From ed8961b30976be8c5fc117d0b748a432e094f72d Mon Sep 17 00:00:00 2001
From: Austin Shafer <ashafer@nvidia.com>
Date: Thu, 1 Feb 2024 17:55:04 -0500
Subject: [PATCH 1/5] backend/drm: always create multigpu renderers

Multi-gpu code needs a context for each GPU in the system, but as
of now we only create renderers for secondary devices. This always
creates the multigpu renderer so we can use it for copying.
---
 backend/drm/backend.c | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/backend/drm/backend.c b/backend/drm/backend.c
index 27e5585dc..848f1046e 100644
--- a/backend/drm/backend.c
+++ b/backend/drm/backend.c
@@ -53,9 +53,7 @@ static void backend_destroy(struct wlr_backend *backend) {
 	wl_list_remove(&drm->dev_change.link);
 	wl_list_remove(&drm->dev_remove.link);
 
-	if (drm->parent) {
-		finish_drm_renderer(&drm->mgpu_renderer);
-	}
+	finish_drm_renderer(&drm->mgpu_renderer);
 
 	finish_drm_resources(drm);
 
@@ -210,22 +208,20 @@ struct wlr_backend *wlr_drm_backend_create(struct wlr_session *session,
 		goto error_event;
 	}
 
-	if (drm->parent) {
-		if (!init_drm_renderer(drm, &drm->mgpu_renderer)) {
-			wlr_log(WLR_ERROR, "Failed to initialize renderer");
-			goto error_resources;
-		}
-
-		// We'll perform a multi-GPU copy for all submitted buffers, we need
-		// to be able to texture from them
-		struct wlr_renderer *renderer = drm->mgpu_renderer.wlr_rend;
-		const struct wlr_drm_format_set *texture_formats =
-			wlr_renderer_get_dmabuf_texture_formats(renderer);
-		if (texture_formats == NULL) {
-			wlr_log(WLR_ERROR, "Failed to query renderer texture formats");
-			goto error_mgpu_renderer;
-		}
+	if (!init_drm_renderer(drm, &drm->mgpu_renderer)) {
+		wlr_log(WLR_ERROR, "Failed to initialize renderer");
+		goto error_resources;
+	}
 
+	// We'll perform a multi-GPU copy for all submitted buffers, we need
+	// to be able to texture from them
+	struct wlr_renderer *renderer = drm->mgpu_renderer.wlr_rend;
+	const struct wlr_drm_format_set *texture_formats =
+		wlr_renderer_get_dmabuf_texture_formats(renderer);
+	// Some configurations (alpine CI job) will have a renderer here that does not
+	// support dmabuf formats. We don't want to fail creation of the drm backend
+	// as a result of this, we simply don't populate the format set in that case.
+	if (texture_formats) {
 		// Forbid implicit modifiers, because their meaning changes from one
 		// GPU to another.
 		for (size_t i = 0; i < texture_formats->len; i++) {
@@ -245,8 +241,6 @@ struct wlr_backend *wlr_drm_backend_create(struct wlr_session *session,
 
 	return &drm->backend;
 
-error_mgpu_renderer:
-	finish_drm_renderer(&drm->mgpu_renderer);
 error_resources:
 	finish_drm_resources(drm);
 error_event:

From 5a38cc57c821748a84e9299cca896e66ff97ca89 Mon Sep 17 00:00:00 2001
From: Austin Shafer <ashafer@nvidia.com>
Date: Wed, 27 Jul 2022 16:07:21 -0400
Subject: [PATCH 2/5] Add new struct wlr_multi_gpu

For development to continue on systems (such as optimus laptops) that
have multiple GPUs, we need a way to reference all of the renderers
that have been created for devices in the system. A wlr_multi_gpu
struct holds renderers coming from two different sources:
1. the primary renderer (given to us by the compositor)
2. drm sub backends, each of which has a renderer created for
   cross-GPU copies.

This change provides a way to access all of these from the same
place.
---
 backend/drm/renderer.c            |   1 +
 backend/multi/backend.c           | 104 ++++++++++++++++++++++++++++++
 include/backend/drm/drm.h         |   1 +
 include/backend/multi.h           |  21 ++++++
 include/wlr/backend/multi.h       |   3 +
 include/wlr/render/wlr_renderer.h |   8 +--
 render/wlr_renderer.c             |   8 +++
 7 files changed, 142 insertions(+), 4 deletions(-)

diff --git a/backend/drm/renderer.c b/backend/drm/renderer.c
index e4aadc106..af62e8191 100644
--- a/backend/drm/renderer.c
+++ b/backend/drm/renderer.c
@@ -3,6 +3,7 @@
 #include <wlr/render/swapchain.h>
 #include <wlr/render/wlr_renderer.h>
 #include <wlr/util/log.h>
+#include "backend/backend.h"
 #include "backend/drm/drm.h"
 #include "backend/drm/fb.h"
 #include "backend/drm/renderer.h"
diff --git a/backend/multi/backend.c b/backend/multi/backend.c
index 740e1d6fa..caa6089b1 100644
--- a/backend/multi/backend.c
+++ b/backend/multi/backend.c
@@ -2,11 +2,15 @@
 #include <stdbool.h>
 #include <stdlib.h>
 #include <time.h>
+#include <xf86drm.h>
+#include <fcntl.h>
 #include <wlr/backend/interface.h>
 #include <wlr/types/wlr_buffer.h>
 #include <wlr/util/log.h>
+#include "render/wlr_renderer.h"
 #include "backend/backend.h"
 #include "backend/multi.h"
+#include "render/allocator/allocator.h"
 
 struct subbackend_state {
 	struct wlr_backend *backend;
@@ -58,6 +62,7 @@ static void multi_backend_destroy(struct wlr_backend *wlr_backend) {
 			wl_container_of(backend->backends.next, sub, link);
 		wlr_backend_destroy(sub->backend);
 	}
+	wlr_multi_gpu_destroy(backend->multi_gpu);
 
 	free(backend);
 }
@@ -118,6 +123,7 @@ struct wlr_backend *wlr_multi_backend_create(struct wl_event_loop *loop) {
 	}
 
 	wl_list_init(&backend->backends);
+	backend->multi_gpu = wlr_multi_gpu_create();
 	wlr_backend_init(&backend->backend, &backend_impl);
 
 	wl_signal_init(&backend->events.backend_add);
@@ -225,3 +231,101 @@ void wlr_multi_for_each_backend(struct wlr_backend *_backend,
 		callback(sub->backend, data);
 	}
 }
+
+/*
+ * Create a wlr_multi_gpu struct and populate it with a renderer and allocator for each
+ * device in the system. This is done by finding all DRM nodes using drmGetDevices2.
+ */
+struct wlr_multi_gpu *wlr_multi_gpu_create(void) {
+	int flags = 0;
+	struct wlr_multi_gpu *multi_gpu = NULL;
+	int devices_len = drmGetDevices2(flags, NULL, 0);
+
+	if (devices_len < 0) {
+		wlr_log(WLR_ERROR, "drmGetDevices2 failed: %s", strerror(-devices_len));
+		return NULL;
+	}
+	drmDevice **devices = calloc(devices_len, sizeof(*devices));
+	if (devices == NULL) {
+		wlr_log_errno(WLR_ERROR, "Allocation failed");
+		goto out;
+	}
+	devices_len = drmGetDevices2(flags, devices, devices_len);
+	if (devices_len < 0) {
+		wlr_log(WLR_ERROR, "drmGetDevices2 failed: %s", strerror(-devices_len));
+		goto out;
+	}
+
+	multi_gpu = calloc(1, sizeof(struct wlr_multi_gpu));
+	if (!multi_gpu) {
+		goto out;
+	}
+	wl_list_init(&multi_gpu->devices);
+
+	for (int i = 0; i < devices_len; i++) {
+		drmDevice *dev = devices[i];
+		if (dev->available_nodes & (1 << DRM_NODE_RENDER)) {
+			const char *name = dev->nodes[DRM_NODE_RENDER];
+			wlr_log(WLR_DEBUG, "Opening DRM render node '%s'", name);
+			int fd = open(name, O_RDWR | O_CLOEXEC);
+			if (fd < 0) {
+				wlr_log_errno(WLR_ERROR, "Failed to open '%s'", name);
+				goto out;
+			}
+
+			// Create a renderer/allocator and add it as a new device
+			struct wlr_renderer *renderer = renderer_autocreate_with_drm_fd(fd);
+			if (!renderer) {
+				wlr_log(WLR_ERROR, "Failed to create multi-GPU renderer");
+				goto fail;
+			}
+
+			struct wlr_allocator *allocator =
+				allocator_autocreate_with_drm_fd(WLR_BUFFER_CAP_DMABUF, renderer, fd);
+			if (!allocator) {
+				wlr_log(WLR_ERROR, "Failed to create multi-GPU allocator");
+				wlr_renderer_destroy(renderer);
+				goto fail;
+			}
+
+			struct wlr_multi_gpu_device *device = calloc(1, sizeof(struct wlr_multi_gpu_device));
+			if (!device) {
+				wlr_allocator_destroy(allocator);
+				wlr_renderer_destroy(renderer);
+				goto fail;
+			}
+			wl_list_insert(&multi_gpu->devices, &device->link);
+			device->renderer = renderer;
+			device->allocator = allocator;
+		}
+	}
+
+	goto out;
+
+fail:
+	wlr_multi_gpu_destroy(multi_gpu);
+	multi_gpu = NULL;
+
+out:
+	for (int i = 0; i < devices_len; i++) {
+		drmFreeDevice(&devices[i]);
+	}
+	if (devices) {
+		free(devices);
+	}
+
+	return multi_gpu;
+}
+
+void wlr_multi_gpu_destroy(struct wlr_multi_gpu *multi_gpu) {
+	struct wlr_multi_gpu_device *device;
+	// Remove and destroy all devices
+	wl_list_for_each(device, &multi_gpu->devices, link) {
+		wlr_allocator_destroy(device->allocator);
+		wlr_renderer_destroy(device->renderer);
+		wl_list_remove(&device->link);
+		free(device);
+	}
+
+	free(multi_gpu);
+}
diff --git a/include/backend/drm/drm.h b/include/backend/drm/drm.h
index cea53f441..30486beb8 100644
--- a/include/backend/drm/drm.h
+++ b/include/backend/drm/drm.h
@@ -107,6 +107,7 @@ struct wlr_drm_backend {
 
 	/* Only initialized on multi-GPU setups */
 	struct wlr_drm_renderer mgpu_renderer;
+	struct wlr_multi_gpu *multi_gpu;
 
 	struct wlr_session *session;
 
diff --git a/include/backend/multi.h b/include/backend/multi.h
index 3ffd81406..993ccbe4b 100644
--- a/include/backend/multi.h
+++ b/include/backend/multi.h
@@ -4,10 +4,31 @@
 #include <wayland-util.h>
 #include <wlr/backend/interface.h>
 #include <wlr/backend/multi.h>
+#include <wlr/render/allocator.h>
+#include <wlr/render/wlr_renderer.h>
+
+struct wlr_multi_gpu_device {
+    struct wlr_renderer *renderer;
+    struct wlr_allocator *allocator;
+    struct wl_list link;
+};
+
+/*
+ * Helper struct for tracking multiple renderers. This solves the
+ * problem of us having many renderers (primary, plus individual
+ * secondary GPU drm renderers) but not tracking them in one location.
+ * We can use this struct to access renderers for each GPU in
+ * the system all from one place. Will be populated by the renderer
+ * the compositor makes, plus every time a drm mgpu renderer is made.
+ */
+struct wlr_multi_gpu {
+	struct wl_list devices;
+};
 
 struct wlr_multi_backend {
 	struct wlr_backend backend;
 
+	struct wlr_multi_gpu *multi_gpu;
 	struct wl_list backends;
 
 	struct wl_listener event_loop_destroy;
diff --git a/include/wlr/backend/multi.h b/include/wlr/backend/multi.h
index c4322d98b..8ae5e4763 100644
--- a/include/wlr/backend/multi.h
+++ b/include/wlr/backend/multi.h
@@ -32,4 +32,7 @@ bool wlr_multi_is_empty(struct wlr_backend *backend);
 void wlr_multi_for_each_backend(struct wlr_backend *backend,
 		void (*callback)(struct wlr_backend *backend, void *data), void *data);
 
+struct wlr_multi_gpu *wlr_multi_gpu_create(void);
+void wlr_multi_gpu_destroy(struct wlr_multi_gpu *multi_gpu);
+
 #endif
diff --git a/include/wlr/render/wlr_renderer.h b/include/wlr/render/wlr_renderer.h
index 08333a529..5a31ca7bb 100644
--- a/include/wlr/render/wlr_renderer.h
+++ b/include/wlr/render/wlr_renderer.h
@@ -14,13 +14,10 @@
 #include <wlr/render/pass.h>
 #include <wlr/render/wlr_texture.h>
 #include <wlr/util/box.h>
+#include <wlr/backend/multi.h>
 
-struct wlr_backend;
 struct wlr_renderer_impl;
-struct wlr_drm_format_set;
 struct wlr_buffer;
-struct wlr_box;
-struct wlr_fbox;
 
 /**
  * A renderer for basic 2D operations.
@@ -39,6 +36,9 @@ struct wlr_renderer {
 	// private state
 
 	const struct wlr_renderer_impl *impl;
+
+	/* The GPU list we are a part of, may be null if not created from multi backend */
+	struct wlr_multi_gpu *multi_gpu;
 };
 
 /**
diff --git a/render/wlr_renderer.c b/render/wlr_renderer.c
index 513fecbd7..d20366a95 100644
--- a/render/wlr_renderer.c
+++ b/render/wlr_renderer.c
@@ -25,6 +25,7 @@
 #endif // WLR_HAS_VULKAN_RENDERER
 
 #include "backend/backend.h"
+#include "backend/multi.h"
 #include "render/pixel_format.h"
 #include "render/wlr_renderer.h"
 #include "util/env.h"
@@ -285,6 +286,13 @@ out:
 	if (own_drm_fd && drm_fd >= 0) {
 		close(drm_fd);
 	}
+	// If we have a multi GPU environment, then track this renderer
+	// for cross-GPU imports.
+	if (renderer && backend && wlr_backend_is_multi(backend)) {
+		struct wlr_multi_backend *multi = (struct wlr_multi_backend *)backend;
+		renderer->multi_gpu = multi->multi_gpu;
+	}
+
 	return renderer;
 }
 

From a446e1801f8373e98556f76a902c0b190f77ad79 Mon Sep 17 00:00:00 2001
From: Austin Shafer <ashafer@nvidia.com>
Date: Wed, 27 Jul 2022 09:46:02 -0400
Subject: [PATCH 3/5] Add wlr_texture_set

When performing cross-GPU operations we need to keep track of where
a buffer has been imported and where it has been copied. This is
what a texture set does. It keeps a dictionary of renderers and
if a wlr_texture has been created for them. The user can request a
texture for a particular renderer, and it will perform any blits
needed and cache the result.
---
 include/wlr/render/wlr_texture.h | 120 ++++++++++
 render/wlr_texture.c             | 386 +++++++++++++++++++++++++++++++
 2 files changed, 506 insertions(+)

diff --git a/include/wlr/render/wlr_texture.h b/include/wlr/render/wlr_texture.h
index 1e352c6e6..af243fb79 100644
--- a/include/wlr/render/wlr_texture.h
+++ b/include/wlr/render/wlr_texture.h
@@ -18,6 +18,7 @@
 struct wlr_buffer;
 struct wlr_renderer;
 struct wlr_texture_impl;
+struct wlr_multi_gpu;
 
 struct wlr_texture {
 	const struct wlr_texture_impl *impl;
@@ -82,4 +83,123 @@ void wlr_texture_destroy(struct wlr_texture *texture);
 struct wlr_texture *wlr_texture_from_buffer(struct wlr_renderer *renderer,
 	struct wlr_buffer *buffer);
 
+struct wlr_texture_renderer_pair {
+	struct wlr_renderer *renderer;
+	struct wlr_texture *texture;
+	struct wlr_allocator *allocator;
+};
+
+/**
+ * The texture set provides a mapping between renderers and the texture
+ * imported into them. You can use it to query a texture for a particular
+ * renderer and it will handle importing and any blitting that needs to
+ * take place.
+ */
+struct wlr_texture_set {
+	/* The buffer this texture set was made from */
+	struct wlr_buffer *buffer;
+	struct wl_listener buffer_release;
+
+	/**
+	 * Index into pairings of the device that this texture directly
+	 * imports into. This texture is "native" to that device, and
+	 * will have to be blitted to other gpus.
+	 *
+	 * This will be -1 if no buffer has been imported yet.
+	 */
+	int32_t native_pair;
+	struct wlr_multi_gpu *multi_gpu;
+	/*
+	 * This will cache the result of creating a linear-layout version of
+	 * this texture on the native device. This can then be imported into
+	 * the other GPUs.
+	 */
+	uint32_t format;
+	void *pixel_data;
+
+	uint32_t width;
+	uint32_t height;
+
+	/* This is the size of the pairings array */
+	int pairing_count;
+	struct wlr_texture_renderer_pair *pairings;
+};
+
+/**
+ * Create an empty texture set. When setting up our wlr_multi_gpu struct we put
+ * all renderers into a list. This lets us iterate them from here. If this
+ * request is made on a renderer not in the multi-GPU set, then the list will
+ * be of length 1, and the renderer will be the only entry in the set.
+ *
+ * A buffer must be imported for this set to be used.
+ */
+struct wlr_texture_set *wlr_texture_set_create(struct wlr_renderer *renderer,
+	struct wlr_allocator *allocator);
+
+/**
+ * Add a renderer to the set. This adds an entry to the set tracking this renderer
+ * in the set's internal list. No texture is created for this renderer.
+ */
+void wlr_texture_set_add_renderer(struct wlr_texture_set *set, struct wlr_renderer *renderer,
+	struct wlr_allocator *allocator);
+
+/*
+ * Imports a buffer into the texture set. This initializes the native_pair
+ * internal state and returns true if the buffer was imported on at least one
+ * of the renderers in the set.
+ *
+ * This should only be called once per texture set initialization.
+ */
+bool wlr_texture_set_import_buffer(struct wlr_texture_set *set, struct wlr_buffer *buffer);
+
+/**
+ * Create a new texture set from a DMA-BUF. The returned texture is immutable.
+ * The dmabuf will be imported on only one of the mgpu renderers in the system,
+ * no copies will be made. Returns NULL if the dmabuf could not be imported into
+ * any renderer.
+ */
+struct wlr_texture_set *wlr_texture_set_from_dmabuf(struct wlr_renderer *renderer,
+	struct wlr_dmabuf_attributes *attribs);
+
+/**
+ * Create a new texture set from a buffer.
+ */
+struct wlr_texture_set *wlr_texture_set_from_buffer(struct wlr_renderer *renderer,
+	struct wlr_buffer *buffer);
+
+/**
+ * Request a wlr_texture for this resource that is compatible with the given
+ * renderer. This allows for on-demand cross-GPU blits in multi-GPU setups.
+ * The texture will have been imported into the renderer that corresponds to
+ * its native device. If a texture is requeseted with a different renderer,
+ * this function will perform a blit and return the appropriate texture.
+ *
+ * Textures are cached, so if multiple requests with a non-native renderer
+ * are made there will be only one blit.
+ */
+struct wlr_texture *wlr_texture_set_get_tex_for_renderer(struct wlr_texture_set *set,
+	struct wlr_renderer *renderer);
+
+/**
+ * Get the wlr_texture corresponding to the texture's local GPU. This is the GPU it
+ * is directly importable into.
+ */
+struct wlr_texture *wlr_texture_set_get_native_texture(struct wlr_texture_set *set);
+
+/**
+ * Get the linear pixel data for the backing texture.
+ */
+void *wlr_texture_set_get_linear_data(struct wlr_texture_set *set);
+
+/**
+  * Update all textures in a set with the contents of the next buffer. This will call
+  * wlr_texture_update_from_buffer for each texture in the set.
+  */
+bool wlr_texture_set_update_from_buffer(struct wlr_texture_set *set,
+		struct wlr_buffer *next, const pixman_region32_t *damage);
+
+/**
+ * Destroys the texture set and all textures held inside it.
+ */
+void wlr_texture_set_destroy(struct wlr_texture_set *set);
 #endif
diff --git a/render/wlr_texture.c b/render/wlr_texture.c
index 3526ee140..1b3339edf 100644
--- a/render/wlr_texture.c
+++ b/render/wlr_texture.c
@@ -3,10 +3,17 @@
 #include <stdbool.h>
 #include <stdlib.h>
 #include <string.h>
+#include <drm_fourcc.h>
 #include <wlr/render/interface.h>
 #include <wlr/render/wlr_texture.h>
 #include "render/pixel_format.h"
+#include <wlr/types/wlr_matrix.h>
+#include <wlr/util/log.h>
 #include "types/wlr_buffer.h"
+#include "backend/multi.h"
+#include "backend/drm/drm.h"
+#include "render/drm_format_set.h"
+#include "render/wlr_renderer.h"
 
 void wlr_texture_init(struct wlr_texture *texture, struct wlr_renderer *renderer,
 		const struct wlr_texture_impl *impl, uint32_t width, uint32_t height) {
@@ -116,6 +123,26 @@ struct wlr_texture *wlr_texture_from_buffer(struct wlr_renderer *renderer,
 	if (!renderer->impl->texture_from_buffer) {
 		return NULL;
 	}
+
+	struct wlr_dmabuf_attributes dmabuf;
+	/*
+	 * If this is a dmabuf backed buffer then get the format/modifier for it and
+	 * compare it with the set supported by the renderer
+	 */
+	if (wlr_buffer_get_dmabuf(buffer, &dmabuf)) {
+		const struct wlr_drm_format_set *formats = wlr_renderer_get_dmabuf_texture_formats(renderer);
+		if (!formats) {
+			wlr_log(WLR_DEBUG, "Could not get DRM format set for renderer");
+			return NULL;
+		}
+
+		if (!wlr_drm_format_set_has(formats, dmabuf.format, dmabuf.modifier)) {
+			wlr_log(WLR_DEBUG, "Renderer could not import buffer with format 0x%x and modifier 0x%lx",
+					dmabuf.format, dmabuf.modifier);
+			return NULL;
+		}
+	}
+
 	return renderer->impl->texture_from_buffer(renderer, buffer);
 }
 
@@ -135,3 +162,362 @@ bool wlr_texture_update_from_buffer(struct wlr_texture *texture,
 	}
 	return texture->impl->update_from_buffer(texture, buffer, damage);
 }
+
+struct wlr_texture_set *wlr_texture_set_from_dmabuf(struct wlr_renderer *renderer,
+	struct wlr_dmabuf_attributes *attribs) {
+	struct wlr_dmabuf_buffer *buffer = dmabuf_buffer_create(attribs);
+	if (buffer == NULL) {
+		return NULL;
+	}
+
+	struct wlr_texture_set *set =
+		wlr_texture_set_from_buffer(renderer, &buffer->base);
+
+	// By this point, the renderer should have locked the buffer if it still
+	// needs to access it in the future.
+	dmabuf_buffer_drop(buffer);
+
+	return set;
+}
+
+static void texture_set_handle_buffer_release(struct wl_listener *listener, void *data) {
+	struct wlr_texture_set *set = wl_container_of(listener, set, buffer_release);
+	set->buffer = NULL;
+	wl_list_remove(&set->buffer_release.link);
+}
+
+static void wlr_texture_set_add_pair(struct wlr_texture_set *set, struct wlr_renderer *renderer,
+		struct wlr_allocator *allocator) {
+
+	set->pairings = realloc(set->pairings,
+			sizeof(struct wlr_texture_renderer_pair) * (set->pairing_count + 1));
+	if (!set->pairings) {
+		return;
+	}
+
+	memset(&set->pairings[set->pairing_count], 0, sizeof(struct wlr_texture_renderer_pair));
+	set->pairings[set->pairing_count].renderer = renderer;
+	set->pairings[set->pairing_count].allocator = allocator;
+	set->pairing_count++;
+}
+
+void wlr_texture_set_add_renderer(struct wlr_texture_set *set, struct wlr_renderer *renderer,
+		struct wlr_allocator *allocator) {
+	if (!renderer) {
+		return;
+	}
+
+	wlr_texture_set_add_pair(set, renderer, allocator);
+
+	if (renderer->multi_gpu) {
+		set->multi_gpu = renderer->multi_gpu;
+		/* Now add each mgpu renderer to the set */
+		struct wlr_multi_gpu_device *device;
+		wl_list_for_each(device, &renderer->multi_gpu->devices, link) {
+			wlr_texture_set_add_pair(set, device->renderer, device->allocator);
+		}
+	}
+}
+
+/*
+ * When setting up our wlr_multi_gpu struct we put all renderers into a list. This lets us
+ * iterate them from here. If this request is made on a renderer not in the multi-GPU set,
+ * then the list will be of length 1, and the renderer will be the only entry in the set.
+ */
+struct wlr_texture_set *wlr_texture_set_create(struct wlr_renderer *renderer,
+		struct wlr_allocator *allocator) {
+	struct wlr_texture_set *set = calloc(1, sizeof(struct wlr_texture_set));
+	if (!set) {
+		return NULL;
+	}
+	set->native_pair = -1;
+
+	wlr_texture_set_add_renderer(set, renderer, allocator);
+
+	return set;
+}
+
+/*
+ * Helper for importing a buffer into the texture set. This initializes
+ * the native_pair internal state.
+ */
+bool wlr_texture_set_import_buffer(struct wlr_texture_set *set, struct wlr_buffer *buffer) {
+	set->buffer = buffer;
+	// Don't lock our buffer since it gets in the way of releasing shm buffers immediately
+	// Instead keep a reference to the buffer but register a handler to notify us when
+	// it is released and clear the pointer.
+	set->buffer_release.notify = texture_set_handle_buffer_release;
+	wl_signal_add(&set->buffer->events.release, &set->buffer_release);
+
+	buffer = wlr_buffer_lock(buffer);
+	bool ret = false;
+
+	/*
+	 * For each renderer, try to create a texture. Go in order, since the first 
+	 * entry is always the "primary" renderer that the user created this texture set with.
+	 * The odds are highest that it is importable into that renderer, so start with that
+	 * one.
+	 */
+	for (int i = 0; i < set->pairing_count; i++) {
+		assert(!set->pairings[i].texture);
+		set->pairings[i].texture = wlr_texture_from_buffer(set->pairings[i].renderer, buffer);
+		/* If we got a match, mark this renderer as the "native" one the buffer is local to */
+		if (set->pairings[i].texture) {
+			/* Cache the width and height so other places don't have to search for it in pairings */
+			set->width = set->pairings[i].texture->width;
+			set->height = set->pairings[i].texture->height;
+			set->native_pair = i;
+			ret = true;
+			goto buffer_unlock;
+		}
+	}
+
+buffer_unlock:
+	wlr_buffer_unlock(buffer);
+	return ret;
+}
+
+struct wlr_texture_set *wlr_texture_set_from_buffer(struct wlr_renderer *renderer,
+		struct wlr_buffer *buffer) {
+	/* Get an empty texture set */
+	struct wlr_texture_set *set = wlr_texture_set_create(renderer, NULL);
+	if (!set) {
+		return NULL;
+	}
+
+	if (!wlr_texture_set_import_buffer(set, buffer)) {
+		goto fail;
+	}
+
+	return set;
+
+fail:
+	/* If the buffer couldn't be imported into any renderer in the system, return NULL */
+	wlr_texture_set_destroy(set);
+	return NULL;
+}
+
+static struct wlr_buffer *texture_set_blit_gpu_buffer(struct wlr_texture_set *set,
+	struct wlr_renderer *renderer) {
+	struct wlr_renderer *native_renderer = set->pairings[set->native_pair].renderer;
+	struct wlr_allocator *native_allocator = set->pairings[set->native_pair].allocator;
+	struct wlr_texture *native_texture = set->pairings[set->native_pair].texture;
+	assert(native_texture);
+
+	// If the user didn't give us an allocator for this renderer then this path can't be used.
+	if (!native_allocator) {
+		return NULL;
+	}
+
+	// Now intersect our DRM formats
+	const struct wlr_drm_format_set *src_formats = wlr_renderer_get_render_formats(native_renderer);
+	if (!src_formats) {
+		wlr_log(WLR_ERROR, "Failed to get primary renderer DRM formats");
+		return NULL;
+	}
+
+	const struct wlr_drm_format_set *dst_formats = wlr_renderer_get_dmabuf_texture_formats(renderer);
+	if (!dst_formats) {
+		wlr_log(WLR_ERROR, "Failed to get destination renderer DRM formats");
+		return NULL;
+	}
+
+	// Get the argb8 mods to use for our new buffer
+	struct wlr_drm_format argb_format = {0};
+	if (!wlr_drm_format_intersect(&argb_format,
+				wlr_drm_format_set_get(dst_formats, DRM_FORMAT_ARGB8888),
+				wlr_drm_format_set_get(src_formats, DRM_FORMAT_ARGB8888))
+			|| argb_format.len == 0) {
+		wlr_log(WLR_ERROR, "Failed to intersect DRM formats");
+		return NULL;
+	}
+
+	// Allocate a new buffer on the source renderer, we will blit the original texture
+	// to this and then return it so the caller can import it.
+	struct wlr_buffer *buffer = wlr_allocator_create_buffer(
+		native_allocator, set->width, set->height, &argb_format);
+	wlr_drm_format_finish(&argb_format);
+	if (!buffer) {
+		wlr_log(WLR_ERROR, "Failed to allocate buffer on source GPU");
+		return NULL;
+	}
+
+	struct wlr_render_pass *pass = wlr_renderer_begin_buffer_pass(native_renderer, buffer, NULL);
+	if (!pass) {
+		wlr_log(WLR_ERROR, "Failed to create a render pass");
+		goto drop_buffer;
+	}
+
+	wlr_render_pass_add_texture(pass, &(struct wlr_render_texture_options) {
+		.texture = native_texture,
+	});
+
+	if (!wlr_render_pass_submit(pass)) {
+		wlr_log(WLR_ERROR, "Failed to render to buffer");
+		goto drop_buffer;
+	}
+
+	return buffer;
+
+drop_buffer:
+	wlr_buffer_drop(buffer);
+	return NULL;
+}
+
+void *wlr_texture_set_get_linear_data(struct wlr_texture_set *set) {
+	struct wlr_renderer *native_renderer = set->pairings[set->native_pair].renderer;
+	struct wlr_texture *native_texture = set->pairings[set->native_pair].texture;
+	assert(native_texture);
+	int stride = native_texture->width * 4;
+
+	if (set->pixel_data) {
+		return set->pixel_data;
+	}
+
+	set->pixel_data = malloc(native_texture->height * stride);
+	if (!set->pixel_data) {
+		return NULL;
+	}
+
+	struct wlr_buffer *buffer = set->buffer;
+	if (!set->buffer) {
+		// If the buffer this set was created with has already been released, blit ourselves
+		// a new one.
+		buffer = texture_set_blit_gpu_buffer(set, native_renderer);
+		if (!buffer) {
+			wlr_log(WLR_DEBUG, "Cannot get linear data, wlr_texture_set's buffer was released");
+			return NULL;
+		}
+	}
+	wlr_buffer_lock(buffer);
+
+	/* Make a buffer with a linear layout and the same format */
+	set->format = wlr_texture_preferred_read_format(native_texture);
+    if (set->format == DRM_FORMAT_INVALID) {
+		wlr_buffer_unlock(buffer);
+		return NULL;
+	}
+
+	bool result = wlr_texture_read_pixels(native_texture, &(struct wlr_texture_read_pixels_options) {
+		.format = DRM_FORMAT_ARGB8888,
+		.stride = stride,
+		.data = set->pixel_data,
+	});
+	wlr_buffer_unlock(buffer);
+	if (!result) {
+		return NULL;
+	}
+
+	wlr_log(WLR_DEBUG, "Copied GPU vidmem buffer to linear sysmem buffer");
+	return set->pixel_data;
+}
+
+struct wlr_texture *wlr_texture_set_get_tex_for_renderer(struct wlr_texture_set *set,
+	struct wlr_renderer *renderer) {
+	/* Find the entry for this renderer */
+	struct wlr_texture_renderer_pair *pair = NULL;
+	for (int i = 0; i < set->pairing_count; i++) {
+		if (set->pairings[i].renderer == renderer) {
+			pair = &set->pairings[i];
+		}
+	}
+
+	/*
+	 * If we have not seen this renderer then add an entry for it so
+	 * we can cache the results of this copy.
+	 */
+	if (!pair) {
+		wlr_texture_set_add_pair(set, renderer, NULL);
+		pair = &set->pairings[set->pairing_count - 1];
+	}
+
+	/* If we already have a texture for this renderer, return it */
+	if (pair->texture) {
+        return pair->texture;
+	}
+
+	/*
+	 * First try to directly import the texture. We must have a valid buffer
+	 * to lock in order to do this. If the buffer has been released (as is the
+	 * case with shm buffers) then we will have to perform a fallback copy.
+	 */
+	if (set->buffer) {
+		wlr_buffer_lock(set->buffer);
+		pair->texture = wlr_texture_from_buffer(renderer, set->buffer);
+		wlr_buffer_unlock(set->buffer);
+		if (pair->texture) {
+	        return pair->texture;
+		}
+	}
+
+	/*
+	 * Directly importing didn't work. The next thing to try is blitting to a compatible
+	 * GPU texture and then importing that.
+	 */
+	struct wlr_buffer *buffer = texture_set_blit_gpu_buffer(set, renderer);
+	if (buffer) {
+		pair->texture = wlr_texture_from_buffer(renderer, buffer);
+		wlr_buffer_drop(buffer);
+		if (pair->texture) {
+	        return pair->texture;
+		}
+	}
+
+	/*
+	 * If the above didn't work then we can try a CPU fallback. This is much more expensive
+	 * but should always work. The reason we need this is that sometimes we have to copy
+	 * from GPU A to GPU B, but GPU A can't render to any modifiers that GPU B supports. This
+	 * happens on NVIDIA (among others) where you cannot render to a linear texture, but need
+	 * to convert to linear so that you can import it anywhere.
+	 *
+	 * Get our linear pixel data so we can import it into the target renderer.
+	 * */
+	void *pixel_data = wlr_texture_set_get_linear_data(set);
+	if (!pixel_data) {
+        return NULL;
+	}
+
+	/* import the linear texture into our renderer */
+	uint32_t stride = set->width * 4;
+	pair->texture = wlr_texture_from_pixels(renderer, DRM_FORMAT_ARGB8888, stride, set->width,
+			set->height, pixel_data);
+
+    return pair->texture;
+}
+
+struct wlr_texture *wlr_texture_set_get_native_texture(struct wlr_texture_set *set) {
+	return set->pairings[set->native_pair].texture;
+}
+
+bool wlr_texture_set_update_from_buffer(struct wlr_texture_set *set,
+		struct wlr_buffer *next, const pixman_region32_t *damage) {
+	/* Call wlr_texture_write_pixels on each valid texture in the set */
+	for (int i = 0; i < set->pairing_count; i++) {
+		if (set->pairings[i].texture) {
+			if (!wlr_texture_update_from_buffer(set->pairings[i].texture,
+						next, damage)) {
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+void wlr_texture_set_destroy(struct wlr_texture_set *set) {
+	if (set->buffer) {
+		wl_list_remove(&set->buffer_release.link);
+	}
+	free(set->pixel_data);
+
+	for (int i = 0; i < set->pairing_count; i++) {
+		if (set->pairings[i].texture) {
+			wlr_texture_destroy(set->pairings[i].texture);
+		}
+	}
+
+	if (set) {
+		free(set->pairings);
+		free(set);
+	}
+}

From 0925a529abfa333afab26279bde676dd67c85eb6 Mon Sep 17 00:00:00 2001
From: Austin Shafer <ashafer@nvidia.com>
Date: Wed, 27 Jul 2022 12:02:09 -0400
Subject: [PATCH 4/5] start using texture sets

There are really two main uses of texture sets: checking if a dmabuf
is importable, and actually importing it for the client buffer's
texture
---
 include/wlr/types/wlr_buffer.h          |  2 +-
 include/wlr/types/wlr_linux_dmabuf_v1.h |  3 ++
 types/buffer/client.c                   | 14 ++++-----
 types/scene/wlr_scene.c                 |  3 +-
 types/wlr_compositor.c                  |  9 ++++--
 types/wlr_linux_dmabuf_v1.c             | 42 ++++++++++++++++++++-----
 6 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/include/wlr/types/wlr_buffer.h b/include/wlr/types/wlr_buffer.h
index de3aeec3d..95d5aa51e 100644
--- a/include/wlr/types/wlr_buffer.h
+++ b/include/wlr/types/wlr_buffer.h
@@ -142,7 +142,7 @@ struct wlr_client_buffer {
 	 * The buffer's texture, if any. A buffer will not have a texture if the
 	 * client destroys the buffer before it has been released.
 	 */
-	struct wlr_texture *texture;
+	struct wlr_texture_set *texture_set;
 	/**
 	 * The buffer this client buffer was created from. NULL if destroyed.
 	 */
diff --git a/include/wlr/types/wlr_linux_dmabuf_v1.h b/include/wlr/types/wlr_linux_dmabuf_v1.h
index cf967f952..92106314f 100644
--- a/include/wlr/types/wlr_linux_dmabuf_v1.h
+++ b/include/wlr/types/wlr_linux_dmabuf_v1.h
@@ -63,6 +63,9 @@ struct wlr_linux_dmabuf_v1 {
 
 	int main_device_fd; // to sanity check FDs sent by clients, -1 if unavailable
 
+	// This is only set when the compositor isn't providing a custom renderer.
+	struct wlr_renderer *main_renderer;
+
 	struct wl_listener display_destroy;
 
 	bool (*check_dmabuf_callback)(struct wlr_dmabuf_attributes *attribs, void *data);
diff --git a/types/buffer/client.c b/types/buffer/client.c
index 4cfa57a89..68a233d59 100644
--- a/types/buffer/client.c
+++ b/types/buffer/client.c
@@ -25,7 +25,7 @@ static struct wlr_client_buffer *client_buffer_from_buffer(
 static void client_buffer_destroy(struct wlr_buffer *buffer) {
 	struct wlr_client_buffer *client_buffer = client_buffer_from_buffer(buffer);
 	wl_list_remove(&client_buffer->source_destroy.link);
-	wlr_texture_destroy(client_buffer->texture);
+	wlr_texture_set_destroy(client_buffer->texture_set);
 	free(client_buffer);
 }
 
@@ -56,21 +56,21 @@ static void client_buffer_handle_source_destroy(struct wl_listener *listener,
 
 struct wlr_client_buffer *wlr_client_buffer_create(struct wlr_buffer *buffer,
 		struct wlr_renderer *renderer) {
-	struct wlr_texture *texture = wlr_texture_from_buffer(renderer, buffer);
-	if (texture == NULL) {
+	struct wlr_texture_set *texture_set = wlr_texture_set_from_buffer(renderer, buffer);
+	if (texture_set == NULL) {
 		wlr_log(WLR_ERROR, "Failed to create texture");
 		return NULL;
 	}
 
 	struct wlr_client_buffer *client_buffer = calloc(1, sizeof(*client_buffer));
 	if (client_buffer == NULL) {
-		wlr_texture_destroy(texture);
+		wlr_texture_set_destroy(texture_set);
 		return NULL;
 	}
 	wlr_buffer_init(&client_buffer->base, &client_buffer_impl,
-		texture->width, texture->height);
+		buffer->width, buffer->height);
 	client_buffer->source = buffer;
-	client_buffer->texture = texture;
+	client_buffer->texture_set = texture_set;
 
 	wl_signal_add(&buffer->events.destroy, &client_buffer->source_destroy);
 	client_buffer->source_destroy.notify = client_buffer_handle_source_destroy;
@@ -89,5 +89,5 @@ bool wlr_client_buffer_apply_damage(struct wlr_client_buffer *client_buffer,
 		return false;
 	}
 
-	return wlr_texture_update_from_buffer(client_buffer->texture, next, damage);
+	return wlr_texture_set_update_from_buffer(client_buffer->texture_set, next, damage);
 }
diff --git a/types/scene/wlr_scene.c b/types/scene/wlr_scene.c
index 7e5b31363..f5b992689 100644
--- a/types/scene/wlr_scene.c
+++ b/types/scene/wlr_scene.c
@@ -878,7 +878,8 @@ static struct wlr_texture *scene_buffer_get_texture(
 	struct wlr_client_buffer *client_buffer =
 		wlr_client_buffer_get(scene_buffer->buffer);
 	if (client_buffer != NULL) {
-		return client_buffer->texture;
+		return wlr_texture_set_get_tex_for_renderer(client_buffer->texture_set,
+				renderer);
 	}
 
 	scene_buffer->texture =
diff --git a/types/wlr_compositor.c b/types/wlr_compositor.c
index 791eb77cb..57ca4bfa4 100644
--- a/types/wlr_compositor.c
+++ b/types/wlr_compositor.c
@@ -445,7 +445,11 @@ static void surface_apply_damage(struct wlr_surface *surface) {
 }
 
 static void surface_update_opaque_region(struct wlr_surface *surface) {
-	if (!wlr_surface_has_buffer(surface)) {
+	/*
+	 * The surface's client_buffer may not have a texture imported yet,
+	 * but if it has a texture set it is tracking a valid buffer.
+	 */
+	if (!wlr_surface_has_buffer(surface) || !surface->buffer->texture_set) {
 		pixman_region32_clear(&surface->opaque_region);
 		return;
 	}
@@ -817,7 +821,8 @@ struct wlr_texture *wlr_surface_get_texture(struct wlr_surface *surface) {
 	if (surface->buffer == NULL) {
 		return NULL;
 	}
-	return surface->buffer->texture;
+	return wlr_texture_set_get_tex_for_renderer(surface->buffer->texture_set,
+			surface->renderer);
 }
 
 bool wlr_surface_has_buffer(struct wlr_surface *surface) {
diff --git a/types/wlr_linux_dmabuf_v1.c b/types/wlr_linux_dmabuf_v1.c
index 13e82760c..6c399ba2b 100644
--- a/types/wlr_linux_dmabuf_v1.c
+++ b/types/wlr_linux_dmabuf_v1.c
@@ -211,16 +211,39 @@ static bool check_import_dmabuf(struct wlr_dmabuf_attributes *attribs, void *dat
 		return true;
 	}
 
-	// TODO: check number of planes
-	for (int i = 0; i < attribs->n_planes; i++) {
-		uint32_t handle = 0;
-		if (drmPrimeFDToHandle(linux_dmabuf->main_device_fd, attribs->fd[i], &handle) != 0) {
-			wlr_log_errno(WLR_DEBUG, "Failed to import DMA-BUF FD");
+	/*
+	 * Some compositors will be using this linux dmabuf manager with custom renderers,
+	 * while others will use a wlroots-managed wlr_renderer. When checking if a dmabuf
+	 * is valid for import we should treat these differently. In the first case we just
+	 * need to check if the dmabuf is importable into the DRM device, in the wlroots-managed
+	 * renderer case we should check if this dmabuf can be imported into the renderer.
+	 *
+	 * In the case where we have a wlr_renderer we need to check if a texture set can
+	 * be created in order to handle multi-gpu systems. The texture set will handle ensuring
+	 * that the dmabuf is importable on one GPU in the system, instead of only checking
+	 * the main device.
+	 */
+	if (linux_dmabuf->main_renderer) {
+		struct wlr_texture_set *set=
+			wlr_texture_set_from_dmabuf(linux_dmabuf->main_renderer, attribs);
+		if (!set) {
 			return false;
 		}
-		if (drmCloseBufferHandle(linux_dmabuf->main_device_fd, handle) != 0) {
-			wlr_log_errno(WLR_ERROR, "Failed to close buffer handle");
-			return false;
+		// We can import the image, good. No need to keep it since wlr_surface will
+		// import it again on commit.
+		wlr_texture_set_destroy(set);
+	} else {
+		// TODO: check number of planes
+		for (int i = 0; i < attribs->n_planes; i++) {
+			uint32_t handle = 0;
+			if (drmPrimeFDToHandle(linux_dmabuf->main_device_fd, attribs->fd[i], &handle) != 0) {
+				wlr_log_errno(WLR_DEBUG, "Failed to import DMA-BUF FD");
+				return false;
+			}
+			if (drmCloseBufferHandle(linux_dmabuf->main_device_fd, handle) != 0) {
+				wlr_log_errno(WLR_ERROR, "Failed to close buffer handle");
+				return false;
+			}
 		}
 	}
 	return true;
@@ -1001,6 +1024,9 @@ struct wlr_linux_dmabuf_v1 *wlr_linux_dmabuf_v1_create_with_renderer(struct wl_d
 	struct wlr_linux_dmabuf_v1 *linux_dmabuf =
 		wlr_linux_dmabuf_v1_create(display, version, &feedback);
 	wlr_linux_dmabuf_feedback_v1_finish(&feedback);
+
+	linux_dmabuf->main_renderer = renderer;
+
 	return linux_dmabuf;
 }
 

From eb812f34f02ae63f8a5875609abeae8b2e876575 Mon Sep 17 00:00:00 2001
From: Austin Shafer <ashafer@nvidia.com>
Date: Tue, 2 Aug 2022 13:06:31 -0400
Subject: [PATCH 5/5] Perform direct scanout on secondary GPUs

This change attempts to import the buffer directly into the target
DRM context, and only performs drm_surface_blit if that doesn't
work.

Also advertise all scanout mods instead of only the ones compatible with
the main dev, since we can now scan them out.
---
 backend/drm/drm.c              | 56 ++++++++++++++++++++++++++--------
 backend/drm/renderer.c         | 24 +++++++++++----
 include/backend/drm/renderer.h |  2 +-
 types/wlr_linux_dmabuf_v1.c    | 17 +++--------
 4 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/backend/drm/drm.c b/backend/drm/drm.c
index 41a38f6dd..085a9fbb7 100644
--- a/backend/drm/drm.c
+++ b/backend/drm/drm.c
@@ -562,6 +562,7 @@ static void drm_connector_state_finish(struct wlr_drm_connector_state *state) {
 
 static bool drm_connector_state_update_primary_fb(struct wlr_drm_connector *conn,
 		struct wlr_drm_connector_state *state) {
+	bool ok;
 	struct wlr_drm_backend *drm = conn->backend;
 
 	assert(state->base->committed & WLR_OUTPUT_STATE_BUFFER);
@@ -571,34 +572,63 @@ static bool drm_connector_state_update_primary_fb(struct wlr_drm_connector *conn
 
 	struct wlr_drm_plane *plane = crtc->primary;
 	struct wlr_buffer *source_buf = state->base->buffer;
+	struct wlr_buffer *local_buf = wlr_buffer_lock(source_buf);
 
-	struct wlr_buffer *local_buf;
-	if (drm->parent) {
+	/*
+	 * First try to import the buffer. We can have a decent degree of
+	 * confidence this will work for a couple reasons:
+	 * 1. Apps running on the dGPU in PRIME setups will be submitting
+	 *    buffers with linear modifiers, so that they can be imported
+	 *    on the primary GPU. This means they are directly imporatable
+	 *    here as well. This gives a nice FPS boost.
+	 * 2. When the dGPU app supports reacting to dmabuf feedback it will
+	 *    be using dGPU modifiers, again meaning it can be imported into
+	 *    the dGPU directly for an additional nice perf boost.
+	 *
+	 * The fallback drm_surface_blit path will only be hit when the
+	 * app is running fullscreen with dGPU (non-linear) modifiers and
+	 * we start using rendered composition again. For a frame we will
+	 * do the fallback before the app reallocs its buffers back to
+	 * linear to be compatible with the primary GPU.
+	 */
+	ok = drm_fb_import(&state->primary_fb, drm, local_buf,
+		&crtc->primary->formats);
+
+	/*
+	 * If trying to import this buffer directly didn't work then try
+	 * to perform a blit to a mgpu drm surface and import that instead.
+	 */
+	if (!ok && drm->parent) {
 		struct wlr_drm_format format = {0};
 		if (!drm_plane_pick_render_format(plane, &format, &drm->mgpu_renderer)) {
 			wlr_log(WLR_ERROR, "Failed to pick primary plane format");
-			return false;
+			ok = false;
+			goto release_buf;
 		}
 
 		// TODO: fallback to modifier-less buffer allocation
-		bool ok = init_drm_surface(&plane->mgpu_surf, &drm->mgpu_renderer,
+		ok = init_drm_surface(&plane->mgpu_surf, &drm->mgpu_renderer,
 			source_buf->width, source_buf->height, &format);
 		wlr_drm_format_finish(&format);
 		if (!ok) {
-			return false;
+			ok = false;
+			goto release_buf;
 		}
 
-		local_buf = drm_surface_blit(&plane->mgpu_surf, source_buf);
-		if (local_buf == NULL) {
-			return false;
+		struct wlr_buffer *drm_buf = drm_surface_blit(&plane->mgpu_surf,
+				&drm->parent->mgpu_renderer, source_buf);
+		if (drm_buf == NULL) {
+			ok = false;
+			goto release_buf;
 		}
-	} else {
-		local_buf = wlr_buffer_lock(source_buf);
+		ok = drm_fb_import(&state->primary_fb, drm, drm_buf,
+				&plane->formats);
+		wlr_buffer_unlock(drm_buf);
 	}
 
-	bool ok = drm_fb_import(&state->primary_fb, drm, local_buf,
-		&plane->formats);
+release_buf:
 	wlr_buffer_unlock(local_buf);
+
 	if (!ok) {
 		wlr_drm_conn_log(conn, WLR_DEBUG,
 			"Failed to import buffer for scan-out");
@@ -1012,7 +1042,7 @@ static bool drm_connector_set_cursor(struct wlr_output *output,
 				return false;
 			}
 
-			local_buf = drm_surface_blit(&plane->mgpu_surf, buffer);
+			local_buf = drm_surface_blit(&plane->mgpu_surf, &drm->parent->mgpu_renderer, buffer);
 			if (local_buf == NULL) {
 				return false;
 			}
diff --git a/backend/drm/renderer.c b/backend/drm/renderer.c
index af62e8191..ace2d1559 100644
--- a/backend/drm/renderer.c
+++ b/backend/drm/renderer.c
@@ -75,7 +75,7 @@ bool init_drm_surface(struct wlr_drm_surface *surf,
 }
 
 struct wlr_buffer *drm_surface_blit(struct wlr_drm_surface *surf,
-		struct wlr_buffer *buffer) {
+		struct wlr_drm_renderer *parent_renderer, struct wlr_buffer *buffer) {
 	struct wlr_renderer *renderer = surf->renderer->wlr_rend;
 
 	if (surf->swapchain->width != buffer->width ||
@@ -84,11 +84,23 @@ struct wlr_buffer *drm_surface_blit(struct wlr_drm_surface *surf,
 		return NULL;
 	}
 
-	struct wlr_texture *tex = wlr_texture_from_buffer(renderer, buffer);
-	if (tex == NULL) {
-		wlr_log(WLR_ERROR, "Failed to import source buffer into multi-GPU renderer");
+	struct wlr_texture_set *set = wlr_texture_set_create(renderer, NULL);
+	if (set == NULL) {
+		wlr_log(WLR_ERROR, "Failed to import source buffer multi-GPU texture set");
 		return NULL;
 	}
+	/* Add the parent renderer so the texture set can use it for copies */
+	wlr_texture_set_add_renderer(set, parent_renderer->wlr_rend, parent_renderer->allocator);
+	if (!wlr_texture_set_import_buffer(set, buffer)) {
+		wlr_log(WLR_ERROR, "Failed to import source buffer multi-GPU texture set");
+		goto error_tex;
+	}
+
+	struct wlr_texture *tex = wlr_texture_set_get_tex_for_renderer(set, renderer);
+	if (tex == NULL) {
+		wlr_log(WLR_ERROR, "Failed to export source buffer for multi-GPU renderer");
+		goto error_tex;
+	}
 
 	struct wlr_buffer *dst = wlr_swapchain_acquire(surf->swapchain, NULL);
 	if (!dst) {
@@ -111,14 +123,14 @@ struct wlr_buffer *drm_surface_blit(struct wlr_drm_surface *surf,
 		goto error_dst;
 	}
 
-	wlr_texture_destroy(tex);
+	wlr_texture_set_destroy(set);
 
 	return dst;
 
 error_dst:
 	wlr_buffer_unlock(dst);
 error_tex:
-	wlr_texture_destroy(tex);
+	wlr_texture_set_destroy(set);
 	return NULL;
 }
 
diff --git a/include/backend/drm/renderer.h b/include/backend/drm/renderer.h
index f53f720bc..04710c61e 100644
--- a/include/backend/drm/renderer.h
+++ b/include/backend/drm/renderer.h
@@ -32,7 +32,7 @@ bool init_drm_surface(struct wlr_drm_surface *surf,
 void finish_drm_surface(struct wlr_drm_surface *surf);
 
 struct wlr_buffer *drm_surface_blit(struct wlr_drm_surface *surf,
-	struct wlr_buffer *buffer);
+	struct wlr_drm_renderer *parent_renderer, struct wlr_buffer *buffer);
 
 bool drm_plane_pick_render_format(struct wlr_drm_plane *plane,
 	struct wlr_drm_format *fmt, struct wlr_drm_renderer *renderer);
diff --git a/types/wlr_linux_dmabuf_v1.c b/types/wlr_linux_dmabuf_v1.c
index 6c399ba2b..d695acd04 100644
--- a/types/wlr_linux_dmabuf_v1.c
+++ b/types/wlr_linux_dmabuf_v1.c
@@ -1096,15 +1096,6 @@ static bool devid_from_fd(int fd, dev_t *devid) {
 	return true;
 }
 
-static bool is_secondary_drm_backend(struct wlr_backend *backend) {
-#if WLR_HAS_DRM_BACKEND
-	return wlr_backend_is_drm(backend) &&
-		wlr_drm_backend_get_parent(backend) != NULL;
-#else
-	return false;
-#endif
-}
-
 bool wlr_linux_dmabuf_feedback_v1_init_with_options(struct wlr_linux_dmabuf_feedback_v1 *feedback,
 		const struct wlr_linux_dmabuf_feedback_v1_init_options *options) {
 	assert(options->main_renderer != NULL);
@@ -1147,8 +1138,7 @@ bool wlr_linux_dmabuf_feedback_v1_init_with_options(struct wlr_linux_dmabuf_feed
 			wlr_log(WLR_ERROR, "Failed to intersect renderer and scanout formats");
 			goto error;
 		}
-	} else if (options->scanout_primary_output != NULL &&
-			!is_secondary_drm_backend(options->scanout_primary_output->backend)) {
+	} else if (options->scanout_primary_output != NULL) {
 		int backend_drm_fd = wlr_backend_get_drm_fd(options->scanout_primary_output->backend);
 		if (backend_drm_fd < 0) {
 			wlr_log(WLR_ERROR, "Failed to get backend DRM FD");
@@ -1174,8 +1164,9 @@ bool wlr_linux_dmabuf_feedback_v1_init_with_options(struct wlr_linux_dmabuf_feed
 
 		tranche->target_device = backend_dev;
 		tranche->flags = ZWP_LINUX_DMABUF_FEEDBACK_V1_TRANCHE_FLAGS_SCANOUT;
-		if (!wlr_drm_format_set_intersect(&tranche->formats, scanout_formats, renderer_formats)) {
-			wlr_log(WLR_ERROR, "Failed to intersect renderer and scanout formats");
+		// Copy our scanout formats to the scanout tranche
+		if (!wlr_drm_format_set_copy(&tranche->formats, scanout_formats)) {
+			wlr_log(WLR_ERROR, "Failed to copy scanout formats");
 			goto error;
 		}
 	}