From 1ec740692f03bb1aa3cc52401c7274f45eb5a088 Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Sun, 12 Apr 2026 17:10:40 +0200
Subject: [PATCH 1/3] render/vulkan: New staging buffer implementation

Implement a ring-buffer that uses timeline points to track and release
allocated spans, upgrading the buffer if it fills and shrinking it if it
has been 4x too large for many collections.
---
 include/render/vulkan.h  |  65 +++++---
 render/vulkan/pass.c     |  15 +-
 render/vulkan/renderer.c | 336 +++++++++++++++++++++++++--------------
 render/vulkan/texture.c  |   6 +-
 4 files changed, 267 insertions(+), 155 deletions(-)

diff --git a/include/render/vulkan.h b/include/render/vulkan.h
index 021749c27..12a7ff173 100644
--- a/include/render/vulkan.h
+++ b/include/render/vulkan.h
@@ -284,8 +284,6 @@ struct wlr_vk_command_buffer {
 	uint64_t timeline_point;
 	// Textures to destroy after the command buffer completes
 	struct wl_list destroy_textures; // wlr_vk_texture.destroy_link
-	// Staging shared buffers to release after the command buffer completes
-	struct wl_list stage_buffers; // wlr_vk_shared_buffer.link
 	// Color transform to unref after the command buffer completes
 	struct wlr_color_transform *color_transform;
 
@@ -352,7 +350,7 @@ struct wlr_vk_renderer {
 	struct {
 		struct wlr_vk_command_buffer *cb;
 		uint64_t last_timeline_point;
-		struct wl_list buffers; // wlr_vk_shared_buffer.link
+		struct wl_list buffers; // wlr_vk_stage_buffer.link
 	} stage;
 
 	struct {
@@ -453,14 +451,27 @@ struct wlr_vk_render_pass {
 struct wlr_vk_render_pass *vulkan_begin_render_pass(struct wlr_vk_renderer *renderer,
 	struct wlr_vk_render_buffer *buffer, const struct wlr_buffer_pass_options *options);
 
-// Suballocates a buffer span with the given size that can be mapped
-// and used as staging buffer. The allocation is implicitly released when the
-// stage cb has finished execution. The start of the span will be a multiple
-// of the given alignment.
+// Suballocates a buffer span with the given size from the staging ring buffer
+// that is mapped for CPU access. vulkan_stage_mark_submit must be called after
+// allocations are made to mark the timeline point after which the allocations
+// will be released. The start of the span will be a multiple of alignment.
 struct wlr_vk_buffer_span vulkan_get_stage_span(
 	struct wlr_vk_renderer *renderer, VkDeviceSize size,
 	VkDeviceSize alignment);
 
+// Returns unused bytes at the end of a buffer span back to the ring buffer.
+// This allows the caller to allocate for worst-case consumption and return the
+// unused remainder. This must not be called after vulkan_stage_mark_submit,
+// and only works for the last made allocation.
+void vulkan_return_stage_span(struct wlr_vk_buffer_span *span,
+	VkDeviceSize return_size);
+
+// Records a watermark on all staging buffers with new allocations with the
+// specified timeline point. Once the timeline point is passed, the span will
+// be reclaimed by vulkan_stage_buffer_reclaim.
+void vulkan_stage_mark_submit(struct wlr_vk_renderer *renderer,
+	uint64_t timeline_point);
+
 // Tries to allocate a texture descriptor set. Will additionally
 // return the pool it was allocated from when successful (for freeing it later).
 struct wlr_vk_descriptor_pool *vulkan_alloc_texture_ds(
@@ -544,29 +555,45 @@ struct wlr_vk_descriptor_pool {
 	struct wl_list link; // wlr_vk_renderer.descriptor_pools
 };
 
-struct wlr_vk_allocation {
-	VkDeviceSize start;
-	VkDeviceSize size;
+struct wlr_vk_stage_watermark {
+	VkDeviceSize head;
+	uint64_t timeline_point;
 };
 
-// List of suballocated staging buffers.
-// Used to upload to/read from device local images.
-struct wlr_vk_shared_buffer {
-	struct wl_list link; // wlr_vk_renderer.stage.buffers or wlr_vk_command_buffer.stage_buffers
+// Ring buffer for staging transfers
+struct wlr_vk_stage_buffer {
+	struct wl_list link; // wlr_vk_renderer.stage.buffers
+	bool active;
 	VkBuffer buffer;
 	VkDeviceMemory memory;
 	VkDeviceSize buf_size;
 	void *cpu_mapping;
-	struct wl_array allocs; // struct wlr_vk_allocation
-	int64_t last_used_ms;
+
+	VkDeviceSize head;
+	VkDeviceSize tail;
+
+	struct wl_array watermarks; // struct wlr_vk_stage_watermark
+	VkDeviceSize peak_utilization;
+	int underutil_count;
 };
 
-// Suballocated range on a buffer.
+// Suballocated range on a staging ring buffer.
 struct wlr_vk_buffer_span {
-	struct wlr_vk_shared_buffer *buffer;
-	struct wlr_vk_allocation alloc;
+	struct wlr_vk_stage_buffer *buffer;
+	VkDeviceSize offset;
+	VkDeviceSize size;
 };
 
+// Suballocate a span of size bytes from a staging ring buffer, with the
+// returned offset rounded up to the given alignment. Returns the byte offset
+// of the allocation, or (VkDeviceSize)-1 if the buffer is too full to fit it.
+VkDeviceSize vulkan_stage_buffer_alloc(struct wlr_vk_stage_buffer *buf,
+	VkDeviceSize size, VkDeviceSize alignment);
+
+// Free all allocations covered by watermarks whose timeline point has been
+// reached. Returns true if the buffer is now fully drained.
+bool vulkan_stage_buffer_reclaim(struct wlr_vk_stage_buffer *buf,
+	uint64_t current_point);
 
 // Prepared form for a color transform
 struct wlr_vk_color_transform {
diff --git a/render/vulkan/pass.c b/render/vulkan/pass.c
index 01e8fbd7a..04cb16cf9 100644
--- a/render/vulkan/pass.c
+++ b/render/vulkan/pass.c
@@ -595,14 +595,7 @@ static bool render_pass_submit(struct wlr_render_pass *wlr_pass) {
 
 	free(render_wait);
 
-	struct wlr_vk_shared_buffer *stage_buf, *stage_buf_tmp;
-	wl_list_for_each_safe(stage_buf, stage_buf_tmp, &renderer->stage.buffers, link) {
-		if (stage_buf->allocs.size == 0) {
-			continue;
-		}
-		wl_list_remove(&stage_buf->link);
-		wl_list_insert(&stage_cb->stage_buffers, &stage_buf->link);
-	}
+	vulkan_stage_mark_submit(renderer, render_timeline_point);
 
 	if (!vulkan_sync_render_pass_release(renderer, pass)) {
 		wlr_log(WLR_ERROR, "Failed to sync render buffer");
@@ -1056,13 +1049,13 @@ static bool create_3d_lut_image(struct wlr_vk_renderer *renderer,
 	size_t size = dim_len * dim_len * dim_len * bytes_per_block;
 	struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer,
 		size, bytes_per_block);
-	if (!span.buffer || span.alloc.size != size) {
+	if (!span.buffer || span.size != size) {
 		wlr_log(WLR_ERROR, "Failed to retrieve staging buffer");
 		goto fail_imageview;
 	}
 
 	float sample_range = 1.0f / (dim_len - 1);
-	char *map = (char *)span.buffer->cpu_mapping + span.alloc.start;
+	char *map = (char *)span.buffer->cpu_mapping + span.offset;
 	float *dst = (float *)map;
 	for (size_t b_index = 0; b_index < dim_len; b_index++) {
 		for (size_t g_index = 0; g_index < dim_len; g_index++) {
@@ -1092,7 +1085,7 @@ static bool create_3d_lut_image(struct wlr_vk_renderer *renderer,
 		VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT,
 		VK_ACCESS_TRANSFER_WRITE_BIT);
 	VkBufferImageCopy copy = {
-		.bufferOffset = span.alloc.start,
+		.bufferOffset = span.offset,
 		.imageExtent.width = dim_len,
 		.imageExtent.height = dim_len,
 		.imageExtent.depth = dim_len,
diff --git a/render/vulkan/renderer.c b/render/vulkan/renderer.c
index 434ab4769..e8e44b3f4 100644
--- a/render/vulkan/renderer.c
+++ b/render/vulkan/renderer.c
@@ -1,6 +1,5 @@
 #include <assert.h>
 #include <fcntl.h>
-#include <math.h>
 #include <poll.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -8,6 +7,7 @@
 #include <unistd.h>
 #include <drm_fourcc.h>
 #include <vulkan/vulkan.h>
+#include <wayland-util.h>
 #include <wlr/render/color.h>
 #include <wlr/render/interface.h>
 #include <wlr/types/wlr_drm.h>
@@ -26,11 +26,9 @@
 #include "render/vulkan/shaders/texture.frag.h"
 #include "render/vulkan/shaders/quad.frag.h"
 #include "render/vulkan/shaders/output.frag.h"
-#include "types/wlr_buffer.h"
-#include "util/time.h"
+#include "util/array.h"
 
 // TODO:
-// - simplify stage allocation, don't track allocations but use ringbuffer-like
 // - use a pipeline cache (not sure when to save though, after every pipeline
 //   creation?)
 // - create pipelines as derivatives of each other
@@ -187,18 +185,13 @@ static void destroy_render_format_setup(struct wlr_vk_renderer *renderer,
 	free(setup);
 }
 
-static void shared_buffer_destroy(struct wlr_vk_renderer *r,
-		struct wlr_vk_shared_buffer *buffer) {
+static void stage_buffer_destroy(struct wlr_vk_renderer *r,
+		struct wlr_vk_stage_buffer *buffer) {
 	if (!buffer) {
 		return;
 	}
 
-	if (buffer->allocs.size > 0) {
-		wlr_log(WLR_ERROR, "shared_buffer_finish: %zu allocations left",
-			buffer->allocs.size / sizeof(struct wlr_vk_allocation));
-	}
-
-	wl_array_release(&buffer->allocs);
+	wl_array_release(&buffer->watermarks);
 	if (buffer->cpu_mapping) {
 		vkUnmapMemory(r->dev->dev, buffer->memory);
 		buffer->cpu_mapping = NULL;
@@ -214,75 +207,12 @@ static void shared_buffer_destroy(struct wlr_vk_renderer *r,
 	free(buffer);
 }
 
-struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
-		VkDeviceSize size, VkDeviceSize alignment) {
-	// try to find free span
-	// simple greedy allocation algorithm - should be enough for this usecase
-	// since all allocations are freed together after the frame
-	struct wlr_vk_shared_buffer *buf;
-	wl_list_for_each_reverse(buf, &r->stage.buffers, link) {
-		VkDeviceSize start = 0u;
-		if (buf->allocs.size > 0) {
-			const struct wlr_vk_allocation *allocs = buf->allocs.data;
-			size_t allocs_len = buf->allocs.size / sizeof(struct wlr_vk_allocation);
-			const struct wlr_vk_allocation *last = &allocs[allocs_len - 1];
-			start = last->start + last->size;
-		}
-
-		assert(start <= buf->buf_size);
-
-		// ensure the proposed start is a multiple of alignment
-		start += alignment - 1 - ((start + alignment - 1) % alignment);
-
-		if (buf->buf_size - start < size) {
-			continue;
-		}
-
-		struct wlr_vk_allocation *a = wl_array_add(&buf->allocs, sizeof(*a));
-		if (a == NULL) {
-			wlr_log_errno(WLR_ERROR, "Allocation failed");
-			goto error_alloc;
-		}
-
-		*a = (struct wlr_vk_allocation){
-			.start = start,
-			.size = size,
-		};
-		return (struct wlr_vk_buffer_span) {
-			.buffer = buf,
-			.alloc = *a,
-		};
-	}
-
-	if (size > max_stage_size) {
-		wlr_log(WLR_ERROR, "cannot vulkan stage buffer: "
-			"requested size (%zu bytes) exceeds maximum (%zu bytes)",
-			(size_t)size, (size_t)max_stage_size);
-		goto error_alloc;
-	}
-
-	// we didn't find a free buffer - create one
-	// size = clamp(max(size * 2, prev_size * 2), min_size, max_size)
-	VkDeviceSize bsize = size * 2;
-	bsize = bsize < min_stage_size ? min_stage_size : bsize;
-	if (!wl_list_empty(&r->stage.buffers)) {
-		struct wl_list *last_link = r->stage.buffers.prev;
-		struct wlr_vk_shared_buffer *prev = wl_container_of(
-			last_link, prev, link);
-		VkDeviceSize last_size = 2 * prev->buf_size;
-		bsize = bsize < last_size ? last_size : bsize;
-	}
-
-	if (bsize > max_stage_size) {
-		wlr_log(WLR_INFO, "vulkan stage buffers have reached max size");
-		bsize = max_stage_size;
-	}
-
-	// create buffer
-	buf = calloc(1, sizeof(*buf));
+static struct wlr_vk_stage_buffer *stage_buffer_create(
+		struct wlr_vk_renderer *r, VkDeviceSize bsize) {
+	struct wlr_vk_stage_buffer *buf = calloc(1, sizeof(*buf));
 	if (!buf) {
 		wlr_log_errno(WLR_ERROR, "Allocation failed");
-		goto error_alloc;
+		return NULL;
 	}
 
 	wl_list_init(&buf->link);
@@ -319,7 +249,7 @@ struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
 	};
 	res = vkAllocateMemory(r->dev->dev, &mem_info, NULL, &buf->memory);
 	if (res != VK_SUCCESS) {
-		wlr_vk_error("vkAllocatorMemory", res);
+		wlr_vk_error("vkAllocateMemory", res);
 		goto error;
 	}
 
@@ -335,34 +265,209 @@ struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
 		goto error;
 	}
 
-	struct wlr_vk_allocation *a = wl_array_add(&buf->allocs, sizeof(*a));
-	if (a == NULL) {
-		wlr_log_errno(WLR_ERROR, "Allocation failed");
+	buf->active = true;
+	buf->buf_size = bsize;
+	return buf;
+
+error:
+	stage_buffer_destroy(r, buf);
+	return NULL;
+}
+
+// Returns true if the buffer is fully drained.
+bool vulkan_stage_buffer_reclaim(struct wlr_vk_stage_buffer *buf,
+		uint64_t current_point) {
+
+	// Update utilization metrics before cleaning
+	VkDeviceSize occupied = buf->head >= buf->tail
+		? buf->head - buf->tail
+		: buf->buf_size - buf->tail + buf->head;
+	if (occupied > buf->peak_utilization) {
+		buf->peak_utilization = occupied;
+	}
+
+	size_t completed = 0;
+	struct wlr_vk_stage_watermark *mark;
+	wl_array_for_each(mark, &buf->watermarks) {
+		if (mark->timeline_point > current_point) {
+			break;
+		}
+		buf->tail = mark->head;
+		completed++;
+	}
+
+	if (completed > 0) {
+		completed *= sizeof(struct wlr_vk_stage_watermark);
+		if (completed == buf->watermarks.size) {
+			buf->watermarks.size = 0;
+		} else {
+			array_remove_at(&buf->watermarks, 0, completed);
+		}
+	}
+
+	return buf->head == buf->tail;
+}
+
+VkDeviceSize vulkan_stage_buffer_alloc(struct wlr_vk_stage_buffer *buf,
+		VkDeviceSize size, VkDeviceSize alignment) {
+	VkDeviceSize head = buf->head;
+
+	// Round up to the next multiple of alignment
+	VkDeviceSize rem = head % alignment;
+	if (rem != 0) {
+		head += alignment - rem;
+	}
+
+	VkDeviceSize end = head >= buf->tail ? buf->buf_size : buf->tail;
+	if (head + size < end) {
+		// Regular allocation head till end of available space
+		buf->head = head + size;
+		return head;
+	} else if (size < buf->tail && head >= buf->tail) {
+		// First allocation after wrap-around
+		buf->head = size;
+		return 0;
+	}
+
+	return (VkDeviceSize)-1;
+}
+
+struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
+		VkDeviceSize size, VkDeviceSize alignment) {
+	if (size > max_stage_size) {
+		wlr_log(WLR_ERROR, "cannot allocate stage buffer: "
+			"requested size (%zu bytes) exceeds maximum (%zu bytes)",
+			(size_t)size, (size_t)max_stage_size);
 		goto error;
 	}
 
-	buf->buf_size = bsize;
-	wl_list_insert(&r->stage.buffers, &buf->link);
+	// Try to reclaim and allocate from the active buffer
+	struct wlr_vk_stage_buffer *buf;
+	VkDeviceSize max_buf_size = min_stage_size;
+	wl_list_for_each(buf, &r->stage.buffers, link) {
+		if (!buf->active) {
+			continue;
+		}
+		VkDeviceSize offset = vulkan_stage_buffer_alloc(buf, size, alignment);
+		if (offset != (VkDeviceSize)-1) {
+			return (struct wlr_vk_buffer_span) {
+				.buffer = buf,
+				.offset = offset,
+				.size = size,
+			};
+		}
+		if (buf->buf_size > max_buf_size) {
+			max_buf_size = buf->buf_size;
+		}
+
+		// Buffer is full, retire it
+		buf->active = false;
+	}
+
+	VkDeviceSize bsize = max_buf_size * 2;
+	while (size * 2 > bsize) {
+		bsize *= 2;
+	}
+	if (bsize > max_stage_size) {
+		wlr_log(WLR_INFO, "vulkan stage buffer has reached max size");
+		bsize = max_stage_size;
+	}
+
+	struct wlr_vk_stage_buffer *new_buf = stage_buffer_create(r, bsize);
+	if (new_buf == NULL) {
+		goto error;
+	}
+
+	wl_list_insert(&r->stage.buffers, &new_buf->link);
+
+	VkDeviceSize offset = vulkan_stage_buffer_alloc(new_buf, size, alignment);
+	assert(offset != (VkDeviceSize)-1);
 
-	*a = (struct wlr_vk_allocation){
-		.start = 0,
-		.size = size,
-	};
 	return (struct wlr_vk_buffer_span) {
-		.buffer = buf,
-		.alloc = *a,
+		.buffer = new_buf,
+		.offset = offset,
+		.size = size,
 	};
 
 error:
-	shared_buffer_destroy(r, buf);
-
-error_alloc:
 	return (struct wlr_vk_buffer_span) {
 		.buffer = NULL,
-		.alloc = (struct wlr_vk_allocation) {0, 0},
+		.offset = 0,
+		.size = 0,
 	};
 }
 
+void vulkan_return_stage_span(struct wlr_vk_buffer_span *span, VkDeviceSize return_size) {
+	assert(return_size <= span->size);
+	if (span->buffer->head == span->offset + span->size) {
+		// If the current buffer head is our current buffer, move the head back
+		span->size -= return_size;
+		span->buffer->head = span->offset + span->size;
+	}
+}
+
+void vulkan_stage_mark_submit(struct wlr_vk_renderer *renderer,
+		uint64_t timeline_point) {
+	struct wlr_vk_stage_buffer *buf;
+	wl_list_for_each(buf, &renderer->stage.buffers, link) {
+		if (buf->head == buf->tail) {
+			continue;
+		}
+
+		struct wlr_vk_stage_watermark *mark = wl_array_add(
+			&buf->watermarks, sizeof(*mark));
+		if (mark == NULL) {
+			wlr_log_errno(WLR_ERROR, "Allocation failed");
+			continue;
+		}
+
+		*mark = (struct wlr_vk_stage_watermark){
+			.head = buf->head,
+			.timeline_point = timeline_point,
+		};
+	}
+}
+
+static void vulkan_stage_buffer_gc(struct wlr_vk_renderer *renderer, uint64_t current_point) {
+	struct wlr_vk_stage_buffer *buf, *buf_tmp;
+	wl_list_for_each_safe(buf, buf_tmp, &renderer->stage.buffers, link) {
+		if (!vulkan_stage_buffer_reclaim(buf, current_point)) {
+			// There are active allocations on this buffer
+			continue;
+		}
+		if (!buf->active) {
+			stage_buffer_destroy(renderer, buf);
+			continue;
+		}
+		if (buf->buf_size < min_stage_size * 2) {
+			// We will neither shrink nor deallocate the first buffer
+			continue;
+		}
+
+		// Note: We use 1/4th as the underutilization threshold, and when
+		// underutilized for 100 GC runs we cut the buffer size in half
+		if (buf->peak_utilization > buf->buf_size / 4) {
+			buf->underutil_count = 0;
+		} else {
+			buf->underutil_count++;
+		}
+		buf->peak_utilization = 0;
+
+		if (buf->underutil_count < 100) {
+			continue;
+		}
+
+		struct wlr_vk_stage_buffer *shrunk = stage_buffer_create(renderer, buf->buf_size / 2);
+		if (shrunk == NULL) {
+			// We'll just keep using the old buffer for now
+			continue;
+		}
+
+		wl_list_insert(&renderer->stage.buffers, &shrunk->link);
+		stage_buffer_destroy(renderer, buf);
+	}
+}
+
 VkCommandBuffer vulkan_record_stage_cb(struct wlr_vk_renderer *renderer) {
 	if (renderer->stage.cb == NULL) {
 		renderer->stage.cb = vulkan_acquire_command_buffer(renderer);
@@ -465,16 +570,21 @@ bool vulkan_submit_stage_wait(struct wlr_vk_renderer *renderer, int wait_sync_fi
 		submit_info.pWaitDstStageMask = &wait_stage;
 	}
 
+	vulkan_stage_mark_submit(renderer, timeline_point);
+
 	VkResult res = vkQueueSubmit(renderer->dev->queue, 1, &submit_info, VK_NULL_HANDLE);
 	if (res != VK_SUCCESS) {
 		wlr_vk_error("vkQueueSubmit", res);
 		return false;
 	}
 
-	// NOTE: don't release stage allocations here since they may still be
-	// used for reading. Will be done next frame.
+	if (!vulkan_wait_command_buffer(cb, renderer)) {
+		return false;
+	}
 
-	return vulkan_wait_command_buffer(cb, renderer);
+	// We did a blocking wait so this is now the current point
+	vulkan_stage_buffer_gc(renderer, timeline_point);
+	return true;
 }
 
 struct wlr_vk_format_props *vulkan_format_props_from_drm(
@@ -508,7 +618,6 @@ static bool init_command_buffer(struct wlr_vk_command_buffer *cb,
 		.vk = vk_cb,
 	};
 	wl_list_init(&cb->destroy_textures);
-	wl_list_init(&cb->stage_buffers);
 	return true;
 }
 
@@ -534,7 +643,7 @@ bool vulkan_wait_command_buffer(struct wlr_vk_command_buffer *cb,
 }
 
 static void release_command_buffer_resources(struct wlr_vk_command_buffer *cb,
-		struct wlr_vk_renderer *renderer, int64_t now) {
+		struct wlr_vk_renderer *renderer) {
 	struct wlr_vk_texture *texture, *texture_tmp;
 	wl_list_for_each_safe(texture, texture_tmp, &cb->destroy_textures, destroy_link) {
 		wl_list_remove(&texture->destroy_link);
@@ -542,15 +651,6 @@ static void release_command_buffer_resources(struct wlr_vk_command_buffer *cb,
 		wlr_texture_destroy(&texture->wlr_texture);
 	}
 
-	struct wlr_vk_shared_buffer *buf, *buf_tmp;
-	wl_list_for_each_safe(buf, buf_tmp, &cb->stage_buffers, link) {
-		buf->allocs.size = 0;
-		buf->last_used_ms = now;
-
-		wl_list_remove(&buf->link);
-		wl_list_insert(&renderer->stage.buffers, &buf->link);
-	}
-
 	if (cb->color_transform) {
 		wlr_color_transform_unref(cb->color_transform);
 		cb->color_transform = NULL;
@@ -569,22 +669,14 @@ static struct wlr_vk_command_buffer *get_command_buffer(
 		return NULL;
 	}
 
-
-	// Garbage collect any buffers that have remained unused for too long
-	int64_t now = get_current_time_msec();
-	struct wlr_vk_shared_buffer *buf, *buf_tmp;
-	wl_list_for_each_safe(buf, buf_tmp, &renderer->stage.buffers, link) {
-		if (buf->allocs.size == 0 && buf->last_used_ms + 10000 < now) {
-			shared_buffer_destroy(renderer, buf);
-		}
-	}
+	vulkan_stage_buffer_gc(renderer, current_point);
 
 	// Destroy textures for completed command buffers
 	for (size_t i = 0; i < VULKAN_COMMAND_BUFFERS_CAP; i++) {
 		struct wlr_vk_command_buffer *cb = &renderer->command_buffers[i];
 		if (cb->vk != VK_NULL_HANDLE && !cb->recording &&
 				cb->timeline_point <= current_point) {
-			release_command_buffer_resources(cb, renderer, now);
+			release_command_buffer_resources(cb, renderer);
 		}
 	}
 
@@ -1187,7 +1279,7 @@ static void vulkan_destroy(struct wlr_renderer *wlr_renderer) {
 		if (cb->vk == VK_NULL_HANDLE) {
 			continue;
 		}
-		release_command_buffer_resources(cb, renderer, 0);
+		release_command_buffer_resources(cb, renderer);
 		if (cb->binary_semaphore != VK_NULL_HANDLE) {
 			vkDestroySemaphore(renderer->dev->dev, cb->binary_semaphore, NULL);
 		}
@@ -1199,9 +1291,9 @@ static void vulkan_destroy(struct wlr_renderer *wlr_renderer) {
 	}
 
 	// stage.cb automatically freed with command pool
-	struct wlr_vk_shared_buffer *buf, *tmp_buf;
+	struct wlr_vk_stage_buffer *buf, *tmp_buf;
 	wl_list_for_each_safe(buf, tmp_buf, &renderer->stage.buffers, link) {
-		shared_buffer_destroy(renderer, buf);
+		stage_buffer_destroy(renderer, buf);
 	}
 
 	struct wlr_vk_texture *tex, *tex_tmp;
diff --git a/render/vulkan/texture.c b/render/vulkan/texture.c
index c6365c90b..9298de804 100644
--- a/render/vulkan/texture.c
+++ b/render/vulkan/texture.c
@@ -72,16 +72,16 @@ static bool write_pixels(struct wlr_vk_texture *texture,
 
 	// get staging buffer
 	struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer, bsize, format_info->bytes_per_block);
-	if (!span.buffer || span.alloc.size != bsize) {
+	if (!span.buffer || span.size != bsize) {
 		wlr_log(WLR_ERROR, "Failed to retrieve staging buffer");
 		free(copies);
 		return false;
 	}
-	char *map = (char*)span.buffer->cpu_mapping + span.alloc.start;
+	char *map = (char*)span.buffer->cpu_mapping + span.offset;
 
 	// upload data
 
-	uint32_t buf_off = span.alloc.start;
+	uint32_t buf_off = span.offset;
 	for (int i = 0; i < rects_len; i++) {
 		pixman_box32_t rect = rects[i];
 		uint32_t width = rect.x2 - rect.x1;

From 3c9f1e35b145e795068ba37cef51e92c668359ef Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Sun, 12 Apr 2026 17:47:32 +0200
Subject: [PATCH 2/3] render/vulkan: Add unit-test for staging buffer

---
 test/meson.build                |  22 +++
 test/test_vulkan_stage_buffer.c | 234 ++++++++++++++++++++++++++++++++
 2 files changed, 256 insertions(+)
 create mode 100644 test/test_vulkan_stage_buffer.c

diff --git a/test/meson.build b/test/meson.build
index f51b2c02c..9c622e3ef 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -1,8 +1,30 @@
+# Used to test internal symbols
+lib_wlr_internal = static_library(
+	versioned_name + '-internal',
+	objects: lib_wlr.extract_all_objects(recursive: false),
+	dependencies: wlr_deps,
+	include_directories: [wlr_inc],
+	install: false,
+)
+
 test(
 	'box',
 	executable('test-box', 'test_box.c', dependencies: wlroots),
 )
 
+if features.get('vulkan-renderer')
+	test(
+		'vulkan_stage_buffer',
+		executable(
+			'test-vulkan-stage-buffer',
+			'test_vulkan_stage_buffer.c',
+			link_with: lib_wlr_internal,
+			dependencies: wlr_deps,
+			include_directories: wlr_inc,
+		),
+	)
+endif
+
 benchmark(
 	'scene',
 	executable('bench-scene', 'bench_scene.c', dependencies: wlroots),
diff --git a/test/test_vulkan_stage_buffer.c b/test/test_vulkan_stage_buffer.c
new file mode 100644
index 000000000..cceefa8de
--- /dev/null
+++ b/test/test_vulkan_stage_buffer.c
@@ -0,0 +1,234 @@
+#include <assert.h>
+#include <stdio.h>
+#include <wayland-util.h>
+
+#include "render/vulkan.h"
+
+#define BUF_SIZE 1024
+#define ALLOC_FAIL ((VkDeviceSize)-1)
+
+static void stage_buffer_init(struct wlr_vk_stage_buffer *buf) {
+	*buf = (struct wlr_vk_stage_buffer){
+		.buf_size = BUF_SIZE,
+	};
+	wl_array_init(&buf->watermarks);
+}
+
+static void stage_buffer_finish(struct wlr_vk_stage_buffer *buf) {
+	wl_array_release(&buf->watermarks);
+}
+
+static void push_watermark(struct wlr_vk_stage_buffer *buf,
+		uint64_t timeline_point) {
+	struct wlr_vk_stage_watermark *mark = wl_array_add(
+		&buf->watermarks, sizeof(*mark));
+	assert(mark != NULL);
+	*mark = (struct wlr_vk_stage_watermark){
+		.head = buf->head,
+		.timeline_point = timeline_point,
+	};
+}
+
+static size_t watermark_count(const struct wlr_vk_stage_buffer *buf) {
+	return buf->watermarks.size / sizeof(struct wlr_vk_stage_watermark);
+}
+
+static void test_alloc_simple(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	assert(buf.head == 100);
+	assert(vulkan_stage_buffer_alloc(&buf, 200, 1) == 100);
+	assert(buf.head == 300);
+	assert(buf.tail == 0);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_alloc_alignment(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 7, 1) == 0);
+	assert(buf.head == 7);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 4, 16) == 16);
+	assert(buf.head == 20);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 8, 8) == 24);
+	assert(buf.head == 32);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_alloc_limit(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// We do not allow allocations that would cause head to equal tail
+	assert(vulkan_stage_buffer_alloc(&buf, BUF_SIZE, 1) == ALLOC_FAIL);
+	assert(buf.head == 0);
+
+	assert(vulkan_stage_buffer_alloc(&buf, BUF_SIZE-1, 1) == 0);
+	assert(buf.head == BUF_SIZE-1);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_alloc_wrap(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// Fill the first 924 bytes
+	assert(vulkan_stage_buffer_alloc(&buf, BUF_SIZE - 100, 1) == 0);
+	push_watermark(&buf, 1);
+
+	// Fill the end of the buffer
+	assert(vulkan_stage_buffer_alloc(&buf, 50, 1) == 924);
+	push_watermark(&buf, 2);
+
+	// First, check that we don't wrap prematurely
+	assert(vulkan_stage_buffer_alloc(&buf, 50, 1) == ALLOC_FAIL);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == ALLOC_FAIL);
+
+	// Free the beginning of the buffer and try to wrap again
+	vulkan_stage_buffer_reclaim(&buf, 1);
+	assert(vulkan_stage_buffer_alloc(&buf, 50, 1) == 0);
+	assert(buf.tail == 924);
+	assert(buf.head == 50);
+
+	// Check that freeing from the end of the buffer still works
+	vulkan_stage_buffer_reclaim(&buf, 2);
+	assert(buf.tail == 974);
+	assert(buf.head == 50);
+
+	// Check that allocations still work
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 50);
+	assert(buf.tail == 974);
+	assert(buf.head == 150);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_empty(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// Fresh buffer with no watermarks and head == tail == 0 is drained.
+	assert(vulkan_stage_buffer_reclaim(&buf, 0));
+	assert(buf.tail == 0);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_pending_not_completed(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	push_watermark(&buf, 1);
+
+	// current point hasn't reached the watermark yet.
+	assert(!vulkan_stage_buffer_reclaim(&buf, 0));
+	assert(buf.tail == 0);
+	assert(watermark_count(&buf) == 1);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_partial(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	push_watermark(&buf, 1);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 100);
+	push_watermark(&buf, 2);
+
+	// Only the first watermark is reached.
+	assert(!vulkan_stage_buffer_reclaim(&buf, 1));
+	assert(buf.tail == 100);
+	assert(watermark_count(&buf) == 1);
+
+	const struct wlr_vk_stage_watermark *remaining = buf.watermarks.data;
+	assert(remaining[0].head == 200);
+	assert(remaining[0].timeline_point == 2);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_all(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	push_watermark(&buf, 1);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 100);
+	push_watermark(&buf, 2);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 200);
+	push_watermark(&buf, 3);
+
+	assert(vulkan_stage_buffer_reclaim(&buf, 100));
+	assert(buf.tail == 300);
+	assert(watermark_count(&buf) == 0);
+
+	stage_buffer_finish(&buf);
+}
+
+
+static void test_peak_utilization(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(buf.peak_utilization == 0);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	assert(vulkan_stage_buffer_alloc(&buf, 200, 1) == 100);
+	vulkan_stage_buffer_reclaim(&buf, 0);
+	assert(buf.peak_utilization == 300);
+
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_peak_utilization_wrap(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// 200 bytes used, 100 bytes from wrap
+	buf.head = BUF_SIZE - 100;
+	buf.tail = buf.head - 200;
+
+	// With 100 byte left, we wrap to front and waste 100 bytes
+	assert(vulkan_stage_buffer_alloc(&buf, 200, 1) == 0);
+	vulkan_stage_buffer_reclaim(&buf, 0);
+	assert(buf.head == 200);
+	assert(buf.tail == BUF_SIZE - 300);
+
+	// 200 bytes initial + 100 bytes wasted + 200 bytes allocated = 500
+	assert(buf.peak_utilization == 500);
+
+	stage_buffer_finish(&buf);
+}
+
+int main(void) {
+#ifdef NDEBUG
+	fprintf(stderr, "NDEBUG must be disabled for tests\n");
+	return 1;
+#endif
+
+	test_alloc_simple();
+	test_alloc_alignment();
+	test_alloc_limit();
+	test_alloc_wrap();
+
+	test_reclaim_empty();
+	test_reclaim_pending_not_completed();
+	test_reclaim_partial();
+	test_reclaim_all();
+
+	test_peak_utilization();
+	test_peak_utilization_wrap();
+
+	return 0;
+}

From b16dd0178bff1125650368512ef6fc495da841f4 Mon Sep 17 00:00:00 2001
From: Kenny Levinsen <kl@kl.wtf>
Date: Sun, 12 Apr 2026 17:47:40 +0200
Subject: [PATCH 3/3] render/vulkan: Use instanced draws instead of scissors

Similar to what we have already done for gles2. To simplify things we
use the staging ring buffer for the vertex buffers by extending the
usage bits, rather than introducing a separate pool.
---
 render/vulkan/pass.c              | 155 +++++++++++++++++++++++++-----
 render/vulkan/renderer.c          |  34 ++++---
 render/vulkan/shaders/common.vert |   3 +
 3 files changed, 156 insertions(+), 36 deletions(-)

diff --git a/render/vulkan/pass.c b/render/vulkan/pass.c
index 04cb16cf9..2dca9d0f3 100644
--- a/render/vulkan/pass.c
+++ b/render/vulkan/pass.c
@@ -2,7 +2,9 @@
 #include <drm_fourcc.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <wlr/util/box.h>
 #include <wlr/util/log.h>
+#include <wlr/util/transform.h>
 #include <wlr/render/color.h>
 #include <wlr/render/drm_syncobj.h>
 
@@ -285,6 +287,20 @@ static bool render_pass_submit(struct wlr_render_pass *wlr_pass) {
 		int clip_rects_len;
 		const pixman_box32_t *clip_rects = pixman_region32_rectangles(
 			clip, &clip_rects_len);
+
+		float identity[4] = { 0.0f, 0.0f, 1.0f, 1.0f };
+		struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer,
+			sizeof(identity), sizeof(identity));
+		if (!span.buffer) {
+			pass->failed = true;
+			goto error;
+		}
+
+		memcpy((char *)span.buffer->cpu_mapping + span.offset, identity, sizeof(identity));
+
+		VkDeviceSize vb_offset = span.offset;
+		vkCmdBindVertexBuffers(render_cb->vk, 0, 1, &span.buffer->buffer, &vb_offset);
+
 		for (int i = 0; i < clip_rects_len; i++) {
 			VkRect2D rect;
 			convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
@@ -656,20 +672,6 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 
 	int clip_rects_len;
 	const pixman_box32_t *clip_rects = pixman_region32_rectangles(&clip, &clip_rects_len);
-	// Record regions possibly updated for use in second subpass
-	for (int i = 0; i < clip_rects_len; i++) {
-		struct wlr_box clip_box = {
-			.x = clip_rects[i].x1,
-			.y = clip_rects[i].y1,
-			.width = clip_rects[i].x2 - clip_rects[i].x1,
-			.height = clip_rects[i].y2 - clip_rects[i].y1,
-		};
-		struct wlr_box intersection;
-		if (!wlr_box_intersection(&intersection, &options->box, &clip_box)) {
-			continue;
-		}
-		render_pass_mark_box_updated(pass, &intersection);
-	}
 
 	struct wlr_box box;
 	wlr_render_rect_options_get_box(options, pass->render_buffer->wlr_buffer, &box);
@@ -692,6 +694,45 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 			break;
 		}
 
+		if (clip_rects_len == 0) {
+			break;
+		}
+
+		const VkDeviceSize instance_size = 4 * sizeof(float);
+		struct wlr_vk_buffer_span span = vulkan_get_stage_span(pass->renderer,
+			clip_rects_len * instance_size, 16);
+		if (!span.buffer) {
+			pass->failed = true;
+			break;
+		}
+		float *instance_data = (float *)((char *)span.buffer->cpu_mapping + span.offset);
+		int instance_count = 0;
+		for (int i = 0; i < clip_rects_len; i++) {
+			struct wlr_box clip_box = {
+				.x = clip_rects[i].x1,
+				.y = clip_rects[i].y1,
+				.width = clip_rects[i].x2 - clip_rects[i].x1,
+				.height = clip_rects[i].y2 - clip_rects[i].y1,
+			};
+			struct wlr_box intersection;
+			if (!wlr_box_intersection(&intersection, &box, &clip_box)) {
+				continue;
+			}
+			render_pass_mark_box_updated(pass, &intersection);
+			instance_data[instance_count * 4 + 0] = (float)(intersection.x - box.x) / box.width;
+			instance_data[instance_count * 4 + 1] = (float)(intersection.y - box.y) / box.height;
+			instance_data[instance_count * 4 + 2] = (float)intersection.width / box.width;
+			instance_data[instance_count * 4 + 3] = (float)intersection.height / box.height;
+			instance_count++;
+		}
+		if (instance_count < clip_rects_len) {
+			vulkan_return_stage_span(&span,
+				(clip_rects_len - instance_count) * instance_size);
+			if (instance_count == 0) {
+				break;
+			}
+		}
+
 		struct wlr_vk_vert_pcr_data vert_pcr_data = {
 			.uv_off = { 0, 0 },
 			.uv_size = { 1, 1 },
@@ -705,12 +746,17 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 			VK_SHADER_STAGE_FRAGMENT_BIT, sizeof(vert_pcr_data), sizeof(float) * 4,
 			linear_color);
 
-		for (int i = 0; i < clip_rects_len; i++) {
-			VkRect2D rect;
-			convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
-			vkCmdSetScissor(cb, 0, 1, &rect);
-			vkCmdDraw(cb, 4, 1, 0, 0);
-		}
+		VkDeviceSize vb_offset = span.offset;
+		vkCmdBindVertexBuffers(cb, 0, 1, &span.buffer->buffer, &vb_offset);
+
+		VkRect2D full_scissor = {
+			.extent = {
+				.width = pass->render_buffer->wlr_buffer->width,
+				.height = pass->render_buffer->wlr_buffer->height,
+			},
+		};
+		vkCmdSetScissor(cb, 0, 1, &full_scissor);
+		vkCmdDraw(cb, 4, instance_count, 0, 0);
 		break;
 	case WLR_RENDER_BLEND_MODE_NONE:;
 		VkClearAttachment clear_att = {
@@ -727,6 +773,18 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 			.layerCount = 1,
 		};
 		for (int i = 0; i < clip_rects_len; i++) {
+			struct wlr_box clip_box = {
+				.x = clip_rects[i].x1,
+				.y = clip_rects[i].y1,
+				.width = clip_rects[i].x2 - clip_rects[i].x1,
+				.height = clip_rects[i].y2 - clip_rects[i].y1,
+			};
+			struct wlr_box intersection;
+			if (!wlr_box_intersection(&intersection, &options->box, &clip_box)) {
+				continue;
+			}
+			render_pass_mark_box_updated(pass, &intersection);
+
 			convert_pixman_box_to_vk_rect(&clip_rects[i], &clear_rect.rect);
 			vkCmdClearAttachments(cb, 1, &clear_att, 1, &clear_rect);
 		}
@@ -888,12 +946,23 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 
 	int clip_rects_len;
 	const pixman_box32_t *clip_rects = pixman_region32_rectangles(&clip, &clip_rects_len);
-	for (int i = 0; i < clip_rects_len; i++) {
-		VkRect2D rect;
-		convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
-		vkCmdSetScissor(cb, 0, 1, &rect);
-		vkCmdDraw(cb, 4, 1, 0, 0);
 
+	if (clip_rects_len == 0) {
+		goto out;
+	}
+
+	const VkDeviceSize instance_size = 4 * sizeof(float);
+	struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer,
+		clip_rects_len * instance_size, 16);
+	if (!span.buffer) {
+		pass->failed = true;
+		goto out;
+	}
+	float *instance_data = (float *)((char *)span.buffer->cpu_mapping + span.offset);
+	int instance_count = 0;
+	enum wl_output_transform inv_transform =
+		wlr_output_transform_invert(options->transform);
+	for (int i = 0; i < clip_rects_len; i++) {
 		struct wlr_box clip_box = {
 			.x = clip_rects[i].x1,
 			.y = clip_rects[i].y1,
@@ -905,8 +974,44 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 			continue;
 		}
 		render_pass_mark_box_updated(pass, &intersection);
+
+		struct wlr_fbox norm = {
+			.x = (double)(intersection.x - dst_box.x) / dst_box.width,
+			.y = (double)(intersection.y - dst_box.y) / dst_box.height,
+			.width = (double)intersection.width / dst_box.width,
+			.height = (double)intersection.height / dst_box.height,
+		};
+
+		if (options->transform != WL_OUTPUT_TRANSFORM_NORMAL) {
+			wlr_fbox_transform(&norm, &norm, inv_transform, 1.0, 1.0);
+		}
+
+		instance_data[instance_count * 4 + 0] = (float)norm.x;
+		instance_data[instance_count * 4 + 1] = (float)norm.y;
+		instance_data[instance_count * 4 + 2] = (float)norm.width;
+		instance_data[instance_count * 4 + 3] = (float)norm.height;
+		instance_count++;
+	}
+	if (instance_count < clip_rects_len) {
+		vulkan_return_stage_span(&span,
+			(clip_rects_len - instance_count) * instance_size);
 	}
 
+	if (instance_count > 0) {
+		VkDeviceSize vb_offset = span.offset;
+		vkCmdBindVertexBuffers(cb, 0, 1, &span.buffer->buffer, &vb_offset);
+
+		VkRect2D full_scissor = {
+			.extent = {
+				.width = pass->render_buffer->wlr_buffer->width,
+				.height = pass->render_buffer->wlr_buffer->height,
+			},
+		};
+		vkCmdSetScissor(cb, 0, 1, &full_scissor);
+		vkCmdDraw(cb, 4, instance_count, 0, 0);
+	}
+
+out:
 	texture->last_used_cb = pass->command_buffer;
 
 	pixman_region32_fini(&clip);
diff --git a/render/vulkan/renderer.c b/render/vulkan/renderer.c
index e8e44b3f4..38e8ac9f4 100644
--- a/render/vulkan/renderer.c
+++ b/render/vulkan/renderer.c
@@ -222,7 +222,8 @@ static struct wlr_vk_stage_buffer *stage_buffer_create(
 		.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
 		.size = bsize,
 		.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT |
-			VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+			VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+			VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
 		.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
 	};
 	res = vkCreateBuffer(r->dev->dev, &buf_info, NULL, &buf->buffer);
@@ -1930,6 +1931,25 @@ static bool pipeline_key_equals(const struct wlr_vk_pipeline_key *a,
 	return true;
 }
 
+static const VkVertexInputBindingDescription instance_vert_binding = {
+	.binding = 0,
+	.stride = sizeof(float) * 4,
+	.inputRate = VK_VERTEX_INPUT_RATE_INSTANCE,
+};
+static const VkVertexInputAttributeDescription instance_vert_attr = {
+	.location = 0,
+	.binding = 0,
+	.format = VK_FORMAT_R32G32B32A32_SFLOAT,
+	.offset = 0,
+};
+static const VkPipelineVertexInputStateCreateInfo instance_vert_input = {
+	.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+	.vertexBindingDescriptionCount = 1,
+	.pVertexBindingDescriptions = &instance_vert_binding,
+	.vertexAttributeDescriptionCount = 1,
+	.pVertexAttributeDescriptions = &instance_vert_attr,
+};
+
 // Initializes the pipeline for rendering textures and using the given
 // VkRenderPass and VkPipelineLayout.
 struct wlr_vk_pipeline *setup_get_or_create_pipeline(
@@ -2061,10 +2081,6 @@ struct wlr_vk_pipeline *setup_get_or_create_pipeline(
 		.dynamicStateCount = sizeof(dyn_states) / sizeof(dyn_states[0]),
 	};
 
-	VkPipelineVertexInputStateCreateInfo vertex = {
-		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-	};
-
 	VkGraphicsPipelineCreateInfo pinfo = {
 		.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
 		.layout = pipeline_layout->vk,
@@ -2079,7 +2095,7 @@ struct wlr_vk_pipeline *setup_get_or_create_pipeline(
 		.pMultisampleState = &multisample,
 		.pViewportState = &viewport,
 		.pDynamicState = &dynamic,
-		.pVertexInputState = &vertex,
+		.pVertexInputState = &instance_vert_input,
 	};
 
 	VkPipelineCache cache = VK_NULL_HANDLE;
@@ -2178,10 +2194,6 @@ static bool init_blend_to_output_pipeline(struct wlr_vk_renderer *renderer,
 		.dynamicStateCount = sizeof(dyn_states) / sizeof(dyn_states[0]),
 	};
 
-	VkPipelineVertexInputStateCreateInfo vertex = {
-		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-	};
-
 	VkGraphicsPipelineCreateInfo pinfo = {
 		.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
 		.pNext = NULL,
@@ -2196,7 +2208,7 @@ static bool init_blend_to_output_pipeline(struct wlr_vk_renderer *renderer,
 		.pMultisampleState = &multisample,
 		.pViewportState = &viewport,
 		.pDynamicState = &dynamic,
-		.pVertexInputState = &vertex,
+		.pVertexInputState = &instance_vert_input,
 	};
 
 	VkPipelineCache cache = VK_NULL_HANDLE;
diff --git a/render/vulkan/shaders/common.vert b/render/vulkan/shaders/common.vert
index f1579790d..82ea9658c 100644
--- a/render/vulkan/shaders/common.vert
+++ b/render/vulkan/shaders/common.vert
@@ -8,11 +8,14 @@ layout(push_constant, row_major) uniform UBO {
 	vec2 uv_size;
 } data;
 
+layout(location = 0) in vec4 inst_rect;
+
 layout(location = 0) out vec2 uv;
 
 void main() {
 	vec2 pos = vec2(float((gl_VertexIndex + 1) & 2) * 0.5f,
 		float(gl_VertexIndex & 2) * 0.5f);
+	pos = inst_rect.xy + pos * inst_rect.zw;
 	uv = data.uv_offset + pos * data.uv_size;
 	gl_Position = data.proj * vec4(pos, 0.0, 1.0);
 }