diff --git a/include/render/vulkan.h b/include/render/vulkan.h
index 021749c27..12a7ff173 100644
--- a/include/render/vulkan.h
+++ b/include/render/vulkan.h
@@ -284,8 +284,6 @@ struct wlr_vk_command_buffer {
 	uint64_t timeline_point;
 	// Textures to destroy after the command buffer completes
 	struct wl_list destroy_textures; // wlr_vk_texture.destroy_link
-	// Staging shared buffers to release after the command buffer completes
-	struct wl_list stage_buffers; // wlr_vk_shared_buffer.link
 	// Color transform to unref after the command buffer completes
 	struct wlr_color_transform *color_transform;
 
@@ -352,7 +350,7 @@ struct wlr_vk_renderer {
 	struct {
 		struct wlr_vk_command_buffer *cb;
 		uint64_t last_timeline_point;
-		struct wl_list buffers; // wlr_vk_shared_buffer.link
+		struct wl_list buffers; // wlr_vk_stage_buffer.link
 	} stage;
 
 	struct {
@@ -453,14 +451,27 @@ struct wlr_vk_render_pass {
 struct wlr_vk_render_pass *vulkan_begin_render_pass(struct wlr_vk_renderer *renderer,
 	struct wlr_vk_render_buffer *buffer, const struct wlr_buffer_pass_options *options);
 
-// Suballocates a buffer span with the given size that can be mapped
-// and used as staging buffer. The allocation is implicitly released when the
-// stage cb has finished execution. The start of the span will be a multiple
-// of the given alignment.
+// Suballocates a buffer span with the given size from the staging ring buffer
+// that is mapped for CPU access. vulkan_stage_mark_submit must be called after
+// allocations are made to mark the timeline point after which the allocations
+// will be released. The start of the span will be a multiple of alignment.
 struct wlr_vk_buffer_span vulkan_get_stage_span(
 	struct wlr_vk_renderer *renderer, VkDeviceSize size,
 	VkDeviceSize alignment);
 
+// Returns unused bytes at the end of a buffer span back to the ring buffer.
+// This allows the caller to allocate for worst-case consumption and return the
+// unused remainder. This must not be called after vulkan_stage_mark_submit,
+// and only works for the last made allocation.
+void vulkan_return_stage_span(struct wlr_vk_buffer_span *span,
+	VkDeviceSize return_size);
+
+// Records a watermark on all staging buffers with new allocations with the
+// specified timeline point. Once the timeline point is passed, the span will
+// be reclaimed by vulkan_stage_buffer_reclaim.
+void vulkan_stage_mark_submit(struct wlr_vk_renderer *renderer,
+	uint64_t timeline_point);
+
 // Tries to allocate a texture descriptor set. Will additionally
 // return the pool it was allocated from when successful (for freeing it later).
 struct wlr_vk_descriptor_pool *vulkan_alloc_texture_ds(
@@ -544,29 +555,45 @@ struct wlr_vk_descriptor_pool {
 	struct wl_list link; // wlr_vk_renderer.descriptor_pools
 };
 
-struct wlr_vk_allocation {
-	VkDeviceSize start;
-	VkDeviceSize size;
+struct wlr_vk_stage_watermark {
+	VkDeviceSize head;
+	uint64_t timeline_point;
 };
 
-// List of suballocated staging buffers.
-// Used to upload to/read from device local images.
-struct wlr_vk_shared_buffer {
-	struct wl_list link; // wlr_vk_renderer.stage.buffers or wlr_vk_command_buffer.stage_buffers
+// Ring buffer for staging transfers
+struct wlr_vk_stage_buffer {
+	struct wl_list link; // wlr_vk_renderer.stage.buffers
+	bool active;
 	VkBuffer buffer;
 	VkDeviceMemory memory;
 	VkDeviceSize buf_size;
 	void *cpu_mapping;
-	struct wl_array allocs; // struct wlr_vk_allocation
-	int64_t last_used_ms;
+
+	VkDeviceSize head;
+	VkDeviceSize tail;
+
+	struct wl_array watermarks; // struct wlr_vk_stage_watermark
+	VkDeviceSize peak_utilization;
+	int underutil_count;
 };
 
-// Suballocated range on a buffer.
+// Suballocated range on a staging ring buffer.
 struct wlr_vk_buffer_span {
-	struct wlr_vk_shared_buffer *buffer;
-	struct wlr_vk_allocation alloc;
+	struct wlr_vk_stage_buffer *buffer;
+	VkDeviceSize offset;
+	VkDeviceSize size;
 };
 
+// Suballocate a span of size bytes from a staging ring buffer, with the
+// returned offset rounded up to the given alignment. Returns the byte offset
+// of the allocation, or (VkDeviceSize)-1 if the buffer is too full to fit it.
+VkDeviceSize vulkan_stage_buffer_alloc(struct wlr_vk_stage_buffer *buf,
+	VkDeviceSize size, VkDeviceSize alignment);
+
+// Free all allocations covered by watermarks whose timeline point has been
+// reached. Returns true if the buffer is now fully drained.
+bool vulkan_stage_buffer_reclaim(struct wlr_vk_stage_buffer *buf,
+	uint64_t current_point);
 
 // Prepared form for a color transform
 struct wlr_vk_color_transform {
diff --git a/render/vulkan/pass.c b/render/vulkan/pass.c
index 01e8fbd7a..2dca9d0f3 100644
--- a/render/vulkan/pass.c
+++ b/render/vulkan/pass.c
@@ -2,7 +2,9 @@
 #include <drm_fourcc.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <wlr/util/box.h>
 #include <wlr/util/log.h>
+#include <wlr/util/transform.h>
 #include <wlr/render/color.h>
 #include <wlr/render/drm_syncobj.h>
 
@@ -285,6 +287,20 @@ static bool render_pass_submit(struct wlr_render_pass *wlr_pass) {
 		int clip_rects_len;
 		const pixman_box32_t *clip_rects = pixman_region32_rectangles(
 			clip, &clip_rects_len);
+
+		float identity[4] = { 0.0f, 0.0f, 1.0f, 1.0f };
+		struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer,
+			sizeof(identity), sizeof(identity));
+		if (!span.buffer) {
+			pass->failed = true;
+			goto error;
+		}
+
+		memcpy((char *)span.buffer->cpu_mapping + span.offset, identity, sizeof(identity));
+
+		VkDeviceSize vb_offset = span.offset;
+		vkCmdBindVertexBuffers(render_cb->vk, 0, 1, &span.buffer->buffer, &vb_offset);
+
 		for (int i = 0; i < clip_rects_len; i++) {
 			VkRect2D rect;
 			convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
@@ -595,14 +611,7 @@ static bool render_pass_submit(struct wlr_render_pass *wlr_pass) {
 
 	free(render_wait);
 
-	struct wlr_vk_shared_buffer *stage_buf, *stage_buf_tmp;
-	wl_list_for_each_safe(stage_buf, stage_buf_tmp, &renderer->stage.buffers, link) {
-		if (stage_buf->allocs.size == 0) {
-			continue;
-		}
-		wl_list_remove(&stage_buf->link);
-		wl_list_insert(&stage_cb->stage_buffers, &stage_buf->link);
-	}
+	vulkan_stage_mark_submit(renderer, render_timeline_point);
 
 	if (!vulkan_sync_render_pass_release(renderer, pass)) {
 		wlr_log(WLR_ERROR, "Failed to sync render buffer");
@@ -663,20 +672,6 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 
 	int clip_rects_len;
 	const pixman_box32_t *clip_rects = pixman_region32_rectangles(&clip, &clip_rects_len);
-	// Record regions possibly updated for use in second subpass
-	for (int i = 0; i < clip_rects_len; i++) {
-		struct wlr_box clip_box = {
-			.x = clip_rects[i].x1,
-			.y = clip_rects[i].y1,
-			.width = clip_rects[i].x2 - clip_rects[i].x1,
-			.height = clip_rects[i].y2 - clip_rects[i].y1,
-		};
-		struct wlr_box intersection;
-		if (!wlr_box_intersection(&intersection, &options->box, &clip_box)) {
-			continue;
-		}
-		render_pass_mark_box_updated(pass, &intersection);
-	}
 
 	struct wlr_box box;
 	wlr_render_rect_options_get_box(options, pass->render_buffer->wlr_buffer, &box);
@@ -699,6 +694,45 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 			break;
 		}
 
+		if (clip_rects_len == 0) {
+			break;
+		}
+
+		const VkDeviceSize instance_size = 4 * sizeof(float);
+		struct wlr_vk_buffer_span span = vulkan_get_stage_span(pass->renderer,
+			clip_rects_len * instance_size, 16);
+		if (!span.buffer) {
+			pass->failed = true;
+			break;
+		}
+		float *instance_data = (float *)((char *)span.buffer->cpu_mapping + span.offset);
+		int instance_count = 0;
+		for (int i = 0; i < clip_rects_len; i++) {
+			struct wlr_box clip_box = {
+				.x = clip_rects[i].x1,
+				.y = clip_rects[i].y1,
+				.width = clip_rects[i].x2 - clip_rects[i].x1,
+				.height = clip_rects[i].y2 - clip_rects[i].y1,
+			};
+			struct wlr_box intersection;
+			if (!wlr_box_intersection(&intersection, &box, &clip_box)) {
+				continue;
+			}
+			render_pass_mark_box_updated(pass, &intersection);
+			instance_data[instance_count * 4 + 0] = (float)(intersection.x - box.x) / box.width;
+			instance_data[instance_count * 4 + 1] = (float)(intersection.y - box.y) / box.height;
+			instance_data[instance_count * 4 + 2] = (float)intersection.width / box.width;
+			instance_data[instance_count * 4 + 3] = (float)intersection.height / box.height;
+			instance_count++;
+		}
+		if (instance_count < clip_rects_len) {
+			vulkan_return_stage_span(&span,
+				(clip_rects_len - instance_count) * instance_size);
+			if (instance_count == 0) {
+				break;
+			}
+		}
+
 		struct wlr_vk_vert_pcr_data vert_pcr_data = {
 			.uv_off = { 0, 0 },
 			.uv_size = { 1, 1 },
@@ -712,12 +746,17 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 			VK_SHADER_STAGE_FRAGMENT_BIT, sizeof(vert_pcr_data), sizeof(float) * 4,
 			linear_color);
 
-		for (int i = 0; i < clip_rects_len; i++) {
-			VkRect2D rect;
-			convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
-			vkCmdSetScissor(cb, 0, 1, &rect);
-			vkCmdDraw(cb, 4, 1, 0, 0);
-		}
+		VkDeviceSize vb_offset = span.offset;
+		vkCmdBindVertexBuffers(cb, 0, 1, &span.buffer->buffer, &vb_offset);
+
+		VkRect2D full_scissor = {
+			.extent = {
+				.width = pass->render_buffer->wlr_buffer->width,
+				.height = pass->render_buffer->wlr_buffer->height,
+			},
+		};
+		vkCmdSetScissor(cb, 0, 1, &full_scissor);
+		vkCmdDraw(cb, 4, instance_count, 0, 0);
 		break;
 	case WLR_RENDER_BLEND_MODE_NONE:;
 		VkClearAttachment clear_att = {
@@ -734,6 +773,18 @@ static void render_pass_add_rect(struct wlr_render_pass *wlr_pass,
 			.layerCount = 1,
 		};
 		for (int i = 0; i < clip_rects_len; i++) {
+			struct wlr_box clip_box = {
+				.x = clip_rects[i].x1,
+				.y = clip_rects[i].y1,
+				.width = clip_rects[i].x2 - clip_rects[i].x1,
+				.height = clip_rects[i].y2 - clip_rects[i].y1,
+			};
+			struct wlr_box intersection;
+			if (!wlr_box_intersection(&intersection, &options->box, &clip_box)) {
+				continue;
+			}
+			render_pass_mark_box_updated(pass, &intersection);
+
 			convert_pixman_box_to_vk_rect(&clip_rects[i], &clear_rect.rect);
 			vkCmdClearAttachments(cb, 1, &clear_att, 1, &clear_rect);
 		}
@@ -895,12 +946,23 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 
 	int clip_rects_len;
 	const pixman_box32_t *clip_rects = pixman_region32_rectangles(&clip, &clip_rects_len);
-	for (int i = 0; i < clip_rects_len; i++) {
-		VkRect2D rect;
-		convert_pixman_box_to_vk_rect(&clip_rects[i], &rect);
-		vkCmdSetScissor(cb, 0, 1, &rect);
-		vkCmdDraw(cb, 4, 1, 0, 0);
 
+	if (clip_rects_len == 0) {
+		goto out;
+	}
+
+	const VkDeviceSize instance_size = 4 * sizeof(float);
+	struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer,
+		clip_rects_len * instance_size, 16);
+	if (!span.buffer) {
+		pass->failed = true;
+		goto out;
+	}
+	float *instance_data = (float *)((char *)span.buffer->cpu_mapping + span.offset);
+	int instance_count = 0;
+	enum wl_output_transform inv_transform =
+		wlr_output_transform_invert(options->transform);
+	for (int i = 0; i < clip_rects_len; i++) {
 		struct wlr_box clip_box = {
 			.x = clip_rects[i].x1,
 			.y = clip_rects[i].y1,
@@ -912,8 +974,44 @@ static void render_pass_add_texture(struct wlr_render_pass *wlr_pass,
 			continue;
 		}
 		render_pass_mark_box_updated(pass, &intersection);
+
+		struct wlr_fbox norm = {
+			.x = (double)(intersection.x - dst_box.x) / dst_box.width,
+			.y = (double)(intersection.y - dst_box.y) / dst_box.height,
+			.width = (double)intersection.width / dst_box.width,
+			.height = (double)intersection.height / dst_box.height,
+		};
+
+		if (options->transform != WL_OUTPUT_TRANSFORM_NORMAL) {
+			wlr_fbox_transform(&norm, &norm, inv_transform, 1.0, 1.0);
+		}
+
+		instance_data[instance_count * 4 + 0] = (float)norm.x;
+		instance_data[instance_count * 4 + 1] = (float)norm.y;
+		instance_data[instance_count * 4 + 2] = (float)norm.width;
+		instance_data[instance_count * 4 + 3] = (float)norm.height;
+		instance_count++;
+	}
+	if (instance_count < clip_rects_len) {
+		vulkan_return_stage_span(&span,
+			(clip_rects_len - instance_count) * instance_size);
 	}
 
+	if (instance_count > 0) {
+		VkDeviceSize vb_offset = span.offset;
+		vkCmdBindVertexBuffers(cb, 0, 1, &span.buffer->buffer, &vb_offset);
+
+		VkRect2D full_scissor = {
+			.extent = {
+				.width = pass->render_buffer->wlr_buffer->width,
+				.height = pass->render_buffer->wlr_buffer->height,
+			},
+		};
+		vkCmdSetScissor(cb, 0, 1, &full_scissor);
+		vkCmdDraw(cb, 4, instance_count, 0, 0);
+	}
+
+out:
 	texture->last_used_cb = pass->command_buffer;
 
 	pixman_region32_fini(&clip);
@@ -1056,13 +1154,13 @@ static bool create_3d_lut_image(struct wlr_vk_renderer *renderer,
 	size_t size = dim_len * dim_len * dim_len * bytes_per_block;
 	struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer,
 		size, bytes_per_block);
-	if (!span.buffer || span.alloc.size != size) {
+	if (!span.buffer || span.size != size) {
 		wlr_log(WLR_ERROR, "Failed to retrieve staging buffer");
 		goto fail_imageview;
 	}
 
 	float sample_range = 1.0f / (dim_len - 1);
-	char *map = (char *)span.buffer->cpu_mapping + span.alloc.start;
+	char *map = (char *)span.buffer->cpu_mapping + span.offset;
 	float *dst = (float *)map;
 	for (size_t b_index = 0; b_index < dim_len; b_index++) {
 		for (size_t g_index = 0; g_index < dim_len; g_index++) {
@@ -1092,7 +1190,7 @@ static bool create_3d_lut_image(struct wlr_vk_renderer *renderer,
 		VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT,
 		VK_ACCESS_TRANSFER_WRITE_BIT);
 	VkBufferImageCopy copy = {
-		.bufferOffset = span.alloc.start,
+		.bufferOffset = span.offset,
 		.imageExtent.width = dim_len,
 		.imageExtent.height = dim_len,
 		.imageExtent.depth = dim_len,
diff --git a/render/vulkan/renderer.c b/render/vulkan/renderer.c
index 434ab4769..38e8ac9f4 100644
--- a/render/vulkan/renderer.c
+++ b/render/vulkan/renderer.c
@@ -1,6 +1,5 @@
 #include <assert.h>
 #include <fcntl.h>
-#include <math.h>
 #include <poll.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -8,6 +7,7 @@
 #include <unistd.h>
 #include <drm_fourcc.h>
 #include <vulkan/vulkan.h>
+#include <wayland-util.h>
 #include <wlr/render/color.h>
 #include <wlr/render/interface.h>
 #include <wlr/types/wlr_drm.h>
@@ -26,11 +26,9 @@
 #include "render/vulkan/shaders/texture.frag.h"
 #include "render/vulkan/shaders/quad.frag.h"
 #include "render/vulkan/shaders/output.frag.h"
-#include "types/wlr_buffer.h"
-#include "util/time.h"
+#include "util/array.h"
 
 // TODO:
-// - simplify stage allocation, don't track allocations but use ringbuffer-like
 // - use a pipeline cache (not sure when to save though, after every pipeline
 //   creation?)
 // - create pipelines as derivatives of each other
@@ -187,18 +185,13 @@ static void destroy_render_format_setup(struct wlr_vk_renderer *renderer,
 	free(setup);
 }
 
-static void shared_buffer_destroy(struct wlr_vk_renderer *r,
-		struct wlr_vk_shared_buffer *buffer) {
+static void stage_buffer_destroy(struct wlr_vk_renderer *r,
+		struct wlr_vk_stage_buffer *buffer) {
 	if (!buffer) {
 		return;
 	}
 
-	if (buffer->allocs.size > 0) {
-		wlr_log(WLR_ERROR, "shared_buffer_finish: %zu allocations left",
-			buffer->allocs.size / sizeof(struct wlr_vk_allocation));
-	}
-
-	wl_array_release(&buffer->allocs);
+	wl_array_release(&buffer->watermarks);
 	if (buffer->cpu_mapping) {
 		vkUnmapMemory(r->dev->dev, buffer->memory);
 		buffer->cpu_mapping = NULL;
@@ -214,75 +207,12 @@ static void shared_buffer_destroy(struct wlr_vk_renderer *r,
 	free(buffer);
 }
 
-struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
-		VkDeviceSize size, VkDeviceSize alignment) {
-	// try to find free span
-	// simple greedy allocation algorithm - should be enough for this usecase
-	// since all allocations are freed together after the frame
-	struct wlr_vk_shared_buffer *buf;
-	wl_list_for_each_reverse(buf, &r->stage.buffers, link) {
-		VkDeviceSize start = 0u;
-		if (buf->allocs.size > 0) {
-			const struct wlr_vk_allocation *allocs = buf->allocs.data;
-			size_t allocs_len = buf->allocs.size / sizeof(struct wlr_vk_allocation);
-			const struct wlr_vk_allocation *last = &allocs[allocs_len - 1];
-			start = last->start + last->size;
-		}
-
-		assert(start <= buf->buf_size);
-
-		// ensure the proposed start is a multiple of alignment
-		start += alignment - 1 - ((start + alignment - 1) % alignment);
-
-		if (buf->buf_size - start < size) {
-			continue;
-		}
-
-		struct wlr_vk_allocation *a = wl_array_add(&buf->allocs, sizeof(*a));
-		if (a == NULL) {
-			wlr_log_errno(WLR_ERROR, "Allocation failed");
-			goto error_alloc;
-		}
-
-		*a = (struct wlr_vk_allocation){
-			.start = start,
-			.size = size,
-		};
-		return (struct wlr_vk_buffer_span) {
-			.buffer = buf,
-			.alloc = *a,
-		};
-	}
-
-	if (size > max_stage_size) {
-		wlr_log(WLR_ERROR, "cannot vulkan stage buffer: "
-			"requested size (%zu bytes) exceeds maximum (%zu bytes)",
-			(size_t)size, (size_t)max_stage_size);
-		goto error_alloc;
-	}
-
-	// we didn't find a free buffer - create one
-	// size = clamp(max(size * 2, prev_size * 2), min_size, max_size)
-	VkDeviceSize bsize = size * 2;
-	bsize = bsize < min_stage_size ? min_stage_size : bsize;
-	if (!wl_list_empty(&r->stage.buffers)) {
-		struct wl_list *last_link = r->stage.buffers.prev;
-		struct wlr_vk_shared_buffer *prev = wl_container_of(
-			last_link, prev, link);
-		VkDeviceSize last_size = 2 * prev->buf_size;
-		bsize = bsize < last_size ? last_size : bsize;
-	}
-
-	if (bsize > max_stage_size) {
-		wlr_log(WLR_INFO, "vulkan stage buffers have reached max size");
-		bsize = max_stage_size;
-	}
-
-	// create buffer
-	buf = calloc(1, sizeof(*buf));
+static struct wlr_vk_stage_buffer *stage_buffer_create(
+		struct wlr_vk_renderer *r, VkDeviceSize bsize) {
+	struct wlr_vk_stage_buffer *buf = calloc(1, sizeof(*buf));
 	if (!buf) {
 		wlr_log_errno(WLR_ERROR, "Allocation failed");
-		goto error_alloc;
+		return NULL;
 	}
 
 	wl_list_init(&buf->link);
@@ -292,7 +222,8 @@ struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
 		.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
 		.size = bsize,
 		.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT |
-			VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+			VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
+			VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
 		.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
 	};
 	res = vkCreateBuffer(r->dev->dev, &buf_info, NULL, &buf->buffer);
@@ -319,7 +250,7 @@ struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
 	};
 	res = vkAllocateMemory(r->dev->dev, &mem_info, NULL, &buf->memory);
 	if (res != VK_SUCCESS) {
-		wlr_vk_error("vkAllocatorMemory", res);
+		wlr_vk_error("vkAllocateMemory", res);
 		goto error;
 	}
 
@@ -335,34 +266,209 @@ struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
 		goto error;
 	}
 
-	struct wlr_vk_allocation *a = wl_array_add(&buf->allocs, sizeof(*a));
-	if (a == NULL) {
-		wlr_log_errno(WLR_ERROR, "Allocation failed");
+	buf->active = true;
+	buf->buf_size = bsize;
+	return buf;
+
+error:
+	stage_buffer_destroy(r, buf);
+	return NULL;
+}
+
+// Returns true if the buffer is fully drained.
+bool vulkan_stage_buffer_reclaim(struct wlr_vk_stage_buffer *buf,
+		uint64_t current_point) {
+
+	// Update utilization metrics before cleaning
+	VkDeviceSize occupied = buf->head >= buf->tail
+		? buf->head - buf->tail
+		: buf->buf_size - buf->tail + buf->head;
+	if (occupied > buf->peak_utilization) {
+		buf->peak_utilization = occupied;
+	}
+
+	size_t completed = 0;
+	struct wlr_vk_stage_watermark *mark;
+	wl_array_for_each(mark, &buf->watermarks) {
+		if (mark->timeline_point > current_point) {
+			break;
+		}
+		buf->tail = mark->head;
+		completed++;
+	}
+
+	if (completed > 0) {
+		completed *= sizeof(struct wlr_vk_stage_watermark);
+		if (completed == buf->watermarks.size) {
+			buf->watermarks.size = 0;
+		} else {
+			array_remove_at(&buf->watermarks, 0, completed);
+		}
+	}
+
+	return buf->head == buf->tail;
+}
+
+VkDeviceSize vulkan_stage_buffer_alloc(struct wlr_vk_stage_buffer *buf,
+		VkDeviceSize size, VkDeviceSize alignment) {
+	VkDeviceSize head = buf->head;
+
+	// Round up to the next multiple of alignment
+	VkDeviceSize rem = head % alignment;
+	if (rem != 0) {
+		head += alignment - rem;
+	}
+
+	VkDeviceSize end = head >= buf->tail ? buf->buf_size : buf->tail;
+	if (head + size < end) {
+		// Regular allocation head till end of available space
+		buf->head = head + size;
+		return head;
+	} else if (size < buf->tail && head >= buf->tail) {
+		// First allocation after wrap-around
+		buf->head = size;
+		return 0;
+	}
+
+	return (VkDeviceSize)-1;
+}
+
+struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r,
+		VkDeviceSize size, VkDeviceSize alignment) {
+	if (size > max_stage_size) {
+		wlr_log(WLR_ERROR, "cannot allocate stage buffer: "
+			"requested size (%zu bytes) exceeds maximum (%zu bytes)",
+			(size_t)size, (size_t)max_stage_size);
 		goto error;
 	}
 
-	buf->buf_size = bsize;
-	wl_list_insert(&r->stage.buffers, &buf->link);
+	// Try to reclaim and allocate from the active buffer
+	struct wlr_vk_stage_buffer *buf;
+	VkDeviceSize max_buf_size = min_stage_size;
+	wl_list_for_each(buf, &r->stage.buffers, link) {
+		if (!buf->active) {
+			continue;
+		}
+		VkDeviceSize offset = vulkan_stage_buffer_alloc(buf, size, alignment);
+		if (offset != (VkDeviceSize)-1) {
+			return (struct wlr_vk_buffer_span) {
+				.buffer = buf,
+				.offset = offset,
+				.size = size,
+			};
+		}
+		if (buf->buf_size > max_buf_size) {
+			max_buf_size = buf->buf_size;
+		}
+
+		// Buffer is full, retire it
+		buf->active = false;
+	}
+
+	VkDeviceSize bsize = max_buf_size * 2;
+	while (size * 2 > bsize) {
+		bsize *= 2;
+	}
+	if (bsize > max_stage_size) {
+		wlr_log(WLR_INFO, "vulkan stage buffer has reached max size");
+		bsize = max_stage_size;
+	}
+
+	struct wlr_vk_stage_buffer *new_buf = stage_buffer_create(r, bsize);
+	if (new_buf == NULL) {
+		goto error;
+	}
+
+	wl_list_insert(&r->stage.buffers, &new_buf->link);
+
+	VkDeviceSize offset = vulkan_stage_buffer_alloc(new_buf, size, alignment);
+	assert(offset != (VkDeviceSize)-1);
 
-	*a = (struct wlr_vk_allocation){
-		.start = 0,
-		.size = size,
-	};
 	return (struct wlr_vk_buffer_span) {
-		.buffer = buf,
-		.alloc = *a,
+		.buffer = new_buf,
+		.offset = offset,
+		.size = size,
 	};
 
 error:
-	shared_buffer_destroy(r, buf);
-
-error_alloc:
 	return (struct wlr_vk_buffer_span) {
 		.buffer = NULL,
-		.alloc = (struct wlr_vk_allocation) {0, 0},
+		.offset = 0,
+		.size = 0,
 	};
 }
 
+void vulkan_return_stage_span(struct wlr_vk_buffer_span *span, VkDeviceSize return_size) {
+	assert(return_size <= span->size);
+	if (span->buffer->head == span->offset + span->size) {
+		// If the current buffer head is our current buffer, move the head back
+		span->size -= return_size;
+		span->buffer->head = span->offset + span->size;
+	}
+}
+
+void vulkan_stage_mark_submit(struct wlr_vk_renderer *renderer,
+		uint64_t timeline_point) {
+	struct wlr_vk_stage_buffer *buf;
+	wl_list_for_each(buf, &renderer->stage.buffers, link) {
+		if (buf->head == buf->tail) {
+			continue;
+		}
+
+		struct wlr_vk_stage_watermark *mark = wl_array_add(
+			&buf->watermarks, sizeof(*mark));
+		if (mark == NULL) {
+			wlr_log_errno(WLR_ERROR, "Allocation failed");
+			continue;
+		}
+
+		*mark = (struct wlr_vk_stage_watermark){
+			.head = buf->head,
+			.timeline_point = timeline_point,
+		};
+	}
+}
+
+static void vulkan_stage_buffer_gc(struct wlr_vk_renderer *renderer, uint64_t current_point) {
+	struct wlr_vk_stage_buffer *buf, *buf_tmp;
+	wl_list_for_each_safe(buf, buf_tmp, &renderer->stage.buffers, link) {
+		if (!vulkan_stage_buffer_reclaim(buf, current_point)) {
+			// There are active allocations on this buffer
+			continue;
+		}
+		if (!buf->active) {
+			stage_buffer_destroy(renderer, buf);
+			continue;
+		}
+		if (buf->buf_size < min_stage_size * 2) {
+			// We will neither shrink nor deallocate the first buffer
+			continue;
+		}
+
+		// Note: We use 1/4th as the underutilization threshold, and when
+		// underutilized for 100 GC runs we cut the buffer size in half
+		if (buf->peak_utilization > buf->buf_size / 4) {
+			buf->underutil_count = 0;
+		} else {
+			buf->underutil_count++;
+		}
+		buf->peak_utilization = 0;
+
+		if (buf->underutil_count < 100) {
+			continue;
+		}
+
+		struct wlr_vk_stage_buffer *shrunk = stage_buffer_create(renderer, buf->buf_size / 2);
+		if (shrunk == NULL) {
+			// We'll just keep using the old buffer for now
+			continue;
+		}
+
+		wl_list_insert(&renderer->stage.buffers, &shrunk->link);
+		stage_buffer_destroy(renderer, buf);
+	}
+}
+
 VkCommandBuffer vulkan_record_stage_cb(struct wlr_vk_renderer *renderer) {
 	if (renderer->stage.cb == NULL) {
 		renderer->stage.cb = vulkan_acquire_command_buffer(renderer);
@@ -465,16 +571,21 @@ bool vulkan_submit_stage_wait(struct wlr_vk_renderer *renderer, int wait_sync_fi
 		submit_info.pWaitDstStageMask = &wait_stage;
 	}
 
+	vulkan_stage_mark_submit(renderer, timeline_point);
+
 	VkResult res = vkQueueSubmit(renderer->dev->queue, 1, &submit_info, VK_NULL_HANDLE);
 	if (res != VK_SUCCESS) {
 		wlr_vk_error("vkQueueSubmit", res);
 		return false;
 	}
 
-	// NOTE: don't release stage allocations here since they may still be
-	// used for reading. Will be done next frame.
+	if (!vulkan_wait_command_buffer(cb, renderer)) {
+		return false;
+	}
 
-	return vulkan_wait_command_buffer(cb, renderer);
+	// We did a blocking wait so this is now the current point
+	vulkan_stage_buffer_gc(renderer, timeline_point);
+	return true;
 }
 
 struct wlr_vk_format_props *vulkan_format_props_from_drm(
@@ -508,7 +619,6 @@ static bool init_command_buffer(struct wlr_vk_command_buffer *cb,
 		.vk = vk_cb,
 	};
 	wl_list_init(&cb->destroy_textures);
-	wl_list_init(&cb->stage_buffers);
 	return true;
 }
 
@@ -534,7 +644,7 @@ bool vulkan_wait_command_buffer(struct wlr_vk_command_buffer *cb,
 }
 
 static void release_command_buffer_resources(struct wlr_vk_command_buffer *cb,
-		struct wlr_vk_renderer *renderer, int64_t now) {
+		struct wlr_vk_renderer *renderer) {
 	struct wlr_vk_texture *texture, *texture_tmp;
 	wl_list_for_each_safe(texture, texture_tmp, &cb->destroy_textures, destroy_link) {
 		wl_list_remove(&texture->destroy_link);
@@ -542,15 +652,6 @@ static void release_command_buffer_resources(struct wlr_vk_command_buffer *cb,
 		wlr_texture_destroy(&texture->wlr_texture);
 	}
 
-	struct wlr_vk_shared_buffer *buf, *buf_tmp;
-	wl_list_for_each_safe(buf, buf_tmp, &cb->stage_buffers, link) {
-		buf->allocs.size = 0;
-		buf->last_used_ms = now;
-
-		wl_list_remove(&buf->link);
-		wl_list_insert(&renderer->stage.buffers, &buf->link);
-	}
-
 	if (cb->color_transform) {
 		wlr_color_transform_unref(cb->color_transform);
 		cb->color_transform = NULL;
@@ -569,22 +670,14 @@ static struct wlr_vk_command_buffer *get_command_buffer(
 		return NULL;
 	}
 
-
-	// Garbage collect any buffers that have remained unused for too long
-	int64_t now = get_current_time_msec();
-	struct wlr_vk_shared_buffer *buf, *buf_tmp;
-	wl_list_for_each_safe(buf, buf_tmp, &renderer->stage.buffers, link) {
-		if (buf->allocs.size == 0 && buf->last_used_ms + 10000 < now) {
-			shared_buffer_destroy(renderer, buf);
-		}
-	}
+	vulkan_stage_buffer_gc(renderer, current_point);
 
 	// Destroy textures for completed command buffers
 	for (size_t i = 0; i < VULKAN_COMMAND_BUFFERS_CAP; i++) {
 		struct wlr_vk_command_buffer *cb = &renderer->command_buffers[i];
 		if (cb->vk != VK_NULL_HANDLE && !cb->recording &&
 				cb->timeline_point <= current_point) {
-			release_command_buffer_resources(cb, renderer, now);
+			release_command_buffer_resources(cb, renderer);
 		}
 	}
 
@@ -1187,7 +1280,7 @@ static void vulkan_destroy(struct wlr_renderer *wlr_renderer) {
 		if (cb->vk == VK_NULL_HANDLE) {
 			continue;
 		}
-		release_command_buffer_resources(cb, renderer, 0);
+		release_command_buffer_resources(cb, renderer);
 		if (cb->binary_semaphore != VK_NULL_HANDLE) {
 			vkDestroySemaphore(renderer->dev->dev, cb->binary_semaphore, NULL);
 		}
@@ -1199,9 +1292,9 @@ static void vulkan_destroy(struct wlr_renderer *wlr_renderer) {
 	}
 
 	// stage.cb automatically freed with command pool
-	struct wlr_vk_shared_buffer *buf, *tmp_buf;
+	struct wlr_vk_stage_buffer *buf, *tmp_buf;
 	wl_list_for_each_safe(buf, tmp_buf, &renderer->stage.buffers, link) {
-		shared_buffer_destroy(renderer, buf);
+		stage_buffer_destroy(renderer, buf);
 	}
 
 	struct wlr_vk_texture *tex, *tex_tmp;
@@ -1838,6 +1931,25 @@ static bool pipeline_key_equals(const struct wlr_vk_pipeline_key *a,
 	return true;
 }
 
+static const VkVertexInputBindingDescription instance_vert_binding = {
+	.binding = 0,
+	.stride = sizeof(float) * 4,
+	.inputRate = VK_VERTEX_INPUT_RATE_INSTANCE,
+};
+static const VkVertexInputAttributeDescription instance_vert_attr = {
+	.location = 0,
+	.binding = 0,
+	.format = VK_FORMAT_R32G32B32A32_SFLOAT,
+	.offset = 0,
+};
+static const VkPipelineVertexInputStateCreateInfo instance_vert_input = {
+	.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+	.vertexBindingDescriptionCount = 1,
+	.pVertexBindingDescriptions = &instance_vert_binding,
+	.vertexAttributeDescriptionCount = 1,
+	.pVertexAttributeDescriptions = &instance_vert_attr,
+};
+
 // Initializes the pipeline for rendering textures and using the given
 // VkRenderPass and VkPipelineLayout.
 struct wlr_vk_pipeline *setup_get_or_create_pipeline(
@@ -1969,10 +2081,6 @@ struct wlr_vk_pipeline *setup_get_or_create_pipeline(
 		.dynamicStateCount = sizeof(dyn_states) / sizeof(dyn_states[0]),
 	};
 
-	VkPipelineVertexInputStateCreateInfo vertex = {
-		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-	};
-
 	VkGraphicsPipelineCreateInfo pinfo = {
 		.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
 		.layout = pipeline_layout->vk,
@@ -1987,7 +2095,7 @@ struct wlr_vk_pipeline *setup_get_or_create_pipeline(
 		.pMultisampleState = &multisample,
 		.pViewportState = &viewport,
 		.pDynamicState = &dynamic,
-		.pVertexInputState = &vertex,
+		.pVertexInputState = &instance_vert_input,
 	};
 
 	VkPipelineCache cache = VK_NULL_HANDLE;
@@ -2086,10 +2194,6 @@ static bool init_blend_to_output_pipeline(struct wlr_vk_renderer *renderer,
 		.dynamicStateCount = sizeof(dyn_states) / sizeof(dyn_states[0]),
 	};
 
-	VkPipelineVertexInputStateCreateInfo vertex = {
-		.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
-	};
-
 	VkGraphicsPipelineCreateInfo pinfo = {
 		.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
 		.pNext = NULL,
@@ -2104,7 +2208,7 @@ static bool init_blend_to_output_pipeline(struct wlr_vk_renderer *renderer,
 		.pMultisampleState = &multisample,
 		.pViewportState = &viewport,
 		.pDynamicState = &dynamic,
-		.pVertexInputState = &vertex,
+		.pVertexInputState = &instance_vert_input,
 	};
 
 	VkPipelineCache cache = VK_NULL_HANDLE;
diff --git a/render/vulkan/shaders/common.vert b/render/vulkan/shaders/common.vert
index f1579790d..82ea9658c 100644
--- a/render/vulkan/shaders/common.vert
+++ b/render/vulkan/shaders/common.vert
@@ -8,11 +8,14 @@ layout(push_constant, row_major) uniform UBO {
 	vec2 uv_size;
 } data;
 
+layout(location = 0) in vec4 inst_rect;
+
 layout(location = 0) out vec2 uv;
 
 void main() {
 	vec2 pos = vec2(float((gl_VertexIndex + 1) & 2) * 0.5f,
 		float(gl_VertexIndex & 2) * 0.5f);
+	pos = inst_rect.xy + pos * inst_rect.zw;
 	uv = data.uv_offset + pos * data.uv_size;
 	gl_Position = data.proj * vec4(pos, 0.0, 1.0);
 }
diff --git a/render/vulkan/texture.c b/render/vulkan/texture.c
index c6365c90b..9298de804 100644
--- a/render/vulkan/texture.c
+++ b/render/vulkan/texture.c
@@ -72,16 +72,16 @@ static bool write_pixels(struct wlr_vk_texture *texture,
 
 	// get staging buffer
 	struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer, bsize, format_info->bytes_per_block);
-	if (!span.buffer || span.alloc.size != bsize) {
+	if (!span.buffer || span.size != bsize) {
 		wlr_log(WLR_ERROR, "Failed to retrieve staging buffer");
 		free(copies);
 		return false;
 	}
-	char *map = (char*)span.buffer->cpu_mapping + span.alloc.start;
+	char *map = (char*)span.buffer->cpu_mapping + span.offset;
 
 	// upload data
 
-	uint32_t buf_off = span.alloc.start;
+	uint32_t buf_off = span.offset;
 	for (int i = 0; i < rects_len; i++) {
 		pixman_box32_t rect = rects[i];
 		uint32_t width = rect.x2 - rect.x1;
diff --git a/test/meson.build b/test/meson.build
index f51b2c02c..9c622e3ef 100644
--- a/test/meson.build
+++ b/test/meson.build
@@ -1,8 +1,30 @@
+# Used to test internal symbols
+lib_wlr_internal = static_library(
+	versioned_name + '-internal',
+	objects: lib_wlr.extract_all_objects(recursive: false),
+	dependencies: wlr_deps,
+	include_directories: [wlr_inc],
+	install: false,
+)
+
 test(
 	'box',
 	executable('test-box', 'test_box.c', dependencies: wlroots),
 )
 
+if features.get('vulkan-renderer')
+	test(
+		'vulkan_stage_buffer',
+		executable(
+			'test-vulkan-stage-buffer',
+			'test_vulkan_stage_buffer.c',
+			link_with: lib_wlr_internal,
+			dependencies: wlr_deps,
+			include_directories: wlr_inc,
+		),
+	)
+endif
+
 benchmark(
 	'scene',
 	executable('bench-scene', 'bench_scene.c', dependencies: wlroots),
diff --git a/test/test_vulkan_stage_buffer.c b/test/test_vulkan_stage_buffer.c
new file mode 100644
index 000000000..cceefa8de
--- /dev/null
+++ b/test/test_vulkan_stage_buffer.c
@@ -0,0 +1,234 @@
+#include <assert.h>
+#include <stdio.h>
+#include <wayland-util.h>
+
+#include "render/vulkan.h"
+
+#define BUF_SIZE 1024
+#define ALLOC_FAIL ((VkDeviceSize)-1)
+
+static void stage_buffer_init(struct wlr_vk_stage_buffer *buf) {
+	*buf = (struct wlr_vk_stage_buffer){
+		.buf_size = BUF_SIZE,
+	};
+	wl_array_init(&buf->watermarks);
+}
+
+static void stage_buffer_finish(struct wlr_vk_stage_buffer *buf) {
+	wl_array_release(&buf->watermarks);
+}
+
+static void push_watermark(struct wlr_vk_stage_buffer *buf,
+		uint64_t timeline_point) {
+	struct wlr_vk_stage_watermark *mark = wl_array_add(
+		&buf->watermarks, sizeof(*mark));
+	assert(mark != NULL);
+	*mark = (struct wlr_vk_stage_watermark){
+		.head = buf->head,
+		.timeline_point = timeline_point,
+	};
+}
+
+static size_t watermark_count(const struct wlr_vk_stage_buffer *buf) {
+	return buf->watermarks.size / sizeof(struct wlr_vk_stage_watermark);
+}
+
+static void test_alloc_simple(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	assert(buf.head == 100);
+	assert(vulkan_stage_buffer_alloc(&buf, 200, 1) == 100);
+	assert(buf.head == 300);
+	assert(buf.tail == 0);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_alloc_alignment(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 7, 1) == 0);
+	assert(buf.head == 7);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 4, 16) == 16);
+	assert(buf.head == 20);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 8, 8) == 24);
+	assert(buf.head == 32);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_alloc_limit(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// We do not allow allocations that would cause head to equal tail
+	assert(vulkan_stage_buffer_alloc(&buf, BUF_SIZE, 1) == ALLOC_FAIL);
+	assert(buf.head == 0);
+
+	assert(vulkan_stage_buffer_alloc(&buf, BUF_SIZE-1, 1) == 0);
+	assert(buf.head == BUF_SIZE-1);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_alloc_wrap(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// Fill the first 924 bytes
+	assert(vulkan_stage_buffer_alloc(&buf, BUF_SIZE - 100, 1) == 0);
+	push_watermark(&buf, 1);
+
+	// Fill the end of the buffer
+	assert(vulkan_stage_buffer_alloc(&buf, 50, 1) == 924);
+	push_watermark(&buf, 2);
+
+	// First, check that we don't wrap prematurely
+	assert(vulkan_stage_buffer_alloc(&buf, 50, 1) == ALLOC_FAIL);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == ALLOC_FAIL);
+
+	// Free the beginning of the buffer and try to wrap again
+	vulkan_stage_buffer_reclaim(&buf, 1);
+	assert(vulkan_stage_buffer_alloc(&buf, 50, 1) == 0);
+	assert(buf.tail == 924);
+	assert(buf.head == 50);
+
+	// Check that freeing from the end of the buffer still works
+	vulkan_stage_buffer_reclaim(&buf, 2);
+	assert(buf.tail == 974);
+	assert(buf.head == 50);
+
+	// Check that allocations still work
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 50);
+	assert(buf.tail == 974);
+	assert(buf.head == 150);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_empty(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// Fresh buffer with no watermarks and head == tail == 0 is drained.
+	assert(vulkan_stage_buffer_reclaim(&buf, 0));
+	assert(buf.tail == 0);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_pending_not_completed(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	push_watermark(&buf, 1);
+
+	// current point hasn't reached the watermark yet.
+	assert(!vulkan_stage_buffer_reclaim(&buf, 0));
+	assert(buf.tail == 0);
+	assert(watermark_count(&buf) == 1);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_partial(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	push_watermark(&buf, 1);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 100);
+	push_watermark(&buf, 2);
+
+	// Only the first watermark is reached.
+	assert(!vulkan_stage_buffer_reclaim(&buf, 1));
+	assert(buf.tail == 100);
+	assert(watermark_count(&buf) == 1);
+
+	const struct wlr_vk_stage_watermark *remaining = buf.watermarks.data;
+	assert(remaining[0].head == 200);
+	assert(remaining[0].timeline_point == 2);
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_reclaim_all(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	push_watermark(&buf, 1);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 100);
+	push_watermark(&buf, 2);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 200);
+	push_watermark(&buf, 3);
+
+	assert(vulkan_stage_buffer_reclaim(&buf, 100));
+	assert(buf.tail == 300);
+	assert(watermark_count(&buf) == 0);
+
+	stage_buffer_finish(&buf);
+}
+
+
+static void test_peak_utilization(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	assert(buf.peak_utilization == 0);
+	assert(vulkan_stage_buffer_alloc(&buf, 100, 1) == 0);
+	assert(vulkan_stage_buffer_alloc(&buf, 200, 1) == 100);
+	vulkan_stage_buffer_reclaim(&buf, 0);
+	assert(buf.peak_utilization == 300);
+
+
+	stage_buffer_finish(&buf);
+}
+
+static void test_peak_utilization_wrap(void) {
+	struct wlr_vk_stage_buffer buf;
+	stage_buffer_init(&buf);
+
+	// 200 bytes used, 100 bytes from wrap
+	buf.head = BUF_SIZE - 100;
+	buf.tail = buf.head - 200;
+
+	// With 100 byte left, we wrap to front and waste 100 bytes
+	assert(vulkan_stage_buffer_alloc(&buf, 200, 1) == 0);
+	vulkan_stage_buffer_reclaim(&buf, 0);
+	assert(buf.head == 200);
+	assert(buf.tail == BUF_SIZE - 300);
+
+	// 200 bytes initial + 100 bytes wasted + 200 bytes allocated = 500
+	assert(buf.peak_utilization == 500);
+
+	stage_buffer_finish(&buf);
+}
+
+int main(void) {
+#ifdef NDEBUG
+	fprintf(stderr, "NDEBUG must be disabled for tests\n");
+	return 1;
+#endif
+
+	test_alloc_simple();
+	test_alloc_alignment();
+	test_alloc_limit();
+	test_alloc_wrap();
+
+	test_reclaim_empty();
+	test_reclaim_pending_not_completed();
+	test_reclaim_partial();
+	test_reclaim_all();
+
+	test_peak_utilization();
+	test_peak_utilization_wrap();
+
+	return 0;
+}