From 1ec740692f03bb1aa3cc52401c7274f45eb5a088 Mon Sep 17 00:00:00 2001 From: Kenny Levinsen Date: Sun, 12 Apr 2026 17:10:40 +0200 Subject: [PATCH] render/vulkan: New staging buffer implementation Implement a ring-buffer that uses timeline points to track and release allocated spans, upgrading the buffer if it fills and shrinking it if it has been 4x too large for many collections. --- include/render/vulkan.h | 65 +++++--- render/vulkan/pass.c | 15 +- render/vulkan/renderer.c | 336 +++++++++++++++++++++++++-------------- render/vulkan/texture.c | 6 +- 4 files changed, 267 insertions(+), 155 deletions(-) diff --git a/include/render/vulkan.h b/include/render/vulkan.h index 021749c27..12a7ff173 100644 --- a/include/render/vulkan.h +++ b/include/render/vulkan.h @@ -284,8 +284,6 @@ struct wlr_vk_command_buffer { uint64_t timeline_point; // Textures to destroy after the command buffer completes struct wl_list destroy_textures; // wlr_vk_texture.destroy_link - // Staging shared buffers to release after the command buffer completes - struct wl_list stage_buffers; // wlr_vk_shared_buffer.link // Color transform to unref after the command buffer completes struct wlr_color_transform *color_transform; @@ -352,7 +350,7 @@ struct wlr_vk_renderer { struct { struct wlr_vk_command_buffer *cb; uint64_t last_timeline_point; - struct wl_list buffers; // wlr_vk_shared_buffer.link + struct wl_list buffers; // wlr_vk_stage_buffer.link } stage; struct { @@ -453,14 +451,27 @@ struct wlr_vk_render_pass { struct wlr_vk_render_pass *vulkan_begin_render_pass(struct wlr_vk_renderer *renderer, struct wlr_vk_render_buffer *buffer, const struct wlr_buffer_pass_options *options); -// Suballocates a buffer span with the given size that can be mapped -// and used as staging buffer. The allocation is implicitly released when the -// stage cb has finished execution. The start of the span will be a multiple -// of the given alignment. +// Suballocates a buffer span with the given size from the staging ring buffer +// that is mapped for CPU access. vulkan_stage_mark_submit must be called after +// allocations are made to mark the timeline point after which the allocations +// will be released. The start of the span will be a multiple of alignment. struct wlr_vk_buffer_span vulkan_get_stage_span( struct wlr_vk_renderer *renderer, VkDeviceSize size, VkDeviceSize alignment); +// Returns unused bytes at the end of a buffer span back to the ring buffer. +// This allows the caller to allocate for worst-case consumption and return the +// unused remainder. This must not be called after vulkan_stage_mark_submit, +// and only works for the last made allocation. +void vulkan_return_stage_span(struct wlr_vk_buffer_span *span, + VkDeviceSize return_size); + +// Records a watermark on all staging buffers with new allocations with the +// specified timeline point. Once the timeline point is passed, the span will +// be reclaimed by vulkan_stage_buffer_reclaim. +void vulkan_stage_mark_submit(struct wlr_vk_renderer *renderer, + uint64_t timeline_point); + // Tries to allocate a texture descriptor set. Will additionally // return the pool it was allocated from when successful (for freeing it later). struct wlr_vk_descriptor_pool *vulkan_alloc_texture_ds( @@ -544,29 +555,45 @@ struct wlr_vk_descriptor_pool { struct wl_list link; // wlr_vk_renderer.descriptor_pools }; -struct wlr_vk_allocation { - VkDeviceSize start; - VkDeviceSize size; +struct wlr_vk_stage_watermark { + VkDeviceSize head; + uint64_t timeline_point; }; -// List of suballocated staging buffers. -// Used to upload to/read from device local images. -struct wlr_vk_shared_buffer { - struct wl_list link; // wlr_vk_renderer.stage.buffers or wlr_vk_command_buffer.stage_buffers +// Ring buffer for staging transfers +struct wlr_vk_stage_buffer { + struct wl_list link; // wlr_vk_renderer.stage.buffers + bool active; VkBuffer buffer; VkDeviceMemory memory; VkDeviceSize buf_size; void *cpu_mapping; - struct wl_array allocs; // struct wlr_vk_allocation - int64_t last_used_ms; + + VkDeviceSize head; + VkDeviceSize tail; + + struct wl_array watermarks; // struct wlr_vk_stage_watermark + VkDeviceSize peak_utilization; + int underutil_count; }; -// Suballocated range on a buffer. +// Suballocated range on a staging ring buffer. struct wlr_vk_buffer_span { - struct wlr_vk_shared_buffer *buffer; - struct wlr_vk_allocation alloc; + struct wlr_vk_stage_buffer *buffer; + VkDeviceSize offset; + VkDeviceSize size; }; +// Suballocate a span of size bytes from a staging ring buffer, with the +// returned offset rounded up to the given alignment. Returns the byte offset +// of the allocation, or (VkDeviceSize)-1 if the buffer is too full to fit it. +VkDeviceSize vulkan_stage_buffer_alloc(struct wlr_vk_stage_buffer *buf, + VkDeviceSize size, VkDeviceSize alignment); + +// Free all allocations covered by watermarks whose timeline point has been +// reached. Returns true if the buffer is now fully drained. +bool vulkan_stage_buffer_reclaim(struct wlr_vk_stage_buffer *buf, + uint64_t current_point); // Prepared form for a color transform struct wlr_vk_color_transform { diff --git a/render/vulkan/pass.c b/render/vulkan/pass.c index 01e8fbd7a..04cb16cf9 100644 --- a/render/vulkan/pass.c +++ b/render/vulkan/pass.c @@ -595,14 +595,7 @@ static bool render_pass_submit(struct wlr_render_pass *wlr_pass) { free(render_wait); - struct wlr_vk_shared_buffer *stage_buf, *stage_buf_tmp; - wl_list_for_each_safe(stage_buf, stage_buf_tmp, &renderer->stage.buffers, link) { - if (stage_buf->allocs.size == 0) { - continue; - } - wl_list_remove(&stage_buf->link); - wl_list_insert(&stage_cb->stage_buffers, &stage_buf->link); - } + vulkan_stage_mark_submit(renderer, render_timeline_point); if (!vulkan_sync_render_pass_release(renderer, pass)) { wlr_log(WLR_ERROR, "Failed to sync render buffer"); @@ -1056,13 +1049,13 @@ static bool create_3d_lut_image(struct wlr_vk_renderer *renderer, size_t size = dim_len * dim_len * dim_len * bytes_per_block; struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer, size, bytes_per_block); - if (!span.buffer || span.alloc.size != size) { + if (!span.buffer || span.size != size) { wlr_log(WLR_ERROR, "Failed to retrieve staging buffer"); goto fail_imageview; } float sample_range = 1.0f / (dim_len - 1); - char *map = (char *)span.buffer->cpu_mapping + span.alloc.start; + char *map = (char *)span.buffer->cpu_mapping + span.offset; float *dst = (float *)map; for (size_t b_index = 0; b_index < dim_len; b_index++) { for (size_t g_index = 0; g_index < dim_len; g_index++) { @@ -1092,7 +1085,7 @@ static bool create_3d_lut_image(struct wlr_vk_renderer *renderer, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT); VkBufferImageCopy copy = { - .bufferOffset = span.alloc.start, + .bufferOffset = span.offset, .imageExtent.width = dim_len, .imageExtent.height = dim_len, .imageExtent.depth = dim_len, diff --git a/render/vulkan/renderer.c b/render/vulkan/renderer.c index 434ab4769..e8e44b3f4 100644 --- a/render/vulkan/renderer.c +++ b/render/vulkan/renderer.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -8,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -26,11 +26,9 @@ #include "render/vulkan/shaders/texture.frag.h" #include "render/vulkan/shaders/quad.frag.h" #include "render/vulkan/shaders/output.frag.h" -#include "types/wlr_buffer.h" -#include "util/time.h" +#include "util/array.h" // TODO: -// - simplify stage allocation, don't track allocations but use ringbuffer-like // - use a pipeline cache (not sure when to save though, after every pipeline // creation?) // - create pipelines as derivatives of each other @@ -187,18 +185,13 @@ static void destroy_render_format_setup(struct wlr_vk_renderer *renderer, free(setup); } -static void shared_buffer_destroy(struct wlr_vk_renderer *r, - struct wlr_vk_shared_buffer *buffer) { +static void stage_buffer_destroy(struct wlr_vk_renderer *r, + struct wlr_vk_stage_buffer *buffer) { if (!buffer) { return; } - if (buffer->allocs.size > 0) { - wlr_log(WLR_ERROR, "shared_buffer_finish: %zu allocations left", - buffer->allocs.size / sizeof(struct wlr_vk_allocation)); - } - - wl_array_release(&buffer->allocs); + wl_array_release(&buffer->watermarks); if (buffer->cpu_mapping) { vkUnmapMemory(r->dev->dev, buffer->memory); buffer->cpu_mapping = NULL; @@ -214,75 +207,12 @@ static void shared_buffer_destroy(struct wlr_vk_renderer *r, free(buffer); } -struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r, - VkDeviceSize size, VkDeviceSize alignment) { - // try to find free span - // simple greedy allocation algorithm - should be enough for this usecase - // since all allocations are freed together after the frame - struct wlr_vk_shared_buffer *buf; - wl_list_for_each_reverse(buf, &r->stage.buffers, link) { - VkDeviceSize start = 0u; - if (buf->allocs.size > 0) { - const struct wlr_vk_allocation *allocs = buf->allocs.data; - size_t allocs_len = buf->allocs.size / sizeof(struct wlr_vk_allocation); - const struct wlr_vk_allocation *last = &allocs[allocs_len - 1]; - start = last->start + last->size; - } - - assert(start <= buf->buf_size); - - // ensure the proposed start is a multiple of alignment - start += alignment - 1 - ((start + alignment - 1) % alignment); - - if (buf->buf_size - start < size) { - continue; - } - - struct wlr_vk_allocation *a = wl_array_add(&buf->allocs, sizeof(*a)); - if (a == NULL) { - wlr_log_errno(WLR_ERROR, "Allocation failed"); - goto error_alloc; - } - - *a = (struct wlr_vk_allocation){ - .start = start, - .size = size, - }; - return (struct wlr_vk_buffer_span) { - .buffer = buf, - .alloc = *a, - }; - } - - if (size > max_stage_size) { - wlr_log(WLR_ERROR, "cannot vulkan stage buffer: " - "requested size (%zu bytes) exceeds maximum (%zu bytes)", - (size_t)size, (size_t)max_stage_size); - goto error_alloc; - } - - // we didn't find a free buffer - create one - // size = clamp(max(size * 2, prev_size * 2), min_size, max_size) - VkDeviceSize bsize = size * 2; - bsize = bsize < min_stage_size ? min_stage_size : bsize; - if (!wl_list_empty(&r->stage.buffers)) { - struct wl_list *last_link = r->stage.buffers.prev; - struct wlr_vk_shared_buffer *prev = wl_container_of( - last_link, prev, link); - VkDeviceSize last_size = 2 * prev->buf_size; - bsize = bsize < last_size ? last_size : bsize; - } - - if (bsize > max_stage_size) { - wlr_log(WLR_INFO, "vulkan stage buffers have reached max size"); - bsize = max_stage_size; - } - - // create buffer - buf = calloc(1, sizeof(*buf)); +static struct wlr_vk_stage_buffer *stage_buffer_create( + struct wlr_vk_renderer *r, VkDeviceSize bsize) { + struct wlr_vk_stage_buffer *buf = calloc(1, sizeof(*buf)); if (!buf) { wlr_log_errno(WLR_ERROR, "Allocation failed"); - goto error_alloc; + return NULL; } wl_list_init(&buf->link); @@ -319,7 +249,7 @@ struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r, }; res = vkAllocateMemory(r->dev->dev, &mem_info, NULL, &buf->memory); if (res != VK_SUCCESS) { - wlr_vk_error("vkAllocatorMemory", res); + wlr_vk_error("vkAllocateMemory", res); goto error; } @@ -335,34 +265,209 @@ struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r, goto error; } - struct wlr_vk_allocation *a = wl_array_add(&buf->allocs, sizeof(*a)); - if (a == NULL) { - wlr_log_errno(WLR_ERROR, "Allocation failed"); + buf->active = true; + buf->buf_size = bsize; + return buf; + +error: + stage_buffer_destroy(r, buf); + return NULL; +} + +// Returns true if the buffer is fully drained. +bool vulkan_stage_buffer_reclaim(struct wlr_vk_stage_buffer *buf, + uint64_t current_point) { + + // Update utilization metrics before cleaning + VkDeviceSize occupied = buf->head >= buf->tail + ? buf->head - buf->tail + : buf->buf_size - buf->tail + buf->head; + if (occupied > buf->peak_utilization) { + buf->peak_utilization = occupied; + } + + size_t completed = 0; + struct wlr_vk_stage_watermark *mark; + wl_array_for_each(mark, &buf->watermarks) { + if (mark->timeline_point > current_point) { + break; + } + buf->tail = mark->head; + completed++; + } + + if (completed > 0) { + completed *= sizeof(struct wlr_vk_stage_watermark); + if (completed == buf->watermarks.size) { + buf->watermarks.size = 0; + } else { + array_remove_at(&buf->watermarks, 0, completed); + } + } + + return buf->head == buf->tail; +} + +VkDeviceSize vulkan_stage_buffer_alloc(struct wlr_vk_stage_buffer *buf, + VkDeviceSize size, VkDeviceSize alignment) { + VkDeviceSize head = buf->head; + + // Round up to the next multiple of alignment + VkDeviceSize rem = head % alignment; + if (rem != 0) { + head += alignment - rem; + } + + VkDeviceSize end = head >= buf->tail ? buf->buf_size : buf->tail; + if (head + size < end) { + // Regular allocation head till end of available space + buf->head = head + size; + return head; + } else if (size < buf->tail && head >= buf->tail) { + // First allocation after wrap-around + buf->head = size; + return 0; + } + + return (VkDeviceSize)-1; +} + +struct wlr_vk_buffer_span vulkan_get_stage_span(struct wlr_vk_renderer *r, + VkDeviceSize size, VkDeviceSize alignment) { + if (size > max_stage_size) { + wlr_log(WLR_ERROR, "cannot allocate stage buffer: " + "requested size (%zu bytes) exceeds maximum (%zu bytes)", + (size_t)size, (size_t)max_stage_size); goto error; } - buf->buf_size = bsize; - wl_list_insert(&r->stage.buffers, &buf->link); + // Try to reclaim and allocate from the active buffer + struct wlr_vk_stage_buffer *buf; + VkDeviceSize max_buf_size = min_stage_size; + wl_list_for_each(buf, &r->stage.buffers, link) { + if (!buf->active) { + continue; + } + VkDeviceSize offset = vulkan_stage_buffer_alloc(buf, size, alignment); + if (offset != (VkDeviceSize)-1) { + return (struct wlr_vk_buffer_span) { + .buffer = buf, + .offset = offset, + .size = size, + }; + } + if (buf->buf_size > max_buf_size) { + max_buf_size = buf->buf_size; + } + + // Buffer is full, retire it + buf->active = false; + } + + VkDeviceSize bsize = max_buf_size * 2; + while (size * 2 > bsize) { + bsize *= 2; + } + if (bsize > max_stage_size) { + wlr_log(WLR_INFO, "vulkan stage buffer has reached max size"); + bsize = max_stage_size; + } + + struct wlr_vk_stage_buffer *new_buf = stage_buffer_create(r, bsize); + if (new_buf == NULL) { + goto error; + } + + wl_list_insert(&r->stage.buffers, &new_buf->link); + + VkDeviceSize offset = vulkan_stage_buffer_alloc(new_buf, size, alignment); + assert(offset != (VkDeviceSize)-1); - *a = (struct wlr_vk_allocation){ - .start = 0, - .size = size, - }; return (struct wlr_vk_buffer_span) { - .buffer = buf, - .alloc = *a, + .buffer = new_buf, + .offset = offset, + .size = size, }; error: - shared_buffer_destroy(r, buf); - -error_alloc: return (struct wlr_vk_buffer_span) { .buffer = NULL, - .alloc = (struct wlr_vk_allocation) {0, 0}, + .offset = 0, + .size = 0, }; } +void vulkan_return_stage_span(struct wlr_vk_buffer_span *span, VkDeviceSize return_size) { + assert(return_size <= span->size); + if (span->buffer->head == span->offset + span->size) { + // If the current buffer head is our current buffer, move the head back + span->size -= return_size; + span->buffer->head = span->offset + span->size; + } +} + +void vulkan_stage_mark_submit(struct wlr_vk_renderer *renderer, + uint64_t timeline_point) { + struct wlr_vk_stage_buffer *buf; + wl_list_for_each(buf, &renderer->stage.buffers, link) { + if (buf->head == buf->tail) { + continue; + } + + struct wlr_vk_stage_watermark *mark = wl_array_add( + &buf->watermarks, sizeof(*mark)); + if (mark == NULL) { + wlr_log_errno(WLR_ERROR, "Allocation failed"); + continue; + } + + *mark = (struct wlr_vk_stage_watermark){ + .head = buf->head, + .timeline_point = timeline_point, + }; + } +} + +static void vulkan_stage_buffer_gc(struct wlr_vk_renderer *renderer, uint64_t current_point) { + struct wlr_vk_stage_buffer *buf, *buf_tmp; + wl_list_for_each_safe(buf, buf_tmp, &renderer->stage.buffers, link) { + if (!vulkan_stage_buffer_reclaim(buf, current_point)) { + // There are active allocations on this buffer + continue; + } + if (!buf->active) { + stage_buffer_destroy(renderer, buf); + continue; + } + if (buf->buf_size < min_stage_size * 2) { + // We will neither shrink nor deallocate the first buffer + continue; + } + + // Note: We use 1/4th as the underutilization threshold, and when + // underutilized for 100 GC runs we cut the buffer size in half + if (buf->peak_utilization > buf->buf_size / 4) { + buf->underutil_count = 0; + } else { + buf->underutil_count++; + } + buf->peak_utilization = 0; + + if (buf->underutil_count < 100) { + continue; + } + + struct wlr_vk_stage_buffer *shrunk = stage_buffer_create(renderer, buf->buf_size / 2); + if (shrunk == NULL) { + // We'll just keep using the old buffer for now + continue; + } + + wl_list_insert(&renderer->stage.buffers, &shrunk->link); + stage_buffer_destroy(renderer, buf); + } +} + VkCommandBuffer vulkan_record_stage_cb(struct wlr_vk_renderer *renderer) { if (renderer->stage.cb == NULL) { renderer->stage.cb = vulkan_acquire_command_buffer(renderer); @@ -465,16 +570,21 @@ bool vulkan_submit_stage_wait(struct wlr_vk_renderer *renderer, int wait_sync_fi submit_info.pWaitDstStageMask = &wait_stage; } + vulkan_stage_mark_submit(renderer, timeline_point); + VkResult res = vkQueueSubmit(renderer->dev->queue, 1, &submit_info, VK_NULL_HANDLE); if (res != VK_SUCCESS) { wlr_vk_error("vkQueueSubmit", res); return false; } - // NOTE: don't release stage allocations here since they may still be - // used for reading. Will be done next frame. + if (!vulkan_wait_command_buffer(cb, renderer)) { + return false; + } - return vulkan_wait_command_buffer(cb, renderer); + // We did a blocking wait so this is now the current point + vulkan_stage_buffer_gc(renderer, timeline_point); + return true; } struct wlr_vk_format_props *vulkan_format_props_from_drm( @@ -508,7 +618,6 @@ static bool init_command_buffer(struct wlr_vk_command_buffer *cb, .vk = vk_cb, }; wl_list_init(&cb->destroy_textures); - wl_list_init(&cb->stage_buffers); return true; } @@ -534,7 +643,7 @@ bool vulkan_wait_command_buffer(struct wlr_vk_command_buffer *cb, } static void release_command_buffer_resources(struct wlr_vk_command_buffer *cb, - struct wlr_vk_renderer *renderer, int64_t now) { + struct wlr_vk_renderer *renderer) { struct wlr_vk_texture *texture, *texture_tmp; wl_list_for_each_safe(texture, texture_tmp, &cb->destroy_textures, destroy_link) { wl_list_remove(&texture->destroy_link); @@ -542,15 +651,6 @@ static void release_command_buffer_resources(struct wlr_vk_command_buffer *cb, wlr_texture_destroy(&texture->wlr_texture); } - struct wlr_vk_shared_buffer *buf, *buf_tmp; - wl_list_for_each_safe(buf, buf_tmp, &cb->stage_buffers, link) { - buf->allocs.size = 0; - buf->last_used_ms = now; - - wl_list_remove(&buf->link); - wl_list_insert(&renderer->stage.buffers, &buf->link); - } - if (cb->color_transform) { wlr_color_transform_unref(cb->color_transform); cb->color_transform = NULL; @@ -569,22 +669,14 @@ static struct wlr_vk_command_buffer *get_command_buffer( return NULL; } - - // Garbage collect any buffers that have remained unused for too long - int64_t now = get_current_time_msec(); - struct wlr_vk_shared_buffer *buf, *buf_tmp; - wl_list_for_each_safe(buf, buf_tmp, &renderer->stage.buffers, link) { - if (buf->allocs.size == 0 && buf->last_used_ms + 10000 < now) { - shared_buffer_destroy(renderer, buf); - } - } + vulkan_stage_buffer_gc(renderer, current_point); // Destroy textures for completed command buffers for (size_t i = 0; i < VULKAN_COMMAND_BUFFERS_CAP; i++) { struct wlr_vk_command_buffer *cb = &renderer->command_buffers[i]; if (cb->vk != VK_NULL_HANDLE && !cb->recording && cb->timeline_point <= current_point) { - release_command_buffer_resources(cb, renderer, now); + release_command_buffer_resources(cb, renderer); } } @@ -1187,7 +1279,7 @@ static void vulkan_destroy(struct wlr_renderer *wlr_renderer) { if (cb->vk == VK_NULL_HANDLE) { continue; } - release_command_buffer_resources(cb, renderer, 0); + release_command_buffer_resources(cb, renderer); if (cb->binary_semaphore != VK_NULL_HANDLE) { vkDestroySemaphore(renderer->dev->dev, cb->binary_semaphore, NULL); } @@ -1199,9 +1291,9 @@ static void vulkan_destroy(struct wlr_renderer *wlr_renderer) { } // stage.cb automatically freed with command pool - struct wlr_vk_shared_buffer *buf, *tmp_buf; + struct wlr_vk_stage_buffer *buf, *tmp_buf; wl_list_for_each_safe(buf, tmp_buf, &renderer->stage.buffers, link) { - shared_buffer_destroy(renderer, buf); + stage_buffer_destroy(renderer, buf); } struct wlr_vk_texture *tex, *tex_tmp; diff --git a/render/vulkan/texture.c b/render/vulkan/texture.c index c6365c90b..9298de804 100644 --- a/render/vulkan/texture.c +++ b/render/vulkan/texture.c @@ -72,16 +72,16 @@ static bool write_pixels(struct wlr_vk_texture *texture, // get staging buffer struct wlr_vk_buffer_span span = vulkan_get_stage_span(renderer, bsize, format_info->bytes_per_block); - if (!span.buffer || span.alloc.size != bsize) { + if (!span.buffer || span.size != bsize) { wlr_log(WLR_ERROR, "Failed to retrieve staging buffer"); free(copies); return false; } - char *map = (char*)span.buffer->cpu_mapping + span.alloc.start; + char *map = (char*)span.buffer->cpu_mapping + span.offset; // upload data - uint32_t buf_off = span.alloc.start; + uint32_t buf_off = span.offset; for (int i = 0; i < rects_len; i++) { pixman_box32_t rect = rects[i]; uint32_t width = rect.x2 - rect.x1;