diff --git a/CHANGELOG.md b/CHANGELOG.md index a2aaf488..65f0bbab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,13 @@ ## Unreleased ### Added + +* Performance increased and input latency decreased on compositors + that do not release SHM buffers immediately ([#2188][2188]). + +[2188]: https://codeberg.org/dnkl/foot/issues/2188 + + ### Changed * SHM buffer sizes are now rounded up to nearest page size, and their diff --git a/config.c b/config.c index 459a1de9..06817247 100644 --- a/config.c +++ b/config.c @@ -2848,9 +2848,11 @@ parse_section_tweak(struct context *ctx) #endif } - else if (streq(key, "min-stride-alignment")) { + else if (streq(key, "min-stride-alignment")) return value_to_uint32(ctx, 10, &conf->tweak.min_stride_alignment); - } + + else if (streq(key, "pre-apply-damage")) + return value_to_bool(ctx, &conf->tweak.preapply_damage); else { LOG_CONTEXTUAL_ERR("not a valid option: %s", key); @@ -3501,6 +3503,7 @@ config_load(struct config *conf, const char *conf_path, .sixel = true, .surface_bit_depth = SHM_BITS_AUTO, .min_stride_alignment = 256, + .preapply_damage = true, }, .touch = { diff --git a/config.h b/config.h index 11439d3a..5b7ff11e 100644 --- a/config.h +++ b/config.h @@ -436,6 +436,7 @@ struct config { bool sixel; enum shm_bit_depth surface_bit_depth; uint32_t min_stride_alignment; + bool preapply_damage; } tweak; struct { diff --git a/doc/foot.ini.5.scd b/doc/foot.ini.5.scd index 56b76be7..7b08d5d4 100644 --- a/doc/foot.ini.5.scd +++ b/doc/foot.ini.5.scd @@ -2093,6 +2093,41 @@ any of these options. Default: _auto_ +*pre-apply-damage* + Boolean. When enabled, foot will attempt to "pre-apply" the damage + from the last frame when foot is forced to double-buffer + (i.e. when the compositor does not release SHM buffers + immediately). All text after this assumes the compositor is not + releasing buffers immediately. + + When this option is disabled, each time foot needs to render a + frame, it has to first copy over areas that changed in the last + frame (i.e. all changes between the last two frames). This is + basically a *memcpy*(3), which can be slow if the changed area is + large. It is also done on the main thread, which means foot cannot + do anything else at the same time; no other rendering, no VT + parsing. After the changes have been brought over to the new + frame, foot proceeds with rendering the cells that has changed + between the last frame and the new frame. + + When this open is enabled, the changes between the last two frames + are brought over to what will become the next frame before foot + starts rendering the next frame. As soon as the compositor + releases the previous buffer (typically right after foot has + pushed a new frame), foot kicks off a thread that copies over the + changes to the newly released buffer. Since this is done in a + thread, foot can continue processing input at the same + time. Later, when it is time to render a new frame, the changes + have already been transferred, and foot can immediately start with + the actual rendering. + + Thus, having this option enabled improves both performance + (copying the last two frames' changes is threaded), and improves + input latency (rending the next frame no longer has to first bring + over the changes between the last two frames). + + Default: _yes_ + # SEE ALSO *foot*(1), *footclient*(1) diff --git a/pgo/pgo.c b/pgo/pgo.c index 757dcd06..4ff4111c 100644 --- a/pgo/pgo.c +++ b/pgo/pgo.c @@ -74,6 +74,8 @@ void render_refresh_icon(struct terminal *term) {} void render_overlay(struct terminal *term) {} +void render_buffer_release_callback(struct buffer *buf, void *data) {} + bool render_xcursor_is_valid(const struct seat *seat, const char *cursor) { @@ -206,7 +208,8 @@ enum shm_bit_depth shm_chain_bit_depth(const struct buffer_chain *chain) { retur struct buffer_chain * shm_chain_new( struct wayland *wayl, bool scrollable, size_t pix_instances, - enum shm_bit_depth desired_bit_depth) + enum shm_bit_depth desired_bit_depth, + void (*release_cb)(struct buffer *buf, void *data), void *cb_data) { return NULL; } diff --git a/render.c b/render.c index 1c24bafa..35752125 100644 --- a/render.c +++ b/render.c @@ -2224,6 +2224,56 @@ render_worker_thread(void *_ctx) case -2: return 0; + + case -3: { + if (term->conf->tweak.render_timer != RENDER_TIMER_NONE) + clock_gettime(CLOCK_MONOTONIC, &term->render.workers.preapplied_damage.start); + + mtx_lock(&term->render.workers.preapplied_damage.lock); + buf = term->render.workers.preapplied_damage.buf; + xassert(buf != NULL); + + if (likely(term->render.last_buf != NULL)) { + mtx_unlock(&term->render.workers.preapplied_damage.lock); + + pixman_region32_t dmg; + pixman_region32_init(&dmg); + + if (buf->age == 0) + ; /* No need to do anything */ + else if (buf->age == 1) + pixman_region32_copy(&dmg, + &term->render.last_buf->dirty[0]); + else + pixman_region32_init_rect(&dmg, 0, 0, buf->width, + buf->height); + + pixman_image_set_clip_region32(buf->pix[my_id], &dmg); + pixman_image_composite32(PIXMAN_OP_SRC, + term->render.last_buf->pix[my_id], + NULL, buf->pix[my_id], 0, 0, 0, 0, 0, + 0, buf->width, buf->height); + + pixman_region32_fini(&dmg); + + buf->age = 0; + shm_unref(term->render.last_buf); + shm_addref(buf); + term->render.last_buf = buf; + + mtx_lock(&term->render.workers.preapplied_damage.lock); + } + + term->render.workers.preapplied_damage.buf = NULL; + cnd_signal(&term->render.workers.preapplied_damage.cond); + mtx_unlock(&term->render.workers.preapplied_damage.lock); + + if (term->conf->tweak.render_timer != RENDER_TIMER_NONE) + clock_gettime(CLOCK_MONOTONIC, &term->render.workers.preapplied_damage.stop); + + frame_done = true; + break; + } } } }; @@ -2231,6 +2281,22 @@ render_worker_thread(void *_ctx) return -1; } +static void +wait_for_preapply_damage(struct terminal *term) +{ + if (!term->render.preapply_last_frame_damage) + return; + if (term->render.workers.preapplied_damage.buf == NULL) + return; + + mtx_lock(&term->render.workers.preapplied_damage.lock); + while (term->render.workers.preapplied_damage.buf != NULL) { + cnd_wait(&term->render.workers.preapplied_damage.cond, + &term->render.workers.preapplied_damage.lock); + } + mtx_unlock(&term->render.workers.preapplied_damage.lock); +} + struct csd_data get_csd_data(const struct terminal *term, enum csd_surface surf_idx) { @@ -3113,14 +3179,6 @@ force_full_repaint(struct terminal *term, struct buffer *buf) static void reapply_old_damage(struct terminal *term, struct buffer *new, struct buffer *old) { - static int counter = 0; - static bool have_warned = false; - if (!have_warned && ++counter > 5) { - LOG_WARN("compositor is not releasing buffers immediately; " - "expect lower rendering performance"); - have_warned = true; - } - if (new->age > 1) { memcpy(new->data, old->data, new->height * new->stride); return; @@ -3251,7 +3309,18 @@ grid_render(struct terminal *term) if (term->shutdown.in_progress) return; - struct timespec start_time, start_double_buffering = {0}, stop_double_buffering = {0}; + struct timespec start_time; + struct timespec start_wait_preapply = {0}, stop_wait_preapply = {0}; + struct timespec start_double_buffering = {0}, stop_double_buffering = {0}; + + /* Might be a thread doing pre-applied damage */ + if (unlikely(term->render.preapply_last_frame_damage && + term->render.workers.preapplied_damage.buf != NULL)) + { + clock_gettime(CLOCK_MONOTONIC, &start_wait_preapply); + wait_for_preapply_damage(term); + clock_gettime(CLOCK_MONOTONIC, &stop_wait_preapply); + } if (term->conf->tweak.render_timer != RENDER_TIMER_NONE) clock_gettime(CLOCK_MONOTONIC, &start_time); @@ -3269,6 +3338,8 @@ grid_render(struct terminal *term) dirty_old_cursor(term); dirty_cursor(term); + LOG_DBG("buffer age: %u (%p)", buf->age, (void *)buf); + if (term->render.last_buf == NULL || term->render.last_buf->width != buf->width || term->render.last_buf->height != buf->height || @@ -3285,9 +3356,27 @@ grid_render(struct terminal *term) xassert(term->render.last_buf->width == buf->width); xassert(term->render.last_buf->height == buf->height); + if (++term->render.frames_since_last_immediate_release > 10) { + static bool have_warned = false; + + if (!term->render.preapply_last_frame_damage && + term->conf->tweak.preapply_damage && + term->render.workers.count > 0) + { + LOG_INFO("enabling pre-applied frame damage"); + term->render.preapply_last_frame_damage = true; + } else if (!have_warned) { + LOG_WARN("compositor is not releasing buffers immediately; " + "expect lower rendering performance"); + have_warned = true; + } + } + clock_gettime(CLOCK_MONOTONIC, &start_double_buffering); reapply_old_damage(term, buf, term->render.last_buf); clock_gettime(CLOCK_MONOTONIC, &stop_double_buffering); + } else if (!term->render.preapply_last_frame_damage) { + term->render.frames_since_last_immediate_release = 0; } if (term->render.last_buf != NULL) { @@ -3515,27 +3604,40 @@ grid_render(struct terminal *term) struct timespec end_time; clock_gettime(CLOCK_MONOTONIC, &end_time); + struct timespec wait_time; + timespec_sub(&stop_wait_preapply, &start_wait_preapply, &wait_time); + struct timespec render_time; timespec_sub(&end_time, &start_time, &render_time); struct timespec double_buffering_time; timespec_sub(&stop_double_buffering, &start_double_buffering, &double_buffering_time); + struct timespec preapply_damage; + timespec_sub(&term->render.workers.preapplied_damage.stop, + &term->render.workers.preapplied_damage.start, + &preapply_damage); + struct timespec total_render_time; timespec_add(&render_time, &double_buffering_time, &total_render_time); + timespec_add(&wait_time, &total_render_time, &total_render_time); switch (term->conf->tweak.render_timer) { case RENDER_TIMER_LOG: case RENDER_TIMER_BOTH: LOG_INFO( "frame rendered in %lds %9ldns " - "(%lds %9ldns rendering, %lds %9ldns double buffering)", + "(%lds %9ldns wait, %lds %9ldns rendering, %lds %9ldns double buffering) not included: %lds %ldns pre-apply damage", (long)total_render_time.tv_sec, total_render_time.tv_nsec, + (long)wait_time.tv_sec, + wait_time.tv_nsec, (long)render_time.tv_sec, render_time.tv_nsec, (long)double_buffering_time.tv_sec, - double_buffering_time.tv_nsec); + double_buffering_time.tv_nsec, + (long)preapply_damage.tv_sec, + preapply_damage.tv_nsec); break; case RENDER_TIMER_OSD: @@ -4295,6 +4397,7 @@ delayed_reflow_of_normal_grid(struct terminal *term) term->interactive_resizing.old_hide_cursor = false; /* Invalidate render pointers */ + wait_for_preapply_damage(term); shm_unref(term->render.last_buf); term->render.last_buf = NULL; term->render.last_cursor.row = NULL; @@ -4869,6 +4972,7 @@ damage_view: tll_free(term->normal.scroll_damage); tll_free(term->alt.scroll_damage); + wait_for_preapply_damage(term); shm_unref(term->render.last_buf); term->render.last_buf = NULL; term_damage_view(term); @@ -5267,3 +5371,77 @@ render_xcursor_set(struct seat *seat, struct terminal *term, seat->pointer.xcursor_pending = true; return true; } + +void +render_buffer_release_callback(struct buffer *buf, void *data) +{ + /* + * Called from shm.c when a buffer is released + * + * We use it to pre-apply last-frame's damage to it, when we're + * forced to double buffer (compositor doesn't release buffers + * immediately). + * + * The timeline is thus: + * 1. We render and push a new frame + * 2. Some (hopefully short) time after that, the compositor releases the previous buffer + * 3. We're called, and kick off the thread that copies the changes from (1) to the just freed buffer + * 4. Time passes.... + * 5. The compositor calls our frame callback, signalling to us that it's time to start rendering the next frame + * 6. Hopefully, our thread is already done with copying the changes, otherwise we stall, waiting for it + * 7. We render the frame as if the compositor does immediate releases. + * + * What's the gain? Reduced latency, by applying the previous + * frame's damage as soon as possible, we shorten the time it + * takes to render the frame after the frame callback. + * + * This means the compositor can, in theory, push the frame + * callback closer to the vblank deadline, and thus reduce input + * latency. Not all compositors (most, in fact?) don't adapt like + * this, unfortunately. But some allows the user to manually + * configure the deadline. + */ + struct terminal *term = data; + + if (likely(buf->age != 1)) + return; + + if (likely(!term->render.preapply_last_frame_damage)) + return; + + if (term->render.last_buf == NULL) + return; + + if (term->render.last_buf->age != 0) + return; + + if (buf->width != term->render.last_buf->width) + return; + + if (buf->height != term->render.last_buf->height) + return; + + xassert(term->render.workers.count > 0); + xassert(term->render.last_buf != NULL); + + xassert(term->render.last_buf->age == 0); + xassert(term->render.last_buf != buf); + + mtx_lock(&term->render.workers.preapplied_damage.lock); + if (term->render.workers.preapplied_damage.buf != NULL) { + mtx_unlock(&term->render.workers.preapplied_damage.lock); + return; + } + + xassert(term->render.workers.preapplied_damage.buf == NULL); + term->render.workers.preapplied_damage.buf = buf; + term->render.workers.preapplied_damage.start = (struct timespec){0}; + term->render.workers.preapplied_damage.stop = (struct timespec){0}; + mtx_unlock(&term->render.workers.preapplied_damage.lock); + + mtx_lock(&term->render.workers.lock); + sem_post(&term->render.workers.start); + xassert(tll_length(term->render.workers.queue) == 0); + tll_push_back(term->render.workers.queue, -3); + mtx_unlock(&term->render.workers.lock); +} diff --git a/render.h b/render.h index 81d2a905..e21eaca8 100644 --- a/render.h +++ b/render.h @@ -47,3 +47,5 @@ struct csd_data { }; struct csd_data get_csd_data(const struct terminal *term, enum csd_surface surf_idx); + +void render_buffer_release_callback(struct buffer *buf, void *data); diff --git a/shm.c b/shm.c index 31ea67ed..72b32f16 100644 --- a/shm.c +++ b/shm.c @@ -87,6 +87,9 @@ struct buffer_private { bool with_alpha; bool scrollable; + + void (*release_cb)(struct buffer *buf, void *data); + void *cb_data; }; struct buffer_chain { @@ -100,6 +103,9 @@ struct buffer_chain { pixman_format_code_t pixman_fmt_with_alpha; enum wl_shm_format shm_format_with_alpha; + + void (*release_cb)(struct buffer *buf, void *data); + void *cb_data; }; static tll(struct buffer_private *) deferred; @@ -232,6 +238,10 @@ buffer_release(void *data, struct wl_buffer *wl_buffer) xassert(found); if (!found) LOG_WARN("deferred delete: buffer not on the 'deferred' list"); + } else { + if (buffer->release_cb != NULL) { + buffer->release_cb(&buffer->public, buffer->cb_data); + } } } @@ -516,6 +526,8 @@ get_new_buffers(struct buffer_chain *chain, size_t count, .offset = 0, .size = sizes[i], .scrollable = chain->scrollable, + .release_cb = chain->release_cb, + .cb_data = chain->cb_data, }; if (!instantiate_offset(buf, offset)) { @@ -623,7 +635,7 @@ shm_get_buffer(struct buffer_chain *chain, int width, int height, bool with_alph * reuse. Pick the "youngest" one, and mark the * other one for purging */ if (buf->public.age < cached->public.age) { - shm_unref(&cached->public); + //shm_unref(&cached->public); cached = buf; } else { /* @@ -634,8 +646,8 @@ shm_get_buffer(struct buffer_chain *chain, int width, int height, bool with_alph * should be safe; "our" tll_foreach() already * holds the next pointer. */ - if (buffer_unref_no_remove_from_chain(buf)) - tll_remove(chain->bufs, it); + //if (buffer_unref_no_remove_from_chain(buf)) + // tll_remove(chain->bufs, it); } } } @@ -994,7 +1006,8 @@ shm_unref(struct buffer *_buf) struct buffer_chain * shm_chain_new(struct wayland *wayl, bool scrollable, size_t pix_instances, - enum shm_bit_depth desired_bit_depth) + enum shm_bit_depth desired_bit_depth, + void (*release_cb)(struct buffer *buf, void *data), void *cb_data) { pixman_format_code_t pixman_fmt_without_alpha = PIXMAN_x8r8g8b8; enum wl_shm_format shm_fmt_without_alpha = WL_SHM_FORMAT_XRGB8888; @@ -1090,6 +1103,9 @@ shm_chain_new(struct wayland *wayl, bool scrollable, size_t pix_instances, .pixman_fmt_with_alpha = pixman_fmt_with_alpha, .shm_format_with_alpha = shm_fmt_with_alpha, + + .release_cb = release_cb, + .cb_data = cb_data, }; return chain; } diff --git a/shm.h b/shm.h index 6050f1c7..84eb4386 100644 --- a/shm.h +++ b/shm.h @@ -50,7 +50,8 @@ void shm_set_min_stride_alignment(size_t min_stride_alignment); struct buffer_chain; struct buffer_chain *shm_chain_new( struct wayland *wayl, bool scrollable, size_t pix_instances, - enum shm_bit_depth desired_bit_depth); + enum shm_bit_depth desired_bit_depth, + void (*release_cb)(struct buffer *buf, void *data), void *cb_data); void shm_chain_free(struct buffer_chain *chain); enum shm_bit_depth shm_chain_bit_depth(const struct buffer_chain *chain); diff --git a/terminal.c b/terminal.c index 60506d07..36f8513b 100644 --- a/terminal.c +++ b/terminal.c @@ -719,6 +719,9 @@ initialize_render_workers(struct terminal *term) goto err_sem_destroy; } + mtx_init(&term->render.workers.preapplied_damage.lock, mtx_plain); + cnd_init(&term->render.workers.preapplied_damage.cond); + term->render.workers.threads = xcalloc( term->render.workers.count, sizeof(term->render.workers.threads[0])); @@ -1356,13 +1359,13 @@ term_init(const struct config *conf, struct fdm *fdm, struct reaper *reaper, .render = { .chains = { .grid = shm_chain_new(wayl, true, 1 + conf->render_worker_count, - desired_bit_depth), - .search = shm_chain_new(wayl, false, 1 ,desired_bit_depth), - .scrollback_indicator = shm_chain_new(wayl, false, 1, desired_bit_depth), - .render_timer = shm_chain_new(wayl, false, 1, desired_bit_depth), - .url = shm_chain_new(wayl, false, 1, desired_bit_depth), - .csd = shm_chain_new(wayl, false, 1, desired_bit_depth), - .overlay = shm_chain_new(wayl, false, 1, desired_bit_depth), + desired_bit_depth, &render_buffer_release_callback, term), + .search = shm_chain_new(wayl, false, 1 ,desired_bit_depth, NULL, NULL), + .scrollback_indicator = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL), + .render_timer = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL), + .url = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL), + .csd = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL), + .overlay = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL), }, .scrollback_lines = conf->scrollback.lines, .app_sync_updates.timer_fd = app_sync_updates_fd, @@ -1893,6 +1896,8 @@ term_destroy(struct terminal *term) } } free(term->render.workers.threads); + mtx_destroy(&term->render.workers.preapplied_damage.lock); + cnd_destroy(&term->render.workers.preapplied_damage.cond); mtx_destroy(&term->render.workers.lock); sem_destroy(&term->render.workers.start); sem_destroy(&term->render.workers.done); diff --git a/terminal.h b/terminal.h index 88371b07..364d57b3 100644 --- a/terminal.h +++ b/terminal.h @@ -706,6 +706,14 @@ struct terminal { tll(int) queue; thrd_t *threads; struct buffer *buf; + + struct { + mtx_t lock; + cnd_t cond; + struct buffer *buf; + struct timespec start; + struct timespec stop; + } preapplied_damage; } workers; /* Last rendered cursor position */ @@ -716,6 +724,8 @@ struct terminal { } last_cursor; struct buffer *last_buf; /* Buffer we rendered to last time */ + size_t frames_since_last_immediate_release; + bool preapply_last_frame_damage; enum overlay_style last_overlay_style; struct buffer *last_overlay_buf;