render: when double-buffering, pre-apply previous frame's damage early

Foot likes it when compositor releases buffer immediately, as that
means we only have to re-render the cells that have changed since the
last frame.

For various reasons, not all compositors do this. In this case, foot
is typically forced to switch between two buffers, i.e. double-buffer.

In this case, each frame starts with copying over the damage from the
previous frame, to the new frame. Then we start rendering the updated
cells.

Bringing over the previous frame's damage can be slow, if the changed
area was large (e.g. when scrolling one or a few lines, or on full
screen updates). It's also done single-threaded. Thus it not only
slows down frame rendering, but pauses everything else (i.e. input
processing). All in all, it reduces performance and increases input
latency.

But we don't have to wait until it's time to render a frame to copy
over the previous frame's damage. We can do that as soon as the
compositor has released the buffer (for the frame _before_ the
previous frame). And we can do this in a thread.

This frees up foot to continue processing input, and reduces frame
rendering time since we can now start rendering the modified cells
immediately, without first doing a large memcpy(3).

In worst case scenarios (or perhaps we should consider them best case
scenarios...), I've seen up to a 10x performance increase in frame
rendering times (this obviously does *not* include the time it takes
to copy over the previous frame's damage, since that doesn't affect
neither input processing nor frame rendering).

Implemented by adding a callback mechanism to the shm abstraction
layer. Use it for the grid buffers, and kick off a thread that copies
the previous frame's damage, and resets the buffers age to 0 (so that
foot understands it can start render to it immediately when it later
needs to render a frame).

Since we have certain way of knowing if a compositor releases buffers
immediately or not, use a bit of heuristics; if we see 10 consecutive
non-immediate releases (that is, we reset the counter as soon as we do
see an immediate release), this new "pre-apply damage" logic is
enabled. It can be force-disabled with tweak.pre-apply-damage=no.

We also need to take care to wait for the thread before resetting the
render's "last_buf" pointer (or we'll SEGFAULT in the thread...).

We must also ensure we wait for the thread to finish before we start
rendering a new frame. Under normal circumstances, the wait time is
always 0, the thread has almost always finished long before we need to
render the next frame. But it _can_ happen.

Closes #2188
This commit is contained in:
Daniel Eklöf 2025-10-05 10:48:36 +02:00
parent bb314425ef
commit 299186a654
No known key found for this signature in database
GPG key ID: 5BBD4992C116573F
11 changed files with 287 additions and 26 deletions

200
render.c
View file

@ -2224,6 +2224,56 @@ render_worker_thread(void *_ctx)
case -2:
return 0;
case -3: {
if (term->conf->tweak.render_timer != RENDER_TIMER_NONE)
clock_gettime(CLOCK_MONOTONIC, &term->render.workers.preapplied_damage.start);
mtx_lock(&term->render.workers.preapplied_damage.lock);
buf = term->render.workers.preapplied_damage.buf;
xassert(buf != NULL);
if (likely(term->render.last_buf != NULL)) {
mtx_unlock(&term->render.workers.preapplied_damage.lock);
pixman_region32_t dmg;
pixman_region32_init(&dmg);
if (buf->age == 0)
; /* No need to do anything */
else if (buf->age == 1)
pixman_region32_copy(&dmg,
&term->render.last_buf->dirty[0]);
else
pixman_region32_init_rect(&dmg, 0, 0, buf->width,
buf->height);
pixman_image_set_clip_region32(buf->pix[my_id], &dmg);
pixman_image_composite32(PIXMAN_OP_SRC,
term->render.last_buf->pix[my_id],
NULL, buf->pix[my_id], 0, 0, 0, 0, 0,
0, buf->width, buf->height);
pixman_region32_fini(&dmg);
buf->age = 0;
shm_unref(term->render.last_buf);
shm_addref(buf);
term->render.last_buf = buf;
mtx_lock(&term->render.workers.preapplied_damage.lock);
}
term->render.workers.preapplied_damage.buf = NULL;
cnd_signal(&term->render.workers.preapplied_damage.cond);
mtx_unlock(&term->render.workers.preapplied_damage.lock);
if (term->conf->tweak.render_timer != RENDER_TIMER_NONE)
clock_gettime(CLOCK_MONOTONIC, &term->render.workers.preapplied_damage.stop);
frame_done = true;
break;
}
}
}
};
@ -2231,6 +2281,22 @@ render_worker_thread(void *_ctx)
return -1;
}
static void
wait_for_preapply_damage(struct terminal *term)
{
if (!term->render.preapply_last_frame_damage)
return;
if (term->render.workers.preapplied_damage.buf == NULL)
return;
mtx_lock(&term->render.workers.preapplied_damage.lock);
while (term->render.workers.preapplied_damage.buf != NULL) {
cnd_wait(&term->render.workers.preapplied_damage.cond,
&term->render.workers.preapplied_damage.lock);
}
mtx_unlock(&term->render.workers.preapplied_damage.lock);
}
struct csd_data
get_csd_data(const struct terminal *term, enum csd_surface surf_idx)
{
@ -3113,14 +3179,6 @@ force_full_repaint(struct terminal *term, struct buffer *buf)
static void
reapply_old_damage(struct terminal *term, struct buffer *new, struct buffer *old)
{
static int counter = 0;
static bool have_warned = false;
if (!have_warned && ++counter > 5) {
LOG_WARN("compositor is not releasing buffers immediately; "
"expect lower rendering performance");
have_warned = true;
}
if (new->age > 1) {
memcpy(new->data, old->data, new->height * new->stride);
return;
@ -3251,7 +3309,18 @@ grid_render(struct terminal *term)
if (term->shutdown.in_progress)
return;
struct timespec start_time, start_double_buffering = {0}, stop_double_buffering = {0};
struct timespec start_time;
struct timespec start_wait_preapply = {0}, stop_wait_preapply = {0};
struct timespec start_double_buffering = {0}, stop_double_buffering = {0};
/* Might be a thread doing pre-applied damage */
if (unlikely(term->render.preapply_last_frame_damage &&
term->render.workers.preapplied_damage.buf != NULL))
{
clock_gettime(CLOCK_MONOTONIC, &start_wait_preapply);
wait_for_preapply_damage(term);
clock_gettime(CLOCK_MONOTONIC, &stop_wait_preapply);
}
if (term->conf->tweak.render_timer != RENDER_TIMER_NONE)
clock_gettime(CLOCK_MONOTONIC, &start_time);
@ -3269,6 +3338,8 @@ grid_render(struct terminal *term)
dirty_old_cursor(term);
dirty_cursor(term);
LOG_DBG("buffer age: %u (%p)", buf->age, (void *)buf);
if (term->render.last_buf == NULL ||
term->render.last_buf->width != buf->width ||
term->render.last_buf->height != buf->height ||
@ -3285,9 +3356,27 @@ grid_render(struct terminal *term)
xassert(term->render.last_buf->width == buf->width);
xassert(term->render.last_buf->height == buf->height);
if (++term->render.frames_since_last_immediate_release > 10) {
static bool have_warned = false;
if (!term->render.preapply_last_frame_damage &&
term->conf->tweak.preapply_damage &&
term->render.workers.count > 0)
{
LOG_INFO("enabling pre-applied frame damage");
term->render.preapply_last_frame_damage = true;
} else if (!have_warned) {
LOG_WARN("compositor is not releasing buffers immediately; "
"expect lower rendering performance");
have_warned = true;
}
}
clock_gettime(CLOCK_MONOTONIC, &start_double_buffering);
reapply_old_damage(term, buf, term->render.last_buf);
clock_gettime(CLOCK_MONOTONIC, &stop_double_buffering);
} else if (!term->render.preapply_last_frame_damage) {
term->render.frames_since_last_immediate_release = 0;
}
if (term->render.last_buf != NULL) {
@ -3515,27 +3604,40 @@ grid_render(struct terminal *term)
struct timespec end_time;
clock_gettime(CLOCK_MONOTONIC, &end_time);
struct timespec wait_time;
timespec_sub(&stop_wait_preapply, &start_wait_preapply, &wait_time);
struct timespec render_time;
timespec_sub(&end_time, &start_time, &render_time);
struct timespec double_buffering_time;
timespec_sub(&stop_double_buffering, &start_double_buffering, &double_buffering_time);
struct timespec preapply_damage;
timespec_sub(&term->render.workers.preapplied_damage.stop,
&term->render.workers.preapplied_damage.start,
&preapply_damage);
struct timespec total_render_time;
timespec_add(&render_time, &double_buffering_time, &total_render_time);
timespec_add(&wait_time, &total_render_time, &total_render_time);
switch (term->conf->tweak.render_timer) {
case RENDER_TIMER_LOG:
case RENDER_TIMER_BOTH:
LOG_INFO(
"frame rendered in %lds %9ldns "
"(%lds %9ldns rendering, %lds %9ldns double buffering)",
"(%lds %9ldns wait, %lds %9ldns rendering, %lds %9ldns double buffering) not included: %lds %ldns pre-apply damage",
(long)total_render_time.tv_sec,
total_render_time.tv_nsec,
(long)wait_time.tv_sec,
wait_time.tv_nsec,
(long)render_time.tv_sec,
render_time.tv_nsec,
(long)double_buffering_time.tv_sec,
double_buffering_time.tv_nsec);
double_buffering_time.tv_nsec,
(long)preapply_damage.tv_sec,
preapply_damage.tv_nsec);
break;
case RENDER_TIMER_OSD:
@ -4295,6 +4397,7 @@ delayed_reflow_of_normal_grid(struct terminal *term)
term->interactive_resizing.old_hide_cursor = false;
/* Invalidate render pointers */
wait_for_preapply_damage(term);
shm_unref(term->render.last_buf);
term->render.last_buf = NULL;
term->render.last_cursor.row = NULL;
@ -4869,6 +4972,7 @@ damage_view:
tll_free(term->normal.scroll_damage);
tll_free(term->alt.scroll_damage);
wait_for_preapply_damage(term);
shm_unref(term->render.last_buf);
term->render.last_buf = NULL;
term_damage_view(term);
@ -5267,3 +5371,77 @@ render_xcursor_set(struct seat *seat, struct terminal *term,
seat->pointer.xcursor_pending = true;
return true;
}
void
render_buffer_release_callback(struct buffer *buf, void *data)
{
/*
* Called from shm.c when a buffer is released
*
* We use it to pre-apply last-frame's damage to it, when we're
* forced to double buffer (compositor doesn't release buffers
* immediately).
*
* The timeline is thus:
* 1. We render and push a new frame
* 2. Some (hopefully short) time after that, the compositor releases the previous buffer
* 3. We're called, and kick off the thread that copies the changes from (1) to the just freed buffer
* 4. Time passes....
* 5. The compositor calls our frame callback, signalling to us that it's time to start rendering the next frame
* 6. Hopefully, our thread is already done with copying the changes, otherwise we stall, waiting for it
* 7. We render the frame as if the compositor does immediate releases.
*
* What's the gain? Reduced latency, by applying the previous
* frame's damage as soon as possible, we shorten the time it
* takes to render the frame after the frame callback.
*
* This means the compositor can, in theory, push the frame
* callback closer to the vblank deadline, and thus reduce input
* latency. Not all compositors (most, in fact?) don't adapt like
* this, unfortunately. But some allows the user to manually
* configure the deadline.
*/
struct terminal *term = data;
if (likely(buf->age != 1))
return;
if (likely(!term->render.preapply_last_frame_damage))
return;
if (term->render.last_buf == NULL)
return;
if (term->render.last_buf->age != 0)
return;
if (buf->width != term->render.last_buf->width)
return;
if (buf->height != term->render.last_buf->height)
return;
xassert(term->render.workers.count > 0);
xassert(term->render.last_buf != NULL);
xassert(term->render.last_buf->age == 0);
xassert(term->render.last_buf != buf);
mtx_lock(&term->render.workers.preapplied_damage.lock);
if (term->render.workers.preapplied_damage.buf != NULL) {
mtx_unlock(&term->render.workers.preapplied_damage.lock);
return;
}
xassert(term->render.workers.preapplied_damage.buf == NULL);
term->render.workers.preapplied_damage.buf = buf;
term->render.workers.preapplied_damage.start = (struct timespec){0};
term->render.workers.preapplied_damage.stop = (struct timespec){0};
mtx_unlock(&term->render.workers.preapplied_damage.lock);
mtx_lock(&term->render.workers.lock);
sem_post(&term->render.workers.start);
xassert(tll_length(term->render.workers.queue) == 0);
tll_push_back(term->render.workers.queue, -3);
mtx_unlock(&term->render.workers.lock);
}