diff --git a/CHANGELOG.md b/CHANGELOG.md
index a2aaf488..65f0bbab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -68,6 +68,13 @@
 
 ## Unreleased
 ### Added
+
+* Performance increased and input latency decreased on compositors
+  that do not release SHM buffers immediately ([#2188][2188]).
+
+[2188]: https://codeberg.org/dnkl/foot/issues/2188
+
+
 ### Changed
 
 * SHM buffer sizes are now rounded up to nearest page size, and their
diff --git a/config.c b/config.c
index 459a1de9..06817247 100644
--- a/config.c
+++ b/config.c
@@ -2848,9 +2848,11 @@ parse_section_tweak(struct context *ctx)
 #endif
     }
 
-    else if (streq(key, "min-stride-alignment")) {
+    else if (streq(key, "min-stride-alignment"))
         return value_to_uint32(ctx, 10, &conf->tweak.min_stride_alignment);
-    }
+
+    else if (streq(key, "pre-apply-damage"))
+        return value_to_bool(ctx, &conf->tweak.preapply_damage);
 
     else {
         LOG_CONTEXTUAL_ERR("not a valid option: %s", key);
@@ -3501,6 +3503,7 @@ config_load(struct config *conf, const char *conf_path,
             .sixel = true,
             .surface_bit_depth = SHM_BITS_AUTO,
             .min_stride_alignment = 256,
+            .preapply_damage = true,
         },
 
         .touch = {
diff --git a/config.h b/config.h
index 11439d3a..5b7ff11e 100644
--- a/config.h
+++ b/config.h
@@ -436,6 +436,7 @@ struct config {
         bool sixel;
         enum shm_bit_depth surface_bit_depth;
         uint32_t min_stride_alignment;
+        bool preapply_damage;
     } tweak;
 
     struct {
diff --git a/doc/foot.ini.5.scd b/doc/foot.ini.5.scd
index 56b76be7..7b08d5d4 100644
--- a/doc/foot.ini.5.scd
+++ b/doc/foot.ini.5.scd
@@ -2093,6 +2093,41 @@ any of these options.
 	
 	Default: _auto_
 
+*pre-apply-damage*
+	Boolean. When enabled, foot will attempt to "pre-apply" the damage
+	from the last frame when foot is forced to double-buffer
+	(i.e. when the compositor does not release SHM buffers
+	immediately). All text after this assumes the compositor is not
+	releasing buffers immediately.
+	
+	When this option is disabled, each time foot needs to render a
+	frame, it has to first copy over areas that changed in the last
+	frame (i.e. all changes between the last two frames). This is
+	basically a *memcpy*(3), which can be slow if the changed area is
+	large. It is also done on the main thread, which means foot cannot
+	do anything else at the same time; no other rendering, no VT
+	parsing. After the changes have been brought over to the new
+	frame, foot proceeds with rendering the cells that has changed
+	between the last frame and the new frame.
+	
+	When this open is enabled, the changes between the last two frames
+	are brought over to what will become the next frame before foot
+	starts rendering the next frame. As soon as the compositor
+	releases the previous buffer (typically right after foot has
+	pushed a new frame), foot kicks off a thread that copies over the
+	changes to the newly released buffer. Since this is done in a
+	thread, foot can continue processing input at the same
+	time. Later, when it is time to render a new frame, the changes
+	have already been transferred, and foot can immediately start with
+	the actual rendering.
+	
+	Thus, having this option enabled improves both performance
+	(copying the last two frames' changes is threaded), and improves
+	input latency (rending the next frame no longer has to first bring
+	over the changes between the last two frames).
+	
+	Default: _yes_
+
 # SEE ALSO
 
 *foot*(1), *footclient*(1)
diff --git a/pgo/pgo.c b/pgo/pgo.c
index 757dcd06..4ff4111c 100644
--- a/pgo/pgo.c
+++ b/pgo/pgo.c
@@ -74,6 +74,8 @@ void render_refresh_icon(struct terminal *term) {}
 
 void render_overlay(struct terminal *term) {}
 
+void render_buffer_release_callback(struct buffer *buf, void *data) {}
+
 bool
 render_xcursor_is_valid(const struct seat *seat, const char *cursor)
 {
@@ -206,7 +208,8 @@ enum shm_bit_depth shm_chain_bit_depth(const struct buffer_chain *chain) { retur
 struct buffer_chain *
 shm_chain_new(
     struct wayland *wayl, bool scrollable, size_t pix_instances,
-    enum shm_bit_depth desired_bit_depth)
+    enum shm_bit_depth desired_bit_depth,
+    void (*release_cb)(struct buffer *buf, void *data), void *cb_data)
 {
     return NULL;
 }
diff --git a/render.c b/render.c
index 1c24bafa..35752125 100644
--- a/render.c
+++ b/render.c
@@ -2224,6 +2224,56 @@ render_worker_thread(void *_ctx)
 
             case -2:
                 return 0;
+
+            case -3: {
+                if (term->conf->tweak.render_timer != RENDER_TIMER_NONE)
+                    clock_gettime(CLOCK_MONOTONIC, &term->render.workers.preapplied_damage.start);
+
+                mtx_lock(&term->render.workers.preapplied_damage.lock);
+                buf = term->render.workers.preapplied_damage.buf;
+                xassert(buf != NULL);
+
+                if (likely(term->render.last_buf != NULL)) {
+                    mtx_unlock(&term->render.workers.preapplied_damage.lock);
+
+                    pixman_region32_t dmg;
+                    pixman_region32_init(&dmg);
+
+                    if (buf->age == 0)
+                        ; /* No need to do anything */
+                    else if (buf->age == 1)
+                        pixman_region32_copy(&dmg,
+                                             &term->render.last_buf->dirty[0]);
+                    else
+                        pixman_region32_init_rect(&dmg, 0, 0, buf->width,
+                                                  buf->height);
+
+                    pixman_image_set_clip_region32(buf->pix[my_id], &dmg);
+                    pixman_image_composite32(PIXMAN_OP_SRC,
+                                             term->render.last_buf->pix[my_id],
+                                             NULL, buf->pix[my_id], 0, 0, 0, 0, 0,
+                                             0, buf->width, buf->height);
+
+                    pixman_region32_fini(&dmg);
+
+                    buf->age = 0;
+                    shm_unref(term->render.last_buf);
+                    shm_addref(buf);
+                    term->render.last_buf = buf;
+
+                    mtx_lock(&term->render.workers.preapplied_damage.lock);
+                }
+
+                term->render.workers.preapplied_damage.buf = NULL;
+                cnd_signal(&term->render.workers.preapplied_damage.cond);
+                mtx_unlock(&term->render.workers.preapplied_damage.lock);
+
+                if (term->conf->tweak.render_timer != RENDER_TIMER_NONE)
+                    clock_gettime(CLOCK_MONOTONIC, &term->render.workers.preapplied_damage.stop);
+
+                frame_done = true;
+                break;
+            }
             }
         }
     };
@@ -2231,6 +2281,22 @@ render_worker_thread(void *_ctx)
     return -1;
 }
 
+static void
+wait_for_preapply_damage(struct terminal *term)
+{
+    if (!term->render.preapply_last_frame_damage)
+        return;
+    if (term->render.workers.preapplied_damage.buf == NULL)
+        return;
+
+    mtx_lock(&term->render.workers.preapplied_damage.lock);
+    while (term->render.workers.preapplied_damage.buf != NULL) {
+        cnd_wait(&term->render.workers.preapplied_damage.cond,
+                 &term->render.workers.preapplied_damage.lock);
+    }
+    mtx_unlock(&term->render.workers.preapplied_damage.lock);
+}
+
 struct csd_data
 get_csd_data(const struct terminal *term, enum csd_surface surf_idx)
 {
@@ -3113,14 +3179,6 @@ force_full_repaint(struct terminal *term, struct buffer *buf)
 static void
 reapply_old_damage(struct terminal *term, struct buffer *new, struct buffer *old)
 {
-    static int counter = 0;
-    static bool have_warned = false;
-    if (!have_warned && ++counter > 5) {
-        LOG_WARN("compositor is not releasing buffers immediately; "
-                 "expect lower rendering performance");
-        have_warned = true;
-    }
-
     if (new->age > 1) {
         memcpy(new->data, old->data, new->height * new->stride);
         return;
@@ -3251,7 +3309,18 @@ grid_render(struct terminal *term)
     if (term->shutdown.in_progress)
         return;
 
-    struct timespec start_time, start_double_buffering = {0}, stop_double_buffering = {0};
+    struct timespec start_time;
+    struct timespec start_wait_preapply = {0}, stop_wait_preapply = {0};
+    struct timespec start_double_buffering = {0}, stop_double_buffering = {0};
+
+    /* Might be a thread doing pre-applied damage */
+    if (unlikely(term->render.preapply_last_frame_damage &&
+                 term->render.workers.preapplied_damage.buf != NULL))
+    {
+        clock_gettime(CLOCK_MONOTONIC, &start_wait_preapply);
+        wait_for_preapply_damage(term);
+        clock_gettime(CLOCK_MONOTONIC, &stop_wait_preapply);
+    }
 
     if (term->conf->tweak.render_timer != RENDER_TIMER_NONE)
         clock_gettime(CLOCK_MONOTONIC, &start_time);
@@ -3269,6 +3338,8 @@ grid_render(struct terminal *term)
     dirty_old_cursor(term);
     dirty_cursor(term);
 
+    LOG_DBG("buffer age: %u (%p)", buf->age, (void *)buf);
+
     if (term->render.last_buf == NULL ||
         term->render.last_buf->width != buf->width ||
         term->render.last_buf->height != buf->height ||
@@ -3285,9 +3356,27 @@ grid_render(struct terminal *term)
         xassert(term->render.last_buf->width == buf->width);
         xassert(term->render.last_buf->height == buf->height);
 
+        if (++term->render.frames_since_last_immediate_release > 10) {
+            static bool have_warned = false;
+
+            if (!term->render.preapply_last_frame_damage &&
+                term->conf->tweak.preapply_damage &&
+                term->render.workers.count > 0)
+            {
+                LOG_INFO("enabling pre-applied frame damage");
+                term->render.preapply_last_frame_damage = true;
+            } else if (!have_warned) {
+                LOG_WARN("compositor is not releasing buffers immediately; "
+                         "expect lower rendering performance");
+                have_warned = true;
+            }
+        }
+
         clock_gettime(CLOCK_MONOTONIC, &start_double_buffering);
         reapply_old_damage(term, buf, term->render.last_buf);
         clock_gettime(CLOCK_MONOTONIC, &stop_double_buffering);
+    } else if (!term->render.preapply_last_frame_damage) {
+        term->render.frames_since_last_immediate_release = 0;
     }
 
     if (term->render.last_buf != NULL) {
@@ -3515,27 +3604,40 @@ grid_render(struct terminal *term)
         struct timespec end_time;
         clock_gettime(CLOCK_MONOTONIC, &end_time);
 
+        struct timespec wait_time;
+        timespec_sub(&stop_wait_preapply, &start_wait_preapply, &wait_time);
+
         struct timespec render_time;
         timespec_sub(&end_time, &start_time, &render_time);
 
         struct timespec double_buffering_time;
         timespec_sub(&stop_double_buffering, &start_double_buffering, &double_buffering_time);
 
+        struct timespec preapply_damage;
+        timespec_sub(&term->render.workers.preapplied_damage.stop,
+                     &term->render.workers.preapplied_damage.start,
+                     &preapply_damage);
+
         struct timespec total_render_time;
         timespec_add(&render_time, &double_buffering_time, &total_render_time);
+        timespec_add(&wait_time, &total_render_time, &total_render_time);
 
         switch (term->conf->tweak.render_timer) {
         case RENDER_TIMER_LOG:
         case RENDER_TIMER_BOTH:
             LOG_INFO(
                 "frame rendered in %lds %9ldns "
-                "(%lds %9ldns rendering, %lds %9ldns double buffering)",
+                "(%lds %9ldns wait, %lds %9ldns rendering, %lds %9ldns double buffering) not included: %lds %ldns pre-apply damage",
                 (long)total_render_time.tv_sec,
                 total_render_time.tv_nsec,
+                (long)wait_time.tv_sec,
+                wait_time.tv_nsec,
                 (long)render_time.tv_sec,
                 render_time.tv_nsec,
                 (long)double_buffering_time.tv_sec,
-                double_buffering_time.tv_nsec);
+                double_buffering_time.tv_nsec,
+                (long)preapply_damage.tv_sec,
+                preapply_damage.tv_nsec);
             break;
 
         case RENDER_TIMER_OSD:
@@ -4295,6 +4397,7 @@ delayed_reflow_of_normal_grid(struct terminal *term)
     term->interactive_resizing.old_hide_cursor = false;
 
     /* Invalidate render pointers */
+    wait_for_preapply_damage(term);
     shm_unref(term->render.last_buf);
     term->render.last_buf = NULL;
     term->render.last_cursor.row = NULL;
@@ -4869,6 +4972,7 @@ damage_view:
     tll_free(term->normal.scroll_damage);
     tll_free(term->alt.scroll_damage);
 
+    wait_for_preapply_damage(term);
     shm_unref(term->render.last_buf);
     term->render.last_buf = NULL;
     term_damage_view(term);
@@ -5267,3 +5371,77 @@ render_xcursor_set(struct seat *seat, struct terminal *term,
     seat->pointer.xcursor_pending = true;
     return true;
 }
+
+void
+render_buffer_release_callback(struct buffer *buf, void *data)
+{
+    /*
+     * Called from shm.c when a buffer is released
+     *
+     * We use it to pre-apply last-frame's damage to it, when we're
+     * forced to double buffer (compositor doesn't release buffers
+     * immediately).
+     *
+     * The timeline is thus:
+     *   1. We render and push a new frame
+     *   2. Some (hopefully short) time after that, the compositor releases the previous buffer
+     *   3. We're called, and kick off the thread that copies the changes from (1) to the just freed buffer
+     *   4. Time passes....
+     *   5. The compositor calls our frame callback, signalling to us that it's time to start rendering the next frame
+     *   6. Hopefully, our thread is already done with copying the changes, otherwise we stall, waiting for it
+     *   7. We render the frame as if the compositor does immediate releases.
+     *
+     * What's the gain? Reduced latency, by applying the previous
+     * frame's damage as soon as possible, we shorten the time it
+     * takes to render the frame after the frame callback.
+     *
+     * This means the compositor can, in theory, push the frame
+     * callback closer to the vblank deadline, and thus reduce input
+     * latency. Not all compositors (most, in fact?) don't adapt like
+     * this, unfortunately. But some allows the user to manually
+     * configure the deadline.
+     */
+    struct terminal *term = data;
+
+    if (likely(buf->age != 1))
+        return;
+
+    if (likely(!term->render.preapply_last_frame_damage))
+        return;
+
+    if (term->render.last_buf == NULL)
+        return;
+
+    if (term->render.last_buf->age != 0)
+        return;
+
+    if (buf->width != term->render.last_buf->width)
+        return;
+
+    if (buf->height != term->render.last_buf->height)
+        return;
+
+    xassert(term->render.workers.count > 0);
+    xassert(term->render.last_buf != NULL);
+
+    xassert(term->render.last_buf->age == 0);
+    xassert(term->render.last_buf != buf);
+
+    mtx_lock(&term->render.workers.preapplied_damage.lock);
+    if (term->render.workers.preapplied_damage.buf != NULL) {
+        mtx_unlock(&term->render.workers.preapplied_damage.lock);
+        return;
+    }
+
+    xassert(term->render.workers.preapplied_damage.buf == NULL);
+    term->render.workers.preapplied_damage.buf = buf;
+    term->render.workers.preapplied_damage.start = (struct timespec){0};
+    term->render.workers.preapplied_damage.stop = (struct timespec){0};
+    mtx_unlock(&term->render.workers.preapplied_damage.lock);
+
+    mtx_lock(&term->render.workers.lock);
+    sem_post(&term->render.workers.start);
+    xassert(tll_length(term->render.workers.queue) == 0);
+    tll_push_back(term->render.workers.queue, -3);
+    mtx_unlock(&term->render.workers.lock);
+}
diff --git a/render.h b/render.h
index 81d2a905..e21eaca8 100644
--- a/render.h
+++ b/render.h
@@ -47,3 +47,5 @@ struct csd_data {
 };
 
 struct csd_data get_csd_data(const struct terminal *term, enum csd_surface surf_idx);
+
+void render_buffer_release_callback(struct buffer *buf, void *data);
diff --git a/shm.c b/shm.c
index 31ea67ed..72b32f16 100644
--- a/shm.c
+++ b/shm.c
@@ -87,6 +87,9 @@ struct buffer_private {
     bool with_alpha;
 
     bool scrollable;
+
+    void (*release_cb)(struct buffer *buf, void *data);
+    void *cb_data;
 };
 
 struct buffer_chain {
@@ -100,6 +103,9 @@ struct buffer_chain {
 
     pixman_format_code_t pixman_fmt_with_alpha;
     enum wl_shm_format shm_format_with_alpha;
+
+    void (*release_cb)(struct buffer *buf, void *data);
+    void *cb_data;
 };
 
 static tll(struct buffer_private *) deferred;
@@ -232,6 +238,10 @@ buffer_release(void *data, struct wl_buffer *wl_buffer)
         xassert(found);
         if (!found)
             LOG_WARN("deferred delete: buffer not on the 'deferred' list");
+    } else {
+        if (buffer->release_cb != NULL) {
+            buffer->release_cb(&buffer->public, buffer->cb_data);
+        }
     }
 }
 
@@ -516,6 +526,8 @@ get_new_buffers(struct buffer_chain *chain, size_t count,
             .offset = 0,
             .size = sizes[i],
             .scrollable = chain->scrollable,
+            .release_cb = chain->release_cb,
+            .cb_data = chain->cb_data,
         };
 
         if (!instantiate_offset(buf, offset)) {
@@ -623,7 +635,7 @@ shm_get_buffer(struct buffer_chain *chain, int width, int height, bool with_alph
                      * reuse. Pick the "youngest" one, and mark the
                      * other one for purging */
                     if (buf->public.age < cached->public.age) {
-                        shm_unref(&cached->public);
+                        //shm_unref(&cached->public);
                         cached = buf;
                     } else {
                         /*
@@ -634,8 +646,8 @@ shm_get_buffer(struct buffer_chain *chain, int width, int height, bool with_alph
                          * should be safe; "our" tll_foreach() already
                          * holds the next pointer.
                          */
-                        if (buffer_unref_no_remove_from_chain(buf))
-                            tll_remove(chain->bufs, it);
+                        //if (buffer_unref_no_remove_from_chain(buf))
+                        //    tll_remove(chain->bufs, it);
                     }
                 }
             }
@@ -994,7 +1006,8 @@ shm_unref(struct buffer *_buf)
 
 struct buffer_chain *
 shm_chain_new(struct wayland *wayl, bool scrollable, size_t pix_instances,
-              enum shm_bit_depth desired_bit_depth)
+              enum shm_bit_depth desired_bit_depth,
+              void (*release_cb)(struct buffer *buf, void *data), void *cb_data)
 {
     pixman_format_code_t pixman_fmt_without_alpha = PIXMAN_x8r8g8b8;
     enum wl_shm_format shm_fmt_without_alpha = WL_SHM_FORMAT_XRGB8888;
@@ -1090,6 +1103,9 @@ shm_chain_new(struct wayland *wayl, bool scrollable, size_t pix_instances,
 
         .pixman_fmt_with_alpha = pixman_fmt_with_alpha,
         .shm_format_with_alpha = shm_fmt_with_alpha,
+
+        .release_cb = release_cb,
+        .cb_data = cb_data,
     };
     return chain;
 }
diff --git a/shm.h b/shm.h
index 6050f1c7..84eb4386 100644
--- a/shm.h
+++ b/shm.h
@@ -50,7 +50,8 @@ void shm_set_min_stride_alignment(size_t min_stride_alignment);
 struct buffer_chain;
 struct buffer_chain *shm_chain_new(
     struct wayland *wayl, bool scrollable, size_t pix_instances,
-    enum shm_bit_depth desired_bit_depth);
+    enum shm_bit_depth desired_bit_depth,
+    void (*release_cb)(struct buffer *buf, void *data), void *cb_data);
 void shm_chain_free(struct buffer_chain *chain);
 
 enum shm_bit_depth shm_chain_bit_depth(const struct buffer_chain *chain);
diff --git a/terminal.c b/terminal.c
index 60506d07..36f8513b 100644
--- a/terminal.c
+++ b/terminal.c
@@ -719,6 +719,9 @@ initialize_render_workers(struct terminal *term)
         goto err_sem_destroy;
     }
 
+    mtx_init(&term->render.workers.preapplied_damage.lock, mtx_plain);
+    cnd_init(&term->render.workers.preapplied_damage.cond);
+
     term->render.workers.threads = xcalloc(
         term->render.workers.count, sizeof(term->render.workers.threads[0]));
 
@@ -1356,13 +1359,13 @@ term_init(const struct config *conf, struct fdm *fdm, struct reaper *reaper,
         .render = {
             .chains = {
                 .grid = shm_chain_new(wayl, true, 1 + conf->render_worker_count,
-                                      desired_bit_depth),
-                .search = shm_chain_new(wayl, false, 1 ,desired_bit_depth),
-                .scrollback_indicator = shm_chain_new(wayl, false, 1, desired_bit_depth),
-                .render_timer = shm_chain_new(wayl, false, 1, desired_bit_depth),
-                .url = shm_chain_new(wayl, false, 1, desired_bit_depth),
-                .csd = shm_chain_new(wayl, false, 1, desired_bit_depth),
-                .overlay = shm_chain_new(wayl, false, 1, desired_bit_depth),
+                                      desired_bit_depth, &render_buffer_release_callback, term),
+                .search = shm_chain_new(wayl, false, 1 ,desired_bit_depth, NULL, NULL),
+                .scrollback_indicator = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL),
+                .render_timer = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL),
+                .url = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL),
+                .csd = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL),
+                .overlay = shm_chain_new(wayl, false, 1, desired_bit_depth, NULL, NULL),
             },
             .scrollback_lines = conf->scrollback.lines,
             .app_sync_updates.timer_fd = app_sync_updates_fd,
@@ -1893,6 +1896,8 @@ term_destroy(struct terminal *term)
         }
     }
     free(term->render.workers.threads);
+    mtx_destroy(&term->render.workers.preapplied_damage.lock);
+    cnd_destroy(&term->render.workers.preapplied_damage.cond);
     mtx_destroy(&term->render.workers.lock);
     sem_destroy(&term->render.workers.start);
     sem_destroy(&term->render.workers.done);
diff --git a/terminal.h b/terminal.h
index 88371b07..364d57b3 100644
--- a/terminal.h
+++ b/terminal.h
@@ -706,6 +706,14 @@ struct terminal {
             tll(int) queue;
             thrd_t *threads;
             struct buffer *buf;
+
+            struct {
+                mtx_t lock;
+                cnd_t cond;
+                struct buffer *buf;
+                struct timespec start;
+                struct timespec stop;
+            } preapplied_damage;
         } workers;
 
         /* Last rendered cursor position */
@@ -716,6 +724,8 @@ struct terminal {
         } last_cursor;
 
         struct buffer *last_buf;     /* Buffer we rendered to last time */
+        size_t frames_since_last_immediate_release;
+        bool preapply_last_frame_damage;
 
         enum overlay_style last_overlay_style;
         struct buffer *last_overlay_buf;