From 878e07da59855d62e89eba5f5f479a5ee598998e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Tue, 18 Mar 2025 14:37:28 +0100 Subject: [PATCH] vt: utf8: don't discard current byte when an invalid UTF-8 sequence is detected Example: printf "pok\xe9mon\n" would result in 'pokon' - the 'm' has been discarded along with E9. While correct, in some sense, it's perhaps not intuitive. This patch changes the VT parser to instead discard everything up to the invalid byte, but then try the invalid byte from the ground state. This way, invalid UTF-8 sequences followed by both plain ASCII, or longer (and valid) UTF-8 sequences are printed as expected instead of being discarded. --- CHANGELOG.md | 4 ++++ vt.c | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30f4dc75..ac022c4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,10 @@ ## Unreleased ### Added ### Changed + +* UTF-8 error recovery now discards fewer bytes. + + ### Deprecated ### Removed ### Fixed diff --git a/vt.c b/vt.c index 9c758c55..173b59a6 100644 --- a/vt.c +++ b/vt.c @@ -1041,7 +1041,7 @@ state_utf8_21_switch(struct terminal *term, uint8_t data) switch (data) { /* exit current enter new state */ case 0x80 ... 0xbf: action_utf8_22(term, data); return STATE_GROUND; - default: return STATE_GROUND; + default: return state_ground_switch(term, data); } } @@ -1051,7 +1051,7 @@ state_utf8_31_switch(struct terminal *term, uint8_t data) switch (data) { /* exit current enter new state */ case 0x80 ... 0xbf: action_utf8_32(term, data); return STATE_UTF8_32; - default: return STATE_GROUND; + default: return state_ground_switch(term, data); } } @@ -1061,7 +1061,7 @@ state_utf8_32_switch(struct terminal *term, uint8_t data) switch (data) { /* exit current enter new state */ case 0x80 ... 0xbf: action_utf8_33(term, data); return STATE_GROUND; - default: return STATE_GROUND; + default: return state_ground_switch(term, data); } } @@ -1071,7 +1071,7 @@ state_utf8_41_switch(struct terminal *term, uint8_t data) switch (data) { /* exit current enter new state */ case 0x80 ... 0xbf: action_utf8_42(term, data); return STATE_UTF8_42; - default: return STATE_GROUND; + default: return state_ground_switch(term, data); } } @@ -1081,7 +1081,7 @@ state_utf8_42_switch(struct terminal *term, uint8_t data) switch (data) { /* exit current enter new state */ case 0x80 ... 0xbf: action_utf8_43(term, data); return STATE_UTF8_43; - default: return STATE_GROUND; + default: return state_ground_switch(term, data); } } @@ -1091,7 +1091,7 @@ state_utf8_43_switch(struct terminal *term, uint8_t data) switch (data) { /* exit current enter new state */ case 0x80 ... 0xbf: action_utf8_44(term, data); return STATE_GROUND; - default: return STATE_GROUND; + default: return state_ground_switch(term, data); } }