vt: utf8: don't discard current byte when an invalid UTF-8 sequence is detected

Example:

  printf "pok\xe9mon\n"

would result in 'pokon' - the 'm' has been discarded along with E9.

While correct, in some sense, it's perhaps not intuitive.

This patch changes the VT parser to instead discard everything up to
the invalid byte, but then try the invalid byte from the ground
state. This way, invalid UTF-8 sequences followed by both plain ASCII,
or longer (and valid) UTF-8 sequences are printed as expected instead
of being discarded.
This commit is contained in:
Daniel Eklöf 2025-03-18 14:37:28 +01:00
parent 6813b321f5
commit 878e07da59
No known key found for this signature in database
GPG key ID: 5BBD4992C116573F
2 changed files with 10 additions and 6 deletions

View file

@ -62,6 +62,10 @@
## Unreleased ## Unreleased
### Added ### Added
### Changed ### Changed
* UTF-8 error recovery now discards fewer bytes.
### Deprecated ### Deprecated
### Removed ### Removed
### Fixed ### Fixed

12
vt.c
View file

@ -1041,7 +1041,7 @@ state_utf8_21_switch(struct terminal *term, uint8_t data)
switch (data) { switch (data) {
/* exit current enter new state */ /* exit current enter new state */
case 0x80 ... 0xbf: action_utf8_22(term, data); return STATE_GROUND; case 0x80 ... 0xbf: action_utf8_22(term, data); return STATE_GROUND;
default: return STATE_GROUND; default: return state_ground_switch(term, data);
} }
} }
@ -1051,7 +1051,7 @@ state_utf8_31_switch(struct terminal *term, uint8_t data)
switch (data) { switch (data) {
/* exit current enter new state */ /* exit current enter new state */
case 0x80 ... 0xbf: action_utf8_32(term, data); return STATE_UTF8_32; case 0x80 ... 0xbf: action_utf8_32(term, data); return STATE_UTF8_32;
default: return STATE_GROUND; default: return state_ground_switch(term, data);
} }
} }
@ -1061,7 +1061,7 @@ state_utf8_32_switch(struct terminal *term, uint8_t data)
switch (data) { switch (data) {
/* exit current enter new state */ /* exit current enter new state */
case 0x80 ... 0xbf: action_utf8_33(term, data); return STATE_GROUND; case 0x80 ... 0xbf: action_utf8_33(term, data); return STATE_GROUND;
default: return STATE_GROUND; default: return state_ground_switch(term, data);
} }
} }
@ -1071,7 +1071,7 @@ state_utf8_41_switch(struct terminal *term, uint8_t data)
switch (data) { switch (data) {
/* exit current enter new state */ /* exit current enter new state */
case 0x80 ... 0xbf: action_utf8_42(term, data); return STATE_UTF8_42; case 0x80 ... 0xbf: action_utf8_42(term, data); return STATE_UTF8_42;
default: return STATE_GROUND; default: return state_ground_switch(term, data);
} }
} }
@ -1081,7 +1081,7 @@ state_utf8_42_switch(struct terminal *term, uint8_t data)
switch (data) { switch (data) {
/* exit current enter new state */ /* exit current enter new state */
case 0x80 ... 0xbf: action_utf8_43(term, data); return STATE_UTF8_43; case 0x80 ... 0xbf: action_utf8_43(term, data); return STATE_UTF8_43;
default: return STATE_GROUND; default: return state_ground_switch(term, data);
} }
} }
@ -1091,7 +1091,7 @@ state_utf8_43_switch(struct terminal *term, uint8_t data)
switch (data) { switch (data) {
/* exit current enter new state */ /* exit current enter new state */
case 0x80 ... 0xbf: action_utf8_44(term, data); return STATE_GROUND; case 0x80 ... 0xbf: action_utf8_44(term, data); return STATE_GROUND;
default: return STATE_GROUND; default: return state_ground_switch(term, data);
} }
} }