From 085c60a33423bf46f64f225e438f62ad12fff8c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= Date: Wed, 26 Jun 2024 18:30:17 +0200 Subject: [PATCH] scripts: generate-emoji-variation-sequences: don't assume single codepoint sequences Right now (Unicode 15.1), all valid variation sequences consist of a single Unicode codepoint (followed by either VS-15 or VS-16). Don't assume this is the case. We don't actually handle longer sequences. But now we at least catch such escapes, and error out. --- scripts/generate-emoji-variation-sequences.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/scripts/generate-emoji-variation-sequences.py b/scripts/generate-emoji-variation-sequences.py index 2c71594c..e05b6290 100644 --- a/scripts/generate-emoji-variation-sequences.py +++ b/scripts/generate-emoji-variation-sequences.py @@ -30,11 +30,18 @@ def main(): if line[0] == '#': continue - cp, vs, _ = line.split(' ', maxsplit=2) - cp = int(cp, 16) - vs = int(vs, 16) + # Example: "0023 FE0E ; text style; # (1.1) NUMBER SIGN" + cps, _ = line.split(';', maxsplit=1) # cps = "0023 FE0F " + cps = cps.strip().split(' ') # cps = ["0023", "FE0F"] - assert vs == 0xfe0e or vs == 0xfe0f + if len(cps) != 2: + raise NotImplementedError(f'emoji variation sequences with more than one base codepoint: {cps}') + + cp, vs = cps # cp = "0023", vs = "FE0F" + cp = int(cp, 16) # cp = 0x23 + vs = int(vs, 16) # vs = 0xfe0f + + assert vs in [0xfe0e, 0xfe0f] if cp not in codepoints: codepoints[cp] = Codepoint(cp)