From 085c60a33423bf46f64f225e438f62ad12fff8c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Ekl=C3=B6f?= <daniel@ekloef.se>
Date: Wed, 26 Jun 2024 18:30:17 +0200
Subject: [PATCH] scripts: generate-emoji-variation-sequences: don't assume
 single codepoint sequences

Right now (Unicode 15.1), all valid variation sequences consist of a
single Unicode codepoint (followed by either VS-15 or VS-16).

Don't assume this is the case.

We don't actually handle longer sequences. But now we at least catch
such escapes, and error out.
---
 scripts/generate-emoji-variation-sequences.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/scripts/generate-emoji-variation-sequences.py b/scripts/generate-emoji-variation-sequences.py
index 2c71594c..e05b6290 100644
--- a/scripts/generate-emoji-variation-sequences.py
+++ b/scripts/generate-emoji-variation-sequences.py
@@ -30,11 +30,18 @@ def main():
         if line[0] == '#':
             continue
 
-        cp, vs, _ = line.split(' ', maxsplit=2)
-        cp = int(cp, 16)
-        vs = int(vs, 16)
+        # Example: "0023 FE0E  ; text style;  # (1.1) NUMBER SIGN"
+        cps, _ = line.split(';', maxsplit=1)  # cps = "0023 FE0F  "
+        cps = cps.strip().split(' ')          # cps = ["0023", "FE0F"]
 
-        assert vs == 0xfe0e or vs == 0xfe0f
+        if len(cps) != 2:
+            raise NotImplementedError(f'emoji variation sequences with more than one base codepoint: {cps}')
+
+        cp, vs = cps       # cp = "0023", vs = "FE0F"
+        cp = int(cp, 16)   # cp = 0x23
+        vs = int(vs, 16)   # vs = 0xfe0f
+
+        assert vs in [0xfe0e, 0xfe0f]
 
         if cp not in codepoints:
             codepoints[cp] = Codepoint(cp)