From f3066b1935fb4abacfadf80c878e744862b46d7e Mon Sep 17 00:00:00 2001
From: Boris Faure <billiob@gmail.com>
Date: Mon, 14 Aug 2023 15:15:39 +0200
Subject: [PATCH] tools: allow user to decide whether emoji are double width

---
 tools/unicode_dbl_width.py | 222 ++++++++++++++++++++++---------------
 1 file changed, 131 insertions(+), 91 deletions(-)

diff --git a/tools/unicode_dbl_width.py b/tools/unicode_dbl_width.py
index 6c5cf138..5f93e0b5 100755
--- a/tools/unicode_dbl_width.py
+++ b/tools/unicode_dbl_width.py
@@ -12,14 +12,16 @@ import xml.etree.ElementTree as ET
 
 URange = namedtuple('unicode_range', ['width', 'start', 'end'])
 
-def get_ranges(xmlfile, emoji_as_wide):
+def get_ranges(xmlfile):
     tree = ET.parse(xmlfile)
     root = tree.getroot()
     repertoire = root.find("{http://www.unicode.org/ns/2003/ucd/1.0}repertoire")
     chars = repertoire.findall("{http://www.unicode.org/ns/2003/ucd/1.0}char")
 
-    ranges = []
-    r = URange('N', 0, 0)
+    ranges_basic = []
+    ranges_emoji_double = []
+    r_basic = URange('N', 0, 0)
+    r_emoji_dbl = URange('N', 0, 0)
     for c in chars:
         ea = c.get('ea')
         if ea in ('Na', 'H'):
@@ -29,26 +31,35 @@ def get_ranges(xmlfile, emoji_as_wide):
         cp = c.get('cp')
         if not cp:
             continue
-        if emoji_as_wide:
-            ext_pic = c.get('ExtPict')
-            emoji = c.get('Emoji')
-            if emoji == 'Y' and ext_pic == 'Y' and ea != 'A':
-                ea = 'W'
-            else:
-                blk = c.get('blk')
-                if blk == 'Misc_Pictographs':
-                    ea = 'W'
-
         cp = int(cp, 16)
-        if ea != r[0]:
-            ranges.append(r)
-            r = URange(ea, cp, cp)
+
+        # basic
+        if ea != r_basic[0]:
+            ranges_basic.append(r_basic)
+            r_basic = URange(ea, cp, cp)
         else:
-            r = r._replace(end=cp)
+            r_basic = r_basic._replace(end=cp)
 
-    ranges.append(r)
+        # emoji as wide
+        ext_pic = c.get('ExtPict')
+        emoji = c.get('Emoji')
+        if emoji == 'Y' and ext_pic == 'Y' and ea != 'A':
+            ea = 'W'
+        else:
+            blk = c.get('blk')
+            if blk == 'Misc_Pictographs':
+                ea = 'W'
+        if ea != r_emoji_dbl[0]:
+            ranges_emoji_double.append(r_emoji_dbl)
+            r_emoji_dbl = URange(ea, cp, cp)
+        else:
+            r_emoji_dbl = r_emoji_dbl._replace(end=cp)
+
+    ranges_basic.append(r_basic)
+    ranges_emoji_double.append(r_emoji_dbl)
+
+    return (ranges_basic, ranges_emoji_double)
 
-    return ranges
 
 def merge_ranges(ranges, is_same_width):
     res = []
@@ -69,109 +80,138 @@ def skip_ranges(ranges, width_skipped):
             res.append(r)
     return res
 
-def gen_header(cur_range, file_header):
+def gen_header(mininum_codepoint, file_header):
     file_header.write(
 """/* XXX: Code generated by tool unicode_dbl_width.py */
 #ifndef TERMINOLOGY_TERMPTY_DBL_H_
 #define TERMINOLOGY_TERMPTY_DBL_H_ 1
 
-Eina_Bool _termpty_is_wide(const Eina_Unicode g);
-Eina_Bool _termpty_is_ambigous_wide(const Eina_Unicode g);
+Eina_Bool _termpty_is_wide(const Eina_Unicode g, Eina_Bool emoji_dbl_width);
+Eina_Bool _termpty_is_ambigous_wide(const Eina_Unicode g, Eina_Bool emoji_dbl_width);
 
 static inline Eina_Bool
 _termpty_is_dblwidth_get(const Termpty *ty, const Eina_Unicode g)
 {
    /* optimize for latin1 non-ambiguous */
 """)
-    file_header.write(f"   if (g <= 0x{cur_range.end:X})")
+    file_header.write(f"   if (g <= 0x{mininum_codepoint:X})")
     file_header.write(
 """
      return EINA_FALSE;
    if (!ty->termstate.cjk_ambiguous_wide)
-     return _termpty_is_wide(g);
+     return _termpty_is_wide(g, ty->config->emoji_dbl_width);
    else
-     return _termpty_is_ambigous_wide(g);
+     return _termpty_is_ambigous_wide(g, ty->config->emoji_dbl_width);
 }
 
 #endif
 """)
 
-def gen_ambigous(ranges, file_source):
+def gen_ambigous(ranges_basic, ranges_emoji_double, file_source):
+    def handle_ranges(ranges):
+        def is_same_width(r1, r2):
+            if r1.width == 'N':
+                return r2.width == 'N'
+            else:
+                return r2.width in ('A', 'W')
+        ranges = merge_ranges(ranges[1:], is_same_width)
+        ranges = skip_ranges(ranges, ('N',))
+        fallthrough = " EINA_FALLTHROUGH;"
+        for idx, r in enumerate(ranges):
+            if r.width == 'N':
+                continue;
+            if idx == len(ranges) -1:
+                fallthrough = ""
+            if r.start == r.end:
+                file_source.write(f"           case 0x{r.start:X}:{fallthrough}\n")
+            else:
+                file_source.write(f"           case 0x{r.start:X} ... 0x{r.end:X}:{fallthrough}\n")
+
     file_source.write(
 """
 __attribute__((const))
 Eina_Bool
-_termpty_is_ambigous_wide(Eina_Unicode g)
+_termpty_is_ambigous_wide(Eina_Unicode g, Eina_Bool emoji_dbl_width)
 {
-    switch (g)
-      {
+   if (emoji_dbl_width)
+     {
+        switch (g)
+          {
 """)
-    def is_same_width(r1, r2):
-        if r1.width == 'N':
-            return r2.width == 'N'
-        else:
-            return r2.width in ('A', 'W')
-    ranges = merge_ranges(ranges[1:], is_same_width)
-    ranges = skip_ranges(ranges, ('N',))
-
-    fallthrough = " EINA_FALLTHROUGH;"
-    for idx, r in enumerate(ranges):
-        if r.width == 'N':
-            continue;
-        if idx == len(ranges) -1:
-            fallthrough = ""
-        if r.start == r.end:
-            file_source.write(f"       case 0x{r.start:X}:{fallthrough}\n")
-        else:
-            file_source.write(f"       case 0x{r.start:X} ... 0x{r.end:X}:{fallthrough}\n")
-
+    handle_ranges(ranges_emoji_double)
     file_source.write(
-"""
-        return EINA_TRUE;
-    }
-   return EINA_FALSE;
-}
+"""             return EINA_TRUE;
+         }
+     }
+   else
+     {
+        switch (g)
+          {
 """)
-
-def gen_wide(ranges, file_source):
+    handle_ranges(ranges_basic)
     file_source.write(
-"""
-__attribute__((const))
-Eina_Bool
-_termpty_is_wide(Eina_Unicode g)
-{
-    switch (g)
-      {
-""")
-    def is_same_width(r1, r2):
-        if r1.width in ('N', 'A'):
-            return r2.width in ('N', 'A')
-        else:
-            return r2.width == 'W'
-    ranges = merge_ranges(ranges[1:], is_same_width)
-    ranges = skip_ranges(ranges, ('N', 'A'))
-    fallthrough = " EINA_FALLTHROUGH;"
-    for idx, r in enumerate(ranges):
-        if r.width in ('N', 'A'):
-            continue;
-        if idx == len(ranges) -1:
-            fallthrough = ""
-        if r.start == r.end:
-            file_source.write(f"       case 0x{r.start:X}:{fallthrough}\n")
-        else:
-            file_source.write(f"       case 0x{r.start:X} ... 0x{r.end:X}:{fallthrough}\n")
-
-    file_source.write(
-"""
-        return EINA_TRUE;
-    }
+"""             return EINA_TRUE;
+          }
+     }
    return EINA_FALSE;
 }
 """)
 
 
-def gen_c(ranges, file_header, file_source):
-    gen_header(ranges[0], file_header)
+def gen_wide(ranges_basic, ranges_emoji_double, file_source):
+    def handle_ranges(ranges):
+        def is_same_width(r1, r2):
+            if r1.width in ('N', 'A'):
+                return r2.width in ('N', 'A')
+            else:
+                return r2.width == 'W'
+        ranges = merge_ranges(ranges[1:], is_same_width)
+        ranges = skip_ranges(ranges, ('N', 'A'))
+        fallthrough = " EINA_FALLTHROUGH;"
+        for idx, r in enumerate(ranges):
+            if r.width in ('N', 'A'):
+                continue;
+            if idx == len(ranges) -1:
+                fallthrough = ""
+            if r.start == r.end:
+                file_source.write(f"        case 0x{r.start:X}:{fallthrough}\n")
+            else:
+                file_source.write(f"        case 0x{r.start:X} ... 0x{r.end:X}:{fallthrough}\n")
+
+    file_source.write(
+"""
+__attribute__((const))
+Eina_Bool
+_termpty_is_wide(Eina_Unicode g, Eina_Bool emoji_dbl_width)
+{
+   if (emoji_dbl_width)
+     {
+        switch (g)
+          {
+""")
+    handle_ranges(ranges_emoji_double)
+    file_source.write(
+"""             return EINA_TRUE;
+          }
+     }
+   else
+     {
+        switch (g)
+          {
+""")
+    handle_ranges(ranges_basic)
+    file_source.write(
+"""             return EINA_TRUE;
+          }
+     }
+   return EINA_FALSE;
+}
+""")
+
+
+def gen_c(ranges_basic, ranges_emoji_double, file_header, file_source):
+    mininum_codepoint = min(ranges_basic[0].end, ranges_emoji_double[0].end)
+    gen_header(mininum_codepoint, file_header)
     file_source.write(
 """/* XXX: Code generated by tool unicode_dbl_width.py */
 #include "private.h"
@@ -180,8 +220,8 @@ def gen_c(ranges, file_header, file_source):
 #include "termpty.h"
 #include "termptydbl.h"
 """)
-    gen_ambigous(ranges, file_source)
-    gen_wide(ranges, file_source)
+    gen_ambigous(ranges_basic, ranges_emoji_double, file_source)
+    gen_wide(ranges_basic, ranges_emoji_double, file_source)
 
 parser = argparse.ArgumentParser(description='Generate code handling different widths of unicode codepoints.')
 parser.add_argument('xml', type=argparse.FileType('r'))
@@ -190,5 +230,5 @@ parser.add_argument('source', type=argparse.FileType('w'))
 
 args = parser.parse_args()
 
-ranges = get_ranges(args.xml, False)
-gen_c(ranges, args.header, args.source)
+(ranges_basic, ranges_emoji_double) = get_ranges(args.xml)
+gen_c(ranges_basic, ranges_emoji_double, args.header, args.source)