From f3066b1935fb4abacfadf80c878e744862b46d7e Mon Sep 17 00:00:00 2001 From: Boris Faure Date: Mon, 14 Aug 2023 15:15:39 +0200 Subject: [PATCH] tools: allow user to decide whether emoji are double width --- tools/unicode_dbl_width.py | 222 ++++++++++++++++++++++--------------- 1 file changed, 131 insertions(+), 91 deletions(-) diff --git a/tools/unicode_dbl_width.py b/tools/unicode_dbl_width.py index 6c5cf138..5f93e0b5 100755 --- a/tools/unicode_dbl_width.py +++ b/tools/unicode_dbl_width.py @@ -12,14 +12,16 @@ import xml.etree.ElementTree as ET URange = namedtuple('unicode_range', ['width', 'start', 'end']) -def get_ranges(xmlfile, emoji_as_wide): +def get_ranges(xmlfile): tree = ET.parse(xmlfile) root = tree.getroot() repertoire = root.find("{http://www.unicode.org/ns/2003/ucd/1.0}repertoire") chars = repertoire.findall("{http://www.unicode.org/ns/2003/ucd/1.0}char") - ranges = [] - r = URange('N', 0, 0) + ranges_basic = [] + ranges_emoji_double = [] + r_basic = URange('N', 0, 0) + r_emoji_dbl = URange('N', 0, 0) for c in chars: ea = c.get('ea') if ea in ('Na', 'H'): @@ -29,26 +31,35 @@ def get_ranges(xmlfile, emoji_as_wide): cp = c.get('cp') if not cp: continue - if emoji_as_wide: - ext_pic = c.get('ExtPict') - emoji = c.get('Emoji') - if emoji == 'Y' and ext_pic == 'Y' and ea != 'A': - ea = 'W' - else: - blk = c.get('blk') - if blk == 'Misc_Pictographs': - ea = 'W' - cp = int(cp, 16) - if ea != r[0]: - ranges.append(r) - r = URange(ea, cp, cp) + + # basic + if ea != r_basic[0]: + ranges_basic.append(r_basic) + r_basic = URange(ea, cp, cp) else: - r = r._replace(end=cp) + r_basic = r_basic._replace(end=cp) - ranges.append(r) + # emoji as wide + ext_pic = c.get('ExtPict') + emoji = c.get('Emoji') + if emoji == 'Y' and ext_pic == 'Y' and ea != 'A': + ea = 'W' + else: + blk = c.get('blk') + if blk == 'Misc_Pictographs': + ea = 'W' + if ea != r_emoji_dbl[0]: + ranges_emoji_double.append(r_emoji_dbl) + r_emoji_dbl = URange(ea, cp, cp) + else: + r_emoji_dbl = r_emoji_dbl._replace(end=cp) + + ranges_basic.append(r_basic) + ranges_emoji_double.append(r_emoji_dbl) + + return (ranges_basic, ranges_emoji_double) - return ranges def merge_ranges(ranges, is_same_width): res = [] @@ -69,109 +80,138 @@ def skip_ranges(ranges, width_skipped): res.append(r) return res -def gen_header(cur_range, file_header): +def gen_header(mininum_codepoint, file_header): file_header.write( """/* XXX: Code generated by tool unicode_dbl_width.py */ #ifndef TERMINOLOGY_TERMPTY_DBL_H_ #define TERMINOLOGY_TERMPTY_DBL_H_ 1 -Eina_Bool _termpty_is_wide(const Eina_Unicode g); -Eina_Bool _termpty_is_ambigous_wide(const Eina_Unicode g); +Eina_Bool _termpty_is_wide(const Eina_Unicode g, Eina_Bool emoji_dbl_width); +Eina_Bool _termpty_is_ambigous_wide(const Eina_Unicode g, Eina_Bool emoji_dbl_width); static inline Eina_Bool _termpty_is_dblwidth_get(const Termpty *ty, const Eina_Unicode g) { /* optimize for latin1 non-ambiguous */ """) - file_header.write(f" if (g <= 0x{cur_range.end:X})") + file_header.write(f" if (g <= 0x{mininum_codepoint:X})") file_header.write( """ return EINA_FALSE; if (!ty->termstate.cjk_ambiguous_wide) - return _termpty_is_wide(g); + return _termpty_is_wide(g, ty->config->emoji_dbl_width); else - return _termpty_is_ambigous_wide(g); + return _termpty_is_ambigous_wide(g, ty->config->emoji_dbl_width); } #endif """) -def gen_ambigous(ranges, file_source): +def gen_ambigous(ranges_basic, ranges_emoji_double, file_source): + def handle_ranges(ranges): + def is_same_width(r1, r2): + if r1.width == 'N': + return r2.width == 'N' + else: + return r2.width in ('A', 'W') + ranges = merge_ranges(ranges[1:], is_same_width) + ranges = skip_ranges(ranges, ('N',)) + fallthrough = " EINA_FALLTHROUGH;" + for idx, r in enumerate(ranges): + if r.width == 'N': + continue; + if idx == len(ranges) -1: + fallthrough = "" + if r.start == r.end: + file_source.write(f" case 0x{r.start:X}:{fallthrough}\n") + else: + file_source.write(f" case 0x{r.start:X} ... 0x{r.end:X}:{fallthrough}\n") + file_source.write( """ __attribute__((const)) Eina_Bool -_termpty_is_ambigous_wide(Eina_Unicode g) +_termpty_is_ambigous_wide(Eina_Unicode g, Eina_Bool emoji_dbl_width) { - switch (g) - { + if (emoji_dbl_width) + { + switch (g) + { """) - def is_same_width(r1, r2): - if r1.width == 'N': - return r2.width == 'N' - else: - return r2.width in ('A', 'W') - ranges = merge_ranges(ranges[1:], is_same_width) - ranges = skip_ranges(ranges, ('N',)) - - fallthrough = " EINA_FALLTHROUGH;" - for idx, r in enumerate(ranges): - if r.width == 'N': - continue; - if idx == len(ranges) -1: - fallthrough = "" - if r.start == r.end: - file_source.write(f" case 0x{r.start:X}:{fallthrough}\n") - else: - file_source.write(f" case 0x{r.start:X} ... 0x{r.end:X}:{fallthrough}\n") - + handle_ranges(ranges_emoji_double) file_source.write( -""" - return EINA_TRUE; - } - return EINA_FALSE; -} +""" return EINA_TRUE; + } + } + else + { + switch (g) + { """) - -def gen_wide(ranges, file_source): + handle_ranges(ranges_basic) file_source.write( -""" -__attribute__((const)) -Eina_Bool -_termpty_is_wide(Eina_Unicode g) -{ - switch (g) - { -""") - def is_same_width(r1, r2): - if r1.width in ('N', 'A'): - return r2.width in ('N', 'A') - else: - return r2.width == 'W' - ranges = merge_ranges(ranges[1:], is_same_width) - ranges = skip_ranges(ranges, ('N', 'A')) - fallthrough = " EINA_FALLTHROUGH;" - for idx, r in enumerate(ranges): - if r.width in ('N', 'A'): - continue; - if idx == len(ranges) -1: - fallthrough = "" - if r.start == r.end: - file_source.write(f" case 0x{r.start:X}:{fallthrough}\n") - else: - file_source.write(f" case 0x{r.start:X} ... 0x{r.end:X}:{fallthrough}\n") - - file_source.write( -""" - return EINA_TRUE; - } +""" return EINA_TRUE; + } + } return EINA_FALSE; } """) -def gen_c(ranges, file_header, file_source): - gen_header(ranges[0], file_header) +def gen_wide(ranges_basic, ranges_emoji_double, file_source): + def handle_ranges(ranges): + def is_same_width(r1, r2): + if r1.width in ('N', 'A'): + return r2.width in ('N', 'A') + else: + return r2.width == 'W' + ranges = merge_ranges(ranges[1:], is_same_width) + ranges = skip_ranges(ranges, ('N', 'A')) + fallthrough = " EINA_FALLTHROUGH;" + for idx, r in enumerate(ranges): + if r.width in ('N', 'A'): + continue; + if idx == len(ranges) -1: + fallthrough = "" + if r.start == r.end: + file_source.write(f" case 0x{r.start:X}:{fallthrough}\n") + else: + file_source.write(f" case 0x{r.start:X} ... 0x{r.end:X}:{fallthrough}\n") + + file_source.write( +""" +__attribute__((const)) +Eina_Bool +_termpty_is_wide(Eina_Unicode g, Eina_Bool emoji_dbl_width) +{ + if (emoji_dbl_width) + { + switch (g) + { +""") + handle_ranges(ranges_emoji_double) + file_source.write( +""" return EINA_TRUE; + } + } + else + { + switch (g) + { +""") + handle_ranges(ranges_basic) + file_source.write( +""" return EINA_TRUE; + } + } + return EINA_FALSE; +} +""") + + +def gen_c(ranges_basic, ranges_emoji_double, file_header, file_source): + mininum_codepoint = min(ranges_basic[0].end, ranges_emoji_double[0].end) + gen_header(mininum_codepoint, file_header) file_source.write( """/* XXX: Code generated by tool unicode_dbl_width.py */ #include "private.h" @@ -180,8 +220,8 @@ def gen_c(ranges, file_header, file_source): #include "termpty.h" #include "termptydbl.h" """) - gen_ambigous(ranges, file_source) - gen_wide(ranges, file_source) + gen_ambigous(ranges_basic, ranges_emoji_double, file_source) + gen_wide(ranges_basic, ranges_emoji_double, file_source) parser = argparse.ArgumentParser(description='Generate code handling different widths of unicode codepoints.') parser.add_argument('xml', type=argparse.FileType('r')) @@ -190,5 +230,5 @@ parser.add_argument('source', type=argparse.FileType('w')) args = parser.parse_args() -ranges = get_ranges(args.xml, False) -gen_c(ranges, args.header, args.source) +(ranges_basic, ranges_emoji_double) = get_ranges(args.xml) +gen_c(ranges_basic, ranges_emoji_double, args.header, args.source)