matrixion/scripts/emoji_codegen.py

#!/usr/bin/env python3

import sys
import re
from unidecode import unidecode
from jinja2 import Template


class Emoji(object):
    def __init__(self, code, shortname, unicodename):
        self.code = ''.join(['\\U'+c.rjust(8, '0') for c in code.strip().split(' ')])
        self.shortname = shortname
        self.unicodename = unicodename

def generate_qml_list(**kwargs):
    tmpl = Template('''
const QVector<Emoji> emoji::Provider::emoji = {
    {%- for c in kwargs.items() %}
    // {{ c[0].capitalize() }}
    {%- for e in c[1] %}
    Emoji{QStringLiteral(u"{{ e.code }}"), QStringLiteral(u"{{ e.shortname }}"), QStringLiteral(u"{{ e.unicodename }}"), emoji::Emoji::Category::{{ c[0].capitalize() }}},
    {%- endfor %}
    {%- endfor %}
};
    ''')
    d = dict(kwargs=kwargs)
    print(tmpl.render(d))
if __name__ == '__main__':
    if len(sys.argv) < 3:
        print('usage: emoji_codegen.py /path/to/emoji-test.txt /path/to/shortcodes.txt')
        sys.exit(1)

    filename = sys.argv[1]
    shortcodefilename = sys.argv[2]

    people = []
    nature = []
    food = []
    activity = []
    travel = []
    objects = []
    symbols = []
    flags = []

    categories = {
        'Smileys & Emotion': people,
        'People & Body': people,
        'Animals & Nature': nature,
        'Food & Drink': food,
        'Travel & Places': travel,
        'Activities': activity,
        'Objects': objects,
        'Symbols': symbols,
        'Flags': flags,
        'Component': symbols
    }
    shortcodeDict = {} 
    # for my sanity - this strips newlines
    for line in open(shortcodefilename, 'r', encoding="utf8"): 
        longname, shortname = line.strip().split(':')
        shortcodeDict[longname] = shortname
    current_category = ''
    for line in open(filename, 'r', encoding="utf8"):
        if line.startswith('# group:'):
            current_category = line.split(':', 1)[1].strip()

        if not line or line.startswith('#'):
            continue

        segments = re.split(r'\s+[#;] ', line.strip())
        if len(segments) != 3:
            continue

        code, qualification, charAndName = segments

        # skip unqualified versions of same unicode
        if qualification != 'fully-qualified':
            continue
        

        char, name = re.match(r'^(\S+) E\d+\.\d+ (.*)$', charAndName).groups()
        shortname = name
        # until skin tone is handled, keep them around
        # discard skin tone variants for sanity
        # __contains__ is so stupid i hate prototype languages
        # if name.__contains__("skin tone") and qualification != 'component': 
        #    continue
        # if qualification == 'component' and not name.__contains__("skin tone"): 
        #    continue
        #TODO: Handle skintone modifiers in a sane way
        basicallyTheSame = False
        if code in shortcodeDict: 
            shortname = shortcodeDict[code]
        else:
            shortname = shortname.lower()
            if shortname.endswith(' (blood type)'): 
                shortname = shortname[:-13]
            if shortname.endswith(': red hair'): 
                shortname = "red_haired_" + shortname[:-10]
            if shortname.endswith(': curly hair'): 
                shortname = "curly_haired_" + shortname[:-12]
            if shortname.endswith(': white hair'): 
                shortname = "white_haried_" + shortname[:-12]
            if shortname.endswith(': bald'): 
                shortname = "bald_" + shortname[:-6]
            if shortname.endswith(': beard'): 
                shortname = "bearded_" + shortname[:-7]
            if shortname.endswith(' face'): 
                shortname = shortname[:-5]
            if shortname.endswith(' button'): 
                shortname = shortname[:-7] 
            if shortname.endswith(' banknote'): 
                shortname = shortname[:-9]
                
            # FIXME: Is there a better way to do this?
            matchobj = re.match(r'^flag: (.*)$', shortname) 
            if shortname.startswith("flag: "): 
                country = shortname[5:]
                shortname = country + " flag"
            shortname = shortname.replace("u.s.", "us")
            shortname = shortname.replace("&", "and")
            
            if shortname == name.lower(): 
                basicallyTheSame = True

            shortname = shortname.replace("-", "_")
            shortname = re.sub(r'\W', '_', shortname)
            shortname, = re.match(r'^_*(.+)_*$', shortname).groups()
            shortname = re.sub(r'_{2,}', '_', shortname) 
            shortname = unidecode(shortname)
        # if basicallyTheSame: 
        #    shortname = ""
        categories[current_category].append(Emoji(code, shortname, name))

    # Use xclip to pipe the output to clipboard.
    # e.g ./codegen.py emoji.json | xclip -sel clip
    # alternatively - delete the var from src/emoji/Provider.cpp, and do ./codegen.py emojis shortcodes >> src/emoji/Provider.cpp
    generate_qml_list(people=people, nature=nature, food=food, activity=activity, travel=travel, objects=objects, symbols=symbols, flags=flags)
Add full emoji support 2017-04-23 21:31:08 +03:00			`#!/usr/bin/env python3`

			`import sys`
Update emoji to unicode 13.0 2020-01-24 06:18:14 +03:00			`import re`
Add transforms and shortcodes to emoji Signed-off-by: BulbyVR <26726264+TheDrawingCoder-Gamer@users.noreply.github.com> 2022-04-21 20:25:39 +03:00			`from unidecode import unidecode`
Add full emoji support 2017-04-23 21:31:08 +03:00			`from jinja2 import Template`


			`class Emoji(object):`
Allow search with unicode names 2022-04-22 19:49:57 +03:00			`def __init__(self, code, shortname, unicodename):`
Use fully qualified emoji and use literals 2022-01-01 08:16:37 +03:00			`self.code = ''.join(['\\U'+c.rjust(8, '0') for c in code.strip().split(' ')])`
Add full emoji support 2017-04-23 21:31:08 +03:00			`self.shortname = shortname`
Allow search with unicode names 2022-04-22 19:49:57 +03:00			`self.unicodename = unicodename`
Add full emoji support 2017-04-23 21:31:08 +03:00
Add new QML-based emoji picker (work in progress) This is necessary to support having a picker within QML. Eventually, this should replace the existing widget-based one. 2020-05-13 07:35:26 +03:00			`def generate_qml_list(**kwargs):`
			`tmpl = Template('''`
			`const QVector<Emoji> emoji::Provider::emoji = {`
			`{%- for c in kwargs.items() %}`
			`// {{ c[0].capitalize() }}`
			`{%- for e in c[1] %}`
Allow search with unicode names 2022-04-22 19:49:57 +03:00			`Emoji{QStringLiteral(u"{{ e.code }}"), QStringLiteral(u"{{ e.shortname }}"), QStringLiteral(u"{{ e.unicodename }}"), emoji::Emoji::Category::{{ c[0].capitalize() }}},`
Add new QML-based emoji picker (work in progress) This is necessary to support having a picker within QML. Eventually, this should replace the existing widget-based one. 2020-05-13 07:35:26 +03:00			`{%- endfor %}`
			`{%- endfor %}`
			`};`
			`''')`
			`d = dict(kwargs=kwargs)`
			`print(tmpl.render(d))`
Add full emoji support 2017-04-23 21:31:08 +03:00			`if __name__ == '__main__':`
Add transforms and shortcodes to emoji Signed-off-by: BulbyVR <26726264+TheDrawingCoder-Gamer@users.noreply.github.com> 2022-04-21 20:25:39 +03:00			`if len(sys.argv) < 3:`
			`print('usage: emoji_codegen.py /path/to/emoji-test.txt /path/to/shortcodes.txt')`
Add full emoji support 2017-04-23 21:31:08 +03:00			`sys.exit(1)`

			`filename = sys.argv[1]`
Add transforms and shortcodes to emoji Signed-off-by: BulbyVR <26726264+TheDrawingCoder-Gamer@users.noreply.github.com> 2022-04-21 20:25:39 +03:00			`shortcodefilename = sys.argv[2]`
Add full emoji support 2017-04-23 21:31:08 +03:00
Update emoji to unicode 13.0 2020-01-24 06:18:14 +03:00			`people = []`
			`nature = []`
			`food = []`
			`activity = []`
			`travel = []`
			`objects = []`
			`symbols = []`
			`flags = []`

			`categories = {`
			`'Smileys & Emotion': people,`
			`'People & Body': people,`
			`'Animals & Nature': nature,`
			`'Food & Drink': food,`
			`'Travel & Places': travel,`
			`'Activities': activity,`
			`'Objects': objects,`
			`'Symbols': symbols,`
More codes and rules 2022-04-24 19:14:23 +03:00			`'Flags': flags,`
			`'Component': symbols`
Update emoji to unicode 13.0 2020-01-24 06:18:14 +03:00			`}`
Add transforms and shortcodes to emoji Signed-off-by: BulbyVR <26726264+TheDrawingCoder-Gamer@users.noreply.github.com> 2022-04-21 20:25:39 +03:00			`shortcodeDict = {}`
			`# for my sanity - this strips newlines`
			`for line in open(shortcodefilename, 'r', encoding="utf8"):`
			`longname, shortname = line.strip().split(':')`
			`shortcodeDict[longname] = shortname`
Update emoji to unicode 13.0 2020-01-24 06:18:14 +03:00			`current_category = ''`
Add Unicode 14.0 emoji 2021-09-25 09:19:44 +03:00			`for line in open(filename, 'r', encoding="utf8"):`
Update emoji to unicode 13.0 2020-01-24 06:18:14 +03:00			`if line.startswith('# group:'):`
			`current_category = line.split(':', 1)[1].strip()`

			`if not line or line.startswith('#'):`
			`continue`
Add full emoji support 2017-04-23 21:31:08 +03:00
Update emoji to unicode 13.0 2020-01-24 06:18:14 +03:00			`segments = re.split(r'\s+[#;] ', line.strip())`
			`if len(segments) != 3:`
			`continue`
Add full emoji support 2017-04-23 21:31:08 +03:00
Update emoji to unicode 13.0 2020-01-24 06:18:14 +03:00			`code, qualification, charAndName = segments`
Add full emoji support 2017-04-23 21:31:08 +03:00
Use fully qualified emoji and use literals 2022-01-01 08:16:37 +03:00			`# skip unqualified versions of same unicode`
Change name to codepoint 2022-04-27 19:45:45 +03:00			`if qualification != 'fully-qualified':`
Update emoji to unicode 13.0 2020-01-24 06:18:14 +03:00			`continue`
More codes and rules 2022-04-24 19:14:23 +03:00
Add full emoji support 2017-04-23 21:31:08 +03:00
Properly add regional indicators, with ZWNJ 2022-04-23 20:22:42 +03:00			`char, name = re.match(r'^(\S+) E\d+\.\d+ (.*)$', charAndName).groups()`
Allow search with unicode names 2022-04-22 19:49:57 +03:00			`shortname = name`
Change name to codepoint 2022-04-27 19:45:45 +03:00			`# until skin tone is handled, keep them around`
More codes and rules 2022-04-24 19:14:23 +03:00			`# discard skin tone variants for sanity`
			`# __contains__ is so stupid i hate prototype languages`
Change name to codepoint 2022-04-27 19:45:45 +03:00			`# if name.__contains__("skin tone") and qualification != 'component':`
			`# continue`
			`# if qualification == 'component' and not name.__contains__("skin tone"):`
			`# continue`
Keep old codes alongside new ones Signed-off-by: BulbyVR <26726264+TheDrawingCoder-Gamer@users.noreply.github.com> 2022-04-22 17:46:43 +03:00			`#TODO: Handle skintone modifiers in a sane way`
Change name to codepoint 2022-04-27 19:45:45 +03:00			`basicallyTheSame = False`
			`if code in shortcodeDict:`
			`shortname = shortcodeDict[code]`
More codes and rules 2022-04-24 19:14:23 +03:00			`else:`
			`shortname = shortname.lower()`
			`if shortname.endswith(' (blood type)'):`
			`shortname = shortname[:-13]`
			`if shortname.endswith(': red hair'):`
			`shortname = "red_haired_" + shortname[:-10]`
			`if shortname.endswith(': curly hair'):`
			`shortname = "curly_haired_" + shortname[:-12]`
			`if shortname.endswith(': white hair'):`
			`shortname = "white_haried_" + shortname[:-12]`
			`if shortname.endswith(': bald'):`
			`shortname = "bald_" + shortname[:-6]`
			`if shortname.endswith(': beard'):`
			`shortname = "bearded_" + shortname[:-7]`
Allow search with unicode names 2022-04-22 19:49:57 +03:00			`if shortname.endswith(' face'):`
			`shortname = shortname[:-5]`
More codes and rules 2022-04-24 19:14:23 +03:00			`if shortname.endswith(' button'):`
Allow search with unicode names 2022-04-22 19:49:57 +03:00			`shortname = shortname[:-7]`
More codes and rules 2022-04-24 19:14:23 +03:00			`if shortname.endswith(' banknote'):`
			`shortname = shortname[:-9]`

			`# FIXME: Is there a better way to do this?`
			`matchobj = re.match(r'^flag: (.*)$', shortname)`
Change name to codepoint 2022-04-27 19:45:45 +03:00			`if shortname.startswith("flag: "):`
			`country = shortname[5:]`
More codes and rules 2022-04-24 19:14:23 +03:00			`shortname = country + " flag"`
			`shortname = shortname.replace("u.s.", "us")`
			`shortname = shortname.replace("&", "and")`
Change name to codepoint 2022-04-27 19:45:45 +03:00
			`if shortname == name.lower():`
			`basicallyTheSame = True`

Allow search with unicode names 2022-04-22 19:49:57 +03:00			`shortname = shortname.replace("-", "_")`
Change name to codepoint 2022-04-27 19:45:45 +03:00			`shortname = re.sub(r'\W', '_', shortname)`
More codes and rules 2022-04-24 19:14:23 +03:00			`shortname, = re.match(r'^_(.+)_$', shortname).groups()`
Allow search with unicode names 2022-04-22 19:49:57 +03:00			`shortname = re.sub(r'_{2,}', '_', shortname)`
			`shortname = unidecode(shortname)`
Change name to codepoint 2022-04-27 19:45:45 +03:00			`# if basicallyTheSame:`
			`# shortname = ""`
Allow search with unicode names 2022-04-22 19:49:57 +03:00			`categories[current_category].append(Emoji(code, shortname, name))`
Add full emoji support 2017-04-23 21:31:08 +03:00
			`# Use xclip to pipe the output to clipboard.`
			`# e.g ./codegen.py emoji.json \| xclip -sel clip`
Add transforms and shortcodes to emoji Signed-off-by: BulbyVR <26726264+TheDrawingCoder-Gamer@users.noreply.github.com> 2022-04-21 20:25:39 +03:00			`# alternatively - delete the var from src/emoji/Provider.cpp, and do ./codegen.py emojis shortcodes >> src/emoji/Provider.cpp`
Fix emoji update script 2021-01-24 01:25:52 +03:00			`generate_qml_list(people=people, nature=nature, food=food, activity=activity, travel=travel, objects=objects, symbols=symbols, flags=flags)`