-
Notifications
You must be signed in to change notification settings - Fork 7
/
update_unicode_table.py
181 lines (154 loc) · 6.7 KB
/
update_unicode_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# (c) 2013-2021 Sebastian Humenda
# This code is licenced under the terms of the LGPL-3+, see the file COPYING for
# more details.
"""This script auto-generates gleetex/unicode_data.py.
The purpose is to provide a table with mappings from unicode points to
their LaTeX equivalent. This way, formulas can be converted using
LaTeX2e, but the end-user can still use unicode in it's formulas. The
unicode version can also be used to make the alternative text more
readable.
"""
import collections
import enum
import os
import shlex
import shutil
import sys
import urllib.request
import xml.etree.ElementTree as ET
################################################################################
# Constants
class LaTeXMode(enum.Enum):
# exception: not a constant, but required for one of the constants
"""Represent either math or text mode.
Math mode in LaTeX is e.g. everything between $ and $.
"""
textmode = 0
mathmode = 1
# URL to XML file, which is used to generate the python source file
UNICODE_TABLE_URL = (
'https://raw.githubusercontent.com/w3c/xml-entities/gh-pages/unicode.xml'
)
# a list of commands to replace, if found
BAD_COMMANDS = {
# decimal_codepoint: {LaTeXMode: new version}
178: {LaTeXMode.mathmode: '^2'},
179: {LaTeXMode.mathmode: '^3'},
181: {LaTeXMode.mathmode: '\\mu'},
185: {LaTeXMode.mathmode: '^1'},
8211: {LaTeXMode.mathmode: '\\mathrm{\\textendash}'},
8722: {LaTeXMode.mathmode: '-'},
}
################################################################################
def get_unicode_table_xml():
with urllib.request.urlopen(UNICODE_TABLE_URL) as u:
return ET.fromstring(u.read())
def create_unicode_latex_table(root):
"""This function iterates over the XML tree and extracts all characters for
the unicode table.
The resulting table will have the decimal unicode point as key. The
value is again a dict with the possible keys from LaTeX and the
LaTeX commands as string. Certain unicode points are ignored, to
prevent replacing normal or control characters.
"""
unicode_table = {}
for character in root.find('charlist').iterfind('character'):
childtags = set(node.tag for node in character)
# skip characters without LaTeX alternative
if (
'latex' not in childtags
and 'AMS' not in childtags
and 'mathlatex' not in childtags
):
continue # skip this character
attr = character.attrib.get
# if no mode (text or math) was specified, ignore character
if attr('mode') not in ('text', 'math', 'mixed', 'other'):
continue
# a defined character may have multiple codepoints (called ids); add
# each of the ids as a separate entry to the table
ids = tuple(map(int, attr('dec').split('-')))
if any(elem for elem in ids if elem < 161):
continue # ignore ASCII and a few control unicode characters
# extract textmode, mathmode and AMS commands:
commands = {}
if 'latex' in childtags:
commands[LaTeXMode.textmode] = next(
character.iterfind('latex')).text
if 'AMS' in childtags:
commands[LaTeXMode.mathmode] = next(character.iterfind('AMS')).text
# only take LaTeX command from <mathlatex/>, if no AMS tag present and
# no set was specified. A `set` is a attempt to specify the LaTeX
# package which needs to be loaded.
if 'mathlatex' in childtags and LaTeXMode.mathmode not in commands:
mathnode = next(character.iterfind('mathlatex'))
if 'set' not in mathnode.attrib:
commands[LaTeXMode.mathmode] = mathnode.text
if commands: # if a usable textmode and a mathmode without unicode-math found:
for identification in ids:
# some code points are not usable for our purposes, so update
# the control sequences, if appropriate
if identification in BAD_COMMANDS:
commands.update(BAD_COMMANDS[identification])
unicode_table[identification] = commands
return unicode_table
def serialize_table(table):
"""Serialize the given unicode table to a python table, which could be
directly executed by eval.
The decimal code points, serving as a key in the dictionary, are
sorted for the output.
"""
ordered_table = collections.OrderedDict()
for key in sorted(table.keys()):
ordered_table[key] = table[key]
python_string = ['unicode_table = {']
def reprmode(m, v): return 'LaTeXMode.%s: %s' % (m.name, repr(v[m]))
for code_point, replacements in ordered_table.items():
# serialize by hand to have a fixed order of items; helpful for a
# minimal git diff
commands = ''
if LaTeXMode.textmode in replacements:
commands = reprmode(LaTeXMode.textmode, replacements)
if LaTeXMode.mathmode in replacements:
if commands:
commands += ', '
commands += reprmode(LaTeXMode.mathmode, replacements)
python_string.append('%s: {%s},' % (code_point, commands))
return '\n '.join(python_string) + '\n }\n'
def generate_python_src_file(table, python_table):
"""Generate a fully importable python source file, by dumping the enum
declarations, python imports, doc strings and the given python string with
the unicode table into the source and returning it as a whole string."""
enum_def = 'class LaTeXMode(enum.Enum):\n """%s"""\n ' % LaTeXMode.__doc__
enum_values = tuple(e for e in dir(LaTeXMode) if not e.startswith('_'))
enum_def += '\n '.join(
'%s = %s' % (name, getattr(LaTeXMode, name).value) for name in enum_values
)
return """\"\"\"
DO NOT ALTER THIS FILE IN ANY WAY, IT IS GENERATED AUTOMATICALLY. SEE THE SCRIPT
`update_unicode_table.py` FOR MORE INFORMATION.
This file contains a table of unicode code point to LaTeX command mapping. It
has %s entries and was derived from
<%s>.\"\"\"
#pylint: disable=too-many-lines,missing-docstring\n\n
import enum\n
%s\n\n%s\n""" % (
len(table),
UNICODE_TABLE_URL,
enum_def,
python_table,
)
def main():
if not os.path.exists('gleetex'):
print('Error: Generator script must be run from GladTeX source root.')
table = create_unicode_latex_table(get_unicode_table_xml())
python_table = serialize_table(table)
path = os.path.join('gleetex', 'unicode.py')
with open(path, 'w', encoding='utf-8') as f:
f.write(generate_python_src_file(table, python_table))
exit = 0
if shutil.which('black'):
exit = os.system(f'black {shlex.quote(path)}')
sys.exit(exit)
if __name__ == '__main__':
main()