-
Notifications
You must be signed in to change notification settings - Fork 1
/
cqt.js
98 lines (92 loc) · 3.19 KB
/
cqt.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
const AMPERS_PAT = /[&\uFE60\uFF06]/g;
const SPECIALIZED_WHITESPACE_PAT = /[\u2028\u2029\u200B\uFEFF\u00A0\u3000\r\n\t]+/g;
const MULTI_WHITESPACE_PAT = /\s{2,}/g;
const DASHCHARS_PAT = /[\u058A\u05BE\u1400\u1806\u2010\u2011\u2012\u2013\u2014\u2015\u2e17\u2e1a\u2e3a\u2e3b\u2e40\u2e5d\u301c\u3030\u30a0\ufe31\ufe32\ufe58\ufe63\uff0d]+/g;
const MULTI_HYPHENS_PAT = /-{2,}/g;
const CJK_PUNCT_PAIRS = [
["\u3001", ","],
["\u3002", "."]
];
const LONG_DOTS_PAT = /[.]{4,}/g;
const QUOTERS_PAT = /["\u2018\u2019\u201C\u201D\u00AB\u00BB\u2039\u203A\u3008\u3009\u300A\u300B\u300C\u300D]/g;
const ANY_WHITESPACE_PAT = /(\s+)/g;
const AUTOCORRECT_PAIRS = [
["\u1f60A", ":-)"],
["\u1f610", ":-|"],
["\u2639", ":-("],
["\u1f603", ":-D"],
["\u1f61D", ":-p"],
["\u1f632", ":-o"],
["\u1f609", ";-)"],
["\u2764", "<3"],
["\u1f494", "</3"],
["\u00a9", "(c)"],
["\u00ae", "(R)"],
["\u2022", "*"]
];
const ASCII_EMOJI_PAIRS = [
[":)", ":-)"],
[":|", ":-|"],
[":(", ":-("],
[":D", ":-D"],
[":p", ":-p"],
[":o", ":-o"],
[";)", ";-)"]
];
function algorithm_1_14(plaintext) {
// Step 1: Assume input is already unicode, so no conversion needed.
// Step 2: Normalize using NFKC.
let x = plaintext.normalize('NFKC');
// Step 3: Replace ampersands with " and ".
x = x.replace(AMPERS_PAT, ' and ');
// Step 4: Normalize whitespace.
function step4(whitespace_anomalies) {
let out = whitespace_anomalies.replace(SPECIALIZED_WHITESPACE_PAT, ' ');
out = out.trim();
out = out.replace(MULTI_WHITESPACE_PAT, ' ');
return out;
}
x = step4(x);
// Step 5: Replace punctuations and symbols.
function step5(punct_anomalies) {
// 5.i
let out = punct_anomalies.replace(DASHCHARS_PAT, '-');
// 5.ii
out = out.replace(MULTI_HYPHENS_PAT, '-');
// 5.iii
for (const [cjk, ascii] of CJK_PUNCT_PAIRS) {
out = out.replace(new RegExp(cjk, 'g'), ascii);
}
// 5.iv
out = out.replace(/\u2026/g, '...');
// 5.v
out = out.replace(LONG_DOTS_PAT, '...');
// 5.vi
out = out.replace(/\u2044/g, '/');
// 5.vii
out = out.replace(QUOTERS_PAT, "'");
// 5.viii
out = out.replace(ANY_WHITESPACE_PAT, (match, group1, index, str) => {
let keep_space = true;
if (index > 0 && str[index - 1].match(/\p{P}/u)) {
keep_space = false;
}
if (index + match.length < str.length && str[index + match.length].match(/\p{P}/u)) {
keep_space = false;
}
return keep_space ? match : group1;
});
// 5.ix
for (const [autocorrect, ascii] of AUTOCORRECT_PAIRS) {
out = out.replace(new RegExp(autocorrect, 'g'), ascii);
}
// 5.x
for (const [noncanonical, canonical] of ASCII_EMOJI_PAIRS) {
out = out.replace(new RegExp(noncanonical.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'), 'g'), canonical);
}
return out;
}
x = step5(x);
// Step 6: Convert to UTF-8 (JavaScript strings are UTF-16).
return new TextEncoder('utf-8').encode(x);
}