-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
unicodedomino_kernel_better_decode.def
159 lines (150 loc) · 3.19 KB
/
unicodedomino_kernel_better_decode.def
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
% -*- mode: tex -*-
%-
% See unicodedomino.sty for copyright and licence terms. Furthermore
% this file is dual-licenced under the LPPL version 1.3c or later.
%-
% Fix check for illegal sequences to fail overlong encoded sequences
% as well as codepoints outside of the UCS range [0;10FFFF]; improve
% UTF-8 decoding with fixed-up check code and a safer UTF-8 decoder.
% This is https://github.com/latex3/latex2e/pull/83 (rejected).
% new check for illegal sequences
\gdef\UTFviii@checkseq#1:#2#3\empty{%
\ifnum`#2<"80 %
\ifx\empty#3\empty%
\else%
1%
\fi%
\else%
\ifnum`#2<"C2 %
1%
\else%
\ifnum`#2<"E0 %
% one "80‥"BF
\UTFviii@check@one#3\empty%
\else%
\ifnum`#2<"E1 %
% "A0‥"BF + one "80‥"BF
\UTFviii@check@two"A0.#3\empty%
\else%
\ifnum`#2<"F0 %
% two "80‥"BF
\UTFviii@check@two"80.#3\empty%
\else%
\ifnum`#2<"F1 %
% "90‥"BF + two "80‥"BF
\UTFviii@check@three"90."BF.#3\empty%
\else%
\ifnum`#2<"F4 %
% three "80‥"BF
\UTFviii@check@three"80."BF.#3\empty%
\else%
\ifnum`#2<"F5 %
% "80‥"8F + two "80‥"BF
\UTFviii@check@three"80."8F.#3\empty%
\else%
1%
\fi%
\fi%
\fi%
\fi%
\fi%
\fi%
\fi%
\fi%
}%
% check last trail octet in the range "80‥"BF
\gdef\UTFviii@check@one#1#2\empty{%
\ifx\empty#2\empty%
\ifnum`#1<"80 %
1%
\else%
\ifnum`#1>"BF %
1%
\fi%
\fi%
\else%
1%
\fi%
}%
% check second-to-last trail octet in the range #1‥"BF
\gdef\UTFviii@check@two#1.#2#3\empty{%
\ifx\empty#3\empty%
1%
\else%
\ifnum`#2<#1 %
1%
\else%
\ifnum`#2>"BF %
1%
\else%
\UTFviii@check@one#3\empty%
\fi%
\fi%
\fi%
}%
% check third-to-last trail octet in the range #1‥#2
\gdef\UTFviii@check@three#1.#2.#3#4\empty{%
\ifx\empty#4\empty%
1%
\else%
\ifnum`#3<#1 %
1%
\else%
\ifnum`#3>#2 %
1%
\else%
\UTFviii@check@two"80.#4\empty%
\fi%
\fi%
\fi%
}%
% changed calling API for \UTFviii@checkseq (empty+relax)
\def\UTFviii@defined#1{%
\ifx#1\relax%
\if\relax\expandafter\UTFviii@checkseq\string#1\empty\relax%
\UTFviii@undefined@err{#1}% from v1.2a 2018/03/24
% not needed in unicodedomino_compat.def though because the
% \UTFviii@defined macro is reimplemented by unicodedomino.sty
\else%
\PackageError{inputenc}{Invalid UTF-8 byte sequence}%
\UTFviii@invalid@help
\fi%
\else%
\expandafter#1%
\fi%
}%
% decode octets to codepoint number, safer
\gdef\decode@UTFviii#1\relax{%
\the\numexpr(\UTFviii@decode0:#1\relax)%
}%
% safer decode, returns "1FFFFF for illegal sequences
\gdef\UTFviii@decode#1\relax{%
\if\relax\expandafter\UTFviii@checkseq\string#1\empty\relax%
\UTFviii@dec@lead#1\relax%
\else%
2097151%
\fi%
}%
% decode lead octet
\gdef\UTFviii@dec@lead#1:#2#3\relax{%
% we know #2 is in "00‥"7F, "C2‥"F4
\ifnum`#2<"80 %
`#2%
\else%
\ifnum`#2<"E0 %
(`#2-"C0%
\else%
\ifnum`#2<"F0 %
((`#2-"E0%
\else%
(((`#2-"F0%
\fi%
\fi%
\UTFviii@dec@trail#3\relax%
\fi%
}%
% decode trail octets recursively
\gdef\UTFviii@dec@trail#1#2\relax{%
)*64+(`#1-"80)%
\ifx\relax#2\else\UTFviii@dec@trail#2\relax\fi%
}%