Skip to content

Commit dec3158

Browse files
committed
Preprocessing UCD to JSON lines
1 parent dda5c8a commit dec3158

File tree

7 files changed

+103
-19
lines changed

7 files changed

+103
-19
lines changed

download.sh

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,7 @@ else
7070
fi
7171

7272
cd ${TMP_DIR}
73-
unzip -j UCD.zip --exclude "ReadMe.txt"
73+
unzip -j UCD.zip -x ReadMe.txt -x emoji/ReadMe.txt
7474
unzip -j ucd.all.flat.zip
7575

76-
echo "INFO: converting ucd to json"
77-
yq -p=xml -o=json "${TMP_DIR}/ucd.all.flat.xml" > "${TMP_DIR}/ucd.all.flat.json"
78-
7976
echo "INFO: download complete at $(date -u +%Y-%m-%dT%H:%M:%SZ)"

local-deploy.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env bash
2+
#
3+
# deploy the files to the local source repo
4+
#
5+
6+
set -o errexit
7+
set -o pipefail
8+
set -o nounset
9+
10+
echo "INFO: local deploy starting at $(date -u +%Y-%m-%dT%H:%M:%SZ)"
11+
12+
ORIG_DIR=./output
13+
if [ ! -d "${ORIG_DIR}" ]; then
14+
echo "ERROR: output files not created in ${ORIG_DIR}"
15+
exit 1
16+
fi
17+
18+
DEST_DIR=~/workspace/ff/www.fileformat.info/src/main/java/org/unicode
19+
if [ ! -d "${ORIG_DIR}" ]; then
20+
echo "ERROR: destination directory not found ${DEST_DIR}"
21+
exit 2
22+
fi
23+
24+
25+
gzip \
26+
--stdout \
27+
"${ORIG_DIR}/ucd-lines.json" \
28+
> "${DEST_DIR}/ucd-lines.json.gz"
29+
30+
cp -p "${ORIG_DIR}/Unihan.txt.gz" "${DEST_DIR}/Unihan.txt.gz"
31+
32+
echo "INFO: local deploy complete at $(date -u +%Y-%m-%dT%H:%M:%SZ)"

run.sh

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,20 @@ set -o errexit
77
set -o pipefail
88
set -o nounset
99

10-
1110
echo "INFO: run starting at $(date -u +%Y-%m-%dT%H:%M:%SZ)"
1211

12+
if [ ! -d "./output" ]; then
13+
echo "INFO: creating output directory ./output"
14+
mkdir -p ./output
15+
fi
16+
17+
if [ ! -d "./node_modules" ]; then
18+
echo "INFO: installing npm dependencies"
19+
npm install
20+
fi
21+
22+
1323
npx tsc
1424
node dist/main.js
1525

16-
echo "INFO: run complete at $(date -u +%Y-%m-%dT%H:%M:%SZ)"
26+
echo "INFO: run complete at $(date -u +%Y-%m-%dT%H:%M:%SZ)"

src/CodepointData.ts

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
11
type CodepointData = {
2-
code: string;
3-
name: string;
42
age: string;
3+
approximations?: string[];
4+
bidi: string;
55
block: string;
6+
caseVariants?: { [key: string]: string[] };
67
category: string;
7-
script: string;
8-
approximations?: string[];
8+
code: string;
9+
combine: string;
910
comments?: string[];
11+
decomposition?: string[];
1012
indexEntries?: string[];
13+
mirror?: string;
14+
name: string;
1115
notes?: string[];
16+
oldname?: string;
1217
related?: string[];
18+
script: string;
1319
scriptExtensions?: string[];
1420
tags?: string[];
21+
title: string;
1522
variants?: string[];
1623
};
1724

src/ProcessIndexTxt.ts

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,6 @@ async function ProcessIndexTxt( filePath: string, cpMap: {[code: string]: Codepo
3636
continue;
3737
}
3838

39-
if (cpData.name == indexEntry) {
40-
// console.log(`DEBUG: skipping index entry ${indexEntry} for code ${code} since it is the same as the name`);
41-
continue;
42-
}
43-
4439
if (!cpData.indexEntries) {
4540
cpData.indexEntries = [];
4641
}

src/ProcessScriptExtensionsTxt.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ async function ProcessScriptExtensionsTxt(
4040
endCode = startCode;
4141
}
4242

43-
console.log(`DEBUG: processing codepoints ${codes} (${startCode}-${endCode}) for scripts ${scripts.join(", ")}`);
43+
//console.log(`DEBUG: processing codepoints ${codes} (${startCode}-${endCode}) for scripts ${scripts.join(", ")}`);
4444

4545
for (let cp = startCode; cp <= endCode; cp++) {
4646
const codeHex = cp.toString(16).toUpperCase().padStart(4, "0");

src/XmlToMap.ts

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ async function XmlToMap(xmlPath: string) {
142142
var name = charData.na || charData.na1;
143143
if (!name && charData['name-alias']) {
144144
name = charData['name-alias'][0].alias;
145+
console.log(
146+
`WARN: codepoint ${charData.cp} missing name, using name-alias '${name}'`
147+
);
145148
}
146149

147150
if (name.endsWith('#')) {
@@ -185,19 +188,59 @@ async function XmlToMap(xmlPath: string) {
185188
console.log(`INFO: name-alias data: ${JSON.stringify(charData['name-alias'])}`);
186189
}
187190

191+
if (charData.JSN && charData.JSN.length > 0) {
192+
notes.push(`Hangul Syllable Type: ${charData.JSN}`);
193+
}
194+
195+
const caseVariants: { [key: string]: string[] } = {};
196+
if (charData.suc && charData.suc.length > 0 && charData.suc !== "#") {
197+
caseVariants["uppercase"] = [ charData.ucp ];
198+
}
199+
if (charData.slc && charData.slc.length > 0 && charData.slc !== "#") {
200+
caseVariants["lowercase"] = [ charData.lcp ];
201+
}
202+
if (charData.stc && charData.stc.length > 0 && charData.stc !== "#") {
203+
caseVariants["titlecase"] = [ charData.tcp ];
204+
}
205+
if (charData.uc && charData.uc.length > 0 && charData.uc !== "#") {
206+
caseVariants["uppercase"] = charData.uc.split(" ");
207+
}
208+
if (charData.lc && charData.lc.length > 0 && charData.lc !== "#") {
209+
caseVariants["lowercase"] = charData.lc.split(" ");
210+
}
211+
if (charData.tc && charData.tc.length > 0 && charData.tc !== "#") {
212+
caseVariants["titlecase"] = charData.tc.split(" ");
213+
}
214+
var decomposition: string[] | undefined = undefined;
215+
if (charData.dm && charData.dm.length > 0 && charData.dm !== "#") {
216+
decomposition = charData.dm.split(" ");
217+
}
218+
219+
if (charData.bpt && charData.bpt.length > 0 && charData.bpt !== 'n') {
220+
notes.push(`Bidi Paired Bracket Type: ${charData.bpt == 'o' ? 'Open' : 'Closed'}`);
221+
notes.push(`Bidi Paired Bracket: U+${charData.bpb}`);
222+
}
223+
188224
var cpData: CodepointData = {
189-
code: charData.cp,
190225
name,
226+
title: charData.na1 || name,
191227
age: charData.age,
228+
bidi: charData.bc,
192229
block: charData.blk,
230+
caseVariants: Object.keys(caseVariants).length > 0 ? caseVariants : undefined,
193231
category: charData.gc,
232+
code: charData.cp,
233+
combine: charData.ccc,
234+
decomposition,
235+
mirror: charData.bmg === '' ? undefined : charData.bmg,
236+
oldname: charData.na1 === '' ? undefined : charData.na1,
194237
script: charData.sc,
195238
tags,
196239
notes,
197240
comments: [],
198241
indexEntries: [],
199-
related : [],
200-
variants : [],
242+
related: [],
243+
variants: [],
201244
};
202245

203246
cpMap[charData.cp] = cpData;

0 commit comments

Comments
 (0)