-
Notifications
You must be signed in to change notification settings - Fork 10
/
generate_json_from_html.js
392 lines (366 loc) · 12.3 KB
/
generate_json_from_html.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
// @ts-nocheck
var fs = require('fs');
var html2json = require('himalaya');
const { exit } = require('process');
const { assert } = require('console');
const inputPath = "D:/网站爬取/www.daorenjia.com"
const outputPath = "./json"
function getDmClass_L1(id) {
switch (true) {
case id <= 11:
return "三洞真经";
break;
case id <= 20:
return "四辅真经";
break;
case id <= 24:
return "道教论集";
break;
case id <= 26:
return "道法众术";
break;
case id <= 29:
return "道教科仪";
break;
case id <= 31:
return "道史仙传";
break;
default:
throw "编号不存在!"
break;
};
};
function getDmClass_L2(id) {
switch (id) {
case 8:
return "洞真上清经";
break;
case 9:
return "洞玄灵宝经";
break;
case 10:
return "洞神三皇经";
break;
case 11:
return "三洞经教";
break;
case 12:
return "太平部诸经";
break;
case 13:
return "太玄部经诀";
break;
case 14:
return "正一部经籙";
break;
case 15:
return "道德真经";
break;
case 16:
return "四子真经";
break;
case 17:
return "黄帝阴符经";
break;
case 18:
return "道教易学";
break;
case 19:
return "太清金丹经";
break;
case 20:
return "太清摄养经";
break;
case 21:
return "诸子文集";
break;
case 22:
return "道学论著";
break;
case 23:
return "全真文集";
break;
case 24:
return "道教类书";
break;
case 25:
return "道法诸经";
break;
case 26:
return "道法总集";
break;
case 27:
return "科戒威仪";
break;
case 28:
return "灵宝诸斋仪";
break;
case 29:
return "灯仪法忏章表";
break;
case 30:
return "神仙高道传";
break;
case 31:
return "仙境名山志";
break;
default:
throw "编号不存在!"
break;
};
};
const writeFileSyncRecursive = function (path, buffer) {
let lastPath = path.substring(0, path.lastIndexOf("/"));
fs.mkdirSync(lastPath, { recursive: true });
fs.writeFileSync(path, buffer);
};
Array.prototype.indexOf = function (val) {
for (var i = 0; i < this.length; i++) {
if (this[i] == val) return i;
}
return -1;
};
Array.prototype.remove = function (val) {
var index = this.indexOf(val);
if (index > -1) {
this.splice(index, 1);
}
};
var fileCnt = 0;
var description;
var paragraphs_raw;
var paragraphs;
var title;
var multiIndexNum = 0;
function commonReplace(str) {
//sup,sub,ub,up替换
let reg = new RegExp("<sup>|<sub>|<up>|<ub>");
str = str.replace(reg, "(");
reg = new RegExp("</sup>|</sub>|</up>|</ub>")
str = str.replace(reg, ")");
//去除空格
str = str.replace(/\s/g, "");
//一些其它修复
str = str.replace("黃", "黄");
return str;
}
function getContentRecursive(jsonIn) {
let ret = "";
//对于image标签,直接无视
if (jsonIn.tagName == "img") {
return "";
};
if (jsonIn.type == "text") {
return jsonIn.content;
};
//不知道出现了什么问题的情况
if (jsonIn.children.length == 0) {
return ""
};
jsonIn.children.forEach(element => {
ret += getContentRecursive(element);
});
return ret;
};
function parseSingle(jsonIn) {
title = jsonIn[2].children[1].children[11].attributes[1].value;
title = commonReplace(title);
description = jsonIn[2].children[1].children[13].attributes[1].value;
description = commonReplace(description);
paragraphs_raw = jsonIn[2].children[3].children[1].children[7].children[3].children[1].children;
paragraphs = new Array();
paragraphs_raw.forEach(element => {
if (element.tagName == "p") {
let content = getContentRecursive(element);
let reg = new RegExp("经名.{0,5}" + title); //删除正文中的简介,最大限度防止错误匹配
if (!reg.exec(content)) {
content = content.replace(/ |“|”|\r\n|·/ig, "");
if (content != "" && content != title) paragraphs.push(content);
};
};
});
};
function parseMultiIndexChar(str) {
const reg = new RegExp("([\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u5343\u96f6]{1,10})"); //匹配一~十和百,千,零
let chnStr;
if (!(chnStr = reg.exec(str)) || multiIndexNum > 0) { //无数字/已经获得了章节数(章节数总是写在最开头)
return 1; //无需处理
} else {
chnStr = chnStr[1];
//开始解析中文数字
var chnNumChar = {
零: 0, 一: 1, 二: 2, 三: 3, 四: 4, 五: 5, 六: 6, 七: 7, 八: 8, 九: 9
};
var chnNameValue = {
十: { value: 10, secUnit: false },
百: { value: 100, secUnit: false },
千: { value: 1000, secUnit: false },
万: { value: 10000, secUnit: true },
亿: { value: 100000000, secUnit: true }
};
var expNumChar = {
十: 10, 十一: 11, 十二: 12, 十三: 13, 十四: 14, 十五: 15, 十六: 16, 十七: 17, 十八: 18, 十九: 19
};
if (expNumChar[chnStr]) {
multiIndexNum = expNumChar[chnStr];
return 0;
}
var rtn = 0;
var section = 0;
var number = 0;
var secUnit = false;
var str = chnStr.split('');
for (var i = 0; i < str.length; i++) {
var num = chnNumChar[str[i]];
if (typeof num !== 'undefined') {
number = num;
if (i === str.length - 1) {
section += number;
}
} else {
var cunit = chnNameValue[str[i]];
if (typeof cunit == 'undefined') {
throw "未知错误";
};
var unit = chnNameValue[str[i]].value;
secUnit = chnNameValue[str[i]].secUnit;
if (secUnit) {
section = (section + number) * unit;
rtn += section;
section = 0;
} else {
section += (number * unit);
}
number = 0;
}
}
multiIndexNum = rtn + section;
return 0;
};
};
function parseMultiContent(jsonIn) {
title = jsonIn[2].children[1].children[9].attributes[1].value;
title = commonReplace(title);
description = jsonIn[2].children[1].children[11].attributes[1].value;
description = commonReplace(description);
paragraphs_raw = jsonIn[2].children[3].children[5].children[3].children;
paragraphs = new Array();
paragraphs_raw.forEach(element => {
if (element.tagName == "p") {
let content = getContentRecursive(element);
let reg = new RegExp("经名.{0,5}" + title); //删除正文中的简介,最大限度防止错误匹配
let reg2 = new RegExp(title + ".{0,5}竟"); //删除正文最后的结束语
if (parseMultiIndexChar(content) && !reg.exec(content) && !reg2.exec(content)) {
content = content.replace(/ |“|”|\r\n|·/ig, "");
if (content != "" && content != title) paragraphs.push(content);
};
};
});
};
//////////////////////////////
//正式开始处理
var fileList = fs.readdirSync(inputPath);
fileList = fileList.sort();
var fileList_copy = fileList.slice();
var multiContentList = new Array(); //分章节的经文的章节文件
var multiIndexList = new Array(); //分章节的经文的索引文件,暂时没什么价值
fileList_copy.forEach(fileName => {
//fileName = "";
if (fileName.indexOf("dao") < 0) {
fileList.remove(fileName);
} else {
let reg = /daoz([0-9]*)-([0-9]*)-([0-9]*)/g;
let tmp;
if (tmp = reg.exec(fileName)) {
multiContentList.push(fileName);
fileList.remove(fileName);
if (multiIndexList[multiIndexList.length - 1] != ("daozang" + tmp[1] + "-" + tmp[2])) {
multiIndexList.push("daozang" + tmp[1] + "-" + tmp[2])
};
};
};
});
multiIndexList.forEach(fileName => {
fileList.remove(fileName);
});
//处理分章节的文件
multiContentList = multiContentList.sort();
var lastProceedIndexName = "";
var lastFilePath = "";
var jsonOut = {};
multiContentList.forEach(fileName => {
let reg = /daoz([0-9]*)-([0-9]*)-([0-9]*)/g;
let tmp = reg.exec(fileName);
let indexName = "daozang" + tmp[1] + "-" + tmp[2];
const htmlFile = fs.readFileSync(inputPath + "/" + fileName, { encoding: 'utf8' });
const jsonIn = html2json.parse(htmlFile);
//fs.writeFileSync("./out_json.3.json", JSON.stringify(jsonIn,null,2));
if (lastProceedIndexName != "") {
if (indexName == lastProceedIndexName) { //两个章节属于同一部经文
parseMultiContent(jsonIn);
let chapter = {};
chapter.index = multiIndexNum;
chapter.paragraphs = paragraphs;
jsonOut.chapters.push(chapter);
multiIndexNum = 0;
} else { //处理新一部经文的第一章
writeFileSyncRecursive(lastFilePath, JSON.stringify(jsonOut, null, 4));
console.log(lastFilePath);
jsonOut = {};
lastProceedIndexName = indexName;
parseMultiContent(jsonIn);
let class1 = getDmClass_L1(Number(tmp[1]));
let class2 = getDmClass_L2(Number(tmp[1]));
lastFilePath = outputPath + "/" + class1 + "/" + class2 + "/" + title + "_multi.json";
let chapter = {};
chapter.index = multiIndexNum;
chapter.paragraphs = paragraphs;
jsonOut.title = title;
jsonOut.description = description;
jsonOut.chapters = new Array();
jsonOut.chapters.push(chapter);
multiIndexNum = 0;
};
} else { //处理第一个
lastProceedIndexName = indexName;
parseMultiContent(jsonIn);
let class1 = getDmClass_L1(Number(tmp[1]));
let class2 = getDmClass_L2(Number(tmp[1]));
lastFilePath = outputPath + "/" + class1 + "/" + class2 + "/" + title + "_multi.json";
let chapter = {};
chapter.index = multiIndexNum;
chapter.paragraphs = paragraphs;
jsonOut.title = title;
jsonOut.description = description;
jsonOut.chapters = new Array();
jsonOut.chapters.push(chapter);
multiIndexNum = 0;
};
});
//处理最后一个
writeFileSyncRecursive(lastFilePath, JSON.stringify(jsonOut, null, 4));
jsonOut = {};
//exit();
//处理单个的文件
fileList.forEach(fileName => {
//读取html文件
//fileName = 'daozang31-43';
var htmlFile = fs.readFileSync(inputPath + "/" + fileName, { encoding: 'utf8' });
if (htmlFile.indexOf("PDF版") < 0) { //有些文档只提供pdf版,跳过它们!.
const jsonIn = html2json.parse(htmlFile);
//fs.writeFileSync("./out_json.2.json",JSON.stringify(jsonIn));
parseSingle(jsonIn);
//分类
let reg = /daozang([0-9]*)-([0-9]*)/g;
let tmp = reg.exec(fileName);
let class1 = getDmClass_L1(Number(tmp[1]));
let class2 = getDmClass_L2(Number(tmp[1]));
var jsonOut = {};
jsonOut.title = title;
jsonOut.description = description;
jsonOut.paragraphs = paragraphs;
let outFileName = outputPath + "/" + class1 + "/" + class2 + "/" + title + ".json";
writeFileSyncRecursive(outFileName, JSON.stringify(jsonOut, null, 4));
console.log(outFileName);
};
});