forked from pingcap/docs-cn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_by_toc.py
executable file
·177 lines (150 loc) · 5.81 KB
/
merge_by_toc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python3
# coding: utf8
#
# Generate all-in-one Markdown file for ``doc-cn``
# Tip: 不支持中文文件名
# readme.md 中的目录引用的md多次(或者md的sub heading),以第一次出现为主
from __future__ import print_function, unicode_literals
import re
import os
followups = []
in_toc = False
contents = []
hyper_link_pattern = re.compile(r'\[(.*?)\]\((.*?)(#.*?)?\)')
toc_line_pattern = re.compile(r'([\-\+]+)\s\[(.*?)\]\((.*?)(#.*?)?\)')
image_link_pattern = re.compile(r'!\[(.*?)\]\((.*?)\)')
level_pattern = re.compile(r'(\s*[\-\+]+)\s')
# match all headings
heading_patthern = re.compile(r'(^#+|\n#+)\s')
entry_file = "TOC.md"
# stage 1, parse toc
with open(entry_file) as fp:
level = 0
current_level = ""
for line in fp:
if not in_toc and line.startswith("## "):
in_toc = True
print("in toc")
elif in_toc and line.startswith('## '):
in_toc = False
# yes, toc processing done
# contents.append(line[1:]) # skip 1 level TOC
break
elif in_toc and not line.startswith('#') and line.strip():
## get level from space length
print(line)
level_space_str = level_pattern.findall(line)[0][:-1]
level = len(level_space_str) // 2 + 1 ## python divide get integer
matches = toc_line_pattern.findall(line)
if matches:
for match in matches:
fpath = match[2]
if fpath.endswith('.md'):
# remove the first slash in the relative path
fpath = fpath[1:]
key = ('FILE', level, fpath)
if key not in followups:
print(key)
followups.append(key)
elif fpath.startswith('http'):
## remove list format character `- `, `+ `
followups.append(('TOC', level, line.strip()[2:]))
else:
name = line.strip().split(None, 1)[-1]
key = ('TOC', level, name)
if key not in followups:
print(key)
followups.append(key)
else:
pass
# overview part in README.md
followups.insert(1, ("RAW", 0, fp.read()))
for k in followups:
print(k)
# stage 2, get file heading
file_link_name = {}
title_pattern = re.compile(r'(^#+)\s.*')
for tp, lv, f in followups:
if tp != 'FILE':
continue
try:
for line in open(f).readlines():
if line.startswith("#"):
tag = line.strip()
break
except Exception as e:
print(e)
tag = ""
if tag.startswith('# '):
tag = tag[2:]
elif tag.startswith('## '):
tag = tag[3:]
file_link_name[f] = tag.lower().replace(' ', '-')
print(file_link_name)
def replace_link_wrap(chapter, name):
# Note: 仅仅支持 hash 匹配,如果在多个文档中有同名 heading 会碰撞
# 支持 chapter 文档中的 ./ddd.md, xxx.md, xxx.md#xxx 等
def replace_link(match):
full = match.group(0)
link_name = match.group(1)
link = match.group(2)
frag = match.group(3)
if link.endswith('.md') or '.md#' in link:
if not frag:
relative_path = ''
if not link.startswith('.'):
relative_path = '../'
_rel_path = os.path.normpath(os.path.join(name, relative_path, link))
for fpath in file_link_name:
if _rel_path == fpath:
frag = '#' + file_link_name[fpath]
return '[%s](%s)' % (link_name, frag)
elif link.endswith('.png') or link.endswith('.jpeg') or link.endswith('.svg') or link.endswith('.gif') or link.endswith('.jpg'):
# special handing for pic
img_link = re.sub(r'[\.\/]*media\/', './media/', link, count=0, flags=0)
# print('****************', img_link)
# print('================', '[%s](%s)' % (link_name, img_link))
# return '[%s](%s/%s)' % (link_name, dirname, fname)
return '[%s](%s)' % (link_name, img_link)
else:
return full
return hyper_link_pattern.sub(replace_link, chapter)
def replace_heading_func(diff_level=0):
def replace_heading(match):
if diff_level == 0:
return match.group(0)
else:
return '\n' + '#' * (match.group(0).count('#') + diff_level) + ' '
return replace_heading
def replace_img_link(match):
full = match.group(0)
link_name = match.group(1)
link = match.group(2)
if link.endswith('.png'):
fname = os.path.basename(link)
return '![%s](./media/%s)' % (link_name, fname)
# stage 3, concat files
for type_, level, name in followups:
if type_ == 'TOC':
contents.append("\n{} {}\n".format('#' * level, name))
elif type_ == 'RAW':
contents.append(name)
elif type_ == 'FILE':
try:
with open(name) as fp:
chapter = fp.read()
chapter = replace_link_wrap(chapter, name)
# chapter = image_link_pattern.sub(replace_img_link, chapter)
# fix heading level
diff_level = level - heading_patthern.findall(chapter)[0].count('#')
print(name, type_, level, diff_level)
chapter = heading_patthern.sub(replace_heading_func(diff_level), chapter)
contents.append(chapter)
contents.append('') # add an empty line
except Exception as e:
print(e)
print("generate file error: ignore!")
# stage 4, generage final doc.md
target_doc_file = 'doc.md'
with open(target_doc_file, 'w') as fp:
fp.write('\n'.join(contents))