Skip to content

Commit 830f180

Browse files
JanardhanamJanardhanam
authored andcommitted
Converts as many files as inputted as arguments
Skips already converted files Progress bar added utf-8 problem fixed (for common characters)
1 parent 1efb578 commit 830f180

File tree

1 file changed

+84
-63
lines changed

1 file changed

+84
-63
lines changed

HTML-Beauti.py

Lines changed: 84 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -3,97 +3,118 @@
33

44
def file_replace_text(fname, toreplace, replacement):
55
#print("opening " + fname)
6-
with open(fname, 'r', encoding='ANSI') as file:
6+
with open(fname, 'r', encoding='utf-8') as file:
77
filedata = file.read()
88
# Replace the text in file
99
filedata = filedata.replace(toreplace, replacement)
1010
# Write the file out again
11-
with open(fname, 'w') as file:
11+
with open(fname, 'w', encoding='utf-8') as file:
1212
file.write(filedata)
1313

1414
def main():
1515
if len(sys.argv) <= 1:
16-
KBID = str(input("Enter KB number to beautify: "))
17-
else:
16+
print("No parameters detected. Aborting.")
17+
input("Press Enter to continue...")
18+
elif len(sys.argv) == 1:
1819
#print("ARG detected: "+sys.argv[1])
19-
KBID = sys.argv[1]
20-
20+
kbarg = sys.argv[1]
21+
else:
22+
kbarg = sys.argv[1:]
23+
2124
dirname = os.path.dirname(__file__)
2225
# print(dirname)
23-
folderpath = os.path.join(dirname, KBID)
24-
# print(folderpath)
2526

26-
# list all .html files in chosen directory (there should only be one)
27-
os.chdir(folderpath)
28-
for file in glob.glob('*.html'):
29-
filename = file
30-
os.chdir('..')
27+
for KBID in kbarg:
28+
print('Beauti.pying ' + KBID, end=" ")
29+
30+
folderpath = os.path.join(dirname, KBID)
31+
# print(folderpath)
3132

32-
filepath = os.path.join(folderpath, filename)
33+
# list all .html files in chosen directory (there should only be one)
34+
os.chdir(folderpath)
35+
for file in glob.glob('*.html'):
36+
filename = file
37+
os.chdir('..')
3338

34-
#if .old version of file already, abort
35-
if os.path.exists(filepath+'.old'):
36-
print('File already converted. Aborting.')
37-
sys.exit()
39+
filepath = os.path.join(folderpath, filename)
3840

39-
print ("Cleaning " + filename)
41+
#if .old version of file already, abort
42+
if os.path.exists(filepath+'.old'):
43+
print('File already converted. Skipping...') #TODO: skip instead of abort
44+
continue
4045

41-
#removing smart quotes
42-
file_replace_text(filepath, '“', '"')
43-
file_replace_text(filepath, '”', '"')
46+
print(filename, end=" ")
4447

45-
#start html cleaning with beautifulsoup
46-
with open(filepath, encoding='utf-8') as fp:
47-
soup = BeautifulSoup(fp, "html.parser")
48+
#removing smart quotes
49+
file_replace_text(filepath, '“', '"')
50+
file_replace_text(filepath, '”', '"')
51+
file_replace_text(filepath, '’', '')
52+
#file_replace_text(filepath, '', '"')
4853

49-
#shameless self-promotion
50-
creditMe = soup.new_tag('meta', content='Converted to HTML by Vinay Janardhanam')
51-
soup.head.append(creditMe)
54+
print(".", end=" ")
5255

53-
#remove style tag from header
54-
for s in soup('style'):
55-
s.extract()
56+
#start html cleaning with beautifulsoup
57+
with open(filepath, encoding='utf-8') as fp:
58+
soup = BeautifulSoup(fp, "html.parser")
5659

57-
#replace <h1> with <h3>
58-
for h1 in soup('h1'):
59-
h1.name = 'h3'
60+
print(".", end=" ")
6061

61-
#replace <h2> with <h4>
62-
for h2 in soup('h2'):
63-
h2.name = 'h4'
62+
#shameless self-promotion
63+
creditMe = soup.new_tag('meta', content='Converted to HTML by Vinay Janardhanam')
64+
soup.head.append(creditMe)
6465

65-
#remove unnecessary attributes
66-
for tag in soup():
67-
for attribute in ['class', 'id', 'name', 'style']:
68-
del tag[attribute]
69-
70-
#changes image path for KB site
71-
print(filename)
72-
filename = os.path.splitext(filename)[0]
73-
print(filename)
74-
for image in soup.findAll('img'):
75-
image['src'] = image['src'].replace(filename, "/images/group87/"+KBID)
66+
#remove style tag from header
67+
for s in soup('style'):
68+
s.extract()
69+
70+
#replace <h1> with <h3>
71+
for h1 in soup('h1'):
72+
h1.name = 'h3'
73+
74+
#replace <h2> with <h4>
75+
for h2 in soup('h2'):
76+
h2.name = 'h4'
77+
78+
#remove unnecessary attributes
79+
for tag in soup():
80+
for attribute in ['class', 'id', 'name', 'style']:
81+
del tag[attribute]
82+
83+
print(".", end=" ")
84+
85+
#changes image path for KB site
86+
#print(filename)
87+
filename = os.path.splitext(filename)[0]
88+
#print(filename)
89+
for image in soup.findAll('img'):
90+
image['src'] = image['src'].replace(filename, "/images/group87/"+KBID)
91+
92+
#self explanitory :)
93+
soup.prettify()
94+
95+
#do string processing after here
96+
soup_string = str(soup.encode("utf-8"))
97+
98+
print(".", end=" ")
7699

77-
#self explanitory :)
78-
soup.prettify()
100+
#removes all <p><br/></p> tags
101+
soup_string = soup_string.replace('<p><br/></p>', '')
79102

80-
#do string processing after here
81-
soup_string = str(soup)
103+
#encapsulates <p> tags in <body> with a <div>
104+
soup_string = soup_string.replace('<body>', '<body>\n<div>')
105+
soup_string = soup_string.replace('</body>', '</div>\n</body>')
82106

83-
#removes all <p><br/></p> tags
84-
soup_string = soup_string.replace('<p><br/></p>', '')
107+
print(".", end=" ")
85108

86-
#encapsulates <p> tags in <body> with a <div>
87-
soup_string = soup_string.replace('<body>', '<body>\n<div>')
88-
soup_string = soup_string.replace('</body>', '</div>\n</body>')
109+
#exports file with .new at end
110+
with open(filepath+'.new', 'w') as file:
111+
file.write(str(soup_string))
89112

90-
#exports file with .new at end
91-
with open(filepath+'.new', 'w') as file:
92-
file.write(str(soup_string))
113+
#preserves original html file (minus the dumb smart quotes) with a .old extension
114+
os.rename(filepath,filepath+'.old')
115+
os.rename(filepath+'.new', filepath)
93116

94-
#preserves original html file (minus the dumb smart quotes) with a .old extension
95-
os.rename(filepath,filepath+'.old')
96-
os.rename(filepath+'.new', filepath)
117+
print('done!')
97118

98119

99120
if __name__ == "__main__":

0 commit comments

Comments
 (0)