@@ -50,13 +50,6 @@ def main():
5050
5151        print (filename , end = " " )
5252
53-         #removing smart quotes 
54-         file_replace_text (filepath , '“' , '"' )
55-         file_replace_text (filepath , '”' , '"' )
56-         file_replace_text (filepath , '’' , '' )
57-         #file_replace_text(filepath, '', '"') 
58- 
59-         print ("." , end = " " )
6053
6154        #start html cleaning with beautifulsoup 
6255        with  open (filepath , encoding = 'utf-8' ) as  fp :
@@ -68,6 +61,8 @@ def main():
6861        creditMe  =  soup .new_tag ('meta' , content = 'Converted to HTML by Vinay Janardhanam' )
6962        soup .head .append (creditMe )
7063
64+         print ("." , end = " " )
65+ 
7166        #remove style tag from header 
7267        for  s  in  soup ('style' ):
7368            s .extract ()
@@ -80,6 +75,8 @@ def main():
8075        for  h2  in  soup ('h2' ):
8176            h2 .name  =  'h4' 
8277
78+         print ("." , end = " " )
79+ 
8380        #remove unnecessary attributes 
8481        for  tag  in  soup ():
8582            for  attribute  in  ['class' , 'id' , 'name' , 'style' ]:
@@ -94,31 +91,25 @@ def main():
9491        for  image  in  soup .findAll ('img' ):
9592            image ['src' ] =  image ['src' ].replace (filename , "/images/group87/" + KBID )
9693
97-         #self explanitory :) 
98-         soup .prettify ()
99- 
100-         #do string processing after here 
101-         soup_string  =  str (soup .encode ("utf-8" ))
102- 
10394        print ("." , end = " " )
10495
105-         #removes all <p><br/></p> tags 
106-         soup_string  =  soup_string .replace ('<p><br/></p>' , '' )
107- 
108-         #encapsulates <p> tags in <body> with a <div> 
109-         soup_string  =  soup_string .replace ('<body>' , '<body>\n <div>' )
110-         soup_string  =  soup_string .replace ('</body>' , '</div>\n </body>' )
96+         #exports file with .new at end 
97+         with  open (filepath + '.new' , 'w' , encoding = "utf-8" ) as  file :
98+             file .write (str (soup ))
11199
112100        print ("." , end = " " )
113101
114-         #exports file with .new at end 
115-         with  open (filepath + '.new' , 'w' ) as  file :
116-             file .write (str (soup_string ))
117- 
118102        #preserves original html file (minus the dumb smart quotes) with a .old extension 
119103        os .rename (filepath ,filepath + '.old' )
120104        os .rename (filepath + '.new' , filepath )
121105
106+         print ("." , end = " " )
107+ 
108+         file_replace_text (filepath , '<p><br/></p>' , '' )
109+         file_replace_text (filepath , '<body>' , '<body>\n <div>' )
110+         file_replace_text (filepath , '</body>' , '</div>\n </body>' )
111+         file_replace_text (filepath , '' , '->' )
112+ 
122113        print ('done!' )
123114
124115
0 commit comments