|  | 
|  | 1 | +from bs4 import BeautifulSoup | 
|  | 2 | + | 
|  | 3 | + | 
|  | 4 | + | 
|  | 5 | + | 
|  | 6 | + | 
|  | 7 | + | 
|  | 8 | + | 
|  | 9 | + | 
|  | 10 | + | 
|  | 11 | + | 
|  | 12 | +def file_replace_rext(filename, toreplace, replacement): | 
|  | 13 | +    with open(filename, 'r',  encoding='utf-8') as file: | 
|  | 14 | +        filedata = file.read() | 
|  | 15 | +    # Replace the text in file | 
|  | 16 | +    filedata = filedata.replace(toreplace, replacement) | 
|  | 17 | +    # Write the file out again | 
|  | 18 | +    with open(filename, 'w') as file: | 
|  | 19 | +        file.write(filedata) | 
|  | 20 | + | 
|  | 21 | +# Read in the file | 
|  | 22 | +with open('todo.html', 'r',  encoding='utf-8') as file: | 
|  | 23 | +  filedata = file.read() | 
|  | 24 | +# Replace the “smart quotes” | 
|  | 25 | +filedata = filedata.replace('“', '"').replace('”', '"') | 
|  | 26 | +# Write the file out again | 
|  | 27 | +with open('todo.html', 'w') as file: | 
|  | 28 | +  file.write(filedata) | 
|  | 29 | + | 
|  | 30 | +#TODO: verify if removing entire style tag is good or no | 
|  | 31 | + | 
|  | 32 | + | 
|  | 33 | +with open("todo.html", encoding='utf-8') as fp: | 
|  | 34 | +    soup = BeautifulSoup(fp, "html.parser") | 
|  | 35 | + | 
|  | 36 | + | 
|  | 37 | +title = soup.title.get_text() | 
|  | 38 | +print(title) | 
|  | 39 | + | 
|  | 40 | +#remove style tag from header | 
|  | 41 | +soup.find('style').extract() | 
|  | 42 | + | 
|  | 43 | +for tag in soup(): | 
|  | 44 | +    for attribute in ['class', 'id', 'name', 'style']: | 
|  | 45 | +        del tag[attribute] | 
|  | 46 | + | 
|  | 47 | +#do string processing after here | 
|  | 48 | +soup_string = str(soup) | 
|  | 49 | + | 
|  | 50 | +#removes all <p><br/></p> tags | 
|  | 51 | +soup_string = soup_string.replace('<p><br/></p>', '<br/>') | 
|  | 52 | + | 
|  | 53 | +#encapsulates <p> tags in <body> with a <div> | 
|  | 54 | +soup_string = soup_string.replace('<body>', '<body>\n<div>') | 
|  | 55 | +soup_string = soup_string.replace('</body>', '</div>\n</body>') | 
|  | 56 | + | 
|  | 57 | +#exports file | 
|  | 58 | +with open('done.html', 'w') as file: | 
|  | 59 | +    file.write(str(soup_string)) | 
0 commit comments