|  | 
| 3 | 3 | 
 | 
| 4 | 4 | def file_replace_text(fname, toreplace, replacement): | 
| 5 | 5 |     #print("opening " + fname) | 
| 6 |  | -    with open(fname, 'r',  encoding='ANSI') as file: | 
|  | 6 | +    with open(fname, 'r',  encoding='utf-8') as file: | 
| 7 | 7 |         filedata = file.read() | 
| 8 | 8 |     # Replace the text in file | 
| 9 | 9 |     filedata = filedata.replace(toreplace, replacement) | 
| 10 | 10 |     # Write the file out again | 
| 11 |  | -    with open(fname, 'w') as file: | 
|  | 11 | +    with open(fname, 'w', encoding='utf-8') as file: | 
| 12 | 12 |         file.write(filedata) | 
| 13 | 13 | 
 | 
| 14 | 14 | def main(): | 
| 15 | 15 |     if len(sys.argv) <= 1: | 
| 16 |  | -        KBID = str(input("Enter KB number to beautify: ")) | 
| 17 |  | -    else: | 
|  | 16 | +        print("No parameters detected. Aborting.") | 
|  | 17 | +        input("Press Enter to continue...") | 
|  | 18 | +    elif len(sys.argv) == 1: | 
| 18 | 19 |         #print("ARG detected: "+sys.argv[1]) | 
| 19 |  | -        KBID = sys.argv[1] | 
| 20 |  | -     | 
|  | 20 | +        kbarg = sys.argv[1] | 
|  | 21 | +    else: | 
|  | 22 | +        kbarg = sys.argv[1:] | 
|  | 23 | + | 
| 21 | 24 |     dirname = os.path.dirname(__file__) | 
| 22 | 25 |     # print(dirname) | 
| 23 |  | -    folderpath = os.path.join(dirname, KBID) | 
| 24 |  | -    # print(folderpath) | 
| 25 | 26 | 
 | 
| 26 |  | -    # list all .html files in chosen directory (there should only be one) | 
| 27 |  | -    os.chdir(folderpath) | 
| 28 |  | -    for file in glob.glob('*.html'): | 
| 29 |  | -        filename = file | 
| 30 |  | -    os.chdir('..') | 
|  | 27 | +    for KBID in kbarg: | 
|  | 28 | +        print('Beauti.pying ' + KBID, end=" ") | 
|  | 29 | +     | 
|  | 30 | +        folderpath = os.path.join(dirname, KBID) | 
|  | 31 | +        # print(folderpath) | 
| 31 | 32 | 
 | 
| 32 |  | -    filepath = os.path.join(folderpath, filename) | 
|  | 33 | +        # list all .html files in chosen directory (there should only be one) | 
|  | 34 | +        os.chdir(folderpath) | 
|  | 35 | +        for file in glob.glob('*.html'): | 
|  | 36 | +            filename = file | 
|  | 37 | +        os.chdir('..') | 
| 33 | 38 | 
 | 
| 34 |  | -    #if .old version of file already, abort | 
| 35 |  | -    if os.path.exists(filepath+'.old'): | 
| 36 |  | -        print('File already converted. Aborting.') | 
| 37 |  | -        sys.exit() | 
|  | 39 | +        filepath = os.path.join(folderpath, filename) | 
| 38 | 40 | 
 | 
| 39 |  | -    print ("Cleaning " + filename) | 
|  | 41 | +        #if .old version of file already, abort | 
|  | 42 | +        if os.path.exists(filepath+'.old'): | 
|  | 43 | +            print('File already converted. Skipping...')       #TODO: skip instead of abort | 
|  | 44 | +            continue | 
| 40 | 45 | 
 | 
| 41 |  | -    #removing smart quotes | 
| 42 |  | -    file_replace_text(filepath, '“', '"') | 
| 43 |  | -    file_replace_text(filepath, '”', '"') | 
|  | 46 | +        print(filename, end=" ") | 
| 44 | 47 | 
 | 
| 45 |  | -    #start html cleaning with beautifulsoup | 
| 46 |  | -    with open(filepath, encoding='utf-8') as fp: | 
| 47 |  | -        soup = BeautifulSoup(fp, "html.parser") | 
|  | 48 | +        #removing smart quotes | 
|  | 49 | +        file_replace_text(filepath, '“', '"') | 
|  | 50 | +        file_replace_text(filepath, '”', '"') | 
|  | 51 | +        file_replace_text(filepath, '’', '') | 
|  | 52 | +        #file_replace_text(filepath, '', '"') | 
| 48 | 53 | 
 | 
| 49 |  | -    #shameless self-promotion | 
| 50 |  | -    creditMe = soup.new_tag('meta', content='Converted to HTML by Vinay Janardhanam') | 
| 51 |  | -    soup.head.append(creditMe) | 
|  | 54 | +        print(".", end=" ") | 
| 52 | 55 | 
 | 
| 53 |  | -    #remove style tag from header | 
| 54 |  | -    for s in soup('style'): | 
| 55 |  | -        s.extract() | 
|  | 56 | +        #start html cleaning with beautifulsoup | 
|  | 57 | +        with open(filepath, encoding='utf-8') as fp: | 
|  | 58 | +            soup = BeautifulSoup(fp, "html.parser") | 
| 56 | 59 | 
 | 
| 57 |  | -    #replace <h1> with <h3> | 
| 58 |  | -    for h1 in soup('h1'): | 
| 59 |  | -        h1.name = 'h3' | 
|  | 60 | +        print(".", end=" ") | 
| 60 | 61 | 
 | 
| 61 |  | -    #replace <h2> with <h4> | 
| 62 |  | -    for h2 in soup('h2'): | 
| 63 |  | -        h2.name = 'h4' | 
|  | 62 | +        #shameless self-promotion | 
|  | 63 | +        creditMe = soup.new_tag('meta', content='Converted to HTML by Vinay Janardhanam') | 
|  | 64 | +        soup.head.append(creditMe) | 
| 64 | 65 | 
 | 
| 65 |  | -    #remove unnecessary attributes | 
| 66 |  | -    for tag in soup(): | 
| 67 |  | -        for attribute in ['class', 'id', 'name', 'style']: | 
| 68 |  | -            del tag[attribute] | 
| 69 |  | -     | 
| 70 |  | -    #changes image path for KB site | 
| 71 |  | -    print(filename) | 
| 72 |  | -    filename = os.path.splitext(filename)[0] | 
| 73 |  | -    print(filename) | 
| 74 |  | -    for image in soup.findAll('img'): | 
| 75 |  | -        image['src'] = image['src'].replace(filename, "/images/group87/"+KBID) | 
|  | 66 | +        #remove style tag from header | 
|  | 67 | +        for s in soup('style'): | 
|  | 68 | +            s.extract() | 
|  | 69 | + | 
|  | 70 | +        #replace <h1> with <h3> | 
|  | 71 | +        for h1 in soup('h1'): | 
|  | 72 | +            h1.name = 'h3' | 
|  | 73 | + | 
|  | 74 | +        #replace <h2> with <h4> | 
|  | 75 | +        for h2 in soup('h2'): | 
|  | 76 | +            h2.name = 'h4' | 
|  | 77 | + | 
|  | 78 | +        #remove unnecessary attributes | 
|  | 79 | +        for tag in soup(): | 
|  | 80 | +            for attribute in ['class', 'id', 'name', 'style']: | 
|  | 81 | +                del tag[attribute] | 
|  | 82 | + | 
|  | 83 | +        print(".", end=" ") | 
|  | 84 | +         | 
|  | 85 | +        #changes image path for KB site | 
|  | 86 | +        #print(filename) | 
|  | 87 | +        filename = os.path.splitext(filename)[0] | 
|  | 88 | +        #print(filename) | 
|  | 89 | +        for image in soup.findAll('img'): | 
|  | 90 | +            image['src'] = image['src'].replace(filename, "/images/group87/"+KBID) | 
|  | 91 | + | 
|  | 92 | +        #self explanitory :) | 
|  | 93 | +        soup.prettify() | 
|  | 94 | + | 
|  | 95 | +        #do string processing after here | 
|  | 96 | +        soup_string = str(soup.encode("utf-8")) | 
|  | 97 | + | 
|  | 98 | +        print(".", end=" ") | 
| 76 | 99 | 
 | 
| 77 |  | -    #self explanitory :) | 
| 78 |  | -    soup.prettify() | 
|  | 100 | +        #removes all <p><br/></p> tags | 
|  | 101 | +        soup_string = soup_string.replace('<p><br/></p>', '') | 
| 79 | 102 | 
 | 
| 80 |  | -    #do string processing after here | 
| 81 |  | -    soup_string = str(soup) | 
|  | 103 | +        #encapsulates <p> tags in <body> with a <div> | 
|  | 104 | +        soup_string = soup_string.replace('<body>', '<body>\n<div>') | 
|  | 105 | +        soup_string = soup_string.replace('</body>', '</div>\n</body>') | 
| 82 | 106 | 
 | 
| 83 |  | -    #removes all <p><br/></p> tags | 
| 84 |  | -    soup_string = soup_string.replace('<p><br/></p>', '') | 
|  | 107 | +        print(".", end=" ") | 
| 85 | 108 | 
 | 
| 86 |  | -    #encapsulates <p> tags in <body> with a <div> | 
| 87 |  | -    soup_string = soup_string.replace('<body>', '<body>\n<div>') | 
| 88 |  | -    soup_string = soup_string.replace('</body>', '</div>\n</body>') | 
|  | 109 | +        #exports file with .new at end | 
|  | 110 | +        with open(filepath+'.new', 'w') as file: | 
|  | 111 | +            file.write(str(soup_string)) | 
| 89 | 112 | 
 | 
| 90 |  | -    #exports file with .new at end | 
| 91 |  | -    with open(filepath+'.new', 'w') as file: | 
| 92 |  | -        file.write(str(soup_string)) | 
|  | 113 | +        #preserves original html file (minus the dumb smart quotes) with a .old extension | 
|  | 114 | +        os.rename(filepath,filepath+'.old') | 
|  | 115 | +        os.rename(filepath+'.new', filepath) | 
| 93 | 116 | 
 | 
| 94 |  | -    #preserves original html file (minus the dumb smart quotes) with a .old extension | 
| 95 |  | -    os.rename(filepath,filepath+'.old') | 
| 96 |  | -    os.rename(filepath+'.new', filepath) | 
|  | 117 | +        print('done!') | 
| 97 | 118 | 
 | 
| 98 | 119 | 
 | 
| 99 | 120 | if __name__ == "__main__": | 
|  | 
0 commit comments