|
| 1 | +""" This module uses 'python-docx' package to analyze a Word Document ('.docx') |
| 2 | + This module can be imported in other programs but the preffered way is to run it |
| 3 | + as a Script from the command line like this: |
| 4 | + |
| 5 | + $ python extract.py <name-of-word-document-file> |
| 6 | + |
| 7 | + The basic function of the program is to analyze the word document and |
| 8 | + collect bold, italic and underlined words from it and then |
| 9 | + after analyzing write these collected words at the end of the word document. |
| 10 | + So the program first reads the word document, collects bold, italic |
| 11 | + and underlined words from it and then writes the collected words at |
| 12 | + the very end of the same word document |
| 13 | + Copyright 2023 Kashaan Mahmood |
| 14 | + License: MIT License |
| 15 | + https://opensource.org/license/mit/ |
| 16 | + """ |
| 17 | + |
| 18 | + |
| 19 | +from docx import Document |
| 20 | +from docx.api import Document |
| 21 | +from docx.enum.text import WD_ALIGN_PARAGRAPH |
| 22 | + |
| 23 | +# global variables |
| 24 | +total_words = 0 |
| 25 | +wordsList = "" |
| 26 | + |
| 27 | + |
| 28 | +# calculate total words in docx |
| 29 | +def get_total_words(docxFile): |
| 30 | + document = Document(docxFile) |
| 31 | + total = 0 |
| 32 | + for p in document.paragraphs: |
| 33 | + for run in p.runs: |
| 34 | + total += len(run.text.split()) |
| 35 | + return total |
| 36 | + |
| 37 | + |
| 38 | +unwanted_characters = [ |
| 39 | + '"', |
| 40 | + "'", |
| 41 | + "’", |
| 42 | + "“", |
| 43 | + ":", |
| 44 | + "\n", |
| 45 | + "-", |
| 46 | + "— — ", |
| 47 | + "—", |
| 48 | + ".", |
| 49 | + ",", |
| 50 | + ";", |
| 51 | + "!", |
| 52 | + "?", |
| 53 | +] |
| 54 | + |
| 55 | + |
| 56 | +def remove_unwanted(words): |
| 57 | + """remove unwanted characters from analyzed output""" |
| 58 | + for i in unwanted_characters: |
| 59 | + if i in words: |
| 60 | + words = words.replace(i, "") |
| 61 | + return words |
| 62 | + |
| 63 | + |
| 64 | +def analyze(docxFile): |
| 65 | + """analyze the docx file and collect bold, italicized, and underlined words from it |
| 66 | + and return a `collect` object these selected words |
| 67 | + """ |
| 68 | + |
| 69 | + document = Document(docxFile) |
| 70 | + |
| 71 | + collect = [ |
| 72 | + {"b": []}, |
| 73 | + {"i": []}, |
| 74 | + {"u": []}, |
| 75 | + {"bi": []}, |
| 76 | + {"bu": []}, |
| 77 | + {"iu": []}, |
| 78 | + {"biu": []}, |
| 79 | + ] |
| 80 | + |
| 81 | + for p in document.paragraphs: |
| 82 | + for run in p.runs: |
| 83 | + if run.bold and run.italic and run.underline: |
| 84 | + filtered_text = remove_unwanted(run.text) |
| 85 | + collect[6]["biu"].append(filtered_text) |
| 86 | + |
| 87 | + elif run.bold and run.italic: |
| 88 | + filtered_text = remove_unwanted(run.text) |
| 89 | + collect[3]["bi"].append(filtered_text) |
| 90 | + |
| 91 | + elif run.bold and run.underline: |
| 92 | + filtered_text = remove_unwanted(run.text) |
| 93 | + collect[4]["bu"].append(filtered_text) |
| 94 | + |
| 95 | + elif run.italic and run.underline: |
| 96 | + filtered_text = remove_unwanted(run.text) |
| 97 | + collect[5]["iu"].append(filtered_text) |
| 98 | + |
| 99 | + elif run.bold: |
| 100 | + filtered_text = remove_unwanted(run.text) |
| 101 | + collect[0]["b"].append(filtered_text) |
| 102 | + |
| 103 | + elif run.italic: |
| 104 | + filtered_text = remove_unwanted(run.text) |
| 105 | + collect[1]["i"].append(filtered_text) |
| 106 | + |
| 107 | + elif run.underline: |
| 108 | + filtered_text = remove_unwanted(run.text) |
| 109 | + collect[2]["u"].append(filtered_text) |
| 110 | + |
| 111 | + return collect |
| 112 | + |
| 113 | + |
| 114 | +def write_data(docxFile, data): |
| 115 | + """gets the `collect` variable as 'data' argument from analyze() |
| 116 | + and reads and appends the 'data' to end of docx file |
| 117 | + """ |
| 118 | + |
| 119 | + global wordsList |
| 120 | + |
| 121 | + document = Document(docxFile) |
| 122 | + |
| 123 | + def save_document(): |
| 124 | + document.save(docxFile) |
| 125 | + return "saved" |
| 126 | + |
| 127 | + def add_words(key): |
| 128 | + global wordsList |
| 129 | + categories = { |
| 130 | + "b": "\nBold Words:-", |
| 131 | + "i": "\n\nItalicized Words:-", |
| 132 | + "u": "\n\nUnderlined Words:-", |
| 133 | + "bi": "\n\nBold & Italicized Words:-", |
| 134 | + "bu": "\n\nBold & Underlined Words:-", |
| 135 | + "biu": "\n\nBold & Italicized & Underlined Words:-", |
| 136 | + "iu": "\n\nItalicized & Underlined Words:-", |
| 137 | + } |
| 138 | + for word in words[key]: |
| 139 | + category = categories[key] |
| 140 | + if len(wordsList) == 0 or category not in wordsList: |
| 141 | + wordsList = wordsList + f"{category}\n{word}" |
| 142 | + else: |
| 143 | + wordsList = wordsList + f", {word}" |
| 144 | + |
| 145 | + title_p = document.add_paragraph(f"\n========== Extracted Words ==========\n") |
| 146 | + |
| 147 | + title_p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER |
| 148 | + |
| 149 | + write_p = document.add_paragraph() |
| 150 | + |
| 151 | + for words in data: |
| 152 | + if words.__contains__("b") and words["b"]: |
| 153 | + add_words("b") |
| 154 | + |
| 155 | + elif words.__contains__("u") and words["u"]: |
| 156 | + add_words("u") |
| 157 | + |
| 158 | + elif words.__contains__("bi") and words["bi"]: |
| 159 | + add_words("bi") |
| 160 | + |
| 161 | + elif words.__contains__("bu") and words["bu"]: |
| 162 | + add_words("bu") |
| 163 | + |
| 164 | + elif words.__contains__("iu") and words["iu"]: |
| 165 | + add_words("iu") |
| 166 | + |
| 167 | + elif words.__contains__("biu") and words["biu"]: |
| 168 | + add_words("biu") |
| 169 | + |
| 170 | + write_p.add_run(f"{wordsList}") |
| 171 | + |
| 172 | + ending_p = document.add_paragraph("\n===================\n") |
| 173 | + ending_p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER |
| 174 | + |
| 175 | + save_document() |
| 176 | + return |
| 177 | + |
| 178 | + |
| 179 | +# function calls inside main() |
| 180 | + |
| 181 | + |
| 182 | +def main(): |
| 183 | + global total_words |
| 184 | + |
| 185 | + data = analyze(docx) |
| 186 | + write_data(docx, data) |
| 187 | + |
| 188 | + |
| 189 | +if __name__ == "__main__": |
| 190 | + from sys import argv |
| 191 | + import time |
| 192 | + |
| 193 | + # get docx file |
| 194 | + docx = argv[1] |
| 195 | + |
| 196 | + print(f"Started at {time.strftime('%X')}...") |
| 197 | + # calling main() |
| 198 | + main() |
| 199 | + print(f"Finished at {time.strftime('%X')}...") |
0 commit comments