Merge pull request larymak#270 from Kashaan-M/main

larymak · web-flow · commit bcd97fbb649e · 2023-03-14T08:05:04.000+03:00
Analyze word document
diff --git a/AUTOMATION/analyzing and writing .docx file/README.md b/AUTOMATION/analyzing and writing .docx file/README.md
@@ -0,0 +1,20 @@
+# Analyze any `.docx` file for bold, underlined, italicized text
+This program helps you find all the bold, underlined and italicized text in a word document.
+
+First create a new folder and then create a file named `extract.py` inside it and copy paste the code to it.
+Then you need to install `python-docx`
+```bash
+$ pip install python-docx
+```
+Copy your word document for example, `process_design_notes.docx` into the current working directory(CWD).
+
+The CWD should now have two files i.e. **extract.py** and **process_design_notes.docx**.
+
+Open a terminal or command prompt in CWD and type
+```bash
+#for linux
+python3 extract.py process_design_notes.docx
+#for windows
+python extract.py process_design_notes.docx
+```
+After typing above command the program will execute on the word document and append the extracted bold, italicized, underlined words to it.
diff --git a/AUTOMATION/analyzing and writing .docx file/extract.py b/AUTOMATION/analyzing and writing .docx file/extract.py
@@ -0,0 +1,199 @@
+""" This module uses 'python-docx' package to analyze a Word Document ('.docx')
+    This module can be imported in other programs but the preffered way is to run it 
+    as a Script from the command line like this:
+    
+    $ python extract.py <name-of-word-document-file>
+    
+    The basic function of the program is to analyze the word document and 
+    collect bold, italic and underlined words from it and then 
+    after analyzing write these collected words at the end of the word document.
+    So the program first reads the word document, collects bold, italic 
+    and underlined words from it and then writes the collected words at
+    the very end of the same word document 
+    Copyright 2023 Kashaan Mahmood
+    License: MIT License
+             https://opensource.org/license/mit/
+    """
+
+
+from docx import Document
+from docx.api import Document
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+
+# global variables
+total_words = 0
+wordsList = ""
+
+
+# calculate total words in docx
+def get_total_words(docxFile):
+    document = Document(docxFile)
+    total = 0
+    for p in document.paragraphs:
+        for run in p.runs:
+            total += len(run.text.split())
+    return total
+
+
+unwanted_characters = [
+    '"',
+    "'",
+    "’",
+    "“",
+    ":",
+    "\n",
+    "-",
+    "— — ",
+    "—",
+    ".",
+    ",",
+    ";",
+    "!",
+    "?",
+]
+
+
+def remove_unwanted(words):
+    """remove unwanted characters from analyzed output"""
+    for i in unwanted_characters:
+        if i in words:
+            words = words.replace(i, "")
+    return words
+
+
+def analyze(docxFile):
+    """analyze the docx file and collect bold, italicized, and underlined words from it
+    and return a `collect` object these selected words
+    """
+
+    document = Document(docxFile)
+
+    collect = [
+        {"b": []},
+        {"i": []},
+        {"u": []},
+        {"bi": []},
+        {"bu": []},
+        {"iu": []},
+        {"biu": []},
+    ]
+
+    for p in document.paragraphs:
+        for run in p.runs:
+            if run.bold and run.italic and run.underline:
+                filtered_text = remove_unwanted(run.text)
+                collect[6]["biu"].append(filtered_text)
+
+            elif run.bold and run.italic:
+                filtered_text = remove_unwanted(run.text)
+                collect[3]["bi"].append(filtered_text)
+
+            elif run.bold and run.underline:
+                filtered_text = remove_unwanted(run.text)
+                collect[4]["bu"].append(filtered_text)
+
+            elif run.italic and run.underline:
+                filtered_text = remove_unwanted(run.text)
+                collect[5]["iu"].append(filtered_text)
+
+            elif run.bold:
+                filtered_text = remove_unwanted(run.text)
+                collect[0]["b"].append(filtered_text)
+
+            elif run.italic:
+                filtered_text = remove_unwanted(run.text)
+                collect[1]["i"].append(filtered_text)
+
+            elif run.underline:
+                filtered_text = remove_unwanted(run.text)
+                collect[2]["u"].append(filtered_text)
+
+    return collect
+
+
+def write_data(docxFile, data):
+    """gets the `collect` variable as 'data' argument from analyze()
+    and reads and appends the 'data' to end of docx file
+    """
+
+    global wordsList
+
+    document = Document(docxFile)
+
+    def save_document():
+        document.save(docxFile)
+        return "saved"
+
+    def add_words(key):
+        global wordsList
+        categories = {
+            "b": "\nBold Words:-",
+            "i": "\n\nItalicized Words:-",
+            "u": "\n\nUnderlined Words:-",
+            "bi": "\n\nBold & Italicized Words:-",
+            "bu": "\n\nBold & Underlined Words:-",
+            "biu": "\n\nBold & Italicized & Underlined Words:-",
+            "iu": "\n\nItalicized & Underlined Words:-",
+        }
+        for word in words[key]:
+            category = categories[key]
+            if len(wordsList) == 0 or category not in wordsList:
+                wordsList = wordsList + f"{category}\n{word}"
+            else:
+                wordsList = wordsList + f", {word}"
+
+    title_p = document.add_paragraph(f"\n========== Extracted Words ==========\n")
+
+    title_p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
+
+    write_p = document.add_paragraph()
+
+    for words in data:
+        if words.__contains__("b") and words["b"]:
+            add_words("b")
+
+        elif words.__contains__("u") and words["u"]:
+            add_words("u")
+
+        elif words.__contains__("bi") and words["bi"]:
+            add_words("bi")
+
+        elif words.__contains__("bu") and words["bu"]:
+            add_words("bu")
+
+        elif words.__contains__("iu") and words["iu"]:
+            add_words("iu")
+
+        elif words.__contains__("biu") and words["biu"]:
+            add_words("biu")
+
+    write_p.add_run(f"{wordsList}")
+
+    ending_p = document.add_paragraph("\n===================\n")
+    ending_p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
+
+    save_document()
+    return
+
+
+# function calls inside main()
+
+
+def main():
+    global total_words
+
+    data = analyze(docx)
+    write_data(docx, data)
+
+
+if __name__ == "__main__":
+    from sys import argv
+    import time
+
+    # get docx file
+    docx = argv[1]
+
+    print(f"Started at {time.strftime('%X')}...")
+    # calling main()
+    main()
+    print(f"Finished at {time.strftime('%X')}...")
diff --git a/README.md b/README.md
@@ -115,4 +115,6 @@ guide [HERE](https://github.com/larymak/Python-project-Scripts/blob/main/CONTRIB
 | 64    | [Umbrella Reminder](https://github.com/larymak/Python-project-Scripts/tree/main/TIME%20SCRIPTS/Umbrella%20Reminder)                                   | [Edula Vinay Kumar Reddy](https://github.com/vinayedula)    |
 | 65    | [Image to PDF](https://github.com/larymak/Python-project-Scripts/tree/main/IMAGES%20%26%20PHOTO%20SCRIPTS/Image%20to%20PDF)                       | [Vedant Chainani](https://github.com/Envoy-VC)              |
 | 66    | [KeyLogger](https://github.com/larymak/Python-project-Scripts/tree/main/OTHERS/KeyLogger)                                                         | [Akhil](https://github.com/akhil-chagarlamudi)              |
-| 67    | [PDF Text Extractor](https://github.com/SamAddy/Python-project-Scripts/tree/main/PYTHON%20APPS/PDF-Text-Extractor)                                                         | [Samuel Addison](https://github.com/SamAddy)              | 
+| 67    | [PDF Text Extractor](https://github.com/SamAddy/Python-project-Scripts/tree/main/PYTHON%20APPS/PDF-Text-Extractor)                                                         | [Samuel Addison](https://github.com/SamAddy)
+| 68    | [Analyze docx file](https://github.com/larymak/Python-project-Scripts/tree/main/AUTOMATION/analyzing%20and%20writing%20.docx%20file)                                     | [Kashaan Mahmood](https://github.com/Kashaan-M)
+