Skip to content

Commit bcd97fb

Browse files
authored
Merge pull request larymak#270 from Kashaan-M/main
Analyze word document
2 parents ef61719 + 21d3a22 commit bcd97fb

File tree

3 files changed

+222
-1
lines changed

3 files changed

+222
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Analyze any `.docx` file for bold, underlined, italicized text
2+
This program helps you find all the bold, underlined and italicized text in a word document.
3+
4+
First create a new folder and then create a file named `extract.py` inside it and copy paste the code to it.
5+
Then you need to install `python-docx`
6+
```bash
7+
$ pip install python-docx
8+
```
9+
Copy your word document for example, `process_design_notes.docx` into the current working directory(CWD).
10+
11+
The CWD should now have two files i.e. **extract.py** and **process_design_notes.docx**.
12+
13+
Open a terminal or command prompt in CWD and type
14+
```bash
15+
#for linux
16+
python3 extract.py process_design_notes.docx
17+
#for windows
18+
python extract.py process_design_notes.docx
19+
```
20+
After typing above command the program will execute on the word document and append the extracted bold, italicized, underlined words to it.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
""" This module uses 'python-docx' package to analyze a Word Document ('.docx')
2+
This module can be imported in other programs but the preffered way is to run it
3+
as a Script from the command line like this:
4+
5+
$ python extract.py <name-of-word-document-file>
6+
7+
The basic function of the program is to analyze the word document and
8+
collect bold, italic and underlined words from it and then
9+
after analyzing write these collected words at the end of the word document.
10+
So the program first reads the word document, collects bold, italic
11+
and underlined words from it and then writes the collected words at
12+
the very end of the same word document
13+
Copyright 2023 Kashaan Mahmood
14+
License: MIT License
15+
https://opensource.org/license/mit/
16+
"""
17+
18+
19+
from docx import Document
20+
from docx.api import Document
21+
from docx.enum.text import WD_ALIGN_PARAGRAPH
22+
23+
# global variables
24+
total_words = 0
25+
wordsList = ""
26+
27+
28+
# calculate total words in docx
29+
def get_total_words(docxFile):
30+
document = Document(docxFile)
31+
total = 0
32+
for p in document.paragraphs:
33+
for run in p.runs:
34+
total += len(run.text.split())
35+
return total
36+
37+
38+
unwanted_characters = [
39+
'"',
40+
"'",
41+
"’",
42+
"“",
43+
":",
44+
"\n",
45+
"-",
46+
"— — ",
47+
"—",
48+
".",
49+
",",
50+
";",
51+
"!",
52+
"?",
53+
]
54+
55+
56+
def remove_unwanted(words):
57+
"""remove unwanted characters from analyzed output"""
58+
for i in unwanted_characters:
59+
if i in words:
60+
words = words.replace(i, "")
61+
return words
62+
63+
64+
def analyze(docxFile):
65+
"""analyze the docx file and collect bold, italicized, and underlined words from it
66+
and return a `collect` object these selected words
67+
"""
68+
69+
document = Document(docxFile)
70+
71+
collect = [
72+
{"b": []},
73+
{"i": []},
74+
{"u": []},
75+
{"bi": []},
76+
{"bu": []},
77+
{"iu": []},
78+
{"biu": []},
79+
]
80+
81+
for p in document.paragraphs:
82+
for run in p.runs:
83+
if run.bold and run.italic and run.underline:
84+
filtered_text = remove_unwanted(run.text)
85+
collect[6]["biu"].append(filtered_text)
86+
87+
elif run.bold and run.italic:
88+
filtered_text = remove_unwanted(run.text)
89+
collect[3]["bi"].append(filtered_text)
90+
91+
elif run.bold and run.underline:
92+
filtered_text = remove_unwanted(run.text)
93+
collect[4]["bu"].append(filtered_text)
94+
95+
elif run.italic and run.underline:
96+
filtered_text = remove_unwanted(run.text)
97+
collect[5]["iu"].append(filtered_text)
98+
99+
elif run.bold:
100+
filtered_text = remove_unwanted(run.text)
101+
collect[0]["b"].append(filtered_text)
102+
103+
elif run.italic:
104+
filtered_text = remove_unwanted(run.text)
105+
collect[1]["i"].append(filtered_text)
106+
107+
elif run.underline:
108+
filtered_text = remove_unwanted(run.text)
109+
collect[2]["u"].append(filtered_text)
110+
111+
return collect
112+
113+
114+
def write_data(docxFile, data):
115+
"""gets the `collect` variable as 'data' argument from analyze()
116+
and reads and appends the 'data' to end of docx file
117+
"""
118+
119+
global wordsList
120+
121+
document = Document(docxFile)
122+
123+
def save_document():
124+
document.save(docxFile)
125+
return "saved"
126+
127+
def add_words(key):
128+
global wordsList
129+
categories = {
130+
"b": "\nBold Words:-",
131+
"i": "\n\nItalicized Words:-",
132+
"u": "\n\nUnderlined Words:-",
133+
"bi": "\n\nBold & Italicized Words:-",
134+
"bu": "\n\nBold & Underlined Words:-",
135+
"biu": "\n\nBold & Italicized & Underlined Words:-",
136+
"iu": "\n\nItalicized & Underlined Words:-",
137+
}
138+
for word in words[key]:
139+
category = categories[key]
140+
if len(wordsList) == 0 or category not in wordsList:
141+
wordsList = wordsList + f"{category}\n{word}"
142+
else:
143+
wordsList = wordsList + f", {word}"
144+
145+
title_p = document.add_paragraph(f"\n========== Extracted Words ==========\n")
146+
147+
title_p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
148+
149+
write_p = document.add_paragraph()
150+
151+
for words in data:
152+
if words.__contains__("b") and words["b"]:
153+
add_words("b")
154+
155+
elif words.__contains__("u") and words["u"]:
156+
add_words("u")
157+
158+
elif words.__contains__("bi") and words["bi"]:
159+
add_words("bi")
160+
161+
elif words.__contains__("bu") and words["bu"]:
162+
add_words("bu")
163+
164+
elif words.__contains__("iu") and words["iu"]:
165+
add_words("iu")
166+
167+
elif words.__contains__("biu") and words["biu"]:
168+
add_words("biu")
169+
170+
write_p.add_run(f"{wordsList}")
171+
172+
ending_p = document.add_paragraph("\n===================\n")
173+
ending_p.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
174+
175+
save_document()
176+
return
177+
178+
179+
# function calls inside main()
180+
181+
182+
def main():
183+
global total_words
184+
185+
data = analyze(docx)
186+
write_data(docx, data)
187+
188+
189+
if __name__ == "__main__":
190+
from sys import argv
191+
import time
192+
193+
# get docx file
194+
docx = argv[1]
195+
196+
print(f"Started at {time.strftime('%X')}...")
197+
# calling main()
198+
main()
199+
print(f"Finished at {time.strftime('%X')}...")

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -115,4 +115,6 @@ guide [HERE](https://github.com/larymak/Python-project-Scripts/blob/main/CONTRIB
115115
| 64 | [Umbrella Reminder](https://github.com/larymak/Python-project-Scripts/tree/main/TIME%20SCRIPTS/Umbrella%20Reminder) | [Edula Vinay Kumar Reddy](https://github.com/vinayedula) |
116116
| 65 | [Image to PDF](https://github.com/larymak/Python-project-Scripts/tree/main/IMAGES%20%26%20PHOTO%20SCRIPTS/Image%20to%20PDF) | [Vedant Chainani](https://github.com/Envoy-VC) |
117117
| 66 | [KeyLogger](https://github.com/larymak/Python-project-Scripts/tree/main/OTHERS/KeyLogger) | [Akhil](https://github.com/akhil-chagarlamudi) |
118-
| 67 | [PDF Text Extractor](https://github.com/SamAddy/Python-project-Scripts/tree/main/PYTHON%20APPS/PDF-Text-Extractor) | [Samuel Addison](https://github.com/SamAddy) |
118+
| 67 | [PDF Text Extractor](https://github.com/SamAddy/Python-project-Scripts/tree/main/PYTHON%20APPS/PDF-Text-Extractor) | [Samuel Addison](https://github.com/SamAddy)
119+
| 68 | [Analyze docx file](https://github.com/larymak/Python-project-Scripts/tree/main/AUTOMATION/analyzing%20and%20writing%20.docx%20file) | [Kashaan Mahmood](https://github.com/Kashaan-M)
120+

0 commit comments

Comments
 (0)