forked from anandalisha/Awesome_Python_Scripts
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtext_extractor.py
37 lines (24 loc) · 928 Bytes
/
text_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# -*- coding: utf-8 -*-
"""Text Extractor in Python.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/18fBAudnned52qJ59xIjZ-xLTNgNCVvCz
"""
#Installing the required module for Text Extraction
pip install PyPDF2
# Import the Module
import PyPDF2
# Reading our PDF, for extracting text
a = PyPDF2.PdfFileReader('/datalab/OPEN SOURCE PROJECT.pdf')
# Initialize an empty string
pdf_text = ""
# getNumPages() gives the number of pages in our pdf
pgno = a.getNumPages()
# Parsing through each page number
for i in range(1, pgno):
# extractText() will give us the text associated with the corresponding ith page
pdf_text += a.getPage(i).extractText()
# create "extracted_text.txt" file, where we write our extracted text
# using the string pdf_text
with open('extracted_text.txt', 'w', encoding = 'utf-8') as f:
f.write(pdf_text)