BasicPythonScripts/Text Extractor from PDF/text_extractor.py

# -*- coding: utf-8 -*-
"""Text Extractor in Python.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/18fBAudnned52qJ59xIjZ-xLTNgNCVvCz
"""

#Installing the required module for Text Extraction
pip install PyPDF2

# Import the Module
import PyPDF2

# Reading our PDF, for extracting text
a = PyPDF2.PdfFileReader('/datalab/OPEN SOURCE PROJECT.pdf')

# Initialize an empty string
pdf_text = ""

# getNumPages() gives the number of pages in our pdf
pgno = a.getNumPages()


# Parsing through each page number
for i in range(1, pgno):
    
    # extractText() will give us the text associated with the corresponding ith page
    pdf_text += a.getPage(i).extractText()
    
    
# create "extracted_text.txt" file, where we write our extracted text
# using the string pdf_text

with open('extracted_text.txt', 'w', encoding = 'utf-8') as f:
    f.write(pdf_text)