Add files via upload

Sakalya100 · web-flow · commit f2b7cfe04944 · 2021-06-22T23:34:01.000+05:30
diff --git a/BasicPythonScripts/Text Extractor from PDF/text_extractor.py b/BasicPythonScripts/Text Extractor from PDF/text_extractor.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+"""Text Extractor in Python.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/18fBAudnned52qJ59xIjZ-xLTNgNCVvCz
+"""
+
+#Installing the required module for Text Extraction
+pip install PyPDF2
+
+# Import the Module
+import PyPDF2
+
+# Reading our PDF, for extracting text
+a = PyPDF2.PdfFileReader('/datalab/OPEN SOURCE PROJECT.pdf')
+
+# Initialize an empty string
+pdf_text = ""
+
+# getNumPages() gives the number of pages in our pdf
+pgno = a.getNumPages()
+
+
+# Parsing through each page number
+for i in range(1, pgno):
+    
+    # extractText() will give us the text associated with the corresponding ith page
+    pdf_text += a.getPage(i).extractText()
+    
+    
+# create "extracted_text.txt" file, where we write our extracted text
+# using the string pdf_text
+
+with open('extracted_text.txt', 'w', encoding = 'utf-8') as f:
+    f.write(pdf_text)