File tree 1 file changed +37
-0
lines changed
1 file changed +37
-0
lines changed Original file line number Diff line number Diff line change
1
+ # -*- coding: utf-8 -*-
2
+ """Text Extractor in Python.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/18fBAudnned52qJ59xIjZ-xLTNgNCVvCz
8
+ """
9
+
10
+ #Installing the required module for Text Extraction
11
+ pip install PyPDF2
12
+
13
+ # Import the Module
14
+ import PyPDF2
15
+
16
+ # Reading our PDF, for extracting text
17
+ a = PyPDF2 .PdfFileReader ('/datalab/OPEN SOURCE PROJECT.pdf' )
18
+
19
+ # Initialize an empty string
20
+ pdf_text = ""
21
+
22
+ # getNumPages() gives the number of pages in our pdf
23
+ pgno = a .getNumPages ()
24
+
25
+
26
+ # Parsing through each page number
27
+ for i in range (1 , pgno ):
28
+
29
+ # extractText() will give us the text associated with the corresponding ith page
30
+ pdf_text += a .getPage (i ).extractText ()
31
+
32
+
33
+ # create "extracted_text.txt" file, where we write our extracted text
34
+ # using the string pdf_text
35
+
36
+ with open ('extracted_text.txt' , 'w' , encoding = 'utf-8' ) as f :
37
+ f .write (pdf_text )
You can’t perform that action at this time.
0 commit comments