forked from openstate/open-raadsinformatie
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf.py
107 lines (89 loc) · 3.05 KB
/
pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import subprocess
import tempfile
import re
from cStringIO import StringIO
from urllib2 import HTTPError
from OpenSSL.SSL import ZeroReturnError
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import requests
from ocd_backend import settings
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
# For some reason PDFMiner chokes on pages that have a single,
# but detailed image in them. Work around by skipping pages with
# a large media box
mediabox = [0, 0, 0, 0]
try:
mediabox = page.mediabox
except AttributeError as e:
pass
try:
mediabox_pixels = mediabox[2] * mediabox[3]
except IndexError as e:
mediabox_pixels = 0
if mediabox_pixels <= settings.PDF_MAX_MEDIABOX_PIXELS:
print "Processing page %s" % (page,)
interpreter.process_page(page)
else:
print "Skipped page %s" % (page,)
infile.close()
converter.close()
text = output.getvalue()
output.close()
return text
class PDFToTextMixin(object):
"""
Interface for converting a PDF file into text format using pdftotext
"""
def pdf_clean_text(self, text):
return text
# return re.sub(r'\s+', u' ', text)
def pdf_get_contents(self, url, max_pages=20):
"""
Convenience method to download a PDF file and converting it to text.
"""
tf = self.pdf_download(url)
if tf is not None:
return self.pdf_to_text(tf.name, max_pages)
else:
return u'' # FIXME: should be something else ...
def pdf_download(self, url):
"""
Downloads a given url to a tempfile.
"""
print "Downloading %s" % (url,)
try:
# GO has no wildcard domain for SSL
r = self.http_session.get(url, verify=False)
tf = tempfile.NamedTemporaryFile()
tf.write(r.content)
tf.seek(0)
return tf
except HTTPError as e:
print "Something went wrong downloading %s" % (url,)
except ZeroReturnError as e:
print "SSL Zero return error %s" % (url,)
except Exception as e:
print "Some other exception %s" % (url,)
def pdf_to_text(self, path, max_pages=20):
"""
Method to convert a given PDF file into text file using a subprocess
"""
if max_pages > 0:
content = convert(path, range(0, max_pages))
else:
content = convert(path)
return unicode(self.pdf_clean_text(content.decode('utf-8')))