Skip to content

Commit dda0b85

Browse files
Michael AignerMichael Aigner
authored andcommitted
added PDFConverter base
1 parent 4896afa commit dda0b85

File tree

4 files changed

+335
-0
lines changed

4 files changed

+335
-0
lines changed

converter/LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2020 Michael Benedikt Aigner
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

converter/PDFContentConverter.py

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
import pandas as pd
2+
from pdfminer.converter import PDFPageAggregator
3+
from pdfminer.layout import LAParams, LTTextLine, LTTextBox, LTImage, \
4+
LTFigure, LTLine, LTRect, LTCurve
5+
from pdfminer.pdfdocument import PDFDocument
6+
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
7+
from pdfminer.pdfpage import PDFPage
8+
from pdfminer.pdfparser import PDFParser
9+
import re
10+
import numpy as np
11+
from converter.util import constants
12+
13+
14+
class PDFContentConverter(object):
15+
16+
def __init__(self, pdf):
17+
self.pdf = pdf
18+
self.font_size = None
19+
self.font_name = None
20+
self.box_id = -1
21+
self.rect_boxes = []
22+
self.plot_boxes = []
23+
self.res = None
24+
self.pandas = None
25+
self.media_boxes = None
26+
self.n = None
27+
28+
def parse_document(self):
29+
self.res = [] # result set
30+
self.media_boxes = dict() # media coordinate dictionary
31+
self.n = 0 # page count
32+
pdf = open(self.pdf, "rb")
33+
pdf_parser = PDFParser(pdf)
34+
pdf_document = PDFDocument(pdf_parser)
35+
la_params = LAParams(detect_vertical=True)
36+
if constants.USE_CUSTOM_PDF_PARAMETERS:
37+
la_params = LAParams(detect_vertical=constants.DEFAULT_DETECT_VERTICAL,
38+
line_overlap=constants.DEFAULT_LINE_OVERLAP,
39+
line_margin=constants.DEFAULT_LINE_MARGIN,
40+
word_margin=constants.DEFAULT_WORD_MARGIN,
41+
char_margin=constants.DEFAULT_CHAR_MARGIN,
42+
boxes_flow=constants.DEFAULT_BOXES_FLOW)
43+
44+
if pdf_document.is_extractable:
45+
resource_manager = PDFResourceManager()
46+
page_aggregator = PDFPageAggregator(resource_manager,
47+
laparams=la_params)
48+
page_interpreter = PDFPageInterpreter(resource_manager,
49+
page_aggregator)
50+
pages = PDFPage.create_pages(pdf_document)
51+
52+
for page in pages:
53+
page_interpreter.process_page(page)
54+
layout = page_aggregator.get_result()
55+
crop_box = page.cropbox
56+
page_box = page.mediabox
57+
self.media_boxes[self.n] = {"x0": crop_box[0], "y0": crop_box[1],
58+
"x1": crop_box[2], "y1": crop_box[3],
59+
"x0page": page_box[0], "y0page": page_box[1],
60+
"x1page": page_box[2], "y1page": page_box[3]}
61+
self.box_id = -1
62+
self.res = self.get_objects(layout._objs, self.res, self.n, self.media_boxes)
63+
self.n += 1
64+
65+
return self.res, self.media_boxes
66+
67+
def convert(self):
68+
self.res, self.media_boxes = self.parse_document()
69+
if len(self.res) == 0:
70+
return None, None
71+
self.pandas = pd.DataFrame(self.res)
72+
self.pandas.columns = ["id", "page", "text",
73+
"x_0", "x_1", "y_0", "y_1",
74+
"pos_x", "pos_y", "abs_pos",
75+
"original_font", "font_name", "code", "bold", "italic", "font_size",
76+
"masked", "frequency_hist",
77+
"len_text", "n_tokens",
78+
"tag", "box"]
79+
self.pandas = self.pandas.apply(lambda x: self.create_surrounding_element_features(x, self.rect_boxes, min=3),
80+
axis=1)
81+
82+
return {"content": self.pandas,
83+
"media_boxes": self.media_boxes,
84+
"page_count": self.n}
85+
86+
def get_objects(self, layout_objs, res, n, media_boxes):
87+
page_height = media_boxes[n]["y1page"]
88+
for obj in layout_objs:
89+
if isinstance(obj, LTTextLine):
90+
y1 = page_height - obj.y1
91+
y0 = page_height - obj.y0
92+
pos_x = (obj.x0 + obj.x1) / 2
93+
pos_y = (y0 + y1) / 2
94+
95+
self.font_size = abs(y1 - y0)
96+
text = obj.get_text()
97+
rgb = self.get_rgb(text)
98+
text = self.clean_text(text)
99+
masked_text = self.mask_text(text)
100+
tag = self.get_tag(text)
101+
102+
self.font_name = obj._objs[0].fontname
103+
font_name_original = self.font_name
104+
code = ""
105+
if "+" in self.font_name:
106+
parts = self.font_name.split("+")
107+
code = parts[0]
108+
self.font_name = parts[1]
109+
italic = 1 if "Italic" in self.font_name else 0
110+
bold = 1 if "Bold" in self.font_name else 0
111+
self.font_name = self.font_name.replace("+", "").replace("-", "").\
112+
replace("Bold", "").replace("Italic", "").replace(",", "")
113+
114+
if len(text.replace(" ", "")) != 0: # filter empty text
115+
res.append(
116+
[
117+
len(res), n, obj.get_text().replace('\n', ' '),
118+
obj.x0, obj.x1,
119+
y0, y1,
120+
pos_x, pos_y,
121+
(pos_x, page_height - pos_y - page_height * n),
122+
font_name_original,
123+
self.font_name, code, bold, italic,
124+
self.font_size,
125+
masked_text, rgb,
126+
len(text), len(text.split(" ")),
127+
tag, self.box_id
128+
]
129+
)
130+
elif isinstance(obj, LTTextBox):
131+
self.box_id = self.box_id + 1
132+
self.get_objects(obj._objs, res, n, media_boxes)
133+
else:
134+
type = ""
135+
if isinstance(obj, LTRect) or isinstance(obj, LTCurve) or isinstance(obj, LTLine):
136+
type = "rectangle" if obj.height > 10 and obj.width > 10 else "line"
137+
elif isinstance(obj, LTFigure):
138+
type = "figure"
139+
elif isinstance(obj, LTImage):
140+
type = "image"
141+
self.add_visual_elements(type, n, obj, page_height)
142+
143+
return res
144+
145+
def clean_text(self, text):
146+
text = text.replace("\\x0", " ").replace('\n', ' ').replace('\r', ' ')
147+
text = re.sub(" +", " ", text)
148+
text = text.strip()
149+
return text
150+
151+
def get_tag(self, text):
152+
if len(text) > 0 and text[-1] == ":":
153+
tag = "key"
154+
else:
155+
tag = "value"
156+
return tag
157+
158+
def mask_text(self, text):
159+
text = re.sub("\d+", "#", text)
160+
text = text.lower()
161+
return text
162+
163+
def get_rgb(self, text):
164+
len_all = len(text)
165+
len_text = len(re.findall("[A-Za-zÄÖÜäöü]", text))
166+
len_digits = len(re.findall("[0-9]", text))
167+
len_text_symbols = len(re.findall("[,\.!\?](\s|$)", text))
168+
len_symbols = len_all - len_text - len_digits - len_text_symbols
169+
if len_all > 0:
170+
return (len_text / len_all, # text
171+
len_digits / len_all, # digits
172+
len_symbols / len_all, # symbols
173+
len_text_symbols / len_all) # text symbols
174+
else:
175+
return (0, 0, 0, 0)
176+
177+
def add_visual_elements(self, type, num_pages, obj, page_height):
178+
if type == "line":
179+
# add single line
180+
self.rect_boxes.append([type, num_pages, round(obj.x0), round(obj.x1),
181+
round(page_height - obj.y1), round(page_height - obj.y0)])
182+
183+
elif type == "rectangle":
184+
type = "line"
185+
# bottom
186+
self.rect_boxes.append([type, num_pages,
187+
round(obj.x0), round(obj.x1),
188+
round(page_height - obj.y0), round(page_height - obj.y0+1)])
189+
# left
190+
self.rect_boxes.append([type, num_pages,
191+
round(obj.x0-1), round(obj.x0),
192+
round(page_height - obj.y1), round(page_height - obj.y0)])
193+
# right
194+
self.rect_boxes.append([type, num_pages,
195+
round(obj.x1), round(obj.x1+1),
196+
round(page_height - obj.y1), round(page_height - obj.y0)])
197+
# top
198+
self.rect_boxes.append([type, num_pages,
199+
round(obj.x0), round(obj.x1),
200+
round(page_height - obj.y1-1), round(page_height - obj.y1)])
201+
202+
elif type == "image" or type == "figure":
203+
self.plot_boxes.append([type, num_pages,
204+
obj.x0, obj.x1, page_height - obj.y1, page_height - obj.y0])
205+
206+
def create_surrounding_element_features(self, location, elements, min):
207+
lines = self.get_surrounding_lines(location, elements)
208+
location["in_element_ids"] = lines
209+
location["in_element"] = "rectangle" if lines.count(-1) <= 4-min else "none"
210+
return location
211+
212+
def get_surrounding_rectangles(self, location, elements):
213+
rectangles = list(filter(lambda x: x[0] == "rectangle" and x[1] == location["page"],
214+
elements))
215+
rect_ids = []
216+
for i, rect in enumerate(rectangles):
217+
if location["x_0"] >= rect[2] and location["x_1"] <= rect[3] and \
218+
location["y_0"] >= rect[5] and location["y_1"] <= rect[4]:
219+
rect_ids.append(i)
220+
return rect_ids if rect_ids != [] else None
221+
222+
def get_surrounding_lines(self, location, elements):
223+
lines = list(filter(lambda x: x[0] == "line" and x[1] == location["page"],
224+
elements))
225+
left_dist = np.inf
226+
left_id = -1
227+
right_dist = np.inf
228+
right_id = -1
229+
bottom_dist = np.inf
230+
bottom_id = -1
231+
top_dist = np.inf
232+
top_id = -1
233+
234+
for i, line in enumerate(lines):
235+
# top
236+
if line[2] <= location["x_0"] and location["x_1"] <= line[3] and location["y_1"] >= line[4]:
237+
dist = location["y_1"] - line[4]
238+
if dist < top_dist:
239+
top_dist = dist
240+
top_id = line[4]
241+
# bottom
242+
if line[2] <= location["x_0"] and location["x_1"] <= line[3] and location["y_0"] <= line[5]:
243+
dist = line[5] - location["y_0"]
244+
if dist < bottom_dist:
245+
bottom_dist = dist
246+
bottom_id = line[5]
247+
# left
248+
if location["x_0"]+2 >= line[2] and line[5] >= location["y_0"] and location["y_1"] >= line[4]:
249+
dist = location["x_0"] - line[2]
250+
if dist < left_dist:
251+
left_dist = dist
252+
left_id = line[2]
253+
# right
254+
if location["x_1"]-2 <= line[3] and line[5] >= location["y_0"] and location["y_1"] >= line[4]:
255+
dist = line[3] - location["x_1"]
256+
if dist < right_dist:
257+
right_dist = dist
258+
right_id = line[3]
259+
260+
ids = [left_id, right_id, top_id, bottom_id]
261+
return ids
262+
263+
def pdf2pandas(self):
264+
if self.pandas is None:
265+
self.convert()
266+
return self.pandas
267+
268+
def get_media_boxes(self):
269+
if self.media_boxes is None:
270+
self.convert()
271+
return self.media_boxes
272+
273+
def get_page_count(self):
274+
if self.n is None:
275+
self.convert()
276+
return self.n

converter/util/StorageUtil.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import pickle
2+
3+
4+
def save_object(obj, path, name):
5+
with open(path + name + '.pkl', 'wb') as f:
6+
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
7+
8+
9+
def load_object(path, name):
10+
with open(path + name + '.pkl', 'rb') as f:
11+
return pickle.load(f)
12+
13+
14+
def get_file_name(path):
15+
parts = path.split("/")
16+
return parts[len(parts) - 1]
17+
18+
19+
def replace_file_type(file_name, new_type):
20+
file_name_parts = file_name.split(".")
21+
return file_name.replace(file_name_parts[len(file_name_parts)-1], new_type)
22+
23+
24+
def cut_file_type(file_name):
25+
file_name_parts = file_name.split(".")
26+
return file_name.replace("." + file_name_parts[len(file_name_parts)-1], "")

converter/util/constants.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
2+
CSV_PATH = "./csv/"
3+
PDF_PATH = "./pdf/"
4+
5+
# pdf miner layout analysis parameters
6+
USE_CUSTOM_PDF_PARAMETERS = True
7+
DEFAULT_DETECT_VERTICAL = True
8+
DEFAULT_LINE_OVERLAP = 0.5
9+
DEFAULT_LINE_MARGIN = 0.15
10+
DEFAULT_WORD_MARGIN = 0.1
11+
DEFAULT_CHAR_MARGIN = 0.5
12+
DEFAULT_BOXES_FLOW = 0.5

0 commit comments

Comments
 (0)