|
| 1 | +import pandas as pd |
| 2 | +from pdfminer.converter import PDFPageAggregator |
| 3 | +from pdfminer.layout import LAParams, LTTextLine, LTTextBox, LTImage, \ |
| 4 | + LTFigure, LTLine, LTRect, LTCurve |
| 5 | +from pdfminer.pdfdocument import PDFDocument |
| 6 | +from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager |
| 7 | +from pdfminer.pdfpage import PDFPage |
| 8 | +from pdfminer.pdfparser import PDFParser |
| 9 | +import re |
| 10 | +import numpy as np |
| 11 | +from converter.util import constants |
| 12 | + |
| 13 | + |
| 14 | +class PDFContentConverter(object): |
| 15 | + |
| 16 | + def __init__(self, pdf): |
| 17 | + self.pdf = pdf |
| 18 | + self.font_size = None |
| 19 | + self.font_name = None |
| 20 | + self.box_id = -1 |
| 21 | + self.rect_boxes = [] |
| 22 | + self.plot_boxes = [] |
| 23 | + self.res = None |
| 24 | + self.pandas = None |
| 25 | + self.media_boxes = None |
| 26 | + self.n = None |
| 27 | + |
| 28 | + def parse_document(self): |
| 29 | + self.res = [] # result set |
| 30 | + self.media_boxes = dict() # media coordinate dictionary |
| 31 | + self.n = 0 # page count |
| 32 | + pdf = open(self.pdf, "rb") |
| 33 | + pdf_parser = PDFParser(pdf) |
| 34 | + pdf_document = PDFDocument(pdf_parser) |
| 35 | + la_params = LAParams(detect_vertical=True) |
| 36 | + if constants.USE_CUSTOM_PDF_PARAMETERS: |
| 37 | + la_params = LAParams(detect_vertical=constants.DEFAULT_DETECT_VERTICAL, |
| 38 | + line_overlap=constants.DEFAULT_LINE_OVERLAP, |
| 39 | + line_margin=constants.DEFAULT_LINE_MARGIN, |
| 40 | + word_margin=constants.DEFAULT_WORD_MARGIN, |
| 41 | + char_margin=constants.DEFAULT_CHAR_MARGIN, |
| 42 | + boxes_flow=constants.DEFAULT_BOXES_FLOW) |
| 43 | + |
| 44 | + if pdf_document.is_extractable: |
| 45 | + resource_manager = PDFResourceManager() |
| 46 | + page_aggregator = PDFPageAggregator(resource_manager, |
| 47 | + laparams=la_params) |
| 48 | + page_interpreter = PDFPageInterpreter(resource_manager, |
| 49 | + page_aggregator) |
| 50 | + pages = PDFPage.create_pages(pdf_document) |
| 51 | + |
| 52 | + for page in pages: |
| 53 | + page_interpreter.process_page(page) |
| 54 | + layout = page_aggregator.get_result() |
| 55 | + crop_box = page.cropbox |
| 56 | + page_box = page.mediabox |
| 57 | + self.media_boxes[self.n] = {"x0": crop_box[0], "y0": crop_box[1], |
| 58 | + "x1": crop_box[2], "y1": crop_box[3], |
| 59 | + "x0page": page_box[0], "y0page": page_box[1], |
| 60 | + "x1page": page_box[2], "y1page": page_box[3]} |
| 61 | + self.box_id = -1 |
| 62 | + self.res = self.get_objects(layout._objs, self.res, self.n, self.media_boxes) |
| 63 | + self.n += 1 |
| 64 | + |
| 65 | + return self.res, self.media_boxes |
| 66 | + |
| 67 | + def convert(self): |
| 68 | + self.res, self.media_boxes = self.parse_document() |
| 69 | + if len(self.res) == 0: |
| 70 | + return None, None |
| 71 | + self.pandas = pd.DataFrame(self.res) |
| 72 | + self.pandas.columns = ["id", "page", "text", |
| 73 | + "x_0", "x_1", "y_0", "y_1", |
| 74 | + "pos_x", "pos_y", "abs_pos", |
| 75 | + "original_font", "font_name", "code", "bold", "italic", "font_size", |
| 76 | + "masked", "frequency_hist", |
| 77 | + "len_text", "n_tokens", |
| 78 | + "tag", "box"] |
| 79 | + self.pandas = self.pandas.apply(lambda x: self.create_surrounding_element_features(x, self.rect_boxes, min=3), |
| 80 | + axis=1) |
| 81 | + |
| 82 | + return {"content": self.pandas, |
| 83 | + "media_boxes": self.media_boxes, |
| 84 | + "page_count": self.n} |
| 85 | + |
| 86 | + def get_objects(self, layout_objs, res, n, media_boxes): |
| 87 | + page_height = media_boxes[n]["y1page"] |
| 88 | + for obj in layout_objs: |
| 89 | + if isinstance(obj, LTTextLine): |
| 90 | + y1 = page_height - obj.y1 |
| 91 | + y0 = page_height - obj.y0 |
| 92 | + pos_x = (obj.x0 + obj.x1) / 2 |
| 93 | + pos_y = (y0 + y1) / 2 |
| 94 | + |
| 95 | + self.font_size = abs(y1 - y0) |
| 96 | + text = obj.get_text() |
| 97 | + rgb = self.get_rgb(text) |
| 98 | + text = self.clean_text(text) |
| 99 | + masked_text = self.mask_text(text) |
| 100 | + tag = self.get_tag(text) |
| 101 | + |
| 102 | + self.font_name = obj._objs[0].fontname |
| 103 | + font_name_original = self.font_name |
| 104 | + code = "" |
| 105 | + if "+" in self.font_name: |
| 106 | + parts = self.font_name.split("+") |
| 107 | + code = parts[0] |
| 108 | + self.font_name = parts[1] |
| 109 | + italic = 1 if "Italic" in self.font_name else 0 |
| 110 | + bold = 1 if "Bold" in self.font_name else 0 |
| 111 | + self.font_name = self.font_name.replace("+", "").replace("-", "").\ |
| 112 | + replace("Bold", "").replace("Italic", "").replace(",", "") |
| 113 | + |
| 114 | + if len(text.replace(" ", "")) != 0: # filter empty text |
| 115 | + res.append( |
| 116 | + [ |
| 117 | + len(res), n, obj.get_text().replace('\n', ' '), |
| 118 | + obj.x0, obj.x1, |
| 119 | + y0, y1, |
| 120 | + pos_x, pos_y, |
| 121 | + (pos_x, page_height - pos_y - page_height * n), |
| 122 | + font_name_original, |
| 123 | + self.font_name, code, bold, italic, |
| 124 | + self.font_size, |
| 125 | + masked_text, rgb, |
| 126 | + len(text), len(text.split(" ")), |
| 127 | + tag, self.box_id |
| 128 | + ] |
| 129 | + ) |
| 130 | + elif isinstance(obj, LTTextBox): |
| 131 | + self.box_id = self.box_id + 1 |
| 132 | + self.get_objects(obj._objs, res, n, media_boxes) |
| 133 | + else: |
| 134 | + type = "" |
| 135 | + if isinstance(obj, LTRect) or isinstance(obj, LTCurve) or isinstance(obj, LTLine): |
| 136 | + type = "rectangle" if obj.height > 10 and obj.width > 10 else "line" |
| 137 | + elif isinstance(obj, LTFigure): |
| 138 | + type = "figure" |
| 139 | + elif isinstance(obj, LTImage): |
| 140 | + type = "image" |
| 141 | + self.add_visual_elements(type, n, obj, page_height) |
| 142 | + |
| 143 | + return res |
| 144 | + |
| 145 | + def clean_text(self, text): |
| 146 | + text = text.replace("\\x0", " ").replace('\n', ' ').replace('\r', ' ') |
| 147 | + text = re.sub(" +", " ", text) |
| 148 | + text = text.strip() |
| 149 | + return text |
| 150 | + |
| 151 | + def get_tag(self, text): |
| 152 | + if len(text) > 0 and text[-1] == ":": |
| 153 | + tag = "key" |
| 154 | + else: |
| 155 | + tag = "value" |
| 156 | + return tag |
| 157 | + |
| 158 | + def mask_text(self, text): |
| 159 | + text = re.sub("\d+", "#", text) |
| 160 | + text = text.lower() |
| 161 | + return text |
| 162 | + |
| 163 | + def get_rgb(self, text): |
| 164 | + len_all = len(text) |
| 165 | + len_text = len(re.findall("[A-Za-zÄÖÜäöü]", text)) |
| 166 | + len_digits = len(re.findall("[0-9]", text)) |
| 167 | + len_text_symbols = len(re.findall("[,\.!\?](\s|$)", text)) |
| 168 | + len_symbols = len_all - len_text - len_digits - len_text_symbols |
| 169 | + if len_all > 0: |
| 170 | + return (len_text / len_all, # text |
| 171 | + len_digits / len_all, # digits |
| 172 | + len_symbols / len_all, # symbols |
| 173 | + len_text_symbols / len_all) # text symbols |
| 174 | + else: |
| 175 | + return (0, 0, 0, 0) |
| 176 | + |
| 177 | + def add_visual_elements(self, type, num_pages, obj, page_height): |
| 178 | + if type == "line": |
| 179 | + # add single line |
| 180 | + self.rect_boxes.append([type, num_pages, round(obj.x0), round(obj.x1), |
| 181 | + round(page_height - obj.y1), round(page_height - obj.y0)]) |
| 182 | + |
| 183 | + elif type == "rectangle": |
| 184 | + type = "line" |
| 185 | + # bottom |
| 186 | + self.rect_boxes.append([type, num_pages, |
| 187 | + round(obj.x0), round(obj.x1), |
| 188 | + round(page_height - obj.y0), round(page_height - obj.y0+1)]) |
| 189 | + # left |
| 190 | + self.rect_boxes.append([type, num_pages, |
| 191 | + round(obj.x0-1), round(obj.x0), |
| 192 | + round(page_height - obj.y1), round(page_height - obj.y0)]) |
| 193 | + # right |
| 194 | + self.rect_boxes.append([type, num_pages, |
| 195 | + round(obj.x1), round(obj.x1+1), |
| 196 | + round(page_height - obj.y1), round(page_height - obj.y0)]) |
| 197 | + # top |
| 198 | + self.rect_boxes.append([type, num_pages, |
| 199 | + round(obj.x0), round(obj.x1), |
| 200 | + round(page_height - obj.y1-1), round(page_height - obj.y1)]) |
| 201 | + |
| 202 | + elif type == "image" or type == "figure": |
| 203 | + self.plot_boxes.append([type, num_pages, |
| 204 | + obj.x0, obj.x1, page_height - obj.y1, page_height - obj.y0]) |
| 205 | + |
| 206 | + def create_surrounding_element_features(self, location, elements, min): |
| 207 | + lines = self.get_surrounding_lines(location, elements) |
| 208 | + location["in_element_ids"] = lines |
| 209 | + location["in_element"] = "rectangle" if lines.count(-1) <= 4-min else "none" |
| 210 | + return location |
| 211 | + |
| 212 | + def get_surrounding_rectangles(self, location, elements): |
| 213 | + rectangles = list(filter(lambda x: x[0] == "rectangle" and x[1] == location["page"], |
| 214 | + elements)) |
| 215 | + rect_ids = [] |
| 216 | + for i, rect in enumerate(rectangles): |
| 217 | + if location["x_0"] >= rect[2] and location["x_1"] <= rect[3] and \ |
| 218 | + location["y_0"] >= rect[5] and location["y_1"] <= rect[4]: |
| 219 | + rect_ids.append(i) |
| 220 | + return rect_ids if rect_ids != [] else None |
| 221 | + |
| 222 | + def get_surrounding_lines(self, location, elements): |
| 223 | + lines = list(filter(lambda x: x[0] == "line" and x[1] == location["page"], |
| 224 | + elements)) |
| 225 | + left_dist = np.inf |
| 226 | + left_id = -1 |
| 227 | + right_dist = np.inf |
| 228 | + right_id = -1 |
| 229 | + bottom_dist = np.inf |
| 230 | + bottom_id = -1 |
| 231 | + top_dist = np.inf |
| 232 | + top_id = -1 |
| 233 | + |
| 234 | + for i, line in enumerate(lines): |
| 235 | + # top |
| 236 | + if line[2] <= location["x_0"] and location["x_1"] <= line[3] and location["y_1"] >= line[4]: |
| 237 | + dist = location["y_1"] - line[4] |
| 238 | + if dist < top_dist: |
| 239 | + top_dist = dist |
| 240 | + top_id = line[4] |
| 241 | + # bottom |
| 242 | + if line[2] <= location["x_0"] and location["x_1"] <= line[3] and location["y_0"] <= line[5]: |
| 243 | + dist = line[5] - location["y_0"] |
| 244 | + if dist < bottom_dist: |
| 245 | + bottom_dist = dist |
| 246 | + bottom_id = line[5] |
| 247 | + # left |
| 248 | + if location["x_0"]+2 >= line[2] and line[5] >= location["y_0"] and location["y_1"] >= line[4]: |
| 249 | + dist = location["x_0"] - line[2] |
| 250 | + if dist < left_dist: |
| 251 | + left_dist = dist |
| 252 | + left_id = line[2] |
| 253 | + # right |
| 254 | + if location["x_1"]-2 <= line[3] and line[5] >= location["y_0"] and location["y_1"] >= line[4]: |
| 255 | + dist = line[3] - location["x_1"] |
| 256 | + if dist < right_dist: |
| 257 | + right_dist = dist |
| 258 | + right_id = line[3] |
| 259 | + |
| 260 | + ids = [left_id, right_id, top_id, bottom_id] |
| 261 | + return ids |
| 262 | + |
| 263 | + def pdf2pandas(self): |
| 264 | + if self.pandas is None: |
| 265 | + self.convert() |
| 266 | + return self.pandas |
| 267 | + |
| 268 | + def get_media_boxes(self): |
| 269 | + if self.media_boxes is None: |
| 270 | + self.convert() |
| 271 | + return self.media_boxes |
| 272 | + |
| 273 | + def get_page_count(self): |
| 274 | + if self.n is None: |
| 275 | + self.convert() |
| 276 | + return self.n |
0 commit comments