Skip to content

Commit 6d2d010

Browse files
authored
Merge pull request #24 from jsvine/develop
v0.5.1; minor fixes/tweaks, adds line quick-draw
2 parents ba9d7ab + e204313 commit 6d2d010

File tree

7 files changed

+81
-32
lines changed

7 files changed

+81
-32
lines changed

CHANGELOG.md

+12
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file. Currently g
44

55
The format is based on [Keep a Changelog](http://keepachangelog.com/).
66

7+
## [0.5.1] — 2017-02-26
8+
### Added
9+
- Quick-draw `PageImage` methods: `.draw_vline`, `.draw_vlines`, `.draw_hline`, and `.draw_hlines`.
10+
- Boolean parameter `keep_blank_chars` for `.extract_words(...)` and `TableFinder` settings.
11+
12+
### Changed
13+
- Increased default `text_tolerance` and `intersection_tolerance` TableFinder values from 1 to 3.
14+
15+
### Fixed
16+
- Properly handle conversion of PDFs with transparency to `pillow` images.
17+
- Properly handle `pandas` DataFrames as inputs to multi-draw commands (e.g., `PageImage.draw_rects(...)`).
18+
719
## [0.5.0] - 2017-02-25
820
### Added
921
- Visual debugging features, via `Page.to_image(...)` and `PageImage`. (Introduces `wand` and `pillow` as package requirements.)

README.md

+12-8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# PDFPlumber `v0.5.0`
1+
# PDFPlumber `v0.5.1`
22

33
Plumb a PDF for detailed information about each text character, rectangle, and line. Plus: Table extraction and visual debugging.
44

@@ -189,11 +189,13 @@ im = my_pdf.page[0].to_image(resolution=150)
189189

190190
You can pass explicit coordinates or any `pdfplumber` PDF object (e.g., char, line, rect) to these methods.
191191

192-
| Single-object method | Bulk method |
193-
|----------------------|-------------|
194-
|`im.draw_line(line, stroke={color}, stroke_width=1)`| `im.draw_lines(list_of_lines, **kwargs)`|
195-
|`im.draw_rect(bbox_or_obj, fill={color}, stroke={color}, stroke_width=1)`| `im.draw_rects(list_of_rects, **kwargs)`|
196-
|`im.draw_circle(center_or_obj, radius=5, fill={color}, stroke={color})`| `im.draw_circles(list_of_circles, **kwargs)`|
192+
| Single-object method | Bulk method | Description |
193+
|----------------------|-------------|-------------|
194+
|`im.draw_line(line, stroke={color}, stroke_width=1)`| `im.draw_lines(list_of_lines, **kwargs)`| Draws a line from a `line`-like object, or a 4-tuple bounding box.|
195+
|`im.draw_vline(location, stroke={color}, stroke_width=1)`| `im.draw_vlines(list_of_locations, **kwargs)`| Draws a vertical line at the x-coordinate indicated by `location`.|
196+
|`im.draw_hline(location, stroke={color}, stroke_width=1)`| `im.draw_hlines(list_of_locations, **kwargs)`| Draws a horizontal line at the y-coordinate indicated by `location`.|
197+
|`im.draw_rect(bbox_or_obj, fill={color}, stroke={color}, stroke_width=1)`| `im.draw_rects(list_of_rects, **kwargs)`| Draws a rectangle from a `rect`, `char`, etc., or 4-tuple bounding box.|
198+
|`im.draw_circle(center_or_obj, radius=5, fill={color}, stroke={color})`| `im.draw_circles(list_of_circles, **kwargs)`| Draws a circle at `(x, y)` coordinate or at the center of a `char`, `rect`, etc.|
197199

198200
Note: The methods above are built on Pillow's [`ImageDraw` methods](http://pillow.readthedocs.io/en/latest/reference/ImageDraw.html), but the parameters have been tweaked for consistency with SVG's `fill`/`stroke`/`stroke_width` nomenclature.
199201

@@ -242,10 +244,11 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r
242244
"join_tolerance": 3,
243245
"edge_min_length": 3,
244246
"text_word_threshold": 3,
245-
"text_tolerance": 1,
247+
"keep_blank_chars": False,
248+
"text_tolerance": 3,
246249
"text_x_tolerance": None,
247250
"text_y_tolerance": None,
248-
"intersection_tolerance": 1,
251+
"intersection_tolerance": 3,
249252
"intersection_x_tolerance": None,
250253
"intersection_y_tolerance": None,
251254
}
@@ -261,6 +264,7 @@ By default, `extract_tables` uses the page's vertical and horizontal lines (or r
261264
|`"join_tolerance"`| Line segments on the same infinite line, and whose ends are within `join_tolerance` of one another, will be "joined" into a single line segment.|
262265
|`"edge_min_length"`| Edges shorter than `edge_min_length` will be discarded before attempting to reconstruct the table.|
263266
|`"text_word_threshold"`| When using the `text` strategy, at least `text_word_threshold` words must share the same alignment.|
267+
|`"keep_blank_chars"`| When using the `text` strategy, consider `" "` chars to be *parts* of words and not word-separators.|
264268
|`"text_tolerance"`, `"text_x_tolerance"`, `"text_y_tolerance"`| When the `text` strategy searches for words, it will expect the individual letters in each word to be no more than `text_tolerance` pixels apart.|
265269
|`"intersection_tolerance"`, `"intersection_x_tolerance"`, `"intersection_y_tolerance"`| When combining edges into cells, orthogonal edges most be within `intersection_tolerance` pixels to be considered intersecting.|
266270

pdfplumber/_version.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
version_info = (0, 5, 0)
1+
version_info = (0, 5, 1)
22
__version__ = '.'.join(map(str, version_info))

pdfplumber/display.py

+41-5
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,12 @@ def get_page_image(pdf_path, page_no, resolution):
2424
page_path = "{0}[{1}]".format(pdf_path, page_no)
2525
with wand.image.Image(filename=page_path, resolution=resolution) as img:
2626
with img.convert("png") as png:
27-
im = PIL.Image.open(BytesIO(png.make_blob())).convert("RGB")
28-
return im
27+
im = PIL.Image.open(BytesIO(png.make_blob()))
28+
if "transparency" in im.info:
29+
converted = im.convert("RGBA").convert("RGB")
30+
else:
31+
converted = im.convert("RGB")
32+
return converted
2933

3034
class PageImage(object):
3135
def __init__(self, page, original=None, resolution=DEFAULT_RESOLUTION):
@@ -100,10 +104,42 @@ def draw_line(self, points_or_line,
100104
return self
101105

102106
def draw_lines(self, list_of_lines, **kwargs):
103-
for x in list_of_lines:
107+
for x in utils.to_list(list_of_lines):
104108
self.draw_line(x, **kwargs)
105109
return self
106110

111+
def draw_vline(self, location,
112+
stroke=DEFAULT_STROKE,
113+
stroke_width=DEFAULT_STROKE_WIDTH):
114+
points = (location, self.page.bbox[1], location, self.page.bbox[3])
115+
self.draw.line(
116+
self._reproject_bbox(points),
117+
fill=stroke,
118+
width=stroke_width
119+
)
120+
return self
121+
122+
def draw_vlines(self, locations, **kwargs):
123+
for x in utils.to_list(locations):
124+
self.draw_vline(x, **kwargs)
125+
return self
126+
127+
def draw_hline(self, location,
128+
stroke=DEFAULT_STROKE,
129+
stroke_width=DEFAULT_STROKE_WIDTH):
130+
points = (self.page.bbox[0], location, self.page.bbox[2], location)
131+
self.draw.line(
132+
self._reproject_bbox(points),
133+
fill=stroke,
134+
width=stroke_width
135+
)
136+
return self
137+
138+
def draw_hlines(self, locations, **kwargs):
139+
for x in utils.to_list(locations):
140+
self.draw_hline(x, **kwargs)
141+
return self
142+
107143
def draw_rect(self, bbox_or_obj,
108144
fill=DEFAULT_FILL,
109145
stroke=DEFAULT_STROKE,
@@ -142,7 +178,7 @@ def draw_rect(self, bbox_or_obj,
142178
return self
143179

144180
def draw_rects(self, list_of_rects, **kwargs):
145-
for x in list_of_rects:
181+
for x in utils.to_list(list_of_rects):
146182
self.draw_rect(x, **kwargs)
147183
return self
148184

@@ -168,7 +204,7 @@ def draw_circle(self, center_or_obj,
168204
return self
169205

170206
def draw_circles(self, list_of_circles, **kwargs):
171-
for x in list_of_circles:
207+
for x in utils.to_list(list_of_circles):
172208
self.draw_circle(x, **kwargs)
173209
return self
174210

pdfplumber/page.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ def extract_text(self,
127127

128128
def extract_words(self,
129129
x_tolerance=utils.DEFAULT_X_TOLERANCE,
130-
y_tolerance=utils.DEFAULT_Y_TOLERANCE):
130+
y_tolerance=utils.DEFAULT_Y_TOLERANCE,
131+
keep_blank_chars=False):
131132

132133
return utils.extract_words(self.chars,
133134
x_tolerance=x_tolerance,

pdfplumber/table.py

+9-15
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,7 @@ def get_group(edge):
8787
return edges
8888

8989
def words_to_edges_h(words,
90-
word_threshold=3,
91-
join_tolerance=DEFAULT_JOIN_TOLERANCE,
92-
snap_tolerance=DEFAULT_SNAP_TOLERANCE):
90+
word_threshold=3):
9391
"""
9492
Find (imaginary) horizontal lines that connect the tops of at least `word_threshold` words.
9593
"""
@@ -116,14 +114,10 @@ def words_to_edges_h(words,
116114
"orientation": "h"
117115
} for r in rects ]
118116

119-
return merge_edges(edges,
120-
join_tolerance=join_tolerance,
121-
snap_tolerance=snap_tolerance)
117+
return edges
122118

123119
def words_to_edges_v(words,
124-
word_threshold=3,
125-
join_tolerance=DEFAULT_JOIN_TOLERANCE,
126-
snap_tolerance=DEFAULT_SNAP_TOLERANCE):
120+
word_threshold=3):
127121
"""
128122
Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words.
129123
"""
@@ -185,9 +179,7 @@ def words_to_edges_v(words,
185179
"orientation": "v"
186180
} ]
187181

188-
return merge_edges(edges,
189-
join_tolerance=join_tolerance,
190-
snap_tolerance=snap_tolerance)
182+
return edges
191183

192184
def edges_to_intersections(edges, x_tolerance=1, y_tolerance=1):
193185
"""
@@ -404,10 +396,11 @@ def char_in_bbox(char, bbox):
404396
"join_tolerance": DEFAULT_JOIN_TOLERANCE,
405397
"edge_min_length": 3,
406398
"text_word_threshold": 3,
407-
"text_tolerance": 1,
399+
"keep_blank_chars": False,
400+
"text_tolerance": 3,
408401
"text_x_tolerance": None,
409402
"text_y_tolerance": None,
410-
"intersection_tolerance": 1,
403+
"intersection_tolerance": 3,
411404
"intersection_x_tolerance": None,
412405
"intersection_y_tolerance": None,
413406
}
@@ -479,7 +472,8 @@ def get_edges(self):
479472
yt = settings["text_tolerance"]
480473
words = self.page.extract_words(
481474
x_tolerance=xt,
482-
y_tolerance=yt
475+
y_tolerance=yt,
476+
keep_blank_chars=settings["keep_blank_chars"]
483477
)
484478

485479
def v_edge_desc_to_edge(desc):

pdfplumber/utils.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,9 @@ def bbox_to_rect(bbox):
130130

131131
def extract_words(chars,
132132
x_tolerance=DEFAULT_X_TOLERANCE,
133-
y_tolerance=DEFAULT_Y_TOLERANCE):
133+
y_tolerance=DEFAULT_Y_TOLERANCE,
134+
keep_blank_chars=False
135+
):
134136

135137
x_tolerance = decimalize(x_tolerance)
136138
y_tolerance = decimalize(y_tolerance)
@@ -153,7 +155,7 @@ def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE):
153155
current_word = []
154156

155157
for char in chars_sorted:
156-
if get_text(char) == " ":
158+
if not keep_blank_chars and get_text(char) == " ":
157159
if len(current_word) > 0:
158160
words.append(current_word)
159161
current_word = []

0 commit comments

Comments
 (0)