Skip to content

Commit d41a14a

Browse files
holtskinnergalz10
andauthored
chore: minor refactoring for readability/simplicity (#42)
- Fixed Type mismatch on `_table_wrapper_from_documentai_table` - Simplified boolean checks - Removed Extra local variables - Updated function names/docs for clarity Co-authored-by: Gal Zahavi <38544478+galz10@users.noreply.github.com>
1 parent ee19b02 commit d41a14a

File tree

4 files changed

+32
-46
lines changed

4 files changed

+32
-46
lines changed

packages/google-cloud-documentai-toolbox/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ pip-log.txt
2929
.nox
3030
.cache
3131
.pytest_cache
32+
.mypy_cache/
3233

3334

3435
# Mac

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/document.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ def from_document_path(
250250
251251
Args:
252252
document_path (str):
253-
Required. The path to the resp.
253+
Required. The path to the document.json file.
254254
Returns:
255255
Document:
256256
A document from local document_path.
@@ -316,22 +316,16 @@ def search_pages(
316316
A list of Pages.
317317
318318
"""
319-
if (target_string is None and pattern is None) or (
320-
target_string is not None and pattern is not None
321-
):
319+
if (target_string and pattern) or (not target_string and not pattern):
322320
raise ValueError(
323321
"Exactly one of target_string and pattern must be specified."
324322
)
325323

326324
found_pages = []
327325
for page in self.pages:
328326
for paragraph in page.paragraphs:
329-
if target_string is not None and target_string in paragraph.text:
330-
found_pages.append(page)
331-
break
332-
elif (
333-
pattern is not None
334-
and re.search(pattern, paragraph.text) is not None
327+
if (target_string and target_string in paragraph.text) or (
328+
pattern and re.search(pattern, paragraph.text)
335329
):
336330
found_pages.append(page)
337331
break

packages/google-cloud-documentai-toolbox/google/cloud/documentai_toolbox/wrappers/page.py

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,14 @@ def to_dataframe(self) -> pd.DataFrame:
5454
The DataFrame of the table.
5555
5656
"""
57-
dataframe = None
58-
5957
if not self.body_rows:
60-
dataframe = pd.DataFrame(columns=self.header_rows)
61-
else:
62-
if self.header_rows != []:
63-
dataframe = pd.DataFrame(self.body_rows)
64-
dataframe.columns = self.header_rows
65-
else:
58+
return pd.DataFrame(columns=self.header_rows)
6659

67-
dataframe = pd.DataFrame(self.body_rows)
68-
dataframe.columns = [None] * len(self.body_rows[0])
60+
dataframe = pd.DataFrame(self.body_rows)
61+
if self.header_rows:
62+
dataframe.columns = self.header_rows
63+
else:
64+
dataframe.columns = [None] * len(self.body_rows[0])
6965

7066
return dataframe
7167

@@ -102,13 +98,13 @@ def sample_table_to_csv():
10298

10399

104100
def _table_wrapper_from_documentai_table(
105-
documentai_table: List[documentai.Document.Page.Table], text: str
101+
documentai_table: documentai.Document.Page.Table, text: str
106102
) -> Table:
107103
r"""Returns a Table.
108104
109105
Args:
110-
documentai_tables (List[documentai.Document.Page.Table]):
111-
Required. A list of documentai.Document.Page.Table.
106+
documentai_table (documentai.Document.Page.Table):
107+
Required. A documentai.Document.Page.Table.
112108
text (str):
113109
Required. UTF-8 encoded text in reading order
114110
from the document.
@@ -119,22 +115,17 @@ def _table_wrapper_from_documentai_table(
119115
120116
"""
121117

122-
header_rows = []
123-
body_rows = []
124-
125-
header_rows = _table_row_from_documentai_table_row(
118+
header_rows = _table_rows_from_documentai_table_rows(
126119
table_rows=documentai_table.header_rows, text=text
127120
)
128-
body_rows = _table_row_from_documentai_table_row(
121+
body_rows = _table_rows_from_documentai_table_rows(
129122
table_rows=documentai_table.body_rows, text=text
130123
)
131124

132-
result = Table(
125+
return Table(
133126
documentai_table=documentai_table, body_rows=body_rows, header_rows=header_rows
134127
)
135128

136-
return result
137-
138129

139130
@dataclasses.dataclass
140131
class Paragraph:
@@ -185,13 +176,11 @@ def _text_from_element_with_layout(
185176

186177
result_text = ""
187178

188-
if element_with_layout.layout.text_anchor.text_segments == []:
179+
if not element_with_layout.layout.text_anchor.text_segments:
189180
return ""
190-
else:
191-
for text_segment in element_with_layout.layout.text_anchor.text_segments:
192-
result_text += text[
193-
int(text_segment.start_index) : int(text_segment.end_index)
194-
]
181+
182+
for text_segment in element_with_layout.layout.text_anchor.text_segments:
183+
result_text += text[int(text_segment.start_index) : int(text_segment.end_index)]
195184

196185
return result_text
197186

@@ -254,10 +243,10 @@ def _get_lines(lines: List[documentai.Document.Page.Line], text: str) -> List[Li
254243
return result
255244

256245

257-
def _table_row_from_documentai_table_row(
246+
def _table_rows_from_documentai_table_rows(
258247
table_rows: List[documentai.Document.Page.Table.TableRow], text: str
259248
) -> List[str]:
260-
r"""Returns a list rows from table_rows.
249+
r"""Returns a list of rows from table_rows.
261250
262251
Args:
263252
table_rows (List[documentai.Document.Page.Table.TableRow]):

packages/google-cloud-documentai-toolbox/tests/unit/test_page.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222

2323
@pytest.fixture
2424
def docproto():
25-
with open("tests/unit/resources/0/toolbox_invoice_test-0.json", "r") as f:
25+
with open(
26+
"tests/unit/resources/0/toolbox_invoice_test-0.json", "r", encoding="utf-8"
27+
) as f:
2628
return documentai.Document.from_json(f.read())
2729

2830

@@ -117,22 +119,22 @@ def test_table_wrapper_from_documentai_table(docproto):
117119
assert len(table.header_rows[0]) == 4
118120

119121

120-
def test_header_for_table_row_from_documentai_table_row(docproto):
122+
def test_header_for_table_rows_from_documentai_table_rows(docproto):
121123
docproto_page = docproto.pages[0]
122124

123-
header_row = page._table_row_from_documentai_table_row(
125+
header_rows = page._table_rows_from_documentai_table_rows(
124126
table_rows=docproto_page.tables[0].header_rows, text=docproto.text
125127
)
126-
assert header_row == [["Item Description", "Quantity", "Price", "Amount"]]
128+
assert header_rows == [["Item Description", "Quantity", "Price", "Amount"]]
127129

128130

129-
def test_body_for_table_row_from_documentai_table_row(docproto):
131+
def test_body_for_table_rows_from_documentai_table_rows(docproto):
130132
docproto_page = docproto.pages[0]
131133

132-
body_row = page._table_row_from_documentai_table_row(
134+
body_rows = page._table_rows_from_documentai_table_rows(
133135
table_rows=docproto_page.tables[0].body_rows, text=docproto.text
134136
)
135-
assert body_row == [
137+
assert body_rows == [
136138
["Tool A", "500", "$1.00", "$500.00"],
137139
["Service B", "1", "$900.00", "$900.00"],
138140
["Resource C", "50", "$12.00", "$600.00"],

0 commit comments

Comments
 (0)