Skip to content

Commit 1182f66

Browse files
authored
Maintain Rows and Columns indices order (#28)
* Maintain col & row order * Display JobId & Wait message * Bug Fix release * md syntax fix
1 parent 5424423 commit 1182f66

File tree

5 files changed

+34
-19
lines changed

5 files changed

+34
-19
lines changed

ExtractTable/__init__.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,13 +105,22 @@ def trigger_process(self, fp: BinaryIO, dup_check: bool = False, **kwargs) -> di
105105

106106
# GetResult if JobId is present in the response
107107
# Usually happens when processing PDF files or idempotent requests
108-
if 'JobId' in resp:
108+
if 'JobId' in resp and resp.get("JobStatus", "") == JobStatus.PROCESSING:
109+
if max_wait_time > 0:
110+
print("[Info]: Waiting to retrieve the output; JobId:", resp['JobId'])
111+
else:
112+
print("[Info]: JobId:", resp['JobId'])
109113
resp = self.get_result(resp['JobId'], max_wait_time=max_wait_time)
110114

111115
return resp
112116

113-
def bigfile_upload(self, filename):
114-
resp = self._make_request('post', HOST.BIGFILE, data={"filename": filename})
117+
def bigfile_upload(self, filepath):
118+
"""
119+
To aid big file processing by uploading the file first and triggering the process next
120+
:param filepath: filepath
121+
:return: a signed URL to upload the file
122+
"""
123+
resp = self._make_request('post', HOST.BIGFILE, data={"filename": filepath})
115124

116125
return resp
117126

@@ -132,7 +141,7 @@ def process_file(
132141
Example: '1,3,4' or '1,4-end' or 'all'.
133142
:param output_format: datafram as default; Check `ExtractTable._OUTPUT_FORMATS` to see available options
134143
:param dup_check: Idempotent requests handler
135-
:param indexing: If row index is needed
144+
:param indexing: Whether to output row & column indices in the outputs other than df
136145
:param kwargs:
137146
max_wait_time: int, optional (default: 300);
138147
Maximum Time to wait before returning to the client
@@ -151,13 +160,13 @@ def process_file(
151160
with open(infile.filepath, 'rb') as fp:
152161
trigger_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
153162
except ClientFileSizeError:
154-
big_gen = self.bigfile_upload(filename=os.path.basename(filepath))
163+
big_gen = self.bigfile_upload(filepath=os.path.basename(filepath))
155164
with open(filepath, 'rb') as ifile:
156165
rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
157166
trigger_resp = self.trigger_process(None, signed_filename=os.path.basename(filepath), **kwargs)
158167

159168
for _type, _obj in trigger_resp.items():
160169
self.__setattr__(_type, _obj)
161170

162-
result = ConvertTo(data=trigger_resp, fmt=output_format, index=indexing).output
171+
result = ConvertTo(data=trigger_resp, fmt=output_format, indexing=indexing).output
163172
return result

ExtractTable/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = (2, 0, 0)
1+
VERSION = (2, 0, 1)
22
PRERELEASE = None # "alpha", "beta" or "rc"
33
REVISION = None
44

ExtractTable/common.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import tempfile
66
import warnings
7+
import collections
78

89
import pandas as pd
910

@@ -13,27 +14,32 @@ class ConvertTo:
1314
FORMATS = {"df", "dataframe", "json", "csv", "dict"}
1415
DEFAULT = "df"
1516

16-
def __init__(self, data: dict, fmt: str = DEFAULT, index: bool = False):
17+
def __init__(self, data: dict, fmt: str = DEFAULT, indexing: bool = False):
1718
"""
1819
1920
:param data: Tabular JSON data from server
2021
:param fmt: format to be converted into
21-
:param index: row index consideration in the output
22+
:param indexing: row & column index consideration in the output
2223
"""
2324
self.data = data
24-
self.output = self._converter(fmt.lower(), index=index)
25+
self.output = self._converter(fmt.lower(), indexing=indexing)
2526

26-
def _converter(self, fmt: str, index: bool = False) -> list:
27+
def _converter(self, fmt: str, indexing: bool = False) -> list:
2728
"""
2829
Actual conversion takes place here using Pandas
2930
:param fmt: format to be converted into
30-
:param index: row index consideration in the output
31+
:param indexing: row index consideration in the output
3132
:return: list of tables from converted into the requested output format
3233
"""
33-
# To convert the column indices to int to maintain the correct order on a table with more than 9 columns
34-
dfs = [pd.DataFrame.from_dict(
35-
{int(k): v for k, v in table["TableJson"].items()}, orient="index"
36-
) for table in self.data.get("Tables", [])]
34+
dfs = []
35+
for table in self.data.get("Tables", []):
36+
tmp = {int(k): v for k, v in table["TableJson"].items()}
37+
# To convert column indices to int to maintain the table order with more than 9 columns
38+
cols = [str(x) for x in sorted([int(x) for x in tmp[0]])]
39+
# To convert row indices to int and maintain the table order with more than 9 rows
40+
tmp = collections.OrderedDict(sorted(tmp.items()))
41+
dfs.append(pd.DataFrame.from_dict(tmp, orient="index", columns=cols))
42+
3743
if fmt in ("df", "dataframe"):
3844
return dfs
3945
elif fmt == "dict":
@@ -43,7 +49,7 @@ def _converter(self, fmt: str, index: bool = False) -> list:
4349
output_location = []
4450
for tbl_n, df in enumerate(dfs):
4551
csv_name = os.path.join(save_folder, f"_table_{tbl_n+1}.csv")
46-
df.to_csv(csv_name, index=index)
52+
df.to_csv(csv_name, index=indexing, header=indexing)
4753
output_location.append(csv_name)
4854
return output_location
4955
elif fmt == "json":

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ Its up to you now to explore the ways.
5050

5151
# Explore
5252
check the complete server response of the latest job with `et_sess.ServerResponse.json()`
53-
```json
53+
```javascript
5454
{
5555
"JobStatus": <string>, # Status of the triggered Process @ JOB-LEVEL
5656
"Pages": <integer>, # Number of pages processed in this request @ PAGE-LEVEL

example-code.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,7 @@
307307
"source": [
308308
"> **Understand the output**: The response of a triggered job is a JSON object in the below format. Note that the response depends on the plan type of the API Key.\n",
309309
"\n",
310-
"```json\n",
310+
"```javascript\n",
311311
"{\n",
312312
" \"JobStatus\": <string>, # Status of the triggered Process @ JOB-LEVEL\n",
313313
" \"Pages\": <integer>, # Number of pages processed in this request @ PAGE-LEVEL\n",

0 commit comments

Comments
 (0)