Maintain Rows and Columns indices order (#28)

akshowhini · web-flow · commit 1182f66264cd · 2020-07-02T21:04:15.000-04:00
* Maintain col &amp; row order

* Display JobId &amp; Wait message

* Bug Fix release

* md syntax fix
diff --git a/ExtractTable/__init__.py b/ExtractTable/__init__.py
@@ -105,13 +105,22 @@ def trigger_process(self, fp: BinaryIO, dup_check: bool = False, **kwargs) -> di
 
         # GetResult if JobId is present in the response
         # Usually happens when processing PDF files or idempotent requests
-        if 'JobId' in resp:
+        if 'JobId' in resp and resp.get("JobStatus", "") == JobStatus.PROCESSING:
+            if max_wait_time > 0:
+                print("[Info]: Waiting to retrieve the output; JobId:", resp['JobId'])
+            else:
+                print("[Info]: JobId:", resp['JobId'])
             resp = self.get_result(resp['JobId'], max_wait_time=max_wait_time)
 
         return resp
 
-    def bigfile_upload(self, filename):
-        resp = self._make_request('post', HOST.BIGFILE, data={"filename": filename})
+    def bigfile_upload(self, filepath):
+        """
+        To aid big file processing by uploading the file first and triggering the process next
+        :param filepath: filepath
+        :return: a signed URL to upload the file
+        """
+        resp = self._make_request('post', HOST.BIGFILE, data={"filename": filepath})
 
         return resp
 
@@ -132,7 +141,7 @@ def process_file(
                 Example: '1,3,4' or '1,4-end' or 'all'.
         :param output_format: datafram as default; Check `ExtractTable._OUTPUT_FORMATS` to see available options
         :param dup_check: Idempotent requests handler
-        :param indexing: If row index is needed
+        :param indexing: Whether to output row & column indices in the outputs other than df
         :param kwargs:
             max_wait_time: int, optional (default: 300);
                 Maximum Time to wait before returning to the client
@@ -151,13 +160,13 @@ def process_file(
                 with open(infile.filepath, 'rb') as fp:
                     trigger_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
         except ClientFileSizeError:
-            big_gen = self.bigfile_upload(filename=os.path.basename(filepath))
+            big_gen = self.bigfile_upload(filepath=os.path.basename(filepath))
             with open(filepath, 'rb') as ifile:
                 rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
             trigger_resp = self.trigger_process(None, signed_filename=os.path.basename(filepath), **kwargs)
 
         for _type, _obj in trigger_resp.items():
             self.__setattr__(_type, _obj)
 
-        result = ConvertTo(data=trigger_resp, fmt=output_format, index=indexing).output
+        result = ConvertTo(data=trigger_resp, fmt=output_format, indexing=indexing).output
         return result
diff --git a/ExtractTable/__version__.py b/ExtractTable/__version__.py
@@ -1,4 +1,4 @@
-VERSION = (2, 0, 0)
+VERSION = (2, 0, 1)
 PRERELEASE = None  # "alpha", "beta" or "rc"
 REVISION = None
 
diff --git a/ExtractTable/common.py b/ExtractTable/common.py
@@ -4,6 +4,7 @@
 import os
 import tempfile
 import warnings
+import collections
 
 import pandas as pd
 
@@ -13,27 +14,32 @@ class ConvertTo:
     FORMATS = {"df", "dataframe", "json", "csv", "dict"}
     DEFAULT = "df"
 
-    def __init__(self, data: dict, fmt: str = DEFAULT, index: bool = False):
+    def __init__(self, data: dict, fmt: str = DEFAULT, indexing: bool = False):
         """
 
         :param data: Tabular JSON data from server
         :param fmt: format to be converted into
-        :param index: row index consideration in the output
+        :param indexing: row & column index consideration in the output
         """
         self.data = data
-        self.output = self._converter(fmt.lower(), index=index)
+        self.output = self._converter(fmt.lower(), indexing=indexing)
 
-    def _converter(self, fmt: str, index: bool = False) -> list:
+    def _converter(self, fmt: str, indexing: bool = False) -> list:
         """
         Actual conversion takes place here using Pandas
         :param fmt: format to be converted into
-        :param index: row index consideration in the output
+        :param indexing: row index consideration in the output
         :return: list of tables from converted into the requested output format
         """
-        # To convert the column indices to int to maintain the correct order on a table with more than 9 columns
-        dfs = [pd.DataFrame.from_dict(
-            {int(k): v for k, v in table["TableJson"].items()}, orient="index"
-        ) for table in self.data.get("Tables", [])]
+        dfs = []
+        for table in self.data.get("Tables", []):
+            tmp = {int(k): v for k, v in table["TableJson"].items()}
+            # To convert column indices to int to maintain the table order with more than 9 columns
+            cols = [str(x) for x in sorted([int(x) for x in tmp[0]])]
+            # To convert row indices to int and maintain the table order with more than 9 rows
+            tmp = collections.OrderedDict(sorted(tmp.items()))
+            dfs.append(pd.DataFrame.from_dict(tmp, orient="index", columns=cols))
+
         if fmt in ("df", "dataframe"):
             return dfs
         elif fmt == "dict":
@@ -43,7 +49,7 @@ def _converter(self, fmt: str, index: bool = False) -> list:
             output_location = []
             for tbl_n, df in enumerate(dfs):
                 csv_name = os.path.join(save_folder, f"_table_{tbl_n+1}.csv")
-                df.to_csv(csv_name, index=index)
+                df.to_csv(csv_name, index=indexing, header=indexing)
                 output_location.append(csv_name)
             return output_location
         elif fmt == "json":
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ Its up to you now to explore the ways.
 
 # Explore
 check the complete server response of the latest job with `et_sess.ServerResponse.json()`
-```json
+```javascript
 {
     "JobStatus": <string>,                              # Status of the triggered Process  @ JOB-LEVEL
     "Pages": <integer>,                                 # Number of pages processed in this request @ PAGE-LEVEL
diff --git a/example-code.ipynb b/example-code.ipynb
@@ -307,7 +307,7 @@
    "source": [
     "> **Understand the output**: The response of a triggered job is a JSON object in the below format. Note that the response depends on the plan type of the API Key.\n",
     "\n",
-    "```json\n",
+    "```javascript\n",
     "{\n",
     "    \"JobStatus\": <string>,                              # Status of the triggered Process  @ JOB-LEVEL\n",
     "    \"Pages\": <integer>,                                 # Number of pages processed in this request @ PAGE-LEVEL\n",

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-VERSION = (2, 0, 0)`
	`1`	`+VERSION = (2, 0, 1)`
`2`	`2`	`PRERELEASE = None # "alpha", "beta" or "rc"`
`3`	`3`	`REVISION = None`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@ Its up to you now to explore the ways.`
`50`	`50`
`51`	`51`	`# Explore`
`52`	`52`	check the complete server response of the latest job with `et_sess.ServerResponse.json()`
`53`		-```json
	`53`	+```javascript
`54`	`54`	`{`
`55`	`55`	`"JobStatus": <string>, # Status of the triggered Process @ JOB-LEVEL`
`56`	`56`	`"Pages": <integer>, # Number of pages processed in this request @ PAGE-LEVEL`