Skip to content

Commit 5424423

Browse files
authored
v2.0 (#26)
* Extended Error Types * to support big files for processing * correct typehint * v2.0 usage * Big file processing support
1 parent 4d05317 commit 5424423

File tree

11 files changed

+453
-284
lines changed

11 files changed

+453
-284
lines changed

ExtractTable/FileOperations/__init__.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import requests
1010
import PyPDF2
1111

12-
from ..exceptions import ClientFileError
12+
from ..exceptions import *
1313

1414

1515
class CheckFile:
@@ -25,13 +25,13 @@ def type_error(self) -> ty.Union[Exception, None]:
2525
"""To check file extension"""
2626
if self.filepath.lower().endswith(self.__SUPPORTED_EXTENSIONS__):
2727
return
28-
raise ClientFileError(Message=f"Allowed file types are {self.__SUPPORTED_EXTENSIONS__}")
28+
raise ClientFileTypeError(Message=f"Allowed file types are {self.__SUPPORTED_EXTENSIONS__}")
2929

3030
def size_error(self) -> ty.Union[Exception, None]:
3131
# 1027 to create some buffer
3232
if os.stat(self.filepath).st_size <= self.__THRESHOLD_SIZE__*1027*1027:
3333
return
34-
raise ClientFileError(Message=f"File Size greater than the threshold {self.__THRESHOLD_SIZE__} Mb.")
34+
raise ClientFileSizeError(Message=f"File Size greater than the threshold {self.__THRESHOLD_SIZE__} Mb.")
3535

3636

3737
class PrepareInput:
@@ -73,7 +73,7 @@ def pdf_separator(self, gather_pages: set):
7373

7474
@staticmethod
7575
def _get_pages(filepath: os.PathLike, pages: str) -> set:
76-
# Credits to camelot library - customized
76+
# Credits to camelot-py library - customized
7777
"""Converts pages string to list of ints.
7878
7979
Parameters
@@ -86,8 +86,7 @@ def _get_pages(filepath: os.PathLike, pages: str) -> set:
8686
8787
Returns
8888
-------
89-
P : list
90-
List of int page numbers.
89+
List of int page numbers.
9190
9291
"""
9392
page_numbers = []

ExtractTable/__init__.py

Lines changed: 163 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,163 @@
1-
from .client import ExtractTable
1+
"""
2+
Any Request or Response of a transaction must take place here
3+
"""
4+
from urllib import parse as urlparse
5+
import os
6+
import typing as ty
7+
from typing import BinaryIO
8+
import time
9+
import warnings
10+
11+
import requests as rq
12+
13+
from .FileOperations import PrepareInput
14+
from .config import HOST, JobStatus
15+
from .parsers import ValidateResponse
16+
from .common import ConvertTo
17+
from .exceptions import ClientFileSizeError
18+
19+
20+
class ExtractTable:
21+
from .__version__ import __version__
22+
_OUTPUT_FORMATS: set = ConvertTo.FORMATS
23+
_DEFAULT: str = ConvertTo.DEFAULT
24+
_WARNINGS: bool = True
25+
_WAIT_UNTIL_OUTPUT: bool = True
26+
VERSION = f"ExtractTable_{__version__}"
27+
28+
def __init__(self, api_key: str):
29+
"""
30+
Starts by creating a session
31+
:param api_key: API Key recieved from https://extracttable.com
32+
"""
33+
self.api_key = api_key
34+
self._session = rq.Session()
35+
self._session.headers['x-api-key'] = self.api_key
36+
37+
# Helpul if the user wants to dig into the actual server response
38+
self.ServerResponse = rq.Response
39+
40+
def _make_request(self, method, host: urlparse, params: dict = None, data: dict = None, **kwargs) -> dict:
41+
"""
42+
Create a server request and parse the response for validation
43+
:param method: Request method
44+
:param host: endpoint to send the request
45+
:param params: query params for the request
46+
:param data: form data for the requests
47+
:param kwargs: Any other that a server accepts
48+
:return: json response of the server
49+
"""
50+
tmp = self.__dict__.copy()
51+
for _type, _obj in tmp.items():
52+
if _type not in ("api_key", "_session"):
53+
self.__delattr__(_type)
54+
55+
host = host if not host.startswith("http") else host.split("/")[2]
56+
url = urlparse.urlunparse(('https', host, '', '', '', ''))
57+
self.ServerResponse = self._session.request(method, url, params=params, data=data, **kwargs)
58+
ValidateResponse(resp=self.ServerResponse, show_warn=self._WARNINGS)
59+
60+
return self.ServerResponse.json()
61+
62+
def check_usage(self) -> dict:
63+
"""
64+
Check the usage of the API Key is valid
65+
:return the plan usage of the API Key, if valid
66+
"""
67+
resp = self._make_request('get', HOST.VALIDATOR)
68+
69+
return resp['usage']
70+
71+
def get_result(self, job_id: str, wait_time: int = 10, max_wait_time: int = 300) -> dict:
72+
"""
73+
Retrieve the tabular data of a triggered job based on the JobId
74+
:param job_id: JobId received from an already triggered process
75+
:param wait_time: Time to wait before making another request
76+
:param max_wait_time: Maximum Time to wait before returning to the client
77+
:return: Tabular JSON when processed successful else helpful user info
78+
"""
79+
params = {'JobId': job_id}
80+
resp = self._make_request('get', HOST.RESULT, params=params)
81+
# Loop to retrieve the output until max_wait_time is reached
82+
max_wait_time = int(max_wait_time)
83+
while self._WAIT_UNTIL_OUTPUT and resp["JobStatus"] == JobStatus.PROCESSING and max_wait_time > 0:
84+
time.sleep(max(10, int(wait_time)))
85+
max_wait_time -= wait_time
86+
resp = self._make_request('get', HOST.RESULT, params=params)
87+
88+
return resp
89+
90+
def trigger_process(self, fp: BinaryIO, dup_check: bool = False, **kwargs) -> dict:
91+
"""
92+
Trigger the document to the server for processing
93+
:param fp: Binary file data of the input file
94+
:param dup_check: helps to handle idempotent requests
95+
:param kwargs: anyother form-data to be sent to the server
96+
:return: Tabular JSON when processed successful else helpful user info
97+
"""
98+
max_wait_time = kwargs.pop('max_wait_time', 300)
99+
data = {'dup_check': dup_check, "library": kwargs.pop("library", self.VERSION)}
100+
data.update(kwargs)
101+
if "signed_filename" in data:
102+
resp = self._make_request('post', HOST.TRIGGER, data=data)
103+
else:
104+
resp = self._make_request('post', HOST.TRIGGER, data=data, files={'input': fp})
105+
106+
# GetResult if JobId is present in the response
107+
# Usually happens when processing PDF files or idempotent requests
108+
if 'JobId' in resp:
109+
resp = self.get_result(resp['JobId'], max_wait_time=max_wait_time)
110+
111+
return resp
112+
113+
def bigfile_upload(self, filename):
114+
resp = self._make_request('post', HOST.BIGFILE, data={"filename": filename})
115+
116+
return resp
117+
118+
def process_file(
119+
self,
120+
filepath: ty.Union[str, bytes, os.PathLike],
121+
pages: ty.Union[str] = "1",
122+
output_format: str = "df",
123+
dup_check: bool = False,
124+
indexing: bool = False,
125+
**kwargs
126+
) -> list:
127+
"""
128+
Trigge the file for processing and returns the tabular data in the user requested output format
129+
:param filepath: Location of the file
130+
:param pages : str, optional (default: '1')
131+
Comma-separated page numbers.
132+
Example: '1,3,4' or '1,4-end' or 'all'.
133+
:param output_format: datafram as default; Check `ExtractTable._OUTPUT_FORMATS` to see available options
134+
:param dup_check: Idempotent requests handler
135+
:param indexing: If row index is needed
136+
:param kwargs:
137+
max_wait_time: int, optional (default: 300);
138+
Maximum Time to wait before returning to the client
139+
any other form-data to be sent to the server for future considerations
140+
:return: user requested output in list;
141+
"""
142+
# Raise a warning if unknown format is requested
143+
if output_format not in self._OUTPUT_FORMATS:
144+
default_format = "dict"
145+
warn_msg = f"Found: {output_format} as output_format; Allowed only {self._OUTPUT_FORMATS}. " \
146+
f"Assigned default format: {default_format}"
147+
warnings.warn(warn_msg)
148+
149+
try:
150+
with PrepareInput(filepath, pages=pages) as infile:
151+
with open(infile.filepath, 'rb') as fp:
152+
trigger_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
153+
except ClientFileSizeError:
154+
big_gen = self.bigfile_upload(filename=os.path.basename(filepath))
155+
with open(filepath, 'rb') as ifile:
156+
rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
157+
trigger_resp = self.trigger_process(None, signed_filename=os.path.basename(filepath), **kwargs)
158+
159+
for _type, _obj in trigger_resp.items():
160+
self.__setattr__(_type, _obj)
161+
162+
result = ConvertTo(data=trigger_resp, fmt=output_format, index=indexing).output
163+
return result

ExtractTable/__version__.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
1-
VERSION = (1, 2, 1)
1+
VERSION = (2, 0, 0)
22
PRERELEASE = None # "alpha", "beta" or "rc"
3-
REVISION = 2
3+
REVISION = None
44

55

6-
def generate_version(version, prerelease=None, revision=None):
7-
version_parts = [".".join(map(str, version))]
8-
if prerelease is not None:
9-
version_parts.append("-{}".format(prerelease))
10-
if revision is not None:
11-
version_parts.append(".{}".format(revision))
6+
def generate_version():
7+
version_parts = [".".join(map(str, VERSION))]
8+
if PRERELEASE is not None:
9+
version_parts.append("-{}".format(PRERELEASE))
10+
if REVISION is not None:
11+
version_parts.append(".{}".format(REVISION))
1212
return "".join(version_parts)
1313

1414

1515
__title__ = "ExtractTable"
16-
__description__ = "Extract tabular data from images and scanned PDFs"
16+
__description__ = "Extract tabular data from images and scanned PDFs. Easily convert image to table, convert pdf to table"
1717
__url__ = "https://github.com/ExtractTable/ExtractTable-py"
18-
__version__ = generate_version(VERSION, prerelease=PRERELEASE, revision=REVISION)
18+
__version__ = generate_version()
1919
__author__ = "Saradhi"
2020
__author_email__ = "saradhi@extracttable.com"
2121
__license__ = "Apache License 2.0"

ExtractTable/client.py

Lines changed: 0 additions & 143 deletions
This file was deleted.

ExtractTable/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def _converter(self, fmt: str, index: bool = False) -> list:
3333
# To convert the column indices to int to maintain the correct order on a table with more than 9 columns
3434
dfs = [pd.DataFrame.from_dict(
3535
{int(k): v for k, v in table["TableJson"].items()}, orient="index"
36-
) for table in self.data["Tables"]]
36+
) for table in self.data.get("Tables", [])]
3737
if fmt in ("df", "dataframe"):
3838
return dfs
3939
elif fmt == "dict":

0 commit comments

Comments
 (0)