Skip to content

Commit a8d6b26

Browse files
authored
Big files download (#49)
* Download big outputs * [B]: Fix processing splitted PDFs
1 parent b3847ba commit a8d6b26

File tree

5 files changed

+20
-22
lines changed

5 files changed

+20
-22
lines changed

ExtractTable/FileOperations/__init__.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,19 +19,17 @@ class CheckFile:
1919
def __init__(self, filepath: ty.Union[os.PathLike, str]):
2020
self.filepath = filepath
2121
self.type_error()
22-
self.size_error()
22+
self.is_big = self.is_big_size()
2323

2424
def type_error(self) -> ty.Union[Exception, None]:
2525
"""To check file extension"""
2626
if self.filepath.lower().endswith(self.__SUPPORTED_EXTENSIONS__):
2727
return
2828
raise ClientFileTypeError(Message=f"Allowed file types are {self.__SUPPORTED_EXTENSIONS__}")
2929

30-
def size_error(self) -> ty.Union[Exception, None]:
30+
def is_big_size(self) -> bool:
3131
# 1027 to create some buffer
32-
if os.stat(self.filepath).st_size <= self.__THRESHOLD_SIZE__*1027*1027:
33-
return
34-
raise ClientFileSizeError(Message=f"File Size greater than the threshold {self.__THRESHOLD_SIZE__} Mb.")
32+
return os.stat(self.filepath).st_size > self.__THRESHOLD_SIZE__*1027*1027
3533

3634

3735
class PrepareInput:
@@ -55,11 +53,10 @@ def __init__(self, filepath: ty.Union[os.PathLike, str], pages: str):
5553
print("[Info]: Aggregating user defined pages..", self.pages)
5654
gather_pages = self._get_pages(self.filepath, pages)
5755
self.filepath = self.pdf_separator(gather_pages)
58-
CheckFile(self.filepath)
5956

6057
def pdf_separator(self, gather_pages: set):
6158
"""PDF Splitter"""
62-
merged_pdf = os.path.join(self.temp_dir, str(self.pages) + os.path.basename(self.filepath))
59+
merged_pdf = os.path.join(self.temp_dir, str(self.pages) + "_" + os.path.basename(self.filepath))
6360
with open(merged_pdf, 'wb') as out_file:
6461
pdf_reader = PyPDF2.PdfFileReader(self.filepath)
6562
pdf_writer = PyPDF2.PdfFileWriter()

ExtractTable/__init__.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import requests as rq
1313

14-
from .FileOperations import PrepareInput
14+
from .FileOperations import PrepareInput, CheckFile
1515
from .config import HOST, JobStatus
1616
from .parsers import ValidateResponse
1717
from .common import ConvertTo
@@ -98,6 +98,10 @@ def get_result(self, job_id: str, wait_time: int = 10, max_wait_time: int = 300)
9898
time.sleep(max(10, int(wait_time)))
9999
max_wait_time -= wait_time
100100
resp = self._make_request('get', HOST.RESULT, params=params)
101+
102+
if resp.get('DownloadUrl', ''):
103+
self.ServerResponse = rq.get(resp['DownloadUrl'])
104+
self.server_response = resp = self.ServerResponse.json()
101105

102106
return resp
103107

@@ -171,15 +175,16 @@ def process_file(
171175
# To use the reference when saving the output
172176
self.__setattr__('input_filename', os.path.basename(filepath))
173177

174-
try:
175-
with PrepareInput(filepath, pages=pages) as infile:
176-
with open(infile.filepath, 'rb') as fp:
178+
with PrepareInput(filepath, pages=pages) as infile:
179+
with open(infile.filepath, 'rb') as fp:
180+
is_big_file = CheckFile(infile.filepath).is_big
181+
if not is_big_file:
177182
trigger_resp = self.trigger_process(fp, dup_check=dup_check, **kwargs)
178-
except ClientFileSizeError:
179-
big_gen = self.bigfile_upload(filepath=os.path.basename(filepath))
180-
with open(filepath, 'rb') as ifile:
181-
rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
182-
trigger_resp = self.trigger_process(None, signed_filename=big_gen["fields"]["key"], dup_check=dup_check, **kwargs)
183+
else:
184+
big_gen = self.bigfile_upload(filepath=os.path.basename(infile.filepath))
185+
with open(infile.filepath, 'rb') as ifile:
186+
rq.post(big_gen['url'], data=big_gen['fields'], files={'file': ifile})
187+
trigger_resp = self.trigger_process(None, signed_filename=big_gen["fields"]["key"], dup_check=dup_check, **kwargs)
183188

184189
for _type, _obj in trigger_resp.items():
185190
self.__setattr__(_type, _obj)

ExtractTable/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = (2, 2, 0)
1+
VERSION = (2, 3, 1)
22
PRERELEASE = None # "alpha", "beta" or "rc"
33
REVISION = None
44

requirements.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010
with open('README.md', 'r') as f:
1111
readme = f.read()
1212

13-
with open("requirements.txt") as fh:
14-
requires = [x.strip() for x in fh.readlines()]
13+
requires = ['requests>=2.21', 'pandas>=0.24', 'PyPDF2>=1.26']
1514

1615

1716
def setup_package():

0 commit comments

Comments
 (0)