Skip to content

Commit e7ad566

Browse files
authored
Save Correction Output (#50)
* Save the corrected output to a folder
1 parent a8d6b26 commit e7ad566

File tree

3 files changed

+69
-24
lines changed

3 files changed

+69
-24
lines changed

ExtractTable/__init__.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -211,13 +211,19 @@ def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv
211211

212212
if output_folder:
213213
if not os.path.exists(output_folder):
214-
output_folder = os.path.split(table_outputs_path[0])[0]
215-
warnings.warn(f"Your output_folder not exists. Saving the outputs to {output_folder}")
216-
else:
217-
for each_tbl_path in table_outputs_path:
218-
shutil.move(each_tbl_path, os.path.join(output_folder, input_fname+os.path.basename(each_tbl_path)))
214+
try:
215+
os.mkdir(output_folder)
216+
except Exception as e:
217+
warnings.warn(f"[Warn]: {str(e)}")
218+
warnings.warn(f"Failed to created output_folder not exists. Saving the outputs to {output_folder}")
219+
output_folder = os.path.dirname(table_outputs_path[0])
219220
else:
220-
output_folder = os.path.split(table_outputs_path[0])[0]
221+
output_folder = os.path.dirname(table_outputs_path[0])
222+
223+
if output_folder != os.path.dirname(table_outputs_path[0]):
224+
for each_tbl_path in table_outputs_path:
225+
shutil.move(each_tbl_path,
226+
os.path.join(output_folder, input_fname + os.path.basename(each_tbl_path)))
221227

222228
for each_page in self.server_response.get("Lines", []):
223229
page_txt_fname = os.path.join(output_folder, f"{input_fname}_Page_{str(each_page['Page'])}.txt")

ExtractTable/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = (2, 3, 1)
1+
VERSION = (2, 4, 0)
22
PRERELEASE = None # "alpha", "beta" or "rc"
33
REVISION = None
44

ExtractTable/common.py

Lines changed: 56 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44
import os
55
import re
6+
import shutil
67
import tempfile
78
import warnings
89
import collections
@@ -84,6 +85,7 @@ def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):
8485
Default assumes all dataframes from the extracttable response, `et_resp`.
8586
If both `et_resp` and `dataframes` are provided, the later is considered for the processing
8687
"""
88+
self.et_resp = et_resp
8789
if et_resp:
8890
self.dataframes = ConvertTo(server_response=et_resp).output
8991

@@ -134,6 +136,7 @@ def split_merged_rows(self) -> List[pd.DataFrame]:
134136
reformat.append(row)
135137

136138
self.dataframes[df_idx] = pd.DataFrame(reformat)
139+
self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')
137140

138141
return self.dataframes
139142

@@ -147,12 +150,11 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool
147150
"""
148151
# TODO: Should we consider delimiter_pattern for the split?
149152
for df_idx, df in enumerate(self.dataframes):
150-
if not columns_idx:
151-
columns_idx = df.columns
153+
cols_idx = df.columns if not columns_idx else columns_idx.copy()
154+
cols_idx = [str(x) for x in cols_idx]
152155

153-
columns_idx = [str(x) for x in columns_idx]
154156
reformat = []
155-
for col_idx in columns_idx:
157+
for col_idx in cols_idx:
156158
tmp = df[col_idx].str.split(expand=True)
157159

158160
if not any([not any(tmp.isna().any()), force_split]) or tmp.shape[-1] == 1:
@@ -163,6 +165,7 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool
163165
reformat.extend([tmp[each].tolist() for each in tmp.columns])
164166

165167
self.dataframes[df_idx] = pd.DataFrame(reformat).T
168+
self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')
166169

167170
return self.dataframes
168171

@@ -185,11 +188,10 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s
185188
decimal_position = int(decimal_position)
186189

187190
for df_idx, df in enumerate(self.dataframes):
188-
if not columns_idx:
189-
columns_idx = df.columns
190-
columns_idx = [str(x) for x in columns_idx]
191+
cols_idx = df.columns if not columns_idx else columns_idx.copy()
192+
cols_idx = [str(x) for x in cols_idx]
191193

192-
for col_idx in columns_idx:
194+
for col_idx in cols_idx:
193195
digits = df[col_idx].str.count(pat=r'\d').sum()
194196
chars = df[col_idx].str.count(pat=r'[\w]').sum()
195197

@@ -220,6 +222,8 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s
220222
df[col_idx][i] = df[col_idx][i][:-(decimal_position+1)] + decimal_separator + df[col_idx][i][-decimal_position:]
221223

222224
self.dataframes[df_idx] = df
225+
self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')
226+
223227
return self.dataframes
224228

225229
def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
@@ -233,11 +237,10 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
233237
"""
234238
date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
235239
for df_idx, df in enumerate(self.dataframes):
236-
if not columns_idx:
237-
columns_idx = df.columns
238-
columns_idx = [str(x) for x in columns_idx]
240+
cols_idx = df.columns if not columns_idx else columns_idx.copy()
241+
cols_idx = [str(x) for x in cols_idx]
239242

240-
for col_idx in columns_idx:
243+
for col_idx in cols_idx:
241244
dates = df[col_idx].str.count(pat=date_regex).sum()
242245

243246
if not (dates >= len(df) * 0.75):
@@ -249,6 +252,7 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
249252
df[col_idx].replace(regex={date_regex: r'\1%s\4%s\6' % (delimiter, delimiter)}, inplace=True)
250253

251254
self.dataframes[df_idx] = df
255+
self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')
252256

253257
return self.dataframes
254258

@@ -263,14 +267,49 @@ def fix_characters(self, columns_idx: List[int] = None, replace_ref: dict = {}):
263267
:return: correted list of dataframes
264268
"""
265269
for df_idx, df in enumerate(self.dataframes):
266-
if not columns_idx:
267-
columns_idx = df.columns
268-
columns_idx = [str(x) for x in columns_idx]
270+
cols_idx = df.columns if not columns_idx else columns_idx.copy()
271+
cols_idx = [str(x) for x in cols_idx]
269272

270-
for col_idx in columns_idx:
273+
for col_idx in cols_idx:
271274
for find_ch, repl_ch in replace_ref.items():
272275
df[col_idx] = df[col_idx].str.replace(str(find_ch), str(repl_ch))
273276

274277
self.dataframes[df_idx] = df
275-
278+
self.et_resp['Tables'][df_idx]['TableJson'] = self.dataframes[df_idx].to_dict(orient='index')
276279
return self.dataframes
280+
281+
def save_output(self, output_folder: os.PathLike = "", output_format: str = "csv", indexing: bool = False):
282+
"""
283+
Save the objects of session data to user preferred location or a default folder
284+
:param output_folder: user preferred output location; default tmp directory
285+
:param output_format: needed only for tables CSV or XLSX
286+
:param indexing: row & column index consideration in the output
287+
:return: location of the output
288+
"""
289+
input_fname = "corrected_"
290+
291+
output_format = output_format.lower()
292+
if output_format not in ("csv", "xlsx"):
293+
output_format = "csv"
294+
warnings.warn("Invalid 'output_format' given. Defaulted to 'csv'")
295+
296+
table_outputs_path = ConvertTo(server_response=self.et_resp, output_format=output_format, indexing=indexing).output
297+
298+
if output_folder:
299+
if not os.path.exists(output_folder):
300+
try:
301+
os.mkdir(output_folder)
302+
except Exception as e:
303+
warnings.warn(f"[Warn]: {str(e)}")
304+
warnings.warn(f"Failed to created output_folder not exists. Saving the outputs to {output_folder}")
305+
output_folder = os.path.dirname(table_outputs_path[0])
306+
else:
307+
output_folder = os.path.dirname(table_outputs_path[0])
308+
309+
if output_folder != os.path.dirname(table_outputs_path[0]):
310+
for each_tbl_path in table_outputs_path:
311+
shutil.move(each_tbl_path,
312+
os.path.join(output_folder, input_fname + os.path.basename(each_tbl_path)))
313+
314+
return output_folder
315+

0 commit comments

Comments
 (0)