3
3
"""
4
4
import os
5
5
import re
6
+ import shutil
6
7
import tempfile
7
8
import warnings
8
9
import collections
@@ -84,6 +85,7 @@ def __init__(self, et_resp: dict = None, dataframes: List[pd.DataFrame] = None):
84
85
Default assumes all dataframes from the extracttable response, `et_resp`.
85
86
If both `et_resp` and `dataframes` are provided, the later is considered for the processing
86
87
"""
88
+ self .et_resp = et_resp
87
89
if et_resp :
88
90
self .dataframes = ConvertTo (server_response = et_resp ).output
89
91
@@ -134,6 +136,7 @@ def split_merged_rows(self) -> List[pd.DataFrame]:
134
136
reformat .append (row )
135
137
136
138
self .dataframes [df_idx ] = pd .DataFrame (reformat )
139
+ self .et_resp ['Tables' ][df_idx ]['TableJson' ] = self .dataframes [df_idx ].to_dict (orient = 'index' )
137
140
138
141
return self .dataframes
139
142
@@ -147,12 +150,11 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool
147
150
"""
148
151
# TODO: Should we consider delimiter_pattern for the split?
149
152
for df_idx , df in enumerate (self .dataframes ):
150
- if not columns_idx :
151
- columns_idx = df . columns
153
+ cols_idx = df . columns if not columns_idx else columns_idx . copy ()
154
+ cols_idx = [ str ( x ) for x in cols_idx ]
152
155
153
- columns_idx = [str (x ) for x in columns_idx ]
154
156
reformat = []
155
- for col_idx in columns_idx :
157
+ for col_idx in cols_idx :
156
158
tmp = df [col_idx ].str .split (expand = True )
157
159
158
160
if not any ([not any (tmp .isna ().any ()), force_split ]) or tmp .shape [- 1 ] == 1 :
@@ -163,6 +165,7 @@ def split_merged_columns(self, columns_idx: List[int] = None, force_split: bool
163
165
reformat .extend ([tmp [each ].tolist () for each in tmp .columns ])
164
166
165
167
self .dataframes [df_idx ] = pd .DataFrame (reformat ).T
168
+ self .et_resp ['Tables' ][df_idx ]['TableJson' ] = self .dataframes [df_idx ].to_dict (orient = 'index' )
166
169
167
170
return self .dataframes
168
171
@@ -185,11 +188,10 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s
185
188
decimal_position = int (decimal_position )
186
189
187
190
for df_idx , df in enumerate (self .dataframes ):
188
- if not columns_idx :
189
- columns_idx = df .columns
190
- columns_idx = [str (x ) for x in columns_idx ]
191
+ cols_idx = df .columns if not columns_idx else columns_idx .copy ()
192
+ cols_idx = [str (x ) for x in cols_idx ]
191
193
192
- for col_idx in columns_idx :
194
+ for col_idx in cols_idx :
193
195
digits = df [col_idx ].str .count (pat = r'\d' ).sum ()
194
196
chars = df [col_idx ].str .count (pat = r'[\w]' ).sum ()
195
197
@@ -220,6 +222,8 @@ def fix_decimal_format(self, columns_idx: List[int] = None, decimal_separator: s
220
222
df [col_idx ][i ] = df [col_idx ][i ][:- (decimal_position + 1 )] + decimal_separator + df [col_idx ][i ][- decimal_position :]
221
223
222
224
self .dataframes [df_idx ] = df
225
+ self .et_resp ['Tables' ][df_idx ]['TableJson' ] = self .dataframes [df_idx ].to_dict (orient = 'index' )
226
+
223
227
return self .dataframes
224
228
225
229
def fix_date_format (self , columns_idx : List [int ] = None , delimiter : str = "/" ):
@@ -233,11 +237,10 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
233
237
"""
234
238
date_regex = r'(\d{2}(\d{2})?)(\W)(\d{2}|[A-Za-z]{3,9})(\W)(\d{2}(\d{2})?)\b'
235
239
for df_idx , df in enumerate (self .dataframes ):
236
- if not columns_idx :
237
- columns_idx = df .columns
238
- columns_idx = [str (x ) for x in columns_idx ]
240
+ cols_idx = df .columns if not columns_idx else columns_idx .copy ()
241
+ cols_idx = [str (x ) for x in cols_idx ]
239
242
240
- for col_idx in columns_idx :
243
+ for col_idx in cols_idx :
241
244
dates = df [col_idx ].str .count (pat = date_regex ).sum ()
242
245
243
246
if not (dates >= len (df ) * 0.75 ):
@@ -249,6 +252,7 @@ def fix_date_format(self, columns_idx: List[int] = None, delimiter: str = "/"):
249
252
df [col_idx ].replace (regex = {date_regex : r'\1%s\4%s\6' % (delimiter , delimiter )}, inplace = True )
250
253
251
254
self .dataframes [df_idx ] = df
255
+ self .et_resp ['Tables' ][df_idx ]['TableJson' ] = self .dataframes [df_idx ].to_dict (orient = 'index' )
252
256
253
257
return self .dataframes
254
258
@@ -263,14 +267,49 @@ def fix_characters(self, columns_idx: List[int] = None, replace_ref: dict = {}):
263
267
:return: correted list of dataframes
264
268
"""
265
269
for df_idx , df in enumerate (self .dataframes ):
266
- if not columns_idx :
267
- columns_idx = df .columns
268
- columns_idx = [str (x ) for x in columns_idx ]
270
+ cols_idx = df .columns if not columns_idx else columns_idx .copy ()
271
+ cols_idx = [str (x ) for x in cols_idx ]
269
272
270
- for col_idx in columns_idx :
273
+ for col_idx in cols_idx :
271
274
for find_ch , repl_ch in replace_ref .items ():
272
275
df [col_idx ] = df [col_idx ].str .replace (str (find_ch ), str (repl_ch ))
273
276
274
277
self .dataframes [df_idx ] = df
275
-
278
+ self . et_resp [ 'Tables' ][ df_idx ][ 'TableJson' ] = self . dataframes [ df_idx ]. to_dict ( orient = 'index' )
276
279
return self .dataframes
280
+
281
+ def save_output (self , output_folder : os .PathLike = "" , output_format : str = "csv" , indexing : bool = False ):
282
+ """
283
+ Save the objects of session data to user preferred location or a default folder
284
+ :param output_folder: user preferred output location; default tmp directory
285
+ :param output_format: needed only for tables CSV or XLSX
286
+ :param indexing: row & column index consideration in the output
287
+ :return: location of the output
288
+ """
289
+ input_fname = "corrected_"
290
+
291
+ output_format = output_format .lower ()
292
+ if output_format not in ("csv" , "xlsx" ):
293
+ output_format = "csv"
294
+ warnings .warn ("Invalid 'output_format' given. Defaulted to 'csv'" )
295
+
296
+ table_outputs_path = ConvertTo (server_response = self .et_resp , output_format = output_format , indexing = indexing ).output
297
+
298
+ if output_folder :
299
+ if not os .path .exists (output_folder ):
300
+ try :
301
+ os .mkdir (output_folder )
302
+ except Exception as e :
303
+ warnings .warn (f"[Warn]: { str (e )} " )
304
+ warnings .warn (f"Failed to created output_folder not exists. Saving the outputs to { output_folder } " )
305
+ output_folder = os .path .dirname (table_outputs_path [0 ])
306
+ else :
307
+ output_folder = os .path .dirname (table_outputs_path [0 ])
308
+
309
+ if output_folder != os .path .dirname (table_outputs_path [0 ]):
310
+ for each_tbl_path in table_outputs_path :
311
+ shutil .move (each_tbl_path ,
312
+ os .path .join (output_folder , input_fname + os .path .basename (each_tbl_path )))
313
+
314
+ return output_folder
315
+
0 commit comments