-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_extraction.py
78 lines (64 loc) · 3.04 KB
/
data_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import database_utils
import pandas as pd
import tabula # For extracting tables from PDF
class DataExtractor:
def __init__(self):
pass
def read_rds_table(self,engine,table_name):
with engine.begin() as conn:
return pd.read_sql_table(table_name, con=conn)
def retrieve_pdf_data(self,link):
return pd.concat(tabula.read_pdf(link, pages='all'))
def API_key(self):
return {'x-api-key':'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'}
def retrieve_stores_data(self):
list_of_frames = []
store_number = self.list_number_of_stores()
for _ in range(store_number):
api_url_base = f'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/{_}'
response = requests.get(
api_url_base,
headers=self.API_key()
)
list_of_frames.append( pd.json_normalize(response.json()))
return pd.concat(list_of_frames)
def list_number_of_stores(self):
api_url_base = 'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores'
response = requests.get(
api_url_base,
headers=self.API_key()
)
return response.json()['number_stores']
def extract_from_s3(self):
s3_client = boto3.client(
"s3"
)
response = s3_client.get_object(Bucket='data-handling-public', Key='products.csv')
status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
if status == 200:
print(f"Successful S3 get_object response. Status - {status}")
return pd.read_csv(response.get("Body"))
else:
print(f"Unsuccessful S3 get_object response. Status - {status}")
def extract_from_s3_by_link(self):
url = 'https://data-handling-public.s3.eu-west-1.amazonaws.com/date_details.json'
response = requests.get(url)
dic = response.json()
df = pd.DataFrame([])
for column_name in dic.keys():
value_list = []
for _ in dic[column_name].keys():
value_list.append(dic[column_name][_])
df[column_name] = value_list
return df
if __name__ == "__main__":
data = DataExtractor()
df = data.read_rds_table('legacy_users')
print(df)
pdf_link = "https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf"
# Instantiate the DataExtractor class
extractor = DataExtractor()
# Retrieve and print the extracted data from the PDF
extracted_data = extractor.retrieve_pdf_data(pdf_link)
if extracted_data is not None:
print(extracted_data.head()) # Display the first few rows of the extracted data