-
Notifications
You must be signed in to change notification settings - Fork 1
/
populate_database.py
68 lines (58 loc) · 2.39 KB
/
populate_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import sqlite3
from datetime import datetime
import pandas as pd
import glob
# Directory containing the CSV files
directory = r'/Users/dokigbo/Downloads/vso_health_summer_project/vso_health_checks_python'
# Connect to the SQLite database
conn = sqlite3.connect('vso_files.db')
cur = conn.cursor()
# Drop the existing check_files_python table if it exists
# Create the check_files_python table with source_name
cur.execute('''
CREATE TABLE check_files_python (
id INTEGER PRIMARY KEY AUTOINCREMENT,
provider TEXT NOT NULL,
source TEXT NOT NULL,
instrument TEXT NOT NULL,
source_name TEXT NOT NULL,
status INTEGER,
check_date TEXT,
UNIQUE (provider, source, instrument, check_date)
)
''')
# Function to insert data into the check_files_python table from a dataframe
def insert_check_file_data(df, check_date):
for index, row in df.iterrows():
source_name = f"{row['Provider']}-{row['Source']}-{row['Instrument']}"
try:
cur.execute('''
INSERT INTO check_files_python (provider, source, instrument, source_name, status, check_date)
VALUES (?, ?, ?, ?, ?, ?)
''', (row['Provider'], row['Source'], row['Instrument'], source_name, row['Status'], check_date))
except sqlite3.IntegrityError:
print(f"Duplicate entry found for {row['Provider']}, {row['Source']}, {row['Instrument']}, {check_date}. Skipping.")
conn.commit()
# Use glob to get all CSV files in the directory
path = os.path.join(directory, "*.csv")
for fname in glob.glob(path):
print(fname) # Full path to the CSV file
# Extract information from the filename
base = os.path.basename(fname) # File name without directory path
parts = base.split('_') # Split the file name into components
if len(parts) >= 4:
# Ensure the date part is actually a date
try:
check_date_raw = parts[3] # Extract the date part (YYYYMMDD format)
# Convert the date part to YYYY-MM-DD format
check_date = datetime.strptime(check_date_raw, '%Y%m%d').strftime('%Y-%m-%d')
except ValueError:
print(f"Filename {base} contains an invalid date format.")
continue
# Load the CSV data into a DataFrame
df = pd.read_csv(fname)
# Insert data into the database
insert_check_file_data(df, check_date)
# Close the connection
conn.close()