Skip to content

Commit

Permalink
Finishes ETL python script based on notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
Adriano Godoy committed May 30, 2021
1 parent 8dcfc83 commit 37d07cb
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 49 deletions.
14 changes: 3 additions & 11 deletions etl.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,8 @@
"metadata": {},
"outputs": [],
"source": [
"time_data = (t, t.dt.hour, t.dt.day, t.dt.isocalendar().week, t.dt.month, t.dt.year, t.dt.weekday)\n",
"column_labels = (\"timestamp\", \"hour\", \"day\", \"week_of_year\", \"month\", \"year\", \"weekday\")"
"time_data = (t, t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday)\n",
"column_labels = (\"timestamp\", \"hour\", \"day\", \"week\", \"month\", \"year\", \"weekday\")"
]
},
{
Expand Down Expand Up @@ -329,7 +329,7 @@
"metadata": {},
"outputs": [],
"source": [
"user_df = df[[\"firstName\", \"gender\", \"lastName\", \"level\", \"userId\"]]"
"user_df = df[[\"userId\", \"firstName\", \"gender\", \"lastName\", \"level\"]]"
]
},
{
Expand Down Expand Up @@ -391,7 +391,6 @@
"\n",
" # insert songplay record\n",
" songplay_data = (pd.to_datetime(row.ts, unit='ms'), row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent)\n",
" print(songplay_data)\n",
" cur.execute(songplay_table_insert, songplay_data)\n",
" conn.commit()"
]
Expand Down Expand Up @@ -427,13 +426,6 @@
"# Implement `etl.py`\n",
"Use what you've completed in this notebook to implement `etl.py`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
22 changes: 11 additions & 11 deletions etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,37 +7,37 @@

def process_song_file(cur, filepath):
# open song file
df =
df = pd.read_json(filepath, lines=True)

# insert song record
song_data =
song_data = df[["song_id", "title", "duration", "year", "artist_id"]].values[0].tolist()
cur.execute(song_table_insert, song_data)

# insert artist record
artist_data =
artist_data = df[["artist_id", "artist_name", "artist_latitude", "artist_longitude", "artist_location"]].values[0].tolist()
cur.execute(artist_table_insert, artist_data)


def process_log_file(cur, filepath):
# open log file
df =
df = pd.read_json(filepath, lines=True)

# filter by NextSong action
df =
df = df.loc[df["page"] == "NextSong"]

# convert timestamp column to datetime
t =
t = pd.to_datetime(df['ts'], unit='ms')

# insert time data records
time_data =
column_labels =
time_df =
time_data = (t, t.dt.hour, t.dt.day, t.dt.week, t.dt.month, t.dt.year, t.dt.weekday)
column_labels = ("timestamp", "hour", "day", "week", "month", "year", "weekday")
time_df = pd.DataFrame(dict(zip(column_labels, time_data)))

for i, row in time_df.iterrows():
cur.execute(time_table_insert, list(row))

# load user table
user_df =
user_df = df[["userId", "firstName", "gender", "lastName", "level"]]

# insert user records
for i, row in user_df.iterrows():
Expand All @@ -56,7 +56,7 @@ def process_log_file(cur, filepath):
songid, artistid = None, None

# insert songplay record
songplay_data =
songplay_data = (pd.to_datetime(row.ts, unit='ms'), row.userId, row.level, songid, artistid, row.sessionId, row.location, row.userAgent)
cur.execute(songplay_table_insert, songplay_data)


Expand Down
31 changes: 17 additions & 14 deletions sql_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,57 +8,60 @@

# CREATE TABLES


songplay_table_create = ("""
CREATE TABLE songplays (timestamp TIMESTAMP, user_id INT, level TEXT, song_id TEXT, artist_id TEXT, session_id INT, location TEXT, user_agent TEXT)
CREATE TABLE songplays (songplay_id SERIAL PRIMARY KEY, start_time TIMESTAMP, user_id INT, level TEXT, song_id TEXT, artist_id TEXT, session_id INT, location TEXT, user_agent TEXT)
""")

user_table_create = ("""
CREATE TABLE users (first_name TEXT, gender CHAR, last_name TEXT, level TEXT, user_id INT)
CREATE TABLE users (user_id INT PRIMARY KEY, first_name TEXT, gender CHAR, last_name TEXT, level TEXT)
""")

song_table_create = ("""
CREATE TABLE songs
(id TEXT PRIMARY KEY, title TEXT, duration DECIMAL, year INT, artist_id TEXT NULL
);
CREATE TABLE songs (song_id TEXT PRIMARY KEY, title TEXT, duration DECIMAL, year INT, artist_id TEXT NULL);
""")
# CONSTRAINT fk_artist FOREIGN KEY(artist_id) REFERENCES artists(id)

artist_table_create = ("""
CREATE TABLE artists (id TEXT PRIMARY KEY, name TEXT, latitude TEXT, longitude TEXT, location TEXT);
CREATE TABLE artists (artist_id TEXT PRIMARY KEY, name TEXT, latitude TEXT, longitude TEXT, location TEXT);
""")

time_table_create = ("""
CREATE TABLE time (timestamp TIMESTAMP, hour INT, day INT, week_of_year INT, month INT, year INT, weekday INT)
CREATE TABLE time (start_time TIMESTAMP PRIMARY KEY, hour INT, day INT, week INT, month INT, year INT, weekday INT)
""")

# INSERT RECORDS

songplay_table_insert = ("""
INSERT INTO songplays (timestamp, user_id, level, song_id, artist_id, session_id, location, user_agent) VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
INSERT INTO songplays (start_time, user_id, level, song_id, artist_id, session_id, location, user_agent) VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
""")

user_table_insert = ("""
INSERT INTO users (first_name, gender, last_name, level, user_id) VALUES (%s, %s, %s, %s, %s);
INSERT INTO users (user_id, first_name, gender, last_name, level) VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (user_id) DO NOTHING;
""")

song_table_insert = ("""
INSERT INTO songs (id, title, duration, year, artist_id) VALUES (%s, %s, %s, %s, %s);
INSERT INTO songs (song_id, title, duration, year, artist_id) VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (song_id) DO NOTHING;
""")

artist_table_insert = ("""
INSERT INTO artists (id, name, latitude, longitude, location) VALUES (%s, %s, %s, %s, %s);
INSERT INTO artists (artist_id, name, latitude, longitude, location) VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (artist_id) DO NOTHING;
""")

time_table_insert = ("""
INSERT INTO time (timestamp, hour, day, week_of_year, month, year, weekday) VALUES (%s, %s, %s, %s, %s, %s, %s);
INSERT INTO time (start_time, hour, day, week, month, year, weekday) VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (start_time) DO NOTHING;
""")

# FIND SONGS

song_select = ("""
SELECT s.id songid, s.artist_id artistid
SELECT s.song_id songid, s.artist_id artistid
FROM songs s
LEFT JOIN artists a ON (s.artist_id = a.id)
LEFT JOIN artists a ON (s.artist_id = a.artist_id)
WHERE s.title = %s AND a.name = %s AND s.duration = %s
""")

Expand Down
Loading

0 comments on commit 37d07cb

Please sign in to comment.