From 87d26f48db1147c24b811f9fbc2bd79f8b22a94b Mon Sep 17 00:00:00 2001 From: JamesKohlsRepo Date: Sat, 2 Nov 2024 08:57:26 -0700 Subject: [PATCH 1/6] replaced example docstring in wiki_feature_infowith example function, replaced parallel lists with a dataclass in wiki_query --- Pipfile.lock | 2 +- .../data_processing/wiki_to_netflix.py | 73 +++++++------------ .../data_processing/wiki_to_netflix_test.py | 20 +++++ 3 files changed, 48 insertions(+), 47 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index a5d3a5a..9ad305e 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "32ea889604fc1353459ed33e39527f602f37f1f89142456036b79da15d2be380" + "sha256": "4538e3fe34bc8b2bf4f190a63bbb7c2fd7a6fa658af76c8b717463ba913c9571" }, "pipfile-spec": 6, "requires": { diff --git a/mediabridge/data_processing/wiki_to_netflix.py b/mediabridge/data_processing/wiki_to_netflix.py index ee97d46..0530e6f 100644 --- a/mediabridge/data_processing/wiki_to_netflix.py +++ b/mediabridge/data_processing/wiki_to_netflix.py @@ -2,6 +2,7 @@ import os import sys import time +from dataclasses import dataclass import requests from tqdm import tqdm @@ -11,6 +12,13 @@ class WikidataServiceTimeoutException(Exception): pass +@dataclass +class movieData: + movie_id: str + genre: str + director: str + + data_dir = os.path.join(os.path.dirname(__file__), "../../data") out_dir = os.path.join(os.path.dirname(__file__), "../../out") user_agent = "Noisebridge MovieBot 0.0.1/Audiodude " @@ -49,36 +57,15 @@ def create_netflix_csv(csv_name, data_list): def wiki_feature_info(data, key): """ - Extracts movie info from Wikidata query results. + Extracts movie information from a Wikidata query result. Parameters: - data (dict): A dictionary representing the JSON response from a SPARQL query, where: - movie-related data is under 'results' -> 'bindings' -> '[key]' -> 'value'. - Example: - { - "results": { - "bindings": [ - { - "item": { - "type": "uri", - "value": "http://www.wikidata.org/entity/Q12345" - }, - "genreLabel": { - "type": "literal", - "value": "Science Fiction" - } - }, - { - ... - }, - ] - } - } - key (str): The key for the information to extract (e.g. 'item', 'genreLabel', 'directorLabel'). + data (dict): JSON response from a SPARQL query, see example in get_example_json_sparql_response(). + key (str): The key for the information to extract (e.g., 'item', 'genreLabel', 'directorLabel'). Returns: - None: If the key is not present or no results are available - List: If the key is 'genreLabel', returns a list of unique genre labels. + None: If the key is not present or no results are available. + list: If the key is 'genreLabel', returns a list of unique genre labels. String: If the Key is present, return the movie ID of the first binding, in other words the first row in query result """ if ( @@ -161,21 +148,13 @@ def wiki_query(data_csv, user_agent): Formats SPARQL query for Wiki data Parameters: - data_csv (list of lists): A list of rows containing movie data, where: - row 1: movie ID (not used in query) - row 2: release year - row 3: movie title + data_csv (list of lists): Rows of movie data with [movie ID, release year, title]. user_agent (str): used to identify our script when sending requests to Wikidata SPARQL API. Returns: - wiki_movie_ids, wiki_genres, wiki_directors (tuple), where: - wiki_movie_ids (list): List of movie IDs - wiki_genres (list): List of genres - wiki_directors (list): List of Directors + list of WikiMovieData: A list of movieData instances with movie IDs, genres, and directors. """ - wiki_movie_ids = [] - wiki_genres = [] - wiki_directors = [] + wiki_data_list = [] for row in tqdm(data_csv): if row[1] is None: @@ -189,10 +168,7 @@ def wiki_query(data_csv, user_agent): response = requests.post( "https://query.wikidata.org/sparql", headers={"User-Agent": user_agent}, - data={ - "query": SPARQL, - "format": "json", - }, + data={"query": SPARQL, "format": "json"}, timeout=20, ) break @@ -202,18 +178,23 @@ def wiki_query(data_csv, user_agent): tries += 1 if tries > 5: raise WikidataServiceTimeoutException( - f"Tried {tries} time, could not reach Wikidata " + f"Tried {tries} times, could not reach Wikidata " f"(movie: {row[2]} {row[1]})" ) response.raise_for_status() data = response.json() - wiki_movie_ids.append(wiki_feature_info(data, "item")) - wiki_genres.append(wiki_feature_info(data, "genreLabel")) - wiki_directors.append(wiki_feature_info(data, "directorLabel")) + # Create WikiMovieData instance and add to the list + wiki_data_list.append( + movieData( + movie_id=wiki_feature_info(data, "item"), + genre=wiki_feature_info(data, "genreLabel"), + director=wiki_feature_info(data, "directorLabel"), + ) + ) - return wiki_movie_ids, wiki_genres, wiki_directors + return wiki_data_list def process_data(test=False): diff --git a/mediabridge/data_processing/wiki_to_netflix_test.py b/mediabridge/data_processing/wiki_to_netflix_test.py index 37bffd1..b7ea413 100644 --- a/mediabridge/data_processing/wiki_to_netflix_test.py +++ b/mediabridge/data_processing/wiki_to_netflix_test.py @@ -5,3 +5,23 @@ def test_format_sparql_query(): QUERY = format_sparql_query("The Room", 2003) assert QUERY == EXPECTED_SPARQL_QUERY + + +def get_example_json_sparql_response(): + """ + Returns an example response structure for testing. + """ + return { + "results": { + "bindings": [ + { + "item": { + "type": "uri", + "value": "http://www.wikidata.org/entity/Q12345", + }, + "genreLabel": {"type": "literal", "value": "Science Fiction"}, + "directorLabel": {"type": "literal", "value": "John Doe"}, + } + ] + } + } From 25e83a02c9cb479ca5d7c904db967bd14df08ca8 Mon Sep 17 00:00:00 2001 From: JamesKohlsRepo Date: Sat, 2 Nov 2024 09:01:21 -0700 Subject: [PATCH 2/6] forgot to save --- mediabridge/data_processing/wiki_to_netflix.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mediabridge/data_processing/wiki_to_netflix.py b/mediabridge/data_processing/wiki_to_netflix.py index 0530e6f..b4f85d3 100644 --- a/mediabridge/data_processing/wiki_to_netflix.py +++ b/mediabridge/data_processing/wiki_to_netflix.py @@ -178,14 +178,13 @@ def wiki_query(data_csv, user_agent): tries += 1 if tries > 5: raise WikidataServiceTimeoutException( - f"Tried {tries} times, could not reach Wikidata " + f"Tried {tries} time, could not reach Wikidata " f"(movie: {row[2]} {row[1]})" ) response.raise_for_status() data = response.json() - # Create WikiMovieData instance and add to the list wiki_data_list.append( movieData( movie_id=wiki_feature_info(data, "item"), From 57a7ab37a5ce052393a061aae61c648409ecbd06 Mon Sep 17 00:00:00 2001 From: JamesKohlsRepo Date: Sat, 2 Nov 2024 09:27:26 -0700 Subject: [PATCH 3/6] fixed capitalization --- mediabridge/data_processing/wiki_to_netflix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mediabridge/data_processing/wiki_to_netflix.py b/mediabridge/data_processing/wiki_to_netflix.py index b4f85d3..00f5584 100644 --- a/mediabridge/data_processing/wiki_to_netflix.py +++ b/mediabridge/data_processing/wiki_to_netflix.py @@ -13,7 +13,7 @@ class WikidataServiceTimeoutException(Exception): @dataclass -class movieData: +class MovieData: movie_id: str genre: str director: str From f7d0d6880b33f6513cd88b50c9ef0cbbbe0bff25 Mon Sep 17 00:00:00 2001 From: JamesKohlsRepo Date: Sat, 2 Nov 2024 09:28:53 -0700 Subject: [PATCH 4/6] fixed object name --- mediabridge/data_processing/wiki_to_netflix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mediabridge/data_processing/wiki_to_netflix.py b/mediabridge/data_processing/wiki_to_netflix.py index 00f5584..8e54276 100644 --- a/mediabridge/data_processing/wiki_to_netflix.py +++ b/mediabridge/data_processing/wiki_to_netflix.py @@ -186,7 +186,7 @@ def wiki_query(data_csv, user_agent): data = response.json() wiki_data_list.append( - movieData( + MovieData( movie_id=wiki_feature_info(data, "item"), genre=wiki_feature_info(data, "genreLabel"), director=wiki_feature_info(data, "directorLabel"), From da75a6e18ba5afd227aeb4c5c07064e9c197c42c Mon Sep 17 00:00:00 2001 From: JamesKohlsRepo Date: Fri, 8 Nov 2024 09:04:55 -0800 Subject: [PATCH 5/6] refactored process_data to use dataclass --- .../data_processing/wiki_to_netflix.py | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/mediabridge/data_processing/wiki_to_netflix.py b/mediabridge/data_processing/wiki_to_netflix.py index 8e54276..4ffbfbc 100644 --- a/mediabridge/data_processing/wiki_to_netflix.py +++ b/mediabridge/data_processing/wiki_to_netflix.py @@ -2,7 +2,8 @@ import os import sys import time -from dataclasses import dataclass +from dataclasses import dataclass, field +from typing import List, Optional import requests from tqdm import tqdm @@ -14,11 +15,13 @@ class WikidataServiceTimeoutException(Exception): @dataclass class MovieData: - movie_id: str - genre: str - director: str + movie_id: Optional[str] + genre: List[str] + director: Optional[str] +# need Genres, Directors, Title, year? + data_dir = os.path.join(os.path.dirname(__file__), "../../data") out_dir = os.path.join(os.path.dirname(__file__), "../../out") user_agent = "Noisebridge MovieBot 0.0.1/Audiodude " @@ -198,41 +201,54 @@ def wiki_query(data_csv, user_agent): def process_data(test=False): """ - Reads movie titles and release years from the Netflix data set (which should be downloaded and placed in the repo), - then tries to match them with data from Wikidata. For any matches, a CSV file is written. + Processes Netflix movie data by enriching it with information from Wikidata and writes the results to a CSV file. + Netflix data was conveted from a generator to a list to avoid exaustion. was running into an issue where nothing would print to CSV file """ missing_count = 0 processed_data = [] - netflix_data = read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), test) + netflix_data = list( + read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), test) + ) netflix_csv = os.path.join(out_dir, "movie_titles.csv") - wiki_movie_ids_list, wiki_genres_list, wiki_directors_list = wiki_query( - netflix_data, user_agent - ) + enriched_movies = wiki_query(netflix_data, user_agent) - num_rows = len(wiki_movie_ids_list) + num_rows = len(enriched_movies) for index, row in enumerate(netflix_data): netflix_id, year, title = row - if wiki_movie_ids_list[index] is None: + movie_data = enriched_movies[index] + if movie_data.movie_id is None: missing_count += 1 + if movie_data.genre: + genres = "; ".join(movie_data.genre) + else: + genres = "" + if movie_data.director: + director = movie_data.director + else: + director = "" movie = [ netflix_id, - wiki_movie_ids_list[index], + movie_data.movie_id, title, year, - wiki_genres_list[index], - wiki_directors_list[index], + genres, + director, ] processed_data.append(movie) + print("Processed Data:") + for movie in processed_data: + print(movie) + create_netflix_csv(netflix_csv, processed_data) - print(f"missing: {missing_count} ({missing_count / num_rows * 100}%)") + print(f"missing: {missing_count} ({missing_count / num_rows * 100:.2f}%)") print( - f"found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100}%)" + f"found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100:.2f}%)" ) print(f"total: {num_rows}") From 4279fc974438d69c999ff542de8d1146f47c0883 Mon Sep 17 00:00:00 2001 From: JamesKohlsRepo Date: Fri, 8 Nov 2024 09:07:01 -0800 Subject: [PATCH 6/6] removed unused import --- mediabridge/data_processing/wiki_to_netflix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mediabridge/data_processing/wiki_to_netflix.py b/mediabridge/data_processing/wiki_to_netflix.py index 4ffbfbc..bc361ed 100644 --- a/mediabridge/data_processing/wiki_to_netflix.py +++ b/mediabridge/data_processing/wiki_to_netflix.py @@ -2,7 +2,7 @@ import os import sys import time -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import List, Optional import requests