Skip to content

Commit

Permalink
Merge pull request #38 from JamesKohlsRepo/main
Browse files Browse the repository at this point in the history
Anti-pattern correction and Docstring Adjustments
  • Loading branch information
JamesKohlsRepo authored Nov 9, 2024
2 parents 1733fcb + 4279fc9 commit 1b94f49
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 59 deletions.
2 changes: 1 addition & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

112 changes: 54 additions & 58 deletions mediabridge/data_processing/wiki_to_netflix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import os
import sys
import time
from dataclasses import dataclass
from typing import List, Optional

import requests
from tqdm import tqdm
Expand All @@ -11,6 +13,15 @@ class WikidataServiceTimeoutException(Exception):
pass


@dataclass
class MovieData:
movie_id: Optional[str]
genre: List[str]
director: Optional[str]


# need Genres, Directors, Title, year?

data_dir = os.path.join(os.path.dirname(__file__), "../../data")
out_dir = os.path.join(os.path.dirname(__file__), "../../out")
user_agent = "Noisebridge MovieBot 0.0.1/Audiodude <audiodude@gmail.com>"
Expand Down Expand Up @@ -49,36 +60,15 @@ def create_netflix_csv(csv_name, data_list):

def wiki_feature_info(data, key):
"""
Extracts movie info from Wikidata query results.
Extracts movie information from a Wikidata query result.
Parameters:
data (dict): A dictionary representing the JSON response from a SPARQL query, where:
movie-related data is under 'results' -> 'bindings' -> '[key]' -> 'value'.
Example:
{
"results": {
"bindings": [
{
"item": {
"type": "uri",
"value": "http://www.wikidata.org/entity/Q12345"
},
"genreLabel": {
"type": "literal",
"value": "Science Fiction"
}
},
{
...
},
]
}
}
key (str): The key for the information to extract (e.g. 'item', 'genreLabel', 'directorLabel').
data (dict): JSON response from a SPARQL query, see example in get_example_json_sparql_response().
key (str): The key for the information to extract (e.g., 'item', 'genreLabel', 'directorLabel').
Returns:
None: If the key is not present or no results are available
List: If the key is 'genreLabel', returns a list of unique genre labels.
None: If the key is not present or no results are available.
list: If the key is 'genreLabel', returns a list of unique genre labels.
String: If the Key is present, return the movie ID of the first binding, in other words the first row in query result
"""
if (
Expand Down Expand Up @@ -161,21 +151,13 @@ def wiki_query(data_csv, user_agent):
Formats SPARQL query for Wiki data
Parameters:
data_csv (list of lists): A list of rows containing movie data, where:
row 1: movie ID (not used in query)
row 2: release year
row 3: movie title
data_csv (list of lists): Rows of movie data with [movie ID, release year, title].
user_agent (str): used to identify our script when sending requests to Wikidata SPARQL API.
Returns:
wiki_movie_ids, wiki_genres, wiki_directors (tuple), where:
wiki_movie_ids (list): List of movie IDs
wiki_genres (list): List of genres
wiki_directors (list): List of Directors
list of WikiMovieData: A list of movieData instances with movie IDs, genres, and directors.
"""
wiki_movie_ids = []
wiki_genres = []
wiki_directors = []
wiki_data_list = []

for row in tqdm(data_csv):
if row[1] is None:
Expand All @@ -189,10 +171,7 @@ def wiki_query(data_csv, user_agent):
response = requests.post(
"https://query.wikidata.org/sparql",
headers={"User-Agent": user_agent},
data={
"query": SPARQL,
"format": "json",
},
data={"query": SPARQL, "format": "json"},
timeout=20,
)
break
Expand All @@ -209,50 +188,67 @@ def wiki_query(data_csv, user_agent):
response.raise_for_status()
data = response.json()

wiki_movie_ids.append(wiki_feature_info(data, "item"))
wiki_genres.append(wiki_feature_info(data, "genreLabel"))
wiki_directors.append(wiki_feature_info(data, "directorLabel"))
wiki_data_list.append(
MovieData(
movie_id=wiki_feature_info(data, "item"),
genre=wiki_feature_info(data, "genreLabel"),
director=wiki_feature_info(data, "directorLabel"),
)
)

return wiki_movie_ids, wiki_genres, wiki_directors
return wiki_data_list


def process_data(test=False):
"""
Reads movie titles and release years from the Netflix data set (which should be downloaded and placed in the repo),
then tries to match them with data from Wikidata. For any matches, a CSV file is written.
Processes Netflix movie data by enriching it with information from Wikidata and writes the results to a CSV file.
Netflix data was conveted from a generator to a list to avoid exaustion. was running into an issue where nothing would print to CSV file
"""
missing_count = 0
processed_data = []

netflix_data = read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), test)
netflix_data = list(
read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), test)
)

netflix_csv = os.path.join(out_dir, "movie_titles.csv")

wiki_movie_ids_list, wiki_genres_list, wiki_directors_list = wiki_query(
netflix_data, user_agent
)
enriched_movies = wiki_query(netflix_data, user_agent)

num_rows = len(wiki_movie_ids_list)
num_rows = len(enriched_movies)

for index, row in enumerate(netflix_data):
netflix_id, year, title = row
if wiki_movie_ids_list[index] is None:
movie_data = enriched_movies[index]
if movie_data.movie_id is None:
missing_count += 1
if movie_data.genre:
genres = "; ".join(movie_data.genre)
else:
genres = ""
if movie_data.director:
director = movie_data.director
else:
director = ""
movie = [
netflix_id,
wiki_movie_ids_list[index],
movie_data.movie_id,
title,
year,
wiki_genres_list[index],
wiki_directors_list[index],
genres,
director,
]
processed_data.append(movie)

print("Processed Data:")
for movie in processed_data:
print(movie)

create_netflix_csv(netflix_csv, processed_data)

print(f"missing: {missing_count} ({missing_count / num_rows * 100}%)")
print(f"missing: {missing_count} ({missing_count / num_rows * 100:.2f}%)")
print(
f"found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100}%)"
f"found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100:.2f}%)"
)
print(f"total: {num_rows}")

Expand Down
20 changes: 20 additions & 0 deletions mediabridge/data_processing/wiki_to_netflix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,23 @@
def test_format_sparql_query():
QUERY = format_sparql_query("The Room", 2003)
assert QUERY == EXPECTED_SPARQL_QUERY


def get_example_json_sparql_response():
"""
Returns an example response structure for testing.
"""
return {
"results": {
"bindings": [
{
"item": {
"type": "uri",
"value": "http://www.wikidata.org/entity/Q12345",
},
"genreLabel": {"type": "literal", "value": "Science Fiction"},
"directorLabel": {"type": "literal", "value": "John Doe"},
}
]
}
}

0 comments on commit 1b94f49

Please sign in to comment.