The project involves a detailed examination of Netflix's content to extract valuable insights and address various business questions. By leveraging SQL queries, we aim to analyze the distribution of content types (movies vs. TV shows), identify common ratings, explore content based on release years, countries, and durations, and categorize content using specific criteria and keywords.
- Analyze distribution between movies and TV shows on Netflix.
- Identify prevalent ratings for movies and TV shows.
- Examine content based on release years, countries, and durations.
- Explore and categorize content using specific criteria and keywords.
The dataset used in this project is sourced from Kaggle:
- Dataset Link: Movies and TV Shows Dataset
CREATE TABLE netflix
(
show_id VARCHAR(5),
type VARCHAR(7),
title VARCHAR(104),
director VARCHAR(208),
casts VARCHAR(771),
country VARCHAR(123),
date_added DATE,
release_year INT,
rating VARCHAR(8),
duration VARCHAR(10),
listed_in VARCHAR(79),
description VARCHAR(250)
);
SELECT
types,
COUNT(*) AS Total_type
FROM
netflix
GROUP BY
types;
Goal: Analyze the distribution of content types on Netflix.
WITH Ratings AS (
SELECT
types,
rating,
count(rating),
RANK() OVER(PARTITION BY types ORDER BY COUNT(*) DESC) AS ranking
FROM
netflix
GROUP BY
types, rating
)
SELECT
types,
rating
FROM
Ratings
WHERE
ranking = 1
Goal: Find the most common rating for each type of content.
SELECT
*
FROM
netflix
WHERE
types = 'Movie'
AND
release_year = 2021
Goal: Compile a list of all movies that were released in the year 2021.
WITH country_data AS (
SELECT
TRIM(SUBSTR(country, 1, INSTR(country, ',') - 1)) AS country
FROM
netflix
WHERE
country LIKE '%,%'
UNION ALL
SELECT
TRIM(country) AS country
FROM
netflix
WHERE
country NOT LIKE '%,%'
AND country IS NOT NULL
AND country <> ''
)
SELECT
country,
COUNT(*) AS content_count
FROM
country_data
GROUP BY
country
ORDER BY
content_count DESC
LIMIT 5;
Goal: Determine the top 5 countries with the most extensive content libraries available on Netflix.
SELECT
types,
MAX(duration) AS Longest_movie
FROM
netflix
WHERE
types = 'Movie'
Goal: Find the movie with the longest duration in the dataset.
SELECT
*
FROM
netflix
WHERE
date_added >= DATE('now', '-3 years');
Goal: List content added to Netflix in the past 3 years.
SELECT
*
FROM
netflix
WHERE
director LIKE '%Martin Scorsese%' COLLATE NOCASE
Goal: Compile a list of movies and TV shows directed by a specific director like Martin Scorsese.
SELECT
*
FROM
netflix
WHERE
types = 'TV Show'
AND
duration > '4 Seasons'
Goal: Identify all TV shows with over 4 seasons.
WITH each_genre AS (
SELECT
TRIM(SUBSTRING(listed_in, 1, INSTR(listed_in, ',') - 1)) AS genre
FROM
netflix
WHERE
listed_in LIKE '%,%'
UNION ALL
SELECT
TRIM(listed_in) AS genre
FROM
netflix
WHERE
listed_in NOT LIKE '%,%'
)
SELECT
genre,
COUNT(*) AS count
FROM
each_genre
GROUP BY
genre
ORDER BY
count DESC
Goal: Determine the count of content items in each genre category.
WITH YearlyReleaseCounts AS (
SELECT
release_year,
COUNT(*) AS num_releases
FROM
netflix
GROUP BY
release_year
),
TotalYears AS (
SELECT COUNT(DISTINCT release_year) AS total_years
FROM netflix
)
SELECT
SUM(num_releases) / (SELECT total_years FROM TotalYears) AS avg_releases_per_year
FROM
YearlyReleaseCounts;
Goal: Determine the average annual content release count within the dataset.
SELECT
*
FROM
netflix
WHERE
types = 'Movie'
AND
listed_in LIKE '%Documentaries%' COLLATE NOCASE
Goal: Compile a list of movies classified as documentaries.
SELECT
*
FROM
netflix
WHERE
director IS NULL OR director = '';
Goal: Identify content without a specified director.
SELECT
*
FROM
netflix
WHERE
types = 'Movie'
AND
casts LIKE '%Leonardo DiCaprio%' COLLATE NOCASE
Goal: Determine the total number of movies in which actor 'Leonardo DiCaprio' has appeared.
WITH SplitActors AS (
SELECT
TRIM(SUBSTR(casts, 1, INSTR(casts, ',') - 1)) AS actor
FROM
netflix
WHERE
casts LIKE '%,%'
UNION ALL
SELECT
TRIM(casts) AS actor
FROM
netflix
WHERE
casts NOT LIKE '%,%'
AND
casts IS NOT NULL
AND
casts <> ''
)
SELECT
actor,
COUNT(*) AS total_appearances
FROM
SplitActors
WHERE
actor IS NOT NULL AND actor <> ''
GROUP BY
actor
ORDER BY
total_appearances DESC
LIMIT 5;
Goal: Identify the top 5 actors with the most appearances in movies within the dataset.
15. Categorize the content based on the presence of the keywords 'detective' and 'investigation' in the description field.
WITH MoviesData AS (
SELECT
*,
CASE
WHEN
description LIKE '%detective%' COLLATE NOCASE
OR
description LIKE '%investigation%' COLLATE NOCASE
THEN
'Investigative Film'
ELSE
'General'
END AS movie_category
FROM
netflix
)
SELECT
movie_category,
COUNT(*) AS total_movies_in_category
FROM
MoviesData
GROUP BY
movie_category
Goal: Categorize content as 'Investigate Film' if it contains 'detective' or 'investigation' and 'General' otherwise. Count the number of items in each category.
- The dataset encompasses a broad range of movies and TV shows with varied ratings and genres.
- Insights into common ratings offer a glimpse into the target audience for the content.
- Examination of top countries and average content releases per year.
- Categorizing content based on specific keywords aids in understanding the content landscape on Netflix.
- This analysis offers a holistic view of Netflix's content, supporting content strategy and decision-making.