Skip to content

Commit 519f01b

Browse files
Adding imdb script
1 parent 8865698 commit 519f01b

File tree

1 file changed

+129
-0
lines changed

1 file changed

+129
-0
lines changed

IMDB.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#-------------------------------------------------------------------------------
2+
# Name: Scrap and VisualiZation IMDB Data
3+
# Author: Prashant Pandey
4+
#
5+
#-------------------------------------------------------------------------------
6+
7+
8+
9+
# encoding=utf8
10+
#importing the necessary packages
11+
from bs4 import BeautifulSoup
12+
from requests import get
13+
import pandas as pd
14+
from time import sleep, time
15+
from random import randint
16+
from warnings import warn
17+
import matplotlib.pyplot as plt
18+
19+
20+
################### PART 1 - DATA AND VISUALIZATION #########################
21+
22+
pages = [str(i) for i in range(1,5)]
23+
years_url = [str(i) for i in range(2000,2018)]
24+
headers = {"Accept-Language": "en-US, en;q=0.5"}
25+
26+
# Redeclaring the lists to store data in
27+
names = []
28+
years = []
29+
imdb_ratings = []
30+
metascores = []
31+
votes = []
32+
33+
# Preparing the monitoring of the loop
34+
start_time = time()
35+
requests = 0
36+
37+
# For every year in the interval 2000-2017
38+
for year_url in years_url:
39+
40+
# For every page in the interval 1-4
41+
for page in pages:
42+
43+
# Make a get request
44+
response = get('http://www.imdb.com/search/title?release_date=' + year_url +
45+
'&sort=num_votes,desc&page=' + page, headers = headers)
46+
47+
# Pause the loop
48+
sleep(randint(8,15))
49+
50+
# Monitor the requests
51+
requests += 1
52+
elapsed_time = time() - start_time
53+
print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
54+
#clear_output(wait = True)
55+
56+
# Throw a warning for non-200 status codes
57+
if response.status_code != 200:
58+
warn('Request: {}; Status code: {}'.format(requests, response.status_code))
59+
60+
# Break the loop if the number of requests is greater than expected
61+
if requests > 72:
62+
warn('Number of requests was greater than expected.')
63+
break
64+
65+
# Parse the content of the request with BeautifulSoup
66+
page_html = BeautifulSoup(response.text, 'html.parser')
67+
68+
# Select all the 50 movie containers from a single page
69+
mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
70+
71+
# For every movie of these 50
72+
for container in mv_containers:
73+
# If the movie has a Metascore, then:
74+
if container.find('div', class_ = 'ratings-metascore') is not None:
75+
76+
# Scrape the name
77+
name = container.h3.a.text
78+
names.append(name)
79+
80+
# Scrape the year
81+
year = container.h3.find('span', class_ = 'lister-item-year').text
82+
years.append(year)
83+
84+
# Scrape the IMDB rating
85+
imdb = float(container.strong.text)
86+
imdb_ratings.append(imdb)
87+
88+
# Scrape the Metascore
89+
m_score = container.find('span', class_ = 'metascore').text
90+
metascores.append(int(m_score))
91+
92+
# Scrape the number of votes
93+
vote = container.find('span', attrs = {'name':'nv'})['data-value']
94+
votes.append(int(vote))
95+
96+
97+
98+
99+
movie_ratings = pd.DataFrame({'movie': names,
100+
'year': years,
101+
'imdb': imdb_ratings,
102+
'metascore': metascores,
103+
'votes': votes})
104+
105+
movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']]
106+
movie_ratings['year'].unique()
107+
movie_ratings.loc[:, 'year'] = movie_ratings['year'].str[-5:-1].astype(int)
108+
movie_ratings['n_imdb'] = movie_ratings['imdb'] * 10
109+
movie_ratings.to_csv('movie_ratings.csv', encoding='utf-8')
110+
111+
fig, axes = plt.subplots(nrows = 1, ncols = 3, figsize = (16,4))
112+
ax1, ax2, ax3 = fig.axes
113+
114+
ax1.hist(movie_ratings['imdb'], bins = 10, range = (0,10)) # bin range = 1
115+
ax1.set_title('IMDB rating')
116+
117+
ax2.hist(movie_ratings['metascore'], bins = 10, range = (0,100)) # bin range = 10
118+
ax2.set_title('Metascore')
119+
120+
ax3.hist(movie_ratings['n_imdb'], bins = 10, range = (0,100), histtype = 'step')
121+
ax3.hist(movie_ratings['metascore'], bins = 10, range = (0,100), histtype = 'step')
122+
ax3.legend(loc = 'upper left')
123+
ax3.set_title('The Two Normalized Distributions')
124+
125+
for ax in fig.axes:
126+
ax.spines['top'].set_visible(False)
127+
ax.spines['right'].set_visible(False)
128+
129+
plt.show()

0 commit comments

Comments
 (0)