|
| 1 | +#------------------------------------------------------------------------------- |
| 2 | +# Name: Scrap and VisualiZation IMDB Data |
| 3 | +# Author: Prashant Pandey |
| 4 | +# |
| 5 | +#------------------------------------------------------------------------------- |
| 6 | + |
| 7 | + |
| 8 | + |
| 9 | +# encoding=utf8 |
| 10 | +#importing the necessary packages |
| 11 | +from bs4 import BeautifulSoup |
| 12 | +from requests import get |
| 13 | +import pandas as pd |
| 14 | +from time import sleep, time |
| 15 | +from random import randint |
| 16 | +from warnings import warn |
| 17 | +import matplotlib.pyplot as plt |
| 18 | + |
| 19 | + |
| 20 | +################### PART 1 - DATA AND VISUALIZATION ######################### |
| 21 | + |
| 22 | +pages = [str(i) for i in range(1,5)] |
| 23 | +years_url = [str(i) for i in range(2000,2018)] |
| 24 | +headers = {"Accept-Language": "en-US, en;q=0.5"} |
| 25 | + |
| 26 | +# Redeclaring the lists to store data in |
| 27 | +names = [] |
| 28 | +years = [] |
| 29 | +imdb_ratings = [] |
| 30 | +metascores = [] |
| 31 | +votes = [] |
| 32 | + |
| 33 | +# Preparing the monitoring of the loop |
| 34 | +start_time = time() |
| 35 | +requests = 0 |
| 36 | + |
| 37 | +# For every year in the interval 2000-2017 |
| 38 | +for year_url in years_url: |
| 39 | + |
| 40 | + # For every page in the interval 1-4 |
| 41 | + for page in pages: |
| 42 | + |
| 43 | + # Make a get request |
| 44 | + response = get('http://www.imdb.com/search/title?release_date=' + year_url + |
| 45 | + '&sort=num_votes,desc&page=' + page, headers = headers) |
| 46 | + |
| 47 | + # Pause the loop |
| 48 | + sleep(randint(8,15)) |
| 49 | + |
| 50 | + # Monitor the requests |
| 51 | + requests += 1 |
| 52 | + elapsed_time = time() - start_time |
| 53 | + print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time)) |
| 54 | + #clear_output(wait = True) |
| 55 | + |
| 56 | + # Throw a warning for non-200 status codes |
| 57 | + if response.status_code != 200: |
| 58 | + warn('Request: {}; Status code: {}'.format(requests, response.status_code)) |
| 59 | + |
| 60 | + # Break the loop if the number of requests is greater than expected |
| 61 | + if requests > 72: |
| 62 | + warn('Number of requests was greater than expected.') |
| 63 | + break |
| 64 | + |
| 65 | + # Parse the content of the request with BeautifulSoup |
| 66 | + page_html = BeautifulSoup(response.text, 'html.parser') |
| 67 | + |
| 68 | + # Select all the 50 movie containers from a single page |
| 69 | + mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced') |
| 70 | + |
| 71 | + # For every movie of these 50 |
| 72 | + for container in mv_containers: |
| 73 | + # If the movie has a Metascore, then: |
| 74 | + if container.find('div', class_ = 'ratings-metascore') is not None: |
| 75 | + |
| 76 | + # Scrape the name |
| 77 | + name = container.h3.a.text |
| 78 | + names.append(name) |
| 79 | + |
| 80 | + # Scrape the year |
| 81 | + year = container.h3.find('span', class_ = 'lister-item-year').text |
| 82 | + years.append(year) |
| 83 | + |
| 84 | + # Scrape the IMDB rating |
| 85 | + imdb = float(container.strong.text) |
| 86 | + imdb_ratings.append(imdb) |
| 87 | + |
| 88 | + # Scrape the Metascore |
| 89 | + m_score = container.find('span', class_ = 'metascore').text |
| 90 | + metascores.append(int(m_score)) |
| 91 | + |
| 92 | + # Scrape the number of votes |
| 93 | + vote = container.find('span', attrs = {'name':'nv'})['data-value'] |
| 94 | + votes.append(int(vote)) |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | +movie_ratings = pd.DataFrame({'movie': names, |
| 100 | + 'year': years, |
| 101 | + 'imdb': imdb_ratings, |
| 102 | + 'metascore': metascores, |
| 103 | + 'votes': votes}) |
| 104 | + |
| 105 | +movie_ratings = movie_ratings[['movie', 'year', 'imdb', 'metascore', 'votes']] |
| 106 | +movie_ratings['year'].unique() |
| 107 | +movie_ratings.loc[:, 'year'] = movie_ratings['year'].str[-5:-1].astype(int) |
| 108 | +movie_ratings['n_imdb'] = movie_ratings['imdb'] * 10 |
| 109 | +movie_ratings.to_csv('movie_ratings.csv', encoding='utf-8') |
| 110 | + |
| 111 | +fig, axes = plt.subplots(nrows = 1, ncols = 3, figsize = (16,4)) |
| 112 | +ax1, ax2, ax3 = fig.axes |
| 113 | + |
| 114 | +ax1.hist(movie_ratings['imdb'], bins = 10, range = (0,10)) # bin range = 1 |
| 115 | +ax1.set_title('IMDB rating') |
| 116 | + |
| 117 | +ax2.hist(movie_ratings['metascore'], bins = 10, range = (0,100)) # bin range = 10 |
| 118 | +ax2.set_title('Metascore') |
| 119 | + |
| 120 | +ax3.hist(movie_ratings['n_imdb'], bins = 10, range = (0,100), histtype = 'step') |
| 121 | +ax3.hist(movie_ratings['metascore'], bins = 10, range = (0,100), histtype = 'step') |
| 122 | +ax3.legend(loc = 'upper left') |
| 123 | +ax3.set_title('The Two Normalized Distributions') |
| 124 | + |
| 125 | +for ax in fig.axes: |
| 126 | + ax.spines['top'].set_visible(False) |
| 127 | + ax.spines['right'].set_visible(False) |
| 128 | + |
| 129 | +plt.show() |
0 commit comments