-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb_1.py
61 lines (46 loc) · 1.75 KB
/
imdb_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import requests
import pprint
from bs4 import BeautifulSoup
URL = " https://www.imdb.com/india/top-rated-indian-movies/" #url of the imdb.com
sample = requests.get(URL)
soup = BeautifulSoup(sample.text,"html.parser")
def scrape_top_list():
m_div = soup.find('div',class_ = 'lister')
t_body = m_div.find('tbody',class_ = 'lister-list')
tr_s = t_body.find_all('tr')
movies_name = []
movies_ranks = []
release_year = []
movies_url = []
movies_ranting = []
for tr in tr_s:
position = tr.find('td', class_ ="titleColumn").get_text().strip()
rank = ''
for i in position:
if '.' not in i:
rank = rank + i
else:
break
movies_ranks.append(rank)
title = tr.find('td',class_ ="titleColumn").a.get_text()
movies_name.append(title)
years = tr.find('td', class_ ="titleColumn").span.get_text()
release_year.append(years)
ratings = tr.find('td', class_ = "ratingColumn").strong.get_text()
movies_ranting.append(ratings)
links = tr.find('td', class_ = "titleColumn").a['href']
movies_link = "https://www.imdb.com" + links
movies_url.append(movies_link)
All_movies = []
details = {}
for i in range(0,len(movies_ranks)):
details['position'] = int(movies_ranks[i])
details['name'] = str(movies_name[i])
year_of_release = release_year[i][1:5]
details['years'] = int(year_of_release)
details['rating'] = float(movies_ranting[i])
details['url'] = movies_url[i]
All_movies.append(details.copy())
return All_movies
scrapped_movies = scrape_top_list()
# pprint.pprint(scrapped_movies)