-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
56 lines (42 loc) · 1.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from film_content_parser import obtain_film_object
from parser_config import check_film_object, watched_included
from html_creator import create_html_file
def get_watched_films(file_path):
watched_films_txt = open(file_path, 'r')
if watched_films_txt:
watched_names = watched_films_txt.read().split('\n')
return [names for names in watched_names if names != '']
return None
watched_films = None
if not watched_included():
watched_films = get_watched_films('watched_films.txt')
# Time to wait for web page to be loaded.
TIME_FACTOR = 3
# Give the URL of the imdb list.
list_url = "https://www.imdb.com/list/ls041732779/?ref_=tt_rls_5"
print("Opening a webdriver")
driver = webdriver.Chrome()
driver.get(list_url)
print("Waiting the website to be loaded")
# Wait browser to load the page.
time.sleep(TIME_FACTOR)
content = driver.page_source.encode('utf-16').strip()
soup = BeautifulSoup(content, 'lxml')
# Obtain all films
film_contents = soup.find_all("div", class_="lister-item mode-detail")
wanted_films = []
list_header = soup.find("h1", class_='header list-name').text
print("Parsing and querying films")
for all_content in film_contents:
img_source = all_content.find('div', class_='lister-item-image ribbonize').find('img')
content = all_content.find('div', class_='lister-item-content')
current_film = obtain_film_object(content, img_source)
if check_film_object(current_film, watched_films):
wanted_films.append(current_film)
create_html_file(wanted_films, list_header)
print("New html created with the name ",list_header )
driver.close()