-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathdownload.py
82 lines (67 loc) · 3.69 KB
/
download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from multiprocessing import Pool
import os
import time
from counter_and_status_bar import MultiProcessingCounterAndStatusBar
from downloaders import MediaDownloader
import fire
from utils import check_max_simultaneous_downloads, get_default_download_dir
from web_handler import RedditStateHandler
def start_download(subreddit: str,
sort_time: str,
sort_by: str,
download_dir: str = None,
max_post_downloads: int = 10,
max_trials: int = 20,
max_simultaneous_downloads: int = 16,
):
"""
Initialize the said class.
:param subreddit: Name of the subreddit.
:param sort_time: Sort time - oneOf(all, year, month, week, day).
:param sort_by: Sort by - oneOf(top, hot, rising, new).
:param download_dir: Path to the download directory. All files will be downloaded here.
:param max_post_downloads: Maximum number of posts to download. Default 10.
:param max_trials: Maximum number of trials to fetch subreddit data. Default 20.
:param max_simultaneous_downloads: Maximum number of simultaneous downloads. Default 16.
"""
# If download directory is not specified, setup the default directory.
download_dir = get_default_download_dir() if download_dir is None else download_dir
# Set download dir to subreddit name inside provided download dir.
download_dir = os.path.join(download_dir, subreddit)
# Check max_simultaneous_downloads
max_simultaneous_downloads = check_max_simultaneous_downloads(max_simultaneous_downloads)
# Setup web handler for reddit.
reddit_state_handler = RedditStateHandler(subreddit=subreddit,
sort_by=sort_by,
sort_time=sort_time,
max_trials=max_trials)
# Initialize the multiprocessing counter.
MultiProcessingCounterAndStatusBar.init(total_count=max_post_downloads,
subreddit_name=subreddit,
sort_by=sort_by,
sort_time=sort_time,
local_download_dir=download_dir)
# Loop till we reach max_post_downloads.
while MultiProcessingCounterAndStatusBar.global_counter.value < max_post_downloads:
# Get subreddit as json.
subreddit_data = reddit_state_handler.webpage_as_json
posts_info = subreddit_data.get('data', {}).get('children', None)
if posts_info is None:
print(f'Unable to find posts for {subreddit}. Please try again! :(')
return
# Initialize the media downloader.
MediaDownloader.init(local_download_dir=download_dir,
global_counter=MultiProcessingCounterAndStatusBar,
max_posts_download=max_post_downloads)
# Setup page post data and download using multiprocessing.
posts_data = [content['data'] for content in subreddit_data['data']['children']]
chunk_size = min(64, int(max_post_downloads/max_simultaneous_downloads))
with Pool(max_simultaneous_downloads) as multi_processor:
multi_processor.map(MediaDownloader.download_post, posts_data, chunksize=chunk_size)
# Go to next page if all posts on current page are exhausted and we are still running.
next_page_available = reddit_state_handler.try_to_goto_next_page()
if not next_page_available:
break
MultiProcessingCounterAndStatusBar.close_counter()
if __name__ == '__main__':
fire.Fire(start_download)