22
22
import list_urls
23
23
import extract_item_info
24
24
import extract_mp3
25
+ import download_mp3s
25
26
26
27
27
28
DATA_DIR = pathlib .Path (__file__ ).parent / "data"
28
29
29
30
30
31
# target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
31
- target_url = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp=2"
32
- item_urls = list_urls .get_national_jukebox_song_detail_urls (target_url )
32
+ target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
33
33
34
34
35
35
def download_and_extract_item (base_url ):
36
36
print (f"Fetching content from: { base_url } " )
37
37
# https://guides.loc.gov/digital-scholarship/faq
38
38
# Stay within 20 requests per minute rate limit.
39
39
time .sleep (3 )
40
- response = requests .get (base_url )
41
40
42
41
try :
42
+ response = requests .get (base_url , timeout = 10 )
43
43
response .raise_for_status () # Raise an exception for HTTP errors (4xx or 5xx)
44
44
except requests .exceptions .RequestException as e :
45
45
print (f"Error fetching URL: { e } " )
@@ -52,23 +52,44 @@ def download_and_extract_item(base_url):
52
52
return item
53
53
54
54
55
- visited_urls = {}
56
- jukebox_path = DATA_DIR / "jukebox.jsonl"
57
55
58
- if jukebox_path . exists ( ):
59
- jukebox = pandas . read_json ( jukebox_path , lines = True , orient = "records" )
60
- visited_urls = frozenset ( jukebox [ "URL" ]. to_list ()) if "URL" in jukebox . columns else {}
56
+ def download_page ( page_number ):
57
+ target_url = target_url_template . format ( page_number )
58
+ item_urls = list_urls . get_national_jukebox_song_detail_urls ( target_url )
61
59
60
+ visited_urls = set ()
61
+ jukebox_path = DATA_DIR / "jukebox.jsonl"
62
62
63
- with open (DATA_DIR / "jukebox.jsonl" , "a" ) as data_file :
64
- for item_url in item_urls :
65
- if item_url in visited_urls :
66
- continue
63
+ if jukebox_path .exists ():
64
+ jukebox = pandas .read_json (jukebox_path , lines = True , orient = "records" )
65
+ visited_urls = frozenset (jukebox ["URL" ].to_list ()) if "URL" in jukebox .columns else {}
67
66
68
- item = download_and_extract_item (item_url )
69
- if item is None :
70
- continue
67
+ with open (DATA_DIR / "jukebox.jsonl" , "a" ) as data_file :
68
+ while item_urls :
69
+ item_url = item_urls .pop (0 )
70
+ if item_url in visited_urls :
71
+ continue
72
+
73
+ item = download_and_extract_item (item_url )
74
+ if item is None :
75
+ item_urls .append (item_url )
76
+ continue
77
+
78
+ json .dump (item , data_file , indent = None )
79
+ data_file .write ("\n " )
80
+ data_file .flush ()
81
+
82
+
83
+ if __name__ == "__main__" :
84
+ page_number = 4
85
+ while True :
86
+ print (f"Page { page_number } " )
87
+ try :
88
+ download_page (page_number )
89
+ download_mp3s .download_all ()
90
+ except requests .exceptions .HTTPError as exc :
91
+ if exc .response .status_code == 404 :
92
+ print ("Reached last page?" )
93
+ break
94
+ page_number += 1
71
95
72
- json .dump (item , data_file , indent = None )
73
- data_file .write ("\n " )
74
- data_file .flush ()
0 commit comments