Skip to content

Commit f81a082

Browse files
authored
Merge pull request #6 from tswast/national-jukebox-more
download all files from national jukebox
2 parents a995eae + 03f814e commit f81a082

File tree

2 files changed

+60
-31
lines changed

2 files changed

+60
-31
lines changed

2025/national-jukebox/download_first_page.py renamed to 2025/national-jukebox/download_all.py

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,24 +22,24 @@
2222
import list_urls
2323
import extract_item_info
2424
import extract_mp3
25+
import download_mp3s
2526

2627

2728
DATA_DIR = pathlib.Path(__file__).parent / "data"
2829

2930

3031
# target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
31-
target_url = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp=2"
32-
item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
32+
target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
3333

3434

3535
def download_and_extract_item(base_url):
3636
print(f"Fetching content from: {base_url}")
3737
# https://guides.loc.gov/digital-scholarship/faq
3838
# Stay within 20 requests per minute rate limit.
3939
time.sleep(3)
40-
response = requests.get(base_url)
4140

4241
try:
42+
response = requests.get(base_url, timeout=10)
4343
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
4444
except requests.exceptions.RequestException as e:
4545
print(f"Error fetching URL: {e}")
@@ -52,23 +52,44 @@ def download_and_extract_item(base_url):
5252
return item
5353

5454

55-
visited_urls = {}
56-
jukebox_path = DATA_DIR / "jukebox.jsonl"
5755

58-
if jukebox_path.exists():
59-
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
60-
visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
56+
def download_page(page_number):
57+
target_url = target_url_template.format(page_number)
58+
item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
6159

60+
visited_urls = set()
61+
jukebox_path = DATA_DIR / "jukebox.jsonl"
6262

63-
with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
64-
for item_url in item_urls:
65-
if item_url in visited_urls:
66-
continue
63+
if jukebox_path.exists():
64+
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
65+
visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
6766

68-
item = download_and_extract_item(item_url)
69-
if item is None:
70-
continue
67+
with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
68+
while item_urls:
69+
item_url = item_urls.pop(0)
70+
if item_url in visited_urls:
71+
continue
72+
73+
item = download_and_extract_item(item_url)
74+
if item is None:
75+
item_urls.append(item_url)
76+
continue
77+
78+
json.dump(item, data_file, indent=None)
79+
data_file.write("\n")
80+
data_file.flush()
81+
82+
83+
if __name__ == "__main__":
84+
page_number = 4
85+
while True:
86+
print(f"Page {page_number}")
87+
try:
88+
download_page(page_number)
89+
download_mp3s.download_all()
90+
except requests.exceptions.HTTPError as exc:
91+
if exc.response.status_code == 404:
92+
print("Reached last page?")
93+
break
94+
page_number += 1
7195

72-
json.dump(item, data_file, indent=None)
73-
data_file.write("\n")
74-
data_file.flush()

2025/national-jukebox/download_mp3s.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,9 @@ def download_mp3(base_url):
2828
# https://guides.loc.gov/digital-scholarship/faq
2929
# Stay within 20 requests per minute rate limit.
3030
time.sleep(3)
31-
response = requests.get(base_url)
3231

3332
try:
33+
response = requests.get(base_url)
3434
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
3535
except requests.exceptions.RequestException as e:
3636
print(f"Error fetching URL: {e}")
@@ -39,17 +39,25 @@ def download_mp3(base_url):
3939
return response.content
4040

4141

42-
jukebox_path = DATA_DIR / "jukebox.jsonl"
43-
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
42+
def download_all():
43+
jukebox_path = DATA_DIR / "jukebox.jsonl"
44+
jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
4445

45-
# for _, row in jukebox.iterrows():
46-
for _, row in jukebox.iloc[100:].iterrows():
47-
jukebox_id = row["URL"].split("/")[-2]
48-
mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
49-
if mp3_path.exists():
50-
continue
46+
# for _, row in jukebox.iterrows():
47+
for _, row in jukebox.iloc[100:].iterrows():
48+
jukebox_id = row["URL"].split("/")[-2]
49+
mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
50+
if mp3_path.exists():
51+
continue
5152

52-
mp3_bytes = download_mp3(row["MP3 URL"])
53-
with open(mp3_path, "wb") as mp3_file:
54-
mp3_file.write(mp3_bytes)
55-
print(f"Wrote {mp3_path}")
53+
mp3_bytes = download_mp3(row["MP3 URL"])
54+
if mp3_bytes is None:
55+
continue
56+
57+
with open(mp3_path, "wb") as mp3_file:
58+
mp3_file.write(mp3_bytes)
59+
print(f"Wrote {mp3_path}")
60+
61+
62+
if __name__ == "__main__":
63+
download_all()

0 commit comments

Comments
 (0)