Merge pull request #6 from tswast/national-jukebox-more

tswast · web-flow · commit f81a082a762a · 2025-07-10T16:55:24.000-05:00
download all files from national jukebox
diff --git a/2025/national-jukebox/download_all.py b/2025/national-jukebox/download_all.py
@@ -22,24 +22,24 @@
 import list_urls
 import extract_item_info
 import extract_mp3
+import download_mp3s
 
 
 DATA_DIR = pathlib.Path(__file__).parent / "data"
 
 
 # target_url = "https://www.loc.gov/collections/national-jukebox/?sb=date_desc&c=100"
-target_url = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp=2"
-item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
+target_url_template = "https://www.loc.gov/collections/national-jukebox/?c=100&sb=date_desc&sp={}"
 
 
 def download_and_extract_item(base_url):
     print(f"Fetching content from: {base_url}")
     # https://guides.loc.gov/digital-scholarship/faq
     # Stay within 20 requests per minute rate limit.
     time.sleep(3)
-    response = requests.get(base_url)
 
     try:
+        response = requests.get(base_url, timeout=10)
         response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
     except requests.exceptions.RequestException as e:
         print(f"Error fetching URL: {e}")
@@ -52,23 +52,44 @@ def download_and_extract_item(base_url):
     return item
 
 
-visited_urls = {}
-jukebox_path = DATA_DIR / "jukebox.jsonl"
 
-if jukebox_path.exists():
-    jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
-    visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
+def download_page(page_number):
+    target_url = target_url_template.format(page_number)
+    item_urls = list_urls.get_national_jukebox_song_detail_urls(target_url)
 
+    visited_urls = set()
+    jukebox_path = DATA_DIR / "jukebox.jsonl"
 
-with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
-    for item_url in item_urls:
-        if item_url in visited_urls:
-            continue
+    if jukebox_path.exists():
+        jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+        visited_urls = frozenset(jukebox["URL"].to_list()) if "URL" in jukebox.columns else {}
 
-        item = download_and_extract_item(item_url)
-        if item is None:
-            continue
+    with open(DATA_DIR / "jukebox.jsonl", "a") as data_file:
+        while item_urls:
+            item_url = item_urls.pop(0)
+            if item_url in visited_urls:
+                continue
+
+            item = download_and_extract_item(item_url)
+            if item is None:
+                item_urls.append(item_url)
+                continue
+
+            json.dump(item, data_file, indent=None)
+            data_file.write("\n")
+            data_file.flush()
+
+
+if __name__ == "__main__":
+    page_number = 4
+    while True:
+        print(f"Page {page_number}")
+        try:
+            download_page(page_number)
+            download_mp3s.download_all()
+        except requests.exceptions.HTTPError as exc:
+            if exc.response.status_code == 404:
+                print("Reached last page?")
+                break
+        page_number += 1
 
-        json.dump(item, data_file, indent=None)
-        data_file.write("\n")
-        data_file.flush()
diff --git a/2025/national-jukebox/download_mp3s.py b/2025/national-jukebox/download_mp3s.py
@@ -28,9 +28,9 @@ def download_mp3(base_url):
     # https://guides.loc.gov/digital-scholarship/faq
     # Stay within 20 requests per minute rate limit.
     time.sleep(3)
-    response = requests.get(base_url)
 
     try:
+        response = requests.get(base_url)
         response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
     except requests.exceptions.RequestException as e:
         print(f"Error fetching URL: {e}")
@@ -39,17 +39,25 @@ def download_mp3(base_url):
     return response.content
 
 
-jukebox_path = DATA_DIR / "jukebox.jsonl"
-jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
+def download_all():
+    jukebox_path = DATA_DIR / "jukebox.jsonl"
+    jukebox = pandas.read_json(jukebox_path, lines=True, orient="records")
 
-# for _, row in jukebox.iterrows():
-for _, row in jukebox.iloc[100:].iterrows():
-    jukebox_id = row["URL"].split("/")[-2]
-    mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
-    if mp3_path.exists():
-        continue
+    # for _, row in jukebox.iterrows():
+    for _, row in jukebox.iloc[100:].iterrows():
+        jukebox_id = row["URL"].split("/")[-2]
+        mp3_path = (DATA_DIR / jukebox_id).with_suffix(".mp3")
+        if mp3_path.exists():
+            continue
 
-    mp3_bytes = download_mp3(row["MP3 URL"])
-    with open(mp3_path, "wb") as mp3_file:
-        mp3_file.write(mp3_bytes)
-    print(f"Wrote {mp3_path}")
+        mp3_bytes = download_mp3(row["MP3 URL"])
+        if mp3_bytes is None:
+            continue
+
+        with open(mp3_path, "wb") as mp3_file:
+            mp3_file.write(mp3_bytes)
+        print(f"Wrote {mp3_path}")
+
+
+if __name__ == "__main__":
+    download_all()