-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch_test_data.py
More file actions
51 lines (46 loc) · 1.6 KB
/
Copy pathfetch_test_data.py
File metadata and controls
51 lines (46 loc) · 1.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""Fetch new test data for validation."""
import os
import requests
import time
DATA_DIR = "data/validation"
os.makedirs(DATA_DIR, exist_ok=True)
# 1. FINANCIAL - Apple 10-K (different company)
print("Fetching Apple 10-K...")
from src.loaders import SECLoader
loader = SECLoader(download_dir=DATA_DIR)
try:
docs = loader.download("AAPL", "10-K", num_filings=1)
print(f" Downloaded: {len(docs)} Apple filings")
except Exception as e:
print(f" Error: {e}")
# 2. TECHNICAL - FastAPI docs
print("\nFetching FastAPI documentation...")
fastapi_urls = [
"https://fastapi.tiangolo.com/tutorial/first-steps/",
"https://fastapi.tiangolo.com/tutorial/path-params/",
"https://fastapi.tiangolo.com/tutorial/query-params/",
]
for url in fastapi_urls:
try:
resp = requests.get(url, timeout=10)
filename = url.split("/")[-2] + ".html"
with open(f"{DATA_DIR}/{filename}", "w") as f:
f.write(resp.text)
print(f" Saved: {filename}")
time.sleep(0.5)
except Exception as e:
print(f" Error fetching {url}: {e}")
# 3. LEGAL - GitHub Terms of Service
print("\nFetching GitHub ToS...")
github_tos_url = "https://docs.github.com/en/site-policy/github-terms/github-terms-of-service"
try:
resp = requests.get(github_tos_url, timeout=10)
with open(f"{DATA_DIR}/github_tos.html", "w") as f:
f.write(resp.text)
print(f" Saved: github_tos.html")
except Exception as e:
print(f" Error: {e}")
print("\nDone! Files in data/validation/:")
for f in os.listdir(DATA_DIR):
size = os.path.getsize(f"{DATA_DIR}/{f}")
print(f" {f}: {size:,} bytes")