Skip to content

Commit ee1e964

Browse files
authored
Merge pull request #2 from digithree/fix-keyerror-list-missing
Comprehensive fixes: API connectivity, error handling, retry logic, and progress tracking
2 parents fd5f2ac + 4277c4d commit ee1e964

File tree

3 files changed

+421
-35
lines changed

3 files changed

+421
-35
lines changed

pocket_to_sqlite/cli.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -81,26 +81,31 @@ def auth(auth):
8181
)
8282
@click.option("--all", is_flag=True, help="Fetch all items (not just new ones)")
8383
@click.option("-s", "--silent", is_flag=True, help="Don't show progress bar")
84-
def fetch(db_path, auth, all, silent):
84+
@click.option("--debug", is_flag=True, help="Enable debug logging")
85+
def fetch(db_path, auth, all, silent, debug):
8586
"Save Pocket data to a SQLite database"
87+
if debug:
88+
import logging
89+
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')
90+
print("Debug logging enabled")
91+
8692
auth = json.load(open(auth))
8793
db = sqlite_utils.Database(db_path)
88-
last_since = None
89-
if not all and db["since"].exists():
90-
last_since = db["since"].get(1)["since"]
91-
fetch = utils.FetchItems(
92-
auth,
93-
since=last_since,
94-
record_since=lambda since: db["since"].insert(
95-
{"id": 1, "since": since}, replace=True, pk="id"
96-
),
97-
)
98-
if (all or last_since is None) and not silent:
94+
95+
# For incremental fetch, start from the number of items already in DB
96+
start_offset = 0
97+
if not all and "items" in db.table_names():
98+
start_offset = db["items"].count
99+
if debug:
100+
print(f"Found {start_offset} existing items, starting from offset {start_offset}")
101+
102+
fetch = utils.FetchItems(auth, start_offset=start_offset)
103+
if (all or start_offset == 0) and not silent:
99104
total_items = utils.fetch_stats(auth)["count_list"]
100105
with click.progressbar(fetch, length=total_items, show_pos=True) as bar:
101106
utils.save_items(bar, db)
102107
else:
103108
# No progress bar
104-
print("Fetching items since {}".format(last_since))
109+
print("Fetching items from offset {}".format(start_offset))
105110
utils.save_items(fetch, db)
106111
utils.ensure_fts(db)

pocket_to_sqlite/utils.py

Lines changed: 63 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2,27 +2,44 @@
22
import requests
33
import json
44
import time
5+
import logging
6+
import hashlib
57
from sqlite_utils.db import AlterError, ForeignKey
68

79

810
def save_items(items, db):
11+
count = 0
912
for item in items:
13+
count += 1
14+
logging.debug(f"Processing item {count}: {item.get('item_id', 'unknown')}")
1015
transform(item)
1116
authors = item.pop("authors", None)
1217
items_authors_to_save = []
1318
if authors:
1419
authors_to_save = []
1520
for details in authors.values():
21+
# Handle both numeric and string author_ids
22+
author_id_raw = details["author_id"]
23+
try:
24+
# Try to use as integer (normal case)
25+
author_id = int(author_id_raw)
26+
author_name = details["name"]
27+
except ValueError:
28+
# String author_id - treat it as the name and generate unique ID
29+
author_name = author_id_raw
30+
# Generate deterministic integer ID from the string
31+
author_id = int(hashlib.md5(author_id_raw.encode()).hexdigest()[:8], 16)
32+
1633
authors_to_save.append(
1734
{
18-
"author_id": int(details["author_id"]),
19-
"name": details["name"],
35+
"author_id": author_id,
36+
"name": author_name,
2037
"url": details["url"],
2138
}
2239
)
2340
items_authors_to_save.append(
2441
{
25-
"author_id": int(details["author_id"]),
42+
"author_id": author_id,
2643
"item_id": int(details["item_id"]),
2744
}
2845
)
@@ -64,36 +81,35 @@ def transform(item):
6481

6582

6683
def ensure_fts(db):
67-
if "items_fts" not in db.table_names():
84+
if "items_fts" not in db.table_names() and "items" in db.table_names():
6885
db["items"].enable_fts(["resolved_title", "excerpt"], create_triggers=True)
6986

7087

7188
def fetch_stats(auth):
72-
response = requests.get(
73-
"https://getpocket.com/v3/stats",
74-
{
75-
"consumer_key": auth["pocket_consumer_key"],
76-
"access_token": auth["pocket_access_token"],
77-
},
78-
)
89+
headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF8"}
90+
data = {
91+
"consumer_key": auth["pocket_consumer_key"],
92+
"access_token": auth["pocket_access_token"],
93+
}
94+
response = requests.post("https://getpocket.com/v3/stats", data=data, headers=headers)
7995
response.raise_for_status()
8096
return response.json()
8197

8298

8399
class FetchItems:
84100
def __init__(
85-
self, auth, since=None, page_size=500, sleep=2, retry_sleep=3, record_since=None
101+
self, auth, start_offset=0, page_size=50, sleep=2, retry_sleep=3
86102
):
87103
self.auth = auth
88-
self.since = since
104+
self.start_offset = start_offset
89105
self.page_size = page_size
90106
self.sleep = sleep
91107
self.retry_sleep = retry_sleep
92-
self.record_since = record_since
93108

94109
def __iter__(self):
95-
offset = 0
110+
offset = self.start_offset
96111
retries = 0
112+
logging.debug(f"Starting fetch with start_offset={self.start_offset}, page_size={self.page_size}")
97113
while True:
98114
args = {
99115
"consumer_key": self.auth["pocket_consumer_key"],
@@ -104,9 +120,11 @@ def __iter__(self):
104120
"count": self.page_size,
105121
"offset": offset,
106122
}
107-
if self.since is not None:
108-
args["since"] = self.since
109-
response = requests.get("https://getpocket.com/v3/get", args)
123+
124+
logging.debug(f"Making API request to /v3/get with offset={offset}")
125+
headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF8"}
126+
response = requests.post("https://getpocket.com/v3/get", data=args, headers=headers)
127+
logging.debug(f"API response status: {response.status_code}")
110128
if response.status_code == 503 and retries < 5:
111129
print("Got a 503, retrying...")
112130
retries += 1
@@ -116,13 +134,36 @@ def __iter__(self):
116134
retries = 0
117135
response.raise_for_status()
118136
page = response.json()
119-
items = list((page["list"] or {}).values())
120-
next_since = page["since"]
121-
if self.record_since and next_since:
122-
self.record_since(next_since)
137+
logging.debug(f"API response keys: {list(page.keys())}")
138+
139+
# Check for API errors (error key present AND has a non-None value)
140+
error_msg = page.get('error')
141+
if error_msg is not None:
142+
logging.error(f"API returned error: {page}")
143+
144+
# Handle payload too large by reducing page size
145+
if "413" in str(error_msg) or "Payload Too Large" in str(error_msg):
146+
if self.page_size > 10:
147+
new_page_size = max(10, self.page_size // 2)
148+
logging.warning(f"Payload too large, reducing page size from {self.page_size} to {new_page_size}")
149+
self.page_size = new_page_size
150+
continue # Retry with smaller page size
151+
else:
152+
raise Exception(f"Pocket API error: Even minimum page size (10) is too large: {error_msg}")
153+
154+
raise Exception(f"Pocket API error: {error_msg}")
155+
156+
items = list((page.get("list") or {}).values())
157+
logging.debug(f"Found {len(items)} items in this page")
158+
159+
next_since = page.get("since")
160+
logging.debug(f"Next since value: {next_since}")
123161
if not items:
162+
logging.debug("No more items found, breaking from loop")
124163
break
164+
logging.debug(f"Yielding {len(items)} items")
125165
yield from items
126166
offset += self.page_size
167+
logging.debug(f"Updated offset to {offset}")
127168
if self.sleep:
128169
time.sleep(self.sleep)

0 commit comments

Comments
 (0)