Skip to content
Merged
31 changes: 18 additions & 13 deletions pocket_to_sqlite/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,26 +81,31 @@ def auth(auth):
)
@click.option("--all", is_flag=True, help="Fetch all items (not just new ones)")
@click.option("-s", "--silent", is_flag=True, help="Don't show progress bar")
def fetch(db_path, auth, all, silent):
@click.option("--debug", is_flag=True, help="Enable debug logging")
def fetch(db_path, auth, all, silent, debug):
"Save Pocket data to a SQLite database"
if debug:
import logging
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')
print("Debug logging enabled")

auth = json.load(open(auth))
db = sqlite_utils.Database(db_path)
last_since = None
if not all and db["since"].exists():
last_since = db["since"].get(1)["since"]
fetch = utils.FetchItems(
auth,
since=last_since,
record_since=lambda since: db["since"].insert(
{"id": 1, "since": since}, replace=True, pk="id"
),
)
if (all or last_since is None) and not silent:

# For incremental fetch, start from the number of items already in DB
start_offset = 0
if not all and "items" in db.table_names():
start_offset = db["items"].count
if debug:
print(f"Found {start_offset} existing items, starting from offset {start_offset}")

fetch = utils.FetchItems(auth, start_offset=start_offset)
if (all or start_offset == 0) and not silent:
total_items = utils.fetch_stats(auth)["count_list"]
with click.progressbar(fetch, length=total_items, show_pos=True) as bar:
utils.save_items(bar, db)
else:
# No progress bar
print("Fetching items since {}".format(last_since))
print("Fetching items from offset {}".format(start_offset))
utils.save_items(fetch, db)
utils.ensure_fts(db)
85 changes: 63 additions & 22 deletions pocket_to_sqlite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,44 @@
import requests
import json
import time
import logging
import hashlib
from sqlite_utils.db import AlterError, ForeignKey


def save_items(items, db):
count = 0
for item in items:
count += 1
logging.debug(f"Processing item {count}: {item.get('item_id', 'unknown')}")
transform(item)
authors = item.pop("authors", None)
items_authors_to_save = []
if authors:
authors_to_save = []
for details in authors.values():
# Handle both numeric and string author_ids
author_id_raw = details["author_id"]
try:
# Try to use as integer (normal case)
author_id = int(author_id_raw)
author_name = details["name"]
except ValueError:
# String author_id - treat it as the name and generate unique ID
author_name = author_id_raw
# Generate deterministic integer ID from the string
author_id = int(hashlib.md5(author_id_raw.encode()).hexdigest()[:8], 16)

authors_to_save.append(
{
"author_id": int(details["author_id"]),
"name": details["name"],
"author_id": author_id,
"name": author_name,
"url": details["url"],
}
)
items_authors_to_save.append(
{
"author_id": int(details["author_id"]),
"author_id": author_id,
"item_id": int(details["item_id"]),
}
)
Expand Down Expand Up @@ -64,36 +81,35 @@ def transform(item):


def ensure_fts(db):
if "items_fts" not in db.table_names():
if "items_fts" not in db.table_names() and "items" in db.table_names():
db["items"].enable_fts(["resolved_title", "excerpt"], create_triggers=True)


def fetch_stats(auth):
response = requests.get(
"https://getpocket.com/v3/stats",
{
"consumer_key": auth["pocket_consumer_key"],
"access_token": auth["pocket_access_token"],
},
)
headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF8"}
data = {
"consumer_key": auth["pocket_consumer_key"],
"access_token": auth["pocket_access_token"],
}
response = requests.post("https://getpocket.com/v3/stats", data=data, headers=headers)
response.raise_for_status()
return response.json()


class FetchItems:
def __init__(
self, auth, since=None, page_size=500, sleep=2, retry_sleep=3, record_since=None
self, auth, start_offset=0, page_size=50, sleep=2, retry_sleep=3
):
self.auth = auth
self.since = since
self.start_offset = start_offset
self.page_size = page_size
self.sleep = sleep
self.retry_sleep = retry_sleep
self.record_since = record_since

def __iter__(self):
offset = 0
offset = self.start_offset
retries = 0
logging.debug(f"Starting fetch with start_offset={self.start_offset}, page_size={self.page_size}")
while True:
args = {
"consumer_key": self.auth["pocket_consumer_key"],
Expand All @@ -104,9 +120,11 @@ def __iter__(self):
"count": self.page_size,
"offset": offset,
}
if self.since is not None:
args["since"] = self.since
response = requests.get("https://getpocket.com/v3/get", args)

logging.debug(f"Making API request to /v3/get with offset={offset}")
headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF8"}
response = requests.post("https://getpocket.com/v3/get", data=args, headers=headers)
logging.debug(f"API response status: {response.status_code}")
if response.status_code == 503 and retries < 5:
print("Got a 503, retrying...")
retries += 1
Expand All @@ -116,13 +134,36 @@ def __iter__(self):
retries = 0
response.raise_for_status()
page = response.json()
items = list((page["list"] or {}).values())
next_since = page["since"]
if self.record_since and next_since:
self.record_since(next_since)
logging.debug(f"API response keys: {list(page.keys())}")

# Check for API errors (error key present AND has a non-None value)
error_msg = page.get('error')
if error_msg is not None:
logging.error(f"API returned error: {page}")

# Handle payload too large by reducing page size
if "413" in str(error_msg) or "Payload Too Large" in str(error_msg):
if self.page_size > 10:
new_page_size = max(10, self.page_size // 2)
logging.warning(f"Payload too large, reducing page size from {self.page_size} to {new_page_size}")
self.page_size = new_page_size
continue # Retry with smaller page size
else:
raise Exception(f"Pocket API error: Even minimum page size (10) is too large: {error_msg}")

raise Exception(f"Pocket API error: {error_msg}")

items = list((page.get("list") or {}).values())
logging.debug(f"Found {len(items)} items in this page")

next_since = page.get("since")
logging.debug(f"Next since value: {next_since}")
if not items:
logging.debug("No more items found, breaking from loop")
break
logging.debug(f"Yielding {len(items)} items")
yield from items
offset += self.page_size
logging.debug(f"Updated offset to {offset}")
if self.sleep:
time.sleep(self.sleep)
Loading