Skip to content

Commit f2a6519

Browse files
digithreeclaude
andcommitted
Support both numeric and string author ID schemas
- Handle numeric author_ids normally (existing schema) - For string author_ids (alternative schema): * Treat the string as the author name * Generate deterministic integer ID using MD5 hash * Maintain integer author_id constraint in database - Add comprehensive test coverage for: * String author IDs become names with generated IDs * Mixed numeric/string author IDs in same item * Consistent ID generation for same string values Supports ~5-10% of Pocket items that use alternative author schema without breaking existing database structure or functionality. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 78e3559 commit f2a6519

File tree

2 files changed

+120
-3
lines changed

2 files changed

+120
-3
lines changed

pocket_to_sqlite/utils.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import json
44
import time
55
import logging
6+
import hashlib
67
from sqlite_utils.db import AlterError, ForeignKey
78

89

@@ -17,16 +18,28 @@ def save_items(items, db):
1718
if authors:
1819
authors_to_save = []
1920
for details in authors.values():
21+
# Handle both numeric and string author_ids
22+
author_id_raw = details["author_id"]
23+
try:
24+
# Try to use as integer (normal case)
25+
author_id = int(author_id_raw)
26+
author_name = details["name"]
27+
except ValueError:
28+
# String author_id - treat it as the name and generate unique ID
29+
author_name = author_id_raw
30+
# Generate deterministic integer ID from the string
31+
author_id = int(hashlib.md5(author_id_raw.encode()).hexdigest()[:8], 16)
32+
2033
authors_to_save.append(
2134
{
22-
"author_id": int(details["author_id"]),
23-
"name": details["name"],
35+
"author_id": author_id,
36+
"name": author_name,
2437
"url": details["url"],
2538
}
2639
)
2740
items_authors_to_save.append(
2841
{
29-
"author_id": int(details["author_id"]),
42+
"author_id": author_id,
3043
"item_id": int(details["item_id"]),
3144
}
3245
)

tests/test_save_pocket.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,110 @@ def test_fetch_items_handles_error_none_success():
220220
assert items[0]["item_id"] == "1"
221221

222222

223+
def test_save_items_handles_string_author_ids():
224+
"""Test that save_items handles string author IDs by treating them as names."""
225+
db = sqlite_utils.Database(":memory:")
226+
227+
# Create item with string author_id (alternative schema)
228+
item_with_string_author = {
229+
"item_id": "123",
230+
"title": "Test Item",
231+
"authors": {
232+
"1": {
233+
"author_id": "Sandra E. Garcia", # String ID - treat as name
234+
"name": "Original Name", # This should be ignored
235+
"url": "http://example.com",
236+
"item_id": "123"
237+
}
238+
}
239+
}
240+
241+
utils.save_items([item_with_string_author], db)
242+
243+
# Should save item and author with generated numeric ID
244+
assert db["items"].count == 1
245+
assert db["authors"].count == 1
246+
247+
author = list(db["authors"].rows)[0]
248+
# Should have numeric author_id and string as name
249+
assert isinstance(author["author_id"], int)
250+
assert author["name"] == "Sandra E. Garcia"
251+
assert author["url"] == "http://example.com"
252+
253+
254+
def test_save_items_handles_mixed_author_id_types():
255+
"""Test that save_items handles mix of numeric and string author IDs."""
256+
db = sqlite_utils.Database(":memory:")
257+
258+
# Create item with both types of author IDs
259+
item_with_mixed_authors = {
260+
"item_id": "123",
261+
"title": "Test Item",
262+
"authors": {
263+
"1": {
264+
"author_id": "456", # Numeric string
265+
"name": "John Doe",
266+
"url": "http://example.com",
267+
"item_id": "123"
268+
},
269+
"2": {
270+
"author_id": "Jane Smith", # String ID
271+
"name": "Original Name",
272+
"url": "http://example2.com",
273+
"item_id": "123"
274+
}
275+
}
276+
}
277+
278+
utils.save_items([item_with_mixed_authors], db)
279+
280+
# Should save item and both authors
281+
assert db["items"].count == 1
282+
assert db["authors"].count == 2
283+
284+
authors = {row["name"]: row for row in db["authors"].rows}
285+
286+
# Numeric author ID should be preserved
287+
assert authors["John Doe"]["author_id"] == 456
288+
289+
# String author ID should become the name with generated numeric ID
290+
assert "Jane Smith" in authors
291+
assert isinstance(authors["Jane Smith"]["author_id"], int)
292+
assert authors["Jane Smith"]["author_id"] != 456 # Different from the other
293+
294+
295+
def test_string_author_id_generates_consistent_ids():
296+
"""Test that same string author ID generates consistent numeric IDs."""
297+
import copy
298+
299+
db1 = sqlite_utils.Database(":memory:")
300+
db2 = sqlite_utils.Database(":memory:")
301+
302+
item_template = {
303+
"item_id": "123",
304+
"title": "Test Item",
305+
"authors": {
306+
"1": {
307+
"author_id": "Sandra E. Garcia",
308+
"name": "Original Name",
309+
"url": "http://example.com",
310+
"item_id": "123"
311+
}
312+
}
313+
}
314+
315+
# Save same item to two different databases (deep copy to avoid mutation)
316+
utils.save_items([copy.deepcopy(item_template)], db1)
317+
utils.save_items([copy.deepcopy(item_template)], db2)
318+
319+
# Should generate same author_id for same string
320+
author1 = list(db1["authors"].rows)[0]
321+
author2 = list(db2["authors"].rows)[0]
322+
assert author1["author_id"] == author2["author_id"]
323+
assert author1["name"] == "Sandra E. Garcia"
324+
assert author2["name"] == "Sandra E. Garcia"
325+
326+
223327
def test_ensure_fts_with_no_items_table():
224328
"""Test that ensure_fts handles case when items table doesn't exist."""
225329
db = sqlite_utils.Database(":memory:")

0 commit comments

Comments
 (0)