Skip to content

Commit

Permalink
entry_dedupe: fix flip-flopping between duplicate new entries. #340
Browse files Browse the repository at this point in the history
  • Loading branch information
lemon24 authored Jun 23, 2024
1 parent d7428e4 commit 37f739c
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 13 deletions.
7 changes: 6 additions & 1 deletion CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ Unreleased
:class:`EntryCounts` and :class:`EntrySearchCounts`.
Thanks to `chenthur`_ for the pull request.
(:issue:`283`)
* Fix bug in :mod:`~reader.plugins.entry_dedupe` causing entries to flip-flop
if there were multiple *new* duplicates of the same issue
(on the first update, one entry remains, on the second update, the other);
related to the bug fixed in `version 3.2 <Version 3.2_>`_.
(:issue:`340`)

.. _chenthur: https://github.com/chenthur

Expand Down Expand Up @@ -354,7 +359,7 @@ Released 2022-09-14
became optional.
(:issue:`96`)
* Fix bug in :mod:`~reader.plugins.entry_dedupe` causing updates to fail
if there were multiple *new* duplicates of the same issue.
if there were multiple *new* duplicates of the same entry.
(:issue:`292`)
* Fix bug in :mod:`~reader.plugins.readtime`
and :mod:`~reader.plugins.mark_as_read` causing updates to fail
Expand Down
42 changes: 35 additions & 7 deletions src/reader/plugins/entry_dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,14 +113,15 @@
from collections import Counter
from collections import defaultdict
from collections import deque
from datetime import datetime
from datetime import timezone
from itertools import groupby
from typing import NamedTuple

from reader._utils import BetterStrPartial as partial
from reader.exceptions import EntryNotFoundError
from reader.exceptions import TagNotFoundError
from reader.types import EntryUpdateStatus
from reader.types import Feed


log = logging.getLogger('reader.plugins.entry_dedupe')
Expand Down Expand Up @@ -260,11 +261,41 @@ def _after_entry_update(reader, entry, status, *, dry_run=False):
if _is_duplicate_full(entry, e)
]
if not duplicates:
log.debug("entry_dedupe: no duplicates for %s", entry.resource_id)
return

try:
entry = reader.get_entry(entry)
except EntryNotFoundError:
log.info("entry_dedupe: entry %r was deleted, aborting", entry.resource_id)
return

def group_key(e):
# unlike _get_entry_groups, we cannot rely on e.last_updated,
# because for duplicates in the feed, we'd end up flip-flopping
# (on the first update, entry 1 is deleted and entry 2 remains;
# on the second update, entry 1 remains because it's new,
# and entry 2 is deleted because it's not modified,
# has lower last_updated, and no update hook runs for it; repeat).
#
# it would be more correct to sort by (is in new feed, last_retrieved),
# but as of 3.14, we don't know about existing but not modified entries
# (the hook isn't called), and entries don't have last_retrieved.
#
# also see test_duplicates_in_feed / #340.
#
return e.updated or e.published or DEFAULT_UPDATED, e.id

group = [entry] + duplicates
group.sort(key=group_key, reverse=True)
entry, *duplicates = group

_dedupe_entries(reader, entry, duplicates, dry_run=dry_run)


DEFAULT_UPDATED = datetime(1970, 1, 1, tzinfo=timezone.utc)


def _get_same_group_entries(reader, entry):
# to make this better, we could do something like
# reader.search_entries(f'title: {fts5_escape(entry.title)}'),
Expand Down Expand Up @@ -347,6 +378,7 @@ def by_title(e):
continue

while group:
# keep the latest entry, consider the rest duplicates
group.sort(key=lambda e: e.last_updated, reverse=True)
entry, *rest = group

Expand Down Expand Up @@ -496,10 +528,6 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
[e.id for e in duplicates],
)

# in case entry is EntryData, not Entry
if hasattr(entry, 'as_entry'):
entry = entry.as_entry(feed=Feed(entry.feed_url))

# don't do anything until we know all actions were generated successfully
actions = list(_make_actions(reader, entry, duplicates))
# FIXME: what if this fails with EntryNotFoundError?
Expand All @@ -510,8 +538,8 @@ def _dedupe_entries(reader, entry, duplicates, *, dry_run):
for action in actions:
action()
log.info("entry_dedupe: %s", action)
except EntryNotFoundError as e:
if entry.resource_id != e.resource_id: # pragma: no cover
except EntryNotFoundError as e: # pragma: no cover
if entry.resource_id != e.resource_id:
raise
log.info("entry_dedupe: entry %r was deleted, aborting", entry.resource_id)

Expand Down
33 changes: 28 additions & 5 deletions tests/test_plugins_entry_dedupe.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,12 +755,13 @@ def test_recent_sort_copying(make_reader, db_path):


@pytest.mark.parametrize('update_after_one', [False, True])
def test_duplicates_in_new_feed(make_reader, update_after_one):
@pytest.mark.parametrize('with_dates, expected_id', [(False, '3'), (True, '2')])
def test_duplicates_in_feed(make_reader, update_after_one, with_dates, expected_id):
reader = make_reader(':memory:', plugins=['reader.entry_dedupe'])
reader._parser = parser = Parser()

reader.add_feed(parser.feed(1))
one = parser.entry(1, 1, title='title', summary='summary')
one = parser.entry(1, '1', title='title', summary='summary')

if update_after_one:
reader.update_feeds()
Expand All @@ -769,10 +770,32 @@ def test_duplicates_in_new_feed(make_reader, update_after_one):
reader.mark_entry_as_important(one)
reader.set_tag(one, 'key', 'value')

parser.entry(1, 2, title='title', summary='summary')
parser.entry(1, 3, title='title', summary='summary')
parser.entry(
1,
'2',
title='title',
summary='summary',
updated=datetime(2010, 1, 2) if with_dates else None,
)
parser.entry(
1,
'3',
title='title',
summary='summary',
published=datetime(2010, 1, 1) if with_dates else None,
)

# shouldn't fail
reader.update_feeds()
assert {e.id for e in reader.get_entries()} == {expected_id}

assert [eval(e.id)[1] for e in reader.get_entries()] == [3]
# shouldn't flip flop
# https://github.com/lemon24/reader/issues/340
reader.update_feeds()
assert {e.id for e in reader.get_entries()} == {expected_id}

if update_after_one:
entry = reader.get_entry(('1', expected_id))
assert entry.read
assert entry.important
assert dict(reader.get_tags(entry)) == {'key': 'value'}

0 comments on commit 37f739c

Please sign in to comment.