From 7d395fdd4b17d245377c6e3cf3110014535b1207 Mon Sep 17 00:00:00 2001 From: David Mandelberg Date: Wed, 1 Nov 2023 17:22:56 -0400 Subject: [PATCH] Start on related media filter --- rock_paper_sand/proto/config.proto | 9 ++ rock_paper_sand/wikidata.py | 61 ++++++++++ rock_paper_sand/wikidata_test.py | 178 ++++++++++++++++++++++++++++- 3 files changed, 247 insertions(+), 1 deletion(-) diff --git a/rock_paper_sand/proto/config.proto b/rock_paper_sand/proto/config.proto index c59b4cc..1d1b606 100644 --- a/rock_paper_sand/proto/config.proto +++ b/rock_paper_sand/proto/config.proto @@ -78,6 +78,15 @@ message WikidataFilter { // If specified, only items with one of these release statuses will match. repeated ReleaseStatus release_statuses = 1; + + // Placeholder for future options for related_media. + message RelatedMedia {} + + // Filters for top-level media to find potential additional parts that aren't + // present in the config file, and existing parts that aren't related to the + // top-level Wikidata item. E.g., this can be used to find sequels to an item, + // or to find all items in a franchise. + RelatedMedia related_media = 2; } // Filters using the JustWatch API. In general, all specified/non-default fields diff --git a/rock_paper_sand/wikidata.py b/rock_paper_sand/wikidata.py index c65cdc5..2b31ef0 100644 --- a/rock_paper_sand/wikidata.py +++ b/rock_paper_sand/wikidata.py @@ -18,6 +18,7 @@ import contextlib import dataclasses import datetime +import pprint import re import typing from typing import Any @@ -28,6 +29,7 @@ from rock_paper_sand import exceptions from rock_paper_sand import media_filter +from rock_paper_sand import media_item from rock_paper_sand import network from rock_paper_sand import wikidata_value from rock_paper_sand.proto import config_pb2 @@ -428,6 +430,60 @@ def __init__( self._config = filter_config self._api = api + def _related_media( + self, request: media_filter.FilterRequest + ) -> Set[media_filter.ResultExtra]: + if request.item.has_parent: + return frozenset() + items_from_config = frozenset( + item.wikidata_item + for item in media_item.iter_all_items((request.item,)) + if item.wikidata_item is not None + ) + assert request.item.wikidata_item is not None # Already checked. + unprocessed: set[wikidata_value.Item] = {request.item.wikidata_item} + processed: set[wikidata_value.Item] = set() + loose: set[wikidata_value.Item] = set() + while unprocessed: + if len(unprocessed) + len(processed) > 1000: + processed_str = sorted(map(str, processed)) + unprocessed_str = sorted(map(str, unprocessed)) + raise ValueError( + "Too many related media items reached from " + f"{request.item.wikidata_item}:\n" + f"Processed: {pprint.pformat(processed_str)}\n" + f"Unprocessed: {pprint.pformat(unprocessed_str)}" + ) + current = unprocessed.pop() + processed.add(current) + related = self._api.related_media(current) + unprocessed.update( + parent for parent in related.parents if parent not in processed + ) + unprocessed.update(related.siblings - processed) + unprocessed.update( + child for child in related.children if child not in processed + ) + loose.update(related.loose) + unprocessed.update((related.loose & items_from_config) - processed) + return { + *( + media_filter.ResultExtraString(f"related item: {item}") + for item in processed - items_from_config + ), + *( + media_filter.ResultExtraString(f"loosely-related item: {item}") + for item in (loose - processed - items_from_config) + ), + *( + media_filter.ResultExtraString( + "item in config file that's not related to " + f"{request.item.wikidata_item}: {item}" + ) + for item in items_from_config - processed - loose + ), + } + def filter_implementation( self, request: media_filter.FilterRequest ) -> media_filter.FilterResult: @@ -446,4 +502,9 @@ def filter_implementation( not in self._config.release_statuses ): return media_filter.FilterResult(False) + if self._config.HasField("related_media"): + related_media_extra = self._related_media(request) + if not related_media_extra: + return media_filter.FilterResult(False) + extra_information.update(related_media_extra) return media_filter.FilterResult(True, extra=extra_information) diff --git a/rock_paper_sand/wikidata_test.py b/rock_paper_sand/wikidata_test.py index 8fbdf48..da564d2 100644 --- a/rock_paper_sand/wikidata_test.py +++ b/rock_paper_sand/wikidata_test.py @@ -954,16 +954,166 @@ def setUp(self) -> None: }, expected_result=media_filter.FilterResult(True), ), + dict( + testcase_name="related_media_ignores_non_top_level", + filter_config={"relatedMedia": {}}, + item={"name": "foo", "wikidata": "Q1"}, + parent_fully_qualified_name="foo", + expected_result=media_filter.FilterResult(False), + ), + dict( + testcase_name="related_media_none", + filter_config={"relatedMedia": {}}, + item={"name": "foo", "wikidata": "Q1"}, + api_related_media={ + "Q1": wikidata.RelatedMedia( + parents=set(), + siblings=set(), + children=set(), + loose=set(), + ), + }, + expected_result=media_filter.FilterResult(False), + ), + dict( + testcase_name="related_media_not_loose", + filter_config={"relatedMedia": {}}, + item={ + "name": "foo", + "wikidata": "Q1", + "parts": [{"name": "bar", "wikidata": "Q4"}], + }, + api_related_media={ + "Q1": wikidata.RelatedMedia( + parents=set(), + siblings=set(), + children={ + wikidata_value.Item("Q2"), + wikidata_value.Item("Q3"), + }, + loose=set(), + ), + "Q2": wikidata.RelatedMedia( + parents={wikidata_value.Item("Q1")}, + siblings={wikidata_value.Item("Q3")}, + children=set(), + loose=set(), + ), + "Q3": wikidata.RelatedMedia( + parents={wikidata_value.Item("Q1")}, + siblings={ + wikidata_value.Item("Q2"), + wikidata_value.Item("Q4"), + }, + children=set(), + loose=set(), + ), + "Q4": wikidata.RelatedMedia( + parents={wikidata_value.Item("Q5")}, + siblings={wikidata_value.Item("Q3")}, + children=set(), + loose=set(), + ), + "Q5": wikidata.RelatedMedia( + parents=set(), + siblings=set(), + children={wikidata_value.Item("Q4")}, + loose=set(), + ), + }, + expected_result=media_filter.FilterResult( + True, + extra={ + media_filter.ResultExtraString( + "related item: https://www.wikidata.org/wiki/Q2" + ), + media_filter.ResultExtraString( + "related item: https://www.wikidata.org/wiki/Q3" + ), + # Q4 is in the config, so not shown here. + media_filter.ResultExtraString( + "related item: https://www.wikidata.org/wiki/Q5" + ), + }, + ), + ), + dict( + testcase_name="related_media_loose", + filter_config={"relatedMedia": {}}, + item={ + "name": "foo", + "wikidata": "Q1", + "parts": [{"name": "bar", "wikidata": "Q2"}], + }, + api_related_media={ + "Q1": wikidata.RelatedMedia( + parents=set(), + siblings=set(), + children=set(), + # Q2 is upgraded to non-loose, because it's also in the + # config. + loose={wikidata_value.Item("Q2")}, + ), + "Q2": wikidata.RelatedMedia( + parents=set(), + siblings=set(), + children=set(), + loose={wikidata_value.Item("Q3")}, + ), + }, + expected_result=media_filter.FilterResult( + True, + extra={ + media_filter.ResultExtraString( + "loosely-related item: https://www.wikidata.org/wiki/Q3" + ), + }, + ), + ), + dict( + testcase_name="related_media_config_has_unrelated", + filter_config={"relatedMedia": {}}, + item={ + "name": "foo", + "wikidata": "Q1", + "parts": [{"name": "bar", "wikidata": "Q2"}], + }, + api_related_media={ + "Q1": wikidata.RelatedMedia( + parents=set(), + siblings=set(), + children=set(), + loose=set(), + ), + }, + expected_result=media_filter.FilterResult( + True, + extra={ + media_filter.ResultExtraString( + "item in config file that's not related to " + "https://www.wikidata.org/wiki/Q1: " + "https://www.wikidata.org/wiki/Q2" + ), + }, + ), + ), ) def test_filter( self, *, filter_config: Any, item: Any, + parent_fully_qualified_name: str | None = None, api_items: Mapping[str, Any] = immutabledict.immutabledict(), + api_related_media: Mapping[str, wikidata.RelatedMedia] = ( + immutabledict.immutabledict() + ), expected_result: media_filter.FilterResult, ) -> None: self._mock_api.item.side_effect = lambda item_id: api_items[item_id.id] + self._mock_api.related_media.side_effect = ( + lambda item_id: api_related_media[item_id.id] + ) test_filter = wikidata.Filter( json_format.ParseDict(filter_config, config_pb2.WikidataFilter()), api=self._mock_api, @@ -972,13 +1122,39 @@ def test_filter( result = test_filter.filter( media_filter.FilterRequest( media_item.MediaItem.from_config( - json_format.ParseDict(item, config_pb2.MediaItem()) + json_format.ParseDict(item, config_pb2.MediaItem()), + parent_fully_qualified_name=parent_fully_qualified_name, ) ) ) self.assertEqual(expected_result, result) + def test_too_many_related_items(self) -> None: + self._mock_api.related_media.return_value = wikidata.RelatedMedia( + parents=set(), + siblings={wikidata_value.Item(f"Q{n}") for n in range(1001)}, + children=set(), + loose=set(), + ) + test_filter = wikidata.Filter( + json_format.ParseDict( + {"relatedMedia": {}}, config_pb2.WikidataFilter() + ), + api=self._mock_api, + ) + request = media_filter.FilterRequest( + media_item.MediaItem.from_config( + json_format.ParseDict( + {"name": "foo", "wikidata": "Q99999"}, + config_pb2.MediaItem(), + ) + ) + ) + + with self.assertRaisesRegex(ValueError, "Too many related media items"): + test_filter.filter(request) + if __name__ == "__main__": absltest.main()