Skip to content

Commit

Permalink
Start on related media filter
Browse files Browse the repository at this point in the history
  • Loading branch information
dseomn committed Nov 1, 2023
1 parent 8267881 commit 7d395fd
Show file tree
Hide file tree
Showing 3 changed files with 247 additions and 1 deletion.
9 changes: 9 additions & 0 deletions rock_paper_sand/proto/config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,15 @@ message WikidataFilter {

// If specified, only items with one of these release statuses will match.
repeated ReleaseStatus release_statuses = 1;

// Placeholder for future options for related_media.
message RelatedMedia {}

// Filters for top-level media to find potential additional parts that aren't
// present in the config file, and existing parts that aren't related to the
// top-level Wikidata item. E.g., this can be used to find sequels to an item,
// or to find all items in a franchise.
RelatedMedia related_media = 2;
}

// Filters using the JustWatch API. In general, all specified/non-default fields
Expand Down
61 changes: 61 additions & 0 deletions rock_paper_sand/wikidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import contextlib
import dataclasses
import datetime
import pprint
import re
import typing
from typing import Any
Expand All @@ -28,6 +29,7 @@

from rock_paper_sand import exceptions
from rock_paper_sand import media_filter
from rock_paper_sand import media_item
from rock_paper_sand import network
from rock_paper_sand import wikidata_value
from rock_paper_sand.proto import config_pb2
Expand Down Expand Up @@ -428,6 +430,60 @@ def __init__(
self._config = filter_config
self._api = api

def _related_media(
self, request: media_filter.FilterRequest
) -> Set[media_filter.ResultExtra]:
if request.item.has_parent:
return frozenset()
items_from_config = frozenset(
item.wikidata_item
for item in media_item.iter_all_items((request.item,))
if item.wikidata_item is not None
)
assert request.item.wikidata_item is not None # Already checked.
unprocessed: set[wikidata_value.Item] = {request.item.wikidata_item}
processed: set[wikidata_value.Item] = set()
loose: set[wikidata_value.Item] = set()
while unprocessed:
if len(unprocessed) + len(processed) > 1000:
processed_str = sorted(map(str, processed))
unprocessed_str = sorted(map(str, unprocessed))
raise ValueError(
"Too many related media items reached from "
f"{request.item.wikidata_item}:\n"
f"Processed: {pprint.pformat(processed_str)}\n"
f"Unprocessed: {pprint.pformat(unprocessed_str)}"
)
current = unprocessed.pop()
processed.add(current)
related = self._api.related_media(current)
unprocessed.update(
parent for parent in related.parents if parent not in processed
)
unprocessed.update(related.siblings - processed)
unprocessed.update(
child for child in related.children if child not in processed
)
loose.update(related.loose)
unprocessed.update((related.loose & items_from_config) - processed)
return {
*(
media_filter.ResultExtraString(f"related item: {item}")
for item in processed - items_from_config
),
*(
media_filter.ResultExtraString(f"loosely-related item: {item}")
for item in (loose - processed - items_from_config)
),
*(
media_filter.ResultExtraString(
"item in config file that's not related to "
f"{request.item.wikidata_item}: {item}"
)
for item in items_from_config - processed - loose
),
}

def filter_implementation(
self, request: media_filter.FilterRequest
) -> media_filter.FilterResult:
Expand All @@ -446,4 +502,9 @@ def filter_implementation(
not in self._config.release_statuses
):
return media_filter.FilterResult(False)
if self._config.HasField("related_media"):
related_media_extra = self._related_media(request)
if not related_media_extra:
return media_filter.FilterResult(False)
extra_information.update(related_media_extra)
return media_filter.FilterResult(True, extra=extra_information)
178 changes: 177 additions & 1 deletion rock_paper_sand/wikidata_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,16 +954,166 @@ def setUp(self) -> None:
},
expected_result=media_filter.FilterResult(True),
),
dict(
testcase_name="related_media_ignores_non_top_level",
filter_config={"relatedMedia": {}},
item={"name": "foo", "wikidata": "Q1"},
parent_fully_qualified_name="foo",
expected_result=media_filter.FilterResult(False),
),
dict(
testcase_name="related_media_none",
filter_config={"relatedMedia": {}},
item={"name": "foo", "wikidata": "Q1"},
api_related_media={
"Q1": wikidata.RelatedMedia(
parents=set(),
siblings=set(),
children=set(),
loose=set(),
),
},
expected_result=media_filter.FilterResult(False),
),
dict(
testcase_name="related_media_not_loose",
filter_config={"relatedMedia": {}},
item={
"name": "foo",
"wikidata": "Q1",
"parts": [{"name": "bar", "wikidata": "Q4"}],
},
api_related_media={
"Q1": wikidata.RelatedMedia(
parents=set(),
siblings=set(),
children={
wikidata_value.Item("Q2"),
wikidata_value.Item("Q3"),
},
loose=set(),
),
"Q2": wikidata.RelatedMedia(
parents={wikidata_value.Item("Q1")},
siblings={wikidata_value.Item("Q3")},
children=set(),
loose=set(),
),
"Q3": wikidata.RelatedMedia(
parents={wikidata_value.Item("Q1")},
siblings={
wikidata_value.Item("Q2"),
wikidata_value.Item("Q4"),
},
children=set(),
loose=set(),
),
"Q4": wikidata.RelatedMedia(
parents={wikidata_value.Item("Q5")},
siblings={wikidata_value.Item("Q3")},
children=set(),
loose=set(),
),
"Q5": wikidata.RelatedMedia(
parents=set(),
siblings=set(),
children={wikidata_value.Item("Q4")},
loose=set(),
),
},
expected_result=media_filter.FilterResult(
True,
extra={
media_filter.ResultExtraString(
"related item: https://www.wikidata.org/wiki/Q2"
),
media_filter.ResultExtraString(
"related item: https://www.wikidata.org/wiki/Q3"
),
# Q4 is in the config, so not shown here.
media_filter.ResultExtraString(
"related item: https://www.wikidata.org/wiki/Q5"
),
},
),
),
dict(
testcase_name="related_media_loose",
filter_config={"relatedMedia": {}},
item={
"name": "foo",
"wikidata": "Q1",
"parts": [{"name": "bar", "wikidata": "Q2"}],
},
api_related_media={
"Q1": wikidata.RelatedMedia(
parents=set(),
siblings=set(),
children=set(),
# Q2 is upgraded to non-loose, because it's also in the
# config.
loose={wikidata_value.Item("Q2")},
),
"Q2": wikidata.RelatedMedia(
parents=set(),
siblings=set(),
children=set(),
loose={wikidata_value.Item("Q3")},
),
},
expected_result=media_filter.FilterResult(
True,
extra={
media_filter.ResultExtraString(
"loosely-related item: https://www.wikidata.org/wiki/Q3"
),
},
),
),
dict(
testcase_name="related_media_config_has_unrelated",
filter_config={"relatedMedia": {}},
item={
"name": "foo",
"wikidata": "Q1",
"parts": [{"name": "bar", "wikidata": "Q2"}],
},
api_related_media={
"Q1": wikidata.RelatedMedia(
parents=set(),
siblings=set(),
children=set(),
loose=set(),
),
},
expected_result=media_filter.FilterResult(
True,
extra={
media_filter.ResultExtraString(
"item in config file that's not related to "
"https://www.wikidata.org/wiki/Q1: "
"https://www.wikidata.org/wiki/Q2"
),
},
),
),
)
def test_filter(
self,
*,
filter_config: Any,
item: Any,
parent_fully_qualified_name: str | None = None,
api_items: Mapping[str, Any] = immutabledict.immutabledict(),
api_related_media: Mapping[str, wikidata.RelatedMedia] = (
immutabledict.immutabledict()
),
expected_result: media_filter.FilterResult,
) -> None:
self._mock_api.item.side_effect = lambda item_id: api_items[item_id.id]
self._mock_api.related_media.side_effect = (
lambda item_id: api_related_media[item_id.id]
)
test_filter = wikidata.Filter(
json_format.ParseDict(filter_config, config_pb2.WikidataFilter()),
api=self._mock_api,
Expand All @@ -972,13 +1122,39 @@ def test_filter(
result = test_filter.filter(
media_filter.FilterRequest(
media_item.MediaItem.from_config(
json_format.ParseDict(item, config_pb2.MediaItem())
json_format.ParseDict(item, config_pb2.MediaItem()),
parent_fully_qualified_name=parent_fully_qualified_name,
)
)
)

self.assertEqual(expected_result, result)

def test_too_many_related_items(self) -> None:
self._mock_api.related_media.return_value = wikidata.RelatedMedia(
parents=set(),
siblings={wikidata_value.Item(f"Q{n}") for n in range(1001)},
children=set(),
loose=set(),
)
test_filter = wikidata.Filter(
json_format.ParseDict(
{"relatedMedia": {}}, config_pb2.WikidataFilter()
),
api=self._mock_api,
)
request = media_filter.FilterRequest(
media_item.MediaItem.from_config(
json_format.ParseDict(
{"name": "foo", "wikidata": "Q99999"},
config_pb2.MediaItem(),
)
)
)

with self.assertRaisesRegex(ValueError, "Too many related media items"):
test_filter.filter(request)


if __name__ == "__main__":
absltest.main()

0 comments on commit 7d395fd

Please sign in to comment.