Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Science Museum ingester with API changes #4105

Merged
merged 5 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 27 additions & 19 deletions catalog/dags/providers/provider_api_scripts/science_museum.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,21 +115,22 @@ def get_record_data(self, record):
):
return None

title = attributes.get("summary_title")
title = ScienceMuseumDataIngester._get_first_list_value("title", attributes)
creator = self._get_creator_info(attributes)
metadata = self._get_metadata(attributes)
images = []
for image_data in multimedia:
if not (foreign_identifier := image_data.get("admin", {}).get("uid")):
if not (foreign_identifier := image_data.get("@admin", {}).get("uid")):
continue
processed = image_data.get("processed")
processed = image_data.get("@processed")
if not isinstance(processed, dict):
continue
(
url,
height,
width,
filetype,
filesize,
) = self._get_image_info(processed)
if not url:
continue
Expand All @@ -144,6 +145,7 @@ def get_record_data(self, record):
"height": height,
"width": width,
"filetype": filetype,
"filesize": filesize,
"license_info": license_info,
"creator": creator,
"title": title,
Expand All @@ -154,22 +156,18 @@ def get_record_data(self, record):

@staticmethod
def _get_creator_info(attributes):
creator_info = None
if (life_cycle := attributes.get("lifecycle")) is not None:
creation = life_cycle.get("creation")
if isinstance(creation, list):
maker = creation[0].get("maker")
if isinstance(maker, list):
creator_info = maker[0].get("summary_title")
return creator_info
if not (maker := attributes.get("creation", {}).get("maker", [])):
return None

return maker[0].get("summary", {}).get("title", None)
Comment on lines +159 to +162
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a lot of "Unknown maker" values in the creator column (the test data is a good example), which I think would be more accurate to leave them as NULL instead.

Suggested change
if not (maker := attributes.get("creation", {}).get("maker", [])):
return None
return maker[0].get("summary", {}).get("title", None)
if not (maker := attributes.get("creation", {}).get("maker", [])):
return None
creator = maker[0].get("summary", {}).get("title", None)
return creator if creator != "Unknown maker" else None

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense, because most other creators are also unknown here :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I can't decide -- theoretically there could be a difference between "no maker information was provided by the source" and "an authoritative (museum) source confirmed the maker is unknown". But maybe that's not a useful distinction. I would be curious if we do something similar for any of our other sources 🤔

I'll make a separate issue for this, mostly because it looks like we do have "Unknown maker" in production data at the moment.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#4145 created!


@staticmethod
def check_url(url: str | None) -> str | None:
if not url:
return None
if url.startswith("http"):
return url
return f"https://coimages.sciencemuseumgroup.org.uk/images/{url}"
return f"https://coimages.sciencemuseumgroup.org.uk/{url}"

@staticmethod
def _get_dimensions(image_data: dict) -> tuple[int | None, int | None]:
Expand All @@ -191,15 +189,25 @@ def _get_dimensions(image_data: dict) -> tuple[int | None, int | None]:
@staticmethod
def _get_image_info(
processed: dict,
) -> tuple[str | None, int | None, int | None, str | None]:
height, width, filetype = None, None, None
) -> tuple[str | None, int | None, int | None, str | None, int | None]:
height, width, filetype, filesize = None, None, None, None
image_data = processed.get("large") or processed.get("medium", {})

url = ScienceMuseumDataIngester.check_url(image_data.get("location"))
if url:
filetype = image_data.get("format")
height, width = ScienceMuseumDataIngester._get_dimensions(image_data)
return url, height, width, filetype

if not (
filesize := int(
image_data.get("measurements", {})
.get("filesize", {})
.get("value", 0)
)
):
filesize = None

return url, height, width, filetype, filesize

@staticmethod
def _get_first_list_value(key: str, attributes: dict) -> str | None:
Expand All @@ -214,7 +222,7 @@ def _get_metadata(attributes):
for attr_key, metadata_key in [
("identifier", "accession number"),
("name", "name"),
("categories", "category"),
("category", "category"),
("description", "description"),
]:
val = ScienceMuseumDataIngester._get_first_list_value(attr_key, attributes)
Expand All @@ -223,7 +231,7 @@ def _get_metadata(attributes):

creditline = attributes.get("legal")
if isinstance(creditline, dict):
line = creditline.get("credit_line")
line = creditline.get("credit")
if line is not None:
metadata["creditline"] = line

Expand All @@ -233,9 +241,9 @@ def _get_metadata(attributes):
def _get_license_info(image_data) -> LicenseInfo | None:
# some items do not return license anywhere, but in the UI
# they look like CC
rights = image_data.get("source", {}).get("legal", {}).get("rights")
rights = image_data.get("legal", {}).get("rights")
if isinstance(rights, list):
license_name = rights[0].get("usage_terms")
license_name = rights[0].get("licence")
if not license_name:
return None
license_name = license_name.lower()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
{
"large": {
"resizable": true,
"@type": "image",
"format": "jpeg",
"modified": 1472014477000,
"location": "3/563/large_1999_0299_0001__0002_.jpg",
"location_is_relative": true,
"measurements": {
"filesize": {
"units": "bytes",
"value": 58772
},
"dimensions": [
{
"dimension": "height",
Expand All @@ -16,9 +23,6 @@
"value": 1536
}
]
},
"modified": 1472014477000,
"resizable": true,
"type": "image"
}
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
{
"measurements": {
"filesize": {
"units": "bytes",
"value": 58772
stacimc marked this conversation as resolved.
Show resolved Hide resolved
},
"dimensions": [
{
"dimension": "height",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{
"medium": {
"resizable": true,
"@type": "image",
"format": "jpeg",
"modified": 1472014477000,
"location": "3/563/medium_1999_0299_0001__0002_.jpg",
"location_is_relative": true,
"measurements": {
Expand All @@ -16,9 +19,6 @@
"value": 866
}
]
},
"modified": 1472014477000,
"resizable": true,
"type": "image"
}
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"accession number": "1999-299/1",
"category": "SCM - Smoking",
"creditline": "McPhail, Mary",
"description": "Packet of 10 'Gold Flake' cigarettes by W D & HO Wills, England, 1920-1950",
"name": "cigarette packet"
"accession number": "A67864",
"category": "SCM - Classical & Medieval Medicine",
"description": "Small bronze amulet, possibly phallic, or stomach, probably Roman, from Italy, 200BC-200AD",
"name": "amulet",
"creditline": "Arte Antica e Moderna"
}
Loading