diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py b/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py index 05a066b696d..9514359a074 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py @@ -25,7 +25,7 @@ "api_key": API_KEY } -DEFAULT_QUERY_PARAM = { +DEFAULT_QUERY_PARAMS = { "has_images": 1, "rights_type_permissive": 1, "limit": LIMIT, @@ -55,19 +55,24 @@ def main(): def _get_query_param( offset=0, - default_query_param=DEFAULT_QUERY_PARAM + default_query_param=None ): + if default_query_param is None: + default_query_param = DEFAULT_QUERY_PARAMS query_param = default_query_param.copy() query_param.update(offset=offset) return query_param def _get_object_json( - headers=HEADERS, + headers=None, endpoint=ENDPOINT, retries=RETRIES, query_param=None ): + if headers is None: + headers = HEADERS.copy() + data = None for tries in range(retries): response = delay_request.get( endpoint, @@ -80,12 +85,8 @@ def _get_object_json( response_json.get("message", "").lower() == "success."): data = response_json.get("data") break - else: - data = None except Exception as e: logger.error(f"Error due to {e}") - data = None - return data diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py b/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py index 6e9ca1d976e..914a09221e5 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py @@ -62,7 +62,7 @@ def _get_response( endpoint=ENDPOINT, retries=RETRIES ): - response_json, total_images = None, 0 + response_json, total_images, tries = None, 0, 0 for tries in range(retries): response = delay_request.get( endpoint, @@ -89,9 +89,8 @@ def _get_response( return response_json, total_images -def _handle_response( - batch - ): +def _handle_response(batch): + total_images = 0 for data in batch: license_ = data.get('share_license_status', '').lower() if license_ != 'cc0': @@ -135,39 +134,32 @@ def _handle_response( return total_images -def _get_image_type( - image_data - ): +def _get_image_type(image_data): + key, image_url = None, None if image_data.get('web'): key = 'web' image_url = image_data.get('web').get('url', None) elif image_data.get('print'): key = 'print' image_url = image_data.get('print').get('url', None) - elif image_data.get('full'): key = 'full' image_url = image_data.get('full').get('url', None) - else: - image_url = None - - if image_url is None: - key = None return image_url, key def _get_metadata(data): - metadata = {} - - metadata['accession_number'] = data.get('accession_number', '') - metadata['technique'] = data.get('technique', '') - metadata['date'] = data.get('creation_date', '') - metadata['credit_line'] = data.get('creditline', '') - metadata['classification'] = data.get('type', '') - metadata['tombstone'] = data.get('tombstone', '') - metadata['culture'] = ','.join( - [i for i in data.get('culture', []) if i is not None] - ) + metadata = { + 'accession_number': data.get('accession_number', ''), + 'technique': data.get('technique', ''), + 'date': data.get('creation_date', ''), + 'credit_line': data.get('creditline', ''), + 'classification': data.get('type', ''), + 'tombstone': data.get('tombstone', ''), + 'culture': ','.join( + [i for i in data.get('culture', []) if i is not None] + )} + metadata = {k: v for k, v in metadata.items() if v is not None} return metadata diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/europeana.py b/src/cc_catalog_airflow/dags/provider_api_scripts/europeana.py index 062160d21ea..e4a6dbd3bfe 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/europeana.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/europeana.py @@ -63,8 +63,6 @@ def main(date): def _get_pagewise(start_timestamp, end_timestamp): cursor = '*' - total_number_of_images = 0 - images_stored = 0 while cursor is not None: image_list, next_cursor, total_number_of_images = _get_image_list( @@ -94,6 +92,10 @@ def _get_image_list( endpoint=ENDPOINT, max_tries=6 # one original try, plus 5 retries ): + try_number = 0 + image_list, next_cursor, total_number_of_images = ( + None, None, None + ) for try_number in range(max_tries): query_param_dict = _build_query_param_dict( @@ -123,9 +125,7 @@ def _get_image_list( and (image_list is None or next_cursor is None) ): logger.warning('No more tries remaining. Returning None types.') - return None, None, None - else: - return image_list, next_cursor, total_number_of_images + return image_list, next_cursor, total_number_of_images def _extract_response_json(response): @@ -157,6 +157,7 @@ def _extract_image_list_from_json(response_json): def _process_image_list(image_list): prev_total = 0 + total_images = 0 for image_data in image_list: total_images = _process_image_data(image_data) if total_images is None: @@ -202,9 +203,9 @@ def _process_image_data(image_data, sub_providers=SUB_PROVIDERS, def _get_license_url(license_field): if len(license_field) > 1: logger.warning('More than one license field found') - for license in license_field: - if 'creativecommons' in license: - return license + for license_ in license_field: + if 'creativecommons' in license_: + return license_ return None diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/finnish_museums.py b/src/cc_catalog_airflow/dags/provider_api_scripts/finnish_museums.py index 9a87e346143..b56d33b6713 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/finnish_museums.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/finnish_museums.py @@ -100,9 +100,9 @@ def _process_object_list(object_list): def _process_object(obj, sub_providers=SUB_PROVIDERS, provider=PROVIDER): total_images = 0 - license = obj.get("imageRights") - if license is not None: - license_url = license.get("link") + license_url = obj.get("imageRights", {}).get("link") + if license_url is None: + return None foreign_identifier = obj.get("id") title = obj.get("title") building = obj.get("buildings")[0].get("value") @@ -137,9 +137,9 @@ def _get_raw_tags(obj): def _get_landing(obj, landing_url=LANDING_URL): l_url = None - id = obj.get("id") - if id: - l_url = landing_url + id + id_ = obj.get("id") + if id_: + l_url = landing_url + id_ return l_url diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/flickr.py b/src/cc_catalog_airflow/dags/provider_api_scripts/flickr.py index 77c67b31428..7305a7bf570 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/flickr.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/flickr.py @@ -79,7 +79,7 @@ def main(date): date_type = DATE_TYPE for start_timestamp, end_timestamp in timestamp_pairs: - total_images = _process_interval( + _process_interval( start_timestamp, end_timestamp, date_type @@ -159,6 +159,8 @@ def _get_image_list( endpoint=ENDPOINT, max_tries=6 # one original try, plus 5 retries ): + image_list, total_pages = None, None + try_number = 0 for try_number in range(max_tries): query_param_dict = _build_query_param_dict( start_timestamp, @@ -181,9 +183,8 @@ def _get_image_list( if try_number == max_tries - 1 and ( (image_list is None) or (total_pages is None)): logger.warning('No more tries remaining. Returning Nonetypes.') - return None, None - else: - return image_list, total_pages + + return image_list, total_pages def _extract_response_json(response): diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/metropolitan_museum_of_art.py b/src/cc_catalog_airflow/dags/provider_api_scripts/metropolitan_museum_of_art.py index 89965e5d46d..59237841c4e 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/metropolitan_museum_of_art.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/metropolitan_museum_of_art.py @@ -133,15 +133,15 @@ def _build_foreign_id(object_id, image_url): def _create_meta_data(object_json): - meta_data = {} - - meta_data['accession_number'] = object_json.get('accessionNumber', None) - meta_data['classification'] = object_json.get('classification', None) - meta_data['culture'] = object_json.get('culture', None) - meta_data['date'] = object_json.get('objectDate', None) - meta_data['medium'] = object_json.get('medium', None) - meta_data['credit_line'] = object_json.get('creditLine', None) - + meta_data = { + 'accession_number': object_json.get('accessionNumber'), + 'classification': object_json.get('classification'), + 'culture': object_json.get('culture'), + 'date': object_json.get('objectDate'), + 'medium': object_json.get('medium'), + 'credit_line': object_json.get('creditLine') + } + meta_data = {k: v for k, v in meta_data.items() if v is not None} return meta_data diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/museum_victoria.py b/src/cc_catalog_airflow/dags/provider_api_scripts/museum_victoria.py index ba711b4698e..b028110942f 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/museum_victoria.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/museum_victoria.py @@ -55,7 +55,7 @@ def main(): if type(results) == list: if len(results) > 0: - image_count = _handle_batch_objects(results) + _handle_batch_objects(results) page += 1 else: condition = False @@ -77,6 +77,7 @@ def _get_batch_objects( endpoint=ENDPOINT, params=None, headers=HEADERS, retries=RETRIES ): + data = None for retry in range(retries): response = delay_request.get( endpoint, @@ -88,8 +89,6 @@ def _get_batch_objects( if type(response_json) == list: data = response_json break - else: - data = None except Exception: data = None return data diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py b/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py index becbff9002a..b248015dfca 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py @@ -193,7 +193,6 @@ def _get_images( image_url_dimensions=IMAGE_URL_DIMENSIONS, thumbnail_dimensions=THUMBNAIL_DIMENSIONS ): - image_url, thumbnail_url = None, None image_type = { parse_qs(urlparse(img.get("$")).query)['t'][0]: img.get("$") for img in images diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/phylopic.py b/src/cc_catalog_airflow/dags/provider_api_scripts/phylopic.py index 3c9c984eab8..9bfdcfd4eed 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/phylopic.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/phylopic.py @@ -43,7 +43,6 @@ def main(date='all'): which running the script will pull data. """ - param = None offset = 0 logger.info('Begin: PhyloPic API requests') @@ -114,8 +113,6 @@ def _get_total_images(): def _create_endpoint_for_IDs(**args): limit = LIMIT - offset = 0 - endpoint = '' if args.get('date'): # Get a list of objects uploaded/updated on a given date. @@ -154,12 +151,6 @@ def _get_meta_data(_uuid): logger.info(f'Processing UUID: {_uuid}') base_url = 'http://phylopic.org' - img_url = '' - thumbnail = '' - width = '' - height = '' - foreign_id = '' - foreign_url = '' meta_data = {} endpoint = f"http://phylopic.org/api/a/image/{_uuid}?options=credit+" \ "licenseURL+pngFiles+submitted+submitter+taxa+canonicalName" \ @@ -196,17 +187,17 @@ def _get_meta_data(_uuid): def _get_creator_details(result): credit_line = None pub_date = None - creator = '' - + creator = None first_name = result.get('submitter', {}).get('firstName') last_name = result.get('submitter', {}).get('lastName') - creator = f'{first_name} {last_name}'.strip() + if first_name and last_name: + creator = f'{first_name} {last_name}'.strip() if result.get('credit'): credit_line = result.get('credit').strip() pub_date = result.get('submitted').strip() - return (creator, credit_line, pub_date) + return creator, credit_line, pub_date def _get_taxa_details(result): @@ -215,15 +206,14 @@ def _get_taxa_details(result): taxa_list = None title = '' if taxa: - taxa = list(filter( - lambda x: x.get('canonicalName') is not None, taxa)) - taxa_list = list( - map(lambda x: x.get('canonicalName', {}).get('string', ''), taxa)) + taxa = [_.get('canonicalName') for _ in taxa + if _.get('canonicalName') is not None] + taxa_list = [_.get('string', '') for _ in taxa] if taxa_list: title = taxa_list[0] - return (taxa_list, title) + return taxa_list, title def _get_image_info(result, _uuid): @@ -234,6 +224,8 @@ def _get_image_info(result, _uuid): height = '' image_info = result.get('pngFiles') + img = [] + thb = [] if image_info: img = list(filter(lambda x: ( int(str(x.get('width', '0'))) >= 257), image_info)) @@ -257,7 +249,7 @@ def _get_image_info(result, _uuid): f'Image not detected in url: {base_url}/image/{_uuid}') return None, None, None, None else: - return (img_url, width, height, thumbnail) + return img_url, width, height, thumbnail if __name__ == '__main__': diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/raw_pixel.py b/src/cc_catalog_airflow/dags/provider_api_scripts/raw_pixel.py index f482d0230f9..4acc17eff8c 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/raw_pixel.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/raw_pixel.py @@ -91,9 +91,11 @@ def _get_title_owner(image): def _get_meta_data(image): + description = image.get("pinterest_description") meta_data = {} - meta_data["description"] = image.get("pinterest_description") - return {k: v for k, v in meta_data.items() if v is not None} + if description: + meta_data["description"] = description + return meta_data def _get_tags(image): @@ -113,7 +115,7 @@ def _get_tags(image): def _process_image_data(image): # verify the license and extract the metadata - license = "cc0" + license_ = "cc0" version = "1.0" foreign_id, foreign_url = _get_foreign_id_url(image) @@ -132,7 +134,7 @@ def _process_image_data(image): return image_store.add_item( foreign_landing_url=foreign_url, image_url=img_url, - license_=license, + license_=license_, license_version=str(version), foreign_identifier=str(foreign_id), width=str(width) if width else None, diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/science_museum.py b/src/cc_catalog_airflow/dags/provider_api_scripts/science_museum.py index 14367dd9cb7..12a8f661739 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/science_museum.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/science_museum.py @@ -109,6 +109,7 @@ def _get_batch_objects( retries=RETRIES, query_param=None ): + data = None for retry in range(retries): response = delay_request.get( endpoint, @@ -120,11 +121,8 @@ def _get_batch_objects( if "data" in response_json.keys(): data = response_json.get("data") break - else: - data = None except Exception as e: logger.error(f"Failed to due to {e}") - data = None return data @@ -135,10 +133,7 @@ def _handle_object_data(batch_data): if id_ in RECORD_IDS: continue RECORD_IDS.append(id_) - links = obj_.get("links") - - if links: - foreign_landing_url = links.get("self") + foreign_landing_url = obj_.get("links", {}).get("self") if foreign_landing_url is None: continue obj_attributes = obj_.get("attributes") diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/smithsonian.py b/src/cc_catalog_airflow/dags/provider_api_scripts/smithsonian.py index eff5dde3451..095eefd6201 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/smithsonian.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/smithsonian.py @@ -472,8 +472,7 @@ def _process_image_list( if __name__ == '__main__': logging.basicConfig( - format='{asctime} - {name} - {levelname}: {message}', - style='{', + format='%(asctime)s - %(name)s - %(levelname)s: %(message)s', level=logging.INFO ) main() diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/staten_museum.py b/src/cc_catalog_airflow/dags/provider_api_scripts/staten_museum.py index 359db951899..d9df21abcd7 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/staten_museum.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/staten_museum.py @@ -43,7 +43,7 @@ def main(): ) if type(items) == list: if len(items) > 0: - image_count = _handle_items_data( + _handle_items_data( items ) offset += LIMIT @@ -72,6 +72,7 @@ def _get_batch_items( headers=HEADERS, retries=RETRIES ): + items = None for retry in range(retries): response = delay_request.get( endpoint, @@ -83,11 +84,8 @@ def _get_batch_items( if "items" in response_json.keys(): items = response_json.get("items") break - else: - items = None except Exception as e: logger.error(f"errored due to {e}") - items = None return items @@ -213,7 +211,9 @@ def _get_title(titles): def _get_metadata(item): meta_data = {} - meta_data["created_date"] = item.get("created") + created_date = item.get("created") + if created_date: + meta_data["created_date"] = created_date collection = item.get("collection") if type(collection) == list: meta_data["collection"] = ','.join(collection) diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/test_museum_victoria.py b/src/cc_catalog_airflow/dags/provider_api_scripts/test_museum_victoria.py index 722999adb9f..87358af468e 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/test_museum_victoria.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/test_museum_victoria.py @@ -90,8 +90,6 @@ def test_get_batch_objects_empty(): return_value=response_empty) as mock_call: actual_response = mv._get_batch_objects(params=query_param) - expected_param = [] - assert mock_call.call_count == 3 assert actual_response is None @@ -227,6 +225,5 @@ def test_handle_batch_objects_success(): with patch.object( mv.image_store, 'add_item') as mock_item: - actual_image_count = mv._handle_batch_objects(batch_objects) - + mv._handle_batch_objects(batch_objects) assert mock_item.call_count == 1 diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/test_nypl.py b/src/cc_catalog_airflow/dags/provider_api_scripts/test_nypl.py index f201eef71e7..ea5ebf23bcf 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/test_nypl.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/test_nypl.py @@ -106,7 +106,7 @@ def test_request_handler_failure(): actual_response = np._request_handler( params=query_param ) - + assert mock_call.call_count == 3 assert actual_response is None @@ -189,6 +189,7 @@ def test_handle_results_success(): np._handle_results(result) assert mock_item.call_count == 7 + assert mock_request.call_count == 1 def test_handle_results_failure(): @@ -199,7 +200,7 @@ def test_handle_results_failure(): with patch.object( np, '_request_handler', - return_value=item_response) as mock_request: + return_value=item_response): with patch.object( np.image_store, 'add_item') as mock_item: diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/walters_art_museum.py b/src/cc_catalog_airflow/dags/provider_api_scripts/walters_art_museum.py index 5a410366f62..ff237ea01f4 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/walters_art_museum.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/walters_art_museum.py @@ -73,7 +73,6 @@ def main(): - logger.info("Begin: Walters Art Museum provider script.") for class_param in QUERY_CLASSIFICATION: @@ -137,18 +136,17 @@ def _build_query_param( return query_params -def _extract_items_list_from_json(json_response_inpydict_form): +def _extract_items_list_from_json(json_response): if ( - json_response_inpydict_form is None - or str( - json_response_inpydict_form.get('ReturnStatus') - ).lower() != 'true' - or json_response_inpydict_form.get('Items') is None - or len(json_response_inpydict_form.get('Items')) == 0 + json_response is None + or str(json_response.get('ReturnStatus') + ).lower() != 'true' + or json_response.get('Items') is None + or len(json_response.get('Items')) == 0 ): items_list = None else: - items_list = json_response_inpydict_form.get('Items') + items_list = json_response.get('Items') return items_list @@ -175,15 +173,15 @@ def _process_image(img): meta_data = _get_image_meta_data(img) return image_store.add_item( - foreign_landing_url=foreign_landing_url, - image_url=image_url, - thumbnail_url=thumbnail_url, - license_url=license_url, - foreign_identifier=foreign_identifier, - creator=creator, - creator_url=creator_url, - title=title, - meta_data=meta_data, + foreign_landing_url=foreign_landing_url, + image_url=image_url, + thumbnail_url=thumbnail_url, + license_url=license_url, + foreign_identifier=foreign_identifier, + creator=creator, + creator_url=creator_url, + title=title, + meta_data=meta_data, ) @@ -191,20 +189,21 @@ def _get_creator_info(img): creator, creator_url = None, None creator = img.get("Creator") if creator: - creator_url = (f"{MUSEUM_SITE}/browse/{creator.lower()}") + creator_url = f"{MUSEUM_SITE}/browse/{creator.lower()}" return creator, creator_url def _get_image_meta_data(img): - image_meta_data = {} - image_meta_data["ObjectNumber"] = img.get("ObjectNumber") - image_meta_data["PublicAccessDate"] = img.get("PublicAccessDate") - image_meta_data["Collection"] = img.get("Collection") - image_meta_data["Medium"] = img.get("Medium") - image_meta_data["Classification"] = img.get("Classification") - image_meta_data["Description"] = img.get("Description") - image_meta_data["CreditLine"] = img.get("CreditLine") + image_meta_data = { + "ObjectNumber": img.get("ObjectNumber"), + "PublicAccessDate": img.get("PublicAccessDate"), + "Collection": img.get("Collection"), + "Medium": img.get("Medium"), + "Classification": img.get("Classification"), + "Description": img.get("Description"), + "CreditLine": img.get("CreditLine") + } return {k: v for k, v in image_meta_data.items() if v is not None} diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py b/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py index 2ab31cd6320..f1468a1387d 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py @@ -117,6 +117,7 @@ def _get_image_batch( continue_token=continue_token ) image_batch = None + new_continue_token = None for _ in range(MEAN_GLOBAL_USAGE_LIMIT): response_json = ( delayed_requester. @@ -130,8 +131,6 @@ def _get_image_batch( ) if response_json is None: - image_batch = None - new_continue_token = None break else: new_continue_token = response_json.pop('continue', {}) @@ -255,7 +254,6 @@ def _get_image_info_dict(image_data): def _check_mediatype(image_info, image_mediatypes=IMAGE_MEDIATYPES): - valid_mediatype = True image_mediatype = image_info.get('mediatype') if image_mediatype not in image_mediatypes: logger.debug( @@ -281,7 +279,7 @@ def _extract_date_info(image_info): .get('DateTime', {}) .get('value', '') ) - return (date_originally_created, last_modified_at_source) + return date_originally_created, last_modified_at_source def _extract_creator_info(image_info): @@ -293,14 +291,14 @@ def _extract_creator_info(image_info): ) if not artist_string: - return (None, None) + return None, None artist_elem = html.fromstring(artist_string) # We take all text to replicate what is shown on Wikimedia Commons artist_text = ''.join(artist_elem.xpath('//text()')).strip() url_list = list(artist_elem.iterlinks()) artist_url = _cleanse_url(url_list[0][2]) if url_list else None - return (artist_text, artist_url) + return artist_text, artist_url def _extract_category_info(image_info):