Skip to content

Commit

Permalink
Fix pep8 violations (#103)
Browse files Browse the repository at this point in the history
  • Loading branch information
obulat authored Jun 17, 2021
1 parent 85affa3 commit 0c7a507
Show file tree
Hide file tree
Showing 17 changed files with 114 additions and 138 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"api_key": API_KEY
}

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
"has_images": 1,
"rights_type_permissive": 1,
"limit": LIMIT,
Expand Down Expand Up @@ -55,19 +55,24 @@ def main():

def _get_query_param(
offset=0,
default_query_param=DEFAULT_QUERY_PARAM
default_query_param=None
):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param = default_query_param.copy()
query_param.update(offset=offset)
return query_param


def _get_object_json(
headers=HEADERS,
headers=None,
endpoint=ENDPOINT,
retries=RETRIES,
query_param=None
):
if headers is None:
headers = HEADERS.copy()
data = None
for tries in range(retries):
response = delay_request.get(
endpoint,
Expand All @@ -80,12 +85,8 @@ def _get_object_json(
response_json.get("message", "").lower() == "success."):
data = response_json.get("data")
break
else:
data = None
except Exception as e:
logger.error(f"Error due to {e}")
data = None

return data


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _get_response(
endpoint=ENDPOINT,
retries=RETRIES
):
response_json, total_images = None, 0
response_json, total_images, tries = None, 0, 0
for tries in range(retries):
response = delay_request.get(
endpoint,
Expand All @@ -89,9 +89,8 @@ def _get_response(
return response_json, total_images


def _handle_response(
batch
):
def _handle_response(batch):
total_images = 0
for data in batch:
license_ = data.get('share_license_status', '').lower()
if license_ != 'cc0':
Expand Down Expand Up @@ -135,39 +134,32 @@ def _handle_response(
return total_images


def _get_image_type(
image_data
):
def _get_image_type(image_data):
key, image_url = None, None
if image_data.get('web'):
key = 'web'
image_url = image_data.get('web').get('url', None)
elif image_data.get('print'):
key = 'print'
image_url = image_data.get('print').get('url', None)

elif image_data.get('full'):
key = 'full'
image_url = image_data.get('full').get('url', None)
else:
image_url = None

if image_url is None:
key = None
return image_url, key


def _get_metadata(data):
metadata = {}

metadata['accession_number'] = data.get('accession_number', '')
metadata['technique'] = data.get('technique', '')
metadata['date'] = data.get('creation_date', '')
metadata['credit_line'] = data.get('creditline', '')
metadata['classification'] = data.get('type', '')
metadata['tombstone'] = data.get('tombstone', '')
metadata['culture'] = ','.join(
[i for i in data.get('culture', []) if i is not None]
)
metadata = {
'accession_number': data.get('accession_number', ''),
'technique': data.get('technique', ''),
'date': data.get('creation_date', ''),
'credit_line': data.get('creditline', ''),
'classification': data.get('type', ''),
'tombstone': data.get('tombstone', ''),
'culture': ','.join(
[i for i in data.get('culture', []) if i is not None]
)}
metadata = {k: v for k, v in metadata.items() if v is not None}
return metadata


Expand Down
17 changes: 9 additions & 8 deletions src/cc_catalog_airflow/dags/provider_api_scripts/europeana.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,6 @@ def main(date):

def _get_pagewise(start_timestamp, end_timestamp):
cursor = '*'
total_number_of_images = 0
images_stored = 0

while cursor is not None:
image_list, next_cursor, total_number_of_images = _get_image_list(
Expand Down Expand Up @@ -94,6 +92,10 @@ def _get_image_list(
endpoint=ENDPOINT,
max_tries=6 # one original try, plus 5 retries
):
try_number = 0
image_list, next_cursor, total_number_of_images = (
None, None, None
)
for try_number in range(max_tries):

query_param_dict = _build_query_param_dict(
Expand Down Expand Up @@ -123,9 +125,7 @@ def _get_image_list(
and (image_list is None or next_cursor is None)
):
logger.warning('No more tries remaining. Returning None types.')
return None, None, None
else:
return image_list, next_cursor, total_number_of_images
return image_list, next_cursor, total_number_of_images


def _extract_response_json(response):
Expand Down Expand Up @@ -157,6 +157,7 @@ def _extract_image_list_from_json(response_json):

def _process_image_list(image_list):
prev_total = 0
total_images = 0
for image_data in image_list:
total_images = _process_image_data(image_data)
if total_images is None:
Expand Down Expand Up @@ -202,9 +203,9 @@ def _process_image_data(image_data, sub_providers=SUB_PROVIDERS,
def _get_license_url(license_field):
if len(license_field) > 1:
logger.warning('More than one license field found')
for license in license_field:
if 'creativecommons' in license:
return license
for license_ in license_field:
if 'creativecommons' in license_:
return license_
return None


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,9 @@ def _process_object_list(object_list):

def _process_object(obj, sub_providers=SUB_PROVIDERS, provider=PROVIDER):
total_images = 0
license = obj.get("imageRights")
if license is not None:
license_url = license.get("link")
license_url = obj.get("imageRights", {}).get("link")
if license_url is None:
return None
foreign_identifier = obj.get("id")
title = obj.get("title")
building = obj.get("buildings")[0].get("value")
Expand Down Expand Up @@ -137,9 +137,9 @@ def _get_raw_tags(obj):

def _get_landing(obj, landing_url=LANDING_URL):
l_url = None
id = obj.get("id")
if id:
l_url = landing_url + id
id_ = obj.get("id")
if id_:
l_url = landing_url + id_
return l_url


Expand Down
9 changes: 5 additions & 4 deletions src/cc_catalog_airflow/dags/provider_api_scripts/flickr.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def main(date):
date_type = DATE_TYPE

for start_timestamp, end_timestamp in timestamp_pairs:
total_images = _process_interval(
_process_interval(
start_timestamp,
end_timestamp,
date_type
Expand Down Expand Up @@ -159,6 +159,8 @@ def _get_image_list(
endpoint=ENDPOINT,
max_tries=6 # one original try, plus 5 retries
):
image_list, total_pages = None, None
try_number = 0
for try_number in range(max_tries):
query_param_dict = _build_query_param_dict(
start_timestamp,
Expand All @@ -181,9 +183,8 @@ def _get_image_list(
if try_number == max_tries - 1 and (
(image_list is None) or (total_pages is None)):
logger.warning('No more tries remaining. Returning Nonetypes.')
return None, None
else:
return image_list, total_pages

return image_list, total_pages


def _extract_response_json(response):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,15 @@ def _build_foreign_id(object_id, image_url):


def _create_meta_data(object_json):
meta_data = {}

meta_data['accession_number'] = object_json.get('accessionNumber', None)
meta_data['classification'] = object_json.get('classification', None)
meta_data['culture'] = object_json.get('culture', None)
meta_data['date'] = object_json.get('objectDate', None)
meta_data['medium'] = object_json.get('medium', None)
meta_data['credit_line'] = object_json.get('creditLine', None)

meta_data = {
'accession_number': object_json.get('accessionNumber'),
'classification': object_json.get('classification'),
'culture': object_json.get('culture'),
'date': object_json.get('objectDate'),
'medium': object_json.get('medium'),
'credit_line': object_json.get('creditLine')
}
meta_data = {k: v for k, v in meta_data.items() if v is not None}
return meta_data


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def main():

if type(results) == list:
if len(results) > 0:
image_count = _handle_batch_objects(results)
_handle_batch_objects(results)
page += 1
else:
condition = False
Expand All @@ -77,6 +77,7 @@ def _get_batch_objects(
endpoint=ENDPOINT, params=None,
headers=HEADERS, retries=RETRIES
):
data = None
for retry in range(retries):
response = delay_request.get(
endpoint,
Expand All @@ -88,8 +89,6 @@ def _get_batch_objects(
if type(response_json) == list:
data = response_json
break
else:
data = None
except Exception:
data = None
return data
Expand Down
1 change: 0 additions & 1 deletion src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ def _get_images(
image_url_dimensions=IMAGE_URL_DIMENSIONS,
thumbnail_dimensions=THUMBNAIL_DIMENSIONS
):
image_url, thumbnail_url = None, None
image_type = {
parse_qs(urlparse(img.get("$")).query)['t'][0]: img.get("$")
for img in images
Expand Down
30 changes: 11 additions & 19 deletions src/cc_catalog_airflow/dags/provider_api_scripts/phylopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def main(date='all'):
which running the script will pull data.
"""

param = None
offset = 0

logger.info('Begin: PhyloPic API requests')
Expand Down Expand Up @@ -114,8 +113,6 @@ def _get_total_images():

def _create_endpoint_for_IDs(**args):
limit = LIMIT
offset = 0
endpoint = ''

if args.get('date'):
# Get a list of objects uploaded/updated on a given date.
Expand Down Expand Up @@ -154,12 +151,6 @@ def _get_meta_data(_uuid):
logger.info(f'Processing UUID: {_uuid}')

base_url = 'http://phylopic.org'
img_url = ''
thumbnail = ''
width = ''
height = ''
foreign_id = ''
foreign_url = ''
meta_data = {}
endpoint = f"http://phylopic.org/api/a/image/{_uuid}?options=credit+" \
"licenseURL+pngFiles+submitted+submitter+taxa+canonicalName" \
Expand Down Expand Up @@ -196,17 +187,17 @@ def _get_meta_data(_uuid):
def _get_creator_details(result):
credit_line = None
pub_date = None
creator = ''

creator = None
first_name = result.get('submitter', {}).get('firstName')
last_name = result.get('submitter', {}).get('lastName')
creator = f'{first_name} {last_name}'.strip()
if first_name and last_name:
creator = f'{first_name} {last_name}'.strip()

if result.get('credit'):
credit_line = result.get('credit').strip()
pub_date = result.get('submitted').strip()

return (creator, credit_line, pub_date)
return creator, credit_line, pub_date


def _get_taxa_details(result):
Expand All @@ -215,15 +206,14 @@ def _get_taxa_details(result):
taxa_list = None
title = ''
if taxa:
taxa = list(filter(
lambda x: x.get('canonicalName') is not None, taxa))
taxa_list = list(
map(lambda x: x.get('canonicalName', {}).get('string', ''), taxa))
taxa = [_.get('canonicalName') for _ in taxa
if _.get('canonicalName') is not None]
taxa_list = [_.get('string', '') for _ in taxa]

if taxa_list:
title = taxa_list[0]

return (taxa_list, title)
return taxa_list, title


def _get_image_info(result, _uuid):
Expand All @@ -234,6 +224,8 @@ def _get_image_info(result, _uuid):
height = ''

image_info = result.get('pngFiles')
img = []
thb = []
if image_info:
img = list(filter(lambda x: (
int(str(x.get('width', '0'))) >= 257), image_info))
Expand All @@ -257,7 +249,7 @@ def _get_image_info(result, _uuid):
f'Image not detected in url: {base_url}/image/{_uuid}')
return None, None, None, None
else:
return (img_url, width, height, thumbnail)
return img_url, width, height, thumbnail


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 0c7a507

Please sign in to comment.