From 48968b0fb2a062eb08e4aa15e554f1342b331481 Mon Sep 17 00:00:00 2001 From: Miguel Vieira Date: Fri, 24 Mar 2023 13:49:17 +0000 Subject: [PATCH] feat(project): change download_workflow to download thumbnails instead of downloading full images Add function to convert and image url to a thumbnail url. The function is used by download_workflow to download thumbnails instead of full images to speed up the download process. --- tests/test_project.py | 14 ++++++++++++++ zoonyper/project.py | 28 +++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 tests/test_project.py diff --git a/tests/test_project.py b/tests/test_project.py new file mode 100644 index 0000000..9a9f14b --- /dev/null +++ b/tests/test_project.py @@ -0,0 +1,14 @@ +import pytest + +from zoonyper.project import Project + + +class TestProject: + def setup_method(self): + self.project = Project() + + def test_get_thumbail_url(self): + assert self.project.get_thumbnail_url("") == "" + assert self.project.get_thumbnail_url("http://image.url") == f"{self.project.thumbnails_url}image.url" + assert self.project.get_thumbnail_url("https://image.url") == f"{self.project.thumbnails_url}image.url" + assert self.project.get_thumbnail_url("ftp://image.url") == "ftp://image.url" diff --git a/zoonyper/project.py b/zoonyper/project.py index 2968bf2..945a272 100644 --- a/zoonyper/project.py +++ b/zoonyper/project.py @@ -10,6 +10,7 @@ import json import os import random +import re import requests import time @@ -56,6 +57,9 @@ class Project(Utils): If specified, a list of column names to be parsed as datetime objects when reading the CSV files. The default value is "%Y-%m-%d", which will parse columns named "created_at" and "updated_at". + thumbnails_url : str, optional + Base URL to download thumbnails, it defaults to + `https://thumbnails.zooniverse.org/100x100/`. Raises ------ @@ -103,6 +107,7 @@ def __init__( redact_users: bool = True, trim_paths: bool = True, parse_dates: str = "%Y-%m-%d", + thumbnails_url: str = "https://thumbnails.zooniverse.org/100x100/" ): """ Constructor method. @@ -168,6 +173,8 @@ def __init__( self.trim_paths = trim_paths self.parse_dates = parse_dates + self.thumbnails_url = thumbnails_url + @staticmethod def _user_logged_in(row: pd.Series) -> bool: """ @@ -707,7 +714,7 @@ def download_workflow( file_name = url.split("/")[-1] save_file = Path(current_dir / Path(file_name)) if not save_file.exists(): - r = requests.get(url, timeout=timeout) + r = requests.get(self.get_thumbnail_url(url), timeout=timeout) save_file.write_bytes(r.content) has_downloaded = True @@ -716,6 +723,25 @@ def download_workflow( return True + def get_thumbnail_url(self, image_url: str) -> str: + """ + Get the thumbail URL for the given image URL. + + Parameters + ---------- + image_url : str + URL to get the thumbnail URL for. + + Returns + ------- + str + Thumbnail URL. + """ + if image_url: + return re.sub("^https?://", self.thumbnails_url, image_url) + + return image_url + @property def inactive_workflow_ids(self) -> list: """