From 0558184fb6af5603d515d8d45ded0f587ab1006d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Mon, 7 Oct 2024 15:07:04 +0200 Subject: [PATCH] chore: fix s3 synchronization script (#10867) - switch to new image folder hierarchy - use local JSONL dump (as downloading from static.openfoodfacts.org from off CT failed) Don't merge until it's tested on production. --- conf/systemd/sync_images_s3@.service | 3 ++- scripts/sync-s3-images/requirements.txt | 8 ++++---- scripts/sync-s3-images/sync_s3_images.py | 18 ++++++++++++------ 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/conf/systemd/sync_images_s3@.service b/conf/systemd/sync_images_s3@.service index 65c73c05a09d3..72558620132d9 100644 --- a/conf/systemd/sync_images_s3@.service +++ b/conf/systemd/sync_images_s3@.service @@ -8,5 +8,6 @@ OnFailure=email-failures@sync_images_s3__%i.service Type=oneshot User=off Group=off -ExecStart=/srv/%i/scripts/sync-s3-images/.venv/bin/python3 /srv/%i/scripts/sync-s3-images/sync_s3_images.py /mnt/%i/images/products +# Warning: this script doesn't work currently with non-off product type +ExecStart=/srv/%i/scripts/sync-s3-images/.venv/bin/python3 /srv/%i/scripts/sync-s3-images/sync_s3_images.py /mnt/off/images/products /mnt/off/html_data/openfoodfacts-products.jsonl.gz KillMode=process diff --git a/scripts/sync-s3-images/requirements.txt b/scripts/sync-s3-images/requirements.txt index 664e3097b4776..292b3565c2e7f 100644 --- a/scripts/sync-s3-images/requirements.txt +++ b/scripts/sync-s3-images/requirements.txt @@ -1,4 +1,4 @@ -openfoodfacts==0.1.12 -orjson==3.9.15 -boto3==1.34.22 -tqdm==4.66.3 \ No newline at end of file +openfoodfacts==1.1.3 +orjson==3.10.7 +boto3==1.35.32 +tqdm==4.66.5 \ No newline at end of file diff --git a/scripts/sync-s3-images/sync_s3_images.py b/scripts/sync-s3-images/sync_s3_images.py index b985a1ddec67e..1df4b6ac1de03 100644 --- a/scripts/sync-s3-images/sync_s3_images.py +++ b/scripts/sync-s3-images/sync_s3_images.py @@ -24,7 +24,8 @@ import boto3 import tqdm -from openfoodfacts import DatasetType, ProductDataset +from openfoodfacts import ProductDataset +from openfoodfacts.images import split_barcode logger = getLogger() handler = logging.StreamHandler() @@ -49,8 +50,7 @@ def generate_product_path(barcode: str) -> str: if not barcode.isdigit(): raise ValueError("unknown barcode format: {}".format(barcode)) - match = BARCODE_PATH_REGEX.fullmatch(barcode) - splitted_barcode = [x for x in match.groups() if x] if match else [barcode] + splitted_barcode = split_barcode(barcode) return "/".join(splitted_barcode) @@ -104,12 +104,13 @@ def get_sync_filepaths( yield barcode, product_dir / ocr_file_name -def run(image_dir: Path) -> None: +def run(image_dir: Path, dataset_path: Path) -> None: """Launch the synchronization. :param image_dir: directory where images are stored + :param dataset_path: path to the JSONL dataset """ - ds = ProductDataset(DatasetType.jsonl, force_download=True, download_newer=True) + ds = ProductDataset(dataset_path=dataset_path) logger.info("Fetching existing keys...") existing_keys = set(obj.key for obj in bucket.objects.filter(Prefix="data/")) logger.info("%d keys in openfoodfacts-images bucket", len(existing_keys)) @@ -197,5 +198,10 @@ def run(image_dir: Path) -> None: type=Path, help="Directory where images are stored.", ) + parser.add_argument( + "dataset_path", + type=Path, + help="Directory where dataset is stored.", + ) args = parser.parse_args() - run(args.image_dir) + run(args.image_dir, args.dataset_path)