Skip to content

Commit

Permalink
chore: fix s3 synchronization script (#10867)
Browse files Browse the repository at this point in the history
- switch to new image folder hierarchy
- use local JSONL dump (as downloading from static.openfoodfacts.org
from off CT failed)

Don't merge until it's tested on production.
  • Loading branch information
raphael0202 authored Oct 7, 2024
1 parent cb1e910 commit 0558184
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 11 deletions.
3 changes: 2 additions & 1 deletion conf/systemd/sync_images_s3@.service
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ OnFailure=email-failures@sync_images_s3__%i.service
Type=oneshot
User=off
Group=off
ExecStart=/srv/%i/scripts/sync-s3-images/.venv/bin/python3 /srv/%i/scripts/sync-s3-images/sync_s3_images.py /mnt/%i/images/products
# Warning: this script doesn't work currently with non-off product type
ExecStart=/srv/%i/scripts/sync-s3-images/.venv/bin/python3 /srv/%i/scripts/sync-s3-images/sync_s3_images.py /mnt/off/images/products /mnt/off/html_data/openfoodfacts-products.jsonl.gz
KillMode=process
8 changes: 4 additions & 4 deletions scripts/sync-s3-images/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
openfoodfacts==0.1.12
orjson==3.9.15
boto3==1.34.22
tqdm==4.66.3
openfoodfacts==1.1.3
orjson==3.10.7
boto3==1.35.32
tqdm==4.66.5
18 changes: 12 additions & 6 deletions scripts/sync-s3-images/sync_s3_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@

import boto3
import tqdm
from openfoodfacts import DatasetType, ProductDataset
from openfoodfacts import ProductDataset
from openfoodfacts.images import split_barcode

logger = getLogger()
handler = logging.StreamHandler()
Expand All @@ -49,8 +50,7 @@ def generate_product_path(barcode: str) -> str:
if not barcode.isdigit():
raise ValueError("unknown barcode format: {}".format(barcode))

match = BARCODE_PATH_REGEX.fullmatch(barcode)
splitted_barcode = [x for x in match.groups() if x] if match else [barcode]
splitted_barcode = split_barcode(barcode)
return "/".join(splitted_barcode)


Expand Down Expand Up @@ -104,12 +104,13 @@ def get_sync_filepaths(
yield barcode, product_dir / ocr_file_name


def run(image_dir: Path) -> None:
def run(image_dir: Path, dataset_path: Path) -> None:
"""Launch the synchronization.
:param image_dir: directory where images are stored
:param dataset_path: path to the JSONL dataset
"""
ds = ProductDataset(DatasetType.jsonl, force_download=True, download_newer=True)
ds = ProductDataset(dataset_path=dataset_path)
logger.info("Fetching existing keys...")
existing_keys = set(obj.key for obj in bucket.objects.filter(Prefix="data/"))
logger.info("%d keys in openfoodfacts-images bucket", len(existing_keys))
Expand Down Expand Up @@ -197,5 +198,10 @@ def run(image_dir: Path) -> None:
type=Path,
help="Directory where images are stored.",
)
parser.add_argument(
"dataset_path",
type=Path,
help="Directory where dataset is stored.",
)
args = parser.parse_args()
run(args.image_dir)
run(args.image_dir, args.dataset_path)

0 comments on commit 0558184

Please sign in to comment.