Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,18 @@
from __future__ import annotations

import datetime
import io
import pathlib
import re
import weakref
from collections.abc import Callable, Iterable
from logging import DEBUG
from typing import Any

import libzim.writer # pyright: ignore
import PIL.Image

from zimscraperlib import logger
from zimscraperlib.constants import (
DEFAULT_DEV_ZIM_METADATA,
FRONT_ARTICLE_MIMETYPES,
Expand Down Expand Up @@ -146,7 +150,31 @@ def config_indexing(
self.__indexing_configured = True
return self

def _maybe_metadata_from_maybe_illustration(self, name, value):
if not isinstance(name, str) or not name.startswith("Illustration_"):
return None
if isinstance(value, bytes):
image = io.BytesIO(value)
try:
img = PIL.Image.open(image)
return img.format
except Exception:
return None

def _log_metadata(self):
if logger.isEnabledFor(DEBUG):
for name, value in sorted(self._metadata.items()):
illustration_metadata = self._maybe_metadata_from_maybe_illustration(
name, value
)
if illustration_metadata is not None:
logger.debug(f"Metadata: {name} MD = {illustration_metadata}")
else:
logger.debug(f"Metadata: {name} = {value}")

def start(self):
self._log_metadata()

if not all(self._metadata.get(key) for key in MANDATORY_ZIM_METADATA_KEYS):
raise ValueError("Mandatory metadata are not all set.")

Expand Down
78 changes: 78 additions & 0 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import base64
import datetime
import io
import logging
import pathlib
import random
import shutil
import subprocess
import sys
import tempfile
import time
from unittest.mock import call, patch

import pytest
from libzim.writer import Compression # pyright: ignore
Expand Down Expand Up @@ -540,6 +542,82 @@ def test_check_metadata(tmp_path):
Creator(tmp_path, "").config_dev_metadata(LongDescription="T" * 5000).start()


@pytest.mark.parametrize(
"tags",
[
(
"wikipedia;_category:wikipedia;_pictures:no;_videos:no;_details:yes;"
"_ftindex:yes"
),
(
[
"wikipedia",
"_category:wikipedia",
"_pictures:no",
"_videos:no",
"_details:yes",
"_ftindex:yes",
]
),
],
)
@patch("zimscraperlib.zim.creator.logger", autospec=True)
def test_start_logs_metadata_log_contents(mocked_logger, png_image, tags, tmp_path):
mocked_logger.isEnabledFor.side_effect = lambda level: level == logging.DEBUG
fpath = tmp_path / "test_config.zim"
with open(png_image, "rb") as fh:
png_data = fh.read()
# Using `with creator:` would call start() twice, complicating
# the assert below.
creator = Creator(fpath, "").config_metadata(
Name="wikipedia_fr_football",
Title="English Wikipedia",
Creator="English speaking Wikipedia contributors",
Publisher="Wikipedia user Foobar",
Date="2009-11-21",
Description="All articles (without images) from the english Wikipedia",
LongDescription="This ZIM file contains all articles (without images)"
" from the english Wikipedia by 2009-11-10. The topics are...",
Language="eng",
License="CC-BY",
Tags=tags,
Flavour="nopic",
Source="https://en.wikipedia.org/",
Scraper="mwoffliner 1.2.3",
Illustration_48x48_at_1=png_data,
TestMetadata="Test Metadata",
)
creator.start()
creator.finish()
mocked_logger.debug.assert_has_calls(
[
call("Metadata: Creator = English speaking Wikipedia contributors"),
call("Metadata: Date = 2009-11-21"),
call(
"Metadata: Description = All articles (without images) from the "
"english Wikipedia"
),
call("Metadata: Flavour = nopic"),
call("Metadata: Illustration_48x48@1 MD = PNG"),
call("Metadata: Language = eng"),
call("Metadata: License = CC-BY"),
call(
"Metadata: LongDescription = This ZIM file contains all articles "
"(without images) from the english Wikipedia by 2009-11-10. "
"The topics are..."
),
call("Metadata: Name = wikipedia_fr_football"),
call("Metadata: Publisher = Wikipedia user Foobar"),
call("Metadata: Relation = None"),
call("Metadata: Scraper = mwoffliner 1.2.3"),
call("Metadata: Source = https://en.wikipedia.org/"),
call(f"Metadata: Tags = {tags}"),
call("Metadata: TestMetadata = Test Metadata"),
call("Metadata: Title = English Wikipedia"),
]
)


def test_relax_metadata(tmp_path):
Creator(tmp_path, "", disable_metadata_checks=True).config_dev_metadata(
Description="T" * 90
Expand Down