Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically add metadata to Hugging Face Hub repos when uploading projects #793

Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,8 +616,15 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi
"--commit-message",
help="""The summary / title / first line of the generated commit.""",
)
@click.option(
"--modelcard/--no-modelcard",
default=True,
help="Update or create a ModelCard with upload.",
juhoinkinen marked this conversation as resolved.
Show resolved Hide resolved
)
@cli_util.common_options
def run_upload(project_ids_pattern, repo_id, token, revision, commit_message):
def run_upload(
project_ids_pattern, repo_id, token, revision, commit_message, modelcard
):
"""
Upload selected projects and their vocabularies to a Hugging Face Hub repository.
\f
Expand Down Expand Up @@ -652,6 +659,9 @@ def run_upload(project_ids_pattern, repo_id, token, revision, commit_message):
)
except (HfHubHTTPError, HFValidationError) as err:
raise OperationFailedException(str(err))
else:
if modelcard:
hfh_util.upsert_modelcard(repo_id, projects, token, revision)
finally:
for fobj in fobjs:
fobj.close()
Expand Down Expand Up @@ -690,7 +700,9 @@ def run_download(project_ids_pattern, repo_id, token, revision, force):
`project_ids_pattern` from the specified Hugging Face Hub repository and
unzips the archives to `data/` directory and places the configuration files
to `projects.d/` directory. An authentication token and revision can
be given with options.
be given with options. If the README.md does not exist in the repository it is
created with default contents and metadata of the uploaded projects, if it exisits,
juhoinkinen marked this conversation as resolved.
Show resolved Hide resolved
its metadata are updated as necessary.
"""

project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
Expand Down
30 changes: 20 additions & 10 deletions annif/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,28 @@
class AnnifConfigCFG:
"""Class for reading configuration in CFG/INI format"""

def __init__(self, filename: str) -> None:
def __init__(self, filename: str = None, projstr: str = None) -> None:
self._config = configparser.ConfigParser()
self._config.optionxform = annif.util.identity
with open(filename, encoding="utf-8-sig") as projf:
try:
logger.debug(f"Reading configuration file {filename} in CFG format")
self._config.read_file(projf)
except (
configparser.DuplicateOptionError,
configparser.DuplicateSectionError,
) as err:
raise ConfigurationException(err.message)
if filename is not None:
logger.debug(f"Reading configuration file {filename} in CFG format")
read_method = self._config.read
source = filename
elif projstr is not None:
logger.debug("Reading configuration from a string in CFG format")
read_method = self._config.read_string
source = projstr
self._read_config(read_method, source)
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed

def _read_config(self, read_method, source):
encoding = "utf-8-sig"
try:
read_method(source, encoding)
except (
configparser.DuplicateOptionError,
configparser.DuplicateSectionError,
) as err:
raise ConfigurationException(err.message)

@property
def project_ids(self) -> list[str]:
Expand Down
89 changes: 89 additions & 0 deletions annif/hfh_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from flask import current_app

import annif
from annif.config import AnnifConfigCFG
from annif.exception import OperationFailedException
from annif.project import Access, AnnifProject

Expand Down Expand Up @@ -238,3 +239,91 @@
config.read(config_path)
section = config.sections()[0]
return config[section]["vocab"]


def upsert_modelcard(repo_id, projects, token, revision):
"""This function creates or updates a Model Card in a Hugging Face Hub repository
with some metadata in it."""
from huggingface_hub import ModelCard

card_exists = "README.md" in _list_files_in_hf_hub(repo_id, token, revision)
if card_exists:
card = ModelCard.load(repo_id)
commit_message = "Update README.md with Annif"
else:
card = _create_modelcard(repo_id)
commit_message = "Create README.md with Annif"

langs_existing = set(card.data.language) if card.data.language else set()
langs_to_add = {proj.vocab_lang for proj in projects}
card.data.language = list(langs_existing.union(langs_to_add))

configs = _get_existing_configs(repo_id, token, revision)
card.text = _update_projects_section(card.text, configs)

card.push_to_hub(
repo_id=repo_id, token=token, revision=revision, commit_message=commit_message
)


def _get_existing_configs(repo_id, token, revision):
from huggingface_hub import HfFileSystem

fs = HfFileSystem(token=token)
cfg_locations = fs.glob(f"{repo_id}/*.cfg", revision=revision)

projstr = ""
for cfg_file in cfg_locations:
projstr += fs.read_text(cfg_file, token=token, revision=revision)

Check warning on line 277 in annif/hfh_util.py

View check run for this annotation

Codecov / codecov/patch

annif/hfh_util.py#L277

Added line #L277 was not covered by tests
return AnnifConfigCFG(projstr=projstr)


def _create_modelcard(repo_id):
from huggingface_hub import ModelCard

content = f"""
---

---

# {repo_id.split("/")[1]}

## Usage

Use the `annif download` command to download selected projects with Annif;
for example, to download all projects in this repository run

annif download "*" {repo_id}

"""
card = ModelCard(content)
card.data.pipeline_tag = "text-classification"
card.data.tags = ["annif"]
return card


def _update_projects_section(text, configs):
section_startind = text.find("## Projects\n")
section_endind = text.rfind("```") + 3 # end of code formatted block

projects_section = _create_projects_section(configs)
if section_startind == -1: # no existing projects section, append it now
return text + projects_section

Check warning on line 311 in annif/hfh_util.py

View check run for this annotation

Codecov / codecov/patch

annif/hfh_util.py#L311

Added line #L311 was not covered by tests
else:
return text[:section_startind] + projects_section + text[section_endind:]


def _create_projects_section(configs):
content = "## Projects\n"
template = "{0:<19} {1:<23} {2:<15} {3:<11}\n"
header = template.format("Project ID", "Project Name", "Vocabulary ID", "Language")
content += "```\n" + header + "-" * len(header.strip()) + "\n"

for proj_id in configs.project_ids:
content += template.format(

Check warning on line 323 in annif/hfh_util.py

View check run for this annotation

Codecov / codecov/patch

annif/hfh_util.py#L323

Added line #L323 was not covered by tests
proj_id,
configs[proj_id]["name"],
configs[proj_id]["vocab"],
configs[proj_id]["language"],
)
return content + "```"
26 changes: 24 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,10 +1069,13 @@ def test_run_help():
assert "Run Annif in server mode for development." in result.output


@mock.patch("annif.hfh_util.upsert_modelcard")
@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
@mock.patch("huggingface_hub.CommitOperationAdd")
@mock.patch("huggingface_hub.HfApi.create_commit")
def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files):
def test_upload(
create_commit, CommitOperationAdd, preupload_lfs_files, upsert_modelcard
):
result = runner.invoke(annif.cli.cli, ["upload", "dummy-fi", "dummy-repo"])
assert not result.exception
assert create_commit.call_count == 1
Expand Down Expand Up @@ -1108,16 +1111,35 @@ def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files):
)
in create_commit.call_args_list
)
assert upsert_modelcard.call_count == 1


@mock.patch("annif.hfh_util.upsert_modelcard")
@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
@mock.patch("huggingface_hub.CommitOperationAdd")
@mock.patch("huggingface_hub.HfApi.create_commit")
def test_upload_many(create_commit, CommitOperationAdd, preupload_lfs_files):
def test_upload_many(
create_commit, CommitOperationAdd, preupload_lfs_files, upsert_modelcard
):
result = runner.invoke(annif.cli.cli, ["upload", "dummy-*", "dummy-repo"])
assert not result.exception
assert create_commit.call_count == 1
assert CommitOperationAdd.call_count == 11
assert upsert_modelcard.call_count == 1


@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
@mock.patch("huggingface_hub.CommitOperationAdd")
@mock.patch("huggingface_hub.HfApi.create_commit")
@mock.patch("annif.hfh_util.upsert_modelcard")
def test_upload_no_modelcard_upsert(
upsert_modelcard, create_commit, CommitOperationAdd, preupload_lfs_files
):
result = runner.invoke(
annif.cli.cli, ["upload", "dummy-fi", "dummy-repo", "--no-modelcard"]
)
assert not result.exception
assert upsert_modelcard.call_count == 0


def test_upload_nonexistent_repo():
Expand Down
68 changes: 68 additions & 0 deletions tests/test_hfh_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,71 @@ def test_copy_project_config_overwrite(copy, exists):
assert copy.call_args == mock.call(
"tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg"
)


@mock.patch("annif.hfh_util._list_files_in_hf_hub", return_value=["README.md"])
@mock.patch(
"huggingface_hub.ModelCard",
)
@mock.patch("huggingface_hub.HfFileSystem.glob", return_value=[])
def test_upsert_modelcard_existing_card(
glob, ModelCard, _list_files_in_hf_hub, project
):
repo_id = "annif-user/Annif-HFH-repo"
project.vocab_lang = "fi"
juhoinkinen marked this conversation as resolved.
Show resolved Hide resolved
projects = [project]
token = "mytoken"
revision = "main"
ModelCard.load.return_value.data.language = ["en"] # Mock language in card

annif.hfh_util.upsert_modelcard(repo_id, projects, token, revision)

ModelCard.assert_not_called() # Do not create a new card

ModelCard.load.assert_called_once_with(repo_id)
card = ModelCard.load.return_value
card.push_to_hub.assert_called_once_with(
repo_id=repo_id,
token=token,
revision=revision,
commit_message="Update README.md with Annif",
)
assert sorted(card.data.language) == ["en", "fi"]


@mock.patch("annif.hfh_util._list_files_in_hf_hub", return_value=[])
@mock.patch(
"huggingface_hub.ModelCard",
)
@mock.patch("huggingface_hub.HfFileSystem.glob", return_value=[])
def test_upsert_modelcard_new_card(glob, ModelCard, _list_files_in_hf_hub, project):
repo_id = "annif-user/Annif-HFH-repo"
project.vocab_lang = "fi"
projects = [project]
token = "mytoken"
revision = "main"

annif.hfh_util.upsert_modelcard(repo_id, projects, token, revision)

ModelCard.assert_called_once()
card = ModelCard.return_value
card.push_to_hub.assert_called_once_with(
repo_id=repo_id,
token=token,
revision=revision,
commit_message="Create README.md with Annif",
)
assert card.data.language == ["fi"]


@mock.patch(
"huggingface_hub.ModelCard",
)
def test_create_modelcard(ModelCard):
repo_id = "annif-user/Annif-HFH-repo"

card = annif.hfh_util._create_modelcard(repo_id)

assert "# Annif-HFH-repo" in ModelCard.call_args[0][0] # README heading
assert card.data.pipeline_tag == "text-classification"
assert card.data.tags == ["annif"]
Loading