Skip to content

Commit

Permalink
Merge pull request #793 from NatLibFi/issue790-automatically-add-meta…
Browse files Browse the repository at this point in the history
…data-to-hugging-face-hub-repos-when-uploading-projects

Automatically add metadata to Hugging Face Hub repos when uploading projects
  • Loading branch information
juhoinkinen authored Sep 27, 2024
2 parents ad6ea7a + e4febab commit 24485af
Show file tree
Hide file tree
Showing 6 changed files with 279 additions and 14 deletions.
16 changes: 14 additions & 2 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,8 +617,15 @@ def run_hyperopt(project_id, paths, docs_limit, trials, jobs, metric, results_fi
"--commit-message",
help="""The summary / title / first line of the generated commit.""",
)
@click.option(
"--modelcard/--no-modelcard",
default=True,
help="Update or create a Model Card with upload.",
)
@cli_util.common_options
def run_upload(project_ids_pattern, repo_id, token, revision, commit_message):
def run_upload(
project_ids_pattern, repo_id, token, revision, commit_message, modelcard
):
"""
Upload selected projects and their vocabularies to a Hugging Face Hub repository.
\f
Expand Down Expand Up @@ -653,6 +660,9 @@ def run_upload(project_ids_pattern, repo_id, token, revision, commit_message):
)
except (HfHubHTTPError, HFValidationError) as err:
raise OperationFailedException(str(err))
else:
if modelcard:
hfh_util.upsert_modelcard(repo_id, projects, token, revision)
finally:
for fobj in fobjs:
fobj.close()
Expand Down Expand Up @@ -691,7 +701,9 @@ def run_download(project_ids_pattern, repo_id, token, revision, force):
`project_ids_pattern` from the specified Hugging Face Hub repository and
unzips the archives to `data/` directory and places the configuration files
to `projects.d/` directory. An authentication token and revision can
be given with options.
be given with options. If the README.md does not exist in the repository it is
created with default contents and metadata of the uploaded projects, if it exists,
its metadata are updated as necessary.
"""

project_ids = hfh_util.get_matching_project_ids_from_hf_hub(
Expand Down
27 changes: 17 additions & 10 deletions annif/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,25 @@
class AnnifConfigCFG:
"""Class for reading configuration in CFG/INI format"""

def __init__(self, filename: str) -> None:
def __init__(self, filename: str = None, projstr: str = None) -> None:
self._config = configparser.ConfigParser()
self._config.optionxform = annif.util.identity
with open(filename, encoding="utf-8-sig") as projf:
try:
logger.debug(f"Reading configuration file {filename} in CFG format")
self._config.read_file(projf)
except (
configparser.DuplicateOptionError,
configparser.DuplicateSectionError,
) as err:
raise ConfigurationException(err.message)
if filename is not None:
logger.debug(f"Reading configuration file {filename} in CFG format")
self._read_config(self._config.read, filename)
elif projstr is not None:
logger.debug("Reading configuration from a string in CFG format")
self._read_config(self._config.read_string, projstr)

def _read_config(self, read_method, source):
encoding = "utf-8-sig"
try:
read_method(source, encoding)
except (
configparser.DuplicateOptionError,
configparser.DuplicateSectionError,
) as err:
raise ConfigurationException(err.message)

@property
def project_ids(self) -> list[str]:
Expand Down
95 changes: 95 additions & 0 deletions annif/hfh_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from flask import current_app

import annif
from annif.config import AnnifConfigCFG
from annif.exception import OperationFailedException
from annif.project import Access, AnnifProject

Expand Down Expand Up @@ -238,3 +239,97 @@ def get_vocab_id_from_config(config_path: str) -> str:
config.read(config_path)
section = config.sections()[0]
return config[section]["vocab"]


def upsert_modelcard(repo_id, projects, token, revision):
"""This function creates or updates a Model Card in a Hugging Face Hub repository
with some metadata in it."""
from huggingface_hub import ModelCard
from huggingface_hub.utils import EntryNotFoundError

try:
card = ModelCard.load(repo_id)
commit_message = "Update README.md with Annif"
except EntryNotFoundError:
card = _create_modelcard(repo_id)
commit_message = "Create README.md with Annif"

langs_existing = set(card.data.language) if card.data.language else set()
langs_to_add = {proj.vocab_lang for proj in projects}
card.data.language = list(langs_existing.union(langs_to_add))

configs = _get_existing_configs(repo_id, token, revision)
card.text = _update_projects_section(card.text, configs)

card.push_to_hub(
repo_id=repo_id, token=token, revision=revision, commit_message=commit_message
)


def _get_existing_configs(repo_id, token, revision):
from huggingface_hub import HfFileSystem

fs = HfFileSystem(token=token)
cfg_locations = fs.glob(f"{repo_id}/*.cfg", revision=revision)

projstr = ""
for cfg_file in cfg_locations:
projstr += fs.read_text(cfg_file, token=token, revision=revision)
return AnnifConfigCFG(projstr=projstr)


def _create_modelcard(repo_id):
from huggingface_hub import ModelCard

content = f"""
---
---
# {repo_id.split("/")[1]}
## Usage
Use the `annif download` command to download selected projects with Annif;
for example, to download all projects in this repository run
annif download "*" {repo_id}
"""
card = ModelCard(content)
card.data.pipeline_tag = "text-classification"
card.data.tags = ["annif"]
return card


AUTOUPDATING_START = "<!--- start-of-autoupdating-part --->"
AUTOUPDATING_END = "<!--- end-of-autoupdating-part --->"


def _update_projects_section(text, configs):
section_start_ind = text.find(AUTOUPDATING_START)
section_end_ind = text.rfind(AUTOUPDATING_END) + len(AUTOUPDATING_END)

projects_section = _create_projects_section(configs)
if section_start_ind == -1: # no existing projects section, append it now
return text + projects_section
else:
return text[:section_start_ind] + projects_section + text[section_end_ind:]


def _create_projects_section(configs):
content = f"{AUTOUPDATING_START}\n## Projects\n"

template = "{0:<19} {1:<23} {2:<15} {3:<8}\n"
header = template.format("Project ID", "Project Name", "Vocabulary ID", "Language")
content += "```\n" + header + "-" * len(header.strip()) + "\n"

for proj_id in configs.project_ids:
project = configs[proj_id]
content += template.format(
proj_id,
project["name"],
project["vocab"],
project["language"],
)
return content + "```\n" + AUTOUPDATING_END
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def project(subject_index, datadir, registry, vocabulary):
proj.analyzer = annif.analyzer.get_analyzer("snowball(finnish)")
proj.language = "fi"
proj.vocab = vocabulary
proj.vocab_lang = "fi"
proj.subjects = subject_index
proj.datadir = str(datadir)
proj.registry = registry
Expand Down
26 changes: 24 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,10 +1069,13 @@ def test_run_help():
assert "Run Annif in server mode for development." in result.output


@mock.patch("annif.hfh_util.upsert_modelcard")
@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
@mock.patch("huggingface_hub.CommitOperationAdd")
@mock.patch("huggingface_hub.HfApi.create_commit")
def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files):
def test_upload(
create_commit, CommitOperationAdd, preupload_lfs_files, upsert_modelcard
):
result = runner.invoke(annif.cli.cli, ["upload", "dummy-fi", "dummy-repo"])
assert not result.exception
assert create_commit.call_count == 1
Expand Down Expand Up @@ -1108,16 +1111,35 @@ def test_upload(create_commit, CommitOperationAdd, preupload_lfs_files):
)
in create_commit.call_args_list
)
assert upsert_modelcard.call_count == 1


@mock.patch("annif.hfh_util.upsert_modelcard")
@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
@mock.patch("huggingface_hub.CommitOperationAdd")
@mock.patch("huggingface_hub.HfApi.create_commit")
def test_upload_many(create_commit, CommitOperationAdd, preupload_lfs_files):
def test_upload_many(
create_commit, CommitOperationAdd, preupload_lfs_files, upsert_modelcard
):
result = runner.invoke(annif.cli.cli, ["upload", "dummy-*", "dummy-repo"])
assert not result.exception
assert create_commit.call_count == 1
assert CommitOperationAdd.call_count == 11
assert upsert_modelcard.call_count == 1


@mock.patch("huggingface_hub.HfApi.preupload_lfs_files")
@mock.patch("huggingface_hub.CommitOperationAdd")
@mock.patch("huggingface_hub.HfApi.create_commit")
@mock.patch("annif.hfh_util.upsert_modelcard")
def test_upload_no_modelcard_upsert(
upsert_modelcard, create_commit, CommitOperationAdd, preupload_lfs_files
):
result = runner.invoke(
annif.cli.cli, ["upload", "dummy-fi", "dummy-repo", "--no-modelcard"]
)
assert not result.exception
assert upsert_modelcard.call_count == 0


def test_upload_nonexistent_repo():
Expand Down
128 changes: 128 additions & 0 deletions tests/test_hfh_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
from datetime import datetime, timezone
from unittest import mock

import huggingface_hub
from huggingface_hub.utils import EntryNotFoundError

import annif.hfh_util
from annif.config import AnnifConfigCFG


def test_archive_dir(testdatadir):
Expand Down Expand Up @@ -101,3 +105,127 @@ def test_copy_project_config_overwrite(copy, exists):
assert copy.call_args == mock.call(
"tests/huggingface-cache/dummy-fi.cfg", "projects.d/dummy-fi.cfg"
)


@mock.patch(
"huggingface_hub.ModelCard.load",
side_effect=EntryNotFoundError("mymessage"),
)
@mock.patch("huggingface_hub.HfFileSystem.glob", return_value=[])
@mock.patch("huggingface_hub.ModelCard")
def test_upsert_modelcard_insert_new(ModelCard, glob, load, project):
repo_id = "annif-user/annif-repo"
token = "mytoken"
revision = "mybranch"

annif.hfh_util.upsert_modelcard(repo_id, [project], token, revision)

ModelCard.assert_called_once()
assert "# annif-repo" in ModelCard.call_args[0][0] # README heading

card = ModelCard.return_value
assert card.data.language == ["fi"]
assert card.data.pipeline_tag == "text-classification"
assert card.data.tags == ["annif"]
card.push_to_hub.assert_called_once_with(
repo_id=repo_id,
token=token,
revision=revision,
commit_message="Create README.md with Annif",
)


@mock.patch("huggingface_hub.ModelCard.push_to_hub")
@mock.patch(
"huggingface_hub.ModelCard.load", # Mock language in existing card
return_value=huggingface_hub.ModelCard("---\nlanguage:\n- en\n---"),
)
@mock.patch("huggingface_hub.HfFileSystem.glob", return_value=["dummy-en.cfg"])
@mock.patch(
"huggingface_hub.HfFileSystem.read_text",
return_value="""
[dummy-en]
name=Dummy English
language=en
vocab=dummy
""",
)
def test_upsert_modelcard_update_existing(read_text, glob, load, push_to_hub, project):
repo_id = "annif-user/annif-repo"
token = "mytoken"
revision = "mybranch"

annif.hfh_util.upsert_modelcard(repo_id, [project], token, revision)

load.assert_called_once_with(repo_id)

card = load.return_value
retained_project_list_content = (
"dummy-en Dummy English dummy en"
)
assert retained_project_list_content in card.text
assert sorted(card.data.language) == ["en", "fi"]
card.push_to_hub.assert_called_once_with(
repo_id=repo_id,
token=token,
revision=revision,
commit_message="Update README.md with Annif",
)


def test_update_modelcard_projects_section_append_new():
empty_cfg = AnnifConfigCFG(projstr="")

text = """This is some existing text in the card."""
updated_text = annif.hfh_util._update_projects_section(text, empty_cfg)

expected_tail = """\
<!--- start-of-autoupdating-part --->
## Projects
```
Project ID Project Name Vocabulary ID Language
--------------------------------------------------------------------
```
<!--- end-of-autoupdating-part --->"""

assert updated_text == text + expected_tail


def test_update_modelcard_projects_section_update_existing():
cfg = AnnifConfigCFG(
projstr="""\
[dummy-fi]
name=Dummy Finnish
language=fi
vocab=dummy"""
)

text_head = """This is some existing text in the card.\n"""

text_initial_projects = """\
<!--- start-of-autoupdating-part --->
## Projects
```
Project ID Project Name Vocabulary ID Language
--------------------------------------------------------------------
```
<!--- end-of-autoupdating-part --->\n"""

text_tail = (
"This is text after the Projects section; it should remain after updates."
)

text = text_head + text_initial_projects + text_tail
updated_text = annif.hfh_util._update_projects_section(text, cfg)

expected_updated_projects = """\
<!--- start-of-autoupdating-part --->
## Projects
```
Project ID Project Name Vocabulary ID Language
--------------------------------------------------------------------
dummy-fi Dummy Finnish dummy fi \n```
<!--- end-of-autoupdating-part --->
"""

assert updated_text == text_head + expected_updated_projects + text_tail

0 comments on commit 24485af

Please sign in to comment.