This repository has been archived by the owner on Jun 30, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
/
build_metadata_file.py
executable file
·61 lines (45 loc) · 1.82 KB
/
build_metadata_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
""" This script will clone the `datasets` repository in your current directory and parse all currently available
metadata, from the `README.md` yaml headers and the automatically generated json files.
It dumps the results in a `metadata_{current-commit-of-datasets}.json` file.
"""
import json
from pathlib import Path
from subprocess import check_call, check_output
from typing import Dict
import yaml
from apputils import new_state
def metadata_from_readme(f: Path) -> Dict:
with f.open() as fi:
content = [line.rstrip() for line in fi]
if content[0] == "---" and "---" in content[1:]:
yamlblock = "\n".join(content[1 : content[1:].index("---") + 1])
return yaml.safe_load(yamlblock) or dict()
def load_ds_datas():
drepo = Path("datasets")
if drepo.exists() and drepo.is_dir():
check_call(["git", "pull"], cwd=drepo)
else:
check_call(["git", "clone", "https://github.com/huggingface/datasets.git"])
head_sha = check_output(["git", "rev-parse", "HEAD"], cwd=drepo)
datasets_md = dict()
for ddir in sorted((drepo / "datasets").iterdir(), key=lambda d: d.name):
try:
metadata = metadata_from_readme(ddir / "README.md")
except:
metadata = None
if metadata is None or len(metadata) == 0:
metadata = new_state()
try:
with (ddir / "dataset_infos.json").open() as fi:
infos = json.load(fi)
except:
infos = None
datasets_md[ddir.name] = dict(metadata=metadata, infos=infos)
return head_sha.decode().strip(), datasets_md
if __name__ == "__main__":
head_sha, datas = load_ds_datas()
fn = f"metadata_{head_sha}.json"
print(f"writing to '{fn}'")
with open(fn, "w") as fi:
fi.write(json.dumps(datas))