Skip to content

Commit f7df55d

Browse files
authored
Add get_hf_hub and make_safe_directory_name (#1156)
* Add get_hf_hub and make_safe_directory_name * Update core.py * Update core.py * Update core.py * Update core.py
1 parent 3ce57f7 commit f7df55d

File tree

2 files changed

+62
-3
lines changed

2 files changed

+62
-3
lines changed

pythainlp/corpus/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"get_corpus_default_db",
2424
"get_corpus_path",
2525
"get_path_folder_corpus",
26+
"get_hf_hub",
2627
"path_pythainlp_corpus",
2728
"provinces",
2829
"remove",
@@ -41,6 +42,7 @@
4142
"thai_wikipedia_titles",
4243
"thai_words",
4344
"thai_wsd_dict",
45+
"make_safe_directory_name",
4446
]
4547

4648
import os
@@ -98,6 +100,8 @@ def corpus_db_path() -> str:
98100
get_corpus_default_db,
99101
get_corpus_path,
100102
get_path_folder_corpus,
103+
make_safe_directory_name,
104+
get_hf_hub,
101105
path_pythainlp_corpus,
102106
remove,
103107
) # these imports must come before other pythainlp.corpus.* imports

pythainlp/corpus/core.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import json
1010
import os
11+
import re
1112
from typing import Union
1213

1314
from pythainlp import __version__
@@ -584,9 +585,6 @@ def remove(name: str) -> bool:
584585
# FileNotFoundError: [Errno 2] No such file or directory:
585586
# '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc'
586587
"""
587-
if _CHECK_MODE == "1":
588-
print("PyThaiNLP is read-only mode. It can't remove corpus.")
589-
return False
590588
with open(corpus_db_path(), "r", encoding="utf-8-sig") as f:
591589
db = json.load(f)
592590
data = [
@@ -614,3 +612,60 @@ def remove(name: str) -> bool:
614612

615613
def get_path_folder_corpus(name, version, *path):
616614
return os.path.join(get_corpus_path(name, version), *path)
615+
616+
617+
def make_safe_directory_name(name:str) -> str:
618+
"""
619+
Make safe directory name
620+
621+
:param str name: directory name
622+
:return: safe directory name
623+
:rtype: str
624+
"""
625+
# Replace invalid characters with an underscore
626+
safe_name = re.sub(r'[<>:"/\\|?*]', '_', name)
627+
# Remove leading/trailing spaces or periods (especially important for Windows)
628+
safe_name = safe_name.strip(' .')
629+
# Prevent names that are reserved on Windows
630+
reserved_names = ['CON', 'PRN', 'AUX', 'NUL', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9']
631+
if safe_name.upper() in reserved_names:
632+
safe_name = f"_{safe_name}" # Prepend underscore to avoid conflict
633+
return safe_name
634+
635+
636+
def get_hf_hub(repo_id:str, filename: str=None) -> str:
637+
"""
638+
HuggingFace Hub in :mod:`pythainlp` data directory.
639+
640+
:param str repo_id: repo_id
641+
:param str filename: filename
642+
:return: path
643+
:rtype: str
644+
"""
645+
if _CHECK_MODE == "1":
646+
print("PyThaiNLP is read-only mode. It can't download.")
647+
return False
648+
try:
649+
from huggingface_hub import hf_hub_download, snapshot_download
650+
except ModuleNotFoundError:
651+
raise ModuleNotFoundError("""
652+
huggingface-hub isn't found!
653+
Please installing the package via 'pip install huggingface-hub'.
654+
""")
655+
except Exception as e:
656+
raise Exception(f"An unexpected error occurred: {e}")
657+
hf_root = get_full_data_path("hf_models")
658+
name_dir = make_safe_directory_name(repo_id)
659+
root_project = os.path.join(hf_root, name_dir)
660+
if filename!=None:
661+
output_path = hf_hub_download(
662+
repo_id=repo_id,
663+
filename=filename,
664+
local_dir=root_project
665+
)
666+
else:
667+
output_path = snapshot_download(
668+
repo_id=repo_id,
669+
local_dir=root_project
670+
)
671+
return output_path

0 commit comments

Comments
 (0)