|
8 | 8 |
|
9 | 9 | import json |
10 | 10 | import os |
| 11 | +import re |
11 | 12 | from typing import Union |
12 | 13 |
|
13 | 14 | from pythainlp import __version__ |
@@ -584,9 +585,6 @@ def remove(name: str) -> bool: |
584 | 585 | # FileNotFoundError: [Errno 2] No such file or directory: |
585 | 586 | # '/usr/local/lib/python3.6/dist-packages/pythainlp/corpus/ttc' |
586 | 587 | """ |
587 | | - if _CHECK_MODE == "1": |
588 | | - print("PyThaiNLP is read-only mode. It can't remove corpus.") |
589 | | - return False |
590 | 588 | with open(corpus_db_path(), "r", encoding="utf-8-sig") as f: |
591 | 589 | db = json.load(f) |
592 | 590 | data = [ |
@@ -614,3 +612,60 @@ def remove(name: str) -> bool: |
614 | 612 |
|
615 | 613 | def get_path_folder_corpus(name, version, *path): |
616 | 614 | return os.path.join(get_corpus_path(name, version), *path) |
| 615 | + |
| 616 | + |
| 617 | +def make_safe_directory_name(name:str) -> str: |
| 618 | + """ |
| 619 | + Make safe directory name |
| 620 | +
|
| 621 | + :param str name: directory name |
| 622 | + :return: safe directory name |
| 623 | + :rtype: str |
| 624 | + """ |
| 625 | + # Replace invalid characters with an underscore |
| 626 | + safe_name = re.sub(r'[<>:"/\\|?*]', '_', name) |
| 627 | + # Remove leading/trailing spaces or periods (especially important for Windows) |
| 628 | + safe_name = safe_name.strip(' .') |
| 629 | + # Prevent names that are reserved on Windows |
| 630 | + reserved_names = ['CON', 'PRN', 'AUX', 'NUL', 'COM1', 'COM2', 'COM3', 'COM4', 'COM5', 'COM6', 'COM7', 'COM8', 'COM9', 'LPT1', 'LPT2', 'LPT3', 'LPT4', 'LPT5', 'LPT6', 'LPT7', 'LPT8', 'LPT9'] |
| 631 | + if safe_name.upper() in reserved_names: |
| 632 | + safe_name = f"_{safe_name}" # Prepend underscore to avoid conflict |
| 633 | + return safe_name |
| 634 | + |
| 635 | + |
| 636 | +def get_hf_hub(repo_id:str, filename: str=None) -> str: |
| 637 | + """ |
| 638 | + HuggingFace Hub in :mod:`pythainlp` data directory. |
| 639 | +
|
| 640 | + :param str repo_id: repo_id |
| 641 | + :param str filename: filename |
| 642 | + :return: path |
| 643 | + :rtype: str |
| 644 | + """ |
| 645 | + if _CHECK_MODE == "1": |
| 646 | + print("PyThaiNLP is read-only mode. It can't download.") |
| 647 | + return False |
| 648 | + try: |
| 649 | + from huggingface_hub import hf_hub_download, snapshot_download |
| 650 | + except ModuleNotFoundError: |
| 651 | + raise ModuleNotFoundError(""" |
| 652 | + huggingface-hub isn't found! |
| 653 | + Please installing the package via 'pip install huggingface-hub'. |
| 654 | + """) |
| 655 | + except Exception as e: |
| 656 | + raise Exception(f"An unexpected error occurred: {e}") |
| 657 | + hf_root = get_full_data_path("hf_models") |
| 658 | + name_dir = make_safe_directory_name(repo_id) |
| 659 | + root_project = os.path.join(hf_root, name_dir) |
| 660 | + if filename!=None: |
| 661 | + output_path = hf_hub_download( |
| 662 | + repo_id=repo_id, |
| 663 | + filename=filename, |
| 664 | + local_dir=root_project |
| 665 | + ) |
| 666 | + else: |
| 667 | + output_path = snapshot_download( |
| 668 | + repo_id=repo_id, |
| 669 | + local_dir=root_project |
| 670 | + ) |
| 671 | + return output_path |
0 commit comments