Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .vscode/settings.json

This file was deleted.

16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
**版本更新:**
- 1.1版本的fastHan与0.5.5版本的fastNLP会导致importerror。如果使用1.1版本的fastHan,请使用0.5.0版本的fastNLP。
- 1.2版本的fastHan修复了fastNLP版本兼容问题。小于等于1.2版本的fastHan在输入句子的首尾包含**空格、换行**符时会产生BUG。如果字符串首尾包含上述字符,请使用 strip 函数处理输入字符串。
- 1.3版本的fastHan自动对输入字符串做 strip 函数处理。
- 1.4版本的fastHan加入用户词典功能(仅限于分词任务)
- 1.5版本的fastHan
- 修正了Parsing任务中可能会出现的ValueError
- 修改结果的返回形式,默认以list的形式返回
- 可以通过url路径加载模型
- 1.6版本的fastHan
- 将用户词典功能扩充到所有任务
- 可以在返回值中包含位置信息
- 1.7版本的fastHan
- 添加finetune功能
- 1.8
- 改为从huggingface modelhub加载模型文件
22 changes: 1 addition & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,30 +25,10 @@ Zhichao Geng, Hang Yan, Xipeng Qiu and Xuanjing Huang, fastHan: A BERT-based Mul


## 安装指南
fastHan需要以下依赖的包:

- torch>=1.0.0
- fastNLP>=0.5.5

**版本更新:**
- 1.1版本的fastHan与0.5.5版本的fastNLP会导致importerror。如果使用1.1版本的fastHan,请使用0.5.0版本的fastNLP。
- 1.2版本的fastHan修复了fastNLP版本兼容问题。小于等于1.2版本的fastHan在输入句子的首尾包含**空格、换行**符时会产生BUG。如果字符串首尾包含上述字符,请使用 strip 函数处理输入字符串。
- 1.3版本的fastHan自动对输入字符串做 strip 函数处理。
- 1.4版本的fastHan加入用户词典功能(仅限于分词任务)
- 1.5版本的fastHan
- 修正了Parsing任务中可能会出现的ValueError
- 修改结果的返回形式,默认以list的形式返回
- 可以通过url路径加载模型
- 1.6版本的fastHan
- 将用户词典功能扩充到所有任务
- 可以在返回值中包含位置信息
- 1.7版本的fastHan
- 添加finetune功能

可执行如下命令完成安装:

```
pip install fastHan
pip install fastHan==1.8
```

或者可以通过github安装:
Expand Down
29 changes: 12 additions & 17 deletions fastHan/FastModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,16 @@
from shutil import copyfile

import torch
from fastNLP import Trainer, Vocabulary
from fastNLP import Trainer
from fastNLP.core.optimizer import AdamW
from fastNLP.io.file_utils import cached_path

from .model.bert import BertEmbedding
from .model.finetune_dataloader import (fastHan_CWS_Loader, fastHan_NER_Loader,
fastHan_Parsing_Loader,
fastHan_POS_loader)
from .model.model import CharModel
from .model.UserDict import UserDict

from .model.utils import hf_cached_path

class Token(object):
"""
Expand Down Expand Up @@ -85,7 +84,11 @@ class FastHan(object):
FastHan类封装了基于BERT的深度学习联合模型CharModel,可处理CWS、POS、NER、dependency parsing四项任务,这\
四项任务共享参数。
"""

HF_URL_MAP = {
"base": 'fdugzc/fasthan_base',
"large": 'fdugzc/fasthan_large'
}
CACHE_SUB_DIR = "fasthan"

def __init__(self,model_type='base',url=None):
"""
Expand All @@ -96,6 +99,9 @@ def __init__(self,model_type='base',url=None):

:param str url:默认为None,用户可通过此参数传入手动下载并解压后的目录路径。
"""
if model_type not in ["base","large"]:
raise ValueError("model_type can only be base or large.")

self.device='cpu'
#获取模型的目录/下载模型
if url is not None:
Expand Down Expand Up @@ -287,19 +293,8 @@ def set_cws_style(self,corpus):
corpus='CWS-'+corpus
self.tag_map['CWS']=self.corpus_map[corpus]

def _get_model(self,model_type):

#首先检查本地目录中是否已缓存模型,若没有缓存则下载。

if model_type=='base':
url='http://212.129.155.247/fasthan/fasthan_base.zip'
elif model_type=='large':
url='http://212.129.155.247/fasthan/fasthan_large.zip'
else:
raise ValueError("model_type can only be base or large.")

model_dir=cached_path(url,name='fasthan')
return model_dir
def _get_model(self, model_type):
return hf_cached_path(FastHan.HF_URL_MAP[model_type], FastHan.CACHE_SUB_DIR)

def _to_tensor(self,chars,target,seq_len):

Expand Down
20 changes: 20 additions & 0 deletions fastHan/model/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import os

from pathlib import Path
from typing import Union, Dict
from fastNLP.io.file_utils import get_cache_path, unzip_file
from transformers.utils import cached_file


def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
Expand Down Expand Up @@ -69,3 +72,20 @@ def get_tokenizer():
except Exception as e:
print('use raw tokenizer')
return lambda x: x.split()

# 返回本地缓存的模型目录路径
# 若本地无缓存,从huggingface中下载并解压
# 修改自 fastNLP.io.file_utils.cached_path, transformers.utils.cached_file
def hf_cached_path(model_url: str, cache_sub_dir: str):
cache_dir = os.path.join(Path(get_cache_path()), cache_sub_dir)
os.makedirs(cache_dir, exist_ok=True)

# model_name 为 fasthan_base 或 fasthan_large
model_name = model_url.split("/")[-1]
target_path = os.path.join(cache_dir, model_name)

if model_name not in os.listdir(cache_dir):
# 若本地不存在缓存, 从huggingface中下载
zipped_file = cached_file(model_url, model_name+".zip")
unzip_file(zipped_file, cache_dir)
return target_path
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
torch>=1.0.0
FastNLP>=0.5.5
torch>=1.0.0, <2.0.0
FastNLP>=0.5.5, <1.0.0
transformers >=4.0.0, <=4.35.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

setup(
name='fastHan',
version='1.7',
version='1.8',
url='https://github.com/fastnlp/fastHan',
description=(
'使用深度学习联合模型,解决中文分词、词性标注、依存分析、命名实体识别任务。'
Expand Down