Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions knlp/module/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# !/usr/bin/python
# -*- coding:UTF-8 -*-
# -----------------------------------------------------------------------#
# File Name: __init__.py
# Author: Junyi Li
# Mail: 4ljy@163.com
# Created Time: 2022-04-10
# Description:
# -----------------------------------------------------------------------#
58 changes: 58 additions & 0 deletions knlp/module/inference_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# !/usr/bin/python
# -*- coding:UTF-8 -*-
# -----------------------------------------------------------------------#
# File Name: inference_base
# Author: Junyi Li
# Mail: 4ljy@163.com
# Created Time: 2022-04-10
# Description:
# -----------------------------------------------------------------------#
from abc import ABC, abstractmethod


class InferenceBaseClass(ABC):
"""
This function offers the base class to do inference
Child class could rewrite any function for its specific function.
"""

def __init__(self):
pass

@abstractmethod
def load_model(self, model_path: str):
"""
This function could load model.

Args:
model_path:

Returns:

"""
pass

@abstractmethod
def predict(self, ):
"""
use model to do predict

Returns:

"""
pass

@abstractmethod
def predict_file(self, input_file_path: str, output_file_path: str):
"""
predict a file.

Args:
input_file_path: string
output_file_path: string

Returns:

"""

pass
66 changes: 66 additions & 0 deletions knlp/module/train_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# !/usr/bin/python
# -*- coding:UTF-8 -*-
# -----------------------------------------------------------------------#
# File Name: train_base
# Author: Junyi Li
# Mail: 4ljy@163.com
# Created Time: 2022-04-10
# Description:
# -----------------------------------------------------------------------#

from abc import ABC, abstractmethod


class TrainBaseClass(ABC):
"""
This function offers the base class to train a new model


"""

def __init__(self):
pass

@abstractmethod
def train(self):
"""
General training function
Returns:

"""
pass

@abstractmethod
def eval(self):
"""
General evaluation for the model trained
Returns:

"""
pass

@abstractmethod
def save_model(self, model_path: str):
"""
Save the model trained

Args:
model_path:

Returns:

"""
pass

@abstractmethod
def load_model(self, model_path: str):
"""
This function could load model.

Args:
model_path:

Returns:

"""
pass
181 changes: 179 additions & 2 deletions knlp/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,16 @@
# 2. 函数运行进度条
# 3. 计算函数运行时间的装饰器
# -----------------------------------------------------------------------#
import datetime
import os
import re
import shutil
import sys
import time
import shutil
import zipfile
import requests
from functools import wraps

import requests
from knlp.common.constant import KNLP_PATH


Expand Down Expand Up @@ -133,3 +135,178 @@ def close(self, words='done'):
print('')
print(words)
self.i = 0


class CommonUtil(object):
_emoji_pattern_cfg = re.compile(u'('
u'\ud83c[\udf00-\udfff]|'
u'\ud83d[\udc00-\ude4f\ude80-\udeff]|'
u'[\u2600-\u2B55])+', flags=re.UNICODE)

_replace_pattern_cfg = {
'float_t': re.compile('\d+\.\d+'),
'phone_t': re.compile(
r'1[0-9\*]{10}|\d{3}[-\s]\d{4}[-\s]\d{4}|\+861[0-9]{10}|[0-9]{3}-[0-9]{3}-[0-9]{4}|[0-9]{4}-[0-9]{7,8}|[8|6][0-9]{7}'),
'email_t': re.compile(r'[^@|\s]+@[^@]+\.[^@|\s]+'),
}

replace_patterns = [
('', re.compile(r'\[.*\]'))
]

punc = '!"#$%&\'()**+。,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~}'
replace_symbol = (r'[{}]+'.format(punc))
_replace_symbol_cfg = re.compile(replace_symbol)

_illegal_char_set = set([])

@classmethod
def keep_alphabate(cls, line):
cop = re.compile("[^a-z^A-Z]")
return cop.sub('', line)

@classmethod
def keep_meaning_thing(cls, line):
cop = re.compile(
"[^\u4e00-\u9fa5^.^a-z^A-Z^0-9^.^\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b”]")
return cop.sub('', line)

@classmethod
def keep_Chinese(cls, line):
cop = re.compile("[^\u4e00-\u9fa5”]")
return cop.sub('', line)

@classmethod
def keep_Chinese_number(cls, line):
cop = re.compile("[^\u4e00-\u9fa5^.^0-9”]")
return cop.sub('', line)

@classmethod
def rm_replace_symbol(cls, line):
res = cls._replace_symbol_cfg.sub('', line)
return res

@classmethod
def remove_emoji_char(cls, text_unicode):
res = cls._emoji_pattern_cfg.sub('', text_unicode)
return res

@classmethod
def rm_html(cls, line):
pat = re.compile('>(.*?)<')
return ''.join(pat.findall(line))

# 时间函数相关
@classmethod
def timestamp2ymd(cls, timestamp, format='%Y-%m-%d %H:%M:%S'):
# timestamp: 1494474258
# format:
# - '%Y-%m-%d %H:%M:%S'
# - '%Y-%m-%d'
# - '%Y%m%d'
x = time.localtime(timestamp)
return time.strftime(format, x)

@classmethod
def ymd2timestamp(cls, Ymd, format='%Y-%m-%d %H:%M:%S'):

# Ymd: 2017-04-13 00:00:00
# format:
# - '%Y-%m-%d %H:%M:%S'
# - '%Y-%m-%d'
# - '%Y%m%d'
# return 1494474258

return time.mktime(time.strptime(Ymd, format))

@classmethod
def get_timestamp(cls, now_flag=1, days=0):
# now_flag: 是否获取当前时间戳,精确到秒
# days: 某天开始时间戳
if now_flag == 1:
return time.time()
else:
today = datetime.date.today()
# 今天-num天时间戳
TimeStamp = today + datetime.timedelta(days=days)
return TimeStamp

def _get_charyype(cls, character):
# Input:
# unicode of a character
# Return:
# 0-number, 1-alpha, 2-Chinese, 3-others
char = character
if len(char) != 1:
return -1
if char in '1234567890':
return 0 # number
if (char >= 'a' and char <= 'z') or (char >= 'A' and char <= 'Z'):
return 1
if re.match(u'[\u4E00-\u9FA5]', char):
return 2
return 3

@classmethod
def get_char_typenum(cls, text):
# Input:
# unicode of text
# Return:
# [#number, #alpha, #Chinese, #Others]
fNumOfNum, fNumOfAlpha, fNumOfChn, fNumOfOther = 0, 0, 0, 0
for char in text:
charType = cls._get_charyype(char)
if 0 == charType:
fNumOfNum += 1
elif 1 == charType:
fNumOfAlpha += 1
elif 2 == charType:
fNumOfChn += 1
elif 3 == charType:
fNumOfOther += 1
return fNumOfNum, fNumOfAlpha, fNumOfChn, fNumOfOther

@classmethod
def is_chinese(cls, uchar):
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
return True
else:
return False

@classmethod
def is_alphabet(cls, uchar):
if (uchar >= u'\u0041' and uchar <= u'\u005a') or \
(uchar >= u'\u0061' and uchar <= u'\u007a'):
return True
else:
return False

@classmethod
def is_number(cls, uchar):
if uchar >= u'\u0030' and uchar <= u'\u0039':
return True
else:
return False

@classmethod
def q2b(cls, uchar):
"""全角转半角"""
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
return uchar
return unichr(inside_code)

@classmethod
def str_q2b(cls, sequence):
"""把字符串全角转半角"""
sequence = sequence.decode('utf-8')
return "".join([cls.q2b(uchar) for uchar in sequence]).encode('utf-8').strip()

@classmethod
def str_q2b_u2l(cls, sequence):
"""格式化字符串,完成全角转半角,大写转小写的工作"""
return cls.str_q2b(sequence).lower().strip()