Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add: add idf embed and dict path custom support, update godoc #166

Merged
merged 2 commits into from
Jan 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions dict_1.16.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (seg *Segmenter) loadZhST(d string) (begin int, err error) {
return
}

// LoadDictEmbed load dictionary by embed file
// LoadDictEmbed load the dictionary by embed file
func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error) {
if len(dict) > 0 {
d := dict[0]
Expand Down Expand Up @@ -87,7 +87,7 @@ func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error) {
return seg.loadZh()
}

// LoadDictStr load dictionary from string
// LoadDictStr load the dictionary from string
func (seg *Segmenter) LoadDictStr(dict string) error {
if seg.Dict == nil {
seg.Dict = NewDict()
Expand Down Expand Up @@ -128,7 +128,7 @@ func (seg *Segmenter) LoadDictStr(dict string) error {
return nil
}

// LoadStopEmbed load stop dictionary from embed file
// LoadStopEmbed load the stop dictionary from embed file
func (seg *Segmenter) LoadStopEmbed(dict ...string) (err error) {
if len(dict) > 0 {
d := dict[0]
Expand Down
3 changes: 3 additions & 0 deletions dict_embed.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ var (
zhT string
//go:embed data/dict/zh/s_1.txt
zhS string

//go:embed data/dict/zh/idf.txt
zhIdf string
)

//go:embed data/dict/zh/stop_tokens.txt
Expand Down
13 changes: 9 additions & 4 deletions dict_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ func (seg *Segmenter) Init() {
seg.TextFreq = "2.0"
}

// init the model of hmm cut
if !seg.NotLoadHMM {
seg.LoadModel()
}
Expand Down Expand Up @@ -154,7 +155,7 @@ func (seg *Segmenter) LoadDict(files ...string) error {
}

var (
dictDir = path.Join(path.Dir(GetCurrentFilePath()), "data")
dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
dictPath string
// load bool
)
Expand Down Expand Up @@ -216,15 +217,19 @@ func (seg *Segmenter) LoadDict(files ...string) error {
}

// GetCurrentFilePath get the current file path
func GetCurrentFilePath() string {
func (seg *Segmenter) GetCurrentFilePath() string {
if seg.DictPath != "" {
return seg.DictPath
}

_, filePath, _, _ := runtime.Caller(1)
return filePath
}

// GetIdfPath get the idf path
func GetIdfPath(files ...string) []string {
func (seg *Segmenter) GetIdfPath(files ...string) []string {
var (
dictDir = path.Join(path.Dir(GetCurrentFilePath()), "data")
dictDir = path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
dictPath = path.Join(dictDir, "dict/zh/idf.txt")
)

Expand Down
2 changes: 1 addition & 1 deletion hmm/idf/idf.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func (i *Idf) AddToken(text string, freq float64, pos ...string) error {
// LoadDict load the idf dictionary
func (i *Idf) LoadDict(files ...string) error {
if len(files) <= 0 {
files = gse.GetIdfPath(files...)
files = i.seg.GetIdfPath(files...)
}

return i.seg.LoadDict(files...)
Expand Down
6 changes: 6 additions & 0 deletions hmm/idf/tag_extracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ func (t *TagExtracter) LoadIdf(fileName ...string) error {
return t.Idf.LoadDict(fileName...)
}

// LoadIdfStr load and create a new Idf dictionary from the string.
func (t *TagExtracter) LoadIdfStr(str string) error {
t.Idf = NewIdf()
return t.Idf.seg.LoadDictStr(str)
}

// LoadStopWords load and create a new StopWord dictionary from the file.
func (t *TagExtracter) LoadStopWords(fileName ...string) error {
t.stopWord = NewStopWord()
Expand Down
32 changes: 11 additions & 21 deletions seg_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,15 @@ import (
"fmt"
)

// ToString segments to string 输出分词结果为字符串
// ToString converts a segments slice to string retrun the string
//
// 有两种输出模式,以 "山达尔星联邦共和国" 为例
// two output modes:
//
// 普通模式(searchMode=false)输出一个分词 "山达尔星联邦共和国/ns "
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
// "山达尔星/nz 联邦/n 共和/nz 国/n 共和国/ns 联邦共和国/nt 山达尔星联邦共和国/ns "
// normal mode (searchMode=false)
// search mode(searchMode=true)
//
// 默认 searchMode=false
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见 Token 结构体的注释。
// default searchMode=false
// search mode is used search engine, and will output more results
func ToString(segs []Segment, searchMode ...bool) (output string) {
var mode bool
if len(searchMode) > 0 {
Expand Down Expand Up @@ -76,16 +75,7 @@ func tokenToBytes(token *Token) (output []byte) {
return
}

// ToSlice segments to slice 输出分词结果到一个字符串 slice
//
// 有两种输出模式,以 "山达尔星联邦共和国" 为例
//
// 普通模式(searchMode=false)输出一个分词"[山达尔星联邦共和国]"
// 搜索模式(searchMode=true) 输出普通模式的再细致切分:
// "[山达尔星 联邦 共和 国 共和国 联邦共和国 山达尔星联邦共和国]"
//
// 默认 searchMode=false
// 搜索模式主要用于给搜索引擎提供尽可能多的关键字,详情请见Token结构体的注释。
// ToSlice converts a segments to slice retrun string slice
func ToSlice(segs []Segment, searchMode ...bool) (output []string) {
var mode bool
if len(searchMode) > 0 {
Expand Down Expand Up @@ -121,7 +111,7 @@ func tokenToSlice(token *Token) (output []string) {
return
}

// ToPos segments to SegPos
// ToPos converts a segments slice to []SegPos
func ToPos(segs []Segment, searchMode ...bool) (output []SegPos) {
var mode bool
if len(searchMode) > 0 {
Expand Down Expand Up @@ -168,20 +158,20 @@ func tokenToPos(token *Token) (output []SegPos) {
return
}

// 将多个字元拼接一个字符串输出
// let make multiple []Text into one string ooutput
func textToString(text []Text) (output string) {
for _, word := range text {
output += string(word)
}
return
}

// 将多个字元拼接一个字符串输出
// let make []Text toString returns a string output
func textSliceToString(text []Text) string {
return Join(text)
}

// 返回多个字元的字节总长度
// retrun total length of text slice
func textSliceByteLen(text []Text) (length int) {
for _, word := range text {
length += len(word)
Expand Down
57 changes: 35 additions & 22 deletions segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ import (

// Segmenter define the segmenter structure
type Segmenter struct {
Dict *Dictionary
Load bool
DictSep string
Dict *Dictionary
Load bool
DictSep string
DictPath string

// NotLoadHMM option load the default hmm model config (Chinese char)
NotLoadHMM bool
Expand Down Expand Up @@ -57,7 +58,8 @@ type Segmenter struct {
StopWordMap map[string]bool
}

// jumper 该结构体用于记录 Viterbi 算法中某字元处的向前分词跳转信息
// jumper this structure is used to record information
// about the forward leap at a word in the Viterbi algorithm
type jumper struct {
minDistance float32
token *Token
Expand Down Expand Up @@ -87,7 +89,7 @@ func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment {
}

func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
// specific case
// special cases
if len(bytes) == 0 {
// return []Segment{}
return nil
Expand All @@ -100,14 +102,16 @@ func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
}

func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
// 搜索模式下该分词已无继续划分可能的情况
// The case where the division is no longer possible in the search mode
if searchMode && len(text) == 1 {
return nil
}

// jumpers 定义了每个字元处的向前跳转信息,
// 包括这个跳转对应的分词,
// 以及从文本段开始到该字元的最短路径值
// jumpers defines the forward jump information at each literal,
// including the subword corresponding to this jump,
// the and the value of the shortest path from the start
// of the text segment to that literal
//
jumpers := make([]jumper, len(text))

if seg.Dict == nil {
Expand All @@ -116,43 +120,49 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {

tokens := make([]*Token, seg.Dict.maxTokenLen)
for current := 0; current < len(text); current++ {
// 找到前一个字元处的最短路径,以便计算后续路径值
// find the shortest path of the previous token,
// to calculate the subsequent path values
var baseDistance float32
if current == 0 {
// 当本字元在文本首部时,基础距离应该是零
// When this character is at the beginning of the text,
// the base distance should be zero
baseDistance = 0
} else {
baseDistance = jumpers[current-1].minDistance
}

// 寻找所有以当前字元开头的分词
// find all the segments starting with this token
tx := text[current:minInt(current+seg.Dict.maxTokenLen, len(text))]
numTokens := seg.Dict.LookupTokens(tx, tokens)

// 对所有可能的分词,更新分词结束字元处的跳转信息
// Update the jump information at the end of the split word
// for all possible splits
for iToken := 0; iToken < numTokens; iToken++ {
location := current + len(tokens[iToken].text) - 1
if !searchMode || current != 0 || location != len(text)-1 {
updateJumper(&jumpers[location], baseDistance, tokens[iToken])
}
}

// 当前字元没有对应分词时补加一个伪分词
// Add a pseudo-syllable if there is no corresponding syllable
// for the current character
if numTokens == 0 || len(tokens[0].text) > 1 {
updateJumper(&jumpers[current], baseDistance,
&Token{text: []Text{text[current]}, freq: 1, distance: 32, pos: "x"})
}
}

// 从后向前扫描第一遍得到需要添加的分词数目
// Scan the first pass from back to front
// to get the number of subwords to be added
numSeg := 0
for index := len(text) - 1; index >= 0; {
location := index - len(jumpers[index].token.text) + 1
numSeg++
index = location - 1
}

// 从后向前扫描第二遍添加分词到最终结果
// Scan from back to front for a second time
// to add the split to the final result
outputSegments := make([]Segment, numSeg)
for index := len(text) - 1; index >= 0; {
location := index - len(jumpers[index].token.text) + 1
Expand All @@ -161,7 +171,7 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
index = location - 1
}

// 计算各个分词的字节位置
// Calculate the byte position of each participle
bytePosition := 0
for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
outputSegments[iSeg].start = bytePosition
Expand All @@ -172,11 +182,14 @@ func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
return outputSegments
}

// updateJumper 更新跳转信息:
// 1. 当该位置从未被访问过时 (jumper.minDistance 为零的情况),或者
// 2. 当该位置的当前最短路径大于新的最短路径时
// updateJumper Update the jump information:
// 1. When the location has never been visited
// (the case where jumper.minDistance is zero), or
// 2. When the current shortest path at the location
// is greater than the new shortest path
//
// 将当前位置的最短路径值更新为 baseDistance 加上新分词的概率
// Update the shortest path value of the current location to baseDistance
// add the probability of the new split
func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
newDistance := baseDistance + token.distance
if jumper.minDistance == 0 || jumper.minDistance > newDistance {
Expand All @@ -202,7 +215,7 @@ func (seg *Segmenter) SplitTextToWords(text Text) []Text {
isNum := unicode.IsNumber(r) && !seg.Num
isAlpha := unicode.IsLetter(r) && !seg.Alpha
if size <= 2 && (isAlpha || isNum) {
// 当前是拉丁字母或数字(非中日韩文字)
// Currently is Latin alphabet or numbers (not in CJK)
if !inAlphanumeric {
alphanumericStart = current
inAlphanumeric = true
Expand Down
9 changes: 7 additions & 2 deletions segmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ func TestToken(t *testing.T) {
}

func TestDictPaths(t *testing.T) {
var seg1 Segmenter
// seg.SkipLog = true
paths := DictPaths("./dictDir", "zh, jp")
tt.Expect(t, "3", len(paths))
Expand All @@ -238,9 +239,13 @@ func TestDictPaths(t *testing.T) {
tt.Expect(t, "3", len(paths))
tt.Equal(t, paths, paths1)

p := strings.ReplaceAll(GetCurrentFilePath(), "/segmenter_test.go", "") +
p := strings.ReplaceAll(seg1.GetCurrentFilePath(), "/segmenter_test.go", "") +
`/data/dict/zh/idf.txt`
tt.Equal(t, "["+p+"]", GetIdfPath([]string{}...))
tt.Equal(t, "["+p+"]", seg1.GetIdfPath([]string{}...))

seg1.DictPath = "testdata/zh"
tt.Equal(t, "testdata/zh", seg1.GetCurrentFilePath())
tt.Equal(t, "[testdata/data/dict/zh/idf.txt]", seg1.GetIdfPath([]string{}...))
}

func TestInAlphaNum(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion stop.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func (seg *Segmenter) LoadStop(files ...string) error {
seg.StopWordMap = make(map[string]bool)
}

dictDir := path.Join(path.Dir(GetCurrentFilePath()), "data")
dictDir := path.Join(path.Dir(seg.GetCurrentFilePath()), "data")
if len(files) <= 0 {
dictPath := path.Join(dictDir, "dict/zh/stop_word.txt")
files = append(files, dictPath)
Expand Down
Loading