Skip to content

use []string instead of chan string to optimizate performance #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tokenizers/jieba.beleve/
2 changes: 1 addition & 1 deletion analyse/tag_extracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func (t *TagExtracter) LoadStopWords(fileName string) error {
func (t *TagExtracter) ExtractTags(sentence string, topK int) (tags Segments) {
freqMap := make(map[string]float64)

for w := range t.seg.Cut(sentence, true) {
for _, w := range t.seg.Cut(sentence, true) {
w = strings.TrimSpace(w)
if utf8.RuneCountInString(w) < 2 {
continue
Expand Down
136 changes: 67 additions & 69 deletions finalseg/finalseg.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,88 +10,86 @@ var (
reSkip = regexp.MustCompile(`(\d+\.\d+|[a-zA-Z0-9]+)`)
)

func cutHan(sentence string) chan string {
result := make(chan string)
go func() {
runes := []rune(sentence)
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
begin, next := 0, 0
for i, char := range runes {
pos := posList[i]
switch pos {
case 'B':
begin = i
case 'E':
result <- string(runes[begin : i+1])
next = i + 1
case 'S':
result <- string(char)
next = i + 1
}
}
if next < len(runes) {
result <- string(runes[next:])
func cutHan(sentence string) []string {
result := make([]string, 0, 10)

runes := []rune(sentence)
_, posList := viterbi(runes, []byte{'B', 'M', 'E', 'S'})
begin, next := 0, 0
for i, char := range runes {
pos := posList[i]
switch pos {
case 'B':
begin = i
case 'E':
result = append(result, string(runes[begin:i+1]))
next = i + 1
case 'S':
result = append(result, string(char))
next = i + 1
}
close(result)
}()
}
if next < len(runes) {
result = append(result, string(runes[next:]))
}

return result
}

// Cut cuts sentence into words using Hidden Markov Model with Viterbi
// algorithm. It is used by Jiebago for unknonw words.
func Cut(sentence string) chan string {
result := make(chan string)
func Cut(sentence string) []string {
result := make([]string, 0, 10)
s := sentence
var hans string
var hanLoc []int
var nonhanLoc []int
go func() {
for {
hanLoc = reHan.FindStringIndex(s)
if hanLoc == nil {
if len(s) == 0 {
break
}
} else if hanLoc[0] == 0 {
hans = s[hanLoc[0]:hanLoc[1]]
s = s[hanLoc[1]:]
for han := range cutHan(hans) {
result <- han
}
continue

for {
hanLoc = reHan.FindStringIndex(s)
if hanLoc == nil {
if len(s) == 0 {
break
}
nonhanLoc = reSkip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
}
} else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result <- nonhans
continue
}
} else if hanLoc[0] == 0 {
hans = s[hanLoc[0]:hanLoc[1]]
s = s[hanLoc[1]:]
for _, han := range cutHan(hans) {
result = append(result, han)
}
var loc []int
if hanLoc == nil && nonhanLoc == nil {
if len(s) > 0 {
result <- s
break
}
} else if hanLoc == nil {
loc = nonhanLoc
} else if nonhanLoc == nil {
loc = hanLoc
} else if hanLoc[0] < nonhanLoc[0] {
loc = hanLoc
} else {
loc = nonhanLoc
continue
}
nonhanLoc = reSkip.FindStringIndex(s)
if nonhanLoc == nil {
if len(s) == 0 {
break
}
} else if nonhanLoc[0] == 0 {
nonhans := s[nonhanLoc[0]:nonhanLoc[1]]
s = s[nonhanLoc[1]:]
if nonhans != "" {
result = append(result, nonhans)
continue
}
result <- s[:loc[0]]
s = s[loc[0]:]
}
close(result)
}()
var loc []int
if hanLoc == nil && nonhanLoc == nil {
if len(s) > 0 {
result = append(result, s)
break
}
} else if hanLoc == nil {
loc = nonhanLoc
} else if nonhanLoc == nil {
loc = hanLoc
} else if hanLoc[0] < nonhanLoc[0] {
loc = hanLoc
} else {
loc = nonhanLoc
}
result = append(result, s[:loc[0]])
s = s[loc[0]:]
}

return result
}
Loading