-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathmarkov_speaking.py
117 lines (108 loc) · 4.66 KB
/
markov_speaking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# coding=utf8
# last edit date: 2016/10/19
# author: Forec
# LICENSE
# Copyright (c) 2015-2017, Forec <forec@bupt.edu.cn>
# Permission to use, copy, modify, and/or distribute this code for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
import jieba
import codecs
import random
import re
class Markov:
def __init__(self, filepath=None, mode=0, coding="utf8"):
self.dictLen = 0
self.Cap = []
self.mode = mode
self.coding = coding
self.dic = {}
if filepath is not None:
self.train(filepath, mode, coding)
def train(self, filepath='', mode=0, coding="utf8"):
self.dic = {}
self.Cap = []
self.mode = mode
self.coding = coding
if filepath is None or filepath == '':
return
eg_puncmark = re.compile('[\,\.\!\;\?\~\`\#\$\%\@\^&\*\(\)\]\[]')
zh_puncmark = re.compile('[,。!;]')
with codecs.open(filepath, "r", coding) as f:
for line in f.readlines():
words = []
line = re.sub('[\r\n]', "", line)
if mode == 0:
sentences = eg_puncmark.split(line)
sentences_words = []
for sentence in sentences:
sentences_words.append(
filter(lambda x: x != '', sentence.split(" ")))
for words in sentences_words:
for i in range(len(words)-2):
keypair = words[i] + " " + words[i+1]
if keypair[0].isupper():
self.Cap.append(keypair)
if self.dic.get(keypair) is None:
self.dic[keypair] = [words[i+2]]
else:
self.dic[keypair].append(words[i+2])
else:
sentences = zh_puncmark.split(line)
for sentence in sentences:
jwords = jieba.cut(sentence, cut_all=False)
for word in jwords:
if len(word) >= 2:
words.append(word)
if len(words) > 2:
self.Cap.append(words[0] + " " + words[1])
# words = filter(lambda x:x != '', words)
# print(words)
for i in range(len(words)-2):
keypair = words[i] + " " + words[i+1]
if self.dic.get(keypair) is None:
self.dic[keypair] = [words[i+2]]
else:
self.dic[keypair].append(words[i+2])
self.dictLen = len(self.dic)
def getDic(self):
return self.dic
def say(self, length=10):
if self.dictLen <= 2:
print("I feel tired and I need food to say something.")
else:
keypair = self.Cap[random.randint(0, len(self.Cap)-1)]
fst, snd = keypair.split(" ")[0], keypair.split(" ")[1]
pairlen = len(self.dic[keypair])
if self.mode == 0:
sentence = fst + " " + snd
else:
sentence = fst + snd
while length > 0 and pairlen > 0:
temp = self.dic[keypair][random.randint(0, pairlen-1)]
fst = snd
snd = temp
if self.mode == 0:
sentence = sentence + " " + snd
else:
sentence = sentence + snd
keypair = fst + " " + snd
if self.dic.get(keypair) is not None:
pairlen = len(self.dic[keypair])
else:
break
length -= 1
if self.mode == 0:
print(sentence + ".")
else:
# sentence + "。".encode("utf8")
# print(sentence + "。".decode("utf8"))
print(sentence)
return sentence