removed unused files.

wuyunxiangwyx · Aug 21, 2016 · 70aae39 · 70aae39
1 parent 455b76b
commit 70aae39
Show file tree

Hide file tree

Showing 18 changed files with 1,459 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,103 @@
-# chatbot-backend
+# Chatbot
+
+Chatbot 可以將對話向量化，基於與規則庫間的主題相似度匹配，來依照使用者可能的需求提供答覆。
+
+## 匹配示例
+
+更多的樣例可以參照 `example/output.txt`
+
+    Case# 明天早上叫我起床。
+    ------------------
+    0.4521	鬧鐘      起床
+    0.3904	天氣      早上
+    0.3067	住宿      起床
+    0.1747	病症      起床
+    0.1580	購買      早上
+    0.1270	股票      早上
+    0.1096	觀光      早上
+
+    Case# 明天上海會不會下雨？
+    ------------------
+    0.5665	天氣      下雨
+    0.3918	鬧鐘      下雨
+    0.1807	病症      下雨
+    0.1362	住宿      下雨
+    0.0000	股票
+    0.0000	觀光
+    0.0000	購買
+
+## 環境需求
+
+* 安裝 python3 開發環境
+* 安裝 [gensim – Topic Modelling in Python](https://github.com/RaRe-Technologies/gensim)
+* 安裝 [jieba 结巴中文分词 ](https://github.com/fxsjy/jieba)
+* 有已訓練好的中文詞向量，根據目錄調整 `Class Console` 的初始化參數
+```
+import console
+c = console.Console(model_path='your_model')
+```
+
+## 使用方式
+
+### 聊天機器人
+
+演示可見 `python3 chatbot.py`
+
+### 計算匹配度
+
+```
+import console
+c = console.Console(model_path='your_model')
+speech = input('Input a sentence:')
+res,path = c.rule_match(speech) #取得已照相似度排序的規則
+c.write_output(speech,res,path)
+```
+
+## 規則格式
+
+規則採用 json 格式，樣板規則放置於`\RuleMatcher\rule`中，
+
+```
+    {
+        "domain": 代表這個規則的抽象概念,
+        "response": [
+			對應到該規則後，
+            機器人所會給予的回覆，
+            機器人會隨機抽取一條 response
+        ],
+        "concepts": [
+            該規則的可能表示方式
+        ],
+        "children": [該規則的子規則，如購買 -> 購買飲料,購買衣服......]
+    }
+```
+
+### Example
+
+```
+    {
+        "domain": "購買",
+        "response": [
+        	"正在將您導向購物模組"
+        ],
+        "concepts": [
+            "購買","購物","訂購"
+        ],
+        "children": [
+            "購買生活用品",
+            "購買家電",
+            "購買食物",
+            "購買飲料",
+            "購買鞋子",
+            "購買衣服",
+            "購買電腦產品"
+        ]
+    },
+```
+
+## 開發日誌
+
+## TODO
+* 追加規則案例
+* 實作平台 adapter
+
diff --git a/RuleMatcher/__pycache__/__init__.cpython-35.pyc b/RuleMatcher/__pycache__/__init__.cpython-35.pyc
diff --git a/RuleMatcher/__pycache__/rulebase.cpython-35.pyc b/RuleMatcher/__pycache__/rulebase.cpython-35.pyc
diff --git a/RuleMatcher/rule/rule.json b/RuleMatcher/rule/rule.json
@@ -56,7 +56,7 @@
         "domain": "購買",
         "response": [],
         "concepts": [
-            "購買"
+            "購買","購物","訂購"
         ],
         "children": [
             "購買生活用品",

diff --git a/__pycache__/console.cpython-35.pyc b/__pycache__/console.cpython-35.pyc
diff --git a/demo.py b/demo.py
@@ -0,0 +1,5 @@
+import console
+c = console.Console()
+speech = input('Input a sentence:')
+res,path = c.rule_match(speech)
+c.write_output(speech,res,path)
diff --git a/medicine/__init__.py b/medicine/__init__.py
diff --git a/medicine/combineData.py b/medicine/combineData.py
@@ -0,0 +1,99 @@
+#coding=utf-8
+#author: Justin Yang
+
+import requests
+from bs4 import BeautifulSoup
+import lxml
+import time
+import sys
+import os
+
+def main():
+	symptomSet = loadDataWithSet("symptoms.txt")
+	# result = matchDiseaseWithSymptoms(symptomSet, "rawpage")
+	result = getDCPair("data")
+	writeDDPair2file('result/dcpair',result)
+	# writeDSPairs2file('result/without_suffix_6', result)
+
+def loadDataWithSet(path):
+
+	elementSet = set()
+	with open(path, 'r') as file:
+		for line in file:
+			element = line.strip('\n')
+			elementSet.add(element)
+	return elementSet
+
+def matchDiseaseWithSymptoms(sym,dir):
+
+	'''從原始頁面中抽取疾病與症狀的配對集
+	'''
+	dic = {}
+
+	for filename in os.listdir("./"+dir):
+		# IGNORE .DS_STORE
+		if not filename.startswith('.'):
+			soup = BeautifulSoup(open(dir+"/"+filename),"lxml")
+
+			# clean the url in 症状查詢專題 like http://cht.a-hospital.com/w/%E6%96%9C%E8%A7%86
+			for nonce in soup.find_all('table'):
+				nonce.clear()
+			for candidate in soup.select("p > a"):
+				if candidate.text in sym:
+					#d_xxx.txt
+					disease_mata = filename.split(".")[0]
+					disease = disease_mata.split("_")[1]
+					if candidate.text in dic:
+						dic[candidate.text].add(disease)
+					else:
+						dic[candidate.text] = set()
+						dic[candidate.text].add(disease)
+	return dic
+
+def getDCPair(dir):
+
+	'''取得疾病的對應科別，資料來自data，格式為:'symptom:disease1,disease2......'
+	'''
+
+	dic = {}
+	for department in os.listdir("./"+dir):
+		# IGNORE .DS_STORE
+		if not department.startswith('.'):
+			with open(dir+"/"+department) as input:
+				diseaseSet = set()
+				for line in input:
+					line = line.strip('\n')
+					line = line.strip(',')
+					diseaseList = line.split(':')[1].split(',')
+					for disease in diseaseList:
+						if disease not in diseaseSet:
+							diseaseSet.add(disease)
+				dic[department.split('.')[0]] = diseaseSet
+	return dic
+
+def writeDSPairs2file(filename,dic):
+
+	'''輸出症狀集，急性XXX與慢性XXX統一視為XXX
+	'''
+	with open(filename,'w',encoding='utf-8') as res:
+		for symptom,diseaseSet in dic.items():
+			res.write(symptom+":")
+			for disease in diseaseSet:
+				if disease != symptom and disease != "口臭":
+					if "急性" not in disease and "慢性" not in disease:
+						res.write(disease+",")
+			res.write('\n')
+
+def writeDDPair2file(filename,dic):
+
+	'''輸出疾病與部門的配對列表
+	'''
+	with open(filename,'w',encoding='utf-8') as output:
+		for department,diseaseSet in dic.items():
+			output.write(department+":")
+			for disease in diseaseSet:
+				output.write(disease+",")
+			output.write('\n')
+
+if __name__=="__main__":
+	main()