model: fastText (#57)

peaceiris · Sep 17, 2020 · e10318a · e10318a
1 parent b99ec95
commit e10318a
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 7 deletions.
diff --git a/models/.gitignore b/models/.gitignore
@@ -1,3 +1,4 @@
 .ACCESS_TOKEN
 *.txt
-training_data/
+suggestions.json
+training_data/
diff --git a/models/Makefile b/models/Makefile
@@ -0,0 +1,11 @@
+IMAGE_NAME := peaceiris/actions-suggest-related-links-models:latest
+
+.PHONY: build
+build:
+	# docker-compose build --build-arg NODE_VERSION=$(NODE_VERSION)
+	docker build -t ${IMAGE_NAME}
+
+.PHONY: run
+run:
+	# docker-compose run --rm dev bash
+	docker run --rm -i -t -v ${PWD}:/src ${IMAGE_NAME} bash
diff --git a/models/README.md b/models/README.md
@@ -0,0 +1,14 @@
+<!-- START doctoc generated TOC please keep comment here to allow auto update -->
+<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
+**Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
+
+- [Models](#models)
+
+<!-- END doctoc generated TOC please keep comment here to allow auto update -->
+
+## Models
+
+```sh
+make build
+make run
+```
diff --git a/models/fasttext/train.py b/models/fasttext/train.py
@@ -1,13 +1,67 @@
 import argparse
 import fasttext
+import json
+import numpy as np
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-d","--document", type=str, required=True, help="path to documents")
-parser.add_argument("-model","--model_name", type=str, default="model.bin", help="outputted model name")
+parser.add_argument("-d","--documents", type=str, default='../training_data/training-data.json', help="path to documents")
+parser.add_argument("-train","--train_document", type=str, default='./train.txt', help="path to train document")
+parser.add_argument("-test","--test_document", type=str, default='./test.txt', help="path to test document")
 args = parser.parse_args()
 
-with open(args.document, "r") as f:
-    terms = f.readlines()
+def cos_sim(v1, v2):
+    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
 
-model = fasttext.train_supervised(input=terms)
-model.save_model(args.model_name)
+# load train data
+issues_open = open(args.documents, "r")
+issues_load = json.load(issues_open)
+
+# load test data
+with open(args.test_document) as f:
+    test = f.readlines()
+
+# add label
+terms = ''
+label_num = 0
+for issue in issues_load:
+    if issue['html_url'].split('/')[-2] == 'issues':
+        terms += f"__label__{issue['number']} {issue['body']}\n"
+        label_num += 1
+
+with open(args.train_document, 'w') as f:
+    f.write(terms)
+
+# train model
+with open(args.train_document) as f:
+    trains = f.readlines()
+model = fasttext.train_supervised(input=args.train_document)
+
+# test model
+test_word_vector = np.mean([model[x] for word in test for x in word.split()], axis=0)
+results = []
+for train in trains:
+    result = {}
+    train_word_vector = np.mean([model[x] for word in train.split()[1:] for x in word.split()], axis=0)
+    prob = cos_sim(train_word_vector, test_word_vector)
+    result['probability'] = prob
+    result['label'] = train.split()[0]
+    results.append(result)
+
+results = sorted(results, key=lambda x:x['probability'], reverse=True)
+suggestions = []
+
+for result in results:
+    for issue in issues_load:
+        if issue['number'] == int(result['label'].split('__')[-1]):
+            suggestion = {}
+            suggestion['html_url'] = issue['html_url']
+            suggestion['title'] = issue['title']
+            suggestion['number'] = int(issue['html_url'].split('/')[-1])
+            suggestion['probability'] = float(result['probability'])
+            suggestions.append(suggestion)
+
+suggestions = json.dumps(suggestions, indent=4)
+with open('suggestions.json', 'w') as f:
+    f.write(suggestions)
+
+print(suggestions)