-
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
87 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
.ACCESS_TOKEN | ||
*.txt | ||
training_data/ | ||
suggestions.json | ||
training_data/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
IMAGE_NAME := peaceiris/actions-suggest-related-links-models:latest | ||
|
||
.PHONY: build | ||
build: | ||
# docker-compose build --build-arg NODE_VERSION=$(NODE_VERSION) | ||
docker build -t ${IMAGE_NAME} | ||
|
||
.PHONY: run | ||
run: | ||
# docker-compose run --rm dev bash | ||
docker run --rm -i -t -v ${PWD}:/src ${IMAGE_NAME} bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
<!-- START doctoc generated TOC please keep comment here to allow auto update --> | ||
<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE --> | ||
**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* | ||
|
||
- [Models](#models) | ||
|
||
<!-- END doctoc generated TOC please keep comment here to allow auto update --> | ||
|
||
## Models | ||
|
||
```sh | ||
make build | ||
make run | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,67 @@ | ||
import argparse | ||
import fasttext | ||
import json | ||
import numpy as np | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-d","--document", type=str, required=True, help="path to documents") | ||
parser.add_argument("-model","--model_name", type=str, default="model.bin", help="outputted model name") | ||
parser.add_argument("-d","--documents", type=str, default='../training_data/training-data.json', help="path to documents") | ||
parser.add_argument("-train","--train_document", type=str, default='./train.txt', help="path to train document") | ||
parser.add_argument("-test","--test_document", type=str, default='./test.txt', help="path to test document") | ||
args = parser.parse_args() | ||
|
||
with open(args.document, "r") as f: | ||
terms = f.readlines() | ||
def cos_sim(v1, v2): | ||
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) | ||
|
||
model = fasttext.train_supervised(input=terms) | ||
model.save_model(args.model_name) | ||
# load train data | ||
issues_open = open(args.documents, "r") | ||
issues_load = json.load(issues_open) | ||
|
||
# load test data | ||
with open(args.test_document) as f: | ||
test = f.readlines() | ||
|
||
# add label | ||
terms = '' | ||
label_num = 0 | ||
for issue in issues_load: | ||
if issue['html_url'].split('/')[-2] == 'issues': | ||
terms += f"__label__{issue['number']} {issue['body']}\n" | ||
label_num += 1 | ||
|
||
with open(args.train_document, 'w') as f: | ||
f.write(terms) | ||
|
||
# train model | ||
with open(args.train_document) as f: | ||
trains = f.readlines() | ||
model = fasttext.train_supervised(input=args.train_document) | ||
|
||
# test model | ||
test_word_vector = np.mean([model[x] for word in test for x in word.split()], axis=0) | ||
results = [] | ||
for train in trains: | ||
result = {} | ||
train_word_vector = np.mean([model[x] for word in train.split()[1:] for x in word.split()], axis=0) | ||
prob = cos_sim(train_word_vector, test_word_vector) | ||
result['probability'] = prob | ||
result['label'] = train.split()[0] | ||
results.append(result) | ||
|
||
results = sorted(results, key=lambda x:x['probability'], reverse=True) | ||
suggestions = [] | ||
|
||
for result in results: | ||
for issue in issues_load: | ||
if issue['number'] == int(result['label'].split('__')[-1]): | ||
suggestion = {} | ||
suggestion['html_url'] = issue['html_url'] | ||
suggestion['title'] = issue['title'] | ||
suggestion['number'] = int(issue['html_url'].split('/')[-1]) | ||
suggestion['probability'] = float(result['probability']) | ||
suggestions.append(suggestion) | ||
|
||
suggestions = json.dumps(suggestions, indent=4) | ||
with open('suggestions.json', 'w') as f: | ||
f.write(suggestions) | ||
|
||
print(suggestions) |