Skip to content
This repository was archived by the owner on Jan 14, 2020. It is now read-only.

Commit aa862f4

Browse files
Add 7.1. To collect all current open issues.
Add 7.2. Calculate cosine similarity between new issues and users.
1 parent 17fba61 commit aa862f4

File tree

3 files changed

+72902
-0
lines changed

3 files changed

+72902
-0
lines changed

python37/7.1. Get_New_Issues.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import urllib3
2+
import requests
3+
import Basic_Functions as bfs
4+
5+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
6+
7+
# GitHub API oAuth
8+
CLIENT_ID = "6aca4b66775c629cbafd"
9+
CLIENT_SECRET = "2349a0a9266e81f9f4d7df62ca49a98ca357b20c"
10+
11+
OWNER = "symfony"
12+
REPO = "symfony"
13+
FIRST_TIMER_LABELS = ["Easy Pick"]
14+
15+
16+
# ==============Functions==============
17+
def getIssueData(owner="symfony", repo="symfony", required_labels=["good first issue"]):
18+
dataset = {}
19+
20+
# ============ Local Search ============
21+
# ============ Online Search ============
22+
index = 0
23+
STOP = False
24+
STATE = "open"
25+
while not STOP:
26+
index += 1
27+
# Example Page: https://api.github.com/repos/symfony/symfony/issues?state=closed&page=1
28+
issues_url = "https://api.github.com/repos/{}/{}/issues?state={}&page={}&per_page=100&client_id={}&client_secret={}".format(
29+
owner, repo, STATE, index, CLIENT_ID, CLIENT_SECRET)
30+
print(issues_url)
31+
issues = requests.get(issues_url, verify=False).json()
32+
# If the length of issues is 0, check whether should search closed issue or stop the while loop
33+
if len(issues) == 0:
34+
STOP = True
35+
# Process all issues
36+
for issue in issues:
37+
RECORD = True
38+
# for label in issue["labels"]:
39+
# if label["name"] in required_labels:
40+
# RECORD = True
41+
if RECORD:
42+
# --process issue
43+
issue["title"] = str(issue["title"]).replace("\"", "'")
44+
issue["title"] = str(issue["title"]).replace('\n', "")
45+
issue["title"] = str(issue["title"]).replace('\r', "")
46+
issue["body"] = str(issue["title"]).replace("\"", "'")
47+
issue["body"] = str(issue["title"]).replace('\n', "")
48+
issue["body"] = str(issue["title"]).replace('\r', "")
49+
50+
# Get pull request
51+
# pull_request_url = "https://api.github.com/repos/{}/{}/pulls/{}?client_id={}&client_secret={}".format(
52+
# owner, repo, issue["number"], CLIENT_ID, CLIENT_SECRET)
53+
# print(pull_request_url)
54+
# pull_request = requests.get(pull_request_url, verify=False).json()
55+
# --process pull request
56+
# commits = []
57+
# if "number" in pull_request:
58+
# pull_request["title"] = str(pull_request["title"]).replace("\"", "'")
59+
# pull_request["body"] = str(pull_request["body"]).replace("\"", "'")
60+
#
61+
# # Get commit
62+
# pull_commits_url = "{}?client_id={}&client_secret={}".format(
63+
# pull_request["commits_url"], CLIENT_ID, CLIENT_SECRET)
64+
# print(pull_commits_url)
65+
# pull_commits = requests.get(pull_commits_url, verify=False).json()
66+
# # --process pull request
67+
# for pull_commit in pull_commits:
68+
# commits.append(pull_commit)
69+
# else:
70+
# pull_request = None
71+
72+
# Add all data to dataset
73+
# dataset.append({"issue": issue, "pull_request": pull_request, "pull_commits": commits})
74+
dataset[issue["number"]] = issue
75+
# dataset.append({"issue": issue})
76+
return dataset
77+
78+
79+
# ==============Main==============
80+
dataset_master = getIssueData(owner=OWNER, repo=REPO, required_labels=FIRST_TIMER_LABELS)
81+
82+
bfs.writeJsonFile(data=dataset_master, name="all_open_issues_{}".format(REPO), folder="data/open_issues")
83+
84+
print(len(dataset_master))
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import Basic_Functions as bfs
2+
from sklearn.feature_extraction.text import TfidfVectorizer
3+
from sklearn.metrics.pairwise import cosine_similarity
4+
import math
5+
6+
REPO = "symfony"
7+
8+
TARGET_USER = "248818"
9+
10+
issues = bfs.readJsonFile(name="all_open_issues_{}".format(REPO), folder="data/open_issues")
11+
users = bfs.readJsonFile(name="processed_user_text_{}".format(REPO), folder="data/user_text")
12+
13+
14+
def recommendation(user="248818"):
15+
16+
BEST_SCORE = 0.0
17+
BEST_RESULT = {"number": "", "score": 0.0}
18+
19+
user_profile = users[user]
20+
# print(user_profile)
21+
user_content = ""
22+
23+
user_content += (user_profile["commit_comments"].replace('\r', '')).replace('\n', '') + " "
24+
user_content += (user_profile["commits"].replace('\r', '')).replace('\n', '') + " "
25+
user_content += (user_profile["issue_comments"].replace('\r', '')).replace('\n', '') + " "
26+
user_content += (user_profile["pr_comments"].replace('\r', '')).replace('\n', '') + " "
27+
user_content += (user_profile["prs"].replace('\r', '')).replace('\n', '') + " "
28+
29+
print(user_content)
30+
31+
# For every issue
32+
for issue_id, issue in issues.items():
33+
# print(issue_id)
34+
temp = ""
35+
temp += "{} {}".format((issue["title"].replace('\r', '')).replace('\n', ''), (issue["body"].replace('\r', '')).replace('\n', ''))
36+
documents = (user_content, temp)
37+
38+
tfidf_vectorizer = TfidfVectorizer()
39+
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
40+
# print(tfidf_matrix.shape)
41+
42+
result = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
43+
result = result.tolist()
44+
score = float(result[0][1])
45+
print(score)
46+
47+
if score > BEST_SCORE or BEST_RESULT["number"] == "":
48+
BEST_SCORE = score
49+
BEST_RESULT["score"] = score
50+
BEST_RESULT["number"] = issue_id
51+
52+
53+
return BEST_RESULT
54+
55+
56+
result = recommendation(TARGET_USER)
57+
print("The best issue for user {} is issue No.{} (score: {})".format(TARGET_USER, result["number"], result["score"]))

0 commit comments

Comments
 (0)