Skip to content

Commit 02a36c6

Browse files
update files
0 parents  commit 02a36c6

7 files changed

+14587
-0
lines changed

Dev2vec_data_load_clean.ipynb

Lines changed: 907 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Dev2vec
2+
This is a replication package for the article
3+
<a href="https://www.sciencedirect.com/science/article/abs/pii/S0950584923000721"><strong>Dev2vec: Representing Domain Expertise of Developers in an Embedding Space</strong></a>
4+
5+
The paper aims to encode the expertise of developers, learned from different source of information, into embedding vectors.<br />
6+
These vectors are learned based on doc2vec model that is trained on three different sources of information: repository meta data, issue resolving history and API calls.<br />
7+
![](https://github.com/ferdorart/EmbeddingVectors/blob/main/detailed_diagram.jpg)
8+
9+
10+
We name these models: dev2vec:repos, dev2vec:Issues and dev2vec:APIs <br />
11+
12+
### Download dev2vec:
13+
You can download the two models dev2vec:repos and dev2vec:Issues from <a href="https://doi.org/10.5281/zenodo.7580313"><strong>here</strong></a> <br />
14+
### Load our trained model:
15+
16+
```
17+
model = Doc2Vec.load("dev2vec_repos")
18+
model = Doc2Vec.load("dev2vec_issues")
19+
```
20+
21+
The model that is used to generated embedding vectors for dev2vec:APIs is the pretrained model from the article <a href="https://ieeexplore.ieee.org/abstract/document/9401957?casa_token=G8DjJLSm2sQAAAAA:3h8AEP8d0XLzSgHaVkSal9k7AyQ1pfXt18uuCCeIyiCMEmEKqlkgR1xsaoJj-iJIbGVP-hbeRg"><strong>Representation of Developer Expertise in Open Source Software</strong></a> <br />
22+
23+
-------------------------------------------------------------------------------------------------------------------------------------------------
24+
## Citation
25+
<a href="https://arxiv.org/abs/2207.05132"><strong>Dev2vec: Representing Domain Expertise of Developers in an Embedding Space</strong></a>
26+
```
27+
@article{dakhel2022dev2vec,
28+
title={Dev2vec: Representing Domain Expertise of Developers in an Embedding Space},
29+
author={Dakhel, Arghavan Moradi and Desmarais, Michel C and Khomh, Foutse},
30+
journal={arXiv preprint arXiv:2207.05132},
31+
year={2022}
32+
}
33+
```
34+
35+

detailed_diagram.jpg

153 KB
Loading

git_Collect_APIs.ipynb

Lines changed: 12520 additions & 0 deletions
Large diffs are not rendered by default.

git_issue_data_collection.ipynb

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import pandas as pd\n",
10+
"import requests \n",
11+
"from github import Github\n",
12+
"from github import RateLimitExceededException\n",
13+
"from github import UnknownObjectException\n",
14+
"from datetime import datetime, timedelta\n",
15+
"import pickle\n",
16+
"import time\n",
17+
"import itertools\n",
18+
"import os\n",
19+
"import calendar"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 21,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"user_roles = pd.read_csv('..\\dev_name_users.csv')"
29+
]
30+
},
31+
{
32+
"cell_type": "markdown",
33+
"metadata": {},
34+
"source": [
35+
"## Collect Issue dataset"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 20,
41+
"metadata": {},
42+
"outputs": [],
43+
"source": [
44+
"class GitIssueScrapper:\n",
45+
"\n",
46+
" def __init__(self):\n",
47+
" auth_git = #put your github authentication key here\n",
48+
" self.g = Github(auth_git)\n",
49+
"\n",
50+
" def issue_info_extracter(self, users):\n",
51+
"\n",
52+
" #users_iter = iter(users)\n",
53+
"\n",
54+
" for u in users:\n",
55+
" try:\n",
56+
" user = self.g.get_user(u)\n",
57+
" try:\n",
58+
" name = user.name\n",
59+
"\n",
60+
" except:\n",
61+
" name = \"none\"\n",
62+
" \n",
63+
" try:\n",
64+
" u_id = user.login\n",
65+
"\n",
66+
" except:\n",
67+
" u_id = \"none\"\n",
68+
"\n",
69+
"\n",
70+
" try:\n",
71+
" reps = user.get_repos()\n",
72+
" except:\n",
73+
" reps = \"none\"\n",
74+
"\n",
75+
" all_issue_repo={}\n",
76+
" counter=0\n",
77+
" for repository in reps:\n",
78+
" full_issue={}\n",
79+
" cnt=0\n",
80+
" try:\n",
81+
" repo_parent = repository.parent\n",
82+
" except:\n",
83+
" repo_parent = \"None\"\n",
84+
"\n",
85+
" \n",
86+
"\n",
87+
" try:\n",
88+
" if repo_parent:\n",
89+
" mention_issues=repository.parent.get_issues(mentioned=user,state='all')\n",
90+
" create_issues=repository.parent.get_issues(creator=user,state='all')\n",
91+
" assigne_issues=repository.parent.get_issues(assignee=user,state='all')\n",
92+
"\n",
93+
" else:\n",
94+
" mention_issues=repository.get_issues(mentioned=user,state='all')\n",
95+
" create_issues=repository.get_issues(creator=user,state='all')\n",
96+
" assigne_issues=repository.get_issues(assignee=user,state='all')\n",
97+
" except:\n",
98+
" issues = []\n",
99+
"\n",
100+
" for issue in mention_issues:\n",
101+
" full_issue[cnt] = {'title':issue.title,'body':issue.body}\n",
102+
" cnt += 1\n",
103+
" \n",
104+
" for issue in create_issues:\n",
105+
" full_issue[cnt] = {'title':issue.title,'body':issue.body}\n",
106+
" cnt += 1\n",
107+
" \n",
108+
" for issue in assigne_issues:\n",
109+
" full_issue[cnt] = {'title':issue.title,'body':issue.body}\n",
110+
" cnt += 1\n",
111+
"\n",
112+
" all_issue_repo[counter] = full_issue\n",
113+
" counter += 1\n",
114+
"\n",
115+
" \n",
116+
" \n",
117+
" \n",
118+
"\n",
119+
"\n",
120+
" \n",
121+
" data = {}\n",
122+
"\n",
123+
" data['name'] = name\n",
124+
" data['u_id'] = u_id\n",
125+
" data['issues'] = all_issue_repo\n",
126+
"\n",
127+
" \n",
128+
" \n",
129+
" pickle.dump(data, open(('../' + u_id + '.pickle'), 'wb'))\n",
130+
" \n",
131+
"\n",
132+
" print(u_id, \"-------------------> done\")\n",
133+
"\n",
134+
" except RateLimitExceededException:\n",
135+
" now = datetime.now()\n",
136+
" current_time = now.strftime(\"%H:%M:%S\")\n",
137+
" search_rate_limit = self.g.get_rate_limit().search\n",
138+
" print('current time:{}'.format(current_time))\n",
139+
" #time.sleep(sleep_time)\n",
140+
" time.sleep(3600)\n",
141+
" continue\n",
142+
" except UnknownObjectException:\n",
143+
" continue\n",
144+
" \n",
145+
" def get_issue_details(self):\n",
146+
" # users = self.git.search_users('', location=self.location, language=self.language)\n",
147+
" #users = self.git.search_users('', location='montreal')\n",
148+
" self.issue_info_extracter(user_roles) "
149+
]
150+
},
151+
{
152+
"cell_type": "code",
153+
"execution_count": 21,
154+
"metadata": {},
155+
"outputs": [
156+
{
157+
"name": "stdout",
158+
"output_type": "stream",
159+
"text": [
160+
"milindhg -------------------> done\n",
161+
"behnammodi -------------------> done\n",
162+
"morganjunlin -------------------> done\n",
163+
"crisgon -------------------> done\n",
164+
"cswanghan -------------------> done\n",
165+
"nacarty -------------------> done\n",
166+
"highflyer910 -------------------> done\n",
167+
"chiragkyal -------------------> done\n",
168+
"abodesigner -------------------> done\n",
169+
"janikgar -------------------> done\n",
170+
"bdetweiler -------------------> done\n",
171+
"ronilaukkarinen -------------------> done\n",
172+
"liuwenzhuang -------------------> done\n",
173+
"sravi4701 -------------------> done\n",
174+
"joelloyd -------------------> done\n",
175+
"JoabMendes -------------------> done\n",
176+
"ronapelbaum -------------------> done\n",
177+
"stevetarver -------------------> done\n",
178+
"current time:09:36:33\n",
179+
"BlueMona -------------------> done\n",
180+
"iamtekeste -------------------> done\n",
181+
"alexpnt -------------------> done\n",
182+
"lumue -------------------> done\n",
183+
"bradleybossard -------------------> done\n",
184+
"SethClydesdale -------------------> done\n",
185+
"alicj -------------------> done\n",
186+
"yessky -------------------> done\n",
187+
"egig -------------------> done\n",
188+
"cwalsh -------------------> done\n",
189+
"yongsen -------------------> done\n",
190+
"Ritesh-Kumar-4946 -------------------> done\n",
191+
"current time:10:47:20\n",
192+
"PratikLunagaria -------------------> done\n",
193+
"naieem -------------------> done\n",
194+
"sanjogshrestha -------------------> done\n",
195+
"celis -------------------> done\n",
196+
"mesarikaya -------------------> done\n",
197+
"dnlsyfq -------------------> done\n",
198+
"DanielB3 -------------------> done\n",
199+
"janogale -------------------> done\n",
200+
"Rhadow -------------------> done\n",
201+
"current time:11:58:04\n",
202+
"juanignaciosl -------------------> done\n",
203+
"membersheep -------------------> done\n",
204+
"moki-daniel -------------------> done\n",
205+
"sdlambert -------------------> done\n",
206+
"sambgordon -------------------> done\n",
207+
"james-work-account -------------------> done\n",
208+
"josephmtinangi -------------------> done\n"
209+
]
210+
}
211+
],
212+
"source": [
213+
"gg = GitIssueScrapper()\n",
214+
"gg.get_issue_details()"
215+
]
216+
},
217+
{
218+
"cell_type": "code",
219+
"execution_count": null,
220+
"metadata": {},
221+
"outputs": [],
222+
"source": []
223+
}
224+
],
225+
"metadata": {
226+
"kernelspec": {
227+
"display_name": "Python 3",
228+
"language": "python",
229+
"name": "python3"
230+
},
231+
"language_info": {
232+
"codemirror_mode": {
233+
"name": "ipython",
234+
"version": 3
235+
},
236+
"file_extension": ".py",
237+
"mimetype": "text/x-python",
238+
"name": "python",
239+
"nbconvert_exporter": "python",
240+
"pygments_lexer": "ipython3",
241+
"version": "3.7.4"
242+
}
243+
},
244+
"nbformat": 4,
245+
"nbformat_minor": 2
246+
}

0 commit comments

Comments
 (0)