Skip to content
This repository was archived by the owner on Jul 21, 2021. It is now read-only.

Commit bb970f0

Browse files
authored
Merge pull request #4 from linkvt/feature-pipeline-changes
Feature pipeline changes + DEV language extactor
2 parents 8a5b9a9 + 9c867cb commit bb970f0

13 files changed

+1956
-17
lines changed

Main.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
import argparse
22

3-
from classifier.FeatureExtractor import FeatureExtractor
3+
from classifier.DecisionTreeClassifier import DecisionTreeClassifier
44
from classifier.GithubAuthentification import GithubAuthentification
55
from classifier.InputParser import InputParser
6-
from classifier.DecisionTreeClassifier import DecisionTreeClassifier
6+
from classifier.feature_extraction.FeatureExtractionPipeline import FeatureExtractionPipeline
77

88
parser = argparse.ArgumentParser(description='Program which analyses github repositories into categories.')
9-
parser.add_argument('-f', '--file', dest="filepath", help='The file location of the input file', nargs='?', required=True,
9+
parser.add_argument('-f', '--file', dest="filepath", help='The file location of the input file', nargs='?',
10+
required=True,
1011
metavar='FILE')
11-
parser.add_argument('-t', '--train', dest="train", help='Specifies the program to train with the given data', action="store_true")
12+
parser.add_argument('-t', '--train', dest="train", help='Specifies the program to train with the given data',
13+
action="store_true")
1214

1315
args = parser.parse_args()
1416

@@ -25,11 +27,11 @@
2527
# build the samples
2628
for (url, current_label) in zip(splitted_urls, labels):
2729
current_repo = github_connection.get_repo(url)
28-
print('<Testing> Read repo name:{} with label {}'.format( current_repo.name, current_label))
29-
features = FeatureExtractor(current_repo).extract_features()
30+
print('<Testing> Read repo name:{} with label {}'.format(current_repo.name, current_label))
31+
features = FeatureExtractionPipeline(current_repo).extract_features()
3032
print('Extracted features: ', features)
3133
samples.append(features)
3234

3335
clf.fit(samples, labels)
34-
predicted_label = clf.predict([[5]])
35-
print("Predict label for repo with 5 forks: {}".format(predicted_label))
36+
predicted_label = clf.predict_with_values([[5, 1]])
37+
print('Prediction: ', predicted_label)

classifier/DecisionTreeClassifier.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,32 @@ def __init__(self):
66
self.clf = tree.DecisionTreeClassifier()
77

88
def fit(self, samples, labels):
9-
self.clf = self.clf.fit(samples, labels)
9+
"""
10+
samples eq. -> [[Feature1, Feature2], [Feature1, Feature2]]
11+
:param samples:
12+
:param labels:
13+
:return:
14+
"""
15+
self.clf = self.clf.fit(self._map_input(samples), labels)
1016

1117
def predict(self, samples):
18+
"""
19+
The sample input contains only the necessary values for scikit input
20+
:param samples:
21+
:return:
22+
"""
23+
return self.clf.predict(self._map_input(samples))
24+
25+
def predict_with_values(self, samples):
1226
return self.clf.predict(samples)
27+
28+
def _map_input(self, samples):
29+
"""
30+
We need a specific input for the scikit classifier
31+
:param samples:
32+
:return:
33+
"""
34+
mapped_samples = []
35+
for sample in samples:
36+
mapped_samples.append([feature.value for feature in sample])
37+
return mapped_samples

classifier/Feature.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
class Feature:
22
def __init__(self, name, value):
3-
self._name = name
4-
self._value = value
3+
self.name = name
4+
self.value = value
5+
6+
def __str__(self):
7+
return 'Feature[' + self.name + ', ' + str(self.value) + ']'
8+
9+
def __repr__(self):
10+
return self.__str__()

classifier/FeatureExtractor.py

Lines changed: 0 additions & 6 deletions
This file was deleted.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import abc
2+
from itertools import chain
3+
4+
from classifier import Feature
5+
from classifier.feature_extraction.FeatureExtractor import FeatureExtractor
6+
7+
8+
class FeatureCategory:
9+
"""
10+
Each category is a representation for a own weightable feature set.
11+
This can be for example be used for own trees.
12+
"""
13+
14+
def __init__(self, repo):
15+
self.repo = repo
16+
self.features = list(chain.from_iterable(
17+
(extractor.extract_features() for extractor in self._get_feature_extractors())))
18+
19+
def extract(self) -> [Feature]:
20+
return self.features
21+
22+
@abc.abstractmethod
23+
def _get_feature_extractors(self) -> [FeatureExtractor]:
24+
raise AssertionError('Must be overridden')
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from itertools import chain
2+
3+
from github.Repository import Repository
4+
5+
from classifier.Feature import Feature
6+
from classifier.feature_extraction import FeatureCategory
7+
from classifier.feature_extraction.MainCategory import MainCategory
8+
9+
CATEGORIES = [
10+
MainCategory
11+
]
12+
13+
14+
class FeatureExtractionPipeline:
15+
def __init__(self, repo: Repository):
16+
self._repo = repo
17+
18+
def extract_features(self) -> [Feature]:
19+
return list(chain.from_iterable((category.extract() for category in self.extract_features_in_categories())))
20+
21+
def extract_features_in_categories(self) -> [FeatureCategory]:
22+
return [category(self._repo) for category in CATEGORIES]
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import abc
2+
3+
from github.Repository import Repository
4+
5+
from classifier import Feature
6+
7+
8+
class FeatureExtractor:
9+
"""
10+
Base class for all feature extractors.
11+
"""
12+
13+
def __init__(self, repo: Repository):
14+
self._repo = repo
15+
16+
@abc.abstractmethod
17+
def extract_features(self) -> [Feature]:
18+
"""
19+
:return: Feature
20+
"""
21+
raise NotImplementedError('Main class should not be called for feature extraction!')
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from classifier.Feature import Feature
2+
from classifier.feature_extraction.FeatureExtractor import FeatureExtractor
3+
4+
5+
class ForkExtractor(FeatureExtractor):
6+
def extract_features(self) -> [Feature]:
7+
return [Feature('Test feature with forks', self._repo.forks)]
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from classifier.feature_extraction.ForkExtractor import ForkExtractor
2+
from classifier.feature_extraction.FeatureCategory import FeatureCategory
3+
from classifier.feature_extraction.FeatureExtractor import FeatureExtractor
4+
from classifier.feature_extraction.language.AllLanguageFeatureExtractor import AllLanguageFeatureExtractor
5+
from classifier.feature_extraction.language.LanguageDEVFeatureExtractor import LanguageDEVFeatureExtractor
6+
7+
8+
class MainCategory(FeatureCategory):
9+
def _get_feature_extractors(self) -> [FeatureExtractor]:
10+
return [ForkExtractor(self.repo), LanguageDEVFeatureExtractor(self.repo), AllLanguageFeatureExtractor(self.repo)]
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import yaml
2+
from github.Repository import Repository
3+
4+
from classifier.Feature import Feature
5+
from classifier.feature_extraction.FeatureExtractor import FeatureExtractor
6+
7+
8+
class AllLanguageFeatureExtractor(FeatureExtractor):
9+
"""
10+
The languages returned from github are mapped to the byte size of usage.
11+
Eq {'Python': 98564, 'R': 4914}
12+
Currently there are 223 languages
13+
"""
14+
LANGUAGES = []
15+
16+
def __init__(self, repo: Repository):
17+
super().__init__(repo)
18+
self._init_languages()
19+
self._languageToProbability = {el: 0 for el in AllLanguageFeatureExtractor.LANGUAGES}
20+
21+
def _init_languages(self):
22+
if not AllLanguageFeatureExtractor.LANGUAGES:
23+
with open('languages.yml', 'r') as f:
24+
doc = yaml.load(f)
25+
AllLanguageFeatureExtractor.LANGUAGES = [language for language in doc]
26+
27+
def dict(self):
28+
return self._languageToProbability
29+
30+
def extract_features(self) -> [Feature]:
31+
languages = self._repo.get_languages()
32+
total_size = sum(languages.values())
33+
34+
for language in languages:
35+
if language in self._languageToProbability:
36+
self._languageToProbability[language] = languages[language] / total_size
37+
else:
38+
print('Language "' + language + '" is not registered in the algorithm.')
39+
40+
return [Feature('Language: ' + key, value) for key, value in self._languageToProbability.items()]

0 commit comments

Comments
 (0)