Merge pull request #4 from linkvt/feature-pipeline-changes

BlackDark · web-flow · commit bb970f08a322 · 2016-12-13T21:48:46.000+01:00
Feature pipeline changes + DEV language extactor
diff --git a/Main.py b/Main.py
@@ -1,14 +1,16 @@
 import argparse
 
-from classifier.FeatureExtractor import FeatureExtractor
+from classifier.DecisionTreeClassifier import DecisionTreeClassifier
 from classifier.GithubAuthentification import GithubAuthentification
 from classifier.InputParser import InputParser
-from classifier.DecisionTreeClassifier import DecisionTreeClassifier
+from classifier.feature_extraction.FeatureExtractionPipeline import FeatureExtractionPipeline
 
 parser = argparse.ArgumentParser(description='Program which analyses github repositories into categories.')
-parser.add_argument('-f', '--file', dest="filepath", help='The file location of the input file', nargs='?', required=True,
+parser.add_argument('-f', '--file', dest="filepath", help='The file location of the input file', nargs='?',
+                    required=True,
                     metavar='FILE')
-parser.add_argument('-t', '--train', dest="train", help='Specifies the program to train with the given data', action="store_true")
+parser.add_argument('-t', '--train', dest="train", help='Specifies the program to train with the given data',
+                    action="store_true")
 
 args = parser.parse_args()
 
@@ -25,11 +27,11 @@
     # build the samples
     for (url, current_label) in zip(splitted_urls, labels):
         current_repo = github_connection.get_repo(url)
-        print('<Testing> Read repo name:{} with label {}'.format( current_repo.name, current_label))
-        features = FeatureExtractor(current_repo).extract_features()
+        print('<Testing> Read repo name:{} with label {}'.format(current_repo.name, current_label))
+        features = FeatureExtractionPipeline(current_repo).extract_features()
         print('Extracted features: ', features)
         samples.append(features)
 
     clf.fit(samples, labels)
-    predicted_label = clf.predict([[5]])
-    print("Predict label for repo with 5 forks: {}".format(predicted_label))
+    predicted_label = clf.predict_with_values([[5, 1]])
+    print('Prediction: ', predicted_label)
diff --git a/classifier/DecisionTreeClassifier.py b/classifier/DecisionTreeClassifier.py
@@ -6,7 +6,32 @@ def __init__(self):
         self.clf = tree.DecisionTreeClassifier()
 
     def fit(self, samples, labels):
-        self.clf = self.clf.fit(samples, labels)
+        """
+        samples eq. -> [[Feature1, Feature2], [Feature1, Feature2]]
+        :param samples:
+        :param labels:
+        :return:
+        """
+        self.clf = self.clf.fit(self._map_input(samples), labels)
 
     def predict(self, samples):
+        """
+        The sample input contains only the necessary values for scikit input
+        :param samples:
+        :return:
+        """
+        return self.clf.predict(self._map_input(samples))
+
+    def predict_with_values(self, samples):
         return self.clf.predict(samples)
+
+    def _map_input(self, samples):
+        """
+        We need a specific input for the scikit classifier
+        :param samples:
+        :return:
+        """
+        mapped_samples = []
+        for sample in samples:
+            mapped_samples.append([feature.value for feature in sample])
+        return mapped_samples
diff --git a/classifier/Feature.py b/classifier/Feature.py
@@ -1,4 +1,10 @@
 class Feature:
     def __init__(self, name, value):
-        self._name = name
-        self._value = value
+        self.name = name
+        self.value = value
+
+    def __str__(self):
+        return 'Feature[' + self.name + ', ' + str(self.value) + ']'
+
+    def __repr__(self):
+        return self.__str__()
diff --git a/classifier/FeatureExtractor.py b/classifier/FeatureExtractor.py
diff --git a/classifier/feature_extraction/FeatureCategory.py b/classifier/feature_extraction/FeatureCategory.py
@@ -0,0 +1,24 @@
+import abc
+from itertools import chain
+
+from classifier import Feature
+from classifier.feature_extraction.FeatureExtractor import FeatureExtractor
+
+
+class FeatureCategory:
+    """
+    Each category is a representation for a own weightable feature set.
+    This can be for example be used for own trees.
+    """
+
+    def __init__(self, repo):
+        self.repo = repo
+        self.features = list(chain.from_iterable(
+            (extractor.extract_features() for extractor in self._get_feature_extractors())))
+
+    def extract(self) -> [Feature]:
+        return self.features
+
+    @abc.abstractmethod
+    def _get_feature_extractors(self) -> [FeatureExtractor]:
+        raise AssertionError('Must be overridden')
diff --git a/classifier/feature_extraction/FeatureExtractionPipeline.py b/classifier/feature_extraction/FeatureExtractionPipeline.py
@@ -0,0 +1,22 @@
+from itertools import chain
+
+from github.Repository import Repository
+
+from classifier.Feature import Feature
+from classifier.feature_extraction import FeatureCategory
+from classifier.feature_extraction.MainCategory import MainCategory
+
+CATEGORIES = [
+    MainCategory
+]
+
+
+class FeatureExtractionPipeline:
+    def __init__(self, repo: Repository):
+        self._repo = repo
+
+    def extract_features(self) -> [Feature]:
+        return list(chain.from_iterable((category.extract() for category in self.extract_features_in_categories())))
+
+    def extract_features_in_categories(self) -> [FeatureCategory]:
+        return [category(self._repo) for category in CATEGORIES]
diff --git a/classifier/feature_extraction/FeatureExtractor.py b/classifier/feature_extraction/FeatureExtractor.py
@@ -0,0 +1,21 @@
+import abc
+
+from github.Repository import Repository
+
+from classifier import Feature
+
+
+class FeatureExtractor:
+    """
+    Base class for all feature extractors.
+    """
+
+    def __init__(self, repo: Repository):
+        self._repo = repo
+
+    @abc.abstractmethod
+    def extract_features(self) -> [Feature]:
+        """
+        :return: Feature
+        """
+        raise NotImplementedError('Main class should not be called for feature extraction!')
diff --git a/classifier/feature_extraction/ForkExtractor.py b/classifier/feature_extraction/ForkExtractor.py
@@ -0,0 +1,7 @@
+from classifier.Feature import Feature
+from classifier.feature_extraction.FeatureExtractor import FeatureExtractor
+
+
+class ForkExtractor(FeatureExtractor):
+    def extract_features(self) -> [Feature]:
+        return [Feature('Test feature with forks', self._repo.forks)]
diff --git a/classifier/feature_extraction/MainCategory.py b/classifier/feature_extraction/MainCategory.py
@@ -0,0 +1,10 @@
+from classifier.feature_extraction.ForkExtractor import ForkExtractor
+from classifier.feature_extraction.FeatureCategory import FeatureCategory
+from classifier.feature_extraction.FeatureExtractor import FeatureExtractor
+from classifier.feature_extraction.language.AllLanguageFeatureExtractor import AllLanguageFeatureExtractor
+from classifier.feature_extraction.language.LanguageDEVFeatureExtractor import LanguageDEVFeatureExtractor
+
+
+class MainCategory(FeatureCategory):
+    def _get_feature_extractors(self) -> [FeatureExtractor]:
+        return [ForkExtractor(self.repo), LanguageDEVFeatureExtractor(self.repo), AllLanguageFeatureExtractor(self.repo)]
diff --git a/classifier/feature_extraction/language/AllLanguageFeatureExtractor.py b/classifier/feature_extraction/language/AllLanguageFeatureExtractor.py
@@ -0,0 +1,40 @@
+import yaml
+from github.Repository import Repository
+
+from classifier.Feature import Feature
+from classifier.feature_extraction.FeatureExtractor import FeatureExtractor
+
+
+class AllLanguageFeatureExtractor(FeatureExtractor):
+    """
+    The languages returned from github are mapped to the byte size of usage.
+    Eq {'Python': 98564, 'R': 4914}
+    Currently there are 223 languages
+    """
+    LANGUAGES = []
+
+    def __init__(self, repo: Repository):
+        super().__init__(repo)
+        self._init_languages()
+        self._languageToProbability = {el: 0 for el in AllLanguageFeatureExtractor.LANGUAGES}
+
+    def _init_languages(self):
+        if not AllLanguageFeatureExtractor.LANGUAGES:
+            with open('languages.yml', 'r') as f:
+                doc = yaml.load(f)
+                AllLanguageFeatureExtractor.LANGUAGES = [language for language in doc]
+
+    def dict(self):
+        return self._languageToProbability
+
+    def extract_features(self) -> [Feature]:
+        languages = self._repo.get_languages()
+        total_size = sum(languages.values())
+
+        for language in languages:
+            if language in self._languageToProbability:
+                self._languageToProbability[language] = languages[language] / total_size
+            else:
+                print('Language "' + language + '" is not registered in the algorithm.')
+
+        return [Feature('Language: ' + key, value) for key, value in self._languageToProbability.items()]
diff --git a/classifier/feature_extraction/language/LanguageDEVFeatureExtractor.py b/classifier/feature_extraction/language/LanguageDEVFeatureExtractor.py
@@ -0,0 +1,9 @@
+from classifier.feature_extraction.language.LanguageFeatureExtractor import LanguageFeatureExtractor
+
+
+class LanguageDEVFeatureExtractor(LanguageFeatureExtractor):
+    def _get_relevant_languages(self) -> [str]:
+        return ['Python', 'Java']
+
+    def _get_category_label(self) -> str:
+        return 'DEV'
diff --git a/classifier/feature_extraction/language/LanguageFeatureExtractor.py b/classifier/feature_extraction/language/LanguageFeatureExtractor.py
@@ -0,0 +1,30 @@
+import abc
+
+from classifier.Feature import Feature
+from classifier.feature_extraction.FeatureExtractor import FeatureExtractor
+
+
+class LanguageFeatureExtractor(FeatureExtractor):
+    """
+    The languages returned from github are mapped to the byte size of usage.
+    Eq {'Python': 98564, 'R': 4914}
+    """
+
+    def extract_features(self) -> [Feature]:
+        languages = self._repo.get_languages()
+        total_size = sum(languages.values())
+        relevant_size = 0
+
+        for language in self._get_relevant_languages():
+            if language in languages:
+                relevant_size += languages[language]
+
+        return [Feature('Language feature for ' + self._get_category_label(), relevant_size / total_size)]
+
+    @abc.abstractmethod
+    def _get_category_label(self) -> str:
+        raise NotImplementedError('Should be implemented in subclasses!')
+
+    @abc.abstractmethod
+    def _get_relevant_languages(self) -> [str]:
+        raise NotImplementedError('Should be implemented in subclasses!')
diff --git a/languages.yml b/languages.yml