-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add pure python implementation for CodeChanges
- Loading branch information
1 parent
7e807a3
commit 8831e49
Showing
6 changed files
with
2,983 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
import utils | ||
from SourceCode import SourceCode | ||
from CommitPure import CommitPure | ||
|
||
|
||
class Code_Changes_Git_Pure(CommitPure): | ||
"""Class for Code_Changes for Git repositories. | ||
Objects are instantiated by specifying a file with the | ||
commits obtained by Perceval from a set of repositories. | ||
:param path: Path to file with one Perceval JSON document per line | ||
""" | ||
|
||
def __init__(self, data_list, date_range=(None, None), src_code_obj=None): | ||
""" | ||
Initilizes self.clean_data_list, a list of dictionaries where each element | ||
is fetched by Perceval | ||
:param data_list: A list of dictionaries, each element a line from the | ||
JSON file | ||
:param date_range: A tuple which represents the start and end date of | ||
interest | ||
:param src_code_obj: An object of SourceCode, to be used to determine | ||
what comprises source code. | ||
""" | ||
|
||
super().__init__(data_list, date_range, src_code_obj) | ||
|
||
def total_count(self): | ||
""" | ||
Get a naive count of the number of commits in the Perceval data. | ||
Note that some commits may be repeated and so total_count may | ||
overshoot. | ||
""" | ||
return len(self.clean_data_list) | ||
|
||
def compute(self, incl_empty=True, incl_merge=True, master_only=False): | ||
"""Count number of commits of different types, like including empty commits | ||
or counting only those commits made on the master branch. | ||
:param incl_empty: Include empty commits | ||
:param incl_merge: Include merge commits | ||
:param master_only: Include only commits made on master branch | ||
""" | ||
clean_data_list = self.clean_data_list | ||
|
||
if master_only: | ||
count = self._count_master_only(incl_empty) | ||
|
||
else: | ||
if not incl_empty: | ||
clean_data_list = [commit for commit in clean_data_list if commit['files_action'] != 0] | ||
if not incl_merge: | ||
clean_data_list = [commit for commit in clean_data_list if not commit['merge']] | ||
count = len(set([x['hash'] for x in clean_data_list])) | ||
|
||
return count | ||
|
||
# compute_timeseries yet to be finished for non_pandas | ||
# please ignore this method. | ||
def compute_timeseries(self, period="month", plot_chart=False): | ||
""" | ||
The metric value is computed for each fixed interval of time | ||
from the "since" date to the "until" date arguments, specified | ||
during object initiation. | ||
The fixed time interval can be either a month or a week. | ||
:param period: A string which can be either "month" or "week" | ||
:param plot_chart: Plots a barchart of the timeseries if True | ||
""" | ||
|
||
df = self.df | ||
if period == "month": | ||
timeseries_series = df['created_date'] \ | ||
.groupby([df['created_date'].dt.year.rename('year'), | ||
df['created_date'].dt.month.rename('month')]) \ | ||
.agg('count') | ||
|
||
all_periods = pd.DataFrame( | ||
pd.date_range(self.since, self.until, freq='M'), | ||
columns=["Dates"]) | ||
all_periods = pd.DataFrame( | ||
[all_periods['Dates'].dt.year.rename("year"), | ||
all_periods['Dates'].dt.month.rename("month")]).T | ||
|
||
elif period == "week": | ||
timeseries_series = df['created_date'] \ | ||
.groupby([df['created_date'].dt.year.rename('year'), | ||
df['created_date'].dt.week.rename('week')]) \ | ||
.agg('count') | ||
|
||
all_periods = pd.DataFrame( | ||
pd.date_range(self.since, self.until, freq='W'), | ||
columns=["Dates"]) | ||
all_periods = pd.DataFrame( | ||
[all_periods['Dates'].dt.year.rename("year"), | ||
all_periods['Dates'].dt.week.rename("week")]).T | ||
|
||
timeseries_df = pd.DataFrame(timeseries_series) | ||
timeseries_df.reset_index(inplace=True) | ||
timeseries_df.columns = ["year", period, "count"] | ||
merged_df = all_periods.merge(timeseries_df, how='outer').fillna(0) | ||
|
||
if plot_chart: | ||
plt.style.use('seaborn') | ||
merged_df.plot(y='count', use_index=True) | ||
plt.fill_between(y1=merged_df['count'], y2=0, x=merged_df.index) | ||
plt.title("Commit Timeseries") | ||
plt.show() | ||
|
||
dataframe = merged_df | ||
return dataframe | ||
|
||
def _count_master_only(self, incl_empty=True): | ||
""" | ||
Counts commits present only on the master branch. | ||
:param incl_empty: exclude empty commits on the master branch | ||
""" | ||
|
||
todo = set() | ||
for commit in self.clean_data_list: | ||
if 'HEAD -> refs/heads/master' in commit['refs']: | ||
todo.add(commit['hash']) | ||
|
||
master = set() | ||
while len(todo) > 0: | ||
current = todo.pop() | ||
master.add(current) | ||
|
||
for parent in [commit['parents'] for commit in self.clean_data_list if commit['hash'] == current][0]: | ||
if parent not in master: | ||
todo.add(parent) | ||
|
||
if not incl_empty: | ||
code_commits = 0 | ||
count1 = 0 | ||
count2 = 0 | ||
for commit_id in master: | ||
commit = [commit for commit in self.clean_data_list if commit['hash'] == current][0] | ||
if commit['files_no'] > 0: | ||
count1 += 1 | ||
for file in commit['files']: | ||
if 'action' in file: | ||
count2 += 1 | ||
code_commits += 1 | ||
break | ||
|
||
else: | ||
code_commits = len(master) | ||
|
||
return code_commits | ||
|
||
|
||
if __name__ == "__main__": | ||
sourcecode = SourceCode(["tests/"], "folder_exclude") | ||
data_list = utils.read_JSON_file('git-commits.json') | ||
changes = Code_Changes_Git_Pure(data_list) | ||
print(changes.compute(incl_empty=False)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import math | ||
import pandas as pd | ||
from MetricPure import MetricPure | ||
import utils | ||
|
||
|
||
class CommitPure(MetricPure): | ||
""" | ||
Initilizes self.clean_data_list, a list with commit dictionaries | ||
elements. | ||
:param data_list: A list of dictionaries. | ||
Each element is a Perceval dictionary, obtained from a JSON | ||
file or from Perceval directly. | ||
:param date_range: A tuple which represents the period of interest | ||
It is of the form (since, until), where since and until are strings. | ||
Either, or both can be None. If, for example, since is None, that | ||
would mean that all commits from the first commit to the commit who last | ||
falls inside the until range will be included. | ||
:param src_code_obj: An object of SourceCode. | ||
It is used to determine what comprises source code. | ||
""" | ||
|
||
def __init__(self, data_list, date_range=(None, None), src_code_obj=None): | ||
|
||
super().__init__(data_list) | ||
|
||
self.clean_data_list = list() | ||
(self.since, self.until) = date_range | ||
|
||
for commit in self.raw_list: | ||
commit = self._clean_commit(commit) | ||
if src_code_obj is None \ | ||
or src_code_obj.is_source_code(commit): | ||
self.clean_data_list.append(commit) | ||
|
||
if self.since: | ||
self.clean_data_list = [commit for commit in self.clean_data_list | ||
if e['created_date'] | ||
>= utils.str_to_dt_data(self.since)] | ||
|
||
else: | ||
self.since = utils.get_date(self.clean_data_list, "since") | ||
|
||
if self.until: | ||
self.clean_data_list = [commit for commit in self.clean_data_list | ||
if e['created_date'] | ||
< utils.str_to_dt_data(self.until)] | ||
else: | ||
self.until = utils.get_date(self.clean_data_list, "until") | ||
|
||
|
||
def _clean_commit(self, line): | ||
""" | ||
This method is for cleaning a raw commit fetched by Perceval. | ||
Since all attributes of the data are not of our importance, it is | ||
better to just keep the ones which are required. | ||
:param line: a raw line fetched by Perceval, present in the JSON file | ||
It is a dictionary. | ||
:returns: A cleaned commit, which is a dictionary | ||
""" | ||
cleaned_line = { | ||
'repo': line['origin'], | ||
'hash': line['data_commit'], | ||
'author': line['data_Author'], | ||
'category': "commit", | ||
'created_date': utils.str_to_dt_data(line['data_AuthorDate']), | ||
'commit': line['data_Commit'], | ||
'commit_date': utils.str_to_dt_data(line['data_CommitDate']), | ||
'files_no': len(line['data_files']), | ||
'refs': line['data_refs'], | ||
'parents': line['data_parents'], | ||
'files': line['data_files'] | ||
} | ||
|
||
actions = 0 | ||
for file in line['data_files']: | ||
if 'action' in file: | ||
actions += 1 | ||
cleaned_line['files_action'] = actions | ||
|
||
try: | ||
non_merge = math.isnan(line['data_Merge']) | ||
|
||
except (TypeError, KeyError): | ||
non_merge = False | ||
|
||
cleaned_line['merge'] = not non_merge | ||
return cleaned_line |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import pandas as pd | ||
|
||
|
||
class MetricPure(): | ||
""" | ||
Create a list of dictionaries from data-list, which is the data | ||
fetched by Perceval | ||
:param data_list: A list of dictionaries. | ||
Each element is a Perceval dictionary, obtained from a JSON | ||
file or from Perceval directly. | ||
""" | ||
|
||
def __init__(self, data_list): | ||
data_list_flattened = MetricPure._flatten_data(data_list) | ||
self.raw_list = data_list_flattened | ||
|
||
|
||
@staticmethod | ||
def _flatten_data(data_list): | ||
""" | ||
Flattens the nested dictionaries in Perceval data to ease | ||
in converting the data to a DataFrame. | ||
:param data_list: A list of dictionaries. | ||
Each element is a Perceval dictionary, obtained from a JSON | ||
file or from Perceval directly. | ||
:returns: commit as a flattened dictionary. | ||
""" | ||
data_rows = list() | ||
|
||
for data_line in data_list: | ||
row = dict() | ||
for key, val in data_line.items(): | ||
if isinstance(val, dict): | ||
for sub_key, sub_val in val.items(): | ||
row[key + "_" + sub_key] = sub_val | ||
else: | ||
row[key] = val | ||
data_rows.append(row) | ||
return data_rows |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
class SourceCode: | ||
|
||
def __init__(self, source_code_exclude_list=None, method="naive"): | ||
""" | ||
A SourceCode object is used to help define source code. | ||
:param source_code_exclude_list: a list of either | ||
file extensions or directories to exclude | ||
:param method: a string which can be "naive", | ||
"folder_exclude" or "extension_exclude" | ||
For example, | ||
source_code_exclude_list = ["py", "other", "gitignore"] | ||
or source_code_exclude_list = ["tests/", "bin/", "drivers/base/"] | ||
""" | ||
self.source_code_exclude_list = source_code_exclude_list | ||
self.method = method | ||
|
||
def is_source_code(self, commit): | ||
""" | ||
Given a commit structure, which is a dictionary returned | ||
by the _summary function, and given a list of files to exclude | ||
using source_code_exclude_list while instantiating an object, | ||
decide whether all the files in a commit are to be excluded or not. | ||
:param commit: a commit structure, returned by the _summary method. | ||
""" | ||
options_dict = { | ||
"naive": self._naive_implementation, | ||
"folder_exclude": self._folder_exclude_implementation, | ||
"extension_exclude": self._extension_exclude_implementation | ||
} | ||
|
||
return options_dict[self.method](commit) | ||
|
||
def _naive_implementation(self, commit): | ||
""" | ||
This implementation is naive, meaning that is assumes that | ||
all the files a commit deal with are part of the source code, | ||
irrespective of how the source code is defined. | ||
:param commit: a commit structure, returned by the _summary method. | ||
""" | ||
return True | ||
|
||
def _folder_exclude_implementation(self, commit): | ||
""" | ||
This implementation is based on the directory a file | ||
is present in, like "tests/" or "bin/". | ||
If all the files affected by a commit are present | ||
in directories which are mentioned source_code_exclude_list, | ||
that commit will not be considered for analysis. | ||
:param commit: a commit structure, returned by the _summary method. | ||
""" | ||
if self.source_code_exclude_list is None: | ||
return True | ||
|
||
for file in commit['files']: | ||
if not any(file['file'].startswith(path) | ||
for path in self.source_code_exclude_list): | ||
return True | ||
return False | ||
|
||
def _extension_exclude_implementation(self, commit): | ||
""" | ||
This implementation is based on the extensions of the files involved | ||
in a commit, like "py", "json", etc. | ||
If all the files affected by a commit have extensions which are | ||
present in the source_code_exclude_list parameter, that commit will | ||
not be considered for the analysis. | ||
:param commit: a commit structure, returned by the _clean_commit method | ||
of Commit class. | ||
""" | ||
extension_set = set() | ||
for file in commit['files']: | ||
extension_set.add(SourceCode._get_extension(file)) | ||
|
||
if self.source_code_exclude_list is None \ | ||
or len(extension_set | ||
.difference(self.source_code_exclude_list)) > 0: | ||
return True | ||
return False | ||
|
||
@staticmethod | ||
def _get_extension(file): | ||
""" | ||
Given a file structure, which is a dictionary and an element | ||
of commit['files'], return the extension of that file. | ||
For files without a standard ".xyz" extension, like LICENCE or AUTHORS, | ||
the "others" extension is used. | ||
:param file: a file structure which is a dictionary, an element | ||
of commit["files"] | ||
""" | ||
|
||
file_name = file['file'] | ||
if '.' in file_name: | ||
file_extension = file_name.split('.')[1] | ||
else: | ||
file_extension = "other" | ||
return file_extension |
Oops, something went wrong.