Skip to content

Commit

Permalink
Add pure python implementation for CodeChanges
Browse files Browse the repository at this point in the history
  • Loading branch information
Polaris000 committed Jun 11, 2019
1 parent 7e807a3 commit 8831e49
Show file tree
Hide file tree
Showing 6 changed files with 2,983 additions and 0 deletions.
156 changes: 156 additions & 0 deletions Weekly_Work/Week3/Code_Changes_Git_Pure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import pandas as pd
import matplotlib.pyplot as plt
import utils
from SourceCode import SourceCode
from CommitPure import CommitPure


class Code_Changes_Git_Pure(CommitPure):
"""Class for Code_Changes for Git repositories.
Objects are instantiated by specifying a file with the
commits obtained by Perceval from a set of repositories.
:param path: Path to file with one Perceval JSON document per line
"""

def __init__(self, data_list, date_range=(None, None), src_code_obj=None):
"""
Initilizes self.clean_data_list, a list of dictionaries where each element
is fetched by Perceval
:param data_list: A list of dictionaries, each element a line from the
JSON file
:param date_range: A tuple which represents the start and end date of
interest
:param src_code_obj: An object of SourceCode, to be used to determine
what comprises source code.
"""

super().__init__(data_list, date_range, src_code_obj)

def total_count(self):
"""
Get a naive count of the number of commits in the Perceval data.
Note that some commits may be repeated and so total_count may
overshoot.
"""
return len(self.clean_data_list)

def compute(self, incl_empty=True, incl_merge=True, master_only=False):
"""Count number of commits of different types, like including empty commits
or counting only those commits made on the master branch.
:param incl_empty: Include empty commits
:param incl_merge: Include merge commits
:param master_only: Include only commits made on master branch
"""
clean_data_list = self.clean_data_list

if master_only:
count = self._count_master_only(incl_empty)

else:
if not incl_empty:
clean_data_list = [commit for commit in clean_data_list if commit['files_action'] != 0]
if not incl_merge:
clean_data_list = [commit for commit in clean_data_list if not commit['merge']]
count = len(set([x['hash'] for x in clean_data_list]))

return count

# compute_timeseries yet to be finished for non_pandas
# please ignore this method.
def compute_timeseries(self, period="month", plot_chart=False):
"""
The metric value is computed for each fixed interval of time
from the "since" date to the "until" date arguments, specified
during object initiation.
The fixed time interval can be either a month or a week.
:param period: A string which can be either "month" or "week"
:param plot_chart: Plots a barchart of the timeseries if True
"""

df = self.df
if period == "month":
timeseries_series = df['created_date'] \
.groupby([df['created_date'].dt.year.rename('year'),
df['created_date'].dt.month.rename('month')]) \
.agg('count')

all_periods = pd.DataFrame(
pd.date_range(self.since, self.until, freq='M'),
columns=["Dates"])
all_periods = pd.DataFrame(
[all_periods['Dates'].dt.year.rename("year"),
all_periods['Dates'].dt.month.rename("month")]).T

elif period == "week":
timeseries_series = df['created_date'] \
.groupby([df['created_date'].dt.year.rename('year'),
df['created_date'].dt.week.rename('week')]) \
.agg('count')

all_periods = pd.DataFrame(
pd.date_range(self.since, self.until, freq='W'),
columns=["Dates"])
all_periods = pd.DataFrame(
[all_periods['Dates'].dt.year.rename("year"),
all_periods['Dates'].dt.week.rename("week")]).T

timeseries_df = pd.DataFrame(timeseries_series)
timeseries_df.reset_index(inplace=True)
timeseries_df.columns = ["year", period, "count"]
merged_df = all_periods.merge(timeseries_df, how='outer').fillna(0)

if plot_chart:
plt.style.use('seaborn')
merged_df.plot(y='count', use_index=True)
plt.fill_between(y1=merged_df['count'], y2=0, x=merged_df.index)
plt.title("Commit Timeseries")
plt.show()

dataframe = merged_df
return dataframe

def _count_master_only(self, incl_empty=True):
"""
Counts commits present only on the master branch.
:param incl_empty: exclude empty commits on the master branch
"""

todo = set()
for commit in self.clean_data_list:
if 'HEAD -> refs/heads/master' in commit['refs']:
todo.add(commit['hash'])

master = set()
while len(todo) > 0:
current = todo.pop()
master.add(current)

for parent in [commit['parents'] for commit in self.clean_data_list if commit['hash'] == current][0]:
if parent not in master:
todo.add(parent)

if not incl_empty:
code_commits = 0
count1 = 0
count2 = 0
for commit_id in master:
commit = [commit for commit in self.clean_data_list if commit['hash'] == current][0]
if commit['files_no'] > 0:
count1 += 1
for file in commit['files']:
if 'action' in file:
count2 += 1
code_commits += 1
break

else:
code_commits = len(master)

return code_commits


if __name__ == "__main__":
sourcecode = SourceCode(["tests/"], "folder_exclude")
data_list = utils.read_JSON_file('git-commits.json')
changes = Code_Changes_Git_Pure(data_list)
print(changes.compute(incl_empty=False))
93 changes: 93 additions & 0 deletions Weekly_Work/Week3/CommitPure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import math
import pandas as pd
from MetricPure import MetricPure
import utils


class CommitPure(MetricPure):
"""
Initilizes self.clean_data_list, a list with commit dictionaries
elements.
:param data_list: A list of dictionaries.
Each element is a Perceval dictionary, obtained from a JSON
file or from Perceval directly.
:param date_range: A tuple which represents the period of interest
It is of the form (since, until), where since and until are strings.
Either, or both can be None. If, for example, since is None, that
would mean that all commits from the first commit to the commit who last
falls inside the until range will be included.
:param src_code_obj: An object of SourceCode.
It is used to determine what comprises source code.
"""

def __init__(self, data_list, date_range=(None, None), src_code_obj=None):

super().__init__(data_list)

self.clean_data_list = list()
(self.since, self.until) = date_range

for commit in self.raw_list:
commit = self._clean_commit(commit)
if src_code_obj is None \
or src_code_obj.is_source_code(commit):
self.clean_data_list.append(commit)

if self.since:
self.clean_data_list = [commit for commit in self.clean_data_list
if e['created_date']
>= utils.str_to_dt_data(self.since)]

else:
self.since = utils.get_date(self.clean_data_list, "since")

if self.until:
self.clean_data_list = [commit for commit in self.clean_data_list
if e['created_date']
< utils.str_to_dt_data(self.until)]
else:
self.until = utils.get_date(self.clean_data_list, "until")


def _clean_commit(self, line):
"""
This method is for cleaning a raw commit fetched by Perceval.
Since all attributes of the data are not of our importance, it is
better to just keep the ones which are required.
:param line: a raw line fetched by Perceval, present in the JSON file
It is a dictionary.
:returns: A cleaned commit, which is a dictionary
"""
cleaned_line = {
'repo': line['origin'],
'hash': line['data_commit'],
'author': line['data_Author'],
'category': "commit",
'created_date': utils.str_to_dt_data(line['data_AuthorDate']),
'commit': line['data_Commit'],
'commit_date': utils.str_to_dt_data(line['data_CommitDate']),
'files_no': len(line['data_files']),
'refs': line['data_refs'],
'parents': line['data_parents'],
'files': line['data_files']
}

actions = 0
for file in line['data_files']:
if 'action' in file:
actions += 1
cleaned_line['files_action'] = actions

try:
non_merge = math.isnan(line['data_Merge'])

except (TypeError, KeyError):
non_merge = False

cleaned_line['merge'] = not non_merge
return cleaned_line
42 changes: 42 additions & 0 deletions Weekly_Work/Week3/MetricPure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pandas as pd


class MetricPure():
"""
Create a list of dictionaries from data-list, which is the data
fetched by Perceval
:param data_list: A list of dictionaries.
Each element is a Perceval dictionary, obtained from a JSON
file or from Perceval directly.
"""

def __init__(self, data_list):
data_list_flattened = MetricPure._flatten_data(data_list)
self.raw_list = data_list_flattened


@staticmethod
def _flatten_data(data_list):
"""
Flattens the nested dictionaries in Perceval data to ease
in converting the data to a DataFrame.
:param data_list: A list of dictionaries.
Each element is a Perceval dictionary, obtained from a JSON
file or from Perceval directly.
:returns: commit as a flattened dictionary.
"""
data_rows = list()

for data_line in data_list:
row = dict()
for key, val in data_line.items():
if isinstance(val, dict):
for sub_key, sub_val in val.items():
row[key + "_" + sub_key] = sub_val
else:
row[key] = val
data_rows.append(row)
return data_rows
101 changes: 101 additions & 0 deletions Weekly_Work/Week3/SourceCode.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
class SourceCode:

def __init__(self, source_code_exclude_list=None, method="naive"):
"""
A SourceCode object is used to help define source code.
:param source_code_exclude_list: a list of either
file extensions or directories to exclude
:param method: a string which can be "naive",
"folder_exclude" or "extension_exclude"
For example,
source_code_exclude_list = ["py", "other", "gitignore"]
or source_code_exclude_list = ["tests/", "bin/", "drivers/base/"]
"""
self.source_code_exclude_list = source_code_exclude_list
self.method = method

def is_source_code(self, commit):
"""
Given a commit structure, which is a dictionary returned
by the _summary function, and given a list of files to exclude
using source_code_exclude_list while instantiating an object,
decide whether all the files in a commit are to be excluded or not.
:param commit: a commit structure, returned by the _summary method.
"""
options_dict = {
"naive": self._naive_implementation,
"folder_exclude": self._folder_exclude_implementation,
"extension_exclude": self._extension_exclude_implementation
}

return options_dict[self.method](commit)

def _naive_implementation(self, commit):
"""
This implementation is naive, meaning that is assumes that
all the files a commit deal with are part of the source code,
irrespective of how the source code is defined.
:param commit: a commit structure, returned by the _summary method.
"""
return True

def _folder_exclude_implementation(self, commit):
"""
This implementation is based on the directory a file
is present in, like "tests/" or "bin/".
If all the files affected by a commit are present
in directories which are mentioned source_code_exclude_list,
that commit will not be considered for analysis.
:param commit: a commit structure, returned by the _summary method.
"""
if self.source_code_exclude_list is None:
return True

for file in commit['files']:
if not any(file['file'].startswith(path)
for path in self.source_code_exclude_list):
return True
return False

def _extension_exclude_implementation(self, commit):
"""
This implementation is based on the extensions of the files involved
in a commit, like "py", "json", etc.
If all the files affected by a commit have extensions which are
present in the source_code_exclude_list parameter, that commit will
not be considered for the analysis.
:param commit: a commit structure, returned by the _clean_commit method
of Commit class.
"""
extension_set = set()
for file in commit['files']:
extension_set.add(SourceCode._get_extension(file))

if self.source_code_exclude_list is None \
or len(extension_set
.difference(self.source_code_exclude_list)) > 0:
return True
return False

@staticmethod
def _get_extension(file):
"""
Given a file structure, which is a dictionary and an element
of commit['files'], return the extension of that file.
For files without a standard ".xyz" extension, like LICENCE or AUTHORS,
the "others" extension is used.
:param file: a file structure which is a dictionary, an element
of commit["files"]
"""

file_name = file['file']
if '.' in file_name:
file_extension = file_name.split('.')[1]
else:
file_extension = "other"
return file_extension
Loading

0 comments on commit 8831e49

Please sign in to comment.