Skip to content

Commit

Permalink
Add docstrings partially
Browse files Browse the repository at this point in the history
Edit docstrings in half of *.py files.
Base the changes in [PEP 257](https://www.python.org/dev/peps/pep-0257).

Address issue datactive#341, partially.
  • Loading branch information
paulolimac committed Oct 30, 2018
1 parent 0f29919 commit 5f70f90
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 55 deletions.
13 changes: 8 additions & 5 deletions bigbang/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,7 @@ def load(path):


class Archive(object):

"""
A representation of a mailing list archive.
"""
"""A representation of a mailing list archive."""

data = None
activity = None
Expand All @@ -29,7 +26,7 @@ class Archive(object):

def __init__(self, data, archive_dir=CONFIG.mail_path, mbox=False):
"""
Initializes an Archive object.
Initialize an Archive object.
The behavior of the constructor depends on the type
of its first argument, data.
Expand Down Expand Up @@ -110,6 +107,7 @@ def __init__(self, data, archive_dir=CONFIG.mail_path, mbox=False):
raise mailman.MissingDataException('Archive after initial processing is empty. Was data collected properly?')

def resolve_entities(self,inplace=True):
"""Return data with resolved entities."""
if self.entities is None:
if self.activity is None:
self.get_activity()
Expand Down Expand Up @@ -139,6 +137,7 @@ def resolve_entities(self,inplace=True):
def get_activity(self,resolved=False):
"""
Get the activity matrix of an Archive.
Columns of the returned DataFrame are the Senders of emails.
Rows are indexed by ordinal date.
Cells are the number of emails sent by each sender on each data.
Expand All @@ -158,6 +157,7 @@ def get_activity(self,resolved=False):
return self.activity

def compute_activity(self, clean=True):
"""Return the computed activity."""
mdf = self.data

if clean:
Expand All @@ -181,6 +181,7 @@ def compute_activity(self, clean=True):
return activity

def get_threads(self, verbose=False):
"""Get threads."""

if self.threads is not None:
return self.threads
Expand Down Expand Up @@ -222,12 +223,14 @@ def get_threads(self, verbose=False):
return threads

def save(self, path,encoding='utf-8'):
"""Save data to csv file."""
self.data.to_csv(path, ",",encoding=encoding)


def find_footer(messages,number=1):
'''
Returns the footer of a DataFrame of emails.
A footer is a string occurring at the tail of most messages.
Messages can be a DataFrame or a Series
'''
Expand Down
26 changes: 17 additions & 9 deletions bigbang/entity_resolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
currID = 1;

def getID(name, email):
"""Get ID from a name and email."""

global currID;
global emailsID
global namesID
Expand Down Expand Up @@ -42,23 +44,29 @@ def getID(name, email):

return nameID;

def store(id, name, email) :
if id not in allID:
allID[id] = {"emails": list(), "names": list()}
fullID = allID[id];
namesID[name] = id;
emailsID[email] = id;
fullID["names"].append(name);
fullID["emails"].append(email);
def store(id, name, email):
"""Store name and email by ID."""

if id not in allID:
allID[id] = {"emails": list(), "names": list()}
fullID = allID[id];
namesID[name] = id;
emailsID[email] = id;
fullID["names"].append(name);
fullID["emails"].append(email);


def name_for_id(id):
"""Return name by ID."""

if id in allID:
if "names" in allID[id] and len(allID[id]["names"]) > 0:
return allID[id]["names"][0]
return "UNKNOWN " + str(id)

def entity_resolve(row, emailCol, nameCol):
"""Return a row with name and email by ID."""

emailAddress = row[emailCol].upper();
emailAddress = emailAddress.replace(" AT ", "@")

Expand All @@ -73,4 +81,4 @@ def entity_resolve(row, emailCol, nameCol):
if nameCol is not None :
name = row[nameCol].upper()
row["Person-ID"] = getID(name, emailAddress)
return row
return row
43 changes: 29 additions & 14 deletions bigbang/git_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,32 @@

ALL_ATTRIBUTES = CONFIG.all_attributes #["HEXSHA", "Committer Name", "Committer Email", "Commit Message", "Time", "Parent Commit", "Touched File"]

def cache_fixer(r): # Adds info from row to graph
def cache_fixer(r):
"""Adds info from row to graph."""
r["Touched File"] = [x.strip() for x in r["Touched File"][1:-1].split(",")]
r["Time"] = pd.to_datetime(r["Time"]);
return r

"""
Class that stores an instance of a git repository given the address to that
repo relative to this file. It returns the data in multiple useful forms.
"""
class GitRepo(object):

""" A pandas DataFrame object indexed by time that stores
the raw form of the repo's commit data as a table where
each row is a commit and each col represents an attribute
of that commit (time, message, commiter name, committer email,
commit hexsha)
class GitRepo(object):
"""
Store a git repository given the address to that repo relative to this file.
It returns the data in many forms.
"""

def __init__(self, name, url=None, attribs = ALL_ATTRIBUTES, cache=None):
"""
Index a Pandas DataFrame object by time.
That stores the raw form of the repo's commit data as a table.
Each row in this table is a commit.
And each column represents an attribute of that commit:
(eg.: time, message, commiter name, committer email, commit hexsha).
"""

self._commit_data = None;
self.url = url;
self.repo = None
Expand Down Expand Up @@ -57,6 +65,7 @@ def __init__(self, name, url=None, attribs = ALL_ATTRIBUTES, cache=None):
self._commit_data = self._commit_data.apply(lambda row: entity_resolve(row, "Committer Email", "Committer Name"), axis=1)

def gen_data(self, repo, raw):
"""Generate data to repo."""

if not repo.active_branch.is_valid():
print("Found an empty repo: " + str(self.name))
Expand Down Expand Up @@ -101,6 +110,7 @@ def gen_data(self, repo, raw):


def populate_data(self, attribs = ALL_ATTRIBUTES):
"""Populate data."""
raw = dict()
for attrib in attribs:
raw[attrib] = list();
Expand All @@ -114,27 +124,33 @@ def populate_data(self, attribs = ALL_ATTRIBUTES):
self._commit_data = pd.DataFrame(raw, index = time_index);

def by_committer(self):
"""Return commit data grouped by commiter."""
return self.commit_data.groupby('Committer Name').size().order()

def commits_per_day(self):
"""Return commits grouped by day."""
ans = self.commit_data.groupby(self.commit_data.index).size()
ans = ans.resample("D", how=np.sum)
return ans;

def commits_per_week(self):
"""Return commits grouped by week."""
ans = self.commits_per_day();
ans = ans.resample("W", how=np.sum)
return ans;

def commits_per_day_full(self):
"""Return commits grouped by day and by commiter."""
ans = self.commit_data.groupby([self.commit_data.index, "Committer Name" ]).size()
return ans;

@property
def commit_data(self):
"""Return commit data."""
return self._commit_data;

def commits_for_committer(self, committer_name):
"""Return commits for committer given the commiter name."""
full_info = self.commit_data
time_index = pd.DatetimeIndex(self.commit_data["Time"], periods = 24, freq = "H");

Expand All @@ -145,14 +161,13 @@ def commits_for_committer(self, committer_name):
return df

def merge_with_repo(self, other):
"""Append commit to a repo."""
# TODO: What if commits have the same time?
self._commit_data = self.commit_data.append(other.commit_data);

class MultiGitRepo(GitRepo):
"""Repos must have a "Repo Name" column."""

"""
Repos must have a "Repo Name" column
"""
def __init__(self, repos, attribs=ALL_ATTRIBUTES):
self._commit_data = repos[0].commit_data.copy(deep=True);
for i in range(1, len(repos)):
Expand Down
15 changes: 12 additions & 3 deletions bigbang/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@


def messages_to_reply_graph(messages):
"""Return a graph given messages."""

G = nx.DiGraph()

Expand All @@ -32,6 +33,7 @@ def messages_to_reply_graph(messages):


def messages_to_interaction_graph(messages, verbose=False,clean=True):
"""Return a interactable graph given messages."""

IG = nx.DiGraph()

Expand Down Expand Up @@ -85,8 +87,8 @@ def messages_to_interaction_graph(messages, verbose=False,clean=True):
return IG


# turn an interaction graph into a weighted edge matrix
def interaction_graph_to_matrix(dg):
"""Turn an interaction graph into a weighted edge matrix."""
nodes = dg.nodes()

n_nodes = len(nodes)
Expand All @@ -103,9 +105,12 @@ def interaction_graph_to_matrix(dg):
return matrix


# Ulanowicz ecosystem health measures
# input is weighted adjacency matrix
def ascendancy(am):
"""
Ulanowicz ecosystem health measures
Input is weighted adjacency matrix.
"""

# total system throughput
tst = np.sum(am)

Expand All @@ -124,6 +129,7 @@ def ascendancy(am):


def capacity(am):
"""Return the capacity given a adjacency matrix."""
# total system throughput
tst = np.sum(am)

Expand All @@ -133,6 +139,7 @@ def capacity(am):


def overhead(am):
"""Return overhead given a adjacency matrix."""
# could be more efficient...
return capacity(am) - ascendancy(am)

Expand All @@ -141,6 +148,8 @@ def overhead(am):


def compute_ascendancy(messages, duration=50):
"""Compute ascendancy given messages."""

print('compute ascendancy')
dated_messages = {}

Expand Down
Loading

0 comments on commit 5f70f90

Please sign in to comment.