diff --git a/bigbang/archive.py b/bigbang/archive.py index 7352eacd..8ed0e30e 100644 --- a/bigbang/archive.py +++ b/bigbang/archive.py @@ -17,10 +17,7 @@ def load(path): class Archive(object): - - """ - A representation of a mailing list archive. - """ + """A representation of a mailing list archive.""" data = None activity = None @@ -29,7 +26,7 @@ class Archive(object): def __init__(self, data, archive_dir=CONFIG.mail_path, mbox=False): """ - Initializes an Archive object. + Initialize an Archive object. The behavior of the constructor depends on the type of its first argument, data. @@ -110,6 +107,7 @@ def __init__(self, data, archive_dir=CONFIG.mail_path, mbox=False): raise mailman.MissingDataException('Archive after initial processing is empty. Was data collected properly?') def resolve_entities(self,inplace=True): + """Return data with resolved entities.""" if self.entities is None: if self.activity is None: self.get_activity() @@ -139,6 +137,7 @@ def resolve_entities(self,inplace=True): def get_activity(self,resolved=False): """ Get the activity matrix of an Archive. + Columns of the returned DataFrame are the Senders of emails. Rows are indexed by ordinal date. Cells are the number of emails sent by each sender on each data. @@ -158,6 +157,7 @@ def get_activity(self,resolved=False): return self.activity def compute_activity(self, clean=True): + """Return the computed activity.""" mdf = self.data if clean: @@ -181,6 +181,7 @@ def compute_activity(self, clean=True): return activity def get_threads(self, verbose=False): + """Get threads.""" if self.threads is not None: return self.threads @@ -222,12 +223,14 @@ def get_threads(self, verbose=False): return threads def save(self, path,encoding='utf-8'): + """Save data to csv file.""" self.data.to_csv(path, ",",encoding=encoding) def find_footer(messages,number=1): ''' Returns the footer of a DataFrame of emails. + A footer is a string occurring at the tail of most messages. Messages can be a DataFrame or a Series ''' diff --git a/bigbang/entity_resolution.py b/bigbang/entity_resolution.py index f6be04c5..84fe5477 100644 --- a/bigbang/entity_resolution.py +++ b/bigbang/entity_resolution.py @@ -8,6 +8,8 @@ currID = 1; def getID(name, email): + """Get ID from a name and email.""" + global currID; global emailsID global namesID @@ -42,23 +44,29 @@ def getID(name, email): return nameID; -def store(id, name, email) : - if id not in allID: - allID[id] = {"emails": list(), "names": list()} - fullID = allID[id]; - namesID[name] = id; - emailsID[email] = id; - fullID["names"].append(name); - fullID["emails"].append(email); +def store(id, name, email): + """Store name and email by ID.""" + + if id not in allID: + allID[id] = {"emails": list(), "names": list()} + fullID = allID[id]; + namesID[name] = id; + emailsID[email] = id; + fullID["names"].append(name); + fullID["emails"].append(email); def name_for_id(id): + """Return name by ID.""" + if id in allID: if "names" in allID[id] and len(allID[id]["names"]) > 0: return allID[id]["names"][0] return "UNKNOWN " + str(id) def entity_resolve(row, emailCol, nameCol): + """Return a row with name and email by ID.""" + emailAddress = row[emailCol].upper(); emailAddress = emailAddress.replace(" AT ", "@") @@ -73,4 +81,4 @@ def entity_resolve(row, emailCol, nameCol): if nameCol is not None : name = row[nameCol].upper() row["Person-ID"] = getID(name, emailAddress) - return row \ No newline at end of file + return row diff --git a/bigbang/git_repo.py b/bigbang/git_repo.py index 3f75cfd2..da3f18da 100644 --- a/bigbang/git_repo.py +++ b/bigbang/git_repo.py @@ -11,24 +11,32 @@ ALL_ATTRIBUTES = CONFIG.all_attributes #["HEXSHA", "Committer Name", "Committer Email", "Commit Message", "Time", "Parent Commit", "Touched File"] -def cache_fixer(r): # Adds info from row to graph +def cache_fixer(r): + """Adds info from row to graph.""" r["Touched File"] = [x.strip() for x in r["Touched File"][1:-1].split(",")] r["Time"] = pd.to_datetime(r["Time"]); return r -""" -Class that stores an instance of a git repository given the address to that -repo relative to this file. It returns the data in multiple useful forms. -""" -class GitRepo(object): - """ A pandas DataFrame object indexed by time that stores - the raw form of the repo's commit data as a table where - each row is a commit and each col represents an attribute - of that commit (time, message, commiter name, committer email, - commit hexsha) +class GitRepo(object): + """ + Store a git repository given the address to that repo relative to this file. + + It returns the data in many forms. """ + def __init__(self, name, url=None, attribs = ALL_ATTRIBUTES, cache=None): + """ + Index a Pandas DataFrame object by time. + + That stores the raw form of the repo's commit data as a table. + + Each row in this table is a commit. + + And each column represents an attribute of that commit: + (eg.: time, message, commiter name, committer email, commit hexsha). + """ + self._commit_data = None; self.url = url; self.repo = None @@ -57,6 +65,7 @@ def __init__(self, name, url=None, attribs = ALL_ATTRIBUTES, cache=None): self._commit_data = self._commit_data.apply(lambda row: entity_resolve(row, "Committer Email", "Committer Name"), axis=1) def gen_data(self, repo, raw): + """Generate data to repo.""" if not repo.active_branch.is_valid(): print("Found an empty repo: " + str(self.name)) @@ -101,6 +110,7 @@ def gen_data(self, repo, raw): def populate_data(self, attribs = ALL_ATTRIBUTES): + """Populate data.""" raw = dict() for attrib in attribs: raw[attrib] = list(); @@ -114,27 +124,33 @@ def populate_data(self, attribs = ALL_ATTRIBUTES): self._commit_data = pd.DataFrame(raw, index = time_index); def by_committer(self): + """Return commit data grouped by commiter.""" return self.commit_data.groupby('Committer Name').size().order() def commits_per_day(self): + """Return commits grouped by day.""" ans = self.commit_data.groupby(self.commit_data.index).size() ans = ans.resample("D", how=np.sum) return ans; def commits_per_week(self): + """Return commits grouped by week.""" ans = self.commits_per_day(); ans = ans.resample("W", how=np.sum) return ans; def commits_per_day_full(self): + """Return commits grouped by day and by commiter.""" ans = self.commit_data.groupby([self.commit_data.index, "Committer Name" ]).size() return ans; @property def commit_data(self): + """Return commit data.""" return self._commit_data; def commits_for_committer(self, committer_name): + """Return commits for committer given the commiter name.""" full_info = self.commit_data time_index = pd.DatetimeIndex(self.commit_data["Time"], periods = 24, freq = "H"); @@ -145,14 +161,13 @@ def commits_for_committer(self, committer_name): return df def merge_with_repo(self, other): + """Append commit to a repo.""" # TODO: What if commits have the same time? self._commit_data = self.commit_data.append(other.commit_data); class MultiGitRepo(GitRepo): + """Repos must have a "Repo Name" column.""" - """ - Repos must have a "Repo Name" column - """ def __init__(self, repos, attribs=ALL_ATTRIBUTES): self._commit_data = repos[0].commit_data.copy(deep=True); for i in range(1, len(repos)): diff --git a/bigbang/graph.py b/bigbang/graph.py index beedd5cd..5b502e41 100644 --- a/bigbang/graph.py +++ b/bigbang/graph.py @@ -10,6 +10,7 @@ def messages_to_reply_graph(messages): + """Return a graph given messages.""" G = nx.DiGraph() @@ -32,6 +33,7 @@ def messages_to_reply_graph(messages): def messages_to_interaction_graph(messages, verbose=False,clean=True): + """Return a interactable graph given messages.""" IG = nx.DiGraph() @@ -85,8 +87,8 @@ def messages_to_interaction_graph(messages, verbose=False,clean=True): return IG -# turn an interaction graph into a weighted edge matrix def interaction_graph_to_matrix(dg): + """Turn an interaction graph into a weighted edge matrix.""" nodes = dg.nodes() n_nodes = len(nodes) @@ -103,9 +105,12 @@ def interaction_graph_to_matrix(dg): return matrix -# Ulanowicz ecosystem health measures -# input is weighted adjacency matrix def ascendancy(am): + """ + Ulanowicz ecosystem health measures + Input is weighted adjacency matrix. + """ + # total system throughput tst = np.sum(am) @@ -124,6 +129,7 @@ def ascendancy(am): def capacity(am): + """Return the capacity given a adjacency matrix.""" # total system throughput tst = np.sum(am) @@ -133,6 +139,7 @@ def capacity(am): def overhead(am): + """Return overhead given a adjacency matrix.""" # could be more efficient... return capacity(am) - ascendancy(am) @@ -141,6 +148,8 @@ def overhead(am): def compute_ascendancy(messages, duration=50): + """Compute ascendancy given messages.""" + print('compute ascendancy') dated_messages = {} diff --git a/bigbang/mailman.py b/bigbang/mailman.py index 603d7dff..d09e3c49 100644 --- a/bigbang/mailman.py +++ b/bigbang/mailman.py @@ -52,10 +52,9 @@ def __str__(self): def load_data(name,archive_dir=CONFIG.mail_path,mbox=False): """ - Loads the data associated with an archive name, given - as a string. + Load the data associated with an archive name, given as a string. - Attempts to open {archives-directory}/NAME.csv as data. + Attempt to open {archives-directory}/NAME.csv as data. Failing that, if the the name is a URL, it will try to derive the list name from that URL and load the .csv again. @@ -89,6 +88,8 @@ def load_data(name,archive_dir=CONFIG.mail_path,mbox=False): def collect_from_url(url, archive_dir=CONFIG.mail_path, notes=None): + """Collect data from a given url.""" + url = url.rstrip() try: has_archives = collect_archive_from_url(url, archive_dir=archive_dir, notes=notes) @@ -126,6 +127,7 @@ def collect_from_url(url, archive_dir=CONFIG.mail_path, notes=None): return None def urls_to_collect(urls_file): + """Collect urls given urls in a file.""" urls = [] for url in open(urls_file): url = url.strip() @@ -140,14 +142,16 @@ def urls_to_collect(urls_file): return urls def collect_from_file(urls_file, archive_dir=CONFIG.mail_path, notes=None): + """Collect urls from a file.""" urls = urls_to_collect(urls_file) for url in urls: collect_from_url(url, archive_dir=archive_dir, notes=notes) def get_list_name(url): """ - Returns the 'list name' from a canonical mailman archive url. - Otherwise returns the same URL. + Return the 'list name' from a canonical mailman archive url. + + Otherwise return the same URL. """ # TODO: it would be better to catch these non-url cases earlier url = url.rstrip() @@ -160,10 +164,11 @@ def get_list_name(url): def normalize_archives_url(url): """ - Given a URL, will try to infer, find or guess the most useful - archives URL. + Normalize url. + + will try to infer, find or guess the most useful archives URL, given a URL. - Returns normalized URL, or the original URL if no improvement is found. + Return normalized URL, or the original URL if no improvement is found. """ # change new IETF mailarchive URLs to older, still available text .mail archives new_ietf_exp = re.compile('https://mailarchive\\.ietf\\.org/arch/search/' @@ -183,15 +188,14 @@ def normalize_archives_url(url): def archive_directory(base_dir, list_name): + """Archive a directory.""" arc_dir = os.path.join(base_dir, list_name) if not os.path.exists(arc_dir): os.makedirs(arc_dir) return arc_dir def populate_provenance(directory, list_name, list_url, notes=None): - """ - Creates a provenance metadata file for current mailing list collection. - """ + """Create a provenance metadata file for current mailing list collection.""" provenance = { 'list': { 'list_name': list_name, @@ -218,9 +222,7 @@ def populate_provenance(directory, list_name, list_url, notes=None): file_handle.close() def access_provenance(directory): - """ - Returns an object with provenance information located in the given directory, or None if no provenance was found. - """ + """Return an object with provenance information located in the given directory, or None if no provenance was found.""" file_path = os.path.join(directory, PROVENANCE_FILENAME) if os.path.isfile(file_path): # a provenance file already exists file_handle = file(file_path, 'r') @@ -229,9 +231,7 @@ def access_provenance(directory): return None def update_provenance(directory, provenance): - """ - Updates provenance file with given object. - """ + """Update provenance file with given object.""" file_path = os.path.join(directory, PROVENANCE_FILENAME) file_handle = file(file_path, 'w') yaml.dump(provenance, file_handle) @@ -240,10 +240,9 @@ def update_provenance(directory, provenance): def collect_archive_from_url(url, archive_dir=CONFIG.mail_path, notes=None): """ - Collects archives (generally tar.gz) files from mailmain - archive page. + Collect archives (generally tar.gz) files from mailmain archive page. - Returns True if archives were downloaded, False otherwise + Return True if archives were downloaded, False otherwise (for example if the page lists no accessible archive files). """ list_name = get_list_name(url) @@ -295,6 +294,7 @@ def collect_archive_from_url(url, archive_dir=CONFIG.mail_path, notes=None): def unzip_archive(url, archive_dir=CONFIG.mail_path): + """Unzip archive files.""" arc_dir = archive_directory(archive_dir, get_list_name(url)) gzs = [os.path.join(arc_dir, fn) for fn @@ -327,6 +327,7 @@ def unzip_archive(url, archive_dir=CONFIG.mail_path): # The payload of a Message may be a String, a Message, or a list of Messages. # OR maybe it's never just a Message, but always a list of them. def recursive_get_payload(x): + """Get payloads recursively.""" if isinstance(x,str): return x elif isinstance(x,list): @@ -340,7 +341,7 @@ def recursive_get_payload(x): def open_list_archives(url, archive_dir=CONFIG.mail_path, mbox=False): """ - Returns a list of all email messages contained in the specified directory. + Return a list of all email messages contained in the specified directory. The argument *url* here is taken to be the name of a subdirectory of the directory specified in argument *archive_dir*. @@ -388,8 +389,9 @@ def open_list_archives(url, archive_dir=CONFIG.mail_path, mbox=False): def open_activity_summary(url, archive_dir=CONFIG.mail_path): """ - Opens the message activity summary for a particular mailing list (as specified by url) - and returns the dataframe. Returns None if no activity summary export file is found. + Open the message activity summary for a particular mailing list (as specified by url). + + Return the dataframe, or return None if no activity summary export file is found. """ list_name = get_list_name(url) arc_dir = archive_directory(archive_dir, list_name) @@ -408,6 +410,7 @@ def open_activity_summary(url, archive_dir=CONFIG.mail_path): return activity_frame def get_text(msg): + """Get texto given a message.""" ## This code for character detection and dealing with exceptions is terrible ## It is in need of refactoring badly. - sb import chardet @@ -456,7 +459,6 @@ def messages_to_dataframe(messages): """ Turn a list of parsed messages into a dataframe of message data, indexed by message-id, with column-names from headers. - """ def safe_unicode(t): return t and unicode(t, 'utf-8', 'replace')