Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
davidhin committed Mar 11, 2021
0 parents commit 9797ab0
Show file tree
Hide file tree
Showing 12 changed files with 357 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
*.img
*.simg
.pytest_cache
__pycache__
storage
*.egg-info
output
.vscode/sftp.json
12 changes: 12 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"jupyter.jupyterServerType": "local",
"files.exclude": {
".pytest_cache": true,
"**/*.egg-info": true,
"**/__pycache__": true,
"**/*.img": true,
"**/*.simg": true,
"**/*.sif": true
},
"python.pythonPath": "/home/david/Documents/miniconda/bin/python"
}
21 changes: 21 additions & 0 deletions Singularity
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Bootstrap:docker
From:python:3.8-slim

%labels
MAINTAINER admin
WHATAMI admin

%files
cli.sh /cli.sh
requirements.txt /requirements.txt

%runscript
exec /bin/bash /cli.sh "$@"

%post
chmod u+x /cli.sh

# Install dependencies here
apt update
apt install -y build-essential
pip install -r /requirements.txt
48 changes: 48 additions & 0 deletions cli.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/bin/bash

echo "Welcome to the CLI! \
Please use '-p initialise' if it's your first time. \
This CLI must be run in the directory folder of the cloned repository."
usage() {
echo "Usage: $0 \
[-h help] \
[-t run tests] \
[-p run program <initialise|path_to_file>] \
[-a arguments]" 1>&2
exit 1
}

while getopts ":hp:a:t" opt; do
case ${opt} in
h)
usage
;;
p)
p=${OPTARG}
;;
a)
a+=("${OPTARG}")
;;
t)
pytest tests/
;;
\?)
echo "Invalid option"
usage
;;
esac
done
shift $((OPTIND - 1))

# Download data and install main code
if [[ "initialise" == "${p}" ]]; then
pip install -e .
exit 0
fi

# Run Python Program
if [[ -z "${p}" ]]; then
usage
else
python3 -u "${p}" "${a[@]}"
fi
Empty file added hpc/logs/placeholder
Empty file.
1 change: 1 addition & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Download GHTorrent Data
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
numpy==1.18.5
pandas==1.1.5
gdown==3.12.2
tqdm==4.58.0
pytest==6.2.2
fastparquet==0.5.0
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from setuptools import find_packages, setup

setup(name="singghtorrent", version="1.0", packages=find_packages())
50 changes: 50 additions & 0 deletions singghtorrent/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Set up project paths."""
from pathlib import Path


def project_root() -> Path:
"""Get project path."""
return Path(__file__).parent.parent


def storage_root() -> Path:
"""Get storage path."""
return Path(__file__).parent.parent / "storage"


def storage_external_root() -> Path:
"""Get storage external path."""
path = storage_root() / "external"
Path(path).mkdir(exist_ok=True, parents=True)
return path


def storage_interim_root() -> Path:
"""Get storage interim path."""
path = storage_root() / "interim"
Path(path).mkdir(exist_ok=True, parents=True)
return path


def storage_processed_root() -> Path:
"""Get storage procesTsed path."""
path = storage_root() / "processed"
Path(path).mkdir(exist_ok=True, parents=True)
return path


def outputs_root() -> Path:
"""Get output path."""
path = Path(__file__).parent.parent / "output"
Path(path).mkdir(exist_ok=True, parents=True)
return path


def get_path(path) -> Path:
"""Get path, if exists. If not, create it."""
Path(path).mkdir(exist_ok=True, parents=True)
return path


# https://stackoverflow.com/a/50194143/1889006
# https://stackoverflow.com/a/53465812/1889006
34 changes: 34 additions & 0 deletions singghtorrent/analysis/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import sys
from pathlib import Path

import numpy as np
import singghtorrent as sg
from singghtorrent.helpers import dl_ghtorrent as dg

# Setup
NUM_JOBS = 200
JOB_ARRAY_NUMBER = int(sys.argv[1]) - 1
START_YEAR = 2015
END_YEAR = 2021

# Create paths
Path(sg.storage_external_root() / "ghtorrent/").mkdir(exist_ok=True)
Path(sg.storage_interim_root() / "ghtorrent").mkdir(exist_ok=True)
Path(sg.storage_processed_root() / "pr_comments/").mkdir(exist_ok=True)
Path(sg.storage_processed_root() / "commit_messages/").mkdir(exist_ok=True)

# Generate job array mapping
Path(sg.storage_interim_root() / "hpc_mapping/").mkdir(exist_ok=True)

# Get dates
all_dates = []
for year in range(START_YEAR, END_YEAR + 1):
all_dates += dg.get_dates_for_year(year)

# Get NUM_JOBS
splits = np.array_split(all_dates, NUM_JOBS) # Approx 3 hours each
split = splits[JOB_ARRAY_NUMBER]

# Download
for date in split:
dg.download_github_day(date)
170 changes: 170 additions & 0 deletions singghtorrent/helpers/dl_ghtorrent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import gzip
import json
import os
from calendar import Calendar
from datetime import date
from glob import glob
from multiprocessing.pool import Pool

import pandas as pd
import requests
import singghtorrent as sg
from tqdm import tqdm


def should_skip(date: str, stage: str = ""):
"""Check hierarchically if data is finished."""
ext_path = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date)
df_prc_path = sg.storage_interim_root() / "ghtorrent/{}-prc.parquet".format(date)
df_cm_path = sg.storage_interim_root() / "ghtorrent/{}-cm.parquet".format(date)
if os.path.exists(df_prc_path) and os.path.exists(df_cm_path):
print("Already interimmed.")
return True
elif stage == "interim":
return False
if os.path.exists(ext_path):
print("Already downloaded.")
return True
return False


def download_gh_event(date: str):
"""Download from ghtorrent.
From: https://github.com/src-d/datasets/blob/master/ReviewComments/\
PR_review_comments_generation.ipynb
Date format in YYYY-MM-DD-hh
Args:
date (str): Date like 2021-01-01-0
"""
url = "http://data.gharchive.org/{}.json.gz".format(date)
saveurl = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date)
if should_skip(date):
return
r = requests.get(url)
with open(saveurl, "wb") as f:
f.write(r.content)


def get_github_data(path: str) -> pd.DataFrame:
"""Get PR comments and commit messages from events."""
COLUMNS = ["COMMENT_ID", "COMMIT_ID", "URL", "AUTHOR", "CREATED_AT", "BODY"]
comments_list = []
commits_list = []
for line in tqdm(gzip.open(path).readlines()):
event = json.loads(line)
if event["type"] == "PullRequestReviewCommentEvent":
comments_list.append(
[
event["payload"]["comment"]["id"],
event["payload"]["comment"]["commit_id"],
event["payload"]["comment"]["html_url"],
event["payload"]["comment"]["user"]["login"],
event["payload"]["comment"]["created_at"],
event["payload"]["comment"]["body"],
]
)
if event["type"] == "PushEvent":
commits_list += event["payload"]["commits"]
pr_comments_df = pd.DataFrame(comments_list, columns=COLUMNS)
commit_message_df = pd.DataFrame.from_records(commits_list).drop_duplicates(
subset="sha"
)[["message", "url"]]
return pr_comments_df, commit_message_df


def download_github_data(date: str):
"""Download and parse PR given YYYY-MM-DD-hh."""
download_gh_event(date)
if should_skip(date, "interim"):
return
ext_dl_path = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date)
df_prc, df_cm = get_github_data(ext_dl_path)
df_prc_path = sg.storage_interim_root() / "ghtorrent/{}-prc.parquet".format(date)
df_prc.to_parquet(df_prc_path, index=0, compression="gzip")
df_cm_path = sg.storage_interim_root() / "ghtorrent/{}-cm.parquet".format(date)
df_cm.to_parquet(df_cm_path, index=0, compression="gzip")
return


def delete_glob(globstr: str):
"""Delete files using glob."""
for f in glob(globstr):
os.remove(f)


def download_github_day(date: tuple):
"""Download by a full date (year, month, day)."""
dates = generate_date_strs(date[0], date[1], date[2])
date3 = "{}-{:02d}-{:02d}".format(date[0], date[1], date[2])
proc_prc_path = (
sg.storage_processed_root() / "pr_comments" / "{}-prc.parquet".format(date3)
)
cm_prc_path = (
sg.storage_processed_root() / "commit_messages" / "{}-cm.parquet".format(date3)
)
if os.path.exists(proc_prc_path) and os.path.exists(cm_prc_path):
delete_glob(str(sg.storage_interim_root() / "ghtorrent/{}-*".format(date3)))
delete_glob(str(sg.storage_external_root() / "ghtorrent/{}-*".format(date3)))
return "Already processed {}".format(date3)

for d in dates:
download_github_data(d)
if d.split("-")[3] == "23":
prc_paths = glob(
str(sg.storage_interim_root() / "ghtorrent/{}-*-prc*".format(date3))
)
cm_paths = glob(
str(sg.storage_interim_root() / "ghtorrent/{}-*-cm*".format(date3))
)
if len(prc_paths) == 24:
df = pd.concat([pd.read_parquet(i) for i in prc_paths])
df.to_parquet(proc_prc_path, index=0, compression="gzip")
if len(cm_paths) == 24:
df = pd.concat([pd.read_parquet(i) for i in cm_paths])
df.to_parquet(cm_prc_path, index=0, compression="gzip")
if os.path.exists(proc_prc_path) and os.path.exists(cm_prc_path):
delete_glob(str(sg.storage_interim_root() / "ghtorrent/{}-*".format(date3)))
delete_glob(str(sg.storage_external_root() / "ghtorrent/{}-*".format(date3)))
print("Finished {}!".format(date))
return


def generate_date_strs(year: int, month: int, day: int) -> str:
"""Automatically generate date strings."""
return ["{}-{:02d}-{:02d}-{}".format(year, month, day, i) for i in range(24)]


def get_dates_for_year(year: int) -> list:
"""Return list of dates for given year."""
early_stop = False
dates = []
today = date.today()
now_year, now_month, now_day = today.year, today.month, today.day
for m in range(1, 13):
interim_dates = list(Calendar().itermonthdays3(year, m))
interim_dates = [i for i in interim_dates if i[1] == m]
processed_dates = []
for d in interim_dates:
if d[0] >= now_year and d[1] >= now_month and d[2] >= now_day:
early_stop = True
break
processed_dates.append(d)
dates += processed_dates
if early_stop:
return dates
return dates


def download_pool_hours(year: str, month: str, day: str) -> pd.DataFrame:
"""Download data in parallel and return dataframe."""
pool = Pool(4)
dates = generate_date_strs(year, month, day)
pr_comments_df = []
commit_messages_df = []
for result in pool.imap_unordered(download_github_data, dates):
pr_comments_df.append(result[0])
commit_messages_df.append(result[1])
pool.close()
pool.join()
return pd.concat(pr_comments_df), pd.concat(commit_messages_df)
4 changes: 4 additions & 0 deletions tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
filterwarnings =
ignore::DeprecationWarning:tensorflow.*:
ignore::DeprecationWarning:tensorboard.*:

0 comments on commit 9797ab0

Please sign in to comment.