-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 9797ab0
Showing
12 changed files
with
357 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
*.img | ||
*.simg | ||
.pytest_cache | ||
__pycache__ | ||
storage | ||
*.egg-info | ||
output | ||
.vscode/sftp.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
{ | ||
"jupyter.jupyterServerType": "local", | ||
"files.exclude": { | ||
".pytest_cache": true, | ||
"**/*.egg-info": true, | ||
"**/__pycache__": true, | ||
"**/*.img": true, | ||
"**/*.simg": true, | ||
"**/*.sif": true | ||
}, | ||
"python.pythonPath": "/home/david/Documents/miniconda/bin/python" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
Bootstrap:docker | ||
From:python:3.8-slim | ||
|
||
%labels | ||
MAINTAINER admin | ||
WHATAMI admin | ||
|
||
%files | ||
cli.sh /cli.sh | ||
requirements.txt /requirements.txt | ||
|
||
%runscript | ||
exec /bin/bash /cli.sh "$@" | ||
|
||
%post | ||
chmod u+x /cli.sh | ||
|
||
# Install dependencies here | ||
apt update | ||
apt install -y build-essential | ||
pip install -r /requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#!/bin/bash | ||
|
||
echo "Welcome to the CLI! \ | ||
Please use '-p initialise' if it's your first time. \ | ||
This CLI must be run in the directory folder of the cloned repository." | ||
usage() { | ||
echo "Usage: $0 \ | ||
[-h help] \ | ||
[-t run tests] \ | ||
[-p run program <initialise|path_to_file>] \ | ||
[-a arguments]" 1>&2 | ||
exit 1 | ||
} | ||
|
||
while getopts ":hp:a:t" opt; do | ||
case ${opt} in | ||
h) | ||
usage | ||
;; | ||
p) | ||
p=${OPTARG} | ||
;; | ||
a) | ||
a+=("${OPTARG}") | ||
;; | ||
t) | ||
pytest tests/ | ||
;; | ||
\?) | ||
echo "Invalid option" | ||
usage | ||
;; | ||
esac | ||
done | ||
shift $((OPTIND - 1)) | ||
|
||
# Download data and install main code | ||
if [[ "initialise" == "${p}" ]]; then | ||
pip install -e . | ||
exit 0 | ||
fi | ||
|
||
# Run Python Program | ||
if [[ -z "${p}" ]]; then | ||
usage | ||
else | ||
python3 -u "${p}" "${a[@]}" | ||
fi |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# Download GHTorrent Data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
numpy==1.18.5 | ||
pandas==1.1.5 | ||
gdown==3.12.2 | ||
tqdm==4.58.0 | ||
pytest==6.2.2 | ||
fastparquet==0.5.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from setuptools import find_packages, setup | ||
|
||
setup(name="singghtorrent", version="1.0", packages=find_packages()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
"""Set up project paths.""" | ||
from pathlib import Path | ||
|
||
|
||
def project_root() -> Path: | ||
"""Get project path.""" | ||
return Path(__file__).parent.parent | ||
|
||
|
||
def storage_root() -> Path: | ||
"""Get storage path.""" | ||
return Path(__file__).parent.parent / "storage" | ||
|
||
|
||
def storage_external_root() -> Path: | ||
"""Get storage external path.""" | ||
path = storage_root() / "external" | ||
Path(path).mkdir(exist_ok=True, parents=True) | ||
return path | ||
|
||
|
||
def storage_interim_root() -> Path: | ||
"""Get storage interim path.""" | ||
path = storage_root() / "interim" | ||
Path(path).mkdir(exist_ok=True, parents=True) | ||
return path | ||
|
||
|
||
def storage_processed_root() -> Path: | ||
"""Get storage procesTsed path.""" | ||
path = storage_root() / "processed" | ||
Path(path).mkdir(exist_ok=True, parents=True) | ||
return path | ||
|
||
|
||
def outputs_root() -> Path: | ||
"""Get output path.""" | ||
path = Path(__file__).parent.parent / "output" | ||
Path(path).mkdir(exist_ok=True, parents=True) | ||
return path | ||
|
||
|
||
def get_path(path) -> Path: | ||
"""Get path, if exists. If not, create it.""" | ||
Path(path).mkdir(exist_ok=True, parents=True) | ||
return path | ||
|
||
|
||
# https://stackoverflow.com/a/50194143/1889006 | ||
# https://stackoverflow.com/a/53465812/1889006 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import sys | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import singghtorrent as sg | ||
from singghtorrent.helpers import dl_ghtorrent as dg | ||
|
||
# Setup | ||
NUM_JOBS = 200 | ||
JOB_ARRAY_NUMBER = int(sys.argv[1]) - 1 | ||
START_YEAR = 2015 | ||
END_YEAR = 2021 | ||
|
||
# Create paths | ||
Path(sg.storage_external_root() / "ghtorrent/").mkdir(exist_ok=True) | ||
Path(sg.storage_interim_root() / "ghtorrent").mkdir(exist_ok=True) | ||
Path(sg.storage_processed_root() / "pr_comments/").mkdir(exist_ok=True) | ||
Path(sg.storage_processed_root() / "commit_messages/").mkdir(exist_ok=True) | ||
|
||
# Generate job array mapping | ||
Path(sg.storage_interim_root() / "hpc_mapping/").mkdir(exist_ok=True) | ||
|
||
# Get dates | ||
all_dates = [] | ||
for year in range(START_YEAR, END_YEAR + 1): | ||
all_dates += dg.get_dates_for_year(year) | ||
|
||
# Get NUM_JOBS | ||
splits = np.array_split(all_dates, NUM_JOBS) # Approx 3 hours each | ||
split = splits[JOB_ARRAY_NUMBER] | ||
|
||
# Download | ||
for date in split: | ||
dg.download_github_day(date) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,170 @@ | ||
import gzip | ||
import json | ||
import os | ||
from calendar import Calendar | ||
from datetime import date | ||
from glob import glob | ||
from multiprocessing.pool import Pool | ||
|
||
import pandas as pd | ||
import requests | ||
import singghtorrent as sg | ||
from tqdm import tqdm | ||
|
||
|
||
def should_skip(date: str, stage: str = ""): | ||
"""Check hierarchically if data is finished.""" | ||
ext_path = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date) | ||
df_prc_path = sg.storage_interim_root() / "ghtorrent/{}-prc.parquet".format(date) | ||
df_cm_path = sg.storage_interim_root() / "ghtorrent/{}-cm.parquet".format(date) | ||
if os.path.exists(df_prc_path) and os.path.exists(df_cm_path): | ||
print("Already interimmed.") | ||
return True | ||
elif stage == "interim": | ||
return False | ||
if os.path.exists(ext_path): | ||
print("Already downloaded.") | ||
return True | ||
return False | ||
|
||
|
||
def download_gh_event(date: str): | ||
"""Download from ghtorrent. | ||
From: https://github.com/src-d/datasets/blob/master/ReviewComments/\ | ||
PR_review_comments_generation.ipynb | ||
Date format in YYYY-MM-DD-hh | ||
Args: | ||
date (str): Date like 2021-01-01-0 | ||
""" | ||
url = "http://data.gharchive.org/{}.json.gz".format(date) | ||
saveurl = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date) | ||
if should_skip(date): | ||
return | ||
r = requests.get(url) | ||
with open(saveurl, "wb") as f: | ||
f.write(r.content) | ||
|
||
|
||
def get_github_data(path: str) -> pd.DataFrame: | ||
"""Get PR comments and commit messages from events.""" | ||
COLUMNS = ["COMMENT_ID", "COMMIT_ID", "URL", "AUTHOR", "CREATED_AT", "BODY"] | ||
comments_list = [] | ||
commits_list = [] | ||
for line in tqdm(gzip.open(path).readlines()): | ||
event = json.loads(line) | ||
if event["type"] == "PullRequestReviewCommentEvent": | ||
comments_list.append( | ||
[ | ||
event["payload"]["comment"]["id"], | ||
event["payload"]["comment"]["commit_id"], | ||
event["payload"]["comment"]["html_url"], | ||
event["payload"]["comment"]["user"]["login"], | ||
event["payload"]["comment"]["created_at"], | ||
event["payload"]["comment"]["body"], | ||
] | ||
) | ||
if event["type"] == "PushEvent": | ||
commits_list += event["payload"]["commits"] | ||
pr_comments_df = pd.DataFrame(comments_list, columns=COLUMNS) | ||
commit_message_df = pd.DataFrame.from_records(commits_list).drop_duplicates( | ||
subset="sha" | ||
)[["message", "url"]] | ||
return pr_comments_df, commit_message_df | ||
|
||
|
||
def download_github_data(date: str): | ||
"""Download and parse PR given YYYY-MM-DD-hh.""" | ||
download_gh_event(date) | ||
if should_skip(date, "interim"): | ||
return | ||
ext_dl_path = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date) | ||
df_prc, df_cm = get_github_data(ext_dl_path) | ||
df_prc_path = sg.storage_interim_root() / "ghtorrent/{}-prc.parquet".format(date) | ||
df_prc.to_parquet(df_prc_path, index=0, compression="gzip") | ||
df_cm_path = sg.storage_interim_root() / "ghtorrent/{}-cm.parquet".format(date) | ||
df_cm.to_parquet(df_cm_path, index=0, compression="gzip") | ||
return | ||
|
||
|
||
def delete_glob(globstr: str): | ||
"""Delete files using glob.""" | ||
for f in glob(globstr): | ||
os.remove(f) | ||
|
||
|
||
def download_github_day(date: tuple): | ||
"""Download by a full date (year, month, day).""" | ||
dates = generate_date_strs(date[0], date[1], date[2]) | ||
date3 = "{}-{:02d}-{:02d}".format(date[0], date[1], date[2]) | ||
proc_prc_path = ( | ||
sg.storage_processed_root() / "pr_comments" / "{}-prc.parquet".format(date3) | ||
) | ||
cm_prc_path = ( | ||
sg.storage_processed_root() / "commit_messages" / "{}-cm.parquet".format(date3) | ||
) | ||
if os.path.exists(proc_prc_path) and os.path.exists(cm_prc_path): | ||
delete_glob(str(sg.storage_interim_root() / "ghtorrent/{}-*".format(date3))) | ||
delete_glob(str(sg.storage_external_root() / "ghtorrent/{}-*".format(date3))) | ||
return "Already processed {}".format(date3) | ||
|
||
for d in dates: | ||
download_github_data(d) | ||
if d.split("-")[3] == "23": | ||
prc_paths = glob( | ||
str(sg.storage_interim_root() / "ghtorrent/{}-*-prc*".format(date3)) | ||
) | ||
cm_paths = glob( | ||
str(sg.storage_interim_root() / "ghtorrent/{}-*-cm*".format(date3)) | ||
) | ||
if len(prc_paths) == 24: | ||
df = pd.concat([pd.read_parquet(i) for i in prc_paths]) | ||
df.to_parquet(proc_prc_path, index=0, compression="gzip") | ||
if len(cm_paths) == 24: | ||
df = pd.concat([pd.read_parquet(i) for i in cm_paths]) | ||
df.to_parquet(cm_prc_path, index=0, compression="gzip") | ||
if os.path.exists(proc_prc_path) and os.path.exists(cm_prc_path): | ||
delete_glob(str(sg.storage_interim_root() / "ghtorrent/{}-*".format(date3))) | ||
delete_glob(str(sg.storage_external_root() / "ghtorrent/{}-*".format(date3))) | ||
print("Finished {}!".format(date)) | ||
return | ||
|
||
|
||
def generate_date_strs(year: int, month: int, day: int) -> str: | ||
"""Automatically generate date strings.""" | ||
return ["{}-{:02d}-{:02d}-{}".format(year, month, day, i) for i in range(24)] | ||
|
||
|
||
def get_dates_for_year(year: int) -> list: | ||
"""Return list of dates for given year.""" | ||
early_stop = False | ||
dates = [] | ||
today = date.today() | ||
now_year, now_month, now_day = today.year, today.month, today.day | ||
for m in range(1, 13): | ||
interim_dates = list(Calendar().itermonthdays3(year, m)) | ||
interim_dates = [i for i in interim_dates if i[1] == m] | ||
processed_dates = [] | ||
for d in interim_dates: | ||
if d[0] >= now_year and d[1] >= now_month and d[2] >= now_day: | ||
early_stop = True | ||
break | ||
processed_dates.append(d) | ||
dates += processed_dates | ||
if early_stop: | ||
return dates | ||
return dates | ||
|
||
|
||
def download_pool_hours(year: str, month: str, day: str) -> pd.DataFrame: | ||
"""Download data in parallel and return dataframe.""" | ||
pool = Pool(4) | ||
dates = generate_date_strs(year, month, day) | ||
pr_comments_df = [] | ||
commit_messages_df = [] | ||
for result in pool.imap_unordered(download_github_data, dates): | ||
pr_comments_df.append(result[0]) | ||
commit_messages_df.append(result[1]) | ||
pool.close() | ||
pool.join() | ||
return pd.concat(pr_comments_df), pd.concat(commit_messages_df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
[pytest] | ||
filterwarnings = | ||
ignore::DeprecationWarning:tensorflow.*: | ||
ignore::DeprecationWarning:tensorboard.*: |