diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2d08a9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*.img +*.simg +.pytest_cache +__pycache__ +storage +*.egg-info +output +.vscode/sftp.json \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1b56022 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,12 @@ +{ + "jupyter.jupyterServerType": "local", + "files.exclude": { + ".pytest_cache": true, + "**/*.egg-info": true, + "**/__pycache__": true, + "**/*.img": true, + "**/*.simg": true, + "**/*.sif": true + }, + "python.pythonPath": "/home/david/Documents/miniconda/bin/python" +} diff --git a/Singularity b/Singularity new file mode 100644 index 0000000..83a7ee1 --- /dev/null +++ b/Singularity @@ -0,0 +1,21 @@ +Bootstrap:docker +From:python:3.8-slim + +%labels + MAINTAINER admin + WHATAMI admin + +%files + cli.sh /cli.sh + requirements.txt /requirements.txt + +%runscript + exec /bin/bash /cli.sh "$@" + +%post + chmod u+x /cli.sh + + # Install dependencies here + apt update + apt install -y build-essential + pip install -r /requirements.txt diff --git a/cli.sh b/cli.sh new file mode 100644 index 0000000..9ec6344 --- /dev/null +++ b/cli.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +echo "Welcome to the CLI! \ +Please use '-p initialise' if it's your first time. \ +This CLI must be run in the directory folder of the cloned repository." +usage() { + echo "Usage: $0 \ +[-h help] \ +[-t run tests] \ +[-p run program ] \ + [-a arguments]" 1>&2 + exit 1 +} + +while getopts ":hp:a:t" opt; do + case ${opt} in + h) + usage + ;; + p) + p=${OPTARG} + ;; + a) + a+=("${OPTARG}") + ;; + t) + pytest tests/ + ;; + \?) + echo "Invalid option" + usage + ;; + esac +done +shift $((OPTIND - 1)) + +# Download data and install main code +if [[ "initialise" == "${p}" ]]; then + pip install -e . + exit 0 +fi + +# Run Python Program +if [[ -z "${p}" ]]; then + usage +else + python3 -u "${p}" "${a[@]}" +fi diff --git a/hpc/logs/placeholder b/hpc/logs/placeholder new file mode 100644 index 0000000..e69de29 diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..69e512d --- /dev/null +++ b/readme.md @@ -0,0 +1 @@ +# Download GHTorrent Data diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ee7d8ea --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +numpy==1.18.5 +pandas==1.1.5 +gdown==3.12.2 +tqdm==4.58.0 +pytest==6.2.2 +fastparquet==0.5.0 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6a91a06 --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import find_packages, setup + +setup(name="singghtorrent", version="1.0", packages=find_packages()) diff --git a/singghtorrent/__init__.py b/singghtorrent/__init__.py new file mode 100644 index 0000000..5a42bff --- /dev/null +++ b/singghtorrent/__init__.py @@ -0,0 +1,50 @@ +"""Set up project paths.""" +from pathlib import Path + + +def project_root() -> Path: + """Get project path.""" + return Path(__file__).parent.parent + + +def storage_root() -> Path: + """Get storage path.""" + return Path(__file__).parent.parent / "storage" + + +def storage_external_root() -> Path: + """Get storage external path.""" + path = storage_root() / "external" + Path(path).mkdir(exist_ok=True, parents=True) + return path + + +def storage_interim_root() -> Path: + """Get storage interim path.""" + path = storage_root() / "interim" + Path(path).mkdir(exist_ok=True, parents=True) + return path + + +def storage_processed_root() -> Path: + """Get storage procesTsed path.""" + path = storage_root() / "processed" + Path(path).mkdir(exist_ok=True, parents=True) + return path + + +def outputs_root() -> Path: + """Get output path.""" + path = Path(__file__).parent.parent / "output" + Path(path).mkdir(exist_ok=True, parents=True) + return path + + +def get_path(path) -> Path: + """Get path, if exists. If not, create it.""" + Path(path).mkdir(exist_ok=True, parents=True) + return path + + +# https://stackoverflow.com/a/50194143/1889006 +# https://stackoverflow.com/a/53465812/1889006 diff --git a/singghtorrent/analysis/main.py b/singghtorrent/analysis/main.py new file mode 100644 index 0000000..3482ae2 --- /dev/null +++ b/singghtorrent/analysis/main.py @@ -0,0 +1,34 @@ +import sys +from pathlib import Path + +import numpy as np +import singghtorrent as sg +from singghtorrent.helpers import dl_ghtorrent as dg + +# Setup +NUM_JOBS = 200 +JOB_ARRAY_NUMBER = int(sys.argv[1]) - 1 +START_YEAR = 2015 +END_YEAR = 2021 + +# Create paths +Path(sg.storage_external_root() / "ghtorrent/").mkdir(exist_ok=True) +Path(sg.storage_interim_root() / "ghtorrent").mkdir(exist_ok=True) +Path(sg.storage_processed_root() / "pr_comments/").mkdir(exist_ok=True) +Path(sg.storage_processed_root() / "commit_messages/").mkdir(exist_ok=True) + +# Generate job array mapping +Path(sg.storage_interim_root() / "hpc_mapping/").mkdir(exist_ok=True) + +# Get dates +all_dates = [] +for year in range(START_YEAR, END_YEAR + 1): + all_dates += dg.get_dates_for_year(year) + +# Get NUM_JOBS +splits = np.array_split(all_dates, NUM_JOBS) # Approx 3 hours each +split = splits[JOB_ARRAY_NUMBER] + +# Download +for date in split: + dg.download_github_day(date) diff --git a/singghtorrent/helpers/dl_ghtorrent.py b/singghtorrent/helpers/dl_ghtorrent.py new file mode 100644 index 0000000..423503e --- /dev/null +++ b/singghtorrent/helpers/dl_ghtorrent.py @@ -0,0 +1,170 @@ +import gzip +import json +import os +from calendar import Calendar +from datetime import date +from glob import glob +from multiprocessing.pool import Pool + +import pandas as pd +import requests +import singghtorrent as sg +from tqdm import tqdm + + +def should_skip(date: str, stage: str = ""): + """Check hierarchically if data is finished.""" + ext_path = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date) + df_prc_path = sg.storage_interim_root() / "ghtorrent/{}-prc.parquet".format(date) + df_cm_path = sg.storage_interim_root() / "ghtorrent/{}-cm.parquet".format(date) + if os.path.exists(df_prc_path) and os.path.exists(df_cm_path): + print("Already interimmed.") + return True + elif stage == "interim": + return False + if os.path.exists(ext_path): + print("Already downloaded.") + return True + return False + + +def download_gh_event(date: str): + """Download from ghtorrent. + + From: https://github.com/src-d/datasets/blob/master/ReviewComments/\ + PR_review_comments_generation.ipynb + Date format in YYYY-MM-DD-hh + Args: + date (str): Date like 2021-01-01-0 + """ + url = "http://data.gharchive.org/{}.json.gz".format(date) + saveurl = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date) + if should_skip(date): + return + r = requests.get(url) + with open(saveurl, "wb") as f: + f.write(r.content) + + +def get_github_data(path: str) -> pd.DataFrame: + """Get PR comments and commit messages from events.""" + COLUMNS = ["COMMENT_ID", "COMMIT_ID", "URL", "AUTHOR", "CREATED_AT", "BODY"] + comments_list = [] + commits_list = [] + for line in tqdm(gzip.open(path).readlines()): + event = json.loads(line) + if event["type"] == "PullRequestReviewCommentEvent": + comments_list.append( + [ + event["payload"]["comment"]["id"], + event["payload"]["comment"]["commit_id"], + event["payload"]["comment"]["html_url"], + event["payload"]["comment"]["user"]["login"], + event["payload"]["comment"]["created_at"], + event["payload"]["comment"]["body"], + ] + ) + if event["type"] == "PushEvent": + commits_list += event["payload"]["commits"] + pr_comments_df = pd.DataFrame(comments_list, columns=COLUMNS) + commit_message_df = pd.DataFrame.from_records(commits_list).drop_duplicates( + subset="sha" + )[["message", "url"]] + return pr_comments_df, commit_message_df + + +def download_github_data(date: str): + """Download and parse PR given YYYY-MM-DD-hh.""" + download_gh_event(date) + if should_skip(date, "interim"): + return + ext_dl_path = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date) + df_prc, df_cm = get_github_data(ext_dl_path) + df_prc_path = sg.storage_interim_root() / "ghtorrent/{}-prc.parquet".format(date) + df_prc.to_parquet(df_prc_path, index=0, compression="gzip") + df_cm_path = sg.storage_interim_root() / "ghtorrent/{}-cm.parquet".format(date) + df_cm.to_parquet(df_cm_path, index=0, compression="gzip") + return + + +def delete_glob(globstr: str): + """Delete files using glob.""" + for f in glob(globstr): + os.remove(f) + + +def download_github_day(date: tuple): + """Download by a full date (year, month, day).""" + dates = generate_date_strs(date[0], date[1], date[2]) + date3 = "{}-{:02d}-{:02d}".format(date[0], date[1], date[2]) + proc_prc_path = ( + sg.storage_processed_root() / "pr_comments" / "{}-prc.parquet".format(date3) + ) + cm_prc_path = ( + sg.storage_processed_root() / "commit_messages" / "{}-cm.parquet".format(date3) + ) + if os.path.exists(proc_prc_path) and os.path.exists(cm_prc_path): + delete_glob(str(sg.storage_interim_root() / "ghtorrent/{}-*".format(date3))) + delete_glob(str(sg.storage_external_root() / "ghtorrent/{}-*".format(date3))) + return "Already processed {}".format(date3) + + for d in dates: + download_github_data(d) + if d.split("-")[3] == "23": + prc_paths = glob( + str(sg.storage_interim_root() / "ghtorrent/{}-*-prc*".format(date3)) + ) + cm_paths = glob( + str(sg.storage_interim_root() / "ghtorrent/{}-*-cm*".format(date3)) + ) + if len(prc_paths) == 24: + df = pd.concat([pd.read_parquet(i) for i in prc_paths]) + df.to_parquet(proc_prc_path, index=0, compression="gzip") + if len(cm_paths) == 24: + df = pd.concat([pd.read_parquet(i) for i in cm_paths]) + df.to_parquet(cm_prc_path, index=0, compression="gzip") + if os.path.exists(proc_prc_path) and os.path.exists(cm_prc_path): + delete_glob(str(sg.storage_interim_root() / "ghtorrent/{}-*".format(date3))) + delete_glob(str(sg.storage_external_root() / "ghtorrent/{}-*".format(date3))) + print("Finished {}!".format(date)) + return + + +def generate_date_strs(year: int, month: int, day: int) -> str: + """Automatically generate date strings.""" + return ["{}-{:02d}-{:02d}-{}".format(year, month, day, i) for i in range(24)] + + +def get_dates_for_year(year: int) -> list: + """Return list of dates for given year.""" + early_stop = False + dates = [] + today = date.today() + now_year, now_month, now_day = today.year, today.month, today.day + for m in range(1, 13): + interim_dates = list(Calendar().itermonthdays3(year, m)) + interim_dates = [i for i in interim_dates if i[1] == m] + processed_dates = [] + for d in interim_dates: + if d[0] >= now_year and d[1] >= now_month and d[2] >= now_day: + early_stop = True + break + processed_dates.append(d) + dates += processed_dates + if early_stop: + return dates + return dates + + +def download_pool_hours(year: str, month: str, day: str) -> pd.DataFrame: + """Download data in parallel and return dataframe.""" + pool = Pool(4) + dates = generate_date_strs(year, month, day) + pr_comments_df = [] + commit_messages_df = [] + for result in pool.imap_unordered(download_github_data, dates): + pr_comments_df.append(result[0]) + commit_messages_df.append(result[1]) + pool.close() + pool.join() + return pd.concat(pr_comments_df), pd.concat(commit_messages_df) diff --git a/tests/pytest.ini b/tests/pytest.ini new file mode 100644 index 0000000..a935078 --- /dev/null +++ b/tests/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +filterwarnings = + ignore::DeprecationWarning:tensorflow.*: + ignore::DeprecationWarning:tensorboard.*: