init

davidhin · Mar 11, 2021 · 9797ab0 · 9797ab0
commit 9797ab0
Show file tree

Hide file tree

Showing 12 changed files with 357 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+*.img
+*.simg
+.pytest_cache
+__pycache__
+storage
+*.egg-info
+output
+.vscode/sftp.json
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,12 @@
+{
+  "jupyter.jupyterServerType": "local",
+  "files.exclude": {
+    ".pytest_cache": true,
+    "**/*.egg-info": true,
+    "**/__pycache__": true,
+    "**/*.img": true,
+    "**/*.simg": true,
+    "**/*.sif": true
+  },
+  "python.pythonPath": "/home/david/Documents/miniconda/bin/python"
+}
diff --git a/Singularity b/Singularity
@@ -0,0 +1,21 @@
+Bootstrap:docker
+From:python:3.8-slim
+
+%labels
+    MAINTAINER admin
+    WHATAMI admin
+
+%files
+    cli.sh /cli.sh
+    requirements.txt /requirements.txt
+
+%runscript
+    exec /bin/bash /cli.sh "$@"
+
+%post
+    chmod u+x /cli.sh
+
+    # Install dependencies here
+    apt update
+    apt install -y build-essential
+    pip install -r /requirements.txt
diff --git a/cli.sh b/cli.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+echo "Welcome to the CLI! \
+Please use '-p initialise' if it's your first time. \
+This CLI must be run in the directory folder of the cloned repository."
+usage() {
+    echo "Usage: $0 \
+[-h help] \
+[-t run tests] \
+[-p run program <initialise|path_to_file>] \
+    [-a arguments]" 1>&2
+    exit 1
+}
+
+while getopts ":hp:a:t" opt; do
+    case ${opt} in
+    h)
+        usage
+        ;;
+    p)
+        p=${OPTARG}
+        ;;
+    a)
+        a+=("${OPTARG}")
+        ;;
+    t)
+        pytest tests/
+        ;;
+    \?)
+        echo "Invalid option"
+        usage
+        ;;
+    esac
+done
+shift $((OPTIND - 1))
+
+# Download data and install main code
+if [[ "initialise" == "${p}" ]]; then
+    pip install -e .
+    exit 0
+fi
+
+# Run Python Program
+if [[ -z "${p}" ]]; then
+    usage
+else
+    python3 -u "${p}" "${a[@]}"
+fi
diff --git a/hpc/logs/placeholder b/hpc/logs/placeholder
diff --git a/readme.md b/readme.md
@@ -0,0 +1 @@
+# Download GHTorrent Data
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+numpy==1.18.5
+pandas==1.1.5
+gdown==3.12.2
+tqdm==4.58.0
+pytest==6.2.2
+fastparquet==0.5.0
diff --git a/setup.py b/setup.py
@@ -0,0 +1,3 @@
+from setuptools import find_packages, setup
+
+setup(name="singghtorrent", version="1.0", packages=find_packages())
diff --git a/singghtorrent/__init__.py b/singghtorrent/__init__.py
@@ -0,0 +1,50 @@
+"""Set up project paths."""
+from pathlib import Path
+
+
+def project_root() -> Path:
+    """Get project path."""
+    return Path(__file__).parent.parent
+
+
+def storage_root() -> Path:
+    """Get storage path."""
+    return Path(__file__).parent.parent / "storage"
+
+
+def storage_external_root() -> Path:
+    """Get storage external path."""
+    path = storage_root() / "external"
+    Path(path).mkdir(exist_ok=True, parents=True)
+    return path
+
+
+def storage_interim_root() -> Path:
+    """Get storage interim path."""
+    path = storage_root() / "interim"
+    Path(path).mkdir(exist_ok=True, parents=True)
+    return path
+
+
+def storage_processed_root() -> Path:
+    """Get storage procesTsed path."""
+    path = storage_root() / "processed"
+    Path(path).mkdir(exist_ok=True, parents=True)
+    return path
+
+
+def outputs_root() -> Path:
+    """Get output path."""
+    path = Path(__file__).parent.parent / "output"
+    Path(path).mkdir(exist_ok=True, parents=True)
+    return path
+
+
+def get_path(path) -> Path:
+    """Get path, if exists. If not, create it."""
+    Path(path).mkdir(exist_ok=True, parents=True)
+    return path
+
+
+# https://stackoverflow.com/a/50194143/1889006
+# https://stackoverflow.com/a/53465812/1889006
diff --git a/singghtorrent/analysis/main.py b/singghtorrent/analysis/main.py
@@ -0,0 +1,34 @@
+import sys
+from pathlib import Path
+
+import numpy as np
+import singghtorrent as sg
+from singghtorrent.helpers import dl_ghtorrent as dg
+
+# Setup
+NUM_JOBS = 200
+JOB_ARRAY_NUMBER = int(sys.argv[1]) - 1
+START_YEAR = 2015
+END_YEAR = 2021
+
+# Create paths
+Path(sg.storage_external_root() / "ghtorrent/").mkdir(exist_ok=True)
+Path(sg.storage_interim_root() / "ghtorrent").mkdir(exist_ok=True)
+Path(sg.storage_processed_root() / "pr_comments/").mkdir(exist_ok=True)
+Path(sg.storage_processed_root() / "commit_messages/").mkdir(exist_ok=True)
+
+# Generate job array mapping
+Path(sg.storage_interim_root() / "hpc_mapping/").mkdir(exist_ok=True)
+
+# Get dates
+all_dates = []
+for year in range(START_YEAR, END_YEAR + 1):
+    all_dates += dg.get_dates_for_year(year)
+
+# Get NUM_JOBS
+splits = np.array_split(all_dates, NUM_JOBS)  # Approx 3 hours each
+split = splits[JOB_ARRAY_NUMBER]
+
+# Download
+for date in split:
+    dg.download_github_day(date)
diff --git a/singghtorrent/helpers/dl_ghtorrent.py b/singghtorrent/helpers/dl_ghtorrent.py
@@ -0,0 +1,170 @@
+import gzip
+import json
+import os
+from calendar import Calendar
+from datetime import date
+from glob import glob
+from multiprocessing.pool import Pool
+
+import pandas as pd
+import requests
+import singghtorrent as sg
+from tqdm import tqdm
+
+
+def should_skip(date: str, stage: str = ""):
+    """Check hierarchically if data is finished."""
+    ext_path = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date)
+    df_prc_path = sg.storage_interim_root() / "ghtorrent/{}-prc.parquet".format(date)
+    df_cm_path = sg.storage_interim_root() / "ghtorrent/{}-cm.parquet".format(date)
+    if os.path.exists(df_prc_path) and os.path.exists(df_cm_path):
+        print("Already interimmed.")
+        return True
+    elif stage == "interim":
+        return False
+    if os.path.exists(ext_path):
+        print("Already downloaded.")
+        return True
+    return False
+
+
+def download_gh_event(date: str):
+    """Download from ghtorrent.
+
+    From: https://github.com/src-d/datasets/blob/master/ReviewComments/\
+        PR_review_comments_generation.ipynb
+    Date format in YYYY-MM-DD-hh
+    Args:
+        date (str): Date like 2021-01-01-0
+    """
+    url = "http://data.gharchive.org/{}.json.gz".format(date)
+    saveurl = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date)
+    if should_skip(date):
+        return
+    r = requests.get(url)
+    with open(saveurl, "wb") as f:
+        f.write(r.content)
+
+
+def get_github_data(path: str) -> pd.DataFrame:
+    """Get PR comments and commit messages from events."""
+    COLUMNS = ["COMMENT_ID", "COMMIT_ID", "URL", "AUTHOR", "CREATED_AT", "BODY"]
+    comments_list = []
+    commits_list = []
+    for line in tqdm(gzip.open(path).readlines()):
+        event = json.loads(line)
+        if event["type"] == "PullRequestReviewCommentEvent":
+            comments_list.append(
+                [
+                    event["payload"]["comment"]["id"],
+                    event["payload"]["comment"]["commit_id"],
+                    event["payload"]["comment"]["html_url"],
+                    event["payload"]["comment"]["user"]["login"],
+                    event["payload"]["comment"]["created_at"],
+                    event["payload"]["comment"]["body"],
+                ]
+            )
+        if event["type"] == "PushEvent":
+            commits_list += event["payload"]["commits"]
+    pr_comments_df = pd.DataFrame(comments_list, columns=COLUMNS)
+    commit_message_df = pd.DataFrame.from_records(commits_list).drop_duplicates(
+        subset="sha"
+    )[["message", "url"]]
+    return pr_comments_df, commit_message_df
+
+
+def download_github_data(date: str):
+    """Download and parse PR given YYYY-MM-DD-hh."""
+    download_gh_event(date)
+    if should_skip(date, "interim"):
+        return
+    ext_dl_path = sg.storage_external_root() / "ghtorrent/{}.json.gz".format(date)
+    df_prc, df_cm = get_github_data(ext_dl_path)
+    df_prc_path = sg.storage_interim_root() / "ghtorrent/{}-prc.parquet".format(date)
+    df_prc.to_parquet(df_prc_path, index=0, compression="gzip")
+    df_cm_path = sg.storage_interim_root() / "ghtorrent/{}-cm.parquet".format(date)
+    df_cm.to_parquet(df_cm_path, index=0, compression="gzip")
+    return
+
+
+def delete_glob(globstr: str):
+    """Delete files using glob."""
+    for f in glob(globstr):
+        os.remove(f)
+
+
+def download_github_day(date: tuple):
+    """Download by a full date (year, month, day)."""
+    dates = generate_date_strs(date[0], date[1], date[2])
+    date3 = "{}-{:02d}-{:02d}".format(date[0], date[1], date[2])
+    proc_prc_path = (
+        sg.storage_processed_root() / "pr_comments" / "{}-prc.parquet".format(date3)
+    )
+    cm_prc_path = (
+        sg.storage_processed_root() / "commit_messages" / "{}-cm.parquet".format(date3)
+    )
+    if os.path.exists(proc_prc_path) and os.path.exists(cm_prc_path):
+        delete_glob(str(sg.storage_interim_root() / "ghtorrent/{}-*".format(date3)))
+        delete_glob(str(sg.storage_external_root() / "ghtorrent/{}-*".format(date3)))
+        return "Already processed {}".format(date3)
+
+    for d in dates:
+        download_github_data(d)
+        if d.split("-")[3] == "23":
+            prc_paths = glob(
+                str(sg.storage_interim_root() / "ghtorrent/{}-*-prc*".format(date3))
+            )
+            cm_paths = glob(
+                str(sg.storage_interim_root() / "ghtorrent/{}-*-cm*".format(date3))
+            )
+            if len(prc_paths) == 24:
+                df = pd.concat([pd.read_parquet(i) for i in prc_paths])
+                df.to_parquet(proc_prc_path, index=0, compression="gzip")
+            if len(cm_paths) == 24:
+                df = pd.concat([pd.read_parquet(i) for i in cm_paths])
+                df.to_parquet(cm_prc_path, index=0, compression="gzip")
+    if os.path.exists(proc_prc_path) and os.path.exists(cm_prc_path):
+        delete_glob(str(sg.storage_interim_root() / "ghtorrent/{}-*".format(date3)))
+        delete_glob(str(sg.storage_external_root() / "ghtorrent/{}-*".format(date3)))
+    print("Finished {}!".format(date))
+    return
+
+
+def generate_date_strs(year: int, month: int, day: int) -> str:
+    """Automatically generate date strings."""
+    return ["{}-{:02d}-{:02d}-{}".format(year, month, day, i) for i in range(24)]
+
+
+def get_dates_for_year(year: int) -> list:
+    """Return list of dates for given year."""
+    early_stop = False
+    dates = []
+    today = date.today()
+    now_year, now_month, now_day = today.year, today.month, today.day
+    for m in range(1, 13):
+        interim_dates = list(Calendar().itermonthdays3(year, m))
+        interim_dates = [i for i in interim_dates if i[1] == m]
+        processed_dates = []
+        for d in interim_dates:
+            if d[0] >= now_year and d[1] >= now_month and d[2] >= now_day:
+                early_stop = True
+                break
+            processed_dates.append(d)
+        dates += processed_dates
+        if early_stop:
+            return dates
+    return dates
+
+
+def download_pool_hours(year: str, month: str, day: str) -> pd.DataFrame:
+    """Download data in parallel and return dataframe."""
+    pool = Pool(4)
+    dates = generate_date_strs(year, month, day)
+    pr_comments_df = []
+    commit_messages_df = []
+    for result in pool.imap_unordered(download_github_data, dates):
+        pr_comments_df.append(result[0])
+        commit_messages_df.append(result[1])
+    pool.close()
+    pool.join()
+    return pd.concat(pr_comments_df), pd.concat(commit_messages_df)
diff --git a/tests/pytest.ini b/tests/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+filterwarnings =
+    ignore::DeprecationWarning:tensorflow.*:
+    ignore::DeprecationWarning:tensorboard.*:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from setuptools import find_packages, setup

		setup(name="singghtorrent", version="1.0", packages=find_packages())