Skip to content

Fix figure #39

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/track-clones.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ jobs:
env:
SECRET_TOKEN: ${{ secrets.SECRET_TOKEN }}
run: python scripts/clone-tracking/fetch.py
- name: Run cumulative.py
run: python scripts/clone-tracking/cumulative.py
- name: Commit download statistics
run: |
git config --global user.name 'tsalo'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
date,clone_count
2024-05-01,0
2024-05-01,0
2024-05-02,0
2024-05-03,0
2024-05-04,0
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
date,clone_count
2024-05-01,0
2024-05-01,0
2024-05-02,0
2024-05-03,0
2024-05-04,0
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
date,clone_count
2024-05-01,0
2024-05-01,0
2024-05-02,0
2024-05-03,0
2024-05-04,0
Expand Down
9 changes: 5 additions & 4 deletions scripts/clone-tracking/create_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from bokeh.models import ColumnDataSource, CustomJS, DatetimeTickFormatter, Select
from bokeh.plotting import curdoc, figure, output_file, show


if __name__ == "__main__":
script_dir = os.path.dirname(__file__)
stats_dir = os.path.join(script_dir, "../../_data/clone-tracking")
Expand All @@ -19,14 +20,14 @@
os.path.join(
script_dir,
"../../_data/clone-tracking/cumulative/all_repos_cumulative.csv",
)
)
),
),
)
df["date"] = pd.to_datetime(df["date"])
REPOS = list(df)[1:]
repos = [col for col in df.columns if col != "date"]

# dropdown menu
select = Select(title="Repository", value="Overall", options=REPOS)
select = Select(title="Repository", value="Overall", options=repos)

def get_data(source_data, repo):
df = source_data[["date", repo]]
Expand Down
38 changes: 38 additions & 0 deletions scripts/clone-tracking/cumulative.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Fetch download statistics for a GitHub repository and output to a directory."""

import os
from glob import glob

import pandas as pd


def main(folder):
"""Combine cumulative statistics for all repositories into a single file."""
repo_wise_cumulative_files = sorted(
glob(os.path.join(folder, "ReproBrainChart_*_cumulative.csv"))
)
dfs = []
for file_ in repo_wise_cumulative_files:
repo_name = os.path.basename(file_).split("_cumulative")[0]
repo_name = repo_name.replace("_", "/", 1)
df = pd.read_csv(file_, index_col="date")
df = df.rename({"clone_count": repo_name}, axis=1)
dfs.append(df)

# Concatenate all repo-wise cumulative stats into a single DataFrame
df_cum = pd.concat(dfs, axis=1)
df_cum.fillna(0, inplace=True)
df_cum = df_cum.sort_index()

# Calculate the overall cumulative clone count
df_cum["Overall"] = df_cum.sum(axis=1)

# update overall cumulative stats across all repos
overall_cumulative_clones_file = os.path.join(folder, "all_repos_cumulative.csv")
df_cum.to_csv(overall_cumulative_clones_file, index_label="date")


if __name__ == "__main__":
script_dir = os.path.dirname(__file__)
stats_dir = os.path.abspath(os.path.join(script_dir, "../../_data/clone-tracking/cumulative"))
main(stats_dir)
153 changes: 73 additions & 80 deletions scripts/clone-tracking/fetch.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Fetch download statistics for a GitHub repository and output to a directory."""

import argparse
import os
from datetime import date
from datetime import datetime as dt
Expand All @@ -11,12 +10,19 @@


def main(repo):
"""Fetch download statistics for a GitHub repository and output to a directory."""
"""Fetch download statistics for a GitHub repository and output to a directory.

Parameters
----------
repo : str
The name of the repository in the format "owner/repo".
"""
print(f"Fetching clone statistics for {repo}...")
token = os.environ.get("SECRET_TOKEN")
g = Github(token)
repoobj = g.get_repo(repo)

# List clones for the repository
df_clones = clones_to_df(fetch_clones(repoobj))
owner_name, repo_name = repo.split("/")

Expand All @@ -28,96 +34,60 @@ def main(repo):
os.makedirs(os.path.join(stats_dir, daily_dir), exist_ok=True)
os.makedirs(os.path.join(stats_dir, cum_dir), exist_ok=True)

daily_path = os.path.join(
daily_clones_file = os.path.join(
stats_dir,
daily_dir,
f"{owner_name}_{repo_name}_daily_clones.csv",
)
if os.path.isfile(daily_clones_file):
df_clones_historical = pd.read_csv(daily_clones_file, index_col="date")
else:
df_clones_historical = pd.DataFrame(columns=["clone_count"])

if len(df_clones):
df_latest_clones = df_clones.tail(1)
last_download_date = df_clones.tail(1).index.date[0]

if not os.path.isfile(daily_path):
patch_df(df_clones).to_csv(daily_path)

# if latest clone timestamp is not today's date, that means there were
# no clones today and we should just put 0 for "number of clones"
elif last_download_date != date.today():
df_todays_clones = pd.DataFrame(
data=[0], index=pd.DatetimeIndex(data=[dt.now().date()])
)
df_todays_clones.to_csv(daily_path, mode="a", header=False)

else:
df_latest_clones.to_csv(daily_path, mode="a", header=False)

elif os.path.isfile(daily_path):
df_todays_clones = pd.DataFrame(
data=[0], index=pd.DatetimeIndex(data=[dt.now().date()])
)
df_todays_clones.to_csv(daily_path, mode="a", header=False)

# if this script is run for the first time and no clones were made
# in the past 2 weeks, create a csv storing today's clone count (i.e. 0)
# Merge df_clones and df_clones_historical.
df_clones = pd.concat([df_clones_historical, df_clones], axis=0)
# Sort by clone count so the rows with the highest counts are first
df_clones = df_clones.sort_values("clone_count", ascending=False)
# Drop duplicate index rows, retaining the highest clone count,
# which should be the most accurate
df_clones = df_clones.loc[~df_clones.index.duplicated(keep="first")]
# Sort by date
df_clones = df_clones.sort_index()
else:
df_todays_clones = pd.DataFrame(
data={"clone_count": [0]}, index=pd.DatetimeIndex(data=[dt.now().date()])
)
df_todays_clones.index.name = "date"
df_todays_clones.to_csv(daily_path)
df_clones = df_clones_historical.copy()

# Fill in missing dates with 0s
df_clones = patch_df(df_clones)
# Sort by date again (just to be safe)
df_clones = df_clones.sort_index()

df_clones.to_csv(daily_clones_file, index_label="date")

# generate cumulative downloads for this repo + output to directory
cum_path = os.path.join(
stats_dir, cum_dir, f"{owner_name}_{repo_name}_cum_clones.csv"
cumulative_clones_file = os.path.join(
stats_dir,
cum_dir,
f"{owner_name}_{repo_name}_cum_clones.csv",
)
df_cum = pd.read_csv(daily_path)
df_cum = df_clones.copy()
df_cum["clone_count"] = df_cum["clone_count"].cumsum()
df_cum.to_csv(cum_path, mode="w+", index=False)

# update overall cumulative stats across all repos
overall_cum_path = os.path.join(stats_dir, cum_dir, "all_repos_cumulative.csv")
update_overall_cumulative(df_cum, overall_cum_path, repo)


def update_overall_cumulative(df_add, path, repo_name):
"""Update cumulative statistics for all repositories."""
df_latest_clones = df_add.tail(1)
todays_date = df_latest_clones.iat[0, 0]
todays_clone_count = df_latest_clones.iloc[0, 1]
df_add = df_add.rename({"clone_count": repo_name}, axis=1)

if not os.path.exists(path):
df_add.insert(loc=1, column="Overall", value=df_add[repo_name])
df_add.to_csv(path, index=False)

# if column for this repo already exists in csv
elif repo_name in pd.read_csv(path):
df_overall = pd.read_csv(path)
# if csv already contains row for today
if todays_date in df_overall["date"].values:
df_overall.at[len(df_overall.index) - 1, repo_name] = todays_clone_count
df_overall.at[len(df_overall.index) - 1, "Overall"] += todays_clone_count
df_overall.to_csv(path, index=False)
else:
df_new_row = pd.DataFrame(columns=list(df_overall))
df_new_row.at[0, "date"] = todays_date
df_new_row.at[0, repo_name] = todays_clone_count
df_new_row.at[0, "Overall"] = todays_clone_count
df_new_row.to_csv(path, mode="a", header=False, index=False)
else:
df_overall = pd.read_csv(path)
df_add = df_add.set_index("date")
df_overall = df_overall.set_index("date")

df_overall = pd.concat([df_overall, df_add], axis=1).sort_index()
df_overall.fillna(0, inplace=True)
df_overall["Overall"] += df_overall[repo_name]
df_overall.to_csv(path)
df_cum.to_csv(cumulative_clones_file, index_label="date")


def patch_df(df):
"""Fill in dates where no clones were made with 0's."""
"""Fill in dates where no clones were made with 0's.

Parameters
----------
df : pd.DataFrame
The DataFrame containing clone statistics.

Returns
-------
pd.DataFrame
The DataFrame with missing dates filled in.
"""
cur_date = df.index[0].date()
todays_date = dt.now().date()
row = 0
Expand All @@ -140,7 +110,19 @@ def patch_df(df):


def clones_to_df(clones):
"""Convert clone statistics to a DataFrame."""
"""Convert clone statistics to a DataFrame.

Parameters
----------
clones : list of github.RepositoryClone
The clone statistics.

Returns
-------
pd.DataFrame
The clone statistics as a DataFrame.
The index is the date of the clone and the column is clone_count.
"""
timestamps = []
total_clone_counts = []

Expand All @@ -157,7 +139,18 @@ def clones_to_df(clones):


def fetch_clones(repo):
"""Fetch clone statistics for a repository."""
"""Fetch clone statistics for a repository.

Parameters
----------
repo : github.Repository.Repository
The repository object.

Returns
-------
list of github.RepositoryClone
The clone statistics.
"""
clones = repo.get_clones_traffic()
return clones["clones"]

Expand Down