ReproBrainChart · tsalo · May 14, 2024 · May 13, 2024 · May 13, 2024 · May 14, 2024
diff --git a/.github/workflows/track-clones.yml b/.github/workflows/track-clones.yml
@@ -27,6 +27,8 @@ jobs:
         env:
           SECRET_TOKEN: ${{ secrets.SECRET_TOKEN }}
         run: python scripts/clone-tracking/fetch.py
+      - name: Run cumulative.py
+        run: python scripts/clone-tracking/cumulative.py
       - name: Commit download statistics
         run: |
           git config --global user.name 'tsalo'

diff --git a/_data/clone-tracking/cumulative/ReproBrainChart_HBN_XCP_cum_clones.csv b/_data/clone-tracking/cumulative/ReproBrainChart_HBN_XCP_cum_clones.csv
@@ -1,6 +1,5 @@
 date,clone_count
 2024-05-01,0
-2024-05-01,0
 2024-05-02,0
 2024-05-03,0
 2024-05-04,0

diff --git a/_data/clone-tracking/cumulative/ReproBrainChart_PACCT_BIDS_cum_clones.csv b/_data/clone-tracking/cumulative/ReproBrainChart_PACCT_BIDS_cum_clones.csv
@@ -1,6 +1,5 @@
 date,clone_count
 2024-05-01,0
-2024-05-01,0
 2024-05-02,0
 2024-05-03,0
 2024-05-04,0

diff --git a/_data/clone-tracking/cumulative/ReproBrainChart_PACCT_CPAC_cum_clones.csv b/_data/clone-tracking/cumulative/ReproBrainChart_PACCT_CPAC_cum_clones.csv
@@ -1,6 +1,5 @@
 date,clone_count
 2024-05-01,0
-2024-05-01,0
 2024-05-02,0
 2024-05-03,0
 2024-05-04,0

diff --git a/scripts/clone-tracking/create_graph.py b/scripts/clone-tracking/create_graph.py
@@ -7,6 +7,7 @@
 from bokeh.models import ColumnDataSource, CustomJS, DatetimeTickFormatter, Select
 from bokeh.plotting import curdoc, figure, output_file, show
 
+
 if __name__ == "__main__":
     script_dir = os.path.dirname(__file__)
     stats_dir = os.path.join(script_dir, "../../_data/clone-tracking")
@@ -19,14 +20,14 @@
             os.path.join(
                 script_dir,
                 "../../_data/clone-tracking/cumulative/all_repos_cumulative.csv",
-            )
-        )
+            ),
+        ),
     )
     df["date"] = pd.to_datetime(df["date"])
-    REPOS = list(df)[1:]
+    repos = [col for col in df.columns if col != "date"]
 
     # dropdown menu
-    select = Select(title="Repository", value="Overall", options=REPOS)
+    select = Select(title="Repository", value="Overall", options=repos)
 
     def get_data(source_data, repo):
         df = source_data[["date", repo]]

diff --git a/scripts/clone-tracking/cumulative.py b/scripts/clone-tracking/cumulative.py
@@ -0,0 +1,38 @@
+"""Fetch download statistics for a GitHub repository and output to a directory."""
+
+import os
+from glob import glob
+
+import pandas as pd
+
+
+def main(folder):
+    """Combine cumulative statistics for all repositories into a single file."""
+    repo_wise_cumulative_files = sorted(
+        glob(os.path.join(folder, "ReproBrainChart_*_cumulative.csv"))
+    )
+    dfs = []
+    for file_ in repo_wise_cumulative_files:
+        repo_name = os.path.basename(file_).split("_cumulative")[0]
+        repo_name = repo_name.replace("_", "/", 1)
+        df = pd.read_csv(file_, index_col="date")
+        df = df.rename({"clone_count": repo_name}, axis=1)
+        dfs.append(df)
+
+    # Concatenate all repo-wise cumulative stats into a single DataFrame
+    df_cum = pd.concat(dfs, axis=1)
+    df_cum.fillna(0, inplace=True)
+    df_cum = df_cum.sort_index()
+
+    # Calculate the overall cumulative clone count
+    df_cum["Overall"] = df_cum.sum(axis=1)
+
+    # update overall cumulative stats across all repos
+    overall_cumulative_clones_file = os.path.join(folder, "all_repos_cumulative.csv")
+    df_cum.to_csv(overall_cumulative_clones_file, index_label="date")
+
+
+if __name__ == "__main__":
+    script_dir = os.path.dirname(__file__)
+    stats_dir = os.path.abspath(os.path.join(script_dir, "../../_data/clone-tracking/cumulative"))
+    main(stats_dir)
diff --git a/scripts/clone-tracking/fetch.py b/scripts/clone-tracking/fetch.py
@@ -1,6 +1,5 @@
 """Fetch download statistics for a GitHub repository and output to a directory."""
 
-import argparse
 import os
 from datetime import date
 from datetime import datetime as dt
@@ -11,12 +10,19 @@
 
 
 def main(repo):
-    """Fetch download statistics for a GitHub repository and output to a directory."""
+    """Fetch download statistics for a GitHub repository and output to a directory.
+
+    Parameters
+    ----------
+    repo : str
+        The name of the repository in the format "owner/repo".
+    """
     print(f"Fetching clone statistics for {repo}...")
     token = os.environ.get("SECRET_TOKEN")
     g = Github(token)
     repoobj = g.get_repo(repo)
 
+    # List clones for the repository
     df_clones = clones_to_df(fetch_clones(repoobj))
     owner_name, repo_name = repo.split("/")
 
@@ -28,96 +34,60 @@ def main(repo):
     os.makedirs(os.path.join(stats_dir, daily_dir), exist_ok=True)
     os.makedirs(os.path.join(stats_dir, cum_dir), exist_ok=True)
 
-    daily_path = os.path.join(
+    daily_clones_file = os.path.join(
         stats_dir,
         daily_dir,
         f"{owner_name}_{repo_name}_daily_clones.csv",
     )
+    if os.path.isfile(daily_clones_file):
+        df_clones_historical = pd.read_csv(daily_clones_file, index_col="date")
+    else:
+        df_clones_historical = pd.DataFrame(columns=["clone_count"])
 
     if len(df_clones):
-        df_latest_clones = df_clones.tail(1)
-        last_download_date = df_clones.tail(1).index.date[0]
-
-        if not os.path.isfile(daily_path):
-            patch_df(df_clones).to_csv(daily_path)
-
-        # if latest clone timestamp is not today's date, that means there were
-        # no clones today and we should just put 0 for "number of clones"
-        elif last_download_date != date.today():
-            df_todays_clones = pd.DataFrame(
-                data=[0], index=pd.DatetimeIndex(data=[dt.now().date()])
-            )
-            df_todays_clones.to_csv(daily_path, mode="a", header=False)
-
-        else:
-            df_latest_clones.to_csv(daily_path, mode="a", header=False)
-
-    elif os.path.isfile(daily_path):
-        df_todays_clones = pd.DataFrame(
-            data=[0], index=pd.DatetimeIndex(data=[dt.now().date()])
-        )
-        df_todays_clones.to_csv(daily_path, mode="a", header=False)
-
-    # if this script is run for the first time and no clones were made
-    # in the past 2 weeks, create a csv storing today's clone count (i.e. 0)
+        # Merge df_clones and df_clones_historical.
+        df_clones = pd.concat([df_clones_historical, df_clones], axis=0)
+        # Sort by clone count so the rows with the highest counts are first
+        df_clones = df_clones.sort_values("clone_count", ascending=False)
+        # Drop duplicate index rows, retaining the highest clone count,
+        # which should be the most accurate
+        df_clones = df_clones.loc[~df_clones.index.duplicated(keep="first")]
+        # Sort by date
+        df_clones = df_clones.sort_index()
     else:
-        df_todays_clones = pd.DataFrame(
-            data={"clone_count": [0]}, index=pd.DatetimeIndex(data=[dt.now().date()])
-        )
-        df_todays_clones.index.name = "date"
-        df_todays_clones.to_csv(daily_path)
+        df_clones = df_clones_historical.copy()
+
+    # Fill in missing dates with 0s
+    df_clones = patch_df(df_clones)
+    # Sort by date again (just to be safe)
+    df_clones = df_clones.sort_index()
+
+    df_clones.to_csv(daily_clones_file, index_label="date")
 
     # generate cumulative downloads for this repo + output to directory
-    cum_path = os.path.join(
-        stats_dir, cum_dir, f"{owner_name}_{repo_name}_cum_clones.csv"
+    cumulative_clones_file = os.path.join(
+        stats_dir,
+        cum_dir,
+        f"{owner_name}_{repo_name}_cum_clones.csv",
     )
-    df_cum = pd.read_csv(daily_path)
+    df_cum = df_clones.copy()
     df_cum["clone_count"] = df_cum["clone_count"].cumsum()
-    df_cum.to_csv(cum_path, mode="w+", index=False)
-
-    # update overall cumulative stats across all repos
-    overall_cum_path = os.path.join(stats_dir, cum_dir, "all_repos_cumulative.csv")
-    update_overall_cumulative(df_cum, overall_cum_path, repo)
-
-
-def update_overall_cumulative(df_add, path, repo_name):
-    """Update cumulative statistics for all repositories."""
-    df_latest_clones = df_add.tail(1)
-    todays_date = df_latest_clones.iat[0, 0]
-    todays_clone_count = df_latest_clones.iloc[0, 1]
-    df_add = df_add.rename({"clone_count": repo_name}, axis=1)
-
-    if not os.path.exists(path):
-        df_add.insert(loc=1, column="Overall", value=df_add[repo_name])
-        df_add.to_csv(path, index=False)
-
-    # if column for this repo already exists in csv
-    elif repo_name in pd.read_csv(path):
-        df_overall = pd.read_csv(path)
-        # if csv already contains row for today
-        if todays_date in df_overall["date"].values:
-            df_overall.at[len(df_overall.index) - 1, repo_name] = todays_clone_count
-            df_overall.at[len(df_overall.index) - 1, "Overall"] += todays_clone_count
-            df_overall.to_csv(path, index=False)
-        else:
-            df_new_row = pd.DataFrame(columns=list(df_overall))
-            df_new_row.at[0, "date"] = todays_date
-            df_new_row.at[0, repo_name] = todays_clone_count
-            df_new_row.at[0, "Overall"] = todays_clone_count
-            df_new_row.to_csv(path, mode="a", header=False, index=False)
-    else:
-        df_overall = pd.read_csv(path)
-        df_add = df_add.set_index("date")
-        df_overall = df_overall.set_index("date")
-
-        df_overall = pd.concat([df_overall, df_add], axis=1).sort_index()
-        df_overall.fillna(0, inplace=True)
-        df_overall["Overall"] += df_overall[repo_name]
-        df_overall.to_csv(path)
+    df_cum.to_csv(cumulative_clones_file, index_label="date")
 
 
 def patch_df(df):
-    """Fill in dates where no clones were made with 0's."""
+    """Fill in dates where no clones were made with 0's.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The DataFrame containing clone statistics.
+
+    Returns
+    -------
+    pd.DataFrame
+        The DataFrame with missing dates filled in.
+    """
     cur_date = df.index[0].date()
     todays_date = dt.now().date()
     row = 0
@@ -140,7 +110,19 @@ def patch_df(df):
 
 
 def clones_to_df(clones):
-    """Convert clone statistics to a DataFrame."""
+    """Convert clone statistics to a DataFrame.
+
+    Parameters
+    ----------
+    clones : list of github.RepositoryClone
+        The clone statistics.
+
+    Returns
+    -------
+    pd.DataFrame
+        The clone statistics as a DataFrame.
+        The index is the date of the clone and the column is clone_count.
+    """
     timestamps = []
     total_clone_counts = []
 
@@ -157,7 +139,18 @@ def clones_to_df(clones):
 
 
 def fetch_clones(repo):
-    """Fetch clone statistics for a repository."""
+    """Fetch clone statistics for a repository.
+
+    Parameters
+    ----------
+    repo : github.Repository.Repository
+        The repository object.
+
+    Returns
+    -------
+    list of github.RepositoryClone
+        The clone statistics.
+    """
     clones = repo.get_clones_traffic()
     return clones["clones"]
-Original file line number
+Diff line change
@@ -1,6 +1,5 @@
     date,clone_count
 -05-01,0
--05-01,0
 -05-02,0
 -05-03,0
 -05-04,0
@@ Expand Down @@