Skip to content

Commit 96c146c

Browse files
authored
Fix figure (#39)
1 parent 64b60da commit 96c146c

File tree

7 files changed

+118
-87
lines changed

7 files changed

+118
-87
lines changed

.github/workflows/track-clones.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ jobs:
2727
env:
2828
SECRET_TOKEN: ${{ secrets.SECRET_TOKEN }}
2929
run: python scripts/clone-tracking/fetch.py
30+
- name: Run cumulative.py
31+
run: python scripts/clone-tracking/cumulative.py
3032
- name: Commit download statistics
3133
run: |
3234
git config --global user.name 'tsalo'

_data/clone-tracking/cumulative/ReproBrainChart_HBN_XCP_cum_clones.csv

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
date,clone_count
22
2024-05-01,0
3-
2024-05-01,0
43
2024-05-02,0
54
2024-05-03,0
65
2024-05-04,0

_data/clone-tracking/cumulative/ReproBrainChart_PACCT_BIDS_cum_clones.csv

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
date,clone_count
22
2024-05-01,0
3-
2024-05-01,0
43
2024-05-02,0
54
2024-05-03,0
65
2024-05-04,0

_data/clone-tracking/cumulative/ReproBrainChart_PACCT_CPAC_cum_clones.csv

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
date,clone_count
22
2024-05-01,0
3-
2024-05-01,0
43
2024-05-02,0
54
2024-05-03,0
65
2024-05-04,0

scripts/clone-tracking/create_graph.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from bokeh.models import ColumnDataSource, CustomJS, DatetimeTickFormatter, Select
88
from bokeh.plotting import curdoc, figure, output_file, show
99

10+
1011
if __name__ == "__main__":
1112
script_dir = os.path.dirname(__file__)
1213
stats_dir = os.path.join(script_dir, "../../_data/clone-tracking")
@@ -19,14 +20,14 @@
1920
os.path.join(
2021
script_dir,
2122
"../../_data/clone-tracking/cumulative/all_repos_cumulative.csv",
22-
)
23-
)
23+
),
24+
),
2425
)
2526
df["date"] = pd.to_datetime(df["date"])
26-
REPOS = list(df)[1:]
27+
repos = [col for col in df.columns if col != "date"]
2728

2829
# dropdown menu
29-
select = Select(title="Repository", value="Overall", options=REPOS)
30+
select = Select(title="Repository", value="Overall", options=repos)
3031

3132
def get_data(source_data, repo):
3233
df = source_data[["date", repo]]

scripts/clone-tracking/cumulative.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""Fetch download statistics for a GitHub repository and output to a directory."""
2+
3+
import os
4+
from glob import glob
5+
6+
import pandas as pd
7+
8+
9+
def main(folder):
10+
"""Combine cumulative statistics for all repositories into a single file."""
11+
repo_wise_cumulative_files = sorted(
12+
glob(os.path.join(folder, "ReproBrainChart_*_cumulative.csv"))
13+
)
14+
dfs = []
15+
for file_ in repo_wise_cumulative_files:
16+
repo_name = os.path.basename(file_).split("_cumulative")[0]
17+
repo_name = repo_name.replace("_", "/", 1)
18+
df = pd.read_csv(file_, index_col="date")
19+
df = df.rename({"clone_count": repo_name}, axis=1)
20+
dfs.append(df)
21+
22+
# Concatenate all repo-wise cumulative stats into a single DataFrame
23+
df_cum = pd.concat(dfs, axis=1)
24+
df_cum.fillna(0, inplace=True)
25+
df_cum = df_cum.sort_index()
26+
27+
# Calculate the overall cumulative clone count
28+
df_cum["Overall"] = df_cum.sum(axis=1)
29+
30+
# update overall cumulative stats across all repos
31+
overall_cumulative_clones_file = os.path.join(folder, "all_repos_cumulative.csv")
32+
df_cum.to_csv(overall_cumulative_clones_file, index_label="date")
33+
34+
35+
if __name__ == "__main__":
36+
script_dir = os.path.dirname(__file__)
37+
stats_dir = os.path.abspath(os.path.join(script_dir, "../../_data/clone-tracking/cumulative"))
38+
main(stats_dir)

scripts/clone-tracking/fetch.py

Lines changed: 73 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
"""Fetch download statistics for a GitHub repository and output to a directory."""
22

3-
import argparse
43
import os
54
from datetime import date
65
from datetime import datetime as dt
@@ -11,12 +10,19 @@
1110

1211

1312
def main(repo):
14-
"""Fetch download statistics for a GitHub repository and output to a directory."""
13+
"""Fetch download statistics for a GitHub repository and output to a directory.
14+
15+
Parameters
16+
----------
17+
repo : str
18+
The name of the repository in the format "owner/repo".
19+
"""
1520
print(f"Fetching clone statistics for {repo}...")
1621
token = os.environ.get("SECRET_TOKEN")
1722
g = Github(token)
1823
repoobj = g.get_repo(repo)
1924

25+
# List clones for the repository
2026
df_clones = clones_to_df(fetch_clones(repoobj))
2127
owner_name, repo_name = repo.split("/")
2228

@@ -28,96 +34,60 @@ def main(repo):
2834
os.makedirs(os.path.join(stats_dir, daily_dir), exist_ok=True)
2935
os.makedirs(os.path.join(stats_dir, cum_dir), exist_ok=True)
3036

31-
daily_path = os.path.join(
37+
daily_clones_file = os.path.join(
3238
stats_dir,
3339
daily_dir,
3440
f"{owner_name}_{repo_name}_daily_clones.csv",
3541
)
42+
if os.path.isfile(daily_clones_file):
43+
df_clones_historical = pd.read_csv(daily_clones_file, index_col="date")
44+
else:
45+
df_clones_historical = pd.DataFrame(columns=["clone_count"])
3646

3747
if len(df_clones):
38-
df_latest_clones = df_clones.tail(1)
39-
last_download_date = df_clones.tail(1).index.date[0]
40-
41-
if not os.path.isfile(daily_path):
42-
patch_df(df_clones).to_csv(daily_path)
43-
44-
# if latest clone timestamp is not today's date, that means there were
45-
# no clones today and we should just put 0 for "number of clones"
46-
elif last_download_date != date.today():
47-
df_todays_clones = pd.DataFrame(
48-
data=[0], index=pd.DatetimeIndex(data=[dt.now().date()])
49-
)
50-
df_todays_clones.to_csv(daily_path, mode="a", header=False)
51-
52-
else:
53-
df_latest_clones.to_csv(daily_path, mode="a", header=False)
54-
55-
elif os.path.isfile(daily_path):
56-
df_todays_clones = pd.DataFrame(
57-
data=[0], index=pd.DatetimeIndex(data=[dt.now().date()])
58-
)
59-
df_todays_clones.to_csv(daily_path, mode="a", header=False)
60-
61-
# if this script is run for the first time and no clones were made
62-
# in the past 2 weeks, create a csv storing today's clone count (i.e. 0)
48+
# Merge df_clones and df_clones_historical.
49+
df_clones = pd.concat([df_clones_historical, df_clones], axis=0)
50+
# Sort by clone count so the rows with the highest counts are first
51+
df_clones = df_clones.sort_values("clone_count", ascending=False)
52+
# Drop duplicate index rows, retaining the highest clone count,
53+
# which should be the most accurate
54+
df_clones = df_clones.loc[~df_clones.index.duplicated(keep="first")]
55+
# Sort by date
56+
df_clones = df_clones.sort_index()
6357
else:
64-
df_todays_clones = pd.DataFrame(
65-
data={"clone_count": [0]}, index=pd.DatetimeIndex(data=[dt.now().date()])
66-
)
67-
df_todays_clones.index.name = "date"
68-
df_todays_clones.to_csv(daily_path)
58+
df_clones = df_clones_historical.copy()
59+
60+
# Fill in missing dates with 0s
61+
df_clones = patch_df(df_clones)
62+
# Sort by date again (just to be safe)
63+
df_clones = df_clones.sort_index()
64+
65+
df_clones.to_csv(daily_clones_file, index_label="date")
6966

7067
# generate cumulative downloads for this repo + output to directory
71-
cum_path = os.path.join(
72-
stats_dir, cum_dir, f"{owner_name}_{repo_name}_cum_clones.csv"
68+
cumulative_clones_file = os.path.join(
69+
stats_dir,
70+
cum_dir,
71+
f"{owner_name}_{repo_name}_cum_clones.csv",
7372
)
74-
df_cum = pd.read_csv(daily_path)
73+
df_cum = df_clones.copy()
7574
df_cum["clone_count"] = df_cum["clone_count"].cumsum()
76-
df_cum.to_csv(cum_path, mode="w+", index=False)
77-
78-
# update overall cumulative stats across all repos
79-
overall_cum_path = os.path.join(stats_dir, cum_dir, "all_repos_cumulative.csv")
80-
update_overall_cumulative(df_cum, overall_cum_path, repo)
81-
82-
83-
def update_overall_cumulative(df_add, path, repo_name):
84-
"""Update cumulative statistics for all repositories."""
85-
df_latest_clones = df_add.tail(1)
86-
todays_date = df_latest_clones.iat[0, 0]
87-
todays_clone_count = df_latest_clones.iloc[0, 1]
88-
df_add = df_add.rename({"clone_count": repo_name}, axis=1)
89-
90-
if not os.path.exists(path):
91-
df_add.insert(loc=1, column="Overall", value=df_add[repo_name])
92-
df_add.to_csv(path, index=False)
93-
94-
# if column for this repo already exists in csv
95-
elif repo_name in pd.read_csv(path):
96-
df_overall = pd.read_csv(path)
97-
# if csv already contains row for today
98-
if todays_date in df_overall["date"].values:
99-
df_overall.at[len(df_overall.index) - 1, repo_name] = todays_clone_count
100-
df_overall.at[len(df_overall.index) - 1, "Overall"] += todays_clone_count
101-
df_overall.to_csv(path, index=False)
102-
else:
103-
df_new_row = pd.DataFrame(columns=list(df_overall))
104-
df_new_row.at[0, "date"] = todays_date
105-
df_new_row.at[0, repo_name] = todays_clone_count
106-
df_new_row.at[0, "Overall"] = todays_clone_count
107-
df_new_row.to_csv(path, mode="a", header=False, index=False)
108-
else:
109-
df_overall = pd.read_csv(path)
110-
df_add = df_add.set_index("date")
111-
df_overall = df_overall.set_index("date")
112-
113-
df_overall = pd.concat([df_overall, df_add], axis=1).sort_index()
114-
df_overall.fillna(0, inplace=True)
115-
df_overall["Overall"] += df_overall[repo_name]
116-
df_overall.to_csv(path)
75+
df_cum.to_csv(cumulative_clones_file, index_label="date")
11776

11877

11978
def patch_df(df):
120-
"""Fill in dates where no clones were made with 0's."""
79+
"""Fill in dates where no clones were made with 0's.
80+
81+
Parameters
82+
----------
83+
df : pd.DataFrame
84+
The DataFrame containing clone statistics.
85+
86+
Returns
87+
-------
88+
pd.DataFrame
89+
The DataFrame with missing dates filled in.
90+
"""
12191
cur_date = df.index[0].date()
12292
todays_date = dt.now().date()
12393
row = 0
@@ -140,7 +110,19 @@ def patch_df(df):
140110

141111

142112
def clones_to_df(clones):
143-
"""Convert clone statistics to a DataFrame."""
113+
"""Convert clone statistics to a DataFrame.
114+
115+
Parameters
116+
----------
117+
clones : list of github.RepositoryClone
118+
The clone statistics.
119+
120+
Returns
121+
-------
122+
pd.DataFrame
123+
The clone statistics as a DataFrame.
124+
The index is the date of the clone and the column is clone_count.
125+
"""
144126
timestamps = []
145127
total_clone_counts = []
146128

@@ -157,7 +139,18 @@ def clones_to_df(clones):
157139

158140

159141
def fetch_clones(repo):
160-
"""Fetch clone statistics for a repository."""
142+
"""Fetch clone statistics for a repository.
143+
144+
Parameters
145+
----------
146+
repo : github.Repository.Repository
147+
The repository object.
148+
149+
Returns
150+
-------
151+
list of github.RepositoryClone
152+
The clone statistics.
153+
"""
161154
clones = repo.get_clones_traffic()
162155
return clones["clones"]
163156

0 commit comments

Comments
 (0)