1
1
"""Fetch download statistics for a GitHub repository and output to a directory."""
2
2
3
- import argparse
4
3
import os
5
4
from datetime import date
6
5
from datetime import datetime as dt
11
10
12
11
13
12
def main (repo ):
14
- """Fetch download statistics for a GitHub repository and output to a directory."""
13
+ """Fetch download statistics for a GitHub repository and output to a directory.
14
+
15
+ Parameters
16
+ ----------
17
+ repo : str
18
+ The name of the repository in the format "owner/repo".
19
+ """
15
20
print (f"Fetching clone statistics for { repo } ..." )
16
21
token = os .environ .get ("SECRET_TOKEN" )
17
22
g = Github (token )
18
23
repoobj = g .get_repo (repo )
19
24
25
+ # List clones for the repository
20
26
df_clones = clones_to_df (fetch_clones (repoobj ))
21
27
owner_name , repo_name = repo .split ("/" )
22
28
@@ -28,96 +34,60 @@ def main(repo):
28
34
os .makedirs (os .path .join (stats_dir , daily_dir ), exist_ok = True )
29
35
os .makedirs (os .path .join (stats_dir , cum_dir ), exist_ok = True )
30
36
31
- daily_path = os .path .join (
37
+ daily_clones_file = os .path .join (
32
38
stats_dir ,
33
39
daily_dir ,
34
40
f"{ owner_name } _{ repo_name } _daily_clones.csv" ,
35
41
)
42
+ if os .path .isfile (daily_clones_file ):
43
+ df_clones_historical = pd .read_csv (daily_clones_file , index_col = "date" )
44
+ else :
45
+ df_clones_historical = pd .DataFrame (columns = ["clone_count" ])
36
46
37
47
if len (df_clones ):
38
- df_latest_clones = df_clones .tail (1 )
39
- last_download_date = df_clones .tail (1 ).index .date [0 ]
40
-
41
- if not os .path .isfile (daily_path ):
42
- patch_df (df_clones ).to_csv (daily_path )
43
-
44
- # if latest clone timestamp is not today's date, that means there were
45
- # no clones today and we should just put 0 for "number of clones"
46
- elif last_download_date != date .today ():
47
- df_todays_clones = pd .DataFrame (
48
- data = [0 ], index = pd .DatetimeIndex (data = [dt .now ().date ()])
49
- )
50
- df_todays_clones .to_csv (daily_path , mode = "a" , header = False )
51
-
52
- else :
53
- df_latest_clones .to_csv (daily_path , mode = "a" , header = False )
54
-
55
- elif os .path .isfile (daily_path ):
56
- df_todays_clones = pd .DataFrame (
57
- data = [0 ], index = pd .DatetimeIndex (data = [dt .now ().date ()])
58
- )
59
- df_todays_clones .to_csv (daily_path , mode = "a" , header = False )
60
-
61
- # if this script is run for the first time and no clones were made
62
- # in the past 2 weeks, create a csv storing today's clone count (i.e. 0)
48
+ # Merge df_clones and df_clones_historical.
49
+ df_clones = pd .concat ([df_clones_historical , df_clones ], axis = 0 )
50
+ # Sort by clone count so the rows with the highest counts are first
51
+ df_clones = df_clones .sort_values ("clone_count" , ascending = False )
52
+ # Drop duplicate index rows, retaining the highest clone count,
53
+ # which should be the most accurate
54
+ df_clones = df_clones .loc [~ df_clones .index .duplicated (keep = "first" )]
55
+ # Sort by date
56
+ df_clones = df_clones .sort_index ()
63
57
else :
64
- df_todays_clones = pd .DataFrame (
65
- data = {"clone_count" : [0 ]}, index = pd .DatetimeIndex (data = [dt .now ().date ()])
66
- )
67
- df_todays_clones .index .name = "date"
68
- df_todays_clones .to_csv (daily_path )
58
+ df_clones = df_clones_historical .copy ()
59
+
60
+ # Fill in missing dates with 0s
61
+ df_clones = patch_df (df_clones )
62
+ # Sort by date again (just to be safe)
63
+ df_clones = df_clones .sort_index ()
64
+
65
+ df_clones .to_csv (daily_clones_file , index_label = "date" )
69
66
70
67
# generate cumulative downloads for this repo + output to directory
71
- cum_path = os .path .join (
72
- stats_dir , cum_dir , f"{ owner_name } _{ repo_name } _cum_clones.csv"
68
+ cumulative_clones_file = os .path .join (
69
+ stats_dir ,
70
+ cum_dir ,
71
+ f"{ owner_name } _{ repo_name } _cum_clones.csv" ,
73
72
)
74
- df_cum = pd . read_csv ( daily_path )
73
+ df_cum = df_clones . copy ( )
75
74
df_cum ["clone_count" ] = df_cum ["clone_count" ].cumsum ()
76
- df_cum .to_csv (cum_path , mode = "w+" , index = False )
77
-
78
- # update overall cumulative stats across all repos
79
- overall_cum_path = os .path .join (stats_dir , cum_dir , "all_repos_cumulative.csv" )
80
- update_overall_cumulative (df_cum , overall_cum_path , repo )
81
-
82
-
83
- def update_overall_cumulative (df_add , path , repo_name ):
84
- """Update cumulative statistics for all repositories."""
85
- df_latest_clones = df_add .tail (1 )
86
- todays_date = df_latest_clones .iat [0 , 0 ]
87
- todays_clone_count = df_latest_clones .iloc [0 , 1 ]
88
- df_add = df_add .rename ({"clone_count" : repo_name }, axis = 1 )
89
-
90
- if not os .path .exists (path ):
91
- df_add .insert (loc = 1 , column = "Overall" , value = df_add [repo_name ])
92
- df_add .to_csv (path , index = False )
93
-
94
- # if column for this repo already exists in csv
95
- elif repo_name in pd .read_csv (path ):
96
- df_overall = pd .read_csv (path )
97
- # if csv already contains row for today
98
- if todays_date in df_overall ["date" ].values :
99
- df_overall .at [len (df_overall .index ) - 1 , repo_name ] = todays_clone_count
100
- df_overall .at [len (df_overall .index ) - 1 , "Overall" ] += todays_clone_count
101
- df_overall .to_csv (path , index = False )
102
- else :
103
- df_new_row = pd .DataFrame (columns = list (df_overall ))
104
- df_new_row .at [0 , "date" ] = todays_date
105
- df_new_row .at [0 , repo_name ] = todays_clone_count
106
- df_new_row .at [0 , "Overall" ] = todays_clone_count
107
- df_new_row .to_csv (path , mode = "a" , header = False , index = False )
108
- else :
109
- df_overall = pd .read_csv (path )
110
- df_add = df_add .set_index ("date" )
111
- df_overall = df_overall .set_index ("date" )
112
-
113
- df_overall = pd .concat ([df_overall , df_add ], axis = 1 ).sort_index ()
114
- df_overall .fillna (0 , inplace = True )
115
- df_overall ["Overall" ] += df_overall [repo_name ]
116
- df_overall .to_csv (path )
75
+ df_cum .to_csv (cumulative_clones_file , index_label = "date" )
117
76
118
77
119
78
def patch_df (df ):
120
- """Fill in dates where no clones were made with 0's."""
79
+ """Fill in dates where no clones were made with 0's.
80
+
81
+ Parameters
82
+ ----------
83
+ df : pd.DataFrame
84
+ The DataFrame containing clone statistics.
85
+
86
+ Returns
87
+ -------
88
+ pd.DataFrame
89
+ The DataFrame with missing dates filled in.
90
+ """
121
91
cur_date = df .index [0 ].date ()
122
92
todays_date = dt .now ().date ()
123
93
row = 0
@@ -140,7 +110,19 @@ def patch_df(df):
140
110
141
111
142
112
def clones_to_df (clones ):
143
- """Convert clone statistics to a DataFrame."""
113
+ """Convert clone statistics to a DataFrame.
114
+
115
+ Parameters
116
+ ----------
117
+ clones : list of github.RepositoryClone
118
+ The clone statistics.
119
+
120
+ Returns
121
+ -------
122
+ pd.DataFrame
123
+ The clone statistics as a DataFrame.
124
+ The index is the date of the clone and the column is clone_count.
125
+ """
144
126
timestamps = []
145
127
total_clone_counts = []
146
128
@@ -157,7 +139,18 @@ def clones_to_df(clones):
157
139
158
140
159
141
def fetch_clones (repo ):
160
- """Fetch clone statistics for a repository."""
142
+ """Fetch clone statistics for a repository.
143
+
144
+ Parameters
145
+ ----------
146
+ repo : github.Repository.Repository
147
+ The repository object.
148
+
149
+ Returns
150
+ -------
151
+ list of github.RepositoryClone
152
+ The clone statistics.
153
+ """
161
154
clones = repo .get_clones_traffic ()
162
155
return clones ["clones" ]
163
156
0 commit comments