forked from twitter/opensource-website
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_projects.py
166 lines (132 loc) · 5.44 KB
/
fetch_projects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# Copyright 2018 Twitter, Inc.
# SPDX-License-Identifier: Apache-2.0
import os
import operator
import json
import requests
import graphql_queries
PATH_TO_DATA = "data"
GITHUB_USERNAME = os.environ["GH_USERNAME"]
GITHUB_OAUTH_TOKEN = os.environ["OAUTH_TOKEN"]
GITHUB_API_ENDPOINT = "https://api.github.com/graphql"
print("LOG: Assuming the current path to be the root of the metrics repository.")
SVG_NO_OF_MEMBERS = 'N/A'
SVG_NO_OF_REPOS = 'N/A'
def fetch_one_page(query_string, variables):
"""
Request the GitHub GraphQL API
"""
headers = {
"Content-Type": "application/json",
}
r = requests.post(GITHUB_API_ENDPOINT, json={"query": query_string, "variables": variables}, auth=(GITHUB_USERNAME, GITHUB_OAUTH_TOKEN))
if r.status_code == 200:
return r.json()
else:
raise Exception("Error in GitHub API query. Status Code : {}, Response: {}".format(r.status_code, r.json()))
all_org_edges = [] # All the repos in the org with their stats
# Read repos-to-include.txt
all_orgs = [] # Track orgs and all its repos e.g. twitter, twitter
all_repos = [] # Track specific repositories e.g. ('pantsbuild', 'pants')
with open("repos-to-include.txt", "r") as f:
for line in f:
owner, repo = line.split("/")
repo = repo.rstrip("\n")
if repo == "*":
all_orgs.append(owner)
else:
all_repos.append((owner, repo))
print("LOG: Orgs to track", all_orgs)
print("Repos to track", all_repos)
for org in all_orgs:
# Combine the paginated responses from the API
has_next_page = False
end_cursor = None
num_of_pages = 0
while True:
print("Num of pages", num_of_pages)
variables = json.dumps({"owner": org, "endCursor": end_cursor})
print("Sending request for", org)
response = fetch_one_page(graphql_queries.org_all_repos, variables)
print("Received request for", org)
if org == 'twitter':
SVG_NO_OF_MEMBERS = response["data"]["organization"]["membersWithRole"]["totalCount"]
repository_edges = response["data"]["organization"]["repositories"]["edges"]
all_org_edges.extend(repository_edges)
pageInfo = response["data"]["organization"]["repositories"]["pageInfo"]
has_next_page = pageInfo["hasNextPage"]
print("has_next_page", has_next_page)
end_cursor = pageInfo["endCursor"]
print("end_cursor", end_cursor)
num_of_pages += 1
if not has_next_page:
break
print("LOG: Fetched all the org repositories. Count:", len(all_org_edges))
# print("LOG: First record")
# print(all_org_edges[0])
# Fetch individual repositories' data
all_repo_edges = [] # All individual repos
for repo in all_repos:
variables = json.dumps({"owner": repo[0], "repo": repo[1], "endCursor": None})
response = fetch_one_page(graphql_queries.repo_wise, variables)
all_repo_edges.append(response["data"])
print("LOG: Fetched all the individual repos as well. Count:", len(all_repo_edges))
# Repos to exclude
repos_to_exclude = set()
with open("repos-to-exclude.txt", "r") as f:
for line in f:
repo = line.rstrip("\n")
repos_to_exclude.add(repo)
print("LOG: Removing private repositories")
public_repos = []
for edge in all_org_edges:
if not edge["node"]["isPrivate"]:
public_repos.append(edge)
for edge in all_repo_edges:
if not edge["repository"]["isPrivate"]:
public_repos.append({'node': edge['repository']})
SVG_NO_OF_REPOS = len(public_repos)
print("LOG: Number of public repos", len(public_repos))
DATA_JSON = {}
for repo in public_repos:
repo_full_name = repo["node"]["nameWithOwner"]
if repo_full_name in repos_to_exclude:
print("LOG: Excluding", repo_full_name)
continue
DATA_JSON[repo_full_name] = repo["node"]
# Flatten list of languages
languages_dict = {}
for item in DATA_JSON[repo_full_name]["languages"]["edges"]:
languages_dict[item["node"]["name"]] = item["size"]
total_bytes = sum(languages_dict.values())
for lang in languages_dict:
languages_dict[lang] /= total_bytes # This is got to be a float, so use Python 3
# Use languages which have more than 5% code
languages = []
for item, value in languages_dict.items():
if value > 0.05:
languages.append(item)
DATA_JSON[repo_full_name]["languages"] = " ".join(languages)
# Flatten list of repository topics
_topics = DATA_JSON[repo_full_name]["repositoryTopics"]["edges"]
topics = []
for item in _topics:
topics.append(item["node"]["topic"]["name"])
DATA_JSON[repo_full_name]["repositoryTopics"] = " ".join(topics)
# Flatten stars count and watch count
DATA_JSON[repo_full_name]["stargazers"] = DATA_JSON[repo_full_name]["stargazers"]["totalCount"]
DATA_JSON[repo_full_name]["watchers"] = DATA_JSON[repo_full_name]["watchers"]["totalCount"]
# Save to _data directory
file_path = PATH_TO_DATA + "/" + "projects.json"
with open(file_path, "w+") as f:
json.dump(DATA_JSON, f, sort_keys=True, indent=2)
print("LOG: Saved to", file_path)
# Update the SVG
print("No of members", SVG_NO_OF_MEMBERS)
print("No of repos", SVG_NO_OF_REPOS)
network_svg = open("static/assets/network_raw.svg").read()
network_svg = network_svg.replace("{$members}", str(SVG_NO_OF_MEMBERS))
network_svg = network_svg.replace("{$Repos}", str(SVG_NO_OF_REPOS))
with open("static/assets/network.svg", "w+") as f:
f.write(network_svg)
print("LOG: assets/network.svg updated!")