Skip to content

Commit 7710fd3

Browse files
committed
Initial commit
0 parents  commit 7710fd3

14 files changed

+1192084
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
packages/

all.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

analysis.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
from __future__ import print_function, division
2+
3+
import pandas as pd
4+
import networkx as nx
5+
from networkx.drawing.nx_pydot import write_dot
6+
import matplotlib.pyplot as plt
7+
from matplotlib import patches
8+
9+
import seaborn as sns
10+
import operator
11+
import numpy as np
12+
sns.set_context('notebook', font_scale=1.5)
13+
sns.set_style('white')
14+
15+
requirements = pd.read_csv('requirements.csv')
16+
17+
def make_graph(df, min_edges=0):
18+
DG = nx.DiGraph()
19+
DG.add_nodes_from(df.package_name.unique())
20+
edges = df.loc[df.requirement.notnull(), ['package_name', 'requirement']].values
21+
DG.add_edges_from(edges)
22+
23+
# Remove bad nodes
24+
DG.remove_nodes_from(['.', 'nan', np.nan])
25+
26+
deg = DG.degree()
27+
#print(deg)
28+
try:
29+
to_remove = [n for n in deg if deg[n] <= min_edges]
30+
DG.remove_nodes_from(to_remove)
31+
except:
32+
#print("key not present")
33+
pass
34+
return DG
35+
36+
#DG = make_graph(requirements, min_edges=10)
37+
#write_dot(DG, 'requirements_graph.dot')
38+
39+
#dep_graph = make_graph(requirements, min_edges=0)
40+
41+
#print(len(dep_graph.node))
42+
G = make_graph(requirements)
43+
print(G.number_of_edges())
44+
45+
def dependency_graph():
46+
deplist = []
47+
48+
for node in G:
49+
if len(G[node]) == 0:
50+
continue
51+
deplist.append(len(G.out_edges([node])))
52+
x = zero_to_nan(deplist)
53+
x = np.sort(x)
54+
print(x)
55+
print(len(x))
56+
p = 1. * np.arange(len(x))/(len(x) - 1)
57+
plt.plot(x, p, marker='.', linestyle='none')
58+
_ = plt.xlabel('Dependencies')
59+
_ = plt.ylabel('CDF')
60+
plt.margins(0.02)
61+
plt.show()
62+
63+
def zero_to_nan(values):
64+
return [float('nan') if x==0 else x for x in values ]
65+
66+
def pageRank():
67+
#Calculate the page rank
68+
pr = {}
69+
pr = nx.pagerank(G)
70+
pr = sorted(pr.values(), reverse=True)
71+
return pr
72+
73+
max_d = []
74+
75+
def dfs_depth(G, source=None, depth_limit=None):
76+
if source is None:
77+
nodes = G
78+
else:
79+
nodes = [source]
80+
visited = set()
81+
if depth_limit is None:
82+
depth_limit = len(G)
83+
for start in nodes:
84+
print(start)
85+
if start in visited:
86+
continue
87+
max_depth = 0
88+
visited.add(start)
89+
stack = [(start, depth_limit, iter(G[start]))]
90+
while stack:
91+
parent, depth_now, children = stack[-1]
92+
try:
93+
child = next(children)
94+
if child not in visited:
95+
yield parent, child
96+
visited.add(child)
97+
if depth_now > 1:
98+
if((depth_limit - depth_now + 1)>max_depth):
99+
max_depth = depth_limit - depth_now + 1
100+
stack.append((child, depth_now - 1, iter(G[child])))
101+
except StopIteration:
102+
stack.pop()
103+
global max_d
104+
max_d.append(max_depth)
105+
106+
def deplist(pr):
107+
# Calculate all the dependencies, dependents
108+
dcon = {}
109+
list1 = []
110+
list2 = []
111+
list3 = []
112+
list4 = []
113+
for node in G:
114+
print(node)
115+
#temp = {node:len(G.out_edges(node))}
116+
list1.append(node)
117+
list2.append(len(G.out_edges(node)))
118+
list3.append(len(G.in_edges(node)))
119+
list4.append(len(list(nx.dfs_edges(G,node))))
120+
list(dfs_depth(G, node))
121+
#dcon.update(temp)
122+
list2 = sorted(list2, reverse=True)
123+
list3 = sorted(list3, reverse=True)
124+
list4 = sorted(list4, reverse=True)
125+
global max_d
126+
max_d = sorted(max_d, reverse=True)
127+
df = pd.DataFrame(data={"Dependencies":list2[:1000], "Dependents":list3[:1000], "DFS-Edges":list4[:1000], "Max-Depth":max_d[:1000], "Page Rank":pr[:1000]})
128+
df.plot(kind="density", subplots=True, layout=(3,2), sharex=False)
129+
plt.show()
130+
131+
#pr = pageRank()
132+
#deplist(pr)
133+
134+
#dependency_graph()
135+
136+
print(G.in_edges())
137+
138+
#sorted_dict = sorted(G.in_degree().items(), key=operator.itemgetter(1))[::-1]
139+
140+
N = 10
141+
x = np.arange(N)
142+
y = np.array([d[1] for d in sorted_dict[:N]])
143+
xlabels = [d[0] for d in sorted_dict[:N]][::-1]
144+
fig, ax = plt.subplots(1, 1, figsize=(7, 7))
145+
146+
ax.barh(x[::-1], y, height=1.0)
147+
ax.set_yticks(x + 0.5)
148+
_ = ax.set_yticklabels(xlabels)
149+
ax.set_xlabel('Number of Connections')
150+
ax.set_title('Graph Degree')
151+
fig.subplots_adjust(left=0.27, bottom=0.1, top=0.95)
152+
fig.show()
153+

authors.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import os, re
2+
3+
def author():
4+
os.chdir("packages")
5+
print(os.getcwd())
6+
main_dic = []
7+
for f in os.listdir():
8+
os.chdir(f)
9+
print(os.listdir())
10+
try:
11+
with open("setup.py", "r") as f:
12+
temp = f.readlines()
13+
except:
14+
os.chdir("../")
15+
continue
16+
dic = {}
17+
for line in temp:
18+
t = []
19+
if 'name=' in line:
20+
ln = "".join(line.split())
21+
ln = re.search("name='(.+?)',", ln)
22+
if ln:
23+
ln = ln.group(1)
24+
ln = ln.strip('\"')
25+
ln = ln.strip("\'")
26+
dic[ln] = []
27+
if 'author=' in line:
28+
la = "".join(line.split())
29+
la = re.search("author='(.+?)',", la)
30+
if la:
31+
la = la.group(1)
32+
la = la.strip('\"')
33+
la = la.strip("\'")
34+
t.append(la)
35+
dic[ln] = t
36+
if 'author_email=' in line:
37+
le = "".join(line.split())
38+
le = re.search("author_email='(.+?)',", le)
39+
if le:
40+
le = le.group(1)
41+
le = le.strip('\"')
42+
le = le.strip("\'")
43+
t.append(le)
44+
dic[ln] = t
45+
break
46+
try:
47+
print(dic)
48+
main_dic.update(dic)
49+
except:
50+
pass
51+
os.chdir("../")
52+
print(main_dic)
53+
54+
def main():
55+
author()
56+
57+
if __name__=="__main__":
58+
main()

base_dep.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import pandas as pd
2+
from collections import defaultdict
3+
import numpy as np
4+
5+
df = pd.read_csv("requirements.csv")
6+
7+
class Tree(object):
8+
def __init__(self, name):
9+
self.name = name
10+
self.children = []
11+
return
12+
13+
def __contains__(self, obj):
14+
return obj == self.name or any([obj in c for c in self.children])
15+
16+
def add(self, obj):
17+
if not self.__contains__(obj):
18+
self.children.append(Tree(obj))
19+
return True
20+
return False
21+
22+
def get_base_requirements(self):
23+
base = []
24+
for child in self.children:
25+
if len(child.children) == 0:
26+
base.append(child.name)
27+
else:
28+
for b in [c.get_base_requirements() for c in child.children()]:
29+
base.extend(b)
30+
return np.unique(base)
31+
32+
33+
def get_requirements(package):
34+
return df.loc[(df.package_name == package) & (df.requirement.notnull()), 'requirement'].values
35+
36+
37+
def get_dependency_tree(package, tree):
38+
reqs = get_requirements(package)
39+
for req in reqs:
40+
#print(req)
41+
flg = tree.add(req)
42+
if not flg:
43+
continue
44+
tree = get_dependency_tree(req, tree)
45+
return tree
46+
47+
datadict = defaultdict(list)
48+
for i, package in enumerate(df.package_name.unique()):
49+
if i % 100 == 0:
50+
print('Package {}: {}'.format(i+1, package))
51+
try:
52+
deptree = get_dependency_tree(package, Tree(package))
53+
except:
54+
print('Failure getting base dependencies for {}'.format(package))
55+
raise ValueError
56+
for dependency in deptree.get_base_requirements():
57+
datadict['package_name'].append(package)
58+
datadict['requirements'].append(dependency)
59+
60+
base_df = pd.DataFrame(data=datadict)
61+
base_df.head()
62+
63+
base_df.to_csv('base_requirements.csv', index=False)

converter.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import pandas as pd
2+
3+
r = pd.read_csv("requirements.csv")
4+
print(r.to_json("all.json", orient='records'))

dependecies.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import pandas as pd
2+
from collections import defaultdict
3+
import os
4+
import requirements
5+
import numpy as np
6+
import xmlrpc.client as xc
7+
8+
client = xc.ServerProxy('https://pypi.python.org/pypi')
9+
packages = client.list_packages()
10+
11+
datadict = defaultdict(list)
12+
with open('requirements.txt', 'r') as infile:
13+
new_package = True
14+
for line in infile:
15+
if line.strip() == '':
16+
new_package = True
17+
print(package_name)
18+
if package_name not in datadict['package']:
19+
datadict['package'].append(package_name)
20+
datadict['requirement'].append(np.nan)
21+
continue
22+
23+
if new_package:
24+
# If this is the case, the current line gives the name of the package
25+
package_name = os.path.basename(line).strip()
26+
new_package = False
27+
else:
28+
# This line gives a requirement for the current package
29+
try:
30+
print(line)
31+
for req in requirements.parse(line.strip()):
32+
datadict['package'].append(package_name)
33+
datadict['requirement'].append(req.name)
34+
except ValueError:
35+
pass
36+
37+
38+
# Convert to dataframe
39+
df = pd.DataFrame(data=datadict)
40+
df.head()
41+
42+
df['package_name'] = np.nan
43+
df['package_version'] = np.nan
44+
for i, package in enumerate(packages):
45+
try:
46+
if i % 100 == 0:
47+
print('Package {}: {}'.format(i+1, package))
48+
for release in client.package_releases(package):
49+
try:
50+
pkg_str = '{}-{}'.format(package, release)
51+
idx = df.loc[df.package == pkg_str].index
52+
if len(idx) > 0:
53+
df.loc[idx, 'package_name'] = package
54+
df.loc[idx, 'package_version'] = release
55+
except:
56+
pass
57+
except:
58+
pass
59+
df.head()
60+
61+
df.to_csv('requirements.csv', index=False)

downloads.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import json
2+
import requests
3+
import bs4
4+
from collections import OrderedDict
5+
6+
url = "https://pepy.tech/project/"
7+
8+
def download_counts():
9+
with open("all.json", "r") as f:
10+
pack = json.load(f)
11+
12+
# A dictionary to store dowload counts for all the packages
13+
dic = {}
14+
#count = 0
15+
packages = []
16+
for item in pack:
17+
packages.append(item["package_name"])
18+
packages = list(OrderedDict.fromkeys(packages))
19+
for item in packages:
20+
#count += 1
21+
print(item)
22+
try:
23+
res = requests.get(url + item)
24+
soup = bs4.BeautifulSoup(res.text, 'lxml')
25+
# td is the tag they used. And taking 1st element from that
26+
td = soup.findAll("td")
27+
print(td[0].getText())
28+
temp = {}
29+
temp[item] = td[0].getText()
30+
dic.update(temp)
31+
except:
32+
print("couldn't find")
33+
pass
34+
#if count == 2:
35+
# break
36+
with open("downloads_counts.json", "w") as f:
37+
json.dump(dic, f)
38+
39+
def main():
40+
download_counts()
41+
42+
if __name__=="__main__":
43+
main()

downloads_counts.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)