Skip to content

Commit

Permalink
update: add more analysis
Browse files Browse the repository at this point in the history
  • Loading branch information
terryyz committed May 16, 2024
1 parent cf7f2e4 commit 38ad209
Show file tree
Hide file tree
Showing 9 changed files with 1,652 additions and 40 deletions.
50 changes: 50 additions & 0 deletions analysis/3rd_party_libs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"Crypto": "pycryptodome",
"PIL": "pillow",
"blake3": "blake3",
"bs4": "beautifulsoup4",
"chardet": "chardet",
"cryptography": "cryptography",
"dateutil": "python-dateutil",
"django": "django",
"docx": "python-docx",
"faker": "faker",
"flask": "flask",
"flask_login": "flask-login",
"flask_mail": "Flask-Mail",
"flask_restful": "flask-restful",
"folium": "folium",
"geopy": "geopy",
"keras": "keras",
"librosa": "librosa",
"lxml": "lxml",
"matplotlib": "matplotlib",
"mechanize": "mechanize",
"nltk": "nltk",
"numpy": "numpy",
"openpyxl": "openpyxl",
"pandas": "pandas",
"prettytable": "prettytable",
"psutil": "psutil",
"pytesseract": "pytesseract",
"pytz": "pytz",
"requests": "requests",
"rsa": "rsa",
"scipy": "scipy",
"seaborn": "seaborn",
"sendgrid": "sendgrid",
"sklearn": "scikit-learn",
"soundfile": "soundfile",
"statsmodels": "statsmodels",
"tensorflow": "tensorflow",
"texttable": "texttable",
"werkzeug": "werkzeug",
"wordninja": "wordninja",
"wtforms": "WTForms",
"xlwt": "xlwt",
"xmltodict": "xmltodict",
"yaml": "PyYAML",
"flask_wtf": "Flask-WTF",
"gensim": "gensim",
"python_http_client": "python-http-client"
}
164 changes: 164 additions & 0 deletions analysis/HumanEval.jsonl

Large diffs are not rendered by default.

279 changes: 240 additions & 39 deletions analysis/benchmark_analysis.ipynb

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions analysis/download_stats.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"requests": 407179468,
"python-dateutil": 317542652,
"PyYAML": 266619413,
"numpy": 238728858,
"cryptography": 228003149,
"pandas": 190247110,
"rsa": 176030185,
"pytz": 155795840,
"flask": 100804266,
"scipy": 98509718,
"psutil": 93922127,
"werkzeug": 93236214,
"beautifulsoup4": 91662537,
"pillow": 86446419,
"lxml": 79545989,
"openpyxl": 76199497,
"scikit-learn": 61481156,
"chardet": 59122948,
"matplotlib": 58474767,
"xmltodict": 40772510,
"pycryptodome": 26702817,
"tensorflow": 21654531,
"nltk": 18161557,
"keras": 16838960,
"seaborn": 16540691,
"statsmodels": 16508856,
"django": 13188158,
"prettytable": 12583066,
"faker": 12116411,
"texttable": 7645216,
"python-http-client": 6660018,
"sendgrid": 6094785,
"Flask-WTF": 5828864,
"flask-login": 5493873,
"WTForms": 5088533,
"gensim": 4886183,
"geopy": 4711066,
"python-docx": 3982122,
"xlwt": 3704863,
"soundfile": 2472954,
"librosa": 2445908,
"pytesseract": 2004509,
"flask-restful": 1563911,
"folium": 998664,
"Flask-Mail": 372163,
"mechanize": 281683,
"blake3": 209166,
"wordninja": 183705
}
Binary file added analysis/download_stats.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,000 changes: 1,000 additions & 0 deletions analysis/ds1000.jsonl

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion analysis/lib2domain.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"Crypto": "Cryptography", "PIL": "Visualization", "array": "General", "base64": "Cryptography", "binascii": "Cryptography", "bisect": "General", "blake3": "Cryptography", "bs4": "Network", "calendar": "Time", "cgi": "Network", "chardet": "Network", "cmath": "Computation", "codecs": "Cryptography", "collections": "General", "cryptography": "Cryptography", "csv": "System", "ctypes": "System", "datetime": "Time", "dateutil": "Time", "difflib": "General", "django": "Network", "docx": "System", "email": "Network", "faker": "General", "flask": "Network", "flask_login": "Network", "flask_mail": "Network", "flask_restful": "Network", "fnmatch": "General", "folium": "Visualization", "functools": "General", "geopy": "General", "getpass": "System", "glob": "System", "gzip": "System", "hashlib": "Cryptography", "heapq": "General", "hmac": "Cryptography", "html": "Network", "http": "Network", "importlib": "General", "inspect": "General", "io": "System", "ipaddress": "Network", "itertools": "General", "json": "System", "keras": "Computation", "librosa": "Computation", "logging": "System", "lxml": "Network", "math": "Computation", "matplotlib": "Visualization", "mechanize": "Network", "mimetypes": "Network", "multiprocessing": "System", "nltk": "Computation", "numpy": "Computation", "openpyxl": "System", "operator": "General", "os": "System", "pandas": "Computation", "pathlib": "System", "pickle": "System", "pkgutil": "General", "platform": "System", "prettytable": "General", "psutil": "System", "pytesseract": "Computation", "pytz": "Time", "queue": "General", "random": "General", "re": "General", "requests": "Network", "rsa": "Cryptography", "scipy": "Computation", "seaborn": "Visualization", "secrets": "Cryptography", "select": "System", "sendgrid": "Network", "shutil": "System", "sklearn": "Computation", "smtplib": "Network", "socket": "Network", "soundfile": "Computation", "sqlite3": "System", "ssl": "Network", "statistics": "Computation", "statsmodels": "Computation", "string": "General", "struct": "System", "subprocess": "System", "sys": "System", "tarfile": "System", "tensorflow": "Computation", "texttable": "General", "textwrap": "General", "threading": "System", "time": "Time", "turtle": "Visualization", "types": "General", "unicodedata": "General", "urllib": "Network", "uuid": "General", "warnings": "General", "werkzeug": "Network", "wordninja": "Computation", "wtforms": "Network", "xlwt": "System", "xml": "Network", "xmltodict": "Network", "yaml": "System", "zipfile": "System"}
{"Crypto": "Cryptography", "PIL": "Visualization", "array": "General", "base64": "Cryptography", "binascii": "Cryptography", "bisect": "General", "blake3": "Cryptography", "bs4": "Network", "calendar": "Time", "cgi": "Network", "chardet": "Network", "cmath": "Computation", "codecs": "Cryptography", "collections": "General", "cryptography": "Cryptography", "csv": "System", "ctypes": "System", "datetime": "Time", "dateutil": "Time", "difflib": "General", "django": "Network", "docx": "System", "email": "Network", "faker": "General", "flask": "Network", "flask_login": "Network", "flask_mail": "Network", "flask_restful": "Network", "fnmatch": "General", "folium": "Visualization", "functools": "General", "geopy": "General", "getpass": "System", "glob": "System", "gzip": "System", "hashlib": "Cryptography", "heapq": "General", "hmac": "Cryptography", "html": "Network", "http": "Network", "importlib": "General", "inspect": "General", "io": "System", "ipaddress": "Network", "itertools": "General", "json": "System", "keras": "Computation", "librosa": "Computation", "logging": "System", "lxml": "Network", "math": "Computation", "matplotlib": "Visualization", "mechanize": "Network", "mimetypes": "Network", "multiprocessing": "System", "nltk": "Computation", "numpy": "Computation", "openpyxl": "System", "operator": "General", "os": "System", "pandas": "Computation", "pathlib": "System", "pickle": "System", "pkgutil": "General", "platform": "System", "prettytable": "General", "psutil": "System", "pytesseract": "Computation", "pytz": "Time", "queue": "General", "random": "General", "re": "General", "requests": "Network", "rsa": "Cryptography", "scipy": "Computation", "seaborn": "Visualization", "secrets": "Cryptography", "select": "System", "sendgrid": "Network", "shutil": "System", "sklearn": "Computation", "smtplib": "Network", "socket": "Network", "soundfile": "Computation", "sqlite3": "System", "ssl": "Network", "statistics": "Computation", "statsmodels": "Computation", "string": "General", "struct": "System", "subprocess": "System", "sys": "System", "tarfile": "System", "tensorflow": "Computation", "texttable": "General", "textwrap": "General", "threading": "System", "time": "Time", "turtle": "Visualization", "types": "General", "unicodedata": "General", "urllib": "Network", "uuid": "General", "warnings": "General", "werkzeug": "Network", "wordninja": "Computation", "wtforms": "Network", "xlwt": "System", "xml": "Network", "xmltodict": "Network", "yaml": "System", "zipfile": "System", "Levenshtein": "Computation", "ast": "General", "configparser": "System", "cv2": "Computation", "decimal": "General", "enum": "General", "errno": "System", "flask_wtf": "Network", "ftplib": "Network", "gensim": "Computation", "geopandas": "Computation", "holidays": "Time", "mpl_toolkits": "Visualization", "natsort": "General", "pyquery": "Network", "python_http_client": "Network", "regex": "General", "shapely": "Computation", "shlex": "System", "signal": "System", "skimage": "Computation", "sympy": "Computation", "textblob": "Computation", "typing": "General", "wikipedia": "Network", "wordcloud": "Visualization", "zlib": "System"}
73 changes: 73 additions & 0 deletions analysis/pypi_download_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import sys
import json
import subprocess
from tqdm import tqdm
from numpy import mean, median
from matplotlib import pyplot as plt
def get_pypi_stats(package_name):
"""
Function to get PyPI download stats for a given package using pypinfo.
"""
try:
# Constructing the command to call pypinfo
command = f"pypinfo --json {package_name}"

# Running the command and capturing the output
result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

# Print the output
print(result.stdout)
parsed_result = json.loads(result.stdout)

return parsed_result["rows"][0]["download_count"]

except subprocess.CalledProcessError as e:
print(f"An error occurred: {e.stderr}", file=sys.stderr)
return None

if __name__ == "__main__":
with open("analysis/lib2domain.json") as f:
lib2domain = json.load(f)

with open("analysis/standard_lib.json") as f:
standard_lib = json.load(f)

with open("analysis/used_std_libs.json","w") as f:
libs = []
for lib in lib2domain.keys():
if lib in standard_lib:
libs.append(lib)
json.dump(libs,f,indent=4)
# with open("analysis/3rd_party_libs.json","w") as f:
# libs = []
# for lib in lib2domain.keys():
# if lib not in standard_lib:
# libs.append(lib)
# json.dump(libs,f,indent=4)

# with open("analysis/3rd_party_libs.json") as f:
# libs = json.load(f)

# download_stats = {}
# for lib in tqdm(list(libs.values())[:]):
# print(f"Getting download stats for {lib}")
# download_stats[lib] = get_pypi_stats(lib)
# sorted_download_stats = dict(sorted(download_stats.items(), key=lambda x: x[1], reverse=True))

# with open("analysis/download_stats.json", "w") as f:
# json.dump(sorted_download_stats, f, indent=4)

with open("analysis/download_stats.json") as f:
download_stats = json.load(f)
# get mean and median download stats
print(f"Mean download stats: {mean(list(download_stats.values()))}")
print(f"Median download stats: {median(list(download_stats.values()))}")
# plot the download stats with curve fitting
plt.hist(list(download_stats.values()), bins=50, color='blue', edgecolor='black')
plt.xlabel("Download Stats")
plt.ylabel("Frequency")
plt.title("Distribution of Download Stats")
plt.savefig("analysis/download_stats.png")



74 changes: 74 additions & 0 deletions analysis/used_std_libs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
[
"array",
"base64",
"binascii",
"bisect",
"calendar",
"cgi",
"cmath",
"codecs",
"collections",
"csv",
"ctypes",
"datetime",
"difflib",
"email",
"fnmatch",
"functools",
"getpass",
"glob",
"gzip",
"hashlib",
"heapq",
"hmac",
"html",
"http",
"importlib",
"inspect",
"io",
"ipaddress",
"itertools",
"json",
"logging",
"math",
"mimetypes",
"multiprocessing",
"operator",
"os",
"pathlib",
"pickle",
"pkgutil",
"platform",
"queue",
"random",
"re",
"secrets",
"select",
"shutil",
"smtplib",
"socket",
"sqlite3",
"ssl",
"statistics",
"string",
"struct",
"subprocess",
"sys",
"tarfile",
"textwrap",
"threading",
"time",
"turtle",
"types",
"unicodedata",
"urllib",
"uuid",
"warnings",
"xml",
"zipfile",
"decimal",
"enum",
"typing",
"unittest",
"zlib"
]

0 comments on commit 38ad209

Please sign in to comment.