-
Notifications
You must be signed in to change notification settings - Fork 10
/
scrape_nba_statistics.py
executable file
·82 lines (75 loc) · 3.83 KB
/
scrape_nba_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import urllib
from time import sleep
import json
import pandas as pd
from itertools import chain
# This method finds the urls for each of the rosters in the NBA using regexes.
def build_team_urls():
# Open the espn teams webpage and extract the names of each roster available.
f = urllib.request.urlopen('https://www.espn.com/nba/teams')
teams_source = f.read().decode('utf-8')
teams = dict(re.findall("www\.espn\.com/nba/team/_/name/(\w+)/(.+?)\",", teams_source))
# Using the names of the rosters, create the urls of each roster
roster_urls = []
for key in teams.keys():
# each roster webpage follows this general pattern.
roster_urls.append('https://www.espn.com/nba/team/roster/_/name/' + key + '/' + teams[key])
teams[key] = str(teams[key])
return dict(zip(teams.values(), roster_urls))
# This method gets a dictionary of player information from a given roster URL
def get_player_info(roster_url):
f = urllib.request.urlopen(roster_url)
roster_source = f.read().decode('utf-8')
sleep(0.5)
player_regex = ('\{\"name\"\:\"(\w+\s\w+)\",\"href\"\:\"https?\://www\.espn\.com/nba/player/.*?\",(.*?)\}')
player_info = re.findall(player_regex, roster_source)
player_dict = dict()
for player in player_info:
player_dict[player[0]] = json.loads("{"+player[1]+"}")
return(player_dict)
# scrape player information from rosters
rosters = build_team_urls()
all_players = dict()
for team in rosters.keys():
print("Gathering player info for team: " + team)
all_players[team] = get_player_info(rosters[team])
# loop through each team, create a pandas DataFrame, and append
all_players_df = pd.DataFrame()
for team in all_players.keys():
team_df = pd.DataFrame.from_dict(all_players[team], orient = "index")
team_df['team'] = team
all_players_df = all_players_df.append(team_df)
all_players_df.to_csv("NBA_roster_info_all_players.csv")
# scrape career statistics
print ("Now gathering career stats on all players (may take a while):")
career_stats_df = pd.DataFrame(columns = ["GP","GS","MIN","FGM", "FGA","FG%","3PTM","3PTA","3P%","FTM","FTA","FT%","OR","DR","REB","AST","BLK","STL","PF","TO","PTS"])
for player_index in all_players_df.index:
url = "https://www.espn.com/nba/player/stats/_/id/" + str(all_players_df.loc[player_index]['id'])
f = urllib.request.urlopen(url)
sleep(0.3)
player_source = f.read().decode('utf-8')
# extract career stats using this regex
stats_regex = ('\[\"Career\",\"\",(.*?)\]\},\{\"ttl\"\:\"Regular Season Totals\"')
career_info = re.findall(stats_regex, player_source)
try:
# convert the stats to a list of floats, and add the entry to the DataFrame
career_info = career_info[0].replace("\"", "").split(",")
career_info = list(chain.from_iterable([i.split("-") for i in career_info]))
career_info = list(map(float,career_info))
career_stats_df = career_stats_df.append(pd.Series(career_info, index = career_stats_df.columns, name=player_index))
except:
# if no career stats were returned, the player was a rookie with no games played
print(player_index + " has no info, ", end = "")
career_stats_df.to_csv("NBA_player_career_stats_all_players.csv")
# join and clean the data
all_stats_df = all_players_df.join(career_stats_df)
def convert_height(height):
split_height = height.split(" ")
feet = float(split_height[0].replace("\'",""))
inches = float(split_height[1].replace("\"",""))
return (feet*12 + inches)
all_stats_df['height'] = [convert_height(x) for x in all_stats_df['height']]
all_stats_df['weight'] = [float(x.split(" ")[0]) for x in all_stats_df['weight']]
all_stats_df['salary'] = [int(re.sub(r'[^\d.]+', '', s)) if isinstance(s, str) else s for s in all_stats_df['salary'].values]
all_stats_df.to_csv("NBA_player_info_and_stats_joined_clean.csv")