Skip to content

Commit

Permalink
finish analysis and publish
Browse files Browse the repository at this point in the history
  • Loading branch information
erilu committed Mar 31, 2020
1 parent 45159bf commit e61fa54
Show file tree
Hide file tree
Showing 9 changed files with 18,997 additions and 269 deletions.
34 changes: 17 additions & 17 deletions NBA_player_info_and_stats_joined_mar2020.csv

Large diffs are not rendered by default.

24 changes: 12 additions & 12 deletions NBA_roster_info_all_players_mar2020.csv

Large diffs are not rendered by default.

2,856 changes: 2,853 additions & 3 deletions README.md

Large diffs are not rendered by default.

15,814 changes: 15,814 additions & 0 deletions docs/scrape-nba-statistics-notebook.html

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
70 changes: 0 additions & 70 deletions scrape_nba_salaries.py

This file was deleted.

82 changes: 82 additions & 0 deletions scrape_nba_statistics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import re
import urllib
from time import sleep
import json
import pandas as pd
from itertools import chain

# This method finds the urls for each of the rosters in the NBA using regexes.
def build_team_urls():
# Open the espn teams webpage and extract the names of each roster available.
f = urllib.request.urlopen('http://www.espn.com/nba/teams')
teams_source = f.read().decode('utf-8')
teams = dict(re.findall("www\.espn\.com/nba/team/_/name/(\w+)/(.+?)\",", teams_source))
# Using the names of the rosters, create the urls of each roster
roster_urls = []
for key in teams.keys():
# each roster webpage follows this general pattern.
roster_urls.append('http://www.espn.com/nba/team/roster/_/name/' + key + '/' + teams[key])
teams[key] = str(teams[key])
return dict(zip(teams.values(), roster_urls))

# This method gets a dictionary of player information from a given roster URL
def get_player_info(roster_url):
f = urllib.request.urlopen(roster_url)
roster_source = f.read().decode('utf-8')
sleep(0.5)
player_regex = ('\{\"name\"\:\"(\w+\s\w+)\",\"href\"\:\"http\://www\.espn\.com/nba/player/.*?\",(.*?)\}')
player_info = re.findall(player_regex, roster_source)
player_dict = dict()
for player in player_info:
player_dict[player[0]] = json.loads("{"+player[1]+"}")
return(player_dict)

# scrape player information from rosters
rosters = build_team_urls()
all_players = dict()
for team in rosters.keys():
print("Gathering player info for team: " + team)
all_players[team] = get_player_info(rosters[team])

# loop through each team, create a pandas DataFrame, and append
all_players_df = pd.DataFrame()
for team in all_players.keys():
team_df = pd.DataFrame.from_dict(all_players[team], orient = "index")
team_df['team'] = team
all_players_df = all_players_df.append(team_df)
all_players_df.to_csv("NBA_roster_info_all_players.csv")

# scrape career statistics
print ("Now gathering career stats on all players (may take a while):")
career_stats_df = pd.DataFrame(columns = ["GP","GS","MIN","FGM", "FGA","FG%","3PTM","3PTA","3P%","FTM","FTA","FT%","OR","DR","REB","AST","BLK","STL","PF","TO","PTS"])
for player_index in all_players_df.index:
url = "https://www.espn.com/nba/player/stats/_/id/" + str(all_players_df.loc[player_index]['id'])
f = urllib.request.urlopen(url)
sleep(0.3)
player_source = f.read().decode('utf-8')
# extract career stats using this regex
stats_regex = ('\[\"Career\",\"\",(.*?)\]\},\{\"ttl\"\:\"Regular Season Totals\"')
career_info = re.findall(stats_regex, player_source)
try:
# convert the stats to a list of floats, and add the entry to the DataFrame
career_info = career_info[0].replace("\"", "").split(",")
career_info = list(chain.from_iterable([i.split("-") for i in career_info]))
career_info = list(map(float,career_info))
career_stats_df = career_stats_df.append(pd.Series(career_info, index = career_stats_df.columns, name=player_index))
except:
# if no career stats were returned, the player was a rookie with no games played
print(player_index + " has no info, ", end = "")
career_stats_df.to_csv("NBA_player_career_stats_all_players.csv")

# join and clean the data
all_stats_df = all_players_df.join(career_stats_df)
def convert_height(height):
split_height = height.split(" ")
feet = float(split_height[0].replace("\'",""))
inches = float(split_height[1].replace("\"",""))
return (feet*12 + inches)
all_stats_df['height'] = [convert_height(x) for x in all_stats_df['height']]
all_stats_df['weight'] = [float(x.split(" ")[0]) for x in all_stats_df['weight']]
all_stats_df['salary'] = [int(re.sub(r'[^\d.]+', '', s)) if isinstance(s, str) else s for s in all_stats_df['salary'].values]

all_stats_df.to_csv("NBA_player_info_and_stats_joined_clean.csv")

0 comments on commit e61fa54

Please sign in to comment.