finish analysis and publish

erilu · Mar 31, 2020 · e61fa54 · e61fa54
1 parent 45159bf
commit e61fa54
Show file tree

Hide file tree

Showing 9 changed files with 18,997 additions and 269 deletions.
diff --git a/NBA_player_info_and_stats_joined_mar2020.csv b/NBA_player_info_and_stats_joined_mar2020.csv
diff --git a/NBA_roster_info_all_players_mar2020.csv b/NBA_roster_info_all_players_mar2020.csv
diff --git a/README.md b/README.md
diff --git a/docs/scrape-nba-statistics-notebook.html b/docs/scrape-nba-statistics-notebook.html
diff --git a/scrape-nba-salaries-notebook.ipynb → scrape-nba-statistics-notebook.ipynb b/scrape-nba-salaries-notebook.ipynb → scrape-nba-statistics-notebook.ipynb
diff --git a/scrape-nba-statistics-notebook_files/scrape-nba-statistics-notebook_101_0.png b/scrape-nba-statistics-notebook_files/scrape-nba-statistics-notebook_101_0.png
diff --git a/scrape-nba-statistics-notebook_files/scrape-nba-statistics-notebook_95_0.png b/scrape-nba-statistics-notebook_files/scrape-nba-statistics-notebook_95_0.png
diff --git a/scrape_nba_salaries.py b/scrape_nba_salaries.py
diff --git a/scrape_nba_statistics.py b/scrape_nba_statistics.py
@@ -0,0 +1,82 @@
+import re
+import urllib
+from time import sleep
+import json
+import pandas as pd
+from itertools import chain
+
+# This method finds the urls for each of the rosters in the NBA using regexes.
+def build_team_urls():
+    # Open the espn teams webpage and extract the names of each roster available.
+    f = urllib.request.urlopen('http://www.espn.com/nba/teams')
+    teams_source = f.read().decode('utf-8')
+    teams = dict(re.findall("www\.espn\.com/nba/team/_/name/(\w+)/(.+?)\",", teams_source))
+    # Using the names of the rosters, create the urls of each roster
+    roster_urls = []
+    for key in teams.keys():
+        # each roster webpage follows this general pattern.
+        roster_urls.append('http://www.espn.com/nba/team/roster/_/name/' + key + '/' + teams[key])
+        teams[key] = str(teams[key])
+    return dict(zip(teams.values(), roster_urls))
+
+# This method gets a dictionary of player information from a given roster URL
+def get_player_info(roster_url):
+    f = urllib.request.urlopen(roster_url)
+    roster_source = f.read().decode('utf-8')
+    sleep(0.5)
+    player_regex = ('\{\"name\"\:\"(\w+\s\w+)\",\"href\"\:\"http\://www\.espn\.com/nba/player/.*?\",(.*?)\}')
+    player_info = re.findall(player_regex, roster_source)
+    player_dict = dict()
+    for player in player_info:
+        player_dict[player[0]] = json.loads("{"+player[1]+"}")
+    return(player_dict)
+
+# scrape player information from rosters
+rosters = build_team_urls()
+all_players = dict()
+for team in rosters.keys():
+    print("Gathering player info for team: " + team)
+    all_players[team] = get_player_info(rosters[team])
+
+# loop through each team, create a pandas DataFrame, and append
+all_players_df = pd.DataFrame()
+for team in all_players.keys():
+    team_df = pd.DataFrame.from_dict(all_players[team], orient = "index")
+    team_df['team'] = team
+    all_players_df = all_players_df.append(team_df)
+all_players_df.to_csv("NBA_roster_info_all_players.csv")
+
+# scrape career statistics
+print ("Now gathering career stats on all players (may take a while):")
+career_stats_df = pd.DataFrame(columns = ["GP","GS","MIN","FGM", "FGA","FG%","3PTM","3PTA","3P%","FTM","FTA","FT%","OR","DR","REB","AST","BLK","STL","PF","TO","PTS"])
+for player_index in all_players_df.index:
+    url = "https://www.espn.com/nba/player/stats/_/id/" + str(all_players_df.loc[player_index]['id'])
+    f = urllib.request.urlopen(url)
+    sleep(0.3)
+    player_source = f.read().decode('utf-8')
+    # extract career stats using this regex
+    stats_regex = ('\[\"Career\",\"\",(.*?)\]\},\{\"ttl\"\:\"Regular Season Totals\"')
+    career_info = re.findall(stats_regex, player_source)
+    try:
+        # convert the stats to a list of floats, and add the entry to the DataFrame
+        career_info = career_info[0].replace("\"", "").split(",")
+        career_info = list(chain.from_iterable([i.split("-") for i in career_info]))
+        career_info = list(map(float,career_info))
+        career_stats_df = career_stats_df.append(pd.Series(career_info, index = career_stats_df.columns, name=player_index))
+    except:
+        # if no career stats were returned, the player was a rookie with no games played
+        print(player_index + " has no info, ", end = "")
+career_stats_df.to_csv("NBA_player_career_stats_all_players.csv")
+
+# join and clean the data
+all_stats_df = all_players_df.join(career_stats_df)
+def convert_height(height):
+    split_height = height.split(" ")
+    feet = float(split_height[0].replace("\'",""))
+    inches = float(split_height[1].replace("\"",""))
+    return (feet*12 + inches)
+all_stats_df['height'] = [convert_height(x) for x in all_stats_df['height']]
+all_stats_df['weight'] = [float(x.split(" ")[0]) for x in all_stats_df['weight']]
+all_stats_df['salary'] = [int(re.sub(r'[^\d.]+', '', s)) if isinstance(s, str) else s for s in all_stats_df['salary'].values]
+
+all_stats_df.to_csv("NBA_player_info_and_stats_joined_clean.csv")