Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model v2 results #5

Merged
merged 7 commits into from
Mar 15, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 50 additions & 49 deletions join_outputs_to_slots.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

folder = 'data'
results_folder = 'results'
prediction_year = 2018
predictions_file = '/submission-2.csv'
slots_output_file = '/less-readable-predictions-by-slots-2.csv'
slots_readable_file = '/readable-predictions-by-slots-2.csv'
# list to collect all print statements to go in a results file
readable_outputs = []

Expand Down Expand Up @@ -148,53 +150,52 @@ def get_slot_winner( pred_df, slot_row ):
return winner_data


if __name__ == "__main__":
# slots data
slot_dtypes = {
'StrongTeamID': str,
'WeakTeamID': str
}
slots = pd.read_csv(folder + '/NCAATourneySlots_Detailed_2018.csv', dtype=slot_dtypes)
slots.head()
# model prediction outputs
predictions = pd.read_csv(results_folder + '/submission.csv')
predictions.head()

# format pred data to join with slots
pred_formatted = format_pred_outputs( predictions )
pred_formatted.head()

# for each slot
for row in slots.itertuples(index=False):
print("Joining slots to predictions")
# slots data
slot_dtypes = {
'StrongTeamID': str,
'WeakTeamID': str
}
slots = pd.read_csv(folder + '/NCAATourneySlots_Detailed_2018.csv', dtype=slot_dtypes)
slots.head()
# model prediction outputs
predictions = pd.read_csv(results_folder + predictions_file)
predictions.head()

# format pred data to join with slots
pred_formatted = format_pred_outputs( predictions )
pred_formatted.head()

# for each slot
for row in slots.itertuples(index=False):
print("Joining slots to predictions")
readable_outputs.append(
['------------------------------------------']
)
if row.Slot == 'R7WIN':
readable_outputs.append(
['------------------------------------------']
[
'Overall 2018 champion: %s' %
(row.StrongTeamName)
]
)
if row.Slot == 'R7WIN':
readable_outputs.append(
[
'Overall 2018 champion: %s' %
(row.StrongTeamName)
]
)
break
# join slots and pred to get higher prob team
# returns [StrongTeamID, StrongTeamName]
slot_winner = get_slot_winner(pred_formatted, row)

# save higher team in appropriate slot for next round
next_slot = row.NextSlot
seed_type = row.NextSeed

# assign the updated winner data in the next slot
slots.loc[slots['Slot'] == next_slot, [seed_type + 'TeamID',seed_type + 'TeamName']] = slot_winner[0], slot_winner[1]

slots.tail()
# output updated slot data to csv in results folder
print("Write updated slots data to file")
slots.to_csv(results_folder + '/less-readable-predictions-by-slots.csv', index=False)
# create readable results csv
print("Writing %d readable bracket results." % len(slots))
with open(results_folder + '/readable-predictions-by-slots.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(readable_outputs)
break
# join slots and pred to get higher prob team
# returns [StrongTeamID, StrongTeamName]
slot_winner = get_slot_winner(pred_formatted, row)

# save higher team in appropriate slot for next round
next_slot = row.NextSlot
seed_type = row.NextSeed

# assign the updated winner data in the next slot
slots.loc[slots['Slot'] == next_slot, [seed_type + 'TeamID',seed_type + 'TeamName']] = slot_winner[0], slot_winner[1]

slots.tail()
# output updated slot data to csv in results folder
print("Write updated slots data to file")
slots.to_csv(results_folder + slots_output_file, index=False)
# create readable results csv
print("Writing %d readable bracket results." % len(slots))
with open(results_folder + slots_readable_file, 'w') as f:
writer = csv.writer(f)
writer.writerows(readable_outputs)
62 changes: 16 additions & 46 deletions new_model.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@

# coding: utf-8

# In[24]:

import math
import csv
import random
Expand All @@ -11,8 +7,15 @@
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation, linear_model


# In[4]:
base_elo = 1600
team_elos = {} # Reset each year.
team_stats = {}
X = []
y = []
submission_data = []
folder = 'data'
results_folder = 'results'
prediction_year = 2018

def calc_elo(win_team, lose_team, season):
winner_rank = get_elo(season, win_team)
Expand Down Expand Up @@ -181,7 +184,7 @@ def build_season_data(all_data):
'pf': row['WPF'],
'form':row['Wform'],
'power_5':row['WTeam_p5'],
'high_rank':row['WTeam_rank']
'high_rank':row['WTeam_rank']
}
stat_2_fields = {
'score': row['LScore'],
Expand Down Expand Up @@ -213,57 +216,41 @@ def build_season_data(all_data):
return X


# In[5]:

base_elo = 1600
team_elos = {} # Reset each year.
team_stats = {}
X = []
y = []
submission_data = []
folder = 'data'
results_folder = 'results'
prediction_year = 2018
stat_fields = ['score', 'fga', 'fgp', 'fga3', '3pp', 'ftp', 'or', 'dr',
'ast', 'to', 'stl', 'blk', 'pf', 'form', 'power_5', 'high_rank']

labels = ['Season', 't1','t1elo', 't1score', 't1fga', 't1fgp', 't1fga3', 't13pp', 't1ftp', 't1or', 't1dr',
't1ast', 't1to', 't1stl', 't1blk', 't1pf', 't1form', 't1p5', 't1rank',
't2', 't2elo', 't2score', 't2fga',
't2', 't2elo', 't2score', 't2fga',
't2fgp', 't2fga3', 't23pp', 't2ftp', 't2or', 't2dr',
't2ast', 't2to', 't2stl', 't2blk', 't2pf', 't2form', 't2p5', 't2rank', 't2_win']

initialize_data()


# In[12]:

# read data
season_data = pd.read_csv(folder + '/RegularSeasonDetailedResults.csv')
season_data.columns
season_data.shape


# In[13]:

tourney_data = pd.read_csv(folder + '/NCAATourneyDetailedResults_2003_2017.csv')
tourney_data.columns
tourney_data.shape


# In[14]:

conferences = pd.read_csv('Data/TeamConferences.csv')
conferences.drop('ConfAbbrev', axis = 1, inplace = True)


# In[15]:

massey = pd.read_csv('Data/MasseyOrdinals.csv')
preseason_rank = massey[['Season', 'RankingDayNum', 'TeamID', 'OrdinalRank']].groupby(['Season', 'TeamID'], as_index = False).agg(min)


# In[17]:

# combine data
frames = [season_data, tourney_data]
Expand All @@ -286,15 +273,9 @@ def build_season_data(all_data):
all_data['rank_diff'] = all_data['WTeam_rank'] - all_data['LTeam_rank']
print(all_data.shape)


# In[18]:

# Build the working data.
df = build_season_data(all_data)


# In[19]:

preds = pd.DataFrame(df, columns = labels)

print(preds.shape)
Expand All @@ -307,11 +288,9 @@ def build_season_data(all_data):
preds[['t2rank', 't1rank']].head()


# In[25]:

print("Fitting on %d samples." % len(X))

model = sklearn.linear_model.LogisticRegression()
model = linear_model.LogisticRegression()
# model = RandomForestClassifier(max_depth = 2)

# Check accuracy.
Expand All @@ -320,9 +299,6 @@ def build_season_data(all_data):

model.fit(X, y)


# In[26]:

# Now predict tournament matchups.
print("Getting teams.")
seeds = pd.read_csv(folder + '/NCAATourneySeeds.csv')
Expand All @@ -340,12 +316,12 @@ def build_season_data(all_data):
if team_1 < team_2:
prediction = predict_winner(
team_1, team_2, model, prediction_year, stat_fields)
label = str(prediction_year) + '_' + str(team_1) + '_' + str(team_2)
label = str(prediction_year) + '_' + str(team_1) + '_' + str(team_2)
submission_data.append([label, prediction[0][0]])

# Write the results.
print("Writing %d results." % len(submission_data))
with open(results_folder + '/submission.csv', 'w') as f:
with open(results_folder + '/submission-2.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(['id', 'pred'])
writer.writerows(submission_data)
Expand Down Expand Up @@ -375,15 +351,9 @@ def build_season_data(all_data):
(team_id_map[winning], team_id_map[losing], proba)
]
)
with open(results_folder + '/readable-predictions.csv', 'w') as f:
with open(results_folder + '/readable-predictions-2.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(readable)
with open(results_folder + '/less-readable-predictions.csv', 'w') as f:
with open(results_folder + '/less-readable-predictions-2.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(less_readable)


# In[ ]:



Loading