Skip to content

Commit dfd2ddc

Browse files
committed
Modified the features used to maken the model more relevant and accurate.
Add rolling statistics, momentum indicators, and lag features to better capture player performance trends. Improve data preprocessing to filter out incomplete data and ensure feature integrity.
1 parent 2ebc9af commit dfd2ddc

File tree

6 files changed

+1053
-248
lines changed

6 files changed

+1053
-248
lines changed

Notebooks/01_data_exploration.ipynb

Lines changed: 292 additions & 69 deletions
Large diffs are not rendered by default.

Notebooks/02_feature_engineering.ipynb

Lines changed: 707 additions & 154 deletions
Large diffs are not rendered by default.
115 KB
Loading
119 KB
Loading
46.1 KB
Loading

src/preprocess.py

Lines changed: 54 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -128,15 +128,15 @@ def standardize_position_labels(df: pd.DataFrame) -> pd.DataFrame:
128128

129129
def resolve_duplicates(df: pd.DataFrame) -> pd.DataFrame:
130130
"""
131-
Resolves duplicate player-gameweek records.
131+
Resolves duplicate player-gameweek records with AGGREGATION strategy.
132132
133-
Duplicate resolution strategy:
134-
1. If duplicates have identical data, keep first occurrence
135-
2. If duplicates differ in fixture ID, they represent legitimate separate matches
136-
3. If duplicates differ in statistics for same fixture, keep record with more minutes
133+
Strategy Change:
134+
- Double gameweeks (multiple fixtures) are LEGITIMATE in FPL
135+
- Instead of keeping one fixture, AGGREGATE stats across both
136+
- Mathematical justification: y_total = sum(y_fixture_i) for i in fixtures
137137
138-
The mathematical justification: we require unique observations in feature space
139-
where each (player, gameweek) tuple maps to exactly one feature vector.
138+
Example: Player scores 2 points in fixture A, 5 in fixture B
139+
Result: Single record with 7 total points for that gameweek
140140
141141
Parameters
142142
----------
@@ -146,14 +146,14 @@ def resolve_duplicates(df: pd.DataFrame) -> pd.DataFrame:
146146
Returns
147147
-------
148148
pd.DataFrame
149-
Dataframe with duplicates resolved
149+
Dataframe with aggregated gameweek statistics
150150
"""
151151
df = df.copy()
152152

153153
initial_rows = len(df)
154154

155155
# Identify duplicates based on player-gameweek combination
156-
duplicate_mask = df.duplicated(subset=['element', 'GW'], keep=False)
156+
duplicate_mask = df.duplicated(subset=['element', 'GW', 'season'], keep=False)
157157
n_duplicates = duplicate_mask.sum()
158158

159159
if n_duplicates == 0:
@@ -162,33 +162,62 @@ def resolve_duplicates(df: pd.DataFrame) -> pd.DataFrame:
162162

163163
logger.info(f"Found {n_duplicates} duplicate player-gameweek records")
164164

165-
# Check if duplicates represent different fixtures
165+
# Check if duplicates represent different fixtures (double gameweeks)
166166
if 'fixture' in df.columns:
167167
duplicates_df = df[duplicate_mask].copy()
168-
169-
# Group by player-gameweek and check fixture variance
170-
fixture_variance = duplicates_df.groupby(['element', 'GW'])['fixture'].nunique()
168+
fixture_variance = duplicates_df.groupby(['element', 'GW', 'season'])['fixture'].nunique()
171169
multiple_fixtures = (fixture_variance > 1).sum()
172170

173171
if multiple_fixtures > 0:
174-
logger.info(f"{multiple_fixtures} player-gameweek pairs have multiple fixtures (legitimate)")
175-
# These are legitimate - players who played multiple matches in one gameweek
176-
# We keep all records for these cases
172+
logger.info(f"{multiple_fixtures} double gameweeks detected - aggregating stats")
173+
174+
# Columns to SUM (cumulative stats across fixtures)
175+
sum_cols = [
176+
'total_points', 'minutes', 'goals_scored', 'assists',
177+
'clean_sheets', 'goals_conceded', 'own_goals', 'penalties_saved',
178+
'penalties_missed', 'yellow_cards', 'red_cards', 'saves',
179+
'bonus', 'bps', 'influence', 'creativity', 'threat',
180+
'expected_goals', 'expected_assists', 'expected_goal_involvements',
181+
'expected_goals_conceded', 'transfers_in', 'transfers_out'
182+
]
183+
184+
# Columns to AVERAGE (intensity metrics)
185+
avg_cols = ['ict_index']
186+
187+
# Columns to KEEP (should be identical across fixtures)
188+
keep_cols = ['element', 'name', 'position', 'team', 'GW', 'season', 'value']
189+
190+
# Filter to existing columns
191+
sum_cols = [c for c in sum_cols if c in df.columns]
192+
avg_cols = [c for c in avg_cols if c in df.columns]
193+
keep_cols = [c for c in keep_cols if c in df.columns]
194+
195+
# Aggregate double gameweeks
196+
agg_dict = {col: 'sum' for col in sum_cols}
197+
agg_dict.update({col: 'mean' for col in avg_cols})
198+
agg_dict.update({col: 'first' for col in keep_cols if col not in ['element', 'GW', 'season']})
199+
200+
df_agg = df[duplicate_mask].groupby(['element', 'GW', 'season'], as_index=False).agg(agg_dict)
201+
202+
# Keep non-duplicates as-is
203+
df_single = df[~duplicate_mask].copy()
204+
205+
# Combine
206+
df = pd.concat([df_single, df_agg], ignore_index=True)
177207

178-
# Only remove duplicates where fixture is same
179-
df = df.sort_values(['element', 'GW', 'fixture', 'minutes'], ascending=[True, True, True, False])
180-
df = df.drop_duplicates(subset=['element', 'GW', 'fixture'], keep='first')
208+
logger.info(f"Aggregated {n_duplicates} records into {len(df_agg)} gameweek summaries")
181209
else:
182-
# All duplicates are for same fixture - keep record with most minutes
183-
df = df.sort_values(['element', 'GW', 'minutes'], ascending=[True, True, False])
184-
df = df.drop_duplicates(subset=['element', 'GW'], keep='first')
210+
# Same-fixture duplicates - keep record with most minutes
211+
logger.info("Same-fixture duplicates detected - keeping highest minutes")
212+
df = df.sort_values(['element', 'GW', 'season', 'minutes'], ascending=[True, True, True, False])
213+
df = df.drop_duplicates(subset=['element', 'GW', 'season'], keep='first')
185214
else:
186-
# No fixture column - simple deduplication keeping first
187-
df = df.drop_duplicates(subset=['element', 'GW'], keep='first')
215+
# No fixture column - simple deduplication
216+
df = df.drop_duplicates(subset=['element', 'GW', 'season'], keep='first')
188217

189218
final_rows = len(df)
190219
removed = initial_rows - final_rows
191-
logger.info(f"Removed {removed} duplicate records ({removed/initial_rows*100:.2f}%)")
220+
logger.info(f"Final dataset: {final_rows:,} rows ({removed} aggregated)")
192221

193222
return df
194223

0 commit comments

Comments
 (0)