@@ -128,15 +128,15 @@ def standardize_position_labels(df: pd.DataFrame) -> pd.DataFrame:
128128
129129def resolve_duplicates (df : pd .DataFrame ) -> pd .DataFrame :
130130 """
131- Resolves duplicate player-gameweek records.
131+ Resolves duplicate player-gameweek records with AGGREGATION strategy .
132132
133- Duplicate resolution strategy :
134- 1. If duplicates have identical data, keep first occurrence
135- 2. If duplicates differ in fixture ID, they represent legitimate separate matches
136- 3. If duplicates differ in statistics for same fixture, keep record with more minutes
133+ Strategy Change :
134+ - Double gameweeks (multiple fixtures) are LEGITIMATE in FPL
135+ - Instead of keeping one fixture, AGGREGATE stats across both
136+ - Mathematical justification: y_total = sum(y_fixture_i) for i in fixtures
137137
138- The mathematical justification: we require unique observations in feature space
139- where each (player, gameweek) tuple maps to exactly one feature vector.
138+ Example: Player scores 2 points in fixture A, 5 in fixture B
139+ Result: Single record with 7 total points for that gameweek
140140
141141 Parameters
142142 ----------
@@ -146,14 +146,14 @@ def resolve_duplicates(df: pd.DataFrame) -> pd.DataFrame:
146146 Returns
147147 -------
148148 pd.DataFrame
149- Dataframe with duplicates resolved
149+ Dataframe with aggregated gameweek statistics
150150 """
151151 df = df .copy ()
152152
153153 initial_rows = len (df )
154154
155155 # Identify duplicates based on player-gameweek combination
156- duplicate_mask = df .duplicated (subset = ['element' , 'GW' ], keep = False )
156+ duplicate_mask = df .duplicated (subset = ['element' , 'GW' , 'season' ], keep = False )
157157 n_duplicates = duplicate_mask .sum ()
158158
159159 if n_duplicates == 0 :
@@ -162,33 +162,62 @@ def resolve_duplicates(df: pd.DataFrame) -> pd.DataFrame:
162162
163163 logger .info (f"Found { n_duplicates } duplicate player-gameweek records" )
164164
165- # Check if duplicates represent different fixtures
165+ # Check if duplicates represent different fixtures (double gameweeks)
166166 if 'fixture' in df .columns :
167167 duplicates_df = df [duplicate_mask ].copy ()
168-
169- # Group by player-gameweek and check fixture variance
170- fixture_variance = duplicates_df .groupby (['element' , 'GW' ])['fixture' ].nunique ()
168+ fixture_variance = duplicates_df .groupby (['element' , 'GW' , 'season' ])['fixture' ].nunique ()
171169 multiple_fixtures = (fixture_variance > 1 ).sum ()
172170
173171 if multiple_fixtures > 0 :
174- logger .info (f"{ multiple_fixtures } player-gameweek pairs have multiple fixtures (legitimate)" )
175- # These are legitimate - players who played multiple matches in one gameweek
176- # We keep all records for these cases
172+ logger .info (f"{ multiple_fixtures } double gameweeks detected - aggregating stats" )
173+
174+ # Columns to SUM (cumulative stats across fixtures)
175+ sum_cols = [
176+ 'total_points' , 'minutes' , 'goals_scored' , 'assists' ,
177+ 'clean_sheets' , 'goals_conceded' , 'own_goals' , 'penalties_saved' ,
178+ 'penalties_missed' , 'yellow_cards' , 'red_cards' , 'saves' ,
179+ 'bonus' , 'bps' , 'influence' , 'creativity' , 'threat' ,
180+ 'expected_goals' , 'expected_assists' , 'expected_goal_involvements' ,
181+ 'expected_goals_conceded' , 'transfers_in' , 'transfers_out'
182+ ]
183+
184+ # Columns to AVERAGE (intensity metrics)
185+ avg_cols = ['ict_index' ]
186+
187+ # Columns to KEEP (should be identical across fixtures)
188+ keep_cols = ['element' , 'name' , 'position' , 'team' , 'GW' , 'season' , 'value' ]
189+
190+ # Filter to existing columns
191+ sum_cols = [c for c in sum_cols if c in df .columns ]
192+ avg_cols = [c for c in avg_cols if c in df .columns ]
193+ keep_cols = [c for c in keep_cols if c in df .columns ]
194+
195+ # Aggregate double gameweeks
196+ agg_dict = {col : 'sum' for col in sum_cols }
197+ agg_dict .update ({col : 'mean' for col in avg_cols })
198+ agg_dict .update ({col : 'first' for col in keep_cols if col not in ['element' , 'GW' , 'season' ]})
199+
200+ df_agg = df [duplicate_mask ].groupby (['element' , 'GW' , 'season' ], as_index = False ).agg (agg_dict )
201+
202+ # Keep non-duplicates as-is
203+ df_single = df [~ duplicate_mask ].copy ()
204+
205+ # Combine
206+ df = pd .concat ([df_single , df_agg ], ignore_index = True )
177207
178- # Only remove duplicates where fixture is same
179- df = df .sort_values (['element' , 'GW' , 'fixture' , 'minutes' ], ascending = [True , True , True , False ])
180- df = df .drop_duplicates (subset = ['element' , 'GW' , 'fixture' ], keep = 'first' )
208+ logger .info (f"Aggregated { n_duplicates } records into { len (df_agg )} gameweek summaries" )
181209 else :
182- # All duplicates are for same fixture - keep record with most minutes
183- df = df .sort_values (['element' , 'GW' , 'minutes' ], ascending = [True , True , False ])
184- df = df .drop_duplicates (subset = ['element' , 'GW' ], keep = 'first' )
210+ # Same-fixture duplicates - keep record with most minutes
211+ logger .info ("Same-fixture duplicates detected - keeping highest minutes" )
212+ df = df .sort_values (['element' , 'GW' , 'season' , 'minutes' ], ascending = [True , True , True , False ])
213+ df = df .drop_duplicates (subset = ['element' , 'GW' , 'season' ], keep = 'first' )
185214 else :
186- # No fixture column - simple deduplication keeping first
187- df = df .drop_duplicates (subset = ['element' , 'GW' ], keep = 'first' )
215+ # No fixture column - simple deduplication
216+ df = df .drop_duplicates (subset = ['element' , 'GW' , 'season' ], keep = 'first' )
188217
189218 final_rows = len (df )
190219 removed = initial_rows - final_rows
191- logger .info (f"Removed { removed } duplicate records ({ removed / initial_rows * 100 :.2f } % )" )
220+ logger .info (f"Final dataset: { final_rows :, } rows ({ removed } aggregated )" )
192221
193222 return df
194223
0 commit comments