Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,16 @@ def signal_handler(sig, frame):
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

# Only parse arguments when running directly (not when imported by gunicorn)
if __name__ == "__main__":
args = parse_args()
else:
# Default args when imported by gunicorn
class DefaultArgs:
no_scrape = False
no_daily_sun = False
args = DefaultArgs()

# Only run scraping tasks if not disabled
if not args.no_scrape:
from flask_apscheduler import APScheduler
Expand Down
25 changes: 25 additions & 0 deletions src/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,31 @@ def setup_database_indexes():

# Index for sorting operations
game_collection.create_index([("date", -1)], background=True)

# Index to have unique games so we won't add duplicates
game_collection.create_index(
[
("sport", 1),
("gender", 1),
("date", 1),
("opponent_id", 1),
("state", 1),
],
unique=True,
background=True
)

# Additional index for tournament games (without opponent_id)
game_collection.create_index(
[
("sport", 1),
("gender", 1),
("date", 1),
("city", 1),
("state", 1),
],
background=True
)

print("✅ MongoDB indexes created successfully")
except Exception as e:
Expand Down
78 changes: 78 additions & 0 deletions src/repositories/game_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,56 @@ def find_by_key_fields(city, date, gender, location, opponent_id, sport, state):

return [Game.from_dict(game) for game in games]

@staticmethod
def find_by_tournament_key_fields(city, date, gender, location, sport, state):
"""
Find tournament games by location and date (excluding opponent_id).
This is used when we need to find a tournament game that might have a placeholder team.
Uses flexible matching to handle TBD/TBA values.
"""
game_collection = db["game"]

# Build flexible query that can handle TBD/TBA values
query = {
"date": date,
"gender": gender,
"sport": sport,
}

# For city, state, and location, use flexible matching
# This allows finding games even when TBD/TBA values change to real values
city_conditions = []
if city:
city_conditions.append(city)
else:
city_conditions = [None]

state_conditions = []
if state:
state_conditions.append(state)
else:
state_conditions = [None]

location_conditions = []
if location:
location_conditions.append(location)
else:
location_conditions = [None]

query["city"] = {"$in": city_conditions}
query["state"] = {"$in": state_conditions}
query["location"] = {"$in": location_conditions}

games = list(game_collection.find(query))

if not games:
return None

if len(games) == 1:
return Game.from_dict(games[0])

return [Game.from_dict(game) for game in games]

@staticmethod
def find_by_sport(sport):
"""
Expand All @@ -156,3 +206,31 @@ def find_by_sport_gender(sport, gender):
game_collection = db["game"]
games = game_collection.find({"sport": sport, "gender": gender})
return [Game.from_dict(game) for game in games]

@staticmethod
def find_games_by_sport_gender_after_date(sport, gender, after_date=None):
"""
Find games for a specific sport and gender, optionally after a specific date.
This method returns raw game data without team information.
"""
game_collection = db["game"]

query = {
"sport": sport,
"gender": gender
}

if after_date:
query["utc_date"] = {"$gt": after_date}

games = game_collection.find(query)
return [Game.from_dict(game) for game in games]

@staticmethod
def delete_games_by_ids(game_ids):
"""
Delete games by their IDs.
"""
game_collection = db["game"]
result = game_collection.delete_many({"_id": {"$in": game_ids}})
return result.deleted_count
39 changes: 32 additions & 7 deletions src/scrapers/games_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
from src.utils.convert_to_utc import convert_to_utc
from src.utils.constants import *
from src.scrapers.game_details_scrape import scrape_game
from src.utils.helpers import get_dominant_color
from src.utils.helpers import get_dominant_color, normalize_game_data, is_tournament_placeholder_team, is_cornell_loss
import base64
import re
import html
from src.database import db
import threading


Expand Down Expand Up @@ -164,6 +164,8 @@ def process_game_data(game_data):
Args:
game_data (dict): A dictionary containing the game data.
"""

game_data = normalize_game_data(game_data)
location_data = game_data["location"].split("\n")
geo_location = location_data[0]
if (",") not in geo_location:
Expand Down Expand Up @@ -232,16 +234,28 @@ def process_game_data(game_data):
if str(final_box_cor_score) != str(cor_final) or str(final_box_opp_score) != str(opp_final):
game_data["score_breakdown"] = game_data["score_breakdown"][::-1]

# finds any existing game with the same key fields regardless of time
curr_game = GameService.get_game_by_key_fields(
# Try to find by tournament key fields to handle placeholder teams
curr_game = GameService.get_game_by_tournament_key_fields(
city,
game_data["date"],
game_data["gender"],
location,
team.id,
game_data["sport"],
state
)

# If no tournament game found, try the regular lookup with opponent_id
if not curr_game:
curr_game = GameService.get_game_by_key_fields(
city,
game_data["date"],
game_data["gender"],
location,
team.id,
game_data["sport"],
state
)

if isinstance(curr_game, list):
if curr_game:
curr_game = curr_game[0]
Expand All @@ -253,8 +267,19 @@ def process_game_data(game_data):
"result": game_data["result"],
"box_score": game_data["box_score"],
"score_breakdown": game_data["score_breakdown"],
"utc_date": utc_date_str
"utc_date": utc_date_str,
"city": city,
"location": location,
"state": state
}

current_team = TeamService.get_team_by_id(curr_game.opponent_id)
if current_team and is_tournament_placeholder_team(current_team.name):
updates["opponent_id"] = team.id

if is_cornell_loss(game_data["result"]) and game_data["utc_date"]:
GameService.handle_tournament_loss(game_data["sport"], game_data["gender"], game_data["utc_date"])

GameService.update_game(curr_game.id, updates)
return

Expand All @@ -272,5 +297,5 @@ def process_game_data(game_data):
"score_breakdown": game_data["score_breakdown"],
"utc_date": utc_date_str
}

GameService.create_game(game_data)
59 changes: 59 additions & 0 deletions src/services/game_service.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from src.repositories.game_repository import GameRepository
from src.models.game import Game
from src.services.team_service import TeamService
from src.utils.helpers import is_tournament_placeholder_team


class GameService:
Expand Down Expand Up @@ -33,6 +34,7 @@ def create_game(data):
opponent_id = data.get("opponent_id")
if not TeamService.get_team_by_id(opponent_id):
raise ValueError(f"Opponent team with id {opponent_id} does not exist.")

game = Game(**data)
GameRepository.insert(game)
return game
Expand Down Expand Up @@ -69,6 +71,16 @@ def get_game_by_key_fields(city, date, gender, location, opponent_id, sport, sta
city, date, gender, location, opponent_id, sport, state
)

@staticmethod
def get_game_by_tournament_key_fields(city, date, gender, location, sport, state):
"""
Retrieve a tournament game by location and date (excluding opponent_id).
This is used when we need to find a tournament game that might have a placeholder team.
"""
return GameRepository.find_by_tournament_key_fields(
city, date, gender, location, sport, state
)

@staticmethod
def get_games_by_sport(sport):
"""
Expand All @@ -89,3 +101,50 @@ def get_games_by_sport_gender(sport, gender):
Retrieves all game by its sport and gender.
"""
return GameRepository.find_by_sport_gender(sport, gender)

@staticmethod
def get_tournament_games_by_sport_gender(sport, gender, after_date=None):
"""
Find tournament games (with placeholder team names) for a specific sport and gender.
Optionally filter by games after a specific date.
"""
games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date)
tournament_games = []

for game in games:
team = TeamService.get_team_by_id(game.opponent_id)
if team and is_tournament_placeholder_team(team.name):
tournament_games.append(game)

return tournament_games

@staticmethod
def delete_tournament_games_by_sport_gender(sport, gender, after_date=None):
"""
Delete tournament games (with placeholder team names) for a specific sport and gender.
Optionally filter by games after a specific date.
"""
games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date)
tournament_game_ids = []

for game in games:
team = TeamService.get_team_by_id(game.opponent_id)
if team and is_tournament_placeholder_team(team.name):
tournament_game_ids.append(game.id)

if tournament_game_ids:
return GameRepository.delete_games_by_ids(tournament_game_ids)
return 0

@staticmethod
def handle_tournament_loss(sport, gender, loss_date):
"""
Handle when a Cornell team loses in a tournament by deleting future tournament games.

Args:
sport (str): The sport of the team that lost
gender (str): The gender of the team that lost
loss_date (datetime): The date when the team lost
"""
deleted_count = GameService.delete_tournament_games_by_sport_gender(sport, gender, loss_date)
return deleted_count
18 changes: 14 additions & 4 deletions src/services/team_service.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from src.repositories import TeamRepository
from src.models.team import Team


class TeamService:
@staticmethod
def get_all_teams():
Expand All @@ -13,14 +12,25 @@ def get_all_teams():
@staticmethod
def create_team(team_data):
"""
Create a new team.

Create a new team, or update it if it already exists.
Args:
team_data (dict): The data for the new team.

Returns:
Team: The created team.
"""
name = team_data.get("name")
if not name:
raise ValueError("Team name is required to create a team.")

existing = TeamService.get_team_by_name(name)
if existing:
if isinstance(existing, list) and existing:
existing = existing[0]

TeamService.update_team(existing.id, team_data)
return existing

team = Team(**team_data)
TeamRepository.insert(team)
return team
Expand Down
42 changes: 41 additions & 1 deletion src/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,44 @@ def get_dominant_color(image_url, white_threshold=200, black_threshold=50):
return hex_color
except Exception as e:
logging.error(f"Error in get_dominant_color for {image_url}: {e}")
return default_color
return default_color

def normalize_game_data(data: dict):
"""
Normalize placeholder values like TBA/TBD into None.
"""
placeholders = {"TBA", "TBD", "tba", "tbd"}

for field in ["time", "city", "state"]:
if data.get(field) in placeholders:
data[field] = None

return data

def is_tournament_placeholder_team(team_name: str):
"""
Check if a team name is a tournament placeholder.
"""

placeholder_team_names = [
"First Round", "Second Round", "Third Round", "Quarterfinals",
"College Cup Semifinals", "College Cup Championship Game",
"ECAC Hockey First Round", "ECAC Hockey Quarterfinals",
"ECAC Hockey Semifinals", "ECAC Hockey Championship Game",
"Regional Semifinals", "Regional Championship", "National Semifinals",
"TBD", "National Championship", "NCAA Wrestling Championships", "NCAA Northeast Regional CHampionships",
"NCAA Cross Country Championships",
]
return team_name in placeholder_team_names

def is_cornell_loss(result: str):
"""
Check if the result indicates a Cornell loss.
"""

if not result:
return False

# Common loss indicators in result strings
loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"]
return any(indicator in result for indicator in loss_indicators)