Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions fast_flights/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,80 @@

DataSource = Literal['html', 'js']


def _parse_aria_label(label: str) -> dict:
"""Parse flight details from aria-label as fallback when CSS selectors fail.

Google Flights always includes a structured aria-label on each flight item,
e.g.: 'From 2359 US dollars. Nonstop flight with Alaska. Leaves San Jose
Mineta International Airport at 2:25 PM on Sunday, February 15 and arrives
at Kona International Airport at 6:13 PM on Sunday, February 15. Total
duration 5 hr 48 min.'

This is used as a fallback when the CSS class names in the HTML differ from
what the parser expects (Google obfuscates class names differently depending
on the browser fingerprint / TLS fingerprint used by the HTTP client).
"""
result = {}

# Airline name: "flight with <airline>. Leaves"
m = re.search(r'flight with (.+?)\.\s*Leaves', label)
result['name'] = m.group(1) if m else ""

# Departure time: "Leaves <airport> at <time> on <day> and arrives"
# Use greedy .+ to skip past airport names that may contain "at"
# Use \s* for the space before AM/PM to handle U+202F (narrow no-break space)
m = re.search(r'Leaves .+ at (\d+:\d+\s*(?:AM|PM)) on (.+?) and arrives', label)
if m:
time_str, date_str = m.group(1), m.group(2)
time_str = re.sub(r'\s+', ' ', time_str)
result['departure'] = f"{time_str} on {_shorten_date(date_str)}"
else:
result['departure'] = ""

# Arrival time: "arrives at <airport> at <time> on <day>. Total"
m = re.search(r'arrives .+ at (\d+:\d+\s*(?:AM|PM)) on (.+?)\.\s*Total', label)
if m:
time_str, date_str = m.group(1), m.group(2)
time_str = re.sub(r'\s+', ' ', time_str)
result['arrival'] = f"{time_str} on {_shorten_date(date_str)}"
else:
result['arrival'] = ""

# Duration: "Total duration <dur>."
m = re.search(r'Total duration (.+?)\.', label)
result['duration'] = m.group(1) if m else ""

# Stops: "Nonstop" or "1 stop" or "2 stops"
m = re.search(r'(Nonstop|\d+ stops?)\s+flight', label)
if m:
stops_text = m.group(1)
result['stops'] = 0 if stops_text == "Nonstop" else int(stops_text.split()[0])
else:
result['stops'] = "Unknown"

return result


_DAY_ABBREVS = {
'Monday': 'Mon', 'Tuesday': 'Tue', 'Wednesday': 'Wed',
'Thursday': 'Thu', 'Friday': 'Fri', 'Saturday': 'Sat', 'Sunday': 'Sun',
}
_MONTH_ABBREVS = {
'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr',
'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug',
'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec',
}


def _shorten_date(date_str: str) -> str:
"""Convert 'Sunday, February 15' to 'Sun, Feb 15'."""
for full, abbr in _DAY_ABBREVS.items():
date_str = date_str.replace(full, abbr)
for full, abbr in _MONTH_ABBREVS.items():
date_str = date_str.replace(full, abbr)
return date_str

# Default cookies embedded into the app to help bypass common consent gating.
# These are used only if the caller does not supply cookies (binary) and
# does not provide cookies via request_kwargs.
Expand Down Expand Up @@ -275,6 +349,30 @@ def safe(n: Optional[LexborNode]):
except ValueError:
stops_fmt = "Unknown"

# Fallback: if CSS selectors missed any key fields, parse from
# aria-label. Google serves different obfuscated class names
# depending on the browser/TLS fingerprint, but aria-label always
# contains structured flight data regardless.
if not name or not departure_time or not arrival_time or not duration or stops_fmt == "Unknown":
# Check the item element itself first, then descendants
aria = item.attributes.get("aria-label", "") or ""
if not aria or "flight" not in aria:
aria_el = item.css_first("[aria-label*='flight']")
if aria_el:
aria = aria_el.attributes.get("aria-label", "") or ""
if aria and "flight" in aria:
parsed = _parse_aria_label(aria)
if not name:
name = parsed.get('name', '')
if not departure_time:
departure_time = parsed.get('departure', '')
if not arrival_time:
arrival_time = parsed.get('arrival', '')
if not duration:
duration = parsed.get('duration', '')
if stops_fmt == "Unknown":
stops_fmt = parsed.get('stops', 'Unknown')

flights.append(
{
"is_best": is_best_flight,
Expand Down