Skip to content

fix(perf-issues): Accept sanitized span descriptions #46187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 39 additions & 12 deletions src/sentry/utils/performance_issues/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,19 +167,46 @@ def get_duration_between_spans(first_span: Span, second_span: Span):


def get_url_from_span(span: Span) -> str:
"""
Parses the span data and pulls out the URL. Accounts for different SDKs and
different versions of SDKs formatting and parsing the URL contents
differently.
"""

data = span.get("data") or {}
url = data.get("url") or ""
if not url:
# If data is missing, fall back to description
description = span.get("description") or ""
parts = description.split(" ", 1)
if len(parts) == 2:
url = parts[1]

if type(url) is dict:
url = url.get("pathname") or ""

return url

# The most modern version is to provide URL information in the span
# data
url_data = data.get("url")

if type(url_data) is dict:
# Some transactions mysteriously provide the URL as a dict that looks
# like JavaScript's URL object
url = url_data.get("pathname") or ""
url += url_data.get("search") or ""
return url

if type(url_data) is str:
# Usually the URL is a regular string, and so is the query. This
# is the standardized format for all SDKs, and is the preferred
# format going forward. Otherwise, if `http.query` is absent, `url`
# contains the query.
url = url_data
query_data = data.get("http.query")
if type(query_data) is str and len(query_data) > 0:
url += f"?{query_data}"

return url

# Attempt to parse the full URL from the span description, in case
# the previous approaches did not yield a good result
description = span.get("description") or ""
parts = description.split(" ", 1)
if len(parts) == 2:
url = parts[1]
return url

return ""


def fingerprint_spans(spans: List[Span], unique_only: bool = False):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,29 +157,32 @@ def is_span_eligible(cls, span: Span) -> bool:
if description.strip()[:3].upper() != "GET":
return False

url = get_url_from_span(span)

# GraphQL URLs have complicated queries in them. Until we parse those
# queries to check for what's duplicated, we can't tell what is being
# duplicated. Ignore them for now
if "graphql" in description:
if "graphql" in url:
return False

# Next.js infixes its data URLs with a build ID. (e.g.,
# /_next/data/<uuid>/some-endpoint) This causes a fingerprinting
# explosion, since every deploy would change this ID and create new
# fingerprints. Since we're not parameterizing URLs yet, we need to
# exclude them
if "_next/data" in description:
if "_next/data" in url:
return False

url = get_url_from_span(span)

# Next.js error pages cause an N+1 API Call that isn't useful to anyone
if "__nextjs_original-stack-frame" in url:
return False

if not url:
return False

# Once most users update their SDKs to use the latest standard, we
# won't have to do this, since the URLs will be sent in as `span.data`
# in a parsed format
parsed_url = urlparse(str(url))

if parsed_url.netloc in cls.HOST_DENYLIST:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def test_allows_eligible_spans(span):
{
"span_id": "a",
"op": "http.client",
"description": "GET http://service.io/resource",
"description": "GET /resource.js",
"hash": "a",
"data": {"url": "/resource.js"},
},
Expand All @@ -353,6 +353,16 @@ def test_allows_eligible_spans(span):
"description": "GET http://service.io/resource?graphql=somequery",
"hash": "a",
},
{
"span_id": "a",
"op": "http.client",
"description": "GET http://service.io/resource", # New JS SDK removes query string from description
"hash": "a",
"data": {
"http.query": "graphql=somequery",
"url": "http://service.io/resource",
},
},
{
"span_id": "a",
"op": "http.client",
Expand Down