Skip to content

Commit

Permalink
chore: Add heuristics for issue tracker (#12703)
Browse files Browse the repository at this point in the history
Fixes #11147 🦕
Fixes #12679
  • Loading branch information
ohmayr authored May 22, 2024
1 parent 995bdaf commit 3650261
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 29 deletions.
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ on:
# * is a special character in YAML so you have to quote this string
# Run this Github Action every Tuesday at 7 AM UTC
- cron: '0 7 * * 2'
workflow_dispatch:

permissions: read-all

Expand Down
184 changes: 155 additions & 29 deletions scripts/updateapilist.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,14 @@

import os
import requests
import logging
from typing import List, Optional
from dataclasses import dataclass

# Configure logging to output messages to console
logging.basicConfig(level=logging.INFO) # Set the desired logging level

import re

class MissingGithubToken(ValueError):
"""Raised when the GITHUB_TOKEN environment variable is not set"""
Expand Down Expand Up @@ -57,9 +62,20 @@ class MissingGithubToken(ValueError):
# BASE_API defines the base API for Github.
BASE_API = "https://api.github.com"

# GITHUB_ISSUES defines the issues URL for a repository on GitHub.
GITHUB_ISSUES = "https://github.com/{repo}/issues"

# BASE_ISSUE_TRACKER defines the base URL for issue tracker.
BASE_ISSUE_TRACKER = "https://issuetracker.google.com"

# This issue-tracker component is part of some saved searches for listing API-side issues.
# However, when we construct URLs for filing new issues (which in some cases we do by analyzing
# the query string for a saved search), we want to ensure we DON'T file a new issue against
# this generic component but against a more specific one.
GENERIC_ISSUE_TRACKER_COMPONENT = "187065"

# This sentinel value is used to mark cache fields that have not been computed yet.
NOT_COMPUTED = -1

class CloudClient:
repo: str = None
Expand All @@ -68,13 +84,97 @@ class CloudClient:
distribution_name: str = None
issue_tracker: str = None


def __init__(self, repo: dict):
self.repo = repo["repo"]
# For now, strip out "Google Cloud" to standardize the titles
self.title = repo["name_pretty"].replace("Google ", "").replace("Cloud ", "")
self.release_level = repo["release_level"]
self.distribution_name = repo["distribution_name"]
self.issue_tracker = repo.get("issue_tracker")
self._cached_component_id = NOT_COMPUTED
self._cached_template_id = NOT_COMPUTED
self._cached_saved_search_id = NOT_COMPUTED

@property
def saved_search_id(self):
if self._cached_saved_search_id != NOT_COMPUTED:
return self._cached_saved_search_id
if not self.issue_tracker:
self._cached_saved_search_id = None
else:
match = re.search(r'savedsearches/(\d+)', self.issue_tracker)
self._cached_saved_search_id = match.group(1) if match else None
return self._cached_saved_search_id

@property
def saved_search_response_text(self):
if not self.saved_search_id:
return None
url = f"{BASE_ISSUE_TRACKER}/action/saved_searches/{self.saved_search_id}"
response = _fetch_response(url)
return response.text if response else None

@property
def issue_tracker_component_id(self):
if self._cached_component_id != NOT_COMPUTED:
return self._cached_component_id

# First, check if the issue tracker is a saved search:
query_string = self.saved_search_response_text or self.issue_tracker
if not query_string:
self._cached_component_id = None
else:
# Try to match 'component=' in the query string
query_match = re.search(r'\bcomponent=(\d+)', query_string)
if query_match:
self._cached_component_id = query_match.group(1)
else:
# If not found, try to match 'componentid:' in the query string
query_match = re.findall(r'\bcomponentid:(\d+)', query_string)
for component_id in query_match:
if component_id == GENERIC_ISSUE_TRACKER_COMPONENT:
continue
if self._cached_component_id != NOT_COMPUTED:
self._cached_component_id = None
logging.error(f"More than one component ID found for issue tracker: {self.issue_tracker}")
break
self._cached_component_id = component_id
self._cached_component_id = self._cached_component_id if self._cached_component_id != NOT_COMPUTED else None
return self._cached_component_id

@property
def issue_tracker_template_id(self):
if self._cached_template_id != NOT_COMPUTED:
return self._cached_template_id
if not self.issue_tracker:
self._cached_template_id = None
else:
match = re.search(r'(?:\?|&)template=(\d+)', self.issue_tracker)
self._cached_template_id = match.group(1) if match else None
return self._cached_template_id

@property
def show_client_issues(self):
return GITHUB_ISSUES.format(repo=self.repo)

@property
def file_api_issue(self):
if self.issue_tracker_component_id:
link = f"{BASE_ISSUE_TRACKER}/issues/new?component={self.issue_tracker_component_id}"
if self.issue_tracker_template_id:
link += f"&template={self.issue_tracker_template_id}"
return link
return None

@property
def show_api_issues(self):
if self.saved_search_id:
# Return the original issue_tracker content, which already links to the saved search.
return self.issue_tracker
elif self.issue_tracker_component_id:
return f"{BASE_ISSUE_TRACKER}/issues?q=componentid:{self.issue_tracker_component_id}"
return None

# For sorting, we want to sort by release level, then API pretty_name
def __lt__(self, other):
Expand All @@ -95,15 +195,35 @@ class Extractor:
def client_for_repo(self, repo_slug) -> Optional[CloudClient]:
path = self.path_format.format(repo_slug=repo_slug)
url = f"{RAW_CONTENT_BASE_URL}/{path}/{REPO_METADATA_FILENAME}"
response = requests.get(url)
if response.status_code != requests.codes.ok:
return

return CloudClient(response.json())
_, metadata = _fetch_and_parse_response(url)
if not metadata:
return None
return CloudClient(metadata)

def get_clients_from_batch_response(self, response_json) -> List[CloudClient]:
return [self.client_for_repo(repo[self.response_key]) for repo in response_json if allowed_repo(repo)]

def _fetch_response(url: str, headers:dict = None, params:Optional[dict] = None) -> Optional[requests.Response]:
try:
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
return response
except requests.RequestException as e:
logging.error(f"Request failed for URL {url}: {e}")
return None

def _parse_response(response: requests.Response) -> Optional[dict]:
try:
return response.json()
except ValueError as e:
logging.error(f"JSON decoding failed for URL {response.url}: {e}")
return None

def _fetch_and_parse_response(url: str, headers:dict = None, params:Optional[dict] = None):
response = _fetch_response(url, headers, params)
if not response:
return None, None
return response, _parse_response(response)

def replace_content_in_readme(content_rows: List[str]) -> None:
START_MARKER = ".. API_TABLE_START"
Expand Down Expand Up @@ -135,16 +255,17 @@ def client_row(client: CloudClient) -> str:
url = f"https://github.com/{client.repo}"
if client.repo == MONO_REPO:
url += f"/tree/main/packages/{client.distribution_name}"

_show_api_issues = client.show_api_issues
_file_api_issue = client.file_api_issue
content_row = [
f" * - `{client.title} <{url}>`_\n",
f" - " + client.release_level + "\n",
f" - |PyPI-{client.distribution_name}|\n",
f" - {client.release_level}\n",
f" - |PyPI-{client.distribution_name}|\n",
f" - `API Issues <{_show_api_issues}>`_\n" if _show_api_issues else " -\n",
f" - `File an API Issue <{_file_api_issue}>`_\n" if _file_api_issue else " -\n",
f" - `Client Library Issues <{client.show_client_issues}>`_\n"
]

if client.issue_tracker:
content_row.append(f" - `API Issues <{client.issue_tracker}>`_\n")

return (content_row, pypi_badge)


Expand All @@ -157,7 +278,9 @@ def generate_table_contents(clients: List[CloudClient]) -> List[str]:
" * - Client\n",
" - Release Level\n",
" - Version\n",
" - API Issue Tracker\n",
" - API Issues\n",
" - File an API Issue\n",
" - Client Library Issues\n",
]

pypi_links = ["\n"]
Expand All @@ -181,30 +304,33 @@ def mono_repo_clients(token: str) -> List[CloudClient]:
# all mono repo clients
url = f"{BASE_API}/repos/{MONO_REPO}/contents/packages"
headers = {'Authorization': f'token {token}'}
response = requests.get(url=url, headers=headers)
_, packages = _fetch_and_parse_response(url, headers)
if not packages:
return []
mono_repo_extractor = Extractor(path_format=MONO_REPO_PATH_FORMAT, response_key=PACKAGE_RESPONSE_KEY)

return mono_repo_extractor.get_clients_from_batch_response(response.json())
return mono_repo_extractor.get_clients_from_batch_response(packages)


def split_repo_clients(token: str) -> List[CloudClient]:

first_request = True
while first_request or 'next' in response.links:
if first_request:
url = f"{BASE_API}/search/repositories?page=1"
first_request = False
else:
url = response.links['next']['url']
headers = {'Authorization': f'token {token}'}
params = {'per_page': 100, "q": "python- in:name org:googleapis"}
response = requests.get(url=url, params=params, headers=headers)
repositories = response.json().get("items", [])
clients = []
url = f"{BASE_API}/search/repositories?page=1"
headers = {'Authorization': f'token {token}'}
params = {'per_page': 100, "q": "python- in:name org:googleapis"}

while url:
response, metadata = _fetch_and_parse_response(url, headers, params)
if not metadata:
break
repositories = metadata.get("items", [])
if len(repositories) == 0:
break

split_repo_extractor = Extractor(path_format=SPLIT_REPO_PATH_FORMAT, response_key=REPO_RESPONSE_KEY)
return split_repo_extractor.get_clients_from_batch_response(repositories)
clients.extend(split_repo_extractor.get_clients_from_batch_response(repositories))

# Check for the 'next' link in the response headers for pagination
url = response.links.get('next', {}).get('url')

return clients


def get_token():
Expand Down

0 comments on commit 3650261

Please sign in to comment.