Skip to content

Commit

Permalink
Add a function to find missing cherry picks in different branches (py…
Browse files Browse the repository at this point in the history
…torch#1752)

* Add a function to find missing cherry picks in different branches

* Add the functionality to find PRs by sha

* Update the logic to find the universe of potentially missing PRs using the milestone id

* Update the logic to find missing cherry picks from previous release that do not exist in the current release

* Update tge logic to use previous release commits to find missing cherry picks in the current issue for release.
  • Loading branch information
juliagmt-google authored Apr 2, 2024
1 parent 38314c2 commit bf55c5e
Showing 1 changed file with 66 additions and 17 deletions.
83 changes: 66 additions & 17 deletions analytics/github_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import enum
import os


class IssueState(enum.Enum):
OPEN = "open"
CLOSED = "closed"
Expand All @@ -25,20 +24,23 @@ class GitCommit:
author: str
author_date: datetime
commit_date: Optional[datetime]
pr_url: str

def __init__(self,
commit_hash: str,
author: str,
author_date: datetime,
title: str,
body: str,
pr_url : str,
commit_date: Optional[datetime] = None) -> None:
self.commit_hash = commit_hash
self.author = author
self.author_date = author_date
self.commit_date = commit_date
self.title = title
self.body = body
self.pr_url = pr_url

def __contains__(self, item: Any) -> bool:
return item in self.body or item in self.title
Expand Down Expand Up @@ -137,12 +139,20 @@ def parse_fuller_format(lines: Union[str, List[str]]) -> GitCommit:
assert lines[3].startswith("Commit: ")
assert lines[4].startswith("CommitDate: ")
assert len(lines[5]) == 0

prUrl = ""
for line in lines:
if "Pull Request resolved:" in line:
prUrl = line.split("Pull Request resolved:")[1].strip()
break

return GitCommit(commit_hash=lines[0].split()[1].strip(),
author=lines[1].split(":", 1)[1].strip(),
author_date=datetime.fromtimestamp(int(lines[2].split(":", 1)[1].strip())),
commit_date=datetime.fromtimestamp(int(lines[4].split(":", 1)[1].strip())),
title=lines[6].strip(),
body="\n".join(lines[7:]),
pr_url=prUrl,
)


Expand Down Expand Up @@ -218,7 +228,6 @@ def fetch_json(url: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[s
print(f"Rate limit exceeded: {err.headers['X-RateLimit-Used']}/{err.headers['X-RateLimit-Limit']}")
raise


def fetch_multipage_json(url: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
if params is None:
params = {}
Expand All @@ -236,7 +245,6 @@ def gh_get_milestones(org='pytorch', project='pytorch', state: IssueState = Issu
url = f'https://api.github.com/repos/{org}/{project}/milestones'
return fetch_multipage_json(url, {"state": state})


def gh_get_milestone_issues(org: str, project: str, milestone_idx: int, state: IssueState = IssueState.OPEN):
url = f'https://api.github.com/repos/{org}/{project}/issues'
return fetch_multipage_json(url, {"milestone": milestone_idx, "state": state})
Expand All @@ -253,6 +261,10 @@ def gh_get_ref_statuses(org: str, project: str, ref: str) -> Dict[str, Any]:
rc["statuses"] += nrc["statuses"]
return rc

def get_issue_comments(org: str, project: str, issue_number : int):
url = f'https://api.github.com/repos/{org}/{project}/issues/{issue_number}/comments'

return fetch_multipage_json(url)

def extract_statuses_map(json: Dict[str, Any]):
return {s["context"]: s["state"] for s in json["statuses"]}
Expand Down Expand Up @@ -372,6 +384,28 @@ def get_commits_dict(x, y):
continue
print(f'{html_url};{issue["title"]};{state}')

def commits_missing_in_release(repo: GitRepo, branch: str, orig_branch: str, minor_release: str, milestone_idx: int, cut_off_date : datetime, issue_num : int) -> None:
def get_commits_dict(x, y):
return build_commit_dict(repo.get_commit_list(x, y))
main_commits = get_commits_dict(minor_release, 'main')
prev_release_commits = get_commits_dict(orig_branch, branch)
current_issue_comments = get_issue_comments('pytorch', 'pytorch',issue_num) # issue comments for the release tracker as cherry picks
print(f"len(main_commits)={len(main_commits)}")
print(f"len(prev_release_commits)={len(prev_release_commits)}")
print(f"len(current_issue_comments)={len(current_issue_comments)}")
print(f"issue_num: {issue_num}, len(issue_comments)={len(current_issue_comments)}")
print("URL;Title;Status")

# Iterate over the previous release branch to find potentially missing cherry picks in the current issue.
for commit in prev_release_commits.values():
not_cherry_picked_in_current_issue = any(commit.pr_url not in issue_comment['body'] for issue_comment in current_issue_comments)
for main_commit in main_commits.values():
if main_commit.pr_url == commit.pr_url :
mentioned_after_cut_off_date = cut_off_date < main_commit.commit_date
if not_cherry_picked_in_current_issue and mentioned_after_cut_off_date:
# Commits that are release only, which exist in previous release branch and not in main.
print(f'{commit.pr_url};{commit.title};{commit.commit_date}')
break

def analyze_stacks(repo: GitRepo) -> None:
from tqdm.contrib.concurrent import thread_map
Expand All @@ -398,6 +432,7 @@ def parse_arguments():
default=os.path.expanduser("~/git/pytorch/pytorch"))
parser.add_argument("--milestone-id", type=str)
parser.add_argument("--branch", type=str)
parser.add_argument("--minor-release", type=str)
parser.add_argument("--remote",
type=str,
help="Remote to base off of",
Expand All @@ -406,7 +441,10 @@ def parse_arguments():
parser.add_argument("--print-reverts", action="store_true")
parser.add_argument("--contributor-stats", action="store_true")
parser.add_argument("--missing-in-branch", action="store_true")
parser.add_argument("--missing-in-release", action="store_true")
parser.add_argument("--analyze-stacks", action="store_true")
parser.add_argument('--date', type=lambda d: datetime.strptime(d, '%Y-%m-%d'))
parser.add_argument("--issue-num", type=int)
return parser.parse_args()


Expand All @@ -427,26 +465,37 @@ def main():
if args.analyze_stacks:
analyze_stacks(repo)
return

# Use milestone idx or search it along milestone titles
try:
milestone_idx = int(args.milestone_id)
except ValueError:
milestone_idx = -1
milestones = gh_get_milestones()
for milestone in milestones:
if milestone.get('title', '') == args.milestone_id:
milestone_idx = int(milestone.get('number', '-2'))
if milestone_idx < 0:
print(f'Could not find milestone {args.milestone_id}')
return

if args.missing_in_branch:
# Use milestone idx or search it along milestone titles
try:
milestone_idx = int(args.milestone_id)
except ValueError:
milestone_idx = -1
milestones = gh_get_milestones()
for milestone in milestones:
if milestone.get('title', '') == args.milestone_id:
milestone_idx = int(milestone.get('number', '-2'))
if milestone_idx < 0:
print(f'Could not find milestone {args.milestone_id}')
return

commits_missing_in_branch(repo,
args.branch,
args.branch,
f'orig/{args.branch}',
milestone_idx)
return

if args.missing_in_release:
commits_missing_in_release(repo,
args.branch,
f'orig/{args.branch}',
args.minor_release,
milestone_idx,
args.date,
args.issue_num
)
return

print(f"Parsing git history with remote {remote}...", end='', flush=True)
start_time = time.time()
Expand Down

0 comments on commit bf55c5e

Please sign in to comment.