llvm-capstone/llvm/utils/Reviewing/find_interesting_reviews.py
Tobias Hieta b71edfaa4e
[NFC][Py Reformat] Reformat python files in llvm
This is the first commit in a series that will reformat
all the python files in the LLVM repository.

Reformatting is done with `black`.

See more information here:

https://discourse.llvm.org/t/rfc-document-and-standardize-python-code-style

Reviewed By: jhenderson, JDevlieghere, MatzeB

Differential Revision: https://reviews.llvm.org/D150545
2023-05-17 10:48:52 +02:00

776 lines
26 KiB
Python

#!/usr/bin/env python
from __future__ import print_function
import argparse
import email.mime.multipart
import email.mime.text
import logging
import os.path
import pickle
import re
import smtplib
import subprocess
import sys
from datetime import datetime, timedelta
from phabricator import Phabricator
# Setting up a virtualenv to run this script can be done by running the
# following commands:
# $ virtualenv venv
# $ . ./venv/bin/activate
# $ pip install Phabricator
GIT_REPO_METADATA = (("llvm-monorepo", "https://github.com/llvm/llvm-project"),)
# The below PhabXXX classes represent objects as modelled by Phabricator.
# The classes can be serialized to disk, to try and make sure that we don't
# needlessly have to re-fetch lots of data from Phabricator, as that would
# make this script unusably slow.
class PhabObject:
OBJECT_KIND = None
def __init__(self, id):
self.id = id
class PhabObjectCache:
def __init__(self, PhabObjectClass):
self.PhabObjectClass = PhabObjectClass
self.most_recent_info = None
self.oldest_info = None
self.id2PhabObjects = {}
def get_name(self):
return self.PhabObjectClass.OBJECT_KIND + "sCache"
def get(self, id):
if id not in self.id2PhabObjects:
self.id2PhabObjects[id] = self.PhabObjectClass(id)
return self.id2PhabObjects[id]
def get_ids_in_cache(self):
return list(self.id2PhabObjects.keys())
def get_objects(self):
return list(self.id2PhabObjects.values())
DEFAULT_DIRECTORY = "PhabObjectCache"
def _get_pickle_name(self, directory):
file_name = "Phab" + self.PhabObjectClass.OBJECT_KIND + "s.pickle"
return os.path.join(directory, file_name)
def populate_cache_from_disk(self, directory=DEFAULT_DIRECTORY):
"""
FIXME: consider if serializing to JSON would bring interoperability
advantages over serializing to pickle.
"""
try:
f = open(self._get_pickle_name(directory), "rb")
except IOError as err:
print("Could not find cache. Error message: {0}. Continuing...".format(err))
else:
with f:
try:
d = pickle.load(f)
self.__dict__.update(d)
except EOFError as err:
print(
"Cache seems to be corrupt. "
+ "Not using cache. Error message: {0}".format(err)
)
def write_cache_to_disk(self, directory=DEFAULT_DIRECTORY):
if not os.path.exists(directory):
os.makedirs(directory)
with open(self._get_pickle_name(directory), "wb") as f:
pickle.dump(self.__dict__, f)
print(
"wrote cache to disk, most_recent_info= {0}".format(
datetime.fromtimestamp(self.most_recent_info)
if self.most_recent_info is not None
else None
)
)
class PhabReview(PhabObject):
OBJECT_KIND = "Review"
def __init__(self, id):
PhabObject.__init__(self, id)
def update(self, title, dateCreated, dateModified, author):
self.title = title
self.dateCreated = dateCreated
self.dateModified = dateModified
self.author = author
def setPhabDiffs(self, phabDiffs):
self.phabDiffs = phabDiffs
class PhabUser(PhabObject):
OBJECT_KIND = "User"
def __init__(self, id):
PhabObject.__init__(self, id)
def update(self, phid, realName):
self.phid = phid
self.realName = realName
class PhabHunk:
def __init__(self, rest_api_hunk):
self.oldOffset = int(rest_api_hunk["oldOffset"])
self.oldLength = int(rest_api_hunk["oldLength"])
# self.actual_lines_changed_offset will contain the offsets of the
# lines that were changed in this hunk.
self.actual_lines_changed_offset = []
offset = self.oldOffset
inHunk = False
hunkStart = -1
contextLines = 3
for line in rest_api_hunk["corpus"].split("\n"):
if line.startswith("+"):
# line is a new line that got introduced in this patch.
# Do not record it as a changed line.
if inHunk is False:
inHunk = True
hunkStart = max(self.oldOffset, offset - contextLines)
continue
if line.startswith("-"):
# line was changed or removed from the older version of the
# code. Record it as a changed line.
if inHunk is False:
inHunk = True
hunkStart = max(self.oldOffset, offset - contextLines)
offset += 1
continue
# line is a context line.
if inHunk is True:
inHunk = False
hunkEnd = offset + contextLines
self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
offset += 1
if inHunk is True:
hunkEnd = offset + contextLines
self.actual_lines_changed_offset.append((hunkStart, hunkEnd))
# The above algorithm could result in adjacent or overlapping ranges
# being recorded into self.actual_lines_changed_offset.
# Merge the adjacent and overlapping ranges in there:
t = []
lastRange = None
for start, end in self.actual_lines_changed_offset + [
(sys.maxsize, sys.maxsize)
]:
if lastRange is None:
lastRange = (start, end)
else:
if lastRange[1] >= start:
lastRange = (lastRange[0], end)
else:
t.append(lastRange)
lastRange = (start, end)
self.actual_lines_changed_offset = t
class PhabChange:
def __init__(self, rest_api_change):
self.oldPath = rest_api_change["oldPath"]
self.hunks = [PhabHunk(h) for h in rest_api_change["hunks"]]
class PhabDiff(PhabObject):
OBJECT_KIND = "Diff"
def __init__(self, id):
PhabObject.__init__(self, id)
def update(self, rest_api_results):
self.revisionID = rest_api_results["revisionID"]
self.dateModified = int(rest_api_results["dateModified"])
self.dateCreated = int(rest_api_results["dateCreated"])
self.changes = [PhabChange(c) for c in rest_api_results["changes"]]
class ReviewsCache(PhabObjectCache):
def __init__(self):
PhabObjectCache.__init__(self, PhabReview)
class UsersCache(PhabObjectCache):
def __init__(self):
PhabObjectCache.__init__(self, PhabUser)
reviews_cache = ReviewsCache()
users_cache = UsersCache()
def init_phab_connection():
phab = Phabricator()
phab.update_interfaces()
return phab
def update_cached_info(
phab,
cache,
phab_query,
order,
record_results,
max_nr_entries_per_fetch,
max_nr_days_to_cache,
):
q = phab
LIMIT = max_nr_entries_per_fetch
for query_step in phab_query:
q = getattr(q, query_step)
results = q(order=order, limit=LIMIT)
most_recent_info, oldest_info = record_results(cache, results, phab)
oldest_info_to_fetch = datetime.fromtimestamp(most_recent_info) - timedelta(
days=max_nr_days_to_cache
)
most_recent_info_overall = most_recent_info
cache.write_cache_to_disk()
after = results["cursor"]["after"]
print("after: {0!r}".format(after))
print("most_recent_info: {0}".format(datetime.fromtimestamp(most_recent_info)))
while (
after is not None and datetime.fromtimestamp(oldest_info) > oldest_info_to_fetch
):
need_more_older_data = (
cache.oldest_info is None
or datetime.fromtimestamp(cache.oldest_info) > oldest_info_to_fetch
)
print(
(
"need_more_older_data={0} cache.oldest_info={1} "
+ "oldest_info_to_fetch={2}"
).format(
need_more_older_data,
datetime.fromtimestamp(cache.oldest_info)
if cache.oldest_info is not None
else None,
oldest_info_to_fetch,
)
)
need_more_newer_data = (
cache.most_recent_info is None or cache.most_recent_info < most_recent_info
)
print(
(
"need_more_newer_data={0} cache.most_recent_info={1} "
+ "most_recent_info={2}"
).format(need_more_newer_data, cache.most_recent_info, most_recent_info)
)
if not need_more_older_data and not need_more_newer_data:
break
results = q(order=order, after=after, limit=LIMIT)
most_recent_info, oldest_info = record_results(cache, results, phab)
after = results["cursor"]["after"]
print("after: {0!r}".format(after))
print("most_recent_info: {0}".format(datetime.fromtimestamp(most_recent_info)))
cache.write_cache_to_disk()
cache.most_recent_info = most_recent_info_overall
if after is None:
# We did fetch all records. Mark the cache to contain all info since
# the start of time.
oldest_info = 0
cache.oldest_info = oldest_info
cache.write_cache_to_disk()
def record_reviews(cache, reviews, phab):
most_recent_info = None
oldest_info = None
for reviewInfo in reviews["data"]:
if reviewInfo["type"] != "DREV":
continue
id = reviewInfo["id"]
# phid = reviewInfo["phid"]
dateModified = int(reviewInfo["fields"]["dateModified"])
dateCreated = int(reviewInfo["fields"]["dateCreated"])
title = reviewInfo["fields"]["title"]
author = reviewInfo["fields"]["authorPHID"]
phabReview = cache.get(id)
if (
"dateModified" not in phabReview.__dict__
or dateModified > phabReview.dateModified
):
diff_results = phab.differential.querydiffs(revisionIDs=[id])
diff_ids = sorted(diff_results.keys())
phabDiffs = []
for diff_id in diff_ids:
diffInfo = diff_results[diff_id]
d = PhabDiff(diff_id)
d.update(diffInfo)
phabDiffs.append(d)
phabReview.update(title, dateCreated, dateModified, author)
phabReview.setPhabDiffs(phabDiffs)
print(
"Updated D{0} modified on {1} ({2} diffs)".format(
id, datetime.fromtimestamp(dateModified), len(phabDiffs)
)
)
if most_recent_info is None:
most_recent_info = dateModified
elif most_recent_info < dateModified:
most_recent_info = dateModified
if oldest_info is None:
oldest_info = dateModified
elif oldest_info > dateModified:
oldest_info = dateModified
return most_recent_info, oldest_info
def record_users(cache, users, phab):
most_recent_info = None
oldest_info = None
for info in users["data"]:
if info["type"] != "USER":
continue
id = info["id"]
phid = info["phid"]
dateModified = int(info["fields"]["dateModified"])
# dateCreated = int(info["fields"]["dateCreated"])
realName = info["fields"]["realName"]
phabUser = cache.get(id)
phabUser.update(phid, realName)
if most_recent_info is None:
most_recent_info = dateModified
elif most_recent_info < dateModified:
most_recent_info = dateModified
if oldest_info is None:
oldest_info = dateModified
elif oldest_info > dateModified:
oldest_info = dateModified
return most_recent_info, oldest_info
PHABCACHESINFO = (
(
reviews_cache,
("differential", "revision", "search"),
"updated",
record_reviews,
5,
7,
),
(users_cache, ("user", "search"), "newest", record_users, 100, 1000),
)
def load_cache():
for cache, phab_query, order, record_results, _, _ in PHABCACHESINFO:
cache.populate_cache_from_disk()
print(
"Loaded {0} nr entries: {1}".format(
cache.get_name(), len(cache.get_ids_in_cache())
)
)
print(
"Loaded {0} has most recent info: {1}".format(
cache.get_name(),
datetime.fromtimestamp(cache.most_recent_info)
if cache.most_recent_info is not None
else None,
)
)
def update_cache(phab):
load_cache()
for (
cache,
phab_query,
order,
record_results,
max_nr_entries_per_fetch,
max_nr_days_to_cache,
) in PHABCACHESINFO:
update_cached_info(
phab,
cache,
phab_query,
order,
record_results,
max_nr_entries_per_fetch,
max_nr_days_to_cache,
)
ids_in_cache = cache.get_ids_in_cache()
print("{0} objects in {1}".format(len(ids_in_cache), cache.get_name()))
cache.write_cache_to_disk()
def get_most_recent_reviews(days):
newest_reviews = sorted(reviews_cache.get_objects(), key=lambda r: -r.dateModified)
if len(newest_reviews) == 0:
return newest_reviews
most_recent_review_time = datetime.fromtimestamp(newest_reviews[0].dateModified)
cut_off_date = most_recent_review_time - timedelta(days=days)
result = []
for review in newest_reviews:
if datetime.fromtimestamp(review.dateModified) < cut_off_date:
return result
result.append(review)
return result
# All of the above code is about fetching data from Phabricator and caching it
# on local disk. The below code contains the actual "business logic" for this
# script.
_userphid2realname = None
def get_real_name_from_author(user_phid):
global _userphid2realname
if _userphid2realname is None:
_userphid2realname = {}
for user in users_cache.get_objects():
_userphid2realname[user.phid] = user.realName
return _userphid2realname.get(user_phid, "unknown")
def print_most_recent_reviews(phab, days, filter_reviewers):
msgs = []
def add_msg(msg):
msgs.append(msg)
print(msg.encode("utf-8"))
newest_reviews = get_most_recent_reviews(days)
add_msg(
"These are the reviews that look interesting to be reviewed. "
+ "The report below has 2 sections. The first "
+ "section is organized per review; the second section is organized "
+ "per potential reviewer.\n"
)
oldest_review = newest_reviews[-1] if len(newest_reviews) > 0 else None
oldest_datetime = (
datetime.fromtimestamp(oldest_review.dateModified) if oldest_review else None
)
add_msg(
(
"The report below is based on analyzing the reviews that got "
+ "touched in the past {0} days (since {1}). "
+ "The script found {2} such reviews.\n"
).format(days, oldest_datetime, len(newest_reviews))
)
reviewer2reviews_and_scores = {}
for i, review in enumerate(newest_reviews):
matched_reviewers = find_reviewers_for_review(review)
matched_reviewers = filter_reviewers(matched_reviewers)
if len(matched_reviewers) == 0:
continue
add_msg(
(
"{0:>3}. https://reviews.llvm.org/D{1} by {2}\n {3}\n"
+ " Last updated on {4}"
).format(
i,
review.id,
get_real_name_from_author(review.author),
review.title,
datetime.fromtimestamp(review.dateModified),
)
)
for reviewer, scores in matched_reviewers:
add_msg(
" potential reviewer {0}, score {1}".format(
reviewer,
"(" + "/".join(["{0:.1f}%".format(s) for s in scores]) + ")",
)
)
if reviewer not in reviewer2reviews_and_scores:
reviewer2reviews_and_scores[reviewer] = []
reviewer2reviews_and_scores[reviewer].append((review, scores))
# Print out a summary per reviewer.
for reviewer in sorted(reviewer2reviews_and_scores.keys()):
reviews_and_scores = reviewer2reviews_and_scores[reviewer]
reviews_and_scores.sort(key=lambda rs: rs[1], reverse=True)
add_msg(
"\n\nSUMMARY FOR {0} (found {1} reviews):".format(
reviewer, len(reviews_and_scores)
)
)
for review, scores in reviews_and_scores:
add_msg(
"[{0}] https://reviews.llvm.org/D{1} '{2}' by {3}".format(
"/".join(["{0:.1f}%".format(s) for s in scores]),
review.id,
review.title,
get_real_name_from_author(review.author),
)
)
return "\n".join(msgs)
def get_git_cmd_output(cmd):
output = None
try:
logging.debug(cmd)
output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
logging.debug(str(e))
if output is None:
return None
return output.decode("utf-8", errors="ignore")
reAuthorMail = re.compile("^author-mail <([^>]*)>.*$")
def parse_blame_output_line_porcelain(blame_output_lines):
email2nr_occurences = {}
if blame_output_lines is None:
return email2nr_occurences
for line in blame_output_lines:
m = reAuthorMail.match(line)
if m:
author_email_address = m.group(1)
if author_email_address not in email2nr_occurences:
email2nr_occurences[author_email_address] = 1
else:
email2nr_occurences[author_email_address] += 1
return email2nr_occurences
class BlameOutputCache:
def __init__(self):
self.cache = {}
def _populate_cache_for(self, cache_key):
assert cache_key not in self.cache
git_repo, base_revision, path = cache_key
cmd = (
"git -C {0} blame --encoding=utf-8 --date iso -f -e -w "
+ "--line-porcelain {1} -- {2}"
).format(git_repo, base_revision, path)
blame_output = get_git_cmd_output(cmd)
self.cache[cache_key] = (
blame_output.split("\n") if blame_output is not None else None
)
# FIXME: the blame cache could probably be made more effective still if
# instead of storing the requested base_revision in the cache, the last
# revision before the base revision this file/path got changed in gets
# stored. That way multiple project revisions for which this specific
# file/patch hasn't changed would get cache hits (instead of misses in
# the current implementation).
def get_blame_output_for(
self, git_repo, base_revision, path, start_line=-1, end_line=-1
):
cache_key = (git_repo, base_revision, path)
if cache_key not in self.cache:
self._populate_cache_for(cache_key)
assert cache_key in self.cache
all_blame_lines = self.cache[cache_key]
if all_blame_lines is None:
return None
if start_line == -1 and end_line == -1:
return all_blame_lines
assert start_line >= 0
assert end_line >= 0
assert end_line <= len(all_blame_lines)
assert start_line <= len(all_blame_lines)
assert start_line <= end_line
return all_blame_lines[start_line:end_line]
def get_parsed_git_blame_for(
self, git_repo, base_revision, path, start_line=-1, end_line=-1
):
return parse_blame_output_line_porcelain(
self.get_blame_output_for(
git_repo, base_revision, path, start_line, end_line
)
)
blameOutputCache = BlameOutputCache()
def find_reviewers_for_diff_heuristic(diff):
# Heuristic 1: assume good reviewers are the ones that touched the same
# lines before as this patch is touching.
# Heuristic 2: assume good reviewers are the ones that touched the same
# files before as this patch is touching.
reviewers2nr_lines_touched = {}
reviewers2nr_files_touched = {}
# Assume last revision before diff was modified is the revision the diff
# applies to.
assert len(GIT_REPO_METADATA) == 1
git_repo = os.path.join("git_repos", GIT_REPO_METADATA[0][0])
cmd = 'git -C {0} rev-list -n 1 --before="{1}" main'.format(
git_repo,
datetime.fromtimestamp(diff.dateModified).strftime("%Y-%m-%d %H:%M:%s"),
)
base_revision = get_git_cmd_output(cmd).strip()
logging.debug("Base revision={0}".format(base_revision))
for change in diff.changes:
path = change.oldPath
# Compute heuristic 1: look at context of patch lines.
for hunk in change.hunks:
for start_line, end_line in hunk.actual_lines_changed_offset:
# Collect git blame results for authors in those ranges.
for (
reviewer,
nr_occurences,
) in blameOutputCache.get_parsed_git_blame_for(
git_repo, base_revision, path, start_line, end_line
).items():
if reviewer not in reviewers2nr_lines_touched:
reviewers2nr_lines_touched[reviewer] = 0
reviewers2nr_lines_touched[reviewer] += nr_occurences
# Compute heuristic 2: don't look at context, just at files touched.
# Collect git blame results for authors in those ranges.
for reviewer, nr_occurences in blameOutputCache.get_parsed_git_blame_for(
git_repo, base_revision, path
).items():
if reviewer not in reviewers2nr_files_touched:
reviewers2nr_files_touched[reviewer] = 0
reviewers2nr_files_touched[reviewer] += 1
# Compute "match scores"
total_nr_lines = sum(reviewers2nr_lines_touched.values())
total_nr_files = len(diff.changes)
reviewers_matchscores = [
(
reviewer,
(
reviewers2nr_lines_touched.get(reviewer, 0) * 100.0 / total_nr_lines
if total_nr_lines != 0
else 0,
reviewers2nr_files_touched[reviewer] * 100.0 / total_nr_files
if total_nr_files != 0
else 0,
),
)
for reviewer, nr_lines in reviewers2nr_files_touched.items()
]
reviewers_matchscores.sort(key=lambda i: i[1], reverse=True)
return reviewers_matchscores
def find_reviewers_for_review(review):
# Process the newest diff first.
diffs = sorted(review.phabDiffs, key=lambda d: d.dateModified, reverse=True)
if len(diffs) == 0:
return
diff = diffs[0]
matched_reviewers = find_reviewers_for_diff_heuristic(diff)
# Show progress, as this is a slow operation:
sys.stdout.write(".")
sys.stdout.flush()
logging.debug("matched_reviewers: {0}".format(matched_reviewers))
return matched_reviewers
def update_git_repos():
git_repos_directory = "git_repos"
for name, url in GIT_REPO_METADATA:
dirname = os.path.join(git_repos_directory, name)
if not os.path.exists(dirname):
cmd = "git clone {0} {1}".format(url, dirname)
output = get_git_cmd_output(cmd)
cmd = "git -C {0} pull --rebase".format(dirname)
output = get_git_cmd_output(cmd)
def send_emails(email_addresses, sender, msg):
s = smtplib.SMTP()
s.connect()
for email_address in email_addresses:
email_msg = email.mime.multipart.MIMEMultipart()
email_msg["From"] = sender
email_msg["To"] = email_address
email_msg["Subject"] = "LLVM patches you may be able to review."
email_msg.attach(email.mime.text.MIMEText(msg.encode("utf-8"), "plain"))
# python 3.x: s.send_message(email_msg)
s.sendmail(email_msg["From"], email_msg["To"], email_msg.as_string())
s.quit()
def filter_reviewers_to_report_for(people_to_look_for):
# The below is just an example filter, to only report potential reviews
# to do for the people that will receive the report email.
return lambda potential_reviewers: [
r for r in potential_reviewers if r[0] in people_to_look_for
]
def main():
parser = argparse.ArgumentParser(
description="Match open reviews to potential reviewers."
)
parser.add_argument(
"--no-update-cache",
dest="update_cache",
action="store_false",
default=True,
help="Do not update cached Phabricator objects",
)
parser.add_argument(
"--email-report",
dest="email_report",
nargs="*",
default="",
help="A email addresses to send the report to.",
)
parser.add_argument(
"--sender",
dest="sender",
default="",
help="The email address to use in 'From' on messages emailed out.",
)
parser.add_argument(
"--email-addresses",
dest="email_addresses",
nargs="*",
help="The email addresses (as known by LLVM git) of "
+ "the people to look for reviews for.",
)
parser.add_argument("--verbose", "-v", action="count")
args = parser.parse_args()
if args.verbose >= 1:
logging.basicConfig(level=logging.DEBUG)
people_to_look_for = [e.decode("utf-8") for e in args.email_addresses]
logging.debug(
"Will look for reviews that following contributors could "
+ "review: {}".format(people_to_look_for)
)
logging.debug("Will email a report to: {}".format(args.email_report))
phab = init_phab_connection()
if args.update_cache:
update_cache(phab)
load_cache()
update_git_repos()
msg = print_most_recent_reviews(
phab,
days=1,
filter_reviewers=filter_reviewers_to_report_for(people_to_look_for),
)
if args.email_report != []:
send_emails(args.email_report, args.sender, msg)
if __name__ == "__main__":
main()