diff --git a/.gitignore b/.gitignore index 0baf01522..99a7d043b 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ /_site +*.env +*.tmp diff --git a/data/rust-lang/rust-clippy/pr-author-latency.csv b/data/rust-lang/rust-clippy/pr-author-latency.csv new file mode 100644 index 000000000..b5b3623ea --- /dev/null +++ b/data/rust-lang/rust-clippy/pr-author-latency.csv @@ -0,0 +1,2 @@ +is:open draft:false -label:S-blocked label:S-waiting-on-author __status_updated:{{param|relative_date}},>1|today,3..1|last 3 days,7..4|last week,14..8|last 2 weeks,30..15|last month,90..31|last 3 months,180..91|last 6 months,<180|more than 6 months +2023-01-30,0,0,0,1,0,2,3,0 diff --git a/data/rust-lang/rust-clippy/pr-review-latency.csv b/data/rust-lang/rust-clippy/pr-review-latency.csv new file mode 100644 index 000000000..d23b4d970 --- /dev/null +++ b/data/rust-lang/rust-clippy/pr-review-latency.csv @@ -0,0 +1,2 @@ +is:open draft:false -label:S-blocked label:S-waiting-on-review __status_updated:{{param|relative_date}},>1|today,3..1|last 3 days,7..4|last week,14..8|last 2 weeks,30..15|last month,90..31|last 3 months,180..91|last 6 months,<180|more than 6 months +2023-01-30,0,5,4,0,7,7,5,7 diff --git a/data/rust-lang/rust/pr-author-latency.csv b/data/rust-lang/rust/pr-author-latency.csv new file mode 100644 index 000000000..a8968ddaa --- /dev/null +++ b/data/rust-lang/rust/pr-author-latency.csv @@ -0,0 +1,2 @@ +is:open draft:false -label:S-blocked label:S-waiting-on-author __status_updated:{{param|relative_date}},>1|today,3..1|last 3 days,7..4|last week,14..8|last 2 weeks,30..15|last month,90..31|last 3 months,180..91|last 6 months,<180|more than 6 months +2023-01-30,1,12,7,15,24,29,12,0 diff --git a/data/rust-lang/rust/pr-review-latency.csv b/data/rust-lang/rust/pr-review-latency.csv new file mode 100644 index 000000000..300bc9198 --- /dev/null +++ b/data/rust-lang/rust/pr-review-latency.csv @@ -0,0 +1,2 @@ +is:open draft:false -label:S-blocked label:S-waiting-on-review __status_updated:{{param|relative_date}},>1|today,3..1|last 3 days,7..4|last week,14..8|last 2 weeks,30..15|last month,90..31|last 3 months,180..91|last 6 months,<180|more than 6 months +2023-01-30,1,24,18,35,46,49,27,0 diff --git a/index.md b/index.md index 276b6fef6..d1a93e274 100644 --- a/index.md +++ b/index.md @@ -6,5 +6,7 @@ graphs: pr-activity: Last activity on pull requests pr-age: Pull requests creation dates pr-merged: Pull requests merged + pr-review-latency: Duration waiting on review + pr-author-latency: Duration waiting on author layout: graphs --- diff --git a/rust-clippy.md b/rust-clippy.md index 7ce43f14a..19c6aa9ca 100644 --- a/rust-clippy.md +++ b/rust-clippy.md @@ -5,5 +5,7 @@ graphs: pr-status: Pull requests status pr-activity: Last activity on pull requests pr-age: Pull requests creation dates + pr-review-latency: Duration waiting on review + pr-author-latency: Duration waiting on author layout: graphs --- diff --git a/updater.py b/updater.py index 90828f7e7..424054d08 100755 --- a/updater.py +++ b/updater.py @@ -19,26 +19,212 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +# pyright: strict + import csv -import datetime -import json +from datetime import datetime, timedelta, timezone import os -import subprocess import sys import time -import jinja2 +from typing import Callable +from urllib.parse import urlparse, parse_qs +import jinja2 import requests API_URL = "https://api.github.com/search/issues" +# Various insignificant comments from triage, merge conflicts, etc +# count as "updates" on GitHub, so `updated:` isn't a ideal way to +# gauge the last activity on a PR. +# +# Instead, use the Issue Events API to find when the +# status label (`S-*`) was most recently changed. +def get_pr_status_updated(http_session: requests.Session, repo: str, events_url: str, pr_number: int) -> str | None: + """Get the timestamp of the last status label change for the given PR""" + + page = 1 + + while True: + print(f"Fetching events for {repo}#{pr_number}") + res = http_session.get(events_url, params={"per_page": 100, "page": page}) + + # Properly handle rate limits + if res.status_code == 403: + wait = float(res.headers["X-RateLimit-Reset"]) - time.time() + 1 + print("Rate limit reached, waiting %s seconds..." % int(wait), flush=True) + time.sleep(wait) + continue + + # Make sure we got the last page + # + # In most cases, the `per_page` of 100 should avoid + # needing to issue another request. + last = res.links.get("last") + if last is not None and "url" in last: + parsed = urlparse(last["url"]) + parsed_query = parse_qs(parsed.query) + last_page = int(parsed_query["page"][0]) + + if last_page > page: + page = last_page + continue + + data = res.json() + print(data) + if "errors" in data: + for error in data["errors"]: + print("Error while fetching events for '%s': %s" % (f"{repo}#{pr_number}", error["message"])) + exit(1) + else: + break + + # Process events + data = list(data) # data is a list of events + + # Find last 'labeled' event with label name 'S-*' + for i, event in enumerate(reversed(data)): + event_index = len(data) - i # because enumerating reversed + + if event["event"] == "labeled": + label = str(event["label"]["name"]) + if label.startswith("S-"): + # Continue iterating backwards to see if this label was the last one removed + found_prev_event = False + for prev_event in reversed(data[:event_index]): + if prev_event["event"] == "unlabeled": + prev_label = str(prev_event["label"]["name"]) + if prev_label == label: + found_prev_event = True + break + elif prev_label.startswith("-S"): + break + + if found_prev_event: + # Same label was just removed and added back, so keep searching + continue + else: + return event["created_at"] + + return None + + +# Convert ">1" to `(0.0, 1.0 + epsilon)` +# Convert "7..4" to `(4.0, 7.0)` +# Convert "<30" to `(30.0 + epsilon, inf)` +def relative_date_to_range(param: str) -> Callable[[int], int]: + """Get a lambda that returns whether or not the argument + is within the relative date range""" + + if "|" in param: + value = param.split("|")[0] + else: + value = param + + # Support date ranges + if ".." in value: + end, start = value.split("..", 1) + end, start = int(end), int(start) + + return lambda x : (x >= start and x <= end) + else: + # Properly handle comparison operators + if value.startswith(">"): + value = value[len(">"):] + value = int(value) + return lambda x : (x < value) + if value.startswith(">="): + value = value[len(">="):] + value = int(value) + return lambda x : (x <= value) + if value.startswith("<"): + value = value[len("<"):] + value = int(value) + return lambda x : (x > value) + if value.startswith("<="): + value = value[len("<="):] + value = int(value) + return lambda x : (x >= value) + + value = int(value) + return lambda x : (x == value) + + +def status_updated(http_session: requests.Session, now: datetime, repo: str, query: str, params: list[str]): + """Fetch the time the status was last updated for each PR, + and compile into a histogram with bins defined by params""" + + # `__status_updated:` must come last in the query + query = query.split("__status_updated:{{param|relative_date}}", 1)[0] + query = f"is:pr repo:{repo} {query}".strip() + + bin_ranges = [relative_date_to_range(param) for param in params] + bins = [0 for _ in params] + + # Iterate through list of all PRs + page = 1 + last_page = 1 + + while page <= last_page: + print(f"Querying {query}", flush=True) + res = http_session.get(API_URL, params={"q": query, "per_page": 100, "page": page}) + + # Properly handle rate limits + if res.status_code == 403: + wait = float(res.headers["X-RateLimit-Reset"]) - time.time() + 1 + print("Rate limit reached, waiting %s seconds..." % int(wait), flush=True) + time.sleep(wait) + continue + + data = res.json() + print(data) + if "errors" in data: + for error in data["errors"]: + print("Error while searching for '%s': %s" % (query, error["message"])) + exit(1) + + # Calculate last page + last_page = int(data["total_count"]) / 100 + + # Process each PR + for pr in data["items"]: + pr_number = int(pr["number"]) + updated = get_pr_status_updated(http_session, repo, pr["events_url"], pr_number) + + if updated is None: + updated = str(pr["updated_at"]) + print(f"{repo}#{pr_number} status updated not found, using updated field instead: {updated}") + else: + print(f"{repo}#{pr_number} last updated at {updated}") + + # Correct for older pythons + updated = updated.replace("Z", "+00:00") + + # Get the relative time period + diff = now.date() - datetime.fromisoformat(updated).date() + + print(f"{repo}#{pr_number} not updated in {diff.days} days") + + # Increment the bin that this diff fits in + for i, r in enumerate(bin_ranges): + if r(diff.days): + param = params[i] + print(f"{repo}#{pr_number} added to bin #{i} for '{param}'") + bins[i] += 1 + break + + page += 1 + + return bins + + # GitHub doesn't support relative dates on `created:` and `updated:`, so this # allows the CSV files to use `{{param|relative_date}}` -def filter_relative_date(value): - def format_relative_date(date): - return str(datetime.date.today() - datetime.timedelta(days=int(date))) + "T00:00:00+00:00" +def filter_relative_date(now: datetime, value: str): + def format_relative_date(date: str): + return str(now.date() - timedelta(days=int(date))) + "T00:00:00+00:00" # Support date ranges if ".." in value: @@ -60,7 +246,7 @@ def format_relative_date(date): return cmp+format_relative_date(value) -def get_issues_count(http_session, repo, jinja_env, query, param): +def get_issues_count(http_session: requests.Session, repo: str, jinja_env: jinja2.Environment, query: str, param: str): """Get the number of issues with the provided label""" # Strip pretty labels from the query if "|" in param: @@ -79,7 +265,7 @@ def get_issues_count(http_session, repo, jinja_env, query, param): # Properly handle rate limits if res.status_code == 403: wait = float(res.headers["X-RateLimit-Reset"]) - time.time() + 1 - print("Rate limit reached, waiting %s seconds..." % int(wait)) + print("Rate limit reached, waiting %s seconds..." % int(wait), flush=True) time.sleep(wait) continue @@ -93,9 +279,10 @@ def get_issues_count(http_session, repo, jinja_env, query, param): return data["total_count"] -def update_csv_file(http_session, repo, path): +def update_csv_file(http_session: requests.Session, repo: str, path: str): """Add today's records to the provided csv file""" - today = str(datetime.date.today()) + now = datetime.now(tz=timezone.utc) + today = str(now.date()) # Load the CSV file in memory with open(path) as f: @@ -103,16 +290,22 @@ def update_csv_file(http_session, repo, path): # If today already has its own row don't add another one if len(content) == 1 or content[1][0] != today: - content.insert(1, None) + content.insert(1, []) content[1] = [today] # Setup the Jinja2 environment jinja_env = jinja2.Environment() - jinja_env.filters["relative_date"] = filter_relative_date + jinja_env.filters["relative_date"] = lambda value : filter_relative_date(now, str(value)) # type: ignore query = content[0][0] - for param in content[0][1:]: - content[1].append(str(get_issues_count(http_session, repo, jinja_env, query, param))) + + # Custom query requring custom logic + if "__status_updated:{{param|relative_date}}" in query: + for bin in status_updated(http_session, now, repo, query, content[0][1:]): + content[1].append(str(bin)) + else: + for param in content[0][1:]: + content[1].append(str(get_issues_count(http_session, repo, jinja_env, query, param))) with open(path, "w") as f: writer = csv.writer(f, lineterminator="\n") @@ -121,6 +314,8 @@ def update_csv_file(http_session, repo, path): if __name__ == "__main__": http_session = requests.Session() + http_session.headers["Accept"] = "application/vnd.github+json" + http_session.headers["X-GitHub-Api-Version"] = "2022-11-28" if "GITHUB_TOKEN" in os.environ: http_session.auth = ('x-token', os.environ["GITHUB_TOKEN"])