From 3d3b3eca483f4c730e0a1fa3d2017921569273f7 Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Fri, 19 Jul 2024 09:33:48 -0400 Subject: [PATCH 1/2] Add report of Pbench Agent version statistics Add a new `--agent` option and generator to report on the Pbench Agent versions present on the server. By default we report on the "main" versions, like "0.50" and "0.73"; by adding `--detail` it'll report the much longer list of full agent versions ("v0.73-2g6da0cfc8d") as well as "nonsense" version metadata like "system". This also recognizes the `--since` and `--until` options to report on agent versions that appear within a specific time range. The report is sorted by the last date a given version was seen, which makes it a bit easier to see that only 0.73 has been used since March 08 2024. ```console Dataset statistics by Pbench Agent version: Count Version First Last ---------- ---------------------- ------------ ------------ 1 '0.37' Apr 13 2012 Apr 13 2012 68 '0.44' Jan 04 2018 Feb 06 2018 84 '0.46' Jan 03 2018 Mar 09 2018 1,341 '0.47' Jan 02 2018 Apr 03 2018 2,197 '0.49' Mar 21 2018 Aug 04 2018 1,388 '0.48' Feb 06 2018 Aug 14 2018 171 '0.51' Aug 10 2018 Aug 31 2018 4,962 '0.50' May 11 2018 Sep 25 2018 494 '0.52' Aug 24 2018 Jan 02 2019 1,942 '0.53' Sep 13 2018 May 29 2019 898 '0.58' Apr 08 2019 May 30 2019 246 '0.55' Jan 28 2019 Jun 06 2019 1,205 '0.54' Nov 27 2018 Jul 01 2019 1 '0.61' Jul 08 2019 Jul 08 2019 532 '0.57' Mar 15 2019 Aug 28 2019 382 '0.62' Jul 17 2019 Sep 10 2019 1,426 '0.56' Feb 11 2019 Oct 16 2019 1,067 '0.59' Apr 30 2019 Nov 12 2019 1,454 '0.63' Jul 31 2019 Dec 18 2019 2,151 '0.65' Sep 27 2019 Feb 21 2020 1,342 '0.64' Aug 27 2019 Mar 26 2020 1,587 '0.60' May 25 2019 May 22 2020 5,255 '0.66' Nov 07 2019 Jul 10 2020 4,596 '0.67' Jan 16 2020 Nov 30 2020 33 '0.70' Nov 18 2020 Jan 12 2021 7,427 '0.68' Apr 01 2020 Apr 27 2021 54,179 '0.69' Jun 25 2020 Mar 08 2023 44,870 '0.71' Oct 17 2020 Feb 28 7,073 '0.72' Jun 24 2022 Mar 08 3,977 '0.73' Aug 14 2023 today ``` I won't capture the full list here (it's much longer), but the "nonsense" version report is currently: ```console Datasets with nonsensical version metadata: Count Version First Last ---------- ---------------------- ------------ ------------ 37 'system' Mar 30 2019 Apr 01 2019 54 'plugins:' Jan 26 2018 Apr 27 2021 5 '' Oct 02 2018 Dec 20 2021 3 'v(unknown)-g(unknown)' Dec 14 2020 Sep 30 2022 ``` --- lib/pbench/cli/server/report.py | 111 ++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/lib/pbench/cli/server/report.py b/lib/pbench/cli/server/report.py index ea80a87dc8..4482d55fda 100644 --- a/lib/pbench/cli/server/report.py +++ b/lib/pbench/cli/server/report.py @@ -1,4 +1,5 @@ from collections import defaultdict +from dataclasses import dataclass import datetime from operator import and_ from pathlib import Path @@ -514,6 +515,108 @@ def report_uploads(options: dict[str, Any]): summarize_dates(rows, options) +def report_agent(options: dict[str, Any]): + """Report dataset statistics by agent version""" + + v_pattern = re.compile(r"v?(?P\d+\.\d+)(?:\.\d+)?(?:-\w+)") + + @dataclass + class Daterange: + first: Optional[datetime.datetime] = None + last: Optional[datetime.datetime] = None + + def add(self, date: datetime.datetime): + if self.first is None or date < self.first: + self.first = date + if self.last is None or date > self.last: + self.last = date + + def print_versions(target: dict[str, Daterange], counts: dict[str, int]): + click.echo(f" {'Count':^10s} {'Version':^22s} {'First':^12s} {'Last':^12s}") + click.echo(f" {'':-<10} {'':-<22} {'':-<12} {'':-<12}") + for version, dates in sorted(target.items(), key=lambda k: k[1].last): + count = counts[version] + first = humanize.naturaldate(dates.first) + last = humanize.naturaldate(dates.last) + click.echo(f" {count:>10,d} {version!r:^22s} {first:>12s} {last:>12s}") + + watcher.update("analyzing version patterns") + since = options.get("since") + until = options.get("until") + + if since and until and since > until: + raise Exception("The --until value must be later than the --since value") + + rows = Database.db_session.query( + cast(Metadata.value["pbench", "date"].as_string(), TZDateTime).label("date"), + Metadata.value["pbench", "rpm-version"].as_string().label("version"), + ).filter(Metadata.key == "metalog") + + count = 0 + dateless = 0 + versionless = 0 + + versions = defaultdict(int) + majorversions = defaultdict(int) + nonversions = defaultdict(int) + range = defaultdict(Daterange) + majorrange = defaultdict(Daterange) + nonversionrange = defaultdict(Daterange) + + filters = [] + + # Create a subquery from our basic select parameters so that we can use + # the label (SQL "AS date") in our WHERE filter clauses. (In a direct query + # PostgreSQL doesn't allow filtering on renamed columns.) + subquery = rows.subquery() + query = Database.db_session.query(subquery.c.date, subquery.c.version).order_by( + subquery.c.date + ) + + if since: + verifier.status(f"Filter since {since}") + filters.append(subquery.c.date >= since) + if until: + verifier.status(f"Filter until {until}") + filters.append(subquery.c.date <= until) + if filters: + query = query.filter(*filters) + rows = query.execution_options(stream_results=True).yield_per(SQL_CHUNK) + + for row in rows: + count += 1 + date: datetime.datetime = row[0] + version = row[1] + if not isinstance(version, str): + versionless += 1 + continue + if not isinstance(date, datetime.datetime): + dateless += 1 + date = datetime.datetime.fromtimestamp(0.0) + m = v_pattern.match(version) + if m: + maj = m.group("major") + versions[version] += 1 + majorversions[maj] += 1 + range[version].add(date) + majorrange[maj].add(date) + else: + nonversions[version] += 1 + nonversionrange[version].add(date) + + click.echo("Dataset statistics by Pbench Agent version:") + print_versions(majorrange, majorversions) + if options.get("detail"): + click.echo("Dataset statistics by full Pbench Agent version:") + print_versions(range, versions) + click.echo("Datasets with nonsensical version metadata:") + print_versions(nonversionrange, nonversions) + if dateless: + click.echo(f"{dateless:,d} datasets lack a date") + if versionless: + click.echo(f"{versionless:,d} datasets lack a Pbench Agent version") + + def report_audit(): """Report audit log statistics.""" @@ -693,6 +796,12 @@ def report_states(): @click.command(name="pbench-report-generator") @pass_cli_context +@click.option( + "--agent", + default=False, + is_flag=True, + help="Display Pbench Agent version statistics", +) @click.option("--all", "-a", default=False, is_flag=True, help="Display full report") @click.option( "--archive", "-A", default=False, is_flag=True, help="Display archive statistics" @@ -790,6 +899,8 @@ def report(context: object, **kwargs): else: click.echo(f"Unexpected statistics option {stats}", err=True) rv = 1 + if kwargs.get("all") or kwargs.get("agent"): + report_agent(kwargs) if kwargs.get("all") or kwargs.get("audit"): report_audit() if kwargs.get("all") or kwargs.get("sql"): From 2d194710af2e180b0a885c4ee604841fcbe750a4 Mon Sep 17 00:00:00 2001 From: David Butenhof Date: Mon, 22 Jul 2024 11:32:00 -0400 Subject: [PATCH 2/2] Tweaks --- lib/pbench/cli/server/report.py | 55 ++++++++++++++++++++------------- server/requirements.txt | 1 + 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/lib/pbench/cli/server/report.py b/lib/pbench/cli/server/report.py index 4482d55fda..72856bd899 100644 --- a/lib/pbench/cli/server/report.py +++ b/lib/pbench/cli/server/report.py @@ -385,9 +385,6 @@ def summarize_dates(base_query: Query, options: dict[str, Any]): since = options.get("since") until = options.get("until") - if since and until and since > until: - raise Exception("The --until value must be later than the --since value") - by_year = defaultdict(int) by_month = defaultdict(int) by_day = defaultdict(int) @@ -518,7 +515,7 @@ def report_uploads(options: dict[str, Any]): def report_agent(options: dict[str, Any]): """Report dataset statistics by agent version""" - v_pattern = re.compile(r"v?(?P\d+\.\d+)(?:\.\d+)?(?:-\w+)") + v_pattern = re.compile(r"(?P\d+\.\d+)(?:\.\d+)?(?:-\w+)") @dataclass class Daterange: @@ -531,26 +528,40 @@ def add(self, date: datetime.datetime): if self.last is None or date > self.last: self.last = date - def print_versions(target: dict[str, Daterange], counts: dict[str, int]): - click.echo(f" {'Count':^10s} {'Version':^22s} {'First':^12s} {'Last':^12s}") - click.echo(f" {'':-<10} {'':-<22} {'':-<12} {'':-<12}") + def print_versions( + target: dict[str, Daterange], counts: dict[str, int], quote: bool = False + ): + cw = 10 + vw = 23 + dw = 11 + click.echo( + f" {'Count':^{cw}s} {'Version':^{vw}s} {'First':^{dw}s} {'Last':^{dw}s}" + ) + click.echo(f" {'':-<{cw}} {'':-<{vw}} {'':-<{dw}} {'':-<{dw}}") for version, dates in sorted(target.items(), key=lambda k: k[1].last): count = counts[version] first = humanize.naturaldate(dates.first) last = humanize.naturaldate(dates.last) - click.echo(f" {count:>10,d} {version!r:^22s} {first:>12s} {last:>12s}") + v = "'" + version + "'" if quote else version + click.echo(f" {count:>{cw},d} {v:^{vw}s} {first:>{dw}s} {last:>{dw}s}") watcher.update("analyzing version patterns") since = options.get("since") until = options.get("until") - if since and until and since > until: - raise Exception("The --until value must be later than the --since value") - - rows = Database.db_session.query( - cast(Metadata.value["pbench", "date"].as_string(), TZDateTime).label("date"), - Metadata.value["pbench", "rpm-version"].as_string().label("version"), - ).filter(Metadata.key == "metalog") + # Create a subquery from our basic select parameters so that we can use + # the label (SQL "AS date") in our WHERE filter clauses. (In a direct query + # PostgreSQL doesn't allow filtering on renamed columns.) + subquery = ( + Database.db_session.query( + cast(Metadata.value["pbench", "date"].as_string(), TZDateTime).label( + "date" + ), + Metadata.value["pbench", "rpm-version"].as_string().label("version"), + ) + .filter(Metadata.key == "metalog") + .subquery() + ) count = 0 dateless = 0 @@ -565,10 +576,6 @@ def print_versions(target: dict[str, Daterange], counts: dict[str, int]): filters = [] - # Create a subquery from our basic select parameters so that we can use - # the label (SQL "AS date") in our WHERE filter clauses. (In a direct query - # PostgreSQL doesn't allow filtering on renamed columns.) - subquery = rows.subquery() query = Database.db_session.query(subquery.c.date, subquery.c.version).order_by( subquery.c.date ) @@ -593,7 +600,7 @@ def print_versions(target: dict[str, Daterange], counts: dict[str, int]): if not isinstance(date, datetime.datetime): dateless += 1 date = datetime.datetime.fromtimestamp(0.0) - m = v_pattern.match(version) + m = v_pattern.search(version) if m: maj = m.group("major") versions[version] += 1 @@ -610,7 +617,7 @@ def print_versions(target: dict[str, Daterange], counts: dict[str, int]): click.echo("Dataset statistics by full Pbench Agent version:") print_versions(range, versions) click.echo("Datasets with nonsensical version metadata:") - print_versions(nonversionrange, nonversions) + print_versions(nonversionrange, nonversions, quote=True) if dateless: click.echo(f"{dateless:,d} datasets lack a date") if versionless: @@ -875,6 +882,12 @@ def report(context: object, **kwargs): rv = 0 try: + + since = kwargs.get("since") + until = kwargs.get("until") + if since and until and since > until: + raise Exception("The --until value must be later than the --since value") + config = config_setup(context) logger = get_pbench_logger("pbench-report-generator", config) cache_m = CacheManager(config, logger) diff --git a/server/requirements.txt b/server/requirements.txt index 837ade3eff..8673bc0d28 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -13,6 +13,7 @@ flask-sqlalchemy gunicorn humanfriendly humanize +numpy<2.0 # Indirect: elasticsearch pquisby psycopg2 pyesbulk>=2.0.1