Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluate Model Responses #19

Merged
merged 4 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions parley/management/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# parley.management
# Management and admin utilities for the parley app.
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Mon Oct 07 15:48:19 2024 -0500
#
# Copyright (C) 2024 Rotational Labs, Inc.
# For license information, see LICENSE
#
# ID: __init__.py [] [email protected] $

"""
Management and admin utilities for the parley app.
"""
14 changes: 14 additions & 0 deletions parley/management/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# parley.management.commands
# Management CLI commands for the parley app.
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Mon Oct 07 15:48:19 2024 -0500
#
# Copyright (C) 2024 Rotational Labs, Inc.
# For license information, see LICENSE
#
# ID: __init__.py [] [email protected] $

"""
Management CLI commands for the parley app.
"""
176 changes: 176 additions & 0 deletions parley/management/commands/analyze.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# parley.management.commands.analyze
# Runs the metrics analytics commands and caches them on the model evaluations.
#
# Author: Benjamin Bengfort <[email protected]>
# Created: Mon Oct 07 15:50:03 2024 -0500
#
# Copyright (C) 2024 Rotational Labs, Inc.
# For license information, see LICENSE
#
# ID: analyze.py [] [email protected] $

"""
Runs the metrics analytics commands and caches them on the model evaluations.
"""

##########################################################################
## Imports
##########################################################################

from parley.models import ModelEvaluation, Sensitive

from parley.tasks import cache_metrics, evaluate_label_correct
from parley.tasks import evaluate_valid_output_type, evaluate_leaks_sensitive
from parley.tasks import evaluate_cyberjudge_label_correct, extract_cyberjudge_label

from django.core.management.base import BaseCommand, CommandError


class Command(BaseCommand):

help = "Run metrics analytics tasks and cache them on model evaluations"

def add_arguments(self, parser):
parser.add_argument(
"-C", "--cyberjudge", action="store_true",
help="run the cyberjudge evaluations before metrics",
)
parser.add_argument(
"-L", "--labels", action="store_true",
help="run the simple label correctness evaluation before metrics"
)
parser.add_argument(
"-O", "--output-type", action="store_true",
help="evaluate output type format before metrics",
)
parser.add_argument(
"-S", "--sensitive", action="store_true",
help="evaluation sensitive leaks before metrics"
)
parser.add_argument(
"-A", "--all", action="store_true",
help="run the evaluation metrics across all model evaluations",
)
parser.add_argument(
"-y", "--yes", action="store_true",
help="do not prompt for input on the command line",
)
parser.add_argument(
"-f", "--filter", type=str, metavar="name",
help="filter the model evaluations to run based on evaluation name"
)
parser.add_argument(
"model_evaluations", nargs="*", metavar="uuid",
help="specify the model evaluation(s) to run analytics for"
)
return super().add_arguments(parser)

def handle(self, *args, **opts):
# Make sure the user doesn't shoot their thumb off.
self.validate_input(*args, **opts)

# Lookup the model evaluations specified by the user in the database
model_evaluations = self.get_queryset(**opts)
n_model_evals = model_evaluations.count()
if n_model_evals == 0:
raise CommandError("no model evaluations found for criteria")

print(f"found {n_model_evals} model evaluations for criteria:")
for me in model_evaluations:
print(f" - {str(me)}")

# Check that the user wants to continue
if not opts["yes"]:
if not self.confirm("continue with analysis?"):
self.stdout.write(
self.style.WARNING("canceled operation by user")
)

# Prepare pre-checks
self._sensitive = None
prechecks = self.prechecks(**opts)
if len(prechecks) > 0:
print(f"performing {len(prechecks)} prechecks on responses")

for me in model_evaluations:
# Perform any required pre-checks
if len(prechecks) > 0:
for response in me.responses():
for check in prechecks:
check(response)

# Cache metrics for the evaluation
cache_metrics(me)

self.stdout.write(
self.style.SUCCESS("successfully completed analysis")
)

def validate_input(self, *args, **opts):
if opts["cyberjudge"] and opts["labels"]:
raise CommandError("specify either cyberjudge or simple labeling")

model_evaluations = opts["model_evaluations"]
if len(model_evaluations) > 0 and opts["all"]:
raise CommandError("specify either model evaluations or --all not both")

if len(model_evaluations) > 0 and opts["filter"]:
raise CommandError("specify either model evaluations or --filter not both")

if opts["all"] and opts["filter"]:
raise CommandError("specify either --all or --filter not both")

def get_queryset(self, **opts):
if opts["all"]:
return ModelEvaluation.objects.all()

if opts["filter"]:
return ModelEvaluation.objects.filter(
evaluation__name__icontains=opts["filter"]
)

if opts["model_evaluations"]:
return ModelEvaluation.objects.filter(id__in=opts["model_evaluations"])

def confirm(self, prompt):
while True:
result = input(f"{prompt} [Y/n] ").strip().lower()
if result[0] == "y":
return True

if result[0] == "n":
return False

def prechecks(self, **opts):
"""
Puts together the precheck functions.
"""
prechecks = []
precheck_funcs = (
("output_type", self.evaluate_output_type),
("sensitive", self.evaluate_sensitive),
("cyberjudge", self.evaluate_cyberjudge),
("labels", self.evaluate_labels),
)

# NOTE: it is assumed precheck_funcs is sorted by application order
for name, fn in precheck_funcs:
if opts[name]:
prechecks.append(fn)

return prechecks

def evaluate_cyberjudge(self, response):
extract_cyberjudge_label(response)
evaluate_cyberjudge_label_correct(response)

def evaluate_labels(self, response):
evaluate_label_correct(response)

def evaluate_output_type(self, response):
evaluate_valid_output_type(response)

def evaluate_sensitive(self, response):
if self._sensitive is None:
self._sensitive = list(Sensitive.objects.all())
evaluate_leaks_sensitive(response, self._sensitive)
121 changes: 121 additions & 0 deletions parley/migrations/0003_modify_modelevaluation_metrics_storage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Generated by Django 5.1.1 on 2024-10-07 20:30

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("parley", "0002_prompt_title_sensitive_is_name"),
]

operations = [
migrations.RemoveField(
model_name="modelevaluation",
name="confabulations_processed",
),
migrations.RemoveField(
model_name="modelevaluation",
name="labels_processed",
),
migrations.RemoveField(
model_name="modelevaluation",
name="output_type_processed",
),
migrations.RemoveField(
model_name="modelevaluation",
name="readable_processed",
),
migrations.RemoveField(
model_name="modelevaluation",
name="sensitive_processed",
),
migrations.RemoveField(
model_name="modelevaluation",
name="similarity_processed",
),
migrations.AddField(
model_name="modelevaluation",
name="n_invalid_output_type",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AddField(
model_name="modelevaluation",
name="n_labeled_incorrectly",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AddField(
model_name="modelevaluation",
name="n_no_sensitive_leaks",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AddField(
model_name="modelevaluation",
name="n_not_confabulation",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AddField(
model_name="modelevaluation",
name="n_not_readable",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AddField(
model_name="modelevaluation",
name="n_not_similar",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AlterField(
model_name="modelevaluation",
name="n_confabulations",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AlterField(
model_name="modelevaluation",
name="n_labeled_correctly",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AlterField(
model_name="modelevaluation",
name="n_leaks_sensitive",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AlterField(
model_name="modelevaluation",
name="n_readable",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AlterField(
model_name="modelevaluation",
name="n_similar",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
migrations.AlterField(
model_name="modelevaluation",
name="n_valid_output_type",
field=models.IntegerField(
blank=True, default=None, editable=False, null=True
),
),
]
Loading