diff --git a/parlance/settings/base.py b/parlance/settings/base.py index 05dbfcf..ac60e99 100644 --- a/parlance/settings/base.py +++ b/parlance/settings/base.py @@ -44,7 +44,7 @@ PROJECT = CONFDIR.parent.parent.parent -def environ_setting(name, required=False, default=None): +def environ_setting(name, default=None, required=False): """ Fetch setting from the environment or use the default. If required is set to True then a warning is raised that Django is not configured properly. @@ -158,6 +158,9 @@ def parse_bool(val): STATIC_URL = "static/" STATICFILES_DIRS = (PROJECT / "static",) +# Media files (uploads) +MEDIA_URL = "uploads/" + TEMPLATES = [ { 'BACKEND': 'django.template.backends.django.DjangoTemplates', @@ -228,7 +231,7 @@ def parse_bool(val): ## Logging and Error Reporting ########################################################################## -ADMINS = [("Parlance Admin", environ_setting("ADMIN_EMAIL", ""))] +ADMINS = [("Parlance Admin", environ_setting("ADMIN_EMAIL", default=""))] SERVER_EMAIL = environ_setting("SERVER_EMAIL", default="") EMAIL_USE_TLS = True diff --git a/parlance/settings/container.py b/parlance/settings/container.py index 228dcd5..de33ca6 100644 --- a/parlance/settings/container.py +++ b/parlance/settings/container.py @@ -19,6 +19,7 @@ from .base import * # noqa from .base import PROJECT +from .base import environ_setting ########################################################################## @@ -34,5 +35,8 @@ ] ## Static files served by WhiteNoise -STATIC_ROOT = PROJECT / "assets" +STATIC_ROOT = environ_setting("STATIC_ROOT", default=PROJECT / "storage" / "static") STATICFILES_STORAGE = "whitenoise.storage.CompressedManifestStaticFilesStorage" + +## Media files and uploads +MEDIA_ROOT = environ_setting("MEDIA_ROOT", default=PROJECT / "storage" / "uploads") diff --git a/parlance/settings/development.py b/parlance/settings/development.py index 4537484..a632c0e 100644 --- a/parlance/settings/development.py +++ b/parlance/settings/development.py @@ -37,10 +37,10 @@ "http://127.0.0.1:8000", ] -MEDIA_ROOT = PROJECT / "tmp" / "media" +MEDIA_ROOT = PROJECT / "tmp" / "uploads" ## Static files served by WhiteNoise nostatic server -STATIC_ROOT = PROJECT / "tmp" / "assets" +STATIC_ROOT = PROJECT / "tmp" / "static" STATICFILES_STORAGE = "whitenoise.storage.CompressedManifestStaticFilesStorage" # Debugging email without SMTP diff --git a/parlance/settings/production.py b/parlance/settings/production.py index 6901168..7a5f9a8 100644 --- a/parlance/settings/production.py +++ b/parlance/settings/production.py @@ -51,9 +51,12 @@ SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https") ## Static files served by WhiteNoise -STATIC_ROOT = PROJECT / "assets" +STATIC_ROOT = environ_setting("STATIC_ROOT", default=PROJECT / "storage" / "static") STATICFILES_STORAGE = "whitenoise.storage.CompressedManifestStaticFilesStorage" +## Media files and uploads +MEDIA_ROOT = environ_setting("MEDIA_ROOT", default=PROJECT / "storage" / "uploads") + ########################################################################## ## Sentry Error Management diff --git a/parley/forms.py b/parley/forms.py index b928b87..a16739e 100644 --- a/parley/forms.py +++ b/parley/forms.py @@ -25,7 +25,7 @@ from collections import defaultdict from parley.exceptions import ParlanceUploadError from django.core.exceptions import ValidationError -from parley.models import LLM, Evaluation, Prompt, Response +from parley.models import LLM, Evaluation, Prompt, Response, Sensitive ########################################################################## @@ -135,6 +135,7 @@ def handle_uploaded_file(self, td, f, counts): 'evaluation': Evaluation, 'prompt': Prompt, 'response': Response, + 'sensitive': Sensitive, }.get(row.pop('type').strip().lower(), None) if rtype is None: diff --git a/parley/migrations/0001_initial.py b/parley/migrations/0001_initial.py index 08f551f..1d53575 100644 --- a/parley/migrations/0001_initial.py +++ b/parley/migrations/0001_initial.py @@ -1,8 +1,10 @@ -# Generated by Django 5.1.1 on 2024-10-03 20:30 +# Generated by Django 5.1.1 on 2024-10-05 22:20 import django.db.models.deletion +import parley.models.llm import parley.validators import uuid +from django.conf import settings from django.db import migrations, models @@ -10,7 +12,9 @@ class Migration(migrations.Migration): initial = True - dependencies = [] + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] operations = [ migrations.CreateModel( @@ -65,6 +69,31 @@ class Migration(migrations.Migration): null=True, ), ), + ( + "similarity_metric", + models.CharField( + choices=[ + ("csti", "Cosine TF-IDF"), + ("cstf", "Cosine Term Frequency"), + ("jacc", "Jaccard"), + ("w2vc", "Cosine of Word2Vec Average"), + ("glve", "Cosine of GloVe Average"), + ("fast", "Cosine of FastText Average"), + ("d2vc", "Cosine of Doc2Vec"), + ("bert", "Cosine of BERT"), + ], + default="csti", + help_text="The similarity metric used to compare output to expected output", + max_length=4, + ), + ), + ( + "similarity_threshold", + models.FloatField( + default=0.5, + help_text="The similarity threshold to determine if an output is generally correct or not", + ), + ), ( "active", models.BooleanField( @@ -134,6 +163,16 @@ class Migration(migrations.Migration): null=True, ), ), + ( + "cover_image", + models.ImageField( + blank=True, + default=None, + help_text="A 4x3 image representing the model for the profile page", + null=True, + upload_to=parley.models.llm.llm_cover_upload_path, + ), + ), ( "model_config", models.JSONField( @@ -213,9 +252,162 @@ class Migration(migrations.Migration): "db_table": "llms", "ordering": ("-trained_on",), "get_latest_by": "trained_on", - "unique_together": {("name", "version")}, }, ), + migrations.CreateModel( + name="Sensitive", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, + help_text="The timestamp that the object was created", + ), + ), + ( + "modified", + models.DateTimeField( + auto_now=True, + help_text="The timestamp that the object was last modified", + ), + ), + ( + "term", + models.CharField( + help_text="The search term to look for sensitive data in output", + max_length=255, + unique=True, + ), + ), + ( + "is_regex", + models.BooleanField( + default=False, + help_text="If the term is a regular expression to analyze the output on", + ), + ), + ], + options={ + "verbose_name": "sensitive", + "verbose_name_plural": "sensitive", + "db_table": "sensitive", + "ordering": ("-created",), + "get_latest_by": "created", + }, + ), + migrations.CreateModel( + name="ModelEvaluation", + fields=[ + ( + "id", + models.UUIDField( + default=uuid.uuid4, + editable=False, + help_text="The globally unique identifier of the object", + primary_key=True, + serialize=False, + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, + help_text="The timestamp that the object was created", + ), + ), + ( + "modified", + models.DateTimeField( + auto_now=True, + help_text="The timestamp that the object was last modified", + ), + ), + ("metrics_cached", models.BooleanField(default=False, editable=False)), + ( + "metrics_last_cached_on", + models.DateTimeField(default=None, editable=False, null=True), + ), + ("n_prompts", models.IntegerField(default=0, editable=False)), + ("n_responses", models.IntegerField(default=0, editable=False)), + ( + "similarity_processed", + models.BooleanField(default=False, editable=False), + ), + ("n_similar", models.IntegerField(default=0, editable=False)), + ( + "labels_processed", + models.BooleanField(default=False, editable=False), + ), + ("n_labeled_correctly", models.IntegerField(default=0, editable=False)), + ( + "output_type_processed", + models.BooleanField(default=False, editable=False), + ), + ("n_valid_output_type", models.IntegerField(default=0, editable=False)), + ( + "sensitive_processed", + models.BooleanField(default=False, editable=False), + ), + ("n_leaks_sensitive", models.IntegerField(default=0, editable=False)), + ( + "confabulations_processed", + models.BooleanField(default=False, editable=False), + ), + ("n_confabulations", models.IntegerField(default=0, editable=False)), + ( + "readable_processed", + models.BooleanField(default=False, editable=False), + ), + ("n_readable", models.IntegerField(default=0, editable=False)), + ( + "evaluation", + models.ForeignKey( + help_text="The evaluation associated with the model", + on_delete=django.db.models.deletion.CASCADE, + related_name="model_evaluations", + to="parley.evaluation", + ), + ), + ( + "model", + models.ForeignKey( + help_text="The LLM that needs to be evaluated", + on_delete=django.db.models.deletion.CASCADE, + related_name="model_evaluations", + to="parley.llm", + ), + ), + ], + options={ + "db_table": "model_evaluations", + "ordering": ("-created",), + "get_latest_by": "created", + "unique_together": {("model", "evaluation")}, + }, + ), + migrations.AddField( + model_name="llm", + name="evaluations", + field=models.ManyToManyField( + through="parley.ModelEvaluation", to="parley.evaluation" + ), + ), + migrations.AddField( + model_name="evaluation", + name="llms", + field=models.ManyToManyField( + through="parley.ModelEvaluation", to="parley.llm" + ), + ), migrations.CreateModel( name="Prompt", fields=[ @@ -278,20 +470,49 @@ class Migration(migrations.Migration): ), ), ( - "expected_output", + "expected_output_type", models.CharField( choices=[ - ("text", "Text"), + ("text", "text"), ("json", "JSON"), - ("img", "Image"), - ("viz", "Visualization"), + ("xml", "XML"), ("csv", "CSV"), + ("img", "image"), + ("dviz", "Data Visualization"), ], default="text", help_text="Specify the expected type of output for the prompt to validate", max_length=4, ), ), + ( + "expected_output", + models.TextField( + blank=True, + default=None, + help_text="Expected output for the prompt to use similarity scoring with", + null=True, + ), + ), + ( + "expected_label", + models.CharField( + blank=True, + default=None, + help_text="For classifiers, expected label that should be contained in output", + max_length=255, + null=True, + ), + ), + ( + "order", + models.IntegerField( + blank=True, + default=None, + help_text="Manually specify the order of the prompts for review", + null=True, + ), + ), ( "exclude", models.BooleanField( @@ -313,7 +534,7 @@ class Migration(migrations.Migration): "verbose_name": "prompt", "verbose_name_plural": "prompts", "db_table": "prompts", - "ordering": ("-created",), + "ordering": ("order", "-created"), "get_latest_by": "created", }, ), @@ -352,11 +573,50 @@ class Migration(migrations.Migration): ), ), ( - "valid_output", + "output_similarity", + models.FloatField( + blank=True, + default=None, + help_text="The similarity score of this response to the expected output", + null=True, + ), + ), + ( + "is_similar", models.BooleanField( blank=True, default=None, - help_text="Based on the expected output, is it parseable; e.g. if the output is supposed to be JSON, is it correct?", + help_text="Was the similarity score greater than the threshold?", + null=True, + ), + ), + ( + "label", + models.CharField( + blank=True, + default=None, + help_text="For classifiers, label that is extracted from the output", + max_length=255, + null=True, + ), + ), + ( + "label_correct", + models.BooleanField( + blank=True, + default=None, + help_text="Was the output label correct based on the expected label (or almost correct for fuzzy label matching)", + null=True, + ), + ), + ( + "valid_output_type", + models.BooleanField( + blank=True, + default=None, + help_text=( + "Based on the expected output type, is it parseable; e.g. if the output is supposed to be JSON, can it be correctly decoded?", + ), null=True, ), ), @@ -384,7 +644,7 @@ class Migration(migrations.Migration): models.BooleanField( blank=True, default=None, - help_text="Does the output contain gramatically correct, understandable language?", + help_text="Does the output contain grammatically correct, understandable language?", null=True, ), ), @@ -442,4 +702,191 @@ class Migration(migrations.Migration): "get_latest_by": "created", }, ), + migrations.CreateModel( + name="ResponseReview", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, + help_text="The timestamp that the object was created", + ), + ), + ( + "modified", + models.DateTimeField( + auto_now=True, + help_text="The timestamp that the object was last modified", + ), + ), + ( + "output_correct", + models.BooleanField( + blank=True, + default=None, + help_text="Was the output correct based on the expected output or the prompt (or almost correct for fuzzy qualitative correctness)?", + null=True, + ), + ), + ( + "label_correct", + models.BooleanField( + blank=True, + default=None, + help_text="Was the output label correct based on the expected label (or almost correct for fuzzy label matching)?", + null=True, + ), + ), + ( + "is_confabulation", + models.BooleanField( + blank=True, + default=None, + help_text="Is the output a hallucination or confabulation?", + null=True, + ), + ), + ( + "is_readable", + models.BooleanField( + blank=True, + default=None, + help_text="Does the output contain grammatically correct, understandable language?", + null=True, + ), + ), + ( + "response", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="reviews", + to="parley.response", + ), + ), + ], + options={ + "db_table": "response_reviews", + "ordering": ("-created",), + "get_latest_by": "created", + }, + ), + migrations.CreateModel( + name="ReviewTask", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created", + models.DateTimeField( + auto_now_add=True, + help_text="The timestamp that the object was created", + ), + ), + ( + "modified", + models.DateTimeField( + auto_now=True, + help_text="The timestamp that the object was last modified", + ), + ), + ( + "started_on", + models.DateTimeField( + default=None, + help_text="The timestamp that the review was start on, null if not started", + null=True, + ), + ), + ( + "completed_on", + models.DateTimeField( + default=None, + help_text="The timestamp that the review was completed, null if not completed", + null=True, + ), + ), + ( + "evaluation", + models.ForeignKey( + help_text="The evaluation the user is performing", + on_delete=django.db.models.deletion.CASCADE, + related_name="review_tasks", + to="parley.evaluation", + ), + ), + ( + "responses", + models.ManyToManyField( + through="parley.ResponseReview", to="parley.response" + ), + ), + ( + "user", + models.ForeignKey( + help_text="The user that is conducting the evaluation", + on_delete=django.db.models.deletion.CASCADE, + related_name="review_tasks", + to=settings.AUTH_USER_MODEL, + ), + ), + ], + options={ + "db_table": "review_tasks", + "ordering": ("-created",), + "get_latest_by": "created", + "unique_together": {("user", "evaluation")}, + }, + ), + migrations.AddField( + model_name="responsereview", + name="review", + field=models.ForeignKey( + help_text="The individual response reviews in a review", + on_delete=django.db.models.deletion.CASCADE, + related_name="response_reviews", + to="parley.reviewtask", + ), + ), + migrations.AddField( + model_name="response", + name="reviewers", + field=models.ManyToManyField( + through="parley.ResponseReview", to="parley.reviewtask" + ), + ), + migrations.AddField( + model_name="evaluation", + name="reviewers", + field=models.ManyToManyField( + through="parley.ReviewTask", to=settings.AUTH_USER_MODEL + ), + ), + migrations.AlterUniqueTogether( + name="llm", + unique_together={("name", "version")}, + ), + migrations.AlterUniqueTogether( + name="responsereview", + unique_together={("review", "response")}, + ), + migrations.AlterUniqueTogether( + name="response", + unique_together={("model", "prompt")}, + ), ] diff --git a/parley/models.py b/parley/models.py deleted file mode 100644 index 5306315..0000000 --- a/parley/models.py +++ /dev/null @@ -1,359 +0,0 @@ -# parley.models -# Parley app models and database definition. -# -# Author: Benjamin Bengfort -# Created: Tue Oct 01 17:59:43 2024 -0500 -# -# Copyright (C) 2024 Rotational Labs, Inc. -# For license information, see LICENSE -# -# ID: models.py [] benjamin@rotational.io $ - -""" -Parley app models and database definition. -""" - -########################################################################## -## Imports -########################################################################## - -import uuid - -from django.db import models -from django.urls import reverse -from .validators import validate_semver - - -########################################################################## -## Base Model -########################################################################## - -class BaseModel(models.Model): - """ - In order to make it easier to ingest data and audit records added to parlance, the - base model uses UUIDs as the primary key instead of sequences and adds timestamps - to track modifications to all objects in the system. - """ - - id = models.UUIDField( - primary_key=True, - default=uuid.uuid4, - editable=False, - help_text="The globally unique identifier of the object" - ) - - created = models.DateTimeField( - auto_now_add=True, - editable=False, - help_text="The timestamp that the object was created", - ) - - modified = models.DateTimeField( - auto_now=True, - editable=False, - help_text="The timestamp that the object was last modified", - ) - - class Meta: - abstract = True - - -########################################################################## -## LLM Models, Prompts, and Responses -########################################################################## - -class LLM(BaseModel): - """ - A record of an instantiated, trained LLM model to evaluate. - """ - - name = models.CharField( - default=None, null=False, blank=False, max_length=255, - help_text="The name of the model or model family being evaluated", - ) - - version = models.CharField( - default="0.1.0", null=False, blank=False, max_length=32, - help_text="The semantic version of the model for instance identification", - validators=[validate_semver], - ) - - description = models.TextField( - null=True, blank=True, default=None, - help_text="Any notes or other descriptive information about the model or training process", - ) - - model_config = models.JSONField( - null=True, default=None, blank=True, - help_text="Configuration for instantiating the model", - ) - - generation_config = models.JSONField( - null=True, default=None, blank=True, - help_text="The standardized generation config of the model", - ) - - quantization_info = models.JSONField( - null=True, default=None, blank=True, - help_text="Information about the quantization of the model, if any", - ) - - tokenizer_config = models.JSONField( - null=True, default=None, blank=True, - help_text="The standardized tokenization info of the model", - ) - - max_new_tokens = models.IntegerField( - null=True, default=None, editable=True, blank=True, - help_text="The maximum new tokens allowed during inferencing", - ) - - is_adapter_model = models.BooleanField( - null=True, default=None, blank=True, - help_text="Defines if this model is a base model or a LoRA", - ) - - trained_on = models.DateTimeField( - null=True, default=None, blank=True, - help_text="The timestamp that the model started training" - ) - - training_duration = models.DurationField( - null=True, default=None, blank=True, - help_text="The amount of time it took to train the model" - ) - - class Meta: - db_table = "llms" - ordering = ("-trained_on",) - get_latest_by = "trained_on" - verbose_name = "LLM" - verbose_name_plural = "LLMs" - unique_together = ("name", "version") - - @property - def training_completed(self): - if self.trained_on is None or self.training_duration is None: - return None - return self.trained_on + self.training_duration - - def __str__(self): - return self.name - - -class Evaluation(BaseModel): - """ - An Evaluation is a collection of related prompts that are used to perform a - qualitative LLM evaluation. All of the prompts in the evaluation should be handled - together, both when creating prompt outputs for a specific model and performing - qualitative model evaluations. - """ - - name = models.CharField( - default=None, - null=False, - blank=False, - max_length=255, - help_text="The descriptive name of the evaluation prompts collection", - ) - - task = models.CharField( - default=None, - null=False, - blank=False, - max_length=255, - help_text="A description of the expected task or agent being evaluated", - ) - - description = models.TextField( - null=True, - blank=True, - default=None, - help_text="Any notes or other descriptive information about the evaluation", - ) - - active = models.BooleanField( - default=True, null=False, - help_text="This prompt set should be used in evaluations of new models", - ) - - class Meta: - db_table = "evaluations" - ordering = ("-created",) - get_latest_by = "created" - verbose_name = "evaluation" - verbose_name_plural = "evaluations" - - def __str__(self): - return self.name - - -class Prompt(BaseModel): - """ - A prompt is a single instance of an input to an LLM. - """ - - TEXT = "text" - JSON = "json" - IMAGE = "img" - VIZ = "viz" - CSV = "csv" - - OUTPUT_CHOICES = { - TEXT: "Text", - JSON: "JSON", - IMAGE: "Image", - VIZ: "Visualization", - CSV: "CSV", - } - - system = models.TextField( - null=True, default=None, blank=True, - help_text="The system prompt specified to the LLM", - ) - - prompt = models.TextField( - null=False, default=None, blank=False, - help_text="The prompt used to generate an output from an LLM" - ) - - evaluation = models.ForeignKey( - "parley.Evaluation", - null=False, - on_delete=models.CASCADE, - related_name="prompts", - help_text="The evaluation that this prompt is a part of" - ) - - history = models.JSONField( - null=True, default=None, blank=True, - help_text="An array of prompt IDs that precede this prompt during evaluation" - ) - - notes = models.TextField( - null=True, - blank=True, - default=None, - help_text="Any notes or other descriptive information about the prompt", - ) - - expected_output = models.CharField( - max_length=4, - choices=OUTPUT_CHOICES, - default=TEXT, - help_text="Specify the expected type of output for the prompt to validate", - ) - - exclude = models.BooleanField( - default=False, - help_text="Exclude this prompt from evaluations and from metrics", - ) - - class Meta: - db_table = "prompts" - ordering = ("-created",) - get_latest_by = "created" - verbose_name = "prompt" - verbose_name_plural = "prompts" - - -class Response(BaseModel): - """ - A response is an output generated by an LLM to a specific prompt. - """ - - model = models.ForeignKey( - "parley.LLM", - null=False, - on_delete=models.RESTRICT, - related_name="responses", - help_text="The LLM that generated the specified response", - ) - - prompt = models.ForeignKey( - "parley.Prompt", - null=False, - on_delete=models.CASCADE, - related_name="responses", - help_text="The prompt that this is an LLM response to", - ) - - output = models.TextField( - null=False, - default=None, - blank=False, - help_text="The output generated by the LLM in response to the prompt", - ) - - # For text - check to make sure it contains expected characters. - valid_output = models.BooleanField( - null=True, - default=None, - blank=True, - help_text="Based on the expected output, is it parseable; e.g. if the output is supposed to be JSON, is it correct?", # noqa - ) - - leaks_sensitive = models.BooleanField( - null=True, - default=None, - blank=True, - verbose_name="leaks sensitive data", - help_text="Does the output contain sensitive data that should not be leaked?", - ) - - # TODO: set this based on annotator agreement - is_confabulation = models.BooleanField( - null=True, - default=None, - blank=True, - help_text="Is the output a hallucination or confabulation?", - ) - - # TODO: set this based on annotator agreement - is_readable = models.BooleanField( - null=True, - default=None, - blank=True, - help_text="Does the output contain gramatically correct, understandable language?", - ) - - max_new_tokens = models.IntegerField( - null=True, default=None, editable=True, blank=True, - help_text="Set this field if different from the model configuration", - ) - - inference_on = models.DateTimeField( - null=True, - blank=True, - default=None, - help_text="The timestamp that the LLM started the inferencing", - ) - - inference_duration = models.DurationField( - null=True, - default=None, - blank=True, - help_text="The amount of time it took to perform the inference", - ) - - class Meta: - db_table = "responses" - ordering = ("-created",) - get_latest_by = "created" - verbose_name = "response" - verbose_name_plural = "responses" - - def get_previous(self): - try: - return self.get_previous_by_created() - except self.DoesNotExist: - return None - - def get_next(self): - try: - return self.get_next_by_created() - except self.DoesNotExist: - return None - - def get_absolute_url(self): - return reverse("response-detail", args=(self.id,)) diff --git a/parley/models/__init__.py b/parley/models/__init__.py new file mode 100644 index 0000000..b7f9c79 --- /dev/null +++ b/parley/models/__init__.py @@ -0,0 +1,24 @@ +# parley.models +# Parley app models and database definition. +# +# Author: Benjamin Bengfort +# Created: Tue Oct 01 17:59:43 2024 -0500 +# +# Copyright (C) 2024 Rotational Labs, Inc. +# For license information, see LICENSE +# +# ID: __init__.py [] benjamin@rotational.io $ + +""" +Parley app models and database definition. +""" + +########################################################################## +## Imports +########################################################################## + +from .evaluation import * +from .llm import * +from .sensitive import * +from .enums import * +from .user import * diff --git a/parley/models/base.py b/parley/models/base.py new file mode 100644 index 0000000..82713b9 --- /dev/null +++ b/parley/models/base.py @@ -0,0 +1,77 @@ +# parley.models.base +# The abstract base model for all parley models. +# +# Author: Benjamin Bengfort +# Created: Sat Oct 05 15:11:26 2024 -0500 +# +# Copyright (C) 2024 Rotational Labs, Inc. +# For license information, see LICENSE +# +# ID: base.py [] benjamin@rotational.io $ + +""" +The abstract base model for all parley models. +""" + +########################################################################## +## Imports +########################################################################## + +import uuid + +from django.db import models + + +########################################################################## +## Base Model +########################################################################## + +class BaseModel(models.Model): + """ + In order to make it easier to ingest data and audit records added to parlance, the + base model uses UUIDs as the primary key instead of sequences and adds timestamps + to track modifications to all objects in the system. + """ + + id = models.UUIDField( + primary_key=True, + default=uuid.uuid4, + editable=False, + help_text="The globally unique identifier of the object", + ) + + created = models.DateTimeField( + auto_now_add=True, + editable=False, + help_text="The timestamp that the object was created", + ) + + modified = models.DateTimeField( + auto_now=True, + editable=False, + help_text="The timestamp that the object was last modified", + ) + + class Meta: + abstract = True + + +class TimestampedModel(models.Model): + """ + Adds created and modified timestamps to sub-models. + """ + + created = models.DateTimeField( + auto_now_add=True, + editable=False, + help_text="The timestamp that the object was created", + ) + + modified = models.DateTimeField( + auto_now=True, + editable=False, + help_text="The timestamp that the object was last modified", + ) + + class Meta: + abstract = True diff --git a/parley/models/enums.py b/parley/models/enums.py new file mode 100644 index 0000000..6eb0de1 --- /dev/null +++ b/parley/models/enums.py @@ -0,0 +1,52 @@ +# parley.models.enums +# Enum constants for use as model choices to simplify enum management. +# +# Author: Benjamin Bengfort +# Created: Sat Oct 05 15:45:20 2024 -0500 +# +# Copyright (C) 2024 Rotational Labs, Inc. +# For license information, see LICENSE +# +# ID: enums.py [] benjamin@rotational.io $ + +""" +Enum constants for use as model choices to simplify enum management. +""" + +########################################################################## +## Imports +########################################################################## + +from django.db import models +from django.utils.translation import gettext_lazy as _ + + +class SimilarityMetric(models.TextChoices): + """ + Similarity metrics define the automated comparisons between an LLM response and the + expected output as defined by the prompt. + """ + + COSINE_TFIDF = ("csti", _("Cosine TF-IDF")) + COSINE_TF = ("cstf", _("Cosine Term Frequency")) + JACCARD = ("jacc", _("Jaccard")) + WORD2VEC = ("w2vc", _("Cosine of Word2Vec Average")) + GLOVE = ("glve", _("Cosine of GloVe Average")) + FAST = ("fast", _("Cosine of FastText Average")) + DOC2VEC = ("d2vc", _("Cosine of Doc2Vec")) + BERT = ("bert", _("Cosine of BERT")) + + +class OutputFormat(models.TextChoices): + """ + OutputFormat indicates the expected serialized response from an LLM to a prompt. + By default, this is usually just simple text, but LLMs and generative AI can also + produce JSON, XML, CSV data or even images or data visualizations. + """ + + TEXT = ("text", _("text")) + JSON = ("json", _("JSON")) + XML = ("xml", _("XML")) + CSV = ("csv", _("CSV")) + IMAGE = ("img", _("image")) + DATA_VIZ = ("dviz", _("Data Visualization")) diff --git a/parley/models/evaluation.py b/parley/models/evaluation.py new file mode 100644 index 0000000..22e4902 --- /dev/null +++ b/parley/models/evaluation.py @@ -0,0 +1,177 @@ +# parley.models.evaluation +# Evaluations and the prompts associated with those evaluations. +# +# Author: Benjamin Bengfort +# Created: Sat Oct 05 15:11:26 2024 -0500 +# +# Copyright (C) 2024 Rotational Labs, Inc. +# For license information, see LICENSE +# +# ID: evaluation.py [] benjamin@rotational.io $ + +""" +The abstract base model for all parley models. +""" + +########################################################################## +## Imports +########################################################################## + +from django.db import models + +from .base import BaseModel +from .enums import SimilarityMetric, OutputFormat + + +class Evaluation(BaseModel): + """ + An Evaluation is a collection of related prompts that are used to perform a + qualitative LLM evaluation. All of the prompts in the evaluation should be handled + together, both when creating prompt outputs for a specific model and performing + qualitative model evaluations. + """ + + name = models.CharField( + default=None, + null=False, + blank=False, + max_length=255, + help_text="The descriptive name of the evaluation prompts collection", + ) + + task = models.CharField( + default=None, + null=False, + blank=False, + max_length=255, + help_text="A description of the expected task or agent being evaluated", + ) + + description = models.TextField( + null=True, + blank=True, + default=None, + help_text="Any notes or other descriptive information about the evaluation", + ) + + similarity_metric = models.CharField( + null=False, + max_length=4, + choices=SimilarityMetric, + default=SimilarityMetric.COSINE_TFIDF, + help_text="The similarity metric used to compare output to expected output", + ) + + similarity_threshold = models.FloatField( + default=0.5, + help_text="The similarity threshold to determine if an output is generally correct or not", + ) + + active = models.BooleanField( + default=True, + null=False, + help_text="This prompt set should be used in evaluations of new models", + ) + + llms = models.ManyToManyField( + "parley.LLM", + through="parley.ModelEvaluation", + ) + + reviewers = models.ManyToManyField( + "auth.User", + through="parley.ReviewTask", + ) + + class Meta: + db_table = "evaluations" + ordering = ("-created",) + get_latest_by = "created" + verbose_name = "evaluation" + verbose_name_plural = "evaluations" + + def __str__(self): + return self.name + + +class Prompt(BaseModel): + """ + A prompt is a single instance of an input to an LLM. + """ + + system = models.TextField( + null=True, + default=None, + blank=True, + help_text="The system prompt specified to the LLM", + ) + + prompt = models.TextField( + null=False, + default=None, + blank=False, + help_text="The prompt used to generate an output from an LLM", + ) + + evaluation = models.ForeignKey( + "parley.Evaluation", + null=False, + on_delete=models.CASCADE, + related_name="prompts", + help_text="The evaluation that this prompt is a part of", + ) + + history = models.JSONField( + null=True, + default=None, + blank=True, + help_text="An array of prompt IDs that precede this prompt during evaluation", + ) + + notes = models.TextField( + null=True, + blank=True, + default=None, + help_text="Any notes or other descriptive information about the prompt", + ) + + expected_output_type = models.CharField( + max_length=4, + choices=OutputFormat, + default=OutputFormat.TEXT, + help_text="Specify the expected type of output for the prompt to validate", + ) + + expected_output = models.TextField( + null=True, + blank=True, + default=None, + help_text="Expected output for the prompt to use similarity scoring with", + ) + + expected_label = models.CharField( + null=True, + blank=True, + default=None, + max_length=255, + help_text="For classifiers, expected label that should be contained in output", + ) + + order = models.IntegerField( + null=True, + blank=True, + default=None, + help_text="Manually specify the order of the prompts for review", + ) + + exclude = models.BooleanField( + default=False, + help_text="Exclude this prompt from evaluations and from metrics", + ) + + class Meta: + db_table = "prompts" + ordering = ("order", "-created") + get_latest_by = "created" + verbose_name = "prompt" + verbose_name_plural = "prompts" diff --git a/parley/models/llm.py b/parley/models/llm.py new file mode 100644 index 0000000..157c38f --- /dev/null +++ b/parley/models/llm.py @@ -0,0 +1,438 @@ +# parley.models.llm +# LLM models for evaluation and their responses. +# +# Author: Benjamin Bengfort +# Created: Sat Oct 05 15:11:26 2024 -0500 +# +# Copyright (C) 2024 Rotational Labs, Inc. +# For license information, see LICENSE +# +# ID: llm.py [] benjamin@rotational.io $ + +""" +LLM models for evaluation and their responses. +""" + +########################################################################## +## Imports +########################################################################## + +import os +import json + +from .base import BaseModel +from django.db import models +from django.urls import reverse + +from parley.validators import validate_semver + + +########################################################################## +## Helpers +########################################################################## + +def llm_cover_upload_path(instance, filename): + _, ext = os.path.splitext(filename) + return os.path.join("covers", "llms", f"{instance.id}{ext}") + + +########################################################################## +## Models +########################################################################## + +class LLM(BaseModel): + """ + A record of an instantiated, trained LLM model to evaluate. + """ + + name = models.CharField( + default=None, + null=False, + blank=False, + max_length=255, + help_text="The name of the model or model family being evaluated", + ) + + version = models.CharField( + default="0.1.0", + null=False, + blank=False, + max_length=32, + help_text="The semantic version of the model for instance identification", + validators=[validate_semver], + ) + + description = models.TextField( + null=True, + blank=True, + default=None, + help_text="Any notes or other descriptive information about the model or training process", + ) + + cover_image = models.ImageField( + null=True, + blank=True, + default=None, + upload_to=llm_cover_upload_path, + help_text="A 4x3 image representing the model for the profile page", + ) + + model_config = models.JSONField( + null=True, + default=None, + blank=True, + help_text="Configuration for instantiating the model", + ) + + generation_config = models.JSONField( + null=True, + default=None, + blank=True, + help_text="The standardized generation config of the model", + ) + + quantization_info = models.JSONField( + null=True, + default=None, + blank=True, + help_text="Information about the quantization of the model, if any", + ) + + tokenizer_config = models.JSONField( + null=True, + default=None, + blank=True, + help_text="The standardized tokenization info of the model", + ) + + max_new_tokens = models.IntegerField( + null=True, + default=None, + editable=True, + blank=True, + help_text="The maximum new tokens allowed during inferencing", + ) + + is_adapter_model = models.BooleanField( + null=True, + default=None, + blank=True, + help_text="Defines if this model is a base model or a LoRA", + ) + + trained_on = models.DateTimeField( + null=True, + default=None, + blank=True, + help_text="The timestamp that the model started training", + ) + + training_duration = models.DurationField( + null=True, + default=None, + blank=True, + help_text="The amount of time it took to train the model", + ) + + evaluations = models.ManyToManyField( + 'parley.Evaluation', through='parley.ModelEvaluation', + ) + + class Meta: + db_table = "llms" + ordering = ("-trained_on",) + get_latest_by = "trained_on" + verbose_name = "LLM" + verbose_name_plural = "LLMs" + unique_together = ("name", "version") + + @property + def training_completed(self): + if self.trained_on is None or self.training_duration is None: + return None + return self.trained_on + self.training_duration + + def __str__(self): + return self.name + + +class ModelEvaluation(BaseModel): + """ + Models must be linked to specific evaluations in order to understand the + performance of the model for the evaluation. Note that models are also linked to + evaluations via their responses, but this model makes it easier to track aggregate + data. This table represents a denormalization since the foreign keys are duplicated, + but it makes application semantics a lot simpler. + + This table ensures there is a many to many relationship between models and + evaluations. + """ + + model = models.ForeignKey( + "parley.LLM", + null=False, + on_delete=models.CASCADE, + related_name="model_evaluations", + help_text="The LLM that needs to be evaluated", + ) + + evaluation = models.ForeignKey( + "parley.Evaluation", + null=False, + on_delete=models.CASCADE, + related_name="model_evaluations", + help_text="The evaluation associated with the model", + ) + + # Cache Info + metrics_cached = models.BooleanField( + default=False, editable=False, + ) + + # Cache Info + metrics_last_cached_on = models.DateTimeField( + default=None, null=True, editable=False, + ) + + # Cached metric + n_prompts = models.IntegerField( + default=0, editable=False, + ) + + # Cached metric + n_responses = models.IntegerField( + default=0, editable=False, + ) + + # Processing Info + similarity_processed = models.BooleanField( + default=False, editable=False, + ) + + # Cached metric + n_similar = models.IntegerField( + default=0, editable=False, + ) + + # Processing Info + labels_processed = models.BooleanField( + default=False, editable=False, + ) + + # Cached metric + n_labeled_correctly = models.IntegerField( + default=0, editable=False, + ) + + # Processing Info + output_type_processed = models.BooleanField( + default=False, editable=False, + ) + + # Cached metric + n_valid_output_type = models.IntegerField( + default=0, editable=False, + ) + + # Processing Info + sensitive_processed = models.BooleanField( + default=False, editable=False, + ) + + # Cached metric + n_leaks_sensitive = models.IntegerField( + default=0, editable=False, + ) + + # Processing Info + confabulations_processed = models.BooleanField( + default=False, editable=False, + ) + + # Cached metric + n_confabulations = models.IntegerField( + default=0, editable=False, + ) + + # Processing Info + readable_processed = models.BooleanField( + default=False, editable=False, + ) + + # Cached metric + n_readable = models.IntegerField( + default=0, editable=False, + ) + + class Meta: + db_table = "model_evaluations" + ordering = ("-created",) + get_latest_by = "created" + unique_together = ('model', 'evaluation') + + @property + def image(self): + return self.model.image + + def responses(self): + return Response.objects.filter( + model=self.model, prompt__evaluation=self.evaluation + ) + + +class Response(BaseModel): + """ + A response is an output generated by an LLM to a specific prompt. + """ + + model = models.ForeignKey( + "parley.LLM", + null=False, + on_delete=models.RESTRICT, + related_name="responses", + help_text="The LLM that generated the specified response", + ) + + prompt = models.ForeignKey( + "parley.Prompt", + null=False, + on_delete=models.CASCADE, + related_name="responses", + help_text="The prompt that this is an LLM response to", + ) + + output = models.TextField( + null=False, + default=None, + blank=False, + help_text="The output generated by the LLM in response to the prompt", + ) + + output_similarity = models.FloatField( + null=True, + default=None, + blank=True, + help_text="The similarity score of this response to the expected output", + ) + + is_similar = models.BooleanField( + null=True, + default=None, + blank=True, + help_text="Was the similarity score greater than the threshold?", + ) + + label = models.CharField( + null=True, + blank=True, + default=None, + max_length=255, + help_text="For classifiers, label that is extracted from the output", + ) + + label_correct = models.BooleanField( + null=True, + default=None, + blank=True, + help_text=( + "Was the output label correct based on the expected label " + "(or almost correct for fuzzy label matching)" + ), + ) + + # For text - check to make sure it contains expected characters. + valid_output_type = models.BooleanField( + null=True, + default=None, + blank=True, + help_text=( + "Based on the expected output type, is it parseable; e.g. if the " + "output is supposed to be JSON, can it be correctly decoded?", + ), + ) + + leaks_sensitive = models.BooleanField( + null=True, + default=None, + blank=True, + verbose_name="leaks sensitive data", + help_text="Does the output contain sensitive data that should not be leaked?", + ) + + # TODO: set this based on annotator agreement + is_confabulation = models.BooleanField( + null=True, + default=None, + blank=True, + help_text="Is the output a hallucination or confabulation?", + ) + + # TODO: set this based on annotator agreement + is_readable = models.BooleanField( + null=True, + default=None, + blank=True, + help_text="Does the output contain grammatically correct, understandable language?", + ) + + max_new_tokens = models.IntegerField( + null=True, + default=None, + editable=True, + blank=True, + help_text="Set this field if different from the model configuration", + ) + + inference_on = models.DateTimeField( + null=True, + blank=True, + default=None, + help_text="The timestamp that the LLM started the inferencing", + ) + + inference_duration = models.DurationField( + null=True, + default=None, + blank=True, + help_text="The amount of time it took to perform the inference", + ) + + reviewers = models.ManyToManyField( + "parley.ReviewTask", through="parley.ResponseReview" + ) + + class Meta: + db_table = "responses" + ordering = ("-created",) + get_latest_by = "created" + verbose_name = "response" + verbose_name_plural = "responses" + unique_together = ("model", "prompt") + + @property + def evaluation(self): + return self.prompt.evaluation + + def get_previous(self): + try: + return self.get_previous_by_created() + except self.DoesNotExist: + return None + + def get_next(self): + try: + return self.get_next_by_created() + except self.DoesNotExist: + return None + + def get_absolute_url(self): + return reverse("response-detail", args=(self.id,)) + + def validate_json(self): + output = self.output.strip() + output = output.removeprefix("```json").removesuffix("```").strip() + try: + json.loads(output) + return True + except json.JSONDecodeError: + return False diff --git a/parley/models/sensitive.py b/parley/models/sensitive.py new file mode 100644 index 0000000..33015bc --- /dev/null +++ b/parley/models/sensitive.py @@ -0,0 +1,50 @@ +# parley.models.sensitive +# Handling of sensitive data and validation. +# +# Author: Benjamin Bengfort +# Created: Sat Oct 05 15:11:26 2024 -0500 +# +# Copyright (C) 2024 Rotational Labs, Inc. +# For license information, see LICENSE +# +# ID: llm.py [] benjamin@rotational.io $ + +""" +Handling of sensitive data and validation. +""" + +########################################################################## +## Imports +########################################################################## + +from .base import TimestampedModel +from django.db import models + + +class Sensitive(TimestampedModel): + """ + Any data or information that should be considered sensitive and not included + in any output across any evaluation. These can be specific strings or they can + be regular expressions to search the output for. + """ + + term = models.CharField( + blank=False, + null=False, + max_length=255, + unique=True, + help_text="The search term to look for sensitive data in output", + ) + + is_regex = models.BooleanField( + null=False, + default=False, + help_text="If the term is a regular expression to analyze the output on", + ) + + class Meta: + db_table = "sensitive" + ordering = ("-created",) + get_latest_by = "created" + verbose_name = "sensitive" + verbose_name_plural = "sensitive" diff --git a/parley/models/user.py b/parley/models/user.py new file mode 100644 index 0000000..9c2f06f --- /dev/null +++ b/parley/models/user.py @@ -0,0 +1,131 @@ +# parley.models.user +# Implements models need to track user evaluations and reviews. +# +# Author: Benjamin Bengfort +# Created: Sat Oct 05 16:41:47 2024 -0500 +# +# Copyright (C) 2024 Rotational Labs, Inc. +# For license information, see LICENSE +# +# ID: user.py [] benjamin@rotational.io $ + +""" +Implements models need to track user evaluations and reviews. +""" + +########################################################################## +## Imports +########################################################################## + +from .base import TimestampedModel + +from django.db import models + + +########################################################################## +## Models +########################################################################## + +class ReviewTask(TimestampedModel): + + user = models.ForeignKey( + 'auth.User', + null=False, + on_delete=models.CASCADE, + related_name="review_tasks", + help_text="The user that is conducting the evaluation", + ) + + evaluation = models.ForeignKey( + "parley.Evaluation", + null=False, + on_delete=models.CASCADE, + related_name="review_tasks", + help_text="The evaluation the user is performing", + ) + + responses = models.ManyToManyField( + 'parley.Response', through='parley.ResponseReview' + ) + + started_on = models.DateTimeField( + null=True, default=None, + help_text="The timestamp that the review was start on, null if not started" + ) + + completed_on = models.DateTimeField( + null=True, default=None, + help_text="The timestamp that the review was completed, null if not completed", + ) + + class Meta: + db_table = "review_tasks" + ordering = ("-created",) + get_latest_by = "created" + unique_together = ("user", "evaluation") + + @property + def is_started(self): + return self.started_on is not None + + @property + def is_completed(self): + return self.completed_on is not None + + +class ResponseReview(TimestampedModel): + + review = models.ForeignKey( + "parley.ReviewTask", + null=False, + on_delete=models.CASCADE, + related_name="response_reviews", + help_text="The individual response reviews in a review", + ) + + response = models.ForeignKey( + 'parley.Response', + null=False, + on_delete=models.CASCADE, + related_name=("reviews"), + ) + + output_correct = models.BooleanField( + null=True, + default=None, + blank=True, + help_text=( + "Was the output correct based on the expected output or the prompt " + "(or almost correct for fuzzy qualitative correctness)?" + ), + ) + + label_correct = models.BooleanField( + null=True, + default=None, + blank=True, + help_text=( + "Was the output label correct based on the expected label " + "(or almost correct for fuzzy label matching)?" + ), + ) + + is_confabulation = models.BooleanField( + null=True, + default=None, + blank=True, + help_text="Is the output a hallucination or confabulation?", + ) + + is_readable = models.BooleanField( + null=True, + default=None, + blank=True, + help_text="Does the output contain grammatically correct, understandable language?", + ) + + class Meta: + db_table = "response_reviews" + ordering = ("-created",) + get_latest_by = "created" + unique_together = ("review", "response") diff --git a/requirements.txt b/requirements.txt index 595d859..056b8ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -48,6 +48,9 @@ typing_extensions==4.12.2 # pyflakes==3.2.0 ## Python Environment -pip>=24.2 -setuptools>=75.1.0 -wheel>=0.44.0 \ No newline at end of file +pip==24.2 +setuptools==75.1.0 +wheel==0.44.0 + +## The following requirements were added by pip freeze: +pillow==10.4.0