diff --git a/factgenie/config/default_prompts.yml b/factgenie/config/default_prompts.yml index 5f936220..5f200917 100644 --- a/factgenie/config/default_prompts.yml +++ b/factgenie/config/default_prompts.yml @@ -21,7 +21,7 @@ llm_eval: | ``` Instructions for annotating the text: - Output the errors as a JSON list "annotations" in which each object contains fields "reason", "text", and "type". The value of "reason" is the reason for the annotation. The value of "text" is the literal value of the text inside the highlighted span, so that the span can later be identified using string matching. The value of "type" is an integer index of the error based on the following list: + Output the errors as a JSON list "annotations" in which each object contains fields "reason", "text", and "annotation_type". The value of "reason" is the reason for the annotation. The value of "text" is the literal value of the text inside the highlighted span, so that the span can later be identified using string matching. The value of "annotation_type" is an integer index of the error based on the following list: {error_list} diff --git a/factgenie/config/llm-eval/example-ollama-llama3-eval.yaml b/factgenie/config/llm-eval/example-ollama-llama3-eval.yaml index a309fc77..3f44efda 100644 --- a/factgenie/config/llm-eval/example-ollama-llama3-eval.yaml +++ b/factgenie/config/llm-eval/example-ollama-llama3-eval.yaml @@ -33,7 +33,7 @@ prompt_template: | ``` {text} ``` - Output the errors as a JSON list "annotations" in which each object contains fields "reason", "text", and "error_type". The value of "text" is the text of the error. The value of "reason" is the reason for the error. The value of "error_type" is one of {0, 1, 2, 3} based on the following list: + Output the errors as a JSON list "annotations" in which each object contains fields "reason", "text", and "annotation_type". The value of "text" is the text of the error. The value of "reason" is the reason for the error. The value of "annotation_type" is one of {0, 1, 2, 3} based on the following list: - 0: Incorrect fact: The fact in the text contradicts the data. - 1: Not checkable: The fact in the text cannot be checked in the data. - 2: Misleading: The fact in the text is misleading in the given context. @@ -54,6 +54,6 @@ prompt_template: | Nokia 3310 is produced in Finland and features a 320x320 display. It is available in black color. The data seem to provide only partial information about the phone. ``` output: - ```{ "annotations": [{"reason": "The country where the phone is produced is not mentioned in the data.", "text": "produced in Finland", "error_type": 1}, {"reason": "The data mentions that the display has resolution 320x240px.", "text": "320x320", "error_type": 0}, {"reason": "Misleadingly suggests that the phone is not available in other colors.", "text": "available in black color", "error_type": 2}, {"reason": "The note is irrelevant for the phone description.", "text": "The data seem to provide only partial information about the phone.", "error_type": 3}] } + ```{ "annotations": [{"reason": "The country where the phone is produced is not mentioned in the data.", "text": "produced in Finland", "annotation_type": 1}, {"reason": "The data mentions that the display has resolution 320x240px.", "text": "320x320", "annotation_type": 0}, {"reason": "Misleadingly suggests that the phone is not available in other colors.", "text": "available in black color", "annotation_type": 2}, {"reason": "The note is irrelevant for the phone description.", "text": "The data seem to provide only partial information about the phone.", "annotation_type": 3}] } ``` Note that some details may not be mentioned in the text: do not count omissions as errors. Also do not be too strict: some facts can be less specific than in the data (rounded values, shortened or abbreviated text, etc.), do not count these as errors. If there are no errors in the text, "annotations" will be an empty list. diff --git a/factgenie/config/llm-eval/example-openai-gpt-4o-mini-eval.yaml b/factgenie/config/llm-eval/example-openai-gpt-4o-mini-eval.yaml index 00dd18f5..7f81d254 100644 --- a/factgenie/config/llm-eval/example-openai-gpt-4o-mini-eval.yaml +++ b/factgenie/config/llm-eval/example-openai-gpt-4o-mini-eval.yaml @@ -25,7 +25,7 @@ prompt_template: | ``` {text} ``` - Output the errors as a JSON list "annotations" in which each object contains fields "reason", "text", and "error_type". The value of "text" is the text of the error. The value of "reason" is the reason for the error. The value of "error_type" is one of {0, 1, 2, 3} based on the following list: + Output the errors as a JSON list "annotations" in which each object contains fields "reason", "text", and "annotation_type". The value of "text" is the text of the error. The value of "reason" is the reason for the error. The value of "annotation_type" is one of {0, 1, 2, 3} based on the following list: - 0: Incorrect fact: The fact in the text contradicts the data. - 1: Not checkable: The fact in the text cannot be checked in the data. - 2: Misleading: The fact in the text is misleading in the given context. @@ -46,6 +46,6 @@ prompt_template: | Nokia 3310 is produced in Finland and features a 320x320 display. It is available in black color. The data seem to provide only partial information about the phone. ``` output: - ```{ "annotations": [{"reason": "The country where the phone is produced is not mentioned in the data.", "text": "produced in Finland", "error_type": 1}, {"reason": "The data mentions that the display has resolution 320x240px.", "text": "320x320", "error_type": 0}, {"reason": "Misleadingly suggests that the phone is not available in other colors.", "text": "available in black color", "error_type": 2}, {"reason": "The note is irrelevant for the phone description.", "text": "The data seem to provide only partial information about the phone.", "error_type": 3}] } + ```{ "annotations": [{"reason": "The country where the phone is produced is not mentioned in the data.", "text": "produced in Finland", "annotation_type": 1}, {"reason": "The data mentions that the display has resolution 320x240px.", "text": "320x320", "annotation_type": 0}, {"reason": "Misleadingly suggests that the phone is not available in other colors.", "text": "available in black color", "annotation_type": 2}, {"reason": "The note is irrelevant for the phone description.", "text": "The data seem to provide only partial information about the phone.", "annotation_type": 3}] } ``` Note that some details may not be mentioned in the text: do not count omissions as errors. Also do not be too strict: some facts can be less specific than in the data (rounded values, shortened or abbreviated text, etc.), do not count these as errors. If there are no errors in the text, "annotations" will be an empty list. diff --git a/factgenie/config/llm-eval/example-vllm-llama3-eval.yaml b/factgenie/config/llm-eval/example-vllm-llama3-eval.yaml index 87569ea8..b46703b6 100644 --- a/factgenie/config/llm-eval/example-vllm-llama3-eval.yaml +++ b/factgenie/config/llm-eval/example-vllm-llama3-eval.yaml @@ -33,7 +33,7 @@ prompt_template: | ``` {text} ``` - Output the errors as a JSON list "annotations" in which each object contains fields "reason", "text", and "error_type". The value of "text" is the text of the error. The value of "reason" is the reason for the error. The value of "error_type" is one of {0, 1, 2, 3} based on the following list: + Output the errors as a JSON list "annotations" in which each object contains fields "reason", "text", and "annotation_type". The value of "text" is the text of the error. The value of "reason" is the reason for the error. The value of "annotation_type" is one of {0, 1, 2, 3} based on the following list: - 0: Incorrect fact: The fact in the text contradicts the data. - 1: Not checkable: The fact in the text cannot be checked in the data. - 2: Misleading: The fact in the text is misleading in the given context. @@ -54,6 +54,6 @@ prompt_template: | Nokia 3310 is produced in Finland and features a 320x320 display. It is available in black color. The data seem to provide only partial information about the phone. ``` output: - ```{ "annotations": [{"reason": "The country where the phone is produced is not mentioned in the data.", "text": "produced in Finland", "error_type": 1}, {"reason": "The data mentions that the display has resolution 320x240px.", "text": "320x320", "error_type": 0}, {"reason": "Misleadingly suggests that the phone is not available in other colors.", "text": "available in black color", "error_type": 2}, {"reason": "The note is irrelevant for the phone description.", "text": "The data seem to provide only partial information about the phone.", "error_type": 3}] } + ```{ "annotations": [{"reason": "The country where the phone is produced is not mentioned in the data.", "text": "produced in Finland", "annotation_type": 1}, {"reason": "The data mentions that the display has resolution 320x240px.", "text": "320x320", "annotation_type": 0}, {"reason": "Misleadingly suggests that the phone is not available in other colors.", "text": "available in black color", "annotation_type": 2}, {"reason": "The note is irrelevant for the phone description.", "text": "The data seem to provide only partial information about the phone.", "annotation_type": 3}] } ``` Note that some details may not be mentioned in the text: do not count omissions as errors. Also do not be too strict: some facts can be less specific than in the data (rounded values, shortened or abbreviated text, etc.), do not count these as errors. If there are no errors in the text, "annotations" will be an empty list. diff --git a/factgenie/models.py b/factgenie/models.py index 2f0ace28..b6d8afab 100644 --- a/factgenie/models.py +++ b/factgenie/models.py @@ -52,15 +52,17 @@ def from_config(config, mode): return classes[metric_type](config) -class Annotation(BaseModel): +class SpanAnnotation(BaseModel): text: str = Field(description="The text which is annotated.") # Do not name it type since it is a reserved keyword in JSON schema - error_type: int = Field(description="Index to the list of categories defined for the annotation campaign.") + annotation_type: int = Field( + description="Index to the list of span annotation types defined for the annotation campaign." + ) reason: str = Field(description="The reason for the annotation.") class OutputAnnotations(BaseModel): - annotations: list[Annotation] = Field(description="The list of annotations.") + annotations: list[SpanAnnotation] = Field(description="The list of annotations.") class Model: @@ -151,8 +153,8 @@ def parse_annotations(self, text, annotations_json): annotation_d = annotation.dict() # For backward compatibility let's use shorter "type" # We do not use the name "type" in JSON schema for error types because it has much broader sense in the schema (e.g. string or integer) - annotation_d["type"] = annotation.error_type - del annotation_d["error_type"] + annotation_d["type"] = annotation.annotation_type + del annotation_d["annotation_type"] # logging where the annotion starts to disambiguate errors on the same string in different places annotation_d["start"] = start_pos annotation_list.append(annotation_d)