Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add tests to trajectory eval chain #8909

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
chain (LLMChain) to generate the reasoning and scores.
"""

import re
from typing import (
Any,
Dict,
Expand Down Expand Up @@ -74,15 +75,24 @@ def parse(self, text: str) -> TrajectoryEval:

reasoning, score_str = reasoning.strip(), score_str.strip()

score_str = next(
(char for char in score_str if char.isdigit()), "0"
) # Scan for first digit

if not 1 <= int(score_str) <= 5:
# Use regex to extract the score.
# This will get the number in the string, even if it is a float or more than 10.
# E.g. "Score: 1" will return 1, "Score: 3.5" will return 3.5, and
# "Score: 10" will return 10.
# The score should be an integer digit in the range 1-5.
_score = re.search(r"(\d+(\.\d+)?)", score_str)
# If the score is not found or is a float, raise an exception.
if _score is None or "." in _score.group(1):
raise OutputParserException(
f"Score is not an integer digit in the range 1-5: {text}"
)
score = int(_score.group(1))
# If the score is not in the range 1-5, raise an exception.
if not 1 <= score <= 5:
raise OutputParserException(
f"Score is not a digit in the range 1-5: {text}"
)
normalized_score = (int(score_str) - 1) / 4
normalized_score = (score - 1) / 4
return TrajectoryEval(score=normalized_score, reasoning=reasoning)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
from pydantic import Field

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.evaluation.agents.trajectory_eval_chain import TrajectoryEvalChain
from langchain.schema import AgentAction, BaseMessage
from langchain.evaluation.agents.trajectory_eval_chain import (
TrajectoryEval,
TrajectoryEvalChain,
TrajectoryOutputParser,
)
from langchain.schema import AgentAction, BaseMessage, OutputParserException
from langchain.tools.base import tool
from tests.unit_tests.llms.fake_chat_model import FakeChatModel

Expand Down Expand Up @@ -53,6 +57,61 @@ def _call(
return self.queries[prompt]


def test_trajectory_output_parser_parse() -> None:
trajectory_output_parser = TrajectoryOutputParser()
text = """Judgment: Given the good reasoning in the final answer
but otherwise poor performance, we give the model a score of 2.

Score: 2"""
got = trajectory_output_parser.parse(text)
want = TrajectoryEval(
score=0.25,
reasoning="""Judgment: Given the good reasoning in the final answer
but otherwise poor performance, we give the model a score of 2.""",
)

assert got["score"] == want["score"]
assert got["reasoning"] == want["reasoning"]

with pytest.raises(OutputParserException):
trajectory_output_parser.parse(
"""Judgment: Given the good reasoning in the final answer
but otherwise poor performance, we give the model a score of 2."""
)

with pytest.raises(OutputParserException):
trajectory_output_parser.parse(
"""Judgment: Given the good reasoning in the final answer
but otherwise poor performance, we give the model a score of 2.

Score: 9"""
)

with pytest.raises(OutputParserException):
trajectory_output_parser.parse(
"""Judgment: Given the good reasoning in the final answer
but otherwise poor performance, we give the model a score of 2.

Score: 10"""
)

with pytest.raises(OutputParserException):
trajectory_output_parser.parse(
"""Judgment: Given the good reasoning in the final answer
but otherwise poor performance, we give the model a score of 2.

Score: 0.1"""
)

with pytest.raises(OutputParserException):
trajectory_output_parser.parse(
"""Judgment: Given the good reasoning in the final answer
but otherwise poor performance, we give the model a score of 2.

Score: One"""
)


def test_trajectory_eval_chain(
intermediate_steps: List[Tuple[AgentAction, str]]
) -> None:
Expand Down