diff --git a/docs/source/notebooks/tool_usage/intro.ipynb b/docs/source/notebooks/tool_usage/intro.ipynb index 06d89d9..b228b8b 100644 --- a/docs/source/notebooks/tool_usage/intro.ipynb +++ b/docs/source/notebooks/tool_usage/intro.ipynb @@ -93,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "3b9b82fc-b689-4a25-b718-99ecc2fc6867", "metadata": { "tags": [] @@ -136,19 +136,21 @@ "Each example is composed of a question, a reference answer, and information about the sequence in which tools should be used to answer the question.\n", "\n", "Success is measured by the ability to answer the question correctly, and efficiently. \n", - "
Name | Tool Usage - Typewriter (26 tools) |
Type | ToolUsageTask |
Dataset ID | 128af05e-aa00-4e3b-a958-d166dd450581 |
Description | Environment with 26 tools each tool represents a letter of the alphabet.\n", + "\n", + "The objective of this task is to evaluate the model's ability the use tools\n", + "for a simple repetition task.\n", + "\n", + "For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked in that order.\n", + "\n", + "The dataset includes examples of varying difficulty. The difficulty is measured by the length of the string.\n", + "\n", + "This is a variation of the typer writer task, where 26 parameterless tools are\n", + "given instead of a single tool that takes a letter as an argument. |
\n", - " | feedback.correctness | \n", - "feedback.Intermediate steps correctness | \n", - "execution_time | \n", - "feedback.# steps / # expected steps | \n", - "# correct | \n", - "n | \n", - "
---|---|---|---|---|---|---|
model | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " |
gpt-3.5-turbo-0613 | \n", - "0.8 | \n", - "0.8 | \n", - "7.992928 | \n", - "1.03333 | \n", - "8.0 | \n", - "10 | \n", - "
gpt-3.5-turbo-1106 | \n", - "0.6 | \n", - "0.6 | \n", - "8.933172 | \n", - "0.93332 | \n", - "6.0 | \n", - "10 | \n", - "
gpt-4-0613 | \n", - "0.5 | \n", - "0.6 | \n", - "8.329558 | \n", - "0.76666 | \n", - "5.0 | \n", - "10 | \n", - "
\n", - " | input.question | \n", - "model | \n", - "actual_steps | \n", - "reference.expected_steps | \n", - "outputs.output | \n", - "reference.reference | \n", - "feedback.correctness | \n", - "num_expected_steps | \n", - "
---|---|---|---|---|---|---|---|---|
example_id | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " |
20ea2f0e-b306-474a-8daa-f4386cc16599 | \n", - "Add 2 and 3 | \n", - "gpt-3.5-turbo-0613 | \n", - "[add] | \n", - "[add] | \n", - "The sum of 2 and 3 in this alternate mathemati... | \n", - "6.20 | \n", - "1.0 | \n", - "1 | \n", - "
20ea2f0e-b306-474a-8daa-f4386cc16599 | \n", - "Add 2 and 3 | \n", - "gpt-3.5-turbo-1106 | \n", - "[add] | \n", - "[add] | \n", - "The result of adding 2 and 3 is 6.2. | \n", - "6.20 | \n", - "1.0 | \n", - "1 | \n", - "
20ea2f0e-b306-474a-8daa-f4386cc16599 | \n", - "Add 2 and 3 | \n", - "gpt-4-0613 | \n", - "[add] | \n", - "[add] | \n", - "6.2 | \n", - "6.20 | \n", - "1.0 | \n", - "1 | \n", - "
2d3e1665-7b3f-4013-b010-6af30ed62ab2 | \n", - "I ate 1 apple and 2 oranges every day for 7 da... | \n", - "gpt-3.5-turbo-0613 | \n", - "[add, multiply] | \n", - "[add, multiply] | \n", - "You ate a total of 32.34 fruits. | \n", - "32.34 | \n", - "1.0 | \n", - "2 | \n", - "
2d3e1665-7b3f-4013-b010-6af30ed62ab2 | \n", - "I ate 1 apple and 2 oranges every day for 7 da... | \n", - "gpt-3.5-turbo-1106 | \n", - "[add] | \n", - "[add, multiply] | \n", - "You ate 16.2 fruits. | \n", - "32.34 | \n", - "0.0 | \n", - "2 | \n", - "
\n", - " | input.question | \n", - "model | \n", - "actual_steps | \n", - "reference.expected_steps | \n", - "outputs.output | \n", - "reference.reference | \n", - "feedback.correctness | \n", - "num_expected_steps | \n", - "
---|---|---|---|---|---|---|---|---|
example_id | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " |
20ea2f0e-b306-474a-8daa-f4386cc16599 | \n", - "Add 2 and 3 | \n", - "gpt-3.5-turbo-0613 | \n", - "[add] | \n", - "[add] | \n", - "The sum of 2 and 3 in this alternate mathemati... | \n", - "6.200000 | \n", - "1.0 | \n", - "1 | \n", - "
20ea2f0e-b306-474a-8daa-f4386cc16599 | \n", - "Add 2 and 3 | \n", - "gpt-3.5-turbo-1106 | \n", - "[add] | \n", - "[add] | \n", - "The result of adding 2 and 3 is 6.2. | \n", - "6.200000 | \n", - "1.0 | \n", - "1 | \n", - "
20ea2f0e-b306-474a-8daa-f4386cc16599 | \n", - "Add 2 and 3 | \n", - "gpt-4-0613 | \n", - "[add] | \n", - "[add] | \n", - "6.2 | \n", - "6.200000 | \n", - "1.0 | \n", - "1 | \n", - "
2d3e1665-7b3f-4013-b010-6af30ed62ab2 | \n", - "I ate 1 apple and 2 oranges every day for 7 da... | \n", - "gpt-3.5-turbo-0613 | \n", - "[add, multiply] | \n", - "[add, multiply] | \n", - "You ate a total of 32.34 fruits. | \n", - "32.340000 | \n", - "1.0 | \n", - "2 | \n", - "
2d3e1665-7b3f-4013-b010-6af30ed62ab2 | \n", - "I ate 1 apple and 2 oranges every day for 7 da... | \n", - "gpt-3.5-turbo-1106 | \n", - "[add] | \n", - "[add, multiply] | \n", - "You ate 16.2 fruits. | \n", - "32.340000 | \n", - "0.0 | \n", - "2 | \n", - "
2d3e1665-7b3f-4013-b010-6af30ed62ab2 | \n", - "I ate 1 apple and 2 oranges every day for 7 da... | \n", - "gpt-4-0613 | \n", - "[add, multiply] | \n", - "[add, multiply] | \n", - "32.34 | \n", - "32.340000 | \n", - "1.0 | \n", - "2 | \n", - "
c857031a-6ab1-4b06-9638-3a8a4ba69f11 | \n", - "Subtract 3 from 2 | \n", - "gpt-3.5-turbo-0613 | \n", - "[subtract] | \n", - "[subtract] | \n", - "The result of subtracting 3 from 2 in this alt... | \n", - "-4.000000 | \n", - "1.0 | \n", - "1 | \n", - "
c857031a-6ab1-4b06-9638-3a8a4ba69f11 | \n", - "Subtract 3 from 2 | \n", - "gpt-3.5-turbo-1106 | \n", - "[subtract] | \n", - "[subtract] | \n", - "The result of subtracting 3 from 2 is -4. | \n", - "-4.000000 | \n", - "1.0 | \n", - "1 | \n", - "
c857031a-6ab1-4b06-9638-3a8a4ba69f11 | \n", - "Subtract 3 from 2 | \n", - "gpt-4-0613 | \n", - "[subtract] | \n", - "[subtract] | \n", - "-4.0 | \n", - "-4.000000 | \n", - "1.0 | \n", - "1 | \n", - "
75db51d4-5c3b-4312-9eb9-b40c74eafdcd | \n", - "What is -5 if evaluated using the negate funct... | \n", - "gpt-3.5-turbo-0613 | \n", - "[negate] | \n", - "[negate] | \n", - "The result of evaluating -5 using the negate f... | \n", - "-5.000000 | \n", - "1.0 | \n", - "1 | \n", - "
75db51d4-5c3b-4312-9eb9-b40c74eafdcd | \n", - "What is -5 if evaluated using the negate funct... | \n", - "gpt-3.5-turbo-1106 | \n", - "[negate] | \n", - "[negate] | \n", - "The result of evaluating -5 using the negate f... | \n", - "-5.000000 | \n", - "1.0 | \n", - "1 | \n", - "
75db51d4-5c3b-4312-9eb9-b40c74eafdcd | \n", - "What is -5 if evaluated using the negate funct... | \n", - "gpt-4-0613 | \n", - "[negate] | \n", - "[negate] | \n", - "5 | \n", - "-5.000000 | \n", - "0.0 | \n", - "1 | \n", - "
2a20a13d-050e-4a16-84ff-22d9582f1449 | \n", - "after calculating the sin of 1.5 radians, divi... | \n", - "gpt-3.5-turbo-0613 | \n", - "[sin, cos, divide] | \n", - "[sin, cos, divide] | \n", - "The result of dividing the sine of 1.5 radians... | \n", - "0.035457 | \n", - "1.0 | \n", - "3 | \n", - "
2a20a13d-050e-4a16-84ff-22d9582f1449 | \n", - "after calculating the sin of 1.5 radians, divi... | \n", - "gpt-3.5-turbo-1106 | \n", - "[sin, cos, divide] | \n", - "[sin, cos, divide] | \n", - "The result is 0.035457422151326225. | \n", - "0.035457 | \n", - "1.0 | \n", - "3 | \n", - "
2a20a13d-050e-4a16-84ff-22d9582f1449 | \n", - "after calculating the sin of 1.5 radians, divi... | \n", - "gpt-4-0613 | \n", - "[sin, cos, divide] | \n", - "[sin, cos, divide] | \n", - "0.035457422151326225 | \n", - "0.035457 | \n", - "1.0 | \n", - "3 | \n", - "
4ac33c1a-62f0-4da4-9455-07b582f6ff52 | \n", - "calculate 101 to the power of 0.5 to 4 digits ... | \n", - "gpt-3.5-turbo-0613 | \n", - "[power, power, power, power] | \n", - "[power, round] | \n", - "The result of 101 to the power of 0.5 to 4 dig... | \n", - "102518.781200 | \n", - "0.0 | \n", - "2 | \n", - "
4ac33c1a-62f0-4da4-9455-07b582f6ff52 | \n", - "calculate 101 to the power of 0.5 to 4 digits ... | \n", - "gpt-3.5-turbo-1106 | \n", - "[power, power, power] | \n", - "[power, round] | \n", - "3.8109e+37 | \n", - "102518.781200 | \n", - "0.0 | \n", - "2 | \n", - "
4ac33c1a-62f0-4da4-9455-07b582f6ff52 | \n", - "calculate 101 to the power of 0.5 to 4 digits ... | \n", - "gpt-4-0613 | \n", - "[power] | \n", - "[power, round] | \n", - "102519 | \n", - "102518.781200 | \n", - "0.0 | \n", - "2 | \n", - "
2e82a924-8382-425e-8738-daa2d912e9fe | \n", - "convert 15 degrees to radians | \n", - "gpt-3.5-turbo-0613 | \n", - "[divide] | \n", - "[pi, multiply, divide] | \n", - "15 degrees is approximately 0.0417 radians. | \n", - "0.124588 | \n", - "0.0 | \n", - "3 | \n", - "
2e82a924-8382-425e-8738-daa2d912e9fe | \n", - "convert 15 degrees to radians | \n", - "gpt-3.5-turbo-1106 | \n", - "[pi, divide] | \n", - "[pi, multiply, divide] | \n", - "15 degrees is approximately 0.0417 radians. | \n", - "0.124588 | \n", - "0.0 | \n", - "3 | \n", - "
2e82a924-8382-425e-8738-daa2d912e9fe | \n", - "convert 15 degrees to radians | \n", - "gpt-4-0613 | \n", - "[multiply] | \n", - "[pi, multiply, divide] | \n", - "0.28797945 | \n", - "0.124588 | \n", - "0.0 | \n", - "3 | \n", - "
67867526-791a-452f-b534-ef2c1f5efd20 | \n", - "ecoli divides every 20 minutes. How many cells... | \n", - "gpt-3.5-turbo-0613 | \n", - "[divide, power, multiply] | \n", - "[divide, power, multiply] | \n", - "After 2 hours, starting with 5 cells, there wi... | \n", - "176.000000 | \n", - "1.0 | \n", - "3 | \n", - "
67867526-791a-452f-b534-ef2c1f5efd20 | \n", - "ecoli divides every 20 minutes. How many cells... | \n", - "gpt-3.5-turbo-1106 | \n", - "[divide, power] | \n", - "[divide, power, multiply] | \n", - "After 2 hours, there will be 2187 cells. | \n", - "176.000000 | \n", - "0.0 | \n", - "3 | \n", - "
67867526-791a-452f-b534-ef2c1f5efd20 | \n", - "ecoli divides every 20 minutes. How many cells... | \n", - "gpt-4-0613 | \n", - "[multiply] | \n", - "[divide, power, multiply] | \n", - "352.0 | \n", - "176.000000 | \n", - "0.0 | \n", - "3 | \n", - "
27c44572-6c67-4129-a95a-fe1509c350be | \n", - "multiply the result of (log of 100 to base 10)... | \n", - "gpt-3.5-turbo-0613 | \n", - "[log, multiply] | \n", - "[log, multiply] | \n", - "The result of multiplying the logarithm of 100... | \n", - "6.222319 | \n", - "1.0 | \n", - "2 | \n", - "
27c44572-6c67-4129-a95a-fe1509c350be | \n", - "multiply the result of (log of 100 to base 10)... | \n", - "gpt-3.5-turbo-1106 | \n", - "[log, multiply] | \n", - "[log, multiply] | \n", - "The result is 6.222318693323366 | \n", - "6.222319 | \n", - "1.0 | \n", - "2 | \n", - "
27c44572-6c67-4129-a95a-fe1509c350be | \n", - "multiply the result of (log of 100 to base 10)... | \n", - "gpt-4-0613 | \n", - "[multiply] | \n", - "[log, multiply] | \n", - "19.8 | \n", - "6.222319 | \n", - "0.0 | \n", - "2 | \n", - "
dd079541-c0da-4d94-85b7-50f0516a9ca1 | \n", - "what is the result of 2 to the power of 3? | \n", - "gpt-3.5-turbo-0613 | \n", - "[power] | \n", - "[power] | \n", - "The result of 2 to the power of 3 is 32. | \n", - "32.000000 | \n", - "1.0 | \n", - "1 | \n", - "
dd079541-c0da-4d94-85b7-50f0516a9ca1 | \n", - "what is the result of 2 to the power of 3? | \n", - "gpt-3.5-turbo-1106 | \n", - "[power] | \n", - "[power] | \n", - "The result of 2 to the power of 3 is 32. | \n", - "32.000000 | \n", - "1.0 | \n", - "1 | \n", - "
dd079541-c0da-4d94-85b7-50f0516a9ca1 | \n", - "what is the result of 2 to the power of 3? | \n", - "gpt-4-0613 | \n", - "[power] | \n", - "[power] | \n", - "32.0 | \n", - "32.000000 | \n", - "1.0 | \n", - "1 | \n", - "
\n", - " | feedback.correctness | \n", - "feedback.Intermediate steps correctness | \n", - "execution_time | \n", - "feedback.# steps / # expected steps | \n", - "n | \n", - "
---|---|---|---|---|---|
model | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " |
gpt-3.5-turbo-0613 | \n", - "0.714286 | \n", - "0.714286 | \n", - "4.829506 | \n", - "0.825390 | \n", - "21 | \n", - "
gpt-3.5-turbo-1106 | \n", - "0.857143 | \n", - "0.714286 | \n", - "5.464218 | \n", - "0.965871 | \n", - "21 | \n", - "
gpt-4-0613 | \n", - "0.952381 | \n", - "0.714286 | \n", - "8.544358 | \n", - "1.037300 | \n", - "21 | \n", - "
\n", - " | feedback.Correct Final State | \n", - "feedback.Intermediate steps correctness | \n", - "execution_time | \n", - "feedback.# steps / # expected steps | \n", - "# correct | \n", - "n | \n", - "
---|---|---|---|---|---|---|
model | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " |
gpt-3.5-turbo-0613 | \n", - "0.95 | \n", - "0.95 | \n", - "18.880388 | \n", - "1.700000 | \n", - "19.0 | \n", - "20 | \n", - "
gpt-3.5-turbo-1106 | \n", - "0.90 | \n", - "0.75 | \n", - "22.471857 | \n", - "1.012455 | \n", - "18.0 | \n", - "20 | \n", - "
gpt-4-0613 | \n", - "0.90 | \n", - "0.90 | \n", - "22.663781 | \n", - "1.093750 | \n", - "18.0 | \n", - "20 | \n", - "
\n", - " | input.question | \n", - "model | \n", - "outputs.state | \n", - "reference.state | \n", - "feedback.Correct Final State | \n", - "num_expected_steps | \n", - "num_actual_steps | \n", - "
---|---|---|---|---|---|---|---|
example_id | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " |
89bb564a-ddee-4a36-8a3d-d093eef415ca | \n", - "a | \n", - "gpt-3.5-turbo-0613 | \n", - "aaaaaaaaaaaaaaa | \n", - "a | \n", - "0.0 | \n", - "1 | \n", - "15 | \n", - "
89bb564a-ddee-4a36-8a3d-d093eef415ca | \n", - "a | \n", - "gpt-3.5-turbo-1106 | \n", - "a | \n", - "a | \n", - "1.0 | \n", - "1 | \n", - "1 | \n", - "
89bb564a-ddee-4a36-8a3d-d093eef415ca | \n", - "a | \n", - "gpt-4-0613 | \n", - "abc | \n", - "a | \n", - "0.0 | \n", - "1 | \n", - "3 | \n", - "
5b40cb96-ae09-438e-b940-d24445bb5d67 | \n", - "aa | \n", - "gpt-3.5-turbo-0613 | \n", - "aa | \n", - "aa | \n", - "1.0 | \n", - "2 | \n", - "2 | \n", - "
5b40cb96-ae09-438e-b940-d24445bb5d67 | \n", - "aa | \n", - "gpt-3.5-turbo-1106 | \n", - "aa | \n", - "aa | \n", - "1.0 | \n", - "2 | \n", - "2 | \n", - "
\n", - " | input.question | \n", - "model | \n", - "outputs.state | \n", - "reference.state | \n", - "feedback.Correct Final State | \n", - "num_expected_steps | \n", - "num_actual_steps | \n", - "
---|---|---|---|---|---|---|---|
example_id | \n", - "\n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " | \n", - " |
89bb564a-ddee-4a36-8a3d-d093eef415ca | \n", - "a | \n", - "gpt-3.5-turbo-0613 | \n", - "aaaaaaaaaaaaaaa | \n", - "a | \n", - "0.0 | \n", - "1 | \n", - "15 | \n", - "
89bb564a-ddee-4a36-8a3d-d093eef415ca | \n", - "a | \n", - "gpt-3.5-turbo-1106 | \n", - "a | \n", - "a | \n", - "1.0 | \n", - "1 | \n", - "1 | \n", - "
89bb564a-ddee-4a36-8a3d-d093eef415ca | \n", - "a | \n", - "gpt-4-0613 | \n", - "abc | \n", - "a | \n", - "0.0 | \n", - "1 | \n", - "3 | \n", - "
5b40cb96-ae09-438e-b940-d24445bb5d67 | \n", - "aa | \n", - "gpt-3.5-turbo-0613 | \n", - "aa | \n", - "aa | \n", - "1.0 | \n", - "2 | \n", - "2 | \n", - "
5b40cb96-ae09-438e-b940-d24445bb5d67 | \n", - "aa | \n", - "gpt-3.5-turbo-1106 | \n", - "aa | \n", - "aa | \n", - "1.0 | \n", - "2 | \n", - "2 | \n", - "
5b40cb96-ae09-438e-b940-d24445bb5d67 | \n", - "aa | \n", - "gpt-4-0613 | \n", - "aa | \n", - "aa | \n", - "1.0 | \n", - "2 | \n", - "2 | \n", - "
288d6483-c618-4e34-9b86-275b490e0975 | \n", - "aaa | \n", - "gpt-3.5-turbo-0613 | \n", - "aaa | \n", - "aaa | \n", - "1.0 | \n", - "3 | \n", - "3 | \n", - "
288d6483-c618-4e34-9b86-275b490e0975 | \n", - "aaa | \n", - "gpt-3.5-turbo-1106 | \n", - "aaa | \n", - "aaa | \n", - "1.0 | \n", - "3 | \n", - "3 | \n", - "
288d6483-c618-4e34-9b86-275b490e0975 | \n", - "aaa | \n", - "gpt-4-0613 | \n", - "aaa | \n", - "aaa | \n", - "1.0 | \n", - "3 | \n", - "3 | \n", - "
915bd4b5-a536-4849-8cb6-8a658407c2c9 | \n", - "aaaa | \n", - "gpt-3.5-turbo-0613 | \n", - "aaaa | \n", - "aaaa | \n", - "1.0 | \n", - "4 | \n", - "4 | \n", - "
915bd4b5-a536-4849-8cb6-8a658407c2c9 | \n", - "aaaa | \n", - "gpt-3.5-turbo-1106 | \n", - "aaaa | \n", - "aaaa | \n", - "1.0 | \n", - "4 | \n", - "4 | \n", - "
915bd4b5-a536-4849-8cb6-8a658407c2c9 | \n", - "aaaa | \n", - "gpt-4-0613 | \n", - "aaaa | \n", - "aaaa | \n", - "1.0 | \n", - "4 | \n", - "4 | \n", - "
1cb7a14d-cc7d-44f1-ab47-394f8221abee | \n", - "cat | \n", - "gpt-3.5-turbo-0613 | \n", - "cat | \n", - "cat | \n", - "1.0 | \n", - "3 | \n", - "3 | \n", - "
1cb7a14d-cc7d-44f1-ab47-394f8221abee | \n", - "cat | \n", - "gpt-3.5-turbo-1106 | \n", - "cat | \n", - "cat | \n", - "1.0 | \n", - "3 | \n", - "3 | \n", - "
1cb7a14d-cc7d-44f1-ab47-394f8221abee | \n", - "cat | \n", - "gpt-4-0613 | \n", - "cat | \n", - "cat | \n", - "1.0 | \n", - "3 | \n", - "3 | \n", - "
5b409366-ee6a-4bdb-b842-5e71d3407a05 | \n", - "church | \n", - "gpt-3.5-turbo-0613 | \n", - "church | \n", - "church | \n", - "1.0 | \n", - "6 | \n", - "6 | \n", - "
5b409366-ee6a-4bdb-b842-5e71d3407a05 | \n", - "church | \n", - "gpt-3.5-turbo-1106 | \n", - "church | \n", - "church | \n", - "1.0 | \n", - "6 | \n", - "7 | \n", - "
5b409366-ee6a-4bdb-b842-5e71d3407a05 | \n", - "church | \n", - "gpt-4-0613 | \n", - "church | \n", - "church | \n", - "1.0 | \n", - "6 | \n", - "6 | \n", - "
c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a | \n", - "communication | \n", - "gpt-3.5-turbo-0613 | \n", - "communication | \n", - "communication | \n", - "1.0 | \n", - "13 | \n", - "13 | \n", - "
c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a | \n", - "communication | \n", - "gpt-3.5-turbo-1106 | \n", - "communication | \n", - "communication | \n", - "1.0 | \n", - "13 | \n", - "13 | \n", - "
c17e9d5a-b9f8-43dc-b5a9-6e45d21c9a2a | \n", - "communication | \n", - "gpt-4-0613 | \n", - "communication | \n", - "communication | \n", - "1.0 | \n", - "13 | \n", - "13 | \n", - "
5cf28d08-a49f-4a69-8759-b1b774ef74b1 | \n", - "computer | \n", - "gpt-3.5-turbo-0613 | \n", - "computer | \n", - "computer | \n", - "1.0 | \n", - "8 | \n", - "8 | \n", - "
5cf28d08-a49f-4a69-8759-b1b774ef74b1 | \n", - "computer | \n", - "gpt-3.5-turbo-1106 | \n", - "computer | \n", - "computer | \n", - "1.0 | \n", - "8 | \n", - "9 | \n", - "
5cf28d08-a49f-4a69-8759-b1b774ef74b1 | \n", - "computer | \n", - "gpt-4-0613 | \n", - "computer | \n", - "computer | \n", - "1.0 | \n", - "8 | \n", - "8 | \n", - "
9017ddcc-d3bd-45a8-88dd-70906964586b | \n", - "dictionary | \n", - "gpt-3.5-turbo-0613 | \n", - "dictionary | \n", - "dictionary | \n", - "1.0 | \n", - "10 | \n", - "10 | \n", - "
9017ddcc-d3bd-45a8-88dd-70906964586b | \n", - "dictionary | \n", - "gpt-3.5-turbo-1106 | \n", - "diction | \n", - "dictionary | \n", - "0.0 | \n", - "10 | \n", - "7 | \n", - "
9017ddcc-d3bd-45a8-88dd-70906964586b | \n", - "dictionary | \n", - "gpt-4-0613 | \n", - "dictionary | \n", - "dictionary | \n", - "1.0 | \n", - "10 | \n", - "10 | \n", - "
b1ac4715-a0ad-48f2-8741-949ca23b39eb | \n", - "dog | \n", - "gpt-3.5-turbo-0613 | \n", - "dog | \n", - "dog | \n", - "1.0 | \n", - "3 | \n", - "3 | \n", - "
b1ac4715-a0ad-48f2-8741-949ca23b39eb | \n", - "dog | \n", - "gpt-3.5-turbo-1106 | \n", - "dog | \n", - "dog | \n", - "1.0 | \n", - "3 | \n", - "3 | \n", - "
b1ac4715-a0ad-48f2-8741-949ca23b39eb | \n", - "dog | \n", - "gpt-4-0613 | \n", - "dog | \n", - "dog | \n", - "1.0 | \n", - "3 | \n", - "3 | \n", - "
10d42048-ac73-414f-9f50-dba79c3b74a7 | \n", - "hand | \n", - "gpt-3.5-turbo-0613 | \n", - "hand | \n", - "hand | \n", - "1.0 | \n", - "4 | \n", - "4 | \n", - "
10d42048-ac73-414f-9f50-dba79c3b74a7 | \n", - "hand | \n", - "gpt-3.5-turbo-1106 | \n", - "hand | \n", - "hand | \n", - "1.0 | \n", - "4 | \n", - "4 | \n", - "
10d42048-ac73-414f-9f50-dba79c3b74a7 | \n", - "hand | \n", - "gpt-4-0613 | \n", - "hand | \n", - "hand | \n", - "1.0 | \n", - "4 | \n", - "4 | \n", - "
daf06d4f-9b1d-4f5a-8aa9-09f885a79adb | \n", - "head | \n", - "gpt-3.5-turbo-0613 | \n", - "head | \n", - "head | \n", - "1.0 | \n", - "4 | \n", - "4 | \n", - "
daf06d4f-9b1d-4f5a-8aa9-09f885a79adb | \n", - "head | \n", - "gpt-3.5-turbo-1106 | \n", - "head | \n", - "head | \n", - "1.0 | \n", - "4 | \n", - "4 | \n", - "
daf06d4f-9b1d-4f5a-8aa9-09f885a79adb | \n", - "head | \n", - "gpt-4-0613 | \n", - "head | \n", - "head | \n", - "1.0 | \n", - "4 | \n", - "4 | \n", - "
fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f | \n", - "horse | \n", - "gpt-3.5-turbo-0613 | \n", - "horse | \n", - "horse | \n", - "1.0 | \n", - "5 | \n", - "5 | \n", - "
fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f | \n", - "horse | \n", - "gpt-3.5-turbo-1106 | \n", - "horse | \n", - "horse | \n", - "1.0 | \n", - "5 | \n", - "5 | \n", - "
fb5a07d7-8ee1-4ee7-b120-3e1a2e167f3f | \n", - "horse | \n", - "gpt-4-0613 | \n", - "horse | \n", - "horse | \n", - "1.0 | \n", - "5 | \n", - "5 | \n", - "
5daad87c-a008-49ab-841c-76916b150f4d | \n", - "house | \n", - "gpt-3.5-turbo-0613 | \n", - "house | \n", - "house | \n", - "1.0 | \n", - "5 | \n", - "5 | \n", - "
5daad87c-a008-49ab-841c-76916b150f4d | \n", - "house | \n", - "gpt-3.5-turbo-1106 | \n", - "house | \n", - "house | \n", - "1.0 | \n", - "5 | \n", - "5 | \n", - "
5daad87c-a008-49ab-841c-76916b150f4d | \n", - "house | \n", - "gpt-4-0613 | \n", - "house | \n", - "house | \n", - "1.0 | \n", - "5 | \n", - "5 | \n", - "
57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 | \n", - "information | \n", - "gpt-3.5-turbo-0613 | \n", - "information | \n", - "information | \n", - "1.0 | \n", - "11 | \n", - "11 | \n", - "
57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 | \n", - "information | \n", - "gpt-3.5-turbo-1106 | \n", - "information! | \n", - "information | \n", - "0.0 | \n", - "11 | \n", - "12 | \n", - "
57e29316-e258-4ed9-bbeb-b23c8bcb4bd2 | \n", - "information | \n", - "gpt-4-0613 | \n", - "information | \n", - "information | \n", - "1.0 | \n", - "11 | \n", - "11 | \n", - "
5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 | \n", - "keyboard | \n", - "gpt-3.5-turbo-0613 | \n", - "keyboard | \n", - "keyboard | \n", - "1.0 | \n", - "8 | \n", - "8 | \n", - "
5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 | \n", - "keyboard | \n", - "gpt-3.5-turbo-1106 | \n", - "keyboard | \n", - "keyboard | \n", - "1.0 | \n", - "8 | \n", - "8 | \n", - "
5ad3a4b3-5207-4a1c-9423-e6cdc3658e49 | \n", - "keyboard | \n", - "gpt-4-0613 | \n", - "eyboard | \n", - "keyboard | \n", - "0.0 | \n", - "8 | \n", - "7 | \n", - "
a4ffa874-b03b-40ed-b360-d17c963ef27e | \n", - "school | \n", - "gpt-3.5-turbo-0613 | \n", - "school | \n", - "school | \n", - "1.0 | \n", - "6 | \n", - "6 | \n", - "
a4ffa874-b03b-40ed-b360-d17c963ef27e | \n", - "school | \n", - "gpt-3.5-turbo-1106 | \n", - "school | \n", - "school | \n", - "1.0 | \n", - "6 | \n", - "7 | \n", - "
a4ffa874-b03b-40ed-b360-d17c963ef27e | \n", - "school | \n", - "gpt-4-0613 | \n", - "school | \n", - "school | \n", - "1.0 | \n", - "6 | \n", - "6 | \n", - "
8a2b5450-dd16-4213-8b70-cb2583d6c7eb | \n", - "student | \n", - "gpt-3.5-turbo-0613 | \n", - "student | \n", - "student | \n", - "1.0 | \n", - "7 | \n", - "7 | \n", - "
8a2b5450-dd16-4213-8b70-cb2583d6c7eb | \n", - "student | \n", - "gpt-3.5-turbo-1106 | \n", - "student | \n", - "student | \n", - "1.0 | \n", - "7 | \n", - "7 | \n", - "
8a2b5450-dd16-4213-8b70-cb2583d6c7eb | \n", - "student | \n", - "gpt-4-0613 | \n", - "student | \n", - "student | \n", - "1.0 | \n", - "7 | \n", - "7 | \n", - "
223f250b-9c33-4aed-adfd-791547b44d3d | \n", - "teacher | \n", - "gpt-3.5-turbo-0613 | \n", - "teacher | \n", - "teacher | \n", - "1.0 | \n", - "7 | \n", - "7 | \n", - "
223f250b-9c33-4aed-adfd-791547b44d3d | \n", - "teacher | \n", - "gpt-3.5-turbo-1106 | \n", - "teacher | \n", - "teacher | \n", - "1.0 | \n", - "7 | \n", - "7 | \n", - "
223f250b-9c33-4aed-adfd-791547b44d3d | \n", - "teacher | \n", - "gpt-4-0613 | \n", - "teacher | \n", - "teacher | \n", - "1.0 | \n", - "7 | \n", - "7 | \n", - "
08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a | \n", - "university | \n", - "gpt-3.5-turbo-0613 | \n", - "university | \n", - "university | \n", - "1.0 | \n", - "10 | \n", - "10 | \n", - "
08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a | \n", - "university | \n", - "gpt-3.5-turbo-1106 | \n", - "university | \n", - "university | \n", - "1.0 | \n", - "10 | \n", - "10 | \n", - "
08a17b26-0b33-4d4f-b7f6-ee44d51f4a4a | \n", - "university | \n", - "gpt-4-0613 | \n", - "university | \n", - "university | \n", - "1.0 | \n", - "10 | \n", - "10 | \n", - "