diff --git a/.github/workflows/evaluate.yaml b/.github/workflows/evaluate.yaml index a12cd2d..381b7d3 100644 --- a/.github/workflows/evaluate.yaml +++ b/.github/workflows/evaluate.yaml @@ -170,20 +170,26 @@ jobs: - name: Summarize results if: ${{ success() }} run: | - echo "📊 Evaluation Results" >> $GITHUB_STEP_SUMMARY - python -m evaltools summary evals/results --output=markdown >> eval-results.md - cat eval-results.md >> $GITHUB_STEP_SUMMARY + echo "## Evaluation results" >> eval-summary.md + python -m evaltools summary evals/results --output=markdown >> eval-summary.md + echo "## Answer differences across runs" >> run-diff.md + python -m evaltools diff evals/results/baseline evals/results/pr${{ github.event.issue.number }} --output=markdown >> run-diff.md + cat eval-summary.md >> $GITHUB_STEP_SUMMARY + cat run-diff.md >> $GITHUB_STEP_SUMMARY - name: Comment on pull request uses: actions/github-script@v7 with: script: | const fs = require('fs'); - const summaryPath = "eval-results.md"; + const summaryPath = "eval-summary.md"; const summary = fs.readFileSync(summaryPath, 'utf8'); + const runId = process.env.GITHUB_RUN_ID; + const repo = process.env.GITHUB_REPOSITORY; + const actionsUrl = `https://github.com/${repo}/actions/runs/${runId}`; github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, - body: summary + body: `${summary}\n\n[Check the Actions tab for more details](${actionsUrl}).` }) diff --git a/docs/evaluation.md b/docs/evaluation.md index ba63843..de62cee 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -2,6 +2,13 @@ Follow these steps to evaluate the quality of the answers generated by the RAG flow. +* [Deploy a GPT-4 model](#deploy-a-gpt-4-model) +* [Setup the evaluation environment](#setup-the-evaluation-environment) +* [Generate ground truth data](#generate-ground-truth-data) +* [Run bulk evaluation](#run-bulk-evaluation) +* [Review the evaluation results](#review-the-evaluation-results) +* [Run bulk evaluation on a PR](#run-bulk-evaluation-on-a-pr) + ## Deploy a GPT-4 model @@ -45,7 +52,7 @@ python evals/generate_ground_truth_data.py Review the generated data after running that script, removing any question/answer pairs that don't seem like realistic user input. -## Evaluate the RAG answer quality +## Run bulk evaluation Review the configuration in `evals/eval_config.json` to ensure that everything is correctly setup. You may want to adjust the metrics used. See [the ai-rag-chat-evaluator README](https://github.com/Azure-Samples/ai-rag-chat-evaluator) for more information on the available metrics. @@ -72,6 +79,6 @@ Compare answers across runs by running the following command: python -m evaltools diff evals/results/baseline/ ``` -## Run the evaluation on a PR +## Run bulk evaluation on a PR To run the evaluation on the changes in a PR, you can add a `/evaluate` comment to the PR. This will trigger the evaluation workflow to run the evaluation on the PR changes and will post the results to the PR.