-
-
Notifications
You must be signed in to change notification settings - Fork 841
157 lines (142 loc) · 5.4 KB
/
run_evals.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
name: eval
on:
# Run on every release
push:
tags:
- "*"
# Allow manual triggers from GitHub UI
workflow_dispatch:
inputs:
khoj_mode:
description: 'Khoj Mode (general/default/research)'
required: true
default: 'default'
type: choice
options:
- general
- default
- research
dataset:
description: 'Dataset to evaluate (frames/simpleqa)'
required: true
default: 'frames'
type: choice
options:
- frames
- simpleqa
- gpqa
- math500
sample_size:
description: 'Number of samples to evaluate'
required: false
default: 200
type: number
jobs:
eval:
runs-on: ubuntu-latest
strategy:
matrix:
# Use input from manual trigger if available, else run all combinations
khoj_mode: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.khoj_mode)) || fromJSON('["general", "default", "research"]') }}
dataset: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.dataset)) || fromJSON('["frames", "simpleqa"]') }}
services:
postgres:
image: ankane/pgvector
env:
POSTGRES_PASSWORD: postgres
POSTGRES_USER: postgres
POSTGRES_DB: postgres
ports:
- 5432:5432
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Get App Version
id: hatch
run: echo "version=$(pipx run hatch version)" >> $GITHUB_OUTPUT
- name: ⏬️ Install Dependencies
env:
DEBIAN_FRONTEND: noninteractive
run: |
# install postgres and other dependencies
sudo apt update && sudo apt install -y git python3-pip libegl1 sqlite3 libsqlite3-dev libsqlite3-0 ffmpeg libsm6 libxext6
sudo apt install -y postgresql postgresql-client && sudo apt install -y postgresql-server-dev-14
# upgrade pip
python -m ensurepip --upgrade && python -m pip install --upgrade pip
# install terrarium for code sandbox
git clone https://github.com/khoj-ai/terrarium.git && cd terrarium && npm install --legacy-peer-deps && mkdir pyodide_cache
- name: ⬇️ Install Application
run: |
sed -i 's/dynamic = \["version"\]/version = "${{ steps.hatch.outputs.version }}"/' pyproject.toml
pip install --upgrade .[dev]
- name: 📝 Run Eval
env:
KHOJ_MODE: ${{ matrix.khoj_mode }}
SAMPLE_SIZE: ${{ github.event_name == 'workflow_dispatch' && inputs.sample_size || 200 }}
BATCH_SIZE: "20"
RANDOMIZE: "True"
KHOJ_URL: "http://localhost:42110"
KHOJ_LLM_SEED: "42"
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
SERPER_DEV_API_KEY: ${{ matrix.dataset != 'math500' && secrets.SERPER_DEV_API_KEY }}
OLOSTEP_API_KEY: ${{ matrix.dataset != 'math500' && secrets.OLOSTEP_API_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
KHOJ_ADMIN_EMAIL: khoj
KHOJ_ADMIN_PASSWORD: khoj
POSTGRES_HOST: localhost
POSTGRES_PORT: 5432
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
KHOJ_TELEMETRY_DISABLE: "True" # To disable telemetry for tests
run: |
# Start Khoj server in background
khoj --anonymous-mode --non-interactive &
# Start code sandbox
npm run dev --prefix cohere-terrarium &
# Wait for server to be ready
timeout=120
while ! curl -s http://localhost:42110/api/health > /dev/null; do
if [ $timeout -le 0 ]; then
echo "Timed out waiting for Khoj server"
exit 1
fi
echo "Waiting for Khoj server..."
sleep 2
timeout=$((timeout-2))
done
# Run evals
python tests/evals/eval.py -d ${{ matrix.dataset }}
- name: Upload Results
if: always() # Upload results even if tests fail
uses: actions/upload-artifact@v3
with:
name: eval-results-${{ steps.hatch.outputs.version }}-${{ matrix.khoj_mode }}-${{ matrix.dataset }}
path: |
*_evaluation_results_*.csv
*_evaluation_summary_*.txt
- name: Display Results
if: always()
run: |
# Read and display summary
echo "## Evaluation Summary of Khoj on ${{ matrix.dataset }} in ${{ matrix.khoj_mode }} mode" >> $GITHUB_STEP_SUMMARY
echo "**$(head -n 1 *_evaluation_summary_*.txt)**" >> $GITHUB_STEP_SUMMARY
echo "- Khoj Version: ${{ steps.hatch.outputs.version }}" >> $GITHUB_STEP_SUMMARY
echo "- Chat Model: Gemini 1.5 Flash 002" >> $GITHUB_STEP_SUMMARY
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
tail -n +2 *_evaluation_summary_*.txt >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
# Display in logs too
echo "===== EVALUATION RESULTS ====="
cat *_evaluation_summary_*.txt