-
Notifications
You must be signed in to change notification settings - Fork 2
/
run_pairwise_eval_llms.sh
33 lines (27 loc) · 1.07 KB
/
run_pairwise_eval_llms.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
##### This script compares a pair of LLMs' outputs ######
model1=gpt-4o
# you need to update your access key here
openai_key=''
eval_model=gpt-4-0125-preview
n_passages=5
domains=(bioasq fiqa recreation technology science writing lifestyle)
for model2 in gpt-4-turbo mistralai/Mixtral-8x22B-Instruct-v0.1
do
for i in "${!domains[@]}"
do
echo evaluating ${model1} and ${model2} using ${eval_model} for ${domains[i]}
python code/evaluate_pair_responses.py \
--eval_dir "data/pairwise_eval" \
--model ${eval_model} \
--eval_model1 ${model1} \
--eval_model2 ${model2} \
--model1_pred_file ${model1}/${domains[i]}_from_colbert_${n_passages}_psgs \
--model2_pred_file ${model2}/${domains[i]}_from_colbert_${n_passages}_psgs \
--reference_file references \
--template_config pairwise_lfrqa.cfg \
--domain ${domains[i]} \
--temperature 0.0 \
--eval_input_save_dir eval_inputs/from_colbert \
--api_key ${openai_key}
done
done