test.sh
1 export PATH=./bin:$PATH 2 export HF_ENDPOINT=http://hf-mirror.com 3 export HF_HOME="" 4 export HF_DATASETS_OFFLINE=1 5 export HF_EVALUATE_OFFLINE=1 6 7 export MODEL_DIR=${1} 8 export TP=${2} 9 export OUTPUT_DIR=${3} 10 mkdir -p ${OUTPUT_DIR} 11 12 run_benchmark() { 13 SPLIT=$1 14 SUBSET=$2 15 16 # Generate code completions 17 python generate.py \ 18 --model ${MODEL_DIR} \ 19 --split ${SPLIT} \ 20 --subset ${SUBSET} \ 21 --greedy \ 22 --bs 1 \ 23 --temperature 0 \ 24 --n_samples 1 \ 25 --resume \ 26 --backend vllm \ 27 --tp ${TP} \ 28 --save_path ${OUTPUT_DIR}/bigcodebench_${SPLIT}_${SUBSET}/completion.jsonl \ 29 #--chat_mode 30 31 # Sanitize and calibrate the generated samples 32 python sanitize.py \ 33 --samples ${OUTPUT_DIR}/bigcodebench_${SPLIT}_${SUBSET}/completion.jsonl \ 34 --calibrate 35 36 # Evaluate the sanitized and calibrated completions 37 python evaluate.py \ 38 --split ${SPLIT} \ 39 --subset ${SUBSET} \ 40 --no-gt \ 41 --samples ${OUTPUT_DIR}/bigcodebench_${SPLIT}_${SUBSET}/completion-sanitized-calibrated.jsonl 42 43 # You are strongly recommended to use the following command to clean up the environment after evaluation: 44 pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi; 45 rm -rf /tmp/* 46 } 47 48 # Run benchmarks for different configurations 49 run_benchmark complete hard 50 run_benchmark complete full 51 run_benchmark instruct hard 52 run_benchmark instruct full