test.sh
 1  export PATH=./bin:$PATH
 2  export HF_ENDPOINT=http://hf-mirror.com
 3  export HF_HOME=""
 4  export HF_DATASETS_OFFLINE=1
 5  export HF_EVALUATE_OFFLINE=1
 6  
 7  export MODEL_DIR=${1}
 8  export TP=${2}
 9  export OUTPUT_DIR=${3}
10  mkdir -p ${OUTPUT_DIR}
11  
12  run_benchmark() {
13    SPLIT=$1
14    SUBSET=$2
15  
16    # Generate code completions
17    python generate.py \
18      --model ${MODEL_DIR} \
19      --split ${SPLIT} \
20      --subset ${SUBSET} \
21      --greedy  \
22      --bs 1 \
23      --temperature 0 \
24      --n_samples 1 \
25      --resume  \
26      --backend vllm \
27      --tp ${TP} \
28      --save_path ${OUTPUT_DIR}/bigcodebench_${SPLIT}_${SUBSET}/completion.jsonl \
29      #--chat_mode
30  
31    # Sanitize and calibrate the generated samples
32    python sanitize.py \
33      --samples ${OUTPUT_DIR}/bigcodebench_${SPLIT}_${SUBSET}/completion.jsonl \
34      --calibrate
35  
36    # Evaluate the sanitized and calibrated completions
37    python evaluate.py \
38      --split ${SPLIT} \
39      --subset ${SUBSET} \
40      --no-gt \
41      --samples ${OUTPUT_DIR}/bigcodebench_${SPLIT}_${SUBSET}/completion-sanitized-calibrated.jsonl
42    
43    # You are strongly recommended to use the following command to clean up the environment after evaluation:
44    pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi;
45    rm -rf /tmp/*
46  }
47  
48  # Run benchmarks for different configurations
49  run_benchmark complete hard
50  run_benchmark complete full
51  run_benchmark instruct hard
52  run_benchmark instruct full