eval_instruct.py
1 import argparse 2 import json 3 import os 4 import torch 5 from pathlib import Path 6 from tqdm import tqdm 7 8 data_abs_dir = Path(__file__).parent / "data" 9 10 from utils.utils import extract_generation_code, languge_settings 11 from transformers import AutoTokenizer, AutoModelForCausalLM 12 from human_eval.evaluation import evaluate_functional_correctness 13 14 15 def build_deepseekcoder_instruction(languge: str, question: str): 16 return """ 17 Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion: 18 ```{} 19 {} 20 ``` 21 """.strip().format( 22 languge.lower(), question.strip() 23 ) 24 25 26 def generate_one(example, lang, tokenizer, model): 27 prompt = build_deepseekcoder_instruction(languge_settings[lang]["full_name"], example["prompt"]) 28 inputs = tokenizer.apply_chat_template([{"role": "user", "content": prompt}], return_tensors="pt").to(model.device) 29 30 stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>") 31 assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found" 32 33 outputs = model.generate( 34 inputs, 35 max_new_tokens=1024, 36 do_sample=False, 37 # top_p=0.95, 38 # temperature=temperature, 39 pad_token_id=stop_id, 40 eos_token_id=stop_id, 41 ) 42 43 output = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True) 44 example["output"] = output 45 46 return extract_generation_code(example, lang_code=lang) 47 48 49 def generate_main(args): 50 model_name_or_path = args.model 51 lang = args.language 52 saved_path = args.output_path 53 temp_dir = args.temp_dir 54 os.makedirs(temp_dir, exist_ok=True) 55 problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl") 56 57 print("model", model_name_or_path) 58 tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 59 print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path)) 60 model = AutoModelForCausalLM.from_pretrained( 61 model_name_or_path, 62 torch_dtype=torch.bfloat16, 63 device_map="auto", 64 # use_flash_attention_2=True 65 ) 66 model.eval() 67 examples = [json.loads(x) for x in open(problem_file) if x.strip()] 68 print("Read {} examples for evaluation over.".format(len(examples))) 69 70 generated_examples = [] 71 for ex in tqdm(examples, desc="Generating"): 72 gen_example = generate_one(ex, args.language, tokenizer, model) 73 generated_examples.append(gen_example) 74 75 print("Generate all over!!!") 76 with open(saved_path, "w", encoding="utf-8") as fw: 77 for ex in generated_examples: 78 fw.write(json.dumps(ex) + "\n") 79 print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path)) 80 81 result = evaluate_functional_correctness( 82 input_file=saved_path, tmp_dir=temp_dir, n_workers=8, timeout=3.0, problem_file=problem_file, language=lang 83 ) 84 print(lang, result, model_name_or_path) 85 pass 86 87 88 def evaluation_only(args): 89 lang = args.language 90 temp_dir = args.temp_dir 91 assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path) 92 os.makedirs(temp_dir, exist_ok=True) 93 94 output_name = os.path.basename(args.output_path) 95 output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()] 96 97 processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")] 98 processed_path = os.path.join(temp_dir, output_name) 99 with open(processed_path, "w", encoding="utf-8") as fw: 100 for ex in processed_examples: 101 fw.write(json.dumps(ex) + "\n") 102 print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path)) 103 104 problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl") 105 from human_eval.evaluation import evaluate_functional_correctness 106 107 result = evaluate_functional_correctness( 108 input_file=processed_path, tmp_dir=temp_dir, n_workers=8, timeout=3.0, problem_file=problem_file, language=lang 109 ) 110 print(lang, result) 111 112 113 if __name__ == "__main__": 114 parser = argparse.ArgumentParser() 115 parser.add_argument("--model", type=str, help="model name or path") 116 parser.add_argument("--output_path", type=str, help="output path of your generation") 117 parser.add_argument("--language", type=str, help="langauge") 118 parser.add_argument("--temp_dir", type=str, help="temp dir for evaluation", default="tmp") 119 args = parser.parse_args() 120 121 os.environ["TOKENIZERS_PARALLELISM"] = "false" 122 generate_main(args) 123 pass