/ qwencoder-eval / base / benchmarks / multiple-eval / eval_instruct.py
eval_instruct.py
  1  import argparse
  2  import json
  3  import os
  4  import torch
  5  from pathlib import Path
  6  from tqdm import tqdm
  7  
  8  data_abs_dir = Path(__file__).parent / "data"
  9  
 10  from utils.utils import extract_generation_code, languge_settings
 11  from transformers import AutoTokenizer, AutoModelForCausalLM
 12  from human_eval.evaluation import evaluate_functional_correctness
 13  
 14  
 15  def build_deepseekcoder_instruction(languge: str, question: str):
 16      return """
 17  Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
 18  ```{}
 19  {}
 20  ```
 21  """.strip().format(
 22          languge.lower(), question.strip()
 23      )
 24  
 25  
 26  def generate_one(example, lang, tokenizer, model):
 27      prompt = build_deepseekcoder_instruction(languge_settings[lang]["full_name"], example["prompt"])
 28      inputs = tokenizer.apply_chat_template([{"role": "user", "content": prompt}], return_tensors="pt").to(model.device)
 29  
 30      stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
 31      assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
 32  
 33      outputs = model.generate(
 34          inputs,
 35          max_new_tokens=1024,
 36          do_sample=False,
 37          # top_p=0.95,
 38          # temperature=temperature,
 39          pad_token_id=stop_id,
 40          eos_token_id=stop_id,
 41      )
 42  
 43      output = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
 44      example["output"] = output
 45  
 46      return extract_generation_code(example, lang_code=lang)
 47  
 48  
 49  def generate_main(args):
 50      model_name_or_path = args.model
 51      lang = args.language
 52      saved_path = args.output_path
 53      temp_dir = args.temp_dir
 54      os.makedirs(temp_dir, exist_ok=True)
 55      problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
 56  
 57      print("model", model_name_or_path)
 58      tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
 59      print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path))
 60      model = AutoModelForCausalLM.from_pretrained(
 61          model_name_or_path,
 62          torch_dtype=torch.bfloat16,
 63          device_map="auto",
 64          # use_flash_attention_2=True
 65      )
 66      model.eval()
 67      examples = [json.loads(x) for x in open(problem_file) if x.strip()]
 68      print("Read {} examples for evaluation over.".format(len(examples)))
 69  
 70      generated_examples = []
 71      for ex in tqdm(examples, desc="Generating"):
 72          gen_example = generate_one(ex, args.language, tokenizer, model)
 73          generated_examples.append(gen_example)
 74  
 75      print("Generate all over!!!")
 76      with open(saved_path, "w", encoding="utf-8") as fw:
 77          for ex in generated_examples:
 78              fw.write(json.dumps(ex) + "\n")
 79          print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path))
 80  
 81      result = evaluate_functional_correctness(
 82          input_file=saved_path, tmp_dir=temp_dir, n_workers=8, timeout=3.0, problem_file=problem_file, language=lang
 83      )
 84      print(lang, result, model_name_or_path)
 85      pass
 86  
 87  
 88  def evaluation_only(args):
 89      lang = args.language
 90      temp_dir = args.temp_dir
 91      assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path)
 92      os.makedirs(temp_dir, exist_ok=True)
 93  
 94      output_name = os.path.basename(args.output_path)
 95      output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
 96  
 97      processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")]
 98      processed_path = os.path.join(temp_dir, output_name)
 99      with open(processed_path, "w", encoding="utf-8") as fw:
100          for ex in processed_examples:
101              fw.write(json.dumps(ex) + "\n")
102          print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path))
103  
104      problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
105      from human_eval.evaluation import evaluate_functional_correctness
106  
107      result = evaluate_functional_correctness(
108          input_file=processed_path, tmp_dir=temp_dir, n_workers=8, timeout=3.0, problem_file=problem_file, language=lang
109      )
110      print(lang, result)
111  
112  
113  if __name__ == "__main__":
114      parser = argparse.ArgumentParser()
115      parser.add_argument("--model", type=str, help="model name or path")
116      parser.add_argument("--output_path", type=str, help="output path of your generation")
117      parser.add_argument("--language", type=str, help="langauge")
118      parser.add_argument("--temp_dir", type=str, help="temp dir for evaluation", default="tmp")
119      args = parser.parse_args()
120  
121      os.environ["TOKENIZERS_PARALLELISM"] = "false"
122      generate_main(args)
123      pass