/ examples / Qwen2.5-Coder-Instruct-stream.py
Qwen2.5-Coder-Instruct-stream.py
 1  from transformers import AutoTokenizer, AutoModelForCausalLM
 2  from transformers import TextIteratorStreamer
 3  from threading import Thread
 4  
 5  device = "cuda" # the device to load the model onto
 6  
 7  # Now you do not need to add "trust_remote_code=True"
 8  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct")
 9  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct", device_map="auto").eval()
10  
11  # Instead of using model.chat(), we directly use model.generate()
12  # But you need to use tokenizer.apply_chat_template() to format your inputs as shown below
13  prompt = "write a quick sort algorithm."
14  messages = [
15      {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
16      {"role": "user", "content": prompt}
17  ]
18  text = tokenizer.apply_chat_template(
19      messages,
20      tokenize=False,
21      add_generation_prompt=True
22  )
23  model_inputs = tokenizer([text], return_tensors="pt").to(device)
24  
25  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
26  
27  generation_kwargs = dict(inputs=model_inputs.input_ids, streamer=streamer, max_new_tokens=2048)
28  thread = Thread(target=model.generate, kwargs=generation_kwargs)
29  
30  thread.start()
31  generated_text = ""
32  for new_text in streamer:
33      generated_text += new_text
34      print(new_text, end="")
35  print(generated_text)