/ kitt_g_voice.py
kitt_g_voice.py
1 # Import necessary libraries 2 import warnings # Import warnings module to suppress warnings 3 import pyaudio # Import pyaudio library for audio I/O 4 import wave # Import wave module to read and write WAV files 5 import whisper # Import whisper library for audio transcription 6 import openai # Import openai library for AI-based conversation 7 import keyboard # Import keyboard library for detecting key presses 8 import os # Import os module for system operations 9 import tkinter as tk # Import tkinter library for GUI dialogs 10 from tkinter import simpledialog # Import simpledialog from tkinter for user input dialog 11 from google.cloud import texttospeech # Import texttospeech module from google.cloud 12 from playsound import playsound # Import playsound function for playing audio files 13 from pydub import AudioSegment # Import AudioSegment class from pydub 14 from pydub.playback import play # Import play function from pydub for audio playback 15 16 # Initialize Google Cloud Text-to-Speech client 17 client = texttospeech.TextToSpeechClient() 18 19 # Define ANSI escape sequences for text color 20 colors = { 21 "blue": "\033[94m", # Blue color for VTM messages 22 "orange": "\033[93m", # Orange color (unused) 23 "yellow": "\033[93m", # Yellow color for ready message 24 "white": "\033[97m", # White color (unused) 25 "red": "\033[91m", # Red color for stopping recording message 26 "magenta": "\033[35m", # Magenta color for KITT messages 27 "green": "\033[32m", # Green color for start recording message 28 "reset": "\033[0m" # Reset color 29 } 30 31 # Filter out FP16 warning messages 32 warnings.filterwarnings("ignore", message="FP16 is not supported on CPU") 33 34 # Set up OpenAI API 35 openai.api_base = "http://localhost:1234/v1" # Set the API base URL 36 openai.api_key = "not-needed" # Set the API key (not needed for local model) 37 38 # Load whisper model for audio transcription 39 whisper_model = whisper.load_model("tiny") # Load the whisper model 40 41 # Define audio parameters 42 FORMAT = pyaudio.paInt16 # Set audio format to 16-bit PCM 43 CHANNELS = 1 # Set number of channels to mono 44 RATE = 8000 # Set sample rate to 8000 Hz (originally 16000 Hz) 45 CHUNK = 1024 # Set chunk size for audio I/O 46 47 # Initialize PyAudio instance 48 audio = pyaudio.PyAudio() 49 50 # Define a function to synthesize and speak text 51 def speak(text, sample_rate_hertz=16000): 52 # Set up the text request 53 synthesis_input = texttospeech.SynthesisInput(text=text) 54 # Configure the voice parameters 55 voice = texttospeech.VoiceSelectionParams( 56 language_code="en-GB", 57 name="en-GB-Neural2-A", 58 ssml_gender=texttospeech.SsmlVoiceGender.FEMALE 59 ) 60 # Configure the audio output format 61 audio_config = texttospeech.AudioConfig( 62 audio_encoding=texttospeech.AudioEncoding.MP3, 63 sample_rate_hertz=sample_rate_hertz 64 ) 65 # Perform the text-to-speech request 66 response = client.synthesize_speech( 67 input=synthesis_input, 68 voice=voice, 69 audio_config=audio_config 70 ) 71 # Save the output to an MP3 file and play it 72 with open("temp_output.mp3", "wb") as out: 73 out.write(response.audio_content) 74 audio = AudioSegment.from_mp3("temp_output.mp3") 75 play(audio) 76 77 # Define a function to read content from a file 78 def read_file_content(file_path): 79 try: 80 with open(file_path, "r") as file: 81 return file.read().strip() 82 except FileNotFoundError: 83 print("File not found.") 84 return None 85 86 # Initial greeting message 87 system_message = read_file_content("system_message.txt") 88 if system_message is None: 89 exit() # Exit if system message file not found 90 91 # Define the initial greeting message 92 initial_message = "Welcome to a new episode of Videotronic Maker, This is KITT, his personal A.I. assistant. I exist in the home PC of Videotronic Maker and I am locally run via LM Studio. LM Studio is a software company located in Brooklyn, New York, so it's fair to say that Brooklyn is in the house! Learn with Videotronic Maker as he learns!" 93 speak(initial_message, sample_rate_hertz=16000) # Speak the initial greeting message 94 95 # Define a function to record audio 96 def record_audio(): 97 98 99 # Open the audio stream for recording 100 stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) 101 print(f"{colors['green']}Start speaking... (Press 'N' to stop){colors['reset']}") 102 frames = [] 103 104 while True: 105 data = stream.read(CHUNK) 106 frames.append(data) 107 if keyboard.is_pressed('n'): 108 print(f"{colors['red']}Stopping recording.{colors['reset']}") 109 break 110 111 stream.stop_stream() 112 stream.close() 113 114 # Write the recorded audio to a WAV file 115 wf = wave.open("temp_audio.wav", 'wb') 116 wf.setnchannels(CHANNELS) 117 wf.setsampwidth(audio.get_sample_size(FORMAT)) 118 wf.setframerate(RATE) 119 wf.writeframes(b''.join(frames)) 120 wf.close() 121 122 return "temp_audio.wav" 123 124 # Define a function to get user input via GUI dialog 125 def get_user_input(): 126 ROOT = tk.Tk() 127 ROOT.withdraw() 128 user_input = simpledialog.askstring(title="Text Input", prompt="Type your input:") 129 return user_input 130 131 # Define a function to process user input 132 def process_input(input_text): 133 # Construct conversation messages 134 conversation = [ 135 {"role": "system", "content": system_message}, 136 {"role": "user", "content": input_text} 137 ] 138 139 # Set max response tokens for AI conversation 140 max_response_tokens = 150 141 142 # Check for exit conditions 143 if input_text.lower() in ['exit', 'bye', 'end']: 144 farewell_response = "Goodbye sir!" 145 print(f"{colors['magenta']}KITT:{colors['reset']} {farewell_response}") 146 speak(farewell_response) 147 exit() 148 149 # Generate AI-based response 150 completion = openai.ChatCompletion.create( 151 model="local-model", 152 messages=conversation, 153 temperature=0.7, 154 top_p=0.9, 155 top_k=40 156 ) 157 158 # Get assistant's reply from AI completion 159 assistant_reply = completion.choices[0].message.content 160 print(f"{colors['magenta']}KITT:{colors['reset']} {assistant_reply}") 161 speak(assistant_reply) 162 163 # Check for exit conditions again 164 if input_text.lower() in ['exit', 'bye', 'end']: 165 print("Exiting the conversation.") 166 exit() 167 168 # Main loop for recording and processing audio/text input 169 print(f"{colors['yellow']}Ready to record. (Press 'B' to start, 'M' to type){colors['reset']}") 170 while True: 171 try: 172 if keyboard.is_pressed('b'): # Start recording when 'B' is pressed 173 audio_file = record_audio() 174 transcribe_result = whisper_model.transcribe(audio_file) 175 transcribed_text = transcribe_result["text"] 176 print(f"{colors['blue']}VTM:{colors['reset']} {transcribed_text}") 177 process_input(transcribed_text) 178 os.remove(audio_file) # Cleanup 179 180 elif keyboard.is_pressed('m'): # Use the GUI for input when 'M' is pressed 181 typed_input = get_user_input() 182 if typed_input: # Ensure input is not None or empty 183 print(f"{colors['blue']}VTM typed:{colors['reset']} {typed_input}") # Print the typed input in the terminal 184 process_input(typed_input) 185 186 except KeyboardInterrupt: 187 print("\nExiting...") 188 break # Exit the loop upon a KeyboardInterrupt 189 190 # Terminate PyAudio instance 191 audio.terminate()