/ kitt_g_voice.py
kitt_g_voice.py
  1  # Import necessary libraries
  2  import warnings          # Import warnings module to suppress warnings
  3  import pyaudio           # Import pyaudio library for audio I/O
  4  import wave              # Import wave module to read and write WAV files
  5  import whisper           # Import whisper library for audio transcription
  6  import openai            # Import openai library for AI-based conversation
  7  import keyboard          # Import keyboard library for detecting key presses
  8  import os                # Import os module for system operations
  9  import tkinter as tk     # Import tkinter library for GUI dialogs
 10  from tkinter import simpledialog  # Import simpledialog from tkinter for user input dialog
 11  from google.cloud import texttospeech  # Import texttospeech module from google.cloud
 12  from playsound import playsound        # Import playsound function for playing audio files
 13  from pydub import AudioSegment         # Import AudioSegment class from pydub
 14  from pydub.playback import play       # Import play function from pydub for audio playback
 15  
 16  # Initialize Google Cloud Text-to-Speech client
 17  client = texttospeech.TextToSpeechClient()
 18  
 19  # Define ANSI escape sequences for text color
 20  colors = {
 21      "blue": "\033[94m",     # Blue color for VTM messages
 22      "orange": "\033[93m",   # Orange color (unused)
 23      "yellow": "\033[93m",   # Yellow color for ready message
 24      "white": "\033[97m",    # White color (unused)
 25      "red": "\033[91m",      # Red color for stopping recording message
 26      "magenta": "\033[35m",  # Magenta color for KITT messages
 27      "green": "\033[32m",    # Green color for start recording message
 28      "reset": "\033[0m"      # Reset color
 29  }
 30  
 31  # Filter out FP16 warning messages
 32  warnings.filterwarnings("ignore", message="FP16 is not supported on CPU")
 33  
 34  # Set up OpenAI API
 35  openai.api_base = "http://localhost:1234/v1"  # Set the API base URL
 36  openai.api_key = "not-needed"                  # Set the API key (not needed for local model)
 37  
 38  # Load whisper model for audio transcription
 39  whisper_model = whisper.load_model("tiny")  # Load the whisper model
 40  
 41  # Define audio parameters
 42  FORMAT = pyaudio.paInt16  # Set audio format to 16-bit PCM
 43  CHANNELS = 1               # Set number of channels to mono
 44  RATE = 8000                # Set sample rate to 8000 Hz (originally 16000 Hz)
 45  CHUNK = 1024               # Set chunk size for audio I/O
 46  
 47  # Initialize PyAudio instance
 48  audio = pyaudio.PyAudio()
 49  
 50  # Define a function to synthesize and speak text
 51  def speak(text, sample_rate_hertz=16000):
 52      # Set up the text request
 53      synthesis_input = texttospeech.SynthesisInput(text=text)
 54      # Configure the voice parameters
 55      voice = texttospeech.VoiceSelectionParams(
 56          language_code="en-GB",
 57          name="en-GB-Neural2-A",
 58          ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
 59      )
 60      # Configure the audio output format
 61      audio_config = texttospeech.AudioConfig(
 62          audio_encoding=texttospeech.AudioEncoding.MP3,
 63          sample_rate_hertz=sample_rate_hertz
 64      )
 65      # Perform the text-to-speech request
 66      response = client.synthesize_speech(
 67          input=synthesis_input,
 68          voice=voice,
 69          audio_config=audio_config
 70      )
 71      # Save the output to an MP3 file and play it
 72      with open("temp_output.mp3", "wb") as out:
 73          out.write(response.audio_content)
 74      audio = AudioSegment.from_mp3("temp_output.mp3")
 75      play(audio)
 76  
 77  # Define a function to read content from a file
 78  def read_file_content(file_path):
 79      try:
 80          with open(file_path, "r") as file:
 81              return file.read().strip()
 82      except FileNotFoundError:
 83          print("File not found.")
 84          return None
 85  
 86  # Initial greeting message
 87  system_message = read_file_content("system_message.txt")
 88  if system_message is None:
 89      exit()  # Exit if system message file not found
 90  
 91  # Define the initial greeting message
 92  initial_message = "Welcome to a new episode of Videotronic Maker, This is KITT, his personal A.I. assistant. I exist in the home PC of Videotronic Maker and I am locally run via LM Studio. LM Studio is a software company located in Brooklyn, New York, so it's fair to say that Brooklyn is in the house! Learn with Videotronic Maker as he learns!"
 93  speak(initial_message, sample_rate_hertz=16000)  # Speak the initial greeting message
 94  
 95  # Define a function to record audio
 96  def record_audio():
 97  
 98  
 99      # Open the audio stream for recording
100      stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
101      print(f"{colors['green']}Start speaking... (Press 'N' to stop){colors['reset']}")
102      frames = []
103  
104      while True:
105          data = stream.read(CHUNK)
106          frames.append(data)
107          if keyboard.is_pressed('n'):
108              print(f"{colors['red']}Stopping recording.{colors['reset']}")
109              break
110  
111      stream.stop_stream()
112      stream.close()
113  
114      # Write the recorded audio to a WAV file
115      wf = wave.open("temp_audio.wav", 'wb')
116      wf.setnchannels(CHANNELS)
117      wf.setsampwidth(audio.get_sample_size(FORMAT))
118      wf.setframerate(RATE)
119      wf.writeframes(b''.join(frames))
120      wf.close()
121  
122      return "temp_audio.wav"
123  
124  # Define a function to get user input via GUI dialog
125  def get_user_input():
126      ROOT = tk.Tk()
127      ROOT.withdraw()
128      user_input = simpledialog.askstring(title="Text Input", prompt="Type your input:")
129      return user_input
130  
131  # Define a function to process user input
132  def process_input(input_text):
133      # Construct conversation messages
134      conversation = [
135          {"role": "system", "content": system_message},
136          {"role": "user", "content": input_text}
137      ]
138  
139      # Set max response tokens for AI conversation
140      max_response_tokens = 150
141  
142      # Check for exit conditions
143      if input_text.lower() in ['exit', 'bye', 'end']:
144          farewell_response = "Goodbye sir!"
145          print(f"{colors['magenta']}KITT:{colors['reset']} {farewell_response}")
146          speak(farewell_response)
147          exit()
148  
149      # Generate AI-based response
150      completion = openai.ChatCompletion.create(
151          model="local-model",
152          messages=conversation,
153          temperature=0.7,
154          top_p=0.9,
155          top_k=40
156      )
157  
158      # Get assistant's reply from AI completion
159      assistant_reply = completion.choices[0].message.content
160      print(f"{colors['magenta']}KITT:{colors['reset']} {assistant_reply}")
161      speak(assistant_reply)
162  
163      # Check for exit conditions again
164      if input_text.lower() in ['exit', 'bye', 'end']:
165          print("Exiting the conversation.")
166          exit()
167  
168  # Main loop for recording and processing audio/text input
169  print(f"{colors['yellow']}Ready to record. (Press 'B' to start, 'M' to type){colors['reset']}")
170  while True:
171      try:
172          if keyboard.is_pressed('b'):  # Start recording when 'B' is pressed
173              audio_file = record_audio()
174              transcribe_result = whisper_model.transcribe(audio_file)
175              transcribed_text = transcribe_result["text"]
176              print(f"{colors['blue']}VTM:{colors['reset']} {transcribed_text}")
177              process_input(transcribed_text)
178              os.remove(audio_file)  # Cleanup
179  
180          elif keyboard.is_pressed('m'):  # Use the GUI for input when 'M' is pressed
181              typed_input = get_user_input()
182              if typed_input:  # Ensure input is not None or empty
183                  print(f"{colors['blue']}VTM typed:{colors['reset']} {typed_input}")  # Print the typed input in the terminal
184                  process_input(typed_input)
185  
186      except KeyboardInterrupt:
187          print("\nExiting...")
188          break  # Exit the loop upon a KeyboardInterrupt
189  
190  # Terminate PyAudio instance
191  audio.terminate()