Cradicle Explorer

/ scripts / compare-tts-voices.sh
compare-tts-voices.sh
  1  #!/bin/bash
  2  # Compare TTS voices from multiple engines
  3  # Generates samples from each available engine for A/B comparison
  4  
  5  set -e
  6  
  7  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
  8  OUTPUT_DIR="$SCRIPT_DIR/../docs/assets/voice-samples"
  9  mkdir -p "$OUTPUT_DIR"
 10  
 11  # Test text (shorter for quick comparison)
 12  TEST_TEXT="This is code action quick. It shows language server fixes in the modeline as a lightbulb. Press Control-c Control-a to apply the first fix instantly."
 13  
 14  # Full narration for final generation
 15  FULL_NARRATION=(
 16      "This is code action quick. It shows language server fixes in the modeline as a lightbulb. Press Control-c Control-a to apply the first fix instantly."
 17      "First, let's convert this comment to a doc comment. The lightbulb shows the available action. Pressing Control-c Control-a applies it."
 18      "Next, HashMap needs an import. The modeline shows import suggestion. Control-c Control-a adds the use statement."
 19      "Finally, an unused variable warning. Control-c Control-a prefixes it with underscore to silence the warning."
 20      "Three fixes applied with just a keybinding."
 21  )
 22  
 23  echo "=== TTS Voice Comparison ==="
 24  echo "Output: $OUTPUT_DIR"
 25  echo ""
 26  
 27  # Track available engines
 28  declare -a AVAILABLE_ENGINES
 29  
 30  #######################################
 31  # 1. edge-tts (Microsoft Azure)
 32  #######################################
 33  generate_edge_tts() {
 34      local voice="$1"
 35      local rate="${2:--10%}"
 36      local output="$OUTPUT_DIR/edge-tts-${voice}.mp3"
 37      
 38      if command -v edge-tts &>/dev/null || [[ -x "$HOME/.local/bin/edge-tts" ]]; then
 39          echo "  Generating: edge-tts ($voice)..."
 40          EDGE_TTS="${HOME}/.local/bin/edge-tts"
 41          [[ ! -x "$EDGE_TTS" ]] && EDGE_TTS="edge-tts"
 42          
 43          $EDGE_TTS --voice "en-US-${voice}" --rate="$rate" \
 44              --text "$TEST_TEXT" \
 45              --write-media "$output" 2>/dev/null
 46          echo "    ✓ $output"
 47          return 0
 48      fi
 49      return 1
 50  }
 51  
 52  #######################################
 53  # 2. Piper (fast, local, many voices)
 54  #######################################
 55  generate_piper() {
 56      local voice="$1"
 57      local output="$OUTPUT_DIR/piper-${voice}.wav"
 58      
 59      if command -v piper &>/dev/null; then
 60          # Check if voice model exists
 61          local model_path="$HOME/.local/share/piper/voices/${voice}.onnx"
 62          if [[ -f "$model_path" ]]; then
 63              echo "  Generating: piper ($voice)..."
 64              echo "$TEST_TEXT" | piper --model "$model_path" --output_file "$output" 2>/dev/null
 65              echo "    ✓ $output"
 66              return 0
 67          else
 68              echo "  Piper voice $voice not found at $model_path"
 69          fi
 70      fi
 71      return 1
 72  }
 73  
 74  #######################################
 75  # 3. Coqui TTS / XTTS
 76  #######################################
 77  generate_coqui() {
 78      local model="$1"
 79      local output="$OUTPUT_DIR/coqui-${model//\//-}.wav"
 80      
 81      if python3 -c "import TTS" 2>/dev/null; then
 82          echo "  Generating: coqui ($model)..."
 83          python3 -c "
 84  from TTS.api import TTS
 85  tts = TTS('$model')
 86  tts.tts_to_file(text='''$TEST_TEXT''', file_path='$output')
 87  " 2>/dev/null
 88          echo "    ✓ $output"
 89          return 0
 90      fi
 91      return 1
 92  }
 93  
 94  #######################################
 95  # 4. Bark (Suno) - very natural, slow
 96  #######################################
 97  generate_bark() {
 98      local voice="$1"
 99      local output="$OUTPUT_DIR/bark-${voice}.wav"
100      
101      if python3 -c "from bark import SAMPLE_RATE, generate_audio, preload_models" 2>/dev/null; then
102          echo "  Generating: bark ($voice)... (this is slow)"
103          python3 << EOF
104  from bark import SAMPLE_RATE, generate_audio, preload_models
105  from scipy.io.wavfile import write as write_wav
106  import numpy as np
107  
108  preload_models()
109  audio_array = generate_audio("$TEST_TEXT", history_prompt="$voice")
110  write_wav("$output", SAMPLE_RATE, audio_array)
111  EOF
112          echo "    ✓ $output"
113          return 0
114      fi
115      return 1
116  }
117  
118  #######################################
119  # 5. Fish Speech (very natural, fast)
120  #######################################
121  generate_fish_speech() {
122      local output="$OUTPUT_DIR/fish-speech.wav"
123      
124      if python3 -c "import fish_speech" 2>/dev/null; then
125          echo "  Generating: fish-speech..."
126          python3 -c "
127  from fish_speech.inference import inference
128  inference(text='''$TEST_TEXT''', output_path='$output')
129  " 2>/dev/null
130          echo "    ✓ $output"
131          return 0
132      fi
133      return 1
134  }
135  
136  #######################################
137  # 6. Silero (lightweight, Russian company)
138  #######################################
139  generate_silero() {
140      local output="$OUTPUT_DIR/silero-v3.wav"
141      
142      if python3 -c "import torch" 2>/dev/null; then
143          echo "  Generating: silero..."
144          python3 << 'EOF'
145  import torch
146  
147  device = torch.device('cpu')
148  model, _ = torch.hub.load(repo_or_dir='snakers4/silero-models',
149                            model='silero_tts',
150                            language='en',
151                            speaker='v3_en')
152  model.to(device)
153  
154  text = """This is code action quick. It shows language server fixes in the modeline as a lightbulb. Press Control-c Control-a to apply the first fix instantly."""
155  audio = model.apply_tts(text=text, speaker='en_0', sample_rate=48000)
156  import torchaudio
157  torchaudio.save("$OUTPUT_DIR/silero-v3.wav", audio.unsqueeze(0), 48000)
158  EOF
159          echo "    ✓ $output"
160          return 0
161      fi
162      return 1
163  }
164  
165  #######################################
166  # Check what's available and generate
167  #######################################
168  
169  echo "[1/3] Checking available TTS engines..."
170  echo ""
171  
172  # edge-tts (always try - it's pip installable)
173  if command -v edge-tts &>/dev/null || [[ -x "$HOME/.local/bin/edge-tts" ]]; then
174      echo "  ✓ edge-tts available"
175      AVAILABLE_ENGINES+=("edge-tts")
176  else
177      echo "  ✗ edge-tts not found (install: pipx install edge-tts)"
178  fi
179  
180  # piper
181  if command -v piper &>/dev/null; then
182      echo "  ✓ piper available"
183      AVAILABLE_ENGINES+=("piper")
184  else
185      echo "  ✗ piper not found (install: pipx install piper-tts)"
186  fi
187  
188  # coqui TTS
189  if python3 -c "import TTS" 2>/dev/null; then
190      echo "  ✓ coqui TTS available"
191      AVAILABLE_ENGINES+=("coqui")
192  else
193      echo "  ✗ coqui TTS not found (install: pip install TTS)"
194  fi
195  
196  # bark
197  if python3 -c "from bark import generate_audio" 2>/dev/null; then
198      echo "  ✓ bark available"
199      AVAILABLE_ENGINES+=("bark")
200  else
201      echo "  ✗ bark not found (install: pip install git+https://github.com/suno-ai/bark.git)"
202  fi
203  
204  # silero (just needs torch)
205  if python3 -c "import torch; import torchaudio" 2>/dev/null; then
206      echo "  ✓ silero available (via torch)"
207      AVAILABLE_ENGINES+=("silero")
208  else
209      echo "  ✗ silero not found (install: pip install torch torchaudio)"
210  fi
211  
212  echo ""
213  echo "[2/3] Generating samples..."
214  echo ""
215  
216  # Generate from each available engine
217  for engine in "${AVAILABLE_ENGINES[@]}"; do
218      case "$engine" in
219          edge-tts)
220              generate_edge_tts "EmmaNeural" "-10%" || true
221              generate_edge_tts "AriaNeural" "-10%" || true
222              generate_edge_tts "JennyNeural" "-10%" || true
223              ;;
224          piper)
225              # Common piper voices (user needs to download models)
226              generate_piper "en_US-amy-medium" || true
227              generate_piper "en_US-lessac-medium" || true
228              ;;
229          coqui)
230              # XTTS v2 is the best quality
231              generate_coqui "tts_models/en/ljspeech/tacotron2-DDC" || true
232              generate_coqui "tts_models/multilingual/multi-dataset/xtts_v2" || true
233              ;;
234          bark)
235              generate_bark "v2/en_speaker_6" || true
236              ;;
237          silero)
238              generate_silero || true
239              ;;
240      esac
241  done
242  
243  echo ""
244  echo "[3/3] Summary"
245  echo ""
246  
247  # List generated files
248  echo "Generated samples:"
249  ls -lh "$OUTPUT_DIR"/*.{mp3,wav} 2>/dev/null | while read line; do
250      echo "  $line"
251  done
252  
253  echo ""
254  echo "=== Installation Commands ==="
255  echo ""
256  echo "# edge-tts (Microsoft Azure, fast, good quality)"
257  echo "pipx install edge-tts"
258  echo ""
259  echo "# piper (fast, local, many voices)"
260  echo "pipx install piper-tts"
261  echo "# Download voices from: https://github.com/rhasspy/piper/releases"
262  echo ""
263  echo "# Coqui TTS (good quality, XTTS v2 is excellent)"
264  echo "pip install TTS"
265  echo ""
266  echo "# Bark (very natural, slow, needs GPU)"
267  echo "pip install git+https://github.com/suno-ai/bark.git"
268  echo ""
269  echo "# Silero (lightweight, decent quality)"  
270  echo "pip install torch torchaudio"
271  echo ""
272  echo "=== Post-Processing ==="
273  echo ""
274  echo "# Adobe Podcast Enhance (free, browser-based, excellent results)"
275  echo "# Upload your audio to: https://podcast.adobe.com/enhance"
276  echo ""
277  echo "# Local alternative: RNNoise (noise removal)"
278  echo "# pip install rnnoise"
279  echo ""
280  echo "=== Compare samples ==="
281  echo "cd $OUTPUT_DIR && for f in *.mp3 *.wav; do echo \"Playing: \$f\"; mpv \"\$f\"; done"