391 lines
16 KiB
Python
391 lines
16 KiB
Python
"""
|
||
Voice Clone TTS
|
||
===============
|
||
Speak any text in YOUR voice using XTTS v2 (runs fully offline after first setup).
|
||
|
||
Usage:
|
||
1. Run this script: python voice_clone_tts.py
|
||
2. Record ~15 seconds of your voice
|
||
3. Type any text and click Generate
|
||
"""
|
||
|
||
import tkinter as tk
|
||
from tkinter import ttk, messagebox, filedialog
|
||
import threading
|
||
import os
|
||
import tempfile
|
||
import numpy as np
|
||
|
||
try:
|
||
import sounddevice as sd
|
||
import scipy.io.wavfile as wav
|
||
except ImportError:
|
||
print("Missing dependencies. Please run: pip install sounddevice scipy")
|
||
raise
|
||
|
||
SAMPLE_RATE = 22050
|
||
VOICE_SAMPLE_FILE = "my_voice_sample.wav"
|
||
|
||
|
||
# ─── Colour palette ──────────────────────────────────────────────────────────
|
||
BG = "#0f0f13"
|
||
SURFACE = "#1a1a24"
|
||
BORDER = "#2a2a3a"
|
||
ACCENT = "#7c6af7"
|
||
ACCENT2 = "#a89cf7"
|
||
TEXT = "#e8e6f0"
|
||
MUTED = "#666680"
|
||
SUCCESS = "#4ade80"
|
||
ERROR = "#f87171"
|
||
WARNING = "#fbbf24"
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
class VoiceCloneTTS:
|
||
def __init__(self, root: tk.Tk):
|
||
self.root = root
|
||
self.root.title("Voice Clone TTS")
|
||
self.root.geometry("640x580")
|
||
self.root.configure(bg=BG)
|
||
self.root.resizable(True, True)
|
||
self.root.minsize(500, 480)
|
||
|
||
self.recording = False
|
||
self.recorded_chunks: list = []
|
||
self.voice_sample_path: str = VOICE_SAMPLE_FILE
|
||
self.tts = None
|
||
self._record_thread: threading.Thread | None = None
|
||
|
||
self._apply_styles()
|
||
self._build_ui()
|
||
self._load_tts_async()
|
||
|
||
# Restore saved voice sample if it exists
|
||
if os.path.exists(self.voice_sample_path):
|
||
self._set_voice_status(f"✓ Saved sample found: {self.voice_sample_path}", SUCCESS)
|
||
|
||
# ── Styles ────────────────────────────────────────────────────────────────
|
||
|
||
def _apply_styles(self):
|
||
style = ttk.Style()
|
||
style.theme_use("clam")
|
||
|
||
style.configure(".",
|
||
background=BG, foreground=TEXT,
|
||
font=("Segoe UI", 10),
|
||
borderwidth=0, relief="flat")
|
||
|
||
style.configure("Card.TFrame", background=SURFACE, relief="flat")
|
||
style.configure("TLabel", background=BG, foreground=TEXT)
|
||
style.configure("Card.TLabel", background=SURFACE, foreground=TEXT)
|
||
style.configure("Muted.TLabel", background=SURFACE, foreground=MUTED, font=("Segoe UI", 9))
|
||
style.configure("Title.TLabel", background=BG, foreground=ACCENT2,
|
||
font=("Segoe UI Semibold", 13))
|
||
style.configure("Step.TLabel", background=SURFACE, foreground=ACCENT2,
|
||
font=("Segoe UI Semibold", 10))
|
||
|
||
style.configure("Accent.TButton",
|
||
background=ACCENT, foreground="#ffffff",
|
||
font=("Segoe UI Semibold", 10), padding=(14, 7),
|
||
relief="flat", borderwidth=0)
|
||
style.map("Accent.TButton",
|
||
background=[("active", ACCENT2), ("disabled", BORDER)],
|
||
foreground=[("disabled", MUTED)])
|
||
|
||
style.configure("Ghost.TButton",
|
||
background=SURFACE, foreground=TEXT,
|
||
font=("Segoe UI", 10), padding=(12, 6),
|
||
relief="flat", borderwidth=0)
|
||
style.map("Ghost.TButton",
|
||
background=[("active", BORDER), ("disabled", SURFACE)],
|
||
foreground=[("disabled", MUTED)])
|
||
|
||
style.configure("TProgressbar",
|
||
troughcolor=BORDER, background=ACCENT,
|
||
thickness=4, relief="flat")
|
||
|
||
# ── UI build ──────────────────────────────────────────────────────────────
|
||
|
||
def _build_ui(self):
|
||
# ── Header
|
||
header = tk.Frame(self.root, bg=BG)
|
||
header.pack(fill="x", padx=20, pady=(18, 4))
|
||
tk.Label(header, text="Voice Clone TTS", bg=BG, fg=ACCENT2,
|
||
font=("Segoe UI Semibold", 17)).pack(side="left")
|
||
self._status_dot = tk.Label(header, text="●", bg=BG, fg=MUTED, font=("Segoe UI", 10))
|
||
self._status_dot.pack(side="right", pady=2)
|
||
self._header_status = tk.Label(header, text="Loading model…", bg=BG, fg=MUTED,
|
||
font=("Segoe UI", 9))
|
||
self._header_status.pack(side="right", padx=(0, 4), pady=2)
|
||
|
||
self._progress = ttk.Progressbar(self.root, mode="indeterminate", style="TProgressbar")
|
||
self._progress.pack(fill="x", padx=20, pady=(0, 10))
|
||
self._progress.start(12)
|
||
|
||
# ── Step 1 – Voice sample
|
||
self._card("Step 1 — Record Your Voice", self._build_record_section)
|
||
|
||
# ── Step 2 – Text input
|
||
self._card("Step 2 — Enter Text", self._build_text_section, expand=True)
|
||
|
||
# ── Step 3 – Generate
|
||
self._card("Step 3 — Generate", self._build_generate_section)
|
||
|
||
def _card(self, title: str, builder, expand=False):
|
||
outer = tk.Frame(self.root, bg=BG)
|
||
outer.pack(fill="both", expand=expand, padx=16, pady=4)
|
||
|
||
frame = tk.Frame(outer, bg=SURFACE, bd=0,
|
||
highlightthickness=1, highlightbackground=BORDER)
|
||
frame.pack(fill="both", expand=expand, padx=0, pady=0)
|
||
|
||
tk.Label(frame, text=title, bg=SURFACE, fg=ACCENT2,
|
||
font=("Segoe UI Semibold", 10)).pack(anchor="w", padx=14, pady=(10, 0))
|
||
|
||
sep = tk.Frame(frame, bg=BORDER, height=1)
|
||
sep.pack(fill="x", padx=14, pady=(6, 0))
|
||
|
||
inner = tk.Frame(frame, bg=SURFACE)
|
||
inner.pack(fill="both", expand=expand, padx=14, pady=10)
|
||
builder(inner)
|
||
|
||
def _build_record_section(self, parent):
|
||
self._voice_status_var = tk.StringVar(value="No voice sample yet — record one below.")
|
||
self._voice_status_lbl = tk.Label(parent, textvariable=self._voice_status_var,
|
||
bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
|
||
anchor="w", wraplength=560)
|
||
self._voice_status_lbl.pack(fill="x", pady=(0, 8))
|
||
|
||
btn_row = tk.Frame(parent, bg=SURFACE)
|
||
btn_row.pack(fill="x")
|
||
|
||
self._rec_btn = ttk.Button(btn_row, text="⏺ Start Recording",
|
||
command=self._toggle_recording, style="Accent.TButton")
|
||
self._rec_btn.pack(side="left", padx=(0, 8))
|
||
|
||
ttk.Button(btn_row, text="📂 Load Audio File",
|
||
command=self._load_voice_file, style="Ghost.TButton").pack(side="left")
|
||
|
||
self._rec_timer_var = tk.StringVar(value="")
|
||
tk.Label(parent, textvariable=self._rec_timer_var,
|
||
bg=SURFACE, fg=ACCENT, font=("Segoe UI Semibold", 9)).pack(anchor="w", pady=(6, 0))
|
||
|
||
tk.Label(parent, text="Tip: Speak naturally for 10–30 s. Clear audio = better cloning.",
|
||
bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(2, 0))
|
||
|
||
def _build_text_section(self, parent):
|
||
self._text_input = tk.Text(parent, height=6, wrap="word",
|
||
bg="#12121a", fg=TEXT, insertbackground=ACCENT2,
|
||
font=("Segoe UI", 11), relief="flat", bd=0,
|
||
padx=10, pady=8, selectbackground=ACCENT)
|
||
self._text_input.pack(fill="both", expand=True)
|
||
self._text_input.insert("1.0", "Hello! This is my voice, speaking through a computer.")
|
||
|
||
tk.Frame(parent, bg=BORDER, height=1).pack(fill="x", pady=(6, 0))
|
||
tk.Label(parent, text="Supports English, Spanish, French, German, Italian, and more.",
|
||
bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(4, 0))
|
||
|
||
def _build_generate_section(self, parent):
|
||
btn_row = tk.Frame(parent, bg=SURFACE)
|
||
btn_row.pack(fill="x")
|
||
|
||
self._play_btn = ttk.Button(btn_row, text="▶ Generate & Play",
|
||
command=lambda: self._generate(play=True, save=False),
|
||
style="Accent.TButton", state="disabled")
|
||
self._play_btn.pack(side="left", padx=(0, 8))
|
||
|
||
self._save_btn = ttk.Button(btn_row, text="💾 Generate & Save",
|
||
command=lambda: self._generate(play=False, save=True),
|
||
style="Ghost.TButton", state="disabled")
|
||
self._save_btn.pack(side="left", padx=(0, 8))
|
||
|
||
self._both_btn = ttk.Button(btn_row, text="▶💾 Play & Save",
|
||
command=lambda: self._generate(play=True, save=True),
|
||
style="Ghost.TButton", state="disabled")
|
||
self._both_btn.pack(side="left")
|
||
|
||
self._gen_status_var = tk.StringVar(value="")
|
||
self._gen_status_lbl = tk.Label(parent, textvariable=self._gen_status_var,
|
||
bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
|
||
anchor="w", wraplength=560)
|
||
self._gen_status_lbl.pack(fill="x", pady=(8, 0))
|
||
|
||
# ── TTS loading ───────────────────────────────────────────────────────────
|
||
|
||
def _load_tts_async(self):
|
||
def _load():
|
||
try:
|
||
from TTS.api import TTS # type: ignore
|
||
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
|
||
self.root.after(0, self._on_tts_ready)
|
||
except Exception as exc:
|
||
_exc = exc
|
||
self.root.after(0, lambda: self._on_tts_error(_exc))
|
||
|
||
threading.Thread(target=_load, daemon=True).start()
|
||
|
||
def _on_tts_ready(self):
|
||
self._progress.stop()
|
||
self._progress.config(value=0)
|
||
self._set_header_status("Model ready", SUCCESS)
|
||
self._enable_generate_btns()
|
||
self._set_gen_status("Ready — enter text and click Generate.", MUTED)
|
||
|
||
def _on_tts_error(self, exc):
|
||
self._progress.stop()
|
||
self._set_header_status("Model failed to load", ERROR)
|
||
self._set_gen_status(f"Error: {exc}", ERROR)
|
||
messagebox.showerror("TTS Load Error",
|
||
f"Could not load the XTTS v2 model:\n{exc}\n\n"
|
||
"Make sure you've run: pip install TTS")
|
||
|
||
# ── Recording ─────────────────────────────────────────────────────────────
|
||
|
||
def _toggle_recording(self):
|
||
if not self.recording:
|
||
self._start_recording()
|
||
else:
|
||
self._stop_recording()
|
||
|
||
def _start_recording(self):
|
||
self.recording = True
|
||
self.recorded_chunks = []
|
||
self._rec_btn.config(text="⏹ Stop Recording")
|
||
self._set_voice_status("🔴 Recording… speak naturally for 10–30 seconds.", WARNING)
|
||
self._rec_timer_var.set("0 s")
|
||
self._elapsed = 0
|
||
self._tick()
|
||
|
||
def _record():
|
||
with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="float32") as stream:
|
||
while self.recording:
|
||
data, _ = stream.read(1024)
|
||
self.recorded_chunks.append(data.copy())
|
||
|
||
self._record_thread = threading.Thread(target=_record, daemon=True)
|
||
self._record_thread.start()
|
||
|
||
def _tick(self):
|
||
if self.recording:
|
||
self._elapsed += 1
|
||
self._rec_timer_var.set(f"{self._elapsed} s recorded")
|
||
self.root.after(1000, self._tick)
|
||
else:
|
||
self._rec_timer_var.set("")
|
||
|
||
def _stop_recording(self):
|
||
self.recording = False
|
||
self._rec_btn.config(text="⏺ Start Recording")
|
||
|
||
if not self.recorded_chunks:
|
||
self._set_voice_status("Nothing recorded — try again.", MUTED)
|
||
return
|
||
|
||
audio = np.concatenate(self.recorded_chunks, axis=0)
|
||
audio_i16 = (audio * 32767).clip(-32768, 32767).astype(np.int16)
|
||
wav.write(self.voice_sample_path, SAMPLE_RATE, audio_i16)
|
||
dur = len(audio_i16) / SAMPLE_RATE
|
||
self._set_voice_status(
|
||
f"✓ Saved {dur:.1f}s sample → {os.path.abspath(self.voice_sample_path)}", SUCCESS)
|
||
|
||
def _load_voice_file(self):
|
||
path = filedialog.askopenfilename(
|
||
title="Select voice sample",
|
||
filetypes=[("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), ("All files", "*.*")])
|
||
if path:
|
||
self.voice_sample_path = path
|
||
self._set_voice_status(f"✓ Loaded: {os.path.basename(path)}", SUCCESS)
|
||
|
||
# ── Generation ────────────────────────────────────────────────────────────
|
||
|
||
def _generate(self, play: bool, save: bool):
|
||
if not self.tts:
|
||
messagebox.showerror("Not ready", "TTS model is still loading.")
|
||
return
|
||
|
||
if not os.path.exists(self.voice_sample_path):
|
||
messagebox.showerror("No voice sample",
|
||
"Please record your voice or load an audio file first.")
|
||
return
|
||
|
||
text = self._text_input.get("1.0", "end").strip()
|
||
if not text:
|
||
messagebox.showerror("No text", "Please enter some text to speak.")
|
||
return
|
||
|
||
save_path: str | None = None
|
||
if save:
|
||
save_path = filedialog.asksaveasfilename(
|
||
defaultextension=".wav",
|
||
filetypes=[("WAV audio", "*.wav")],
|
||
title="Save generated speech")
|
||
if not save_path:
|
||
return
|
||
|
||
self._set_btns_state("disabled")
|
||
self._progress.start(12)
|
||
self._set_gen_status("⏳ Generating speech (this may take ~10–30 s)…", ACCENT2)
|
||
|
||
def _run():
|
||
try:
|
||
out = save_path or os.path.join(tempfile.gettempdir(), "vcl_output.wav")
|
||
self.tts.tts_to_file(
|
||
text=text,
|
||
speaker_wav=self.voice_sample_path,
|
||
language="en",
|
||
file_path=out,
|
||
)
|
||
|
||
if play:
|
||
sr, data = wav.read(out)
|
||
audio = data.astype(np.float32) / (32768.0 if data.dtype == np.int16 else 1.0)
|
||
if audio.ndim > 1:
|
||
audio = audio.mean(axis=1)
|
||
sd.play(audio, samplerate=sr)
|
||
sd.wait()
|
||
|
||
msg = "✓ Done!"
|
||
if save_path:
|
||
msg += f" Saved → {save_path}"
|
||
self.root.after(0, lambda: self._set_gen_status(msg, SUCCESS))
|
||
except Exception as exc:
|
||
self.root.after(0, lambda: self._set_gen_status(f"Error: {exc}", ERROR))
|
||
finally:
|
||
self.root.after(0, self._on_generate_done)
|
||
|
||
threading.Thread(target=_run, daemon=True).start()
|
||
|
||
def _on_generate_done(self):
|
||
self._progress.stop()
|
||
self._progress.config(value=0)
|
||
self._set_btns_state("normal")
|
||
|
||
# ── Helpers ───────────────────────────────────────────────────────────────
|
||
|
||
def _set_voice_status(self, msg: str, color: str = TEXT):
|
||
self._voice_status_var.set(msg)
|
||
self._voice_status_lbl.config(fg=color)
|
||
|
||
def _set_gen_status(self, msg: str, color: str = TEXT):
|
||
self._gen_status_var.set(msg)
|
||
self._gen_status_lbl.config(fg=color)
|
||
|
||
def _set_header_status(self, msg: str, color: str = TEXT):
|
||
self._header_status.config(text=msg, fg=color)
|
||
self._status_dot.config(fg=color)
|
||
|
||
def _enable_generate_btns(self):
|
||
for btn in (self._play_btn, self._save_btn, self._both_btn):
|
||
btn.config(state="normal")
|
||
|
||
def _set_btns_state(self, state: str):
|
||
for btn in (self._play_btn, self._save_btn, self._both_btn, self._rec_btn):
|
||
btn.config(state=state)
|
||
|
||
|
||
# ── Entry point ───────────────────────────────────────────────────────────────
|
||
|
||
if __name__ == "__main__":
|
||
root = tk.Tk()
|
||
app = VoiceCloneTTS(root)
|
||
root.mainloop()
|