""" Voice Clone TTS =============== Speak any text in YOUR voice using XTTS v2 (runs fully offline after first setup). Usage: 1. Run this script: python voice_clone_tts.py 2. Record ~15 seconds of your voice 3. Type any text and click Generate """ import tkinter as tk from tkinter import ttk, messagebox, filedialog import threading import os import tempfile import numpy as np try: import sounddevice as sd import scipy.io.wavfile as wav except ImportError: print("Missing dependencies. Please run: pip install sounddevice scipy") raise SAMPLE_RATE = 22050 VOICE_SAMPLE_FILE = "my_voice_sample.wav" # ─── Colour palette ────────────────────────────────────────────────────────── BG = "#0f0f13" SURFACE = "#1a1a24" BORDER = "#2a2a3a" ACCENT = "#7c6af7" ACCENT2 = "#a89cf7" TEXT = "#e8e6f0" MUTED = "#666680" SUCCESS = "#4ade80" ERROR = "#f87171" WARNING = "#fbbf24" # ───────────────────────────────────────────────────────────────────────────── class VoiceCloneTTS: def __init__(self, root: tk.Tk): self.root = root self.root.title("Voice Clone TTS") self.root.geometry("640x580") self.root.configure(bg=BG) self.root.resizable(True, True) self.root.minsize(500, 480) self.recording = False self.recorded_chunks: list = [] self.voice_sample_path: str = VOICE_SAMPLE_FILE self.tts = None self._record_thread: threading.Thread | None = None self._apply_styles() self._build_ui() self._load_tts_async() # Restore saved voice sample if it exists if os.path.exists(self.voice_sample_path): self._set_voice_status(f"✓ Saved sample found: {self.voice_sample_path}", SUCCESS) # ── Styles ──────────────────────────────────────────────────────────────── def _apply_styles(self): style = ttk.Style() style.theme_use("clam") style.configure(".", background=BG, foreground=TEXT, font=("Segoe UI", 10), borderwidth=0, relief="flat") style.configure("Card.TFrame", background=SURFACE, relief="flat") style.configure("TLabel", background=BG, foreground=TEXT) style.configure("Card.TLabel", background=SURFACE, foreground=TEXT) style.configure("Muted.TLabel", background=SURFACE, foreground=MUTED, font=("Segoe UI", 9)) style.configure("Title.TLabel", background=BG, foreground=ACCENT2, font=("Segoe UI Semibold", 13)) style.configure("Step.TLabel", background=SURFACE, foreground=ACCENT2, font=("Segoe UI Semibold", 10)) style.configure("Accent.TButton", background=ACCENT, foreground="#ffffff", font=("Segoe UI Semibold", 10), padding=(14, 7), relief="flat", borderwidth=0) style.map("Accent.TButton", background=[("active", ACCENT2), ("disabled", BORDER)], foreground=[("disabled", MUTED)]) style.configure("Ghost.TButton", background=SURFACE, foreground=TEXT, font=("Segoe UI", 10), padding=(12, 6), relief="flat", borderwidth=0) style.map("Ghost.TButton", background=[("active", BORDER), ("disabled", SURFACE)], foreground=[("disabled", MUTED)]) style.configure("TProgressbar", troughcolor=BORDER, background=ACCENT, thickness=4, relief="flat") # ── UI build ────────────────────────────────────────────────────────────── def _build_ui(self): # ── Header header = tk.Frame(self.root, bg=BG) header.pack(fill="x", padx=20, pady=(18, 4)) tk.Label(header, text="Voice Clone TTS", bg=BG, fg=ACCENT2, font=("Segoe UI Semibold", 17)).pack(side="left") self._status_dot = tk.Label(header, text="●", bg=BG, fg=MUTED, font=("Segoe UI", 10)) self._status_dot.pack(side="right", pady=2) self._header_status = tk.Label(header, text="Loading model…", bg=BG, fg=MUTED, font=("Segoe UI", 9)) self._header_status.pack(side="right", padx=(0, 4), pady=2) self._progress = ttk.Progressbar(self.root, mode="indeterminate", style="TProgressbar") self._progress.pack(fill="x", padx=20, pady=(0, 10)) self._progress.start(12) # ── Step 1 – Voice sample self._card("Step 1 — Record Your Voice", self._build_record_section) # ── Step 2 – Text input self._card("Step 2 — Enter Text", self._build_text_section, expand=True) # ── Step 3 – Generate self._card("Step 3 — Generate", self._build_generate_section) def _card(self, title: str, builder, expand=False): outer = tk.Frame(self.root, bg=BG) outer.pack(fill="both", expand=expand, padx=16, pady=4) frame = tk.Frame(outer, bg=SURFACE, bd=0, highlightthickness=1, highlightbackground=BORDER) frame.pack(fill="both", expand=expand, padx=0, pady=0) tk.Label(frame, text=title, bg=SURFACE, fg=ACCENT2, font=("Segoe UI Semibold", 10)).pack(anchor="w", padx=14, pady=(10, 0)) sep = tk.Frame(frame, bg=BORDER, height=1) sep.pack(fill="x", padx=14, pady=(6, 0)) inner = tk.Frame(frame, bg=SURFACE) inner.pack(fill="both", expand=expand, padx=14, pady=10) builder(inner) def _build_record_section(self, parent): self._voice_status_var = tk.StringVar(value="No voice sample yet — record one below.") self._voice_status_lbl = tk.Label(parent, textvariable=self._voice_status_var, bg=SURFACE, fg=MUTED, font=("Segoe UI", 9), anchor="w", wraplength=560) self._voice_status_lbl.pack(fill="x", pady=(0, 8)) btn_row = tk.Frame(parent, bg=SURFACE) btn_row.pack(fill="x") self._rec_btn = ttk.Button(btn_row, text="⏺ Start Recording", command=self._toggle_recording, style="Accent.TButton") self._rec_btn.pack(side="left", padx=(0, 8)) ttk.Button(btn_row, text="📂 Load Audio File", command=self._load_voice_file, style="Ghost.TButton").pack(side="left") self._rec_timer_var = tk.StringVar(value="") tk.Label(parent, textvariable=self._rec_timer_var, bg=SURFACE, fg=ACCENT, font=("Segoe UI Semibold", 9)).pack(anchor="w", pady=(6, 0)) tk.Label(parent, text="Tip: Speak naturally for 10–30 s. Clear audio = better cloning.", bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(2, 0)) def _build_text_section(self, parent): self._text_input = tk.Text(parent, height=6, wrap="word", bg="#12121a", fg=TEXT, insertbackground=ACCENT2, font=("Segoe UI", 11), relief="flat", bd=0, padx=10, pady=8, selectbackground=ACCENT) self._text_input.pack(fill="both", expand=True) self._text_input.insert("1.0", "Hello! This is my voice, speaking through a computer.") tk.Frame(parent, bg=BORDER, height=1).pack(fill="x", pady=(6, 0)) tk.Label(parent, text="Supports English, Spanish, French, German, Italian, and more.", bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(4, 0)) def _build_generate_section(self, parent): btn_row = tk.Frame(parent, bg=SURFACE) btn_row.pack(fill="x") self._play_btn = ttk.Button(btn_row, text="▶ Generate & Play", command=lambda: self._generate(play=True, save=False), style="Accent.TButton", state="disabled") self._play_btn.pack(side="left", padx=(0, 8)) self._save_btn = ttk.Button(btn_row, text="💾 Generate & Save", command=lambda: self._generate(play=False, save=True), style="Ghost.TButton", state="disabled") self._save_btn.pack(side="left", padx=(0, 8)) self._both_btn = ttk.Button(btn_row, text="▶💾 Play & Save", command=lambda: self._generate(play=True, save=True), style="Ghost.TButton", state="disabled") self._both_btn.pack(side="left") self._gen_status_var = tk.StringVar(value="") self._gen_status_lbl = tk.Label(parent, textvariable=self._gen_status_var, bg=SURFACE, fg=MUTED, font=("Segoe UI", 9), anchor="w", wraplength=560) self._gen_status_lbl.pack(fill="x", pady=(8, 0)) # ── TTS loading ─────────────────────────────────────────────────────────── def _load_tts_async(self): def _load(): try: from TTS.api import TTS # type: ignore self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) self.root.after(0, self._on_tts_ready) except Exception as exc: _exc = exc self.root.after(0, lambda: self._on_tts_error(_exc)) threading.Thread(target=_load, daemon=True).start() def _on_tts_ready(self): self._progress.stop() self._progress.config(value=0) self._set_header_status("Model ready", SUCCESS) self._enable_generate_btns() self._set_gen_status("Ready — enter text and click Generate.", MUTED) def _on_tts_error(self, exc): self._progress.stop() self._set_header_status("Model failed to load", ERROR) self._set_gen_status(f"Error: {exc}", ERROR) messagebox.showerror("TTS Load Error", f"Could not load the XTTS v2 model:\n{exc}\n\n" "Make sure you've run: pip install TTS") # ── Recording ───────────────────────────────────────────────────────────── def _toggle_recording(self): if not self.recording: self._start_recording() else: self._stop_recording() def _start_recording(self): self.recording = True self.recorded_chunks = [] self._rec_btn.config(text="⏹ Stop Recording") self._set_voice_status("🔴 Recording… speak naturally for 10–30 seconds.", WARNING) self._rec_timer_var.set("0 s") self._elapsed = 0 self._tick() def _record(): with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="float32") as stream: while self.recording: data, _ = stream.read(1024) self.recorded_chunks.append(data.copy()) self._record_thread = threading.Thread(target=_record, daemon=True) self._record_thread.start() def _tick(self): if self.recording: self._elapsed += 1 self._rec_timer_var.set(f"{self._elapsed} s recorded") self.root.after(1000, self._tick) else: self._rec_timer_var.set("") def _stop_recording(self): self.recording = False self._rec_btn.config(text="⏺ Start Recording") if not self.recorded_chunks: self._set_voice_status("Nothing recorded — try again.", MUTED) return audio = np.concatenate(self.recorded_chunks, axis=0) audio_i16 = (audio * 32767).clip(-32768, 32767).astype(np.int16) wav.write(self.voice_sample_path, SAMPLE_RATE, audio_i16) dur = len(audio_i16) / SAMPLE_RATE self._set_voice_status( f"✓ Saved {dur:.1f}s sample → {os.path.abspath(self.voice_sample_path)}", SUCCESS) def _load_voice_file(self): path = filedialog.askopenfilename( title="Select voice sample", filetypes=[("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), ("All files", "*.*")]) if path: self.voice_sample_path = path self._set_voice_status(f"✓ Loaded: {os.path.basename(path)}", SUCCESS) # ── Generation ──────────────────────────────────────────────────────────── def _generate(self, play: bool, save: bool): if not self.tts: messagebox.showerror("Not ready", "TTS model is still loading.") return if not os.path.exists(self.voice_sample_path): messagebox.showerror("No voice sample", "Please record your voice or load an audio file first.") return text = self._text_input.get("1.0", "end").strip() if not text: messagebox.showerror("No text", "Please enter some text to speak.") return save_path: str | None = None if save: save_path = filedialog.asksaveasfilename( defaultextension=".wav", filetypes=[("WAV audio", "*.wav")], title="Save generated speech") if not save_path: return self._set_btns_state("disabled") self._progress.start(12) self._set_gen_status("⏳ Generating speech (this may take ~10–30 s)…", ACCENT2) def _run(): try: out = save_path or os.path.join(tempfile.gettempdir(), "vcl_output.wav") self.tts.tts_to_file( text=text, speaker_wav=self.voice_sample_path, language="en", file_path=out, ) if play: sr, data = wav.read(out) audio = data.astype(np.float32) / (32768.0 if data.dtype == np.int16 else 1.0) if audio.ndim > 1: audio = audio.mean(axis=1) sd.play(audio, samplerate=sr) sd.wait() msg = "✓ Done!" if save_path: msg += f" Saved → {save_path}" self.root.after(0, lambda: self._set_gen_status(msg, SUCCESS)) except Exception as exc: self.root.after(0, lambda: self._set_gen_status(f"Error: {exc}", ERROR)) finally: self.root.after(0, self._on_generate_done) threading.Thread(target=_run, daemon=True).start() def _on_generate_done(self): self._progress.stop() self._progress.config(value=0) self._set_btns_state("normal") # ── Helpers ─────────────────────────────────────────────────────────────── def _set_voice_status(self, msg: str, color: str = TEXT): self._voice_status_var.set(msg) self._voice_status_lbl.config(fg=color) def _set_gen_status(self, msg: str, color: str = TEXT): self._gen_status_var.set(msg) self._gen_status_lbl.config(fg=color) def _set_header_status(self, msg: str, color: str = TEXT): self._header_status.config(text=msg, fg=color) self._status_dot.config(fg=color) def _enable_generate_btns(self): for btn in (self._play_btn, self._save_btn, self._both_btn): btn.config(state="normal") def _set_btns_state(self, state: str): for btn in (self._play_btn, self._save_btn, self._both_btn, self._rec_btn): btn.config(state=state) # ── Entry point ─────────────────────────────────────────────────────────────── if __name__ == "__main__": root = tk.Tk() app = VoiceCloneTTS(root) root.mainloop()