voiceclone-tts/voice_clone_tts.py
2026-03-28 22:04:45 +09:00

391 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Voice Clone TTS
===============
Speak any text in YOUR voice using XTTS v2 (runs fully offline after first setup).
Usage:
1. Run this script: python voice_clone_tts.py
2. Record ~15 seconds of your voice
3. Type any text and click Generate
"""
import tkinter as tk
from tkinter import ttk, messagebox, filedialog
import threading
import os
import tempfile
import numpy as np
try:
import sounddevice as sd
import scipy.io.wavfile as wav
except ImportError:
print("Missing dependencies. Please run: pip install sounddevice scipy")
raise
SAMPLE_RATE = 22050
VOICE_SAMPLE_FILE = "my_voice_sample.wav"
# ─── Colour palette ──────────────────────────────────────────────────────────
BG = "#0f0f13"
SURFACE = "#1a1a24"
BORDER = "#2a2a3a"
ACCENT = "#7c6af7"
ACCENT2 = "#a89cf7"
TEXT = "#e8e6f0"
MUTED = "#666680"
SUCCESS = "#4ade80"
ERROR = "#f87171"
WARNING = "#fbbf24"
# ─────────────────────────────────────────────────────────────────────────────
class VoiceCloneTTS:
def __init__(self, root: tk.Tk):
self.root = root
self.root.title("Voice Clone TTS")
self.root.geometry("640x580")
self.root.configure(bg=BG)
self.root.resizable(True, True)
self.root.minsize(500, 480)
self.recording = False
self.recorded_chunks: list = []
self.voice_sample_path: str = VOICE_SAMPLE_FILE
self.tts = None
self._record_thread: threading.Thread | None = None
self._apply_styles()
self._build_ui()
self._load_tts_async()
# Restore saved voice sample if it exists
if os.path.exists(self.voice_sample_path):
self._set_voice_status(f"✓ Saved sample found: {self.voice_sample_path}", SUCCESS)
# ── Styles ────────────────────────────────────────────────────────────────
def _apply_styles(self):
style = ttk.Style()
style.theme_use("clam")
style.configure(".",
background=BG, foreground=TEXT,
font=("Segoe UI", 10),
borderwidth=0, relief="flat")
style.configure("Card.TFrame", background=SURFACE, relief="flat")
style.configure("TLabel", background=BG, foreground=TEXT)
style.configure("Card.TLabel", background=SURFACE, foreground=TEXT)
style.configure("Muted.TLabel", background=SURFACE, foreground=MUTED, font=("Segoe UI", 9))
style.configure("Title.TLabel", background=BG, foreground=ACCENT2,
font=("Segoe UI Semibold", 13))
style.configure("Step.TLabel", background=SURFACE, foreground=ACCENT2,
font=("Segoe UI Semibold", 10))
style.configure("Accent.TButton",
background=ACCENT, foreground="#ffffff",
font=("Segoe UI Semibold", 10), padding=(14, 7),
relief="flat", borderwidth=0)
style.map("Accent.TButton",
background=[("active", ACCENT2), ("disabled", BORDER)],
foreground=[("disabled", MUTED)])
style.configure("Ghost.TButton",
background=SURFACE, foreground=TEXT,
font=("Segoe UI", 10), padding=(12, 6),
relief="flat", borderwidth=0)
style.map("Ghost.TButton",
background=[("active", BORDER), ("disabled", SURFACE)],
foreground=[("disabled", MUTED)])
style.configure("TProgressbar",
troughcolor=BORDER, background=ACCENT,
thickness=4, relief="flat")
# ── UI build ──────────────────────────────────────────────────────────────
def _build_ui(self):
# ── Header
header = tk.Frame(self.root, bg=BG)
header.pack(fill="x", padx=20, pady=(18, 4))
tk.Label(header, text="Voice Clone TTS", bg=BG, fg=ACCENT2,
font=("Segoe UI Semibold", 17)).pack(side="left")
self._status_dot = tk.Label(header, text="", bg=BG, fg=MUTED, font=("Segoe UI", 10))
self._status_dot.pack(side="right", pady=2)
self._header_status = tk.Label(header, text="Loading model…", bg=BG, fg=MUTED,
font=("Segoe UI", 9))
self._header_status.pack(side="right", padx=(0, 4), pady=2)
self._progress = ttk.Progressbar(self.root, mode="indeterminate", style="TProgressbar")
self._progress.pack(fill="x", padx=20, pady=(0, 10))
self._progress.start(12)
# ── Step 1 Voice sample
self._card("Step 1 — Record Your Voice", self._build_record_section)
# ── Step 2 Text input
self._card("Step 2 — Enter Text", self._build_text_section, expand=True)
# ── Step 3 Generate
self._card("Step 3 — Generate", self._build_generate_section)
def _card(self, title: str, builder, expand=False):
outer = tk.Frame(self.root, bg=BG)
outer.pack(fill="both", expand=expand, padx=16, pady=4)
frame = tk.Frame(outer, bg=SURFACE, bd=0,
highlightthickness=1, highlightbackground=BORDER)
frame.pack(fill="both", expand=expand, padx=0, pady=0)
tk.Label(frame, text=title, bg=SURFACE, fg=ACCENT2,
font=("Segoe UI Semibold", 10)).pack(anchor="w", padx=14, pady=(10, 0))
sep = tk.Frame(frame, bg=BORDER, height=1)
sep.pack(fill="x", padx=14, pady=(6, 0))
inner = tk.Frame(frame, bg=SURFACE)
inner.pack(fill="both", expand=expand, padx=14, pady=10)
builder(inner)
def _build_record_section(self, parent):
self._voice_status_var = tk.StringVar(value="No voice sample yet — record one below.")
self._voice_status_lbl = tk.Label(parent, textvariable=self._voice_status_var,
bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
anchor="w", wraplength=560)
self._voice_status_lbl.pack(fill="x", pady=(0, 8))
btn_row = tk.Frame(parent, bg=SURFACE)
btn_row.pack(fill="x")
self._rec_btn = ttk.Button(btn_row, text="⏺ Start Recording",
command=self._toggle_recording, style="Accent.TButton")
self._rec_btn.pack(side="left", padx=(0, 8))
ttk.Button(btn_row, text="📂 Load Audio File",
command=self._load_voice_file, style="Ghost.TButton").pack(side="left")
self._rec_timer_var = tk.StringVar(value="")
tk.Label(parent, textvariable=self._rec_timer_var,
bg=SURFACE, fg=ACCENT, font=("Segoe UI Semibold", 9)).pack(anchor="w", pady=(6, 0))
tk.Label(parent, text="Tip: Speak naturally for 1030 s. Clear audio = better cloning.",
bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(2, 0))
def _build_text_section(self, parent):
self._text_input = tk.Text(parent, height=6, wrap="word",
bg="#12121a", fg=TEXT, insertbackground=ACCENT2,
font=("Segoe UI", 11), relief="flat", bd=0,
padx=10, pady=8, selectbackground=ACCENT)
self._text_input.pack(fill="both", expand=True)
self._text_input.insert("1.0", "Hello! This is my voice, speaking through a computer.")
tk.Frame(parent, bg=BORDER, height=1).pack(fill="x", pady=(6, 0))
tk.Label(parent, text="Supports English, Spanish, French, German, Italian, and more.",
bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(4, 0))
def _build_generate_section(self, parent):
btn_row = tk.Frame(parent, bg=SURFACE)
btn_row.pack(fill="x")
self._play_btn = ttk.Button(btn_row, text="▶ Generate & Play",
command=lambda: self._generate(play=True, save=False),
style="Accent.TButton", state="disabled")
self._play_btn.pack(side="left", padx=(0, 8))
self._save_btn = ttk.Button(btn_row, text="💾 Generate & Save",
command=lambda: self._generate(play=False, save=True),
style="Ghost.TButton", state="disabled")
self._save_btn.pack(side="left", padx=(0, 8))
self._both_btn = ttk.Button(btn_row, text="▶💾 Play & Save",
command=lambda: self._generate(play=True, save=True),
style="Ghost.TButton", state="disabled")
self._both_btn.pack(side="left")
self._gen_status_var = tk.StringVar(value="")
self._gen_status_lbl = tk.Label(parent, textvariable=self._gen_status_var,
bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
anchor="w", wraplength=560)
self._gen_status_lbl.pack(fill="x", pady=(8, 0))
# ── TTS loading ───────────────────────────────────────────────────────────
def _load_tts_async(self):
def _load():
try:
from TTS.api import TTS # type: ignore
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
self.root.after(0, self._on_tts_ready)
except Exception as exc:
_exc = exc
self.root.after(0, lambda: self._on_tts_error(_exc))
threading.Thread(target=_load, daemon=True).start()
def _on_tts_ready(self):
self._progress.stop()
self._progress.config(value=0)
self._set_header_status("Model ready", SUCCESS)
self._enable_generate_btns()
self._set_gen_status("Ready — enter text and click Generate.", MUTED)
def _on_tts_error(self, exc):
self._progress.stop()
self._set_header_status("Model failed to load", ERROR)
self._set_gen_status(f"Error: {exc}", ERROR)
messagebox.showerror("TTS Load Error",
f"Could not load the XTTS v2 model:\n{exc}\n\n"
"Make sure you've run: pip install TTS")
# ── Recording ─────────────────────────────────────────────────────────────
def _toggle_recording(self):
if not self.recording:
self._start_recording()
else:
self._stop_recording()
def _start_recording(self):
self.recording = True
self.recorded_chunks = []
self._rec_btn.config(text="⏹ Stop Recording")
self._set_voice_status("🔴 Recording… speak naturally for 1030 seconds.", WARNING)
self._rec_timer_var.set("0 s")
self._elapsed = 0
self._tick()
def _record():
with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="float32") as stream:
while self.recording:
data, _ = stream.read(1024)
self.recorded_chunks.append(data.copy())
self._record_thread = threading.Thread(target=_record, daemon=True)
self._record_thread.start()
def _tick(self):
if self.recording:
self._elapsed += 1
self._rec_timer_var.set(f"{self._elapsed} s recorded")
self.root.after(1000, self._tick)
else:
self._rec_timer_var.set("")
def _stop_recording(self):
self.recording = False
self._rec_btn.config(text="⏺ Start Recording")
if not self.recorded_chunks:
self._set_voice_status("Nothing recorded — try again.", MUTED)
return
audio = np.concatenate(self.recorded_chunks, axis=0)
audio_i16 = (audio * 32767).clip(-32768, 32767).astype(np.int16)
wav.write(self.voice_sample_path, SAMPLE_RATE, audio_i16)
dur = len(audio_i16) / SAMPLE_RATE
self._set_voice_status(
f"✓ Saved {dur:.1f}s sample → {os.path.abspath(self.voice_sample_path)}", SUCCESS)
def _load_voice_file(self):
path = filedialog.askopenfilename(
title="Select voice sample",
filetypes=[("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), ("All files", "*.*")])
if path:
self.voice_sample_path = path
self._set_voice_status(f"✓ Loaded: {os.path.basename(path)}", SUCCESS)
# ── Generation ────────────────────────────────────────────────────────────
def _generate(self, play: bool, save: bool):
if not self.tts:
messagebox.showerror("Not ready", "TTS model is still loading.")
return
if not os.path.exists(self.voice_sample_path):
messagebox.showerror("No voice sample",
"Please record your voice or load an audio file first.")
return
text = self._text_input.get("1.0", "end").strip()
if not text:
messagebox.showerror("No text", "Please enter some text to speak.")
return
save_path: str | None = None
if save:
save_path = filedialog.asksaveasfilename(
defaultextension=".wav",
filetypes=[("WAV audio", "*.wav")],
title="Save generated speech")
if not save_path:
return
self._set_btns_state("disabled")
self._progress.start(12)
self._set_gen_status("⏳ Generating speech (this may take ~1030 s)…", ACCENT2)
def _run():
try:
out = save_path or os.path.join(tempfile.gettempdir(), "vcl_output.wav")
self.tts.tts_to_file(
text=text,
speaker_wav=self.voice_sample_path,
language="en",
file_path=out,
)
if play:
sr, data = wav.read(out)
audio = data.astype(np.float32) / (32768.0 if data.dtype == np.int16 else 1.0)
if audio.ndim > 1:
audio = audio.mean(axis=1)
sd.play(audio, samplerate=sr)
sd.wait()
msg = "✓ Done!"
if save_path:
msg += f" Saved → {save_path}"
self.root.after(0, lambda: self._set_gen_status(msg, SUCCESS))
except Exception as exc:
self.root.after(0, lambda: self._set_gen_status(f"Error: {exc}", ERROR))
finally:
self.root.after(0, self._on_generate_done)
threading.Thread(target=_run, daemon=True).start()
def _on_generate_done(self):
self._progress.stop()
self._progress.config(value=0)
self._set_btns_state("normal")
# ── Helpers ───────────────────────────────────────────────────────────────
def _set_voice_status(self, msg: str, color: str = TEXT):
self._voice_status_var.set(msg)
self._voice_status_lbl.config(fg=color)
def _set_gen_status(self, msg: str, color: str = TEXT):
self._gen_status_var.set(msg)
self._gen_status_lbl.config(fg=color)
def _set_header_status(self, msg: str, color: str = TEXT):
self._header_status.config(text=msg, fg=color)
self._status_dot.config(fg=color)
def _enable_generate_btns(self):
for btn in (self._play_btn, self._save_btn, self._both_btn):
btn.config(state="normal")
def _set_btns_state(self, state: str):
for btn in (self._play_btn, self._save_btn, self._both_btn, self._rec_btn):
btn.config(state=state)
# ── Entry point ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
root = tk.Tk()
app = VoiceCloneTTS(root)
root.mainloop()