This commit is contained in:
Jay 2026-03-28 22:04:45 +09:00
commit 34d55dc95f
8 changed files with 553 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
.venv/

1
.python-version Normal file
View file

@ -0,0 +1 @@
3.10.14

78
README.md Normal file
View file

@ -0,0 +1,78 @@
# Voice Clone TTS
Type any text, hear it in your own voice. Runs fully offline.
![Screenshot](docs/assets/img/preview.png)
---
## Setup (first time only)
**1. Install system packages:**
```bash
sudo apt install portaudio19-dev python3-tk espeak-ng -y
```
**2. Install Python 3.10 via pyenv** (required on Debian to avoid lzma bug):
```bash
curl https://pyenv.run | bash
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
echo '[[ -d $PYENV_ROOT/bin ]] && export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc
echo 'eval "$(pyenv init - bash)"' >> ~/.bashrc
source ~/.bashrc
pyenv install 3.10.14
pyenv local 3.10.14
```
**3. Create a virtual environment:**
```bash
~/.pyenv/versions/3.10.14/bin/python -m venv .venv
source .venv/bin/activate
```
**4. Install Python packages** (takes 1530 min, downloads ~2GB):
```bash
pip install --upgrade pip
pip install --no-cache-dir "numpy==1.22.0"
pip install --no-cache-dir --resume-retries 20 TTS sounddevice scipy
pip install --no-cache-dir "transformers==4.40.0"
pip install --no-cache-dir "torch==2.1.0" "torchaudio==2.1.0"
```
---
## Running the app
Every time you want to use it:
```bash
source .venv/bin/activate
python voice_clone_tts.py
```
---
## How to use
1. Wait for **"Model ready"** in the top right *(first launch only: downloads ~2GB, takes 515 min)*
2. Click **Start Recording** → read the passage below for 30 seconds → **Stop Recording**
3. Type any text
4. Click **Generate & Play**
Your voice sample saves as `my_voice_sample.wav` and is reused automatically on future runs.
---
## Best text to record (Rainbow Passage)
> *"When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. The rainbow is a division of white light into many beautiful colors. These take the shape of a long round arch, with its path high above, and its two ends apparently beyond the horizon. There is, according to legend, a pot of gold at the end of the rainbow. The shape of a rainbow reminds me of a bridge. Like a bridge, a rainbow is wide in the middle and narrow at its ends."*
Read it **twice through** at your normal pace.
---
## Tips
- Record in a quiet room with no background noise
- Speak naturally — don't put on a "reading voice"
- 30 seconds of clean audio is the sweet spot
- Generation takes 1030 seconds per sentence on CPU

BIN
docs/assets/img/preview.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.6 KiB

41
requirements.txt Normal file
View file

@ -0,0 +1,41 @@
asgiref==3.7.2
bcrypt==5.0.0
black==24.2.0
blinker==1.9.0
Brotli==1.1.0
certifi==2023.7.22
charset-normalizer==3.3.2
click==8.1.7
Django==5.0.2
django-appconf==1.0.6
evdev==1.7.1
Flask==3.1.3
gallery_dl==1.30.6
idna==3.4
instaloader==4.14
itsdangerous==2.2.0
Jinja2==3.1.2
libsass==0.23.0
MarkupSafe==2.1.3
mutagen==1.47.0
mypy-extensions==1.0.0
numpy==2.2.6
packaging==23.2
pathspec==0.12.1
platformdirs==4.2.0
pycryptodomex==3.19.0
python-dateutil==2.9.0.post0
python-xlib==0.33
pytz==2026.1.post1
rcssmin==1.1.1
requests==2.31.0
rjsmin==1.2.1
six==1.17.0
sqlparse==0.4.4
tomli==2.0.1
typing_extensions==4.9.0
tzdata==2025.3
urllib3==2.1.0
websockets==12.0
Werkzeug==3.1.3
yt-dlp==2025.1.26

18
run_mac_linux.sh Normal file
View file

@ -0,0 +1,18 @@
#!/bin/bash
echo "============================================"
echo " Voice Clone TTS - Setup & Launch"
echo "============================================"
echo
# Check Python
if ! command -v python3 &>/dev/null; then
echo "[ERROR] Python 3 not found. Install from https://www.python.org/downloads/"
exit 1
fi
echo "Installing dependencies (first time only)..."
pip3 install -r requirements.txt
echo
echo "Launching app..."
python3 voice_clone_tts.py

22
run_windows.bat Normal file
View file

@ -0,0 +1,22 @@
@echo off
echo ============================================
echo Voice Clone TTS - Setup ^& Launch
echo ============================================
echo.
:: Check Python
python --version >nul 2>&1
if errorlevel 1 (
echo [ERROR] Python not found. Download it from https://www.python.org/downloads/
pause
exit /b 1
)
:: Install dependencies
echo Installing dependencies (first time only, ~5 min)...
pip install -r requirements.txt
echo.
echo Launching app...
python voice_clone_tts.py
pause

391
voice_clone_tts.py Normal file
View file

@ -0,0 +1,391 @@
"""
Voice Clone TTS
===============
Speak any text in YOUR voice using XTTS v2 (runs fully offline after first setup).
Usage:
1. Run this script: python voice_clone_tts.py
2. Record ~15 seconds of your voice
3. Type any text and click Generate
"""
import tkinter as tk
from tkinter import ttk, messagebox, filedialog
import threading
import os
import tempfile
import numpy as np
try:
import sounddevice as sd
import scipy.io.wavfile as wav
except ImportError:
print("Missing dependencies. Please run: pip install sounddevice scipy")
raise
SAMPLE_RATE = 22050
VOICE_SAMPLE_FILE = "my_voice_sample.wav"
# ─── Colour palette ──────────────────────────────────────────────────────────
BG = "#0f0f13"
SURFACE = "#1a1a24"
BORDER = "#2a2a3a"
ACCENT = "#7c6af7"
ACCENT2 = "#a89cf7"
TEXT = "#e8e6f0"
MUTED = "#666680"
SUCCESS = "#4ade80"
ERROR = "#f87171"
WARNING = "#fbbf24"
# ─────────────────────────────────────────────────────────────────────────────
class VoiceCloneTTS:
def __init__(self, root: tk.Tk):
self.root = root
self.root.title("Voice Clone TTS")
self.root.geometry("640x580")
self.root.configure(bg=BG)
self.root.resizable(True, True)
self.root.minsize(500, 480)
self.recording = False
self.recorded_chunks: list = []
self.voice_sample_path: str = VOICE_SAMPLE_FILE
self.tts = None
self._record_thread: threading.Thread | None = None
self._apply_styles()
self._build_ui()
self._load_tts_async()
# Restore saved voice sample if it exists
if os.path.exists(self.voice_sample_path):
self._set_voice_status(f"✓ Saved sample found: {self.voice_sample_path}", SUCCESS)
# ── Styles ────────────────────────────────────────────────────────────────
def _apply_styles(self):
style = ttk.Style()
style.theme_use("clam")
style.configure(".",
background=BG, foreground=TEXT,
font=("Segoe UI", 10),
borderwidth=0, relief="flat")
style.configure("Card.TFrame", background=SURFACE, relief="flat")
style.configure("TLabel", background=BG, foreground=TEXT)
style.configure("Card.TLabel", background=SURFACE, foreground=TEXT)
style.configure("Muted.TLabel", background=SURFACE, foreground=MUTED, font=("Segoe UI", 9))
style.configure("Title.TLabel", background=BG, foreground=ACCENT2,
font=("Segoe UI Semibold", 13))
style.configure("Step.TLabel", background=SURFACE, foreground=ACCENT2,
font=("Segoe UI Semibold", 10))
style.configure("Accent.TButton",
background=ACCENT, foreground="#ffffff",
font=("Segoe UI Semibold", 10), padding=(14, 7),
relief="flat", borderwidth=0)
style.map("Accent.TButton",
background=[("active", ACCENT2), ("disabled", BORDER)],
foreground=[("disabled", MUTED)])
style.configure("Ghost.TButton",
background=SURFACE, foreground=TEXT,
font=("Segoe UI", 10), padding=(12, 6),
relief="flat", borderwidth=0)
style.map("Ghost.TButton",
background=[("active", BORDER), ("disabled", SURFACE)],
foreground=[("disabled", MUTED)])
style.configure("TProgressbar",
troughcolor=BORDER, background=ACCENT,
thickness=4, relief="flat")
# ── UI build ──────────────────────────────────────────────────────────────
def _build_ui(self):
# ── Header
header = tk.Frame(self.root, bg=BG)
header.pack(fill="x", padx=20, pady=(18, 4))
tk.Label(header, text="Voice Clone TTS", bg=BG, fg=ACCENT2,
font=("Segoe UI Semibold", 17)).pack(side="left")
self._status_dot = tk.Label(header, text="", bg=BG, fg=MUTED, font=("Segoe UI", 10))
self._status_dot.pack(side="right", pady=2)
self._header_status = tk.Label(header, text="Loading model…", bg=BG, fg=MUTED,
font=("Segoe UI", 9))
self._header_status.pack(side="right", padx=(0, 4), pady=2)
self._progress = ttk.Progressbar(self.root, mode="indeterminate", style="TProgressbar")
self._progress.pack(fill="x", padx=20, pady=(0, 10))
self._progress.start(12)
# ── Step 1 Voice sample
self._card("Step 1 — Record Your Voice", self._build_record_section)
# ── Step 2 Text input
self._card("Step 2 — Enter Text", self._build_text_section, expand=True)
# ── Step 3 Generate
self._card("Step 3 — Generate", self._build_generate_section)
def _card(self, title: str, builder, expand=False):
outer = tk.Frame(self.root, bg=BG)
outer.pack(fill="both", expand=expand, padx=16, pady=4)
frame = tk.Frame(outer, bg=SURFACE, bd=0,
highlightthickness=1, highlightbackground=BORDER)
frame.pack(fill="both", expand=expand, padx=0, pady=0)
tk.Label(frame, text=title, bg=SURFACE, fg=ACCENT2,
font=("Segoe UI Semibold", 10)).pack(anchor="w", padx=14, pady=(10, 0))
sep = tk.Frame(frame, bg=BORDER, height=1)
sep.pack(fill="x", padx=14, pady=(6, 0))
inner = tk.Frame(frame, bg=SURFACE)
inner.pack(fill="both", expand=expand, padx=14, pady=10)
builder(inner)
def _build_record_section(self, parent):
self._voice_status_var = tk.StringVar(value="No voice sample yet — record one below.")
self._voice_status_lbl = tk.Label(parent, textvariable=self._voice_status_var,
bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
anchor="w", wraplength=560)
self._voice_status_lbl.pack(fill="x", pady=(0, 8))
btn_row = tk.Frame(parent, bg=SURFACE)
btn_row.pack(fill="x")
self._rec_btn = ttk.Button(btn_row, text="⏺ Start Recording",
command=self._toggle_recording, style="Accent.TButton")
self._rec_btn.pack(side="left", padx=(0, 8))
ttk.Button(btn_row, text="📂 Load Audio File",
command=self._load_voice_file, style="Ghost.TButton").pack(side="left")
self._rec_timer_var = tk.StringVar(value="")
tk.Label(parent, textvariable=self._rec_timer_var,
bg=SURFACE, fg=ACCENT, font=("Segoe UI Semibold", 9)).pack(anchor="w", pady=(6, 0))
tk.Label(parent, text="Tip: Speak naturally for 1030 s. Clear audio = better cloning.",
bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(2, 0))
def _build_text_section(self, parent):
self._text_input = tk.Text(parent, height=6, wrap="word",
bg="#12121a", fg=TEXT, insertbackground=ACCENT2,
font=("Segoe UI", 11), relief="flat", bd=0,
padx=10, pady=8, selectbackground=ACCENT)
self._text_input.pack(fill="both", expand=True)
self._text_input.insert("1.0", "Hello! This is my voice, speaking through a computer.")
tk.Frame(parent, bg=BORDER, height=1).pack(fill="x", pady=(6, 0))
tk.Label(parent, text="Supports English, Spanish, French, German, Italian, and more.",
bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(4, 0))
def _build_generate_section(self, parent):
btn_row = tk.Frame(parent, bg=SURFACE)
btn_row.pack(fill="x")
self._play_btn = ttk.Button(btn_row, text="▶ Generate & Play",
command=lambda: self._generate(play=True, save=False),
style="Accent.TButton", state="disabled")
self._play_btn.pack(side="left", padx=(0, 8))
self._save_btn = ttk.Button(btn_row, text="💾 Generate & Save",
command=lambda: self._generate(play=False, save=True),
style="Ghost.TButton", state="disabled")
self._save_btn.pack(side="left", padx=(0, 8))
self._both_btn = ttk.Button(btn_row, text="▶💾 Play & Save",
command=lambda: self._generate(play=True, save=True),
style="Ghost.TButton", state="disabled")
self._both_btn.pack(side="left")
self._gen_status_var = tk.StringVar(value="")
self._gen_status_lbl = tk.Label(parent, textvariable=self._gen_status_var,
bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
anchor="w", wraplength=560)
self._gen_status_lbl.pack(fill="x", pady=(8, 0))
# ── TTS loading ───────────────────────────────────────────────────────────
def _load_tts_async(self):
def _load():
try:
from TTS.api import TTS # type: ignore
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
self.root.after(0, self._on_tts_ready)
except Exception as exc:
_exc = exc
self.root.after(0, lambda: self._on_tts_error(_exc))
threading.Thread(target=_load, daemon=True).start()
def _on_tts_ready(self):
self._progress.stop()
self._progress.config(value=0)
self._set_header_status("Model ready", SUCCESS)
self._enable_generate_btns()
self._set_gen_status("Ready — enter text and click Generate.", MUTED)
def _on_tts_error(self, exc):
self._progress.stop()
self._set_header_status("Model failed to load", ERROR)
self._set_gen_status(f"Error: {exc}", ERROR)
messagebox.showerror("TTS Load Error",
f"Could not load the XTTS v2 model:\n{exc}\n\n"
"Make sure you've run: pip install TTS")
# ── Recording ─────────────────────────────────────────────────────────────
def _toggle_recording(self):
if not self.recording:
self._start_recording()
else:
self._stop_recording()
def _start_recording(self):
self.recording = True
self.recorded_chunks = []
self._rec_btn.config(text="⏹ Stop Recording")
self._set_voice_status("🔴 Recording… speak naturally for 1030 seconds.", WARNING)
self._rec_timer_var.set("0 s")
self._elapsed = 0
self._tick()
def _record():
with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="float32") as stream:
while self.recording:
data, _ = stream.read(1024)
self.recorded_chunks.append(data.copy())
self._record_thread = threading.Thread(target=_record, daemon=True)
self._record_thread.start()
def _tick(self):
if self.recording:
self._elapsed += 1
self._rec_timer_var.set(f"{self._elapsed} s recorded")
self.root.after(1000, self._tick)
else:
self._rec_timer_var.set("")
def _stop_recording(self):
self.recording = False
self._rec_btn.config(text="⏺ Start Recording")
if not self.recorded_chunks:
self._set_voice_status("Nothing recorded — try again.", MUTED)
return
audio = np.concatenate(self.recorded_chunks, axis=0)
audio_i16 = (audio * 32767).clip(-32768, 32767).astype(np.int16)
wav.write(self.voice_sample_path, SAMPLE_RATE, audio_i16)
dur = len(audio_i16) / SAMPLE_RATE
self._set_voice_status(
f"✓ Saved {dur:.1f}s sample → {os.path.abspath(self.voice_sample_path)}", SUCCESS)
def _load_voice_file(self):
path = filedialog.askopenfilename(
title="Select voice sample",
filetypes=[("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), ("All files", "*.*")])
if path:
self.voice_sample_path = path
self._set_voice_status(f"✓ Loaded: {os.path.basename(path)}", SUCCESS)
# ── Generation ────────────────────────────────────────────────────────────
def _generate(self, play: bool, save: bool):
if not self.tts:
messagebox.showerror("Not ready", "TTS model is still loading.")
return
if not os.path.exists(self.voice_sample_path):
messagebox.showerror("No voice sample",
"Please record your voice or load an audio file first.")
return
text = self._text_input.get("1.0", "end").strip()
if not text:
messagebox.showerror("No text", "Please enter some text to speak.")
return
save_path: str | None = None
if save:
save_path = filedialog.asksaveasfilename(
defaultextension=".wav",
filetypes=[("WAV audio", "*.wav")],
title="Save generated speech")
if not save_path:
return
self._set_btns_state("disabled")
self._progress.start(12)
self._set_gen_status("⏳ Generating speech (this may take ~1030 s)…", ACCENT2)
def _run():
try:
out = save_path or os.path.join(tempfile.gettempdir(), "vcl_output.wav")
self.tts.tts_to_file(
text=text,
speaker_wav=self.voice_sample_path,
language="en",
file_path=out,
)
if play:
sr, data = wav.read(out)
audio = data.astype(np.float32) / (32768.0 if data.dtype == np.int16 else 1.0)
if audio.ndim > 1:
audio = audio.mean(axis=1)
sd.play(audio, samplerate=sr)
sd.wait()
msg = "✓ Done!"
if save_path:
msg += f" Saved → {save_path}"
self.root.after(0, lambda: self._set_gen_status(msg, SUCCESS))
except Exception as exc:
self.root.after(0, lambda: self._set_gen_status(f"Error: {exc}", ERROR))
finally:
self.root.after(0, self._on_generate_done)
threading.Thread(target=_run, daemon=True).start()
def _on_generate_done(self):
self._progress.stop()
self._progress.config(value=0)
self._set_btns_state("normal")
# ── Helpers ───────────────────────────────────────────────────────────────
def _set_voice_status(self, msg: str, color: str = TEXT):
self._voice_status_var.set(msg)
self._voice_status_lbl.config(fg=color)
def _set_gen_status(self, msg: str, color: str = TEXT):
self._gen_status_var.set(msg)
self._gen_status_lbl.config(fg=color)
def _set_header_status(self, msg: str, color: str = TEXT):
self._header_status.config(text=msg, fg=color)
self._status_dot.config(fg=color)
def _enable_generate_btns(self):
for btn in (self._play_btn, self._save_btn, self._both_btn):
btn.config(state="normal")
def _set_btns_state(self, state: str):
for btn in (self._play_btn, self._save_btn, self._both_btn, self._rec_btn):
btn.config(state=state)
# ── Entry point ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
root = tk.Tk()
app = VoiceCloneTTS(root)
root.mainloop()