Upload
This commit is contained in:
commit
34d55dc95f
8 changed files with 553 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
.venv/
|
||||||
|
|
||||||
1
.python-version
Normal file
1
.python-version
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
3.10.14
|
||||||
78
README.md
Normal file
78
README.md
Normal file
|
|
@ -0,0 +1,78 @@
|
||||||
|
# Voice Clone TTS
|
||||||
|
|
||||||
|
Type any text, hear it in your own voice. Runs fully offline.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Setup (first time only)
|
||||||
|
|
||||||
|
**1. Install system packages:**
|
||||||
|
```bash
|
||||||
|
sudo apt install portaudio19-dev python3-tk espeak-ng -y
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. Install Python 3.10 via pyenv** (required on Debian to avoid lzma bug):
|
||||||
|
```bash
|
||||||
|
curl https://pyenv.run | bash
|
||||||
|
echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
|
||||||
|
echo '[[ -d $PYENV_ROOT/bin ]] && export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc
|
||||||
|
echo 'eval "$(pyenv init - bash)"' >> ~/.bashrc
|
||||||
|
source ~/.bashrc
|
||||||
|
pyenv install 3.10.14
|
||||||
|
pyenv local 3.10.14
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. Create a virtual environment:**
|
||||||
|
```bash
|
||||||
|
~/.pyenv/versions/3.10.14/bin/python -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
```
|
||||||
|
|
||||||
|
**4. Install Python packages** (takes 15–30 min, downloads ~2GB):
|
||||||
|
```bash
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install --no-cache-dir "numpy==1.22.0"
|
||||||
|
pip install --no-cache-dir --resume-retries 20 TTS sounddevice scipy
|
||||||
|
pip install --no-cache-dir "transformers==4.40.0"
|
||||||
|
pip install --no-cache-dir "torch==2.1.0" "torchaudio==2.1.0"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Running the app
|
||||||
|
|
||||||
|
Every time you want to use it:
|
||||||
|
```bash
|
||||||
|
source .venv/bin/activate
|
||||||
|
python voice_clone_tts.py
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How to use
|
||||||
|
|
||||||
|
1. Wait for **"Model ready"** in the top right *(first launch only: downloads ~2GB, takes 5–15 min)*
|
||||||
|
2. Click **Start Recording** → read the passage below for 30 seconds → **Stop Recording**
|
||||||
|
3. Type any text
|
||||||
|
4. Click **Generate & Play**
|
||||||
|
|
||||||
|
Your voice sample saves as `my_voice_sample.wav` and is reused automatically on future runs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Best text to record (Rainbow Passage)
|
||||||
|
|
||||||
|
> *"When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. The rainbow is a division of white light into many beautiful colors. These take the shape of a long round arch, with its path high above, and its two ends apparently beyond the horizon. There is, according to legend, a pot of gold at the end of the rainbow. The shape of a rainbow reminds me of a bridge. Like a bridge, a rainbow is wide in the middle and narrow at its ends."*
|
||||||
|
|
||||||
|
Read it **twice through** at your normal pace.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tips
|
||||||
|
|
||||||
|
- Record in a quiet room with no background noise
|
||||||
|
- Speak naturally — don't put on a "reading voice"
|
||||||
|
- 30 seconds of clean audio is the sweet spot
|
||||||
|
- Generation takes 10–30 seconds per sentence on CPU
|
||||||
BIN
docs/assets/img/preview.png
Normal file
BIN
docs/assets/img/preview.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 8.6 KiB |
41
requirements.txt
Normal file
41
requirements.txt
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
asgiref==3.7.2
|
||||||
|
bcrypt==5.0.0
|
||||||
|
black==24.2.0
|
||||||
|
blinker==1.9.0
|
||||||
|
Brotli==1.1.0
|
||||||
|
certifi==2023.7.22
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
click==8.1.7
|
||||||
|
Django==5.0.2
|
||||||
|
django-appconf==1.0.6
|
||||||
|
evdev==1.7.1
|
||||||
|
Flask==3.1.3
|
||||||
|
gallery_dl==1.30.6
|
||||||
|
idna==3.4
|
||||||
|
instaloader==4.14
|
||||||
|
itsdangerous==2.2.0
|
||||||
|
Jinja2==3.1.2
|
||||||
|
libsass==0.23.0
|
||||||
|
MarkupSafe==2.1.3
|
||||||
|
mutagen==1.47.0
|
||||||
|
mypy-extensions==1.0.0
|
||||||
|
numpy==2.2.6
|
||||||
|
packaging==23.2
|
||||||
|
pathspec==0.12.1
|
||||||
|
platformdirs==4.2.0
|
||||||
|
pycryptodomex==3.19.0
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
python-xlib==0.33
|
||||||
|
pytz==2026.1.post1
|
||||||
|
rcssmin==1.1.1
|
||||||
|
requests==2.31.0
|
||||||
|
rjsmin==1.2.1
|
||||||
|
six==1.17.0
|
||||||
|
sqlparse==0.4.4
|
||||||
|
tomli==2.0.1
|
||||||
|
typing_extensions==4.9.0
|
||||||
|
tzdata==2025.3
|
||||||
|
urllib3==2.1.0
|
||||||
|
websockets==12.0
|
||||||
|
Werkzeug==3.1.3
|
||||||
|
yt-dlp==2025.1.26
|
||||||
18
run_mac_linux.sh
Normal file
18
run_mac_linux.sh
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
#!/bin/bash
|
||||||
|
echo "============================================"
|
||||||
|
echo " Voice Clone TTS - Setup & Launch"
|
||||||
|
echo "============================================"
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Check Python
|
||||||
|
if ! command -v python3 &>/dev/null; then
|
||||||
|
echo "[ERROR] Python 3 not found. Install from https://www.python.org/downloads/"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Installing dependencies (first time only)..."
|
||||||
|
pip3 install -r requirements.txt
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Launching app..."
|
||||||
|
python3 voice_clone_tts.py
|
||||||
22
run_windows.bat
Normal file
22
run_windows.bat
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
@echo off
|
||||||
|
echo ============================================
|
||||||
|
echo Voice Clone TTS - Setup ^& Launch
|
||||||
|
echo ============================================
|
||||||
|
echo.
|
||||||
|
|
||||||
|
:: Check Python
|
||||||
|
python --version >nul 2>&1
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo [ERROR] Python not found. Download it from https://www.python.org/downloads/
|
||||||
|
pause
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
:: Install dependencies
|
||||||
|
echo Installing dependencies (first time only, ~5 min)...
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo Launching app...
|
||||||
|
python voice_clone_tts.py
|
||||||
|
pause
|
||||||
391
voice_clone_tts.py
Normal file
391
voice_clone_tts.py
Normal file
|
|
@ -0,0 +1,391 @@
|
||||||
|
"""
|
||||||
|
Voice Clone TTS
|
||||||
|
===============
|
||||||
|
Speak any text in YOUR voice using XTTS v2 (runs fully offline after first setup).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
1. Run this script: python voice_clone_tts.py
|
||||||
|
2. Record ~15 seconds of your voice
|
||||||
|
3. Type any text and click Generate
|
||||||
|
"""
|
||||||
|
|
||||||
|
import tkinter as tk
|
||||||
|
from tkinter import ttk, messagebox, filedialog
|
||||||
|
import threading
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
try:
|
||||||
|
import sounddevice as sd
|
||||||
|
import scipy.io.wavfile as wav
|
||||||
|
except ImportError:
|
||||||
|
print("Missing dependencies. Please run: pip install sounddevice scipy")
|
||||||
|
raise
|
||||||
|
|
||||||
|
SAMPLE_RATE = 22050
|
||||||
|
VOICE_SAMPLE_FILE = "my_voice_sample.wav"
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Colour palette ──────────────────────────────────────────────────────────
|
||||||
|
BG = "#0f0f13"
|
||||||
|
SURFACE = "#1a1a24"
|
||||||
|
BORDER = "#2a2a3a"
|
||||||
|
ACCENT = "#7c6af7"
|
||||||
|
ACCENT2 = "#a89cf7"
|
||||||
|
TEXT = "#e8e6f0"
|
||||||
|
MUTED = "#666680"
|
||||||
|
SUCCESS = "#4ade80"
|
||||||
|
ERROR = "#f87171"
|
||||||
|
WARNING = "#fbbf24"
|
||||||
|
# ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class VoiceCloneTTS:
|
||||||
|
def __init__(self, root: tk.Tk):
|
||||||
|
self.root = root
|
||||||
|
self.root.title("Voice Clone TTS")
|
||||||
|
self.root.geometry("640x580")
|
||||||
|
self.root.configure(bg=BG)
|
||||||
|
self.root.resizable(True, True)
|
||||||
|
self.root.minsize(500, 480)
|
||||||
|
|
||||||
|
self.recording = False
|
||||||
|
self.recorded_chunks: list = []
|
||||||
|
self.voice_sample_path: str = VOICE_SAMPLE_FILE
|
||||||
|
self.tts = None
|
||||||
|
self._record_thread: threading.Thread | None = None
|
||||||
|
|
||||||
|
self._apply_styles()
|
||||||
|
self._build_ui()
|
||||||
|
self._load_tts_async()
|
||||||
|
|
||||||
|
# Restore saved voice sample if it exists
|
||||||
|
if os.path.exists(self.voice_sample_path):
|
||||||
|
self._set_voice_status(f"✓ Saved sample found: {self.voice_sample_path}", SUCCESS)
|
||||||
|
|
||||||
|
# ── Styles ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _apply_styles(self):
|
||||||
|
style = ttk.Style()
|
||||||
|
style.theme_use("clam")
|
||||||
|
|
||||||
|
style.configure(".",
|
||||||
|
background=BG, foreground=TEXT,
|
||||||
|
font=("Segoe UI", 10),
|
||||||
|
borderwidth=0, relief="flat")
|
||||||
|
|
||||||
|
style.configure("Card.TFrame", background=SURFACE, relief="flat")
|
||||||
|
style.configure("TLabel", background=BG, foreground=TEXT)
|
||||||
|
style.configure("Card.TLabel", background=SURFACE, foreground=TEXT)
|
||||||
|
style.configure("Muted.TLabel", background=SURFACE, foreground=MUTED, font=("Segoe UI", 9))
|
||||||
|
style.configure("Title.TLabel", background=BG, foreground=ACCENT2,
|
||||||
|
font=("Segoe UI Semibold", 13))
|
||||||
|
style.configure("Step.TLabel", background=SURFACE, foreground=ACCENT2,
|
||||||
|
font=("Segoe UI Semibold", 10))
|
||||||
|
|
||||||
|
style.configure("Accent.TButton",
|
||||||
|
background=ACCENT, foreground="#ffffff",
|
||||||
|
font=("Segoe UI Semibold", 10), padding=(14, 7),
|
||||||
|
relief="flat", borderwidth=0)
|
||||||
|
style.map("Accent.TButton",
|
||||||
|
background=[("active", ACCENT2), ("disabled", BORDER)],
|
||||||
|
foreground=[("disabled", MUTED)])
|
||||||
|
|
||||||
|
style.configure("Ghost.TButton",
|
||||||
|
background=SURFACE, foreground=TEXT,
|
||||||
|
font=("Segoe UI", 10), padding=(12, 6),
|
||||||
|
relief="flat", borderwidth=0)
|
||||||
|
style.map("Ghost.TButton",
|
||||||
|
background=[("active", BORDER), ("disabled", SURFACE)],
|
||||||
|
foreground=[("disabled", MUTED)])
|
||||||
|
|
||||||
|
style.configure("TProgressbar",
|
||||||
|
troughcolor=BORDER, background=ACCENT,
|
||||||
|
thickness=4, relief="flat")
|
||||||
|
|
||||||
|
# ── UI build ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _build_ui(self):
|
||||||
|
# ── Header
|
||||||
|
header = tk.Frame(self.root, bg=BG)
|
||||||
|
header.pack(fill="x", padx=20, pady=(18, 4))
|
||||||
|
tk.Label(header, text="Voice Clone TTS", bg=BG, fg=ACCENT2,
|
||||||
|
font=("Segoe UI Semibold", 17)).pack(side="left")
|
||||||
|
self._status_dot = tk.Label(header, text="●", bg=BG, fg=MUTED, font=("Segoe UI", 10))
|
||||||
|
self._status_dot.pack(side="right", pady=2)
|
||||||
|
self._header_status = tk.Label(header, text="Loading model…", bg=BG, fg=MUTED,
|
||||||
|
font=("Segoe UI", 9))
|
||||||
|
self._header_status.pack(side="right", padx=(0, 4), pady=2)
|
||||||
|
|
||||||
|
self._progress = ttk.Progressbar(self.root, mode="indeterminate", style="TProgressbar")
|
||||||
|
self._progress.pack(fill="x", padx=20, pady=(0, 10))
|
||||||
|
self._progress.start(12)
|
||||||
|
|
||||||
|
# ── Step 1 – Voice sample
|
||||||
|
self._card("Step 1 — Record Your Voice", self._build_record_section)
|
||||||
|
|
||||||
|
# ── Step 2 – Text input
|
||||||
|
self._card("Step 2 — Enter Text", self._build_text_section, expand=True)
|
||||||
|
|
||||||
|
# ── Step 3 – Generate
|
||||||
|
self._card("Step 3 — Generate", self._build_generate_section)
|
||||||
|
|
||||||
|
def _card(self, title: str, builder, expand=False):
|
||||||
|
outer = tk.Frame(self.root, bg=BG)
|
||||||
|
outer.pack(fill="both", expand=expand, padx=16, pady=4)
|
||||||
|
|
||||||
|
frame = tk.Frame(outer, bg=SURFACE, bd=0,
|
||||||
|
highlightthickness=1, highlightbackground=BORDER)
|
||||||
|
frame.pack(fill="both", expand=expand, padx=0, pady=0)
|
||||||
|
|
||||||
|
tk.Label(frame, text=title, bg=SURFACE, fg=ACCENT2,
|
||||||
|
font=("Segoe UI Semibold", 10)).pack(anchor="w", padx=14, pady=(10, 0))
|
||||||
|
|
||||||
|
sep = tk.Frame(frame, bg=BORDER, height=1)
|
||||||
|
sep.pack(fill="x", padx=14, pady=(6, 0))
|
||||||
|
|
||||||
|
inner = tk.Frame(frame, bg=SURFACE)
|
||||||
|
inner.pack(fill="both", expand=expand, padx=14, pady=10)
|
||||||
|
builder(inner)
|
||||||
|
|
||||||
|
def _build_record_section(self, parent):
|
||||||
|
self._voice_status_var = tk.StringVar(value="No voice sample yet — record one below.")
|
||||||
|
self._voice_status_lbl = tk.Label(parent, textvariable=self._voice_status_var,
|
||||||
|
bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
|
||||||
|
anchor="w", wraplength=560)
|
||||||
|
self._voice_status_lbl.pack(fill="x", pady=(0, 8))
|
||||||
|
|
||||||
|
btn_row = tk.Frame(parent, bg=SURFACE)
|
||||||
|
btn_row.pack(fill="x")
|
||||||
|
|
||||||
|
self._rec_btn = ttk.Button(btn_row, text="⏺ Start Recording",
|
||||||
|
command=self._toggle_recording, style="Accent.TButton")
|
||||||
|
self._rec_btn.pack(side="left", padx=(0, 8))
|
||||||
|
|
||||||
|
ttk.Button(btn_row, text="📂 Load Audio File",
|
||||||
|
command=self._load_voice_file, style="Ghost.TButton").pack(side="left")
|
||||||
|
|
||||||
|
self._rec_timer_var = tk.StringVar(value="")
|
||||||
|
tk.Label(parent, textvariable=self._rec_timer_var,
|
||||||
|
bg=SURFACE, fg=ACCENT, font=("Segoe UI Semibold", 9)).pack(anchor="w", pady=(6, 0))
|
||||||
|
|
||||||
|
tk.Label(parent, text="Tip: Speak naturally for 10–30 s. Clear audio = better cloning.",
|
||||||
|
bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(2, 0))
|
||||||
|
|
||||||
|
def _build_text_section(self, parent):
|
||||||
|
self._text_input = tk.Text(parent, height=6, wrap="word",
|
||||||
|
bg="#12121a", fg=TEXT, insertbackground=ACCENT2,
|
||||||
|
font=("Segoe UI", 11), relief="flat", bd=0,
|
||||||
|
padx=10, pady=8, selectbackground=ACCENT)
|
||||||
|
self._text_input.pack(fill="both", expand=True)
|
||||||
|
self._text_input.insert("1.0", "Hello! This is my voice, speaking through a computer.")
|
||||||
|
|
||||||
|
tk.Frame(parent, bg=BORDER, height=1).pack(fill="x", pady=(6, 0))
|
||||||
|
tk.Label(parent, text="Supports English, Spanish, French, German, Italian, and more.",
|
||||||
|
bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(4, 0))
|
||||||
|
|
||||||
|
def _build_generate_section(self, parent):
|
||||||
|
btn_row = tk.Frame(parent, bg=SURFACE)
|
||||||
|
btn_row.pack(fill="x")
|
||||||
|
|
||||||
|
self._play_btn = ttk.Button(btn_row, text="▶ Generate & Play",
|
||||||
|
command=lambda: self._generate(play=True, save=False),
|
||||||
|
style="Accent.TButton", state="disabled")
|
||||||
|
self._play_btn.pack(side="left", padx=(0, 8))
|
||||||
|
|
||||||
|
self._save_btn = ttk.Button(btn_row, text="💾 Generate & Save",
|
||||||
|
command=lambda: self._generate(play=False, save=True),
|
||||||
|
style="Ghost.TButton", state="disabled")
|
||||||
|
self._save_btn.pack(side="left", padx=(0, 8))
|
||||||
|
|
||||||
|
self._both_btn = ttk.Button(btn_row, text="▶💾 Play & Save",
|
||||||
|
command=lambda: self._generate(play=True, save=True),
|
||||||
|
style="Ghost.TButton", state="disabled")
|
||||||
|
self._both_btn.pack(side="left")
|
||||||
|
|
||||||
|
self._gen_status_var = tk.StringVar(value="")
|
||||||
|
self._gen_status_lbl = tk.Label(parent, textvariable=self._gen_status_var,
|
||||||
|
bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
|
||||||
|
anchor="w", wraplength=560)
|
||||||
|
self._gen_status_lbl.pack(fill="x", pady=(8, 0))
|
||||||
|
|
||||||
|
# ── TTS loading ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _load_tts_async(self):
|
||||||
|
def _load():
|
||||||
|
try:
|
||||||
|
from TTS.api import TTS # type: ignore
|
||||||
|
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
|
||||||
|
self.root.after(0, self._on_tts_ready)
|
||||||
|
except Exception as exc:
|
||||||
|
_exc = exc
|
||||||
|
self.root.after(0, lambda: self._on_tts_error(_exc))
|
||||||
|
|
||||||
|
threading.Thread(target=_load, daemon=True).start()
|
||||||
|
|
||||||
|
def _on_tts_ready(self):
|
||||||
|
self._progress.stop()
|
||||||
|
self._progress.config(value=0)
|
||||||
|
self._set_header_status("Model ready", SUCCESS)
|
||||||
|
self._enable_generate_btns()
|
||||||
|
self._set_gen_status("Ready — enter text and click Generate.", MUTED)
|
||||||
|
|
||||||
|
def _on_tts_error(self, exc):
|
||||||
|
self._progress.stop()
|
||||||
|
self._set_header_status("Model failed to load", ERROR)
|
||||||
|
self._set_gen_status(f"Error: {exc}", ERROR)
|
||||||
|
messagebox.showerror("TTS Load Error",
|
||||||
|
f"Could not load the XTTS v2 model:\n{exc}\n\n"
|
||||||
|
"Make sure you've run: pip install TTS")
|
||||||
|
|
||||||
|
# ── Recording ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _toggle_recording(self):
|
||||||
|
if not self.recording:
|
||||||
|
self._start_recording()
|
||||||
|
else:
|
||||||
|
self._stop_recording()
|
||||||
|
|
||||||
|
def _start_recording(self):
|
||||||
|
self.recording = True
|
||||||
|
self.recorded_chunks = []
|
||||||
|
self._rec_btn.config(text="⏹ Stop Recording")
|
||||||
|
self._set_voice_status("🔴 Recording… speak naturally for 10–30 seconds.", WARNING)
|
||||||
|
self._rec_timer_var.set("0 s")
|
||||||
|
self._elapsed = 0
|
||||||
|
self._tick()
|
||||||
|
|
||||||
|
def _record():
|
||||||
|
with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="float32") as stream:
|
||||||
|
while self.recording:
|
||||||
|
data, _ = stream.read(1024)
|
||||||
|
self.recorded_chunks.append(data.copy())
|
||||||
|
|
||||||
|
self._record_thread = threading.Thread(target=_record, daemon=True)
|
||||||
|
self._record_thread.start()
|
||||||
|
|
||||||
|
def _tick(self):
|
||||||
|
if self.recording:
|
||||||
|
self._elapsed += 1
|
||||||
|
self._rec_timer_var.set(f"{self._elapsed} s recorded")
|
||||||
|
self.root.after(1000, self._tick)
|
||||||
|
else:
|
||||||
|
self._rec_timer_var.set("")
|
||||||
|
|
||||||
|
def _stop_recording(self):
|
||||||
|
self.recording = False
|
||||||
|
self._rec_btn.config(text="⏺ Start Recording")
|
||||||
|
|
||||||
|
if not self.recorded_chunks:
|
||||||
|
self._set_voice_status("Nothing recorded — try again.", MUTED)
|
||||||
|
return
|
||||||
|
|
||||||
|
audio = np.concatenate(self.recorded_chunks, axis=0)
|
||||||
|
audio_i16 = (audio * 32767).clip(-32768, 32767).astype(np.int16)
|
||||||
|
wav.write(self.voice_sample_path, SAMPLE_RATE, audio_i16)
|
||||||
|
dur = len(audio_i16) / SAMPLE_RATE
|
||||||
|
self._set_voice_status(
|
||||||
|
f"✓ Saved {dur:.1f}s sample → {os.path.abspath(self.voice_sample_path)}", SUCCESS)
|
||||||
|
|
||||||
|
def _load_voice_file(self):
|
||||||
|
path = filedialog.askopenfilename(
|
||||||
|
title="Select voice sample",
|
||||||
|
filetypes=[("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), ("All files", "*.*")])
|
||||||
|
if path:
|
||||||
|
self.voice_sample_path = path
|
||||||
|
self._set_voice_status(f"✓ Loaded: {os.path.basename(path)}", SUCCESS)
|
||||||
|
|
||||||
|
# ── Generation ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _generate(self, play: bool, save: bool):
|
||||||
|
if not self.tts:
|
||||||
|
messagebox.showerror("Not ready", "TTS model is still loading.")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not os.path.exists(self.voice_sample_path):
|
||||||
|
messagebox.showerror("No voice sample",
|
||||||
|
"Please record your voice or load an audio file first.")
|
||||||
|
return
|
||||||
|
|
||||||
|
text = self._text_input.get("1.0", "end").strip()
|
||||||
|
if not text:
|
||||||
|
messagebox.showerror("No text", "Please enter some text to speak.")
|
||||||
|
return
|
||||||
|
|
||||||
|
save_path: str | None = None
|
||||||
|
if save:
|
||||||
|
save_path = filedialog.asksaveasfilename(
|
||||||
|
defaultextension=".wav",
|
||||||
|
filetypes=[("WAV audio", "*.wav")],
|
||||||
|
title="Save generated speech")
|
||||||
|
if not save_path:
|
||||||
|
return
|
||||||
|
|
||||||
|
self._set_btns_state("disabled")
|
||||||
|
self._progress.start(12)
|
||||||
|
self._set_gen_status("⏳ Generating speech (this may take ~10–30 s)…", ACCENT2)
|
||||||
|
|
||||||
|
def _run():
|
||||||
|
try:
|
||||||
|
out = save_path or os.path.join(tempfile.gettempdir(), "vcl_output.wav")
|
||||||
|
self.tts.tts_to_file(
|
||||||
|
text=text,
|
||||||
|
speaker_wav=self.voice_sample_path,
|
||||||
|
language="en",
|
||||||
|
file_path=out,
|
||||||
|
)
|
||||||
|
|
||||||
|
if play:
|
||||||
|
sr, data = wav.read(out)
|
||||||
|
audio = data.astype(np.float32) / (32768.0 if data.dtype == np.int16 else 1.0)
|
||||||
|
if audio.ndim > 1:
|
||||||
|
audio = audio.mean(axis=1)
|
||||||
|
sd.play(audio, samplerate=sr)
|
||||||
|
sd.wait()
|
||||||
|
|
||||||
|
msg = "✓ Done!"
|
||||||
|
if save_path:
|
||||||
|
msg += f" Saved → {save_path}"
|
||||||
|
self.root.after(0, lambda: self._set_gen_status(msg, SUCCESS))
|
||||||
|
except Exception as exc:
|
||||||
|
self.root.after(0, lambda: self._set_gen_status(f"Error: {exc}", ERROR))
|
||||||
|
finally:
|
||||||
|
self.root.after(0, self._on_generate_done)
|
||||||
|
|
||||||
|
threading.Thread(target=_run, daemon=True).start()
|
||||||
|
|
||||||
|
def _on_generate_done(self):
|
||||||
|
self._progress.stop()
|
||||||
|
self._progress.config(value=0)
|
||||||
|
self._set_btns_state("normal")
|
||||||
|
|
||||||
|
# ── Helpers ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _set_voice_status(self, msg: str, color: str = TEXT):
|
||||||
|
self._voice_status_var.set(msg)
|
||||||
|
self._voice_status_lbl.config(fg=color)
|
||||||
|
|
||||||
|
def _set_gen_status(self, msg: str, color: str = TEXT):
|
||||||
|
self._gen_status_var.set(msg)
|
||||||
|
self._gen_status_lbl.config(fg=color)
|
||||||
|
|
||||||
|
def _set_header_status(self, msg: str, color: str = TEXT):
|
||||||
|
self._header_status.config(text=msg, fg=color)
|
||||||
|
self._status_dot.config(fg=color)
|
||||||
|
|
||||||
|
def _enable_generate_btns(self):
|
||||||
|
for btn in (self._play_btn, self._save_btn, self._both_btn):
|
||||||
|
btn.config(state="normal")
|
||||||
|
|
||||||
|
def _set_btns_state(self, state: str):
|
||||||
|
for btn in (self._play_btn, self._save_btn, self._both_btn, self._rec_btn):
|
||||||
|
btn.config(state=state)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Entry point ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
root = tk.Tk()
|
||||||
|
app = VoiceCloneTTS(root)
|
||||||
|
root.mainloop()
|
||||||
Loading…
Reference in a new issue