commit 34d55dc95f89fa3335316996469fa1a3cce8c217 Author: juyung Date: Sat Mar 28 22:04:45 2026 +0900 Upload diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..def4cc9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.venv/ + diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..1445aee --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10.14 diff --git a/README.md b/README.md new file mode 100644 index 0000000..f849621 --- /dev/null +++ b/README.md @@ -0,0 +1,78 @@ +# Voice Clone TTS + +Type any text, hear it in your own voice. Runs fully offline. + +![Screenshot](docs/assets/img/preview.png) + +--- + +## Setup (first time only) + +**1. Install system packages:** +```bash +sudo apt install portaudio19-dev python3-tk espeak-ng -y +``` + +**2. Install Python 3.10 via pyenv** (required on Debian to avoid lzma bug): +```bash +curl https://pyenv.run | bash +echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc +echo '[[ -d $PYENV_ROOT/bin ]] && export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc +echo 'eval "$(pyenv init - bash)"' >> ~/.bashrc +source ~/.bashrc +pyenv install 3.10.14 +pyenv local 3.10.14 +``` + +**3. Create a virtual environment:** +```bash +~/.pyenv/versions/3.10.14/bin/python -m venv .venv +source .venv/bin/activate +``` + +**4. Install Python packages** (takes 15–30 min, downloads ~2GB): +```bash +pip install --upgrade pip +pip install --no-cache-dir "numpy==1.22.0" +pip install --no-cache-dir --resume-retries 20 TTS sounddevice scipy +pip install --no-cache-dir "transformers==4.40.0" +pip install --no-cache-dir "torch==2.1.0" "torchaudio==2.1.0" +``` + +--- + +## Running the app + +Every time you want to use it: +```bash +source .venv/bin/activate +python voice_clone_tts.py +``` + +--- + +## How to use + +1. Wait for **"Model ready"** in the top right *(first launch only: downloads ~2GB, takes 5–15 min)* +2. Click **Start Recording** → read the passage below for 30 seconds → **Stop Recording** +3. Type any text +4. Click **Generate & Play** + +Your voice sample saves as `my_voice_sample.wav` and is reused automatically on future runs. + +--- + +## Best text to record (Rainbow Passage) + +> *"When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. The rainbow is a division of white light into many beautiful colors. These take the shape of a long round arch, with its path high above, and its two ends apparently beyond the horizon. There is, according to legend, a pot of gold at the end of the rainbow. The shape of a rainbow reminds me of a bridge. Like a bridge, a rainbow is wide in the middle and narrow at its ends."* + +Read it **twice through** at your normal pace. + +--- + +## Tips + +- Record in a quiet room with no background noise +- Speak naturally — don't put on a "reading voice" +- 30 seconds of clean audio is the sweet spot +- Generation takes 10–30 seconds per sentence on CPU diff --git a/docs/assets/img/preview.png b/docs/assets/img/preview.png new file mode 100644 index 0000000..52d0de4 Binary files /dev/null and b/docs/assets/img/preview.png differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1003ad0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,41 @@ +asgiref==3.7.2 +bcrypt==5.0.0 +black==24.2.0 +blinker==1.9.0 +Brotli==1.1.0 +certifi==2023.7.22 +charset-normalizer==3.3.2 +click==8.1.7 +Django==5.0.2 +django-appconf==1.0.6 +evdev==1.7.1 +Flask==3.1.3 +gallery_dl==1.30.6 +idna==3.4 +instaloader==4.14 +itsdangerous==2.2.0 +Jinja2==3.1.2 +libsass==0.23.0 +MarkupSafe==2.1.3 +mutagen==1.47.0 +mypy-extensions==1.0.0 +numpy==2.2.6 +packaging==23.2 +pathspec==0.12.1 +platformdirs==4.2.0 +pycryptodomex==3.19.0 +python-dateutil==2.9.0.post0 +python-xlib==0.33 +pytz==2026.1.post1 +rcssmin==1.1.1 +requests==2.31.0 +rjsmin==1.2.1 +six==1.17.0 +sqlparse==0.4.4 +tomli==2.0.1 +typing_extensions==4.9.0 +tzdata==2025.3 +urllib3==2.1.0 +websockets==12.0 +Werkzeug==3.1.3 +yt-dlp==2025.1.26 diff --git a/run_mac_linux.sh b/run_mac_linux.sh new file mode 100644 index 0000000..5834f08 --- /dev/null +++ b/run_mac_linux.sh @@ -0,0 +1,18 @@ +#!/bin/bash +echo "============================================" +echo " Voice Clone TTS - Setup & Launch" +echo "============================================" +echo + +# Check Python +if ! command -v python3 &>/dev/null; then + echo "[ERROR] Python 3 not found. Install from https://www.python.org/downloads/" + exit 1 +fi + +echo "Installing dependencies (first time only)..." +pip3 install -r requirements.txt + +echo +echo "Launching app..." +python3 voice_clone_tts.py diff --git a/run_windows.bat b/run_windows.bat new file mode 100644 index 0000000..9e3e7eb --- /dev/null +++ b/run_windows.bat @@ -0,0 +1,22 @@ +@echo off +echo ============================================ +echo Voice Clone TTS - Setup ^& Launch +echo ============================================ +echo. + +:: Check Python +python --version >nul 2>&1 +if errorlevel 1 ( + echo [ERROR] Python not found. Download it from https://www.python.org/downloads/ + pause + exit /b 1 +) + +:: Install dependencies +echo Installing dependencies (first time only, ~5 min)... +pip install -r requirements.txt + +echo. +echo Launching app... +python voice_clone_tts.py +pause diff --git a/voice_clone_tts.py b/voice_clone_tts.py new file mode 100644 index 0000000..a0d8ac4 --- /dev/null +++ b/voice_clone_tts.py @@ -0,0 +1,391 @@ +""" +Voice Clone TTS +=============== +Speak any text in YOUR voice using XTTS v2 (runs fully offline after first setup). + +Usage: + 1. Run this script: python voice_clone_tts.py + 2. Record ~15 seconds of your voice + 3. Type any text and click Generate +""" + +import tkinter as tk +from tkinter import ttk, messagebox, filedialog +import threading +import os +import tempfile +import numpy as np + +try: + import sounddevice as sd + import scipy.io.wavfile as wav +except ImportError: + print("Missing dependencies. Please run: pip install sounddevice scipy") + raise + +SAMPLE_RATE = 22050 +VOICE_SAMPLE_FILE = "my_voice_sample.wav" + + +# ─── Colour palette ────────────────────────────────────────────────────────── +BG = "#0f0f13" +SURFACE = "#1a1a24" +BORDER = "#2a2a3a" +ACCENT = "#7c6af7" +ACCENT2 = "#a89cf7" +TEXT = "#e8e6f0" +MUTED = "#666680" +SUCCESS = "#4ade80" +ERROR = "#f87171" +WARNING = "#fbbf24" +# ───────────────────────────────────────────────────────────────────────────── + + +class VoiceCloneTTS: + def __init__(self, root: tk.Tk): + self.root = root + self.root.title("Voice Clone TTS") + self.root.geometry("640x580") + self.root.configure(bg=BG) + self.root.resizable(True, True) + self.root.minsize(500, 480) + + self.recording = False + self.recorded_chunks: list = [] + self.voice_sample_path: str = VOICE_SAMPLE_FILE + self.tts = None + self._record_thread: threading.Thread | None = None + + self._apply_styles() + self._build_ui() + self._load_tts_async() + + # Restore saved voice sample if it exists + if os.path.exists(self.voice_sample_path): + self._set_voice_status(f"✓ Saved sample found: {self.voice_sample_path}", SUCCESS) + + # ── Styles ──────────────────────────────────────────────────────────────── + + def _apply_styles(self): + style = ttk.Style() + style.theme_use("clam") + + style.configure(".", + background=BG, foreground=TEXT, + font=("Segoe UI", 10), + borderwidth=0, relief="flat") + + style.configure("Card.TFrame", background=SURFACE, relief="flat") + style.configure("TLabel", background=BG, foreground=TEXT) + style.configure("Card.TLabel", background=SURFACE, foreground=TEXT) + style.configure("Muted.TLabel", background=SURFACE, foreground=MUTED, font=("Segoe UI", 9)) + style.configure("Title.TLabel", background=BG, foreground=ACCENT2, + font=("Segoe UI Semibold", 13)) + style.configure("Step.TLabel", background=SURFACE, foreground=ACCENT2, + font=("Segoe UI Semibold", 10)) + + style.configure("Accent.TButton", + background=ACCENT, foreground="#ffffff", + font=("Segoe UI Semibold", 10), padding=(14, 7), + relief="flat", borderwidth=0) + style.map("Accent.TButton", + background=[("active", ACCENT2), ("disabled", BORDER)], + foreground=[("disabled", MUTED)]) + + style.configure("Ghost.TButton", + background=SURFACE, foreground=TEXT, + font=("Segoe UI", 10), padding=(12, 6), + relief="flat", borderwidth=0) + style.map("Ghost.TButton", + background=[("active", BORDER), ("disabled", SURFACE)], + foreground=[("disabled", MUTED)]) + + style.configure("TProgressbar", + troughcolor=BORDER, background=ACCENT, + thickness=4, relief="flat") + + # ── UI build ────────────────────────────────────────────────────────────── + + def _build_ui(self): + # ── Header + header = tk.Frame(self.root, bg=BG) + header.pack(fill="x", padx=20, pady=(18, 4)) + tk.Label(header, text="Voice Clone TTS", bg=BG, fg=ACCENT2, + font=("Segoe UI Semibold", 17)).pack(side="left") + self._status_dot = tk.Label(header, text="●", bg=BG, fg=MUTED, font=("Segoe UI", 10)) + self._status_dot.pack(side="right", pady=2) + self._header_status = tk.Label(header, text="Loading model…", bg=BG, fg=MUTED, + font=("Segoe UI", 9)) + self._header_status.pack(side="right", padx=(0, 4), pady=2) + + self._progress = ttk.Progressbar(self.root, mode="indeterminate", style="TProgressbar") + self._progress.pack(fill="x", padx=20, pady=(0, 10)) + self._progress.start(12) + + # ── Step 1 – Voice sample + self._card("Step 1 — Record Your Voice", self._build_record_section) + + # ── Step 2 – Text input + self._card("Step 2 — Enter Text", self._build_text_section, expand=True) + + # ── Step 3 – Generate + self._card("Step 3 — Generate", self._build_generate_section) + + def _card(self, title: str, builder, expand=False): + outer = tk.Frame(self.root, bg=BG) + outer.pack(fill="both", expand=expand, padx=16, pady=4) + + frame = tk.Frame(outer, bg=SURFACE, bd=0, + highlightthickness=1, highlightbackground=BORDER) + frame.pack(fill="both", expand=expand, padx=0, pady=0) + + tk.Label(frame, text=title, bg=SURFACE, fg=ACCENT2, + font=("Segoe UI Semibold", 10)).pack(anchor="w", padx=14, pady=(10, 0)) + + sep = tk.Frame(frame, bg=BORDER, height=1) + sep.pack(fill="x", padx=14, pady=(6, 0)) + + inner = tk.Frame(frame, bg=SURFACE) + inner.pack(fill="both", expand=expand, padx=14, pady=10) + builder(inner) + + def _build_record_section(self, parent): + self._voice_status_var = tk.StringVar(value="No voice sample yet — record one below.") + self._voice_status_lbl = tk.Label(parent, textvariable=self._voice_status_var, + bg=SURFACE, fg=MUTED, font=("Segoe UI", 9), + anchor="w", wraplength=560) + self._voice_status_lbl.pack(fill="x", pady=(0, 8)) + + btn_row = tk.Frame(parent, bg=SURFACE) + btn_row.pack(fill="x") + + self._rec_btn = ttk.Button(btn_row, text="⏺ Start Recording", + command=self._toggle_recording, style="Accent.TButton") + self._rec_btn.pack(side="left", padx=(0, 8)) + + ttk.Button(btn_row, text="📂 Load Audio File", + command=self._load_voice_file, style="Ghost.TButton").pack(side="left") + + self._rec_timer_var = tk.StringVar(value="") + tk.Label(parent, textvariable=self._rec_timer_var, + bg=SURFACE, fg=ACCENT, font=("Segoe UI Semibold", 9)).pack(anchor="w", pady=(6, 0)) + + tk.Label(parent, text="Tip: Speak naturally for 10–30 s. Clear audio = better cloning.", + bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(2, 0)) + + def _build_text_section(self, parent): + self._text_input = tk.Text(parent, height=6, wrap="word", + bg="#12121a", fg=TEXT, insertbackground=ACCENT2, + font=("Segoe UI", 11), relief="flat", bd=0, + padx=10, pady=8, selectbackground=ACCENT) + self._text_input.pack(fill="both", expand=True) + self._text_input.insert("1.0", "Hello! This is my voice, speaking through a computer.") + + tk.Frame(parent, bg=BORDER, height=1).pack(fill="x", pady=(6, 0)) + tk.Label(parent, text="Supports English, Spanish, French, German, Italian, and more.", + bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(4, 0)) + + def _build_generate_section(self, parent): + btn_row = tk.Frame(parent, bg=SURFACE) + btn_row.pack(fill="x") + + self._play_btn = ttk.Button(btn_row, text="▶ Generate & Play", + command=lambda: self._generate(play=True, save=False), + style="Accent.TButton", state="disabled") + self._play_btn.pack(side="left", padx=(0, 8)) + + self._save_btn = ttk.Button(btn_row, text="💾 Generate & Save", + command=lambda: self._generate(play=False, save=True), + style="Ghost.TButton", state="disabled") + self._save_btn.pack(side="left", padx=(0, 8)) + + self._both_btn = ttk.Button(btn_row, text="▶💾 Play & Save", + command=lambda: self._generate(play=True, save=True), + style="Ghost.TButton", state="disabled") + self._both_btn.pack(side="left") + + self._gen_status_var = tk.StringVar(value="") + self._gen_status_lbl = tk.Label(parent, textvariable=self._gen_status_var, + bg=SURFACE, fg=MUTED, font=("Segoe UI", 9), + anchor="w", wraplength=560) + self._gen_status_lbl.pack(fill="x", pady=(8, 0)) + + # ── TTS loading ─────────────────────────────────────────────────────────── + + def _load_tts_async(self): + def _load(): + try: + from TTS.api import TTS # type: ignore + self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) + self.root.after(0, self._on_tts_ready) + except Exception as exc: + _exc = exc + self.root.after(0, lambda: self._on_tts_error(_exc)) + + threading.Thread(target=_load, daemon=True).start() + + def _on_tts_ready(self): + self._progress.stop() + self._progress.config(value=0) + self._set_header_status("Model ready", SUCCESS) + self._enable_generate_btns() + self._set_gen_status("Ready — enter text and click Generate.", MUTED) + + def _on_tts_error(self, exc): + self._progress.stop() + self._set_header_status("Model failed to load", ERROR) + self._set_gen_status(f"Error: {exc}", ERROR) + messagebox.showerror("TTS Load Error", + f"Could not load the XTTS v2 model:\n{exc}\n\n" + "Make sure you've run: pip install TTS") + + # ── Recording ───────────────────────────────────────────────────────────── + + def _toggle_recording(self): + if not self.recording: + self._start_recording() + else: + self._stop_recording() + + def _start_recording(self): + self.recording = True + self.recorded_chunks = [] + self._rec_btn.config(text="⏹ Stop Recording") + self._set_voice_status("🔴 Recording… speak naturally for 10–30 seconds.", WARNING) + self._rec_timer_var.set("0 s") + self._elapsed = 0 + self._tick() + + def _record(): + with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="float32") as stream: + while self.recording: + data, _ = stream.read(1024) + self.recorded_chunks.append(data.copy()) + + self._record_thread = threading.Thread(target=_record, daemon=True) + self._record_thread.start() + + def _tick(self): + if self.recording: + self._elapsed += 1 + self._rec_timer_var.set(f"{self._elapsed} s recorded") + self.root.after(1000, self._tick) + else: + self._rec_timer_var.set("") + + def _stop_recording(self): + self.recording = False + self._rec_btn.config(text="⏺ Start Recording") + + if not self.recorded_chunks: + self._set_voice_status("Nothing recorded — try again.", MUTED) + return + + audio = np.concatenate(self.recorded_chunks, axis=0) + audio_i16 = (audio * 32767).clip(-32768, 32767).astype(np.int16) + wav.write(self.voice_sample_path, SAMPLE_RATE, audio_i16) + dur = len(audio_i16) / SAMPLE_RATE + self._set_voice_status( + f"✓ Saved {dur:.1f}s sample → {os.path.abspath(self.voice_sample_path)}", SUCCESS) + + def _load_voice_file(self): + path = filedialog.askopenfilename( + title="Select voice sample", + filetypes=[("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), ("All files", "*.*")]) + if path: + self.voice_sample_path = path + self._set_voice_status(f"✓ Loaded: {os.path.basename(path)}", SUCCESS) + + # ── Generation ──────────────────────────────────────────────────────────── + + def _generate(self, play: bool, save: bool): + if not self.tts: + messagebox.showerror("Not ready", "TTS model is still loading.") + return + + if not os.path.exists(self.voice_sample_path): + messagebox.showerror("No voice sample", + "Please record your voice or load an audio file first.") + return + + text = self._text_input.get("1.0", "end").strip() + if not text: + messagebox.showerror("No text", "Please enter some text to speak.") + return + + save_path: str | None = None + if save: + save_path = filedialog.asksaveasfilename( + defaultextension=".wav", + filetypes=[("WAV audio", "*.wav")], + title="Save generated speech") + if not save_path: + return + + self._set_btns_state("disabled") + self._progress.start(12) + self._set_gen_status("⏳ Generating speech (this may take ~10–30 s)…", ACCENT2) + + def _run(): + try: + out = save_path or os.path.join(tempfile.gettempdir(), "vcl_output.wav") + self.tts.tts_to_file( + text=text, + speaker_wav=self.voice_sample_path, + language="en", + file_path=out, + ) + + if play: + sr, data = wav.read(out) + audio = data.astype(np.float32) / (32768.0 if data.dtype == np.int16 else 1.0) + if audio.ndim > 1: + audio = audio.mean(axis=1) + sd.play(audio, samplerate=sr) + sd.wait() + + msg = "✓ Done!" + if save_path: + msg += f" Saved → {save_path}" + self.root.after(0, lambda: self._set_gen_status(msg, SUCCESS)) + except Exception as exc: + self.root.after(0, lambda: self._set_gen_status(f"Error: {exc}", ERROR)) + finally: + self.root.after(0, self._on_generate_done) + + threading.Thread(target=_run, daemon=True).start() + + def _on_generate_done(self): + self._progress.stop() + self._progress.config(value=0) + self._set_btns_state("normal") + + # ── Helpers ─────────────────────────────────────────────────────────────── + + def _set_voice_status(self, msg: str, color: str = TEXT): + self._voice_status_var.set(msg) + self._voice_status_lbl.config(fg=color) + + def _set_gen_status(self, msg: str, color: str = TEXT): + self._gen_status_var.set(msg) + self._gen_status_lbl.config(fg=color) + + def _set_header_status(self, msg: str, color: str = TEXT): + self._header_status.config(text=msg, fg=color) + self._status_dot.config(fg=color) + + def _enable_generate_btns(self): + for btn in (self._play_btn, self._save_btn, self._both_btn): + btn.config(state="normal") + + def _set_btns_state(self, state: str): + for btn in (self._play_btn, self._save_btn, self._both_btn, self._rec_btn): + btn.config(state=state) + + +# ── Entry point ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + root = tk.Tk() + app = VoiceCloneTTS(root) + root.mainloop()