Upload

2026-03-28 22:04:45 +09:00 · 2026-03-28 22:04:45 +09:00 · 34d55dc95f
commit 34d55dc95f
8 changed files with 553 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
 .venv/
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.10.14
--- a/README.md
+++ b/README.md
@ -0,0 +1,78 @@
 # Voice Clone TTS
 Type any text, hear it in your own voice. Runs fully offline.
 ![Screenshot](docs/assets/img/preview.png)
 ---
 ## Setup (first time only)
 **1. Install system packages:**
 ```bash
 sudo apt install portaudio19-dev python3-tk espeak-ng -y
 ```
 **2. Install Python 3.10 via pyenv** (required on Debian to avoid lzma bug):
 ```bash
 curl https://pyenv.run | bash
 echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
 echo '[[ -d $PYENV_ROOT/bin ]] && export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc
 echo 'eval "$(pyenv init - bash)"' >> ~/.bashrc
 source ~/.bashrc
 pyenv install 3.10.14
 pyenv local 3.10.14
 ```
 **3. Create a virtual environment:**
 ```bash
 ~/.pyenv/versions/3.10.14/bin/python -m venv .venv
 source .venv/bin/activate
 ```
 **4. Install Python packages** (takes 15–30 min, downloads ~2GB):
 ```bash
 pip install --upgrade pip
 pip install --no-cache-dir "numpy==1.22.0"
 pip install --no-cache-dir --resume-retries 20 TTS sounddevice scipy
 pip install --no-cache-dir "transformers==4.40.0"
 pip install --no-cache-dir "torch==2.1.0" "torchaudio==2.1.0"
 ```
 ---
 ## Running the app
 Every time you want to use it:
 ```bash
 source .venv/bin/activate
 python voice_clone_tts.py
 ```
 ---
 ## How to use
 1. Wait for **"Model ready"** in the top right *(first launch only: downloads ~2GB, takes 5–15 min)*
 2. Click **Start Recording** → read the passage below for 30 seconds → **Stop Recording**
 3. Type any text
 4. Click **Generate & Play**
 Your voice sample saves as `my_voice_sample.wav` and is reused automatically on future runs.
 ---
 ## Best text to record (Rainbow Passage)
 > *"When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. The rainbow is a division of white light into many beautiful colors. These take the shape of a long round arch, with its path high above, and its two ends apparently beyond the horizon. There is, according to legend, a pot of gold at the end of the rainbow. The shape of a rainbow reminds me of a bridge. Like a bridge, a rainbow is wide in the middle and narrow at its ends."*
 Read it **twice through** at your normal pace.
 ---
 ## Tips
 - Record in a quiet room with no background noise
 - Speak naturally — don't put on a "reading voice"
 - 30 seconds of clean audio is the sweet spot
 - Generation takes 10–30 seconds per sentence on CPU
--- a/docs/assets/img/preview.png
+++ b/docs/assets/img/preview.png
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,41 @@
 asgiref==3.7.2
 bcrypt==5.0.0
 black==24.2.0
 blinker==1.9.0
 Brotli==1.1.0
 certifi==2023.7.22
 charset-normalizer==3.3.2
 click==8.1.7
 Django==5.0.2
 django-appconf==1.0.6
 evdev==1.7.1
 Flask==3.1.3
 gallery_dl==1.30.6
 idna==3.4
 instaloader==4.14
 itsdangerous==2.2.0
 Jinja2==3.1.2
 libsass==0.23.0
 MarkupSafe==2.1.3
 mutagen==1.47.0
 mypy-extensions==1.0.0
 numpy==2.2.6
 packaging==23.2
 pathspec==0.12.1
 platformdirs==4.2.0
 pycryptodomex==3.19.0
 python-dateutil==2.9.0.post0
 python-xlib==0.33
 pytz==2026.1.post1
 rcssmin==1.1.1
 requests==2.31.0
 rjsmin==1.2.1
 six==1.17.0
 sqlparse==0.4.4
 tomli==2.0.1
 typing_extensions==4.9.0
 tzdata==2025.3
 urllib3==2.1.0
 websockets==12.0
 Werkzeug==3.1.3
 yt-dlp==2025.1.26
--- a/run_mac_linux.sh
+++ b/run_mac_linux.sh
@ -0,0 +1,18 @@
 #!/bin/bash
 echo "============================================"
 echo " Voice Clone TTS  -  Setup & Launch"
 echo "============================================"
 echo
 # Check Python
 if ! command -v python3 &>/dev/null; then
    echo "[ERROR] Python 3 not found. Install from https://www.python.org/downloads/"
    exit 1
 fi
 echo "Installing dependencies (first time only)..."
 pip3 install -r requirements.txt
 echo
 echo "Launching app..."
 python3 voice_clone_tts.py
--- a/run_windows.bat
+++ b/run_windows.bat
@ -0,0 +1,22 @@
@echo off
 echo ============================================
 echo  Voice Clone TTS  -  Setup ^& Launch
 echo ============================================
 echo.
 :: Check Python
 python --version >nul 2>&1
 if errorlevel 1 (
    echo [ERROR] Python not found. Download it from https://www.python.org/downloads/
    pause
    exit /b 1
 )
 :: Install dependencies
 echo Installing dependencies (first time only, ~5 min)...
 pip install -r requirements.txt
 echo.
 echo Launching app...
 python voice_clone_tts.py
 pause
--- a/voice_clone_tts.py
+++ b/voice_clone_tts.py
@ -0,0 +1,391 @@
 """
 Voice Clone TTS
 ===============
 Speak any text in YOUR voice using XTTS v2 (runs fully offline after first setup).
 Usage:
  1. Run this script: python voice_clone_tts.py
  2. Record ~15 seconds of your voice
  3. Type any text and click Generate
 """
 import tkinter as tk
 from tkinter import ttk, messagebox, filedialog
 import threading
 import os
 import tempfile
 import numpy as np
 try:
    import sounddevice as sd
    import scipy.io.wavfile as wav
 except ImportError:
    print("Missing dependencies. Please run: pip install sounddevice scipy")
    raise
 SAMPLE_RATE = 22050
 VOICE_SAMPLE_FILE = "my_voice_sample.wav"
 # ─── Colour palette ──────────────────────────────────────────────────────────
 BG       = "#0f0f13"
 SURFACE  = "#1a1a24"
 BORDER   = "#2a2a3a"
 ACCENT   = "#7c6af7"
 ACCENT2  = "#a89cf7"
 TEXT     = "#e8e6f0"
 MUTED    = "#666680"
 SUCCESS  = "#4ade80"
 ERROR    = "#f87171"
 WARNING  = "#fbbf24"
 # ─────────────────────────────────────────────────────────────────────────────
 class VoiceCloneTTS:
    def __init__(self, root: tk.Tk):
        self.root = root
        self.root.title("Voice Clone TTS")
        self.root.geometry("640x580")
        self.root.configure(bg=BG)
        self.root.resizable(True, True)
        self.root.minsize(500, 480)
        self.recording = False
        self.recorded_chunks: list = []
        self.voice_sample_path: str = VOICE_SAMPLE_FILE
        self.tts = None
        self._record_thread: threading.Thread | None = None
        self._apply_styles()
        self._build_ui()
        self._load_tts_async()
        # Restore saved voice sample if it exists
        if os.path.exists(self.voice_sample_path):
            self._set_voice_status(f"✓  Saved sample found: {self.voice_sample_path}", SUCCESS)
    # ── Styles ────────────────────────────────────────────────────────────────
    def _apply_styles(self):
        style = ttk.Style()
        style.theme_use("clam")
        style.configure(".",
            background=BG, foreground=TEXT,
            font=("Segoe UI", 10),
            borderwidth=0, relief="flat")
        style.configure("Card.TFrame", background=SURFACE, relief="flat")
        style.configure("TLabel", background=BG, foreground=TEXT)
        style.configure("Card.TLabel", background=SURFACE, foreground=TEXT)
        style.configure("Muted.TLabel", background=SURFACE, foreground=MUTED, font=("Segoe UI", 9))
        style.configure("Title.TLabel", background=BG, foreground=ACCENT2,
                        font=("Segoe UI Semibold", 13))
        style.configure("Step.TLabel", background=SURFACE, foreground=ACCENT2,
                        font=("Segoe UI Semibold", 10))
        style.configure("Accent.TButton",
            background=ACCENT, foreground="#ffffff",
            font=("Segoe UI Semibold", 10), padding=(14, 7),
            relief="flat", borderwidth=0)
        style.map("Accent.TButton",
            background=[("active", ACCENT2), ("disabled", BORDER)],
            foreground=[("disabled", MUTED)])
        style.configure("Ghost.TButton",
            background=SURFACE, foreground=TEXT,
            font=("Segoe UI", 10), padding=(12, 6),
            relief="flat", borderwidth=0)
        style.map("Ghost.TButton",
            background=[("active", BORDER), ("disabled", SURFACE)],
            foreground=[("disabled", MUTED)])
        style.configure("TProgressbar",
            troughcolor=BORDER, background=ACCENT,
            thickness=4, relief="flat")
    # ── UI build ──────────────────────────────────────────────────────────────
    def _build_ui(self):
        # ── Header
        header = tk.Frame(self.root, bg=BG)
        header.pack(fill="x", padx=20, pady=(18, 4))
        tk.Label(header, text="Voice Clone TTS", bg=BG, fg=ACCENT2,
                 font=("Segoe UI Semibold", 17)).pack(side="left")
        self._status_dot = tk.Label(header, text="●", bg=BG, fg=MUTED, font=("Segoe UI", 10))
        self._status_dot.pack(side="right", pady=2)
        self._header_status = tk.Label(header, text="Loading model…", bg=BG, fg=MUTED,
                                       font=("Segoe UI", 9))
        self._header_status.pack(side="right", padx=(0, 4), pady=2)
        self._progress = ttk.Progressbar(self.root, mode="indeterminate", style="TProgressbar")
        self._progress.pack(fill="x", padx=20, pady=(0, 10))
        self._progress.start(12)
        # ── Step 1 – Voice sample
        self._card("Step 1 — Record Your Voice", self._build_record_section)
        # ── Step 2 – Text input
        self._card("Step 2 — Enter Text", self._build_text_section, expand=True)
        # ── Step 3 – Generate
        self._card("Step 3 — Generate", self._build_generate_section)
    def _card(self, title: str, builder, expand=False):
        outer = tk.Frame(self.root, bg=BG)
        outer.pack(fill="both", expand=expand, padx=16, pady=4)
        frame = tk.Frame(outer, bg=SURFACE, bd=0,
                         highlightthickness=1, highlightbackground=BORDER)
        frame.pack(fill="both", expand=expand, padx=0, pady=0)
        tk.Label(frame, text=title, bg=SURFACE, fg=ACCENT2,
                 font=("Segoe UI Semibold", 10)).pack(anchor="w", padx=14, pady=(10, 0))
        sep = tk.Frame(frame, bg=BORDER, height=1)
        sep.pack(fill="x", padx=14, pady=(6, 0))
        inner = tk.Frame(frame, bg=SURFACE)
        inner.pack(fill="both", expand=expand, padx=14, pady=10)
        builder(inner)
    def _build_record_section(self, parent):
        self._voice_status_var = tk.StringVar(value="No voice sample yet — record one below.")
        self._voice_status_lbl = tk.Label(parent, textvariable=self._voice_status_var,
                                          bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
                                          anchor="w", wraplength=560)
        self._voice_status_lbl.pack(fill="x", pady=(0, 8))
        btn_row = tk.Frame(parent, bg=SURFACE)
        btn_row.pack(fill="x")
        self._rec_btn = ttk.Button(btn_row, text="⏺  Start Recording",
                                   command=self._toggle_recording, style="Accent.TButton")
        self._rec_btn.pack(side="left", padx=(0, 8))
        ttk.Button(btn_row, text="📂  Load Audio File",
                   command=self._load_voice_file, style="Ghost.TButton").pack(side="left")
        self._rec_timer_var = tk.StringVar(value="")
        tk.Label(parent, textvariable=self._rec_timer_var,
                 bg=SURFACE, fg=ACCENT, font=("Segoe UI Semibold", 9)).pack(anchor="w", pady=(6, 0))
        tk.Label(parent, text="Tip: Speak naturally for 10–30 s. Clear audio = better cloning.",
                 bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(2, 0))
    def _build_text_section(self, parent):
        self._text_input = tk.Text(parent, height=6, wrap="word",
                                   bg="#12121a", fg=TEXT, insertbackground=ACCENT2,
                                   font=("Segoe UI", 11), relief="flat", bd=0,
                                   padx=10, pady=8, selectbackground=ACCENT)
        self._text_input.pack(fill="both", expand=True)
        self._text_input.insert("1.0", "Hello! This is my voice, speaking through a computer.")
        tk.Frame(parent, bg=BORDER, height=1).pack(fill="x", pady=(6, 0))
        tk.Label(parent, text="Supports English, Spanish, French, German, Italian, and more.",
                 bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(4, 0))
    def _build_generate_section(self, parent):
        btn_row = tk.Frame(parent, bg=SURFACE)
        btn_row.pack(fill="x")
        self._play_btn = ttk.Button(btn_row, text="▶  Generate & Play",
                                    command=lambda: self._generate(play=True, save=False),
                                    style="Accent.TButton", state="disabled")
        self._play_btn.pack(side="left", padx=(0, 8))
        self._save_btn = ttk.Button(btn_row, text="💾  Generate & Save",
                                    command=lambda: self._generate(play=False, save=True),
                                    style="Ghost.TButton", state="disabled")
        self._save_btn.pack(side="left", padx=(0, 8))
        self._both_btn = ttk.Button(btn_row, text="▶💾  Play & Save",
                                    command=lambda: self._generate(play=True, save=True),
                                    style="Ghost.TButton", state="disabled")
        self._both_btn.pack(side="left")
        self._gen_status_var = tk.StringVar(value="")
        self._gen_status_lbl = tk.Label(parent, textvariable=self._gen_status_var,
                                        bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
                                        anchor="w", wraplength=560)
        self._gen_status_lbl.pack(fill="x", pady=(8, 0))
    # ── TTS loading ───────────────────────────────────────────────────────────
    def _load_tts_async(self):
        def _load():
            try:
                from TTS.api import TTS  # type: ignore
                self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
                self.root.after(0, self._on_tts_ready)
            except Exception as exc:
                _exc = exc
                self.root.after(0, lambda: self._on_tts_error(_exc))
        threading.Thread(target=_load, daemon=True).start()
    def _on_tts_ready(self):
        self._progress.stop()
        self._progress.config(value=0)
        self._set_header_status("Model ready", SUCCESS)
        self._enable_generate_btns()
        self._set_gen_status("Ready — enter text and click Generate.", MUTED)
    def _on_tts_error(self, exc):
        self._progress.stop()
        self._set_header_status("Model failed to load", ERROR)
        self._set_gen_status(f"Error: {exc}", ERROR)
        messagebox.showerror("TTS Load Error",
            f"Could not load the XTTS v2 model:\n{exc}\n\n"
            "Make sure you've run:  pip install TTS")
    # ── Recording ─────────────────────────────────────────────────────────────
    def _toggle_recording(self):
        if not self.recording:
            self._start_recording()
        else:
            self._stop_recording()
    def _start_recording(self):
        self.recording = True
        self.recorded_chunks = []
        self._rec_btn.config(text="⏹  Stop Recording")
        self._set_voice_status("🔴  Recording… speak naturally for 10–30 seconds.", WARNING)
        self._rec_timer_var.set("0 s")
        self._elapsed = 0
        self._tick()
        def _record():
            with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="float32") as stream:
                while self.recording:
                    data, _ = stream.read(1024)
                    self.recorded_chunks.append(data.copy())
        self._record_thread = threading.Thread(target=_record, daemon=True)
        self._record_thread.start()
    def _tick(self):
        if self.recording:
            self._elapsed += 1
            self._rec_timer_var.set(f"{self._elapsed} s recorded")
            self.root.after(1000, self._tick)
        else:
            self._rec_timer_var.set("")
    def _stop_recording(self):
        self.recording = False
        self._rec_btn.config(text="⏺  Start Recording")
        if not self.recorded_chunks:
            self._set_voice_status("Nothing recorded — try again.", MUTED)
            return
        audio = np.concatenate(self.recorded_chunks, axis=0)
        audio_i16 = (audio * 32767).clip(-32768, 32767).astype(np.int16)
        wav.write(self.voice_sample_path, SAMPLE_RATE, audio_i16)
        dur = len(audio_i16) / SAMPLE_RATE
        self._set_voice_status(
            f"✓  Saved {dur:.1f}s sample → {os.path.abspath(self.voice_sample_path)}", SUCCESS)
    def _load_voice_file(self):
        path = filedialog.askopenfilename(
            title="Select voice sample",
            filetypes=[("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), ("All files", "*.*")])
        if path:
            self.voice_sample_path = path
            self._set_voice_status(f"✓  Loaded: {os.path.basename(path)}", SUCCESS)
    # ── Generation ────────────────────────────────────────────────────────────
    def _generate(self, play: bool, save: bool):
        if not self.tts:
            messagebox.showerror("Not ready", "TTS model is still loading.")
            return
        if not os.path.exists(self.voice_sample_path):
            messagebox.showerror("No voice sample",
                "Please record your voice or load an audio file first.")
            return
        text = self._text_input.get("1.0", "end").strip()
        if not text:
            messagebox.showerror("No text", "Please enter some text to speak.")
            return
        save_path: str | None = None
        if save:
            save_path = filedialog.asksaveasfilename(
                defaultextension=".wav",
                filetypes=[("WAV audio", "*.wav")],
                title="Save generated speech")
            if not save_path:
                return
        self._set_btns_state("disabled")
        self._progress.start(12)
        self._set_gen_status("⏳  Generating speech (this may take ~10–30 s)…", ACCENT2)
        def _run():
            try:
                out = save_path or os.path.join(tempfile.gettempdir(), "vcl_output.wav")
                self.tts.tts_to_file(
                    text=text,
                    speaker_wav=self.voice_sample_path,
                    language="en",
                    file_path=out,
                )
                if play:
                    sr, data = wav.read(out)
                    audio = data.astype(np.float32) / (32768.0 if data.dtype == np.int16 else 1.0)
                    if audio.ndim > 1:
                        audio = audio.mean(axis=1)
                    sd.play(audio, samplerate=sr)
                    sd.wait()
                msg = "✓  Done!"
                if save_path:
                    msg += f"  Saved → {save_path}"
                self.root.after(0, lambda: self._set_gen_status(msg, SUCCESS))
            except Exception as exc:
                self.root.after(0, lambda: self._set_gen_status(f"Error: {exc}", ERROR))
            finally:
                self.root.after(0, self._on_generate_done)
        threading.Thread(target=_run, daemon=True).start()
    def _on_generate_done(self):
        self._progress.stop()
        self._progress.config(value=0)
        self._set_btns_state("normal")
    # ── Helpers ───────────────────────────────────────────────────────────────
    def _set_voice_status(self, msg: str, color: str = TEXT):
        self._voice_status_var.set(msg)
        self._voice_status_lbl.config(fg=color)
    def _set_gen_status(self, msg: str, color: str = TEXT):
        self._gen_status_var.set(msg)
        self._gen_status_lbl.config(fg=color)
    def _set_header_status(self, msg: str, color: str = TEXT):
        self._header_status.config(text=msg, fg=color)
        self._status_dot.config(fg=color)
    def _enable_generate_btns(self):
        for btn in (self._play_btn, self._save_btn, self._both_btn):
            btn.config(state="normal")
    def _set_btns_state(self, state: str):
        for btn in (self._play_btn, self._save_btn, self._both_btn, self._rec_btn):
            btn.config(state=state)
 # ── Entry point ───────────────────────────────────────────────────────────────
 if __name__ == "__main__":
    root = tk.Tk()
    app = VoiceCloneTTS(root)
    root.mainloop()