Upload

2026-03-28 22:04:45 +09:00 · 2026-03-28 22:04:45 +09:00 · 34d55dc95f
commit 34d55dc95f
8 changed files with 553 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+.venv/
+
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.10.14
--- a/README.md
+++ b/README.md
@ -0,0 +1,78 @@
+# Voice Clone TTS
+
+Type any text, hear it in your own voice. Runs fully offline.
+
+![Screenshot](docs/assets/img/preview.png)
+
+---
+
+## Setup (first time only)
+
+**1. Install system packages:**
+```bash
+sudo apt install portaudio19-dev python3-tk espeak-ng -y
+```
+
+**2. Install Python 3.10 via pyenv** (required on Debian to avoid lzma bug):
+```bash
+curl https://pyenv.run | bash
+echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc
+echo '[[ -d $PYENV_ROOT/bin ]] && export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc
+echo 'eval "$(pyenv init - bash)"' >> ~/.bashrc
+source ~/.bashrc
+pyenv install 3.10.14
+pyenv local 3.10.14
+```
+
+**3. Create a virtual environment:**
+```bash
+~/.pyenv/versions/3.10.14/bin/python -m venv .venv
+source .venv/bin/activate
+```
+
+**4. Install Python packages** (takes 15–30 min, downloads ~2GB):
+```bash
+pip install --upgrade pip
+pip install --no-cache-dir "numpy==1.22.0"
+pip install --no-cache-dir --resume-retries 20 TTS sounddevice scipy
+pip install --no-cache-dir "transformers==4.40.0"
+pip install --no-cache-dir "torch==2.1.0" "torchaudio==2.1.0"
+```
+
+---
+
+## Running the app
+
+Every time you want to use it:
+```bash
+source .venv/bin/activate
+python voice_clone_tts.py
+```
+
+---
+
+## How to use
+
+1. Wait for **"Model ready"** in the top right *(first launch only: downloads ~2GB, takes 5–15 min)*
+2. Click **Start Recording** → read the passage below for 30 seconds → **Stop Recording**
+3. Type any text
+4. Click **Generate & Play**
+
+Your voice sample saves as `my_voice_sample.wav` and is reused automatically on future runs.
+
+---
+
+## Best text to record (Rainbow Passage)
+
+> *"When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. The rainbow is a division of white light into many beautiful colors. These take the shape of a long round arch, with its path high above, and its two ends apparently beyond the horizon. There is, according to legend, a pot of gold at the end of the rainbow. The shape of a rainbow reminds me of a bridge. Like a bridge, a rainbow is wide in the middle and narrow at its ends."*
+
+Read it **twice through** at your normal pace.
+
+---
+
+## Tips
+
+- Record in a quiet room with no background noise
+- Speak naturally — don't put on a "reading voice"
+- 30 seconds of clean audio is the sweet spot
+- Generation takes 10–30 seconds per sentence on CPU
--- a/docs/assets/img/preview.png
+++ b/docs/assets/img/preview.png
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,41 @@
+asgiref==3.7.2
+bcrypt==5.0.0
+black==24.2.0
+blinker==1.9.0
+Brotli==1.1.0
+certifi==2023.7.22
+charset-normalizer==3.3.2
+click==8.1.7
+Django==5.0.2
+django-appconf==1.0.6
+evdev==1.7.1
+Flask==3.1.3
+gallery_dl==1.30.6
+idna==3.4
+instaloader==4.14
+itsdangerous==2.2.0
+Jinja2==3.1.2
+libsass==0.23.0
+MarkupSafe==2.1.3
+mutagen==1.47.0
+mypy-extensions==1.0.0
+numpy==2.2.6
+packaging==23.2
+pathspec==0.12.1
+platformdirs==4.2.0
+pycryptodomex==3.19.0
+python-dateutil==2.9.0.post0
+python-xlib==0.33
+pytz==2026.1.post1
+rcssmin==1.1.1
+requests==2.31.0
+rjsmin==1.2.1
+six==1.17.0
+sqlparse==0.4.4
+tomli==2.0.1
+typing_extensions==4.9.0
+tzdata==2025.3
+urllib3==2.1.0
+websockets==12.0
+Werkzeug==3.1.3
+yt-dlp==2025.1.26
--- a/run_mac_linux.sh
+++ b/run_mac_linux.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+echo "============================================"
+echo " Voice Clone TTS  -  Setup & Launch"
+echo "============================================"
+echo
+
+# Check Python
+if ! command -v python3 &>/dev/null; then
+    echo "[ERROR] Python 3 not found. Install from https://www.python.org/downloads/"
+    exit 1
+fi
+
+echo "Installing dependencies (first time only)..."
+pip3 install -r requirements.txt
+
+echo
+echo "Launching app..."
+python3 voice_clone_tts.py
--- a/run_windows.bat
+++ b/run_windows.bat
@ -0,0 +1,22 @@
+@echo off
+echo ============================================
+echo  Voice Clone TTS  -  Setup ^& Launch
+echo ============================================
+echo.
+
+:: Check Python
+python --version >nul 2>&1
+if errorlevel 1 (
+    echo [ERROR] Python not found. Download it from https://www.python.org/downloads/
+    pause
+    exit /b 1
+)
+
+:: Install dependencies
+echo Installing dependencies (first time only, ~5 min)...
+pip install -r requirements.txt
+
+echo.
+echo Launching app...
+python voice_clone_tts.py
+pause
--- a/voice_clone_tts.py
+++ b/voice_clone_tts.py
@ -0,0 +1,391 @@
+"""
+Voice Clone TTS
+===============
+Speak any text in YOUR voice using XTTS v2 (runs fully offline after first setup).
+
+Usage:
+  1. Run this script: python voice_clone_tts.py
+  2. Record ~15 seconds of your voice
+  3. Type any text and click Generate
+"""
+
+import tkinter as tk
+from tkinter import ttk, messagebox, filedialog
+import threading
+import os
+import tempfile
+import numpy as np
+
+try:
+    import sounddevice as sd
+    import scipy.io.wavfile as wav
+except ImportError:
+    print("Missing dependencies. Please run: pip install sounddevice scipy")
+    raise
+
+SAMPLE_RATE = 22050
+VOICE_SAMPLE_FILE = "my_voice_sample.wav"
+
+
+# ─── Colour palette ──────────────────────────────────────────────────────────
+BG       = "#0f0f13"
+SURFACE  = "#1a1a24"
+BORDER   = "#2a2a3a"
+ACCENT   = "#7c6af7"
+ACCENT2  = "#a89cf7"
+TEXT     = "#e8e6f0"
+MUTED    = "#666680"
+SUCCESS  = "#4ade80"
+ERROR    = "#f87171"
+WARNING  = "#fbbf24"
+# ─────────────────────────────────────────────────────────────────────────────
+
+
+class VoiceCloneTTS:
+    def __init__(self, root: tk.Tk):
+        self.root = root
+        self.root.title("Voice Clone TTS")
+        self.root.geometry("640x580")
+        self.root.configure(bg=BG)
+        self.root.resizable(True, True)
+        self.root.minsize(500, 480)
+
+        self.recording = False
+        self.recorded_chunks: list = []
+        self.voice_sample_path: str = VOICE_SAMPLE_FILE
+        self.tts = None
+        self._record_thread: threading.Thread | None = None
+
+        self._apply_styles()
+        self._build_ui()
+        self._load_tts_async()
+
+        # Restore saved voice sample if it exists
+        if os.path.exists(self.voice_sample_path):
+            self._set_voice_status(f"✓  Saved sample found: {self.voice_sample_path}", SUCCESS)
+
+    # ── Styles ────────────────────────────────────────────────────────────────
+
+    def _apply_styles(self):
+        style = ttk.Style()
+        style.theme_use("clam")
+
+        style.configure(".",
+            background=BG, foreground=TEXT,
+            font=("Segoe UI", 10),
+            borderwidth=0, relief="flat")
+
+        style.configure("Card.TFrame", background=SURFACE, relief="flat")
+        style.configure("TLabel", background=BG, foreground=TEXT)
+        style.configure("Card.TLabel", background=SURFACE, foreground=TEXT)
+        style.configure("Muted.TLabel", background=SURFACE, foreground=MUTED, font=("Segoe UI", 9))
+        style.configure("Title.TLabel", background=BG, foreground=ACCENT2,
+                        font=("Segoe UI Semibold", 13))
+        style.configure("Step.TLabel", background=SURFACE, foreground=ACCENT2,
+                        font=("Segoe UI Semibold", 10))
+
+        style.configure("Accent.TButton",
+            background=ACCENT, foreground="#ffffff",
+            font=("Segoe UI Semibold", 10), padding=(14, 7),
+            relief="flat", borderwidth=0)
+        style.map("Accent.TButton",
+            background=[("active", ACCENT2), ("disabled", BORDER)],
+            foreground=[("disabled", MUTED)])
+
+        style.configure("Ghost.TButton",
+            background=SURFACE, foreground=TEXT,
+            font=("Segoe UI", 10), padding=(12, 6),
+            relief="flat", borderwidth=0)
+        style.map("Ghost.TButton",
+            background=[("active", BORDER), ("disabled", SURFACE)],
+            foreground=[("disabled", MUTED)])
+
+        style.configure("TProgressbar",
+            troughcolor=BORDER, background=ACCENT,
+            thickness=4, relief="flat")
+
+    # ── UI build ──────────────────────────────────────────────────────────────
+
+    def _build_ui(self):
+        # ── Header
+        header = tk.Frame(self.root, bg=BG)
+        header.pack(fill="x", padx=20, pady=(18, 4))
+        tk.Label(header, text="Voice Clone TTS", bg=BG, fg=ACCENT2,
+                 font=("Segoe UI Semibold", 17)).pack(side="left")
+        self._status_dot = tk.Label(header, text="●", bg=BG, fg=MUTED, font=("Segoe UI", 10))
+        self._status_dot.pack(side="right", pady=2)
+        self._header_status = tk.Label(header, text="Loading model…", bg=BG, fg=MUTED,
+                                       font=("Segoe UI", 9))
+        self._header_status.pack(side="right", padx=(0, 4), pady=2)
+
+        self._progress = ttk.Progressbar(self.root, mode="indeterminate", style="TProgressbar")
+        self._progress.pack(fill="x", padx=20, pady=(0, 10))
+        self._progress.start(12)
+
+        # ── Step 1 – Voice sample
+        self._card("Step 1 — Record Your Voice", self._build_record_section)
+
+        # ── Step 2 – Text input
+        self._card("Step 2 — Enter Text", self._build_text_section, expand=True)
+
+        # ── Step 3 – Generate
+        self._card("Step 3 — Generate", self._build_generate_section)
+
+    def _card(self, title: str, builder, expand=False):
+        outer = tk.Frame(self.root, bg=BG)
+        outer.pack(fill="both", expand=expand, padx=16, pady=4)
+
+        frame = tk.Frame(outer, bg=SURFACE, bd=0,
+                         highlightthickness=1, highlightbackground=BORDER)
+        frame.pack(fill="both", expand=expand, padx=0, pady=0)
+
+        tk.Label(frame, text=title, bg=SURFACE, fg=ACCENT2,
+                 font=("Segoe UI Semibold", 10)).pack(anchor="w", padx=14, pady=(10, 0))
+
+        sep = tk.Frame(frame, bg=BORDER, height=1)
+        sep.pack(fill="x", padx=14, pady=(6, 0))
+
+        inner = tk.Frame(frame, bg=SURFACE)
+        inner.pack(fill="both", expand=expand, padx=14, pady=10)
+        builder(inner)
+
+    def _build_record_section(self, parent):
+        self._voice_status_var = tk.StringVar(value="No voice sample yet — record one below.")
+        self._voice_status_lbl = tk.Label(parent, textvariable=self._voice_status_var,
+                                          bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
+                                          anchor="w", wraplength=560)
+        self._voice_status_lbl.pack(fill="x", pady=(0, 8))
+
+        btn_row = tk.Frame(parent, bg=SURFACE)
+        btn_row.pack(fill="x")
+
+        self._rec_btn = ttk.Button(btn_row, text="⏺  Start Recording",
+                                   command=self._toggle_recording, style="Accent.TButton")
+        self._rec_btn.pack(side="left", padx=(0, 8))
+
+        ttk.Button(btn_row, text="📂  Load Audio File",
+                   command=self._load_voice_file, style="Ghost.TButton").pack(side="left")
+
+        self._rec_timer_var = tk.StringVar(value="")
+        tk.Label(parent, textvariable=self._rec_timer_var,
+                 bg=SURFACE, fg=ACCENT, font=("Segoe UI Semibold", 9)).pack(anchor="w", pady=(6, 0))
+
+        tk.Label(parent, text="Tip: Speak naturally for 10–30 s. Clear audio = better cloning.",
+                 bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(2, 0))
+
+    def _build_text_section(self, parent):
+        self._text_input = tk.Text(parent, height=6, wrap="word",
+                                   bg="#12121a", fg=TEXT, insertbackground=ACCENT2,
+                                   font=("Segoe UI", 11), relief="flat", bd=0,
+                                   padx=10, pady=8, selectbackground=ACCENT)
+        self._text_input.pack(fill="both", expand=True)
+        self._text_input.insert("1.0", "Hello! This is my voice, speaking through a computer.")
+
+        tk.Frame(parent, bg=BORDER, height=1).pack(fill="x", pady=(6, 0))
+        tk.Label(parent, text="Supports English, Spanish, French, German, Italian, and more.",
+                 bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(4, 0))
+
+    def _build_generate_section(self, parent):
+        btn_row = tk.Frame(parent, bg=SURFACE)
+        btn_row.pack(fill="x")
+
+        self._play_btn = ttk.Button(btn_row, text="▶  Generate & Play",
+                                    command=lambda: self._generate(play=True, save=False),
+                                    style="Accent.TButton", state="disabled")
+        self._play_btn.pack(side="left", padx=(0, 8))
+
+        self._save_btn = ttk.Button(btn_row, text="💾  Generate & Save",
+                                    command=lambda: self._generate(play=False, save=True),
+                                    style="Ghost.TButton", state="disabled")
+        self._save_btn.pack(side="left", padx=(0, 8))
+
+        self._both_btn = ttk.Button(btn_row, text="▶💾  Play & Save",
+                                    command=lambda: self._generate(play=True, save=True),
+                                    style="Ghost.TButton", state="disabled")
+        self._both_btn.pack(side="left")
+
+        self._gen_status_var = tk.StringVar(value="")
+        self._gen_status_lbl = tk.Label(parent, textvariable=self._gen_status_var,
+                                        bg=SURFACE, fg=MUTED, font=("Segoe UI", 9),
+                                        anchor="w", wraplength=560)
+        self._gen_status_lbl.pack(fill="x", pady=(8, 0))
+
+    # ── TTS loading ───────────────────────────────────────────────────────────
+
+    def _load_tts_async(self):
+        def _load():
+            try:
+                from TTS.api import TTS  # type: ignore
+                self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
+                self.root.after(0, self._on_tts_ready)
+            except Exception as exc:
+                _exc = exc
+                self.root.after(0, lambda: self._on_tts_error(_exc))
+
+        threading.Thread(target=_load, daemon=True).start()
+
+    def _on_tts_ready(self):
+        self._progress.stop()
+        self._progress.config(value=0)
+        self._set_header_status("Model ready", SUCCESS)
+        self._enable_generate_btns()
+        self._set_gen_status("Ready — enter text and click Generate.", MUTED)
+
+    def _on_tts_error(self, exc):
+        self._progress.stop()
+        self._set_header_status("Model failed to load", ERROR)
+        self._set_gen_status(f"Error: {exc}", ERROR)
+        messagebox.showerror("TTS Load Error",
+            f"Could not load the XTTS v2 model:\n{exc}\n\n"
+            "Make sure you've run:  pip install TTS")
+
+    # ── Recording ─────────────────────────────────────────────────────────────
+
+    def _toggle_recording(self):
+        if not self.recording:
+            self._start_recording()
+        else:
+            self._stop_recording()
+
+    def _start_recording(self):
+        self.recording = True
+        self.recorded_chunks = []
+        self._rec_btn.config(text="⏹  Stop Recording")
+        self._set_voice_status("🔴  Recording… speak naturally for 10–30 seconds.", WARNING)
+        self._rec_timer_var.set("0 s")
+        self._elapsed = 0
+        self._tick()
+
+        def _record():
+            with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="float32") as stream:
+                while self.recording:
+                    data, _ = stream.read(1024)
+                    self.recorded_chunks.append(data.copy())
+
+        self._record_thread = threading.Thread(target=_record, daemon=True)
+        self._record_thread.start()
+
+    def _tick(self):
+        if self.recording:
+            self._elapsed += 1
+            self._rec_timer_var.set(f"{self._elapsed} s recorded")
+            self.root.after(1000, self._tick)
+        else:
+            self._rec_timer_var.set("")
+
+    def _stop_recording(self):
+        self.recording = False
+        self._rec_btn.config(text="⏺  Start Recording")
+
+        if not self.recorded_chunks:
+            self._set_voice_status("Nothing recorded — try again.", MUTED)
+            return
+
+        audio = np.concatenate(self.recorded_chunks, axis=0)
+        audio_i16 = (audio * 32767).clip(-32768, 32767).astype(np.int16)
+        wav.write(self.voice_sample_path, SAMPLE_RATE, audio_i16)
+        dur = len(audio_i16) / SAMPLE_RATE
+        self._set_voice_status(
+            f"✓  Saved {dur:.1f}s sample → {os.path.abspath(self.voice_sample_path)}", SUCCESS)
+
+    def _load_voice_file(self):
+        path = filedialog.askopenfilename(
+            title="Select voice sample",
+            filetypes=[("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), ("All files", "*.*")])
+        if path:
+            self.voice_sample_path = path
+            self._set_voice_status(f"✓  Loaded: {os.path.basename(path)}", SUCCESS)
+
+    # ── Generation ────────────────────────────────────────────────────────────
+
+    def _generate(self, play: bool, save: bool):
+        if not self.tts:
+            messagebox.showerror("Not ready", "TTS model is still loading.")
+            return
+
+        if not os.path.exists(self.voice_sample_path):
+            messagebox.showerror("No voice sample",
+                "Please record your voice or load an audio file first.")
+            return
+
+        text = self._text_input.get("1.0", "end").strip()
+        if not text:
+            messagebox.showerror("No text", "Please enter some text to speak.")
+            return
+
+        save_path: str | None = None
+        if save:
+            save_path = filedialog.asksaveasfilename(
+                defaultextension=".wav",
+                filetypes=[("WAV audio", "*.wav")],
+                title="Save generated speech")
+            if not save_path:
+                return
+
+        self._set_btns_state("disabled")
+        self._progress.start(12)
+        self._set_gen_status("⏳  Generating speech (this may take ~10–30 s)…", ACCENT2)
+
+        def _run():
+            try:
+                out = save_path or os.path.join(tempfile.gettempdir(), "vcl_output.wav")
+                self.tts.tts_to_file(
+                    text=text,
+                    speaker_wav=self.voice_sample_path,
+                    language="en",
+                    file_path=out,
+                )
+
+                if play:
+                    sr, data = wav.read(out)
+                    audio = data.astype(np.float32) / (32768.0 if data.dtype == np.int16 else 1.0)
+                    if audio.ndim > 1:
+                        audio = audio.mean(axis=1)
+                    sd.play(audio, samplerate=sr)
+                    sd.wait()
+
+                msg = "✓  Done!"
+                if save_path:
+                    msg += f"  Saved → {save_path}"
+                self.root.after(0, lambda: self._set_gen_status(msg, SUCCESS))
+            except Exception as exc:
+                self.root.after(0, lambda: self._set_gen_status(f"Error: {exc}", ERROR))
+            finally:
+                self.root.after(0, self._on_generate_done)
+
+        threading.Thread(target=_run, daemon=True).start()
+
+    def _on_generate_done(self):
+        self._progress.stop()
+        self._progress.config(value=0)
+        self._set_btns_state("normal")
+
+    # ── Helpers ───────────────────────────────────────────────────────────────
+
+    def _set_voice_status(self, msg: str, color: str = TEXT):
+        self._voice_status_var.set(msg)
+        self._voice_status_lbl.config(fg=color)
+
+    def _set_gen_status(self, msg: str, color: str = TEXT):
+        self._gen_status_var.set(msg)
+        self._gen_status_lbl.config(fg=color)
+
+    def _set_header_status(self, msg: str, color: str = TEXT):
+        self._header_status.config(text=msg, fg=color)
+        self._status_dot.config(fg=color)
+
+    def _enable_generate_btns(self):
+        for btn in (self._play_btn, self._save_btn, self._both_btn):
+            btn.config(state="normal")
+
+    def _set_btns_state(self, state: str):
+        for btn in (self._play_btn, self._save_btn, self._both_btn, self._rec_btn):
+            btn.config(state=state)
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    root = tk.Tk()
+    app = VoiceCloneTTS(root)
+    root.mainloop()