From 34d55dc95f89fa3335316996469fa1a3cce8c217 Mon Sep 17 00:00:00 2001 From: juyung Date: Sat, 28 Mar 2026 22:04:45 +0900 Subject: [PATCH] Upload --- .gitignore | 2 + .python-version | 1 + README.md | 78 +++++++ docs/assets/img/preview.png | Bin 0 -> 8838 bytes requirements.txt | 41 ++++ run_mac_linux.sh | 18 ++ run_windows.bat | 22 ++ voice_clone_tts.py | 391 ++++++++++++++++++++++++++++++++++++ 8 files changed, 553 insertions(+) create mode 100644 .gitignore create mode 100644 .python-version create mode 100644 README.md create mode 100644 docs/assets/img/preview.png create mode 100644 requirements.txt create mode 100644 run_mac_linux.sh create mode 100644 run_windows.bat create mode 100644 voice_clone_tts.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..def4cc9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.venv/ + diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..1445aee --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10.14 diff --git a/README.md b/README.md new file mode 100644 index 0000000..f849621 --- /dev/null +++ b/README.md @@ -0,0 +1,78 @@ +# Voice Clone TTS + +Type any text, hear it in your own voice. Runs fully offline. + +![Screenshot](docs/assets/img/preview.png) + +--- + +## Setup (first time only) + +**1. Install system packages:** +```bash +sudo apt install portaudio19-dev python3-tk espeak-ng -y +``` + +**2. Install Python 3.10 via pyenv** (required on Debian to avoid lzma bug): +```bash +curl https://pyenv.run | bash +echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc +echo '[[ -d $PYENV_ROOT/bin ]] && export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc +echo 'eval "$(pyenv init - bash)"' >> ~/.bashrc +source ~/.bashrc +pyenv install 3.10.14 +pyenv local 3.10.14 +``` + +**3. Create a virtual environment:** +```bash +~/.pyenv/versions/3.10.14/bin/python -m venv .venv +source .venv/bin/activate +``` + +**4. Install Python packages** (takes 15–30 min, downloads ~2GB): +```bash +pip install --upgrade pip +pip install --no-cache-dir "numpy==1.22.0" +pip install --no-cache-dir --resume-retries 20 TTS sounddevice scipy +pip install --no-cache-dir "transformers==4.40.0" +pip install --no-cache-dir "torch==2.1.0" "torchaudio==2.1.0" +``` + +--- + +## Running the app + +Every time you want to use it: +```bash +source .venv/bin/activate +python voice_clone_tts.py +``` + +--- + +## How to use + +1. Wait for **"Model ready"** in the top right *(first launch only: downloads ~2GB, takes 5–15 min)* +2. Click **Start Recording** → read the passage below for 30 seconds → **Stop Recording** +3. Type any text +4. Click **Generate & Play** + +Your voice sample saves as `my_voice_sample.wav` and is reused automatically on future runs. + +--- + +## Best text to record (Rainbow Passage) + +> *"When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow. The rainbow is a division of white light into many beautiful colors. These take the shape of a long round arch, with its path high above, and its two ends apparently beyond the horizon. There is, according to legend, a pot of gold at the end of the rainbow. The shape of a rainbow reminds me of a bridge. Like a bridge, a rainbow is wide in the middle and narrow at its ends."* + +Read it **twice through** at your normal pace. + +--- + +## Tips + +- Record in a quiet room with no background noise +- Speak naturally — don't put on a "reading voice" +- 30 seconds of clean audio is the sweet spot +- Generation takes 10–30 seconds per sentence on CPU diff --git a/docs/assets/img/preview.png b/docs/assets/img/preview.png new file mode 100644 index 0000000000000000000000000000000000000000..52d0de4b077b84df94571faee0c4278567493731 GIT binary patch literal 8838 zcma)ibySpH_ckDcfPeymln9ERY>^1v3d!0Y-*HBGma#99TJUl#dRTTvo9v%UJ z`}thEg0t9N*)qgUK~QxaMcn-72Ol5*Z}?v`1H*kPDh5hQYEDiu4h{}ZPEKFDLogWp zfc?Sv#AIeFpaC_Ld@RZgGBQ>YCnhE?O8HI8sLUbmpum35#N3Nrvt;@A_S^P;@05Ad zfCacPTlx#-&W?(Vz*uft#sQR*>c&bLZbLFRn5OnU#XhtspCBU@=a8e=+jKlU)*@8} zSskw*YspdYT*!?u)u~q>e(?(h29ic9-9=mwOH2>{%)RdgGZbt*H{($k*f%g8Omsx=KkKDjlaP9#K9=2l;kS4o>R8_T={#$S1^}SX^ze>oE^M=2^}0Ov-e=Mx;F@; ziU&L+>i8fG@qS(-aqO(uV9hJXt3LY)slAdOtq`oUhqn?;~@o$m1o+Z@s2ChV>a=W{{||0_3$E@Az@yG=M_%(ZF}uk?OB=YwvwfWCl6rEFHzz{=%;{>45F!96Jv%;+aQ&{Ai~qh^5M#7wa< zODDzmC-|1{Zr&`TRIInIH)t&NvcRgZ4b!w$xll{hBCE2{{B?+EnoiyUqpLGPJj4Ys zBMaQ$zINiGd*~T4wq~qlR|GWrqjr08h70Ow-M%v+_y)V&`Cv?|NU@!*G`CoN-<-Lwh|QAE*#O*iWXD2l zGs^*sM;5@&y}#D(=8fdnaU!?$S9KVN`G*Q2eyN;t+6w?*YfzH6ns99q8_H=r3<={h zQr^7EaUyXZ47{_URRNm~YY@97ed<9wQwJ>yQY>#vD%KSBuw6@)kh4mY>GrugZmlBf z?y@fJyH+1~d<~4BM~DX_P$9v~BaEX8%(`cZ_lZ0Rz?tyCpIccmq=AgvIRG!72;qO6 z@Ki|dvg5^3$w2-C@b0n)0(deIJTN|PhROX6a3p{8{|SNv%+PDl)`p9;=?4FAEg{lu zm_^Hs7Ul50EuHG+cq`2+Kf?-2_v2$lFtgFv0GX?2wfMkCjjfwA)j*7fJc)>*ziQdE zQDG}D#=Libuxuj~A$L<-{P223O!eU-nAoqN&Vwr6X}-IfPOhFGI-h&)34!%raT^ZG z)3wAUFZScVQSW#zdYYHcJ(LH$o9aOW^>MlELt;w%oK(ltby{``CV9!LEe>m#U zZ!gd~o?>A((yZt2w!Bt#f~DMzrE}v0?!h1*!UVM!Zi5pr{12p)cITi?y&^>j_1*=3 zPYig0P&HBT&5jZ}Cqdy<1Kh~>nBwRY9J=NJ@=%~jabcc)oZRR%yl(le$b1R=oxMbT zb)PO4(q%u;P*4H*x~5xd@_`Yb4-Ni*Jcl z6OSBJ)p=02KZ_{$xoK?uvQ066TwJG6GXEZySrKc>-ggq!k@w5Zbzk8_dIQYrQB+DE znMfJtw%~%OGv$_bbt|T=yq@-0DJErRcta+7-G3 z#*)qH!uZIy+{4FWGY`4EEt>M4lFuYoRVvc5ut>f_WcGoiwZm{i z$6DbM;214-y%NOFm$j^&-=nv{jauUn%zfpQSlA0?@6-*ykFBcm)AnPrwgT; z+Ejgxn2Sb(@36soUAc| zjgN|P6s=TQ30)>NcC-?rZvx!igc=I_QcZqOX zB5pS!4Lj}70rJ>1QqRJCEY^eS`ruAP%S(U8p`|X$Iz{g9NOutjR4r$va8A za<$$NjKVyGpvAHs7Bvi2ig*XHnM0v>g!t6?2TOlt=eZSqR)?V5G{?wTAFci*Tc=L!(lvW-+MQwYA$D z;Dd@8Z8T4e!PK?YRcE&+^>OgkwcZv}NPR{4V2`{Ia?w)5KLNJ-2s6;$tk=?(tT#)S zr}I2RNy6AKOVGKl2Uv|>Zv()ilZ@3Lj>{2Hw-T7uL@}>9)vDRs6197aAFUoa2);xA zC=wVBOtI2UUc}tr5x(cVj}Ie&ZG&0=XW_r2etoC^baGdlAj$bZlDd?~A4>Ue%jq+) z>pAi#15DWDXI-t3J;eSTuoAC%=*;T5j+xM19GMLSgp+{HoSI4gm_?rP{k;jbye52b zSxGKMpAn{K4p#Ve$Lu(`L9XzFBfang2QCrh^JQ;>?`48yKO@r%G8x2KBjhMg;2X5K@PADLq+>^B=FNGmNO^t#4#Yrqid!g(`n8^&rd zg#rkN(Wvtjhsr3%X!jm~zUsD5B$ga`0(L5#j@TbkKML^6b`$+9zAi5Ct z!|P|X&|5_zw5WDpjw?{DTRiT2PrF!KyI`=W#-_`J5zQdjUMrEPUASX5c5oU_Hme2E z_Apwu5uM@vPQ&TdU!&5+*oue;H$t5qP=$g)oPD&$PdwAb=*XCbe%R*RTS+43o!3-` z7kgMV1z|3*OUxNaOXJi4!lMMz*u%Hm%uhguK6f!l)RbgWJV&@n7coDsll`lI;d%}J zrJNlpGICp=@}XL?RH3BDGN);lkg=?RG4!A8HJqB#Zrsc&~NMu;g_w-)S!Sf|Cx=x0?hA|og?3EM@&i<;kjp?#MoZdzQIL*AN_g_ z#TD7zD@G<+l2mG8n;Tw31@AtP#b=OKg>jOx^_{7p zo}Y<>((2S871FYZ{a4!)A}<^VrDv7Lx7}@Sq3_8~Sa9n;lag_(x8IL=@M4B2NK5D0 zocg_%96Fl)_eI5QPLu%i?JKfHh{*|X8;`p28CO0R&=EBBiHp(riLW|o-(g$ByHBBm zcr(cv*44b)?maft`0siea{0I+;uPiaa5m&hny|JmmxXj zptt_CPLCpKGUCxrAjHo-zaW^xqIf`VjeKUjD|HN`eYH$1%1$VwaiM^*PUq*=yt`c z8&Cpv+?N27tswI{kP37Y%9c$CAp~SdfYT=}*FZN3a1Yn;aSpEnsqooxs|vWK{~!B5 zggAnKc+dyIRZdPmd738mhzra4!!vC=e~Mi-Im5)ss+)(oRZNs*=5>+C6IA+;0y{gREVxKpXoShdB1PoM%=_aNE|lEF zE4k__>mzMrIf%)M*EHdc>lEuYYWAJq-q*6GXqDH%jth&^8?K=?rp7#OQ-iQCG#Iz% zJSr!5NF86>p=yp+vz->#6V8#RjN3rZpRN2^D>Pl$%C`Dk})G z*m#5^>P6Tkyoi@8Dk;TVCeuNeWr5Wm)gUw5tEy6SM&6WiC&BgCYTog1A-bZGx@dK= zfUl6EoEr13W)u7C0upjhwSSc$HLS3w#mxT-$ho+rsv^6bAZzmqYArX*lqc)j*0jiw&D{3@F=f(0|KR4yWaK%Kkw!S~CH*mJ_A_5&S<$(FZ~B?-k$MxLSFJ z(Q_XWwsY@ID_H!#TABGo5cz*50$m6ICaq;?rQ3E@_U;zL%W#KtEjoQg4xI^uAAHmY+vSX zbWEV=9suYkGAl{L@0f{eznPzk5PjO?6wiQdHvE2YWl{u}!%!s?N1GjU9@IOiCbCGr z{Bu7*ner10ZJ$WdhisBNVedf+FpeM#)g2@2H26E@P|8RPCSg8>KZrlWuc;6iTfXX- zO^Ei%UPBjaKMlh`ux6s63G?O7H1BhYi$~SI=@KQ9W;@Qh>?@*bW^R}n=u6*WhRy`? z`>KmNeKDDpRpmsS>})p!cL1N zXj{WfU`%rK`hT^W|J3b|HdG>@R?K-R}{I zjubO51oHdzQ^Ic6!Y%E1229 zZT(kO8&SiL#OSXn4VeUG_arrS*`@PR?b zYBS2bDs;5$v9nwIPo~70?nCw|0iI_gQcR{0gg1QH;Z@8C38Dn2kcY#(nG=r zZb`71;4?`V!f1;vvR&!MUp@>kO=vwgL^0)iOGog$Icm#08>|H{sou-rI7-kIShNW$ z^G{Bs+<9J)Cg}O>+W+q9hqH-=?q0e&6#`5yOFYn*=5VKY+RXTlaKAtlsETQdwd+Sl zhP~nEoxxk#TOnMQo~d7YbFOO?X&GkeK$>?$#6;>>zw}l6$1&tTTp3X;YiHu|;)=v= z03kSZkbO)apGZ@fP$t25C+rq&V>&0fv;>omr+noNB1&mNL-IXR6a%8;$`r>ldAEPusw^9 zlC_PBQYNrW9ZIi~7ZpzpIW|^M9-ba99c>95+QvsyRsd0mq^I2Qa9C=N&99O*6a;LN zF}zkwiLLpz(M0BF7k#HUktCybyp+36KB%vTb#DMNu@+tfgIsVS;g9%WQR`&v$f0|E zV+lT~AC|INX&_IKv5Dspl*0jRLyrQ*T#@JLTM4Vx%>>gIWsEV|ru!~hMAMpXp9UO0 zHwxzoWcsnDZY%#q&T8{z!}ACvf96e;ix$|uQ*Zh7Is+FJQ3&v(RW$w(2nI%wRe>MMHj3SJ!uIU{tP?fBW_`J>ZMX2|O>+c)pp@O5{ z{niRBT41^;GqQSg0z~YK8Lj%P`LwLbdVYw0Ehn{nNwkXc*+#Vq+@t#Ki-6rdU&q!X zuIsyZ#G{&BDtk2C-H0=6;0Haf+Xk5-;VL|o1aHmu8#gDdABsC`dOmr{BVvB<+(fx0 z;j|Ab=DbPqRnuc^;p`3W=CD$GaNgi^#j@b)20@IQEP5Et`z_#Xhr1o>y@ zONfj4JNyz{QsVahf72+kRs>59@#O&ikN(W!((gda6crAKp#vLOVqrNrAEjM#J?2@6a~1m@0g`@u$nPJyLkqfifZkJok&qY# z_lC`63{NMDpH!Z$c|HB2deo|wC;?xL7+>wSbS$xLM%pWMb@(_W9+Rsn*e1a>-+&y2 zX5y9hD^DWoEDUCy%UwNxh~(cU_8;tvMY8yeGtZQrRFxNOc@{5hA&dakxerp)-h`t` zz9;pkV)#juLD{Wq#_ldXUE18W-hNH&;l8{FDLa5Gywzj|Q!_o4tj03_1nuu=@L`Nc zc`43!GT^aPeum<0$=;n_vN%gT%j;h%U{h|4@SOUPB`8zlEqhHHjd8zy5B zW6?wGuoasvL4bq{2uV(PE?UgzsGQM|ns=Hqznd-L12l8%L9!GqfvxVJH_-?ODbsgr zidp^@pzb&Zl>6ulldE>oAgsCNm#dIz>*A{U;2xSjiBXm0EAgzwXe7X1~QbyW`TYL%IS z*jy>jy^+!Tn0}gqC9368EjtlAt4ib6(TSysZ_)=mirA0i=aNvn$nCu*LxyW5CZm_AQB+^K9njb{-5WLL$X^d)bBNrDyzJQ#VV$3JFHOv|@U#18W=)M@t5p)2~F zH}_t;CxqK+t1oFt50+Xjejf>{BC)g{rV?3uYD%-dKcnQ*=+{n1$=iKT1iN?svgtK8 z?6vbmddE8DUXO%*wu4NHA!mj=cMg;A3hLLuiHQz9r7h(SL^OHDV^pIdqUs0JJo|`$ zsTC1B$QZiXP=Ga*de>M38?&~XWZ|j+bA0rsRx}hY$l5HpDJD?h1QmX_``p(E%yNv( zXxW8F@ZAu(X|-hH>?yq?=rG1Qz2TK`;&#lNCa`yA(BD%{gr)_Kn9^Eimqb`7F{?g1 zJtu%S$Zbc%4aJ>y^v!NOcC;s((2^RRmWJ_gSKLl#lXNrCtej%Hv%aKy1!pgCNm2i+ zyd-~B`?8FeS_-Ekz;Y2*YtMP9sfMiv(?-Q_ZJz7|qm!^dV$IFmF{v2^7r@~d7 zfc>)KgqP!!s?UK`Kr0?9=|(V?aobAvA=Aw;FoFfb60CmJ&^;A-mvMh3{V4buFou}1#6-P_SNv3(J zieFc^YSu$+mFTAymReEDZKc`_Gwz!X*FHBLW;baA{Zj%3n+Lh2hP0v{Ke$+P1*Iqk zdgs(7oTKt?HkjSg7t!QGz1d@Bb2J3&OXa!Tx~8AeK5ch{gkO(w;jYVj%Fmb#L}6#y zxzWr=4oWJ4UhD1mM1Z!{QFbB-`Lngl}P;8Nspamp&E;|6pa12n&9TNaEq z9a?Ermk^#205R&Jzijs@D=V>~lU2)ueQGAuk|EtW8&7th9xIz^=Q}@$jVbqg7{i63 zLiL#W-MC4|tbUOgQ;uzZ^@@K}-OAeqv4ZZ^a$!Q+_g}3^eCes=~P- zs<4~Iiw0)R91GVuw>z!8K5eaOaxeuO&b$-2gTCQ z#%|Mu&Ydr1`T2BM&z_pe{nHqOsV+P0Ut|1XnlB0P_ZhrAlYbm3He9=WE%V2@`{Nmx z6kSqw-f(VH69~kC{Qez>-!&1jX9%I4v-1RG=rw}l`=~LL&>_*BQx^qt6!7i*?r|_xZTI0tDW-%;v0jJX zkRYLj;KhS_!6J~Z<%^W3RDN(y{&<}PGQbd3l^*=S@3~=ey!r!spcqEi{fi%c{}>pt zpfXH*nyZ8CvP@LneQmPT0#+3&0wGZi(fqHa9_<+<;1&}n*a!99QTORUN!*pHX7gIc bcTXudNj9itgCdqLUmj6a)Kn;!dlvkE&HS3L literal 0 HcmV?d00001 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1003ad0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,41 @@ +asgiref==3.7.2 +bcrypt==5.0.0 +black==24.2.0 +blinker==1.9.0 +Brotli==1.1.0 +certifi==2023.7.22 +charset-normalizer==3.3.2 +click==8.1.7 +Django==5.0.2 +django-appconf==1.0.6 +evdev==1.7.1 +Flask==3.1.3 +gallery_dl==1.30.6 +idna==3.4 +instaloader==4.14 +itsdangerous==2.2.0 +Jinja2==3.1.2 +libsass==0.23.0 +MarkupSafe==2.1.3 +mutagen==1.47.0 +mypy-extensions==1.0.0 +numpy==2.2.6 +packaging==23.2 +pathspec==0.12.1 +platformdirs==4.2.0 +pycryptodomex==3.19.0 +python-dateutil==2.9.0.post0 +python-xlib==0.33 +pytz==2026.1.post1 +rcssmin==1.1.1 +requests==2.31.0 +rjsmin==1.2.1 +six==1.17.0 +sqlparse==0.4.4 +tomli==2.0.1 +typing_extensions==4.9.0 +tzdata==2025.3 +urllib3==2.1.0 +websockets==12.0 +Werkzeug==3.1.3 +yt-dlp==2025.1.26 diff --git a/run_mac_linux.sh b/run_mac_linux.sh new file mode 100644 index 0000000..5834f08 --- /dev/null +++ b/run_mac_linux.sh @@ -0,0 +1,18 @@ +#!/bin/bash +echo "============================================" +echo " Voice Clone TTS - Setup & Launch" +echo "============================================" +echo + +# Check Python +if ! command -v python3 &>/dev/null; then + echo "[ERROR] Python 3 not found. Install from https://www.python.org/downloads/" + exit 1 +fi + +echo "Installing dependencies (first time only)..." +pip3 install -r requirements.txt + +echo +echo "Launching app..." +python3 voice_clone_tts.py diff --git a/run_windows.bat b/run_windows.bat new file mode 100644 index 0000000..9e3e7eb --- /dev/null +++ b/run_windows.bat @@ -0,0 +1,22 @@ +@echo off +echo ============================================ +echo Voice Clone TTS - Setup ^& Launch +echo ============================================ +echo. + +:: Check Python +python --version >nul 2>&1 +if errorlevel 1 ( + echo [ERROR] Python not found. Download it from https://www.python.org/downloads/ + pause + exit /b 1 +) + +:: Install dependencies +echo Installing dependencies (first time only, ~5 min)... +pip install -r requirements.txt + +echo. +echo Launching app... +python voice_clone_tts.py +pause diff --git a/voice_clone_tts.py b/voice_clone_tts.py new file mode 100644 index 0000000..a0d8ac4 --- /dev/null +++ b/voice_clone_tts.py @@ -0,0 +1,391 @@ +""" +Voice Clone TTS +=============== +Speak any text in YOUR voice using XTTS v2 (runs fully offline after first setup). + +Usage: + 1. Run this script: python voice_clone_tts.py + 2. Record ~15 seconds of your voice + 3. Type any text and click Generate +""" + +import tkinter as tk +from tkinter import ttk, messagebox, filedialog +import threading +import os +import tempfile +import numpy as np + +try: + import sounddevice as sd + import scipy.io.wavfile as wav +except ImportError: + print("Missing dependencies. Please run: pip install sounddevice scipy") + raise + +SAMPLE_RATE = 22050 +VOICE_SAMPLE_FILE = "my_voice_sample.wav" + + +# ─── Colour palette ────────────────────────────────────────────────────────── +BG = "#0f0f13" +SURFACE = "#1a1a24" +BORDER = "#2a2a3a" +ACCENT = "#7c6af7" +ACCENT2 = "#a89cf7" +TEXT = "#e8e6f0" +MUTED = "#666680" +SUCCESS = "#4ade80" +ERROR = "#f87171" +WARNING = "#fbbf24" +# ───────────────────────────────────────────────────────────────────────────── + + +class VoiceCloneTTS: + def __init__(self, root: tk.Tk): + self.root = root + self.root.title("Voice Clone TTS") + self.root.geometry("640x580") + self.root.configure(bg=BG) + self.root.resizable(True, True) + self.root.minsize(500, 480) + + self.recording = False + self.recorded_chunks: list = [] + self.voice_sample_path: str = VOICE_SAMPLE_FILE + self.tts = None + self._record_thread: threading.Thread | None = None + + self._apply_styles() + self._build_ui() + self._load_tts_async() + + # Restore saved voice sample if it exists + if os.path.exists(self.voice_sample_path): + self._set_voice_status(f"✓ Saved sample found: {self.voice_sample_path}", SUCCESS) + + # ── Styles ──────────────────────────────────────────────────────────────── + + def _apply_styles(self): + style = ttk.Style() + style.theme_use("clam") + + style.configure(".", + background=BG, foreground=TEXT, + font=("Segoe UI", 10), + borderwidth=0, relief="flat") + + style.configure("Card.TFrame", background=SURFACE, relief="flat") + style.configure("TLabel", background=BG, foreground=TEXT) + style.configure("Card.TLabel", background=SURFACE, foreground=TEXT) + style.configure("Muted.TLabel", background=SURFACE, foreground=MUTED, font=("Segoe UI", 9)) + style.configure("Title.TLabel", background=BG, foreground=ACCENT2, + font=("Segoe UI Semibold", 13)) + style.configure("Step.TLabel", background=SURFACE, foreground=ACCENT2, + font=("Segoe UI Semibold", 10)) + + style.configure("Accent.TButton", + background=ACCENT, foreground="#ffffff", + font=("Segoe UI Semibold", 10), padding=(14, 7), + relief="flat", borderwidth=0) + style.map("Accent.TButton", + background=[("active", ACCENT2), ("disabled", BORDER)], + foreground=[("disabled", MUTED)]) + + style.configure("Ghost.TButton", + background=SURFACE, foreground=TEXT, + font=("Segoe UI", 10), padding=(12, 6), + relief="flat", borderwidth=0) + style.map("Ghost.TButton", + background=[("active", BORDER), ("disabled", SURFACE)], + foreground=[("disabled", MUTED)]) + + style.configure("TProgressbar", + troughcolor=BORDER, background=ACCENT, + thickness=4, relief="flat") + + # ── UI build ────────────────────────────────────────────────────────────── + + def _build_ui(self): + # ── Header + header = tk.Frame(self.root, bg=BG) + header.pack(fill="x", padx=20, pady=(18, 4)) + tk.Label(header, text="Voice Clone TTS", bg=BG, fg=ACCENT2, + font=("Segoe UI Semibold", 17)).pack(side="left") + self._status_dot = tk.Label(header, text="●", bg=BG, fg=MUTED, font=("Segoe UI", 10)) + self._status_dot.pack(side="right", pady=2) + self._header_status = tk.Label(header, text="Loading model…", bg=BG, fg=MUTED, + font=("Segoe UI", 9)) + self._header_status.pack(side="right", padx=(0, 4), pady=2) + + self._progress = ttk.Progressbar(self.root, mode="indeterminate", style="TProgressbar") + self._progress.pack(fill="x", padx=20, pady=(0, 10)) + self._progress.start(12) + + # ── Step 1 – Voice sample + self._card("Step 1 — Record Your Voice", self._build_record_section) + + # ── Step 2 – Text input + self._card("Step 2 — Enter Text", self._build_text_section, expand=True) + + # ── Step 3 – Generate + self._card("Step 3 — Generate", self._build_generate_section) + + def _card(self, title: str, builder, expand=False): + outer = tk.Frame(self.root, bg=BG) + outer.pack(fill="both", expand=expand, padx=16, pady=4) + + frame = tk.Frame(outer, bg=SURFACE, bd=0, + highlightthickness=1, highlightbackground=BORDER) + frame.pack(fill="both", expand=expand, padx=0, pady=0) + + tk.Label(frame, text=title, bg=SURFACE, fg=ACCENT2, + font=("Segoe UI Semibold", 10)).pack(anchor="w", padx=14, pady=(10, 0)) + + sep = tk.Frame(frame, bg=BORDER, height=1) + sep.pack(fill="x", padx=14, pady=(6, 0)) + + inner = tk.Frame(frame, bg=SURFACE) + inner.pack(fill="both", expand=expand, padx=14, pady=10) + builder(inner) + + def _build_record_section(self, parent): + self._voice_status_var = tk.StringVar(value="No voice sample yet — record one below.") + self._voice_status_lbl = tk.Label(parent, textvariable=self._voice_status_var, + bg=SURFACE, fg=MUTED, font=("Segoe UI", 9), + anchor="w", wraplength=560) + self._voice_status_lbl.pack(fill="x", pady=(0, 8)) + + btn_row = tk.Frame(parent, bg=SURFACE) + btn_row.pack(fill="x") + + self._rec_btn = ttk.Button(btn_row, text="⏺ Start Recording", + command=self._toggle_recording, style="Accent.TButton") + self._rec_btn.pack(side="left", padx=(0, 8)) + + ttk.Button(btn_row, text="📂 Load Audio File", + command=self._load_voice_file, style="Ghost.TButton").pack(side="left") + + self._rec_timer_var = tk.StringVar(value="") + tk.Label(parent, textvariable=self._rec_timer_var, + bg=SURFACE, fg=ACCENT, font=("Segoe UI Semibold", 9)).pack(anchor="w", pady=(6, 0)) + + tk.Label(parent, text="Tip: Speak naturally for 10–30 s. Clear audio = better cloning.", + bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(2, 0)) + + def _build_text_section(self, parent): + self._text_input = tk.Text(parent, height=6, wrap="word", + bg="#12121a", fg=TEXT, insertbackground=ACCENT2, + font=("Segoe UI", 11), relief="flat", bd=0, + padx=10, pady=8, selectbackground=ACCENT) + self._text_input.pack(fill="both", expand=True) + self._text_input.insert("1.0", "Hello! This is my voice, speaking through a computer.") + + tk.Frame(parent, bg=BORDER, height=1).pack(fill="x", pady=(6, 0)) + tk.Label(parent, text="Supports English, Spanish, French, German, Italian, and more.", + bg=SURFACE, fg=MUTED, font=("Segoe UI", 8)).pack(anchor="w", pady=(4, 0)) + + def _build_generate_section(self, parent): + btn_row = tk.Frame(parent, bg=SURFACE) + btn_row.pack(fill="x") + + self._play_btn = ttk.Button(btn_row, text="▶ Generate & Play", + command=lambda: self._generate(play=True, save=False), + style="Accent.TButton", state="disabled") + self._play_btn.pack(side="left", padx=(0, 8)) + + self._save_btn = ttk.Button(btn_row, text="💾 Generate & Save", + command=lambda: self._generate(play=False, save=True), + style="Ghost.TButton", state="disabled") + self._save_btn.pack(side="left", padx=(0, 8)) + + self._both_btn = ttk.Button(btn_row, text="▶💾 Play & Save", + command=lambda: self._generate(play=True, save=True), + style="Ghost.TButton", state="disabled") + self._both_btn.pack(side="left") + + self._gen_status_var = tk.StringVar(value="") + self._gen_status_lbl = tk.Label(parent, textvariable=self._gen_status_var, + bg=SURFACE, fg=MUTED, font=("Segoe UI", 9), + anchor="w", wraplength=560) + self._gen_status_lbl.pack(fill="x", pady=(8, 0)) + + # ── TTS loading ─────────────────────────────────────────────────────────── + + def _load_tts_async(self): + def _load(): + try: + from TTS.api import TTS # type: ignore + self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) + self.root.after(0, self._on_tts_ready) + except Exception as exc: + _exc = exc + self.root.after(0, lambda: self._on_tts_error(_exc)) + + threading.Thread(target=_load, daemon=True).start() + + def _on_tts_ready(self): + self._progress.stop() + self._progress.config(value=0) + self._set_header_status("Model ready", SUCCESS) + self._enable_generate_btns() + self._set_gen_status("Ready — enter text and click Generate.", MUTED) + + def _on_tts_error(self, exc): + self._progress.stop() + self._set_header_status("Model failed to load", ERROR) + self._set_gen_status(f"Error: {exc}", ERROR) + messagebox.showerror("TTS Load Error", + f"Could not load the XTTS v2 model:\n{exc}\n\n" + "Make sure you've run: pip install TTS") + + # ── Recording ───────────────────────────────────────────────────────────── + + def _toggle_recording(self): + if not self.recording: + self._start_recording() + else: + self._stop_recording() + + def _start_recording(self): + self.recording = True + self.recorded_chunks = [] + self._rec_btn.config(text="⏹ Stop Recording") + self._set_voice_status("🔴 Recording… speak naturally for 10–30 seconds.", WARNING) + self._rec_timer_var.set("0 s") + self._elapsed = 0 + self._tick() + + def _record(): + with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, dtype="float32") as stream: + while self.recording: + data, _ = stream.read(1024) + self.recorded_chunks.append(data.copy()) + + self._record_thread = threading.Thread(target=_record, daemon=True) + self._record_thread.start() + + def _tick(self): + if self.recording: + self._elapsed += 1 + self._rec_timer_var.set(f"{self._elapsed} s recorded") + self.root.after(1000, self._tick) + else: + self._rec_timer_var.set("") + + def _stop_recording(self): + self.recording = False + self._rec_btn.config(text="⏺ Start Recording") + + if not self.recorded_chunks: + self._set_voice_status("Nothing recorded — try again.", MUTED) + return + + audio = np.concatenate(self.recorded_chunks, axis=0) + audio_i16 = (audio * 32767).clip(-32768, 32767).astype(np.int16) + wav.write(self.voice_sample_path, SAMPLE_RATE, audio_i16) + dur = len(audio_i16) / SAMPLE_RATE + self._set_voice_status( + f"✓ Saved {dur:.1f}s sample → {os.path.abspath(self.voice_sample_path)}", SUCCESS) + + def _load_voice_file(self): + path = filedialog.askopenfilename( + title="Select voice sample", + filetypes=[("Audio files", "*.wav *.mp3 *.flac *.ogg *.m4a"), ("All files", "*.*")]) + if path: + self.voice_sample_path = path + self._set_voice_status(f"✓ Loaded: {os.path.basename(path)}", SUCCESS) + + # ── Generation ──────────────────────────────────────────────────────────── + + def _generate(self, play: bool, save: bool): + if not self.tts: + messagebox.showerror("Not ready", "TTS model is still loading.") + return + + if not os.path.exists(self.voice_sample_path): + messagebox.showerror("No voice sample", + "Please record your voice or load an audio file first.") + return + + text = self._text_input.get("1.0", "end").strip() + if not text: + messagebox.showerror("No text", "Please enter some text to speak.") + return + + save_path: str | None = None + if save: + save_path = filedialog.asksaveasfilename( + defaultextension=".wav", + filetypes=[("WAV audio", "*.wav")], + title="Save generated speech") + if not save_path: + return + + self._set_btns_state("disabled") + self._progress.start(12) + self._set_gen_status("⏳ Generating speech (this may take ~10–30 s)…", ACCENT2) + + def _run(): + try: + out = save_path or os.path.join(tempfile.gettempdir(), "vcl_output.wav") + self.tts.tts_to_file( + text=text, + speaker_wav=self.voice_sample_path, + language="en", + file_path=out, + ) + + if play: + sr, data = wav.read(out) + audio = data.astype(np.float32) / (32768.0 if data.dtype == np.int16 else 1.0) + if audio.ndim > 1: + audio = audio.mean(axis=1) + sd.play(audio, samplerate=sr) + sd.wait() + + msg = "✓ Done!" + if save_path: + msg += f" Saved → {save_path}" + self.root.after(0, lambda: self._set_gen_status(msg, SUCCESS)) + except Exception as exc: + self.root.after(0, lambda: self._set_gen_status(f"Error: {exc}", ERROR)) + finally: + self.root.after(0, self._on_generate_done) + + threading.Thread(target=_run, daemon=True).start() + + def _on_generate_done(self): + self._progress.stop() + self._progress.config(value=0) + self._set_btns_state("normal") + + # ── Helpers ─────────────────────────────────────────────────────────────── + + def _set_voice_status(self, msg: str, color: str = TEXT): + self._voice_status_var.set(msg) + self._voice_status_lbl.config(fg=color) + + def _set_gen_status(self, msg: str, color: str = TEXT): + self._gen_status_var.set(msg) + self._gen_status_lbl.config(fg=color) + + def _set_header_status(self, msg: str, color: str = TEXT): + self._header_status.config(text=msg, fg=color) + self._status_dot.config(fg=color) + + def _enable_generate_btns(self): + for btn in (self._play_btn, self._save_btn, self._both_btn): + btn.config(state="normal") + + def _set_btns_state(self, state: str): + for btn in (self._play_btn, self._save_btn, self._both_btn, self._rec_btn): + btn.config(state=state) + + +# ── Entry point ─────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + root = tk.Tk() + app = VoiceCloneTTS(root) + root.mainloop()