speaches-ai · fedirz · Jan 26, 2025 · Jan 26, 2025
diff --git a/docs/usage/text-to-speech.md b/docs/usage/text-to-speech.md
@@ -14,8 +14,8 @@ Download the Kokoro model and voices.
 export KOKORO_REVISION=c97b7bbc3e60f447383c79b2f94fee861ff156ac
 # Download the ONNX model (~346 MBs)
 docker exec -it speaches huggingface-cli download hexgrad/Kokoro-82M --include 'kokoro-v0_19.onnx' --revision $KOKORO_REVISION
-# Download the voices.json (~54 MBs) file
-docker exec -it speaches curl --location --output /home/ubuntu/.cache/huggingface/hub/models--hexgrad--Kokoro-82M/snapshots/$KOKORO_REVISION/voices.json https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.json
+# Download the voices.bin (~5.5 MBs) file
+docker exec -it speaches curl --location --output /home/ubuntu/.cache/huggingface/hub/models--hexgrad--Kokoro-82M/snapshots/$KOKORO_REVISION/voices.bin https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.bin
 ```
 
 !!! note

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,9 +8,8 @@ dependencies = [
     "fastapi>=0.115.6",
     "faster-whisper>=1.1.1",
     "huggingface-hub[hf-transfer]>=0.25.1",
-    "kokoro-onnx>=0.2.2",
+    "kokoro-onnx[gpu]>=0.3.6,<0.4.0",
     "numpy>=2.1.1",
-    "onnxruntime-gpu>=1.20.1 ; platform_machine == 'x86_64'",
     "piper-phonemize ; platform_machine == 'x86_64'",
     "piper-tts>=1.2.0 ; platform_machine == 'x86_64'",
     "pydantic-settings>=2.5.2",

diff --git a/src/speaches/hf_utils.py b/src/speaches/hf_utils.py
@@ -232,9 +232,9 @@ def download_kokoro_model() -> None:
     )
     # HACK
     res = httpx.get(
-        "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.json", follow_redirects=True
+        "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.bin", follow_redirects=True
     ).raise_for_status()
-    voices_path = model_repo_path / "voices.json"
+    voices_path = model_repo_path / "voices.bin"
     voices_path.touch(exist_ok=True)
     voices_path.write_bytes(res.content)
 

diff --git a/src/speaches/model_manager.py b/src/speaches/model_manager.py
@@ -201,7 +201,7 @@ def __init__(self, ttl: int) -> None:
     # TODO
     def _load_fn(self, _model_id: str) -> Kokoro:
         model_path = get_kokoro_model_path()
-        voices_path = model_path.parent / "voices.json"
+        voices_path = model_path.parent / "voices.bin"
         inf_sess = InferenceSession(model_path, providers=ONNX_PROVIDERS)
         return Kokoro.from_session(inf_sess, str(voices_path))
 

diff --git a/src/speaches/routers/speech.py b/src/speaches/routers/speech.py
@@ -134,7 +134,7 @@ async def synthesize(
 ) -> StreamingResponse:
     match body.model:
         case "hexgrad/Kokoro-82M":
-            # TODO: download the `voices.json` file
+            # TODO: download the `voices.bin` file
             with kokoro_model_manager.load_model(body.voice) as tts:
                 audio_generator = kokoro_utils.generate_audio(
                     tts,