Merge 6ddfcd9745 into fc674574ca

2026-04-21 03:34:38 +02:00 · 2026-04-21 03:34:38 +02:00 · c9cd97a5a3
parent fc674574ca 6ddfcd9745
commit c9cd97a5a3
2 changed files with 147 additions and 2 deletions
--- a/examples/talk-llama/minimax-tts.py
+++ b/examples/talk-llama/minimax-tts.py
@ -0,0 +1,134 @@
+import sys
+import os
+import json
+import argparse
+import subprocess
+import tempfile
+import urllib.request
+
+MINIMAX_VOICES = [
+    'English_Graceful_Lady',
+    'English_Insightful_Speaker',
+    'English_radiant_girl',
+    'English_Persuasive_Man',
+    'English_Lucky_Robot',
+    'English_expressive_narrator',
+]
+
+parser = argparse.ArgumentParser(add_help=False,
+    formatter_class=argparse.RawTextHelpFormatter,
+    description='MiniMax TTS client for whisper.cpp talk-llama example')
+
+modes = parser.add_argument_group("action")
+modes.add_argument("inputfile", metavar="TEXTFILE",
+    nargs='?', type=argparse.FileType(), default=sys.stdin,
+    help="read the text file (default: stdin)")
+modes.add_argument("-l", "--list", action="store_true",
+    help="show the list of voices and exit")
+modes.add_argument("-h", "--help", action="help",
+    help="show this help and exit")
+
+selopts = parser.add_argument_group("voice selection")
+selmodes = selopts.add_mutually_exclusive_group()
+selmodes.add_argument("-n", "--name",
+    default="English_Graceful_Lady",
+    help="voice ID to use (default: English_Graceful_Lady)")
+selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
+    help="voice by index number (see --list)")
+
+outmodes = parser.add_argument_group("output")
+outgroup = outmodes.add_mutually_exclusive_group()
+outgroup.add_argument("-s", "--save", metavar="FILE",
+    default="audio.mp3",
+    help="save the TTS to a file (default: audio.mp3)")
+outgroup.add_argument("-p", "--play", action="store_true",
+    help="play the TTS with ffplay")
+
+apiopts = parser.add_argument_group("API options")
+apiopts.add_argument("-k", "--api-key", metavar="KEY",
+    default=os.environ.get("MINIMAX_API_KEY", ""),
+    help="MiniMax API key (default: $MINIMAX_API_KEY)")
+apiopts.add_argument("-m", "--model",
+    default="speech-2.8-hd",
+    help="TTS model to use (default: speech-2.8-hd)")
+apiopts.add_argument("-b", "--base-url",
+    default=os.environ.get("MINIMAX_BASE_URL", "https://api.minimax.io"),
+    help="MiniMax base URL (default: https://api.minimax.io)")
+
+args = parser.parse_args()
+
+if args.list:
+    for i, v in enumerate(MINIMAX_VOICES):
+        print(str(i) + ": " + v)
+    sys.exit()
+
+if not args.api_key:
+    print("MiniMax API key is required. Set MINIMAX_API_KEY environment variable or use -k.")
+    sys.exit(1)
+
+if args.voice is not None:
+    voice_id = MINIMAX_VOICES[args.voice % len(MINIMAX_VOICES)]
+else:
+    voice_id = args.name
+
+text = args.inputfile.read()
+
+url = args.base_url.rstrip("/") + "/v1/t2a_v2"
+payload = json.dumps({
+    "model": args.model,
+    "text": text,
+    "stream": True,
+    "voice_setting": {
+        "voice_id": voice_id,
+        "speed": 1,
+        "vol": 1,
+        "pitch": 0,
+    },
+    "audio_setting": {
+        "sample_rate": 32000,
+        "bitrate": 128000,
+        "format": "mp3",
+        "channel": 1,
+    },
+}).encode("utf-8")
+
+req = urllib.request.Request(url, data=payload, method="POST")
+req.add_header("Content-Type", "application/json")
+req.add_header("Authorization", "Bearer " + args.api_key)
+
+audio_chunks = []
+buffer = b""
+
+with urllib.request.urlopen(req) as resp:
+    for raw_line in resp:
+        line = raw_line.decode("utf-8", errors="replace").rstrip("\n\r")
+        if not line.startswith("data:"):
+            continue
+        json_str = line[5:].strip()
+        if not json_str or json_str == "[DONE]":
+            continue
+        try:
+            event = json.loads(json_str)
+            audio_hex = event.get("data", {}).get("audio", "")
+            if audio_hex:
+                audio_chunks.append(bytes.fromhex(audio_hex))
+        except (json.JSONDecodeError, ValueError):
+            pass
+
+audio = b"".join(audio_chunks)
+
+if args.play:
+    with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp:
+        tmp.write(audio)
+        tmp_path = tmp.name
+    try:
+        subprocess.run(
+            ["ffplay", "-autoexit", "-nodisp", "-loglevel", "quiet",
+             "-hide_banner", "-i", tmp_path],
+            check=False,
+        )
+    finally:
+        os.unlink(tmp_path)
+else:
+    with open(args.save, "wb") as f:
+        f.write(audio)
--- a/examples/talk-llama/speak
+++ b/examples/talk-llama/speak
@ -32,9 +32,20 @@ elif installed python3 && \
    #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
    #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1

+# MiniMax TTS
+elif [ -n "$MINIMAX_API_KEY" ] && installed python3 && installed ffplay; then
+    wd=$(dirname $0)
+    script=$wd/minimax-tts.py
+    python3 $script -p -v $1 $2 >/dev/null 2>&1
+
+    # Uncomment to keep the audio file
+    #python3 $script -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
+    #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
+
 else
  echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
  echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
-  echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
-  echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
+  echo 'or elevenlabs ("pip install elevenlabs") with ffplay,'
+  echo 'or set MINIMAX_API_KEY and install ffplay for MiniMax TTS.'
+  echo '(See https://platform.minimax.io for a MiniMax API key)'
 fi