diff --git a/examples/talk-llama/minimax-tts.py b/examples/talk-llama/minimax-tts.py new file mode 100644 index 00000000..ff110323 --- /dev/null +++ b/examples/talk-llama/minimax-tts.py @@ -0,0 +1,134 @@ +import sys +import os +import json +import argparse +import subprocess +import tempfile +import urllib.request + +MINIMAX_VOICES = [ + 'English_Graceful_Lady', + 'English_Insightful_Speaker', + 'English_radiant_girl', + 'English_Persuasive_Man', + 'English_Lucky_Robot', + 'English_expressive_narrator', +] + +parser = argparse.ArgumentParser(add_help=False, + formatter_class=argparse.RawTextHelpFormatter, + description='MiniMax TTS client for whisper.cpp talk-llama example') + +modes = parser.add_argument_group("action") +modes.add_argument("inputfile", metavar="TEXTFILE", + nargs='?', type=argparse.FileType(), default=sys.stdin, + help="read the text file (default: stdin)") +modes.add_argument("-l", "--list", action="store_true", + help="show the list of voices and exit") +modes.add_argument("-h", "--help", action="help", + help="show this help and exit") + +selopts = parser.add_argument_group("voice selection") +selmodes = selopts.add_mutually_exclusive_group() +selmodes.add_argument("-n", "--name", + default="English_Graceful_Lady", + help="voice ID to use (default: English_Graceful_Lady)") +selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER", + help="voice by index number (see --list)") + +outmodes = parser.add_argument_group("output") +outgroup = outmodes.add_mutually_exclusive_group() +outgroup.add_argument("-s", "--save", metavar="FILE", + default="audio.mp3", + help="save the TTS to a file (default: audio.mp3)") +outgroup.add_argument("-p", "--play", action="store_true", + help="play the TTS with ffplay") + +apiopts = parser.add_argument_group("API options") +apiopts.add_argument("-k", "--api-key", metavar="KEY", + default=os.environ.get("MINIMAX_API_KEY", ""), + help="MiniMax API key (default: $MINIMAX_API_KEY)") +apiopts.add_argument("-m", "--model", + default="speech-2.8-hd", + help="TTS model to use (default: speech-2.8-hd)") +apiopts.add_argument("-b", "--base-url", + default=os.environ.get("MINIMAX_BASE_URL", "https://api.minimax.io"), + help="MiniMax base URL (default: https://api.minimax.io)") + +args = parser.parse_args() + +if args.list: + for i, v in enumerate(MINIMAX_VOICES): + print(str(i) + ": " + v) + sys.exit() + +if not args.api_key: + print("MiniMax API key is required. Set MINIMAX_API_KEY environment variable or use -k.") + sys.exit(1) + +if args.voice is not None: + voice_id = MINIMAX_VOICES[args.voice % len(MINIMAX_VOICES)] +else: + voice_id = args.name + +text = args.inputfile.read() + +url = args.base_url.rstrip("/") + "/v1/t2a_v2" +payload = json.dumps({ + "model": args.model, + "text": text, + "stream": True, + "voice_setting": { + "voice_id": voice_id, + "speed": 1, + "vol": 1, + "pitch": 0, + }, + "audio_setting": { + "sample_rate": 32000, + "bitrate": 128000, + "format": "mp3", + "channel": 1, + }, +}).encode("utf-8") + +req = urllib.request.Request(url, data=payload, method="POST") +req.add_header("Content-Type", "application/json") +req.add_header("Authorization", "Bearer " + args.api_key) + +audio_chunks = [] +buffer = b"" + +with urllib.request.urlopen(req) as resp: + for raw_line in resp: + line = raw_line.decode("utf-8", errors="replace").rstrip("\n\r") + if not line.startswith("data:"): + continue + json_str = line[5:].strip() + if not json_str or json_str == "[DONE]": + continue + try: + event = json.loads(json_str) + audio_hex = event.get("data", {}).get("audio", "") + if audio_hex: + audio_chunks.append(bytes.fromhex(audio_hex)) + except (json.JSONDecodeError, ValueError): + pass + +audio = b"".join(audio_chunks) + +if args.play: + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp: + tmp.write(audio) + tmp_path = tmp.name + try: + subprocess.run( + ["ffplay", "-autoexit", "-nodisp", "-loglevel", "quiet", + "-hide_banner", "-i", tmp_path], + check=False, + ) + finally: + os.unlink(tmp_path) +else: + with open(args.save, "wb") as f: + f.write(audio) diff --git a/examples/talk-llama/speak b/examples/talk-llama/speak index 31ea417a..09d60ee9 100755 --- a/examples/talk-llama/speak +++ b/examples/talk-llama/speak @@ -32,9 +32,20 @@ elif installed python3 && \ #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1 #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1 +# MiniMax TTS +elif [ -n "$MINIMAX_API_KEY" ] && installed python3 && installed ffplay; then + wd=$(dirname $0) + script=$wd/minimax-tts.py + python3 $script -p -v $1 $2 >/dev/null 2>&1 + + # Uncomment to keep the audio file + #python3 $script -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1 + #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1 + else echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),' echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,' - echo 'or elevenlabs ("pip install elevenlabs") with ffplay.' - echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)' + echo 'or elevenlabs ("pip install elevenlabs") with ffplay,' + echo 'or set MINIMAX_API_KEY and install ffplay for MiniMax TTS.' + echo '(See https://platform.minimax.io for a MiniMax API key)' fi