diff --git a/.gitignore b/.gitignore index f930361..c49a349 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ +.benchmarks/ # Translations *.mo @@ -168,9 +169,18 @@ cython_debug/ # Ruff stuff: .ruff_cache/ +**/output.wav # PyPI configuration file .pypirc -# Examples -**/output.wav \ No newline at end of file +# Temporary files / directories +.tmp/ +tmp/ + +# Model caches +.models/ +.cache/ +*.onnx +*.pkf +.claude \ No newline at end of file diff --git a/examples/tts/tts_autoplay/README.md b/examples/tts/tts_autoplay/README.md new file mode 100644 index 0000000..a1544b5 --- /dev/null +++ b/examples/tts/tts_autoplay/README.md @@ -0,0 +1,43 @@ +# Speechmatics TTS Async Streaming API Client + +This example shows how to use the Speechmatics TTS API to generate audio from text and autoplay it using sounddevice through the systems default audio output device. +You must have an audio output device configured on their system for this example to work. +## How it Works + +There are two main components in this example, an audio generator and an audio player. These components are run concurrently using asyncio as tasks, ochestrated by the main() function, to generate and play audio in real-time. +### audio_generator() + +This producer function connects to the Speechmatics TTS API using the AsyncClient. It calls client.generate() with your text, the voice you want to use, and the output format - RAW_PCM_16000 in this example. +The code iterates over the audio data as it is streamed in chunks (iter_chunked), and accumulates in a bytearray buffer. +The while len(buffer) >= 2 loop reads each audio sample containing 2 bytes, from the buffer, and converts it to a numpy array of int-16 values, which is then put into the audio_queue. +The processed 2 byte sample is then removed from the front of the buffer. +END_OF_STREAM is used as a sentinel value to signal the end of the audio stream, with no more audio data to process. +If an error occurs during audio generation, the END_OF_STREAM sentinel value is still put into the queue to signal the end of the audio stream to prevent the consumer, audio_player(), from getting stuck in an infinite loop, and raises the exception. +### audio_player() + +This consumer function initialises a sounddevice OutputStream, which is responsible for streaming the audio data to the default audio output device. Within the outputstream, the while True loop means there is continous processing of the incoming audio data. +sample = await asyncio.wait_for(play_queue.get(), timeout=0.1) fetches the next sample from the queue, or waits for 0.1 seconds if the queue is empty. +If the sample is END_OF_STREAM, the while loop breaks and the audio player exits. +If the sample is not END_OF_STREAM, it is converted to a numpy array of int-16 values and written to the audio output device using the sounddevice OutputStream. +play_queue.task_done() is called to signal that the sample has been processed. +If an error occurs during audio playback, the END_OF_STREAM sentinel value is still put into the queue to signal the end of the audio stream to prevent the audio_player() from getting stuck in an infinite loop, and raises the exception. + +## Installation + +```bash +pip install speechmatics-tts +``` + +## Usage + +To run the example, use the following command: + +```bash +python tts_stream_example.py +``` + +## Environment Variables + +The client supports the following environment variables: + +- `SPEECHMATICS_API_KEY`: Your Speechmatics API key diff --git a/examples/tts/tts_autoplay/requirements.txt b/examples/tts/tts_autoplay/requirements.txt new file mode 100644 index 0000000..d4ef65f --- /dev/null +++ b/examples/tts/tts_autoplay/requirements.txt @@ -0,0 +1,3 @@ +sounddevice>=0.4.6 +numpy>=1.24.3 +speechmatics-tts>=0.1.0 diff --git a/examples/tts/tts_autoplay/tts_stream_example.py b/examples/tts/tts_autoplay/tts_stream_example.py new file mode 100644 index 0000000..fc97db0 --- /dev/null +++ b/examples/tts/tts_autoplay/tts_stream_example.py @@ -0,0 +1,121 @@ +import asyncio +import sounddevice as sd +import numpy as np +from speechmatics.tts import AsyncClient, Voice, OutputFormat + +# Configuration +TEXT = "Welcome to the future of audio generation from text! This audio is a demo of the async streaming Speechmatics Text-to-Speech (TTS) API." +VOICE = Voice.JACK +OUTPUT_FORMAT = OutputFormat.RAW_PCM_16000 + +# Audio Parameters +SAMPLE_RATE = 16000 #Hz +SAMPLE_WIDTH = 2 # 16-bit audio +CHANNELS = 1 # Mono audio +CHUNK_SIZE = 2048 # Size of audio chunks +BUFFER_SIZE = 4096 # Size of buffer + +# Sentinel value to signal end of stream +END_OF_STREAM = None + + +# Core Async Functions + +# 1. Producer: Generates audio and puts chunks into the queue: + +async def audio_generator(audio_queue: asyncio.Queue, text: str, voice: str, output_format: str) -> None: +# Generate speech and stream audio chunks into the queue. + + try: + async with AsyncClient() as client, await client.generate( + text=text, + voice=voice, + output_format=output_format + ) as response: + buffer=bytearray() + async for chunk in response.content.iter_chunked(BUFFER_SIZE): + if not chunk: + continue + buffer.extend(chunk) + + # Process complete frames (2 bytes per sample for 16-bit audio) + # Convert little-endian 16-bit signed int to np.int-16 + while len(buffer) >= 2: + sample = int.from_bytes(buffer[:2], byteorder='little', signed=True) + await audio_queue.put(sample) + buffer = buffer[2:] + + await audio_queue.put(END_OF_STREAM) + print("Audio generated and put into queue.") + + except Exception as e: + print(f"[{'Generator'}] An error occurred in the audio generator: {e}") + await audio_queue.put(END_OF_STREAM) + raise + +# 2. Consumer: Read audio data from queue and play it in real-time using sounddevice. +async def audio_player(play_queue: asyncio.Queue) -> None: + try: + with sd.OutputStream( + samplerate=SAMPLE_RATE, + channels=CHANNELS, + dtype='int16', # 16-bit PCM + blocksize=CHUNK_SIZE, + latency='high', + ) as stream: + buffer=[] + while True: + try: + sample = await asyncio.wait_for(play_queue.get(), timeout=0.1) + if sample is END_OF_STREAM: + if buffer: + audio_data=np.array(buffer, dtype=np.int16) + stream.write(audio_data) + buffer=[] + break + + buffer.append(sample) + if len(buffer) >= CHUNK_SIZE: + audio_data=np.array(buffer[:CHUNK_SIZE], dtype=np.int16) + stream.write(audio_data) + buffer=buffer[CHUNK_SIZE:] + + play_queue.task_done() + + except asyncio.TimeoutError: + if buffer: + audio_data=np.array(buffer, dtype=np.int16) + stream.write(audio_data) + buffer=[] + continue + + except Exception as e: + print(f"[{'Player'}] An error occurred playing audio chunk {e}") + raise + + except Exception as e: + print(f"[{'Player'}] An error occurred in the audio player: {e}") + raise + finally: + sd.stop() + +# 3. Main Function: Orchestrate audio generation and audio stream +async def main() -> None: + play_queue = asyncio.Queue() + + # Create tasks + tasks = [ + asyncio.create_task(audio_generator(play_queue, TEXT, VOICE, OUTPUT_FORMAT)), + asyncio.create_task(audio_player(play_queue)) + ] + + try: + await asyncio.gather(*tasks) + + except Exception as e: + for task in tasks: + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sdk/rt/speechmatics/rt/_async_client.py b/sdk/rt/speechmatics/rt/_async_client.py index ffd8af2..5e581e1 100644 --- a/sdk/rt/speechmatics/rt/_async_client.py +++ b/sdk/rt/speechmatics/rt/_async_client.py @@ -163,7 +163,7 @@ async def stop_session(self) -> None: async def force_end_of_utterance(self) -> None: """ - This method sends a ForceEndOfUtterance message to the server to signal + This method sends a ForceEndOfUtterance message to the server to signal the end of an utterance. Forcing end of utterance will cause the final transcript to be sent to the client early. diff --git a/sdk/tts/README.md b/sdk/tts/README.md index 07514a6..648d210 100644 --- a/sdk/tts/README.md +++ b/sdk/tts/README.md @@ -24,28 +24,19 @@ pip install speechmatics-tts ```python import asyncio -import wave -from pathlib import Path - from speechmatics.tts import AsyncClient, Voice, OutputFormat -async def save_audio(audio_data: bytes, filename: str) -> None: - with wave.open(filename, "wb") as wav: - wav.setnchannels(1) # Mono - wav.setsampwidth(2) # 16-bit - wav.setframerate(16000) # 16kHz - wav.writeframes(audio_data) - # Generate speech data from text and save to WAV file async def main(): async with AsyncClient() as client: async with await client.generate( - text="Welcome to the future of audio generation from text!", + text="Welcome to the future of voice AI!", voice=Voice.SARAH, - output_format=OutputFormat.RAW_PCM_16000 + output_format=OutputFormat.WAV_16000 ) as response: audio = b''.join([chunk async for chunk in response.content.iter_chunked(1024)]) - await save_audio(audio, "output.wav") + with open("output.wav", "wb") as f: + f.write(audio) # Run the async main function diff --git a/sdk/tts/speechmatics/tts/__init__.py b/sdk/tts/speechmatics/tts/__init__.py index 68e44e5..e3b4a39 100644 --- a/sdk/tts/speechmatics/tts/__init__.py +++ b/sdk/tts/speechmatics/tts/__init__.py @@ -26,4 +26,4 @@ "ConnectionConfig", "Voice", "OutputFormat", -] \ No newline at end of file +] diff --git a/sdk/tts/speechmatics/tts/_models.py b/sdk/tts/speechmatics/tts/_models.py index fdbca0e..572e598 100644 --- a/sdk/tts/speechmatics/tts/_models.py +++ b/sdk/tts/speechmatics/tts/_models.py @@ -50,8 +50,10 @@ class Voice(str, Enum): sarah: English (UK) female voice. theo: English (UK) male voice. megan: English (UK) female voice. + jack: English (US) male voice. """ SARAH = "sarah" THEO = "theo" MEGAN = "megan" + JACK = "jack"