From f168fd0de9d281657d26463d0e026749660b1ba8 Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Wed, 15 Oct 2025 15:49:07 +0100 Subject: [PATCH 1/4] add speaker ID function --- examples/rt/async/speaker_id/README.md | 11 ++++++ examples/rt/async/speaker_id/main.py | 50 +++++++++++++++++++++++++ sdk/batch/README.md | 2 + sdk/rt/README.md | 2 + sdk/rt/speechmatics/rt/_async_client.py | 5 +++ sdk/rt/speechmatics/rt/_base_client.py | 32 ++++++++++++++++ 6 files changed, 102 insertions(+) create mode 100644 examples/rt/async/speaker_id/README.md create mode 100644 examples/rt/async/speaker_id/main.py diff --git a/examples/rt/async/speaker_id/README.md b/examples/rt/async/speaker_id/README.md new file mode 100644 index 00000000..86301429 --- /dev/null +++ b/examples/rt/async/speaker_id/README.md @@ -0,0 +1,11 @@ +# Live Real-Time Example + +This example demonstrates how to use the Speechmatics Python SDK to transcribe audio from a microphone in real-time. It requires `pyaudio` to be installed for the example to work correctly. + +The SDK requires an API key to be set as an environment variable before it can be used. You can obtain an API key by signing up for a Speechmatics account at https://portal.speechmatics.com/dashboard + +## Prerequisites + +- Install Speechmatics RT SDK: `pip install speechmatics-rt` +- Export Speechmatics API key: `export SPEECHMATICS_API_KEY=YOUR-API-KEY` +- Install `pyaudio`: `pip install pyaudio` diff --git a/examples/rt/async/speaker_id/main.py b/examples/rt/async/speaker_id/main.py new file mode 100644 index 00000000..f31d085e --- /dev/null +++ b/examples/rt/async/speaker_id/main.py @@ -0,0 +1,50 @@ +import asyncio +import logging + +from speechmatics.rt import ServerMessageType +from speechmatics.rt import ( + AsyncClient, + AudioEncoding, + AudioFormat, + OperatingPoint, + TranscriptionConfig, +) + + +logging.basicConfig(level=logging.INFO) + + +async def main() -> None: + """Run async transcription example.""" + + transcription_config = TranscriptionConfig( + max_delay=0.8, + enable_partials=True, + operating_point=OperatingPoint.ENHANCED, + diarization="speaker", + ) + + # Initialize client with API key from environment + async with AsyncClient() as client: + try: + @client.on(ServerMessageType.ADD_TRANSCRIPT) + def handle_finals(msg): + print(f"Final: {msg['metadata']['transcript']}") + + @client.on(ServerMessageType.SPEAKERS_RESULT) + def handle_speakers_result(msg): + print(msg) + + # Transcribe audio file + with open("./examples/example.wav", "rb") as audio_file: + await client.transcribe( + audio_file, + transcription_config=transcription_config, + get_speakers=True, + ) + except Exception as e: + print(f"Transcription error: {e}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sdk/batch/README.md b/sdk/batch/README.md index a3160020..ae7193db 100644 --- a/sdk/batch/README.md +++ b/sdk/batch/README.md @@ -25,6 +25,8 @@ pip install speechmatics-batch ### Quick Start +To run transcription, you'll need an audio file. You can find an example file [here](https://github.com/speechmatics/speechmatics-python-sdk/blob/main/examples/example.wav). + ```python import asyncio from speechmatics.batch import AsyncClient diff --git a/sdk/rt/README.md b/sdk/rt/README.md index c509c3a6..93976895 100644 --- a/sdk/rt/README.md +++ b/sdk/rt/README.md @@ -24,6 +24,8 @@ pip install speechmatics-rt ``` ## Quick Start +To run transcription, you'll need an audio file. You can find an example file [here](https://github.com/speechmatics/speechmatics-python-sdk/blob/main/examples/example.wav). + ```python import asyncio from speechmatics.rt import AsyncClient, ServerMessageType diff --git a/sdk/rt/speechmatics/rt/_async_client.py b/sdk/rt/speechmatics/rt/_async_client.py index 3566ce31..1d453111 100644 --- a/sdk/rt/speechmatics/rt/_async_client.py +++ b/sdk/rt/speechmatics/rt/_async_client.py @@ -171,6 +171,7 @@ async def transcribe( audio_events_config: Optional[AudioEventsConfig] = None, ws_headers: Optional[dict] = None, timeout: Optional[float] = None, + get_speakers: Optional[bool] = False, ) -> None: """ Transcribe a single audio stream in real-time. @@ -193,6 +194,7 @@ async def transcribe( ws_headers: Additional headers to include in the WebSocket handshake. timeout: Maximum time in seconds to wait for transcription completion. Default None. + get_speakers: Send a speaker identifier event at the end of the session. Raises: AudioError: If source is invalid or cannot be read. @@ -233,6 +235,9 @@ async def transcribe( ws_headers=ws_headers, ) + if get_speakers: + await self.send_message({"message": "GetSpeakers", "final": True}) + try: await asyncio.wait_for( self._audio_producer(source, audio_format.chunk_size), diff --git a/sdk/rt/speechmatics/rt/_base_client.py b/sdk/rt/speechmatics/rt/_base_client.py index 0ac6d085..dd93a962 100644 --- a/sdk/rt/speechmatics/rt/_base_client.py +++ b/sdk/rt/speechmatics/rt/_base_client.py @@ -18,7 +18,9 @@ from ._models import AudioEventsConfig from ._models import AudioFormat from ._models import ConnectionConfig +from ._models import ServerMessageType from ._models import SessionInfo +from ._models import SpeakerIdentifier from ._models import TranscriptionConfig from ._models import TranslationConfig from ._transport import Transport @@ -149,6 +151,36 @@ async def send_message(self, message: dict[str, Any]) -> None: self._closed_evt.set() raise + async def get_speakers(self, final=False) -> list[SpeakerIdentifier]: + """ + Get the list of speakers in the current session. + This method returns as soon as a SPEAKERS_RESULT message is received. + Multiple requests to the method may therefore cause a race condition in which the same + SPEAKERS_RESULT message is received by multiple requests. This should not cause any issues, + but will result in redundant SPEAKERS_RESULT events. + + Args: + final: Whether to wait to the end of the session to return speaker IDs (default: False) + + Returns: + List of SpeakerIdentifier objects + """ + try: + await self.send_message({"message": "GetSpeakers", "final": final}) + speaker_evt = asyncio.Event() + speaker_identifiers: list[SpeakerIdentifier] = [] + self.once( + ServerMessageType.SPEAKERS_RESULT, + lambda msg: (speaker_identifiers.extend(msg.get("speakers", [])), speaker_evt.set()), + ) + await speaker_evt.wait() + return speaker_identifiers + except asyncio.TimeoutError: + raise TransportError("Timeout waiting for SPEAKERS_RESULT") + except Exception: + self._closed_evt.set() + raise + async def _recv_loop(self) -> None: """ Background task that continuously receives and dispatches server messages. From ebb5fed01edb5d7130c59eda1e347c01605613c1 Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Wed, 15 Oct 2025 15:50:40 +0100 Subject: [PATCH 2/4] update README --- examples/rt/async/speaker_id/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/rt/async/speaker_id/README.md b/examples/rt/async/speaker_id/README.md index 86301429..1b070c5c 100644 --- a/examples/rt/async/speaker_id/README.md +++ b/examples/rt/async/speaker_id/README.md @@ -1,6 +1,6 @@ -# Live Real-Time Example +# Live Real-Time Speaker ID Example -This example demonstrates how to use the Speechmatics Python SDK to transcribe audio from a microphone in real-time. It requires `pyaudio` to be installed for the example to work correctly. +This example demonstrates how to use the Speechmatics Python SDK to perform speaker ID in real-time. The SDK requires an API key to be set as an environment variable before it can be used. You can obtain an API key by signing up for a Speechmatics account at https://portal.speechmatics.com/dashboard @@ -8,4 +8,3 @@ The SDK requires an API key to be set as an environment variable before it can b - Install Speechmatics RT SDK: `pip install speechmatics-rt` - Export Speechmatics API key: `export SPEECHMATICS_API_KEY=YOUR-API-KEY` -- Install `pyaudio`: `pip install pyaudio` From fcdabed852a1c338bb7452f868e420a40c317c0e Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Fri, 17 Oct 2025 11:09:34 +0100 Subject: [PATCH 3/4] remove partials --- examples/rt/async/speaker_id/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/rt/async/speaker_id/main.py b/examples/rt/async/speaker_id/main.py index f31d085e..13dfeba9 100644 --- a/examples/rt/async/speaker_id/main.py +++ b/examples/rt/async/speaker_id/main.py @@ -19,7 +19,6 @@ async def main() -> None: transcription_config = TranscriptionConfig( max_delay=0.8, - enable_partials=True, operating_point=OperatingPoint.ENHANCED, diarization="speaker", ) From f5131c8b0c5a7f08e1d9e383869dd7047786b374 Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Mon, 20 Oct 2025 15:59:40 +0100 Subject: [PATCH 4/4] update eg --- examples/rt/async/speaker_id/README.md | 5 ++ examples/rt/async/speaker_id/generate.py | 56 +++++++++++++++++++ .../speaker_id/{main.py => transcribe.py} | 24 ++++---- 3 files changed, 74 insertions(+), 11 deletions(-) create mode 100644 examples/rt/async/speaker_id/generate.py rename examples/rt/async/speaker_id/{main.py => transcribe.py} (61%) diff --git a/examples/rt/async/speaker_id/README.md b/examples/rt/async/speaker_id/README.md index 1b070c5c..0ae32fed 100644 --- a/examples/rt/async/speaker_id/README.md +++ b/examples/rt/async/speaker_id/README.md @@ -8,3 +8,8 @@ The SDK requires an API key to be set as an environment variable before it can b - Install Speechmatics RT SDK: `pip install speechmatics-rt` - Export Speechmatics API key: `export SPEECHMATICS_API_KEY=YOUR-API-KEY` + +## Usage + +- Generate speaker IDs: `python generate.py` - this will generate a `speakers.json` file +- Transcribe audio: `python transcribe.py` - this will use the `speakers.json` file to perform speaker ID on a conversation diff --git a/examples/rt/async/speaker_id/generate.py b/examples/rt/async/speaker_id/generate.py new file mode 100644 index 00000000..a7604494 --- /dev/null +++ b/examples/rt/async/speaker_id/generate.py @@ -0,0 +1,56 @@ +import asyncio +import logging +import os +import json + +from speechmatics.rt import ( + AsyncClient, + OperatingPoint, + TranscriptionConfig, + ServerMessageType, +) + + +logging.basicConfig(level=logging.INFO) + + +speakers: list[dict] = [] + + +async def generate_ids(voice_file: str) -> None: + """Run async transcription example.""" + + transcription_config = TranscriptionConfig( + operating_point=OperatingPoint.ENHANCED, + diarization="speaker", + ) + + # Initialize client with API key from environment + async with AsyncClient() as client: + try: + @client.on(ServerMessageType.SPEAKERS_RESULT) + def handle_speakers_result(msg): + new_speakers = msg.get('speakers', []) + new_speakers[0]["label"] = voice_file + speakers.append(new_speakers[0]) + + # Transcribe audio file + with open(os.path.join(voices_folder, voice_file), "rb") as audio_file: + await client.transcribe( + audio_file, + transcription_config=transcription_config, + get_speakers=True, + ) + except Exception as e: + print(f"Transcription error: {e}") + + +if __name__ == "__main__": + voices_folder = "./examples/rt/async/speaker_id/voices" + voice_files = [f for f in os.listdir(voices_folder) if os.path.isfile(os.path.join(voices_folder, f))] + + for voice_file in voice_files: + asyncio.run(generate_ids(voice_file)) + + with open('./speakers.json', 'w') as f: + json.dump(speakers, f) diff --git a/examples/rt/async/speaker_id/main.py b/examples/rt/async/speaker_id/transcribe.py similarity index 61% rename from examples/rt/async/speaker_id/main.py rename to examples/rt/async/speaker_id/transcribe.py index 13dfeba9..179db3f9 100644 --- a/examples/rt/async/speaker_id/main.py +++ b/examples/rt/async/speaker_id/transcribe.py @@ -1,13 +1,14 @@ import asyncio import logging +import json -from speechmatics.rt import ServerMessageType +from speechmatics.rt import SpeakerIdentifier +from speechmatics.rt import SpeakerDiarizationConfig from speechmatics.rt import ( AsyncClient, - AudioEncoding, - AudioFormat, OperatingPoint, TranscriptionConfig, + ServerMessageType ) @@ -17,10 +18,16 @@ async def main() -> None: """Run async transcription example.""" + with open('./speakers.json') as f: + speaker_identifiers = [SpeakerIdentifier(**s) for s in json.load(f)] + transcription_config = TranscriptionConfig( - max_delay=0.8, operating_point=OperatingPoint.ENHANCED, diarization="speaker", + max_delay=4, + speaker_diarization_config=SpeakerDiarizationConfig( + speakers=speaker_identifiers, + ) ) # Initialize client with API key from environment @@ -28,18 +35,13 @@ async def main() -> None: try: @client.on(ServerMessageType.ADD_TRANSCRIPT) def handle_finals(msg): - print(f"Final: {msg['metadata']['transcript']}") - - @client.on(ServerMessageType.SPEAKERS_RESULT) - def handle_speakers_result(msg): - print(msg) + print(f"Final: {msg['metadata']['speaker']} {msg['metadata']['transcript']}") # Transcribe audio file - with open("./examples/example.wav", "rb") as audio_file: + with open("./examples/conversation.wav", "rb") as audio_file: await client.transcribe( audio_file, transcription_config=transcription_config, - get_speakers=True, ) except Exception as e: print(f"Transcription error: {e}")