diff --git a/examples/rt/async/speaker_id/README.md b/examples/rt/async/speaker_id/README.md new file mode 100644 index 00000000..0ae32fed --- /dev/null +++ b/examples/rt/async/speaker_id/README.md @@ -0,0 +1,15 @@ +# Live Real-Time Speaker ID Example + +This example demonstrates how to use the Speechmatics Python SDK to perform speaker ID in real-time. + +The SDK requires an API key to be set as an environment variable before it can be used. You can obtain an API key by signing up for a Speechmatics account at https://portal.speechmatics.com/dashboard + +## Prerequisites + +- Install Speechmatics RT SDK: `pip install speechmatics-rt` +- Export Speechmatics API key: `export SPEECHMATICS_API_KEY=YOUR-API-KEY` + +## Usage + +- Generate speaker IDs: `python generate.py` - this will generate a `speakers.json` file +- Transcribe audio: `python transcribe.py` - this will use the `speakers.json` file to perform speaker ID on a conversation diff --git a/examples/rt/async/speaker_id/generate.py b/examples/rt/async/speaker_id/generate.py new file mode 100644 index 00000000..a7604494 --- /dev/null +++ b/examples/rt/async/speaker_id/generate.py @@ -0,0 +1,56 @@ +import asyncio +import logging +import os +import json + +from speechmatics.rt import ( + AsyncClient, + OperatingPoint, + TranscriptionConfig, + ServerMessageType, +) + + +logging.basicConfig(level=logging.INFO) + + +speakers: list[dict] = [] + + +async def generate_ids(voice_file: str) -> None: + """Run async transcription example.""" + + transcription_config = TranscriptionConfig( + operating_point=OperatingPoint.ENHANCED, + diarization="speaker", + ) + + # Initialize client with API key from environment + async with AsyncClient() as client: + try: + @client.on(ServerMessageType.SPEAKERS_RESULT) + def handle_speakers_result(msg): + new_speakers = msg.get('speakers', []) + new_speakers[0]["label"] = voice_file + speakers.append(new_speakers[0]) + + # Transcribe audio file + with open(os.path.join(voices_folder, voice_file), "rb") as audio_file: + await client.transcribe( + audio_file, + transcription_config=transcription_config, + get_speakers=True, + ) + except Exception as e: + print(f"Transcription error: {e}") + + +if __name__ == "__main__": + voices_folder = "./examples/rt/async/speaker_id/voices" + voice_files = [f for f in os.listdir(voices_folder) if os.path.isfile(os.path.join(voices_folder, f))] + + for voice_file in voice_files: + asyncio.run(generate_ids(voice_file)) + + with open('./speakers.json', 'w') as f: + json.dump(speakers, f) diff --git a/examples/rt/async/speaker_id/transcribe.py b/examples/rt/async/speaker_id/transcribe.py new file mode 100644 index 00000000..179db3f9 --- /dev/null +++ b/examples/rt/async/speaker_id/transcribe.py @@ -0,0 +1,51 @@ +import asyncio +import logging +import json + +from speechmatics.rt import SpeakerIdentifier +from speechmatics.rt import SpeakerDiarizationConfig +from speechmatics.rt import ( + AsyncClient, + OperatingPoint, + TranscriptionConfig, + ServerMessageType +) + + +logging.basicConfig(level=logging.INFO) + + +async def main() -> None: + """Run async transcription example.""" + + with open('./speakers.json') as f: + speaker_identifiers = [SpeakerIdentifier(**s) for s in json.load(f)] + + transcription_config = TranscriptionConfig( + operating_point=OperatingPoint.ENHANCED, + diarization="speaker", + max_delay=4, + speaker_diarization_config=SpeakerDiarizationConfig( + speakers=speaker_identifiers, + ) + ) + + # Initialize client with API key from environment + async with AsyncClient() as client: + try: + @client.on(ServerMessageType.ADD_TRANSCRIPT) + def handle_finals(msg): + print(f"Final: {msg['metadata']['speaker']} {msg['metadata']['transcript']}") + + # Transcribe audio file + with open("./examples/conversation.wav", "rb") as audio_file: + await client.transcribe( + audio_file, + transcription_config=transcription_config, + ) + except Exception as e: + print(f"Transcription error: {e}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sdk/batch/README.md b/sdk/batch/README.md index a3160020..ae7193db 100644 --- a/sdk/batch/README.md +++ b/sdk/batch/README.md @@ -25,6 +25,8 @@ pip install speechmatics-batch ### Quick Start +To run transcription, you'll need an audio file. You can find an example file [here](https://github.com/speechmatics/speechmatics-python-sdk/blob/main/examples/example.wav). + ```python import asyncio from speechmatics.batch import AsyncClient diff --git a/sdk/rt/README.md b/sdk/rt/README.md index c509c3a6..93976895 100644 --- a/sdk/rt/README.md +++ b/sdk/rt/README.md @@ -24,6 +24,8 @@ pip install speechmatics-rt ``` ## Quick Start +To run transcription, you'll need an audio file. You can find an example file [here](https://github.com/speechmatics/speechmatics-python-sdk/blob/main/examples/example.wav). + ```python import asyncio from speechmatics.rt import AsyncClient, ServerMessageType diff --git a/sdk/rt/speechmatics/rt/_async_client.py b/sdk/rt/speechmatics/rt/_async_client.py index 3566ce31..1d453111 100644 --- a/sdk/rt/speechmatics/rt/_async_client.py +++ b/sdk/rt/speechmatics/rt/_async_client.py @@ -171,6 +171,7 @@ async def transcribe( audio_events_config: Optional[AudioEventsConfig] = None, ws_headers: Optional[dict] = None, timeout: Optional[float] = None, + get_speakers: Optional[bool] = False, ) -> None: """ Transcribe a single audio stream in real-time. @@ -193,6 +194,7 @@ async def transcribe( ws_headers: Additional headers to include in the WebSocket handshake. timeout: Maximum time in seconds to wait for transcription completion. Default None. + get_speakers: Send a speaker identifier event at the end of the session. Raises: AudioError: If source is invalid or cannot be read. @@ -233,6 +235,9 @@ async def transcribe( ws_headers=ws_headers, ) + if get_speakers: + await self.send_message({"message": "GetSpeakers", "final": True}) + try: await asyncio.wait_for( self._audio_producer(source, audio_format.chunk_size), diff --git a/sdk/rt/speechmatics/rt/_base_client.py b/sdk/rt/speechmatics/rt/_base_client.py index 0ac6d085..dd93a962 100644 --- a/sdk/rt/speechmatics/rt/_base_client.py +++ b/sdk/rt/speechmatics/rt/_base_client.py @@ -18,7 +18,9 @@ from ._models import AudioEventsConfig from ._models import AudioFormat from ._models import ConnectionConfig +from ._models import ServerMessageType from ._models import SessionInfo +from ._models import SpeakerIdentifier from ._models import TranscriptionConfig from ._models import TranslationConfig from ._transport import Transport @@ -149,6 +151,36 @@ async def send_message(self, message: dict[str, Any]) -> None: self._closed_evt.set() raise + async def get_speakers(self, final=False) -> list[SpeakerIdentifier]: + """ + Get the list of speakers in the current session. + This method returns as soon as a SPEAKERS_RESULT message is received. + Multiple requests to the method may therefore cause a race condition in which the same + SPEAKERS_RESULT message is received by multiple requests. This should not cause any issues, + but will result in redundant SPEAKERS_RESULT events. + + Args: + final: Whether to wait to the end of the session to return speaker IDs (default: False) + + Returns: + List of SpeakerIdentifier objects + """ + try: + await self.send_message({"message": "GetSpeakers", "final": final}) + speaker_evt = asyncio.Event() + speaker_identifiers: list[SpeakerIdentifier] = [] + self.once( + ServerMessageType.SPEAKERS_RESULT, + lambda msg: (speaker_identifiers.extend(msg.get("speakers", [])), speaker_evt.set()), + ) + await speaker_evt.wait() + return speaker_identifiers + except asyncio.TimeoutError: + raise TransportError("Timeout waiting for SPEAKERS_RESULT") + except Exception: + self._closed_evt.set() + raise + async def _recv_loop(self) -> None: """ Background task that continuously receives and dispatches server messages.