speechmatics · TudorCRL · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 9, 2025
diff --git a/examples/nodejs/README.md b/examples/nodejs/README.md
@@ -28,4 +28,10 @@ pnpm run:batch
 
 ```
 pnpm run:real-time-file
+```
+
+### Get speakers (real-time)
+
+```
+pnpm run:speaker-id
 ```
diff --git a/examples/nodejs/example.wav b/examples/nodejs/example.wav
diff --git a/examples/nodejs/package.json b/examples/nodejs/package.json
@@ -6,7 +6,8 @@
   "type": "module",
   "scripts": {
     "run:batch": "node --import tsx/esm batch-example.ts",
-    "run:real-time-file": "node --import tsx/esm real-time-file-example.ts"
+    "run:real-time-file": "node --import tsx/esm real-time-file-example.ts",
+    "run:speaker-id": "node --import tsx/esm speaker-id-example.ts"
   },
   "keywords": [],
   "author": "",

diff --git a/examples/nodejs/speaker-id-example.ts b/examples/nodejs/speaker-id-example.ts
@@ -0,0 +1,58 @@
+/**
+ * This file showcases the speaker ID feature of the real-time-client package being used in NodeJS.
+ *
+ * It will connect to the real-time API and transcribe a file in real-time, then return the speakers.
+ * To run this example, you will need to have a Speechmatics API key,
+ * which can be generated from the Speechmatics Portal: https://portal.speechmatics.com/api-keys
+ *
+ * NOTE: This script is run as an ES Module via tsx, letting us use top-level await.
+ * The library also works with CommonJS, but the code would need to be wrapped in an async function.
+ */
+import { RealtimeClient } from '@speechmatics/real-time-client';
+import fs from 'node:fs';
+import dotenv from 'dotenv';
+import { createSpeechmaticsJWT } from '@speechmatics/auth';
+
+dotenv.config();
+
+const apiKey = process.env.API_KEY;
+if (!apiKey) {
+  throw new Error('Please set the API_KEY environment variable');
+}
+
+const client = new RealtimeClient();
+
+const jwt = await createSpeechmaticsJWT({
+  type: 'rt',
+  apiKey,
+  ttl: 60, // 1 minute
+});
+
+const fileStream = fs.createReadStream('./example.wav', {
+  highWaterMark: 4096, // avoid sending too much data at once
+});
+
+await client.start(jwt, {
+  transcription_config: {
+    language: 'en',
+    operating_point: 'enhanced',
+  },
+});
+
+//send it
+fileStream.on('data', (sample) => {
+  client.sendAudio(sample);
+});
+
+//end the session
+fileStream.on('end', () => {
+  // Send a stop message to the server when we're done sending audio.
+  // We set `noTimeout` because we are streaming faster than real-time,
+  // so we should wait for all the data to be processed before closing the connection.
+  client.stopRecognition({ noTimeout: true });
+});
+
+// We wait for the speakers to be available.
+// With final = true, the speakers are only returned when the session is finished
+const speakers = await client.getSpeakers(true);
+console.log(speakers);
diff --git a/packages/real-time-client-react/package.json b/packages/real-time-client-react/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@speechmatics/real-time-client-react",
-  "version": "2.0.2",
+  "version": "3.0.0",
   "description": "React hooks for interacting with the Speechmatics Real-Time API",
   "main": "./dist/index.cjs",
   "module": "./dist/index.js",

diff --git a/packages/real-time-client-react/src/use-real-time-transcription.ts b/packages/real-time-client-react/src/use-real-time-transcription.ts
@@ -31,14 +31,40 @@ export function useRealtimeTranscription() {
     [client],
   );
 
+  const setRecognitionConfig = useCallback<
+    RealtimeClient['setRecognitionConfig']
+  >(
+    (config) => {
+      client.setRecognitionConfig(config);
+    },
+    [client],
+  );
+
+  const getSpeakers = useCallback<RealtimeClient['getSpeakers']>(
+    (final?: boolean) => {
+      return client.getSpeakers(final);
+    },
+    [client],
+  );
+
   return useMemo(
     () => ({
       sessionId,
       socketState,
       startTranscription,
       stopTranscription,
       sendAudio,
+      setRecognitionConfig,
+      getSpeakers,
     }),
-    [sessionId, socketState, startTranscription, stopTranscription, sendAudio],
+    [
+      sessionId,
+      socketState,
+      startTranscription,
+      stopTranscription,
+      sendAudio,
+      setRecognitionConfig,
+      getSpeakers,
+    ],
   );
 }
diff --git a/packages/real-time-client/models/AddChannelAudio.ts b/packages/real-time-client/models/AddChannelAudio.ts
@@ -0,0 +1,12 @@
+interface AddChannelAudio {
+  message: 'AddChannelAudio';
+  /**
+   * The channel identifier to which the audio belongs.
+   */
+  channel: string;
+  /**
+   * The audio data in base64 format.
+   */
+  data: string;
+}
+export type { AddChannelAudio };
diff --git a/packages/real-time-client/models/AddPartialTranscript.ts b/packages/real-time-client/models/AddPartialTranscript.ts
@@ -8,5 +8,15 @@ interface AddPartialTranscript {
   format?: string;
   metadata: RecognitionMetadata;
   results: RecognitionResult[];
+  /**
+   * The channel identifier to which the audio belongs. This field is only seen in multichannel.
+   *
+   * :::note
+   *
+   * This field is only available in [preview mode](https://docs.speechmatics.com/private/preview-mode).
+   *
+   * :::
+   */
+  channel?: string;
 }
 export type { AddPartialTranscript };
diff --git a/packages/real-time-client/models/AddPartialTranslation.ts b/packages/real-time-client/models/AddPartialTranslation.ts
@@ -5,6 +5,9 @@ interface AddPartialTranslation {
    * Speechmatics JSON output format version number.
    */
   format?: string;
+  /**
+   * Language translation relates to given as an ISO language code.
+   */
   language: string;
   results: TranslatedSentence[];
 }

diff --git a/packages/real-time-client/models/AddTranscript.ts b/packages/real-time-client/models/AddTranscript.ts
@@ -8,5 +8,15 @@ interface AddTranscript {
   format?: string;
   metadata: RecognitionMetadata;
   results: RecognitionResult[];
+  /**
+   * The channel identifier to which the audio belongs. This field is only seen in multichannel.
+   *
+   * :::note
+   *
+   * This field is only available in [preview mode](https://docs.speechmatics.com/private/preview-mode).
+   *
+   * :::
+   */
+  channel?: string;
 }
 export type { AddTranscript };
diff --git a/packages/real-time-client/models/AddTranslation.ts b/packages/real-time-client/models/AddTranslation.ts
@@ -5,6 +5,9 @@ interface AddTranslation {
    * Speechmatics JSON output format version number.
    */
   format?: string;
+  /**
+   * Language translation relates to given as an ISO language code.
+   */
   language: string;
   results: TranslatedSentence[];
 }

diff --git a/packages/real-time-client/models/AudioEventEndData.ts b/packages/real-time-client/models/AudioEventEndData.ts
@@ -1,4 +1,7 @@
 interface AudioEventEndData {
+  /**
+   * The type of audio event that has started or ended. See our list of [supported Audio Event types](https://docs.speechmatics.com/speech-to-text/features/audio-events#supported-audio-events).
+   */
   type: string;
   end_time: number;
 }

diff --git a/packages/real-time-client/models/AudioEventStartData.ts b/packages/real-time-client/models/AudioEventStartData.ts
@@ -1,6 +1,15 @@
 interface AudioEventStartData {
+  /**
+   * The type of audio event that has started or ended. See our list of [supported Audio Event types](https://docs.speechmatics.com/speech-to-text/features/audio-events#supported-audio-events).
+   */
   type: string;
+  /**
+   * The time (in seconds) of the audio corresponding to the beginning of the audio event.
+   */
   start_time: number;
+  /**
+   * A confidence score assigned to the audio event. Ranges from 0.0 (least confident) to 1.0 (most confident).
+   */
   confidence: number;
 }
 export type { AudioEventStartData };
diff --git a/packages/real-time-client/models/AudioEventsConfig.ts b/packages/real-time-client/models/AudioEventsConfig.ts
@@ -1,4 +1,10 @@
+/**
+ * Contains configuration for [Audio Events](https://docs.speechmatics.com/speech-to-text/features/audio-events)
+ */
 interface AudioEventsConfig {
+  /**
+   * List of [Audio Event types](https://docs.speechmatics.com/speech-to-text/features/audio-events#supported-audio-events) to enable.
+   */
   types?: string[];
 }
 export type { AudioEventsConfig };
diff --git a/packages/real-time-client/models/AudioFilteringConfig.ts b/packages/real-time-client/models/AudioFilteringConfig.ts
@@ -1,3 +1,6 @@
+/**
+ * Puts a lower limit on the volume of processed audio by using the `volume_threshold` setting. See [Audio Filtering](https://docs.speechmatics.com/speech-to-text/features/audio-filtering).
+ */
 interface AudioFilteringConfig {
   volume_threshold?: number;
 }

diff --git a/packages/real-time-client/models/AudioFormatFile.ts b/packages/real-time-client/models/AudioFormatFile.ts
diff --git a/packages/real-time-client/models/AudioFormatRaw.ts b/packages/real-time-client/models/AudioFormatRaw.ts
diff --git a/packages/real-time-client/models/ChannelAudioAdded.ts b/packages/real-time-client/models/ChannelAudioAdded.ts
@@ -0,0 +1,6 @@
+interface ChannelAudioAdded {
+  message: 'ChannelAudioAdded';
+  seq_no: number;
+  channel: string;
+}
+export type { ChannelAudioAdded };
diff --git a/packages/real-time-client/models/ConversationConfig.ts b/packages/real-time-client/models/ConversationConfig.ts
@@ -1,5 +1,5 @@
 /**
- * This mode will detect when a speaker has stopped talking. The end_of_utterance_silence_trigger is the time in seconds after which the server will assume that the speaker has finished speaking, and will emit an EndOfUtterance message. A value of 0 disables the feature.
+ * This mode will detect when a speaker has stopped talking. The `end_of_utterance_silence_trigger` is the time in seconds after which the server will assume that the speaker has finished speaking, and will emit an `EndOfUtterance` message. A value of 0 disables the feature.
  */
 interface ConversationConfig {
   end_of_utterance_silence_trigger?: number;

diff --git a/packages/real-time-client/models/DiarizationConfig.ts b/packages/real-time-client/models/DiarizationConfig.ts
@@ -1,2 +1,5 @@
+/**
+ * Set to `speaker` to apply [Speaker Diarization](https://docs.speechmatics.com/speech-to-text/features/diarization) to the audio.
+ */
 type DiarizationConfig = 'none' | 'speaker';
 export type { DiarizationConfig };
diff --git a/packages/real-time-client/models/DirectionEnum.ts b/packages/real-time-client/models/DirectionEnum.ts
@@ -1,2 +1,5 @@
+/**
+ * Either `ltr` for words that should be displayed left-to-right, or `rtl` vice versa.
+ */
 type DirectionEnum = 'ltr' | 'rtl';
 export type { DirectionEnum };
diff --git a/packages/real-time-client/models/EndOfChannel.ts b/packages/real-time-client/models/EndOfChannel.ts
@@ -0,0 +1,9 @@
+interface EndOfChannel {
+  message: 'EndOfChannel';
+  /**
+   * The channel identifier to which the audio belongs.
+   */
+  channel: string;
+  last_seq_no: number;
+}
+export type { EndOfChannel };
diff --git a/packages/real-time-client/models/EndOfUtterance.ts b/packages/real-time-client/models/EndOfUtterance.ts
@@ -2,5 +2,9 @@ import type { EndOfUtteranceMetadata } from './EndOfUtteranceMetadata';
 interface EndOfUtterance {
   message: 'EndOfUtterance';
   metadata: EndOfUtteranceMetadata;
+  /**
+   * The channel identifier to which the EndOfUtterance message belongs. This field is only seen in multichannel.
+   */
+  channel?: string;
 }
 export type { EndOfUtterance };
diff --git a/packages/real-time-client/models/EndOfUtteranceMetadata.ts b/packages/real-time-client/models/EndOfUtteranceMetadata.ts
@@ -1,5 +1,11 @@
 interface EndOfUtteranceMetadata {
+  /**
+   * The time (in seconds) that the end of utterance was detected.
+   */
   start_time?: number;
+  /**
+   * The time (in seconds) that the end of utterance was detected.
+   */
   end_time?: number;
 }
 export type { EndOfUtteranceMetadata };
diff --git a/packages/real-time-client/models/ErrorType.ts b/packages/real-time-client/models/ErrorType.ts
@@ -1,6 +1,36 @@
 import type { ErrorTypeEnum } from './ErrorTypeEnum';
 interface ErrorType {
   message: 'Error';
+  /**
+   * The following are the possible error types:
+   *
+   * | Error Type | Description |
+   * | --- | --- |
+   * | `invalid_message` | The message received was not understood. |
+   * | `invalid_model` | Unable to use the model for the recognition. This can happen if the language is not supported at all, or is not available for the user. |
+   * | `invalid_config` | The config received contains some wrong or unsupported fields, or too many translation target languages were requested. |
+   * | `invalid_audio_type` | Audio type is not supported, is deprecated, or the `audio_type` is malformed. |
+   * | `invalid_output_format` | Output format is not supported, is deprecated, or the `output_format` is malformed. |
+   * | `not_authorised` | User was not recognised, or the API key provided is not valid. |
+   * | `insufficient_funds` | User doesn't have enough credits or any other reason preventing the user to be charged for the job properly. |
+   * | `not_allowed` | User is not allowed to use this message (is not allowed to perform the action the message would invoke). |
+   * | `job_error` | Unable to do any work on this job, the server might have timed out etc. |
+   * | `data_error` | Unable to accept the data specified - usually because there is too much data being sent at once |
+   * | `buffer_error` | Unable to fit the data in a corresponding buffer. This can happen for clients sending the input data faster than real-time. |
+   * | `protocol_error` | Message received was syntactically correct, but could not be accepted due to protocol limitations. This is usually caused by messages sent in the wrong order. |
+   * | `quota_exceeded` | Maximum number of concurrent connections allowed for the contract has been reached |
+   * | `timelimit_exceeded` | Usage quota for the contract has been reached |
+   * | `idle_timeout` | Idle duration limit was reached (no audio data sent within the last hour), a closing handshake with code 1008 follows this in-band error. |
+   * | `session_timeout` | Max session duration was reached (maximum session duration of 48 hours), a closing handshake with code 1008 follows this in-band error. |
+   * | `session_transfer` | An error while transferring session to another backend with the reason: Session transfer failed. This may occur when moving sessions due to backend maintenance operations or migration from a faulty backend. |
+   * | `unknown_error` | An error that did not fit any of the types above. |
+   *
+   * :::info
+   *
+   * `invalid_message`, `protocol_error` and `unknown_error` can be triggered as a response to any type of messages.
+   *
+   * :::
+   */
   type: ErrorTypeEnum;
   reason: string;
   code?: number;

diff --git a/packages/real-time-client/models/ErrorTypeEnum.ts b/packages/real-time-client/models/ErrorTypeEnum.ts
@@ -1,16 +1,50 @@
+/**
+ * The following are the possible error types:
+ *
+ * | Error Type | Description |
+ * | --- | --- |
+ * | `invalid_message` | The message received was not understood. |
+ * | `invalid_model` | Unable to use the model for the recognition. This can happen if the language is not supported at all, or is not available for the user. |
+ * | `invalid_config` | The config received contains some wrong or unsupported fields, or too many translation target languages were requested. |
+ * | `invalid_audio_type` | Audio type is not supported, is deprecated, or the `audio_type` is malformed. |
+ * | `invalid_output_format` | Output format is not supported, is deprecated, or the `output_format` is malformed. |
+ * | `not_authorised` | User was not recognised, or the API key provided is not valid. |
+ * | `insufficient_funds` | User doesn't have enough credits or any other reason preventing the user to be charged for the job properly. |
+ * | `not_allowed` | User is not allowed to use this message (is not allowed to perform the action the message would invoke). |
+ * | `job_error` | Unable to do any work on this job, the server might have timed out etc. |
+ * | `data_error` | Unable to accept the data specified - usually because there is too much data being sent at once |
+ * | `buffer_error` | Unable to fit the data in a corresponding buffer. This can happen for clients sending the input data faster than real-time. |
+ * | `protocol_error` | Message received was syntactically correct, but could not be accepted due to protocol limitations. This is usually caused by messages sent in the wrong order. |
+ * | `quota_exceeded` | Maximum number of concurrent connections allowed for the contract has been reached |
+ * | `timelimit_exceeded` | Usage quota for the contract has been reached |
+ * | `idle_timeout` | Idle duration limit was reached (no audio data sent within the last hour), a closing handshake with code 1008 follows this in-band error. |
+ * | `session_timeout` | Max session duration was reached (maximum session duration of 48 hours), a closing handshake with code 1008 follows this in-band error. |
+ * | `session_transfer` | An error while transferring session to another backend with the reason: Session transfer failed. This may occur when moving sessions due to backend maintenance operations or migration from a faulty backend. |
+ * | `unknown_error` | An error that did not fit any of the types above. |
+ *
+ * :::info
+ *
+ * `invalid_message`, `protocol_error` and `unknown_error` can be triggered as a response to any type of messages.
+ *
+ * :::
+ */
 type ErrorTypeEnum =
   | 'invalid_message'
   | 'invalid_model'
   | 'invalid_config'
   | 'invalid_audio_type'
+  | 'invalid_output_format'
   | 'not_authorised'
   | 'insufficient_funds'
   | 'not_allowed'
   | 'job_error'
   | 'data_error'
   | 'buffer_error'
   | 'protocol_error'
-  | 'timelimit_exceeded'
   | 'quota_exceeded'
+  | 'timelimit_exceeded'
+  | 'idle_timeout'
+  | 'session_timeout'
+  | 'session_transfer'
   | 'unknown_error';
 export type { ErrorTypeEnum };