diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 4482bf2..d81d76f 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,18 +1,21 @@
 ---
 name: Bug report
 about: Create a report to help us improve
-title: '[BUG] '
-labels: 'bug'
-assignees: ''
+title: "[BUG] "
+labels: "bug"
+assignees: ""
 ---
 
 **Which SDK are you using?**
+
 - [ ] speechmatics-rt (Real-Time SDK)
 - [ ] speechmatics-batch (Batch SDK)
-- [ ] Both
+- [ ] speechmatics-voice (Voice SDK)
+- [ ] All
 
 **Package Information**
-- **Package Name**: (e.g., speechmatics-rt, speechmatics-batch)
+
+- **Package Name**: (e.g., speechmatics-rt, speechmatics-batch, speechmatics-voice)
 - **Package Version**: (e.g., 1.0.0)
 - **Python Version**: (e.g., 3.9, 3.10, 3.11, 3.12, 3.13)
 - **Operating System**: (e.g., Windows 10, macOS 12, Ubuntu 20.04)
@@ -22,6 +25,7 @@ A clear and concise description of what the bug is.
 
 **To Reproduce**
 Steps to reproduce the behavior:
+
 1.
 2.
 
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
index b3e00c5..99c47f3 100644
--- a/.github/ISSUE_TEMPLATE/question.md
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -1,21 +1,24 @@
 ---
 name: Question
 about: Ask a question or request support
-title: '[QUESTION] '
-labels: 'question'
-assignees: ''
+title: "[QUESTION] "
+labels: "question"
+assignees: ""
 ---
 
 **Which SDK is your question about?**
+
 - [ ] speechmatics-rt (Real-Time SDK)
 - [ ] speechmatics-batch (Batch SDK)
-- [ ] Both SDKs
+- [ ] speechmatics-voice (Voice SDK)
+- [ ] All
 - [ ] General usage question
 - [ ] Setup/Installation
 - [ ] Not sure
 
 **Package Information** (if applicable)
-- **Package Name**: (e.g., speechmatics-rt, speechmatics-batch)
+
+- **Package Name**: (e.g., speechmatics-rt, speechmatics-batch, speechmatics-voice)
 - **Package Version**: (e.g., 1.0.0)
 - **Python Version**: (e.g., 3.9, 3.10, 3.11, 3.12, 3.13)
 - **Operating System**: (e.g., Windows 10, macOS 12, Ubuntu 20.04)
@@ -35,4 +38,5 @@ What result or behavior are you hoping to achieve?
 
 **Related issues**
 Link any related issues or discussions:
+
 - Related to #
diff --git a/.github/RELEASE.md b/.github/RELEASE.md
index adccaec..9b6ef51 100644
--- a/.github/RELEASE.md
+++ b/.github/RELEASE.md
@@ -5,9 +5,11 @@ This document outlines the release process for the Speechmatics Python SDK packa
 ## Overview
 
 The Speechmatics Python SDK repository contains two separate packages:
+
 - `speechmatics-rt` - Real-Time API Client
 - `speechmatics-batch` - Batch API Client
 - `speechmatics-flow` - Flow API Client
+- `speechmatics-voice` - Voice Agent API Client
 
 Each package is released independently with its own versioning and release workflow.
 
@@ -16,6 +18,7 @@ Each package is released independently with its own versioning and release workf
 Before creating a release, ensure the following steps are completed:
 
 ### Code Quality
+
 - [ ] All tests pass locally (`make test-all`)
 - [ ] Linting passes (`make lint-all`)
 - [ ] Type checking passes (`make type-check-all`)
@@ -23,10 +26,12 @@ Before creating a release, ensure the following steps are completed:
 - [ ] Documentation is up to date
 
 ### Version Management
+
 - [ ] Review and update README files if needed
 - [ ] Verify dependencies are correct in `pyproject.toml`
 
 ### Testing
+
 - [ ] Test examples with fresh installations
 - [ ] Verify environment variables work correctly
 - [ ] Test error handling scenarios
@@ -39,6 +44,7 @@ Before creating a release, ensure the following steps are completed:
 To release a new version of the RT SDK:
 
 1. **Create a Release Tag**
+
    ```bash
    git tag rt/v1.0.0
    git push origin rt/v1.0.0
@@ -46,6 +52,7 @@ To release a new version of the RT SDK:
 
 2. **Automated Workflow**
    The `release-rt.yaml` workflow will automatically:
+
    - Extract version from tag (e.g., `rt/v1.0.0` → `1.0.0`)
    - Run comprehensive tests across Python versions
    - Update version in `sdk/rt/speechmatics/rt/__init__.py`
@@ -63,6 +70,7 @@ To release a new version of the RT SDK:
 To release a new version of the Batch SDK:
 
 1. **Create a Release Tag**
+
    ```bash
    git tag batch/v1.0.0
    git push origin batch/v1.0.0
@@ -70,6 +78,7 @@ To release a new version of the Batch SDK:
 
 2. **Automated Workflow**
    The `release-batch.yaml` workflow will automatically:
+
    - Extract version from tag (e.g., `batch/v1.0.0` → `1.0.0`)
    - Run comprehensive tests across Python versions
    - Update version in `sdk/batch/speechmatics/batch/__init__.py`
@@ -87,6 +96,7 @@ To release a new version of the Batch SDK:
 To release a new version of the Flow SDK:
 
 1. **Create a Release Tag**
+
    ```bash
    git tag flow/v1.0.0
    git push origin flow/v1.0.0
@@ -94,6 +104,7 @@ To release a new version of the Flow SDK:
 
 2. **Automated Workflow**
    The `release-flow.yaml` workflow will automatically:
+
    - Extract version from tag (e.g., `flow/v1.0.0` → `1.0.0`)
    - Run comprehensive tests across Python versions
    - Update version in `sdk/flow/speechmatics/flow/__init__.py`
@@ -106,36 +117,72 @@ To release a new version of the Flow SDK:
    - Update GitHub release notes
    - Announce the release
 
+### 4. Voice Agent SDK Release
+
+To release a new version of the Voice Agent SDK:
+
+1. **Create a Release Tag**
+
+   ```bash
+   git tag voice/v1.0.0
+   git push origin voice/v1.0.0
+   ```
+
+2. **Automated Workflow**
+   The `release-voice.yaml` workflow will automatically:
+
+   - Extract version from tag (e.g., `voice/v1.0.0` → `1.0.0`)
+   - Run comprehensive tests across Python versions
+   - Update version in `sdk/voice/speechmatics/voice/__init__.py`
+   - Build the package
+   - Publish to PyPI
+
+3. **Manual Steps After Release**
+   - Verify the package is available on PyPI
+   - Test installation: `pip install speechmatics-voice==1.0.0`
+   - Update GitHub release notes
+   - Announce the release
+
 ## Version Management
 
 ### Version Format
+
 Both packages follow semantic versioning (SemVer):
+
 - `MAJOR.MINOR.PATCH` (e.g., `1.2.3`)
 - `MAJOR.MINOR.PATCH-beta.N` for beta releases (e.g., `1.2.3-beta.1`)
 
 ### Version Update Process
+
 1. **Development**: Versions remain as `0.0.0` in `__init__.py` files
 2. **Release**: GitHub Actions automatically updates the version during release
 3. **Post-Release**: The updated version remains in the repository
 
 ### Tag Naming Convention
+
 - RT SDK: `rt/v{version}` (e.g., `rt/v1.0.0`)
 - Batch SDK: `batch/v{version}` (e.g., `batch/v1.0.0`)
 - Flow SDK: `flow/v{version}` (e.g., `flow/v1.0.0`)
+- Voice Agent SDK: `voice/v{version}` (e.g., `voice/v1.0.0`)
 
 ## Environment Setup
 
 ### PyPI Configuration
+
 Both packages are published to PyPI using GitHub Actions with OpenID Connect (OIDC):
+
 - RT SDK: Uses `pypi-rt` environment
 - Batch SDK: Uses `pypi-batch` environment
 - Flow SDK: Uses `pypi-flow` environment
+- Voice Agent SDK: Uses `pypi-voice` environment
 
 ### Required Secrets
+
 No manual secrets are required as the workflows use OIDC for PyPI authentication.
 
 ## Testing Matrix
 
 Both packages are tested against:
+
 - Python versions: 3.9, 3.10, 3.11, 3.12, 3.13
 - Operating system: Ubuntu (latest)
diff --git a/.github/workflows/release-voice.yaml b/.github/workflows/release-voice.yaml
new file mode 100644
index 0000000..6cd92ac
--- /dev/null
+++ b/.github/workflows/release-voice.yaml
@@ -0,0 +1,90 @@
+name: Release Voice Agent SDK
+
+on:
+  push:
+    tags:
+      - "voice/v*"
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  extract-version:
+    runs-on: ubuntu-latest
+    outputs:
+      version: ${{ steps.extract.outputs.version }}
+    steps:
+      - name: Extract version from tag
+        id: extract
+        run: |
+          # Extract version from tag (voice/v1.0.0 -> 1.0.0)
+          VERSION=${GITHUB_REF#refs/tags/voice/v}
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+          echo "Extracted version: $VERSION"
+
+  test-voice:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Test Voice Agent SDK
+        run: |
+          make install-dev
+          make lint-voice
+          make test-voice
+
+  release-build:
+    runs-on: ubuntu-latest
+    needs: [extract-version, test-voice]
+    outputs:
+      version: ${{ needs.extract-version.outputs.version }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - name: Update package version in sdk/voice/speechmatics/voice/__init__.py
+        run: |
+          VERSION="${{ needs.extract-version.outputs.version }}"
+          sed -i "s/0\.0\.0/$VERSION/g" ./sdk/voice/speechmatics/voice/__init__.py
+          echo "Updated version to: $VERSION"
+          cat ./sdk/voice/speechmatics/voice/__init__.py | grep __version__
+
+      - name: Build Voice Agent SDK
+        run: |
+          make install-dev
+          make build-voice
+
+      - name: Upload dist
+        uses: actions/upload-artifact@v4
+        with:
+          name: voice-release-dist
+          path: sdk/voice/dist/
+
+  pypi-publish:
+    runs-on: ubuntu-latest
+    needs: [release-build]
+    environment:
+      name: pypi-voice
+      url: https://pypi.org/project/speechmatics-voice/${{ needs.release-build.outputs.version }}
+
+    steps:
+      - name: Retrieve release dist
+        uses: actions/download-artifact@v4
+        with:
+          name: voice-release-dist
+          path: dist/
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          packages-dir: dist/
+          password: ${{ secrets.PYPI_ORG_TOKEN }}
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b0ca76c..49d61fe 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -69,3 +69,23 @@ jobs:
         run: make test-flow
       - name: Build Flow SDK
         run: make build-flow
+
+  test-voice:
+    name: Test Voice Agent SDK
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: make install-dev
+      - name: Lint Voice Agent SDK
+        run: make lint-voice
+      - name: Test Voice Agent SDK
+        run: make test-voice
+      - name: Build Voice Agent SDK
+        run: make build-voice
diff --git a/.gitignore b/.gitignore
index f930361..27b6686 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,27 +50,12 @@ coverage.xml
 .hypothesis/
 .pytest_cache/
 cover/
+.benchmarks/
 
 # Translations
 *.mo
 *.pot
 
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
 # PyBuilder
 .pybuilder/
 target/
@@ -172,5 +157,16 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 
+# Temporary files / directories
+.tmp/
+tmp/
+
+# Model caches
+.models/
+.cache/
+*.onnx
+*.pkf
+.claude
+
 # Examples
-**/output.wav
\ No newline at end of file
+**/output.wav
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e98d541..b5277c6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -50,6 +50,7 @@ repos:
       - id: bandit
         files: ^sdk/.*/speechmatics/
         args: [-r, -f, json, -ll]
+        additional_dependencies: ["pbr"]
 
   - repo: https://github.com/pre-commit/pygrep-hooks
     rev: v1.10.0
diff --git a/Makefile b/Makefile
index 7f69ce2..a34fd4d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,12 +1,13 @@
 # Makefile for Speechmatics Python SDKs
 
 .PHONY: help
-.PHONY: test-all test-rt test-batch test-flow test-tts
-.PHONY: format-all format-rt format-batch format-flow format-tts
-.PHONY: lint-all lint-rt lint-batch lint-flow lint-tts
-.PHONY: type-check-all type-check-rt type-check-batch type-check-flow type-check-tts
-.PHONY: build-all build-rt build-batch build-flow build-tts
-.PHONY: clean-all clean-rt clean-batch clean-flow clean-tts
+.PHONY: test-all test-rt test-batch test-flow test-tts test-voice
+.PHONY: format-all format-rt format-batch format-flow format-tts format-voice
+.PHONY: lint-all lint-rt lint-batch lint-flow lint-tts lint-voice
+.PHONY: type-check-all type-check-rt type-check-batch type-check-flow type-check-tts type-check-voice
+.PHONY: build-all build-rt build-batch build-flow build-tts build-voice
+.PHONY: clean-all clean-rt clean-batch clean-flow clean-tts clean-voice
+
 
 help:
 	@echo "Available commands:"
@@ -16,24 +17,32 @@ help:
 	@echo "  test-rt           Run tests for RT SDK"
 	@echo "  test-batch        Run tests for Batch SDK"
 	@echo "  test-flow         Run tests for Flow SDK"
+	@echo "  test-tts          Run tests for TTS SDK"
+	@echo "  test-voice        Run tests for Voice Agent SDK"
 	@echo ""
 	@echo "Code formatting:"
 	@echo "  format-all        Auto-fix formatting for all SDKs"
 	@echo "  format-rt         Auto-fix formatting for RT SDK"
 	@echo "  format-batch      Auto-fix formatting for Batch SDK"
 	@echo "  format-flow       Auto-fix formatting for Flow SDK"
+	@echo "  format-tts        Auto-fix formatting for TTS SDK"
+	@echo "  format-voice      Auto-fix formatting for Voice Agent SDK"
 	@echo ""
 	@echo "Linting:"
 	@echo "  lint-all          Run linting for all SDKs"
 	@echo "  lint-rt           Run linting for RT SDK"
 	@echo "  lint-batch        Run linting for Batch SDK"
 	@echo "  lint-flow         Run linting for Flow SDK"
+	@echo "  lint-tts          Run linting for TTS SDK"
+	@echo "  lint-voice        Run linting for Voice Agent SDK"
 	@echo ""
 	@echo "Type checking:"
 	@echo "  type-check-all    Run type checking for all SDKs"
 	@echo "  type-check-rt     Run type checking for RT SDK"
 	@echo "  type-check-batch  Run type checking for Batch SDK"
 	@echo "  type-check-flow   Run type checking for Flow SDK"
+	@echo "  type-check-tts    Run type checking for TTS SDK"
+	@echo "  type-check-voice  Run type checking for Voice Agent SDK"
 	@echo ""
 	@echo "Building:"
 	@echo "  build-all         Build all SDKs"
@@ -41,6 +50,7 @@ help:
 	@echo "  build-batch       Build Batch SDK"
 	@echo "  build-flow        Build Flow SDK"
 	@echo "  build-tts         Build TTS SDK"
+	@echo "  build-voice       Build Voice Agent SDK"
 	@echo ""
 	@echo "Cleaning:"
 	@echo "  clean-all         Clean all SDKs"
@@ -48,22 +58,28 @@ help:
 	@echo "  clean-batch       Clean Batch SDK build artifacts"
 	@echo "  clean-flow        Clean Flow SDK build artifacts"
 	@echo "  clean-tts         Clean TTS SDK build artifacts"
+	@echo "  clean-voice       Clean Voice Agent SDK build artifacts"
 	@echo ""
 
 # Testing targets
-test-all: test-rt test-batch test-flow test-tts
-
+test-all: test-rt test-batch test-flow test-tts test-voice
 test-rt:
-	pytest tests/rt/ -v
+	pytest tests/rt/ -v -s
 
 test-batch:
-	pytest tests/batch/ -v
+	pytest tests/batch/ -v -s
 
 test-flow:
-	pytest tests/flow/ -v
+	pytest tests/flow/ -v -s
+
+test-tts:
+	pytest tests/tts/ -v -s
+
+test-voice:
+	pytest tests/voice/ -v -s
 
 # Formatting targets
-format-all: format-rt format-batch format-flow format-tts
+format-all: format-rt format-batch format-flow format-tts format-voice format-tests format-examples
 
 format-rt:
 	cd sdk/rt/speechmatics && black .
@@ -81,8 +97,20 @@ format-tts:
 	cd sdk/tts/speechmatics && black .
 	cd sdk/tts/speechmatics && ruff check --fix .
 
+format-voice:
+	cd sdk/voice/speechmatics && black .
+	cd sdk/voice/speechmatics && ruff check --fix .
+
+format-tests:
+	cd tests && black .
+	cd tests && ruff check --fix .
+
+format-examples:
+	cd examples && black .
+	cd examples && ruff check --fix .
+
 # Linting targets
-lint-all: lint-rt lint-batch lint-flow lint-tts
+lint-all: lint-rt lint-batch lint-flow lint-tts lint-voice
 
 lint-rt:
 	cd sdk/rt/speechmatics && ruff check .
@@ -96,9 +124,11 @@ lint-flow:
 lint-tts:
 	cd sdk/tts/speechmatics && ruff check .
 
-# Type checking targets
-type-check-all: type-check-rt type-check-batch type-check-flow type-check-tts
+lint-voice:
+	cd sdk/voice/speechmatics && ruff check .
 
+# Type checking targets
+type-check-all: type-check-rt type-check-batch type-check-flow type-check-tts type-check-voice
 type-check-rt:
 	cd sdk/rt/speechmatics && mypy .
 
@@ -111,6 +141,9 @@ type-check-flow:
 type-check-tts:
 	cd sdk/tts/speechmatics && mypy .
 
+type-check-voice:
+	cd sdk/voice/speechmatics && mypy .
+
 # Installation targets
 install-dev:
 	python -m pip install --upgrade pip
@@ -118,12 +151,13 @@ install-dev:
 	python -m pip install -e sdk/batch[dev]
 	python -m pip install -e sdk/flow[dev]
 	python -m pip install -e sdk/tts[dev]
+	python -m pip install -e sdk/voice[dev,smart]
 
 install-build:
 	python -m pip install --upgrade build
 
 # Building targets
-build-all: build-rt build-batch build-flow build-tts
+build-all: build-rt build-batch build-flow build-tts build-voice
 
 build-rt: install-build
 	cd sdk/rt && python -m build
@@ -137,9 +171,11 @@ build-flow: install-build
 build-tts: install-build
 	cd sdk/tts && python -m build
 
-# Cleaning targets
-clean-all: clean-rt clean-batch clean-flow clean-tts
+build-voice: install-build
+	cd sdk/voice && python -m build
 
+# Cleaning targets
+clean-all: clean-rt clean-batch clean-flow clean-tts clean-voice clean-test clean-examples
 clean-rt:
 	rm -rf sdk/rt/dist sdk/rt/build sdk/rt/*.egg-info
 	find sdk/rt -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
@@ -154,4 +190,16 @@ clean-flow:
 
 clean-tts:
 	rm -rf sdk/tts/dist sdk/tts/build sdk/tts/*.egg-info
-	find sdk/tts -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
\ No newline at end of file
+	find sdk/tts -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+
+clean-voice:
+	rm -rf sdk/voice/dist sdk/voice/build sdk/voice/*.egg-info
+	find sdk/voice -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+
+clean-test:
+	find tests -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
+	rm -rf .pytest_cache
+	rm -rf .mypy_cache
+
+clean-examples:
+	find examples -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
diff --git a/README.md b/README.md
index 4d08531..5b866f5 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,26 @@ An async Python client for Speechmatics Flow API.
 pip install speechmatics-flow
 ```
 
+### (Beta) Voice Agent Client (`speechmatics-voice`)
+
+A Voice Agent Python client for Speechmatics Real-Time API.
+
+```bash
+# Standard installation
+pip install speechmatics-voice
+
+# With SMART_TURN (ML-based turn detection)
+pip install speechmatics-voice[smart]
+```
+
+### (Beta) TTS Client (`speechmatics-tts`)
+
+An async Python client for Speechmatics TTS API.
+
+```bash
+pip install speechmatics-tts
+```
+
 ## Development
 
 ### Repository Structure
@@ -52,11 +72,21 @@ speechmatics-python-sdk/
 │   ├── flow/
 │   │   ├── pyproject.toml
 │   │   └── README.md
+│   │
+│   ├── voice/
+│   │   ├── pyproject.toml
+│   │   └── README.md
+│   │
+│   ├── tts/
+│   │   ├── pyproject.toml
+│   │   └── README.md
 │
 ├── tests/
 │   ├── batch/
-│   └── rt/
-│   └── flow/
+│   ├── rt/
+│   ├── flow/
+│   ├── voice/
+│   └── tts/
 │
 ├── examples/
 ├── Makefile
@@ -97,6 +127,8 @@ Each package can be installed separately:
 pip install speechmatics-rt
 pip install speechmatics-batch
 pip install speechmatics-flow
+pip install speechmatics-voice[smart]
+pip install speechmatics-tts
 ```
 
 ## Docs
diff --git a/examples/batch/transcribe_file_async.py b/examples/batch/transcribe_file_async.py
index 2450886..7217e7f 100644
--- a/examples/batch/transcribe_file_async.py
+++ b/examples/batch/transcribe_file_async.py
@@ -5,17 +5,13 @@
 import asyncio
 import os
 
-from speechmatics.batch import (
-    AsyncClient,
-    JobConfig,
-    JobType,
-    Transcript,
-    TranscriptionConfig,
-)
+from speechmatics.batch import AsyncClient
+from speechmatics.batch import JobConfig
+from speechmatics.batch import JobType
+from speechmatics.batch import Transcript
+from speechmatics.batch import TranscriptionConfig
 
-audio_file = os.getenv(
-    "AUDIO_FILE_PATH", os.path.join(os.path.dirname(__file__), "../example.wav")
-)
+audio_file = os.getenv("AUDIO_FILE_PATH", os.path.join(os.path.dirname(__file__), "../example1.wav"))
 
 
 async def main() -> None:
@@ -29,9 +25,7 @@ async def main() -> None:
             # Submit transcription job
             config = JobConfig(
                 type=JobType.TRANSCRIPTION,
-                transcription_config=TranscriptionConfig(
-                    language="en", enable_entities=True
-                ),
+                transcription_config=TranscriptionConfig(language="en", enable_entities=True),
             )
 
             job = await client.submit_job(audio_file, config=config)
diff --git a/examples/example.wav b/examples/example1.wav
similarity index 100%
rename from examples/example.wav
rename to examples/example1.wav
diff --git a/examples/example2.wav b/examples/example2.wav
new file mode 100644
index 0000000..6720939
--- /dev/null
+++ b/examples/example2.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c744f7e3f3d45fc1d64000670da99c223bcdec348cbf014385eaf98b0913016f
+size 901352
diff --git a/examples/example3.wav b/examples/example3.wav
new file mode 100644
index 0000000..6a3536e
--- /dev/null
+++ b/examples/example3.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d214377882dd8a0472d44df3f9853f7cfb488ef429427efa03b7f2b5b6b7c37
+size 184964
diff --git a/examples/example4.wav b/examples/example4.wav
new file mode 100644
index 0000000..21c5cff
--- /dev/null
+++ b/examples/example4.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96425f4f5f1d4db7b6dceadd99df889143e0ce00f0cbfd4b48a05335e4b43c62
+size 472300
diff --git a/examples/rt/async/file/main.py b/examples/rt/async/file/main.py
index b117777..d3c41f0 100644
--- a/examples/rt/async/file/main.py
+++ b/examples/rt/async/file/main.py
@@ -1,6 +1,7 @@
-
 import asyncio
-from speechmatics.rt import AsyncClient, ServerMessageType
+
+from speechmatics.rt import AsyncClient
+from speechmatics.rt import ServerMessageType
 
 
 async def main():
@@ -12,8 +13,9 @@ def handle_final_transcript(msg):
             print(f"Final: {msg['metadata']['transcript']}")
 
         # Transcribe audio file
-        with open("./examples/example.wav", "rb") as audio_file:
+        with open("./examples/example1.wav", "rb") as audio_file:
             await client.transcribe(audio_file)
 
+
 # Run the async function
 asyncio.run(main())
diff --git a/examples/voice/README.md b/examples/voice/README.md
new file mode 100644
index 0000000..3c9c43e
--- /dev/null
+++ b/examples/voice/README.md
@@ -0,0 +1,48 @@
+# Speechmatics Voice Agent SDK Examples
+
+This directory contains practical demonstrations of the Speechmatics Voice Agent SDK for real-time speech-to-text applications.
+
+### Setup
+
+Install the Voice SDK to use the examples. Using a virtual environment is recommended.
+
+```shell
+# Voice examples
+cd <project_directory>
+
+# Create a virtual environment
+python -m venv .venv
+
+# Activate : macOS / Ubuntu / Debian
+source .venv/bin/activate
+
+# Activate : Windows
+.venv\Scripts\activate
+
+# Update pip
+python -m pip install --upgrade pip
+```
+
+**PyAudio installation:**
+
+For macOS and Linux you need to install the `portaudio` package.
+
+```shell
+# macOS
+brew install portaudio
+
+# Ubuntu/Debian
+sudo apt-get install portaudio19-dev
+```
+
+**Package dependencies:**
+
+Install dependencies from the project root.
+
+```shell
+# Voice SDK
+python -m pip install -e 'sdk/voice[dev,smart]'
+
+# Required audio package for examples
+python -m pip install pyaudio
+```
diff --git a/examples/voice/cli/README.md b/examples/voice/cli/README.md
new file mode 100644
index 0000000..0a036a2
--- /dev/null
+++ b/examples/voice/cli/README.md
@@ -0,0 +1,136 @@
+# Transcription CLI with Speaker Diarization
+
+Real-time transcription tool using the Speechmatics Voice SDK. Supports microphone input and audio file streaming with speaker diarization.
+
+## Quick Start
+
+**Microphone:**
+```bash
+python cli.py -p -k YOUR_API_KEY
+```
+
+**Audio file:**
+```bash
+python cli.py -p -k YOUR_API_KEY -i audio.wav
+```
+
+Press `CTRL+C` to stop.
+
+## Requirements
+
+- Speechmatics API key from the [portal](https://portal.speechmatics.com/)
+- Install dependencies: see [examples README](../README.md)
+
+## Options
+
+### Core
+
+- `-k, --api-key` - API key (defaults to `SPEECHMATICS_API_KEY` env var)
+- `-u, --url` - Server URL (defaults to `SPEECHMATICS_RT_URL` env var)
+- `-i, --input-file` - Audio file path (WAV, mono 16-bit). Uses microphone if not specified
+- `-c, --config` - JSON config string or file path (overrides other Voice Agent options)
+
+### Output
+
+- `-p, --pretty` - Formatted console output with colors
+- `-o, --output-file` - Save output to JSONL file
+- `-v, --verbose` - Increase verbosity (can repeat: `-v`, `-vv`, `-vvv`, `-vvvv`, `-vvvvv`)
+  - `-v` - Add speaker VAD events
+  - `-vv` - Add turn predictions
+  - `-vvv` - Add segment annotations
+  - `-vvvv` - Add metrics
+  - `-vvvvv` - Add STT events
+- `-L, --legacy` - Show only legacy transcript messages
+- `--results` - Include word-level results in segments
+
+### Audio
+
+- `--sample-rate` - Sample rate in Hz (default: 16000)
+- `--chunk-size` - Chunk size in bytes (default: 320)
+- `-M, --mute` - Mute audio playback for file input
+- `-D, --default-device` - Use default audio device (skip selection)
+
+### Voice Agent Config
+
+- `-l, --language` - Language code (default: en)
+- `-d, --max-delay` - Max transcription delay in seconds (default: 0.7)
+- `-t, --end-of-utterance-silence-trigger` - Silence duration for turn end (default: 0.5)
+- `-m, --end-of-utterance-mode` - Turn detection mode: `FIXED`, `ADAPTIVE`, `SMART_TURN`, or `EXTERNAL`
+- `-e, --emit-sentences` - Emit sentence-level segments
+- `--forced-eou` - Enable forced end of utterance
+
+### Speaker Management
+
+- `-f, --focus-speakers` - Speakers to focus on (e.g., `S1 S2`)
+- `-I, --ignore-speakers` - Speakers to ignore (e.g., `S1 S2`)
+- `-x, --ignore-mode` - Use ignore mode (instead of retain) for focus speakers
+
+### Speaker Identification
+
+- `-E, --enrol` - Enrol speakers and output identifiers at end
+- `-s, --speakers` - Known speakers JSON string or file path
+
+## Examples
+
+**Basic microphone:**
+```bash
+python cli.py -k YOUR_KEY -p
+```
+
+**Audio file:**
+```bash
+python cli.py -k YOUR_KEY -i audio.wav -p
+```
+
+**Audio file (muted):**
+```bash
+python cli.py -k YOUR_KEY -i audio.wav -Mp
+```
+
+**Save output:**
+```bash
+python cli.py -k YOUR_KEY -o output.jsonl -p
+```
+
+**Verbose logging:**
+```bash
+python cli.py -k YOUR_KEY -vv -p
+```
+
+**Focus on speakers:**
+```bash
+python cli.py -k YOUR_KEY -f S1 S2 -p
+```
+
+**Enrol speakers:**
+```bash
+python cli.py -k YOUR_KEY -Ep
+```
+Press `CTRL+C` when done to see speaker identifiers.
+
+**Use known speakers:**
+```bash
+python cli.py -k YOUR_KEY -s speakers.json -p
+```
+
+Example `speakers.json`:
+```json
+[
+  {"label": "Alice", "speaker_identifiers": ["XX...XX"]},
+  {"label": "Bob", "speaker_identifiers": ["YY...YY"]}
+]
+```
+
+**Custom config:**
+```bash
+python cli.py -k YOUR_KEY -c config.json -p
+```
+
+## Notes
+
+- Speaker identifiers are encrypted and unique to your API key
+- Allow speakers to say at least 20 words before enrolling
+- Avoid labels `S1`, `S2` (reserved by engine)
+- Labels like `__XXX__` are automatically ignored
+
+See the [Speechmatics documentation](https://docs.speechmatics.com/speech-to-text/realtime/realtime-speaker-identification) for more details.
diff --git a/examples/voice/cli/cli.py b/examples/voice/cli/cli.py
new file mode 100644
index 0000000..a581c9c
--- /dev/null
+++ b/examples/voice/cli/cli.py
@@ -0,0 +1,771 @@
+"""Transcription CLI with Speaker Diarization.
+
+Command-line tool for real-time transcription using the Speechmatics Voice SDK.
+Supports both microphone input and audio file streaming with speaker diarization.
+"""
+
+import argparse
+import asyncio
+import datetime
+import json
+import os
+import wave
+from pathlib import Path
+from typing import Any
+
+from utils import AudioPlayer
+from utils import select_audio_device
+from utils import select_audio_output_device
+
+from speechmatics.rt import ClientMessageType
+from speechmatics.rt import Microphone
+from speechmatics.voice import AdditionalVocabEntry
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import SpeakerFocusConfig
+from speechmatics.voice import SpeakerFocusMode
+from speechmatics.voice import SpeakerIdentifier
+from speechmatics.voice import SpeechSegmentConfig
+from speechmatics.voice import VoiceAgentClient
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice._models import TranscriptionUpdatePreset
+
+# ==============================================================================
+# CONSTANTS
+# ==============================================================================
+
+COLORS = {
+    # Segments
+    "AddPartialSegment": "\033[93m",
+    "AddSegment": "\033[1;92m",
+    # Speaker events
+    "SpeakerStarted": "\033[94m",
+    "SpeakerEnded": "\033[94m",
+    "SpeakersResult": "\033[95m",
+    "SpeakerMetrics": "\033[96m",
+    # End of turn
+    "StartOfTurn": "\033[91m",
+    "EndOfTurnPrediction": "\033[95m",
+    "EndOfTurn": "\033[1;91m",
+    # Transcript events
+    "AddPartialTranscript": "\033[90m",
+    "AddTranscript": "\033[90m",
+    "EndOfUtterance": "\033[90m",
+}
+
+
+# ==============================================================================
+# MAIN ENTRY POINT
+# ==============================================================================
+
+
+async def main() -> None:
+    """Run the transcription CLI."""
+
+    # Parse the command line arguments
+    args = parse_args()
+
+    # Setup audio source (microphone or file)
+    audio_source = setup_audio_source(args)
+    if not audio_source:
+        return
+
+    # Setup audio output (for file playback)
+    audio_player = setup_audio_output(audio_source, args)
+
+    # Remove JSONL output file if it already exists
+    if args.output_file and os.path.exists(args.output_file):
+        os.remove(args.output_file)
+
+    # Create speaker configuration
+    speaker_config = create_speaker_config(args)
+
+    # Known speakers
+    known_speakers: list[SpeakerIdentifier] = [SpeakerIdentifier(**s) for s in args.speakers] if args.speakers else []
+
+    # Use JSON config
+    if args.config is not None:
+        try:
+            config = VoiceAgentConfig.model_validate(args.config)
+        except Exception as e:
+            print(f"Error validating config: {e}")
+            return
+
+    # Create Voice Agent configuration
+    else:
+        config = VoiceAgentConfig(
+            language=args.language or "en",
+            end_of_utterance_silence_trigger=args.end_of_utterance_silence_trigger or 0.5,
+            max_delay=args.max_delay or 0.7,
+            end_of_utterance_mode=(
+                args.end_of_utterance_mode.lower() if args.end_of_utterance_mode else EndOfUtteranceMode.ADAPTIVE
+            ),
+            speaker_config=speaker_config,
+            use_forced_eou_message=args.forced_eou,
+            additional_vocab=[
+                AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]),
+            ],
+            known_speakers=known_speakers,
+            speech_segment_config=SpeechSegmentConfig(
+                emit_sentences=args.emit_sentences,
+            ),
+            transcription_update_preset=TranscriptionUpdatePreset.COMPLETE_PLUS_TIMING,
+            include_results=args.results,
+        )
+
+    # Display instructions
+    if audio_source["type"] == "file":
+        print("\nStreaming audio file... (Press CTRL+C to stop)\n")
+    else:
+        print("\nMicrophone ready - speak now... (Press CTRL+C to stop)\n")
+
+    # Set common items
+    config.enable_diarization = True
+    config.sample_rate = audio_source["sample_rate"]
+
+    # Create Voice Agent client
+    client = VoiceAgentClient(api_key=args.api_key, url=args.url, config=config)
+
+    # Setup event handlers
+    start_time = datetime.datetime.now()
+    register_event_handlers(client, args, start_time)
+
+    # Connect to the Voice Agent service
+    try:
+        await client.connect()
+    except Exception:
+        print("Error connecting to Voice Agent service")
+        return
+
+    # Request speaker IDs at the end of the session (if enrolling)
+    if args.enrol:
+        await client.send_message({"message": ClientMessageType.GET_SPEAKERS, "final": True})
+
+    # Stream audio
+    try:
+        await stream_audio(audio_source, audio_player, client, args.chunk_size)
+    except asyncio.CancelledError:
+        pass
+    finally:
+        if audio_player:
+            audio_player.stop()
+        await client.disconnect()
+
+
+# ==============================================================================
+# AUDIO SOURCE SETUP
+# ==============================================================================
+
+
+def setup_audio_source(args) -> dict | None:
+    """Setup audio source (microphone or file).
+
+    Returns:
+        Dictionary with audio source information or None on error.
+    """
+    if args.input_file:
+        return setup_file_source(args)
+    else:
+        return setup_microphone_source(args)
+
+
+def setup_file_source(args) -> dict | None:
+    """Setup audio file source.
+
+    Returns:
+        Dictionary with file information or None on error.
+    """
+    audio_file_path = Path(args.input_file)
+    if not audio_file_path.exists():
+        print(f"Error: Audio file not found: {audio_file_path}")
+        return None
+
+    # Load and validate the audio file
+    try:
+        with wave.open(str(audio_file_path), "rb") as wav_file:
+            sample_rate = wav_file.getframerate()
+            channels = wav_file.getnchannels()
+            sample_width = wav_file.getsampwidth()
+            frames = wav_file.getnframes()
+            duration = frames / sample_rate
+
+            if channels != 1:
+                print("Error: Only mono audio files are supported")
+                return None
+            if sample_width != 2:
+                print("Error: Only 16-bit audio files are supported")
+                return None
+
+        print(f"Loading: {audio_file_path.name} ({duration:.1f}s, {sample_rate}Hz)")
+
+        return {
+            "type": "file",
+            "path": audio_file_path,
+            "sample_rate": sample_rate,
+            "channels": channels,
+            "sample_width": sample_width,
+        }
+
+    except (wave.Error, ValueError) as e:
+        print(f"Error loading audio file: {e}")
+        return None
+
+
+def setup_microphone_source(args) -> dict | None:
+    """Setup microphone source.
+
+    Returns:
+        Dictionary with microphone information or None on error.
+    """
+
+    if not args.default_device:
+        print("\nSelect microphone input device:")
+        selected_device = select_audio_device()
+    else:
+        selected_device = None
+
+    mic = Microphone(
+        sample_rate=args.sample_rate or None,
+        chunk_size=args.chunk_size,
+        device_index=selected_device,
+    )
+
+    if not mic.start():
+        print("Error: PyAudio not available - install with: pip install pyaudio")
+        return None
+
+    return {
+        "type": "microphone",
+        "mic": mic,
+        "sample_rate": args.sample_rate,
+    }
+
+
+# ==============================================================================
+# AUDIO OUTPUT SETUP
+# ==============================================================================
+
+
+def setup_audio_output(audio_source: dict, args) -> AudioPlayer | None:
+    """Setup audio output for file playback.
+
+    Args:
+        audio_source: Audio source information
+        args: Command-line arguments
+
+    Returns:
+        AudioPlayer instance or None if not needed/available.
+    """
+    # Only setup audio output for file sources
+    if audio_source["type"] != "file":
+        return None
+
+    # Skip audio output if muted
+    if args.mute:
+        print("\nAudio playback muted - transcription only")
+        return None
+
+    if not args.default_device:
+        print("\nSelect audio output device for playback:")
+        output_device = select_audio_output_device()
+    else:
+        output_device = None
+
+    audio_player = AudioPlayer(
+        sample_rate=audio_source["sample_rate"],
+        channels=audio_source["channels"],
+        sample_width=audio_source["sample_width"],
+        device_index=output_device,
+    )
+
+    if not audio_player.start():
+        print("Warning: Audio playback unavailable - continuing with transcription only")
+        return None
+
+    return audio_player
+
+
+# ==============================================================================
+# SPEAKER CONFIGURATION
+# ==============================================================================
+
+
+def create_speaker_config(args) -> SpeakerFocusConfig:
+    """Create speaker diarization configuration from arguments.
+
+    Args:
+        args: Command-line arguments
+
+    Returns:
+        SpeakerFocusConfig instance.
+    """
+    if args.focus_speakers or args.ignore_speakers:
+        focus_mode = SpeakerFocusMode.IGNORE if args.ignore_mode else SpeakerFocusMode.RETAIN
+        return SpeakerFocusConfig(
+            focus_speakers=args.focus_speakers or [],
+            ignore_speakers=args.ignore_speakers or [],
+            focus_mode=focus_mode,
+        )
+    else:
+        return SpeakerFocusConfig()
+
+
+# ==============================================================================
+# EVENT HANDLERS
+# ==============================================================================
+
+
+def register_event_handlers(client: VoiceAgentClient, args, start_time: datetime.datetime) -> None:
+    """Register event handlers for transcription events.
+
+    Args:
+        client: Voice Agent client
+        args: Command-line arguments
+        start_time: Start time for timestamp calculation
+    """
+
+    def console_print(ts: datetime.datetime, message: dict) -> None:
+        """Print message to console with optional formatting."""
+        if not args.pretty:
+            print(json.dumps(message))
+            return
+
+        # Extract common data
+        ts_str = ts.strftime("%H:%M:%S") + f".{ts.microsecond // 1000:03d}"
+        msg_type = message["message"]
+        color = COLORS.get(msg_type, "")
+        payload = message
+
+        # Handle segment messages
+        if msg_type in ("AddPartialSegment", "AddSegment"):
+            _segs = []
+            for segment in message["segments"]:
+                suffix = "" if segment["is_active"] else " (background)"
+                if args.verbose >= 3:
+                    _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}` {segment['annotation']}")
+                else:
+                    _segs.append(f"@{segment['speaker_id']}{suffix}: `{segment['text']}`")
+            payload = {"segments": _segs}
+
+        # Print to console
+        print(f"{color}{ts_str} {msg_type:<24} {json.dumps(payload)}\033[0m")
+
+    def log_message(message: dict[str, Any]) -> None:
+        """Log message to console and optional JSONL file."""
+        now = datetime.datetime.now()
+        console_print(now, message)
+        if args.output_file:
+            ts_str = now.strftime("%Y-%m-%d %H:%M:%S") + f".{now.microsecond // 1000:03d}"
+            with open(args.output_file, "a") as f:
+                f.write(json.dumps({"ts": ts_str, **message}) + "\n")
+
+    # Register standard handlers
+    client.on(AgentServerMessageType.INFO, log_message)
+    client.on(AgentServerMessageType.RECOGNITION_STARTED, log_message)
+    client.on(AgentServerMessageType.END_OF_TRANSCRIPT, log_message)
+
+    # Voice SDK messages
+    if not args.legacy:
+        # Segment messages
+        client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, log_message)
+        client.on(AgentServerMessageType.ADD_SEGMENT, log_message)
+        client.on(AgentServerMessageType.START_OF_TURN, log_message)
+        client.on(AgentServerMessageType.END_OF_TURN, log_message)
+        client.on(AgentServerMessageType.SPEAKERS_RESULT, log_message)
+
+        # Verbose VAD events
+        if args.verbose >= 1:
+            client.on(AgentServerMessageType.SPEAKER_STARTED, log_message)
+            client.on(AgentServerMessageType.SPEAKER_ENDED, log_message)
+
+        # Verbose turn prediction
+        if args.verbose >= 2:
+            client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, log_message)
+
+        # Metrics
+        if args.verbose >= 4:
+            client.on(AgentServerMessageType.SESSION_METRICS, log_message)
+            client.on(AgentServerMessageType.SPEAKER_METRICS, log_message)
+
+        # Verbose STT events
+        if args.verbose >= 5:
+            client.on(AgentServerMessageType.END_OF_UTTERANCE, log_message)
+            client.on("ForcedEndOfUtterance", log_message)
+            client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, log_message)
+            client.on(AgentServerMessageType.ADD_TRANSCRIPT, log_message)
+
+    # Legacy messages
+    else:
+        client.on(AgentServerMessageType.END_OF_UTTERANCE, log_message)
+        client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, log_message)
+        client.on(AgentServerMessageType.ADD_TRANSCRIPT, log_message)
+
+    # Log the config
+    if args.verbose >= 1:
+        log_message(
+            {
+                "message": "VoiceAgentClientConfig",
+                "config": client._config.model_dump(exclude_none=True, exclude_unset=True),
+            }
+        )
+
+
+# ==============================================================================
+# AUDIO STREAMING
+# ==============================================================================
+
+
+async def stream_audio(
+    audio_source: dict,
+    audio_player: AudioPlayer | None,
+    client: VoiceAgentClient,
+    chunk_size: int,
+) -> None:
+    """Stream audio from source to client.
+
+    Args:
+        audio_source: Audio source information
+        audio_player: Audio player for file playback (optional)
+        client: Voice Agent client
+        chunk_size: Audio chunk size in bytes
+    """
+    if audio_source["type"] == "file":
+        await stream_file(audio_source, audio_player, client, chunk_size)
+    else:
+        await stream_microphone(audio_source, client, chunk_size)
+
+
+async def stream_file(
+    audio_source: dict,
+    audio_player: AudioPlayer | None,
+    client: VoiceAgentClient,
+    chunk_size: int,
+) -> None:
+    """Stream audio file with real-time pacing.
+
+    Uses absolute timing to prevent audio crackling when processing takes longer
+    than expected. This ensures consistent playback timing regardless of
+    transcription processing delays.
+
+    Args:
+        audio_source: Audio source information
+        audio_player: Audio player for playback (optional)
+        client: Voice Agent client
+        chunk_size: Audio chunk size in bytes
+    """
+    file_path = audio_source["path"]
+    sample_rate = audio_source["sample_rate"]
+    chunk_duration = chunk_size / sample_rate
+
+    # Use absolute timing to prevent drift
+    start_time = asyncio.get_event_loop().time()
+    chunk_count = 0
+
+    with wave.open(str(file_path), "rb") as wav_file:
+        while True:
+            audio_data = wav_file.readframes(chunk_size)
+            if not audio_data:
+                break
+
+            # Send to transcription (non-blocking)
+            asyncio.create_task(client.send_audio(audio_data))
+
+            # Play audio (blocking to maintain timing)
+            if audio_player:
+                audio_player.play(audio_data)
+
+            # Calculate next chunk time based on absolute timing
+            chunk_count += 1
+            next_chunk_time = start_time + (chunk_count * chunk_duration)
+            current_time = asyncio.get_event_loop().time()
+            sleep_duration = next_chunk_time - current_time
+
+            # Only sleep if we're ahead of schedule
+            if sleep_duration > 0:
+                await asyncio.sleep(sleep_duration)
+
+
+async def stream_microphone(
+    audio_source: dict,
+    client: VoiceAgentClient,
+    chunk_size: int,
+) -> None:
+    """Stream microphone audio to client.
+
+    Args:
+        audio_source: Audio source information
+        client: Voice Agent client
+        chunk_size: Audio chunk size in bytes
+    """
+    mic = audio_source["mic"]
+    while True:
+        frame = await mic.read(chunk_size)
+        await client.send_audio(frame)
+
+
+# ==============================================================================
+# COMMAND-LINE ARGUMENT PARSING
+# ==============================================================================
+
+
+def load_json(value: str):
+    """Load JSON string or file path.
+
+    Args:
+        value: Either a JSON string or path to a JSON file
+
+    Returns:
+        Parsed json object
+
+    Raises:
+        argparse.ArgumentTypeError: If the value cannot be parsed
+    """
+    # First, try to parse as JSON string
+    try:
+        return json.loads(value)
+    except json.JSONDecodeError:
+        pass
+
+    # If that fails, try to load as a file path
+    try:
+        file_path = Path(value)
+        if file_path.exists() and file_path.is_file():
+            with open(file_path) as f:
+                return json.load(f)
+        else:
+            raise argparse.ArgumentTypeError(f"File not found: {value}")
+    except Exception as e:
+        raise argparse.ArgumentTypeError(f"Could not parse as JSON or load from file: {value}. Error: {e}")
+
+
+def parse_args():
+    """Parse command-line arguments.
+
+    Returns:
+        Parsed arguments namespace.
+    """
+    parser = argparse.ArgumentParser(
+        description="Transcription CLI with speaker diarization - supports microphone or audio file input",
+        epilog="Example: python main.py -k YOUR_KEY -i audio.wav -p",
+    )
+
+    # ==============================================================================
+    # Core parameters
+    # ==============================================================================
+
+    parser.add_argument(
+        "-k",
+        "--api-key",
+        default=os.getenv("SPEECHMATICS_API_KEY"),
+        help="Speechmatics API key (defaults to SPEECHMATICS_API_KEY environment variable)",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        default=os.getenv("SPEECHMATICS_RT_URL"),
+        help="Speechmatics server URL (optional)",
+    )
+
+    # ==============================================================================
+    # Audio source
+    # ==============================================================================
+
+    parser.add_argument(
+        "-i",
+        "--input-file",
+        type=str,
+        help="Path to input audio file (WAV format, mono 16-bit). If not provided, uses microphone",
+    )
+
+    # ==============================================================================
+    # Audio configuration
+    # ==============================================================================
+
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="Audio sample rate in Hz (default: 16000)",
+    )
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=320,
+        help="Audio chunk size in bytes (default: 320)",
+    )
+    parser.add_argument(
+        "-M",
+        "--mute",
+        action="store_true",
+        help="Mute audio playback for file input (default: False)",
+    )
+
+    # ==============================================================================
+    # Output configuration
+    # ==============================================================================
+
+    parser.add_argument(
+        "-o",
+        "--output-file",
+        type=str,
+        help="Output to a JSONL file",
+    )
+    parser.add_argument(
+        "-p",
+        "--pretty",
+        action="store_true",
+        help="Pretty print console output (default: False)",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="count",
+        default=0,
+        help="Increase logging verbosity (-v: add speaker VAD events, -vv: add END_OF_TURN_PREDICTION, -vvv: add additional payloads)",
+    )
+    parser.add_argument(
+        "-L",
+        "--legacy",
+        action="store_true",
+        help="Only show payloads from AsyncClient (AddPartialTranscript | AddTranscript) (default: False)",
+    )
+    parser.add_argument(
+        "-D",
+        "--default-device",
+        action="store_true",
+        help="Use default device (default: False)",
+    )
+    parser.add_argument(
+        "--results",
+        action="store_true",
+        help="Include word transcription payload results in output (default: False)",
+    )
+
+    # ==============================================================================
+    # Voice Agent configuration
+    # ==============================================================================
+
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=load_json,
+        help="Config JSON string or path to JSON file (default: None)",
+    )
+    parser.add_argument(
+        "-l",
+        "--language",
+        type=str,
+        help="Language code (default: en)",
+    )
+    parser.add_argument(
+        "-d",
+        "--max-delay",
+        type=float,
+        help="Maximum delay for transcription results in seconds (default: 0.7)",
+    )
+    parser.add_argument(
+        "-t",
+        "--end-of-utterance-silence-trigger",
+        type=float,
+        help="Silence duration to trigger end of utterance in seconds (default: 0.5)",
+    )
+    parser.add_argument(
+        "-m",
+        "--end-of-utterance-mode",
+        type=lambda s: s.upper(),
+        choices=["FIXED", "ADAPTIVE", "EXTERNAL", "SMART_TURN"],
+        help="End of utterance detection mode (default: ADAPTIVE)",
+    )
+    parser.add_argument(
+        "-e",
+        "--emit-sentences",
+        action="store_true",
+        help="Emit sentences (default: False)",
+    )
+
+    # ==============================================================================
+    # Speaker configuration
+    # ==============================================================================
+
+    parser.add_argument(
+        "-f",
+        "--focus-speakers",
+        nargs="*",
+        help="Speakers to focus on (e.g., S1 S2). Use with --ignore-mode to ignore these speakers instead",
+    )
+    parser.add_argument(
+        "-I",
+        "--ignore-speakers",
+        nargs="*",
+        help="Specific speakers to ignore (e.g., S1 S2)",
+    )
+    parser.add_argument(
+        "-x",
+        "--ignore-mode",
+        action="store_true",
+        help="Use IGNORE mode instead of RETAIN mode for non-focus speakers",
+    )
+
+    # ==============================================================================
+    # Speaker identification
+    # ==============================================================================
+
+    parser.add_argument(
+        "-E",
+        "--enrol",
+        action="store_true",
+        help="Enrol a speaker (default: False)",
+    )
+    parser.add_argument(
+        "-s",
+        "--speakers",
+        type=load_json,
+        help="Known speakers as JSON string or path to JSON file (default: None)",
+    )
+    parser.add_argument(
+        "--forced-eou",
+        action="store_true",
+        help="Use forced end of utterance (default: False)",
+    )
+
+    # ==============================================================================
+    # Check for mutually exclusive options
+    # ==============================================================================
+
+    args = parser.parse_args()
+
+    mutually_excludive = [
+        "emit-sentences",
+        "end-of-utterance-mode",
+        "end-of-utterance-silence-trigger",
+        "focus-speakers",
+        "ignore-mode",
+        "ignore-speakers",
+        "language",
+        "max-delay",
+        "forced-eou",
+        "speakers",
+    ]
+
+    if args.config is not None:
+        conflicts: list[str] = []
+        for arg in mutually_excludive:
+            if getattr(args, arg.replace("-", "_")):
+                conflicts.append(arg)
+        if conflicts:
+            print(f"**ERROR** -> You cannot use {[f'--{arg}' for arg in conflicts]} in combination with -c/--config")
+            exit(1)
+
+    # Return the parsed arguments
+    return args
+
+
+# ==============================================================================
+# ENTRY POINT
+# ==============================================================================
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("\nCLI utility stopped by user")
diff --git a/examples/voice/cli/utils.py b/examples/voice/cli/utils.py
new file mode 100644
index 0000000..a22a61d
--- /dev/null
+++ b/examples/voice/cli/utils.py
@@ -0,0 +1,371 @@
+"""Utility functions and classes for the Speechmatics Voice CLI.
+
+This module provides:
+- Audio device selection (input/output)
+- Audio playback functionality
+- Custom logging with colour support
+- Helper functions for async operations
+"""
+
+import asyncio
+import logging
+import sys
+
+import pyaudio
+
+from speechmatics.rt import Microphone
+from speechmatics.voice import VoiceAgentClient
+
+if sys.platform == "win32":
+    pass
+else:
+    import termios
+    import tty
+
+
+# ==============================================================================
+# ASYNC UTILITIES
+# ==============================================================================
+
+
+async def wait_for_keypress() -> None:
+    """Wait for any key press in a non-blocking way.
+
+    Note: Unix/Mac only.
+    """
+    loop = asyncio.get_event_loop()
+
+    def _read_key():
+        """Read a single key press."""
+        fd = sys.stdin.fileno()
+        old_settings = termios.tcgetattr(fd)
+        try:
+            tty.setraw(fd)
+            ch = sys.stdin.read(1)
+            return ch
+        finally:
+            termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+
+    # Run the blocking key read in a thread pool
+    await loop.run_in_executor(None, _read_key)
+
+
+# ==============================================================================
+# AUDIO DEVICE SELECTION
+# ==============================================================================
+
+
+def select_audio_device() -> int | None:
+    """Interactive microphone device selection.
+
+    Displays available input devices and prompts user to select one.
+
+    Returns:
+        Device index or None for default device.
+    """
+    devices = Microphone.list_devices()
+    if not devices:
+        return None
+
+    print("Available microphones:")
+    for device in devices:
+        print(f"  [{device['index']}] {device['name']} ({device['channels']} channels)")
+    print()
+
+    return get_device_choice(
+        [d["index"] for d in devices],
+        "Enter device index (or press Enter for default): ",
+    )
+
+
+def select_audio_output_device() -> int | None:
+    """Interactive audio output device selection.
+
+    Displays available output devices and prompts user to select one.
+
+    Returns:
+        Device index or None for default device.
+    """
+    try:
+        output_devices = get_output_devices()
+        if not output_devices:
+            print("No audio output devices found.")
+            return None
+
+        print("Available audio output devices:")
+        for device in output_devices:
+            print(f"  [{device['index']}] {device['name']} ({device['channels']} channels, {device['sample_rate']}Hz)")
+        print()
+
+        return get_device_choice(
+            [d["index"] for d in output_devices],
+            "Enter output device index (or press Enter for default): ",
+        )
+
+    except Exception as e:
+        print(f"Error listing audio devices: {e}")
+        return None
+
+
+def get_output_devices() -> list[dict]:
+    """Get list of available output devices.
+
+    Returns:
+        List of dictionaries containing device information.
+    """
+    p = pyaudio.PyAudio()
+    try:
+        devices = []
+        for i in range(p.get_device_count()):
+            info = p.get_device_info_by_index(i)
+            if info["maxOutputChannels"] > 0:
+                devices.append(
+                    {
+                        "index": i,
+                        "name": info["name"],
+                        "channels": info["maxOutputChannels"],
+                        "sample_rate": int(info["defaultSampleRate"]),
+                    }
+                )
+        return devices
+    finally:
+        p.terminate()
+
+
+def get_device_choice(valid_indices: list[int], prompt: str) -> int | None:
+    """Get user device choice with validation.
+
+    Args:
+        valid_indices: List of valid device indices.
+        prompt: Prompt message to display.
+
+    Returns:
+        Selected device index or None for default.
+    """
+    while True:
+        try:
+            choice = input(prompt).strip()
+            if not choice:
+                return None
+
+            device_index = int(choice)
+            if device_index in valid_indices:
+                return device_index
+
+            print(f"Invalid device index. Choose from: {valid_indices}")
+
+        except ValueError:
+            print("Please enter a valid number.")
+        except KeyboardInterrupt:
+            return None
+
+
+# ==============================================================================
+# MICROPHONE UTILITIES
+# ==============================================================================
+
+
+def setup_microphone(sample_rate: int, chunk_size: int) -> Microphone | None:
+    """Setup microphone with device selection.
+
+    Args:
+        sample_rate: Audio sample rate in Hz.
+        chunk_size: Audio chunk size in bytes.
+
+    Returns:
+        Microphone instance or None on error.
+    """
+    selected_device = select_audio_device()
+
+    mic = Microphone(
+        sample_rate=sample_rate,
+        chunk_size=chunk_size,
+        device_index=selected_device,
+    )
+
+    if not mic.start():
+        print("Error: PyAudio not available - install with: pip install pyaudio")
+        return None
+    return mic
+
+
+async def stream_microphone(mic: Microphone, client: VoiceAgentClient, chunk_size: int) -> None:
+    """Stream microphone audio to client.
+
+    Args:
+        mic: Microphone instance.
+        client: Voice Agent client.
+        chunk_size: Audio chunk size in bytes.
+    """
+    while True:
+        frame = await mic.read(chunk_size)
+        await client.send_audio(frame)
+
+
+# ==============================================================================
+# AUDIO PLAYBACK
+# ==============================================================================
+
+
+class AudioPlayer:
+    """Real-time audio player using PyAudio.
+
+    Provides synchronized audio playback for file streaming with transcription.
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        channels: int = 1,
+        sample_width: int = 2,
+        device_index: int | None = None,
+    ):
+        """Initialize audio player.
+
+        Args:
+            sample_rate: Audio sample rate in Hz.
+            channels: Number of audio channels (1 for mono).
+            sample_width: Sample width in bytes (2 for 16-bit).
+            device_index: Output device index (None for default).
+        """
+        self.sample_rate = sample_rate
+        self.channels = channels
+        self.sample_width = sample_width
+        self.device_index = device_index
+        self.p = None
+        self.stream = None
+
+    def start(self) -> bool:
+        """Start audio playback stream.
+
+        Returns:
+            True if successful, False otherwise.
+        """
+        try:
+            self.p = pyaudio.PyAudio()
+            if not self.p:
+                return False
+
+            audio_format = self._get_audio_format()
+            if not audio_format:
+                return False
+
+            self.stream = self.p.open(
+                format=audio_format,
+                channels=self.channels,
+                rate=self.sample_rate,
+                output=True,
+                output_device_index=self.device_index,
+            )
+            return True
+
+        except Exception as e:
+            print(f"Error starting audio player: {e}")
+            return False
+
+    def play(self, audio_data: bytes) -> None:
+        """Play audio data chunk.
+
+        Args:
+            audio_data: Raw audio bytes to play.
+        """
+        if self.stream:
+            self.stream.write(audio_data)
+
+    def stop(self) -> None:
+        """Stop and cleanup audio player resources."""
+        if self.stream:
+            self.stream.stop_stream()
+            self.stream.close()
+            self.stream = None
+
+        if self.p:
+            self.p.terminate()
+            self.p = None
+
+    def _get_audio_format(self) -> int | None:
+        """Get PyAudio format from sample width.
+
+        Returns:
+            PyAudio format constant or None if unsupported.
+        """
+        format_map = {1: pyaudio.paInt8, 2: pyaudio.paInt16, 4: pyaudio.paInt32}
+        if self.sample_width not in format_map:
+            print(f"Unsupported sample width: {self.sample_width}")
+            return None
+        return format_map[self.sample_width]
+
+
+# ==============================================================================
+# CUSTOM LOGGING
+# ==============================================================================
+
+
+class CustomLevels:
+    """Custom logging levels for transcription events.
+
+    Defines numeric levels for different types of transcription events.
+    """
+
+    PARTIAL = 11  # Partial transcription results
+    FINAL = 12  # Final transcription results
+    SPEAKER = 15  # Speech activity events
+
+
+class CustomTextFormatter(logging.Formatter):
+    """Coloured logging formatter for transcription events.
+
+    Applies ANSI colour codes to log messages based on their level.
+    """
+
+    FORMAT = "%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s"
+
+    # ANSI colour codes
+    COLOURS = {
+        logging.DEBUG: "\033[90m",  # Grey
+        CustomLevels.PARTIAL: "\033[32m",  # Green
+        CustomLevels.FINAL: "\033[33m",  # Yellow
+        CustomLevels.SPEAKER: "\033[36m",  # Cyan
+    }
+    RESET = "\033[0m"
+
+    def format(self, record):
+        """Format log record with colour.
+
+        Args:
+            record: Log record to format.
+
+        Returns:
+            Formatted and coloured log message.
+        """
+        colour = self.COLOURS.get(record.levelno, self.RESET)
+        message = super().format(record)
+        return f"{colour}{message}{self.RESET}\r"
+
+
+def get_logger(name: str) -> logging.Logger:
+    """Setup coloured logger for transcription events.
+
+    Args:
+        name: Logger name.
+
+    Returns:
+        Configured logger instance.
+    """
+    logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logging.getLogger("speechmatics.voice").setLevel(logging.DEBUG)
+
+    # Register custom level names
+    logging.addLevelName(CustomLevels.PARTIAL, "PARTIAL")
+    logging.addLevelName(CustomLevels.FINAL, "FINAL")
+    logging.addLevelName(CustomLevels.SPEAKER, "SPEAKER")
+
+    # Apply custom formatter to all handlers
+    formatter = CustomTextFormatter(CustomTextFormatter.FORMAT, datefmt="%Y-%m-%d %H:%M:%S")
+    for handler in logging.root.handlers:
+        handler.setFormatter(formatter)
+
+    return logger
diff --git a/examples/voice/requirements-examples.txt b/examples/voice/requirements-examples.txt
new file mode 100644
index 0000000..d21a996
--- /dev/null
+++ b/examples/voice/requirements-examples.txt
@@ -0,0 +1,2 @@
+pyaudio
+speechmatics-voice[smart]
diff --git a/examples/voice/scribe/README.md b/examples/voice/scribe/README.md
new file mode 100644
index 0000000..2b0468f
--- /dev/null
+++ b/examples/voice/scribe/README.md
@@ -0,0 +1,55 @@
+# Ambient Scribe
+
+Real-time transcription for note-taking and documentation. Uses the default microphone and the SCRIBE preset.
+
+A custom dictionary can be used to improve accuracy for domain-specific terms. The example `vocab.json` is loaded automatically if present.
+
+## Quick Start
+
+```bash
+export SPEECHMATICS_API_KEY=your_api_key
+python scribe.py
+```
+
+Press `CTRL+C` to stop.
+
+## Features
+
+- Real-time transcription with speaker diarization
+- Partial results (yellow) update as speech continues
+- Final results (green) shown with timestamps
+- Automatically loads custom vocabulary from `vocab.json` if present
+- Uses SCRIBE preset (fixed EOU, 1s max delay, sentence emission)
+
+## Requirements
+
+- Speechmatics API key from the [portal](https://portal.speechmatics.com/)
+- PyAudio: `pip install pyaudio`
+- See [examples README](../README.md) for SDK dependencies
+
+## Output Example
+
+```
+Microphone ready - speak now... (Press CTRL+C to stop)
+
+00:00:03 - S1: Hello, how are you today?
+00:00:07 - S2: I'm doing great, thanks for asking.
+00:00:12 - S1: That's wonderful to hear.
+ listening ...
+```
+
+## Custom Vocabulary
+
+Create `vocab.json` to improve accuracy for domain-specific terms:
+
+```json
+[
+  {
+    "content": "Speechmatics",
+    "sounds_like": ["speech matics"]
+  },
+  {
+    "content": "API"
+  }
+]
+```
diff --git a/examples/voice/scribe/scribe.py b/examples/voice/scribe/scribe.py
new file mode 100644
index 0000000..2b6822c
--- /dev/null
+++ b/examples/voice/scribe/scribe.py
@@ -0,0 +1,162 @@
+"""Simple microphone transcription example.
+
+This example demonstrates basic real-time transcription with speaker diarization
+using the default microphone. It prints partial segments, final segments, and
+end-of-turn events.
+"""
+
+import asyncio
+import json
+import os
+from enum import Enum
+from pathlib import Path
+
+from speechmatics.rt import Microphone
+from speechmatics.voice import AdditionalVocabEntry
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import VoiceAgentClient
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice import VoiceAgentConfigPreset
+
+
+class Color(Enum):
+    PARTIAL = "\033[93m"
+    FINAL = "\033[1;92m"
+    WAITING = "\033[95m"
+    RESET = "\033[0m"
+
+
+async def main() -> None:
+    """Run simple microphone transcription."""
+
+    # Get API key from environment
+    api_key = os.getenv("SPEECHMATICS_API_KEY")
+    if not api_key:
+        print("Error: SPEECHMATICS_API_KEY environment variable not set")
+        return
+
+    # Setup microphone with default device
+    mic = Microphone(sample_rate=16000, chunk_size=320)
+    if not mic.start():
+        print("Error: PyAudio not available - install with: pip install pyaudio")
+        return
+
+    # Load additional vocabulary from vocab.json if it exists
+    vocab_file = Path(__file__).parent / "vocab.json"
+    additional_vocab = []
+    if vocab_file.exists():
+        with open(vocab_file) as f:
+            vocab_data = json.load(f)
+            additional_vocab = [
+                AdditionalVocabEntry(content=entry["content"], sounds_like=entry.get("sounds_like", []))
+                for entry in vocab_data
+            ]
+
+    # Use the SCRIBE preset with additional vocabulary
+    config = VoiceAgentConfigPreset.SCRIBE(VoiceAgentConfig(language="en", additional_vocab=additional_vocab))
+
+    # Create client
+    client = VoiceAgentClient(api_key=api_key, config=config)
+
+    # Track waiting state
+    waiting_displayed = False
+
+    # Show listening message
+    def show_listening():
+        """Show listening for audio."""
+        nonlocal waiting_displayed
+        if not waiting_displayed:
+            print(f"\r\033[K{Color.WAITING.value} listening ... {Color.RESET.value}", end="", flush=True)
+            waiting_displayed = True
+
+    # Format timestamp from start_time (seconds since session start)
+    def format_time(seconds: float) -> str:
+        """Format seconds as HH:MM:SS timestamp."""
+        hours = int(seconds // 3600)
+        minutes = int((seconds % 3600) // 60)
+        secs = int(seconds % 60)
+        return f"{hours:02d}:{minutes:02d}:{secs:02d}"
+
+    # Handle partial segments (interim results)
+    def on_partial_segment(message):
+        """Print partial segment(s) as yellow."""
+
+        # Clear waiting message
+        nonlocal waiting_displayed
+        waiting_displayed = False
+
+        # Segments
+        segments = message.get("segments", [])
+        if not segments:
+            return
+
+        # Get metadata start_time
+        metadata = message.get("metadata", {})
+        start_time = metadata.get("start_time", 0)
+        timestamp = format_time(start_time)
+
+        # Move to beginning of line, clear it, and print yellow partial with timestamp
+        for segment in segments:
+            print(
+                f"\r\033[K{Color.PARTIAL.value}{timestamp} - {segment['speaker_id']}: {segment['text']}{Color.RESET.value}",
+                end="",
+                flush=True,
+            )
+
+    # Handle final segments
+    def on_segment(message):
+        """Print final segment(s) as green."""
+
+        # Segments
+        segments = message.get("segments", [])
+        if not segments:
+            return
+
+        # Get metadata start_time
+        metadata = message.get("metadata", {})
+        start_time = metadata.get("start_time", 0)
+        timestamp = format_time(start_time)
+
+        # Clear line, print green final with timestamp, then newline
+        for segment in segments:
+            print(
+                f"\r\033[K{Color.FINAL.value}{timestamp} - {segment['speaker_id']}: {segment['text']}{Color.RESET.value}",
+                flush=True,
+            )
+
+        # Show listening message
+        show_listening()
+
+    # Register event handlers
+    client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, on_partial_segment)
+    client.on(AgentServerMessageType.ADD_SEGMENT, on_segment)
+
+    # Instructions
+    print("\nMicrophone ready - speak now... (Press CTRL+C to stop)\n")
+
+    # Connect to the service
+    await client.connect()
+
+    # Show initial listening message
+    show_listening()
+
+    # Stream audio from microphone
+    async def stream_audio():
+        while True:
+            audio_chunk = await mic.read(320)
+            await client.send_audio(audio_chunk)
+
+    # Run until interrupted
+    try:
+        await stream_audio()
+    except KeyboardInterrupt:
+        print("\n\nStopping...")
+    except asyncio.CancelledError:
+        pass
+
+    # Disconnect
+    await client.disconnect()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/voice/scribe/vocab.json b/examples/voice/scribe/vocab.json
new file mode 100644
index 0000000..aca0da4
--- /dev/null
+++ b/examples/voice/scribe/vocab.json
@@ -0,0 +1,16 @@
+[
+  {
+    "content": "Speechmatics",
+    "sounds_like": ["speech matics", "speech mattics"]
+  },
+  {
+    "content": "API"
+  },
+  {
+    "content": "WebSocket"
+  },
+  {
+    "content": "OAuth",
+    "sounds_like": ["oh auth", "o auth"]
+  }
+]
diff --git a/examples/voice/simple/README.md b/examples/voice/simple/README.md
new file mode 100644
index 0000000..3c1fee3
--- /dev/null
+++ b/examples/voice/simple/README.md
@@ -0,0 +1,68 @@
+# Simple Microphone Transcription
+
+Basic real-time transcription using the default microphone with speaker diarization.
+
+## Quick Start
+
+```bash
+export SPEECHMATICS_API_KEY=your_api_key
+python simple.py
+```
+
+Press `CTRL+C` to stop.
+
+## Features
+
+- Uses default microphone
+- Real-time transcription with speaker diarization
+- Shows partial and final results
+- Detects end of turn
+- Uses "scribe" preset
+
+## Requirements
+
+- Speechmatics API key from the [portal](https://portal.speechmatics.com/)
+- PyAudio: `pip install pyaudio`
+- See [examples README](../README.md) for SDK dependencies
+
+## Code Example
+
+```python
+from speechmatics.rt import Microphone
+from speechmatics.voice import VoiceAgentClient, AgentServerMessageType
+
+# Create client with preset
+client = VoiceAgentClient(api_key="YOUR_KEY", preset="scribe")
+
+# Register event handlers
+@client.on(AgentServerMessageType.ADD_SEGMENT)
+def on_segment(message):
+    segments = message.get("segments", [])
+    for segment in segments:
+        print(f"{segment['speaker_id']}: {segment['text']}")
+
+# Connect and stream
+await client.connect()
+mic = Microphone(sample_rate=16000, chunk_size=320)
+mic.start()
+
+while True:
+    audio_chunk = await mic.read(320)
+    await client.send_audio(audio_chunk)
+```
+
+## Output Example
+
+```
+Microphone ready - speak now... (Press CTRL+C to stop)
+
+[PARTIAL] S1: Hello
+[PARTIAL] S1: Hello how
+[PARTIAL] S1: Hello how are
+[FINAL] S1: Hello, how are you?
+[END OF TURN]
+[PARTIAL] S2: I'm
+[PARTIAL] S2: I'm good
+[FINAL] S2: I'm good, thanks!
+[END OF TURN]
+```
diff --git a/examples/voice/simple/simple.py b/examples/voice/simple/simple.py
new file mode 100644
index 0000000..aa0afe9
--- /dev/null
+++ b/examples/voice/simple/simple.py
@@ -0,0 +1,78 @@
+"""Simple microphone transcription example.
+
+This example demonstrates basic real-time transcription with speaker diarization
+using the default microphone. It prints partial segments, final segments, and
+end-of-turn events.
+"""
+
+import asyncio
+import os
+
+from speechmatics.rt import Microphone
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import VoiceAgentClient
+
+
+async def main() -> None:
+    """Run simple microphone transcription."""
+
+    # Get API key from environment
+    api_key = os.getenv("SPEECHMATICS_API_KEY")
+    if not api_key:
+        print("Error: SPEECHMATICS_API_KEY environment variable not set")
+        return
+
+    # Setup microphone with default device
+    mic = Microphone(sample_rate=16000, chunk_size=320)
+    if not mic.start():
+        print("Error: PyAudio not available - install with: pip install pyaudio")
+        return
+
+    # Create client
+    client = VoiceAgentClient(api_key=api_key, preset="scribe")
+
+    # Handle partial segments (interim results)
+    @client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT)
+    def on_partial_segment(message):
+        segments = message.get("segments", [])
+        for segment in segments:
+            print(f"[PARTIAL] {segment['speaker_id']}: {segment['text']}")
+
+    # Handle final segments
+    @client.on(AgentServerMessageType.ADD_SEGMENT)
+    def on_segment(message):
+        segments = message.get("segments", [])
+        for segment in segments:
+            print(f"[FINAL] {segment['speaker_id']}: {segment['text']}")
+
+    # Handle end of turn
+    @client.on(AgentServerMessageType.END_OF_TURN)
+    def on_end_of_turn(message):
+        print("[END OF TURN]")
+
+    # Instructions
+    print("\nMicrophone ready - speak now... (Press CTRL+C to stop)\n")
+
+    # Connect to the service
+    await client.connect()
+
+    # Stream audio from microphone
+    async def stream_audio():
+        while True:
+            audio_chunk = await mic.read(320)
+            await client.send_audio(audio_chunk)
+
+    # Run until interrupted
+    try:
+        await stream_audio()
+    except KeyboardInterrupt:
+        print("\n\nStopping...")
+    except asyncio.CancelledError:
+        pass
+
+    # Disconnect
+    await client.disconnect()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/sdk/batch/pyproject.toml b/sdk/batch/pyproject.toml
index 7c10a8f..7d5f9ea 100644
--- a/sdk/batch/pyproject.toml
+++ b/sdk/batch/pyproject.toml
@@ -50,3 +50,7 @@ version = { attr = "speechmatics.batch.__version__" }
 
 [tool.setuptools.packages.find]
 where = ["."]
+
+[[tool.mypy.overrides]]
+module = ["aiofiles.*"]
+ignore_missing_imports = true
diff --git a/sdk/rt/pyproject.toml b/sdk/rt/pyproject.toml
index eca9c4c..ab70780 100644
--- a/sdk/rt/pyproject.toml
+++ b/sdk/rt/pyproject.toml
@@ -56,3 +56,7 @@ version = { attr = "speechmatics.rt.__version__" }
 
 [tool.setuptools.packages.find]
 where = ["."]
+
+[[tool.mypy.overrides]]
+module = ["pyaudio.*"]
+ignore_missing_imports = true
diff --git a/sdk/tts/speechmatics/tts/__init__.py b/sdk/tts/speechmatics/tts/__init__.py
index 68e44e5..e3b4a39 100644
--- a/sdk/tts/speechmatics/tts/__init__.py
+++ b/sdk/tts/speechmatics/tts/__init__.py
@@ -26,4 +26,4 @@
     "ConnectionConfig",
     "Voice",
     "OutputFormat",
-]
\ No newline at end of file
+]
diff --git a/sdk/voice/README.md b/sdk/voice/README.md
new file mode 100644
index 0000000..4c8085e
--- /dev/null
+++ b/sdk/voice/README.md
@@ -0,0 +1,779 @@
+# Speechmatics Voice SDK
+
+[![PyPI](https://img.shields.io/pypi/v/speechmatics-voice)](https://pypi.org/project/speechmatics-voice/)
+![PythonSupport](https://img.shields.io/badge/Python-3.9%2B-green)
+
+Python SDK for building voice-enabled applications with the Speechmatics Real-Time API. Optimized for conversational AI, voice agents, transcription services, and real-time captioning.
+
+## What is the Voice SDK?
+
+The Voice SDK is a higher-level abstraction built on top of the Speechmatics Real-Time API (`speechmatics-rt`). While the Real-Time SDK provides raw transcription events (words and utterances), the Voice SDK adds:
+
+- **Intelligent Segmentation** - Groups words into meaningful speech segments per speaker
+- **Turn Detection** - Automatically detects when speakers finish their turns using adaptive or ML-based methods
+- **Speaker Management** - Focus on or ignore specific speakers in multi-speaker scenarios
+- **Preset Configurations** - Ready-to-use configs for common use cases (conversation, note-taking, captions)
+- **Simplified Event Handling** - Receive clean, structured segments instead of raw word-level events
+
+### When to Use Voice SDK vs Real-Time SDK
+
+**Use Voice SDK when:**
+
+- Building conversational AI or voice agents
+- You need automatic turn detection
+- You want speaker-focused transcription
+- You need ready-to-use presets for common scenarios
+
+**Use Real-Time SDK when:**
+
+- You need raw word-level events
+- Building custom segmentation logic
+- You want fine-grained control over every event
+- Processing batch files or custom workflows
+
+## Installation
+
+```bash
+# Standard installation
+pip install speechmatics-voice
+
+# With SMART_TURN (ML-based turn detection)
+pip install speechmatics-voice[smart]
+```
+
+> **Note:** `SMART_TURN` requires additional ML dependencies (ONNX runtime, transformers). If not installed, it automatically falls back to `ADAPTIVE` mode.
+
+## Quick Start
+
+### Basic Example
+
+```python
+import asyncio
+import os
+from speechmatics.rt import Microphone
+from speechmatics.voice import VoiceAgentClient, AgentServerMessageType
+
+async def main():
+    # Create client with preset
+    client = VoiceAgentClient(
+        api_key=os.getenv("SPEECHMATICS_API_KEY"),
+        preset="scribe"
+    )
+
+    # Handle final segments
+    @client.on(AgentServerMessageType.ADD_SEGMENT)
+    def on_segment(message):
+        for segment in message["segments"]:
+            speaker = segment["speaker_id"]
+            text = segment["text"]
+            print(f"{speaker}: {text}")
+
+    # Setup microphone
+    mic = Microphone(sample_rate=16000, chunk_size=320)
+    if not mic.start():
+        print("Error: Microphone not available")
+        return
+
+    # Connect and stream
+    await client.connect()
+
+    try:
+        while True:
+            audio_chunk = await mic.read(320)
+            await client.send_audio(audio_chunk)
+    except KeyboardInterrupt:
+        pass
+    finally:
+        await client.disconnect()
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### Using Presets
+
+Presets provide optimized configurations for common use cases:
+
+```python
+# Scribe preset - for note-taking
+client = VoiceAgentClient(api_key=api_key, preset="scribe")
+
+# Low latency preset - for fast responses
+client = VoiceAgentClient(api_key=api_key, preset="low_latency")
+
+# Conversation preset - for natural dialogue
+client = VoiceAgentClient(api_key=api_key, preset="conversation_adaptive")
+
+# Advanced conversation with ML turn detection
+client = VoiceAgentClient(api_key=api_key, preset="conversation_smart_turn")
+
+# Captions preset - for live captioning
+client = VoiceAgentClient(api_key=api_key, preset="captions")
+```
+
+### Custom Configuration
+
+```python
+from speechmatics.voice import VoiceAgentClient, VoiceAgentConfig, EndOfUtteranceMode
+
+config = VoiceAgentConfig(
+    language="en",
+    enable_diarization=True,
+    max_delay=0.7,
+    end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
+)
+
+client = VoiceAgentClient(api_key=api_key, config=config)
+```
+
+## Configuration
+
+### Basic Parameters
+
+**`language`** (str, default: `"en"`)
+Language code for transcription (e.g., `"en"`, `"es"`, `"fr"`). See [supported languages](https://docs.speechmatics.com/speech-to-text/languages).
+
+**`operating_point`** (OperatingPoint, default: `ENHANCED`)
+Balance accuracy vs latency. Options: `STANDARD` or `ENHANCED`.
+
+**`domain`** (str, default: `None`)
+Domain-specific model (e.g., `"finance"`, `"medical"`). See [supported languages and domains](https://docs.speechmatics.com/speech-to-text/languages).
+
+**`output_locale`** (str, default: `None`)
+Output locale for formatting (e.g., `"en-GB"`, `"en-US"`). See [supported languages and locales](https://docs.speechmatics.com/speech-to-text/languages).
+
+**`enable_diarization`** (bool, default: `False`)
+Enable speaker diarization to identify and label different speakers.
+
+### Turn Detection Parameters
+
+**`end_of_utterance_mode`** (EndOfUtteranceMode, default: `FIXED`)
+Controls how turn endings are detected:
+
+- **`FIXED`** - Uses fixed silence threshold. Fast but may split slow speech.
+- **`ADAPTIVE`** - Adjusts delay based on speech rate, pauses, and disfluencies. Best for natural conversation.
+- **`SMART_TURN`** - Uses ML model to detect acoustic turn-taking cues. Requires `[smart]` extras.
+- **`EXTERNAL`** - Manual control via `client.finalize()`. For custom turn logic.
+
+**`end_of_utterance_silence_trigger`** (float, default: `0.2`)
+Silence duration in seconds to trigger turn end.
+
+**`end_of_utterance_max_delay`** (float, default: `10.0`)
+Maximum delay before forcing turn end.
+
+**`max_delay`** (float, default: `0.7`)
+Maximum transcription delay for word emission.
+
+### Speaker Configuration
+
+**`speaker_sensitivity`** (float, default: `0.5`)
+Diarization sensitivity between 0.0 and 1.0. Higher values detect more speakers.
+
+**`max_speakers`** (int, default: `None`)
+Limit maximum number of speakers to detect.
+
+**`prefer_current_speaker`** (bool, default: `False`)
+Give extra weight to current speaker for word grouping.
+
+**`speaker_config`** (SpeakerFocusConfig, default: `SpeakerFocusConfig()`)
+Configure speaker focus/ignore rules.
+
+```python
+from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode
+
+# Focus only on specific speakers
+config = VoiceAgentConfig(
+    enable_diarization=True,
+    speaker_config=SpeakerFocusConfig(
+        focus_speakers=["S1", "S2"],
+        focus_mode=SpeakerFocusMode.RETAIN
+    )
+)
+
+# Ignore specific speakers
+config = VoiceAgentConfig(
+    enable_diarization=True,
+    speaker_config=SpeakerFocusConfig(
+        ignore_speakers=["S3"],
+        focus_mode=SpeakerFocusMode.IGNORE
+    )
+)
+```
+
+**`known_speakers`** (list[SpeakerIdentifier], default: `[]`)
+Pre-enrolled speaker identifiers for speaker identification.
+
+```python
+from speechmatics.voice import SpeakerIdentifier
+
+config = VoiceAgentConfig(
+    enable_diarization=True,
+    known_speakers=[
+        SpeakerIdentifier(label="Alice", speaker_identifiers=["XX...XX"]),
+        SpeakerIdentifier(label="Bob", speaker_identifiers=["YY...YY"])
+    ]
+)
+```
+
+### Language & Vocabulary
+
+**`additional_vocab`** (list[AdditionalVocabEntry], default: `[]`)
+Custom vocabulary for domain-specific terms.
+
+```python
+from speechmatics.voice import AdditionalVocabEntry
+
+config = VoiceAgentConfig(
+    language="en",
+    additional_vocab=[
+        AdditionalVocabEntry(
+            content="Speechmatics",
+            sounds_like=["speech matters", "speech matics"]
+        ),
+        AdditionalVocabEntry(content="API"),
+    ]
+)
+```
+
+**`punctuation_overrides`** (dict, default: `None`)
+Custom punctuation rules.
+
+### Audio Parameters
+
+**`sample_rate`** (int, default: `16000`)
+Audio sample rate in Hz.
+
+**`audio_encoding`** (AudioEncoding, default: `PCM_S16LE`)
+Audio encoding format.
+
+### Advanced Parameters
+
+**`transcription_update_preset`** (TranscriptionUpdatePreset, default: `COMPLETE`)
+Controls when to emit updates: `COMPLETE`, `COMPLETE_PLUS_TIMING`, `WORDS`, `WORDS_PLUS_TIMING`, or `TIMING`.
+
+**`speech_segment_config`** (SpeechSegmentConfig, default: `SpeechSegmentConfig()`)
+Fine-tune segment generation and post-processing.
+
+**`smart_turn_config`** (SmartTurnConfig, default: `None`)
+Configure SMART_TURN behavior (buffer length, threshold).
+
+**`include_results`** (bool, default: `False`)
+Include word-level timing data in segments.
+
+**`include_partials`** (bool, default: `True`)
+Emit partial segments. Set to `False` for final-only output.
+
+### Configuration with Overlays
+
+Use presets as a starting point and customize with overlays:
+
+```python
+from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig
+
+# Use preset with custom overrides
+config = VoiceAgentConfigPreset.SCRIBE(
+    VoiceAgentConfig(
+        language="es",
+        max_delay=0.8
+    )
+)
+
+# Available presets
+presets = VoiceAgentConfigPreset.list_presets()
+# ['low_latency', 'conversation_adaptive', 'conversation_smart_turn', 'scribe', 'captions']
+```
+
+### Configuration Serialization
+
+Export and import configurations as JSON:
+
+```python
+from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig
+
+# Export preset to JSON
+config_json = VoiceAgentConfigPreset.SCRIBE().to_json()
+
+# Load from JSON
+config = VoiceAgentConfig.from_json(config_json)
+
+# Or create from JSON string
+config = VoiceAgentConfig.from_json('{"language": "en", "enable_diarization": true}')
+```
+
+## Event Messages
+
+The Voice SDK emits structured events via `AgentServerMessageType`. Register handlers using the `@client.on()` decorator or `client.on()` method.
+
+> **Note:** The payloads shown below are the actual message payloads from the Voice SDK. When using the CLI example with `--output-file`, messages also include a `ts` timestamp field (e.g., `"ts": "2025-11-11 23:18:35.909"`), which is added by the CLI for logging purposes and is not part of the SDK payload.
+
+### Core Events
+
+#### RECOGNITION_STARTED
+
+Emitted when transcription session starts. Contains session ID and language pack info.
+
+```python
+@client.on(AgentServerMessageType.RECOGNITION_STARTED)
+def on_started(message):
+    session_id = message["id"]
+    language = message["language_pack_info"]["language_description"]
+    print(f"Session {session_id} started - Language: {language}")
+```
+
+**Payload:**
+
+```json
+{
+  "message": "RecognitionStarted",
+  "id": "a8779b0b-a238-43de-8211-c70f5fcbe191",
+  "orchestrator_version": "2025.08.29127+289170c022.HEAD",
+  "language_pack_info": {
+    "language_description": "English",
+    "word_delimiter": " ",
+    "writing_direction": "left-to-right",
+    "itn": true,
+    "adapted": false
+  }
+}
+```
+
+#### ADD_PARTIAL_SEGMENT
+
+Emitted continuously as speech is being processed. Contains interim text that updates in real-time.
+
+```python
+@client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT)
+def on_partial(message):
+    for segment in message["segments"]:
+        print(f"[INTERIM] {segment['speaker_id']}: {segment['text']}")
+```
+
+**Payload:**
+
+```json
+{
+  "message": "AddPartialSegment",
+  "segments": [
+    {
+      "speaker_id": "S1",
+      "is_active": true,
+      "timestamp": "2025-11-11T23:18:37.189+00:00",
+      "language": "en",
+      "text": "Welcome to",
+      "annotation": ["has_partial"],
+      "metadata": {
+        "start_time": 1.28,
+        "end_time": 1.6
+      }
+    }
+  ],
+  "metadata": {
+    "start_time": 1.28,
+    "end_time": 1.6,
+    "processing_time": 0.307
+  }
+}
+```
+
+**Fields:**
+
+- `speaker_id` - Speaker label (e.g., `"S1"`, `"S2"`)
+- `is_active` - `true` if speaker is in focus (based on `speaker_config`)
+- `text` - Current partial transcription text
+- `annotation` - Status flags (see annotation section below)
+- `metadata.start_time` - Segment start time (seconds since session start)
+- `metadata.end_time` - Segment end time (seconds since session start)
+
+Top-level `metadata` contains the same timing plus `processing_time`.
+
+#### ADD_SEGMENT
+
+Emitted when a segment is finalized. Contains stable, final transcription text.
+
+```python
+@client.on(AgentServerMessageType.ADD_SEGMENT)
+def on_segment(message):
+    for segment in message["segments"]:
+        speaker = segment["speaker_id"]
+        text = segment["text"]
+        start = message["metadata"]["start_time"]
+        print(f"[{start:.2f}s] {speaker}: {text}")
+```
+
+**Payload:**
+
+```json
+{
+  "message": "AddSegment",
+  "segments": [
+    {
+      "speaker_id": "S1",
+      "is_active": true,
+      "timestamp": "2025-11-11T23:18:37.189+00:00",
+      "language": "en",
+      "text": "Welcome to Speechmatics.",
+      "annotation": [
+        "has_final",
+        "starts_with_final",
+        "ends_with_final",
+        "ends_with_eos",
+        "ends_with_punctuation"
+      ],
+      "metadata": {
+        "start_time": 1.28,
+        "end_time": 8.04
+      }
+    }
+  ],
+  "metadata": {
+    "start_time": 1.28,
+    "end_time": 8.04,
+    "processing_time": 0.187
+  }
+}
+```
+
+**Annotation Flags:**
+
+- `has_final` - Contains finalized words
+- `has_partial` - Contains partial (interim) words
+- `starts_with_final` - First word is finalized
+- `ends_with_final` - Last word is finalized
+- `ends_with_eos` - Ends with end-of-sentence
+- `ends_with_punctuation` - Ends with punctuation
+- `fast_speaker` - Speaker is speaking quickly (may appear in some segments)
+- `has_disfluency` - Contains disfluencies like "um", "er" (may appear in some segments)
+
+#### END_OF_TURN
+
+Emitted when a speaker's turn is complete. Timing depends on `end_of_utterance_mode`.
+
+```python
+@client.on(AgentServerMessageType.END_OF_TURN)
+def on_turn_end(message):
+    duration = message["metadata"]["end_time"] - message["metadata"]["start_time"]
+    print(f"Turn ended (duration: {duration:.2f}s)")
+```
+
+**Payload:**
+
+```json
+{
+  "message": "EndOfTurn",
+  "turn_id": 0,
+  "metadata": {
+    "start_time": 1.28,
+    "end_time": 8.04
+  }
+}
+```
+
+### Speaker Events
+
+#### SPEAKER_STARTED
+
+Emitted when a speaker starts speaking (voice activity detected).
+
+```python
+@client.on(AgentServerMessageType.SPEAKER_STARTED)
+def on_speaker_start(message):
+    speaker = message["speaker_id"]
+    time = message["time"]
+    print(f"{speaker} started speaking at {time}s")
+```
+
+**Payload:**
+
+```json
+{
+  "message": "SpeakerStarted",
+  "is_active": true,
+  "speaker_id": "S1",
+  "time": 1.28
+}
+```
+
+#### SPEAKER_ENDED
+
+Emitted when a speaker stops speaking (silence detected).
+
+```python
+@client.on(AgentServerMessageType.SPEAKER_ENDED)
+def on_speaker_end(message):
+    speaker = message["speaker_id"]
+    time = message["time"]
+    print(f"{speaker} stopped speaking at {time}s")
+```
+
+**Payload:**
+
+```json
+{
+  "message": "SpeakerEnded",
+  "is_active": false,
+  "speaker_id": "S1",
+  "time": 2.64
+}
+```
+
+#### SPEAKERS_RESULT
+
+Emitted when speaker enrollment completes.
+
+```python
+# Request speaker IDs at end of session
+await client.send_message({"message": "GetSpeakers", "final": True})
+
+@client.on(AgentServerMessageType.SPEAKERS_RESULT)
+def on_speakers(message):
+    for speaker in message["speakers"]:
+        print(f"Speaker {speaker['label']}: {speaker['speaker_identifiers']}")
+```
+
+### Additional Events
+
+**`START_OF_TURN`** - Emitted at the beginning of a new turn.
+
+**`END_OF_TURN_PREDICTION`** - Emitted during `ADAPTIVE` or `SMART_TURN` mode to predict turn completion (fires before `END_OF_TURN`).
+
+**`END_OF_UTTERANCE`** - Low-level STT engine event (fires when silence threshold is reached).
+
+**`ADD_PARTIAL_TRANSCRIPT` / `ADD_TRANSCRIPT`** - Legacy word-level events from underlying Real-Time API (not typically needed with Voice SDK).
+
+## Common Usage Patterns
+
+### Simple Transcription
+
+```python
+client = VoiceAgentClient(api_key=api_key, preset="scribe")
+
+@client.on(AgentServerMessageType.ADD_SEGMENT)
+def on_segment(message):
+    for segment in message["segments"]:
+        print(f"{segment['speaker_id']}: {segment['text']}")
+```
+
+### Conversational AI with Turn Detection
+
+```python
+config = VoiceAgentConfig(
+    language="en",
+    enable_diarization=True,
+    end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
+)
+
+client = VoiceAgentClient(api_key=api_key, config=config)
+
+@client.on(AgentServerMessageType.ADD_SEGMENT)
+def on_segment(message):
+    user_text = message["segments"][0]["text"]
+    # Process user input
+
+@client.on(AgentServerMessageType.END_OF_TURN)
+def on_turn_end(message):
+    # User finished speaking - generate AI response
+    pass
+```
+
+### Live Captions with Timestamps
+
+```python
+client = VoiceAgentClient(api_key=api_key, preset="captions")
+
+@client.on(AgentServerMessageType.ADD_SEGMENT)
+def on_segment(message):
+    start_time = message["metadata"]["start_time"]
+    for segment in message["segments"]:
+        print(f"[{start_time:.1f}s] {segment['text']}")
+```
+
+### Speaker Identification
+
+```python
+from speechmatics.voice import SpeakerIdentifier
+
+# Use known speakers from previous session
+known_speakers = [
+    SpeakerIdentifier(label="Alice", speaker_identifiers=["XX...XX"]),
+    SpeakerIdentifier(label="Bob", speaker_identifiers=["YY...YY"])
+]
+
+config = VoiceAgentConfig(
+    enable_diarization=True,
+    known_speakers=known_speakers
+)
+
+client = VoiceAgentClient(api_key=api_key, config=config)
+
+@client.on(AgentServerMessageType.ADD_SEGMENT)
+def on_segment(message):
+    for segment in message["segments"]:
+        # Will show "Alice" or "Bob" instead of "S1", "S2"
+        print(f"{segment['speaker_id']}: {segment['text']}")
+```
+
+### Manual Turn Control
+
+```python
+config = VoiceAgentConfig(
+    end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL
+)
+
+client = VoiceAgentClient(api_key=api_key, config=config)
+
+# Manually trigger turn end
+await client.finalize(end_of_turn=True)
+```
+
+### Focus on Specific Speaker
+
+```python
+from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode
+
+config = VoiceAgentConfig(
+    enable_diarization=True,
+    speaker_config=SpeakerFocusConfig(
+        focus_speakers=["S1"],  # Only emit S1's speech
+        focus_mode=SpeakerFocusMode.RETAIN
+    )
+)
+
+client = VoiceAgentClient(api_key=api_key, config=config)
+
+@client.on(AgentServerMessageType.ADD_SEGMENT)
+def on_segment(message):
+    # Only S1's segments will appear here
+    for segment in message["segments"]:
+        if segment["is_active"]:
+            print(f"{segment['text']}")
+
+# Dynamically change focused speaker during session
+await client.update_diarization_config(
+    SpeakerFocusConfig(
+        focus_speakers=["S2"],  # Switch focus to S2
+        focus_mode=SpeakerFocusMode.RETAIN
+    )
+)
+```
+
+## Environment Variables
+
+- `SPEECHMATICS_API_KEY` - Your Speechmatics API key (required)
+- `SPEECHMATICS_RT_URL` - Custom WebSocket endpoint (optional)
+- `SMART_TURN_MODEL_PATH` - Path for SMART_TURN ONNX model cache (optional)
+- `SMART_TURN_HF_URL` - Override SMART_TURN model download URL (optional)
+
+## Examples
+
+See the `examples/voice/` directory for complete working examples:
+
+- **`simple/`** - Basic microphone transcription
+- **`scribe/`** - Note-taking with custom vocabulary
+- **`cli/`** - Full-featured CLI with all options
+
+## API Reference
+
+### VoiceAgentClient
+
+```python
+class VoiceAgentClient:
+    def __init__(
+        self,
+        auth: Optional[AuthBase] = None,
+        api_key: Optional[str] = None,
+        url: Optional[str] = None,
+        app: Optional[str] = None,
+        config: Optional[VoiceAgentConfig] = None,
+        preset: Optional[str] = None
+    ):
+        """Create Voice Agent client.
+
+        Args:
+            auth: Authentication instance (optional)
+            api_key: Speechmatics API key (defaults to SPEECHMATICS_API_KEY env var)
+            url: Custom WebSocket URL (defaults to SPEECHMATICS_RT_URL env var)
+            app: Optional application name for endpoint URL
+            config: Voice Agent configuration (optional)
+            preset: Preset name ("scribe", "low_latency", etc.) (optional)
+        """
+
+    async def connect(self) -> None:
+        """Connect to Speechmatics service.
+
+        Establishes WebSocket connection and starts transcription session.
+        Must be called before sending audio.
+        """
+
+    async def disconnect(self) -> None:
+        """Disconnect from service.
+
+        Closes WebSocket connection and cleans up resources.
+        """
+
+    async def send_audio(self, payload: bytes) -> None:
+        """Send audio data for transcription.
+
+        Args:
+            payload: Audio data as bytes
+        """
+
+    def update_diarization_config(self, config: SpeakerFocusConfig) -> None:
+        """Update diarization configuration during session.
+
+        Args:
+            config: New speaker focus configuration
+        """
+
+    def finalize(self, end_of_turn: bool = False) -> None:
+        """Finalize segments and optionally trigger end of turn.
+
+        Args:
+            end_of_turn: Whether to emit end of turn message (default: False)
+        """
+
+    async def send_message(self, message: dict) -> None:
+        """Send control message to service.
+
+        Args:
+            message: Control message dictionary
+        """
+
+    def on(self, event: AgentServerMessageType, callback: Callable) -> None:
+        """Register event handler.
+
+        Args:
+            event: Event type to listen for
+            callback: Function to call when event occurs
+        """
+
+    def once(self, event: AgentServerMessageType, callback: Callable) -> None:
+        """Register one-time event handler.
+
+        Args:
+            event: Event type to listen for
+            callback: Function to call once when event occurs
+        """
+
+    def off(self, event: AgentServerMessageType, callback: Callable) -> None:
+        """Unregister event handler.
+
+        Args:
+            event: Event type
+            callback: Function to remove
+        """
+```
+
+## Requirements
+
+- Python 3.9+
+- Speechmatics API key ([Get one here](https://portal.speechmatics.com/))
+
+## Documentation
+
+- [Speechmatics Documentation](https://docs.speechmatics.com/)
+- [Real-Time Quickstart](https://docs.speechmatics.com/speech-to-text/realtime/quickstart)
+- [Authentication](https://docs.speechmatics.com/get-started/authentication)
+
+## License
+
+[MIT](LICENSE)
diff --git a/sdk/voice/pyproject.toml b/sdk/voice/pyproject.toml
new file mode 100644
index 0000000..239d395
--- /dev/null
+++ b/sdk/voice/pyproject.toml
@@ -0,0 +1,76 @@
+[build-system]
+requires = ["setuptools>=61.0.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "speechmatics-voice"
+dynamic = ["version"]
+description = "Speechmatics Voice Agent Python client for Real-Time API"
+readme = "README.md"
+authors = [{ name = "Speechmatics", email = "support@speechmatics.com" }]
+license = "MIT"
+requires-python = ">=3.9"
+dependencies = [
+    "speechmatics-rt>=0.5.1",
+    "pydantic>=2.10.6,<3",
+    "numpy>=1.26.4,<3"
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Operating System :: OS Independent",
+    "Topic :: Multimedia :: Sound/Audio :: Speech",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+keywords = [
+    "speechmatics",
+    "conversational-ai",
+    "voice",
+    "agents",
+    "websocket",
+    "real-time",
+    "pipecat",
+    "livekit"
+]
+
+[project.optional-dependencies]
+smart = [
+    "certifi>=2025.10.5",
+    "onnxruntime>=1.19.0,<2",
+    "transformers>=4.57.0,<5",
+]
+dev = [
+    "black",
+    "ruff",
+    "mypy",
+    "pre-commit",
+    "pytest",
+    "pytest-asyncio",
+    "pytest-cov",
+    "pytest-mock",
+    "build",
+]
+
+[project.urls]
+homepage = "https://github.com/speechmatics/speechmatics-python-sdk"
+documentation = "https://docs.speechmatics.com/"
+repository = "https://github.com/speechmatics/speechmatics-python-sdk"
+issues = "https://github.com/speechmatics/speechmatics-python-sdk/issues"
+
+[tool.setuptools.dynamic]
+version = { attr = "speechmatics.voice.__version__" }
+
+[tool.setuptools.package-data]
+"speechmatics.voice" = ["py.typed"]
+
+[tool.setuptools.packages.find]
+where = ["."]
+
+[[tool.mypy.overrides]]
+module = ["speechmatics.rt.*", "onnxruntime.*"]
+ignore_missing_imports = true
diff --git a/sdk/voice/speechmatics/__init__.py b/sdk/voice/speechmatics/__init__.py
new file mode 100644
index 0000000..8db66d3
--- /dev/null
+++ b/sdk/voice/speechmatics/__init__.py
@@ -0,0 +1 @@
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)
diff --git a/sdk/voice/speechmatics/voice/__init__.py b/sdk/voice/speechmatics/voice/__init__.py
new file mode 100644
index 0000000..6cb66fd
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/__init__.py
@@ -0,0 +1,72 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+
+"""Voice Agents SDK.
+
+A comprehensive set of utility classes tailored for Voice Agents and
+using the Speechmatics Python Real-Time SDK, including the processing of
+partial and final transcription from the STT engine into accumulated
+transcriptions with flags to indicate changes between messages, etc.
+"""
+
+__version__ = "0.0.0"
+
+from speechmatics.rt import AudioEncoding
+from speechmatics.rt import AudioFormat
+from speechmatics.rt import ClientMessageType as AgentClientMessageType
+from speechmatics.rt import OperatingPoint
+from speechmatics.rt import SpeakerDiarizationConfig
+from speechmatics.rt import SpeakerIdentifier
+
+from ._client import VoiceAgentClient
+from ._models import AdditionalVocabEntry
+from ._models import AgentServerMessageType
+from ._models import EndOfTurnConfig
+from ._models import EndOfTurnPenaltyItem
+from ._models import EndOfUtteranceMode
+from ._models import SegmentMessage
+from ._models import SessionMetricsMessage
+from ._models import SmartTurnConfig
+from ._models import SpeakerFocusConfig
+from ._models import SpeakerFocusMode
+from ._models import SpeakerMetricsMessage
+from ._models import SpeechSegmentConfig
+from ._models import TurnPredictionMessage
+from ._models import TurnStartEndResetMessage
+from ._models import VADStatusMessage
+from ._models import VoiceAgentConfig
+from ._presets import VoiceAgentConfigPreset
+
+__all__ = [
+    "__version__",
+    # Client
+    "VoiceAgentClient",
+    # Config
+    "AdditionalVocabEntry",
+    "AudioEncoding",
+    "AudioFormat",
+    "EndOfTurnConfig",
+    "EndOfTurnPenaltyItem",
+    "EndOfUtteranceMode",
+    "OperatingPoint",
+    "SpeakerDiarizationConfig",
+    "SpeakerFocusConfig",
+    "SpeakerFocusMode",
+    "SpeakerIdentifier",
+    "SmartTurnConfig",
+    "SpeechSegmentConfig",
+    "VoiceAgentConfig",
+    "VoiceAgentConfigPreset",
+    # Client messages
+    "AgentClientMessageType",
+    # Server messages
+    "AgentServerMessageType",
+    "SegmentMessage",
+    "SessionMetricsMessage",
+    "SpeakerMetricsMessage",
+    "TurnPredictionMessage",
+    "TurnStartEndResetMessage",
+    "VADStatusMessage",
+]
diff --git a/sdk/voice/speechmatics/voice/_audio.py b/sdk/voice/speechmatics/voice/_audio.py
new file mode 100644
index 0000000..486c18e
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/_audio.py
@@ -0,0 +1,238 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+from __future__ import annotations
+
+import asyncio
+
+import numpy as np
+
+
+class AudioBuffer:
+    """Rolling audio buffer.
+
+    A rolling audio buffer that has a set sample_rate, sample_size,
+    frame_size and total_seconds. As the buffer fills, the oldest
+    data is removed and the start_time is updated.
+
+    The function get_slice(start_time, end_time) will return a snapshot
+    of the data between the start_time and end_time. If the start_time is
+    before the start of the buffer, then the start_time will be set to the
+    start of the buffer. If the end_time is after the end of the buffer,
+    then the end_time will be set to the end of the buffer.
+
+    Timing is based on the number of bytes added to the buffer.
+
+    The buffer is thread-safe and can be used from multiple threads, using
+    asyncio locks to ensure thread safety.
+    """
+
+    def __init__(self, sample_rate: int, frame_size: int, sample_width: int = 2, total_seconds: float = 20.0):
+        """Initialise the audio buffer.
+
+        Args:
+            sample_rate: The sample rate of the audio.
+            frame_size: The frame size of the audio.
+            sample_width: The sample width in bytes (1 or 2).
+            total_seconds: The total number of seconds to keep in the buffer.
+        """
+        # Store audio format info
+        self._sample_rate: int = sample_rate
+        self._sample_width: int = sample_width
+        self._frame_size: int = frame_size
+        self._frame_bytes: int = frame_size * sample_width
+
+        # Queue
+        self._frames: list[bytes] = []
+        self._max_frames: int = int(total_seconds * (sample_rate / frame_size))
+        self._lock = asyncio.Lock()
+
+        # Under / overflow
+        self._buffer: bytes = b""
+
+        # Timing info
+        self._total_frames: int = 0
+
+    def _get_time_from_frame(self, frame_index: int) -> float:
+        """Get the time from a frame index.
+
+        Args:
+            frame_index: The frame index.
+
+        Returns:
+            The time in seconds.
+        """
+        return frame_index / (self._sample_rate / self._frame_size)
+
+    def _get_frame_from_time(self, time: float) -> int:
+        """Get the frame index from a time.
+
+        Uses int() with a small epsilon to handle floating-point precision issues
+        while maintaining consistent truncation behaviour.
+
+        Args:
+            time: The time in seconds.
+
+        Returns:
+            The frame index.
+        """
+        return int(time * (self._sample_rate / self._frame_size) + 1e-9)
+
+    async def put_bytes(self, data: bytes) -> None:
+        """Add data to the buffer.
+
+        Arbitrary length of bytes to save to buffer. Accumulates until there is
+        a frame size worth of data, then puts a frame into the buffer.
+
+        Args:
+            data: The data frame to add to the buffer.
+        """
+
+        # If the right length and buffer zero
+        if len(data) // self._sample_width == self._frame_size and len(self._buffer) == 0:
+            return await self.put_frame(data)
+
+        # Add to the buffer
+        self._buffer += data
+
+        # While the buffer is greater than or equal to the frame size
+        while len(self._buffer) >= self._frame_bytes:
+            # Get the frame
+            frame = self._buffer[: self._frame_bytes]
+
+            # Remove the frame from the buffer
+            self._buffer = self._buffer[self._frame_bytes :]
+
+            # Put the frame into the queue
+            await self.put_frame(frame)
+
+    async def put_frame(self, data: bytes) -> None:
+        """Add data to the buffer.
+
+        New data added to the end of the buffer. The oldest data is removed
+        to maintain the total number of seconds in the buffer.
+
+        Args:
+            data: The data frame to add to the buffer.
+        """
+
+        # Add data to the buffer
+        async with self._lock:
+            self._frames.append(data)
+            self._total_frames += 1
+            if len(self._frames) > self._max_frames:
+                self._frames = self._frames[-self._max_frames :]
+
+    async def get_frames(self, start_time: float, end_time: float, fade_out: float = 0) -> bytes:
+        """Get a slice of the buffer.
+
+        Get a slice of the buffer between the start_time and end_time.
+        If the start_time is before the start of the buffer, then the
+        start_time will be set to the start of the buffer. If the end_time
+        is after the end of the buffer, then the end_time will be set to
+        the end of the buffer.
+
+        If a fade out time is specified, then the end of the slice will be
+        faded out by the specified amount of seconds.
+
+        Args:
+            start_time: The start time of the slice.
+            end_time: The end time of the slice.
+            fade_out: The fade out time in seconds.
+
+        Returns:
+            The slice of the buffer between the start_time and end_time.
+        """
+
+        # Get the slice of the buffer
+        async with self._lock:
+            # Get the start and end frame indices (absolute frame numbers)
+            start_index = self._get_frame_from_time(start_time)
+            end_index = self._get_frame_from_time(end_time)
+
+            # Calculate the range of frames currently in the buffer
+            buffer_start_frame = self._total_frames - len(self._frames)
+            buffer_end_frame = self._total_frames
+
+            # Check if the requested range is entirely outside the buffer
+            if end_index <= buffer_start_frame or start_index >= buffer_end_frame:
+                return b""
+
+            # Clamp the requested range to what's available in the buffer
+            clamped_start = max(start_index, buffer_start_frame)
+            clamped_end = min(end_index, buffer_end_frame)
+
+            # Convert absolute frame indices to buffer indices
+            actual_start_index = clamped_start - buffer_start_frame
+            actual_end_index = clamped_end - buffer_start_frame
+
+            # Get what frames are available
+            frames = self._frames[actual_start_index:actual_end_index]
+
+            # Bytes
+            data = b"".join(frames)
+
+            # Fade out
+            if fade_out > 0:
+                data = self._fade_out_audio(data, fade_out=fade_out)
+
+            # Return the joined frames
+            return data
+
+    def _fade_out_audio(self, data: bytes, fade_out: float = 0.01) -> bytes:
+        """Apply a fade-out over the final `fade_out` seconds of PCM audio data.
+
+        Args:
+            data: Raw PCM audio data as bytes.
+            fade_out: Duration of fade-out in seconds (e.g., 0.01 = 10 ms).
+
+        Returns:
+            Bytes with fade-out applied.
+        """
+        # Choose dtype
+        dtype: type[np.signedinteger]
+        if self._sample_width == 1:
+            dtype = np.int8
+        elif self._sample_width == 2:
+            dtype = np.int16
+        else:
+            raise ValueError(f"Unsupported sample_width {self._sample_width}: must be 1 or 2")
+
+        # Convert bytes to NumPy array
+        samples = np.frombuffer(data, dtype=dtype)
+
+        # Number of samples to fade
+        fade_samples = int(self._sample_rate * fade_out)
+        if fade_samples <= 0 or fade_samples > len(samples):
+            return data
+
+        # Linear fade envelope
+        envelope = np.linspace(1.0, 0.0, fade_samples, endpoint=True)
+
+        # Apply fade
+        faded = samples.astype(np.float32)
+        faded[-fade_samples:] *= envelope
+
+        # Convert back to original dtype and bytes
+        return bytes(faded.astype(dtype).tobytes())
+
+    async def reset(self) -> None:
+        """Reset the buffer."""
+        async with self._lock:
+            self._frames = []
+
+    @property
+    def total_frames(self) -> int:
+        """Get the total number of frames added to the buffer."""
+        return self._total_frames
+
+    @property
+    def total_time(self) -> float:
+        """Get the total time added to the buffer."""
+        return self._get_time_from_frame(self._total_frames)
+
+    @property
+    def size(self) -> int:
+        """Get the size of the buffer."""
+        return len(self._frames)
diff --git a/sdk/voice/speechmatics/voice/_client.py b/sdk/voice/speechmatics/voice/_client.py
new file mode 100644
index 0000000..e1575b3
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/_client.py
@@ -0,0 +1,1694 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+from __future__ import annotations
+
+import asyncio
+import datetime
+import os
+import re
+import time
+from collections.abc import Awaitable
+from typing import Any
+from typing import Callable
+from typing import Optional
+from typing import Union
+from urllib.parse import urlencode
+
+from speechmatics.rt import AsyncClient
+from speechmatics.rt import AudioEncoding
+from speechmatics.rt import AudioFormat
+from speechmatics.rt import AuthBase
+from speechmatics.rt import ConversationConfig
+from speechmatics.rt import ServerMessageType
+from speechmatics.rt import SpeakerDiarizationConfig
+from speechmatics.rt import SpeakerIdentifier
+from speechmatics.rt import TranscriptionConfig
+from speechmatics.rt._exceptions import TransportError
+
+from . import __version__
+from ._audio import AudioBuffer
+from ._logging import get_logger
+from ._models import AgentServerMessageType
+from ._models import AnnotationFlags
+from ._models import AnnotationResult
+from ._models import BaseMessage
+from ._models import ClientSessionInfo
+from ._models import EndOfUtteranceMode
+from ._models import ErrorMessage
+from ._models import LanguagePackInfo
+from ._models import MessageTimeMetadata
+from ._models import SegmentMessage
+from ._models import SegmentMessageSegment
+from ._models import SegmentMessageSegmentFragment
+from ._models import SessionMetricsMessage
+from ._models import SessionSpeaker
+from ._models import SpeakerFocusConfig
+from ._models import SpeakerFocusMode
+from ._models import SpeakerMetricsMessage
+from ._models import SpeakerSegment
+from ._models import SpeakerSegmentView
+from ._models import SpeechFragment
+from ._models import TranscriptionUpdatePreset
+from ._models import TurnPredictionMessage
+from ._models import TurnPredictionMetadata
+from ._models import TurnStartEndResetMessage
+from ._models import VADStatusMessage
+from ._models import VoiceAgentConfig
+from ._presets import VoiceAgentConfigPreset
+from ._smart_turn import SMART_TURN_INSTALL_HINT
+from ._smart_turn import SmartTurnDetector
+from ._smart_turn import SmartTurnPredictionResult
+from ._turn import TurnTaskProcessor
+from ._utils import FragmentUtils
+
+
+class VoiceAgentClient(AsyncClient):
+    """Voice Agent client.
+
+    This class extends the AsyncClient class from the Speechmatics Real-Time SDK
+    and provides additional functionality for processing partial and final
+    transcription from the STT engine into accumulated transcriptions with
+    flags to indicate changes between messages, etc.
+    """
+
+    # ============================================================================
+    # INITIALISATION & CONFIGURATION
+    # ============================================================================
+
+    def __init__(
+        self,
+        auth: Optional[AuthBase] = None,
+        api_key: Optional[str] = None,
+        url: Optional[str] = None,
+        app: Optional[str] = None,
+        config: Optional[VoiceAgentConfig] = None,
+        preset: Optional[str] = None,
+    ):
+        """Initialize the Voice Agent client.
+
+        Args:
+            auth: Authentication instance. If not provided, uses StaticKeyAuth
+                with api_key parameter or SPEECHMATICS_API_KEY environment variable.
+            api_key: Speechmatics API key. If None, uses SPEECHMATICS_API_KEY env var.
+            url: REST API endpoint URL. If None, uses SPEECHMATICS_RT_URL env var
+                 or defaults to production endpoint.
+            app: Optional application name to use in the endpoint URL.
+            config: Optional voice agent configuration.
+            preset: Optional voice agent preset.
+
+        Examples:
+            Recommended - using context manager:
+                >>> from speechmatics.voice import VoiceAgentClient, VoiceAgentConfig
+                >>> config = VoiceAgentConfig(language="en")
+                >>> async with VoiceAgentClient(api_key="your_api_key", config=config) as client:
+                ...     # Client automatically connects and disconnects
+                ...     await client.send_audio(audio_data)
+
+            Using a preset (named):
+                >>> from speechmatics.voice import VoiceAgentClient
+                >>> client = VoiceAgentClient(
+                ...     api_key="your_api_key",
+                ...     url="wss://custom.endpoint.com/v2",
+                ...     preset="conversation_adaptive"
+                ... )
+
+            Using a preset (utility class):
+                >>> from speechmatics.voice import VoiceAgentClient, VoiceAgentConfigPreset
+                >>> config=VoiceAgentConfigPreset.CONVERSATION_ADAPTIVE()
+                >>> client = VoiceAgentClient(
+                ...     api_key="your_api_key",
+                ...     url="wss://custom.endpoint.com/v2",
+                ...     config=config
+                ... )
+
+            Manual connection management:
+                >>> client = VoiceAgentClient(api_key="your_api_key", config=config)
+                >>> await client.connect()
+                >>> # ... use client ...
+                >>> await client.disconnect()
+
+            Using environment variables:
+                >>> import os
+                >>> os.environ["SPEECHMATICS_API_KEY"] = "your_api_key"
+                >>> async with VoiceAgentClient(config=VoiceAgentConfig(language="en")) as client:
+                ...     await client.send_audio(audio_data)
+
+            With custom endpoint:
+                >>> client = VoiceAgentClient(
+                ...     api_key="your_api_key",
+                ...     url="wss://custom.endpoint.com/v2",
+                ...     config=VoiceAgentConfig(language="en")
+                ... )
+        """
+
+        # Default URL
+        if not url:
+            url = os.getenv("SPEECHMATICS_RT_URL") or "wss://eu2.rt.speechmatics.com/v2"
+
+        # Initialize the client
+        super().__init__(auth=auth, api_key=api_key, url=self._get_endpoint_url(url, app))
+
+        # Logger
+        self._logger = get_logger(__name__)
+
+        # -------------------------------------
+        # Client Configuration
+        # -------------------------------------
+
+        # Check for preset
+        if preset:
+            preset_config = VoiceAgentConfigPreset.load(preset)
+            config = VoiceAgentConfigPreset._merge_configs(preset_config, config)
+
+        # Process the config
+        self._config, self._transcription_config, self._audio_format = self._prepare_config(config)
+
+        # Connection status
+        self._is_connected: bool = False
+        self._is_ready_for_audio: bool = False
+        self._closing_session: bool = False
+
+        # Session info (updated on session created)
+        self._client_session: ClientSessionInfo = ClientSessionInfo(
+            config=self._config,
+            session_id="NOT_SET",
+            base_time=datetime.datetime.now(datetime.timezone.utc),
+            language_pack_info=LanguagePackInfo.model_validate({}),
+        )
+
+        # -------------------------------------
+        # Transcription Change Filter
+        # -------------------------------------
+
+        # Change filter to emit segments
+        self._change_filter: list[AnnotationFlags] = [
+            AnnotationFlags.NEW,
+            # AnnotationFlags.UPDATED_PARTIALS,
+            # AnnotationFlags.UPDATED_FINALS,
+        ]
+
+        # Full text has changed
+        if self._config.transcription_update_preset == TranscriptionUpdatePreset.COMPLETE:
+            self._change_filter.append(AnnotationFlags.UPDATED_FULL)
+        # Full text and timing have changed
+        elif self._config.transcription_update_preset == TranscriptionUpdatePreset.COMPLETE_PLUS_TIMING:
+            self._change_filter.append(AnnotationFlags.UPDATED_FULL)
+            self._change_filter.append(AnnotationFlags.UPDATED_WORD_TIMINGS)
+        # Word content only has changed
+        elif self._config.transcription_update_preset == TranscriptionUpdatePreset.WORDS:
+            self._change_filter.append(AnnotationFlags.UPDATED_STRIPPED)
+        # Word content and timing have changed
+        elif self._config.transcription_update_preset == TranscriptionUpdatePreset.WORDS_PLUS_TIMING:
+            self._change_filter.append(AnnotationFlags.UPDATED_STRIPPED)
+            self._change_filter.append(AnnotationFlags.UPDATED_WORD_TIMINGS)
+        # Timing only has changed
+        elif self._config.transcription_update_preset == TranscriptionUpdatePreset.TIMING:
+            self._change_filter.append(AnnotationFlags.UPDATED_WORD_TIMINGS)
+
+        # STT message received queue
+        self._stt_message_queue: asyncio.Queue[Callable[[], Awaitable[None]]] = asyncio.Queue()
+        self._stt_queue_task: Optional[asyncio.Task] = None
+
+        # -------------------------------------
+        # Session Timing
+        # -------------------------------------
+
+        self._total_time: float = 0
+        self._total_bytes: int = 0
+        self._last_ttfb: float = 0
+
+        # -------------------------------------
+        # Segment Tracking
+        # -------------------------------------
+
+        self._trim_before_time: float = 0
+        self._fragment_idx: int = 0
+        self._last_fragment_end_time: float = 0
+        self._speech_fragments: list[SpeechFragment] = []
+        self._speech_fragments_lock: asyncio.Lock = asyncio.Lock()
+        self._current_view: Optional[SpeakerSegmentView] = None
+        self._previous_view: Optional[SpeakerSegmentView] = None
+
+        # -------------------------------------
+        # EOU / EOT
+        # -------------------------------------
+
+        # Handlers
+        self._turn_handler: TurnTaskProcessor = TurnTaskProcessor(name="turn_handler", done_callback=self.finalize)
+        self._smart_turn_detector: Optional[SmartTurnDetector] = None
+
+        # Current turn
+        self._turn_start_time: Optional[float] = None
+        self._turn_active: bool = False
+
+        # Start turn detector if SMART_TURN requested
+        if self._config.end_of_utterance_mode == EndOfUtteranceMode.SMART_TURN:
+            eou_mode_ok: bool = False
+            if not SmartTurnDetector.dependencies_available():
+                self._logger.warning(SMART_TURN_INSTALL_HINT)
+            else:
+                detector = SmartTurnDetector(
+                    auto_init=True,
+                    threshold=self._config.smart_turn_config.smart_turn_threshold,
+                )
+                if detector.model_exists():
+                    self._smart_turn_detector = detector
+                    self._config.smart_turn_config.audio_buffer_length = 10.0
+                    eou_mode_ok = True
+            if not eou_mode_ok:
+                self._logger.warning("Smart Turn model not available. Falling back to ADAPTIVE.")
+                self._config.end_of_utterance_mode = EndOfUtteranceMode.ADAPTIVE
+
+        # EOU mode
+        self._eou_mode: EndOfUtteranceMode = self._config.end_of_utterance_mode
+
+        # Uses fixed EndOfUtterance message
+        self._uses_fixed_eou: bool = self._eou_mode == EndOfUtteranceMode.FIXED
+
+        # Uses ForceEndOfUtterance message
+        self._uses_forced_eou: bool = self._eou_mode in [
+            EndOfUtteranceMode.ADAPTIVE,
+            EndOfUtteranceMode.SMART_TURN,
+        ]
+        self._forced_eou_active: bool = False
+
+        # -------------------------------------
+        # Diarization / Speakers
+        # -------------------------------------
+
+        self._session_speakers: dict[str, SessionSpeaker] = {}
+        self._is_speaking: bool = False
+        self._current_speaker: Optional[str] = None
+        self._dz_enabled: bool = self._config.enable_diarization
+        self._dz_config = self._config.speaker_config
+
+        # -------------------------------------
+        # Metrics
+        # -------------------------------------
+
+        self._metrics_emitter_interval: float = 5.0
+        self._metrics_emitter_task: Optional[asyncio.Task] = None
+
+        # -------------------------------------
+        # Audio
+        # -------------------------------------
+
+        # Audio sampling info
+        self._audio_sample_rate: int = self._audio_format.sample_rate
+        self._audio_sample_width: int = {
+            AudioEncoding.PCM_F32LE: 4,
+            AudioEncoding.PCM_S16LE: 2,
+        }.get(self._audio_format.encoding, 1)
+
+        # Audio buffer
+        if self._config.smart_turn_config.audio_buffer_length > 0:
+            self._audio_buffer: AudioBuffer = AudioBuffer(
+                sample_rate=self._audio_format.sample_rate,
+                frame_size=self._audio_format.chunk_size,
+                total_seconds=self._config.smart_turn_config.audio_buffer_length,
+            )
+
+        # Register handlers
+        self._register_event_handlers()
+
+    def _prepare_config(
+        self, config: Optional[VoiceAgentConfig] = None
+    ) -> tuple[VoiceAgentConfig, TranscriptionConfig, AudioFormat]:
+        """Create a formatted STT transcription and audio config.
+
+        Creates a transcription config object based on the service parameters. Aligns
+        with the Speechmatics RT API transcription config.
+
+        Args:
+            config: Optional VoiceAgentConfig object to process.
+
+        Returns:
+            A tuple of (VoiceAgentConfig, TranscriptionConfig, AudioFormat).
+        """
+
+        # Default config
+        if config is None:
+            config = VoiceAgentConfig()
+
+        # Transcription config
+        transcription_config = TranscriptionConfig(
+            language=config.language,
+            domain=config.domain,
+            output_locale=config.output_locale,
+            operating_point=config.operating_point,
+            diarization="speaker" if config.enable_diarization else None,
+            enable_partials=True,
+            max_delay=config.max_delay,
+            max_delay_mode="fixed",
+            audio_filtering_config={
+                "volume_threshold": 0.0,
+            },
+        )
+
+        # Merge in overrides
+        if config.advanced_engine_control:
+            for key, value in config.advanced_engine_control.items():
+                setattr(transcription_config, key, value)
+
+        # Additional vocab
+        if config.additional_vocab:
+            transcription_config.additional_vocab = [
+                {
+                    "content": e.content,
+                    **({"sounds_like": e.sounds_like} if e.sounds_like else {}),
+                }
+                for e in config.additional_vocab
+            ]
+
+        # Diarization
+        if config.enable_diarization:
+            # List of known speakers
+            dz_speakers: list[SpeakerIdentifier] = []
+            if config.known_speakers:
+                dz_speakers.extend(
+                    [
+                        SpeakerIdentifier(label=s.label, speaker_identifiers=s.speaker_identifiers)
+                        for s in config.known_speakers
+                    ]
+                )
+
+            # Diarization config
+            transcription_config.speaker_diarization_config = SpeakerDiarizationConfig(
+                speaker_sensitivity=config.speaker_sensitivity,
+                prefer_current_speaker=config.prefer_current_speaker,
+                max_speakers=config.max_speakers,
+                speakers=dz_speakers or None,
+            )
+
+        # End of Utterance (for fixed)
+        if config.end_of_utterance_silence_trigger and config.end_of_utterance_mode == EndOfUtteranceMode.FIXED:
+            transcription_config.conversation_config = ConversationConfig(
+                end_of_utterance_silence_trigger=config.end_of_utterance_silence_trigger,
+            )
+
+        # Punctuation overrides
+        if config.punctuation_overrides:
+            transcription_config.punctuation_overrides = config.punctuation_overrides
+
+        # Configure the audio
+        audio_format = AudioFormat(
+            encoding=config.audio_encoding,
+            sample_rate=config.sample_rate,
+            chunk_size=320,
+        )
+
+        # Return the config objects
+        return config, transcription_config, audio_format
+
+    # ============================================================================
+    # LIFECYCLE METHODS
+    # ============================================================================
+
+    async def connect(self) -> None:
+        """Connect to the Speechmatics API.
+
+        Establishes WebSocket connection and starts the transcription session.
+        This must be called before sending audio.
+
+        Raises:
+            Exception: If connection fails.
+
+        Examples:
+            Manual connection:
+                >>> client = VoiceAgentClient(api_key="your_api_key", config=config)
+                >>> await client.connect()
+
+            With event handlers:
+                >>> @client.on("AddSegment")
+                ... async def on_segment(message):
+                ...     segments = message["segments"]
+                ...     print(f"Received {len(segments)} segments")
+                >>>
+                >>> await client.connect()
+
+            Using context manager (recommended):
+                >>> async with VoiceAgentClient(api_key="key", config=config) as client:
+                ...     # Client is automatically connected here
+                ...     await client.send_audio(audio_data)
+                ... # Automatically disconnected and cleaned up
+        """
+
+        # Check if we are already connected
+        if self._is_connected:
+            self._emit_message(
+                ErrorMessage(
+                    reason="Already connected",
+                )
+            )
+            return
+
+        # Update the closing session flag
+        self._closing_session = False
+
+        # Start the processor task
+        self._stt_queue_task = asyncio.create_task(self._run_stt_queue())
+
+        # Connect to API
+        try:
+            await self.start_session(
+                transcription_config=self._transcription_config,
+                audio_format=self._audio_format,
+            )
+            self._is_connected = True
+            self._start_metrics_task()
+        except Exception as e:
+            self._logger.error(f"Exception: {e}")
+            raise
+
+    async def __aenter__(self) -> VoiceAgentClient:
+        """Enter async context manager.
+
+        Automatically connects to the Speechmatics API when entering the context.
+
+        Returns:
+            The connected VoiceAgentClient instance.
+
+        Examples:
+            >>> async with VoiceAgentClient(api_key="key", config=config) as client:
+            ...     # Client is already connected here
+            ...     await client.send_audio(audio_data)
+        """
+        await self.connect()
+        return self
+
+    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        """Exit async context manager.
+
+        Automatically disconnects and cleans up resources when exiting the context.
+
+        Args:
+            exc_type: Exception type if an exception occurred.
+            exc_val: Exception value if an exception occurred.
+            exc_tb: Exception traceback if an exception occurred.
+        """
+        await self.disconnect()
+
+    async def disconnect(self) -> None:
+        """Disconnect from the Speechmatics API.
+
+        Closes the WebSocket connection and cleans up resources.
+
+        Examples:
+            Manual disconnect:
+                >>> await client.connect()
+                >>> # ... send audio ...
+                >>> await client.disconnect()
+
+            Using context manager (automatic):
+                >>> async with VoiceAgentClient(api_key="key", config=config) as client:
+                ...     # No need to call disconnect() - handled automatically
+                ...     await client.send_audio(audio_data)
+        """
+
+        # Check if we are already connected
+        if not self._is_connected:
+            return
+
+        # Update the closing session flag
+        self._closing_session = True
+
+        # Emit final segments
+        await self._emit_segments(finalize=True)
+
+        # Emit final metrics
+        self._emit_speaker_metrics()
+        self._emit_metrics()
+
+        # Stop audio and metrics tasks
+        self._is_ready_for_audio = False
+        self._stop_metrics_task()
+
+        # end session
+        try:
+            await asyncio.wait_for(self.stop_session(), timeout=5.0)
+        except Exception as e:
+            self._logger.error(f"Error closing session: {e}")
+        finally:
+            self._is_connected = False
+
+        # Stop end of turn-related tasks
+        self._turn_handler.cancel_tasks()
+
+        # Stop the STT queue task
+        if self._stt_queue_task:
+            self._stt_queue_task.cancel()
+            try:
+                await self._stt_queue_task
+            except asyncio.CancelledError:
+                pass
+            self._stt_queue_task = None
+
+    # ============================================================================
+    # PUBLIC API METHODS
+    # ============================================================================
+
+    async def send_audio(self, payload: bytes) -> None:
+        """Send an audio frame through the WebSocket.
+
+        Args:
+            payload: Audio data as bytes.
+
+        Examples:
+            Sending audio from a file:
+                >>> import wave
+                >>> with wave.open("audio.wav", "rb") as wav_file:
+                ...     while True:
+                ...         audio_chunk = wav_file.readframes(320)
+                ...         if not audio_chunk:
+                ...             break
+                ...         await client.send_audio(audio_chunk)
+
+            Sending audio from microphone:
+                >>> import pyaudio
+                >>> p = pyaudio.PyAudio()
+                >>> stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True)
+                >>> while True:
+                ...     audio_data = stream.read(320)
+                ...     await client.send_audio(audio_data)
+
+            With async generator:
+                >>> async for audio_chunk in audio_stream():
+                ...     await client.send_audio(audio_chunk)
+        """
+        # Skip if not ready for audio
+        if not self._is_ready_for_audio:
+            return
+
+        # Send to the AsyncClient
+        try:
+            await super().send_audio(payload)
+        except TransportError as e:
+            self._logger.warning(f"Error sending audio: {e}")
+            self._emit_message(
+                ErrorMessage(
+                    reason="Transport error - connection being closed",
+                )
+            )
+            await self.disconnect()
+            return
+
+        # Add to audio buffer (use put_bytes to handle variable chunk sizes)
+        if self._config.smart_turn_config.audio_buffer_length > 0:
+            await self._audio_buffer.put_bytes(payload)
+
+        # Calculate the time (in seconds) for the payload
+        if self._audio_format is not None:
+            self._total_bytes += len(payload)
+            self._total_time += len(payload) / self._audio_sample_rate / self._audio_sample_width
+
+    def update_diarization_config(self, config: SpeakerFocusConfig) -> None:
+        """Update the diarization configuration.
+
+        You can update the speakers that needs to be focussed on or ignored during
+        a session. The new config will overwrite the existing configuration and become
+        active immediately.
+
+        Args:
+            config: The new diarization configuration.
+
+        Examples:
+            Focus on specific speakers:
+                >>> from speechmatics.voice import SpeakerFocusConfig, SpeakerFocusMode
+                >>> config = SpeakerFocusConfig(
+                ...     focus_speakers=["speaker_1", "speaker_2"],
+                ...     focus_mode=SpeakerFocusMode.RETAIN
+                ... )
+                >>> client.update_diarization_config(config)
+
+            Ignore specific speakers:
+                >>> config = SpeakerFocusConfig(
+                ...     ignore_speakers=["speaker_3"],
+                ...     focus_mode=SpeakerFocusMode.IGNORE
+                ... )
+                >>> client.update_diarization_config(config)
+
+            Dynamic speaker management:
+                >>> # Start with all speakers
+                >>> await client.connect()
+                >>> # Later, focus on main speaker
+                >>> client.update_diarization_config(
+                ...     SpeakerFocusConfig(focus_speakers=["main_speaker"])
+                ... )
+        """
+        self._dz_config = config
+
+    # ============================================================================
+    # PUBLIC UTTERANCE / TURN MANAGEMENT
+    # ============================================================================
+
+    def finalize(self, end_of_turn: bool = False) -> None:
+        """Finalize segments.
+
+        This function will emit segments in the buffer without any further checks
+        on the contents of the segments.
+
+        Args:
+            end_of_turn: Whether to emit an end of turn message.
+        """
+
+        # Current turn
+        _turn_id = self._turn_handler.handler_id
+
+        # Emit the finalize or use EndOfTurn on demand preview
+        async def emit() -> None:
+            """Wait for EndOfUtterance if needed, then emit segments."""
+
+            # Forced end of utterance message (only when no speaker is detected)
+            if (
+                self._config.use_forced_eou_message
+                and self._current_view
+                and (self._eou_mode == EndOfUtteranceMode.EXTERNAL or not self._is_speaking)
+            ) and not (self._current_view.fragments[-1].is_eos and self._current_view.fragments[-1].is_final):
+                await self._await_forced_eou()
+
+            # Check if the turn has changed
+            if self._turn_handler.handler_id != _turn_id:
+                return
+
+            # Emit the segments
+            self._stt_message_queue.put_nowait(lambda: self._emit_segments(finalize=True))
+
+        # Call async task
+        asyncio.create_task(emit())
+
+    # ============================================================================
+    # EVENT REGISTRATION & HANDLERS
+    # ============================================================================
+
+    def _register_event_handlers(self) -> None:
+        """Register event handlers.
+
+        Specific event handlers that we need to deal with. All other events
+        from the STT API will be available to clients to use themselves.
+        """
+
+        # Recognition started event
+        @self.once(ServerMessageType.RECOGNITION_STARTED)  # type: ignore[misc]
+        def _evt_on_recognition_started(message: dict[str, Any]) -> None:
+            self._is_ready_for_audio = True
+            self._client_session = ClientSessionInfo(
+                config=self._config,
+                session_id=message.get("id", "UNKNOWN"),
+                base_time=datetime.datetime.now(datetime.timezone.utc),
+                language_pack_info=LanguagePackInfo.model_validate(message.get("language_pack_info", {})),
+            )
+
+        # Partial transcript event
+        @self.on(ServerMessageType.ADD_PARTIAL_TRANSCRIPT)  # type: ignore[misc]
+        def _evt_on_partial_transcript(message: dict[str, Any]) -> None:
+            if self._closing_session:
+                return
+            self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=False))
+
+        # Final transcript event
+        @self.on(ServerMessageType.ADD_TRANSCRIPT)  # type: ignore[misc]
+        def _evt_on_final_transcript(message: dict[str, Any]) -> None:
+            if self._closing_session:
+                return
+            self._stt_message_queue.put_nowait(lambda: self._handle_transcript(message, is_final=True))
+
+        # End of Utterance (FIXED mode only)
+        if self._uses_fixed_eou:
+
+            @self.on(ServerMessageType.END_OF_UTTERANCE)  # type: ignore[misc]
+            def _evt_on_end_of_utterance(message: dict[str, Any]) -> None:
+                if self._closing_session:
+                    return
+
+                async def _trigger_end_of_turn() -> None:
+                    self.finalize()
+
+                self._stt_message_queue.put_nowait(_trigger_end_of_turn)
+
+    def _emit_message(self, message: BaseMessage) -> None:
+        """Emit a message to the client.
+
+        This takes a BaseMessage class and emits it as a dictionary to the
+        client.
+
+        Args:
+            message: The BaseMessage class message to emit.
+        """
+
+        # Forward to the emit() method
+        self.emit(message.message, message.model_dump())
+
+    def _emit_info_message(self, message: Union[str, dict[str, Any]]) -> None:
+        """Emit an info message to the client."""
+        if isinstance(message, str):
+            message = {"msg": message}
+        self.emit(AgentServerMessageType.INFO, {"message": AgentServerMessageType.INFO.value, **message})
+
+    # ============================================================================
+    # QUEUE PROCESSING
+    # ============================================================================
+
+    def _start_stt_queue(self) -> None:
+        """Start the STT message queue."""
+        self._stt_queue_task = asyncio.create_task(self._run_stt_queue())
+
+    async def _run_stt_queue(self) -> None:
+        """Run the STT message queue."""
+        while True:
+            try:
+                callback = await self._stt_message_queue.get()
+
+                if asyncio.iscoroutine(callback):
+                    await callback
+                elif asyncio.iscoroutinefunction(callback):
+                    await callback()
+                elif callable(callback):
+                    result = callback()
+                    if asyncio.iscoroutine(result):
+                        await result
+
+            except asyncio.CancelledError:
+                self._logger.debug("STT queue task cancelled")
+                return
+            except RuntimeError:
+                self._logger.debug("STT queue event loop closed")
+                return
+            except Exception:
+                self._logger.warning("Exception in STT message queue", exc_info=True)
+
+    def _stop_stt_queue(self) -> None:
+        """Stop the STT message queue."""
+        if self._stt_queue_task:
+            self._stt_queue_task.cancel()
+
+    # ============================================================================
+    # METRICS
+    # ============================================================================
+
+    def _start_metrics_task(self) -> None:
+        """Start the metrics task."""
+
+        # Task to send metrics
+        async def emit_metrics() -> None:
+            # Tracker
+            last_emission_time = self._total_time
+
+            # Emit metrics
+            while True:
+                # Calculate when the next emission should occur
+                next_emission_time = (
+                    last_emission_time // self._metrics_emitter_interval + 1
+                ) * self._metrics_emitter_interval
+
+                # Check if there are any listeners for AgentServerMessageType.METRICS
+                if not self.listeners(AgentServerMessageType.SESSION_METRICS):
+                    await asyncio.sleep(self._metrics_emitter_interval)
+                    last_emission_time = self._total_time
+                    continue
+
+                # Wait until we've actually reached that time
+                while self._total_time < next_emission_time:
+                    time_to_wait = next_emission_time - self._total_time
+                    await asyncio.sleep(min(0.25, time_to_wait))
+
+                # Update tracker
+                last_emission_time = self._total_time
+
+                # Emit metrics
+                self._emit_metrics()
+
+        # Trigger the task
+        self._metrics_emitter_task = asyncio.create_task(emit_metrics())
+
+    def _emit_metrics(self) -> None:
+        """Emit metrics."""
+        self._emit_message(
+            SessionMetricsMessage(
+                total_time=round(self._total_time, 1),
+                total_time_str=time.strftime("%H:%M:%S", time.gmtime(self._total_time)),
+                total_bytes=self._total_bytes,
+                processing_time=round(self._last_ttfb, 3),
+            )
+        )
+
+    def _stop_metrics_task(self) -> None:
+        """Stop the metrics task."""
+        if self._metrics_emitter_task:
+            self._metrics_emitter_task.cancel()
+            self._metrics_emitter_task = None
+
+    def _calculate_ttfb(self, end_time: float) -> None:
+        """Calculate the time to first text.
+
+        The TTFB is calculated by taking the end time of the payload from the STT
+        engine and then calculating the difference between the total time of bytes
+        sent to the engine from the client.
+
+        Args:
+            end_time: The end time of the payload from the STT engine.
+        """
+
+        # Calculate the time difference (convert to ms)
+        ttfb = self._total_time - end_time
+
+        # Skip if zero or less
+        if ttfb <= 0:
+            return
+
+        # Save TTFB and end time
+        self._last_ttfb = ttfb
+
+    def _calculate_speaker_metrics(
+        self, partial_segments: list[SpeakerSegment], final_segments: list[SpeakerSegment]
+    ) -> None:
+        """Calculate the speaker metrics.
+
+        Used to track the number of words per speaker. Only valid speakers are
+        considered. Ignored speakers will be excluded. Total is past finals +
+        new partials. The number _may_ go down if partials are removed or
+        re-attribute to a different speaker.
+
+        Args:
+            partial_segments: The partial segments to calculate the speaker metrics for.
+            final_segments: The final segments to calculate the speaker metrics for.
+        """
+
+        # Skip if not enabled
+        if not self.listeners(AgentServerMessageType.SPEAKER_METRICS):
+            return
+
+        changes_detected = False
+
+        # Process finalized words
+        for seg in final_segments:
+            for frag in seg.fragments:
+                if frag.type_ == "word" and frag.speaker is not None:
+                    # Initialize speaker if not exists
+                    if frag.speaker not in self._session_speakers:
+                        self._session_speakers[frag.speaker] = SessionSpeaker(speaker_id=frag.speaker)
+
+                    speaker = self._session_speakers[frag.speaker]
+
+                    # Update final word count
+                    speaker.final_word_count += 1
+                    speaker.last_heard = frag.end_time
+
+                    # Update volume
+                    if frag.volume is not None:
+                        speaker.update_volume(frag.volume)
+
+                    changes_detected = True
+
+        # Reset word count to final count for all speakers before reprocessing partials
+        for speaker in self._session_speakers.values():
+            speaker.word_count = speaker.final_word_count
+
+        # Process partial words (adds to the base final count)
+        for seg in partial_segments:
+            for frag in seg.fragments:
+                if frag.type_ == "word" and frag.speaker is not None:
+                    # Initialize speaker if not exists
+                    if frag.speaker not in self._session_speakers:
+                        self._session_speakers[frag.speaker] = SessionSpeaker(speaker_id=frag.speaker)
+                        # Set baseline for new speaker from partials
+                        self._session_speakers[frag.speaker].word_count = 0
+
+                    speaker = self._session_speakers[frag.speaker]
+
+                    # Increment total word count
+                    speaker.word_count += 1
+                    speaker.last_heard = frag.end_time
+
+                    # Update volume
+                    if frag.volume is not None:
+                        speaker.update_volume(frag.volume)
+
+                    changes_detected = True
+
+        # Emit metrics if any changes occurred
+        if changes_detected:
+            self._emit_speaker_metrics()
+
+    def _emit_speaker_metrics(self) -> None:
+        """Emit speaker metrics."""
+        self._emit_message(
+            SpeakerMetricsMessage(
+                speakers=list(self._session_speakers.values()),
+            ),
+        )
+
+    # ============================================================================
+    # TRANSCRIPT PROCESSING
+    # ============================================================================
+
+    async def _handle_transcript(self, message: dict[str, Any], is_final: bool) -> None:
+        """Handle the partial and final transcript events (async).
+
+        As `AddTranscript` messages are _always_ followed by `AddPartialTranscript` messages,
+        we can skip processing. Also skip if there are no fragments in the buffer.
+
+        Args:
+            message: The new Partial or Final from the STT engine.
+            is_final: Whether the data is final or partial.
+        """
+
+        # Add the speech fragments
+        fragments_available = await self._add_speech_fragments(
+            message=message,
+            is_final=is_final,
+        )
+
+        # Skip if no fragments
+        if not fragments_available:
+            return
+
+        # Process (only done with AddPartialTranscript, as they always immediately follow AddTranscript
+        if not is_final:
+            await self._process_speech_fragments(self._change_filter)
+
+    async def _add_speech_fragments(self, message: dict[str, Any], is_final: bool = False) -> bool:
+        """Takes a new Partial or Final from the STT engine.
+
+        Accumulates it into the _speech_data list. As new final data is added, all
+        partials are removed from the list.
+
+        Note: If a known speaker is `__[A-Z0-9_]{2,}__`, then the words are skipped,
+        as this is used to protect against self-interruption by the assistant or to
+        block out specific known voices.
+
+        Args:
+            message: The new Partial or Final from the STT engine.
+            is_final: Whether the data is final or partial.
+
+        Returns:
+            True if the speech fragments were updated, False otherwise.
+        """
+
+        async with self._speech_fragments_lock:
+            # Parsed new speech data from the STT engine
+            fragments: list[SpeechFragment] = []
+
+            # Metadata
+            metadata = message.get("metadata", {})
+            payload_end_time = metadata.get("end_time", 0)
+
+            # Iterate over the results in the payload
+            for result in message.get("results", []):
+                alt = result.get("alternatives", [{}])[0]
+                if alt.get("content", None):
+                    # Create the new fragment
+                    fragment = SpeechFragment(
+                        idx=self._next_fragment_id(),
+                        start_time=result.get("start_time", 0),
+                        end_time=result.get("end_time", 0),
+                        language=alt.get("language", "en"),
+                        direction=alt.get("direction", "ltr"),
+                        type_=result.get("type", "word"),
+                        is_eos=result.get("is_eos", False),
+                        is_disfluency="disfluency" in alt.get("tags", []),
+                        is_punctuation=result.get("type", "") == "punctuation",
+                        is_final=is_final,
+                        attaches_to=result.get("attaches_to", ""),
+                        content=alt.get("content", ""),
+                        speaker=alt.get("speaker", None),
+                        confidence=alt.get("confidence", 1.0),
+                        volume=result.get("volume", None),
+                        result={"final": is_final, **result},
+                    )
+
+                    # Check fragment is after trim time
+                    if fragment.start_time < self._trim_before_time:
+                        continue
+
+                    # Speaker filtering
+                    if fragment.speaker:
+                        # Drop `__XX__` speakers
+                        if re.match(r"^__[A-Z0-9_]{2,}__$", fragment.speaker):
+                            continue
+
+                        # Drop speakers not focussed on
+                        if (
+                            self._dz_config.focus_mode == SpeakerFocusMode.IGNORE
+                            and self._dz_config.focus_speakers
+                            and fragment.speaker not in self._dz_config.focus_speakers
+                        ):
+                            continue
+
+                        # Drop ignored speakers
+                        if self._dz_config.ignore_speakers and fragment.speaker in self._dz_config.ignore_speakers:
+                            continue
+
+                    # Add the fragment
+                    fragments.append(fragment)
+
+                    # Track the last fragment end time
+                    self._last_fragment_end_time = max(self._last_fragment_end_time, fragment.end_time)
+
+            # Evaluate for VAD (only done on partials)
+            if not is_final:
+                await self._vad_evaluation(fragments)
+
+            # Fragments to retain
+            retained_fragments = [
+                frag for frag in self._speech_fragments if frag.is_final and frag.start_time >= self._trim_before_time
+            ]
+
+            # Re-structure the speech fragments
+            self._speech_fragments = retained_fragments.copy()
+            self._speech_fragments.extend(fragments)
+            self._speech_fragments.sort(key=lambda x: x.idx)
+
+            # Remove fragment at head that is for previous
+            if (
+                self._speech_fragments
+                and self._speech_fragments[0].is_punctuation
+                and self._speech_fragments[0].attaches_to == "previous"
+            ):
+                self._speech_fragments.pop(0)
+
+            # Update TTFB (only if there are listeners)
+            if not is_final:
+                self._calculate_ttfb(end_time=payload_end_time)
+
+            # Fragments available
+            return len(self._speech_fragments) > 0
+
+    # ============================================================================
+    # SEGMENT PROCESSING & EMISSION
+    # ============================================================================
+
+    def _update_current_view(self) -> None:
+        """Load the current view of the speech fragments."""
+        self._current_view = SpeakerSegmentView(
+            session=self._client_session,
+            fragments=self._speech_fragments.copy(),
+            focus_speakers=self._dz_config.focus_speakers,
+        )
+
+    async def _process_speech_fragments(self, change_filter: Optional[list[AnnotationFlags]] = None) -> None:
+        """Process the speech fragments.
+
+        Compares the current speech fragments against the last set of speech fragments.
+        When segments are emitted, they are then removed from the buffer of fragments
+        so the next comparison is based on the remaining + new fragments.
+
+        Args:
+            change_filter: Optional list of annotation flags to filter changes.
+        """
+
+        # Lock the speech fragments
+        async with self._speech_fragments_lock:
+            """Creates a new view of the fragments and compares against the last view."""
+
+            # Create a view of the current segments
+            self._update_current_view()
+
+            # Check view exists
+            if not self._current_view:
+                return
+
+            # Check we have at least one segment
+            if self._current_view.segment_count == 0 or self._current_view.last_active_segment_index == -1:
+                return
+
+            # Create a view of segments to emit
+            last_segment = self._current_view.segments[self._current_view.last_active_segment_index]
+
+            # Trim the view
+            self._current_view.trim(start_time=self._current_view.start_time, end_time=last_segment.end_time)
+
+            # Compare previous view to this view
+            if self._previous_view:
+                changes = FragmentUtils.compare_views(self._client_session, self._previous_view, self._current_view)
+            else:
+                changes = AnnotationResult.from_flags(AnnotationFlags.NEW)
+
+            # Update the previous view
+            self._previous_view = self._current_view
+
+        # Catch no changes
+        if change_filter and not changes.any(*change_filter):
+            return
+
+        # Turn prediction
+        if self._uses_forced_eou:
+            ttl = await self._calculate_finalize_delay()
+            if ttl:
+                self._turn_handler.update_timer(ttl)
+
+        # Check for gaps
+        # FragmentUtils.find_segment_pauses(self._client_session, self._current_view)
+
+        # Emit the segments
+        await self._emit_segments()
+
+    async def _emit_segments(self, finalize: bool = False) -> None:
+        """Emit segments to listeners.
+
+        This function will emit segments in the view without any further checks
+        on the contents of the segments. Any segments that end with a final / EOS
+        will be emitted as finals and removed from the fragment buffer.
+
+        Args:
+            finalize: Whether to finalize all segments.
+        """
+
+        # Only process if we have segments in the buffer
+        if self._current_view and self._current_view.segment_count == 0:
+            if finalize:
+                await self._emit_end_of_turn()
+            return
+
+        # Lock the speech fragments
+        async with self._speech_fragments_lock:
+            # Segments to emit
+            final_segments: list[SpeakerSegment] = []
+            partial_segments: list[SpeakerSegment] = []
+
+            # Keep until end of turn (`ON_END_OF_TURN`)
+            if not finalize and not self._config.speech_segment_config.emit_sentences:
+                partial_segments = self._current_view.segments if self._current_view else []
+
+            # Force finalize
+            elif finalize:
+                final_segments = self._current_view.segments if self._current_view else []
+
+            # Split between finals and interim segments (`ON_FINALIZED_SENTENCE`)
+            else:
+                final_segments = [
+                    s
+                    for s in (self._current_view.segments if self._current_view else [])
+                    if s.annotation.has(AnnotationFlags.ENDS_WITH_FINAL, AnnotationFlags.ENDS_WITH_EOS)
+                ]
+                partial_segments = [
+                    s for s in (self._current_view.segments if self._current_view else []) if s not in final_segments
+                ]
+
+            # Remove partial segments that have no final fragments
+            if not self._config.include_partials:
+                partial_segments = [s for s in partial_segments if s.annotation.has(AnnotationFlags.HAS_FINAL)]
+
+            # Emit finals first
+            if final_segments:
+                """Final segments are checked for end of sentence."""
+
+                # Metadata for final segments uses actual start/end times of the segments being emitted
+                final_metadata = MessageTimeMetadata(
+                    start_time=final_segments[0].start_time,
+                    end_time=final_segments[-1].end_time,
+                    processing_time=round(self._last_ttfb, 3),
+                )
+
+                # Ensure final segment ends with EOS
+                if self._config.speech_segment_config.add_trailing_eos:
+                    last_segment = final_segments[-1]
+                    last_fragment = last_segment.fragments[-1]
+                    if not last_fragment.is_eos:
+                        # Add new fragment
+                        last_segment.fragments.append(
+                            SpeechFragment(
+                                idx=self._next_fragment_id(),
+                                start_time=last_fragment.end_time,
+                                end_time=last_fragment.end_time,
+                                content=".",
+                                attaches_to="previous",
+                                is_eos=True,
+                            )
+                        )
+                        # Update text
+                        FragmentUtils.update_segment_text(
+                            session=self._client_session,
+                            segment=last_segment,
+                        )
+
+                # Emit segments
+                self._emit_message(
+                    SegmentMessage(
+                        message=AgentServerMessageType.ADD_SEGMENT,
+                        segments=[
+                            SegmentMessageSegment(
+                                speaker_id=s.speaker_id,
+                                is_active=s.is_active,
+                                timestamp=s.timestamp,
+                                language=s.language,
+                                text=s.text,
+                                annotation=s.annotation,
+                                fragments=(
+                                    [SegmentMessageSegmentFragment(**f.__dict__) for f in s.fragments]
+                                    if self._config.include_results
+                                    else None
+                                ),
+                                metadata=MessageTimeMetadata(start_time=s.start_time, end_time=s.end_time),
+                            )
+                            for s in final_segments
+                        ],
+                        metadata=final_metadata,
+                    ),
+                )
+                self._trim_before_time = final_segments[-1].end_time
+                self._speech_fragments = [f for f in self._speech_fragments if f.start_time >= self._trim_before_time]
+
+            # Emit interim segments (suppress when forced EOU is active)
+            if partial_segments and not self._forced_eou_active:
+                """Partial segments are emitted as is."""
+
+                # Metadata for partial segments uses actual start/end times of the segments being emitted
+                partial_metadata = MessageTimeMetadata(
+                    start_time=partial_segments[0].start_time,
+                    end_time=partial_segments[-1].end_time,
+                    processing_time=round(self._last_ttfb, 3),
+                )
+
+                # Emit segments
+                self._emit_message(
+                    SegmentMessage(
+                        message=AgentServerMessageType.ADD_PARTIAL_SEGMENT,
+                        segments=[
+                            SegmentMessageSegment(
+                                speaker_id=s.speaker_id,
+                                is_active=s.is_active,
+                                timestamp=s.timestamp,
+                                language=s.language,
+                                text=s.text,
+                                annotation=s.annotation,
+                                fragments=(
+                                    [SegmentMessageSegmentFragment(**f.__dict__) for f in s.fragments]
+                                    if self._config.include_results
+                                    else None
+                                ),
+                                metadata=MessageTimeMetadata(start_time=s.start_time, end_time=s.end_time),
+                            )
+                            for s in partial_segments
+                        ],
+                        metadata=partial_metadata,
+                    ),
+                )
+
+            # Update the current view
+            self._update_current_view()
+
+            # Reset the turn start time
+            if not self._turn_start_time and self._current_view:
+                self._turn_start_time = self._current_view.start_time
+
+            # Send updated speaker metrics
+            if self._dz_enabled:
+                self._calculate_speaker_metrics(partial_segments, final_segments)
+
+        # Emit end of turn
+        if finalize:
+            await self._emit_end_of_turn()
+
+    async def _emit_start_of_turn(self, event_time: float) -> None:
+        """Emit the start of turn message."""
+
+        # Flag as turn active
+        self._turn_active = True
+
+        # Emit
+        self._emit_message(
+            TurnStartEndResetMessage(
+                message=AgentServerMessageType.START_OF_TURN,
+                turn_id=self._turn_handler.handler_id,
+                metadata=MessageTimeMetadata(
+                    start_time=event_time,
+                ),
+            ),
+        )
+
+    async def _emit_end_of_turn(self) -> None:
+        """Emit the end of turn message."""
+
+        # Check if we have a previous view
+        if not self._previous_view or not self._turn_active:
+            return
+
+        # Flag as turn active
+        self._turn_active = False
+
+        # Metadata (for LAST view)
+        metadata = MessageTimeMetadata(start_time=self._turn_start_time, end_time=self._previous_view.end_time)
+
+        # Emit
+        self._emit_message(
+            TurnStartEndResetMessage(
+                message=AgentServerMessageType.END_OF_TURN,
+                turn_id=self._turn_handler.handler_id,
+                metadata=metadata,
+            ),
+        )
+
+        # Stop the EOT handler
+        self._turn_handler.complete_handler()
+
+        # Reset the previous view
+        self._previous_view = None
+        self._turn_start_time = None
+
+    # ============================================================================
+    # TURN DETECTION & FINALIZATION
+    # ============================================================================
+
+    async def _calculate_finalize_delay(
+        self,
+        smart_turn_prediction: Optional[SmartTurnPredictionResult] = None,
+    ) -> Optional[float]:
+        """Calculate the delay before finalizing / end of turn.
+
+        Process the most recent segment and view to determine how long to delay before finalizing
+        the segments to the client. Checks for disfluencies, speech speed, end of sentence markers,
+        and smart turn predictions to calculate appropriate delay.
+
+        Args:
+            smart_turn_prediction: The smart turn prediction result to use for evaluation.
+
+        Returns:
+            Optional[float]: The delay before finalizing / end of turn.
+        """
+
+        # Get the current view or previous view with active segments
+        view = (
+            self._current_view
+            if self._current_view and self._current_view.last_active_segment_index > -1
+            else self._previous_view
+        )
+
+        # Skip if view doesn't exist
+        if not view:
+            return None
+
+        # Get last active segment
+        last_active_segment_index = view.last_active_segment_index
+        last_active_segment = view.segments[last_active_segment_index] if last_active_segment_index > -1 else None
+
+        # Track penalty multipliers and reasons
+        reasons: list[tuple[float, str]] = []
+
+        # Apply penalties based on last active segment annotations
+        if last_active_segment:
+            for p in self._config.end_of_turn_config.penalties:
+                description = "__".join(p.annotation)
+                has_annotation = last_active_segment.annotation.has(*p.annotation)
+
+                if (not p.is_not and has_annotation) or (p.is_not and not has_annotation):
+                    reason = f"not__{description}" if p.is_not else description
+                    reasons.append((p.penalty, reason))
+
+        # Apply smart turn prediction penalty
+        if smart_turn_prediction:
+            if smart_turn_prediction.prediction:
+                reasons.append((self._config.smart_turn_config.positive_penalty, "smart_turn_true"))
+            else:
+                reasons.append((self._config.smart_turn_config.negative_penalty, "smart_turn_false"))
+
+        # Calculate final multiplier (compound multiplication)
+        multiplier = (
+            self._config.end_of_turn_config.base_multiplier
+            * self._config.end_of_turn_config.end_of_turn_adjustment_factor
+        )
+        for penalty, _ in reasons:
+            multiplier *= penalty
+
+        # Calculate delay with minimum of 25ms
+        delay = round(self._config.end_of_utterance_silence_trigger * multiplier, 3)
+
+        # Clamp to max delay and adjust for TTFB
+        clamped_delay = min(delay, self._config.end_of_utterance_max_delay)
+        finalize_delay = max(clamped_delay - self._last_ttfb, self._config.end_of_turn_config.min_end_of_turn_delay)
+
+        # Emit prediction message
+        self._emit_message(
+            TurnPredictionMessage(
+                turn_id=self._turn_handler.handler_id,
+                metadata=TurnPredictionMetadata(
+                    ttl=round(finalize_delay, 2),
+                    reasons=[reason for _, reason in reasons],
+                ),
+            ),
+        )
+
+        return finalize_delay
+
+    async def _eot_prediction(self, end_time: Optional[float] = None) -> float:
+        """Handle end of turn prediction."""
+
+        # Wait for Smart Turn result
+        if self._eou_mode == EndOfUtteranceMode.SMART_TURN and end_time is not None:
+            result = await self._smart_turn_prediction(end_time, self._config.language)
+        else:
+            result = None
+
+        # Create a new task to evaluate the finalize delay
+        delay = await self._calculate_finalize_delay(smart_turn_prediction=result)
+
+        # Return the result
+        return delay or 0.005
+
+    async def _smart_turn_prediction(self, end_time: float, language: str) -> SmartTurnPredictionResult:
+        """Predict when to emit the end of turn.
+
+        This will give an acoustic prediction of when the turn has completed using
+        the ONNX model to look for vocal intonation and hints.
+
+        Args:
+            end_time: The end time of the last active segment.
+            language: The language of the audio.
+
+        Returns:
+            bool: Whether the turn has completed.
+        """
+
+        # Check we have smart turn enabled
+        if not self._smart_turn_detector:
+            return SmartTurnPredictionResult(error="Smart turn is not enabled")
+
+        # Get audio slice (add small margin of 100ms to the end of the audio)
+        segment_audio = await self._audio_buffer.get_frames(
+            start_time=end_time - self._config.smart_turn_config.audio_buffer_length,
+            end_time=end_time + self._config.smart_turn_config.slice_margin,
+        )
+
+        # TODO - Output audio (for client to use)
+
+        # Evaluate
+        prediction = await self._smart_turn_detector.predict(
+            segment_audio,
+            language=language,
+            sample_rate=self._audio_sample_rate,
+            sample_width=self._audio_sample_width,
+        )
+
+        # Return the prediction
+        return prediction
+
+    async def _await_forced_eou(self, timeout: float = 2.0) -> None:
+        """Await the forced end of utterance."""
+
+        # Received EOU
+        eou_received: asyncio.Event = asyncio.Event()
+
+        # Add listener
+        self.once(AgentServerMessageType.END_OF_UTTERANCE, lambda message: eou_received.set())
+
+        # Trigger EOU message
+        self._emit_info_message("ForceEndOfUtterance sent")
+        await self.force_end_of_utterance()
+
+        # Wait for EOU
+        try:
+            self._forced_eou_active = True
+            await asyncio.wait_for(eou_received.wait(), timeout=timeout)
+        except asyncio.TimeoutError:
+            pass
+        finally:
+            self._forced_eou_active = False
+
+    # ============================================================================
+    # VAD (VOICE ACTIVITY DETECTION) / SPEAKER DETECTION
+    # ============================================================================
+
+    async def _vad_evaluation(self, fragments: list[SpeechFragment]) -> None:
+        """Emit a VAD event.
+
+        This will emit `SPEAKER_STARTED` and `SPEAKER_ENDED` events to the client and is
+        based on valid transcription for active speakers. Ignored or speakers not in
+        focus will not be considered an active participant.
+
+        This should only run on partial / non-final words.
+
+        Args:
+            fragments: The list of fragments to use for evaluation.
+        """
+
+        # Find the valid list of partial words
+        if self._dz_enabled and self._dz_config.focus_speakers:
+            new_partials = [
+                frag
+                for frag in fragments
+                if frag.speaker in self._dz_config.focus_speakers and frag.type_ == "word" and not frag.is_final
+            ]
+            pre_partials = [
+                frag
+                for frag in self._speech_fragments
+                if frag.speaker in self._dz_config.focus_speakers and frag.type_ == "word" and not frag.is_final
+            ]
+        else:
+            new_partials = [frag for frag in fragments if frag.type_ == "word" and not frag.is_final]
+            pre_partials = [frag for frag in self._speech_fragments if frag.type_ == "word" and not frag.is_final]
+
+        # Check if last new partial matches the last pre partial
+        if len(pre_partials) > 0 and len(new_partials) > 0:
+            has_valid_partial = not all(
+                [
+                    pre_partials[-1].speaker == new_partials[-1].speaker,
+                    pre_partials[-1].start_time == new_partials[-1].start_time,
+                    pre_partials[-1].end_time == new_partials[-1].end_time,
+                    pre_partials[-1].content == new_partials[-1].content,
+                ]
+            )
+
+        # Evaluate if any valid partial words exist
+        else:
+            has_valid_partial = len(new_partials) > 0
+
+        # Current states
+        current_is_speaking = self._is_speaking
+        current_speaker = self._current_speaker
+
+        # Establish the speaker from latest partials
+        latest_speaker = new_partials[-1].speaker if has_valid_partial else current_speaker
+
+        # Determine if the speaker has changed (and we have a speaker)
+        speaker_changed = latest_speaker != current_speaker and current_speaker is not None
+
+        # Start / end times (earliest and latest)
+        speaker_start_time = new_partials[0].start_time if has_valid_partial else None
+        speaker_end_time = self._last_fragment_end_time
+
+        # If diarization is enabled, indicate speaker switching
+        if self._dz_enabled and latest_speaker is not None:
+            """When enabled, we send a speech events if the speaker has changed.
+
+            This
+            will emit a SPEAKER_ENDED for the previous speaker and a SPEAKER_STARTED
+            for the new speaker.
+
+            For any client that wishes to show _which_ speaker is speaking, this will
+            emit events to indicate when speakers switch.
+            """
+
+            # Check if speaker is different to the current speaker
+            if current_is_speaking and speaker_changed:
+                self._emit_message(
+                    VADStatusMessage(
+                        message=AgentServerMessageType.SPEAKER_ENDED,
+                        speaker_id=current_speaker,
+                        is_active=False,
+                        time=speaker_end_time,
+                    ),
+                )
+                self._emit_message(
+                    VADStatusMessage(
+                        message=AgentServerMessageType.SPEAKER_STARTED,
+                        speaker_id=latest_speaker,
+                        is_active=True,
+                        time=speaker_end_time,
+                    ),
+                )
+
+        # Update current speaker
+        self._current_speaker = latest_speaker
+
+        # No further processing if we have no new fragments and we are not speaking
+        if has_valid_partial == current_is_speaking:
+            return
+
+        # Update speaking state
+        self._is_speaking = not current_is_speaking
+
+        # Event time
+        event_time = speaker_start_time if self._is_speaking else speaker_end_time
+
+        # Skip if no event time
+        if event_time is None:
+            return
+
+        # Speaker events
+        if self._is_speaking:
+            await self._handle_speaker_started(latest_speaker, event_time)
+        else:
+            await self._handle_speaker_stopped(latest_speaker, speaker_end_time)
+
+    async def _handle_speaker_started(self, speaker: Optional[str], event_time: float) -> None:
+        """Reset timers when a new speaker starts speaking after silence."""
+
+        # Emit start of turn (not when using EXTERNAL)
+        if self._is_speaking and not self._turn_active:
+            await self._emit_start_of_turn(event_time)
+
+        # Update the turn handler
+        if self._uses_forced_eou:
+            self._turn_handler.reset()
+
+        # Emit the event
+        self._emit_message(
+            VADStatusMessage(
+                message=AgentServerMessageType.SPEAKER_STARTED,
+                speaker_id=speaker,
+                is_active=True,
+                time=event_time,
+            ),
+        )
+
+        # Reset the handlers
+        self._turn_handler.reset()
+
+    async def _handle_speaker_stopped(self, speaker: Optional[str], event_time: float) -> None:
+        """Reset the current speaker and do smart turn detection (if enabled)."""
+
+        # Turn prediction
+        if self._uses_forced_eou:
+            ttl = await self._eot_prediction(event_time)
+            self._turn_handler.update_timer(ttl)
+
+        # Emit the event
+        self._emit_message(
+            VADStatusMessage(
+                message=AgentServerMessageType.SPEAKER_ENDED,
+                speaker_id=speaker,
+                is_active=False,
+                time=event_time,
+            ),
+        )
+
+        # Reset current speaker
+        self._current_speaker = None
+
+    # ============================================================================
+    # HELPER METHODS
+    # ============================================================================
+
+    def _next_fragment_id(self) -> int:
+        """Return the next fragment ID."""
+        self._fragment_idx += 10
+        return self._fragment_idx
+
+    def _get_endpoint_url(self, url: str, app: Optional[str] = None) -> str:
+        """Format the endpoint URL with the SDK and app versions.
+
+        Args:
+            url: The base URL for the endpoint.
+            app: The application name to use in the endpoint URL.
+
+        Returns:
+        str: The formatted endpoint URL.
+        """
+
+        query_params = {}
+        query_params["sm-app"] = app or f"voice-sdk/{__version__}"
+        query_params["sm-voice-sdk"] = f"{__version__}"
+        query = urlencode(query_params)
+
+        return f"{url}?{query}"
diff --git a/sdk/voice/speechmatics/voice/_logging.py b/sdk/voice/speechmatics/voice/_logging.py
new file mode 100644
index 0000000..5df400e
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/_logging.py
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+from __future__ import annotations
+
+import logging
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.NullHandler())
+
+
+def get_logger(name: str) -> logging.Logger:
+    """Get a logger that stays silent by default.
+
+    The logger uses Python's standard logging module and includes NullHandler
+    by default to avoid unwanted output. Users can configure logging levels
+    and handlers as needed.
+
+    Args:
+        name: Logger name, typically __name__ from the calling module.
+
+    Returns:
+        Configured logger instance.
+
+    Examples:
+        Basic usage in SDK modules:
+            logger = get_logger(__name__)
+            logger.debug("HTTP request sent %s %s", method, url)
+            logger.info("Job submitted (job_id=%s)", job_id)
+            logger.warning("Job failed (job_id=%s): %s", job_id, error)
+            logger.error("Connection failed: %s", e)
+
+        Enable debug logging in user code:
+            import logging
+            logging.basicConfig(level=logging.DEBUG)
+            # Now all SDK debug messages will be visible
+
+        Custom logging configuration:
+            import logging
+            logging.basicConfig(
+                level=logging.INFO,
+                format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+            )
+
+            # Or for specific components:
+            logging.getLogger('speechmatics.batch').setLevel(logging.DEBUG)
+    """
+    module_logger = logging.getLogger(name)
+    module_logger.addHandler(logging.NullHandler())
+    return module_logger
+
+
+__all__ = ["get_logger"]
diff --git a/sdk/voice/speechmatics/voice/_models.py b/sdk/voice/speechmatics/voice/_models.py
new file mode 100644
index 0000000..2b00230
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/_models.py
@@ -0,0 +1,1184 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+from __future__ import annotations
+
+import datetime
+from enum import Enum
+from typing import Any
+from typing import Literal
+from typing import Optional
+
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from pydantic import Field
+
+from speechmatics.rt import AudioEncoding
+from speechmatics.rt import OperatingPoint
+from speechmatics.rt import SpeakerIdentifier
+
+# ==============================================================================
+# ENUMS
+# ==============================================================================
+
+
+class EndOfUtteranceMode(str, Enum):
+    """End of turn delay options for transcription.
+
+    - `EXTERNAL`: External end of turn detection. The engine will not perform any
+        end of turn detection and will use an external trigger via `finalize()`.
+
+    - `FIXED`: Fixed end of turn delay. The STT engine will use silence detection
+        to determine the end of turn. For slow speakers, this may result in
+        sentences being split up into smaller segments.
+
+    - `ADAPTIVE`: Adaptive end of turn delay. The STT engine will use silence detection
+        to determine the end of turn. The delay is adaptive and will be adjusted
+        based on the content of what the most recent speaker has said, such as
+        rate of speech and whether they have any pauses or disfluencies.
+
+    - `SMART_TURN`: Smart turn end of turn delay. The STT engine will use a combination
+        of silence detection, adaptive delay and smart turn detection using machine learning
+        to determine the end of turn.
+
+    Examples:
+        Using fixed mode (default):
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     end_of_utterance_mode=EndOfUtteranceMode.FIXED
+            ... )
+
+        Using adaptive mode for natural conversations:
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE
+            ... )
+
+        Using smart turn detection:
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN,
+            ... )
+
+        External control (manual finalization):
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL
+            ... )
+            >>> # Later in code:
+            >>> client.finalize()
+    """
+
+    EXTERNAL = "external"
+    FIXED = "fixed"
+    ADAPTIVE = "adaptive"
+    SMART_TURN = "smart_turn"
+
+
+class TranscriptionUpdatePreset(str, Enum):
+    """Filter options for when to emit changes to transcription.
+
+    - `COMPLETE`: Emit complete transcription.
+    - `COMPLETE_PLUS_TIMING`: Emit complete transcription with timing changes.
+    - `WORDS`: Emit when word context has changed.
+    - `WORDS_PLUS_TIMING`: Emit when word context or timing has changed.
+    - `TIMING`: Emit when timing has changed.
+    """
+
+    COMPLETE = "complete"
+    COMPLETE_PLUS_TIMING = "complete_plus_timing"
+    WORDS = "words"
+    WORDS_PLUS_TIMING = "words_plus_timing"
+    TIMING = "timing"
+
+
+class SpeakerFocusMode(str, Enum):
+    """Speaker focus mode for diarization.
+
+    - `RETAIN`: Retain words spoken by other speakers (not listed in `ignore_speakers`)
+        and process them as passive speaker frames.
+    - `IGNORE`: Ignore words spoken by other speakers and they will not be processed.
+
+    Examples:
+        Retain all speakers but mark focus:
+            >>> config = SpeakerFocusConfig(
+            ...     focus_speakers=["S1"],
+            ...     focus_mode=SpeakerFocusMode.RETAIN
+            ... )
+
+        Ignore non-focus speakers completely:
+            >>> config = SpeakerFocusConfig(
+            ...     focus_speakers=["S1", "S2"],
+            ...     focus_mode=SpeakerFocusMode.IGNORE
+            ... )
+    """
+
+    RETAIN = "retain"
+    IGNORE = "ignore"
+
+
+class AgentServerMessageType(str, Enum):
+    """Message types that can be received from the server / agent.
+
+    These enum values represent the different types of messages that the
+    Speechmatics RT API / Voice Agent SDK can send to the client.
+
+    Attributes:
+        RecognitionStarted: The recognition session has started.
+        EndOfTranscript: The recognition session has ended.
+        Info: Informational message.
+        Warning: Warning message.
+        Error: Error message.
+        AddPartialTranscript: Partial transcript has been added.
+        AddTranscript: Transcript has been added.
+        EndOfUtterance: End of utterance has been detected (from STT engine).
+        SpeakerStarted: Speech has started.
+        SpeakerEnded: Speech has ended.
+        AddPartialSegment: A partial / interim segment has been detected.
+        AddSegment: A final segment has been detected.
+        StartOfTurn: Start of turn has been detected.
+        EndOfTurnPrediction: End of turn prediction timing.
+        EndOfTurn: End of turn has been detected.
+        SpeakersResult: Speakers result has been detected.
+        Metrics: Metrics for the STT engine.
+        SpeakerMetrics: Metrics relating to speakers.
+
+    Examples:
+        >>> # Register event handlers for different message types
+        >>> @client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT)
+        >>> def handle_interim(message):
+        ...     segments: list[SpeakerSegment] = message['segments']
+        ...     print(f"Interim: {segments}")
+        >>>
+        >>> @client.on(AgentServerMessageType.ADD_SEGMENT)
+        >>> def handle_final(message):
+        ...     segments: list[SpeakerSegment] = message['segments']
+        ...     print(f"Final: {segments}")
+        >>>
+        >>> @client.on(AgentServerMessageType.END_OF_TURN)
+        >>> def handle_end_of_turn(message):
+        ...     print(f"End of turn")
+        >>>
+        >>> @client.on(AgentServerMessageType.ERROR)
+        >>> def handle_error(message):
+        ...     print(f"Error: {message['reason']}")
+    """
+
+    # API messages
+    RECOGNITION_STARTED = "RecognitionStarted"
+    END_OF_TRANSCRIPT = "EndOfTranscript"
+    INFO = "Info"
+    WARNING = "Warning"
+    ERROR = "Error"
+
+    # Raw transcription messages
+    ADD_PARTIAL_TRANSCRIPT = "AddPartialTranscript"
+    ADD_TRANSCRIPT = "AddTranscript"
+    END_OF_UTTERANCE = "EndOfUtterance"
+
+    # VAD messages
+    SPEAKER_STARTED = "SpeakerStarted"
+    SPEAKER_ENDED = "SpeakerEnded"
+
+    # Segment messages
+    ADD_PARTIAL_SEGMENT = "AddPartialSegment"
+    ADD_SEGMENT = "AddSegment"
+
+    # Turn messages
+    START_OF_TURN = "StartOfTurn"
+    END_OF_TURN_PREDICTION = "EndOfTurnPrediction"
+    END_OF_TURN = "EndOfTurn"
+    SMART_TURN_AUDIO = "SmartTurnAudio"
+
+    # Speaker messages
+    SPEAKERS_RESULT = "SpeakersResult"
+
+    # Metrics
+    SESSION_METRICS = "SessionMetrics"
+    SPEAKER_METRICS = "SpeakerMetrics"
+
+
+class AnnotationFlags(str, Enum):
+    """Flags to apply when processing speech / objects."""
+
+    # High-level segment updates
+    NEW = "new"
+    UPDATED_FULL = "updated_full"
+    UPDATED_FULL_LCASE = "updated_full_lcase"
+    UPDATED_STRIPPED = "updated_stripped"
+    UPDATED_STRIPPED_LCASE = "updated_stripped_lcase"
+    UPDATED_FINALS = "updated_finals"
+    UPDATED_PARTIALS = "updated_partials"
+    UPDATED_WORD_TIMINGS = "updated_word_timings"
+    FINALIZED = "finalized"
+
+    # Annotations changed
+    UPDATED_ANNOTATIONS = "updated_annotations"
+
+    # Content of segments
+    ONLY_ACTIVE_SPEAKERS = "only_active_speakers"
+    CONTAINS_INACTIVE_SPEAKERS = "contains_inactive_speakers"
+
+    # More granular details on the word content
+    HAS_PARTIAL = "has_partial"
+    HAS_FINAL = "has_final"
+    STARTS_WITH_FINAL = "starts_with_final"
+    ENDS_WITH_FINAL = "ends_with_final"
+    HAS_EOS = "has_eos"
+    ENDS_WITH_EOS = "ends_with_eos"
+    HAS_DISFLUENCY = "has_disfluency"
+    STARTS_WITH_DISFLUENCY = "starts_with_disfluency"
+    ENDS_WITH_DISFLUENCY = "ends_with_disfluency"
+    HIGH_DISFLUENCY_COUNT = "high_disfluency_count"
+    ENDS_WITH_PUNCTUATION = "ends_with_punctuation"
+    VERY_SLOW_SPEAKER = "very_slow_speaker"
+    SLOW_SPEAKER = "slow_speaker"
+    FAST_SPEAKER = "fast_speaker"
+    ONLY_PUNCTUATION = "only_punctuation"
+    MULTIPLE_SPEAKERS = "multiple_speakers"
+    NO_TEXT = "no_text"
+
+    # End of utterance detection
+    END_OF_UTTERANCE = "end_of_utterance"
+
+
+# ==============================================================================
+# CONFIGURATION MODELS
+# ==============================================================================
+
+
+class BaseConfigModel(BaseModel):
+    """Base configuration model."""
+
+    model_config = ConfigDict(extra="forbid")
+
+
+class AdditionalVocabEntry(BaseConfigModel):
+    """Additional vocabulary entry.
+
+    Parameters:
+        content: The word to add to the dictionary.
+        sounds_like: Similar words to the word.
+
+    Examples:
+        Adding a brand name:
+            >>> vocab = AdditionalVocabEntry(
+            ...     content="Speechmatics",
+            ...     sounds_like=["speech mattics", "speech matics"]
+            ... )
+
+        Adding technical terms:
+            >>> vocab_list = [
+            ...     AdditionalVocabEntry(content="API", sounds_like=["A P I"]),
+            ...     AdditionalVocabEntry(content="WebSocket", sounds_like=["web socket"])
+            ... ]
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     additional_vocab=vocab_list
+            ... )
+    """
+
+    content: str
+    sounds_like: list[str] = Field(default_factory=list)
+
+
+class SpeakerFocusConfig(BaseConfigModel):
+    """Speaker Focus Config.
+
+    List of speakers to focus on, ignore and how to deal with speakers that are not
+    in focus. These settings can be changed during a session. Other changes may require
+    a new session.
+
+    Parameters:
+        focus_speakers: List of speaker IDs to focus on. When enabled, only these speakers are
+            emitted as finalized frames and other speakers are considered passive. Words from
+            other speakers are still processed, but only emitted when a focussed speaker has
+            also said new words. A list of labels (e.g. `S1`, `S2`) or identifiers of known
+            speakers (e.g. `speaker_1`, `speaker_2`) can be used.
+            Defaults to [].
+
+        ignore_speakers: List of speaker IDs to ignore. When enabled, these speakers are
+            excluded from the transcription and their words are not processed. Their speech
+            will not trigger any VAD or end of utterance detection. By default, any speaker
+            with a label starting and ending with double underscores will be excluded (e.g.
+            `__ASSISTANT__`).
+            Defaults to [].
+
+        focus_mode: Speaker focus mode for diarization. When set to `SpeakerFocusMode.RETAIN`,
+            the STT engine will retain words spoken by other speakers (not listed in `ignore_speakers`)
+            and process them as passive speaker frames. When set to `SpeakerFocusMode.IGNORE`,
+            the STT engine will ignore words spoken by other speakers and they will not be processed.
+            Defaults to `SpeakerFocusMode.RETAIN`.
+    """
+
+    focus_speakers: list[str] = Field(default_factory=list)
+    ignore_speakers: list[str] = Field(default_factory=list)
+    focus_mode: SpeakerFocusMode = SpeakerFocusMode.RETAIN
+
+
+class SpeechSegmentConfig(BaseConfigModel):
+    """Configuration on how segments are emitted.
+
+    Parameters:
+        add_trailing_eos: Add trailing end of sentence to segments. When enabled, segments are
+            emitted with missing trailing end of sentence added. Defaults to False.
+
+        emit_sentences: Emit segments when a sentence has ended. A finalized segment is emitted
+            as soon as a finalized end of sentence is detected. If a speaker continues to speak during
+            a turn, then multiple finalized segments may be emitted during the turn.
+
+        pause_mark: Add pause mark to segments. When set, a pause fragment will be added to the segment
+            when a pause is detected using the string provided. For example, `...` would add this text
+            into the formatted output for a segment as `Hello ... how are you?`.
+            Defaults to None.
+    """
+
+    add_trailing_eos: bool = False
+    emit_sentences: bool = True
+    pause_mark: Optional[str] = None
+
+
+class EndOfTurnPenaltyItem(BaseConfigModel):
+    """End of turn penalty item.
+
+    Parameters:
+        penalty: Penalty value.
+        annotation: List of annotations to apply the penalty to.
+        is_not: Whether the penalty should be applied when the annotation is not present.
+    """
+
+    penalty: float
+    annotation: list[AnnotationFlags]
+    is_not: bool = False
+
+
+class EndOfTurnConfig(BaseConfigModel):
+    """Configuration for end of turn.
+
+    Parameters:
+        base_multiplier: Base multiplier for end of turn delay.
+        min_end_of_turn_delay: Minimum end of turn delay.
+        end_of_turn_adjustment_factor: End of turn adjustment factor.
+        penalties: List of end of turn penalty items.
+    """
+
+    base_multiplier: float = 1.0
+    min_end_of_turn_delay: float = 0.015
+    end_of_turn_adjustment_factor: float = 1.0
+    penalties: list[EndOfTurnPenaltyItem] = Field(
+        default_factory=lambda: [
+            # Increase delay
+            EndOfTurnPenaltyItem(penalty=3.0, annotation=[AnnotationFlags.VERY_SLOW_SPEAKER]),
+            EndOfTurnPenaltyItem(penalty=2.0, annotation=[AnnotationFlags.SLOW_SPEAKER]),
+            EndOfTurnPenaltyItem(penalty=2.5, annotation=[AnnotationFlags.ENDS_WITH_DISFLUENCY]),
+            EndOfTurnPenaltyItem(penalty=1.2, annotation=[AnnotationFlags.HAS_DISFLUENCY]),
+            EndOfTurnPenaltyItem(
+                penalty=2.0,
+                annotation=[AnnotationFlags.ENDS_WITH_EOS],
+                is_not=True,
+            ),
+            # Decrease delay
+            EndOfTurnPenaltyItem(
+                penalty=0.25, annotation=[AnnotationFlags.ENDS_WITH_FINAL, AnnotationFlags.ENDS_WITH_EOS]
+            ),
+        ]
+    )
+
+
+class SmartTurnConfig(BaseConfigModel):
+    """Smart turn configuration for the Speechmatics Voice Agent.
+
+    This configuration is used to determine when a turn has completed. It is used to
+    extract slices of recent audio for post-processing by end of thought models.
+
+    Parameters:
+        audio_buffer_length: Length of audio buffer to extract slices of recent audio for post-processing
+            by end of thought models. Defaults to 0.0 seconds.
+
+        smart_turn_threshold: Smart turn threshold. This is used to determine when a turn has completed.
+            Only used when `end_of_utterance_mode` is `EndOfUtteranceMode.SMART_TURN`. Defaults to 0.5.
+
+        slice_margin: Margin to add to the audio buffer to ensure that the end of thought models have
+            enough audio to work with. Defaults to 0.05 seconds.
+
+        positive_penalty: Positive penalty for smart turn. Defaults to -1.0.
+
+        negative_penalty: Negative penalty for smart turn. Defaults to 2.5.
+
+    Examples:
+        >>> config = SmartTurnConfig(
+        ...     audio_buffer_length=0.5,
+        ...     smart_turn_threshold=0.5,
+        ...     slice_margin=0.05
+        ... )
+    """
+
+    audio_buffer_length: float = 0.0
+    smart_turn_threshold: float = 0.5
+    slice_margin: float = 0.05
+    positive_penalty: float = 0.2
+    negative_penalty: float = 2.5
+
+
+class VoiceAgentConfig(BaseConfigModel):
+    """Voice Agent configuration.
+
+    A framework-independent configuration object for the Speechmatics Voice Agent. This uses
+    utility functions to create `TranscriptionConfig` and `AudioConfig` objects and also retain
+    agent configuration for the `VoiceAgentClient`.
+
+    Parameters:
+        operating_point: Operating point for transcription accuracy vs. latency tradeoff. It is
+            recommended to use `OperatingPoint.ENHANCED` for most use cases. Defaults to
+            `OperatingPoint.ENHANCED`.
+
+        domain: Domain for Speechmatics API. Defaults to `None`.
+
+        language: Language code for transcription. Defaults to `en`.
+
+        output_locale: Output locale for transcription, e.g. `en-GB`. Defaults to `None`.
+
+        max_delay: Maximum delay in seconds for transcription. This forces the STT engine to
+            speed up the processing of transcribed words and reduces the interval between partial
+            and final results. Lower values can have an impact on accuracy. Defaults to `0.7`.
+
+        end_of_utterance_silence_trigger: Maximum delay in seconds for end of utterance trigger.
+            The delay is used to wait for any further transcribed words before emitting the final
+            word frames. The value must be lower than max_delay.
+            Defaults to `0.2`.
+
+        end_of_utterance_max_delay: Maximum delay in seconds for end of utterance delay.
+            The delay is used to wait for any further transcribed words before emitting the final
+            word frames. The value must be greater than end_of_utterance_silence_trigger.
+            Defaults to `10.0`.
+
+        end_of_utterance_mode: End of utterance delay mode. When ADAPTIVE is used, the delay
+            can be adjusted on the content of what the most recent speaker has said, such as
+            rate of speech and whether they have any pauses or disfluencies. When FIXED is used,
+            the delay is fixed to the value of `end_of_utterance_delay`. Use of NONE disables
+            end of utterance detection and uses a fallback timer.
+            Defaults to `EndOfUtteranceMode.FIXED`.
+
+        additional_vocab: List of additional vocabulary entries. If you supply a list of
+            additional vocabulary entries, the this will increase the weight of the words in the
+            vocabulary and help the STT engine to better transcribe the words.
+            Defaults to [].
+
+        punctuation_overrides: Punctuation overrides. This allows you to override the punctuation
+            in the STT engine. This is useful for languages that use different punctuation
+            than English. See documentation for more information.
+            Defaults to `None`.
+
+        enable_diarization: Enable speaker diarization. When enabled, the STT engine will
+            determine and attribute words to unique speakers. The speaker_sensitivity
+            parameter can be used to adjust the sensitivity of diarization.
+            Defaults to `False`.
+
+        include_partials: Include partial segment fragments (words) in the output of
+            AddPartialSegment messages. Partial fragments from the STT will always be used for
+            speaker activity detection. If `include_results` is enabled, then partials will
+            always be included in the segment fragment list. This setting is used only for
+            the formatted text output of individual segments.
+            Defaults to `True`.
+
+        speaker_sensitivity: Diarization sensitivity. A higher value increases the sensitivity
+            of diarization and helps when two or more speakers have similar voices.
+            Defaults to `0.5`.
+
+        max_speakers: Maximum number of speakers to detect. This forces the STT engine to cluster
+            words into a fixed number of speakers. It should not be used to limit the number of
+            speakers, unless it is clear that there will only be a known number of speakers.
+            Defaults to `None`.
+
+        prefer_current_speaker: Prefer current speaker ID. When set to true, groups of words close
+            together are given extra weight to be identified as the same speaker.
+            Defaults to False.
+
+        speaker_config: SpeakerFocusConfig to configure the speakers to focus on, ignore and
+            how to deal with speakers that are not in focus.
+
+        known_speakers: List of known speaker labels and identifiers. If you supply a list of
+            labels and identifiers for speakers, then the STT engine will use them to attribute
+            any spoken words to that speaker. This is useful when you want to attribute words
+            to a specific speaker, such as the assistant or a specific user. Labels and identifiers
+            can be obtained from a running STT session and then used in subsequent sessions.
+            Identifiers are unique to each Speechmatics account and cannot be used across accounts.
+            Refer to our examples on the format of the known_speakers parameter.
+            Defaults to [].
+
+        include_results: Include word data in the response. This is useful for debugging and
+            understanding the STT engine's behavior. Defaults to False.
+
+        use_forced_eou_message: Use forced end of utterance message. This will force the STT engine to emit
+            end of utterance messages. Defaults to False.
+
+        transcription_update_preset: Emit segments when the text content or word timings change.
+            Options are: `COMPLETE` (emit on changes to text content), `COMPLETE_PLUS_TIMING`
+            (emit on changes to text content and word timings), `WORDS` (emit on changes to word
+            content, without punctuation), `WORDS_PLUS_TIMING` (emit on changes to word content
+            and word timings), and `TIMING` (emit on changes to word timings, not recommended).
+            Defaults to `TranscriptionUpdatePreset.COMPLETE`.
+
+        end_of_turn_config: End of turn configuration for the Speechmatics Voice Agent.
+
+        smart_turn_config: Smart turn configuration for the Speechmatics Voice Agent.
+
+        speech_segment_config: Speech segment configuration for the Speechmatics Voice Agent.
+
+        advanced_engine_control: Internal use only.
+
+        sample_rate: Audio sample rate for streaming. Defaults to `16000`.
+        audio_encoding: Audio encoding format. Defaults to `AudioEncoding.PCM_S16LE`.
+
+    Examples:
+        Basic configuration:
+            >>> config = VoiceAgentConfig(language="en")
+
+        With diarization enabled:
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     enable_diarization=True,
+            ...     speaker_sensitivity=0.7
+            ... )
+
+        With custom vocabulary:
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     additional_vocab=[
+            ...         AdditionalVocabEntry(
+            ...             content="Speechmatics",
+            ...             sounds_like=["speech mattics"]
+            ...         )
+            ...     ]
+            ... )
+
+        Advanced configuration with speaker focus:
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     enable_diarization=True,
+            ...     speaker_config=SpeakerFocusConfig(
+            ...         focus_speakers=["S1"],
+            ...         focus_mode=SpeakerFocusMode.RETAIN
+            ...     ),
+            ...     end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE
+            ... )
+
+        With known speakers:
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     enable_diarization=True,
+            ...     known_speakers=[
+            ...         SpeakerIdentifier(
+            ...             label="Alice",
+            ...             speaker_identifiers=["speaker_abc123"]
+            ...         )
+            ...     ]
+            ... )
+
+        Complete example with multiple features:
+            >>> config = VoiceAgentConfig(
+            ...     language="en",
+            ...     operating_point=OperatingPoint.ENHANCED,
+            ...     enable_diarization=True,
+            ...     speaker_sensitivity=0.7,
+            ...     max_speakers=3,
+            ...     end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN,
+            ...     smart_turn_config=SmartTurnConfig(
+            ...         smart_turn_threshold=0.5
+            ...     ),
+            ...     additional_vocab=[
+            ...         AdditionalVocabEntry(content="API"),
+            ...         AdditionalVocabEntry(content="WebSocket")
+            ...     ],
+            ...     speaker_config=SpeakerFocusConfig(
+            ...         focus_speakers=["S1", "S2"]
+            ...     )
+            ... )
+    """
+
+    # Service configuration
+    operating_point: OperatingPoint = OperatingPoint.ENHANCED
+    domain: Optional[str] = None
+    language: str = "en"
+    output_locale: Optional[str] = None
+
+    # Features
+    max_delay: float = 0.7
+    end_of_utterance_silence_trigger: float = 0.2
+    end_of_utterance_max_delay: float = 10.0
+    end_of_utterance_mode: EndOfUtteranceMode = EndOfUtteranceMode.FIXED
+    additional_vocab: list[AdditionalVocabEntry] = Field(default_factory=list)
+    punctuation_overrides: Optional[dict] = None
+
+    # Diarization
+    enable_diarization: bool = False
+    include_partials: bool = True
+    speaker_sensitivity: float = 0.5
+    max_speakers: Optional[int] = None
+    prefer_current_speaker: bool = False
+    speaker_config: SpeakerFocusConfig = Field(default_factory=SpeakerFocusConfig)
+    known_speakers: list[SpeakerIdentifier] = Field(default_factory=list)
+
+    # Advanced features
+    include_results: bool = False
+    use_forced_eou_message: bool = False
+    transcription_update_preset: TranscriptionUpdatePreset = TranscriptionUpdatePreset.COMPLETE
+    end_of_turn_config: EndOfTurnConfig = Field(default_factory=EndOfTurnConfig)
+    smart_turn_config: SmartTurnConfig = Field(default_factory=SmartTurnConfig)
+    speech_segment_config: SpeechSegmentConfig = Field(default_factory=SpeechSegmentConfig)
+
+    # Advanced engine configuration
+    advanced_engine_control: Optional[dict[str, Any]] = None
+
+    # Audio
+    sample_rate: int = 16000
+    audio_encoding: AudioEncoding = AudioEncoding.PCM_S16LE
+
+    # Parse JSON
+    @classmethod
+    def from_json(cls, json_data: str) -> VoiceAgentConfig:
+        """Convert a JSON string to a VoiceAgentConfig object."""
+        cfg: VoiceAgentConfig = cls.model_validate_json(json_data)
+        return cfg
+
+    # To JSON
+    def to_json(self) -> str:
+        """Convert the model to a JSON string."""
+        config_str: str = self.model_dump_json(exclude_none=True, exclude_defaults=True, exclude_unset=True)
+        return config_str
+
+
+# ==============================================================================
+# SESSION & INFO MODELS
+# ==============================================================================
+
+
+class LanguagePackInfo(BaseModel):
+    """Information about the language pack used in a session.
+
+    Attributes:
+        adapted (bool): Whether the language pack is adapted.
+        itn (bool): Whether the language pack has ITN enabled.
+        language_description (str): The language description.
+        word_delimiter (str): The word delimiter.
+        writing_direction (str): The writing direction ('ltr' or 'rtl').
+    """
+
+    adapted: bool = False
+    itn: bool = True
+    language_description: str = "English"
+    word_delimiter: str = " "
+    writing_direction: str = "ltr"
+
+
+class ClientSessionInfo(BaseModel):
+    """Information about the session.
+
+    Attributes:
+        config (VoiceAgentConfig): The configuration for the session.
+        session_id (str): The session ID.
+        base_time (datetime.datetime): The base time for the session.
+        language_pack_info (LanguagePackInfo): The language pack info for the session.
+    """
+
+    config: VoiceAgentConfig
+    session_id: str
+    base_time: datetime.datetime
+    language_pack_info: LanguagePackInfo
+
+
+class SessionSpeaker(BaseModel):
+    """Info on a speaker in a session.
+
+    Attributes:
+        speaker_id (str): The speaker ID.
+        word_count (int): The word count for the speaker.
+        last_heard (float): The last time the speaker was heard.
+        volume (Optional[float]): The average volume of the speaker (mean of last 50 values).
+    """
+
+    speaker_id: str
+    word_count: int = 0
+    last_heard: float = 0
+    volume: Optional[float] = None
+    final_word_count: int = Field(default=0, exclude=True)
+    volume_history: list[float] = Field(default_factory=list, exclude=True)
+
+    def update_volume(self, new_volume: float) -> None:
+        """Update volume with average from last N values.
+
+        Args:
+            new_volume: The new volume value to add.
+        """
+        # Track volume history (last N values)
+        self.volume_history.append(new_volume)
+        while len(self.volume_history) > 10:
+            self.volume_history.pop(0)
+
+        # Calculate average from history
+        self.volume = round(sum(self.volume_history) / len(self.volume_history), 1)
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, SessionSpeaker):
+            return False
+        return (
+            self.speaker_id == other.speaker_id
+            and self.word_count == other.word_count
+            and self.last_heard == other.last_heard
+            and self.volume == other.volume
+        )
+
+
+class AnnotationResult(list):
+    """Processing result."""
+
+    @staticmethod
+    def from_flags(*flags: AnnotationFlags) -> AnnotationResult:
+        """Create an AnnotationResult from a list of flags."""
+        r = AnnotationResult()
+        r.add(*flags)
+        return r
+
+    def add(self, *flags: AnnotationFlags) -> None:
+        """Add a flag(s) to the object."""
+        for flag in flags:
+            if flag not in self:
+                self.append(flag.value)
+
+    def remove(self, *flags: AnnotationFlags) -> None:
+        """Remove a flag(s) from the object."""
+        for flag in flags:
+            if flag in self:
+                super().remove(flag.value)
+
+    def has(self, *flags: AnnotationFlags) -> bool:
+        """Check if the object has all given flags."""
+        return all(f.value in set(self) for f in flags)
+
+    def any(self, *flags: AnnotationFlags) -> bool:
+        """Check if the object has any of the given flags."""
+        return any(f.value in set(self) for f in flags)
+
+    def __eq__(self, other: object) -> bool:
+        """Check if the object is equal to another."""
+        if isinstance(other, AnnotationResult):
+            return set(self) == set(other)
+        return False
+
+
+# ==============================================================================
+# FRAGMENT & SEGMENT MODELS
+# ==============================================================================
+
+
+class SpeechFragment(BaseModel):
+    """Fragment of a speech event.
+
+    As the transcript is processed (partials and finals), a list of SpeechFragments
+    objects are accumulated and then used to form SpeechSegments objects.
+
+    Parameters:
+        idx: Index of the fragment in the list (used for sorting).
+        start_time: Start time of the fragment in seconds (from session start).
+        end_time: End time of the fragment in seconds (from session start).
+        language: Language of the fragment. Defaults to `en`.
+        direction: Direction of the fragment. Defaults to `ltr`.
+        type_: Type of the fragment. Defaults to `word`.
+        is_eos: Whether the fragment is the end of a sentence. Defaults to `False`.
+        is_final: Whether the fragment is the final fragment. Defaults to `False`.
+        is_disfluency: Whether the fragment is a disfluency. Defaults to `False`.
+        is_punctuation: Whether the fragment is a punctuation. Defaults to `False`.
+        attaches_to: Whether the fragment attaches to the previous or next fragment (punctuation). Defaults to empty string.
+        content: Content of the fragment. Defaults to empty string.
+        speaker: Speaker of the fragment (if diarization is enabled). Defaults to `None`.
+        confidence: Confidence of the fragment (0.0 to 1.0). Defaults to `1.0`.
+        volume: Volume of the fragment (0.0 to 100.0). Defaults to `None`.
+        result: Raw result of the fragment from the TTS.
+        annotation: Annotation for the fragment.
+    """
+
+    idx: int
+    start_time: float
+    end_time: float
+    language: str = "en"
+    direction: str = "ltr"
+    type_: str = "word"
+    is_eos: bool = False
+    is_final: bool = False
+    is_disfluency: bool = False
+    is_punctuation: bool = False
+    attaches_to: str = ""
+    content: str = ""
+    speaker: Optional[str] = None
+    confidence: float = 1.0
+    volume: Optional[float] = None
+    result: Optional[Any] = None
+    annotation: Optional[AnnotationResult] = None
+
+    model_config = ConfigDict(use_enum_values=True, arbitrary_types_allowed=True)
+
+
+class SpeakerSegment(BaseModel):
+    """SpeechFragment items grouped by speaker_id and whether the speaker is active.
+
+    Parameters:
+        speaker_id: The ID of the speaker.
+        is_active: Whether the speaker is active (emits frame).
+        timestamp: The timestamp of the frame.
+        language: The language of the frame.
+        fragments: The list of SpeechFragment items.
+        text: The text of the segment.
+        annotation: The annotation associated with the segment.
+    """
+
+    speaker_id: Optional[str] = None
+    is_active: bool = False
+    timestamp: Optional[str] = None
+    language: Optional[str] = None
+    fragments: list[SpeechFragment] = Field(default_factory=list)
+    text: Optional[str] = None
+    annotation: AnnotationResult = Field(default_factory=AnnotationResult)
+
+    model_config = ConfigDict(use_enum_values=True, arbitrary_types_allowed=True)
+
+    @property
+    def start_time(self) -> float:
+        """Return the start time of the segment."""
+        return self.fragments[0].start_time if self.fragments else 0.0
+
+    @property
+    def end_time(self) -> float:
+        """Return the end time of the segment."""
+        return self.fragments[-1].end_time if self.fragments else 0.0
+
+    def model_dump(self, include_results: bool = False, **kwargs: Any) -> dict[str, Any]:
+        """Override model_dump to control fragments/results inclusion."""
+
+        # Always exclude fragments from the base dump
+        kwargs["exclude"] = {"fragments"}
+        data: dict[str, Any] = super().model_dump(**kwargs)
+
+        # Add timing information
+        data["start_time"] = self.start_time
+        data["end_time"] = self.end_time
+
+        # Add results if requested
+        if include_results:
+            data["results"] = [f.result for f in self.fragments]
+
+        # Return the dump
+        return data
+
+
+class SpeakerSegmentView(BaseModel):
+    """View for speaker fragments.
+
+    Parameters:
+        session: ClientSessionInfo object.
+        fragments: List of fragments.
+        focus_speakers: List of speakers to focus on or None.
+    """
+
+    session: ClientSessionInfo
+    fragments: list[SpeechFragment]
+    segments: list[SpeakerSegment] = Field(default_factory=list)
+    focus_speakers: Optional[list[str]] = None
+
+    def __init__(
+        self,
+        session: ClientSessionInfo,
+        fragments: list[SpeechFragment],
+        focus_speakers: Optional[list[str]] = None,
+        annotate_segments: bool = True,
+        **data: Any,
+    ) -> None:
+        # Lazy import to avoid circular dependency
+        from ._utils import FragmentUtils
+
+        # Process fragments into a list of segments
+        segments = FragmentUtils.segment_list_from_fragments(
+            session=session,
+            fragments=fragments,
+            focus_speakers=focus_speakers,
+            annotate_segments=annotate_segments,
+        )
+
+        super().__init__(session=session, fragments=fragments, segments=segments, focus_speakers=focus_speakers, **data)
+
+    @property
+    def start_time(self) -> float:
+        return self.fragments[0].start_time if self.fragments else 0.0
+
+    @property
+    def end_time(self) -> float:
+        return self.fragments[-1].end_time if self.fragments else 0.0
+
+    @property
+    def final_count(self) -> int:
+        return sum(1 for frag in self.fragments if frag.is_final)
+
+    @property
+    def partial_count(self) -> int:
+        return sum(1 for frag in self.fragments if not frag.is_final)
+
+    @property
+    def segment_count(self) -> int:
+        return len(self.segments)
+
+    @property
+    def last_active_segment_index(self) -> int:
+        idx = next(
+            (i for i, segment in enumerate(reversed(self.segments)) if segment.is_active),
+            None,
+        )
+        if idx is None:
+            return -1
+        return len(self.segments) - idx - 1
+
+    def has_no_active_segments_remaining(self) -> bool:
+        return self.last_active_segment_index == -1
+
+    def format_view_text(
+        self,
+        format: str = "|{speaker_id}|{text}|",
+        separator: str = "",
+        words_only: bool = False,
+        include_partials: bool = True,
+    ) -> str:
+        """Format each segment into a single string.
+
+        Args:
+            format: Format string.
+            separator: Separator string.
+            words_only: Whether to include only word fragments.
+            include_partials: Whether to include partial fragments in the output.
+
+        Returns:
+            str: The formatted text.
+        """
+        # Lazy import to avoid circular dependency
+        from ._utils import FragmentUtils
+
+        return separator.join(
+            FragmentUtils.format_segment_text(
+                session=self.session,
+                segment=segment,
+                format=format,
+                words_only=words_only,
+                include_partials=include_partials,
+            )
+            for segment in self.segments
+        )
+
+    def trim(self, start_time: float, end_time: float, annotate_segments: bool = True) -> None:
+        """Trim a segment view to a specific time range.
+
+        Args:
+            start_time: Start time in seconds.
+            end_time: End time in seconds.
+            annotate_segments: Whether to annotate segments.
+        """
+        # Lazy import to avoid circular dependency
+        from ._utils import FragmentUtils
+
+        self.fragments = [
+            frag for frag in self.fragments if frag.start_time >= start_time and frag.end_time <= end_time
+        ]
+        self.segments = FragmentUtils.segment_list_from_fragments(
+            session=self.session,
+            fragments=self.fragments,
+            focus_speakers=self.focus_speakers,
+            annotate_segments=annotate_segments,
+        )
+
+
+# ==============================================================================
+# MESSAGES / PAYLOADS
+# ==============================================================================
+
+
+class BaseMessageModel(BaseModel):
+    """Base model for all messages."""
+
+    def model_dump(self, *args: Any, **kwargs: Any) -> dict[str, Any]:
+        """Default to excluding None values."""
+        return super().model_dump(*args, **kwargs, exclude_none=True, mode="json")  # type: ignore[no-any-return]
+
+    def model_dump_json(self, *args: Any, **kwargs: Any) -> str:
+        """Default to excluding None values."""
+        return super().model_dump_json(*args, **kwargs, exclude_none=True)  # type: ignore[no-any-return]
+
+
+class BaseMessage(BaseMessageModel):
+    """Base model for all messages."""
+
+    message: AgentServerMessageType
+
+
+class ErrorMessage(BaseMessage):
+    """Emitted when an error occurs.
+
+    Parameters:
+        message: The message type.
+        reason: The reason for the error.
+    """
+
+    message: AgentServerMessageType = AgentServerMessageType.ERROR
+    reason: str
+
+
+class SessionMetricsMessage(BaseMessage):
+    """Emitted when metrics are calculated.
+
+    Parameters:
+        message: The message type.
+        total_time: The total time in seconds.
+        total_time_str: The total time in HH:MM:SS format.
+        total_bytes: The total bytes sent to the STT engine.
+        processing_time: The latest processing time in seconds.
+    """
+
+    message: AgentServerMessageType = AgentServerMessageType.SESSION_METRICS
+    total_time: float
+    total_time_str: str
+    total_bytes: int
+    processing_time: float
+
+
+class VADStatusMessage(BaseMessage):
+    """Emitted when a speaker starts or ends speaking.
+
+    The speaker id is taken from the last word in the segment when
+    the event is emitted.
+
+    Parameters:
+        message: The message type.
+        is_active: Whether the speaker is active.
+        speaker_id: The ID of the speaker.
+        time: The time of the event (start for STARTED, end for ENDED).
+    """
+
+    message: Literal[AgentServerMessageType.SPEAKER_STARTED, AgentServerMessageType.SPEAKER_ENDED]
+    is_active: bool
+    speaker_id: Optional[str] = None
+    time: Optional[float] = None
+
+
+class MessageTimeMetadata(BaseMessageModel):
+    """Metadata for segment messages.
+
+    Parameters:
+        time: The time of the event.
+        start_time: The start time of the segment.
+        end_time: The end time of the segment.
+        processing_time: The processing time of the segment.
+    """
+
+    time: Optional[float] = None
+    start_time: Optional[float] = None
+    end_time: Optional[float] = None
+    processing_time: Optional[float] = None
+
+
+class TurnStartEndResetMessage(BaseMessage):
+    """Emitted when a turn starts, ends or is reset.
+
+    Parameters:
+        turn_id: The ID of the turn.
+        is_active: Whether the turn is active.
+    """
+
+    message: Literal[
+        AgentServerMessageType.START_OF_TURN,
+        AgentServerMessageType.END_OF_TURN,
+    ]
+    turn_id: int
+    metadata: MessageTimeMetadata
+
+
+class TurnPredictionMetadata(BaseMessageModel):
+    """Metadata for turn prediction messages.
+
+    Parameters:
+        ttl: The time to live of the prediction in seconds.
+        reasons: The reasons for the prediction.
+    """
+
+    ttl: float
+    reasons: list[str]
+
+
+class TurnPredictionMessage(BaseMessage):
+    """Emitted when a turn prediction is made."""
+
+    message: AgentServerMessageType = AgentServerMessageType.END_OF_TURN_PREDICTION
+    turn_id: int
+    metadata: TurnPredictionMetadata
+
+
+class SpeakerMetricsMessage(BaseMessage):
+    """Emitted when the speaker metrics are updated.
+
+    Parameters:
+        speakers: List of speakers.
+    """
+
+    message: AgentServerMessageType = AgentServerMessageType.SPEAKER_METRICS
+    speakers: list[SessionSpeaker]
+
+
+class SegmentMessageSegmentFragment(BaseMessageModel):
+    """Speech fragment for segment messages.
+
+    Parameters:
+        start_time: The start time of the fragment.
+        end_time: The end time of the fragment.
+        language: The language of the fragment.
+        direction: The direction of the fragment.
+        type_: The type of the fragment.
+        content: The content of the fragment.
+        attaches_to: The ID of the fragment that this fragment attaches to.
+    """
+
+    start_time: float
+    end_time: float
+    language: str = "en"
+    direction: str = "ltr"
+    type: str = Field(default="word", alias="type_")
+    content: str = ""
+    attaches_to: str = ""
+
+    model_config = ConfigDict(extra="ignore")
+
+
+class SegmentMessageSegment(BaseMessageModel):
+    """Partial or final segment.
+
+    Parameters:
+        speaker_id: The ID of the speaker.
+        is_active: Whether the speaker is active (emits frame).
+        timestamp: The timestamp of the frame.
+        language: The language of the frame.
+        text: The text of the segment.
+        fragments: The fragments associated with the segment.
+        annotation: The annotation associated with the segment.
+        metadata: The metadata associated with the segment.
+    """
+
+    speaker_id: Optional[str] = None
+    is_active: bool = False
+    timestamp: Optional[str] = None
+    language: Optional[str] = None
+    text: Optional[str] = None
+    fragments: Optional[list[SegmentMessageSegmentFragment]] = None
+    annotation: list[AnnotationFlags] = Field(default_factory=list)
+    metadata: MessageTimeMetadata
+
+
+class SegmentMessage(BaseMessage):
+    """Emitted when a segment is added to the session."""
+
+    message: Literal[AgentServerMessageType.ADD_PARTIAL_SEGMENT, AgentServerMessageType.ADD_SEGMENT]
+    segments: list[SegmentMessageSegment]
+    metadata: MessageTimeMetadata
diff --git a/sdk/voice/speechmatics/voice/_presets.py b/sdk/voice/speechmatics/voice/_presets.py
new file mode 100644
index 0000000..2452d14
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/_presets.py
@@ -0,0 +1,178 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+from __future__ import annotations
+
+from typing import Optional
+
+from ._models import EndOfUtteranceMode
+from ._models import OperatingPoint
+from ._models import SpeechSegmentConfig
+from ._models import VoiceAgentConfig
+
+
+class VoiceAgentConfigPreset:
+    """Set of preset configurations for the Voice Agent SDK."""
+
+    @staticmethod
+    def LOW_LATENCY(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig:  # noqa: N802
+        """Best suited for low latency situations.
+
+        This configuration will emit the end of turn as soon as possible, with minimal
+        delay to finalizing the spoken sentences. It is not recommended for
+        conversation, as it will not account for pauses, slow speech or disfluencies.
+        """
+        return VoiceAgentConfigPreset._merge_configs(
+            VoiceAgentConfig(
+                operating_point=OperatingPoint.STANDARD,
+                enable_diarization=True,
+                max_delay=0.7,
+                end_of_utterance_silence_trigger=0.5,
+                end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+                speech_segment_config=SpeechSegmentConfig(emit_sentences=True),
+            ),
+            overlay,
+        )
+
+    @staticmethod
+    def CONVERSATION_ADAPTIVE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig:  # noqa: N802
+        """Best suited for general conversational use cases.
+
+        For conversation, there is a balance between accuracy, speed and the rate at
+        which the end of turn is emitted. Tne use of ADAPTIVE means that the delay to
+        finalizing the spoken sentences will be adjusted based on the words and whether
+        there are any pauses, slow speech or disfluencies.
+        """
+        return VoiceAgentConfigPreset._merge_configs(
+            VoiceAgentConfig(
+                operating_point=OperatingPoint.ENHANCED,
+                enable_diarization=True,
+                max_delay=0.7,
+                end_of_utterance_silence_trigger=1.0,
+                end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
+                speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+            ),
+            overlay,
+        )
+
+    @staticmethod
+    def CONVERSATION_SMART_TURN(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig:  # noqa: N802
+        """Best suited for complex conversational use cases.
+
+        For conversation, there is a balance between accuracy, speed and the rate at
+        which the end of turn is emitted. Tne use of SMART_TURN means that the delay to
+        finalizing the spoken sentences will be adjusted based on the words and whether
+        there are any pauses, slow speech or disfluencies.
+
+        This preset will use a model to detect for acoustic indicators from the
+        speaker to determine when a turn has ended.
+
+        Use of this will requite `pip install speechmatics-voice[smart]` and may not
+        be suited to low-power devices.
+        """
+        return VoiceAgentConfigPreset._merge_configs(
+            VoiceAgentConfig(
+                operating_point=OperatingPoint.ENHANCED,
+                enable_diarization=True,
+                max_delay=0.7,
+                end_of_utterance_silence_trigger=1.0,
+                end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN,
+                speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+            ),
+            overlay,
+        )
+
+    @staticmethod
+    def SCRIBE(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig:  # noqa: N802
+        """Best suited for note-taking and scribes.
+
+        This mode will emit partial and final segments as they become available. The end of
+        utterance is set to fixed. End of turn is not required for note-taking.
+        """
+        return VoiceAgentConfigPreset._merge_configs(
+            VoiceAgentConfig(
+                operating_point=OperatingPoint.ENHANCED,
+                enable_diarization=True,
+                max_delay=1.0,
+                end_of_utterance_silence_trigger=1.2,
+                end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+                speech_segment_config=SpeechSegmentConfig(emit_sentences=True),
+            ),
+            overlay,
+        )
+
+    @staticmethod
+    def CAPTIONS(overlay: Optional[VoiceAgentConfig] = None) -> VoiceAgentConfig:  # noqa: N802
+        """Best suited for captions.
+
+        This mode will emit partial and final segments as they become available. The end of
+        utterance is set to fixed. End of turn is not required for captions. The segments
+        will only include finalized words.
+        """
+        return VoiceAgentConfigPreset._merge_configs(
+            VoiceAgentConfig(
+                operating_point=OperatingPoint.ENHANCED,
+                enable_diarization=True,
+                max_delay=0.9,
+                end_of_utterance_silence_trigger=1.2,
+                end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+                speech_segment_config=SpeechSegmentConfig(emit_sentences=True),
+                include_partials=False,
+            ),
+            overlay,
+        )
+
+    @staticmethod
+    def list_presets() -> list[str]:
+        """List available presets."""
+        return [attr.lower() for attr in dir(VoiceAgentConfigPreset) if not attr.startswith("_") and attr.isupper()]
+
+    @staticmethod
+    def load(preset: str, overlay_json: Optional[str] = None) -> VoiceAgentConfig:
+        """Get a preset configuration.
+
+        Args:
+            preset: Preset to use.
+            overlay_json: Optional overlay JSON to apply to the preset.
+
+        Returns:
+            VoiceAgentConfig: Preset configuration.
+        """
+        try:
+            config: VoiceAgentConfig = getattr(VoiceAgentConfigPreset, preset.upper())()
+            if overlay_json is not None:
+                overlay = VoiceAgentConfig.model_validate_json(overlay_json)
+                config = VoiceAgentConfigPreset._merge_configs(config, overlay)
+            return config
+        except ValueError:
+            raise ValueError(f"Invalid overlay JSON: {overlay_json}")
+        except AttributeError:
+            raise ValueError(f"Invalid preset: {preset}")
+
+    @staticmethod
+    def _merge_configs(base: VoiceAgentConfig, overlay: Optional[VoiceAgentConfig]) -> VoiceAgentConfig:
+        """Merge two VoiceAgentConfig objects.
+
+        Simply merge any overrides from the overlay into the base config. This makes creating
+        custom configs from presets easier.
+
+        Args:
+            base: Base config to merge into.
+            overlay: Overlay config to merge from.
+
+        Returns:
+            Merged config.
+
+        """
+
+        # No overlay required
+        if overlay is None:
+            return base
+
+        # Merge overlay into base - use model_validate to properly reconstruct nested models
+        merged_dict = {
+            **base.model_dump(exclude_unset=True, exclude_none=True),
+            **overlay.model_dump(exclude_unset=True, exclude_none=True),
+        }
+        return VoiceAgentConfig.model_validate(merged_dict)  # type: ignore[no-any-return]
diff --git a/sdk/voice/speechmatics/voice/_smart_turn.py b/sdk/voice/speechmatics/voice/_smart_turn.py
new file mode 100644
index 0000000..011318d
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/_smart_turn.py
@@ -0,0 +1,326 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+from __future__ import annotations
+
+import datetime
+import logging
+import os
+import ssl
+import urllib.request
+from typing import Any
+from typing import Optional
+from urllib.parse import urlparse
+
+import numpy as np
+from pydantic import BaseModel
+
+ort: Any
+WhisperFeatureExtractor: Any
+logger = logging.getLogger(__name__)
+
+try:
+    os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
+    import certifi
+    import onnxruntime as _ort
+    from transformers import WhisperFeatureExtractor as _WhisperFeatureExtractor
+
+    ort = _ort
+    WhisperFeatureExtractor = _WhisperFeatureExtractor
+
+    def _create_ssl_context(*args: Any, **kwargs: Any) -> ssl.SSLContext:
+        """Create SSL context with certifi certificates."""
+        if "cafile" not in kwargs:
+            kwargs["cafile"] = certifi.where()
+        return ssl.create_default_context(*args, **kwargs)
+
+    ssl._create_default_https_context = _create_ssl_context
+
+except ModuleNotFoundError:
+    WhisperFeatureExtractor = None
+    ort = None
+
+
+# Base model from HuggingFace
+SMART_TURN_MODEL_URL = os.getenv(
+    "SMART_TURN_HF_URL", "https://huggingface.co/pipecat-ai/smart-turn-v3/resolve/main/smart-turn-v3.0.onnx"
+)
+SMART_TURN_MODEL_LOCAL_PATH = os.getenv("SMART_TURN_MODEL_PATH", ".models/smart-turn-v3.0.onnx")
+
+# Hint for when dependencies are not available
+SMART_TURN_INSTALL_HINT = "SMART_TURN mode unavailable. Install `speechmatics-voice[smart]` to enable SMART_TURN mode."
+
+
+class SmartTurnPredictionResult(BaseModel):
+    """Prediction result from the smart turn detector.
+
+    Attributes:
+        prediction: True for complete, False for incomplete
+        probability: Probability of completion (sigmoid output)
+        processing_time: Time taken to process the audio (in seconds)
+        error: Error message if an error occurred
+    """
+
+    prediction: bool = False
+    probability: float = 0.0
+    processing_time: Optional[float] = None
+    error: Optional[str] = None
+
+
+class SmartTurnDetector:
+    """Smart Turn Detector.
+
+    Uses Pipecat's opensource acoustic model for determining if an audio sample
+    is predicted to be complete or incomplete.
+
+    Further information at https://github.com/pipecat-ai/smart-turn
+    """
+
+    def __init__(self, auto_init: bool = True, threshold: float = 0.8):
+        """Create the new SmartTurnDetector.
+
+        Args:
+            auto_init: Whether to automatically initialise the detector.
+            threshold: Probability threshold for turn completion (0.0-1.0).
+        """
+
+        # Has initialized
+        self._is_initialized: bool = False
+
+        # Threshold
+        self._threshold: float = threshold
+
+        # If auto_init is True, setup the detector
+        if auto_init:
+            self.setup()
+
+    @staticmethod
+    def dependencies_available() -> bool:
+        """Return whether optional Smart Turn dependencies are installed."""
+        return ort is not None and WhisperFeatureExtractor is not None
+
+    def setup(self) -> None:
+        """Setup the detector.
+
+        Initialises the ONNX model and feature extractor.
+        """
+
+        # Show warning if dependencies are not available
+        if not self.dependencies_available():
+            logger.warning(SMART_TURN_INSTALL_HINT)
+            return
+
+        try:
+            # Check / download the model
+            self.download_model()
+
+            # Check the model downloaded
+            if not self.model_exists():
+                logger.warning("Smart Turn model not found. Please download the model first.")
+                return
+
+            # Build the session
+            self.session = self.build_session(SMART_TURN_MODEL_LOCAL_PATH)
+
+            # Load the feature extractor
+            self.feature_extractor = WhisperFeatureExtractor(chunk_length=8)
+
+            # Set initialized
+            self._is_initialized = True
+
+        except Exception as e:
+            logger.error(f"Failed to setup SmartTurnDetector: {e}")
+
+    def build_session(self, onnx_path: str) -> ort.InferenceSession:
+        """Build the ONNX session and load resources.
+
+        Args:
+            onnx_path: Path to the ONNX model.
+
+        Returns:
+            ONNX inference session.
+        """
+
+        # Show warning if dependencies are not available
+        if ort is None:
+            raise RuntimeError("onnxruntime is not available")
+
+        # Build the session
+        so = ort.SessionOptions()
+        so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+        so.inter_op_num_threads = 1
+        so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+
+        # Return the new session
+        return ort.InferenceSession(onnx_path, sess_options=so)
+
+    async def predict(
+        self, audio_array: bytes, language: str, sample_rate: int = 16000, sample_width: int = 2
+    ) -> SmartTurnPredictionResult:
+        """Predict whether an audio segment is complete (turn ended) or incomplete.
+
+        Args:
+            audio_array: Numpy array containing audio samples at 16kHz. The function
+                will convert the audio into float32 and truncate to 8 seconds (keeping the end)
+                or pad to 8 seconds.
+            language: Language of the audio.
+            sample_rate: Sample rate of the audio.
+            sample_width: Sample width of the audio.
+
+        Returns:
+            Prediction result containing completion status and probability.
+        """
+
+        # Check if initialized
+        if not self._is_initialized:
+            return SmartTurnPredictionResult(error="SmartTurnDetector is not initialized")
+
+        # Check a valid language
+        if not self.valid_language(language):
+            logger.warning(f"Invalid language: {language}. Results may be unreliable.")
+
+        # Record start time
+        start_time = datetime.datetime.now()
+
+        # Convert into numpy array
+        dtype = np.int16 if sample_width == 2 else np.int8
+        int16_array: np.ndarray = np.frombuffer(audio_array, dtype=dtype).astype(np.int16)
+
+        # Process audio using Whisper's feature extractor
+        inputs = self.feature_extractor(
+            int16_array,
+            sampling_rate=sample_rate,
+            return_tensors="np",
+            padding="max_length",
+            max_length=8 * sample_rate,
+            truncation=True,
+            do_normalize=True,
+        )
+
+        # Extract features and ensure correct shape for ONNX
+        input_features = inputs.input_features.squeeze(0).astype(np.float32)
+        input_features = np.expand_dims(input_features, axis=0)
+
+        # Run ONNX inference
+        outputs = self.session.run(None, {"input_features": input_features})
+
+        # Extract probability (ONNX model returns sigmoid probabilities)
+        probability = outputs[0][0].item()
+
+        # Make prediction (True for Complete, False for Incomplete)
+        prediction = probability >= self._threshold
+
+        # Record end time
+        end_time = datetime.datetime.now()
+
+        # Return the result
+        return SmartTurnPredictionResult(
+            prediction=prediction,
+            probability=probability,
+            processing_time=float((end_time - start_time).total_seconds()),
+        )
+
+    @staticmethod
+    def truncate_audio_to_last_n_seconds(
+        audio_array: np.ndarray, n_seconds: float = 8.0, sample_rate: int = 16000
+    ) -> np.ndarray:
+        """Truncate audio to last n seconds or pad with zeros to meet n seconds.
+
+        Args:
+            audio_array: Numpy array containing audio samples at 16kHz.
+            n_seconds: Number of seconds to truncate to.
+            sample_rate: Sample rate of the audio.
+
+        Returns:
+            Numpy array truncated to last n seconds or padded with zeros.
+        """
+
+        # Calculate the max samples we should have
+        max_samples = int(n_seconds * sample_rate)
+
+        # Truncate if longer
+        if len(audio_array) > max_samples:
+            return audio_array[-max_samples:]
+
+        # Pad if shorter
+        elif len(audio_array) < max_samples:
+            padding = max_samples - len(audio_array)
+            return np.pad(audio_array, (padding, 0), mode="constant", constant_values=0)
+
+        # Otherwise return the array
+        return audio_array
+
+    @staticmethod
+    def download_model() -> None:
+        """Download the ONNX model.
+
+        This will check if the model has been downloaded and is available in the
+        location specified by the SMART_TURN_MODEL_PATH environment variable.
+
+        If not, it will download the model from HuggingFace.
+        """
+
+        # Check if model file exists
+        if SmartTurnDetector.model_exists():
+            return
+
+        # Check the URL for valid schemes
+        parsed_url = urlparse(SMART_TURN_MODEL_URL)
+        if parsed_url.scheme not in ("http", "https"):
+            logger.error(f"Invalid URL scheme: {parsed_url.scheme}")
+            return
+
+        # Report to the user
+        logger.warning("Smart Turn model not found. Downloading from HuggingFace...")
+
+        # Create the directory
+        os.makedirs(os.path.dirname(SMART_TURN_MODEL_LOCAL_PATH), exist_ok=True)
+
+        # Download
+        urllib.request.urlretrieve(SMART_TURN_MODEL_URL, SMART_TURN_MODEL_LOCAL_PATH)  # nosec B310
+
+    @staticmethod
+    def model_exists() -> bool:
+        """Check the model has been downloaded.
+
+        Returns:
+            True if the model file exists, False otherwise.
+        """
+        return os.path.exists(SMART_TURN_MODEL_LOCAL_PATH)
+
+    @staticmethod
+    def valid_language(language: str) -> bool:
+        """Check if the language is valid.
+
+        Args:
+            language: Language code to validate.
+
+        Returns:
+            True if the language is supported, False otherwise.
+        """
+        return language in [
+            "ar",
+            "bn",
+            "zh",
+            "da",
+            "nl",
+            "de",
+            "en",
+            "fi",
+            "fr",
+            "hi",
+            "id",
+            "it",
+            "ja",
+            "ko",
+            "mr",
+            "no",
+            "pl",
+            "pt",
+            "ru",
+            "es",
+            "tr",
+            "uk",
+            "vi",
+        ]
diff --git a/sdk/voice/speechmatics/voice/_turn.py b/sdk/voice/speechmatics/voice/_turn.py
new file mode 100644
index 0000000..09af054
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/_turn.py
@@ -0,0 +1,161 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+from __future__ import annotations
+
+import asyncio
+from typing import Callable
+from typing import Optional
+
+
+class TurnTaskProcessor:
+    """Container for turn task processing.
+
+    This utility is used to make sure that all processing is completed within a turn. When a
+    process is added, once it completes and all other tasks have also completed, then it will
+    make a call to the `done_callback` function (sync or async).
+    """
+
+    def __init__(self, name: str, handler_id: int = 0, done_callback: Optional[Callable] = None):
+        """Create new handler.
+
+        Args:
+            name: The name of the processor.
+            handler_id: The base handler id (used to validate tasks).
+            done_callback: The callback to call when all tasks are completed.
+        """
+
+        # Processor name
+        self._name = name
+
+        # Handler id (used to validate tasks)
+        self._handler_id = handler_id
+        self._handler_active = False
+
+        # Tasks + events
+        self._tasks: dict[str, asyncio.Task] = {}
+        self._listener_tasks: list[asyncio.Task] = []
+
+        # Done callback (can be async)
+        self._done_callback: Optional[Callable] = done_callback
+
+    @property
+    def has_pending_tasks(self) -> bool:
+        """Check for any pending tasks.
+
+        Returns:
+            True if there are pending tasks, False otherwise.
+        """
+        return any(not task.done() for task in self._tasks.values())
+
+    @property
+    def handler_id(self) -> int:
+        """Get the handler id.
+
+        Returns:
+            The current handler ID.
+        """
+        return self._handler_id
+
+    @property
+    def handler_active(self) -> bool:
+        """Get the handler active state.
+
+        Returns:
+            The current handler active state.
+        """
+        return self._handler_active
+
+    def update_timer(self, delay: float) -> None:
+        """Set a new done trigger.
+
+        Args:
+            delay: Delay in seconds before triggering done callback.
+        """
+
+        if delay < 0:
+            return
+        self.add_task(
+            asyncio.create_task(asyncio.sleep(delay)),
+            "done_task",
+        )
+
+    def add_task(self, task: asyncio.Task, task_name: str) -> None:
+        """Add a task to the end of turn.
+
+        Args:
+            task: The asyncio task to add.
+            task_name: Name identifier for the task.
+        """
+
+        # Cancel any same-named tasks
+        if task_name in self._tasks and not self._tasks[task_name].done():
+            self._tasks[task_name].cancel()
+
+        # Add the task to the list
+        self._tasks[task_name] = task
+
+        # Wait for the task
+        async def wait_for_task(task: asyncio.Task) -> None:
+            try:
+                _handler_id = self._handler_id
+                await task
+                if _handler_id != self._handler_id:
+                    return
+                if not self.has_pending_tasks:
+                    asyncio.create_task(self._do_done_callback())
+            except asyncio.CancelledError:
+                pass
+
+        # Start the task
+        asyncio.create_task(wait_for_task(task))
+
+    async def _do_done_callback(self) -> None:
+        """Do the done callback."""
+
+        # Do the callback
+        if self._done_callback:
+            try:
+                if asyncio.iscoroutinefunction(self._done_callback):
+                    await self._done_callback()
+                else:
+                    self._done_callback()
+            except Exception:
+                pass
+
+        # Complete the task
+        # self.complete_handler()
+
+    def cancel_tasks(self) -> None:
+        """Cancel any pending tasks."""
+        for task in self._tasks.values():
+            if not task.done():
+                task.cancel()
+        self._tasks.clear()
+
+    def reset(self) -> None:
+        """Reset the end of turn."""
+        self.cancel_tasks()
+
+    def start_handler(self) -> None:
+        """Start the end of turn."""
+        self._handler_active = True
+
+    def complete_handler(self) -> None:
+        """Complete the end of turn."""
+        self.next()
+        self._handler_active = False
+
+    def next(self) -> None:
+        """Increment the handler. id"""
+        self.reset()
+        self._handler_id += 1
+
+    def __str__(self) -> str:
+        """Get the string representation of the end of turn.
+
+        Returns:
+            String representation of the processor state.
+        """
+        return f"TurnTaskProcessor(name={self._name}, handler_id={self._handler_id}, tasks={self._tasks.keys()}, pending={self.has_pending_tasks})"
diff --git a/sdk/voice/speechmatics/voice/_utils.py b/sdk/voice/speechmatics/voice/_utils.py
new file mode 100644
index 0000000..31e7e24
--- /dev/null
+++ b/sdk/voice/speechmatics/voice/_utils.py
@@ -0,0 +1,492 @@
+#
+# Copyright (c) 2025, Speechmatics / Cantab Research Ltd
+#
+
+from __future__ import annotations
+
+import datetime
+import re
+import unicodedata
+from typing import Optional
+
+from ._models import AnnotationFlags
+from ._models import AnnotationResult
+from ._models import ClientSessionInfo
+from ._models import SpeakerSegment
+from ._models import SpeakerSegmentView
+from ._models import SpeechFragment
+
+
+class FragmentUtils:
+    """Set of utility functions for working with SpeechFragment and SpeakerSegment objects."""
+
+    @staticmethod
+    def format_segment_text(
+        session: ClientSessionInfo,
+        segment: SpeakerSegment,
+        format: str = "{text}",
+        words_only: bool = False,
+        include_partials: bool = True,
+    ) -> str:
+        """Format a segment's text based on the language pack info.
+
+        Args:
+            session: ClientSessionInfo object.
+            segment: SpeakerSegment object.
+            format: Format string.
+            words_only: Whether to include only word fragments.
+            include_partials: Whether to include partial fragments.
+
+        Returns:
+            str: The formatted text.
+        """
+
+        # Cumulative contents
+        content = ""
+
+        # Select fragments to include
+        if words_only:
+            fragments = [frag for frag in segment.fragments if frag.type_ == "word"]
+        else:
+            fragments = segment.fragments
+
+        # Filter out partials if requested
+        if not include_partials:
+            fragments = [frag for frag in fragments if frag.is_final]
+
+        # Assemble the text
+        previous_frag: Optional[SpeechFragment] = None
+        for frag in fragments:
+            if not previous_frag:
+                content = frag.content
+            elif frag.attaches_to == "previous" or previous_frag.attaches_to == "next":
+                content += frag.content
+            else:
+                content += session.language_pack_info.word_delimiter + frag.content
+            previous_frag = frag
+
+        # Return the formatted text
+        return format.format(
+            **{
+                "speaker_id": segment.speaker_id,
+                "text": content,
+                "ts": segment.timestamp,
+                "lang": segment.language,
+                "start_time": fragments[0].start_time if fragments else 0,
+                "end_time": fragments[-1].end_time if fragments else 0,
+                "annotation": segment.annotation or [],
+            }
+        )
+
+    @staticmethod
+    def segment_list_from_fragments(
+        session: ClientSessionInfo,
+        fragments: list[SpeechFragment],
+        focus_speakers: Optional[list[str]] = None,
+        annotate_segments: bool = True,
+    ) -> list[SpeakerSegment]:
+        """Create SpeakerSegment objects from a list of SpeechFragment objects.
+
+        Args:
+            session: ClientSessionInfo object.
+            fragments: List of SpeechFragment objects.
+            focus_speakers: List of speakers to focus on or None.
+            annotate_segments: Whether to annotate segments.
+
+        Returns:
+            List of SpeakerSegment objects.
+        """
+
+        # Speaker groups
+        current_speaker: Optional[str] = None
+        speaker_groups: list[list[SpeechFragment]] = [[]]
+
+        # Group by speakers
+        for frag in fragments:
+            if frag.speaker != current_speaker:
+                current_speaker = frag.speaker
+                if speaker_groups[-1]:
+                    speaker_groups.append([])
+            speaker_groups[-1].append(frag)
+
+        # Create SpeakerFragments objects
+        segments: list[SpeakerSegment] = []
+        for group in speaker_groups:
+            # Skip if the group is empty
+            if not group:
+                continue
+
+            # Split group into sub-groups by end-of-sentence markers (finals only)
+            if session.config.speech_segment_config.emit_sentences:
+                subgroup: list[SpeechFragment] = []
+                subgroups: list[list[SpeechFragment]] = []
+                for frag in group:
+                    subgroup.append(frag)
+                    if frag.is_eos and frag.is_final:
+                        subgroups.append(subgroup)
+                        subgroup = []
+                if subgroup:
+                    subgroups.append(subgroup)
+            else:
+                subgroups = [group]
+
+            # Process each of the sub-groups
+            for fragments_subset in subgroups:
+                segment = FragmentUtils.segment_from_fragments(
+                    session=session,
+                    fragments=fragments_subset,
+                    focus_speakers=focus_speakers,
+                    annotate=annotate_segments,
+                )
+                if segment:
+                    FragmentUtils.update_segment_text(session=session, segment=segment)
+                    segments.append(segment)
+
+        # Return the grouped SpeakerFragments objects
+        return segments
+
+    @staticmethod
+    def update_segment_text(session: ClientSessionInfo, segment: SpeakerSegment) -> None:
+        """Update the text of a segment based on the language pack info.
+
+        Args:
+            session: ClientSessionInfo object.
+            segment: SpeakerSegment object.
+        """
+        segment.text = FragmentUtils.format_segment_text(
+            session=session, segment=segment, include_partials=session.config.include_partials
+        )
+
+    @staticmethod
+    def segment_from_fragments(
+        session: ClientSessionInfo,
+        fragments: list[SpeechFragment],
+        focus_speakers: Optional[list[str]] = None,
+        annotate: bool = True,
+    ) -> Optional[SpeakerSegment]:
+        """Take a group of fragments and piece together into SpeakerSegment.
+
+        Each fragment for a given speaker is assembled into a string,
+        taking into consideration whether words are attached to the
+        previous or next word (notably punctuation). This ensures that
+        the text does not have extra spaces. This will also check for
+        any straggling punctuation from earlier utterances that should
+        be removed.
+
+        Args:
+            session: ClientSessionInfo object.
+            fragments: List of SpeechFragment objects.
+            focus_speakers: List of speakers to focus on.
+            annotate: Whether to annotate the segment.
+
+        Returns:
+            The SpeakerSegment object for the group, or None if no valid fragments.
+        """
+        # Check for starting fragments that are attached to previous
+        if fragments and fragments[0].attaches_to == "previous":
+            fragments = fragments[1:]
+
+        # Check for trailing fragments that are attached to next
+        if fragments and fragments[-1].attaches_to == "next":
+            fragments = fragments[:-1]
+
+        # Check there are results
+        if not fragments:
+            return None
+
+        # Get the timing extremes
+        start_time = min(frag.start_time for frag in fragments)
+
+        # Timestamp
+        ts = (session.base_time + datetime.timedelta(seconds=start_time)).isoformat(timespec="milliseconds")
+
+        # Determine if the speaker is considered active
+        is_active = True
+        if focus_speakers:
+            is_active = fragments[0].speaker in focus_speakers
+
+        # New SpeakerSegment
+        segment = SpeakerSegment(
+            speaker_id=fragments[0].speaker,
+            timestamp=ts,
+            language=fragments[0].language,
+            fragments=fragments,
+            is_active=is_active,
+        )
+
+        # Annotate
+        if annotate:
+            segment.annotation = FragmentUtils._annotate_segment(segment)
+
+        # Return the SpeakerSegment object
+        return segment
+
+    @staticmethod
+    def _annotate_segment(segment: SpeakerSegment) -> AnnotationResult:
+        """Annotate the segment with any additional information.
+
+        Args:
+            segment: SpeakerSegment object.
+
+        Returns:
+            AnnotationResult: The annotation result.
+        """
+        # Annotation result
+        result = AnnotationResult()
+
+        # References
+        segment_length: int = len(segment.fragments)
+        first_fragment: SpeechFragment = segment.fragments[0]
+        last_fragment: SpeechFragment = segment.fragments[-1]
+        penultimate_fragment: Optional[SpeechFragment] = segment.fragments[-2] if segment_length > 1 else None
+
+        # Count of words
+        words = [frag for frag in segment.fragments if frag.type_ == "word"]
+        word_count = len(words)
+        if word_count == 0:
+            result.add(AnnotationFlags.NO_TEXT)
+
+        # Only punctuation
+        if all(frag.is_punctuation for frag in segment.fragments):
+            result.add(AnnotationFlags.ONLY_PUNCTUATION)
+
+        # Partials and finals
+        if any(not frag.is_final for frag in segment.fragments):
+            result.add(AnnotationFlags.HAS_PARTIAL)
+
+        # Finals
+        if any(frag.is_final for frag in segment.fragments):
+            result.add(AnnotationFlags.HAS_FINAL)
+        if first_fragment.is_final:
+            result.add(AnnotationFlags.STARTS_WITH_FINAL)
+        if last_fragment.is_final:
+            result.add(AnnotationFlags.ENDS_WITH_FINAL)
+
+        # End of sentence
+        if last_fragment.is_eos:
+            result.add(AnnotationFlags.ENDS_WITH_EOS)
+
+        # Punctuation
+        if last_fragment.is_punctuation:
+            result.add(AnnotationFlags.ENDS_WITH_PUNCTUATION)
+
+        # Disfluency
+        if any(frag.is_disfluency for frag in segment.fragments):
+            result.add(AnnotationFlags.HAS_DISFLUENCY)
+        if first_fragment.is_disfluency:
+            result.add(AnnotationFlags.STARTS_WITH_DISFLUENCY)
+        if last_fragment.is_disfluency:
+            result.add(AnnotationFlags.ENDS_WITH_DISFLUENCY)
+        if (
+            penultimate_fragment
+            and result.any(AnnotationFlags.ENDS_WITH_EOS, AnnotationFlags.ENDS_WITH_PUNCTUATION)
+            and penultimate_fragment.is_disfluency
+        ):
+            result.add(AnnotationFlags.ENDS_WITH_DISFLUENCY)
+
+        # Rate of speech
+        if len(words) > 1:
+            # Calculate the approximate words-per-minute (for last few words)
+            recent_words = words[-10:]
+            word_time_span = recent_words[-1].end_time - recent_words[0].start_time
+            wpm = (len(recent_words) / word_time_span) * 60
+
+            # Categorize the speaker
+            if wpm < 80:
+                result.add(AnnotationFlags.VERY_SLOW_SPEAKER)
+            elif wpm < 120:
+                result.add(AnnotationFlags.SLOW_SPEAKER)
+            elif wpm > 250:
+                result.add(AnnotationFlags.FAST_SPEAKER)
+
+        # Return the annotation result
+        return result
+
+    @staticmethod
+    def compare_views(
+        session: ClientSessionInfo, view1: SpeakerSegmentView, view2: Optional[SpeakerSegmentView]
+    ) -> AnnotationResult:
+        """Compare two SpeakerSegmentView objects and return the differences.
+
+        View 1 (new) is compared to view 2 (old).
+
+        Args:
+            session: ClientSessionInfo object.
+            view1: The first SpeakerSegmentView object to compare.
+            view2: The second SpeakerSegmentView object to compare to or None.
+
+        Returns:
+            AnnotationResult: The annotation result.
+        """
+        # Result
+        result = AnnotationResult()
+
+        # Flag to include partials
+        include_partials = session.config.include_partials
+
+        # If we have a previous view, compare it
+        if view2 and view2.segment_count > 0:
+            # Compare full string
+            view1_full_str: str = view1.format_view_text(include_partials=include_partials)
+            view2_full_str: str = view2.format_view_text(include_partials=include_partials)
+            if view1_full_str != view2_full_str:
+                result.add(AnnotationFlags.UPDATED_FULL)
+            if view1_full_str.lower() != view2_full_str.lower():
+                result.add(AnnotationFlags.UPDATED_FULL_LCASE)
+
+            # Stripped string (without punctuation)
+            view1_stripped_str: str = view1.format_view_text(include_partials=include_partials, words_only=True)
+            view2_stripped_str: str = view2.format_view_text(include_partials=include_partials, words_only=True)
+            if view1_stripped_str != view2_stripped_str:
+                result.add(AnnotationFlags.UPDATED_STRIPPED)
+            if view1_stripped_str.lower() != view2_stripped_str.lower():
+                result.add(AnnotationFlags.UPDATED_STRIPPED_LCASE)
+
+            # Word timings
+            view1_timings_str: str = view1.format_view_text(
+                format="|{start_time}-{end_time}|", words_only=True, include_partials=include_partials
+            )
+            view2_timings_str: str = view2.format_view_text(
+                format="|{start_time}-{end_time}|", words_only=True, include_partials=include_partials
+            )
+            if view1_timings_str != view2_timings_str:
+                result.add(AnnotationFlags.UPDATED_WORD_TIMINGS)
+
+            # Annotations
+            view1_annotation_str: str = view1.format_view_text(format="|{annotation}|")
+            view2_annotation_str: str = view2.format_view_text(format="|{annotation}|")
+            if set(view1_annotation_str) != set(view2_annotation_str):
+                result.add(AnnotationFlags.UPDATED_ANNOTATIONS)
+
+            # Partials, finals and speakers
+            if view1.final_count != view2.final_count:
+                result.add(AnnotationFlags.UPDATED_FINALS)
+            if view1.partial_count != view2.partial_count:
+                result.add(AnnotationFlags.UPDATED_PARTIALS)
+
+        # Assume this is new
+        elif view1.segment_count > 0:
+            result.add(AnnotationFlags.NEW)
+
+        # Finalized (last segment only has finals)
+        if view1.segment_count > 0 and view1.partial_count == 0:
+            result.add(AnnotationFlags.FINALIZED)
+
+        # Return the result
+        return result
+
+    @staticmethod
+    def find_segment_pauses(session: ClientSessionInfo, view: SpeakerSegmentView) -> None:
+        """Find gaps in the segments.
+
+        LLMs may find knowledge of when someone pauses between words of use when constructing
+        their reply. This utility adds in pause markers to the segments which can then be used
+        during the text formatting of the segment.
+
+        Args:
+            session: ClientSessionInfo object.
+            view: The SpeakerSegmentView object to process.
+        """
+
+        # Find gaps in the view
+        for segment in view.segments:
+            # Strip out existing pauses
+            words = [f for f in segment.fragments if f.type_ != "pause"]
+
+            # Find gaps between the end of one word and the start of the next
+            for i in range(len(words) - 1):
+                word = words[i]
+                next_word = words[i + 1]
+                gap_start = word.end_time
+                gap_end = next_word.start_time
+                if gap_end - gap_start > 0.1:
+                    segment.fragments.append(
+                        SpeechFragment(
+                            idx=word.idx + 1,
+                            type_="pause",
+                            start_time=gap_start,
+                            end_time=gap_end,
+                            is_final=word.is_final,
+                            content=session.config.speech_segment_config.pause_mark or "...",
+                        )
+                    )
+
+            # Resort the fragments
+            segment.fragments.sort(key=lambda f: f.idx)
+
+            # Re-process the text
+            FragmentUtils.update_segment_text(session, segment)
+
+
+class TextUtils:
+    """Set of string / text utilities."""
+
+    @staticmethod
+    def cer(ref: str, hyp: str) -> float:
+        """
+        Compute Character Error Rate (CER) between reference and hypothesis.
+
+        CER = (S + D + I) / N
+        where
+            S = substitutions
+            D = deletions
+            I = insertions
+            N = number of characters in reference
+
+        Args:
+            ref (str): Reference text.
+            hyp (str): Hypothesis text.
+
+        Returns:
+            float: Character Error Rate (CER).
+        """
+
+        # Initialise DP matrix
+        n, m = len(ref), len(hyp)
+        dp = [[0] * (m + 1) for _ in range(n + 1)]
+
+        # Base cases
+        for i in range(n + 1):
+            dp[i][0] = i
+        for j in range(m + 1):
+            dp[0][j] = j
+
+        # Fill DP matrix
+        for i in range(1, n + 1):
+            for j in range(1, m + 1):
+                cost = 0 if ref[i - 1] == hyp[j - 1] else 1
+                dp[i][j] = min(
+                    dp[i - 1][j] + 1,  # deletion
+                    dp[i][j - 1] + 1,  # insertion
+                    dp[i - 1][j - 1] + cost,  # substitution
+                )
+
+        # Return CER
+        distance = dp[n][m]
+        return distance / n if n > 0 else float("inf")
+
+    @staticmethod
+    def normalize(text: str) -> str:
+        """Normalise text.
+
+        When comparing text, it is often useful to normalise it first. This will strip out
+        all non-letter characters and collapse whitespace.
+
+        Args:
+            text (str): Text to normalise.
+
+        Returns:
+            str: Normalised text.
+        """
+
+        # Lowercase
+        text = text.lower()
+
+        # Remove punctuation (Unicode category "P")
+        text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "P")
+
+        # Collapse whitespace
+        text = re.sub(r"\s+", " ", text).strip()
+
+        # Return cleaned text
+        return text
diff --git a/tests/voice/.gitignore b/tests/voice/.gitignore
new file mode 100644
index 0000000..d36977d
--- /dev/null
+++ b/tests/voice/.gitignore
@@ -0,0 +1 @@
+.tmp
diff --git a/tests/voice/README.md b/tests/voice/README.md
new file mode 100644
index 0000000..bd5c452
--- /dev/null
+++ b/tests/voice/README.md
@@ -0,0 +1,40 @@
+# Voice SDK Tests
+
+You will need a `SPEECHMATICS_API_KEY` to run most of the tests, as they will use live transcription.
+
+You need to have git LFS support installed to run audio file tests.
+
+```bash
+# Windows (select Git LFS when installing Git)
+
+# Linux
+sudo apt install git-lfs
+
+# macOS
+brew install git-lfs
+
+# Download / update FLS files
+git lfs pull
+```
+
+To run tests:
+
+```bash
+# Install dependencies
+make install-dev
+
+# Run tests without an API key (those needing live transcription will be skipped)
+make test-voice
+
+# Run all tests
+SPEECHMATICS_API_KEY=your_api_key make voice-tests
+
+# Run a specific test
+SPEECHMATICS_API_KEY=your_api_key pytest -v -s tests/voice/test_03_conversation.py
+
+# Run a specific sub-test
+SPEECHMATICS_API_KEY=your_api_key pytest -v -s tests/voice/test_03_conversation.py::test_log_messages
+
+# Run a specific test with logging
+SPEECHMATICS_API_KEY=your_api_key SPEECHMATICS_SHOW_LOG=1 pytest -v -s tests/voice/test_03_conversation.py
+```
diff --git a/tests/voice/_utils.py b/tests/voice/_utils.py
new file mode 100644
index 0000000..c663e99
--- /dev/null
+++ b/tests/voice/_utils.py
@@ -0,0 +1,211 @@
+import asyncio
+import json
+import os
+import time
+import wave
+from typing import Any
+from typing import Callable
+from typing import Optional
+
+import aiofiles
+
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import VoiceAgentClient
+from speechmatics.voice import VoiceAgentConfig
+
+
+async def get_client(
+    api_key: Optional[str] = None,
+    url: Optional[str] = None,
+    app: Optional[str] = None,
+    config: Optional[VoiceAgentConfig] = None,
+    connect: bool = True,
+) -> VoiceAgentClient:
+    """Get a client."""
+
+    # Create client
+    client = VoiceAgentClient(api_key=api_key, url=url, app=app, config=config)
+
+    # Connect
+    if connect:
+        """Connect to the client and wait for the RECOGNITION_STARTED event."""
+
+        # Create an event to track when the callback is called
+        event_received = asyncio.Event()
+        received_message = None
+
+        # Callback function for connection
+        def on_recognition_started(message):
+            nonlocal received_message
+            received_message = message
+            event_received.set()
+
+        # Add listener for when recognition starts
+        client.once(AgentServerMessageType.RECOGNITION_STARTED, on_recognition_started)
+
+        # Connect
+        await client.connect()
+
+        # Wait for the callback with a 5-second timeout
+        try:
+            await asyncio.wait_for(event_received.wait(), timeout=5.0)
+            assert received_message is not None
+        except asyncio.TimeoutError:
+            raise TimeoutError("RECOGNITION_STARTED event was not received within 5 seconds")
+
+    # Return client
+    return client
+
+
+async def send_audio_file(
+    client: VoiceAgentClient,
+    audio_file: str,
+    terminate_event: Optional[asyncio.Event] = None,
+    chunk_size: int = 320,
+    sample_rate: int = 16000,
+    sample_size: int = 2,
+    progress_callback: Optional[Callable[[int], None]] = None,
+) -> None:
+    """Send audio data to the API server."""
+
+    # Make sure client is connected
+    assert client._is_connected
+
+    # Make sure file ends with .wav
+    assert audio_file.lower().endswith(".wav")
+
+    # Check file exists
+    file = os.path.join(os.path.dirname(__file__), audio_file)
+    assert os.path.exists(file)
+
+    # Make sure progress callback is callable
+    if progress_callback:
+        assert callable(progress_callback)
+
+    # Delay is based off 16kHz int16 and chunk size
+    delay = chunk_size / sample_rate / sample_size
+
+    # Load the file
+    async with aiofiles.open(file, "rb") as wav_file:
+        # Trim off the WAV file header
+        await wav_file.seek(44)
+
+        # Send audio data
+        next_time = time.perf_counter() + delay
+        while not terminate_event.is_set() if terminate_event else True:
+            """Reads all chunks until the end of the file with precision delay."""
+
+            # Read chunk
+            chunk = await wav_file.read(chunk_size)
+
+            # End of file
+            if not chunk:
+                break
+
+            # Send audio to client
+            await client.send_audio(chunk)
+
+            # Do any callbacks
+            if progress_callback:
+                progress_callback(len(chunk))
+
+            # Precision delay
+            sleep_time = next_time - time.perf_counter()
+            if sleep_time > 0:
+                await asyncio.sleep(sleep_time)
+            next_time += delay
+
+
+async def load_audio_file(audio_file: str) -> bytes:
+    """Load an audio file."""
+
+    # Make sure file ends with .wav
+    assert audio_file.lower().endswith(".wav")
+
+    # Check file exists
+    file = os.path.join(os.path.dirname(__file__), audio_file)
+    assert os.path.exists(file)
+
+    # Load the file
+    with wave.open(file, "rb") as wav_file:
+        return wav_file.readframes(wav_file.getnframes())
+
+
+async def send_silence(
+    client: VoiceAgentClient,
+    duration: float,
+    terminate_event: Optional[asyncio.Event] = None,
+    chunk_size: int = 320,
+    sample_rate: int = 16000,
+    sample_size: int = 2,
+    progress_callback: Optional[Callable[[int], None]] = None,
+):
+    """Send silence to the client (creates a chunk of silence and sends it to the client)"""
+
+    # Make sure client is connected
+    assert client._is_connected
+
+    # Make sure duration is positive
+    assert duration > 0
+
+    # Make sure chunk size is positive
+    assert chunk_size > 0
+
+    # Make sure progress callback is callable
+    if progress_callback:
+        assert callable(progress_callback)
+
+    # Send silence
+    silence = b"\x00" * chunk_size
+
+    # Timing
+    delay = chunk_size / sample_rate / sample_size
+    next_time = time.perf_counter() + delay
+
+    # Iterations required
+    iterations = int(duration / delay)
+
+    # Keep sending
+    while (not terminate_event.is_set() if terminate_event else True) and iterations > 0:
+        # Send audio to client
+        await client.send_audio(silence)
+
+        # Do any callbacks
+        if progress_callback:
+            progress_callback(len(silence))
+
+        # Precision delay
+        sleep_time = next_time - time.perf_counter()
+        if sleep_time > 0:
+            await asyncio.sleep(sleep_time)
+        next_time += delay
+
+        # Reduce iterations
+        iterations -= 1
+
+
+class ConversationLog:
+    """Load a JSONL past conversation."""
+
+    def __init__(self, file: str):
+        """Load a JSONL past conversation.
+
+        Args:
+            file (str): Path to the JSONL file.
+        """
+        self.file = file
+        self.conversation = self._load_conversation()
+
+    def _load_conversation(self):
+        """Load a JSONL past conversation."""
+        with open(self.file) as f:
+            return [json.loads(line) for line in f]
+
+    def get_conversation(self, filter: Optional[list[str]] = None) -> list[dict[str, Any]]:
+        """Get the conversation."""
+        try:
+            if filter:
+                return [line for line in self.conversation if line["payload"]["message"] in filter]
+            return list(self.conversation)
+        except KeyError:
+            return []
diff --git a/tests/voice/assets/audio_01_16kHz.wav b/tests/voice/assets/audio_01_16kHz.wav
new file mode 100644
index 0000000..4bc7b28
--- /dev/null
+++ b/tests/voice/assets/audio_01_16kHz.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93dbce77fcc7457c87e0559dd7501a2f24861239466f90664decdb6d465818e2
+size 902240
diff --git a/tests/voice/assets/audio_02_8kHz.wav b/tests/voice/assets/audio_02_8kHz.wav
new file mode 100644
index 0000000..45ae225
--- /dev/null
+++ b/tests/voice/assets/audio_02_8kHz.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e93958ef0a3bf812e176a4003d942cdeceb1340583a79d120fa54c430293bb9
+size 579016
diff --git a/tests/voice/assets/audio_03_16kHz.wav b/tests/voice/assets/audio_03_16kHz.wav
new file mode 100644
index 0000000..0c9b179
--- /dev/null
+++ b/tests/voice/assets/audio_03_16kHz.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eadc1c0609e13027d0bb7bfe6bf7868c123cbd17421898e6023b1889f103cf17
+size 58460
diff --git a/tests/voice/assets/audio_04_16kHz.wav b/tests/voice/assets/audio_04_16kHz.wav
new file mode 100644
index 0000000..ebb9fda
--- /dev/null
+++ b/tests/voice/assets/audio_04_16kHz.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e4d8e833dec170233f9889874d529c06074be6bd0291dd923ec463315230fab5
+size 324902
diff --git a/tests/voice/assets/audio_05_16kHz.wav b/tests/voice/assets/audio_05_16kHz.wav
new file mode 100644
index 0000000..d52c888
--- /dev/null
+++ b/tests/voice/assets/audio_05_16kHz.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5218b4caa8d3f853e79a9f6dbf341d0ab9a0771b9a45f996d4a3b07fd7607bdb
+size 887270
diff --git a/tests/voice/assets/audio_06_16kHz.wav b/tests/voice/assets/audio_06_16kHz.wav
new file mode 100644
index 0000000..658847c
--- /dev/null
+++ b/tests/voice/assets/audio_06_16kHz.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5cb9da6ed4aa3e6c2fb16f94840466d4ce33c180eb715724d11c85cb266f30b
+size 1161364
diff --git a/tests/voice/assets/chat1.jsonl b/tests/voice/assets/chat1.jsonl
new file mode 100644
index 0000000..e017e5a
--- /dev/null
+++ b/tests/voice/assets/chat1.jsonl
@@ -0,0 +1,271 @@
+{"ts": 2.9e-05, "audio_ts": 0.0, "payload": {"message": "AudioFile", "path": "./assets/audio_01_16kHz.wav"}}
+{"ts": 8.8e-05, "audio_ts": 0.0, "payload": {"message": "VoiceAgentConfig", "operating_point": "enhanced", "domain": null, "language": "en", "output_locale": null, "max_delay": 0.7, "end_of_utterance_silence_trigger": 0.2, "end_of_utterance_max_delay": 10.0, "end_of_utterance_mode": "fixed", "additional_vocab": [], "punctuation_overrides": null, "enable_diarization": true, "speaker_sensitivity": 0.5, "max_speakers": null, "prefer_current_speaker": false, "speaker_config": {"focus_speakers": [], "ignore_speakers": [], "focus_mode": "retain"}, "known_speakers": [], "include_results": false, "enable_preview_features": false, "sample_rate": 16000, "audio_encoding": "pcm_s16le"}}
+{"ts": 0.000128, "audio_ts": 0.0, "payload": {"message": "TranscriptionConfig", "language": "en", "operating_point": "enhanced", "output_locale": null, "diarization": "speaker", "additional_vocab": null, "punctuation_overrides": null, "domain": null, "enable_entities": null, "audio_filtering_config": null, "transcript_filtering_config": null, "max_delay": 0.7, "max_delay_mode": null, "enable_partials": true, "speaker_diarization_config": {"speaker_sensitivity": 0.5, "prefer_current_speaker": false}, "streaming_mode": null, "conversation_config": {"end_of_utterance_silence_trigger": 0.2}, "ctrl": null, "channel_diarization_labels": null}}
+{"ts": 0.000148, "audio_ts": 0.0, "payload": {"message": "AudioFormat", "encoding": "pcm_s16le", "sample_rate": 16000, "chunk_size": 320}}
+{"ts": 0.091146, "audio_ts": 0.0, "payload": {"message": "Info", "type": "concurrent_session_usage", "reason": "1 concurrent sessions active out of quota 150", "usage": 1, "quota": 150, "last_updated": "2025-09-22T16:51:01Z"}}
+{"ts": 0.132706, "audio_ts": 0.0, "payload": {"message": "RecognitionStarted", "orchestrator_version": "2025.08.29127+289170c022.HEAD", "id": "9d8a5d05-7a62-4266-8cfc-9ffd9cb3cdf1", "language_pack_info": {"adapted": false, "itn": true, "language_description": "English", "word_delimiter": " ", "writing_direction": "left-to-right"}}}
+{"ts": 0.67894, "audio_ts": 0.55, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 0.28, "start_time": 0.0, "transcript": ""}}}
+{"ts": 0.67918, "audio_ts": 0.55, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 0.28, "start_time": 0.28, "transcript": ""}}}
+{"ts": 1.10585, "audio_ts": 0.98, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 0.64, "start_time": 0.28, "transcript": ""}}}
+{"ts": 1.435181, "audio_ts": 1.31, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Welcome", "language": "en", "speaker": "S1"}], "end_time": 0.92, "start_time": 0.36, "type": "word"}], "metadata": {"end_time": 0.92, "start_time": 0.2, "transcript": "Welcome"}}}
+{"ts": 1.436055, "audio_ts": 1.31, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S1"}}}
+{"ts": 1.436582, "audio_ts": 1.31, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:01.873+00:00", "language": "en", "text": "Welcome", "annotation": ["has_partial"]}], "metadata": {"start_time": 0.36, "end_time": 0.92}}}
+{"ts": 1.792961, "audio_ts": 1.66, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Welcome", "language": "en", "speaker": "S1"}], "end_time": 0.92, "start_time": 0.36, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "to", "language": "en", "speaker": "S1"}], "end_time": 1.0, "start_time": 0.92, "type": "word"}], "metadata": {"end_time": 1.0, "start_time": 0.2, "transcript": "Welcome to "}}}
+{"ts": 1.793181, "audio_ts": 1.66, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:01.873+00:00", "language": "en", "text": "Welcome to", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 0.36, "end_time": 1.0}}}
+{"ts": 1.793294, "audio_ts": 1.66, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "speech", "language": "en", "speaker": "S1"}], "end_time": 1.28, "start_time": 1.04, "type": "word"}], "metadata": {"end_time": 1.28, "start_time": 1.0, "transcript": "speech"}}}
+{"ts": 1.793391, "audio_ts": 1.66, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:01.873+00:00", "language": "en", "text": "Welcome to speech", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 0.36, "end_time": 1.28}}}
+{"ts": 2.196876, "audio_ts": 2.07, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "speech", "language": "en", "speaker": "S1"}], "end_time": 1.32, "start_time": 1.04, "type": "word"}], "metadata": {"end_time": 1.32, "start_time": 1.0, "transcript": "speech "}}}
+{"ts": 2.197238, "audio_ts": 2.07, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:01.873+00:00", "language": "en", "text": "Welcome to speech", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 0.36, "end_time": 1.32}}}
+{"ts": 2.197393, "audio_ts": 2.07, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 1.72, "start_time": 1.4, "transcript": ""}}}
+{"ts": 2.19746, "audio_ts": 2.07, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S1"}}}
+{"ts": 2.197555, "audio_ts": 2.07, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:01.873+00:00", "language": "en", "text": "Welcome to speech", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 0.36, "end_time": 1.32}}}
+{"ts": 2.495748, "audio_ts": 2.37, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 2.08, "start_time": 1.4, "transcript": ""}}}
+{"ts": 2.496584, "audio_ts": 2.37, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:01.873+00:00", "language": "en", "text": "Welcome to speech", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 0.36, "end_time": 1.32}}}
+{"ts": 2.828358, "audio_ts": 2.7, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 1.32, "is_eos": true, "start_time": 1.32, "type": "punctuation"}], "metadata": {"end_time": 2.36, "start_time": 1.32, "transcript": ". "}}}
+{"ts": 2.828587, "audio_ts": 2.7, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 2.44, "start_time": 2.44, "transcript": ""}}}
+{"ts": 2.828685, "audio_ts": 2.7, "payload": {"message": "EndOfUtterance", "format": "2.9", "metadata": {"end_time": 2.44, "start_time": 2.44}}}
+{"ts": 2.829411, "audio_ts": 2.7, "payload": {"message": "AddSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:01.873+00:00", "language": "en", "text": "Welcome to speech.", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 0.36, "end_time": 1.32}}}
+{"ts": 2.829598, "audio_ts": 2.7, "payload": {"message": "EndOfTurn", "metadata": {"start_time": 1.32, "end_time": 1.32}}}
+{"ts": 3.168606, "audio_ts": 3.04, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 2.8, "start_time": 2.44, "transcript": ""}}}
+{"ts": 3.168855, "audio_ts": 3.04, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 2.8, "start_time": 2.8, "transcript": ""}}}
+{"ts": 3.541596, "audio_ts": 3.41, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 3.16, "start_time": 2.8, "transcript": ""}}}
+{"ts": 3.542262, "audio_ts": 3.41, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 3.16, "start_time": 3.16, "transcript": ""}}}
+{"ts": 3.894022, "audio_ts": 3.77, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 3.52, "start_time": 3.16, "transcript": ""}}}
+{"ts": 3.894126, "audio_ts": 3.77, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 3.52, "start_time": 3.52, "transcript": ""}}}
+{"ts": 4.29732, "audio_ts": 4.17, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "We're", "language": "en", "speaker": "S1"}], "end_time": 3.72, "start_time": 3.48, "type": "word"}], "metadata": {"end_time": 3.8, "start_time": 3.44, "transcript": "We're"}}}
+{"ts": 4.297519, "audio_ts": 4.17, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S1"}}}
+{"ts": 4.297701, "audio_ts": 4.17, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're", "annotation": ["has_partial"]}], "metadata": {"start_time": 3.48, "end_time": 3.72}}}
+{"ts": 4.683379, "audio_ts": 4.55, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "We're", "language": "en", "speaker": "S1"}], "end_time": 3.72, "start_time": 3.48, "type": "word"}], "metadata": {"end_time": 3.72, "start_time": 3.44, "transcript": "We're "}}}
+{"ts": 4.683496, "audio_ts": 4.55, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "delighted", "language": "en", "speaker": "S1"}], "end_time": 4.08, "start_time": 3.72, "type": "word"}, {"alternatives": [{"confidence": 0.99, "content": "that", "language": "en", "speaker": "S1"}], "end_time": 4.16, "start_time": 4.08, "type": "word"}], "metadata": {"end_time": 4.16, "start_time": 3.72, "transcript": "delighted that"}}}
+{"ts": 4.683796, "audio_ts": 4.55, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 3.72}}}
+{"ts": 4.684178, "audio_ts": 4.55, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 4.16}}}
+{"ts": 5.017098, "audio_ts": 4.89, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "delighted", "language": "en", "speaker": "S1"}], "end_time": 4.08, "start_time": 3.72, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "that", "language": "en", "speaker": "S1"}], "end_time": 4.24, "start_time": 4.08, "type": "word"}], "metadata": {"end_time": 4.24, "start_time": 3.72, "transcript": "delighted that "}}}
+{"ts": 5.017376, "audio_ts": 4.89, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 0.7, "content": "you've", "language": "en", "speaker": "S1"}], "end_time": 4.44, "start_time": 4.24, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "done", "language": "en", "speaker": "S1"}], "end_time": 4.52, "start_time": 4.44, "type": "word"}], "metadata": {"end_time": 4.52, "start_time": 4.24, "transcript": "you've done"}}}
+{"ts": 5.018423, "audio_ts": 4.89, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 4.24}}}
+{"ts": 5.018884, "audio_ts": 4.89, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've done", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 4.52}}}
+{"ts": 5.331675, "audio_ts": 5.2, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "you've", "language": "en", "speaker": "S1"}], "end_time": 4.44, "start_time": 4.24, "type": "word"}], "metadata": {"end_time": 4.44, "start_time": 4.24, "transcript": "you've "}}}
+{"ts": 5.332795, "audio_ts": 5.2, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "decided", "language": "en", "speaker": "S1"}], "end_time": 4.84, "start_time": 4.44, "type": "word"}], "metadata": {"end_time": 4.88, "start_time": 4.44, "transcript": "decided"}}}
+{"ts": 5.335569, "audio_ts": 5.2, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 4.44}}}
+{"ts": 5.335927, "audio_ts": 5.2, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 4.84}}}
+{"ts": 5.727246, "audio_ts": 5.6, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "decided", "language": "en", "speaker": "S1"}], "end_time": 4.84, "start_time": 4.44, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "to", "language": "en", "speaker": "S1"}], "end_time": 4.96, "start_time": 4.84, "type": "word"}], "metadata": {"end_time": 4.96, "start_time": 4.44, "transcript": "decided to "}}}
+{"ts": 5.727506, "audio_ts": 5.6, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "try", "language": "en", "speaker": "S1"}], "end_time": 5.24, "start_time": 4.96, "type": "word"}], "metadata": {"end_time": 5.24, "start_time": 4.96, "transcript": "try"}}}
+{"ts": 5.728125, "audio_ts": 5.6, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 4.96}}}
+{"ts": 5.728571, "audio_ts": 5.6, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 5.24}}}
+{"ts": 6.124554, "audio_ts": 6.0, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "try", "language": "en", "speaker": "S1"}], "end_time": 5.36, "start_time": 4.96, "type": "word"}, {"alternatives": [{"confidence": 0.93, "content": "our", "language": "en", "speaker": "S1"}], "end_time": 5.6, "start_time": 5.36, "type": "word"}], "metadata": {"end_time": 5.6, "start_time": 4.96, "transcript": "try our"}}}
+{"ts": 6.125347, "audio_ts": 6.0, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try our", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 5.6}}}
+{"ts": 6.438556, "audio_ts": 6.31, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "try", "language": "en", "speaker": "S1"}], "end_time": 5.36, "start_time": 4.96, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "our", "language": "en", "speaker": "S1"}], "end_time": 5.56, "start_time": 5.36, "type": "word"}], "metadata": {"end_time": 5.56, "start_time": 4.96, "transcript": "try our "}}}
+{"ts": 6.438908, "audio_ts": 6.31, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "speech", "language": "en", "speaker": "S1"}], "end_time": 5.88, "start_time": 5.56, "type": "word"}], "metadata": {"end_time": 5.96, "start_time": 5.56, "transcript": "speech"}}}
+{"ts": 6.439773, "audio_ts": 6.31, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try our", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 5.56}}}
+{"ts": 6.440425, "audio_ts": 6.31, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try our speech", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 5.88}}}
+{"ts": 6.765347, "audio_ts": 6.64, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "speech", "language": "en", "speaker": "S1"}], "end_time": 5.88, "start_time": 5.56, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "to", "language": "en", "speaker": "S1"}], "end_time": 6.0, "start_time": 5.88, "type": "word"}], "metadata": {"end_time": 6.0, "start_time": 5.56, "transcript": "speech to "}}}
+{"ts": 6.765656, "audio_ts": 6.64, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "text", "language": "en", "speaker": "S1"}], "end_time": 6.32, "start_time": 6.04, "type": "word"}], "metadata": {"end_time": 6.32, "start_time": 6.0, "transcript": "text"}}}
+{"ts": 6.766616, "audio_ts": 6.64, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try our speech to", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 6.0}}}
+{"ts": 6.767199, "audio_ts": 6.64, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try our speech to text", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 6.32}}}
+{"ts": 7.20699, "audio_ts": 7.08, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "text", "language": "en", "speaker": "S1"}], "end_time": 6.36, "start_time": 6.04, "type": "word"}], "metadata": {"end_time": 6.36, "start_time": 6.0, "transcript": "text "}}}
+{"ts": 7.207173, "audio_ts": 7.08, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 6.76, "start_time": 6.44, "transcript": ""}}}
+{"ts": 7.207509, "audio_ts": 7.08, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try our speech to text", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 6.36}}}
+{"ts": 7.207602, "audio_ts": 7.08, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S1"}}}
+{"ts": 7.207709, "audio_ts": 7.08, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try our speech to text", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 6.36}}}
+{"ts": 7.455293, "audio_ts": 7.33, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "software", "language": "en", "speaker": "S1"}], "end_time": 7.0, "start_time": 6.36, "type": "word"}], "metadata": {"end_time": 7.04, "start_time": 6.36, "transcript": "software"}}}
+{"ts": 7.455772, "audio_ts": 7.33, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S1"}}}
+{"ts": 7.456316, "audio_ts": 7.33, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try our speech to text software", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 3.48, "end_time": 7.0}}}
+{"ts": 7.900214, "audio_ts": 7.77, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "software", "language": "en", "speaker": "S1"}], "end_time": 7.0, "start_time": 6.36, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 7.0, "is_eos": true, "start_time": 7.0, "type": "punctuation"}], "metadata": {"end_time": 7.0, "start_time": 6.36, "transcript": "software. "}}}
+{"ts": 7.900544, "audio_ts": 7.77, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 0.99, "content": "Hello", "language": "en", "speaker": "S1"}], "end_time": 7.4, "start_time": 7.08, "type": "word"}], "metadata": {"end_time": 7.4, "start_time": 7.0, "transcript": "Hello"}}}
+{"ts": 7.901426, "audio_ts": 7.77, "payload": {"message": "AddSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:04.993+00:00", "language": "en", "text": "We're delighted that you've decided to try our speech to text software.", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 3.48, "end_time": 7.0}}}
+{"ts": 7.90212, "audio_ts": 7.77, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello", "annotation": ["has_partial"]}], "metadata": {"start_time": 7.08, "end_time": 7.4}}}
+{"ts": 8.281933, "audio_ts": 8.15, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Hello", "language": "en", "speaker": "S2"}], "end_time": 7.6, "start_time": 7.08, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 7.6, "is_eos": true, "start_time": 7.6, "type": "punctuation"}, {"alternatives": [{"confidence": 1.0, "content": "This", "language": "en", "speaker": "S2"}], "end_time": 7.76, "start_time": 7.64, "type": "word"}], "metadata": {"end_time": 7.76, "start_time": 7.0, "transcript": "Hello. This"}}}
+{"ts": 8.282326, "audio_ts": 8.15, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S1"}}}
+{"ts": 8.282391, "audio_ts": 8.15, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S2"}}}
+{"ts": 8.282652, "audio_ts": 8.15, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This", "annotation": ["has_partial"]}], "metadata": {"start_time": 7.08, "end_time": 7.76}}}
+{"ts": 8.621385, "audio_ts": 8.49, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Hello", "language": "en", "speaker": "S2"}], "end_time": 7.6, "start_time": 7.08, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 7.6, "is_eos": true, "start_time": 7.6, "type": "punctuation"}, {"alternatives": [{"confidence": 1.0, "content": "This", "language": "en", "speaker": "S2"}], "end_time": 7.84, "start_time": 7.64, "type": "word"}], "metadata": {"end_time": 7.84, "start_time": 7.0, "transcript": "Hello. This "}}}
+{"ts": 8.6219, "audio_ts": 8.49, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "is", "language": "en", "speaker": "S2"}], "end_time": 8.0, "start_time": 7.84, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "a", "language": "en", "speaker": "S2"}], "end_time": 8.12, "start_time": 8.0, "type": "word"}], "metadata": {"end_time": 8.12, "start_time": 7.84, "transcript": "is a"}}}
+{"ts": 8.622674, "audio_ts": 8.49, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 7.84}}}
+{"ts": 8.623622, "audio_ts": 8.49, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 8.12}}}
+{"ts": 8.987266, "audio_ts": 8.86, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "is", "language": "en", "speaker": "S2"}], "end_time": 8.0, "start_time": 7.84, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "a", "language": "en", "speaker": "S2"}], "end_time": 8.16, "start_time": 8.0, "type": "word"}], "metadata": {"end_time": 8.16, "start_time": 7.84, "transcript": "is a "}}}
+{"ts": 8.988643, "audio_ts": 8.86, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 8.16}}}
+{"ts": 8.989081, "audio_ts": 8.86, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "test", "language": "en", "speaker": "S2"}], "end_time": 8.48, "start_time": 8.16, "type": "word"}], "metadata": {"end_time": 8.48, "start_time": 8.16, "transcript": "test"}}}
+{"ts": 8.989517, "audio_ts": 8.86, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 8.48}}}
+{"ts": 9.350349, "audio_ts": 9.22, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "test", "language": "en", "speaker": "S2"}], "end_time": 8.44, "start_time": 8.16, "type": "word"}], "metadata": {"end_time": 8.44, "start_time": 8.16, "transcript": "test "}}}
+{"ts": 9.350627, "audio_ts": 9.22, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "one", "language": "en", "speaker": "S2"}], "end_time": 8.72, "start_time": 8.44, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ",", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 8.72, "is_eos": false, "start_time": 8.72, "type": "punctuation"}, {"alternatives": [{"confidence": 1.0, "content": "two", "language": "en", "speaker": "S2"}], "end_time": 8.84, "start_time": 8.76, "type": "word"}], "metadata": {"end_time": 8.84, "start_time": 8.44, "transcript": "one, two"}}}
+{"ts": 9.351462, "audio_ts": 9.22, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 8.44}}}
+{"ts": 9.351991, "audio_ts": 9.22, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test one, two", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 8.84}}}
+{"ts": 9.723448, "audio_ts": 9.59, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "one", "language": "en", "speaker": "S2"}], "end_time": 8.72, "start_time": 8.44, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ",", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 8.72, "is_eos": false, "start_time": 8.72, "type": "punctuation"}], "metadata": {"end_time": 8.72, "start_time": 8.44, "transcript": "one, "}}}
+{"ts": 9.723706, "audio_ts": 9.59, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "two", "language": "en", "speaker": "S2"}], "end_time": 9.0, "start_time": 8.76, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ",", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 9.0, "is_eos": false, "start_time": 9.0, "type": "punctuation"}, {"alternatives": [{"confidence": 0.8, "content": "three", "language": "en", "speaker": "S2"}], "end_time": 9.2, "start_time": 9.04, "type": "word"}], "metadata": {"end_time": 9.2, "start_time": 8.72, "transcript": "two, three"}}}
+{"ts": 9.724594, "audio_ts": 9.59, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test one,", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_punctuation"]}], "metadata": {"start_time": 7.08, "end_time": 8.72}}}
+{"ts": 9.72497, "audio_ts": 9.59, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test one, two, three", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 9.2}}}
+{"ts": 10.007078, "audio_ts": 9.88, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "two", "language": "en", "speaker": "S2"}], "end_time": 9.0, "start_time": 8.76, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ",", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 9.0, "is_eos": false, "start_time": 9.0, "type": "punctuation"}], "metadata": {"end_time": 9.0, "start_time": 8.72, "transcript": "two, "}}}
+{"ts": 10.007402, "audio_ts": 9.88, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "three", "language": "en", "speaker": "S2"}], "end_time": 9.52, "start_time": 9.04, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 9.52, "is_eos": true, "start_time": 9.52, "type": "punctuation"}], "metadata": {"end_time": 9.56, "start_time": 9.0, "transcript": "three."}}}
+{"ts": 10.008074, "audio_ts": 9.88, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test one, two,", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_punctuation"]}], "metadata": {"start_time": 7.08, "end_time": 9.0}}}
+{"ts": 10.008464, "audio_ts": 9.88, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test one, two, three.", "annotation": ["has_partial", "has_final", "starts_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 7.08, "end_time": 9.52}}}
+{"ts": 10.395931, "audio_ts": 10.27, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "three", "language": "en", "speaker": "S2"}], "end_time": 9.52, "start_time": 9.04, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "one", "language": "en", "speaker": "S2"}], "end_time": 9.88, "start_time": 9.6, "type": "word"}], "metadata": {"end_time": 9.92, "start_time": 9.0, "transcript": "three one"}}}
+{"ts": 10.39653, "audio_ts": 10.27, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test one, two, three one", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 9.88}}}
+{"ts": 10.74807, "audio_ts": 10.62, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "312", "language": "en", "speaker": "S2"}], "end_time": 10.16, "start_time": 9.04, "type": "word"}], "metadata": {"end_time": 10.28, "start_time": 9.0, "transcript": "312"}}}
+{"ts": 10.748675, "audio_ts": 10.62, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test one, two, 312", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 10.16}}}
+{"ts": 11.092076, "audio_ts": 10.96, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "3123", "language": "en", "speaker": "S2"}], "end_time": 10.64, "start_time": 9.04, "type": "word"}], "metadata": {"end_time": 10.64, "start_time": 9.0, "transcript": "3123"}}}
+{"ts": 11.092738, "audio_ts": 10.96, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test one, two, 3123", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 7.08, "end_time": 10.64}}}
+{"ts": 11.442641, "audio_ts": 11.31, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "three", "language": "en", "speaker": "S2"}], "end_time": 9.52, "start_time": 9.04, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 9.52, "is_eos": true, "start_time": 9.52, "type": "punctuation"}, {"alternatives": [{"confidence": 1.0, "content": "One", "language": "en", "speaker": "S2"}], "end_time": 9.88, "start_time": 9.6, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 9.88, "is_eos": true, "start_time": 9.88, "type": "punctuation"}, {"alternatives": [{"confidence": 1.0, "content": "Two", "language": "en", "speaker": "S2"}], "end_time": 10.16, "start_time": 9.92, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "three", "language": "en", "speaker": "S2"}], "end_time": 10.64, "start_time": 10.2, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 10.64, "is_eos": true, "start_time": 10.64, "type": "punctuation"}], "metadata": {"end_time": 11.0, "start_time": 9.0, "transcript": "three. One. Two three. "}}}
+{"ts": 11.444378, "audio_ts": 11.31, "payload": {"message": "AddSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:08.593+00:00", "language": "en", "text": "Hello. This is a test one, two, three. One. Two three.", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 7.08, "end_time": 10.64}}}
+{"ts": 11.444561, "audio_ts": 11.31, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 11.08, "start_time": 11.08, "transcript": ""}}}
+{"ts": 11.444646, "audio_ts": 11.31, "payload": {"message": "EndOfUtterance", "format": "2.9", "metadata": {"end_time": 11.08, "start_time": 11.08}}}
+{"ts": 11.444827, "audio_ts": 11.31, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S2"}}}
+{"ts": 11.444928, "audio_ts": 11.31, "payload": {"message": "EndOfTurn", "metadata": {"start_time": 10.64, "end_time": 10.64}}}
+{"ts": 11.853717, "audio_ts": 11.72, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 11.44, "start_time": 11.08, "transcript": ""}}}
+{"ts": 11.85388, "audio_ts": 11.72, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 11.44, "start_time": 11.44, "transcript": ""}}}
+{"ts": 12.12025, "audio_ts": 11.99, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 11.8, "start_time": 11.44, "transcript": ""}}}
+{"ts": 12.120426, "audio_ts": 11.99, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 11.8, "start_time": 11.8, "transcript": ""}}}
+{"ts": 12.555445, "audio_ts": 12.43, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "To", "language": "en", "speaker": "S2"}], "end_time": 12.08, "start_time": 11.88, "type": "word"}], "metadata": {"end_time": 12.08, "start_time": 11.72, "transcript": "To"}}}
+{"ts": 12.555683, "audio_ts": 12.43, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S2"}}}
+{"ts": 12.556105, "audio_ts": 12.43, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:13.393+00:00", "language": "en", "text": "To", "annotation": ["has_partial"]}], "metadata": {"start_time": 11.88, "end_time": 12.08}}}
+{"ts": 12.920393, "audio_ts": 12.79, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "To", "language": "en", "speaker": "S1"}], "end_time": 12.04, "start_time": 11.88, "type": "word"}], "metadata": {"end_time": 12.04, "start_time": 11.72, "transcript": "To "}}}
+{"ts": 12.922719, "audio_ts": 12.79, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:13.393+00:00", "language": "en", "text": "To", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 11.88, "end_time": 12.04}}}
+{"ts": 12.923296, "audio_ts": 12.79, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "get", "language": "en", "speaker": "S1"}], "end_time": 12.28, "start_time": 12.04, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "going", "language": "en", "speaker": "S1"}], "end_time": 12.44, "start_time": 12.28, "type": "word"}], "metadata": {"end_time": 12.44, "start_time": 12.04, "transcript": "get going"}}}
+{"ts": 12.924095, "audio_ts": 12.79, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S2"}}}
+{"ts": 12.92418, "audio_ts": 12.79, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S1"}}}
+{"ts": 12.924401, "audio_ts": 12.79, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:13.393+00:00", "language": "en", "text": "To get going", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 11.88, "end_time": 12.44}}}
+{"ts": 13.225297, "audio_ts": 13.1, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "get", "language": "en", "speaker": "S1"}], "end_time": 12.28, "start_time": 12.04, "type": "word"}], "metadata": {"end_time": 12.28, "start_time": 12.04, "transcript": "get "}}}
+{"ts": 13.226234, "audio_ts": 13.1, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:13.393+00:00", "language": "en", "text": "To get", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 11.88, "end_time": 12.28}}}
+{"ts": 13.226538, "audio_ts": 13.1, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "going", "language": "en", "speaker": "S1"}], "end_time": 12.76, "start_time": 12.28, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 12.76, "is_eos": true, "start_time": 12.76, "type": "punctuation"}], "metadata": {"end_time": 12.8, "start_time": 12.28, "transcript": "going."}}}
+{"ts": 13.226926, "audio_ts": 13.1, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:13.393+00:00", "language": "en", "text": "To get going.", "annotation": ["has_partial", "has_final", "starts_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 11.88, "end_time": 12.76}}}
+{"ts": 13.653997, "audio_ts": 13.53, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "going", "language": "en", "speaker": "S1"}], "end_time": 12.76, "start_time": 12.28, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 12.76, "is_eos": true, "start_time": 12.76, "type": "punctuation"}], "metadata": {"end_time": 12.76, "start_time": 12.28, "transcript": "going. "}}}
+{"ts": 13.654176, "audio_ts": 13.53, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Just", "language": "en", "speaker": "S1"}], "end_time": 13.0, "start_time": 12.8, "type": "word"}], "metadata": {"end_time": 13.16, "start_time": 12.76, "transcript": "Just"}}}
+{"ts": 13.654529, "audio_ts": 13.53, "payload": {"message": "AddSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:13.393+00:00", "language": "en", "text": "To get going.", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 11.88, "end_time": 12.76}}}
+{"ts": 13.654706, "audio_ts": 13.53, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just", "annotation": ["has_partial"]}], "metadata": {"start_time": 12.8, "end_time": 13.0}}}
+{"ts": 14.010459, "audio_ts": 13.88, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Just", "language": "en", "speaker": "S1"}], "end_time": 13.0, "start_time": 12.8, "type": "word"}], "metadata": {"end_time": 13.0, "start_time": 12.76, "transcript": "Just "}}}
+{"ts": 14.010949, "audio_ts": 13.88, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 13.0}}}
+{"ts": 14.011338, "audio_ts": 13.88, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "create", "language": "en", "speaker": "S1"}], "end_time": 13.32, "start_time": 13.0, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 13.32, "is_eos": true, "start_time": 13.32, "type": "punctuation"}, {"alternatives": [{"confidence": 1.0, "content": "An", "language": "en", "speaker": "S1"}], "end_time": 13.52, "start_time": 13.36, "type": "word"}], "metadata": {"end_time": 13.52, "start_time": 13.0, "transcript": "create. An"}}}
+{"ts": 14.011865, "audio_ts": 13.88, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create. An", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 13.52}}}
+{"ts": 14.368019, "audio_ts": 14.24, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "create", "language": "en", "speaker": "S1"}], "end_time": 13.32, "start_time": 13.0, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "an", "language": "en", "speaker": "S1"}], "end_time": 13.48, "start_time": 13.36, "type": "word"}], "metadata": {"end_time": 13.48, "start_time": 13.0, "transcript": "create an "}}}
+{"ts": 14.368301, "audio_ts": 14.24, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "API", "language": "en", "speaker": "S1"}], "end_time": 13.88, "start_time": 13.48, "type": "word"}], "metadata": {"end_time": 13.88, "start_time": 13.48, "transcript": "API"}}}
+{"ts": 14.369016, "audio_ts": 14.24, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 13.48}}}
+{"ts": 14.369555, "audio_ts": 14.24, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 13.88}}}
+{"ts": 14.735598, "audio_ts": 14.61, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "API", "language": "en", "speaker": "S1"}], "end_time": 13.88, "start_time": 13.48, "type": "word"}], "metadata": {"end_time": 13.88, "start_time": 13.48, "transcript": "API "}}}
+{"ts": 14.73573, "audio_ts": 14.61, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "key", "language": "en", "speaker": "S1"}], "end_time": 14.24, "start_time": 13.88, "type": "word"}], "metadata": {"end_time": 14.24, "start_time": 13.88, "transcript": "key"}}}
+{"ts": 14.736022, "audio_ts": 14.61, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 13.88}}}
+{"ts": 14.736213, "audio_ts": 14.61, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 14.24}}}
+{"ts": 15.157364, "audio_ts": 15.03, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "key", "language": "en", "speaker": "S1"}], "end_time": 14.24, "start_time": 13.88, "type": "word"}], "metadata": {"end_time": 14.24, "start_time": 13.88, "transcript": "key "}}}
+{"ts": 15.157645, "audio_ts": 15.03, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "and", "language": "en", "speaker": "S1"}], "end_time": 14.44, "start_time": 14.24, "type": "word"}], "metadata": {"end_time": 14.6, "start_time": 14.24, "transcript": "and"}}}
+{"ts": 15.159178, "audio_ts": 15.03, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 14.24}}}
+{"ts": 15.159648, "audio_ts": 15.03, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 14.44}}}
+{"ts": 15.479561, "audio_ts": 15.35, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "and", "language": "en", "speaker": "S1"}], "end_time": 14.44, "start_time": 14.24, "type": "word"}], "metadata": {"end_time": 14.44, "start_time": 14.24, "transcript": "and "}}}
+{"ts": 15.479801, "audio_ts": 15.35, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "submit", "language": "en", "speaker": "S1"}], "end_time": 14.76, "start_time": 14.44, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "a", "language": "en", "speaker": "S1"}], "end_time": 14.84, "start_time": 14.76, "type": "word"}], "metadata": {"end_time": 14.96, "start_time": 14.44, "transcript": "submit a"}}}
+{"ts": 15.480428, "audio_ts": 15.35, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 14.44}}}
+{"ts": 15.480797, "audio_ts": 15.35, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 14.84}}}
+{"ts": 15.75834, "audio_ts": 15.63, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "submit", "language": "en", "speaker": "S1"}], "end_time": 14.76, "start_time": 14.44, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "a", "language": "en", "speaker": "S1"}], "end_time": 14.84, "start_time": 14.76, "type": "word"}], "metadata": {"end_time": 14.84, "start_time": 14.44, "transcript": "submit a "}}}
+{"ts": 15.758403, "audio_ts": 15.63, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 15.4, "start_time": 14.92, "transcript": ""}}}
+{"ts": 15.758553, "audio_ts": 15.63, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 14.84}}}
+{"ts": 15.758605, "audio_ts": 15.63, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S1"}}}
+{"ts": 15.758659, "audio_ts": 15.63, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 14.84}}}
+{"ts": 16.154197, "audio_ts": 16.03, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 0.99, "content": "transcription", "language": "en", "speaker": "S1"}], "end_time": 15.48, "start_time": 14.84, "type": "word"}], "metadata": {"end_time": 15.68, "start_time": 14.84, "transcript": "transcription"}}}
+{"ts": 16.154598, "audio_ts": 16.03, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S1"}}}
+{"ts": 16.155114, "audio_ts": 16.03, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a transcription", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 15.48}}}
+{"ts": 16.502616, "audio_ts": 16.37, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "transcription", "language": "en", "speaker": "S1"}], "end_time": 15.48, "start_time": 14.84, "type": "word"}], "metadata": {"end_time": 15.48, "start_time": 14.84, "transcript": "transcription "}}}
+{"ts": 16.502891, "audio_ts": 16.37, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "request", "language": "en", "speaker": "S1"}], "end_time": 15.88, "start_time": 15.48, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "to", "language": "en", "speaker": "S1"}], "end_time": 15.96, "start_time": 15.88, "type": "word"}, {"alternatives": [{"confidence": 0.96, "content": "our", "language": "en", "speaker": "S1"}], "end_time": 16.04, "start_time": 15.96, "type": "word"}], "metadata": {"end_time": 16.04, "start_time": 15.48, "transcript": "request to our"}}}
+{"ts": 16.503957, "audio_ts": 16.37, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a transcription", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 15.48}}}
+{"ts": 16.504535, "audio_ts": 16.37, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a transcription request to our", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 16.04}}}
+{"ts": 16.895782, "audio_ts": 16.77, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "request", "language": "en", "speaker": "S1"}], "end_time": 15.88, "start_time": 15.48, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "to", "language": "en", "speaker": "S1"}], "end_time": 16.0, "start_time": 15.88, "type": "word"}], "metadata": {"end_time": 16.0, "start_time": 15.48, "transcript": "request to "}}}
+{"ts": 16.896121, "audio_ts": 16.77, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "our", "language": "en", "speaker": "S1"}], "end_time": 16.2, "start_time": 16.0, "type": "word"}], "metadata": {"end_time": 16.4, "start_time": 16.0, "transcript": "our"}}}
+{"ts": 16.896995, "audio_ts": 16.77, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a transcription request to", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 16.0}}}
+{"ts": 16.897653, "audio_ts": 16.77, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a transcription request to our", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 16.2}}}
+{"ts": 17.231649, "audio_ts": 17.1, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "our", "language": "en", "speaker": "S1"}], "end_time": 16.2, "start_time": 16.0, "type": "word"}], "metadata": {"end_time": 16.2, "start_time": 16.0, "transcript": "our "}}}
+{"ts": 17.232464, "audio_ts": 17.1, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a transcription request to our", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 12.8, "end_time": 16.2}}}
+{"ts": 17.233052, "audio_ts": 17.1, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 0.75, "content": "API", "language": "en", "speaker": "S1"}], "end_time": 16.72, "start_time": 16.2, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 16.72, "is_eos": true, "start_time": 16.72, "type": "punctuation"}], "metadata": {"end_time": 16.76, "start_time": 16.2, "transcript": "API."}}}
+{"ts": 17.233623, "audio_ts": 17.1, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a transcription request to our API.", "annotation": ["has_partial", "has_final", "starts_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 12.8, "end_time": 16.72}}}
+{"ts": 17.558128, "audio_ts": 17.43, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "API", "language": "en", "speaker": "S1"}], "end_time": 16.72, "start_time": 16.2, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 16.72, "is_eos": true, "start_time": 16.72, "type": "punctuation"}], "metadata": {"end_time": 17.12, "start_time": 16.2, "transcript": "API. "}}}
+{"ts": 17.558414, "audio_ts": 17.43, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 17.2, "start_time": 17.2, "transcript": ""}}}
+{"ts": 17.55851, "audio_ts": 17.43, "payload": {"message": "EndOfUtterance", "format": "2.9", "metadata": {"end_time": 17.2, "start_time": 17.2}}}
+{"ts": 17.559607, "audio_ts": 17.43, "payload": {"message": "AddSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:14.313+00:00", "language": "en", "text": "Just create an API key and submit a transcription request to our API.", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 12.8, "end_time": 16.72}}}
+{"ts": 17.559849, "audio_ts": 17.43, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S1"}}}
+{"ts": 17.559943, "audio_ts": 17.43, "payload": {"message": "EndOfTurn", "metadata": {"start_time": 16.72, "end_time": 16.72}}}
+{"ts": 17.955868, "audio_ts": 17.83, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 17.56, "start_time": 17.2, "transcript": ""}}}
+{"ts": 18.341727, "audio_ts": 18.21, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "We", "language": "en", "speaker": "S1"}], "end_time": 17.56, "start_time": 17.4, "type": "word"}], "metadata": {"end_time": 17.56, "start_time": 17.12, "transcript": "We "}}}
+{"ts": 18.342021, "audio_ts": 18.21, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "hope", "language": "en", "speaker": "S1"}], "end_time": 17.76, "start_time": 17.6, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "you", "language": "en", "speaker": "S1"}], "end_time": 17.84, "start_time": 17.76, "type": "word"}], "metadata": {"end_time": 17.84, "start_time": 17.56, "transcript": "hope you"}}}
+{"ts": 18.342854, "audio_ts": 18.21, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 17.4, "end_time": 17.56}}}
+{"ts": 18.343158, "audio_ts": 18.21, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S1"}}}
+{"ts": 18.343466, "audio_ts": 18.21, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you", "annotation": ["has_partial", "has_final", "starts_with_final", "fast_speaker"]}], "metadata": {"start_time": 17.4, "end_time": 17.84}}}
+{"ts": 18.665109, "audio_ts": 18.54, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "hope", "language": "en", "speaker": "S1"}], "end_time": 17.76, "start_time": 17.6, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "you'll", "language": "en", "speaker": "S1"}], "end_time": 17.92, "start_time": 17.76, "type": "word"}], "metadata": {"end_time": 17.92, "start_time": 17.56, "transcript": "hope you'll "}}}
+{"ts": 18.665393, "audio_ts": 18.54, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "be", "language": "en", "speaker": "S1"}], "end_time": 18.04, "start_time": 17.92, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "very", "language": "en", "speaker": "S1"}], "end_time": 18.2, "start_time": 18.04, "type": "word"}], "metadata": {"end_time": 18.2, "start_time": 17.92, "transcript": "be very"}}}
+{"ts": 18.666072, "audio_ts": 18.54, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 17.4, "end_time": 17.92}}}
+{"ts": 18.666542, "audio_ts": 18.54, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll be very", "annotation": ["has_partial", "has_final", "starts_with_final", "fast_speaker"]}], "metadata": {"start_time": 17.4, "end_time": 18.2}}}
+{"ts": 19.016404, "audio_ts": 18.89, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "be", "language": "en", "speaker": "S1"}], "end_time": 18.04, "start_time": 17.92, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "very", "language": "en", "speaker": "S1"}], "end_time": 18.2, "start_time": 18.04, "type": "word"}], "metadata": {"end_time": 18.2, "start_time": 17.92, "transcript": "be very "}}}
+{"ts": 19.016696, "audio_ts": 18.89, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 0.83, "content": "impressed", "language": "en", "speaker": "S1"}], "end_time": 18.56, "start_time": 18.2, "type": "word"}], "metadata": {"end_time": 18.56, "start_time": 18.2, "transcript": "impressed"}}}
+{"ts": 19.017383, "audio_ts": 18.89, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll be very", "annotation": ["has_final", "starts_with_final", "ends_with_final", "fast_speaker"]}], "metadata": {"start_time": 17.4, "end_time": 18.2}}}
+{"ts": 19.017847, "audio_ts": 18.89, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll be very impressed", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 17.4, "end_time": 18.56}}}
+{"ts": 19.391891, "audio_ts": 19.26, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "impressed", "language": "en", "speaker": "S1"}], "end_time": 18.56, "start_time": 18.2, "type": "word"}], "metadata": {"end_time": 18.56, "start_time": 18.2, "transcript": "impressed "}}}
+{"ts": 19.392182, "audio_ts": 19.26, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "by", "language": "en", "speaker": "S1"}], "end_time": 18.68, "start_time": 18.56, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "the", "language": "en", "speaker": "S1"}], "end_time": 18.76, "start_time": 18.68, "type": "word"}, {"alternatives": [{"confidence": 0.86, "content": "results", "language": "en", "speaker": "S1"}], "end_time": 18.92, "start_time": 18.76, "type": "word"}], "metadata": {"end_time": 18.92, "start_time": 18.56, "transcript": "by the results"}}}
+{"ts": 19.392969, "audio_ts": 19.26, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll be very impressed", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 17.4, "end_time": 18.56}}}
+{"ts": 19.393431, "audio_ts": 19.26, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll be very impressed by the results", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 17.4, "end_time": 18.92}}}
+{"ts": 19.744803, "audio_ts": 19.62, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "by", "language": "en", "speaker": "S1"}], "end_time": 18.68, "start_time": 18.56, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "the", "language": "en", "speaker": "S1"}], "end_time": 18.8, "start_time": 18.68, "type": "word"}], "metadata": {"end_time": 18.8, "start_time": 18.56, "transcript": "by the "}}}
+{"ts": 19.745101, "audio_ts": 19.62, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "results", "language": "en", "speaker": "S1"}], "end_time": 19.28, "start_time": 18.8, "type": "word"}], "metadata": {"end_time": 19.28, "start_time": 18.8, "transcript": "results"}}}
+{"ts": 19.745824, "audio_ts": 19.62, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll be very impressed by the", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 17.4, "end_time": 18.8}}}
+{"ts": 19.746324, "audio_ts": 19.62, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll be very impressed by the results", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 17.4, "end_time": 19.28}}}
+{"ts": 20.081443, "audio_ts": 19.95, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "results", "language": "en", "speaker": "S1"}], "end_time": 19.52, "start_time": 18.8, "type": "word"}], "metadata": {"end_time": 19.64, "start_time": 18.8, "transcript": "results"}}}
+{"ts": 20.082731, "audio_ts": 19.95, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll be very impressed by the results", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 17.4, "end_time": 19.52}}}
+{"ts": 20.460972, "audio_ts": 20.33, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "results", "language": "en", "speaker": "S1"}], "end_time": 19.52, "start_time": 18.8, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 19.52, "is_eos": true, "start_time": 19.52, "type": "punctuation"}], "metadata": {"end_time": 20.0, "start_time": 18.8, "transcript": "results. "}}}
+{"ts": 20.461283, "audio_ts": 20.33, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 20.08, "start_time": 20.08, "transcript": ""}}}
+{"ts": 20.46141, "audio_ts": 20.33, "payload": {"message": "EndOfUtterance", "format": "2.9", "metadata": {"end_time": 20.08, "start_time": 20.08}}}
+{"ts": 20.46271, "audio_ts": 20.33, "payload": {"message": "AddSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:18.913+00:00", "language": "en", "text": "We hope you'll be very impressed by the results.", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 17.4, "end_time": 19.52}}}
+{"ts": 20.462907, "audio_ts": 20.33, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S1"}}}
+{"ts": 20.462993, "audio_ts": 20.33, "payload": {"message": "EndOfTurn", "metadata": {"start_time": 19.52, "end_time": 19.52}}}
+{"ts": 20.833766, "audio_ts": 20.7, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 20.44, "start_time": 20.08, "transcript": ""}}}
+{"ts": 20.833995, "audio_ts": 20.7, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 20.44, "start_time": 20.44, "transcript": ""}}}
+{"ts": 21.194138, "audio_ts": 21.07, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Results", "language": "en", "speaker": "S2"}], "end_time": 20.72, "start_time": 20.36, "type": "word"}], "metadata": {"end_time": 20.72, "start_time": 20.36, "transcript": "Results"}}}
+{"ts": 21.194512, "audio_ts": 21.07, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S2"}}}
+{"ts": 21.194982, "audio_ts": 21.07, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:21.873+00:00", "language": "en", "text": "Results", "annotation": ["has_partial"]}], "metadata": {"start_time": 20.36, "end_time": 20.72}}}
+{"ts": 21.634402, "audio_ts": 21.5, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 0.99, "content": "Results", "language": "en", "speaker": "S2"}], "end_time": 20.76, "start_time": 20.36, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 20.76, "is_eos": true, "start_time": 20.76, "type": "punctuation"}], "metadata": {"end_time": 20.76, "start_time": 20.36, "transcript": "Results. "}}}
+{"ts": 21.634775, "audio_ts": 21.5, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 21.16, "start_time": 20.84, "transcript": ""}}}
+{"ts": 21.635731, "audio_ts": 21.51, "payload": {"message": "AddSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:21.873+00:00", "language": "en", "text": "Results.", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 20.36, "end_time": 20.76}}}
+{"ts": 21.635884, "audio_ts": 21.51, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S2"}}}
+{"ts": 21.912866, "audio_ts": 21.78, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Really", "language": "en", "speaker": "S2"}], "end_time": 21.12, "start_time": 20.8, "type": "word"}], "metadata": {"end_time": 21.12, "start_time": 20.76, "transcript": "Really "}}}
+{"ts": 21.913124, "audio_ts": 21.78, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "are", "language": "en", "speaker": "S2"}], "end_time": 21.24, "start_time": 21.12, "type": "word"}], "metadata": {"end_time": 21.44, "start_time": 21.12, "transcript": "are"}}}
+{"ts": 21.913787, "audio_ts": 21.78, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:22.313+00:00", "language": "en", "text": "Really", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 20.8, "end_time": 21.12}}}
+{"ts": 21.914069, "audio_ts": 21.78, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S2"}}}
+{"ts": 21.914382, "audio_ts": 21.78, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:22.313+00:00", "language": "en", "text": "Really are", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 20.8, "end_time": 21.24}}}
+{"ts": 22.272704, "audio_ts": 22.14, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "are", "language": "en", "speaker": "S2"}], "end_time": 21.24, "start_time": 21.12, "type": "word"}], "metadata": {"end_time": 21.24, "start_time": 21.12, "transcript": "are "}}}
+{"ts": 22.272905, "audio_ts": 22.14, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "pretty", "language": "en", "speaker": "S2"}], "end_time": 21.56, "start_time": 21.24, "type": "word"}], "metadata": {"end_time": 21.8, "start_time": 21.24, "transcript": "pretty"}}}
+{"ts": 22.273503, "audio_ts": 22.14, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:22.313+00:00", "language": "en", "text": "Really are", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 20.8, "end_time": 21.24}}}
+{"ts": 22.273949, "audio_ts": 22.14, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:22.313+00:00", "language": "en", "text": "Really are pretty", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 20.8, "end_time": 21.56}}}
+{"ts": 22.592997, "audio_ts": 22.46, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "pretty", "language": "en", "speaker": "S2"}], "end_time": 21.56, "start_time": 21.24, "type": "word"}], "metadata": {"end_time": 21.56, "start_time": 21.24, "transcript": "pretty "}}}
+{"ts": 22.593262, "audio_ts": 22.46, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "amazing", "language": "en", "speaker": "S2"}], "end_time": 22.16, "start_time": 21.56, "type": "word"}], "metadata": {"end_time": 22.16, "start_time": 21.56, "transcript": "amazing"}}}
+{"ts": 22.594044, "audio_ts": 22.46, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:22.313+00:00", "language": "en", "text": "Really are pretty", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 20.8, "end_time": 21.56}}}
+{"ts": 22.594571, "audio_ts": 22.46, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:22.313+00:00", "language": "en", "text": "Really are pretty amazing", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 20.8, "end_time": 22.16}}}
+{"ts": 22.940643, "audio_ts": 22.81, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "amazing", "language": "en", "speaker": "S2"}], "end_time": 22.24, "start_time": 21.56, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S2"}], "attaches_to": "previous", "end_time": 22.24, "is_eos": true, "start_time": 22.24, "type": "punctuation"}], "metadata": {"end_time": 22.52, "start_time": 21.56, "transcript": "amazing. "}}}
+{"ts": 22.940937, "audio_ts": 22.81, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 22.6, "start_time": 22.6, "transcript": ""}}}
+{"ts": 22.941234, "audio_ts": 22.81, "payload": {"message": "EndOfUtterance", "format": "2.9", "metadata": {"end_time": 22.6, "start_time": 22.6}}}
+{"ts": 22.941971, "audio_ts": 22.81, "payload": {"message": "AddSegment", "segments": [{"speaker_id": "S2", "is_active": true, "timestamp": "2025-09-22T16:51:22.313+00:00", "language": "en", "text": "Really are pretty amazing.", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 20.8, "end_time": 22.24}}}
+{"ts": 22.942169, "audio_ts": 22.81, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S2"}}}
+{"ts": 22.94226, "audio_ts": 22.81, "payload": {"message": "EndOfTurn", "metadata": {"start_time": 22.24, "end_time": 22.24}}}
+{"ts": 23.365603, "audio_ts": 23.24, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 22.96, "start_time": 22.6, "transcript": ""}}}
+{"ts": 23.365829, "audio_ts": 23.24, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 22.96, "start_time": 22.96, "transcript": ""}}}
+{"ts": 23.734221, "audio_ts": 23.61, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Thank", "language": "en", "speaker": "S1"}], "end_time": 23.12, "start_time": 22.88, "type": "word"}, {"alternatives": [{"confidence": 0.95, "content": "you", "language": "en", "speaker": "S1"}], "end_time": 23.24, "start_time": 23.12, "type": "word"}], "metadata": {"end_time": 23.24, "start_time": 22.88, "transcript": "Thank you"}}}
+{"ts": 23.734553, "audio_ts": 23.61, "payload": {"message": "SpeakerStarted", "status": {"is_active": true, "speaker_id": "S1"}}}
+{"ts": 23.734919, "audio_ts": 23.61, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:24.393+00:00", "language": "en", "text": "Thank you", "annotation": ["has_partial"]}], "metadata": {"start_time": 22.88, "end_time": 23.24}}}
+{"ts": 24.070199, "audio_ts": 23.94, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Thank", "language": "en", "speaker": "S1"}], "end_time": 23.12, "start_time": 22.88, "type": "word"}], "metadata": {"end_time": 23.12, "start_time": 22.88, "transcript": "Thank "}}}
+{"ts": 24.07058, "audio_ts": 23.94, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "you", "language": "en", "speaker": "S1"}], "end_time": 23.44, "start_time": 23.12, "type": "word"}], "metadata": {"end_time": 23.6, "start_time": 23.12, "transcript": "you"}}}
+{"ts": 24.071296, "audio_ts": 23.94, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:24.393+00:00", "language": "en", "text": "Thank", "annotation": ["has_final", "starts_with_final", "ends_with_final"]}], "metadata": {"start_time": 22.88, "end_time": 23.12}}}
+{"ts": 24.071869, "audio_ts": 23.94, "payload": {"message": "AddPartialSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:24.393+00:00", "language": "en", "text": "Thank you", "annotation": ["has_partial", "has_final", "starts_with_final"]}], "metadata": {"start_time": 22.88, "end_time": 23.44}}}
+{"ts": 24.433138, "audio_ts": 24.3, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "you", "language": "en", "speaker": "S1"}], "end_time": 23.44, "start_time": 23.12, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 23.44, "is_eos": true, "start_time": 23.44, "type": "punctuation"}], "metadata": {"end_time": 23.96, "start_time": 23.12, "transcript": "you. "}}}
+{"ts": 24.434497, "audio_ts": 24.3, "payload": {"message": "AddSegment", "segments": [{"speaker_id": "S1", "is_active": true, "timestamp": "2025-09-22T16:51:24.393+00:00", "language": "en", "text": "Thank you.", "annotation": ["has_final", "starts_with_final", "ends_with_final", "ends_with_eos", "ends_with_punctuation"]}], "metadata": {"start_time": 22.88, "end_time": 23.44}}}
+{"ts": 24.435224, "audio_ts": 24.3, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 24.04, "start_time": 24.04, "transcript": ""}}}
+{"ts": 24.435351, "audio_ts": 24.3, "payload": {"message": "EndOfUtterance", "format": "2.9", "metadata": {"end_time": 24.04, "start_time": 24.04}}}
+{"ts": 24.435635, "audio_ts": 24.3, "payload": {"message": "SpeakerEnded", "status": {"is_active": false, "speaker_id": "S1"}}}
+{"ts": 24.435728, "audio_ts": 24.3, "payload": {"message": "EndOfTurn", "metadata": {"start_time": 23.44, "end_time": 23.44}}}
+{"ts": 24.768097, "audio_ts": 24.64, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 24.4, "start_time": 24.04, "transcript": ""}}}
+{"ts": 24.768313, "audio_ts": 24.64, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 24.4, "start_time": 24.4, "transcript": ""}}}
+{"ts": 25.137276, "audio_ts": 25.01, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 24.76, "start_time": 24.4, "transcript": ""}}}
+{"ts": 25.137542, "audio_ts": 25.01, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 24.76, "start_time": 24.76, "transcript": ""}}}
+{"ts": 25.469681, "audio_ts": 25.34, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 25.12, "start_time": 24.76, "transcript": ""}}}
+{"ts": 25.470109, "audio_ts": 25.34, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 25.12, "start_time": 25.12, "transcript": ""}}}
+{"ts": 25.904285, "audio_ts": 25.78, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 25.48, "start_time": 25.12, "transcript": ""}}}
+{"ts": 25.904505, "audio_ts": 25.78, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 25.48, "start_time": 25.48, "transcript": ""}}}
+{"ts": 26.273945, "audio_ts": 26.14, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 25.84, "start_time": 25.48, "transcript": ""}}}
+{"ts": 26.274097, "audio_ts": 26.14, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 25.84, "start_time": 25.84, "transcript": ""}}}
+{"ts": 26.538952, "audio_ts": 26.41, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 26.2, "start_time": 25.84, "transcript": ""}}}
+{"ts": 26.539192, "audio_ts": 26.41, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 26.2, "start_time": 26.2, "transcript": ""}}}
+{"ts": 26.943639, "audio_ts": 26.81, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 26.56, "start_time": 26.2, "transcript": ""}}}
+{"ts": 26.943883, "audio_ts": 26.81, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 26.56, "start_time": 26.56, "transcript": ""}}}
+{"ts": 27.291209, "audio_ts": 27.16, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 26.92, "start_time": 26.56, "transcript": ""}}}
+{"ts": 27.291421, "audio_ts": 27.16, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 26.92, "start_time": 26.92, "transcript": ""}}}
+{"ts": 27.628956, "audio_ts": 27.5, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 27.28, "start_time": 26.92, "transcript": ""}}}
+{"ts": 27.629113, "audio_ts": 27.5, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 27.28, "start_time": 27.28, "transcript": ""}}}
+{"ts": 28.028535, "audio_ts": 27.9, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 27.64, "start_time": 27.28, "transcript": ""}}}
+{"ts": 28.028766, "audio_ts": 27.9, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 27.64, "start_time": 27.64, "transcript": ""}}}
diff --git a/tests/voice/assets/chat2.jsonl b/tests/voice/assets/chat2.jsonl
new file mode 100644
index 0000000..e5a114a
--- /dev/null
+++ b/tests/voice/assets/chat2.jsonl
@@ -0,0 +1,14 @@
+{"ts": 0.091146, "audio_ts": 0.0, "payload": {"message": "Info", "type": "concurrent_session_usage", "reason": "1 concurrent sessions active out of quota 150", "usage": 1, "quota": 150, "last_updated": "2025-09-22T16:51:01Z"}}
+{"ts": 0.132706, "audio_ts": 0.0, "payload": {"message": "RecognitionStarted", "orchestrator_version": "2025.08.29127+289170c022.HEAD", "id": "9d8a5d05-7a62-4266-8cfc-9ffd9cb3cdf1", "language_pack_info": {"adapted": false, "itn": true, "language_description": "English", "word_delimiter": " ", "writing_direction": "left-to-right"}}}
+{"ts": 0.67894, "audio_ts": 0.55, "payload": {"message": "AddTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 0.28, "start_time": 0.0, "transcript": ""}}}
+{"ts": 0.67918, "audio_ts": 0.55, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 0.28, "start_time": 0.28, "transcript": ""}}}
+{"ts": 1.10585, "audio_ts": 0.98, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 0.64, "start_time": 0.28, "transcript": ""}}}
+{"ts": 1.435181, "audio_ts": 1.31, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Welcome", "language": "en", "speaker": "S1"}], "end_time": 0.92, "start_time": 0.36, "type": "word"}], "metadata": {"end_time": 0.92, "start_time": 0.2, "transcript": "Welcome"}}}
+{"ts": 1.792961, "audio_ts": 1.66, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Welcome", "language": "en", "speaker": "S1"}], "end_time": 0.92, "start_time": 0.36, "type": "word"}, {"alternatives": [{"confidence": 1.0, "content": "to", "language": "en", "speaker": "S1"}], "end_time": 1.0, "start_time": 0.92, "type": "word"}], "metadata": {"end_time": 1.0, "start_time": 0.2, "transcript": "Welcome to "}}}
+{"ts": 1.793294, "audio_ts": 1.66, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Speechmatics", "language": "en", "speaker": "S1"}], "end_time": 1.28, "start_time": 1.04, "type": "word"}], "metadata": {"end_time": 1.28, "start_time": 1.0, "transcript": "speech"}}}
+{"ts": 2.196876, "audio_ts": 2.07, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": "Speechmatics", "language": "en", "speaker": "S1"}], "end_time": 1.32, "start_time": 1.04, "type": "word"}], "metadata": {"end_time": 1.32, "start_time": 1.0, "transcript": "speech "}}}
+{"ts": 2.197393, "audio_ts": 2.07, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 1.72, "start_time": 1.4, "transcript": ""}}}
+{"ts": 2.495748, "audio_ts": 2.37, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 2.08, "start_time": 1.4, "transcript": ""}}}
+{"ts": 2.828358, "audio_ts": 2.7, "payload": {"message": "AddTranscript", "format": "2.9", "results": [{"alternatives": [{"confidence": 1.0, "content": ".", "language": "en", "speaker": "S1"}], "attaches_to": "previous", "end_time": 1.32, "is_eos": true, "start_time": 1.32, "type": "punctuation"}], "metadata": {"end_time": 2.36, "start_time": 1.32, "transcript": ". "}}}
+{"ts": 2.828587, "audio_ts": 2.7, "payload": {"message": "AddPartialTranscript", "format": "2.9", "results": [], "metadata": {"end_time": 2.44, "start_time": 2.44, "transcript": ""}}}
+{"ts": 2.828685, "audio_ts": 2.7, "payload": {"message": "EndOfUtterance", "format": "2.9", "metadata": {"end_time": 2.44, "start_time": 2.44}}}
diff --git a/tests/voice/assets/languages/cmn_hans_cn_000328.wav b/tests/voice/assets/languages/cmn_hans_cn_000328.wav
new file mode 100644
index 0000000..e0cb525
--- /dev/null
+++ b/tests/voice/assets/languages/cmn_hans_cn_000328.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f04d0429ccbdf4b69f6d5bb4893b91fc04a65809d863224218b3ef8c955064a1
+size 378284
diff --git a/tests/voice/assets/languages/de_de_000675.wav b/tests/voice/assets/languages/de_de_000675.wav
new file mode 100644
index 0000000..5792054
--- /dev/null
+++ b/tests/voice/assets/languages/de_de_000675.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe2b28fa4bf8123fa5494cb4cfdbed535899048c892fe14033424f443bdd7330
+size 355244
diff --git a/tests/voice/assets/languages/es_419_000896.wav b/tests/voice/assets/languages/es_419_000896.wav
new file mode 100644
index 0000000..9618a4e
--- /dev/null
+++ b/tests/voice/assets/languages/es_419_000896.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56eab3b040e2a9de740d0e70ea51ebd37637cb30b20709c9b8e454ef3d326782
+size 460844
diff --git a/tests/voice/assets/languages/fr_fr_000378.wav b/tests/voice/assets/languages/fr_fr_000378.wav
new file mode 100644
index 0000000..b620f87
--- /dev/null
+++ b/tests/voice/assets/languages/fr_fr_000378.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27e79762c5577bfd21fb874bd78b4c8c8659b1413ae892c6ee72ed08c5b4f20f
+size 313004
diff --git a/tests/voice/assets/languages/he_il_000432.wav b/tests/voice/assets/languages/he_il_000432.wav
new file mode 100644
index 0000000..16af9d2
--- /dev/null
+++ b/tests/voice/assets/languages/he_il_000432.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d4f64da0b3ffa3a979e97a0b354a884f6a77029f8ab3546d95537a37cf460059
+size 268844
diff --git a/tests/voice/assets/languages/ja_jp_000595.wav b/tests/voice/assets/languages/ja_jp_000595.wav
new file mode 100644
index 0000000..e40a0e5
--- /dev/null
+++ b/tests/voice/assets/languages/ja_jp_000595.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:939884e5d7c23ce68bb310ac3df01073554a9da28470ed976519dc0b241a20db
+size 351404
diff --git a/tests/voice/assets/languages/th_th_000208.wav b/tests/voice/assets/languages/th_th_000208.wav
new file mode 100644
index 0000000..096a0cd
--- /dev/null
+++ b/tests/voice/assets/languages/th_th_000208.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8ca5846de9f877921737ad8a6f01f7c39e8df8f9ef67ca22596b39b045d1f1a3
+size 276524
diff --git a/tests/voice/assets/smart_turn/01_false_16kHz.wav b/tests/voice/assets/smart_turn/01_false_16kHz.wav
new file mode 100644
index 0000000..2df00a8
--- /dev/null
+++ b/tests/voice/assets/smart_turn/01_false_16kHz.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2f716f2b806a7f4a6e10860a5547885d69b26dd000468d221f57fa5b1aa086
+size 82018
diff --git a/tests/voice/assets/smart_turn/02_false_16kHz.wav b/tests/voice/assets/smart_turn/02_false_16kHz.wav
new file mode 100644
index 0000000..e9cfe7f
--- /dev/null
+++ b/tests/voice/assets/smart_turn/02_false_16kHz.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:442f8ae1d0a1f714df4610d8765106ec602f5c77a26169650b33e7b73293cbb8
+size 150280
diff --git a/tests/voice/assets/smart_turn/03_true_16kHz.wav b/tests/voice/assets/smart_turn/03_true_16kHz.wav
new file mode 100644
index 0000000..243b359
--- /dev/null
+++ b/tests/voice/assets/smart_turn/03_true_16kHz.wav
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e3721e260f59673de77eb795d683a591f0df23919afb9072f32eebabee9ecc4
+size 236004
diff --git a/tests/voice/test_01_client.py b/tests/voice/test_01_client.py
new file mode 100644
index 0000000..8463524
--- /dev/null
+++ b/tests/voice/test_01_client.py
@@ -0,0 +1,104 @@
+import os
+
+import pytest
+from _utils import get_client
+
+from speechmatics.voice import VoiceAgentClient
+from speechmatics.voice import VoiceAgentConfig
+
+# Constants
+API_KEY = os.getenv("SPEECHMATICS_API_KEY")
+
+
+@pytest.mark.asyncio
+async def test_client():
+    """Tests that a client can be created.
+
+    - Checks for a valid session
+    - Checks that 'English' is the language pack info
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Create client
+    client = await get_client(
+        api_key=API_KEY,
+        connect=False,
+    )
+
+    # Check we are connected OK
+    await client.connect()
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Disconnect
+    await client.disconnect()
+
+    # Check we are disconnected
+    assert not client._is_connected
+
+    # Check session info
+    assert client._client_session.session_id != "NOT_SET"
+    assert client._client_session.language_pack_info is not None
+    assert client._client_session.language_pack_info.language_description == "English"
+
+
+@pytest.mark.asyncio
+async def test_client_context_manager():
+    """Tests that a client can be used as an async context manager.
+
+    - Checks that connection is established automatically on enter
+    - Checks that disconnection happens automatically on exit
+    - Verifies session info is set correctly
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Create config
+    config = VoiceAgentConfig(language="en")
+
+    # Use client as context manager
+    async with VoiceAgentClient(api_key=API_KEY, config=config) as client:
+        # Check we are connected automatically
+        assert client._is_connected
+
+        # Check session info is set
+        assert client._client_session.session_id != "NOT_SET"
+        assert client._client_session.language_pack_info is not None
+        assert client._client_session.language_pack_info.language_description == "English"
+
+    # After exiting context, client should be disconnected
+    assert not client._is_connected
+
+
+@pytest.mark.asyncio
+async def test_client_context_manager_with_exception():
+    """Tests that context manager properly cleans up even when an exception occurs.
+
+    - Checks that disconnection happens even if an exception is raised
+    - Verifies exception is propagated correctly
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Create config
+    config = VoiceAgentConfig(language="en")
+
+    # Use client as context manager and raise an exception
+    with pytest.raises(ValueError, match="Test exception"):
+        async with VoiceAgentClient(api_key=API_KEY, config=config) as client:
+            # Check we are connected
+            assert client._is_connected
+
+            # Raise an exception
+            raise ValueError("Test exception")
+
+    # After exiting context (even with exception), client should be disconnected
+    assert not client._is_connected
diff --git a/tests/voice/test_02_transcriber.py b/tests/voice/test_02_transcriber.py
new file mode 100644
index 0000000..35ea170
--- /dev/null
+++ b/tests/voice/test_02_transcriber.py
@@ -0,0 +1,176 @@
+import asyncio
+import os
+
+import pytest
+from _utils import get_client
+from _utils import send_audio_file
+
+from speechmatics.voice import AdditionalVocabEntry
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import VoiceAgentConfig
+
+# Skip for CI testing
+pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping transcriber tests in CI")
+
+# Constants
+API_KEY = os.getenv("SPEECHMATICS_API_KEY")
+
+
+@pytest.mark.asyncio
+async def test_transcribe_partial():
+    """Test transcription.
+
+    This test will:
+        - send audio data to the API server
+        - wait for the first partial transcript (within 10 seconds)
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Client
+    client = await get_client(
+        api_key=API_KEY,
+        connect=True,
+        config=VoiceAgentConfig(
+            additional_vocab=[
+                AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]),
+            ]
+        ),
+    )
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Create an event to track when the callback is called
+    event_received = asyncio.Event()
+    received_message = None
+
+    # Callback function for connection
+    def on_partial_received(message):
+        nonlocal received_message
+        received_message = message
+        event_received.set()
+
+    # Add listener for PARTIALS
+    client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, on_partial_received)
+
+    # Load the audio file `./assets/audio_01_16kHz.wav`
+    await send_audio_file(client, "./assets/audio_01_16kHz.wav", event_received)
+
+    # Wait for the callback with timeout
+    try:
+        await asyncio.wait_for(event_received.wait(), timeout=5.0)
+        assert received_message is not None
+    except asyncio.TimeoutError:
+        pytest.fail("ADD_PARTIAL_TRANSCRIPT event was not received within 5 seconds of audio finish")
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
+
+
+@pytest.mark.asyncio
+async def test_transcribe_final():
+    """Test transcription.
+
+    This test will:
+        - send audio data to the API server
+        - wait for the first final transcript (within 10 seconds)
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Client
+    client = await get_client(api_key=API_KEY, connect=True)
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Create an event to track when the callback is called
+    event_received = asyncio.Event()
+    received_message = None
+
+    # Callback function for connection
+    def on_final_received(message):
+        nonlocal received_message
+        received_message = message
+        event_received.set()
+
+    # Add listener for PARTIALS
+    client.on(AgentServerMessageType.ADD_TRANSCRIPT, on_final_received)
+
+    # Load the audio file `./assets/audio_01_16kHz.wav`
+    await send_audio_file(client, "./assets/audio_01_16kHz.wav", event_received)
+
+    # Wait for the callback with timeout
+    try:
+        await asyncio.wait_for(event_received.wait(), timeout=5.0)
+        assert received_message is not None
+    except asyncio.TimeoutError:
+        pytest.fail("ADD_TRANSCRIPT event was not received within 5 seconds of audio finish")
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
+
+
+@pytest.mark.asyncio
+async def test_partial_segment():
+    """Test transcription.
+
+    This test will:
+        - send audio data to the API server
+        - wait for the first partial segment (within 10 seconds)
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Client
+    client = await get_client(api_key=API_KEY, connect=True, config=VoiceAgentConfig())
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Create an event to track when the callback is called
+    event_received = asyncio.Event()
+
+    # Callback function for connection
+    def on_segment_received(message):
+        # Segments from the message
+        segments = message.get("segments", [])
+
+        # We need at least one segment
+        if not segments:
+            return
+
+        # Get the first segment's text
+        transcription = segments[0]["text"]
+
+        # Check transcription starts with `Welcome to Speechmatics`
+        if not transcription.lower().startswith("welcome to speech"):
+            return
+
+        # Set the event
+        event_received.set()
+
+    # Add listener for PARTIALS
+    client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, on_segment_received)
+
+    # Load the audio file `./assets/audio_01_16kHz.wav`
+    await send_audio_file(client=client, audio_file="./assets/audio_01_16kHz.wav", terminate_event=event_received)
+
+    # Wait for the callback with timeout
+    try:
+        await asyncio.wait_for(event_received.wait(), timeout=5.0)
+    except asyncio.TimeoutError:
+        pytest.fail("ADD_PARTIAL_SEGMENT event was not received within 5 seconds of audio finish")
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
diff --git a/tests/voice/test_03_conversation.py b/tests/voice/test_03_conversation.py
new file mode 100644
index 0000000..aa2398b
--- /dev/null
+++ b/tests/voice/test_03_conversation.py
@@ -0,0 +1,114 @@
+import datetime
+import json
+import os
+
+import pytest
+from _utils import get_client
+from _utils import send_audio_file
+
+from speechmatics.voice import AdditionalVocabEntry
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import VoiceAgentConfig
+
+# Skip for CI testing
+pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping transcription tests in CI")
+
+# Constants
+API_KEY = os.getenv("SPEECHMATICS_API_KEY")
+SHOW_LOG = os.getenv("SPEECHMATICS_SHOW_LOG", "0").lower() in ["1", "true"]
+
+
+@pytest.mark.asyncio
+async def test_log_messages():
+    """Test transcription.
+
+    This test will:
+        - log messages
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Client
+    client = await get_client(
+        api_key=API_KEY,
+        connect=False,
+        config=VoiceAgentConfig(
+            end_of_utterance_silence_trigger=0.2,
+            max_delay=0.7,
+            end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+            enable_diarization=True,
+            additional_vocab=[
+                AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]),
+            ],
+        ),
+    )
+
+    # Create an event to track when the callback is called
+    messages: list[str] = []
+    bytes_sent: int = 0
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Bytes logger
+    def log_bytes_sent(bytes):
+        nonlocal bytes_sent
+        bytes_sent += bytes
+
+    # Callback for each message
+    def log_message(message):
+        ts = (datetime.datetime.now() - start_time).total_seconds()
+        audio_ts = bytes_sent / 16000 / 2
+        log = json.dumps({"ts": round(ts, 3), "audio_ts": round(audio_ts, 2), "payload": message})
+        messages.append(log)
+        if SHOW_LOG:
+            print(log)
+
+    # Add listeners
+    client.once(AgentServerMessageType.RECOGNITION_STARTED, log_message)
+    client.once(AgentServerMessageType.INFO, log_message)
+    client.on(AgentServerMessageType.WARNING, log_message)
+    client.on(AgentServerMessageType.ERROR, log_message)
+    client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, log_message)
+    client.on(AgentServerMessageType.ADD_TRANSCRIPT, log_message)
+    client.on(AgentServerMessageType.END_OF_UTTERANCE, log_message)
+    client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, log_message)
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_message)
+    client.on(AgentServerMessageType.SPEAKER_STARTED, log_message)
+    client.on(AgentServerMessageType.SPEAKER_ENDED, log_message)
+    client.on(AgentServerMessageType.END_OF_TURN, log_message)
+
+    # Load the audio file `./assets/audio_01_16kHz.wav`
+    audio_file = "./assets/audio_01_16kHz.wav"
+
+    # HEADER
+    if SHOW_LOG:
+        print()
+        print()
+        print("---")
+        log_message({"message": "AudioFile", "path": audio_file})
+        log_message({"message": "VoiceAgentConfig", **client._config.model_dump()})
+        log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()})
+        log_message({"message": "AudioFormat", **client._audio_format.to_dict()})
+
+    # Connect
+    await client.connect()
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Individual payloads
+    await send_audio_file(client, audio_file, progress_callback=log_bytes_sent)
+
+    # FOOTER
+    if SHOW_LOG:
+        print("---")
+        print()
+        print()
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
diff --git a/tests/voice/test_04_models.py b/tests/voice/test_04_models.py
new file mode 100644
index 0000000..6e3af3d
--- /dev/null
+++ b/tests/voice/test_04_models.py
@@ -0,0 +1,255 @@
+import json
+
+import pytest
+
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice._models import AdditionalVocabEntry
+from speechmatics.voice._models import AnnotationFlags
+from speechmatics.voice._models import AnnotationResult
+from speechmatics.voice._models import OperatingPoint
+from speechmatics.voice._models import SpeakerFocusConfig
+from speechmatics.voice._models import SpeakerFocusMode
+from speechmatics.voice._models import SpeakerIdentifier
+from speechmatics.voice._models import SpeakerSegment
+from speechmatics.voice._models import SpeechFragment
+
+
+@pytest.mark.asyncio
+async def test_voice_agent_config():
+    """Test VoiceAgentConfig Pydantic serialisation and deserialisation."""
+    # Create instance with custom values
+    config = VoiceAgentConfig(
+        language="en",
+        max_delay=1.5,
+        enable_diarization=True,
+        speaker_sensitivity=0.7,
+        additional_vocab=[AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"])],
+        known_speakers=[SpeakerIdentifier(label="John", speaker_identifiers=["78673523465237xx"])],
+    )
+
+    # Test JSON serialisation
+    config_dict = config.model_dump()
+    assert config_dict["language"] == "en"
+    assert config_dict["max_delay"] == 1.5
+    assert config_dict["enable_diarization"] is True
+    assert config_dict["speaker_sensitivity"] == 0.7
+    assert len(config_dict["additional_vocab"]) == 1
+    assert config_dict["additional_vocab"][0]["content"] == "Speechmatics"
+    assert len(config_dict["known_speakers"]) == 1
+    assert config_dict["known_speakers"][0]["label"] == "John"
+
+    # Get JSON from the model
+    config_json = config.to_json()
+
+    # Test JSON deserialisation
+    config_from_json = VoiceAgentConfig.from_json(config_json)
+    assert config_from_json.language == config.language
+    assert config_from_json.max_delay == config.max_delay
+    assert config_from_json.enable_diarization == config.enable_diarization
+    assert config_from_json.speaker_sensitivity == config.speaker_sensitivity
+    assert len(config_from_json.additional_vocab) == 1
+    assert config_from_json.additional_vocab[0].content == "Speechmatics"
+    assert len(config_from_json.known_speakers) == 1
+    assert config_from_json.known_speakers[0].label == "John"
+
+    # From JSON
+    preset: VoiceAgentConfig = VoiceAgentConfig.from_json('{"operating_point": "enhanced"}')
+    assert preset.operating_point == OperatingPoint.ENHANCED
+
+
+@pytest.mark.asyncio
+async def test_annotation_result():
+    """Test AnnotationResult.
+
+    - create new annotation
+    - add, remove, check for flags
+    - serialize to JSON
+    """
+
+    # Create a new annotation
+    annotation = AnnotationResult.from_flags(AnnotationFlags.NO_TEXT, AnnotationFlags.HAS_DISFLUENCY)
+    assert annotation is not None
+
+    # Add extra flag
+    annotation.add(AnnotationFlags.MULTIPLE_SPEAKERS)
+
+    # Has a flag
+    assert annotation.has(AnnotationFlags.NO_TEXT)
+    assert annotation.has(AnnotationFlags.HAS_DISFLUENCY)
+    assert annotation.has(AnnotationFlags.MULTIPLE_SPEAKERS)
+
+    # Remove a flag
+    annotation.remove(AnnotationFlags.MULTIPLE_SPEAKERS)
+    assert not annotation.has(AnnotationFlags.MULTIPLE_SPEAKERS)
+
+    # Add existing flag
+    annotation.add(AnnotationFlags.NO_TEXT)
+    assert annotation.has(AnnotationFlags.NO_TEXT)
+    assert str(annotation) == "['no_text', 'has_disfluency']"
+
+    # Add multiple flags
+    annotation.add(AnnotationFlags.MULTIPLE_SPEAKERS, AnnotationFlags.STARTS_WITH_DISFLUENCY)
+    assert annotation.has(AnnotationFlags.MULTIPLE_SPEAKERS, AnnotationFlags.STARTS_WITH_DISFLUENCY)
+
+    # Remove multiple flags
+    annotation.remove(AnnotationFlags.MULTIPLE_SPEAKERS, AnnotationFlags.STARTS_WITH_DISFLUENCY)
+    assert not annotation.has(AnnotationFlags.MULTIPLE_SPEAKERS, AnnotationFlags.STARTS_WITH_DISFLUENCY)
+
+    # Compare
+    assert annotation == AnnotationResult([AnnotationFlags.HAS_DISFLUENCY, AnnotationFlags.NO_TEXT])
+
+    # Compare with non AnnotationResult
+    assert annotation != "string"
+    assert annotation != 123
+
+    # String representation
+    assert str(annotation) == "['no_text', 'has_disfluency']"
+    assert str({"annotation": annotation}) == "{'annotation': ['no_text', 'has_disfluency']}"
+    assert json.dumps({"annotation": annotation}) == '{"annotation": ["no_text", "has_disfluency"]}'
+
+
+@pytest.mark.asyncio
+async def test_additional_vocab_entry():
+    """Test AdditionalVocabEntry serialisation and deserialisation.
+
+    - create instance
+    - serialize to JSON
+    - deserialize from JSON
+    """
+
+    # Create instance
+    entry = AdditionalVocabEntry(content="hello", sounds_like=["helo", "hallo"])
+
+    # Test JSON serialisation
+    json_data = entry.model_dump()
+    assert json_data["content"] == "hello"
+    assert json_data["sounds_like"] == ["helo", "hallo"]
+
+    # Test JSON deserialisation
+    entry_from_json = AdditionalVocabEntry.model_validate(json_data)
+    assert entry_from_json.content == entry.content
+    assert entry_from_json.sounds_like == entry.sounds_like
+
+    # Test with defaults
+    entry_minimal = AdditionalVocabEntry(content="test")
+    json_minimal = entry_minimal.model_dump()
+    assert json_minimal["sounds_like"] == []
+
+
+@pytest.mark.asyncio
+async def test_speaker_focus_config():
+    """Test SpeakerFocusConfig serialisation and deserialisation.
+
+    - create instance with custom values
+    - serialize to JSON
+    - deserialize from JSON
+    """
+
+    # Create instance with custom values
+    config = SpeakerFocusConfig(
+        focus_speakers=["S1", "S2"],
+        ignore_speakers=["__ASSISTANT__", "__SYSTEM__"],
+        focus_mode=SpeakerFocusMode.IGNORE,
+    )
+
+    # Test JSON serialisation
+    json_data = config.model_dump()
+    assert json_data["focus_speakers"] == ["S1", "S2"]
+    assert json_data["ignore_speakers"] == ["__ASSISTANT__", "__SYSTEM__"]
+    assert json_data["focus_mode"] == SpeakerFocusMode.IGNORE
+
+    # Test JSON deserialisation
+    config_from_json = SpeakerFocusConfig.model_validate(json_data)
+    assert config_from_json.focus_speakers == config.focus_speakers
+    assert config_from_json.ignore_speakers == config.ignore_speakers
+    assert config_from_json.focus_mode == config.focus_mode
+
+    # Test with defaults
+    config_default = SpeakerFocusConfig()
+    json_default = config_default.model_dump()
+    assert json_default["focus_speakers"] == []
+    assert json_default["ignore_speakers"] == []
+    assert json_default["focus_mode"] == SpeakerFocusMode.RETAIN
+
+
+@pytest.mark.asyncio
+async def test_speech_fragment():
+    """Test SpeechFragment serialisation and deserialisation.
+
+    - create instance with annotation
+    - serialize to JSON
+    - deserialize from JSON
+    """
+
+    # Create instance with annotation
+    annotation = AnnotationResult.from_flags(AnnotationFlags.HAS_FINAL, AnnotationFlags.ENDS_WITH_EOS)
+
+    # Create fragment
+    fragment = SpeechFragment(
+        idx=1,
+        start_time=0.5,
+        end_time=1.2,
+        language="en",
+        content="Hello",
+        speaker="S1",
+        is_final=True,
+        confidence=0.95,
+        annotation=annotation,
+    )
+
+    # Test JSON serialisation
+    json_data = fragment.model_dump()
+    assert json_data["idx"] == 1
+    assert json_data["start_time"] == 0.5
+    assert json_data["end_time"] == 1.2
+    assert json_data["content"] == "Hello"
+    assert json_data["speaker"] == "S1"
+    assert json_data["is_final"] is True
+    assert json_data["confidence"] == 0.95
+    assert isinstance(json_data["annotation"], list)
+
+
+@pytest.mark.asyncio
+async def test_speaker_segment():
+    """Test SpeakerSegment serialisation and deserialisation.
+
+    - create instance with annotation
+    - serialize to JSON
+    - deserialize from JSON
+    """
+
+    # Create fragments
+    fragment1 = SpeechFragment(idx=1, start_time=0.5, end_time=1.0, content="Hello", speaker="S1")
+    fragment2 = SpeechFragment(idx=2, start_time=1.0, end_time=1.5, content="world", speaker="S1")
+
+    # Create annotation
+    annotation = AnnotationResult.from_flags(AnnotationFlags.HAS_FINAL, AnnotationFlags.MULTIPLE_SPEAKERS)
+
+    # Create instance
+    segment = SpeakerSegment(
+        speaker_id="S1",
+        is_active=True,
+        timestamp="2025-01-01T12:00:00.500",
+        language="en",
+        fragments=[fragment1, fragment2],
+        text="Hello world",
+        annotation=annotation,
+    )
+
+    # Test model_dump() default behavior (should exclude fragments by default)
+    json_data = segment.model_dump()
+    assert json_data["speaker_id"] == "S1"
+    assert json_data["is_active"] is True
+    assert json_data["timestamp"] == "2025-01-01T12:00:00.500"
+    assert json_data["text"] == "Hello world"
+    assert "fragments" not in json_data
+    assert "results" not in json_data
+    assert isinstance(json_data["annotation"], list)
+
+    # Test model_dump with include_results=True
+    dict_data_results = segment.model_dump(include_results=True)
+    assert dict_data_results["speaker_id"] == "S1"
+    assert dict_data_results["text"] == "Hello world"
+    assert "results" in dict_data_results
+    assert "fragments" not in dict_data_results
+    assert len(dict_data_results["results"]) == 2
diff --git a/tests/voice/test_05_utterance.py b/tests/voice/test_05_utterance.py
new file mode 100644
index 0000000..cf30453
--- /dev/null
+++ b/tests/voice/test_05_utterance.py
@@ -0,0 +1,422 @@
+import asyncio
+import datetime
+import os
+from typing import Any
+from typing import Optional
+
+import pytest
+from _utils import ConversationLog
+from _utils import get_client
+
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import SpeechSegmentConfig
+from speechmatics.voice import VoiceAgentConfig
+
+SHOW_LOG = os.getenv("SPEECHMATICS_SHOW_LOG", "0").lower() in ["1", "true"]
+
+
+@pytest.mark.asyncio
+async def test_speech_fragments():
+    """Test SpeechFragment.
+
+    - create fragment(s)
+    - check output from processing conversation
+    - serialize to JSON
+    """
+
+    # Test conversation
+    log = ConversationLog(os.path.join(os.path.dirname(__file__), "./assets/chat2.jsonl"))
+    chat = log.get_conversation(
+        ["Info", "RecognitionStarted", "AddPartialTranscript", "AddTranscript", "EndOfUtterance"]
+    )
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Create a client
+    client = await get_client(api_key="NONE", connect=False)
+    assert client is not None
+
+    # Start the queue
+    client._start_stt_queue()
+
+    # Event to wait
+    event_rx: asyncio.Event = asyncio.Event()
+    last_message: Optional[dict[str, Any]] = None
+
+    # Reset message
+    def message_reset():
+        nonlocal last_message
+        last_message = None
+        event_rx.clear()
+
+    # Message receiver
+    def message_rx(message: dict[str, Any]):
+        nonlocal last_message
+        last_message = message
+        event_rx.set()
+
+    # Send a message from the conversation
+    async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
+        for i in range(count):
+            # Get the message from the chat
+            message = chat[idx + i]
+
+            # Wait for TTL to expire
+            if use_ttl:
+                ttl = (start_time + datetime.timedelta(seconds=message["ts"])) - datetime.datetime.now()
+                if ttl.total_seconds() > 0:
+                    await asyncio.sleep(ttl.total_seconds())
+            else:
+                await asyncio.sleep(0.05)
+
+            # Emit the message
+            client.emit(message["payload"]["message"], message["payload"])
+
+    # Add listener for first interim segment
+    message_reset()
+    client.once(AgentServerMessageType.ADD_PARTIAL_SEGMENT, message_rx)
+
+    # Inject first partial
+    await send_message(0, count=6, use_ttl=False)
+
+    # Wait for first segment
+    try:
+        await asyncio.wait_for(event_rx.wait(), timeout=5.0)
+        assert last_message is not None
+    except asyncio.TimeoutError:
+        pytest.fail("ADD_PARTIAL_SEGMENT event was not received within 5 seconds")
+
+    # Check the right message was received
+    assert last_message.get("message") == AgentServerMessageType.ADD_PARTIAL_SEGMENT
+
+    # Check the segment
+    segments = last_message.get("segments", [])
+    assert len(segments) == 1
+    seg0 = segments[0]
+    assert seg0["speaker_id"] == "S1"
+    assert seg0["text"] == "Welcome"
+    assert f"{seg0['speaker_id']}: {seg0['text']}" == "S1: Welcome"
+
+    # Add listener for final segment
+    message_reset()
+    client.once(AgentServerMessageType.ADD_SEGMENT, message_rx)
+
+    # Send a more partials and finals
+    await send_message(5, count=8, use_ttl=False)
+
+    # Wait for final segment
+    try:
+        await asyncio.wait_for(event_rx.wait(), timeout=5.0)
+        assert last_message is not None
+    except asyncio.TimeoutError:
+        pytest.fail("ADD_SEGMENT event was not received within 5 seconds")
+
+    # Check the right message was received
+    assert last_message.get("message") == AgentServerMessageType.ADD_SEGMENT
+
+    # Check the segment
+    segments = last_message.get("segments", [])
+    assert len(segments) == 1
+    seg0 = segments[0]
+    assert seg0["speaker_id"] == "S1"
+    assert seg0["text"] == "Welcome to Speechmatics."
+    assert f"{seg0['speaker_id']}: {seg0['text']}" == "S1: Welcome to Speechmatics."
+
+    # Stop the queue
+    client._stop_stt_queue()
+
+
+@pytest.mark.asyncio
+async def test_end_of_utterance_fixed():
+    """Test EndOfUtterance from STT engine.
+
+    - send converstaion messages (fast)
+    - wait for `EndOfUtterance` message
+    """
+
+    # Test conversation
+    log = ConversationLog(os.path.join(os.path.dirname(__file__), "./assets/chat2.jsonl"))
+    chat = log.get_conversation(
+        ["Info", "RecognitionStarted", "AddPartialTranscript", "AddTranscript", "EndOfUtterance"]
+    )
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Create a client
+    client = await get_client(
+        api_key="NONE",
+        connect=False,
+        config=VoiceAgentConfig(end_of_utterance_silence_trigger=0.5, end_of_utterance_mode=EndOfUtteranceMode.FIXED),
+    )
+    assert client is not None
+
+    # Debug
+    if SHOW_LOG:
+        client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, lambda message: print(message))
+        client.on(AgentServerMessageType.ADD_SEGMENT, lambda message: print(message))
+        client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, lambda message: print(message))
+        client.on(AgentServerMessageType.START_OF_TURN, lambda message: print(message))
+        client.on(AgentServerMessageType.END_OF_TURN, lambda message: print(message))
+        client.on(AgentServerMessageType.END_OF_UTTERANCE, lambda message: print(message))
+
+    # Start the queue
+    client._start_stt_queue()
+
+    # Event to wait
+    event_rx: asyncio.Event = asyncio.Event()
+    last_message: Optional[dict[str, Any]] = None
+
+    # Message receiver
+    def message_rx(message: dict[str, Any]):
+        nonlocal last_message
+        last_message = message
+        event_rx.set()
+
+    # Send a message from the conversation
+    async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
+        for i in range(count):
+            # Get the message from the chat
+            message = chat[idx + i]
+
+            # Wait for TTL to expire
+            if use_ttl:
+                ttl = (start_time + datetime.timedelta(seconds=message["ts"])) - datetime.datetime.now()
+                if ttl.total_seconds() > 0:
+                    await asyncio.sleep(ttl.total_seconds())
+            else:
+                await asyncio.sleep(0.005)
+
+            # Emit the message
+            client.emit(message["payload"]["message"], message["payload"])
+
+    # Add listener for first interim segment
+    client.once(AgentServerMessageType.ADD_SEGMENT, message_rx)
+
+    # Inject conversation
+    await send_message(0, count=14, use_ttl=False)
+
+    # Wait for EndOfTurn
+    try:
+        await asyncio.wait_for(event_rx.wait(), timeout=5.0)
+        assert last_message is not None
+    except asyncio.TimeoutError:
+        pytest.fail("ADD_SEGMENT event was not received within 5 seconds")
+
+    # Check the right message was received
+    assert last_message.get("message") == AgentServerMessageType.ADD_SEGMENT
+
+
+@pytest.mark.asyncio
+async def test_external_vad():
+    """Test EndOfUtterance from STT engine.
+
+    - send converstaion messages (realtime)
+    - finalizes based on external VAD (e.g. Pipecat's `UserStoppedSpeakingFrame` frame)
+    """
+
+    # Test conversation
+    log = ConversationLog(os.path.join(os.path.dirname(__file__), "./assets/chat2.jsonl"))
+    chat = log.get_conversation(
+        ["Info", "RecognitionStarted", "AddPartialTranscript", "AddTranscript", "EndOfUtterance"]
+    )
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Adaptive timeout
+    adaptive_timeout = 1.0
+
+    # Create a client
+    client = await get_client(
+        api_key="NONE",
+        connect=False,
+        config=VoiceAgentConfig(
+            end_of_utterance_silence_trigger=adaptive_timeout, end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL
+        ),
+    )
+    assert client is not None
+
+    # Start the queue
+    client._start_stt_queue()
+
+    # Event to wait
+    event_rx: asyncio.Event = asyncio.Event()
+    last_message: Optional[dict[str, Any]] = None
+
+    # Message receiver
+    def message_rx(message: dict[str, Any]):
+        nonlocal last_message
+        last_message = message
+        event_rx.set()
+
+    # Send a message from the conversation
+    async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
+        for i in range(count):
+            # Get the message from the chat
+            message = chat[idx + i]
+
+            # Wait for TTL to expire
+            if use_ttl:
+                ttl = (start_time + datetime.timedelta(seconds=message["ts"])) - datetime.datetime.now()
+                if ttl.total_seconds() > 0:
+                    await asyncio.sleep(ttl.total_seconds())
+            else:
+                await asyncio.sleep(0.005)
+
+            # Emit the message
+            client.emit(message["payload"]["message"], message["payload"])
+
+    # Inject conversation
+    await send_message(0, count=12, use_ttl=False)
+
+    # Momentary pause
+    await asyncio.sleep(0.5)
+
+    # Add listener for first interim segment
+    client.once(AgentServerMessageType.ADD_SEGMENT, message_rx)
+
+    # Pause for a moment
+    await asyncio.sleep(0.5)
+
+    # Send finalize
+    client.finalize()
+
+    # Wait for AddSegments
+    try:
+        await asyncio.wait_for(event_rx.wait(), timeout=4)
+        assert last_message is not None
+    except asyncio.TimeoutError:
+        pytest.fail("ADD_SEGMENT event was not received within 4 seconds")
+
+    # Check the right message was received
+    assert last_message.get("message") == AgentServerMessageType.ADD_SEGMENT
+
+    # Check the segment
+    segments = last_message.get("segments", [])
+    assert len(segments) == 1
+    seg0 = segments[0]
+    assert seg0["speaker_id"] == "S1"
+    assert seg0["text"] == "Welcome to Speechmatics"
+    assert f"{seg0['speaker_id']}: {seg0['text']}" == "S1: Welcome to Speechmatics"
+
+    # Stop the queue
+    client._stop_stt_queue()
+
+
+@pytest.mark.asyncio
+async def test_end_of_utterance_adaptive_vad():
+    """Test EndOfUtterance from STT engine.
+
+    - send converstaion messages (realtime)
+    - wait for `EndOfUtterance` message from SDK (adaptive)
+    - check the interval to receive the EndOfUtterance message is within 25% of expected
+    """
+
+    # Test conversation
+    log = ConversationLog(os.path.join(os.path.dirname(__file__), "./assets/chat2.jsonl"))
+    chat = log.get_conversation(["Info", "RecognitionStarted", "AddPartialTranscript", "AddTranscript"])
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Adaptive timeout
+    adaptive_timeout = 0.5
+
+    # Create a client
+    client = await get_client(
+        api_key="NONE",
+        connect=False,
+        config=VoiceAgentConfig(
+            end_of_utterance_silence_trigger=adaptive_timeout,
+            end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
+            speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+        ),
+    )
+    assert client is not None
+
+    # Start the queue
+    client._start_stt_queue()
+
+    # Event to wait
+    eot_received: asyncio.Event = asyncio.Event()
+    last_message: Optional[dict[str, Any]] = None
+
+    # Time for last final (used to calculate the interval)
+    last_final_time: Optional[float] = None
+    receive_interval: Optional[float] = None
+
+    # Transcript receiver
+    def transcript_rx(message: dict[str, Any]):
+        if not eot_received.is_set():
+            nonlocal last_final_time
+            last_final_time = datetime.datetime.now()
+
+    # End of turn receiver
+    def eot_rx(message: dict[str, Any]):
+        nonlocal last_message
+        nonlocal receive_interval
+        last_message = message
+        receive_interval = (datetime.datetime.now() - last_final_time).total_seconds()
+        eot_received.set()
+
+    # Send a message from the conversation
+    async def send_message(idx: int, count: int = 1, use_ttl: bool = True):
+        for i in range(count):
+            # Get the message from the chat
+            message = chat[idx + i]
+
+            # Wait for TTL to expire
+            if use_ttl:
+                ttl = (start_time + datetime.timedelta(seconds=message["ts"])) - datetime.datetime.now()
+                if ttl.total_seconds() > 0:
+                    await asyncio.sleep(ttl.total_seconds())
+            else:
+                await asyncio.sleep(0.005)
+
+            # Emit the message
+            client.emit(message["payload"]["message"], message["payload"])
+
+    # Add listener for partials (as these will trigger the adaptive timer))
+    client.on(AgentServerMessageType.ADD_PARTIAL_TRANSCRIPT, transcript_rx)
+
+    # Add listener for end of turn
+    client.once(AgentServerMessageType.END_OF_TURN, eot_rx)
+
+    # Debug
+    if SHOW_LOG:
+        client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, lambda message: print(message))
+        client.on(AgentServerMessageType.ADD_SEGMENT, lambda message: print(message))
+        client.on(AgentServerMessageType.START_OF_TURN, lambda message: print(message))
+        client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, lambda message: print(message))
+        client.on(AgentServerMessageType.END_OF_TURN, lambda message: print(message))
+
+    # Inject conversation up to the penultimate final from the STT
+    await send_message(0, count=12, use_ttl=True)
+
+    # Check we have had a final
+    assert last_final_time is not None
+
+    # Timing info
+    timeout = adaptive_timeout * 2.0
+
+    # Wait for EndOfUtterance
+    try:
+        await asyncio.wait_for(eot_received.wait(), timeout=timeout)
+        assert last_message is not None
+    except asyncio.TimeoutError:
+        pytest.fail(f"END_OF_TURN event was not received within {timeout} seconds")
+
+    # Check the right message was received
+    assert last_message.get("message") == AgentServerMessageType.END_OF_TURN
+
+    # Check the interval was within +/- 25% of the adaptive trigger of 0.5 the timeout (see client code)
+    # expected_min_interval = adaptive_timeout * 0.75
+    # expected_max_interval = adaptive_timeout * 1.25
+    # assert receive_interval >= expected_min_interval
+    # assert receive_interval <= expected_max_interval
+
+    # Stop the queue
+    client._stop_stt_queue()
diff --git a/tests/voice/test_06_stt_config.py b/tests/voice/test_06_stt_config.py
new file mode 100644
index 0000000..cc7b3ca
--- /dev/null
+++ b/tests/voice/test_06_stt_config.py
@@ -0,0 +1,8 @@
+import pytest
+
+
+@pytest.mark.asyncio
+async def test_no_partials():
+    """Tests for STT config (no partials)."""
+
+    pass
diff --git a/tests/voice/test_07_languages.py b/tests/voice/test_07_languages.py
new file mode 100644
index 0000000..759e67b
--- /dev/null
+++ b/tests/voice/test_07_languages.py
@@ -0,0 +1,211 @@
+import asyncio
+import datetime
+import json
+import os
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Any
+from typing import Optional
+
+import pytest
+from _utils import get_client
+from _utils import send_audio_file
+
+from speechmatics.voice import AdditionalVocabEntry
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice._utils import TextUtils
+
+# Skip for CI testing
+pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping language tests in CI")
+
+# Constants
+API_KEY = os.getenv("SPEECHMATICS_API_KEY")
+URL = "wss://eu2.rt.speechmatics.com/v2"
+
+
+@dataclass
+class AudioSample:
+    language: str
+    path: str
+    transcript: str
+    sentence_break: str = " "
+    use_cer: bool = False
+    cer_pass: float = 0.05
+    vocab: list[str] = field(default_factory=list)
+
+
+SAMPLES: list[AudioSample] = [
+    AudioSample(
+        language="fr",
+        path="./assets/languages/fr_fr_000378.wav",
+        transcript=(
+            "la partie extérieure que nous voyons lorsque nous regardons le soleil "
+            "s’appelle la photosphère ce qui signifie « boule de lumière »"
+        ),
+    ),
+    AudioSample(
+        language="de",
+        path="./assets/languages/de_de_000675.wav",
+        transcript=(
+            "Die Einreise in das südliche Afrika mit dem Auto ist eine erstaunliche Möglichkeit, "
+            "die ganze Schönheit der Region zu sehen und an Orte abseits der normalen Touristenrouten zu gelangen."
+        ),
+    ),
+    AudioSample(
+        language="es",
+        path="./assets/languages/es_419_000896.wav",
+        transcript=(
+            "Es esencial que cuente, cuando menos, con calzado con suelas apropiadas. "
+            "Los zapatos de verano por lo general resbalan mucho en el hielo y la nieve, "
+            "incluso hay botas de invierno que no son adecuadas."
+        ),
+    ),
+    AudioSample(
+        language="he",
+        path="./assets/languages/he_il_000432.wav",
+        transcript="טורקיה מוקפת ים משלושה כיוונים: הים האגאי ממערב, הים השחור מצפון והים התיכון מדרום.",
+        sentence_break="",
+    ),
+    AudioSample(
+        language="cmn",
+        path="./assets/languages/cmn_hans_cn_000328.wav",
+        transcript="博贝克出生于克罗地亚首都萨格勒布，在为贝尔格莱德游击队足球俱乐部效力时成名。",
+        sentence_break="",
+        use_cer=True,
+    ),
+    AudioSample(
+        language="ja",
+        path="./assets/languages/ja_jp_000595.wav",
+        transcript="動物は地球上のいたるところに生息しています。地面を掘ったり、海を泳ぎ回ったり、空を飛んだりしています。",
+        sentence_break="",
+        use_cer=True,
+        cer_pass=0.07,
+    ),
+    AudioSample(
+        language="th",
+        path="./assets/languages/th_th_000208.wav",
+        transcript="ข้สภาพอากาศเลวร้ายที่เป็นสาเหตุของการยกเลิกการลงจอดทำให้การค้นหายากลำบาก",
+        sentence_break="",
+        use_cer=True,
+        cer_pass=0.03,
+    ),
+]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("sample", SAMPLES, ids=lambda s: f"{s.language}:{s.path}")
+async def test_transcribe_languages(sample: AudioSample):
+    """Test foreign language transcription.
+
+    This test will:
+        - use samples from the FLEURS dataset
+        - use different languages
+        - compare the normalized transcriptions with the reference transcription
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Client
+    client = await get_client(
+        api_key=API_KEY,
+        url=URL,
+        connect=False,
+        config=VoiceAgentConfig(
+            max_delay=1.2,
+            end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL,
+            language=sample.language,
+            additional_vocab=[AdditionalVocabEntry(content=vocab) for vocab in sample.vocab],
+        ),
+    )
+    assert client is not None
+
+    # Create an event to track when the callback is called
+    messages: list[str] = []
+    bytes_sent: int = 0
+    last_message: Optional[dict[str, Any]] = None
+
+    # Segments
+    segments: list[dict[str, Any]] = []
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Bytes logger
+    def log_bytes_sent(bytes):
+        nonlocal bytes_sent
+        bytes_sent += bytes
+
+    # Callback for each message
+    def log_message(message):
+        nonlocal last_message
+        last_message = message
+        ts = (datetime.datetime.now() - start_time).total_seconds()
+        audio_ts = bytes_sent / 16000 / 2
+        log = json.dumps({"ts": round(ts, 3), "audio_ts": round(audio_ts, 2), "payload": message})
+        messages.append(log)
+
+    # Log a segment
+    def log_segment(message):
+        segments.extend(message["segments"])
+
+    # Add listeners
+    client.once(AgentServerMessageType.RECOGNITION_STARTED, log_message)
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_message)
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_segment)
+
+    # Load the audio file
+    audio_file = sample.path
+
+    # Connect
+    await client.connect()
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Individual payloads
+    await send_audio_file(client, audio_file, progress_callback=log_bytes_sent)
+
+    # Send finalize
+    await asyncio.sleep(1.5)
+    client.finalize()
+    await asyncio.sleep(1.5)
+
+    # Extract the last message
+    assert last_message.get("message") == AgentServerMessageType.ADD_SEGMENT
+
+    # Check the segment
+    assert len(segments) >= 1
+    seg0 = segments[0]
+
+    # Check language
+    assert seg0.get("language") == sample.language
+
+    # Concatenate text from segments
+    transcribed = sample.sentence_break.join([seg["text"] for seg in segments])
+
+    # Get normalized versions of the transcription and reference
+    str_original = TextUtils.normalize(sample.transcript)
+    str_transcribed = TextUtils.normalize(transcribed)
+    str_cer = TextUtils.cer(str_original, str_transcribed)
+
+    # Assert the CER
+    if sample.use_cer:
+        ok = str_cer < sample.cer_pass  # < 5% CER acceptable (default)
+    else:
+        ok = str_original == str_transcribed  # Exact match required
+
+    # Compare transcriptions
+    if not ok:
+        print("\n".join(messages))
+        print(f"Original: [{str_original}]")
+        print(f"Transcribed: [{str_transcribed}]")
+        print(f"CER: {str_cer}")
+        raise AssertionError("Transcription does not match original")
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
diff --git a/tests/voice/test_08_multiple_speakers.py b/tests/voice/test_08_multiple_speakers.py
new file mode 100644
index 0000000..adbebd5
--- /dev/null
+++ b/tests/voice/test_08_multiple_speakers.py
@@ -0,0 +1,208 @@
+import datetime
+import json
+import os
+import re
+from dataclasses import field
+from typing import Optional
+
+import pytest
+from _utils import get_client
+from _utils import send_audio_file
+from pydantic import BaseModel
+
+from speechmatics.voice import AdditionalVocabEntry
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import SpeakerFocusConfig
+from speechmatics.voice import SpeakerFocusMode
+from speechmatics.voice import SpeechSegmentConfig
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice._models import SpeakerSegment
+
+# Skip for CI testing
+pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping diarization tests in CI")
+
+# Constants
+API_KEY = os.getenv("SPEECHMATICS_API_KEY")
+SHOW_LOG = os.getenv("SPEECHMATICS_SHOW_LOG", "0").lower() in ["1", "true"]
+
+
+class SpeakerTest(BaseModel):
+    id: str
+    path: str
+    sample_rate: int = 16000
+    sample_size: int = 2
+    segment_regex: list[str] = field(default_factory=list)
+    config: Optional[VoiceAgentConfig] = None
+    speaker_config: Optional[SpeakerFocusConfig] = None
+    speakers_present: list[str] = field(default_factory=list)
+
+
+SAMPLES: list[SpeakerTest] = [
+    SpeakerTest(
+        id="multiple_speakers",
+        path="./assets/audio_02_8kHz.wav",
+        sample_rate=8000,
+        segment_regex=["^Welcome to GeoRouter", "Buckingham", "clarify", "Notting Hill", "Rickmansworth"],
+        speakers_present=["S1", "S2"],
+    ),
+    SpeakerTest(
+        id="focus_s2",
+        path="./assets/audio_02_8kHz.wav",
+        sample_rate=8000,
+        segment_regex=["^Welcome to GeoRouter", "Buckingham", "clarify", "Notting Hill"],
+        speaker_config=SpeakerFocusConfig(
+            focus_speakers=["S2"],
+        ),
+        speakers_present=["S1", "S2"],
+    ),
+    SpeakerTest(
+        id="only_s2",
+        path="./assets/audio_02_8kHz.wav",
+        sample_rate=8000,
+        segment_regex=["Buckingham", "Notting Hill"],
+        speaker_config=SpeakerFocusConfig(
+            focus_speakers=["S2"],
+            focus_mode=SpeakerFocusMode.IGNORE,
+        ),
+        speakers_present=["S2"],
+    ),
+    SpeakerTest(
+        id="ignore_s2",
+        path="./assets/audio_02_8kHz.wav",
+        sample_rate=8000,
+        segment_regex=["^Welcome to GeoRouter", "clarify", "Rickmansworth"],
+        speaker_config=SpeakerFocusConfig(
+            ignore_speakers=["S2"],
+        ),
+        speakers_present=["S1"],
+    ),
+]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("sample", SAMPLES, ids=lambda s: f"{s.id}:{s.path}")
+async def test_multiple_speakers(sample: SpeakerTest):
+    """Test transcription.
+
+    This test will:
+        - log messages
+        - transcribe audio with diarization config
+        - validate the segments received
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Config
+    config = sample.config or VoiceAgentConfig(
+        end_of_utterance_silence_trigger=1.0,
+        max_delay=2.0,
+        end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+        additional_vocab=[
+            AdditionalVocabEntry(content="GeoRouter"),
+        ],
+        speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+    )
+
+    # Diarization options
+    if sample.speaker_config:
+        config.speaker_config = sample.speaker_config
+
+    # Standard features
+    config.enable_diarization = True
+    config.sample_rate = sample.sample_rate
+
+    # Client
+    client = await get_client(
+        api_key=API_KEY,
+        connect=False,
+        config=config,
+    )
+
+    # Create an event to track when the callback is called
+    messages: list[str] = []
+    bytes_sent: int = 0
+    final_segments: list[dict] = []
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Bytes logger
+    def log_bytes_sent(bytes):
+        nonlocal bytes_sent
+        bytes_sent += bytes
+
+    # Callback for each message
+    def log_message(message):
+        ts = (datetime.datetime.now() - start_time).total_seconds()
+        audio_ts = bytes_sent / sample.sample_rate / sample.sample_size
+        log = json.dumps({"ts": round(ts, 3), "audio_ts": round(audio_ts, 2), "payload": message})
+        messages.append(log)
+        if SHOW_LOG:
+            print(log)
+
+    # Log final segments
+    def log_final_segment(message):
+        segments: list[SpeakerSegment] = message["segments"]
+        final_segments.extend(segments)
+
+    # Add listeners
+    client.once(AgentServerMessageType.RECOGNITION_STARTED, log_message)
+    client.once(AgentServerMessageType.INFO, log_message)
+    client.on(AgentServerMessageType.WARNING, log_message)
+    client.on(AgentServerMessageType.ERROR, log_message)
+    client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, log_message)
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_message)
+    client.on(AgentServerMessageType.SPEAKER_STARTED, log_message)
+    client.on(AgentServerMessageType.SPEAKER_ENDED, log_message)
+    client.on(AgentServerMessageType.END_OF_TURN, log_message)
+
+    # Log ADD_SEGMENT
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_final_segment)
+
+    # HEADER
+    if SHOW_LOG:
+        print()
+        print()
+        print("---")
+        log_message({"message": "Sample", **sample.model_dump()})
+        log_message({"message": "VoiceAgentConfig", **client._config.model_dump()})
+        log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()})
+        log_message({"message": "AudioFormat", **client._audio_format.to_dict()})
+
+    # Connect
+    await client.connect()
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Individual payloads
+    await send_audio_file(
+        client,
+        sample.path,
+        sample_rate=sample.sample_rate,
+        sample_size=sample.sample_size,
+        progress_callback=log_bytes_sent,
+    )
+
+    # FOOTER
+    if SHOW_LOG:
+        print("---")
+        print()
+        print()
+
+    # Check final segments against regex
+    for idx, _test in enumerate(sample.segment_regex):
+        if SHOW_LOG:
+            print(f"`{_test}` -> `{final_segments[idx].get('text')}`")
+        assert re.search(_test, final_segments[idx].get("text"), flags=re.IGNORECASE | re.MULTILINE)
+
+    # Check only speakers present
+    speakers = [segment.get("speaker_id") for segment in final_segments]
+    assert set(speakers) == set(sample.speakers_present)
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
diff --git a/tests/voice/test_09_speaker_id.py b/tests/voice/test_09_speaker_id.py
new file mode 100644
index 0000000..9592984
--- /dev/null
+++ b/tests/voice/test_09_speaker_id.py
@@ -0,0 +1,313 @@
+import asyncio
+import datetime
+import json
+import os
+from typing import Optional
+
+import pytest
+from _utils import get_client
+from _utils import send_audio_file
+
+from speechmatics.rt import ClientMessageType
+from speechmatics.voice import AdditionalVocabEntry
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import SpeakerIdentifier
+from speechmatics.voice import SpeechSegmentConfig
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice._models import SpeakerSegment
+
+# Skip for CI testing
+pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping speaker id tests in CI")
+
+# Constants
+API_KEY = os.getenv("SPEECHMATICS_API_KEY")
+URL: Optional[str] = "wss://eu2.rt.speechmatics.com/v2"
+SHOW_LOG = os.getenv("SPEECHMATICS_SHOW_LOG", "0").lower() in ["1", "true"]
+
+# List of know speakers during tests
+speaker_ids: list[SpeakerIdentifier] = []
+
+
+@pytest.mark.asyncio
+async def test_extract_speaker_ids():
+    """Test speaker id extraction.
+
+    This test will:
+        - transcribe audio with diarization config
+        - get speaker ids for the two speakers
+        - uses legacy format until out of preview!
+        - this MUST use a preview endpoint
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Client
+    client = await get_client(
+        api_key=API_KEY,
+        url=URL,
+        connect=False,
+        config=VoiceAgentConfig(
+            end_of_utterance_silence_trigger=1.0,
+            max_delay=2.0,
+            end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
+            enable_diarization=True,
+            sample_rate=8000,
+            additional_vocab=[
+                AdditionalVocabEntry(content="GeoRouter"),
+            ],
+        ),
+    )
+
+    # Create an event to track when the callback is called
+    messages: list[str] = []
+    bytes_sent: int = 0
+
+    # Flag
+    speakers_event_received = asyncio.Event()
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Bytes logger
+    def log_bytes_sent(bytes):
+        nonlocal bytes_sent
+        bytes_sent += bytes
+
+    # Callback for each message
+    def log_message(message):
+        ts = (datetime.datetime.now() - start_time).total_seconds()
+        audio_ts = bytes_sent / 8000
+        log = json.dumps({"ts": round(ts, 3), "audio_ts": round(audio_ts, 2), "payload": message})
+        messages.append(log)
+        if SHOW_LOG:
+            print(log)
+
+    # Log speakers result
+    def save_speakers_result(message):
+        for speaker in message.get("speakers", []):
+            label: str = speaker.get("label")
+            speaker_identifiers: list[str] = speaker.get("speaker_identifiers", [])
+
+            if not label or not speaker_identifiers:
+                continue
+
+            speaker_ids.append(
+                SpeakerIdentifier(
+                    label=label,
+                    speaker_identifiers=speaker_identifiers,
+                )
+            )
+
+        speakers_event_received.set()
+
+    # Add listeners
+    client.once(AgentServerMessageType.RECOGNITION_STARTED, log_message)
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_message)
+
+    # Log SPEAKERS_RESULT
+    client.once(AgentServerMessageType.SPEAKERS_RESULT, save_speakers_result)
+
+    # HEADER
+    if SHOW_LOG:
+        print()
+        print()
+        print("---")
+        log_message({"message": "VoiceAgentConfig", **client._config.model_dump()})
+        log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()})
+        log_message({"message": "AudioFormat", **client._audio_format.to_dict()})
+
+    # Connect
+    try:
+        await client.connect()
+    except Exception:
+        pytest.skip(f"Failed to connect to server: {URL}")
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Individual payloads
+    await send_audio_file(client, "./assets/audio_02_8kHz.wav", sample_rate=8000, progress_callback=log_bytes_sent)
+
+    # Request the speakers result
+    await client.send_message({"message": ClientMessageType.GET_SPEAKERS})
+
+    # Wait for the callback with timeout
+    try:
+        await asyncio.wait_for(speakers_event_received.wait(), timeout=5.0)
+    except asyncio.TimeoutError:
+        pytest.fail("SPEAKERS_RESULT event was not received within 5 seconds of audio finish")
+
+    # FOOTER
+    if SHOW_LOG:
+        print("---")
+        print()
+        print()
+
+    # Check speaker IDs
+    assert speaker_ids
+    assert len(speaker_ids) == 2
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
+
+
+@pytest.mark.asyncio
+async def test_known_speakers():
+    """Test using known speakers.
+
+    This test will:
+        - use known speakers
+        - check names for speakers
+        - this MUST use a preview endpoint
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Copy known speakers
+    known_speakers = speaker_ids.copy()
+    known_speakers[0].label = "Assistant"
+    known_speakers[1].label = "John Doe"
+
+    # Client
+    client = await get_client(
+        api_key=API_KEY,
+        url=URL,
+        connect=False,
+        config=VoiceAgentConfig(
+            end_of_utterance_silence_trigger=1.0,
+            max_delay=2.0,
+            end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
+            enable_diarization=True,
+            sample_rate=8000,
+            known_speakers=known_speakers,
+            speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+            additional_vocab=[
+                AdditionalVocabEntry(content="GeoRouter"),
+            ],
+        ),
+    )
+
+    # Finalised segments
+    final_segments: list[dict] = []
+
+    # Log final segments
+    def log_final_segment(message):
+        segments: list[SpeakerSegment] = message["segments"]
+        final_segments.extend(segments)
+
+    # Add listeners
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_final_segment)
+
+    # Connect
+    try:
+        await client.connect()
+    except Exception:
+        pytest.skip(f"Failed to connect to server: {URL}")
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Individual payloads
+    await send_audio_file(
+        client,
+        "./assets/audio_02_8kHz.wav",
+        sample_rate=8000,
+    )
+
+    # Check only speakers present
+    speakers = [segment.get("speaker_id") for segment in final_segments]
+    assert set(speakers) == set({"Assistant", "John Doe"})
+
+    # Should be 5 segments
+    assert len(final_segments) == 5
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
+
+
+@pytest.mark.asyncio
+async def test_ignoring_assistant():
+    """Test ignoring the assistant.
+
+    This test will:
+        - use known speakers
+        - set assistant to `__ASSISTANT__`
+        - this MUST use a preview endpoint
+    """
+
+    # API key
+    if not API_KEY:
+        pytest.skip("Valid API key required for test")
+
+    # Copy known speakers
+    known_speakers = speaker_ids.copy()
+    known_speakers[0].label = "__ASSISTANT__"
+    known_speakers[1].label = "John Doe"
+
+    # Client
+    client = await get_client(
+        api_key=API_KEY,
+        url=URL,
+        connect=False,
+        config=VoiceAgentConfig(
+            end_of_utterance_silence_trigger=1.0,
+            max_delay=2.0,
+            end_of_utterance_mode=EndOfUtteranceMode.ADAPTIVE,
+            enable_diarization=True,
+            sample_rate=8000,
+            known_speakers=known_speakers,
+            speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+            additional_vocab=[
+                AdditionalVocabEntry(content="GeoRouter"),
+            ],
+        ),
+    )
+
+    # Finalised segments
+    final_segments: list[dict] = []
+
+    # Log final segments
+    def log_final_segment(message):
+        segments: list[SpeakerSegment] = message["segments"]
+        final_segments.extend(segments)
+
+    # Add listeners
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_final_segment)
+
+    # Connect
+    try:
+        await client.connect()
+    except Exception:
+        pytest.skip(f"Failed to connect to server: {URL}")
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Individual payloads
+    await send_audio_file(
+        client,
+        "./assets/audio_02_8kHz.wav",
+        sample_rate=8000,
+    )
+
+    # Check only speakers present
+    speakers = [segment.get("speaker_id") for segment in final_segments]
+    assert set(speakers) == set({"John Doe"})
+
+    # Should be only 2 segments
+    assert len(final_segments) == 2
+
+    # No segment should contain `Rickmansworth`
+    for segment in final_segments:
+        assert "Rickmansworth" not in segment.get("text", "")
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
diff --git a/tests/voice/test_10_finalize.py b/tests/voice/test_10_finalize.py
new file mode 100644
index 0000000..247d46f
--- /dev/null
+++ b/tests/voice/test_10_finalize.py
@@ -0,0 +1,139 @@
+import asyncio
+import datetime
+import json
+import os
+
+import pytest
+from _utils import get_client
+from _utils import send_audio_file
+
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import VoiceAgentConfig
+
+# Skip for CI testing
+pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping finalization tests in CI")
+
+# Constants
+API_KEY = os.getenv("SPEECHMATICS_API_KEY")
+SHOW_LOG = os.getenv("SPEECHMATICS_SHOW_LOG", "0").lower() in ["1", "true"]
+AUDIO_FILE = "./assets/audio_05_16kHz.wav"
+
+
+@pytest.mark.asyncio
+async def test_finalize():
+    """Test finalization.
+
+    This test will:
+        - play a short audio clip
+        - finalize the segment
+        - this MUST use a preview / dev endpoint
+    """
+
+    # API key
+    api_key = os.getenv("SPEECHMATICS_API_KEY")
+    if not api_key:
+        pytest.skip("Valid API key required for test")
+
+    # Client
+    client = await get_client(
+        api_key=api_key,
+        connect=False,
+        config=VoiceAgentConfig(
+            end_of_utterance_silence_trigger=0.7,
+            max_delay=1.2,
+            end_of_utterance_mode=EndOfUtteranceMode.EXTERNAL,
+            use_forced_eou_message=True,
+        ),
+    )
+
+    # Create an event to track when the callback is called
+    messages: list[str] = []
+    bytes_sent: int = 0
+
+    # Flag
+    eot_received = asyncio.Event()
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Bytes logger
+    def log_bytes_sent(bytes):
+        nonlocal bytes_sent
+        bytes_sent += bytes
+
+    # Callback for each message
+    def log_message(message):
+        ts = (datetime.datetime.now() - start_time).total_seconds()
+        audio_ts = bytes_sent / 16000 / 2
+        log = json.dumps({"ts": round(ts, 3), "audio_ts": round(audio_ts, 2), "payload": message})
+        messages.append(log)
+        if SHOW_LOG:
+            print(log)
+
+    # EOT received
+    def eot_received_callback(message):
+        eot_received.set()
+
+    # Add listeners
+    client.once(AgentServerMessageType.RECOGNITION_STARTED, log_message)
+    client.on(AgentServerMessageType.INFO, log_message)
+    client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, log_message)
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_message)
+    client.on(AgentServerMessageType.END_OF_UTTERANCE, log_message)
+    client.on(AgentServerMessageType.START_OF_TURN, log_message)
+    client.on(AgentServerMessageType.END_OF_TURN, log_message)
+    client.on(AgentServerMessageType.END_OF_TRANSCRIPT, log_message)
+
+    # End of Turn
+    client.once(AgentServerMessageType.END_OF_TURN, eot_received_callback)
+
+    # HEADER
+    if SHOW_LOG:
+        print()
+        print()
+        print("---")
+        log_message({"message": "VoiceAgentConfig", **client._config.model_dump()})
+        log_message({"message": "TranscriptionConfig", **client._transcription_config.to_dict()})
+        log_message({"message": "AudioFormat", **client._audio_format.to_dict()})
+
+    # Connect
+    try:
+        await client.connect()
+    except Exception:
+        pytest.skip("Failed to connect to server")
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Set chunk size
+    chunk_size = 160
+
+    asyncio.create_task(send_audio_file(client, AUDIO_FILE, chunk_size=chunk_size, progress_callback=log_bytes_sent))
+
+    # Wait for 0.25 seconds
+    await asyncio.sleep(2)
+
+    # Request the speakers result
+    finalize_trigger_time = datetime.datetime.now()
+    client.finalize()
+
+    # Wait for the callback with timeout
+    try:
+        await asyncio.wait_for(eot_received.wait(), timeout=5.0)
+        finalize_latency = (datetime.datetime.now() - finalize_trigger_time).total_seconds() * 1000
+    except asyncio.TimeoutError:
+        pytest.fail("END_OF_TURN event was not received within 5 seconds of audio finish")
+
+    # FOOTER
+    if SHOW_LOG:
+        print(f"--- latency {finalize_latency:.2f} ms")
+        print()
+        print()
+
+    # Make sure latency is within bounds
+    assert finalize_latency < 500
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
diff --git a/tests/voice/test_11_audio_buffer.py b/tests/voice/test_11_audio_buffer.py
new file mode 100644
index 0000000..63f2fdc
--- /dev/null
+++ b/tests/voice/test_11_audio_buffer.py
@@ -0,0 +1,398 @@
+import asyncio
+import json
+import os
+import random
+import shutil
+import wave
+from typing import Optional
+
+import aiofiles
+import pytest
+from _utils import get_client
+from _utils import send_audio_file
+from _utils import send_silence
+
+from speechmatics.voice import AdditionalVocabEntry
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import SmartTurnConfig
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice._audio import AudioBuffer
+
+
+@pytest.mark.asyncio
+async def test_clean_tmp():
+    """Clear tmp directory"""
+
+    # Output directory
+    tmp_dir = os.path.join(os.path.dirname(__file__), "./.tmp/buffer")
+
+    # Clean tmp
+    if os.path.exists(tmp_dir):
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+
+    # Re-create
+    os.makedirs(tmp_dir, exist_ok=True)
+    assert os.path.exists(tmp_dir)
+
+
+@pytest.mark.asyncio
+async def test_buffer():
+    """Test AudioBuffer"""
+
+    # Audio info
+    sample_rate = 16000
+    sample_width = 2
+    frame_size = 160
+    frame_bytes = frame_size * sample_width
+
+    # Create buffer
+    buffer = AudioBuffer(sample_rate=sample_rate, frame_size=frame_size, sample_width=sample_width, total_seconds=10.0)
+
+    # Check zeros
+    assert buffer.total_frames == 0
+    assert buffer.total_time == 0.0
+    assert buffer.size == 0
+
+    # Add in 20 seconds of data
+    for _ in range(int(20.0 * sample_rate / frame_size)):
+        await buffer.put_frame(b"\x00" * frame_bytes)
+
+    # Check values
+    assert buffer.total_frames == int(20.0 * sample_rate / frame_size)
+    assert buffer.total_time == 20.0
+    assert buffer.size == int(10.0 * sample_rate / frame_size)
+
+    # Check frame >< time conversion
+    assert buffer._get_frame_from_time(buffer._get_time_from_frame(1234)) == 1234
+
+    # Get data from more than 10 seconds ago
+    data = await buffer.get_frames(2.5, 7.5)
+    assert len(data) == 0
+
+    # Get a 5 second slice from 12.5 seconds in
+    data = await buffer.get_frames(12.5, 17.5)
+    assert len(data) == int(5.0 * sample_rate / frame_size) * frame_bytes
+
+
+@pytest.mark.asyncio
+async def test_buffer_bytes():
+    """Test AudioBuffer with byte payloads"""
+
+    # Audio info
+    sample_rate = 16000
+    sample_width = 2
+    frame_size = 160
+    frame_bytes = frame_size * sample_width
+
+    # Create buffer
+    buffer = AudioBuffer(sample_rate=sample_rate, frame_size=frame_size, sample_width=sample_width, total_seconds=10.0)
+
+    # Check zeros
+    assert buffer.total_frames == 0
+    assert buffer.total_time == 0.0
+    assert buffer.size == 0
+
+    # 20 seconds of frames
+    twenty_second_frame_count = int(20.0 * sample_rate / frame_size)
+
+    # Fill with random payloads of data
+    while buffer.total_frames < twenty_second_frame_count - 1:
+        await buffer.put_bytes(b"\x00" * random.randint(1, frame_bytes))
+
+    # Add one last frame of zeros
+    await buffer.put_frame(b"\xff" * frame_bytes)
+
+    # Check values
+    assert buffer.total_frames == int(20.0 * sample_rate / frame_size)
+    assert buffer.total_time == 20.0
+    assert buffer.size == int(10.0 * sample_rate / frame_size)
+
+    # Check frame >< time conversion
+    assert buffer._get_frame_from_time(buffer._get_time_from_frame(1234)) == 1234
+
+    # Get data from more than 10 seconds ago
+    data = await buffer.get_frames(2.5, 7.5)
+    assert len(data) == 0
+
+    # Get a 5 second slice from 12.5 seconds in
+    data = await buffer.get_frames(12.5, 17.5)
+    assert len(data) == int(5.0 * sample_rate / frame_size) * frame_bytes
+
+    # Get most recent frame
+    start_time = buffer.total_time - (frame_size / sample_rate)
+    end_time = buffer.total_time
+
+    # Extract data
+    data = await buffer.get_frames(start_time, end_time)
+
+    # Test
+    assert len(data) == int((end_time - start_time) * sample_rate / frame_size) * frame_bytes
+
+
+@pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping in CI")
+@pytest.mark.asyncio
+async def test_load_audio_file():
+    """Test loading audio file into buffer"""
+
+    # API key
+    api_key = os.getenv("SPEECHMATICS_API_KEY")
+    if not api_key:
+        pytest.skip("Valid API key required for test")
+
+    # File
+    file = "audio_01_16kHz"
+
+    # Check file exists
+    input_file = os.path.join(os.path.dirname(__file__), f"./assets/{file}.wav")
+    assert os.path.exists(input_file)
+
+    # Output file
+    output_file = os.path.join(os.path.dirname(__file__), f"./.tmp/buffer/{file}_slice.wav")
+    output_folder = os.path.dirname(output_file)
+    os.makedirs(output_folder, exist_ok=True)
+    assert os.path.exists(output_folder)
+
+    # Audio info
+    sample_rate = 16000
+    sample_width = 2
+    frame_size = 160
+    frame_bytes = frame_size * sample_width
+
+    # Create buffer
+    buffer = AudioBuffer(sample_rate=sample_rate, frame_size=frame_size, sample_width=sample_width, total_seconds=35.0)
+
+    # Load the file
+    async with aiofiles.open(input_file, "rb") as wav_file:
+        await wav_file.seek(44)
+        while True:
+            chunk = await wav_file.read(frame_bytes)
+            if not chunk:
+                break
+            await buffer.put_frame(chunk)
+
+    # Slice
+    slice_start = 3.52
+    slice_end = 6.96
+
+    # Get a 5 second slice
+    data = await buffer.get_frames(slice_start, slice_end)
+
+    # Lengths - calculate expected using same logic as buffer
+    start_frame = buffer._get_frame_from_time(slice_start)
+    end_frame = buffer._get_frame_from_time(slice_end)
+
+    # Check length
+    assert len(data) == (end_frame - start_frame) * frame_bytes
+
+    # Write bytes to a temporary WAV file
+    with wave.open(output_file, "wb") as wav_file:
+        wav_file.setnchannels(1)
+        wav_file.setsampwidth(2)
+        wav_file.setframerate(sample_rate)
+        wav_file.writeframes(data)
+
+
+@pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping in CI")
+@pytest.mark.asyncio
+async def test_transcribe_and_slice():
+    """Load, transcribe and slice an audio file"""
+
+    # API key
+    api_key = os.getenv("SPEECHMATICS_API_KEY")
+    if not api_key:
+        pytest.skip("Valid API key required for test")
+
+    # Input file
+    file = "audio_01_16kHz"
+
+    # Check file exists
+    input_file = os.path.join(os.path.dirname(__file__), f"./assets/{file}.wav")
+    assert os.path.exists(input_file)
+
+    # Output directory
+    output_folder = os.path.join(os.path.dirname(__file__), "./.tmp/buffer")
+    os.makedirs(output_folder, exist_ok=True)
+    assert os.path.exists(output_folder)
+
+    # Exceptions
+    exceptions: Exception = []
+
+    # Save a slice
+    async def save_slice(
+        start_time: float, end_time: float, prefix: str = "slice", json_data: Optional[str] = None
+    ) -> None:
+        try:
+            output_file = os.path.join(output_folder, f"{file}_{prefix}_{start_time:.2f}_{end_time:.2f}")
+            data = await client._audio_buffer.get_frames(start_time, end_time)
+            with wave.open(f"{output_file}.wav", "wb") as wav_file:
+                wav_file.setnchannels(1)
+                wav_file.setsampwidth(2)
+                wav_file.setframerate(client._audio_buffer._sample_rate)
+                wav_file.writeframes(data)
+            if json_data:
+                with open(f"{output_file}.json", "w") as json_file:
+                    json_file.write(json_data)
+        except Exception as e:
+            exceptions.append(e)
+
+    # Client
+    client = await get_client(
+        api_key=api_key,
+        connect=False,
+        config=VoiceAgentConfig(
+            end_of_utterance_silence_trigger=0.35,
+            max_delay=0.7,
+            end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+            enable_diarization=True,
+            additional_vocab=[
+                AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]),
+            ],
+            smart_turn_config=SmartTurnConfig(audio_buffer_length=20.0),
+        ),
+    )
+
+    # Check audio buffer is enabled
+    assert client._audio_buffer
+
+    # Bytes logger
+    def final_segment(message):
+        try:
+            segments = message.get("segments", [])
+            assert segments
+
+            for segment in segments:
+                start_time = segment["metadata"]["start_time"]
+                end_time = segment["metadata"]["end_time"]
+                speaker_id = segment["speaker_id"]
+                asyncio.create_task(
+                    save_slice(
+                        start_time=start_time,
+                        end_time=end_time,
+                        prefix=speaker_id,
+                        json_data=json.dumps(segment, indent=2),
+                    )
+                )
+
+        except Exception as e:
+            exceptions.append(e)
+
+    # Add listeners
+    client.on(AgentServerMessageType.ADD_SEGMENT, final_segment)
+
+    # Connect
+    await client.connect()
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Send audio
+    await send_audio_file(client, input_file)
+    await send_silence(client, 2.0)
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
+
+    # Check exceptions
+    assert not exceptions
+
+
+@pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping in CI")
+@pytest.mark.asyncio
+async def x_test_transcribe_and_slice_vad():
+    """Load, transcribe and slice an audio file using VAD"""
+
+    # API key
+    api_key = os.getenv("SPEECHMATICS_API_KEY")
+    if not api_key:
+        pytest.skip("Valid API key required for test")
+
+    # Input file
+    file = "audio_06_16kHz"
+
+    # Check file exists
+    input_file = os.path.join(os.path.dirname(__file__), f"./assets/{file}.wav")
+    assert os.path.exists(input_file)
+
+    # Output directory
+    output_folder = os.path.join(os.path.dirname(__file__), "./.tmp/buffer")
+    os.makedirs(output_folder, exist_ok=True)
+    assert os.path.exists(output_folder)
+
+    # Exceptions
+    exceptions: Exception = []
+
+    # Save a slice
+    async def save_slice(
+        start_time: float, end_time: float, prefix: str = "slice", json_data: Optional[str] = None
+    ) -> None:
+        try:
+            output_file = os.path.join(output_folder, f"{file}_{prefix}_{start_time:.2f}_{end_time:.2f}")
+            data = await client._audio_buffer.get_frames(start_time, end_time)
+            with wave.open(f"{output_file}.wav", "wb") as wav_file:
+                wav_file.setnchannels(1)
+                wav_file.setsampwidth(2)
+                wav_file.setframerate(client._audio_buffer._sample_rate)
+                wav_file.writeframes(data)
+            if json_data:
+                with open(f"{output_file}.json", "w") as json_file:
+                    json_file.write(json_data)
+        except Exception as e:
+            exceptions.append(e)
+
+    # Client
+    client = await get_client(
+        api_key=api_key,
+        connect=False,
+        config=VoiceAgentConfig(
+            end_of_utterance_silence_trigger=0.35,
+            max_delay=0.7,
+            end_of_utterance_mode=EndOfUtteranceMode.FIXED,
+            enable_diarization=True,
+            additional_vocab=[
+                AdditionalVocabEntry(content="Speechmatics", sounds_like=["speech matics"]),
+            ],
+            smart_turn_config=SmartTurnConfig(audio_buffer_length=20.0),
+        ),
+    )
+
+    # Check audio buffer is enabled
+    assert client._audio_buffer
+
+    # Bytes logger
+    def speaker_ended(message):
+        try:
+            end_time = message["time"]
+            speaker_id = message["speaker_id"]
+            asyncio.create_task(
+                save_slice(
+                    start_time=end_time - 0.2,
+                    end_time=end_time + 0.4,
+                    prefix=speaker_id,
+                    json_data=json.dumps(message, indent=2),
+                )
+            )
+
+        except Exception as e:
+            exceptions.append(e)
+
+    # Add listeners
+    client.on(AgentServerMessageType.SPEAKER_ENDED, speaker_ended)
+
+    # Connect
+    await client.connect()
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Send audio
+    await send_audio_file(client, input_file)
+    await send_silence(client, 2.0)
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
+
+    # Check exceptions
+    assert not exceptions
diff --git a/tests/voice/test_12_smart_turn_with_files.py b/tests/voice/test_12_smart_turn_with_files.py
new file mode 100644
index 0000000..553b6ab
--- /dev/null
+++ b/tests/voice/test_12_smart_turn_with_files.py
@@ -0,0 +1,87 @@
+import os
+
+import pytest
+from _utils import load_audio_file
+from pydantic import BaseModel
+
+from speechmatics.voice._smart_turn import SmartTurnDetector
+from speechmatics.voice._smart_turn import SmartTurnPredictionResult
+
+# Skip for CI testing
+pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping smart turn tests in CI")
+
+# Detector
+detector = SmartTurnDetector(auto_init=False, threshold=0.75)
+
+
+class PredictionTest(BaseModel):
+    id: str
+    path: str
+    language: str
+    expected: SmartTurnPredictionResult
+
+
+SAMPLES: list[PredictionTest] = [
+    PredictionTest(
+        id="01",
+        path="./assets/smart_turn/01_false_16kHz.wav",
+        language="en",
+        expected=SmartTurnPredictionResult(
+            prediction=False,
+            probability=0.095,
+        ),
+    ),
+    PredictionTest(
+        id="02",
+        path="./assets/smart_turn/02_false_16kHz.wav",
+        language="en",
+        expected=SmartTurnPredictionResult(
+            prediction=False,
+            probability=0.011,
+        ),
+    ),
+    PredictionTest(
+        id="03",
+        path="./assets/smart_turn/03_true_16kHz.wav",
+        language="en",
+        expected=SmartTurnPredictionResult(
+            prediction=True,
+            probability=0.892,
+        ),
+    ),
+]
+
+
+@pytest.mark.asyncio
+async def test_onnx_model():
+    """Download ONNX model"""
+
+    # Initialize
+    detector.setup()
+
+    # Check exists
+    assert detector.model_exists()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("sample", SAMPLES, ids=lambda s: f"{s.id}:{s.path}")
+async def test_prediction(sample: PredictionTest):
+    """Test prediction"""
+
+    # Load an audio snippet
+    bytes_array = await load_audio_file(sample.path)
+
+    # Run an inference
+    result = await detector.predict(bytes_array, language=sample.language, sample_rate=16000, sample_width=2)
+
+    # Processing time < 100ms
+    assert result.processing_time < 0.1
+
+    # Check result
+    assert result.prediction == sample.expected.prediction
+
+    # Prediction within 5% of expected
+    assert (
+        result.probability >= sample.expected.probability - 0.05
+        and result.probability <= sample.expected.probability + 0.05
+    )
diff --git a/tests/voice/test_13_smart_turn_transcribe.py b/tests/voice/test_13_smart_turn_transcribe.py
new file mode 100644
index 0000000..9283c33
--- /dev/null
+++ b/tests/voice/test_13_smart_turn_transcribe.py
@@ -0,0 +1,166 @@
+import datetime
+import json
+import os
+import shutil
+
+import pytest
+from _utils import get_client
+from _utils import send_audio_file
+from pydantic import BaseModel
+from pydantic import Field
+
+from speechmatics.voice import AdditionalVocabEntry
+from speechmatics.voice import AgentServerMessageType
+from speechmatics.voice import EndOfUtteranceMode
+from speechmatics.voice import SpeechSegmentConfig
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice._smart_turn import SmartTurnDetector
+
+# Skip for CI testing
+pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skipping smart turn tests in CI")
+
+
+# Constants
+API_KEY = os.getenv("SPEECHMATICS_API_KEY")
+SHOW_LOG = os.getenv("SPEECHMATICS_SHOW_LOG", "0").lower() in ["1", "true"]
+
+
+# Detector
+detector = SmartTurnDetector(auto_init=False, threshold=0.75)
+
+
+class TranscriptionTest(BaseModel):
+    id: str
+    path: str
+    sample_rate: int
+    language: str
+    eot_count: int
+    additional_vocab: list[AdditionalVocabEntry] = Field(default_factory=list)
+
+
+SAMPLES: list[TranscriptionTest] = [
+    TranscriptionTest(id="01", path="./assets/audio_04_16kHz.wav", sample_rate=16000, language="en", eot_count=2),
+    TranscriptionTest(id="02", path="./assets/audio_05_16kHz.wav", sample_rate=16000, language="en", eot_count=1),
+    TranscriptionTest(id="03", path="./assets/audio_06_16kHz.wav", sample_rate=16000, language="en", eot_count=1),
+]
+
+
+@pytest.mark.asyncio
+async def test_clean_tmp():
+    """Clear tmp directory"""
+
+    # Output directory
+    tmp_dir = os.path.join(os.path.dirname(__file__), "./.tmp/turn")
+
+    # Clean tmp
+    if os.path.exists(tmp_dir):
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+
+    # Re-create
+    os.makedirs(tmp_dir, exist_ok=True)
+    assert os.path.exists(tmp_dir)
+
+
+@pytest.mark.asyncio
+async def test_onnx_model():
+    """Download ONNX model"""
+
+    # Initialize
+    detector.setup()
+
+    # Check exists
+    assert detector.model_exists()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("sample", SAMPLES, ids=lambda s: f"{s.id}:{s.path}")
+async def test_prediction(sample: TranscriptionTest):
+    """Test transcription and prediction"""
+
+    # API key
+    api_key = os.getenv("SPEECHMATICS_API_KEY")
+    if not api_key:
+        pytest.skip("Valid API key required for test")
+
+    # Start time
+    start_time = datetime.datetime.now()
+
+    # Results
+    eot_count: int = 0
+
+    # Client
+    client = await get_client(
+        api_key=api_key,
+        connect=False,
+        config=VoiceAgentConfig(
+            max_delay=0.7,
+            end_of_utterance_mode=EndOfUtteranceMode.SMART_TURN,
+            end_of_utterance_silence_trigger=0.5,
+            enable_diarization=True,
+            sample_rate=sample.sample_rate,
+            additional_vocab=sample.additional_vocab,
+            use_forced_eou_message=True,
+            speech_segment_config=SpeechSegmentConfig(emit_sentences=False),
+        ),
+    )
+
+    # EOT detected
+    def eot_detected(message):
+        nonlocal eot_count
+        eot_count += 1
+
+    # Callback for each message
+    def log_message(message):
+        ts = (datetime.datetime.now() - start_time).total_seconds()
+        log = json.dumps({"ts": round(ts, 3), "payload": message})
+        if SHOW_LOG:
+            print(log)
+
+    # Add listeners
+    # client.on(AgentServerMessageType.RECOGNITION_STARTED, log_message)
+    # client.on(AgentServerMessageType.END_OF_TRANSCRIPT, log_message)
+    # client.on(AgentServerMessageType.ADD_PARTIAL_SEGMENT, log_message)
+    client.on(AgentServerMessageType.ADD_SEGMENT, log_message)
+    # client.on(AgentServerMessageType.SPEAKER_STARTED, log_message)
+    client.on(AgentServerMessageType.SPEAKER_ENDED, log_message)
+    # client.on(AgentServerMessageType.SPEAKER_METRICS, log_message)
+    client.on(AgentServerMessageType.END_OF_TURN_PREDICTION, log_message)
+    client.on(AgentServerMessageType.END_OF_TURN, log_message)
+
+    # Calculated end of turn count
+    client.on(AgentServerMessageType.END_OF_TURN, eot_detected)
+
+    # HEADER
+    if SHOW_LOG:
+        print()
+        print()
+        print("---")
+
+    # Connect
+    try:
+        await client.connect()
+    except Exception:
+        pytest.skip("Failed to connect to server")
+
+    # Check we are connected
+    assert client._is_connected
+
+    # Individual payloads
+    await send_audio_file(client, sample.path)
+
+    # FOOTER
+    if SHOW_LOG:
+        print("---")
+        print()
+        print()
+
+    # Close session
+    await client.disconnect()
+    assert not client._is_connected
+
+    # Debug count
+    # print(eot_count)
+
+    # Validate (if we have expected results)
+    # if sample.eot_count:
+    #     assert eot_count == sample.eot_count
diff --git a/tests/voice/test_14_presets.py b/tests/voice/test_14_presets.py
new file mode 100644
index 0000000..1669392
--- /dev/null
+++ b/tests/voice/test_14_presets.py
@@ -0,0 +1,58 @@
+import pytest
+
+from speechmatics.voice import VoiceAgentConfig
+from speechmatics.voice._models import OperatingPoint
+from speechmatics.voice._models import SpeechSegmentConfig
+from speechmatics.voice._presets import VoiceAgentConfigPreset
+
+
+@pytest.mark.asyncio
+async def test_presets():
+    """Test VoiceAgentConfigPreset presets."""
+
+    # Create a preset
+    preset: VoiceAgentConfig = VoiceAgentConfigPreset.LOW_LATENCY()
+    assert preset is not None
+    assert preset.speech_segment_config.emit_sentences is True
+
+    # Overlay #1
+    preset: VoiceAgentConfig = VoiceAgentConfigPreset.LOW_LATENCY(
+        VoiceAgentConfig(max_delay=12.34, enable_diarization=False)
+    )
+    assert preset is not None
+    assert preset.max_delay == 12.34
+    assert preset.enable_diarization is False
+
+    # Overlay #2
+    preset: VoiceAgentConfig = VoiceAgentConfigPreset.LOW_LATENCY(
+        VoiceAgentConfig(speech_segment_config=SpeechSegmentConfig(emit_sentences=False))
+    )
+    assert preset is not None
+    assert preset.enable_diarization is True
+    assert preset.speech_segment_config.emit_sentences is False
+
+    # Preset names
+    presets = VoiceAgentConfigPreset.list_presets()
+    assert "low_latency" in presets
+
+    # Get a preset by a name
+    preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("low_latency")
+    assert preset is not None
+
+
+@pytest.mark.asyncio
+async def test_json_presets():
+    """Test VoiceAgentConfigPreset JSON presets."""
+
+    # With a JSON string overlay
+    preset: VoiceAgentConfig = VoiceAgentConfigPreset.load("low_latency", '{"operating_point": "enhanced"}')
+    assert preset is not None
+    assert preset.operating_point == OperatingPoint.ENHANCED
+
+    # Check using incorrect preset name
+    with pytest.raises(ValueError):
+        VoiceAgentConfigPreset.load("invalid_preset")
+
+    # Check with invalid overlay
+    with pytest.raises(ValueError):
+        VoiceAgentConfigPreset.load("low_latency", '{"invalid": "value"}')