diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 8edc9657..7e3cb47a 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 10.0.0 +current_version = 11.0.0 commit = True message = Bump version: {current_version} → {new_version} [skip ci] diff --git a/ibm_watson/speech_to_text_v1.py b/ibm_watson/speech_to_text_v1.py index 1f413afb..64e60064 100644 --- a/ibm_watson/speech_to_text_v1.py +++ b/ibm_watson/speech_to_text_v1.py @@ -218,6 +218,7 @@ def recognize( end_of_phrase_silence_time: Optional[float] = None, split_transcript_at_phrase_end: Optional[bool] = None, speech_detector_sensitivity: Optional[float] = None, + sad_module: Optional[int] = None, background_audio_suppression: Optional[float] = None, low_latency: Optional[bool] = None, character_insertion_bias: Optional[float] = None, @@ -351,8 +352,9 @@ def recognize( activity is detected in the stream. This can be used both in standard and low latency mode. This feature enables client applications to know that some words/speech has been detected and the service is in the process of - decoding. This can be used in lieu of interim results in standard mode. See - [Using speech recognition + decoding. This can be used in lieu of interim results in standard mode. Use + `sad_module: 2` to increase accuracy and performance in detecting speech + boundaries within the audio stream. See [Using speech recognition parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters). :param str language_customization_id: (optional) The customization ID (GUID) of a custom language model that is to be used with the recognition @@ -555,6 +557,12 @@ def recognize( sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity) and [Language model support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support). + :param int sad_module: (optional) Detects speech boundaries within the + audio stream with better performance, improved noise suppression, faster + responsiveness, and increased accuracy. + Specify `sad_module: 2` + See [Speech Activity Detection + (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). :param float background_audio_suppression: (optional) The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side @@ -647,6 +655,7 @@ def recognize( 'end_of_phrase_silence_time': end_of_phrase_silence_time, 'split_transcript_at_phrase_end': split_transcript_at_phrase_end, 'speech_detector_sensitivity': speech_detector_sensitivity, + 'sad_module': sad_module, 'background_audio_suppression': background_audio_suppression, 'low_latency': low_latency, 'character_insertion_bias': character_insertion_bias, @@ -845,6 +854,7 @@ def create_job( end_of_phrase_silence_time: Optional[float] = None, split_transcript_at_phrase_end: Optional[bool] = None, speech_detector_sensitivity: Optional[float] = None, + sad_module: Optional[int] = None, background_audio_suppression: Optional[float] = None, low_latency: Optional[bool] = None, character_insertion_bias: Optional[float] = None, @@ -1244,6 +1254,12 @@ def create_job( sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity) and [Language model support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support). + :param int sad_module: (optional) Detects speech boundaries within the + audio stream with better performance, improved noise suppression, faster + responsiveness, and increased accuracy. + Specify `sad_module: 2` + See [Speech Activity Detection + (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). :param float background_audio_suppression: (optional) The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side @@ -1341,6 +1357,7 @@ def create_job( 'end_of_phrase_silence_time': end_of_phrase_silence_time, 'split_transcript_at_phrase_end': split_transcript_at_phrase_end, 'speech_detector_sensitivity': speech_detector_sensitivity, + 'sad_module': sad_module, 'background_audio_suppression': background_audio_suppression, 'low_latency': low_latency, 'character_insertion_bias': character_insertion_bias, diff --git a/ibm_watson/speech_to_text_v1_adapter.py b/ibm_watson/speech_to_text_v1_adapter.py index dabe6526..5f3b3969 100644 --- a/ibm_watson/speech_to_text_v1_adapter.py +++ b/ibm_watson/speech_to_text_v1_adapter.py @@ -57,6 +57,7 @@ def recognize_using_websocket(self, background_audio_suppression=None, low_latency=None, character_insertion_bias=None, + sad_module=None, **kwargs): """ Sends audio for speech recognition using web sockets. @@ -309,6 +310,12 @@ def recognize_using_websocket(self, `Narrowband` models. See [Character insertion bias](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#insertion-bias). + :param int sad_module: (optional) Detects speech boundaries within the + audio stream with better performance, improved noise suppression, faster + responsiveness, and increased accuracy. + Specify `sad_module: 2` + See [Speech Activity Detection + (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad). :param dict headers: A `dict` containing the request headers :return: A `dict` containing the `SpeechRecognitionResults` response. :rtype: dict @@ -377,6 +384,7 @@ def recognize_using_websocket(self, 'background_audio_suppression': background_audio_suppression, 'character_insertion_bias': character_insertion_bias, 'low_latency': low_latency, + 'sad_module': sad_module, } options = {k: v for k, v in options.items() if v is not None} request['options'] = options diff --git a/ibm_watson/text_to_speech_v1.py b/ibm_watson/text_to_speech_v1.py index 5cdac13f..7748b75b 100644 --- a/ibm_watson/text_to_speech_v1.py +++ b/ibm_watson/text_to_speech_v1.py @@ -1808,22 +1808,29 @@ class Voice(str, Enum): DE_DE_ERIKAV3VOICE = 'de-DE_ErikaV3Voice' EN_AU_HEIDIEXPRESSIVE = 'en-AU_HeidiExpressive' EN_AU_JACKEXPRESSIVE = 'en-AU_JackExpressive' + EN_CA_HANNAHNATURAL = 'en-CA_HannahNatural' EN_GB_CHARLOTTEV3VOICE = 'en-GB_CharlotteV3Voice' + EN_GB_CHLOENATURAL = 'en-GB_ChloeNatural' EN_GB_GEORGEEXPRESSIVE = 'en-GB_GeorgeExpressive' EN_GB_JAMESV3VOICE = 'en-GB_JamesV3Voice' + EN_GB_GEORGENATURAL = 'en-GB_GeorgeNatural' EN_GB_KATEV3VOICE = 'en-GB_KateV3Voice' EN_US_ALLISONEXPRESSIVE = 'en-US_AllisonExpressive' EN_US_ALLISONV3VOICE = 'en-US_AllisonV3Voice' EN_US_ELLIENATURAL = 'en-US_EllieNatural' EN_US_EMILYV3VOICE = 'en-US_EmilyV3Voice' EN_US_EMMAEXPRESSIVE = 'en-US_EmmaExpressive' + EN_US_EMMANATURAL = 'en-US_EmmaNatural' + EN_US_ETHANNATURAL = 'en-US_EthanNatural' EN_US_HENRYV3VOICE = 'en-US_HenryV3Voice' + EN_US_JACKSONNATURAL = 'en-US_JacksonNatural' EN_US_KEVINV3VOICE = 'en-US_KevinV3Voice' EN_US_LISAEXPRESSIVE = 'en-US_LisaExpressive' EN_US_LISAV3VOICE = 'en-US_LisaV3Voice' EN_US_MICHAELEXPRESSIVE = 'en-US_MichaelExpressive' EN_US_MICHAELV3VOICE = 'en-US_MichaelV3Voice' EN_US_OLIVIAV3VOICE = 'en-US_OliviaV3Voice' + EN_US_VICTORIANATURAL = 'en-US_VictoriaNatural' ES_ES_ENRIQUEV3VOICE = 'es-ES_EnriqueV3Voice' ES_ES_LAURAV3VOICE = 'es-ES_LauraV3Voice' ES_LA_DANIELAEXPRESSIVE = 'es-LA_DanielaExpressive' @@ -1836,8 +1843,10 @@ class Voice(str, Enum): JA_JP_EMIV3VOICE = 'ja-JP_EmiV3Voice' KO_KR_JINV3VOICE = 'ko-KR_JinV3Voice' NL_NL_MERELV3VOICE = 'nl-NL_MerelV3Voice' + PT_BR_CAMILANATURAL = 'pt-BR_CamilaNatural' PT_BR_ISABELAV3VOICE = 'pt-BR_IsabelaV3Voice' PT_BR_LUCASEXPRESSIVE = 'pt-BR_LucasExpressive' + PT_BR_LUCASNATURAL = 'pt-BR_LucasNatural' class SynthesizeEnums: @@ -1887,22 +1896,29 @@ class Voice(str, Enum): DE_DE_ERIKAV3VOICE = 'de-DE_ErikaV3Voice' EN_AU_HEIDIEXPRESSIVE = 'en-AU_HeidiExpressive' EN_AU_JACKEXPRESSIVE = 'en-AU_JackExpressive' + EN_CA_HANNAHNATURAL = 'en-CA_HannahNatural' EN_GB_CHARLOTTEV3VOICE = 'en-GB_CharlotteV3Voice' + EN_GB_CHLOENATURAL = 'en-GB_ChloeNatural' EN_GB_GEORGEEXPRESSIVE = 'en-GB_GeorgeExpressive' EN_GB_JAMESV3VOICE = 'en-GB_JamesV3Voice' + EN_GB_GEORGENATURAL = 'en-GB_GeorgeNatural' EN_GB_KATEV3VOICE = 'en-GB_KateV3Voice' EN_US_ALLISONEXPRESSIVE = 'en-US_AllisonExpressive' EN_US_ALLISONV3VOICE = 'en-US_AllisonV3Voice' EN_US_ELLIENATURAL = 'en-US_EllieNatural' EN_US_EMILYV3VOICE = 'en-US_EmilyV3Voice' EN_US_EMMAEXPRESSIVE = 'en-US_EmmaExpressive' + EN_US_EMMANATURAL = 'en-US_EmmaNatural' + EN_US_ETHANNATURAL = 'en-US_EthanNatural' EN_US_HENRYV3VOICE = 'en-US_HenryV3Voice' + EN_US_JACKSONNATURAL = 'en-US_JacksonNatural' EN_US_KEVINV3VOICE = 'en-US_KevinV3Voice' EN_US_LISAEXPRESSIVE = 'en-US_LisaExpressive' EN_US_LISAV3VOICE = 'en-US_LisaV3Voice' EN_US_MICHAELEXPRESSIVE = 'en-US_MichaelExpressive' EN_US_MICHAELV3VOICE = 'en-US_MichaelV3Voice' EN_US_OLIVIAV3VOICE = 'en-US_OliviaV3Voice' + EN_US_VICTORIANATURAL = 'en-US_VictoriaNatural' ES_ES_ENRIQUEV3VOICE = 'es-ES_EnriqueV3Voice' ES_ES_LAURAV3VOICE = 'es-ES_LauraV3Voice' ES_LA_DANIELAEXPRESSIVE = 'es-LA_DanielaExpressive' @@ -1915,8 +1931,10 @@ class Voice(str, Enum): JA_JP_EMIV3VOICE = 'ja-JP_EmiV3Voice' KO_KR_JINV3VOICE = 'ko-KR_JinV3Voice' NL_NL_MERELV3VOICE = 'nl-NL_MerelV3Voice' + PT_BR_CAMILANATURAL = 'pt-BR_CamilaNatural' PT_BR_ISABELAV3VOICE = 'pt-BR_IsabelaV3Voice' PT_BR_LUCASEXPRESSIVE = 'pt-BR_LucasExpressive' + PT_BR_LUCASNATURAL = 'pt-BR_LucasNatural' class SpellOutMode(str, Enum): """ @@ -1965,22 +1983,29 @@ class Voice(str, Enum): DE_DE_ERIKAV3VOICE = 'de-DE_ErikaV3Voice' EN_AU_HEIDIEXPRESSIVE = 'en-AU_HeidiExpressive' EN_AU_JACKEXPRESSIVE = 'en-AU_JackExpressive' + EN_CA_HANNAHNATURAL = 'en-CA_HannahNatural' EN_GB_CHARLOTTEV3VOICE = 'en-GB_CharlotteV3Voice' + EN_GB_CHLOENATURAL = 'en-GB_ChloeNatural' EN_GB_GEORGEEXPRESSIVE = 'en-GB_GeorgeExpressive' EN_GB_JAMESV3VOICE = 'en-GB_JamesV3Voice' + EN_GB_GEORGENATURAL = 'en-GB_GeorgeNatural' EN_GB_KATEV3VOICE = 'en-GB_KateV3Voice' EN_US_ALLISONEXPRESSIVE = 'en-US_AllisonExpressive' EN_US_ALLISONV3VOICE = 'en-US_AllisonV3Voice' EN_US_ELLIENATURAL = 'en-US_EllieNatural' EN_US_EMILYV3VOICE = 'en-US_EmilyV3Voice' EN_US_EMMAEXPRESSIVE = 'en-US_EmmaExpressive' + EN_US_EMMANATURAL = 'en-US_EmmaNatural' + EN_US_ETHANNATURAL = 'en-US_EthanNatural' EN_US_HENRYV3VOICE = 'en-US_HenryV3Voice' + EN_US_JACKSONNATURAL = 'en-US_JacksonNatural' EN_US_KEVINV3VOICE = 'en-US_KevinV3Voice' EN_US_LISAEXPRESSIVE = 'en-US_LisaExpressive' EN_US_LISAV3VOICE = 'en-US_LisaV3Voice' EN_US_MICHAELEXPRESSIVE = 'en-US_MichaelExpressive' EN_US_MICHAELV3VOICE = 'en-US_MichaelV3Voice' EN_US_OLIVIAV3VOICE = 'en-US_OliviaV3Voice' + EN_US_VICTORIANATURAL = 'en-US_VictoriaNatural' ES_ES_ENRIQUEV3VOICE = 'es-ES_EnriqueV3Voice' ES_ES_LAURAV3VOICE = 'es-ES_LauraV3Voice' ES_LA_DANIELAEXPRESSIVE = 'es-LA_DanielaExpressive' @@ -1993,8 +2018,10 @@ class Voice(str, Enum): JA_JP_EMIV3VOICE = 'ja-JP_EmiV3Voice' KO_KR_JINV3VOICE = 'ko-KR_JinV3Voice' NL_NL_MERELV3VOICE = 'nl-NL_MerelV3Voice' + PT_BR_CAMILANATURAL = 'pt-BR_CamilaNatural' PT_BR_ISABELAV3VOICE = 'pt-BR_IsabelaV3Voice' PT_BR_LUCASEXPRESSIVE = 'pt-BR_LucasExpressive' + PT_BR_LUCASNATURAL = 'pt-BR_LucasNatural' class Format(str, Enum): """ diff --git a/ibm_watson/version.py b/ibm_watson/version.py index 2e568bf4..344ed4c9 100644 --- a/ibm_watson/version.py +++ b/ibm_watson/version.py @@ -1 +1 @@ -__version__ = '10.0.0' +__version__ = '11.0.0' diff --git a/setup.py b/setup.py index 566a960d..c0dd4303 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -# (C) Copyright IBM Corp. 2015, 2020. +# (C) Copyright IBM Corp. 2015, 2025. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ from setuptools import setup from os import path -__version__ = '10.0.0' +__version__ = '11.0.0' # read contents of README file this_directory = path.abspath(path.dirname(__file__)) diff --git a/test/unit/test_speech_to_text_v1.py b/test/unit/test_speech_to_text_v1.py index 658ae899..348bbb6c 100644 --- a/test/unit/test_speech_to_text_v1.py +++ b/test/unit/test_speech_to_text_v1.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# (C) Copyright IBM Corp. 2024. +# (C) Copyright IBM Corp. 2025. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -239,6 +239,7 @@ def test_recognize_all_params(self): end_of_phrase_silence_time = 0.8 split_transcript_at_phrase_end = False speech_detector_sensitivity = 0.5 + sad_module = 1 background_audio_suppression = 0.0 low_latency = False character_insertion_bias = 0.0 @@ -270,6 +271,7 @@ def test_recognize_all_params(self): end_of_phrase_silence_time=end_of_phrase_silence_time, split_transcript_at_phrase_end=split_transcript_at_phrase_end, speech_detector_sensitivity=speech_detector_sensitivity, + sad_module=sad_module, background_audio_suppression=background_audio_suppression, low_latency=low_latency, character_insertion_bias=character_insertion_bias, @@ -302,6 +304,7 @@ def test_recognize_all_params(self): assert 'audio_metrics={}'.format('true' if audio_metrics else 'false') in query_string assert 'end_of_phrase_silence_time={}'.format(end_of_phrase_silence_time) in query_string assert 'split_transcript_at_phrase_end={}'.format('true' if split_transcript_at_phrase_end else 'false') in query_string + assert 'sad_module={}'.format(sad_module) in query_string assert 'low_latency={}'.format('true' if low_latency else 'false') in query_string # Validate body params @@ -663,6 +666,7 @@ def test_create_job_all_params(self): end_of_phrase_silence_time = 0.8 split_transcript_at_phrase_end = False speech_detector_sensitivity = 0.5 + sad_module = 1 background_audio_suppression = 0.0 low_latency = False character_insertion_bias = 0.0 @@ -699,6 +703,7 @@ def test_create_job_all_params(self): end_of_phrase_silence_time=end_of_phrase_silence_time, split_transcript_at_phrase_end=split_transcript_at_phrase_end, speech_detector_sensitivity=speech_detector_sensitivity, + sad_module=sad_module, background_audio_suppression=background_audio_suppression, low_latency=low_latency, character_insertion_bias=character_insertion_bias, @@ -735,6 +740,7 @@ def test_create_job_all_params(self): assert 'audio_metrics={}'.format('true' if audio_metrics else 'false') in query_string assert 'end_of_phrase_silence_time={}'.format(end_of_phrase_silence_time) in query_string assert 'split_transcript_at_phrase_end={}'.format('true' if split_transcript_at_phrase_end else 'false') in query_string + assert 'sad_module={}'.format(sad_module) in query_string assert 'low_latency={}'.format('true' if low_latency else 'false') in query_string # Validate body params