Skip to content

Commit 1ba9257

Browse files
committed
Fixed VAD and added VAD interruptions
1 parent 723542d commit 1ba9257

File tree

5 files changed

+95
-147
lines changed

5 files changed

+95
-147
lines changed

Assets/Prefabs/DemoIntegration.prefab

+11-11
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,7 @@ RectTransform:
526526
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
527527
m_AnchorMin: {x: 0, y: 1}
528528
m_AnchorMax: {x: 1, y: 1}
529-
m_AnchoredPosition: {x: 0, y: -308}
529+
m_AnchoredPosition: {x: 0, y: -272}
530530
m_SizeDelta: {x: -40, y: 1}
531531
m_Pivot: {x: 0.5, y: 0.5}
532532
--- !u!222 &4512565803369859169
@@ -676,7 +676,7 @@ RectTransform:
676676
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
677677
m_AnchorMin: {x: 0, y: 1}
678678
m_AnchorMax: {x: 1, y: 1}
679-
m_AnchoredPosition: {x: 0, y: -48}
679+
m_AnchoredPosition: {x: 0, y: -40}
680680
m_SizeDelta: {x: -40, y: 1}
681681
m_Pivot: {x: 0.5, y: 0.5}
682682
--- !u!222 &7929203979481334323
@@ -947,7 +947,7 @@ RectTransform:
947947
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
948948
m_AnchorMin: {x: 0, y: 1}
949949
m_AnchorMax: {x: 1, y: 1}
950-
m_AnchoredPosition: {x: 10, y: -108}
950+
m_AnchoredPosition: {x: 10, y: -100}
951951
m_SizeDelta: {x: -20, y: 50}
952952
m_Pivot: {x: 0.5, y: 0.5}
953953
--- !u!222 &1307506363146839489
@@ -1290,7 +1290,7 @@ RectTransform:
12901290
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
12911291
m_AnchorMin: {x: 0, y: 1}
12921292
m_AnchorMax: {x: 1, y: 1}
1293-
m_AnchoredPosition: {x: 10, y: -368}
1293+
m_AnchoredPosition: {x: 10, y: -332}
12941294
m_SizeDelta: {x: -20, y: 50}
12951295
m_Pivot: {x: 0.5, y: 0.5}
12961296
--- !u!222 &2909591103344910339
@@ -1422,9 +1422,9 @@ RectTransform:
14221422
m_Children: []
14231423
m_Father: {fileID: 5337751238952555557}
14241424
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
1425-
m_AnchorMin: {x: 0, y: 0}
1426-
m_AnchorMax: {x: 1, y: 0}
1427-
m_AnchoredPosition: {x: 0, y: 72}
1425+
m_AnchorMin: {x: 0, y: 1}
1426+
m_AnchorMax: {x: 1, y: 1}
1427+
m_AnchoredPosition: {x: 0, y: -504}
14281428
m_SizeDelta: {x: -40, y: 1}
14291429
m_Pivot: {x: 0.5, y: 0.5}
14301430
--- !u!222 &5717813167708040783
@@ -1792,7 +1792,7 @@ RectTransform:
17921792
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
17931793
m_AnchorMin: {x: 0, y: 1}
17941794
m_AnchorMax: {x: 1, y: 1}
1795-
m_AnchoredPosition: {x: 444, y: -285}
1795+
m_AnchoredPosition: {x: 444, y: -244}
17961796
m_SizeDelta: {x: -964, y: 100}
17971797
m_Pivot: {x: 0.5, y: 0.5}
17981798
--- !u!114 &8894532348555155453
@@ -1932,7 +1932,7 @@ RectTransform:
19321932
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
19331933
m_AnchorMin: {x: 0, y: 1}
19341934
m_AnchorMax: {x: 1, y: 1}
1935-
m_AnchoredPosition: {x: 9.999943, y: -88}
1935+
m_AnchoredPosition: {x: 10, y: -80}
19361936
m_SizeDelta: {x: -20.000116, y: 50}
19371937
m_Pivot: {x: 0.5, y: 0.5}
19381938
--- !u!222 &6919660927735059422
@@ -2412,7 +2412,7 @@ RectTransform:
24122412
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
24132413
m_AnchorMin: {x: 0, y: 1}
24142414
m_AnchorMax: {x: 1, y: 1}
2415-
m_AnchoredPosition: {x: 10, y: -348}
2415+
m_AnchoredPosition: {x: 10, y: -312}
24162416
m_SizeDelta: {x: -20, y: 50}
24172417
m_Pivot: {x: 0.5, y: 0.5}
24182418
--- !u!222 &3075031599492574065
@@ -3232,7 +3232,7 @@ RectTransform:
32323232
m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
32333233
m_AnchorMin: {x: 0, y: 1}
32343234
m_AnchorMax: {x: 1, y: 1}
3235-
m_AnchoredPosition: {x: 356, y: -285}
3235+
m_AnchoredPosition: {x: 356, y: -244}
32363236
m_SizeDelta: {x: -964, y: 100}
32373237
m_Pivot: {x: 0.5, y: 0.5}
32383238
--- !u!114 &2783153125252807574

Assets/Prefabs/RealtimeAPI.prefab

+4-2
Original file line numberDiff line numberDiff line change
@@ -144,9 +144,11 @@ MonoBehaviour:
144144
listeningMode: 0
145145
sampleRate: 24000
146146
interruptResponseOnNewRecording: 1
147-
vadThreshold: 0.01
147+
vadThreshold: 0.1
148148
vadSilenceDuration: 2
149-
ignoreInitialMicrophoneFramesOnVAD: 1
149+
ignoreInitialMicrophoneFramesOnVAD: 0
150+
currentVolumeLevel: 0
151+
fftSampleSize: 1024
150152
--- !u!1 &8086852912749008722
151153
GameObject:
152154
m_ObjectHideFlags: 0

Assets/Scenes/DemoScene.unity

-4
Original file line numberDiff line numberDiff line change
@@ -513,10 +513,6 @@ PrefabInstance:
513513
serializedVersion: 3
514514
m_TransformParent: {fileID: 0}
515515
m_Modifications:
516-
- target: {fileID: 74292311301300547, guid: 5ca84f8d7b82048e4ba92610cbf110e9, type: 3}
517-
propertyPath: apiKey
518-
value:
519-
objectReference: {fileID: 0}
520516
- target: {fileID: 4132844161054319781, guid: 5ca84f8d7b82048e4ba92610cbf110e9, type: 3}
521517
propertyPath: m_LocalPosition.x
522518
value: 0

Assets/Scripts/AudioController.cs

+37-124
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,22 @@ public class AudioController : MonoBehaviour
1515
private bool isVADRecording = false;
1616
private float silenceTimer = 0f;
1717
private int lastSamplePosition = 0;
18+
private int lastVADSamplePosition = 0;
1819
private List<float> vadAudioData = new List<float>();
1920
private AudioClip microphoneClip;
2021
private AudioSource audioSource;
2122
private bool isPlayingAudio = false;
2223
private bool cancelPending = false;
2324
private Queue<byte[]> audioBuffer = new Queue<byte[]>();
24-
public static event Action<string> OnAudioRecorded;
2525
private string microphoneDevice;
2626
private bool ignoreInitialSpike = false;
27-
public float currentVolumeLevel { get; private set; } = 0f;
27+
public float currentVolumeLevel = 0f;
2828
public float[] frequencyData { get; private set; }
2929
public int fftSampleSize = 1024;
3030
public float[] aiFrequencyData { get; private set; }
31+
public static event Action<string> OnAudioRecorded;
32+
public static event Action OnVADRecordingStarted;
33+
public static event Action OnVADRecordingEnded;
3134

3235
private void Start()
3336
{
@@ -64,10 +67,8 @@ private void Update()
6467

6568
public void StartRecording()
6669
{
67-
if (interruptResponseOnNewRecording)
68-
CancelAudioPlayback();
69-
if (Microphone.devices.Length == 0)
70-
return;
70+
if (interruptResponseOnNewRecording) CancelAudioPlayback();
71+
if (Microphone.devices.Length == 0) return;
7172
ResetCancelPending();
7273
microphoneDevice = Microphone.devices[0];
7374
microphoneClip = Microphone.Start(microphoneDevice, false, 10, sampleRate);
@@ -94,19 +95,15 @@ public void StopRecording()
9495

9596
public void StartMicrophone()
9697
{
97-
if (Microphone.devices.Length == 0)
98-
return;
98+
if (Microphone.devices.Length == 0) return;
9999
microphoneDevice = Microphone.devices[0];
100100
microphoneClip = Microphone.Start(microphoneDevice, true, 10, sampleRate);
101101
lastSamplePosition = 0;
102102
}
103103

104104
public void StopMicrophone()
105105
{
106-
if (Microphone.IsRecording(microphoneDevice))
107-
{
108-
Microphone.End(microphoneDevice);
109-
}
106+
if (Microphone.IsRecording(microphoneDevice)) Microphone.End(microphoneDevice);
110107
frequencyData = null;
111108
}
112109

@@ -158,143 +155,59 @@ private void UpdateCurrentVolumeAndFrequency()
158155
lastSamplePosition = micPosition;
159156
}
160157

158+
161159
private void PerformVAD()
162160
{
163-
if (!Microphone.IsRecording(microphoneDevice))
164-
{
165-
lastSamplePosition = Microphone.GetPosition(microphoneDevice);
166-
return;
167-
}
168-
int micPosition = Microphone.GetPosition(microphoneDevice);
169-
int sampleDiff = micPosition - lastSamplePosition;
170-
if (sampleDiff < 0)
171-
{
172-
sampleDiff += microphoneClip.samples;
173-
}
174-
if (sampleDiff == 0)
175-
{
176-
return;
177-
}
178-
float[] samples = new float[sampleDiff];
179-
int startPosition = lastSamplePosition;
180-
if (startPosition + sampleDiff <= microphoneClip.samples)
181-
{
182-
microphoneClip.GetData(samples, startPosition);
183-
}
184-
else
185-
{
186-
int samplesToEnd = microphoneClip.samples - startPosition;
187-
int samplesFromStart = sampleDiff - samplesToEnd;
188-
float[] samplesPart1 = new float[samplesToEnd];
189-
float[] samplesPart2 = new float[samplesFromStart];
190-
microphoneClip.GetData(samplesPart1, startPosition);
191-
microphoneClip.GetData(samplesPart2, 0);
192-
Array.Copy(samplesPart1, 0, samples, 0, samplesToEnd);
193-
Array.Copy(samplesPart2, 0, samples, samplesToEnd, samplesFromStart);
194-
}
195-
float maxVolume = 0f;
196-
foreach (var sample in samples)
197-
{
198-
float absSample = Mathf.Abs(sample);
199-
if (absSample > maxVolume)
200-
{
201-
maxVolume = absSample;
202-
}
203-
}
204-
currentVolumeLevel = maxVolume;
205-
if (maxVolume > vadThreshold)
161+
if (!Microphone.IsRecording(microphoneDevice)) return;
162+
163+
if (currentVolumeLevel > vadThreshold && !isVADRecording)
206164
{
207165
silenceTimer = 0f;
208-
if (ignoreInitialMicrophoneFramesOnVAD)
209-
{
210-
if (ignoreInitialSpike)
211-
{
212-
ignoreFrameCount++;
213-
if (ignoreFrameCount >= framesToIgnore)
214-
{
215-
ignoreInitialSpike = false;
216-
ignoreFrameCount = 0;
217-
}
218-
}
219-
else
220-
{
221-
if (!isVADRecording)
222-
{
223-
StartVADRecording();
224-
}
225-
AppendVADData(samples);
226-
}
227-
}
228-
else
229-
{
230-
if (!isVADRecording)
231-
{
232-
StartVADRecording();
233-
}
234-
AppendVADData(samples);
235-
}
166+
StartVADRecording();
236167
}
237-
else
168+
else if (isVADRecording)
238169
{
239-
if (isVADRecording)
240-
{
241-
silenceTimer += sampleDiff / (float)sampleRate;
242-
if (silenceTimer >= vadSilenceDuration)
243-
{
244-
StopVADRecording();
245-
}
246-
else
247-
{
248-
AppendVADData(samples);
249-
}
250-
}
251-
else
252-
{
253-
if (ignoreInitialMicrophoneFramesOnVAD)
254-
{
255-
ignoreInitialSpike = true;
256-
ignoreFrameCount = 0;
257-
}
258-
}
170+
silenceTimer += Time.deltaTime;
171+
if (silenceTimer >= vadSilenceDuration) StopVADRecording();
259172
}
260-
lastSamplePosition = micPosition;
261173
}
262174

263175
private void StartVADRecording()
264176
{
177+
if (interruptResponseOnNewRecording && !isVADRecording) CancelAudioPlayback();
178+
ResetCancelPending();
265179
isVADRecording = true;
266-
vadAudioData.Clear();
267-
}
268-
269-
private void AppendVADData(float[] samples)
270-
{
271-
vadAudioData.AddRange(samples);
180+
silenceTimer = 0f;
181+
microphoneClip = Microphone.Start(microphoneDevice, false, 10, sampleRate);
182+
OnVADRecordingStarted?.Invoke();
272183
}
273184

274185
private void StopVADRecording()
275186
{
276-
isVADRecording = false;
277-
silenceTimer = 0f;
278-
if (vadAudioData.Count > 0)
187+
if (Microphone.IsRecording(microphoneDevice))
279188
{
280-
float[] audioData = vadAudioData.ToArray();
189+
int micPosition = Microphone.GetPosition(microphoneDevice);
190+
float[] audioData = new float[micPosition];
191+
microphoneClip.GetData(audioData, 0);
281192
string base64AudioData = ConvertFloatToPCM16AndBase64(audioData);
282193
OnAudioRecorded?.Invoke(base64AudioData);
283194
}
284-
vadAudioData.Clear();
195+
196+
isVADRecording = false;
197+
silenceTimer = 0f;
198+
OnVADRecordingEnded?.Invoke();
199+
200+
Microphone.End(microphoneDevice);
201+
StartMicrophone();
202+
203+
ignoreInitialSpike = true;
285204
}
286205

287206
public void EnqueueAudioData(byte[] pcmAudioData)
288207
{
289-
if (cancelPending)
290-
{
291-
return;
292-
}
208+
if (cancelPending) return;
293209
audioBuffer.Enqueue(pcmAudioData);
294-
if (!isPlayingAudio)
295-
{
296-
PlayBufferedAudio();
297-
}
210+
if (!isPlayingAudio) PlayBufferedAudio();
298211
}
299212

300213
private void PlayBufferedAudio()

0 commit comments

Comments
 (0)