Skip to content

Commit fd88d40

Browse files
authored
Merge pull request #48 from mutablelogic/avfilter
Added Segment Silence Detection
2 parents 47dcd33 + 8191abd commit fd88d40

File tree

2 files changed

+157
-70
lines changed

2 files changed

+157
-70
lines changed

pkg/segmenter/opt.go

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package segmenter
2+
3+
import (
4+
"time"
5+
6+
// Packages
7+
media "github.com/mutablelogic/go-media"
8+
)
9+
10+
///////////////////////////////////////////////////////////////////////////////////
11+
// TYPES
12+
13+
type Opt func(*opts) error
14+
15+
type opts struct {
16+
SilenceThreshold float64 // Silence threshold
17+
SilenceDuration time.Duration // Duration of silence to consider a segment boundary
18+
}
19+
20+
///////////////////////////////////////////////////////////////////////////////////
21+
// GLOBALS
22+
23+
const (
24+
DefaultSilenceThreshold = 0.0005 // Default silence threshold
25+
DefaultSilenceDuration = time.Second * 2 // Default silence duration
26+
)
27+
28+
///////////////////////////////////////////////////////////////////////////////////
29+
// LIFECYCLE
30+
31+
func applyOpts(opt ...Opt) (*opts, error) {
32+
var o opts
33+
for _, fn := range opt {
34+
if err := fn(&o); err != nil {
35+
return nil, err
36+
}
37+
}
38+
return &o, nil
39+
}
40+
41+
///////////////////////////////////////////////////////////////////////////////////
42+
// TYPES
43+
44+
func WithDefaultSilenceThreshold() Opt {
45+
return func(o *opts) error {
46+
o.SilenceThreshold = DefaultSilenceThreshold
47+
o.SilenceDuration = DefaultSilenceDuration
48+
return nil
49+
}
50+
}
51+
52+
func WithSilenceDuration(v time.Duration) Opt {
53+
return func(o *opts) error {
54+
if v < time.Millisecond*100 {
55+
return media.ErrBadParameter.Withf("silence duration %s is too short, must be at least 100ms", v)
56+
} else {
57+
o.SilenceDuration = v
58+
}
59+
return nil
60+
}
61+
}

pkg/segmenter/segmenter.go

Lines changed: 96 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"errors"
66
"io"
7+
"math"
78
"time"
89

910
// Packages
@@ -17,7 +18,9 @@ import (
1718
// A segmenter reads audio samples from a reader and segments them into
1819
// fixed-size chunks. The segmenter can be used to process audio samples
1920
type Segmenter struct {
21+
opts
2022
ts time.Duration
23+
sts float64 // silence timestamps
2124
sample_rate int
2225
n int
2326
buf_flt []float32
@@ -33,6 +36,13 @@ type SegmentFuncFloat32 func(time.Duration, []float32) error
3336
// segment of audio samples. The first argument is the timestamp of the segment.
3437
type SegmentFuncInt16 func(time.Duration, []int16) error
3538

39+
//////////////////////////////////////////////////////////////////////////////
40+
// GLOBALS
41+
42+
const (
43+
Int16Gain = float64(math.MaxInt16) // Gain for converting int16 to float32
44+
)
45+
3646
//////////////////////////////////////////////////////////////////////////////
3747
// LIFECYCLE
3848

@@ -43,9 +53,16 @@ type SegmentFuncInt16 func(time.Duration, []int16) error
4353
//
4454
// At the moment, the audio format is auto-detected, but there should be
4555
// a way to specify the audio format.
46-
func NewReader(r io.Reader, dur time.Duration, sample_rate int) (*Segmenter, error) {
56+
func NewReader(r io.Reader, dur time.Duration, sample_rate int, opts ...Opt) (*Segmenter, error) {
4757
segmenter := new(Segmenter)
4858

59+
// Apply options
60+
if o, err := applyOpts(opts...); err != nil {
61+
return nil, err
62+
} else {
63+
segmenter.opts = *o
64+
}
65+
4966
// Check arguments
5067
if dur < 0 || sample_rate <= 0 {
5168
return nil, media.ErrBadParameter.With("invalid duration or sample rate arguments")
@@ -108,22 +125,53 @@ func (s *Segmenter) DecodeFloat32(ctx context.Context, fn SegmentFuncFloat32) er
108125
}
109126

110127
// Allocate the buffer
111-
if s.n > 0 {
112-
s.buf_flt = make([]float32, 0, s.n)
113-
}
128+
s.buf_flt = make([]float32, 0, s.n)
129+
130+
// Reset the silence timestamp
131+
s.sts = -1
114132

115133
// Decode samples and segment
116134
if err := s.reader.Decode(ctx, mapFunc, func(stream int, frame *ffmpeg.Frame) error {
117-
// We get null frames sometimes, ignore them
135+
// Ignore null frames
118136
if frame == nil {
119137
return nil
120138
}
121139

140+
// Return if the frame is empty
141+
data := frame.Float32(0)
142+
if len(data) == 0 {
143+
return nil
144+
}
145+
146+
// Calculate the energy of the frame - root mean squared and normalize between 0 and 1
147+
var sum float32
148+
var energy float64
149+
for _, sample := range data {
150+
sum += float32(sample) * float32(sample)
151+
}
152+
energy = math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16)
153+
154+
// If silence detection is enabled, check if the energy is below the threshold
155+
var cut bool
156+
if s.SilenceThreshold > 0 && energy < s.SilenceThreshold {
157+
// If the energy is below the threshold, we consider it silence
158+
if s.sts == -1 {
159+
// If this is the first silence, set the timestamp
160+
s.sts = frame.Ts()
161+
} else if frame.Ts()-s.sts >= s.SilenceDuration.Seconds() {
162+
// Cut when the buffer size is greater than 10 seconds
163+
if len(s.buf_flt) >= s.sample_rate*10 {
164+
cut = true
165+
}
166+
s.sts = -1 // Reset the silence timestamp
167+
}
168+
}
169+
122170
// Append float32 samples from plane 0 to buffer
123171
s.buf_flt = append(s.buf_flt, frame.Float32(0)...)
124172

125173
// n != 0 and len(buf) >= n we have a segment to process
126-
if s.n != 0 && len(s.buf_flt) >= s.n {
174+
if (s.n != 0 && len(s.buf_flt) >= s.n) || cut {
127175
if err := s.segment_flt(fn); err != nil {
128176
return err
129177
}
@@ -173,25 +221,57 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
173221
}
174222

175223
// Allocate the buffer
176-
if s.n > 0 {
177-
s.buf_s16 = make([]int16, 0, s.n)
178-
}
224+
s.buf_s16 = make([]int16, 0, s.n)
225+
226+
// Reset the silence timestamp
227+
s.sts = -1
179228

180229
// Decode samples and segment
181230
if err := s.reader.Decode(ctx, mapFunc, func(stream int, frame *ffmpeg.Frame) error {
182-
// We get null frames sometimes, ignore them
231+
// Ignore null frames
183232
if frame == nil {
184233
return nil
185234
}
186235

236+
// Return if the frame is empty
237+
data := frame.Int16(0)
238+
if len(data) == 0 {
239+
return nil
240+
}
241+
242+
// Calculate the energy of the frame - root mean squared and normalize between 0 and 1
243+
var sum float32
244+
var energy float64
245+
for _, sample := range data {
246+
sum += float32(sample) * float32(sample)
247+
}
248+
energy = math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16)
249+
250+
// If silence detection is enabled, check if the energy is below the threshold
251+
var cut bool
252+
if s.SilenceThreshold > 0 && energy < s.SilenceThreshold {
253+
// If the energy is below the threshold, we consider it silence
254+
if s.sts == -1 {
255+
// If this is the first silence, set the timestamp
256+
s.sts = frame.Ts()
257+
} else if frame.Ts()-s.sts >= s.SilenceDuration.Seconds() {
258+
// Cut when the buffer size is greater than 10 seconds
259+
if len(s.buf_s16) >= s.sample_rate*10 {
260+
cut = true
261+
}
262+
s.sts = -1 // Reset the silence timestamp
263+
}
264+
}
265+
187266
// Append int16 samples from plane 0 to buffer
188-
s.buf_s16 = append(s.buf_s16, frame.Int16(0)...)
267+
s.buf_s16 = append(s.buf_s16, data...)
189268

190269
// n != 0 and len(buf) >= n we have a segment to process
191-
if s.n != 0 && len(s.buf_s16) >= s.n {
270+
if (s.n != 0 && len(s.buf_s16) >= s.n) || cut {
192271
if err := s.segment_s16(fn); err != nil {
193272
return err
194273
}
274+
195275
// Increment the timestamp
196276
s.ts += time.Duration(len(s.buf_s16)) * time.Second / time.Duration(s.sample_rate)
197277

@@ -223,65 +303,11 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
223303
// PRIVATE METHODS
224304

225305
func (s *Segmenter) segment_flt(fn SegmentFuncFloat32) error {
226-
// Not segmenting
227-
if s.n == 0 {
228-
return fn(s.ts, s.buf_flt)
229-
}
230-
231-
// Split into n-sized segments
232-
bufLength := len(s.buf_flt)
233-
ts := s.ts
234-
tsinc := time.Duration(s.n) * time.Second / time.Duration(s.sample_rate)
235-
for i := 0; i < bufLength; i += s.n {
236-
end := i + s.n
237-
var segment []float32
238-
if end <= bufLength {
239-
// If the segment fits exactly or there are enough items
240-
segment = s.buf_flt[i:end]
241-
} else {
242-
// If the segment is smaller than segmentSize, pad with zeros
243-
segment = make([]float32, s.n)
244-
copy(segment, s.buf_flt[i:bufLength])
245-
}
246-
if err := fn(ts, segment); err != nil {
247-
return err
248-
} else {
249-
ts += tsinc
250-
}
251-
}
252-
253-
// Return success
254-
return nil
306+
// TODO: Pad any remaining samples with zeros if the buffer is not full
307+
return fn(s.ts, s.buf_flt)
255308
}
256309

257310
func (s *Segmenter) segment_s16(fn SegmentFuncInt16) error {
258-
// Not segmenting
259-
if s.n == 0 {
260-
return fn(s.ts, s.buf_s16)
261-
}
262-
263-
// Split into n-sized segments
264-
bufLength := len(s.buf_s16)
265-
ts := s.ts
266-
tsinc := time.Duration(s.n) * time.Second / time.Duration(s.sample_rate)
267-
for i := 0; i < bufLength; i += s.n {
268-
end := i + s.n
269-
var segment []int16
270-
if end <= bufLength {
271-
// If the segment fits exactly or there are enough items
272-
segment = s.buf_s16[i:end]
273-
} else {
274-
// If the segment is smaller than segmentSize, pad with zeros
275-
segment = make([]int16, s.n)
276-
copy(segment, s.buf_s16[i:bufLength])
277-
}
278-
if err := fn(ts, segment); err != nil {
279-
return err
280-
} else {
281-
ts += tsinc
282-
}
283-
}
284-
285-
// Return success
286-
return nil
311+
// TODO: Pad any remaining samples with zeros if the buffer is not full
312+
return fn(s.ts, s.buf_s16)
287313
}

0 commit comments

Comments
 (0)