44 "context"
55 "errors"
66 "io"
7+ "math"
78 "time"
89
910 // Packages
@@ -17,7 +18,9 @@ import (
1718// A segmenter reads audio samples from a reader and segments them into
1819// fixed-size chunks. The segmenter can be used to process audio samples
1920type Segmenter struct {
21+ opts
2022 ts time.Duration
23+ sts float64 // silence timestamps
2124 sample_rate int
2225 n int
2326 buf_flt []float32
@@ -33,6 +36,13 @@ type SegmentFuncFloat32 func(time.Duration, []float32) error
3336// segment of audio samples. The first argument is the timestamp of the segment.
3437type SegmentFuncInt16 func (time.Duration , []int16 ) error
3538
39+ //////////////////////////////////////////////////////////////////////////////
40+ // GLOBALS
41+
42+ const (
43+ Int16Gain = float64 (math .MaxInt16 ) // Gain for converting int16 to float32
44+ )
45+
3646//////////////////////////////////////////////////////////////////////////////
3747// LIFECYCLE
3848
@@ -43,9 +53,16 @@ type SegmentFuncInt16 func(time.Duration, []int16) error
4353//
4454// At the moment, the audio format is auto-detected, but there should be
4555// a way to specify the audio format.
46- func NewReader (r io.Reader , dur time.Duration , sample_rate int ) (* Segmenter , error ) {
56+ func NewReader (r io.Reader , dur time.Duration , sample_rate int , opts ... Opt ) (* Segmenter , error ) {
4757 segmenter := new (Segmenter )
4858
59+ // Apply options
60+ if o , err := applyOpts (opts ... ); err != nil {
61+ return nil , err
62+ } else {
63+ segmenter .opts = * o
64+ }
65+
4966 // Check arguments
5067 if dur < 0 || sample_rate <= 0 {
5168 return nil , media .ErrBadParameter .With ("invalid duration or sample rate arguments" )
@@ -108,22 +125,53 @@ func (s *Segmenter) DecodeFloat32(ctx context.Context, fn SegmentFuncFloat32) er
108125 }
109126
110127 // Allocate the buffer
111- if s .n > 0 {
112- s .buf_flt = make ([]float32 , 0 , s .n )
113- }
128+ s .buf_flt = make ([]float32 , 0 , s .n )
129+
130+ // Reset the silence timestamp
131+ s .sts = - 1
114132
115133 // Decode samples and segment
116134 if err := s .reader .Decode (ctx , mapFunc , func (stream int , frame * ffmpeg.Frame ) error {
117- // We get null frames sometimes, ignore them
135+ // Ignore null frames
118136 if frame == nil {
119137 return nil
120138 }
121139
140+ // Return if the frame is empty
141+ data := frame .Float32 (0 )
142+ if len (data ) == 0 {
143+ return nil
144+ }
145+
146+ // Calculate the energy of the frame - root mean squared and normalize between 0 and 1
147+ var sum float32
148+ var energy float64
149+ for _ , sample := range data {
150+ sum += float32 (sample ) * float32 (sample )
151+ }
152+ energy = math .Sqrt (float64 (sum )/ float64 (len (data ))) / float64 (math .MaxInt16 )
153+
154+ // If silence detection is enabled, check if the energy is below the threshold
155+ var cut bool
156+ if s .SilenceThreshold > 0 && energy < s .SilenceThreshold {
157+ // If the energy is below the threshold, we consider it silence
158+ if s .sts == - 1 {
159+ // If this is the first silence, set the timestamp
160+ s .sts = frame .Ts ()
161+ } else if frame .Ts ()- s .sts >= s .SilenceDuration .Seconds () {
162+ // Cut when the buffer size is greater than 10 seconds
163+ if len (s .buf_flt ) >= s .sample_rate * 10 {
164+ cut = true
165+ }
166+ s .sts = - 1 // Reset the silence timestamp
167+ }
168+ }
169+
122170 // Append float32 samples from plane 0 to buffer
123171 s .buf_flt = append (s .buf_flt , frame .Float32 (0 )... )
124172
125173 // n != 0 and len(buf) >= n we have a segment to process
126- if s .n != 0 && len (s .buf_flt ) >= s .n {
174+ if ( s .n != 0 && len (s .buf_flt ) >= s .n ) || cut {
127175 if err := s .segment_flt (fn ); err != nil {
128176 return err
129177 }
@@ -173,25 +221,57 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
173221 }
174222
175223 // Allocate the buffer
176- if s .n > 0 {
177- s .buf_s16 = make ([]int16 , 0 , s .n )
178- }
224+ s .buf_s16 = make ([]int16 , 0 , s .n )
225+
226+ // Reset the silence timestamp
227+ s .sts = - 1
179228
180229 // Decode samples and segment
181230 if err := s .reader .Decode (ctx , mapFunc , func (stream int , frame * ffmpeg.Frame ) error {
182- // We get null frames sometimes, ignore them
231+ // Ignore null frames
183232 if frame == nil {
184233 return nil
185234 }
186235
236+ // Return if the frame is empty
237+ data := frame .Int16 (0 )
238+ if len (data ) == 0 {
239+ return nil
240+ }
241+
242+ // Calculate the energy of the frame - root mean squared and normalize between 0 and 1
243+ var sum float32
244+ var energy float64
245+ for _ , sample := range data {
246+ sum += float32 (sample ) * float32 (sample )
247+ }
248+ energy = math .Sqrt (float64 (sum )/ float64 (len (data ))) / float64 (math .MaxInt16 )
249+
250+ // If silence detection is enabled, check if the energy is below the threshold
251+ var cut bool
252+ if s .SilenceThreshold > 0 && energy < s .SilenceThreshold {
253+ // If the energy is below the threshold, we consider it silence
254+ if s .sts == - 1 {
255+ // If this is the first silence, set the timestamp
256+ s .sts = frame .Ts ()
257+ } else if frame .Ts ()- s .sts >= s .SilenceDuration .Seconds () {
258+ // Cut when the buffer size is greater than 10 seconds
259+ if len (s .buf_s16 ) >= s .sample_rate * 10 {
260+ cut = true
261+ }
262+ s .sts = - 1 // Reset the silence timestamp
263+ }
264+ }
265+
187266 // Append int16 samples from plane 0 to buffer
188- s .buf_s16 = append (s .buf_s16 , frame . Int16 ( 0 ) ... )
267+ s .buf_s16 = append (s .buf_s16 , data ... )
189268
190269 // n != 0 and len(buf) >= n we have a segment to process
191- if s .n != 0 && len (s .buf_s16 ) >= s .n {
270+ if ( s .n != 0 && len (s .buf_s16 ) >= s .n ) || cut {
192271 if err := s .segment_s16 (fn ); err != nil {
193272 return err
194273 }
274+
195275 // Increment the timestamp
196276 s .ts += time .Duration (len (s .buf_s16 )) * time .Second / time .Duration (s .sample_rate )
197277
@@ -223,65 +303,11 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error
223303// PRIVATE METHODS
224304
225305func (s * Segmenter ) segment_flt (fn SegmentFuncFloat32 ) error {
226- // Not segmenting
227- if s .n == 0 {
228- return fn (s .ts , s .buf_flt )
229- }
230-
231- // Split into n-sized segments
232- bufLength := len (s .buf_flt )
233- ts := s .ts
234- tsinc := time .Duration (s .n ) * time .Second / time .Duration (s .sample_rate )
235- for i := 0 ; i < bufLength ; i += s .n {
236- end := i + s .n
237- var segment []float32
238- if end <= bufLength {
239- // If the segment fits exactly or there are enough items
240- segment = s .buf_flt [i :end ]
241- } else {
242- // If the segment is smaller than segmentSize, pad with zeros
243- segment = make ([]float32 , s .n )
244- copy (segment , s .buf_flt [i :bufLength ])
245- }
246- if err := fn (ts , segment ); err != nil {
247- return err
248- } else {
249- ts += tsinc
250- }
251- }
252-
253- // Return success
254- return nil
306+ // TODO: Pad any remaining samples with zeros if the buffer is not full
307+ return fn (s .ts , s .buf_flt )
255308}
256309
257310func (s * Segmenter ) segment_s16 (fn SegmentFuncInt16 ) error {
258- // Not segmenting
259- if s .n == 0 {
260- return fn (s .ts , s .buf_s16 )
261- }
262-
263- // Split into n-sized segments
264- bufLength := len (s .buf_s16 )
265- ts := s .ts
266- tsinc := time .Duration (s .n ) * time .Second / time .Duration (s .sample_rate )
267- for i := 0 ; i < bufLength ; i += s .n {
268- end := i + s .n
269- var segment []int16
270- if end <= bufLength {
271- // If the segment fits exactly or there are enough items
272- segment = s .buf_s16 [i :end ]
273- } else {
274- // If the segment is smaller than segmentSize, pad with zeros
275- segment = make ([]int16 , s .n )
276- copy (segment , s .buf_s16 [i :bufLength ])
277- }
278- if err := fn (ts , segment ); err != nil {
279- return err
280- } else {
281- ts += tsinc
282- }
283- }
284-
285- // Return success
286- return nil
311+ // TODO: Pad any remaining samples with zeros if the buffer is not full
312+ return fn (s .ts , s .buf_s16 )
287313}
0 commit comments