From ee4ce12b71178201d684421c2f790aaadd0e04de Mon Sep 17 00:00:00 2001 From: David Thorpe Date: Fri, 20 Jun 2025 10:44:33 +0200 Subject: [PATCH] Updated segmenter --- pkg/segmenter/opt.go | 33 +++++++---- pkg/segmenter/segmenter.go | 113 ++++++++++++++++++++----------------- 2 files changed, 83 insertions(+), 63 deletions(-) diff --git a/pkg/segmenter/opt.go b/pkg/segmenter/opt.go index 7bda125..b0d9d74 100644 --- a/pkg/segmenter/opt.go +++ b/pkg/segmenter/opt.go @@ -13,16 +13,18 @@ import ( type Opt func(*opts) error type opts struct { + SegmentSize time.Duration // Segment size, zero means no segmenting + SilenceSize time.Duration // Size of silence to consider a segment boundary SilenceThreshold float64 // Silence threshold - SilenceDuration time.Duration // Duration of silence to consider a segment boundary } /////////////////////////////////////////////////////////////////////////////////// // GLOBALS const ( - DefaultSilenceThreshold = 0.0005 // Default silence threshold - DefaultSilenceDuration = time.Second * 2 // Default silence duration + DefaultSilenceThreshold = 0.01 // Default silence threshold + DefaultSilenceDuration = time.Millisecond * 500 // Default silence duration + MinDuration = time.Millisecond * 250 // Minimum duration ) /////////////////////////////////////////////////////////////////////////////////// @@ -41,21 +43,32 @@ func applyOpts(opt ...Opt) (*opts, error) { /////////////////////////////////////////////////////////////////////////////////// // TYPES -func WithDefaultSilenceThreshold() Opt { +func WithSegmentSize(v time.Duration) Opt { return func(o *opts) error { - o.SilenceThreshold = DefaultSilenceThreshold - o.SilenceDuration = DefaultSilenceDuration + if v < MinDuration { + return media.ErrBadParameter.Withf("segment duration is too short, must be at least %v", MinDuration) + } else { + o.SegmentSize = v + } return nil } } -func WithSilenceDuration(v time.Duration) Opt { +func WithSilenceSize(v time.Duration) Opt { return func(o *opts) error { - if v < time.Millisecond*100 { - return media.ErrBadParameter.Withf("silence duration %s is too short, must be at least 100ms", v) + if v < MinDuration { + return media.ErrBadParameter.Withf("silence duration is too short, must be at least %v", MinDuration) } else { - o.SilenceDuration = v + o.SilenceSize = v } return nil } } + +func WithDefaultSilenceThreshold() Opt { + return func(o *opts) error { + o.SilenceThreshold = DefaultSilenceThreshold + o.SilenceSize = DefaultSilenceDuration + return nil + } +} diff --git a/pkg/segmenter/segmenter.go b/pkg/segmenter/segmenter.go index 8f0ec77..5034083 100644 --- a/pkg/segmenter/segmenter.go +++ b/pkg/segmenter/segmenter.go @@ -46,14 +46,20 @@ const ( ////////////////////////////////////////////////////////////////////////////// // LIFECYCLE -// Create a new segmenter with a reader r which segments into raw audio of 'dur' -// length. If dur is zero then no segmenting is performed, the whole -// audio file is read and output in one go, which could cause some memory issues. +// Create a new segmenter with a reader r which segments into raw audio. // The sample rate is the number of samples per second. // +// Setting option WithSegmentSize will cause the segmenter to segment the audio +// into fixed-size chunks approximately of the specified duration. +// +// Setting option WithDefaultSilenceThreshold will cause the segmenter to break +// into smaller chunks, if silence is detected. The length of the silence is +// specified by the WithSilenceDuration option, which defaults to 2 seconds. +// // At the moment, the audio format is auto-detected, but there should be -// a way to specify the audio format. -func NewReader(r io.Reader, dur time.Duration, sample_rate int, opts ...Opt) (*Segmenter, error) { +// a way to specify the audio format. The output samples are always single-channel +// (mono). +func NewReader(r io.Reader, sample_rate int, opts ...Opt) (*Segmenter, error) { segmenter := new(Segmenter) // Apply options @@ -64,14 +70,14 @@ func NewReader(r io.Reader, dur time.Duration, sample_rate int, opts ...Opt) (*S } // Check arguments - if dur < 0 || sample_rate <= 0 { + if sample_rate <= 0 { return nil, media.ErrBadParameter.With("invalid duration or sample rate arguments") } else { segmenter.sample_rate = sample_rate } // Sample buffer is duration * sample rate, assuming mono - segmenter.n = int(dur.Seconds() * float64(sample_rate)) + segmenter.n = int(segmenter.opts.SegmentSize.Seconds() * float64(sample_rate)) // Open the file media, err := ffmpeg.NewReader(r) @@ -143,33 +149,20 @@ func (s *Segmenter) DecodeFloat32(ctx context.Context, fn SegmentFuncFloat32) er return nil } - // Calculate the energy of the frame - root mean squared and normalize between 0 and 1 - var sum float32 - var energy float64 - for _, sample := range data { - sum += float32(sample) * float32(sample) - } - energy = math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16) - - // If silence detection is enabled, check if the energy is below the threshold - var cut bool - if s.SilenceThreshold > 0 && energy < s.SilenceThreshold { - // If the energy is below the threshold, we consider it silence - if s.sts == -1 { - // If this is the first silence, set the timestamp - s.sts = frame.Ts() - } else if frame.Ts()-s.sts >= s.SilenceDuration.Seconds() { - // Cut when the buffer size is greater than 10 seconds - if len(s.buf_flt) >= s.sample_rate*10 { - cut = true - } - s.sts = -1 // Reset the silence timestamp + // Calculate the energy of the frame and determine if we should "cut" the segment + _, cut := s.detect_silence(frame.Ts(), func() float64 { + var sum float32 + for _, sample := range data { + sum += float32(sample) * float32(sample) } - } + return math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16) + }) // Append float32 samples from plane 0 to buffer s.buf_flt = append(s.buf_flt, frame.Float32(0)...) + // TODO: If we don't have enough samples for a segment, or we are not cutting, + // n != 0 and len(buf) >= n we have a segment to process if (s.n != 0 && len(s.buf_flt) >= s.n) || cut { if err := s.segment_flt(fn); err != nil { @@ -203,6 +196,32 @@ func (s *Segmenter) DecodeFloat32(ctx context.Context, fn SegmentFuncFloat32) er return nil } +func (s *Segmenter) detect_silence(ts float64, energy_fn func() float64) (float64, bool) { + energy := energy_fn() + + // Segmenting or Silence detection is not enabled + if s.SegmentSize == 0 || s.SilenceThreshold == 0 { + return energy, false + } + + // If energy is above the threshold, reset the silence timestamp + if energy >= s.SilenceThreshold { + s.sts = -1 + return energy, false + } + + // Set the first frame of silence + if s.sts == -1 { + s.sts = ts + return energy, false + } + + // Calculate the silence duration, and consider whether we consider this + // a segment boundary. + silence_duration := ts - s.sts + return energy, silence_duration >= s.SilenceSize.Seconds() +} + // Segments are output through a callback, with the samples and a timestamp // At the moment the "best" audio stream is used, based on ffmpeg heuristic. func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error { @@ -239,33 +258,23 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error return nil } - // Calculate the energy of the frame - root mean squared and normalize between 0 and 1 - var sum float32 - var energy float64 - for _, sample := range data { - sum += float32(sample) * float32(sample) - } - energy = math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16) - - // If silence detection is enabled, check if the energy is below the threshold - var cut bool - if s.SilenceThreshold > 0 && energy < s.SilenceThreshold { - // If the energy is below the threshold, we consider it silence - if s.sts == -1 { - // If this is the first silence, set the timestamp - s.sts = frame.Ts() - } else if frame.Ts()-s.sts >= s.SilenceDuration.Seconds() { - // Cut when the buffer size is greater than 10 seconds - if len(s.buf_s16) >= s.sample_rate*10 { - cut = true - } - s.sts = -1 // Reset the silence timestamp + // Calculate the energy of the frame and determine if we should "cut" the segment + _, cut := s.detect_silence(frame.Ts(), func() float64 { + var sum float32 + for _, sample := range data { + sum += float32(sample) * float32(sample) } - } + return math.Sqrt(float64(sum)/float64(len(data))) / float64(math.MaxInt16) + }) // Append int16 samples from plane 0 to buffer s.buf_s16 = append(s.buf_s16, data...) + // TODO: If we don't have enough samples for a segment, or we are not cutting + if cut && len(s.buf_s16) < (s.n>>1) { + cut = false + } + // n != 0 and len(buf) >= n we have a segment to process if (s.n != 0 && len(s.buf_s16) >= s.n) || cut { if err := s.segment_s16(fn); err != nil { @@ -303,11 +312,9 @@ func (s *Segmenter) DecodeInt16(ctx context.Context, fn SegmentFuncInt16) error // PRIVATE METHODS func (s *Segmenter) segment_flt(fn SegmentFuncFloat32) error { - // TODO: Pad any remaining samples with zeros if the buffer is not full return fn(s.ts, s.buf_flt) } func (s *Segmenter) segment_s16(fn SegmentFuncInt16) error { - // TODO: Pad any remaining samples with zeros if the buffer is not full return fn(s.ts, s.buf_s16) }