diff --git a/README b/README index 4158a9be..957b3fff 100644 --- a/README +++ b/README @@ -1,4 +1,12 @@ RNNoise is a noise suppression library based on a recurrent neural network. +A description of the algorithm is provided in the following paper: + +J.-M. Valin, A Hybrid DSP/Deep Learning Approach to Real-Time Full-Band Speech +Enhancement, Proceedings of IEEE Multimedia Signal Processing (MMSP) Workshop, +arXiv:1709.08243, 2018. +https://arxiv.org/pdf/1709.08243.pdf + +An interactive demo is available at: https://jmvalin.ca/demo/rnnoise/ To compile, just type: % ./autogen.sh diff --git a/configure.ac b/configure.ac index 5ffc7c2d..95c2d790 100644 --- a/configure.ac +++ b/configure.ac @@ -47,7 +47,7 @@ AC_SUBST(OP_LT_REVISION) AC_SUBST(OP_LT_AGE) CC_CHECK_CFLAGS_APPEND( - [-pedantic -Wall -Wextra -Wno-sign-compare -Wno-parentheses -Wno-long-long]) + [-O3 -march=native -pedantic -Wall -Wextra -Wno-sign-compare -Wno-parentheses -Wno-long-long]) # Platform-specific tweaks case $host in diff --git a/src/compile.sh b/src/compile.sh index 4b2ea538..f9c7cfc2 100755 --- a/src/compile.sh +++ b/src/compile.sh @@ -1,3 +1,3 @@ #!/bin/sh -gcc -DTRAINING=1 -Wall -W -O3 -g -I../include denoise.c kiss_fft.c pitch.c celt_lpc.c rnn.c rnn_data.c -o denoise_training -lm +gcc -DTRAINING=1 -march=native -Wall -W -O3 -g -I../include denoise.c kiss_fft.c pitch.c celt_lpc.c rnn.c rnn_data.c -o denoise_training -lm diff --git a/src/denoise.c b/src/denoise.c index 5a628440..0a54914b 100644 --- a/src/denoise.c +++ b/src/denoise.c @@ -270,6 +270,14 @@ int rnnoise_init(DenoiseState *st, RNNModel *model) { st->rnn.vad_gru_state = calloc(sizeof(float), st->rnn.model->vad_gru_size); st->rnn.noise_gru_state = calloc(sizeof(float), st->rnn.model->noise_gru_size); st->rnn.denoise_gru_state = calloc(sizeof(float), st->rnn.model->denoise_gru_size); + st->rnn.compute_gru_fct = &compute_gru; + +#if defined(__AVX2__) + if(is_avx2_supported() == 1) { + st->rnn.compute_gru_fct = &compute_gru_avx2; + } +#endif + return 0; } @@ -408,13 +416,11 @@ static void frame_synthesis(DenoiseState *st, float *out, const kiss_fft_cpx *y) } static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) { - int i; - for (i=0;i +// SIMD +#include +#include +#include + + +/************************************** + * GCC + *************************************/ + +int is_avx2_supported() { +#if defined(__AVX2__) + int cpuInfo[4]; + int max_function_id; + int os_enables_XSAVE_XRSTORE = 0; + int os_enables_avx = 0; + int os_enables_avx2 = 0; +#ifdef __FMA__ + int os_enables_fma = 0; +#endif + + // Check for GCC or WIN32, other compilers not supported +#if !defined(__GNUC__) && !defined(_WIN32) + return 0; +#endif + + // WIN32 must support CPUID +#if defined(_WIN32) && !defined(HAS_CPUID) + return 0; +#endif + + + // Check CPU support + // See: https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/cpuid.h + +#if defined(__GNUC__) + __cpuid_count(0, 0, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]); +#else // _WIN32 + __cpuid(cpuInfo, 0); +#endif + max_function_id = cpuInfo[0]; + if (max_function_id < 1) { + return 0; + } + +#if defined(__GNUC__) + __cpuid_count(1, 0, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]); +#else // _WIN32 + __cpuid(cpuInfo, 1); +#endif + os_enables_XSAVE_XRSTORE = cpuInfo[2] & 0x08000000; + if(!os_enables_XSAVE_XRSTORE) { + return 0; + } + +#ifdef __FMA__ + os_enables_fma = cpuInfo[2] & 0x00001000; +#endif + os_enables_avx = cpuInfo[2] & 0x10000000; + + if (max_function_id >= 7) { +#if defined(__GNUC__) + __cpuid_count(7, 0, cpuInfo[0], cpuInfo[1], cpuInfo[2], cpuInfo[3]); +#else // _WIN32 + __cpuid(cpuInfo, 7); +#endif + os_enables_avx2 = cpuInfo[1] & 0x00000020; + } + + + // Check OS support + // See: https://stackoverflow.com/a/22521619/2750093 + // AVX2 and FMA: no check available, checking AVX only is your best bet + if(os_enables_avx) { + unsigned long long xcrFeatureMask = _xgetbv(0); // _XCR_XFEATURE_ENABLED_MASK + os_enables_avx = (xcrFeatureMask & 0x6) == 0x6; + } + +#ifdef __FMA__ + return os_enables_avx && os_enables_avx2 && os_enables_fma; +#else + return os_enables_avx && os_enables_avx2; +#endif + +#else + return 0; +#endif +} + + static OPUS_INLINE float tansig_approx(float x) { int i; @@ -84,28 +174,196 @@ void compute_dense(const DenseLayer *layer, float *output, const float *input) M = layer->nb_inputs; N = layer->nb_neurons; stride = N; - for (i=0;ibias[i]; - for (j=0;jinput_weights[j*stride + i]*input[j]; output[i] = WEIGHTS_SCALE*sum; } if (layer->activation == ACTIVATION_SIGMOID) { - for (i=0;iactivation == ACTIVATION_TANH) { - for (i=0;iactivation == ACTIVATION_RELU) { - for (i=0;i + +// Use native FMA if available, otherwise fall back to multiply + add +#ifdef __FMA__ +#define _MM256_FMADD_PS(a, b, c) _mm256_fmadd_ps(a, b, c) +#else +static OPUS_INLINE __m256 _mm256_fmadd_ps_fallback(__m256 a, __m256 b, __m256 c) { + __m256 multiplied = _mm256_mul_ps(a, b); + return _mm256_add_ps(c, multiplied); +} + +#define _MM256_FMADD_PS(a, b, c) _mm256_fmadd_ps_fallback(a, b, c) +#endif + +void compute_gru_avx2(const GRULayer *gru, float *state, const float *input) +{ + int i, j; + int N, M; + int stride; + float z[MAX_NEURONS]; + float r[MAX_NEURONS]; + float h[MAX_NEURONS]; + M = gru->nb_inputs; + N = gru->nb_neurons; + stride = 3 * N; + + int chunk_size = 8; + int n_remainder = N % chunk_size; + int n_chunk_count = (N - n_remainder) / chunk_size; + + for (int i_chunk = 0; i_chunk < n_chunk_count; i_chunk++) { + // Load i8s + __m128i i8_z_sum = _mm_loadu_si128((__m128i*) &gru->bias[i_chunk * chunk_size]); + __m128i i8_r_sum = _mm_loadu_si128((__m128i*) &gru->bias[N + (i_chunk * chunk_size)]); + // Sign-extend to i32s + __m256i i32_z_sum = _mm256_cvtepi8_epi32(i8_z_sum); + __m256i i32_r_sum = _mm256_cvtepi8_epi32(i8_r_sum); + // Convert to f32s + __m256 z_sum = _mm256_cvtepi32_ps(i32_z_sum); + __m256 r_sum = _mm256_cvtepi32_ps(i32_r_sum); + + for (j = 0; jinput_weights[j*stride + (i_chunk * chunk_size)]); + __m128i r_input_weights_i8 = _mm_loadu_si128((__m128i*) &gru->input_weights[N + j*stride + (i_chunk * chunk_size)]); + // Sign-extend to i32s + __m256i z_input_weights_i32 = _mm256_cvtepi8_epi32(z_input_weights_i8); + __m256i r_input_weights_i32 = _mm256_cvtepi8_epi32(r_input_weights_i8); + // Convert to f32s + __m256 z_input_weights = _mm256_cvtepi32_ps(z_input_weights_i32); + __m256 r_input_weights = _mm256_cvtepi32_ps(r_input_weights_i32); + + __m256 input_v = _mm256_broadcast_ss(&input[j]); + + z_sum = _MM256_FMADD_PS(z_input_weights, input_v, z_sum); + r_sum = _MM256_FMADD_PS(r_input_weights, input_v, r_sum); + } + for (j = 0; jrecurrent_weights[j*stride + (i_chunk * chunk_size)]); + __m128i r_recurrent_weights_i8 = _mm_loadu_si128((__m128i*) &gru->recurrent_weights[N + j*stride + (i_chunk * chunk_size)]); + // Sign-extend to i32s + __m256i z_recurrent_weights_i32 = _mm256_cvtepi8_epi32(z_recurrent_weights_i8); + __m256i r_recurrent_weights_i32 = _mm256_cvtepi8_epi32(r_recurrent_weights_i8); + // Convert to f32s + __m256 z_recurrent_weights = _mm256_cvtepi32_ps(z_recurrent_weights_i32); + __m256 r_recurrent_weights = _mm256_cvtepi32_ps(r_recurrent_weights_i32); + + __m256 state_v = _mm256_broadcast_ss(&state[j]); + + z_sum = _MM256_FMADD_PS(z_recurrent_weights, state_v, z_sum); + r_sum = _MM256_FMADD_PS(r_recurrent_weights, state_v, r_sum); + } + + // Store sums + _mm256_storeu_ps(&z[i_chunk * chunk_size], z_sum); + _mm256_storeu_ps(&r[i_chunk * chunk_size], r_sum); + } + // Remainders + for (int i = n_chunk_count * chunk_size; i < N; i++) { + float z_sum = gru->bias[i]; + float r_sum = gru->bias[N + i]; + + for (j = 0; jinput_weights[j*stride + i]*input[j]; + /* Compute reset gate. */ + r_sum += gru->input_weights[N + j*stride + i]*input[j]; + } + for (j = 0; jrecurrent_weights[j*stride + i]*state[j]; + /* Compute reset gate. */ + r_sum += gru->recurrent_weights[N + j*stride + i]*state[j]; + } + + z[i] = z_sum; + r[i] = r_sum; + } + // Apply sigmoid to sums + for (i = 0; i < N; i++) { + z[i] = sigmoid_approx(WEIGHTS_SCALE * z[i]); + r[i] = sigmoid_approx(WEIGHTS_SCALE * r[i]); + } + + /* Compute output. */ + for (int i_chunk = 0; i_chunk < n_chunk_count; i_chunk++) { + // Load i8s + __m128i i8_sum = _mm_loadu_si128((__m128i*) &gru->bias[2*N + (i_chunk * chunk_size)]); + // Sign-extend to i32s + __m256i i32_sum = _mm256_cvtepi8_epi32(i8_sum); + // Convert to f32s + __m256 sum = _mm256_cvtepi32_ps(i32_sum); + + for (j = 0; j < M; j++) { + // Load i8s + __m128i input_weights_i8 = _mm_loadu_si128((__m128i*) &gru->input_weights[2*N + j*stride + (i_chunk * chunk_size)]); + // Sign-extend to i32s + __m256i input_weights_i32 = _mm256_cvtepi8_epi32(input_weights_i8); + // Convert to f32s + __m256 input_weights = _mm256_cvtepi32_ps(input_weights_i32); + + __m256 input_v = _mm256_broadcast_ss(&input[j]); + + sum = _MM256_FMADD_PS(input_weights, input_v, sum) ; + } + + for (j = 0; j < N; j++) { + // Load i8s + __m128i recurrent_weights_i8 = _mm_loadu_si128((__m128i*) &gru->recurrent_weights[2*N + j*stride + (i_chunk * chunk_size)]); + // Sign-extend to i32s + __m256i recurrent_weights_i32 = _mm256_cvtepi8_epi32(recurrent_weights_i8); + // Convert to f32s + __m256 recurrent_weights = _mm256_cvtepi32_ps(recurrent_weights_i32); + + float state_times_r = state[j] * r[j]; + __m256 state_times_r_v = _mm256_set1_ps(state_times_r); + + sum = _MM256_FMADD_PS(recurrent_weights, state_times_r_v, sum); + } + + // Store sums + _mm256_storeu_ps(&h[i_chunk * chunk_size], sum); + } + // Remainders + for (int i = n_chunk_count * chunk_size; i < N; i++) { + float sum = gru->bias[2*N + i]; + for (j = 0; j < M; j++) + sum += gru->input_weights[2*N + j*stride + i] * input[j]; + for (j = 0; j < N; j++) + sum += gru->recurrent_weights[2*N + j*stride + i] * state[j] * r[j]; + + h[i] = sum; + } + + for (i = 0; i < N; i++) { + float sum = h[i]; + + if (gru->activation == ACTIVATION_SIGMOID) sum = sigmoid_approx(WEIGHTS_SCALE*sum); + else if (gru->activation == ACTIVATION_TANH) sum = tansig_approx(WEIGHTS_SCALE*sum); + else if (gru->activation == ACTIVATION_RELU) sum = relu(WEIGHTS_SCALE*sum); + else *(int*)0=0; + state[i] = z[i]*state[i] + (1-z[i])*sum; + } +} +#endif + void compute_gru(const GRULayer *gru, float *state, const float *input) { int i, j; @@ -117,42 +375,42 @@ void compute_gru(const GRULayer *gru, float *state, const float *input) M = gru->nb_inputs; N = gru->nb_neurons; stride = 3*N; - for (i=0;ibias[i]; - for (j=0;jinput_weights[j*stride + i]*input[j]; - for (j=0;jrecurrent_weights[j*stride + i]*state[j]; - z[i] = sigmoid_approx(WEIGHTS_SCALE*sum); - } - for (i=0;ibias[N + i]; - for (j=0;jinput_weights[N + j*stride + i]*input[j]; - for (j=0;jrecurrent_weights[N + j*stride + i]*state[j]; - r[i] = sigmoid_approx(WEIGHTS_SCALE*sum); + float z_sum = gru->bias[i]; + float r_sum = gru->bias[N + i]; + + for (j = 0; jinput_weights[j*stride + i]*input[j]; + /* Compute reset gate. */ + r_sum += gru->input_weights[N + j*stride + i]*input[j]; + } + for (j = 0; j < N; j++) { + /* Compute update gate. */ + z_sum += gru->recurrent_weights[j*stride + i]*state[j]; + /* Compute reset gate. */ + r_sum += gru->recurrent_weights[N + j*stride + i]*state[j]; + } + + z[i] = sigmoid_approx(WEIGHTS_SCALE*z_sum); + r[i] = sigmoid_approx(WEIGHTS_SCALE*r_sum); } - for (i=0;ibias[2*N + i]; - for (j=0;jinput_weights[2*N + j*stride + i]*input[j]; - for (j=0;jrecurrent_weights[2*N + j*stride + i]*state[j]*r[j]; if (gru->activation == ACTIVATION_SIGMOID) sum = sigmoid_approx(WEIGHTS_SCALE*sum); else if (gru->activation == ACTIVATION_TANH) sum = tansig_approx(WEIGHTS_SCALE*sum); else if (gru->activation == ACTIVATION_RELU) sum = relu(WEIGHTS_SCALE*sum); else *(int*)0=0; - h[i] = z[i]*state[i] + (1-z[i])*sum; + h[i] = z[i] * state[i] + (1 - z[i]) * sum; } - for (i=0;imodel->input_dense, dense_out, input); - compute_gru(rnn->model->vad_gru, rnn->vad_gru_state, dense_out); + rnn->compute_gru_fct(rnn->model->vad_gru, rnn->vad_gru_state, dense_out); compute_dense(rnn->model->vad_output, vad, rnn->vad_gru_state); - for (i=0;imodel->input_dense_size;i++) noise_input[i] = dense_out[i]; - for (i=0;imodel->vad_gru_size;i++) noise_input[i+rnn->model->input_dense_size] = rnn->vad_gru_state[i]; - for (i=0;imodel->input_dense_size+rnn->model->vad_gru_size] = input[i]; - compute_gru(rnn->model->noise_gru, rnn->noise_gru_state, noise_input); - - for (i=0;imodel->vad_gru_size;i++) denoise_input[i] = rnn->vad_gru_state[i]; - for (i=0;imodel->noise_gru_size;i++) denoise_input[i+rnn->model->vad_gru_size] = rnn->noise_gru_state[i]; - for (i=0;imodel->vad_gru_size+rnn->model->noise_gru_size] = input[i]; - compute_gru(rnn->model->denoise_gru, rnn->denoise_gru_state, denoise_input); + for (i = 0;imodel->input_dense_size;i++) noise_input[i] = dense_out[i]; + for (i = 0;imodel->vad_gru_size;i++) noise_input[i+rnn->model->input_dense_size] = rnn->vad_gru_state[i]; + for (i = 0;imodel->input_dense_size+rnn->model->vad_gru_size] = input[i]; + rnn->compute_gru_fct(rnn->model->noise_gru, rnn->noise_gru_state, noise_input); + + for (i = 0;imodel->vad_gru_size;i++) denoise_input[i] = rnn->vad_gru_state[i]; + for (i = 0;imodel->noise_gru_size;i++) denoise_input[i+rnn->model->vad_gru_size] = rnn->noise_gru_state[i]; + for (i = 0;imodel->vad_gru_size+rnn->model->noise_gru_size] = input[i]; + rnn->compute_gru_fct(rnn->model->denoise_gru, rnn->denoise_gru_state, denoise_input); compute_dense(rnn->model->denoise_output, gains, rnn->denoise_gru_state); } diff --git a/src/rnn.h b/src/rnn.h index 31b962fc..8c711f86 100644 --- a/src/rnn.h +++ b/src/rnn.h @@ -60,10 +60,16 @@ typedef struct { typedef struct RNNState RNNState; +int is_avx2_supported(); + void compute_dense(const DenseLayer *layer, float *output, const float *input); void compute_gru(const GRULayer *gru, float *state, const float *input); +#if defined(__AVX2__) +void compute_gru_avx2(const GRULayer *gru, float *state, const float *input); +#endif + void compute_rnn(RNNState *rnn, float *gains, float *vad, const float *input); #endif /* RNN_H_ */ diff --git a/src/rnn_data.h b/src/rnn_data.h index f2186fe0..b74798ac 100644 --- a/src/rnn_data.h +++ b/src/rnn_data.h @@ -28,6 +28,7 @@ struct RNNState { float *vad_gru_state; float *noise_gru_state; float *denoise_gru_state; + void (*compute_gru_fct)(const GRULayer *gru, float *state, const float *input); };