diff --git a/CMakeLists.txt b/CMakeLists.txt index fc45352..c92d5c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -407,6 +407,8 @@ set(audio_processing__aec3 "audio_processing/aec3/erl_estimator.h" "audio_processing/aec3/erle_estimator.cc" "audio_processing/aec3/erle_estimator.h" + "audio_processing/aec3/false_comfort_noise_generator.cc" + "audio_processing/aec3/false_comfort_noise_generator.h" "audio_processing/aec3/fft_buffer.cc" "audio_processing/aec3/fft_buffer.h" "audio_processing/aec3/fft_data.h" @@ -1134,32 +1136,6 @@ if("${CMAKE_VS_PLATFORM_NAME}" STREQUAL "x86") ) endif() - - - - - - - - - - - - - - - - - - - - - - - - - - set(PROJECT_NAME demo) ################################################################################ diff --git a/android/app/src/main/cpp/audio_processing/aec3/false_comfort_noise_generator.cc b/android/app/src/main/cpp/audio_processing/aec3/false_comfort_noise_generator.cc index 26ed834..1dabb0b 100644 --- a/android/app/src/main/cpp/audio_processing/aec3/false_comfort_noise_generator.cc +++ b/android/app/src/main/cpp/audio_processing/aec3/false_comfort_noise_generator.cc @@ -29,152 +29,165 @@ namespace webrtc { -namespace { + namespace { // Table of sqrt(2) * sin(2*pi*i/32). -constexpr float kSqrt2Sin[32] = { - +0.0000000f, +0.2758994f, +0.5411961f, +0.7856950f, +1.0000000f, - +1.1758756f, +1.3065630f, +1.3870398f, +1.4142136f, +1.3870398f, - +1.3065630f, +1.1758756f, +1.0000000f, +0.7856950f, +0.5411961f, - +0.2758994f, +0.0000000f, -0.2758994f, -0.5411961f, -0.7856950f, - -1.0000000f, -1.1758756f, -1.3065630f, -1.3870398f, -1.4142136f, - -1.3870398f, -1.3065630f, -1.1758756f, -1.0000000f, -0.7856950f, - -0.5411961f, -0.2758994f}; - -void GenerateComfortNoise(Aec3Optimization optimization, - const std::array& N2, - uint32_t* seed, - FftData* lower_band_noise, - FftData* upper_band_noise) { - FftData* N_low = lower_band_noise; - FftData* N_high = upper_band_noise; - - // Compute square root spectrum. - std::array N; - std::copy(N2.begin(), N2.end(), N.begin()); - aec3::VectorMath(optimization).Sqrt(N); - - // Compute the noise level for the upper bands. - constexpr float kOneByNumBands = 1.f / (kFftLengthBy2Plus1 / 2 + 1); - constexpr int kFftLengthBy2Plus1By2 = kFftLengthBy2Plus1 / 2; - const float high_band_noise_level = - std::accumulate(N.begin() + kFftLengthBy2Plus1By2, N.end(), 0.f) * - kOneByNumBands; - - // The analysis and synthesis windowing cause loss of power when - // cross-fading the noise where frames are completely uncorrelated - // (generated with random phase), hence the factor sqrt(2). - // This is not the case for the speech signal where the input is overlapping - // (strong correlation). - N_low->re[0] = N_low->re[kFftLengthBy2] = N_high->re[0] = - N_high->re[kFftLengthBy2] = 0.f; - for (size_t k = 1; k < kFftLengthBy2; k++) { - constexpr int kIndexMask = 32 - 1; - // Generate a random 31-bit integer. - seed[0] = (seed[0] * 69069 + 1) & (0x80000000 - 1); - // Convert to a 5-bit index. - int i = seed[0] >> 26; - - // y = sqrt(2) * sin(a) - const float x = kSqrt2Sin[i]; - // x = sqrt(2) * cos(a) = sqrt(2) * sin(a + pi/2) - const float y = kSqrt2Sin[(i + 8) & kIndexMask]; - - // Form low-frequency noise via spectral shaping. - N_low->re[k] = N[k] * x; - N_low->im[k] = N[k] * y; - - // Form the high-frequency noise via simple levelling. - N_high->re[k] = high_band_noise_level * x; - N_high->im[k] = high_band_noise_level * y; - } -} - -} // namespace - -FalseComfortNoiseGenerator::FalseComfortNoiseGenerator(Aec3Optimization optimization, - size_t num_capture_channels) - : optimization_(optimization), - seed_(42), - num_capture_channels_(num_capture_channels), - N2_initial_( - std::make_unique>>( - num_capture_channels_)), - Y2_smoothed_(num_capture_channels_), - N2_(num_capture_channels_) { - for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - (*N2_initial_)[ch].fill(0.f); - Y2_smoothed_[ch].fill(0.f); - N2_[ch].fill(1.0e6f); - } -} - -FalseComfortNoiseGenerator::~FalseComfortNoiseGenerator() = default; - -void FalseComfortNoiseGenerator::Compute( - bool saturated_capture, - rtc::ArrayView> - capture_spectrum, - rtc::ArrayView lower_band_noise, - rtc::ArrayView upper_band_noise) { - const auto& Y2 = capture_spectrum; - - if (!saturated_capture) { - // Smooth Y2. - for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - std::transform(Y2_smoothed_[ch].begin(), Y2_smoothed_[ch].end(), - Y2[ch].begin(), Y2_smoothed_[ch].begin(), - [](float a, float b) { return a + 0.1f * (b - a); }); - } + constexpr float kSqrt2Sin[32] = { + +0.0000000f, +0.2758994f, +0.5411961f, +0.7856950f, +1.0000000f, + +1.1758756f, +1.3065630f, +1.3870398f, +1.4142136f, +1.3870398f, + +1.3065630f, +1.1758756f, +1.0000000f, +0.7856950f, +0.5411961f, + +0.2758994f, +0.0000000f, -0.2758994f, -0.5411961f, -0.7856950f, + -1.0000000f, -1.1758756f, -1.3065630f, -1.3870398f, -1.4142136f, + -1.3870398f, -1.3065630f, -1.1758756f, -1.0000000f, -0.7856950f, + -0.5411961f, -0.2758994f}; + + void GenerateComfortNoise(Aec3Optimization optimization, + const std::array& N2, + uint32_t* seed, + FftData* lower_band_noise, + FftData* upper_band_noise) { + FftData* N_low = lower_band_noise; + FftData* N_high = upper_band_noise; + + // Compute square root spectrum. + std::array N; + std::copy(N2.begin(), N2.end(), N.begin()); + aec3::VectorMath(optimization).Sqrt(N); + + // Compute the noise level for the upper bands. + constexpr float kOneByNumBands = 1.f / (kFftLengthBy2Plus1 / 2 + 1); + constexpr int kFftLengthBy2Plus1By2 = kFftLengthBy2Plus1 / 2; + const float high_band_noise_level = + std::accumulate(N.begin() + kFftLengthBy2Plus1By2, N.end(), 0.f) * + kOneByNumBands; + + // The analysis and synthesis windowing cause loss of power when + // cross-fading the noise where frames are completely uncorrelated + // (generated with random phase), hence the factor sqrt(2). + // This is not the case for the speech signal where the input is overlapping + // (strong correlation). + N_low->re[0] = N_low->re[kFftLengthBy2] = N_high->re[0] = + N_high->re[kFftLengthBy2] = 0.f; + for (size_t k = 1; k < kFftLengthBy2; k++) { + constexpr int kIndexMask = 32 - 1; + // Generate a random 31-bit integer. + seed[0] = (seed[0] * 69069 + 1) & (0x80000000 - 1); + // Convert to a 5-bit index. + int i = seed[0] >> 26; + + // y = sqrt(2) * sin(a) + const float x = kSqrt2Sin[i]; + // x = sqrt(2) * cos(a) = sqrt(2) * sin(a + pi/2) + const float y = kSqrt2Sin[(i + 8) & kIndexMask]; + + // Form low-frequency noise via spectral shaping. + N_low->re[k] = N[k] * x; + N_low->im[k] = N[k] * y; + + // Form the high-frequency noise via simple levelling. + N_high->re[k] = high_band_noise_level * x; + N_high->im[k] = high_band_noise_level * y; + } + } - if (N2_counter_ > 50) { - // Update N2 from Y2_smoothed. + } // namespace + + FalseComfortNoiseGenerator::FalseComfortNoiseGenerator(Aec3Optimization optimization, + size_t num_capture_channels) + : optimization_(optimization), + seed_(42), + num_capture_channels_(num_capture_channels), + N2_initial_( + std::make_unique>>( + num_capture_channels_)), + Y2_smoothed_(num_capture_channels_), + N2_(num_capture_channels_) { for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - std::transform(N2_[ch].begin(), N2_[ch].end(), Y2_smoothed_[ch].begin(), - N2_[ch].begin(), [](float a, float b) { - return b < a ? (0.9f * b + 0.1f * a) * 1.0002f - : a * 1.0002f; - }); + (*N2_initial_)[ch].fill(0.f); + Y2_smoothed_[ch].fill(0.f); + N2_[ch].fill(1.0e6f); } } - if (N2_initial_) { - if (++N2_counter_ == 1000) { - N2_initial_.reset(); - } else { - // Compute the N2_initial from N2. + FalseComfortNoiseGenerator::~FalseComfortNoiseGenerator() = default; + + void FalseComfortNoiseGenerator::Compute( + bool saturated_capture, + rtc::ArrayView> + capture_spectrum, + rtc::ArrayView lower_band_noise, + rtc::ArrayView upper_band_noise) { + const auto& Y2 = capture_spectrum; + + if (!saturated_capture) { + // Smooth Y2. for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - std::transform(N2_[ch].begin(), N2_[ch].end(), - (*N2_initial_)[ch].begin(), (*N2_initial_)[ch].begin(), - [](float a, float b) { - return a > b ? b + 0.001f * (a - b) : a; - }); + std::transform(Y2_smoothed_[ch].begin(), Y2_smoothed_[ch].end(), + Y2[ch].begin(), Y2_smoothed_[ch].begin(), + [](float a, float b) { return a + 0.1f * (b - a); }); } - } - } - // Limit the noise to a floor matching a WGN input of -96 dBFS. - constexpr float kNoiseFloor = 17.1267f; + if (N2_counter_ > 50) { + // Update N2 from Y2_smoothed. + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + std::transform(N2_[ch].begin(), N2_[ch].end(), Y2_smoothed_[ch].begin(), + N2_[ch].begin(), [](float a, float b) { + return b < a ? (0.9f * b + 0.1f * a) * 1.0002f + : a * 1.0002f; + }); + } + } - for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - for (auto& n : N2_[ch]) { - n = std::max(n, kNoiseFloor); - } - if (N2_initial_) { - for (auto& n : (*N2_initial_)[ch]) { - n = std::max(n, kNoiseFloor); + if (N2_initial_) { + if (++N2_counter_ == 1000) { + N2_initial_.reset(); + } else { + // Compute the N2_initial from N2. + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + std::transform(N2_[ch].begin(), N2_[ch].end(), + (*N2_initial_)[ch].begin(), (*N2_initial_)[ch].begin(), + [](float a, float b) { + return a > b ? b + 0.001f * (a - b) : a; + }); + } + } + } + + // Limit the noise to a floor matching a WGN input of -96 dBFS. + constexpr float kNoiseFloor = 17.1267f; + + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + for (auto& n : N2_[ch]) { + n = kNoiseFloor; + } + if (N2_initial_) { + for (auto& n : (*N2_initial_)[ch]) { + n = kNoiseFloor; + } + } + } + //Original problematic code kept here for posterity + /* + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + for (auto& n : N2_[ch]) { + n = std::max(n, kNoiseFloor); + } + if (N2_initial_) { + for (auto& n : (*N2_initial_)[ch]) { + n = std::max(n, kNoiseFloor); + } + } } + */ } - } - } - // Choose N2 estimate to use. - const auto& N2 = N2_initial_ ? (*N2_initial_) : N2_; + // Choose N2 estimate to use. + const auto& N2 = N2_initial_ ? (*N2_initial_) : N2_; - for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - //GenerateComfortNoise(optimization_, N2[ch], &seed_, &lower_band_noise[ch], - // &upper_band_noise[ch]); - } -} + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + GenerateComfortNoise(optimization_, N2[ch], &seed_, &lower_band_noise[ch], + &upper_band_noise[ch]); + } + } } // namespace webrtc diff --git a/audio_processing/aec3/echo_remover.cc b/audio_processing/aec3/echo_remover.cc index b37587f..21304b1 100644 --- a/audio_processing/aec3/echo_remover.cc +++ b/audio_processing/aec3/echo_remover.cc @@ -21,7 +21,8 @@ #include "audio_processing/aec3/aec3_common.h" #include "audio_processing/aec3/aec3_fft.h" #include "audio_processing/aec3/aec_state.h" -#include "audio_processing/aec3/comfort_noise_generator.h" +//#include "audio_processing/aec3/comfort_noise_generator.h" +#include "audio_processing/aec3/false_comfort_noise_generator.h" #include "audio_processing/aec3/echo_path_variability.h" #include "audio_processing/aec3/echo_remover_metrics.h" #include "audio_processing/aec3/fft_data.h" @@ -150,7 +151,7 @@ class EchoRemoverImpl final : public EchoRemover { const bool use_shadow_filter_output_; Subtractor subtractor_; SuppressionGain suppression_gain_; - ComfortNoiseGenerator cng_; + FalseComfortNoiseGenerator fcng_; SuppressionFilter suppression_filter_; RenderSignalAnalyzer render_signal_analyzer_; ResidualEchoEstimator residual_echo_estimator_; @@ -200,7 +201,7 @@ EchoRemoverImpl::EchoRemoverImpl(const EchoCanceller3Config& config, optimization_, sample_rate_hz, num_capture_channels), - cng_(optimization_, num_capture_channels_), + fcng_(optimization_, num_capture_channels_), suppression_filter_(optimization_, sample_rate_hz_, num_capture_channels_), @@ -394,7 +395,7 @@ void EchoRemoverImpl::ProcessCapture( residual_echo_estimator_.Estimate(aec_state_, *render_buffer, S2_linear, Y2, R2); - cng_.Compute(aec_state_.SaturatedCapture(), Y2, comfort_noise, + fcng_.Compute(aec_state_.SaturatedCapture(), Y2, comfort_noise, high_band_comfort_noise); if (aec_state_.UsableLinearEstimate()) { @@ -414,14 +415,14 @@ void EchoRemoverImpl::ProcessCapture( float high_bands_gain; std::array G; suppression_gain_.GetGain(nearend_spectrum, echo_spectrum, R2, - cng_.NoiseSpectrum(), render_signal_analyzer_, + fcng_.NoiseSpectrum(), render_signal_analyzer_, aec_state_, x, &high_bands_gain, &G); suppression_filter_.ApplyGain(comfort_noise, high_band_comfort_noise, G, high_bands_gain, Y_fft, y); // Update the metrics. - metrics_.Update(aec_state_, cng_.NoiseSpectrum()[0], G); + metrics_.Update(aec_state_, fcng_.NoiseSpectrum()[0], G); // Debug outputs for the purpose of development and analysis. data_dumper_->DumpWav("aec3_echo_estimate", kBlockSize, @@ -429,7 +430,7 @@ void EchoRemoverImpl::ProcessCapture( data_dumper_->DumpRaw("aec3_output", (*y)[0][0]); data_dumper_->DumpRaw("aec3_narrow_render", render_signal_analyzer_.NarrowPeakBand() ? 1 : 0); - data_dumper_->DumpRaw("aec3_N2", cng_.NoiseSpectrum()[0]); + data_dumper_->DumpRaw("aec3_N2", fcng_.NoiseSpectrum()[0]); data_dumper_->DumpRaw("aec3_suppressor_gain", G); data_dumper_->DumpWav("aec3_output", rtc::ArrayView(&(*y)[0][0][0], kBlockSize), diff --git a/audio_processing/aec3/false_comfort_noise_generator.cc b/audio_processing/aec3/false_comfort_noise_generator.cc index 26ed834..416ba8f 100644 --- a/audio_processing/aec3/false_comfort_noise_generator.cc +++ b/audio_processing/aec3/false_comfort_noise_generator.cc @@ -156,6 +156,18 @@ void FalseComfortNoiseGenerator::Compute( // Limit the noise to a floor matching a WGN input of -96 dBFS. constexpr float kNoiseFloor = 17.1267f; + for (size_t ch = 0; ch < num_capture_channels_; ++ch) { + for (auto& n : N2_[ch]) { + n = kNoiseFloor; + } + if (N2_initial_) { + for (auto& n : (*N2_initial_)[ch]) { + n = kNoiseFloor; + } + } + } + //Original problematic code kept here for posterity + /* for (size_t ch = 0; ch < num_capture_channels_; ++ch) { for (auto& n : N2_[ch]) { n = std::max(n, kNoiseFloor); @@ -166,14 +178,15 @@ void FalseComfortNoiseGenerator::Compute( } } } + */ } // Choose N2 estimate to use. const auto& N2 = N2_initial_ ? (*N2_initial_) : N2_; for (size_t ch = 0; ch < num_capture_channels_; ++ch) { - //GenerateComfortNoise(optimization_, N2[ch], &seed_, &lower_band_noise[ch], - // &upper_band_noise[ch]); + GenerateComfortNoise(optimization_, N2[ch], &seed_, &lower_band_noise[ch], + &upper_band_noise[ch]); } } diff --git a/demo/demo.cc b/demo/demo.cc index 8959fd1..c8d3f2f 100644 --- a/demo/demo.cc +++ b/demo/demo.cc @@ -81,7 +81,7 @@ int main(int argc, char* argv[]) if (ref_format != rec_format || ref_channels != rec_channels || ref_sample_rate != rec_sample_rate || - ref_bits_per_sample != ref_bits_per_sample) + ref_bits_per_sample != rec_bits_per_sample) { cerr << "ref file format != rec file format" << endl; return -1;