| // Copyright 2016 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "third_party/blink/renderer/platform/audio/audio_delay_dsp_kernel.h" |
| |
| #include <arm_neon.h> |
| |
| #include <algorithm> |
| |
| namespace blink { |
| |
| #if defined(CPU_ARM_NEON) |
| static ALWAYS_INLINE int32x4_t WrapIndexVector(int32x4_t v_write_index, |
| int32x4_t v_buffer_length) { |
| // Wrap the write_index if any index is past the end of the buffer. |
| // This implements |
| // |
| // if (write_index >= buffer_length) |
| // write_index -= buffer_length |
| |
| // If write_index >= buffer_length, cmp = 0xffffffff. Otherwise 0. |
| int32x4_t cmp = |
| reinterpret_cast<int32x4_t>(vcgeq_s32(v_write_index, v_buffer_length)); |
| |
| // Bitwise-and cmp with buffer length to get buffer length or 0 depending on |
| // whether write_index >= buffer_length or not. Subtract this from the index |
| // to wrap the index appropriately. |
| return vsubq_s32(v_write_index, vandq_s32(cmp, v_buffer_length)); |
| } |
| |
| static ALWAYS_INLINE float32x4_t |
| WrapPositionVector(float32x4_t v_position, float32x4_t v_buffer_length) { |
| // Wrap the read position if it exceed the buffer length. |
| // This implements |
| // |
| // if (position >= buffer_length) |
| // read_position -= buffer_length |
| |
| // If position >= buffer length, set cmp = 0xffffffff. Otherwise 0. |
| uint32x4_t cmp = vcgeq_f32(v_position, v_buffer_length); |
| |
| // Bitwise-and buffer_length with cmp to get buffer_length or 0 depending on |
| // whether read_position >= buffer length or not. Then subtract from the |
| // position to wrap it around if needed. |
| return vsubq_f32(v_position, |
| reinterpret_cast<float32x4_t>(vandq_u32( |
| reinterpret_cast<uint32x4_t>(v_buffer_length), cmp))); |
| } |
| |
| std::tuple<unsigned, int> AudioDelayDSPKernel::ProcessARateVector( |
| float* destination, |
| uint32_t frames_to_process) const { |
| const int buffer_length = buffer_.size(); |
| const float* buffer = buffer_.Data(); |
| |
| const float sample_rate = this->SampleRate(); |
| const float* delay_times = delay_times_.Data(); |
| |
| int w_index = write_index_; |
| |
| const float32x4_t v_sample_rate = vdupq_n_f32(sample_rate); |
| |
| // The buffer length as a float and as an int so we don't need to constant |
| // convert from one to the other. |
| const float32x4_t v_buffer_length_float = vdupq_n_f32(buffer_length); |
| const int32x4_t v_buffer_length_int = vdupq_n_s32(buffer_length); |
| |
| // How much to increment the write index each time through the loop. |
| const int32x4_t v_incr = vdupq_n_s32(4); |
| |
| // Temp arrays for storing the samples needed for interpolation |
| float sample1[4] __attribute((aligned(16))); |
| float sample2[4] __attribute((aligned(16))); |
| |
| // Temp array for holding the indices so we can access them |
| // individually. |
| int read_index1[4] __attribute((aligned(16))); |
| int read_index2[4] __attribute((aligned(16))); |
| |
| // Initialize the write index vector, and wrap the values if needed. |
| int32x4_t v_write_index = {w_index + 0, w_index + 1, w_index + 2, |
| w_index + 3}; |
| v_write_index = WrapIndexVector(v_write_index, v_buffer_length_int); |
| |
| int number_of_loops = frames_to_process / 4; |
| int k = 0; |
| |
| for (int n = 0; n < number_of_loops; ++n, k += 4) { |
| const float32x4_t v_delay_time = vld1q_f32(delay_times + k); |
| const float32x4_t v_desired_delay_frames = |
| vmulq_f32(v_delay_time, v_sample_rate); |
| |
| // read_position = write_index + buffer_length - desired_delay_frames. Wrap |
| // the position if needed. |
| float32x4_t v_read_position = |
| vaddq_f32(vcvtq_f32_s32(v_write_index), |
| vsubq_f32(v_buffer_length_float, v_desired_delay_frames)); |
| v_read_position = |
| WrapPositionVector(v_read_position, v_buffer_length_float); |
| |
| // Get indices into the buffer for the samples we need for interpolation. |
| const int32x4_t v_read_index1 = vcvtq_s32_f32(v_read_position); |
| const int32x4_t v_read_index2 = WrapIndexVector( |
| vaddq_s32(v_read_index1, vdupq_n_s32(1)), v_buffer_length_int); |
| |
| const float32x4_t interpolation_factor = |
| vsubq_f32(v_read_position, vcvtq_f32_s32(v_read_index1)); |
| |
| // Save indices so we can access the components individually for |
| // getting the aamples from the buffer. |
| vst1q_s32(read_index1, v_read_index1); |
| vst1q_s32(read_index2, v_read_index2); |
| |
| for (int m = 0; m < 4; ++m) { |
| sample1[m] = buffer[read_index1[m]]; |
| sample2[m] = buffer[read_index2[m]]; |
| } |
| |
| const float32x4_t v_sample1 = vld1q_f32(sample1); |
| const float32x4_t v_sample2 = vld1q_f32(sample2); |
| |
| v_write_index = vaddq_s32(v_write_index, v_incr); |
| v_write_index = WrapIndexVector(v_write_index, v_buffer_length_int); |
| |
| // Linear interpolation between samples. |
| const float32x4_t sample = vaddq_f32( |
| v_sample1, |
| vmulq_f32(interpolation_factor, vsubq_f32(v_sample2, v_sample1))); |
| vst1q_f32(destination + k, sample); |
| } |
| |
| // Update |w_index| based on how many frames we processed here, wrapping |
| // around if needed. |
| w_index = write_index_ + k; |
| if (w_index >= buffer_length) |
| w_index -= buffer_length; |
| |
| return std::make_tuple(k, w_index); |
| } |
| |
| void AudioDelayDSPKernel::HandleNaN(float* delay_times, |
| uint32_t frames_to_process, |
| float max_time) { |
| unsigned k = 0; |
| int number_of_loops = frames_to_process / 4; |
| |
| float32x4_t v_max_time = vdupq_n_f32(max_time); |
| |
| // This is approximately 4 times faster than the scalar version. |
| for (int loop = 0; loop < number_of_loops; ++loop, k += 4) { |
| float32x4_t x = vld1q_f32(delay_times + k); |
| // x == x only fails when x is NaN. Then cmp is set to 0. Otherwise |
| // 0xffffffff |
| uint32x4_t cmp = vceqq_f32(x, x); |
| |
| // Use cmp as a mask to set a component of x to 0 if x is NaN. |
| // Otherwise, preserve x. We pun the types here so we can apply |
| // the mask to the floating point numbers. A integer value of |
| // 0 corresponds to a floating-point +0.0, which is what we want. |
| uint32x4_t xint = vandq_u32(cmp, reinterpret_cast<uint32x4_t>(x)); |
| |
| // Invert the mask. |
| cmp = vmvnq_u32(cmp); |
| |
| // More punning of the types so we can apply the complement mask |
| // to set cmp to either max_time (if NaN) or 0 (otherwise) |
| cmp = vandq_u32(cmp, reinterpret_cast<uint32x4_t>(v_max_time)); |
| |
| // Merge i (bitwise or) x and cmp. This makes x = max_time if x was NaN and |
| // preserves x if not. More type punning to do bitwise or the results |
| // together. |
| xint = vorrq_u32(xint, cmp); |
| |
| // Finally, save the float result. |
| vst1q_f32(delay_times + k, reinterpret_cast<float32x4_t>(xint)); |
| } |
| |
| // Handle any frames not done in the loop above. |
| for (; k < frames_to_process; ++k) { |
| if (std::isnan(delay_times[k])) |
| delay_times[k] = max_time; |
| } |
| } |
| #endif |
| |
| } // namespace blink |