| // Copyright 2020 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #include "third_party/blink/renderer/modules/webaudio/oscillator_node.h" |
| |
| #include "third_party/blink/renderer/modules/webaudio/periodic_wave.h" |
| |
| #if defined(CPU_ARM_NEON) |
| #include <arm_neon.h> |
| #endif |
| |
| namespace blink { |
| |
| #if defined(CPU_ARM_NEON) |
| static float32x4_t WrapVirtualIndexVector(float32x4_t x, |
| float32x4_t wave_size, |
| float32x4_t inv_wave_size) { |
| // r = x/wave_size, f = truncate(r), truncating towards 0 |
| const float32x4_t r = vmulq_f32(x, inv_wave_size); |
| int32x4_t f = vcvtq_s32_f32(r); |
| |
| // vcltq_f32 returns returns all 0xfffffff (-1) if a < b and if if not. |
| const uint32x4_t cmp = vcltq_f32(r, vcvtq_f32_s32(f)); |
| f = vaddq_s32(f, vreinterpretq_s32_u32(cmp)); |
| |
| return vsubq_f32(x, vmulq_f32(vcvtq_f32_s32(f), wave_size)); |
| } |
| |
| std::tuple<int, double> OscillatorHandler::ProcessKRateVector( |
| int n, |
| float* dest_p, |
| double virtual_read_index, |
| float frequency, |
| float rate_scale) const { |
| auto periodic_wave = periodic_wave_.Lock(); |
| const unsigned periodic_wave_size = periodic_wave->PeriodicWaveSize(); |
| const double inv_periodic_wave_size = 1.0 / periodic_wave_size; |
| |
| float* higher_wave_data = nullptr; |
| float* lower_wave_data = nullptr; |
| float table_interpolation_factor = 0; |
| const float incr = frequency * rate_scale; |
| DCHECK_GE(incr, kInterpolate2Point); |
| |
| periodic_wave->WaveDataForFundamentalFrequency( |
| frequency, lower_wave_data, higher_wave_data, table_interpolation_factor); |
| |
| const float32x4_t v_wave_size = vdupq_n_f32(periodic_wave_size); |
| const float32x4_t v_inv_wave_size = vdupq_n_f32(1.0f / periodic_wave_size); |
| |
| const uint32x4_t v_read_mask = vdupq_n_u32(periodic_wave_size - 1); |
| const uint32x4_t v_one = vdupq_n_u32(1); |
| |
| const float32x4_t v_table_factor = vdupq_n_f32(table_interpolation_factor); |
| |
| const float32x4_t v_incr = vdupq_n_f32(4 * incr); |
| |
| float32x4_t v_virt_index = { |
| virtual_read_index + 0 * incr, virtual_read_index + 1 * incr, |
| virtual_read_index + 2 * incr, virtual_read_index + 3 * incr}; |
| |
| // Temporary arrsys to hold the read indices so we can access them |
| // individually to get the samples needed for interpolation. |
| uint32_t r0[4] __attribute__((aligned(16))); |
| uint32_t r1[4] __attribute__((aligned(16))); |
| |
| // Temporary arrays where we can gather up the wave data we need for |
| // interpolation. Align these for best efficiency on older CPUs where aligned |
| // access is much faster than unaliged. TODO(rtoy): Is there a faster way to |
| // do this? |
| float sample1_lower[4] __attribute__((aligned(16))); |
| float sample2_lower[4] __attribute__((aligned(16))); |
| float sample1_higher[4] __attribute__((aligned(16))); |
| float sample2_higher[4] __attribute__((aligned(16))); |
| |
| // It's possible that adding the incr above exceeded the bounds, so wrap them |
| // if needed. |
| v_virt_index = |
| WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size); |
| |
| int k = 0; |
| int n_loops = n / 4; |
| |
| for (int loop = 0; loop < n_loops; ++loop, k += 4) { |
| // Compute indices for the samples and contain within the valid range. |
| const uint32x4_t read_index_0 = |
| vandq_u32(vcvtq_u32_f32(v_virt_index), v_read_mask); |
| const uint32x4_t read_index_1 = |
| vandq_u32(vaddq_u32(read_index_0, v_one), v_read_mask); |
| |
| // Extract the components of the indices so we can get the samples |
| // associated with the lower and higher wave data. |
| vst1q_u32(r0, read_index_0); |
| vst1q_u32(r1, read_index_1); |
| |
| for (int m = 0; m < 4; ++m) { |
| sample1_lower[m] = lower_wave_data[r0[m]]; |
| sample2_lower[m] = lower_wave_data[r1[m]]; |
| sample1_higher[m] = higher_wave_data[r0[m]]; |
| sample2_higher[m] = higher_wave_data[r1[m]]; |
| } |
| |
| const float32x4_t s1_low = vld1q_f32(sample1_lower); |
| const float32x4_t s2_low = vld1q_f32(sample2_lower); |
| const float32x4_t s1_high = vld1q_f32(sample1_higher); |
| const float32x4_t s2_high = vld1q_f32(sample2_higher); |
| |
| const float32x4_t interpolation_factor = |
| vsubq_f32(v_virt_index, vcvtq_f32_u32(read_index_0)); |
| const float32x4_t sample_higher = vaddq_f32( |
| s1_high, vmulq_f32(interpolation_factor, vsubq_f32(s2_high, s1_high))); |
| const float32x4_t sample_lower = vaddq_f32( |
| s1_low, vmulq_f32(interpolation_factor, vsubq_f32(s2_low, s1_low))); |
| const float32x4_t sample = vaddq_f32( |
| sample_higher, |
| vmulq_f32(v_table_factor, vsubq_f32(sample_lower, sample_higher))); |
| |
| vst1q_f32(dest_p + k, sample); |
| |
| // Increment virtual read index and wrap virtualReadIndex into the range |
| // 0 -> periodicWaveSize. |
| v_virt_index = vaddq_f32(v_virt_index, v_incr); |
| v_virt_index = |
| WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size); |
| } |
| |
| // There's a bit of round-off above, so update the index more accurately so at |
| // least the next render starts over with a more accurate value. |
| virtual_read_index += k * incr; |
| virtual_read_index -= |
| std::floor(virtual_read_index * inv_periodic_wave_size) * |
| periodic_wave_size; |
| |
| return std::make_tuple(k, virtual_read_index); |
| } |
| |
| static ALWAYS_INLINE double WrapVirtualIndex(double virtual_index, |
| unsigned periodic_wave_size, |
| double inv_periodic_wave_size) { |
| return virtual_index - |
| floor(virtual_index * inv_periodic_wave_size) * periodic_wave_size; |
| } |
| |
| double OscillatorHandler::ProcessARateVectorKernel( |
| float* destination, |
| double virtual_read_index, |
| const float* phase_increments, |
| unsigned periodic_wave_size, |
| const float* const lower_wave_data[4], |
| const float* const higher_wave_data[4], |
| const float table_interpolation_factor[4]) const { |
| // See the scalar version in oscillator_node.cc for the basic algorithm. |
| double inv_periodic_wave_size = 1.0 / periodic_wave_size; |
| unsigned read_index_mask = periodic_wave_size - 1; |
| |
| // Accumulate the phase increments so we can set up the virtual read index |
| // vector appropriately. This must be a double to preserve accuracy and |
| // to match the scalar version. |
| double incr_sum[4]; |
| incr_sum[0] = phase_increments[0]; |
| for (int m = 1; m < 4; ++m) { |
| incr_sum[m] = incr_sum[m - 1] + phase_increments[m]; |
| } |
| |
| // It's really important for accuracy that we use doubles instead of |
| // floats for the virtual_read_index. Without this, we can only get some |
| // 30-50 dB in the sweep tests instead of 100+ dB. |
| // |
| // Arm NEON doesn't have float64x2_t so we have to do this. (Aarch64 has |
| // float64x2_t.) |
| double virt_index[4]; |
| virt_index[0] = virtual_read_index; |
| virt_index[1] = WrapVirtualIndex(virtual_read_index + incr_sum[0], |
| periodic_wave_size, inv_periodic_wave_size); |
| virt_index[2] = WrapVirtualIndex(virtual_read_index + incr_sum[1], |
| periodic_wave_size, inv_periodic_wave_size); |
| virt_index[3] = WrapVirtualIndex(virtual_read_index + incr_sum[2], |
| periodic_wave_size, inv_periodic_wave_size); |
| |
| // The virtual indices we're working with now. |
| const float32x4_t v_virt_index = { |
| static_cast<float>(virt_index[0]), static_cast<float>(virt_index[1]), |
| static_cast<float>(virt_index[2]), static_cast<float>(virt_index[3])}; |
| |
| // Convert virtual index to actual index into wave data. |
| const uint32x4_t v_read0 = vcvtq_u32_f32(v_virt_index); |
| |
| // v_read1 = v_read0 + 1, but wrap the index around, if needed. |
| const uint32x4_t v_read1 = vandq_u32(vaddq_u32(v_read0, vdupq_n_u32(1)), |
| vdupq_n_u32(read_index_mask)); |
| |
| float sample1_lower[4] __attribute__((aligned(16))); |
| float sample2_lower[4] __attribute__((aligned(16))); |
| float sample1_higher[4] __attribute__((aligned(16))); |
| float sample2_higher[4] __attribute__((aligned(16))); |
| |
| uint32_t read0[4] __attribute__((aligned(16))); |
| uint32_t read1[4] __attribute__((aligned(16))); |
| |
| vst1q_u32(read0, v_read0); |
| vst1q_u32(read1, v_read1); |
| |
| // Read the samples from the wave tables |
| for (int m = 0; m < 4; ++m) { |
| DCHECK_LT(read0[m], periodic_wave_size); |
| DCHECK_LT(read1[m], periodic_wave_size); |
| |
| sample1_lower[m] = lower_wave_data[m][read0[m]]; |
| sample2_lower[m] = lower_wave_data[m][read1[m]]; |
| sample1_higher[m] = higher_wave_data[m][read0[m]]; |
| sample2_higher[m] = higher_wave_data[m][read1[m]]; |
| } |
| |
| // Compute factor for linear interpolation within a wave table. |
| const float32x4_t v_factor = vsubq_f32(v_virt_index, vcvtq_f32_u32(v_read0)); |
| |
| // Linearly interpolate between samples from the higher wave table. |
| const float32x4_t sample_higher = vmlaq_f32( |
| vld1q_f32(sample1_higher), v_factor, |
| vsubq_f32(vld1q_f32(sample2_higher), vld1q_f32(sample1_higher))); |
| |
| // Linearly interpolate between samples from the lower wave table. |
| const float32x4_t sample_lower = |
| vmlaq_f32(vld1q_f32(sample1_lower), v_factor, |
| vsubq_f32(vld1q_f32(sample2_lower), vld1q_f32(sample1_lower))); |
| |
| // Linearly interpolate between wave tables to get the desired |
| // output samples. |
| const float32x4_t sample = |
| vmlaq_f32(sample_higher, vld1q_f32(table_interpolation_factor), |
| vsubq_f32(sample_lower, sample_higher)); |
| |
| vst1q_f32(destination, sample); |
| |
| // Update the virtual_read_index appropriately and return it for the |
| // next call. |
| virtual_read_index = |
| WrapVirtualIndex(virtual_read_index + incr_sum[3], periodic_wave_size, |
| inv_periodic_wave_size); |
| |
| return virtual_read_index; |
| } |
| #endif |
| |
| } // namespace blink |