chromium/src/third_party/blink/renderer/modules/webaudio/cpu/arm/oscillator_kernel_neon.cc - manifest_repos/chromium_src - Git at Google

 // Copyright 2020 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "third_party/blink/renderer/modules/webaudio/oscillator_node.h"

 #include "third_party/blink/renderer/modules/webaudio/periodic_wave.h"

 #if defined(CPU_ARM_NEON)
 #include <arm_neon.h>
 #endif

 namespace blink {

 #if defined(CPU_ARM_NEON)
 static float32x4_t WrapVirtualIndexVector(float32x4_t x,
                                           float32x4_t wave_size,
                                           float32x4_t inv_wave_size) {
   // r = x/wave_size, f = truncate(r), truncating towards 0
   const float32x4_t r = vmulq_f32(x, inv_wave_size);
   int32x4_t f = vcvtq_s32_f32(r);

   // vcltq_f32 returns returns all 0xfffffff (-1) if a < b and if if not.
   const uint32x4_t cmp = vcltq_f32(r, vcvtq_f32_s32(f));
   f = vaddq_s32(f, vreinterpretq_s32_u32(cmp));

   return vsubq_f32(x, vmulq_f32(vcvtq_f32_s32(f), wave_size));
 }

 std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
     int n,
     float* dest_p,
     double virtual_read_index,
     float frequency,
     float rate_scale) const {
   auto periodic_wave = periodic_wave_.Lock();
   const unsigned periodic_wave_size = periodic_wave->PeriodicWaveSize();
   const double inv_periodic_wave_size = 1.0 / periodic_wave_size;

   float* higher_wave_data = nullptr;
   float* lower_wave_data = nullptr;
   float table_interpolation_factor = 0;
   const float incr = frequency * rate_scale;
   DCHECK_GE(incr, kInterpolate2Point);

   periodic_wave->WaveDataForFundamentalFrequency(
       frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);

   const float32x4_t v_wave_size = vdupq_n_f32(periodic_wave_size);
   const float32x4_t v_inv_wave_size = vdupq_n_f32(1.0f / periodic_wave_size);

   const uint32x4_t v_read_mask = vdupq_n_u32(periodic_wave_size - 1);
   const uint32x4_t v_one = vdupq_n_u32(1);

   const float32x4_t v_table_factor = vdupq_n_f32(table_interpolation_factor);

   const float32x4_t v_incr = vdupq_n_f32(4 * incr);

   float32x4_t v_virt_index = {
       virtual_read_index + 0 * incr, virtual_read_index + 1 * incr,
       virtual_read_index + 2 * incr, virtual_read_index + 3 * incr};

   // Temporary arrsys to hold the read indices so we can access them
   // individually to get the samples needed for interpolation.
   uint32_t r0[4] __attribute__((aligned(16)));
   uint32_t r1[4] __attribute__((aligned(16)));

   // Temporary arrays where we can gather up the wave data we need for
   // interpolation.  Align these for best efficiency on older CPUs where aligned
   // access is much faster than unaliged.  TODO(rtoy): Is there a faster way to
   // do this?
   float sample1_lower[4] __attribute__((aligned(16)));
   float sample2_lower[4] __attribute__((aligned(16)));
   float sample1_higher[4] __attribute__((aligned(16)));
   float sample2_higher[4] __attribute__((aligned(16)));

   // It's possible that adding the incr above exceeded the bounds, so wrap them
   // if needed.
   v_virt_index =
       WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);

   int k = 0;
   int n_loops = n / 4;

   for (int loop = 0; loop < n_loops; ++loop, k += 4) {
     // Compute indices for the samples and contain within the valid range.
     const uint32x4_t read_index_0 =
         vandq_u32(vcvtq_u32_f32(v_virt_index), v_read_mask);
     const uint32x4_t read_index_1 =
         vandq_u32(vaddq_u32(read_index_0, v_one), v_read_mask);

     // Extract the components of the indices so we can get the samples
     // associated with the lower and higher wave data.
     vst1q_u32(r0, read_index_0);
     vst1q_u32(r1, read_index_1);

     for (int m = 0; m < 4; ++m) {
       sample1_lower[m] = lower_wave_data[r0[m]];
       sample2_lower[m] = lower_wave_data[r1[m]];
       sample1_higher[m] = higher_wave_data[r0[m]];
       sample2_higher[m] = higher_wave_data[r1[m]];
     }

     const float32x4_t s1_low = vld1q_f32(sample1_lower);
     const float32x4_t s2_low = vld1q_f32(sample2_lower);
     const float32x4_t s1_high = vld1q_f32(sample1_higher);
     const float32x4_t s2_high = vld1q_f32(sample2_higher);

     const float32x4_t interpolation_factor =
         vsubq_f32(v_virt_index, vcvtq_f32_u32(read_index_0));
     const float32x4_t sample_higher = vaddq_f32(
         s1_high, vmulq_f32(interpolation_factor, vsubq_f32(s2_high, s1_high)));
     const float32x4_t sample_lower = vaddq_f32(
         s1_low, vmulq_f32(interpolation_factor, vsubq_f32(s2_low, s1_low)));
     const float32x4_t sample = vaddq_f32(
         sample_higher,
         vmulq_f32(v_table_factor, vsubq_f32(sample_lower, sample_higher)));

     vst1q_f32(dest_p + k, sample);

     // Increment virtual read index and wrap virtualReadIndex into the range
     // 0 -> periodicWaveSize.
     v_virt_index = vaddq_f32(v_virt_index, v_incr);
     v_virt_index =
         WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);
   }

   // There's a bit of round-off above, so update the index more accurately so at
   // least the next render starts over with a more accurate value.
   virtual_read_index += k * incr;
   virtual_read_index -=
       std::floor(virtual_read_index * inv_periodic_wave_size) *
       periodic_wave_size;

   return std::make_tuple(k, virtual_read_index);
 }

 static ALWAYS_INLINE double WrapVirtualIndex(double virtual_index,
                                              unsigned periodic_wave_size,
                                              double inv_periodic_wave_size) {
   return virtual_index -
          floor(virtual_index * inv_periodic_wave_size) * periodic_wave_size;
 }

 double OscillatorHandler::ProcessARateVectorKernel(
     float* destination,
     double virtual_read_index,
     const float* phase_increments,
     unsigned periodic_wave_size,
     const float* const lower_wave_data[4],
     const float* const higher_wave_data[4],
     const float table_interpolation_factor[4]) const {
   // See the scalar version in oscillator_node.cc for the basic algorithm.
   double inv_periodic_wave_size = 1.0 / periodic_wave_size;
   unsigned read_index_mask = periodic_wave_size - 1;

   // Accumulate the phase increments so we can set up the virtual read index
   // vector appropriately.  This must be a double to preserve accuracy and
   // to match the scalar version.
   double incr_sum[4];
   incr_sum[0] = phase_increments[0];
   for (int m = 1; m < 4; ++m) {
     incr_sum[m] = incr_sum[m - 1] + phase_increments[m];
   }

   // It's really important for accuracy that we use doubles instead of
   // floats for the virtual_read_index.  Without this, we can only get some
   // 30-50 dB in the sweep tests instead of 100+ dB.
   //
   // Arm NEON doesn't have float64x2_t so we have to do this.  (Aarch64 has
   // float64x2_t.)
   double virt_index[4];
   virt_index[0] = virtual_read_index;
   virt_index[1] = WrapVirtualIndex(virtual_read_index + incr_sum[0],
                                    periodic_wave_size, inv_periodic_wave_size);
   virt_index[2] = WrapVirtualIndex(virtual_read_index + incr_sum[1],
                                    periodic_wave_size, inv_periodic_wave_size);
   virt_index[3] = WrapVirtualIndex(virtual_read_index + incr_sum[2],
                                    periodic_wave_size, inv_periodic_wave_size);

   // The virtual indices we're working with now.
   const float32x4_t v_virt_index = {
       static_cast<float>(virt_index[0]), static_cast<float>(virt_index[1]),
       static_cast<float>(virt_index[2]), static_cast<float>(virt_index[3])};

   // Convert virtual index to actual index into wave data.
   const uint32x4_t v_read0 = vcvtq_u32_f32(v_virt_index);

   // v_read1 = v_read0 + 1, but wrap the index around, if needed.
   const uint32x4_t v_read1 = vandq_u32(vaddq_u32(v_read0, vdupq_n_u32(1)),
                                        vdupq_n_u32(read_index_mask));

   float sample1_lower[4] __attribute__((aligned(16)));
   float sample2_lower[4] __attribute__((aligned(16)));
   float sample1_higher[4] __attribute__((aligned(16)));
   float sample2_higher[4] __attribute__((aligned(16)));

   uint32_t read0[4] __attribute__((aligned(16)));
   uint32_t read1[4] __attribute__((aligned(16)));

   vst1q_u32(read0, v_read0);
   vst1q_u32(read1, v_read1);

   // Read the samples from the wave tables
   for (int m = 0; m < 4; ++m) {
     DCHECK_LT(read0[m], periodic_wave_size);
     DCHECK_LT(read1[m], periodic_wave_size);

     sample1_lower[m] = lower_wave_data[m][read0[m]];
     sample2_lower[m] = lower_wave_data[m][read1[m]];
     sample1_higher[m] = higher_wave_data[m][read0[m]];
     sample2_higher[m] = higher_wave_data[m][read1[m]];
   }

   // Compute factor for linear interpolation within a wave table.
   const float32x4_t v_factor = vsubq_f32(v_virt_index, vcvtq_f32_u32(v_read0));

   // Linearly interpolate between samples from the higher wave table.
   const float32x4_t sample_higher = vmlaq_f32(
       vld1q_f32(sample1_higher), v_factor,
       vsubq_f32(vld1q_f32(sample2_higher), vld1q_f32(sample1_higher)));

   // Linearly interpolate between samples from the lower wave table.
   const float32x4_t sample_lower =
       vmlaq_f32(vld1q_f32(sample1_lower), v_factor,
                 vsubq_f32(vld1q_f32(sample2_lower), vld1q_f32(sample1_lower)));

   // Linearly interpolate between wave tables to get the desired
   // output samples.
   const float32x4_t sample =
       vmlaq_f32(sample_higher, vld1q_f32(table_interpolation_factor),
                 vsubq_f32(sample_lower, sample_higher));

   vst1q_f32(destination, sample);

   // Update the virtual_read_index appropriately and return it for the
   // next call.
   virtual_read_index =
       WrapVirtualIndex(virtual_read_index + incr_sum[3], periodic_wave_size,
                        inv_periodic_wave_size);

   return virtual_read_index;
 }
 #endif

 }  // namespace blink
	// Copyright 2020 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "third_party/blink/renderer/modules/webaudio/oscillator_node.h"

	#include "third_party/blink/renderer/modules/webaudio/periodic_wave.h"

	#if defined(CPU_ARM_NEON)
	#include <arm_neon.h>
	#endif

	namespace blink {

	#if defined(CPU_ARM_NEON)
	static float32x4_t WrapVirtualIndexVector(float32x4_t x,
	float32x4_t wave_size,
	float32x4_t inv_wave_size) {
	// r = x/wave_size, f = truncate(r), truncating towards 0
	const float32x4_t r = vmulq_f32(x, inv_wave_size);
	int32x4_t f = vcvtq_s32_f32(r);

	// vcltq_f32 returns returns all 0xfffffff (-1) if a < b and if if not.
	const uint32x4_t cmp = vcltq_f32(r, vcvtq_f32_s32(f));
	f = vaddq_s32(f, vreinterpretq_s32_u32(cmp));

	return vsubq_f32(x, vmulq_f32(vcvtq_f32_s32(f), wave_size));
	}

	std::tuple<int, double> OscillatorHandler::ProcessKRateVector(
	int n,
	float* dest_p,
	double virtual_read_index,
	float frequency,
	float rate_scale) const {
	auto periodic_wave = periodic_wave_.Lock();
	const unsigned periodic_wave_size = periodic_wave->PeriodicWaveSize();
	const double inv_periodic_wave_size = 1.0 / periodic_wave_size;

	float* higher_wave_data = nullptr;
	float* lower_wave_data = nullptr;
	float table_interpolation_factor = 0;
	const float incr = frequency * rate_scale;
	DCHECK_GE(incr, kInterpolate2Point);

	periodic_wave->WaveDataForFundamentalFrequency(
	frequency, lower_wave_data, higher_wave_data, table_interpolation_factor);

	const float32x4_t v_wave_size = vdupq_n_f32(periodic_wave_size);
	const float32x4_t v_inv_wave_size = vdupq_n_f32(1.0f / periodic_wave_size);

	const uint32x4_t v_read_mask = vdupq_n_u32(periodic_wave_size - 1);
	const uint32x4_t v_one = vdupq_n_u32(1);

	const float32x4_t v_table_factor = vdupq_n_f32(table_interpolation_factor);

	const float32x4_t v_incr = vdupq_n_f32(4 * incr);

	float32x4_t v_virt_index = {
	virtual_read_index + 0 * incr, virtual_read_index + 1 * incr,
	virtual_read_index + 2 * incr, virtual_read_index + 3 * incr};

	// Temporary arrsys to hold the read indices so we can access them
	// individually to get the samples needed for interpolation.
	uint32_t r0[4] __attribute__((aligned(16)));
	uint32_t r1[4] __attribute__((aligned(16)));

	// Temporary arrays where we can gather up the wave data we need for
	// interpolation. Align these for best efficiency on older CPUs where aligned
	// access is much faster than unaliged. TODO(rtoy): Is there a faster way to
	// do this?
	float sample1_lower[4] __attribute__((aligned(16)));
	float sample2_lower[4] __attribute__((aligned(16)));
	float sample1_higher[4] __attribute__((aligned(16)));
	float sample2_higher[4] __attribute__((aligned(16)));

	// It's possible that adding the incr above exceeded the bounds, so wrap them
	// if needed.
	v_virt_index =
	WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);

	int k = 0;
	int n_loops = n / 4;

	for (int loop = 0; loop < n_loops; ++loop, k += 4) {
	// Compute indices for the samples and contain within the valid range.
	const uint32x4_t read_index_0 =
	vandq_u32(vcvtq_u32_f32(v_virt_index), v_read_mask);
	const uint32x4_t read_index_1 =
	vandq_u32(vaddq_u32(read_index_0, v_one), v_read_mask);

	// Extract the components of the indices so we can get the samples
	// associated with the lower and higher wave data.
	vst1q_u32(r0, read_index_0);
	vst1q_u32(r1, read_index_1);

	for (int m = 0; m < 4; ++m) {
	sample1_lower[m] = lower_wave_data[r0[m]];
	sample2_lower[m] = lower_wave_data[r1[m]];
	sample1_higher[m] = higher_wave_data[r0[m]];
	sample2_higher[m] = higher_wave_data[r1[m]];
	}

	const float32x4_t s1_low = vld1q_f32(sample1_lower);
	const float32x4_t s2_low = vld1q_f32(sample2_lower);
	const float32x4_t s1_high = vld1q_f32(sample1_higher);
	const float32x4_t s2_high = vld1q_f32(sample2_higher);

	const float32x4_t interpolation_factor =
	vsubq_f32(v_virt_index, vcvtq_f32_u32(read_index_0));
	const float32x4_t sample_higher = vaddq_f32(
	s1_high, vmulq_f32(interpolation_factor, vsubq_f32(s2_high, s1_high)));
	const float32x4_t sample_lower = vaddq_f32(
	s1_low, vmulq_f32(interpolation_factor, vsubq_f32(s2_low, s1_low)));
	const float32x4_t sample = vaddq_f32(
	sample_higher,
	vmulq_f32(v_table_factor, vsubq_f32(sample_lower, sample_higher)));

	vst1q_f32(dest_p + k, sample);

	// Increment virtual read index and wrap virtualReadIndex into the range
	// 0 -> periodicWaveSize.
	v_virt_index = vaddq_f32(v_virt_index, v_incr);
	v_virt_index =
	WrapVirtualIndexVector(v_virt_index, v_wave_size, v_inv_wave_size);
	}

	// There's a bit of round-off above, so update the index more accurately so at
	// least the next render starts over with a more accurate value.
	virtual_read_index += k * incr;
	virtual_read_index -=
	std::floor(virtual_read_index * inv_periodic_wave_size) *
	periodic_wave_size;

	return std::make_tuple(k, virtual_read_index);
	}

	static ALWAYS_INLINE double WrapVirtualIndex(double virtual_index,
	unsigned periodic_wave_size,
	double inv_periodic_wave_size) {
	return virtual_index -
	floor(virtual_index * inv_periodic_wave_size) * periodic_wave_size;
	}

	double OscillatorHandler::ProcessARateVectorKernel(
	float* destination,
	double virtual_read_index,
	const float* phase_increments,
	unsigned periodic_wave_size,
	const float* const lower_wave_data[4],
	const float* const higher_wave_data[4],
	const float table_interpolation_factor[4]) const {
	// See the scalar version in oscillator_node.cc for the basic algorithm.
	double inv_periodic_wave_size = 1.0 / periodic_wave_size;
	unsigned read_index_mask = periodic_wave_size - 1;

	// Accumulate the phase increments so we can set up the virtual read index
	// vector appropriately. This must be a double to preserve accuracy and
	// to match the scalar version.
	double incr_sum[4];
	incr_sum[0] = phase_increments[0];
	for (int m = 1; m < 4; ++m) {
	incr_sum[m] = incr_sum[m - 1] + phase_increments[m];
	}

	// It's really important for accuracy that we use doubles instead of
	// floats for the virtual_read_index. Without this, we can only get some
	// 30-50 dB in the sweep tests instead of 100+ dB.
	//
	// Arm NEON doesn't have float64x2_t so we have to do this. (Aarch64 has
	// float64x2_t.)
	double virt_index[4];
	virt_index[0] = virtual_read_index;
	virt_index[1] = WrapVirtualIndex(virtual_read_index + incr_sum[0],
	periodic_wave_size, inv_periodic_wave_size);
	virt_index[2] = WrapVirtualIndex(virtual_read_index + incr_sum[1],
	periodic_wave_size, inv_periodic_wave_size);
	virt_index[3] = WrapVirtualIndex(virtual_read_index + incr_sum[2],
	periodic_wave_size, inv_periodic_wave_size);

	// The virtual indices we're working with now.
	const float32x4_t v_virt_index = {
	static_cast<float>(virt_index[0]), static_cast<float>(virt_index[1]),
	static_cast<float>(virt_index[2]), static_cast<float>(virt_index[3])};

	// Convert virtual index to actual index into wave data.
	const uint32x4_t v_read0 = vcvtq_u32_f32(v_virt_index);

	// v_read1 = v_read0 + 1, but wrap the index around, if needed.
	const uint32x4_t v_read1 = vandq_u32(vaddq_u32(v_read0, vdupq_n_u32(1)),
	vdupq_n_u32(read_index_mask));

	float sample1_lower[4] __attribute__((aligned(16)));
	float sample2_lower[4] __attribute__((aligned(16)));
	float sample1_higher[4] __attribute__((aligned(16)));
	float sample2_higher[4] __attribute__((aligned(16)));

	uint32_t read0[4] __attribute__((aligned(16)));
	uint32_t read1[4] __attribute__((aligned(16)));

	vst1q_u32(read0, v_read0);
	vst1q_u32(read1, v_read1);

	// Read the samples from the wave tables
	for (int m = 0; m < 4; ++m) {
	DCHECK_LT(read0[m], periodic_wave_size);
	DCHECK_LT(read1[m], periodic_wave_size);

	sample1_lower[m] = lower_wave_data[m][read0[m]];
	sample2_lower[m] = lower_wave_data[m][read1[m]];
	sample1_higher[m] = higher_wave_data[m][read0[m]];
	sample2_higher[m] = higher_wave_data[m][read1[m]];
	}

	// Compute factor for linear interpolation within a wave table.
	const float32x4_t v_factor = vsubq_f32(v_virt_index, vcvtq_f32_u32(v_read0));

	// Linearly interpolate between samples from the higher wave table.
	const float32x4_t sample_higher = vmlaq_f32(
	vld1q_f32(sample1_higher), v_factor,
	vsubq_f32(vld1q_f32(sample2_higher), vld1q_f32(sample1_higher)));

	// Linearly interpolate between samples from the lower wave table.
	const float32x4_t sample_lower =
	vmlaq_f32(vld1q_f32(sample1_lower), v_factor,
	vsubq_f32(vld1q_f32(sample2_lower), vld1q_f32(sample1_lower)));

	// Linearly interpolate between wave tables to get the desired
	// output samples.
	const float32x4_t sample =
	vmlaq_f32(sample_higher, vld1q_f32(table_interpolation_factor),
	vsubq_f32(sample_lower, sample_higher));

	vst1q_f32(destination, sample);

	// Update the virtual_read_index appropriately and return it for the
	// next call.
	virtual_read_index =
	WrapVirtualIndex(virtual_read_index + incr_sum[3], periodic_wave_size,
	inv_periodic_wave_size);

	return virtual_read_index;
	}
	#endif

	} // namespace blink