chromium/src/third_party/blink/renderer/platform/audio/cpu/arm/audio_delay_dsp_kernel_neon.cc - manifest_repos/chromium_src - Git at Google

 // Copyright 2016 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "third_party/blink/renderer/platform/audio/audio_delay_dsp_kernel.h"

 #include <arm_neon.h>

 #include <algorithm>

 namespace blink {

 #if defined(CPU_ARM_NEON)
 static ALWAYS_INLINE int32x4_t WrapIndexVector(int32x4_t v_write_index,
                                                int32x4_t v_buffer_length) {
   // Wrap the write_index if any index is past the end of the buffer.
   // This implements
   //
   //   if (write_index >= buffer_length)
   //     write_index -= buffer_length

   // If write_index >= buffer_length, cmp = 0xffffffff.  Otherwise 0.
   int32x4_t cmp =
       reinterpret_cast<int32x4_t>(vcgeq_s32(v_write_index, v_buffer_length));

   // Bitwise-and cmp with buffer length to get buffer length or 0 depending on
   // whether write_index >= buffer_length or not.  Subtract this from the index
   // to wrap the index appropriately.
   return vsubq_s32(v_write_index, vandq_s32(cmp, v_buffer_length));
 }

 static ALWAYS_INLINE float32x4_t
 WrapPositionVector(float32x4_t v_position, float32x4_t v_buffer_length) {
   // Wrap the read position if it exceed the buffer length.
   // This implements
   //
   //   if (position >= buffer_length)
   //     read_position -= buffer_length

   // If position >= buffer length, set cmp = 0xffffffff.  Otherwise 0.
   uint32x4_t cmp = vcgeq_f32(v_position, v_buffer_length);

   // Bitwise-and buffer_length with cmp to get buffer_length or 0 depending on
   // whether read_position >= buffer length or not.  Then subtract from the
   // position to wrap it around if needed.
   return vsubq_f32(v_position,
                    reinterpret_cast<float32x4_t>(vandq_u32(
                        reinterpret_cast<uint32x4_t>(v_buffer_length), cmp)));
 }

 std::tuple<unsigned, int> AudioDelayDSPKernel::ProcessARateVector(
     float* destination,
     uint32_t frames_to_process) const {
   const int buffer_length = buffer_.size();
   const float* buffer = buffer_.Data();

   const float sample_rate = this->SampleRate();
   const float* delay_times = delay_times_.Data();

   int w_index = write_index_;

   const float32x4_t v_sample_rate = vdupq_n_f32(sample_rate);

   // The buffer length as a float and as an int so we don't need to constant
   // convert from one to the other.
   const float32x4_t v_buffer_length_float = vdupq_n_f32(buffer_length);
   const int32x4_t v_buffer_length_int = vdupq_n_s32(buffer_length);

   // How much to increment the write index each time through the loop.
   const int32x4_t v_incr = vdupq_n_s32(4);

   // Temp arrays for storing the samples needed for interpolation
   float sample1[4] __attribute((aligned(16)));
   float sample2[4] __attribute((aligned(16)));

   // Temp array for holding the indices so we can access them
   // individually.
   int read_index1[4] __attribute((aligned(16)));
   int read_index2[4] __attribute((aligned(16)));

   // Initialize the write index vector, and  wrap the values if needed.
   int32x4_t v_write_index = {w_index + 0, w_index + 1, w_index + 2,
                              w_index + 3};
   v_write_index = WrapIndexVector(v_write_index, v_buffer_length_int);

   int number_of_loops = frames_to_process / 4;
   int k = 0;

   for (int n = 0; n < number_of_loops; ++n, k += 4) {
     const float32x4_t v_delay_time = vld1q_f32(delay_times + k);
     const float32x4_t v_desired_delay_frames =
         vmulq_f32(v_delay_time, v_sample_rate);

     // read_position = write_index + buffer_length - desired_delay_frames.  Wrap
     // the position if needed.
     float32x4_t v_read_position =
         vaddq_f32(vcvtq_f32_s32(v_write_index),
                   vsubq_f32(v_buffer_length_float, v_desired_delay_frames));
     v_read_position =
         WrapPositionVector(v_read_position, v_buffer_length_float);

     // Get indices into the buffer for the samples we need for interpolation.
     const int32x4_t v_read_index1 = vcvtq_s32_f32(v_read_position);
     const int32x4_t v_read_index2 = WrapIndexVector(
         vaddq_s32(v_read_index1, vdupq_n_s32(1)), v_buffer_length_int);

     const float32x4_t interpolation_factor =
         vsubq_f32(v_read_position, vcvtq_f32_s32(v_read_index1));

     // Save indices so we can access the components individually for
     // getting the aamples from the buffer.
     vst1q_s32(read_index1, v_read_index1);
     vst1q_s32(read_index2, v_read_index2);

     for (int m = 0; m < 4; ++m) {
       sample1[m] = buffer[read_index1[m]];
       sample2[m] = buffer[read_index2[m]];
     }

     const float32x4_t v_sample1 = vld1q_f32(sample1);
     const float32x4_t v_sample2 = vld1q_f32(sample2);

     v_write_index = vaddq_s32(v_write_index, v_incr);
     v_write_index = WrapIndexVector(v_write_index, v_buffer_length_int);

     // Linear interpolation between samples.
     const float32x4_t sample = vaddq_f32(
         v_sample1,
         vmulq_f32(interpolation_factor, vsubq_f32(v_sample2, v_sample1)));
     vst1q_f32(destination + k, sample);
   }

   // Update |w_index| based on how many frames we processed here, wrapping
   // around if needed.
   w_index = write_index_ + k;
   if (w_index >= buffer_length)
     w_index -= buffer_length;

   return std::make_tuple(k, w_index);
 }

 void AudioDelayDSPKernel::HandleNaN(float* delay_times,
                                     uint32_t frames_to_process,
                                     float max_time) {
   unsigned k = 0;
   int number_of_loops = frames_to_process / 4;

   float32x4_t v_max_time = vdupq_n_f32(max_time);

   // This is approximately 4 times faster than the scalar version.
   for (int loop = 0; loop < number_of_loops; ++loop, k += 4) {
     float32x4_t x = vld1q_f32(delay_times + k);
     // x == x only fails when x is NaN.  Then cmp is set to 0. Otherwise
     // 0xffffffff
     uint32x4_t cmp = vceqq_f32(x, x);

     // Use cmp as a mask to set a component of x to 0 if x is NaN.
     // Otherwise, preserve x.  We pun the types here so we can apply
     // the  mask to the floating point numbers.  A integer value of
     // 0 corresponds to a floating-point +0.0, which is what we want.
     uint32x4_t xint = vandq_u32(cmp, reinterpret_cast<uint32x4_t>(x));

     // Invert the mask.
     cmp = vmvnq_u32(cmp);

     // More punning of the types so we can apply the complement mask
     // to set cmp to either max_time (if NaN) or 0 (otherwise)
     cmp = vandq_u32(cmp, reinterpret_cast<uint32x4_t>(v_max_time));

     // Merge i (bitwise or) x and cmp.  This makes x = max_time if x was NaN and
     // preserves x if not.  More type punning to do bitwise or the results
     // together.
     xint = vorrq_u32(xint, cmp);

     // Finally, save the float result.
     vst1q_f32(delay_times + k, reinterpret_cast<float32x4_t>(xint));
   }

   // Handle any frames not done in the loop above.
   for (; k < frames_to_process; ++k) {
     if (std::isnan(delay_times[k]))
       delay_times[k] = max_time;
   }
 }
 #endif

 }  // namespace blink
	// Copyright 2016 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "third_party/blink/renderer/platform/audio/audio_delay_dsp_kernel.h"

	#include <arm_neon.h>

	#include <algorithm>

	namespace blink {

	#if defined(CPU_ARM_NEON)
	static ALWAYS_INLINE int32x4_t WrapIndexVector(int32x4_t v_write_index,
	int32x4_t v_buffer_length) {
	// Wrap the write_index if any index is past the end of the buffer.
	// This implements
	//
	// if (write_index >= buffer_length)
	// write_index -= buffer_length

	// If write_index >= buffer_length, cmp = 0xffffffff. Otherwise 0.
	int32x4_t cmp =
	reinterpret_cast<int32x4_t>(vcgeq_s32(v_write_index, v_buffer_length));

	// Bitwise-and cmp with buffer length to get buffer length or 0 depending on
	// whether write_index >= buffer_length or not. Subtract this from the index
	// to wrap the index appropriately.
	return vsubq_s32(v_write_index, vandq_s32(cmp, v_buffer_length));
	}

	static ALWAYS_INLINE float32x4_t
	WrapPositionVector(float32x4_t v_position, float32x4_t v_buffer_length) {
	// Wrap the read position if it exceed the buffer length.
	// This implements
	//
	// if (position >= buffer_length)
	// read_position -= buffer_length

	// If position >= buffer length, set cmp = 0xffffffff. Otherwise 0.
	uint32x4_t cmp = vcgeq_f32(v_position, v_buffer_length);

	// Bitwise-and buffer_length with cmp to get buffer_length or 0 depending on
	// whether read_position >= buffer length or not. Then subtract from the
	// position to wrap it around if needed.
	return vsubq_f32(v_position,
	reinterpret_cast<float32x4_t>(vandq_u32(
	reinterpret_cast<uint32x4_t>(v_buffer_length), cmp)));
	}

	std::tuple<unsigned, int> AudioDelayDSPKernel::ProcessARateVector(
	float* destination,
	uint32_t frames_to_process) const {
	const int buffer_length = buffer_.size();
	const float* buffer = buffer_.Data();

	const float sample_rate = this->SampleRate();
	const float* delay_times = delay_times_.Data();

	int w_index = write_index_;

	const float32x4_t v_sample_rate = vdupq_n_f32(sample_rate);

	// The buffer length as a float and as an int so we don't need to constant
	// convert from one to the other.
	const float32x4_t v_buffer_length_float = vdupq_n_f32(buffer_length);
	const int32x4_t v_buffer_length_int = vdupq_n_s32(buffer_length);

	// How much to increment the write index each time through the loop.
	const int32x4_t v_incr = vdupq_n_s32(4);

	// Temp arrays for storing the samples needed for interpolation
	float sample1[4] __attribute((aligned(16)));
	float sample2[4] __attribute((aligned(16)));

	// Temp array for holding the indices so we can access them
	// individually.
	int read_index1[4] __attribute((aligned(16)));
	int read_index2[4] __attribute((aligned(16)));

	// Initialize the write index vector, and wrap the values if needed.
	int32x4_t v_write_index = {w_index + 0, w_index + 1, w_index + 2,
	w_index + 3};
	v_write_index = WrapIndexVector(v_write_index, v_buffer_length_int);

	int number_of_loops = frames_to_process / 4;
	int k = 0;

	for (int n = 0; n < number_of_loops; ++n, k += 4) {
	const float32x4_t v_delay_time = vld1q_f32(delay_times + k);
	const float32x4_t v_desired_delay_frames =
	vmulq_f32(v_delay_time, v_sample_rate);

	// read_position = write_index + buffer_length - desired_delay_frames. Wrap
	// the position if needed.
	float32x4_t v_read_position =
	vaddq_f32(vcvtq_f32_s32(v_write_index),
	vsubq_f32(v_buffer_length_float, v_desired_delay_frames));
	v_read_position =
	WrapPositionVector(v_read_position, v_buffer_length_float);

	// Get indices into the buffer for the samples we need for interpolation.
	const int32x4_t v_read_index1 = vcvtq_s32_f32(v_read_position);
	const int32x4_t v_read_index2 = WrapIndexVector(
	vaddq_s32(v_read_index1, vdupq_n_s32(1)), v_buffer_length_int);

	const float32x4_t interpolation_factor =
	vsubq_f32(v_read_position, vcvtq_f32_s32(v_read_index1));

	// Save indices so we can access the components individually for
	// getting the aamples from the buffer.
	vst1q_s32(read_index1, v_read_index1);
	vst1q_s32(read_index2, v_read_index2);

	for (int m = 0; m < 4; ++m) {
	sample1[m] = buffer[read_index1[m]];
	sample2[m] = buffer[read_index2[m]];
	}

	const float32x4_t v_sample1 = vld1q_f32(sample1);
	const float32x4_t v_sample2 = vld1q_f32(sample2);

	v_write_index = vaddq_s32(v_write_index, v_incr);
	v_write_index = WrapIndexVector(v_write_index, v_buffer_length_int);

	// Linear interpolation between samples.
	const float32x4_t sample = vaddq_f32(
	v_sample1,
	vmulq_f32(interpolation_factor, vsubq_f32(v_sample2, v_sample1)));
	vst1q_f32(destination + k, sample);
	}

	// Update \|w_index\| based on how many frames we processed here, wrapping
	// around if needed.
	w_index = write_index_ + k;
	if (w_index >= buffer_length)
	w_index -= buffer_length;

	return std::make_tuple(k, w_index);
	}

	void AudioDelayDSPKernel::HandleNaN(float* delay_times,
	uint32_t frames_to_process,
	float max_time) {
	unsigned k = 0;
	int number_of_loops = frames_to_process / 4;

	float32x4_t v_max_time = vdupq_n_f32(max_time);

	// This is approximately 4 times faster than the scalar version.
	for (int loop = 0; loop < number_of_loops; ++loop, k += 4) {
	float32x4_t x = vld1q_f32(delay_times + k);
	// x == x only fails when x is NaN. Then cmp is set to 0. Otherwise
	// 0xffffffff
	uint32x4_t cmp = vceqq_f32(x, x);

	// Use cmp as a mask to set a component of x to 0 if x is NaN.
	// Otherwise, preserve x. We pun the types here so we can apply
	// the mask to the floating point numbers. A integer value of
	// 0 corresponds to a floating-point +0.0, which is what we want.
	uint32x4_t xint = vandq_u32(cmp, reinterpret_cast<uint32x4_t>(x));

	// Invert the mask.
	cmp = vmvnq_u32(cmp);

	// More punning of the types so we can apply the complement mask
	// to set cmp to either max_time (if NaN) or 0 (otherwise)
	cmp = vandq_u32(cmp, reinterpret_cast<uint32x4_t>(v_max_time));

	// Merge i (bitwise or) x and cmp. This makes x = max_time if x was NaN and
	// preserves x if not. More type punning to do bitwise or the results
	// together.
	xint = vorrq_u32(xint, cmp);

	// Finally, save the float result.
	vst1q_f32(delay_times + k, reinterpret_cast<float32x4_t>(xint));
	}

	// Handle any frames not done in the loop above.
	for (; k < frames_to_process; ++k) {
	if (std::isnan(delay_times[k]))
	delay_times[k] = max_time;
	}
	}
	#endif

	} // namespace blink