chromium/src/third_party/blink/renderer/platform/audio/cpu/arm/vector_math_neon.h - manifest_repos/chromium_src - Git at Google

 // Copyright 2017 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_CPU_ARM_VECTOR_MATH_NEON_H_
 #define THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_CPU_ARM_VECTOR_MATH_NEON_H_

 #include <arm_neon.h>

 #include <algorithm>

 #include "third_party/blink/renderer/platform/audio/vector_math_scalar.h"

 namespace blink {
 namespace vector_math {
 namespace neon {

 // TODO: Consider optimizing this.
 using scalar::Conv;

 static ALWAYS_INLINE void Vadd(const float* source1p,
                                int source_stride1,
                                const float* source2p,
                                int source_stride2,
                                float* dest_p,
                                int dest_stride,
                                uint32_t frames_to_process) {
   int n = frames_to_process;

   if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source1 = vld1q_f32(source1p);
       float32x4_t source2 = vld1q_f32(source2p);
       vst1q_f32(dest_p, vaddq_f32(source1, source2));

       source1p += 4;
       source2p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }

   scalar::Vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
                dest_stride, n);
 }

 static ALWAYS_INLINE void Vsub(const float* source1p,
                                int source_stride1,
                                const float* source2p,
                                int source_stride2,
                                float* dest_p,
                                int dest_stride,
                                uint32_t frames_to_process) {
   int n = frames_to_process;

   if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source1 = vld1q_f32(source1p);
       float32x4_t source2 = vld1q_f32(source2p);
       vst1q_f32(dest_p, vsubq_f32(source1, source2));

       source1p += 4;
       source2p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }

   scalar::Vsub(source1p, source_stride1, source2p, source_stride2, dest_p,
                dest_stride, n);
 }

 static ALWAYS_INLINE void Vclip(const float* source_p,
                                 int source_stride,
                                 const float* low_threshold_p,
                                 const float* high_threshold_p,
                                 float* dest_p,
                                 int dest_stride,
                                 uint32_t frames_to_process) {
   int n = frames_to_process;

   if (source_stride == 1 && dest_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     float32x4_t low = vdupq_n_f32(*low_threshold_p);
     float32x4_t high = vdupq_n_f32(*high_threshold_p);
     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low));
       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }

   scalar::Vclip(source_p, source_stride, low_threshold_p, high_threshold_p,
                 dest_p, dest_stride, n);
 }

 static ALWAYS_INLINE void Vmaxmgv(const float* source_p,
                                   int source_stride,
                                   float* max_p,
                                   uint32_t frames_to_process) {
   int n = frames_to_process;

   if (source_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = source_p + n - tail_frames;

     float32x4_t four_max = vdupq_n_f32(*max_p);
     while (source_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       four_max = vmaxq_f32(four_max, vabsq_f32(source));
       source_p += 4;
     }
     float32x2_t two_max =
         vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max));

     float group_max[2];
     vst1_f32(group_max, two_max);
     *max_p = std::max(group_max[0], group_max[1]);

     n = tail_frames;
   }

   scalar::Vmaxmgv(source_p, source_stride, max_p, n);
 }

 static ALWAYS_INLINE void Vmul(const float* source1p,
                                int source_stride1,
                                const float* source2p,
                                int source_stride2,
                                float* dest_p,
                                int dest_stride,
                                uint32_t frames_to_process) {
   int n = frames_to_process;

   if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source1 = vld1q_f32(source1p);
       float32x4_t source2 = vld1q_f32(source2p);
       vst1q_f32(dest_p, vmulq_f32(source1, source2));

       source1p += 4;
       source2p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }

   scalar::Vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
                dest_stride, n);
 }

 static ALWAYS_INLINE void Vsma(const float* source_p,
                                int source_stride,
                                const float* scale,
                                float* dest_p,
                                int dest_stride,
                                uint32_t frames_to_process) {
   int n = frames_to_process;

   if (source_stride == 1 && dest_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     float32x4_t k = vdupq_n_f32(*scale);
     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       float32x4_t dest = vld1q_f32(dest_p);

       dest = vmlaq_f32(dest, source, k);
       vst1q_f32(dest_p, dest);

       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }

   scalar::Vsma(source_p, source_stride, scale, dest_p, dest_stride, n);
 }

 static ALWAYS_INLINE void Vsmul(const float* source_p,
                                 int source_stride,
                                 const float* scale,
                                 float* dest_p,
                                 int dest_stride,
                                 uint32_t frames_to_process) {
   int n = frames_to_process;

   if (source_stride == 1 && dest_stride == 1) {
     float k = *scale;
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       vst1q_f32(dest_p, vmulq_n_f32(source, k));

       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }

   scalar::Vsmul(source_p, source_stride, scale, dest_p, dest_stride, n);
 }

 static ALWAYS_INLINE void Vsadd(const float* source_p,
                                 int source_stride,
                                 const float* addend,
                                 float* dest_p,
                                 int dest_stride,
                                 uint32_t frames_to_process) {
   int n = frames_to_process;

   if (source_stride == 1 && dest_stride == 1) {
     float32x4_t k = vld1q_dup_f32(addend);
     int tail_frames = n % 4;
     const float* end_p = dest_p + n - tail_frames;

     while (dest_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       vst1q_f32(dest_p, vaddq_f32(source, k));

       source_p += 4;
       dest_p += 4;
     }
     n = tail_frames;
   }

   scalar::Vsadd(source_p, source_stride, addend, dest_p, dest_stride, n);
 }

 static ALWAYS_INLINE void Vsvesq(const float* source_p,
                                  int source_stride,
                                  float* sum_p,
                                  uint32_t frames_to_process) {
   int n = frames_to_process;

   if (source_stride == 1) {
     int tail_frames = n % 4;
     const float* end_p = source_p + n - tail_frames;

     float32x4_t four_sum = vdupq_n_f32(0);
     while (source_p < end_p) {
       float32x4_t source = vld1q_f32(source_p);
       four_sum = vmlaq_f32(four_sum, source, source);
       source_p += 4;
     }
     float32x2_t two_sum =
         vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum));

     float group_sum[2];
     vst1_f32(group_sum, two_sum);
     *sum_p += group_sum[0] + group_sum[1];

     n = tail_frames;
   }

   scalar::Vsvesq(source_p, source_stride, sum_p, n);
 }

 static ALWAYS_INLINE void Zvmul(const float* real1p,
                                 const float* imag1p,
                                 const float* real2p,
                                 const float* imag2p,
                                 float* real_dest_p,
                                 float* imag_dest_p,
                                 uint32_t frames_to_process) {
   unsigned i = 0;

   unsigned end_size = frames_to_process - frames_to_process % 4;
   while (i < end_size) {
     float32x4_t real1 = vld1q_f32(real1p + i);
     float32x4_t real2 = vld1q_f32(real2p + i);
     float32x4_t imag1 = vld1q_f32(imag1p + i);
     float32x4_t imag2 = vld1q_f32(imag2p + i);

     float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
     float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);

     vst1q_f32(real_dest_p + i, real_result);
     vst1q_f32(imag_dest_p + i, imag_result);

     i += 4;
   }

   scalar::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i,
                 imag_dest_p + i, frames_to_process - i);
 }

 }  // namespace neon
 }  // namespace vector_math
 }  // namespace blink

 #endif  // THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_CPU_ARM_VECTOR_MATH_NEON_H_
	// Copyright 2017 The Chromium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_CPU_ARM_VECTOR_MATH_NEON_H_
	#define THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_CPU_ARM_VECTOR_MATH_NEON_H_

	#include <arm_neon.h>

	#include <algorithm>

	#include "third_party/blink/renderer/platform/audio/vector_math_scalar.h"

	namespace blink {
	namespace vector_math {
	namespace neon {

	// TODO: Consider optimizing this.
	using scalar::Conv;

	static ALWAYS_INLINE void Vadd(const float* source1p,
	int source_stride1,
	const float* source2p,
	int source_stride2,
	float* dest_p,
	int dest_stride,
	uint32_t frames_to_process) {
	int n = frames_to_process;

	if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	while (dest_p < end_p) {
	float32x4_t source1 = vld1q_f32(source1p);
	float32x4_t source2 = vld1q_f32(source2p);
	vst1q_f32(dest_p, vaddq_f32(source1, source2));

	source1p += 4;
	source2p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}

	scalar::Vadd(source1p, source_stride1, source2p, source_stride2, dest_p,
	dest_stride, n);
	}

	static ALWAYS_INLINE void Vsub(const float* source1p,
	int source_stride1,
	const float* source2p,
	int source_stride2,
	float* dest_p,
	int dest_stride,
	uint32_t frames_to_process) {
	int n = frames_to_process;

	if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	while (dest_p < end_p) {
	float32x4_t source1 = vld1q_f32(source1p);
	float32x4_t source2 = vld1q_f32(source2p);
	vst1q_f32(dest_p, vsubq_f32(source1, source2));

	source1p += 4;
	source2p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}

	scalar::Vsub(source1p, source_stride1, source2p, source_stride2, dest_p,
	dest_stride, n);
	}

	static ALWAYS_INLINE void Vclip(const float* source_p,
	int source_stride,
	const float* low_threshold_p,
	const float* high_threshold_p,
	float* dest_p,
	int dest_stride,
	uint32_t frames_to_process) {
	int n = frames_to_process;

	if (source_stride == 1 && dest_stride == 1) {
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	float32x4_t low = vdupq_n_f32(*low_threshold_p);
	float32x4_t high = vdupq_n_f32(*high_threshold_p);
	while (dest_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	vst1q_f32(dest_p, vmaxq_f32(vminq_f32(source, high), low));
	source_p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}

	scalar::Vclip(source_p, source_stride, low_threshold_p, high_threshold_p,
	dest_p, dest_stride, n);
	}

	static ALWAYS_INLINE void Vmaxmgv(const float* source_p,
	int source_stride,
	float* max_p,
	uint32_t frames_to_process) {
	int n = frames_to_process;

	if (source_stride == 1) {
	int tail_frames = n % 4;
	const float* end_p = source_p + n - tail_frames;

	float32x4_t four_max = vdupq_n_f32(*max_p);
	while (source_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	four_max = vmaxq_f32(four_max, vabsq_f32(source));
	source_p += 4;
	}
	float32x2_t two_max =
	vmax_f32(vget_low_f32(four_max), vget_high_f32(four_max));

	float group_max[2];
	vst1_f32(group_max, two_max);
	*max_p = std::max(group_max[0], group_max[1]);

	n = tail_frames;
	}

	scalar::Vmaxmgv(source_p, source_stride, max_p, n);
	}

	static ALWAYS_INLINE void Vmul(const float* source1p,
	int source_stride1,
	const float* source2p,
	int source_stride2,
	float* dest_p,
	int dest_stride,
	uint32_t frames_to_process) {
	int n = frames_to_process;

	if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) {
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	while (dest_p < end_p) {
	float32x4_t source1 = vld1q_f32(source1p);
	float32x4_t source2 = vld1q_f32(source2p);
	vst1q_f32(dest_p, vmulq_f32(source1, source2));

	source1p += 4;
	source2p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}

	scalar::Vmul(source1p, source_stride1, source2p, source_stride2, dest_p,
	dest_stride, n);
	}

	static ALWAYS_INLINE void Vsma(const float* source_p,
	int source_stride,
	const float* scale,
	float* dest_p,
	int dest_stride,
	uint32_t frames_to_process) {
	int n = frames_to_process;

	if (source_stride == 1 && dest_stride == 1) {
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	float32x4_t k = vdupq_n_f32(*scale);
	while (dest_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	float32x4_t dest = vld1q_f32(dest_p);

	dest = vmlaq_f32(dest, source, k);
	vst1q_f32(dest_p, dest);

	source_p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}

	scalar::Vsma(source_p, source_stride, scale, dest_p, dest_stride, n);
	}

	static ALWAYS_INLINE void Vsmul(const float* source_p,
	int source_stride,
	const float* scale,
	float* dest_p,
	int dest_stride,
	uint32_t frames_to_process) {
	int n = frames_to_process;

	if (source_stride == 1 && dest_stride == 1) {
	float k = *scale;
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	while (dest_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	vst1q_f32(dest_p, vmulq_n_f32(source, k));

	source_p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}

	scalar::Vsmul(source_p, source_stride, scale, dest_p, dest_stride, n);
	}

	static ALWAYS_INLINE void Vsadd(const float* source_p,
	int source_stride,
	const float* addend,
	float* dest_p,
	int dest_stride,
	uint32_t frames_to_process) {
	int n = frames_to_process;

	if (source_stride == 1 && dest_stride == 1) {
	float32x4_t k = vld1q_dup_f32(addend);
	int tail_frames = n % 4;
	const float* end_p = dest_p + n - tail_frames;

	while (dest_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	vst1q_f32(dest_p, vaddq_f32(source, k));

	source_p += 4;
	dest_p += 4;
	}
	n = tail_frames;
	}

	scalar::Vsadd(source_p, source_stride, addend, dest_p, dest_stride, n);
	}

	static ALWAYS_INLINE void Vsvesq(const float* source_p,
	int source_stride,
	float* sum_p,
	uint32_t frames_to_process) {
	int n = frames_to_process;

	if (source_stride == 1) {
	int tail_frames = n % 4;
	const float* end_p = source_p + n - tail_frames;

	float32x4_t four_sum = vdupq_n_f32(0);
	while (source_p < end_p) {
	float32x4_t source = vld1q_f32(source_p);
	four_sum = vmlaq_f32(four_sum, source, source);
	source_p += 4;
	}
	float32x2_t two_sum =
	vadd_f32(vget_low_f32(four_sum), vget_high_f32(four_sum));

	float group_sum[2];
	vst1_f32(group_sum, two_sum);
	*sum_p += group_sum[0] + group_sum[1];

	n = tail_frames;
	}

	scalar::Vsvesq(source_p, source_stride, sum_p, n);
	}

	static ALWAYS_INLINE void Zvmul(const float* real1p,
	const float* imag1p,
	const float* real2p,
	const float* imag2p,
	float* real_dest_p,
	float* imag_dest_p,
	uint32_t frames_to_process) {
	unsigned i = 0;

	unsigned end_size = frames_to_process - frames_to_process % 4;
	while (i < end_size) {
	float32x4_t real1 = vld1q_f32(real1p + i);
	float32x4_t real2 = vld1q_f32(real2p + i);
	float32x4_t imag1 = vld1q_f32(imag1p + i);
	float32x4_t imag2 = vld1q_f32(imag2p + i);

	float32x4_t real_result = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
	float32x4_t imag_result = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);

	vst1q_f32(real_dest_p + i, real_result);
	vst1q_f32(imag_dest_p + i, imag_result);

	i += 4;
	}

	scalar::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i,
	imag_dest_p + i, frames_to_process - i);
	}

	} // namespace neon
	} // namespace vector_math
	} // namespace blink

	#endif // THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_CPU_ARM_VECTOR_MATH_NEON_H_