| // Copyright 2017 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // This file intentionally does not have header guards, it's included from |
| // vector_math_avx.h and from vector_math_sse.h with different macro |
| // definitions. The following line silences a presubmit warning that would |
| // otherwise be triggered by this: no-include-guard-because-multiply-included |
| |
| #include "build/build_config.h" |
| |
| #if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MAC) |
| |
| #include <algorithm> |
| #include <cmath> |
| |
| #include "third_party/blink/renderer/platform/audio/audio_array.h" |
| #include "third_party/blink/renderer/platform/wtf/assertions.h" |
| |
| namespace blink { |
| namespace vector_math { |
| namespace VECTOR_MATH_SIMD_NAMESPACE_NAME { |
| |
| // This stride is chosen so that the same prepared filter created by |
| // AVX::PrepareFilterForConv can be used by both AVX::Conv and sse::Conv. |
| // A prepared filter created by sse::PrepareFilterForConv can only be used |
| // by sse::Conv. |
| constexpr size_t kReversedFilterStride = 8u / kPackedFloatsPerRegister; |
| |
| bool IsAligned(const float* p) { |
| constexpr size_t kBytesPerRegister = kBitsPerRegister / 8u; |
| constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u; |
| return (reinterpret_cast<size_t>(p) & kAlignmentOffsetMask) == 0u; |
| } |
| |
| void PrepareFilterForConv(const float* filter_p, |
| int filter_stride, |
| size_t filter_size, |
| AudioFloatArray* prepared_filter) { |
| // Only contiguous convolution is implemented. Correlation (positive |
| // |filter_stride|) and support for non-contiguous vectors are not |
| // implemented. |
| DCHECK_EQ(-1, filter_stride); |
| DCHECK(prepared_filter); |
| |
| // Reverse the filter and repeat each value across a vector |
| prepared_filter->Allocate(kReversedFilterStride * kPackedFloatsPerRegister * |
| filter_size); |
| MType* reversed_filter = reinterpret_cast<MType*>(prepared_filter->Data()); |
| for (size_t i = 0; i < filter_size; ++i) { |
| reversed_filter[kReversedFilterStride * i] = MM_PS(set1)(*(filter_p - i)); |
| } |
| } |
| |
| // Direct vector convolution: |
| // dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m |
| // provided that |prepared_filter_p| is |prepared_filter->Data()| and that |
| // |prepared_filter| is prepared with |PrepareFilterForConv|. |
| void Conv(const float* source_p, |
| const float* prepared_filter_p, |
| float* dest_p, |
| uint32_t frames_to_process, |
| size_t filter_size) { |
| const float* const dest_end_p = dest_p + frames_to_process; |
| |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| DCHECK_EQ(0u, filter_size % kPackedFloatsPerRegister); |
| |
| const MType* reversed_filter = |
| reinterpret_cast<const MType*>(prepared_filter_p); |
| |
| // Do convolution with kPackedFloatsPerRegister inputs at a time. |
| while (dest_p < dest_end_p) { |
| MType m_convolution_sum = MM_PS(setzero)(); |
| |
| // |filter_size| is a multiple of kPackedFloatsPerRegister so we can unroll |
| // the loop by kPackedFloatsPerRegister, manually. |
| for (size_t i = 0; i < filter_size; i += kPackedFloatsPerRegister) { |
| for (size_t j = 0; j < kPackedFloatsPerRegister; ++j) { |
| size_t k = i + j; |
| MType m_product; |
| MType m_source; |
| |
| m_source = MM_PS(loadu)(source_p + k); |
| m_product = |
| MM_PS(mul)(reversed_filter[kReversedFilterStride * k], m_source); |
| m_convolution_sum = MM_PS(add)(m_convolution_sum, m_product); |
| } |
| } |
| MM_PS(storeu)(dest_p, m_convolution_sum); |
| |
| source_p += kPackedFloatsPerRegister; |
| dest_p += kPackedFloatsPerRegister; |
| } |
| } |
| |
| // dest[k] = source1[k] + source2[k] |
| void Vadd(const float* source1p, |
| const float* source2p, |
| float* dest_p, |
| uint32_t frames_to_process) { |
| const float* const source1_end_p = source1p + frames_to_process; |
| |
| DCHECK(IsAligned(source1p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| #define ADD_ALL(loadSource2, storeDest) \ |
| while (source1p < source1_end_p) { \ |
| MType m_source1 = MM_PS(load)(source1p); \ |
| MType m_source2 = MM_PS(loadSource2)(source2p); \ |
| MType m_dest = MM_PS(add)(m_source1, m_source2); \ |
| MM_PS(storeDest)(dest_p, m_dest); \ |
| source1p += kPackedFloatsPerRegister; \ |
| source2p += kPackedFloatsPerRegister; \ |
| dest_p += kPackedFloatsPerRegister; \ |
| } |
| |
| if (IsAligned(source2p)) { |
| if (IsAligned(dest_p)) { |
| ADD_ALL(load, store); |
| } else { |
| ADD_ALL(load, storeu); |
| } |
| } else { |
| if (IsAligned(dest_p)) { |
| ADD_ALL(loadu, store); |
| } else { |
| ADD_ALL(loadu, storeu); |
| } |
| } |
| |
| #undef ADD_ALL |
| } |
| |
| // dest[k] = source1[k] - source2[k] |
| void Vsub(const float* source1p, |
| const float* source2p, |
| float* dest_p, |
| uint32_t frames_to_process) { |
| const float* const source1_end_p = source1p + frames_to_process; |
| |
| DCHECK(IsAligned(source1p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| #define SUB_ALL(loadSource2, storeDest) \ |
| while (source1p < source1_end_p) { \ |
| MType m_source1 = MM_PS(load)(source1p); \ |
| MType m_source2 = MM_PS(loadSource2)(source2p); \ |
| MType m_dest = MM_PS(sub)(m_source1, m_source2); \ |
| MM_PS(storeDest)(dest_p, m_dest); \ |
| source1p += kPackedFloatsPerRegister; \ |
| source2p += kPackedFloatsPerRegister; \ |
| dest_p += kPackedFloatsPerRegister; \ |
| } |
| |
| if (IsAligned(source2p)) { |
| if (IsAligned(dest_p)) { |
| SUB_ALL(load, store); |
| } else { |
| SUB_ALL(load, storeu); |
| } |
| } else { |
| if (IsAligned(dest_p)) { |
| SUB_ALL(loadu, store); |
| } else { |
| SUB_ALL(loadu, storeu); |
| } |
| } |
| |
| #undef SUB_ALL |
| } |
| |
| // dest[k] = clip(source[k], low_threshold, high_threshold) |
| // = max(low_threshold, min(high_threshold, source[k])) |
| void Vclip(const float* source_p, |
| const float* low_threshold_p, |
| const float* high_threshold_p, |
| float* dest_p, |
| uint32_t frames_to_process) { |
| const float* const source_end_p = source_p + frames_to_process; |
| |
| DCHECK(IsAligned(source_p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| MType m_low_threshold = MM_PS(set1)(*low_threshold_p); |
| MType m_high_threshold = MM_PS(set1)(*high_threshold_p); |
| |
| #define CLIP_ALL(storeDest) \ |
| while (source_p < source_end_p) { \ |
| MType m_source = MM_PS(load)(source_p); \ |
| MType m_dest = \ |
| MM_PS(max)(m_low_threshold, MM_PS(min)(m_high_threshold, m_source)); \ |
| MM_PS(storeDest)(dest_p, m_dest); \ |
| source_p += kPackedFloatsPerRegister; \ |
| dest_p += kPackedFloatsPerRegister; \ |
| } |
| |
| if (IsAligned(dest_p)) { |
| CLIP_ALL(store); |
| } else { |
| CLIP_ALL(storeu); |
| } |
| |
| #undef CLIP_ALL |
| } |
| |
| // *max_p = max(*max_p, source_max) where |
| // source_max = max(abs(source[k])) for all k |
| void Vmaxmgv(const float* source_p, float* max_p, uint32_t frames_to_process) { |
| constexpr uint32_t kMask = 0x7FFFFFFFu; |
| |
| const float* const source_end_p = source_p + frames_to_process; |
| |
| DCHECK(IsAligned(source_p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| MType m_mask = MM_PS(set1)(*reinterpret_cast<const float*>(&kMask)); |
| MType m_max = MM_PS(setzero)(); |
| |
| while (source_p < source_end_p) { |
| MType m_source = MM_PS(load)(source_p); |
| // Calculate the absolute value by ANDing the source with the mask, |
| // which will set the sign bit to 0. |
| m_source = MM_PS(and)(m_source, m_mask); |
| m_max = MM_PS(max)(m_source, m_max); |
| source_p += kPackedFloatsPerRegister; |
| } |
| |
| // Combine the packed floats. |
| const float* maxes = reinterpret_cast<const float*>(&m_max); |
| for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i) |
| *max_p = std::max(*max_p, maxes[i]); |
| } |
| |
| // dest[k] = source1[k] * source2[k] |
| void Vmul(const float* source1p, |
| const float* source2p, |
| float* dest_p, |
| uint32_t frames_to_process) { |
| const float* const source1_end_p = source1p + frames_to_process; |
| |
| DCHECK(IsAligned(source1p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| #define MULTIPLY_ALL(loadSource2, storeDest) \ |
| while (source1p < source1_end_p) { \ |
| MType m_source1 = MM_PS(load)(source1p); \ |
| MType m_source2 = MM_PS(loadSource2)(source2p); \ |
| MType m_dest = MM_PS(mul)(m_source1, m_source2); \ |
| MM_PS(storeDest)(dest_p, m_dest); \ |
| source1p += kPackedFloatsPerRegister; \ |
| source2p += kPackedFloatsPerRegister; \ |
| dest_p += kPackedFloatsPerRegister; \ |
| } |
| |
| if (IsAligned(source2p)) { |
| if (IsAligned(dest_p)) { |
| MULTIPLY_ALL(load, store); |
| } else { |
| MULTIPLY_ALL(load, storeu); |
| } |
| } else { |
| if (IsAligned(dest_p)) { |
| MULTIPLY_ALL(loadu, store); |
| } else { |
| MULTIPLY_ALL(loadu, storeu); |
| } |
| } |
| |
| #undef MULTIPLY_ALL |
| } |
| |
| // dest[k] += scale * source[k] |
| void Vsma(const float* source_p, |
| const float* scale, |
| float* dest_p, |
| uint32_t frames_to_process) { |
| const float* const source_end_p = source_p + frames_to_process; |
| |
| DCHECK(IsAligned(source_p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| const MType m_scale = MM_PS(set1)(*scale); |
| |
| #define SCALAR_MULTIPLY_AND_ADD_ALL(loadDest, storeDest) \ |
| while (source_p < source_end_p) { \ |
| MType m_source = MM_PS(load)(source_p); \ |
| MType m_dest = MM_PS(loadDest)(dest_p); \ |
| m_dest = MM_PS(add)(m_dest, MM_PS(mul)(m_scale, m_source)); \ |
| MM_PS(storeDest)(dest_p, m_dest); \ |
| source_p += kPackedFloatsPerRegister; \ |
| dest_p += kPackedFloatsPerRegister; \ |
| } |
| |
| if (IsAligned(dest_p)) { |
| SCALAR_MULTIPLY_AND_ADD_ALL(load, store); |
| } else { |
| SCALAR_MULTIPLY_AND_ADD_ALL(loadu, storeu); |
| } |
| |
| #undef SCALAR_MULTIPLY_AND_ADD_ALL |
| } |
| |
| // dest[k] = scale * source[k] |
| void Vsmul(const float* source_p, |
| const float* scale, |
| float* dest_p, |
| uint32_t frames_to_process) { |
| const float* const source_end_p = source_p + frames_to_process; |
| |
| DCHECK(IsAligned(source_p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| const MType m_scale = MM_PS(set1)(*scale); |
| |
| #define SCALAR_MULTIPLY_ALL(storeDest) \ |
| while (source_p < source_end_p) { \ |
| MType m_source = MM_PS(load)(source_p); \ |
| MType m_dest = MM_PS(mul)(m_scale, m_source); \ |
| MM_PS(storeDest)(dest_p, m_dest); \ |
| source_p += kPackedFloatsPerRegister; \ |
| dest_p += kPackedFloatsPerRegister; \ |
| } |
| |
| if (IsAligned(dest_p)) { |
| SCALAR_MULTIPLY_ALL(store); |
| } else { |
| SCALAR_MULTIPLY_ALL(storeu); |
| } |
| |
| #undef SCALAR_MULTIPLY_ALL |
| } |
| |
| // dest[k] = addend + source[k] |
| void Vsadd(const float* source_p, |
| const float* addend, |
| float* dest_p, |
| uint32_t frames_to_process) { |
| const float* const source_end_p = source_p + frames_to_process; |
| |
| DCHECK(IsAligned(source_p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| const MType m_addend = MM_PS(set1)(*addend); |
| |
| #define SCALAR_ADD_ALL(storeDest) \ |
| while (source_p < source_end_p) { \ |
| MType m_source = MM_PS(load)(source_p); \ |
| MType m_dest = MM_PS(add)(m_addend, m_source); \ |
| MM_PS(storeDest)(dest_p, m_dest); \ |
| source_p += kPackedFloatsPerRegister; \ |
| dest_p += kPackedFloatsPerRegister; \ |
| } |
| |
| if (IsAligned(dest_p)) { |
| SCALAR_ADD_ALL(store); |
| } else { |
| SCALAR_ADD_ALL(storeu); |
| } |
| |
| #undef SCALAR_ADD_ALL |
| } |
| |
| // sum += sum(source[k]^2) for all k |
| void Vsvesq(const float* source_p, float* sum_p, uint32_t frames_to_process) { |
| const float* const source_end_p = source_p + frames_to_process; |
| |
| DCHECK(IsAligned(source_p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| MType m_sum = MM_PS(setzero)(); |
| |
| while (source_p < source_end_p) { |
| MType m_source = MM_PS(load)(source_p); |
| m_sum = MM_PS(add)(m_sum, MM_PS(mul)(m_source, m_source)); |
| source_p += kPackedFloatsPerRegister; |
| } |
| |
| // Combine the packed floats. |
| const float* sums = reinterpret_cast<const float*>(&m_sum); |
| for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i) |
| *sum_p += sums[i]; |
| } |
| |
| // real_dest[k] = real1[k] * real2[k] - imag1[k] * imag2[k] |
| // imag_dest[k] = real1[k] * imag2[k] + imag1[k] * real2[k] |
| void Zvmul(const float* real1p, |
| const float* imag1p, |
| const float* real2p, |
| const float* imag2p, |
| float* real_dest_p, |
| float* imag_dest_p, |
| uint32_t frames_to_process) { |
| DCHECK(IsAligned(real1p)); |
| DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister); |
| |
| #define MULTIPLY_ALL(loadOtherThanReal1, storeDest) \ |
| for (size_t i = 0u; i < frames_to_process; i += kPackedFloatsPerRegister) { \ |
| MType real1 = MM_PS(load)(real1p + i); \ |
| MType real2 = MM_PS(loadOtherThanReal1)(real2p + i); \ |
| MType imag1 = MM_PS(loadOtherThanReal1)(imag1p + i); \ |
| MType imag2 = MM_PS(loadOtherThanReal1)(imag2p + i); \ |
| MType real = \ |
| MM_PS(sub)(MM_PS(mul)(real1, real2), MM_PS(mul)(imag1, imag2)); \ |
| MType imag = \ |
| MM_PS(add)(MM_PS(mul)(real1, imag2), MM_PS(mul)(imag1, real2)); \ |
| MM_PS(storeDest)(real_dest_p + i, real); \ |
| MM_PS(storeDest)(imag_dest_p + i, imag); \ |
| } |
| |
| if (IsAligned(imag1p) && IsAligned(real2p) && IsAligned(imag2p) && |
| IsAligned(real_dest_p) && IsAligned(imag_dest_p)) { |
| MULTIPLY_ALL(load, store); |
| } else { |
| MULTIPLY_ALL(loadu, storeu); |
| } |
| |
| #undef MULTIPLY_ALL |
| } |
| |
| } // namespace VECTOR_MATH_SIMD_NAMESPACE_NAME |
| } // namespace vector_math |
| } // namespace blink |
| |
| #endif // defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MAC) |