| // Copyright 2017 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_CPU_X86_VECTOR_MATH_X86_H_ |
| #define THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_CPU_X86_VECTOR_MATH_X86_H_ |
| |
| #include "base/cpu.h" |
| #include "third_party/blink/renderer/platform/audio/cpu/x86/vector_math_avx.h" |
| #include "third_party/blink/renderer/platform/audio/cpu/x86/vector_math_sse.h" |
| #include "third_party/blink/renderer/platform/audio/vector_math_scalar.h" |
| #include "third_party/blink/renderer/platform/wtf/assertions.h" |
| |
| namespace blink { |
| namespace vector_math { |
| namespace x86 { |
| |
| struct FrameCounts { |
| size_t scalar_for_alignment; |
| size_t sse_for_alignment; |
| size_t avx; |
| size_t sse; |
| size_t scalar; |
| }; |
| |
| static bool CPUSupportsAVX() { |
| static const bool supports = ::base::CPU().has_avx(); |
| return supports; |
| } |
| |
| static size_t GetAVXAlignmentOffsetInNumberOfFloats(const float* source_p) { |
| constexpr size_t kBytesPerRegister = avx::kBitsPerRegister / 8u; |
| constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u; |
| size_t offset = reinterpret_cast<size_t>(source_p) & kAlignmentOffsetMask; |
| DCHECK_EQ(0u, offset % sizeof(*source_p)); |
| return offset / sizeof(*source_p); |
| } |
| |
| static ALWAYS_INLINE FrameCounts |
| SplitFramesToProcess(const float* source_p, uint32_t frames_to_process) { |
| FrameCounts counts = {0u, 0u, 0u, 0u, 0u}; |
| |
| const size_t avx_alignment_offset = |
| GetAVXAlignmentOffsetInNumberOfFloats(source_p); |
| |
| // If the first frame is not AVX aligned, the first several frames (at most |
| // seven) must be processed separately for proper alignment. |
| const size_t total_for_alignment = |
| (avx::kPackedFloatsPerRegister - avx_alignment_offset) & |
| ~avx::kFramesToProcessMask; |
| const size_t scalar_for_alignment = |
| total_for_alignment & ~sse::kFramesToProcessMask; |
| const size_t sse_for_alignment = |
| total_for_alignment & sse::kFramesToProcessMask; |
| |
| // Check which CPU features can be used based on the number of frames to |
| // process and based on CPU support. |
| const bool use_at_least_avx = |
| CPUSupportsAVX() && |
| frames_to_process >= scalar_for_alignment + sse_for_alignment + |
| avx::kPackedFloatsPerRegister; |
| const bool use_at_least_sse = |
| use_at_least_avx || |
| frames_to_process >= scalar_for_alignment + sse::kPackedFloatsPerRegister; |
| |
| if (use_at_least_sse) { |
| counts.scalar_for_alignment = scalar_for_alignment; |
| frames_to_process -= counts.scalar_for_alignment; |
| // The remaining frames are SSE aligned. |
| DCHECK(sse::IsAligned(source_p + counts.scalar_for_alignment)); |
| |
| if (use_at_least_avx) { |
| counts.sse_for_alignment = sse_for_alignment; |
| frames_to_process -= counts.sse_for_alignment; |
| // The remaining frames are AVX aligned. |
| DCHECK(avx::IsAligned(source_p + counts.scalar_for_alignment + |
| counts.sse_for_alignment)); |
| |
| // Process as many as possible of the remaining frames using AVX. |
| counts.avx = frames_to_process & avx::kFramesToProcessMask; |
| frames_to_process -= counts.avx; |
| } |
| |
| // Process as many as possible of the remaining frames using SSE. |
| counts.sse = frames_to_process & sse::kFramesToProcessMask; |
| frames_to_process -= counts.sse; |
| } |
| |
| // Process the remaining frames separately. |
| counts.scalar = frames_to_process; |
| return counts; |
| } |
| |
| static ALWAYS_INLINE void PrepareFilterForConv( |
| const float* filter_p, |
| int filter_stride, |
| size_t filter_size, |
| AudioFloatArray* prepared_filter) { |
| if (CPUSupportsAVX()) { |
| avx::PrepareFilterForConv(filter_p, filter_stride, filter_size, |
| prepared_filter); |
| } else { |
| sse::PrepareFilterForConv(filter_p, filter_stride, filter_size, |
| prepared_filter); |
| } |
| } |
| |
| static ALWAYS_INLINE void Conv(const float* source_p, |
| int source_stride, |
| const float* filter_p, |
| int filter_stride, |
| float* dest_p, |
| int dest_stride, |
| uint32_t frames_to_process, |
| size_t filter_size, |
| const AudioFloatArray* prepared_filter) { |
| const float* prepared_filter_p = |
| prepared_filter ? prepared_filter->Data() : nullptr; |
| if (source_stride == 1 && dest_stride == 1 && prepared_filter_p) { |
| if (CPUSupportsAVX() && (filter_size & ~avx::kFramesToProcessMask) == 0u) { |
| // |frames_to_process| is always a multiply of render quantum and |
| // therefore the frames can always be processed using AVX. |
| CHECK_EQ(frames_to_process & ~avx::kFramesToProcessMask, 0u); |
| avx::Conv(source_p, prepared_filter_p, dest_p, frames_to_process, |
| filter_size); |
| return; |
| } |
| if ((filter_size & ~sse::kFramesToProcessMask) == 0u) { |
| // |frames_to_process| is always a multiply of render quantum and |
| // therefore the frames can always be processed using SSE. |
| CHECK_EQ(frames_to_process & ~sse::kFramesToProcessMask, 0u); |
| sse::Conv(source_p, prepared_filter_p, dest_p, frames_to_process, |
| filter_size); |
| return; |
| } |
| } |
| scalar::Conv(source_p, source_stride, filter_p, filter_stride, dest_p, |
| dest_stride, frames_to_process, filter_size, nullptr); |
| } |
| |
| static ALWAYS_INLINE void Vadd(const float* source1p, |
| int source_stride1, |
| const float* source2p, |
| int source_stride2, |
| float* dest_p, |
| int dest_stride, |
| uint32_t frames_to_process) { |
| if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) { |
| const FrameCounts frame_counts = |
| SplitFramesToProcess(source1p, frames_to_process); |
| |
| scalar::Vadd(source1p, 1, source2p, 1, dest_p, 1, |
| frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Vadd(source1p + i, source2p + i, dest_p + i, |
| frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Vadd(source1p + i, source2p + i, dest_p + i, frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Vadd(source1p + i, source2p + i, dest_p + i, frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Vadd(source1p + i, 1, source2p + i, 1, dest_p + i, 1, |
| frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } else { |
| scalar::Vadd(source1p, source_stride1, source2p, source_stride2, dest_p, |
| dest_stride, frames_to_process); |
| } |
| } |
| |
| static ALWAYS_INLINE void Vsub(const float* source1p, |
| int source_stride1, |
| const float* source2p, |
| int source_stride2, |
| float* dest_p, |
| int dest_stride, |
| uint32_t frames_to_process) { |
| if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) { |
| const FrameCounts frame_counts = |
| SplitFramesToProcess(source1p, frames_to_process); |
| |
| scalar::Vsub(source1p, 1, source2p, 1, dest_p, 1, |
| frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Vsub(source1p + i, source2p + i, dest_p + i, |
| frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Vsub(source1p + i, source2p + i, dest_p + i, frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Vsub(source1p + i, source2p + i, dest_p + i, frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Vsub(source1p + i, 1, source2p + i, 1, dest_p + i, 1, |
| frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } else { |
| scalar::Vsub(source1p, source_stride1, source2p, source_stride2, dest_p, |
| dest_stride, frames_to_process); |
| } |
| } |
| |
| static ALWAYS_INLINE void Vclip(const float* source_p, |
| int source_stride, |
| const float* low_threshold_p, |
| const float* high_threshold_p, |
| float* dest_p, |
| int dest_stride, |
| uint32_t frames_to_process) { |
| if (source_stride == 1 && dest_stride == 1) { |
| const FrameCounts frame_counts = |
| SplitFramesToProcess(source_p, frames_to_process); |
| |
| scalar::Vclip(source_p, 1, low_threshold_p, high_threshold_p, dest_p, 1, |
| frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Vclip(source_p + i, low_threshold_p, high_threshold_p, dest_p + i, |
| frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Vclip(source_p + i, low_threshold_p, high_threshold_p, dest_p + i, |
| frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Vclip(source_p + i, low_threshold_p, high_threshold_p, dest_p + i, |
| frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Vclip(source_p + i, 1, low_threshold_p, high_threshold_p, |
| dest_p + i, 1, frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } else { |
| scalar::Vclip(source_p, source_stride, low_threshold_p, high_threshold_p, |
| dest_p, dest_stride, frames_to_process); |
| } |
| } |
| |
| static ALWAYS_INLINE void Vmaxmgv(const float* source_p, |
| int source_stride, |
| float* max_p, |
| uint32_t frames_to_process) { |
| if (source_stride == 1) { |
| const FrameCounts frame_counts = |
| SplitFramesToProcess(source_p, frames_to_process); |
| |
| scalar::Vmaxmgv(source_p, 1, max_p, frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Vmaxmgv(source_p + i, max_p, frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Vmaxmgv(source_p + i, max_p, frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Vmaxmgv(source_p + i, max_p, frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Vmaxmgv(source_p + i, 1, max_p, frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } else { |
| scalar::Vmaxmgv(source_p, source_stride, max_p, frames_to_process); |
| } |
| } |
| |
| static ALWAYS_INLINE void Vmul(const float* source1p, |
| int source_stride1, |
| const float* source2p, |
| int source_stride2, |
| float* dest_p, |
| int dest_stride, |
| uint32_t frames_to_process) { |
| if (source_stride1 == 1 && source_stride2 == 1 && dest_stride == 1) { |
| const FrameCounts frame_counts = |
| SplitFramesToProcess(source1p, frames_to_process); |
| |
| scalar::Vmul(source1p, 1, source2p, 1, dest_p, 1, |
| frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Vmul(source1p + i, source2p + i, dest_p + i, |
| frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Vmul(source1p + i, source2p + i, dest_p + i, frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Vmul(source1p + i, source2p + i, dest_p + i, frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Vmul(source1p + i, 1, source2p + i, 1, dest_p + i, 1, |
| frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } else { |
| scalar::Vmul(source1p, source_stride1, source2p, source_stride2, dest_p, |
| dest_stride, frames_to_process); |
| } |
| } |
| |
| static ALWAYS_INLINE void Vsma(const float* source_p, |
| int source_stride, |
| const float* scale, |
| float* dest_p, |
| int dest_stride, |
| uint32_t frames_to_process) { |
| if (source_stride == 1 && dest_stride == 1) { |
| const FrameCounts frame_counts = |
| SplitFramesToProcess(source_p, frames_to_process); |
| |
| scalar::Vsma(source_p, 1, scale, dest_p, 1, |
| frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Vsma(source_p + i, scale, dest_p + i, |
| frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Vsma(source_p + i, scale, dest_p + i, frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Vsma(source_p + i, scale, dest_p + i, frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Vsma(source_p + i, 1, scale, dest_p + i, 1, frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } else { |
| scalar::Vsma(source_p, source_stride, scale, dest_p, dest_stride, |
| frames_to_process); |
| } |
| } |
| |
| static ALWAYS_INLINE void Vsmul(const float* source_p, |
| int source_stride, |
| const float* scale, |
| float* dest_p, |
| int dest_stride, |
| uint32_t frames_to_process) { |
| if (source_stride == 1 && dest_stride == 1) { |
| const FrameCounts frame_counts = |
| SplitFramesToProcess(source_p, frames_to_process); |
| |
| scalar::Vsmul(source_p, 1, scale, dest_p, 1, |
| frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Vsmul(source_p + i, scale, dest_p + i, |
| frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Vsmul(source_p + i, scale, dest_p + i, frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Vsmul(source_p + i, scale, dest_p + i, frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Vsmul(source_p + i, 1, scale, dest_p + i, 1, frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } else { |
| scalar::Vsmul(source_p, source_stride, scale, dest_p, dest_stride, |
| frames_to_process); |
| } |
| } |
| |
| static ALWAYS_INLINE void Vsadd(const float* source_p, |
| int source_stride, |
| const float* addend, |
| float* dest_p, |
| int dest_stride, |
| uint32_t frames_to_process) { |
| if (source_stride == 1 && dest_stride == 1) { |
| const FrameCounts frame_counts = |
| SplitFramesToProcess(source_p, frames_to_process); |
| |
| scalar::Vsadd(source_p, 1, addend, dest_p, 1, |
| frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Vsadd(source_p + i, addend, dest_p + i, |
| frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Vsadd(source_p + i, addend, dest_p + i, frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Vsadd(source_p + i, addend, dest_p + i, frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Vsadd(source_p + i, 1, addend, dest_p + i, 1, frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } else { |
| scalar::Vsadd(source_p, source_stride, addend, dest_p, dest_stride, |
| frames_to_process); |
| } |
| } |
| |
| static ALWAYS_INLINE void Vsvesq(const float* source_p, |
| int source_stride, |
| float* sum_p, |
| uint32_t frames_to_process) { |
| if (source_stride == 1) { |
| const FrameCounts frame_counts = |
| SplitFramesToProcess(source_p, frames_to_process); |
| |
| scalar::Vsvesq(source_p, 1, sum_p, frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Vsvesq(source_p + i, sum_p, frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Vsvesq(source_p + i, sum_p, frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Vsvesq(source_p + i, sum_p, frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Vsvesq(source_p + i, 1, sum_p, frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } else { |
| scalar::Vsvesq(source_p, source_stride, sum_p, frames_to_process); |
| } |
| } |
| |
| static ALWAYS_INLINE void Zvmul(const float* real1p, |
| const float* imag1p, |
| const float* real2p, |
| const float* imag2p, |
| float* real_dest_p, |
| float* imag_dest_p, |
| uint32_t frames_to_process) { |
| FrameCounts frame_counts = SplitFramesToProcess(real1p, frames_to_process); |
| |
| scalar::Zvmul(real1p, imag1p, real2p, imag2p, real_dest_p, imag_dest_p, |
| frame_counts.scalar_for_alignment); |
| size_t i = frame_counts.scalar_for_alignment; |
| if (frame_counts.sse_for_alignment > 0u) { |
| sse::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i, |
| imag_dest_p + i, frame_counts.sse_for_alignment); |
| i += frame_counts.sse_for_alignment; |
| } |
| if (frame_counts.avx > 0u) { |
| avx::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i, |
| imag_dest_p + i, frame_counts.avx); |
| i += frame_counts.avx; |
| } |
| if (frame_counts.sse > 0u) { |
| sse::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i, |
| imag_dest_p + i, frame_counts.sse); |
| i += frame_counts.sse; |
| } |
| scalar::Zvmul(real1p + i, imag1p + i, real2p + i, imag2p + i, real_dest_p + i, |
| imag_dest_p + i, frame_counts.scalar); |
| DCHECK_EQ(frames_to_process, i + frame_counts.scalar); |
| } |
| |
| } // namespace x86 |
| } // namespace vector_math |
| } // namespace blink |
| |
| #endif // THIRD_PARTY_BLINK_RENDERER_PLATFORM_AUDIO_CPU_X86_VECTOR_MATH_X86_H_ |