blob: 1737382972486d3ef396c83f7387a441e36782da [file] [log] [blame]
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file intentionally does not have header guards, it's included from
// vector_math_avx.h and from vector_math_sse.h with different macro
// definitions. The following line silences a presubmit warning that would
// otherwise be triggered by this: no-include-guard-because-multiply-included
#include "build/build_config.h"
#if defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MAC)
#include <algorithm>
#include <cmath>
#include "third_party/blink/renderer/platform/audio/audio_array.h"
#include "third_party/blink/renderer/platform/wtf/assertions.h"
namespace blink {
namespace vector_math {
// This stride is chosen so that the same prepared filter created by
// AVX::PrepareFilterForConv can be used by both AVX::Conv and sse::Conv.
// A prepared filter created by sse::PrepareFilterForConv can only be used
// by sse::Conv.
constexpr size_t kReversedFilterStride = 8u / kPackedFloatsPerRegister;
bool IsAligned(const float* p) {
constexpr size_t kBytesPerRegister = kBitsPerRegister / 8u;
constexpr size_t kAlignmentOffsetMask = kBytesPerRegister - 1u;
return (reinterpret_cast<size_t>(p) & kAlignmentOffsetMask) == 0u;
void PrepareFilterForConv(const float* filter_p,
int filter_stride,
size_t filter_size,
AudioFloatArray* prepared_filter) {
// Only contiguous convolution is implemented. Correlation (positive
// |filter_stride|) and support for non-contiguous vectors are not
// implemented.
DCHECK_EQ(-1, filter_stride);
// Reverse the filter and repeat each value across a vector
prepared_filter->Allocate(kReversedFilterStride * kPackedFloatsPerRegister *
MType* reversed_filter = reinterpret_cast<MType*>(prepared_filter->Data());
for (size_t i = 0; i < filter_size; ++i) {
reversed_filter[kReversedFilterStride * i] = MM_PS(set1)(*(filter_p - i));
// Direct vector convolution:
// dest[k] = sum(source[k+m]*filter[m*filter_stride]) for all m
// provided that |prepared_filter_p| is |prepared_filter->Data()| and that
// |prepared_filter| is prepared with |PrepareFilterForConv|.
void Conv(const float* source_p,
const float* prepared_filter_p,
float* dest_p,
uint32_t frames_to_process,
size_t filter_size) {
const float* const dest_end_p = dest_p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
DCHECK_EQ(0u, filter_size % kPackedFloatsPerRegister);
const MType* reversed_filter =
reinterpret_cast<const MType*>(prepared_filter_p);
// Do convolution with kPackedFloatsPerRegister inputs at a time.
while (dest_p < dest_end_p) {
MType m_convolution_sum = MM_PS(setzero)();
// |filter_size| is a multiple of kPackedFloatsPerRegister so we can unroll
// the loop by kPackedFloatsPerRegister, manually.
for (size_t i = 0; i < filter_size; i += kPackedFloatsPerRegister) {
for (size_t j = 0; j < kPackedFloatsPerRegister; ++j) {
size_t k = i + j;
MType m_product;
MType m_source;
m_source = MM_PS(loadu)(source_p + k);
m_product =
MM_PS(mul)(reversed_filter[kReversedFilterStride * k], m_source);
m_convolution_sum = MM_PS(add)(m_convolution_sum, m_product);
MM_PS(storeu)(dest_p, m_convolution_sum);
source_p += kPackedFloatsPerRegister;
dest_p += kPackedFloatsPerRegister;
// dest[k] = source1[k] + source2[k]
void Vadd(const float* source1p,
const float* source2p,
float* dest_p,
uint32_t frames_to_process) {
const float* const source1_end_p = source1p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
#define ADD_ALL(loadSource2, storeDest) \
while (source1p < source1_end_p) { \
MType m_source1 = MM_PS(load)(source1p); \
MType m_source2 = MM_PS(loadSource2)(source2p); \
MType m_dest = MM_PS(add)(m_source1, m_source2); \
MM_PS(storeDest)(dest_p, m_dest); \
source1p += kPackedFloatsPerRegister; \
source2p += kPackedFloatsPerRegister; \
dest_p += kPackedFloatsPerRegister; \
if (IsAligned(source2p)) {
if (IsAligned(dest_p)) {
ADD_ALL(load, store);
} else {
ADD_ALL(load, storeu);
} else {
if (IsAligned(dest_p)) {
ADD_ALL(loadu, store);
} else {
ADD_ALL(loadu, storeu);
#undef ADD_ALL
// dest[k] = source1[k] - source2[k]
void Vsub(const float* source1p,
const float* source2p,
float* dest_p,
uint32_t frames_to_process) {
const float* const source1_end_p = source1p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
#define SUB_ALL(loadSource2, storeDest) \
while (source1p < source1_end_p) { \
MType m_source1 = MM_PS(load)(source1p); \
MType m_source2 = MM_PS(loadSource2)(source2p); \
MType m_dest = MM_PS(sub)(m_source1, m_source2); \
MM_PS(storeDest)(dest_p, m_dest); \
source1p += kPackedFloatsPerRegister; \
source2p += kPackedFloatsPerRegister; \
dest_p += kPackedFloatsPerRegister; \
if (IsAligned(source2p)) {
if (IsAligned(dest_p)) {
SUB_ALL(load, store);
} else {
SUB_ALL(load, storeu);
} else {
if (IsAligned(dest_p)) {
SUB_ALL(loadu, store);
} else {
SUB_ALL(loadu, storeu);
#undef SUB_ALL
// dest[k] = clip(source[k], low_threshold, high_threshold)
// = max(low_threshold, min(high_threshold, source[k]))
void Vclip(const float* source_p,
const float* low_threshold_p,
const float* high_threshold_p,
float* dest_p,
uint32_t frames_to_process) {
const float* const source_end_p = source_p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
MType m_low_threshold = MM_PS(set1)(*low_threshold_p);
MType m_high_threshold = MM_PS(set1)(*high_threshold_p);
#define CLIP_ALL(storeDest) \
while (source_p < source_end_p) { \
MType m_source = MM_PS(load)(source_p); \
MType m_dest = \
MM_PS(max)(m_low_threshold, MM_PS(min)(m_high_threshold, m_source)); \
MM_PS(storeDest)(dest_p, m_dest); \
source_p += kPackedFloatsPerRegister; \
dest_p += kPackedFloatsPerRegister; \
if (IsAligned(dest_p)) {
} else {
#undef CLIP_ALL
// *max_p = max(*max_p, source_max) where
// source_max = max(abs(source[k])) for all k
void Vmaxmgv(const float* source_p, float* max_p, uint32_t frames_to_process) {
constexpr uint32_t kMask = 0x7FFFFFFFu;
const float* const source_end_p = source_p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
MType m_mask = MM_PS(set1)(*reinterpret_cast<const float*>(&kMask));
MType m_max = MM_PS(setzero)();
while (source_p < source_end_p) {
MType m_source = MM_PS(load)(source_p);
// Calculate the absolute value by ANDing the source with the mask,
// which will set the sign bit to 0.
m_source = MM_PS(and)(m_source, m_mask);
m_max = MM_PS(max)(m_source, m_max);
source_p += kPackedFloatsPerRegister;
// Combine the packed floats.
const float* maxes = reinterpret_cast<const float*>(&m_max);
for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i)
*max_p = std::max(*max_p, maxes[i]);
// dest[k] = source1[k] * source2[k]
void Vmul(const float* source1p,
const float* source2p,
float* dest_p,
uint32_t frames_to_process) {
const float* const source1_end_p = source1p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
#define MULTIPLY_ALL(loadSource2, storeDest) \
while (source1p < source1_end_p) { \
MType m_source1 = MM_PS(load)(source1p); \
MType m_source2 = MM_PS(loadSource2)(source2p); \
MType m_dest = MM_PS(mul)(m_source1, m_source2); \
MM_PS(storeDest)(dest_p, m_dest); \
source1p += kPackedFloatsPerRegister; \
source2p += kPackedFloatsPerRegister; \
dest_p += kPackedFloatsPerRegister; \
if (IsAligned(source2p)) {
if (IsAligned(dest_p)) {
MULTIPLY_ALL(load, store);
} else {
MULTIPLY_ALL(load, storeu);
} else {
if (IsAligned(dest_p)) {
MULTIPLY_ALL(loadu, store);
} else {
MULTIPLY_ALL(loadu, storeu);
// dest[k] += scale * source[k]
void Vsma(const float* source_p,
const float* scale,
float* dest_p,
uint32_t frames_to_process) {
const float* const source_end_p = source_p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
const MType m_scale = MM_PS(set1)(*scale);
#define SCALAR_MULTIPLY_AND_ADD_ALL(loadDest, storeDest) \
while (source_p < source_end_p) { \
MType m_source = MM_PS(load)(source_p); \
MType m_dest = MM_PS(loadDest)(dest_p); \
m_dest = MM_PS(add)(m_dest, MM_PS(mul)(m_scale, m_source)); \
MM_PS(storeDest)(dest_p, m_dest); \
source_p += kPackedFloatsPerRegister; \
dest_p += kPackedFloatsPerRegister; \
if (IsAligned(dest_p)) {
} else {
// dest[k] = scale * source[k]
void Vsmul(const float* source_p,
const float* scale,
float* dest_p,
uint32_t frames_to_process) {
const float* const source_end_p = source_p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
const MType m_scale = MM_PS(set1)(*scale);
#define SCALAR_MULTIPLY_ALL(storeDest) \
while (source_p < source_end_p) { \
MType m_source = MM_PS(load)(source_p); \
MType m_dest = MM_PS(mul)(m_scale, m_source); \
MM_PS(storeDest)(dest_p, m_dest); \
source_p += kPackedFloatsPerRegister; \
dest_p += kPackedFloatsPerRegister; \
if (IsAligned(dest_p)) {
} else {
// dest[k] = addend + source[k]
void Vsadd(const float* source_p,
const float* addend,
float* dest_p,
uint32_t frames_to_process) {
const float* const source_end_p = source_p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
const MType m_addend = MM_PS(set1)(*addend);
#define SCALAR_ADD_ALL(storeDest) \
while (source_p < source_end_p) { \
MType m_source = MM_PS(load)(source_p); \
MType m_dest = MM_PS(add)(m_addend, m_source); \
MM_PS(storeDest)(dest_p, m_dest); \
source_p += kPackedFloatsPerRegister; \
dest_p += kPackedFloatsPerRegister; \
if (IsAligned(dest_p)) {
} else {
// sum += sum(source[k]^2) for all k
void Vsvesq(const float* source_p, float* sum_p, uint32_t frames_to_process) {
const float* const source_end_p = source_p + frames_to_process;
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
MType m_sum = MM_PS(setzero)();
while (source_p < source_end_p) {
MType m_source = MM_PS(load)(source_p);
m_sum = MM_PS(add)(m_sum, MM_PS(mul)(m_source, m_source));
source_p += kPackedFloatsPerRegister;
// Combine the packed floats.
const float* sums = reinterpret_cast<const float*>(&m_sum);
for (unsigned i = 0u; i < kPackedFloatsPerRegister; ++i)
*sum_p += sums[i];
// real_dest[k] = real1[k] * real2[k] - imag1[k] * imag2[k]
// imag_dest[k] = real1[k] * imag2[k] + imag1[k] * real2[k]
void Zvmul(const float* real1p,
const float* imag1p,
const float* real2p,
const float* imag2p,
float* real_dest_p,
float* imag_dest_p,
uint32_t frames_to_process) {
DCHECK_EQ(0u, frames_to_process % kPackedFloatsPerRegister);
#define MULTIPLY_ALL(loadOtherThanReal1, storeDest) \
for (size_t i = 0u; i < frames_to_process; i += kPackedFloatsPerRegister) { \
MType real1 = MM_PS(load)(real1p + i); \
MType real2 = MM_PS(loadOtherThanReal1)(real2p + i); \
MType imag1 = MM_PS(loadOtherThanReal1)(imag1p + i); \
MType imag2 = MM_PS(loadOtherThanReal1)(imag2p + i); \
MType real = \
MM_PS(sub)(MM_PS(mul)(real1, real2), MM_PS(mul)(imag1, imag2)); \
MType imag = \
MM_PS(add)(MM_PS(mul)(real1, imag2), MM_PS(mul)(imag1, real2)); \
MM_PS(storeDest)(real_dest_p + i, real); \
MM_PS(storeDest)(imag_dest_p + i, imag); \
if (IsAligned(imag1p) && IsAligned(real2p) && IsAligned(imag2p) &&
IsAligned(real_dest_p) && IsAligned(imag_dest_p)) {
MULTIPLY_ALL(load, store);
} else {
MULTIPLY_ALL(loadu, storeu);
} // namespace vector_math
} // namespace blink
#endif // defined(ARCH_CPU_X86_FAMILY) && !defined(OS_MAC)