blob: c33573df5fbd00864d8bed26344ee0736b2c8fa4 [file] [log] [blame]
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_GRAPHICS_CPU_X86_WEBGL_IMAGE_CONVERSION_SSE_H_
#define THIRD_PARTY_BLINK_RENDERER_PLATFORM_GRAPHICS_CPU_X86_WEBGL_IMAGE_CONVERSION_SSE_H_
#include "build/build_config.h"
#if defined(ARCH_CPU_X86_FAMILY)
#include <emmintrin.h>
namespace blink {
namespace simd {
ALWAYS_INLINE void UnpackOneRowOfRGBA4444LittleToRGBA8(
const uint16_t*& source,
uint8_t*& destination,
unsigned& pixels_per_row) {
__m128i immediate0x0f = _mm_set1_epi16(0x0F);
unsigned pixels_per_row_trunc = (pixels_per_row / 8) * 8;
for (unsigned i = 0; i < pixels_per_row_trunc; i += 8) {
__m128i packed_value = _mm_loadu_si128((__m128i*)source);
__m128i r = _mm_srli_epi16(packed_value, 12);
__m128i g = _mm_and_si128(_mm_srli_epi16(packed_value, 8), immediate0x0f);
__m128i b = _mm_and_si128(_mm_srli_epi16(packed_value, 4), immediate0x0f);
__m128i a = _mm_and_si128(packed_value, immediate0x0f);
__m128i component_r = _mm_or_si128(_mm_slli_epi16(r, 4), r);
__m128i component_g = _mm_or_si128(_mm_slli_epi16(g, 4), g);
__m128i component_b = _mm_or_si128(_mm_slli_epi16(b, 4), b);
__m128i component_a = _mm_or_si128(_mm_slli_epi16(a, 4), a);
__m128i component_rg =
_mm_or_si128(_mm_slli_epi16(component_g, 8), component_r);
__m128i component_ba =
_mm_or_si128(_mm_slli_epi16(component_a, 8), component_b);
__m128i component_rgba1 = _mm_unpackhi_epi16(component_rg, component_ba);
__m128i component_rgba2 = _mm_unpacklo_epi16(component_rg, component_ba);
_mm_storeu_si128((__m128i*)destination, component_rgba2);
_mm_storeu_si128((__m128i*)(destination + 16), component_rgba1);
source += 8;
destination += 32;
}
pixels_per_row -= pixels_per_row_trunc;
}
ALWAYS_INLINE void UnpackOneRowOfRGBA5551LittleToRGBA8(
const uint16_t*& source,
uint8_t*& destination,
unsigned& pixels_per_row) {
__m128i immediate0x1f = _mm_set1_epi16(0x1F);
__m128i immediate0x7 = _mm_set1_epi16(0x7);
__m128i immediate0x1 = _mm_set1_epi16(0x1);
unsigned pixels_per_row_trunc = (pixels_per_row / 8) * 8;
for (unsigned i = 0; i < pixels_per_row_trunc; i += 8) {
__m128i packed_value = _mm_loadu_si128((__m128i*)source);
__m128i r = _mm_srli_epi16(packed_value, 11);
__m128i g = _mm_and_si128(_mm_srli_epi16(packed_value, 6), immediate0x1f);
__m128i b = _mm_and_si128(_mm_srli_epi16(packed_value, 1), immediate0x1f);
__m128i component_r =
_mm_or_si128(_mm_slli_epi16(r, 3), _mm_and_si128(r, immediate0x7));
__m128i component_g =
_mm_or_si128(_mm_slli_epi16(g, 3), _mm_and_si128(g, immediate0x7));
__m128i component_b =
_mm_or_si128(_mm_slli_epi16(b, 3), _mm_and_si128(b, immediate0x7));
__m128i component_a = _mm_cmpeq_epi16(
_mm_and_si128(packed_value, immediate0x1), immediate0x1);
__m128i component_rg =
_mm_or_si128(_mm_slli_epi16(component_g, 8), component_r);
__m128i component_ba =
_mm_or_si128(_mm_slli_epi16(component_a, 8), component_b);
__m128i component_rgba1 = _mm_unpackhi_epi16(component_rg, component_ba);
__m128i component_rgba2 = _mm_unpacklo_epi16(component_rg, component_ba);
_mm_storeu_si128((__m128i*)destination, component_rgba2);
_mm_storeu_si128((__m128i*)(destination + 16), component_rgba1);
source += 8;
destination += 32;
}
pixels_per_row -= pixels_per_row_trunc;
}
ALWAYS_INLINE void PackOneRowOfRGBA8LittleToRA8(const uint8_t*& source,
uint8_t*& destination,
unsigned& pixels_per_row) {
float tmp[4];
unsigned pixels_per_row_trunc = (pixels_per_row / 4) * 4;
const __m128 max_pixel_value = _mm_set1_ps(255.0);
for (unsigned i = 0; i < pixels_per_row_trunc; i += 4) {
__m128 scale_factor = _mm_set_ps(
source[15] ? source[15] : 255.0, source[11] ? source[11] : 255.0,
source[7] ? source[7] : 255.0, source[3] ? source[3] : 255.0);
__m128 source_r = _mm_set_ps(source[12], source[8], source[4], source[0]);
source_r = _mm_mul_ps(source_r, _mm_div_ps(max_pixel_value, scale_factor));
_mm_storeu_ps(tmp, source_r);
destination[0] = static_cast<uint8_t>(tmp[0]);
destination[1] = static_cast<uint8_t>(source[3]);
destination[2] = static_cast<uint8_t>(tmp[1]);
destination[3] = static_cast<uint8_t>(source[7]);
destination[4] = static_cast<uint8_t>(tmp[2]);
destination[5] = static_cast<uint8_t>(source[11]);
destination[6] = static_cast<uint8_t>(tmp[3]);
destination[7] = static_cast<uint8_t>(source[15]);
source += 16;
destination += 8;
}
pixels_per_row -= pixels_per_row_trunc;
}
ALWAYS_INLINE void UnpackOneRowOfBGRA8LittleToRGBA8(const uint32_t*& source,
uint32_t*& destination,
unsigned& pixels_per_row) {
__m128i bgra, rgba;
__m128i br_mask = _mm_set1_epi32(0x00ff00ff);
__m128i ga_mask = _mm_set1_epi32(0xff00ff00);
unsigned pixels_per_row_trunc = (pixels_per_row / 4) * 4;
for (unsigned i = 0; i < pixels_per_row_trunc; i += 4) {
bgra = _mm_loadu_si128((const __m128i*)(source));
rgba = _mm_shufflehi_epi16(_mm_shufflelo_epi16(bgra, 0xB1), 0xB1);
rgba = _mm_or_si128(_mm_and_si128(rgba, br_mask),
_mm_and_si128(bgra, ga_mask));
_mm_storeu_si128((__m128i*)(destination), rgba);
source += 4;
destination += 4;
}
pixels_per_row -= pixels_per_row_trunc;
}
ALWAYS_INLINE void PackOneRowOfRGBA8LittleToR8(const uint8_t*& source,
uint8_t*& destination,
unsigned& pixels_per_row) {
float tmp[4];
unsigned pixels_per_row_trunc = (pixels_per_row / 4) * 4;
for (unsigned i = 0; i < pixels_per_row_trunc; i += 4) {
__m128 scale =
_mm_set_ps(source[15] ? source[15] : 255, source[11] ? source[11] : 255,
source[7] ? source[7] : 255, source[3] ? source[3] : 255);
__m128 source_r = _mm_set_ps(source[12], source[8], source[4], source[0]);
source_r = _mm_mul_ps(source_r, _mm_div_ps(_mm_set1_ps(255), scale));
_mm_storeu_ps(tmp, source_r);
destination[0] = static_cast<uint8_t>(tmp[0]);
destination[1] = static_cast<uint8_t>(tmp[1]);
destination[2] = static_cast<uint8_t>(tmp[2]);
destination[3] = static_cast<uint8_t>(tmp[3]);
source += 16;
destination += 4;
}
pixels_per_row -= pixels_per_row_trunc;
}
// This function deliberately doesn't mutate the incoming source, destination,
// or pixelsPerRow arguments, since it always handles the full row.
ALWAYS_INLINE void PackOneRowOfRGBA8LittleToRGBA8(const uint8_t* source,
uint8_t* destination,
unsigned pixels_per_row) {
float tmp[4];
float scale;
for (unsigned i = 0; i < pixels_per_row; i++) {
scale = source[3] ? source[3] : 255;
__m128 source_r = _mm_set_ps(0, source[2], source[1], source[0]);
source_r =
_mm_mul_ps(source_r, _mm_div_ps(_mm_set1_ps(255), _mm_set1_ps(scale)));
_mm_storeu_ps(tmp, source_r);
destination[0] = static_cast<uint8_t>(tmp[0]);
destination[1] = static_cast<uint8_t>(tmp[1]);
destination[2] = static_cast<uint8_t>(tmp[2]);
destination[3] = source[3];
source += 4;
destination += 4;
}
}
} // namespace simd
} // namespace blink
#endif // ARCH_CPU_X86_FAMILY
#endif // THIRD_PARTY_BLINK_RENDERER_PLATFORM_GRAPHICS_CPU_X86_WEBGL_IMAGE_CONVERSION_SSE_H_