| /* |
| * Copyright (C) 2012 Gabor Rapcsanyi (rgabor@inf.u-szeged.hu), University of |
| * Szeged |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY |
| * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL UNIVERSITY OF SZEGED OR |
| * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| */ |
| |
| #ifndef THIRD_PARTY_BLINK_RENDERER_PLATFORM_GRAPHICS_CPU_ARM_WEBGL_IMAGE_CONVERSION_NEON_H_ |
| #define THIRD_PARTY_BLINK_RENDERER_PLATFORM_GRAPHICS_CPU_ARM_WEBGL_IMAGE_CONVERSION_NEON_H_ |
| |
| #include "base/compiler_specific.h" |
| |
| #if defined(CPU_ARM_NEON) |
| |
| #include <arm_neon.h> |
| |
| namespace blink { |
| |
| namespace simd { |
| |
| ALWAYS_INLINE void UnpackOneRowOfRGBA16LittleToRGBA8(const uint16_t*& source, |
| uint8_t*& destination, |
| unsigned& pixels_per_row) { |
| unsigned components_per_row = pixels_per_row * 4; |
| unsigned tail_components = components_per_row % 16; |
| unsigned components_size = components_per_row - tail_components; |
| const uint8_t* src = reinterpret_cast<const uint8_t*>(source); |
| |
| for (unsigned i = 0; i < components_size; i += 16) { |
| uint8x16x2_t components = vld2q_u8(src + i * 2); |
| vst1q_u8(destination + i, components.val[1]); |
| } |
| |
| source += components_size; |
| destination += components_size; |
| pixels_per_row = tail_components / 4; |
| } |
| |
| ALWAYS_INLINE void UnpackOneRowOfRGB16LittleToRGBA8(const uint16_t*& source, |
| uint8_t*& destination, |
| unsigned& pixels_per_row) { |
| unsigned components_per_row = pixels_per_row * 3; |
| unsigned tail_components = components_per_row % 24; |
| unsigned components_size = components_per_row - tail_components; |
| |
| uint8x8_t component_a = vdup_n_u8(0xFF); |
| for (unsigned i = 0; i < components_size; i += 24) { |
| uint16x8x3_t rgb16 = vld3q_u16(source + i); |
| uint8x8_t component_r = vqmovn_u16(vshrq_n_u16(rgb16.val[0], 8)); |
| uint8x8_t component_g = vqmovn_u16(vshrq_n_u16(rgb16.val[1], 8)); |
| uint8x8_t component_b = vqmovn_u16(vshrq_n_u16(rgb16.val[2], 8)); |
| uint8x8x4_t rgba8 = {{component_r, component_g, component_b, component_a}}; |
| vst4_u8(destination, rgba8); |
| destination += 32; |
| } |
| |
| source += components_size; |
| pixels_per_row = tail_components / 3; |
| } |
| |
| ALWAYS_INLINE void UnpackOneRowOfARGB16LittleToRGBA8(const uint16_t*& source, |
| uint8_t*& destination, |
| unsigned& pixels_per_row) { |
| unsigned components_per_row = pixels_per_row * 4; |
| unsigned tail_components = components_per_row % 32; |
| unsigned components_size = components_per_row - tail_components; |
| |
| for (unsigned i = 0; i < components_size; i += 32) { |
| uint16x8x4_t argb16 = vld4q_u16(source + i); |
| uint8x8_t component_a = vqmovn_u16(vshrq_n_u16(argb16.val[0], 8)); |
| uint8x8_t component_r = vqmovn_u16(vshrq_n_u16(argb16.val[1], 8)); |
| uint8x8_t component_g = vqmovn_u16(vshrq_n_u16(argb16.val[2], 8)); |
| uint8x8_t component_b = vqmovn_u16(vshrq_n_u16(argb16.val[3], 8)); |
| uint8x8x4_t rgba8 = {{component_r, component_g, component_b, component_a}}; |
| vst4_u8(destination + i, rgba8); |
| } |
| |
| source += components_size; |
| destination += components_size; |
| pixels_per_row = tail_components / 4; |
| } |
| |
| ALWAYS_INLINE void UnpackOneRowOfBGRA16LittleToRGBA8(const uint16_t*& source, |
| uint8_t*& destination, |
| unsigned& pixels_per_row) { |
| unsigned components_per_row = pixels_per_row * 4; |
| unsigned tail_components = components_per_row % 32; |
| unsigned components_size = components_per_row - tail_components; |
| |
| for (unsigned i = 0; i < components_size; i += 32) { |
| uint16x8x4_t argb16 = vld4q_u16(source + i); |
| uint8x8_t component_b = vqmovn_u16(vshrq_n_u16(argb16.val[0], 8)); |
| uint8x8_t component_g = vqmovn_u16(vshrq_n_u16(argb16.val[1], 8)); |
| uint8x8_t component_r = vqmovn_u16(vshrq_n_u16(argb16.val[2], 8)); |
| uint8x8_t component_a = vqmovn_u16(vshrq_n_u16(argb16.val[3], 8)); |
| uint8x8x4_t rgba8 = {{component_r, component_g, component_b, component_a}}; |
| vst4_u8(destination + i, rgba8); |
| } |
| |
| source += components_size; |
| destination += components_size; |
| pixels_per_row = tail_components / 4; |
| } |
| |
| ALWAYS_INLINE void UnpackOneRowOfRGBA4444ToRGBA8(const uint16_t*& source, |
| uint8_t*& destination, |
| unsigned& pixels_per_row) { |
| unsigned tail_pixels = pixels_per_row % 8; |
| unsigned pixel_size = pixels_per_row - tail_pixels; |
| |
| uint16x8_t immediate0x0f = vdupq_n_u16(0x0F); |
| for (unsigned i = 0; i < pixel_size; i += 8) { |
| uint16x8_t eight_pixels = vld1q_u16(source + i); |
| |
| uint8x8_t component_r = vqmovn_u16(vshrq_n_u16(eight_pixels, 12)); |
| uint8x8_t component_g = |
| vqmovn_u16(vandq_u16(vshrq_n_u16(eight_pixels, 8), immediate0x0f)); |
| uint8x8_t component_b = |
| vqmovn_u16(vandq_u16(vshrq_n_u16(eight_pixels, 4), immediate0x0f)); |
| uint8x8_t component_a = vqmovn_u16(vandq_u16(eight_pixels, immediate0x0f)); |
| |
| component_r = vorr_u8(vshl_n_u8(component_r, 4), component_r); |
| component_g = vorr_u8(vshl_n_u8(component_g, 4), component_g); |
| component_b = vorr_u8(vshl_n_u8(component_b, 4), component_b); |
| component_a = vorr_u8(vshl_n_u8(component_a, 4), component_a); |
| |
| uint8x8x4_t dest_components = { |
| {component_r, component_g, component_b, component_a}}; |
| vst4_u8(destination, dest_components); |
| destination += 32; |
| } |
| |
| source += pixel_size; |
| pixels_per_row = tail_pixels; |
| } |
| |
| ALWAYS_INLINE void PackOneRowOfRGBA8ToUnsignedShort4444( |
| const uint8_t*& source, |
| uint16_t*& destination, |
| unsigned& pixels_per_row) { |
| unsigned components_per_row = pixels_per_row * 4; |
| unsigned tail_components = components_per_row % 32; |
| unsigned components_size = components_per_row - tail_components; |
| |
| uint8_t* dst = reinterpret_cast<uint8_t*>(destination); |
| uint8x8_t immediate0xf0 = vdup_n_u8(0xF0); |
| for (unsigned i = 0; i < components_size; i += 32) { |
| uint8x8x4_t rgba8 = vld4_u8(source + i); |
| |
| uint8x8_t component_r = vand_u8(rgba8.val[0], immediate0xf0); |
| uint8x8_t component_g = vshr_n_u8(vand_u8(rgba8.val[1], immediate0xf0), 4); |
| uint8x8_t component_b = vand_u8(rgba8.val[2], immediate0xf0); |
| uint8x8_t component_a = vshr_n_u8(vand_u8(rgba8.val[3], immediate0xf0), 4); |
| |
| uint8x8x2_t rgba4; |
| rgba4.val[0] = vorr_u8(component_b, component_a); |
| rgba4.val[1] = vorr_u8(component_r, component_g); |
| vst2_u8(dst, rgba4); |
| dst += 16; |
| } |
| |
| source += components_size; |
| destination += components_size / 4; |
| pixels_per_row = tail_components / 4; |
| } |
| |
| ALWAYS_INLINE void UnpackOneRowOfRGBA5551ToRGBA8(const uint16_t*& source, |
| uint8_t*& destination, |
| unsigned& pixels_per_row) { |
| unsigned tail_pixels = pixels_per_row % 8; |
| unsigned pixel_size = pixels_per_row - tail_pixels; |
| |
| uint8x8_t immediate0x7 = vdup_n_u8(0x7); |
| uint8x8_t immediate0xff = vdup_n_u8(0xFF); |
| uint16x8_t immediate0x1f = vdupq_n_u16(0x1F); |
| uint16x8_t immediate0x1 = vdupq_n_u16(0x1); |
| |
| for (unsigned i = 0; i < pixel_size; i += 8) { |
| uint16x8_t eight_pixels = vld1q_u16(source + i); |
| |
| uint8x8_t component_r = vqmovn_u16(vshrq_n_u16(eight_pixels, 11)); |
| uint8x8_t component_g = |
| vqmovn_u16(vandq_u16(vshrq_n_u16(eight_pixels, 6), immediate0x1f)); |
| uint8x8_t component_b = |
| vqmovn_u16(vandq_u16(vshrq_n_u16(eight_pixels, 1), immediate0x1f)); |
| uint8x8_t component_a = vqmovn_u16(vandq_u16(eight_pixels, immediate0x1)); |
| |
| component_r = |
| vorr_u8(vshl_n_u8(component_r, 3), vand_u8(component_r, immediate0x7)); |
| component_g = |
| vorr_u8(vshl_n_u8(component_g, 3), vand_u8(component_g, immediate0x7)); |
| component_b = |
| vorr_u8(vshl_n_u8(component_b, 3), vand_u8(component_b, immediate0x7)); |
| component_a = vmul_u8(component_a, immediate0xff); |
| |
| uint8x8x4_t dest_components = { |
| {component_r, component_g, component_b, component_a}}; |
| vst4_u8(destination, dest_components); |
| destination += 32; |
| } |
| |
| source += pixel_size; |
| pixels_per_row = tail_pixels; |
| } |
| |
| ALWAYS_INLINE void PackOneRowOfRGBA8ToUnsignedShort5551( |
| const uint8_t*& source, |
| uint16_t*& destination, |
| unsigned& pixels_per_row) { |
| unsigned components_per_row = pixels_per_row * 4; |
| unsigned tail_components = components_per_row % 32; |
| unsigned components_size = components_per_row - tail_components; |
| |
| uint8_t* dst = reinterpret_cast<uint8_t*>(destination); |
| |
| uint8x8_t immediate0xf8 = vdup_n_u8(0xF8); |
| uint8x8_t immediate0x18 = vdup_n_u8(0x18); |
| for (unsigned i = 0; i < components_size; i += 32) { |
| uint8x8x4_t rgba8 = vld4_u8(source + i); |
| |
| uint8x8_t component_r = vand_u8(rgba8.val[0], immediate0xf8); |
| uint8x8_t component_g3bit = vshr_n_u8(rgba8.val[1], 5); |
| |
| uint8x8_t component_g2bit = |
| vshl_n_u8(vand_u8(rgba8.val[1], immediate0x18), 3); |
| uint8x8_t component_b = vshr_n_u8(vand_u8(rgba8.val[2], immediate0xf8), 2); |
| uint8x8_t component_a = vshr_n_u8(rgba8.val[3], 7); |
| |
| uint8x8x2_t rgba5551; |
| rgba5551.val[0] = |
| vorr_u8(vorr_u8(component_g2bit, component_b), component_a); |
| rgba5551.val[1] = vorr_u8(component_r, component_g3bit); |
| vst2_u8(dst, rgba5551); |
| dst += 16; |
| } |
| |
| source += components_size; |
| destination += components_size / 4; |
| pixels_per_row = tail_components / 4; |
| } |
| |
| ALWAYS_INLINE void PackOneRowOfRGBA8ToUnsignedShort565( |
| const uint8_t*& source, |
| uint16_t*& destination, |
| unsigned& pixels_per_row) { |
| unsigned components_per_row = pixels_per_row * 4; |
| unsigned tail_components = components_per_row % 32; |
| unsigned components_size = components_per_row - tail_components; |
| uint8_t* dst = reinterpret_cast<uint8_t*>(destination); |
| |
| uint8x8_t immediate0xf8 = vdup_n_u8(0xF8); |
| uint8x8_t immediate0x1c = vdup_n_u8(0x1C); |
| for (unsigned i = 0; i < components_size; i += 32) { |
| uint8x8x4_t rgba8 = vld4_u8(source + i); |
| |
| uint8x8_t component_r = vand_u8(rgba8.val[0], immediate0xf8); |
| uint8x8_t component_g_left = vshr_n_u8(rgba8.val[1], 5); |
| uint8x8_t component_g_right = |
| vshl_n_u8(vand_u8(rgba8.val[1], immediate0x1c), 3); |
| uint8x8_t component_b = vshr_n_u8(vand_u8(rgba8.val[2], immediate0xf8), 3); |
| |
| uint8x8x2_t rgb565; |
| rgb565.val[0] = vorr_u8(component_g_right, component_b); |
| rgb565.val[1] = vorr_u8(component_r, component_g_left); |
| vst2_u8(dst, rgb565); |
| dst += 16; |
| } |
| |
| source += components_size; |
| destination += components_size / 4; |
| pixels_per_row = tail_components / 4; |
| } |
| |
| } // namespace simd |
| |
| } // namespace blink |
| |
| #endif // CPU_ARM_NEON |
| |
| #endif // THIRD_PARTY_BLINK_RENDERER_PLATFORM_GRAPHICS_CPU_ARM_WEBGL_IMAGE_CONVERSION_NEON_H_ |