| // Copyright 2017 The Chromium Authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| // This file is automatically generated using tfNative from a neural network, |
| // trained by TensorFlow. Please do not edit. |
| |
| #include "darkmode_classifier.h" |
| #include <algorithm> |
| #include <cassert> |
| #include <cmath> |
| #include <cstdint> |
| #include <cstring> |
| #include <limits> |
| #include <tuple> |
| #if USE_EIGEN |
| #include "third_party/eigen3/Eigen/Core" |
| #endif |
| namespace darkmode_tfnative_model { |
| namespace { |
| |
| // ----------------------------------------------------------------------------- |
| // OP LIBRARY |
| // Copied here to make sure that the inferece code always stays in sync with the |
| // lib that it was generated for. |
| // ----------------------------------------------------------------------------- |
| |
| // Default to using std::copy and std::fill over memcpy and memset as they |
| // are usually faster, thanks to the compiler getting stricter alignment |
| // guarantees. |
| #ifndef USE_TYPED_MEMSETMEMCPY |
| #define USE_TYPED_MEMSETMEMCPY 1 |
| #endif |
| #define USE_EIGEN 0 |
| #ifndef USE_EIGEN |
| #error Please define USE_EIGEN to either 0 or 1 |
| #endif |
| |
| // Helper to reinterpret memory as Eigen matrices. |
| #if USE_EIGEN |
| template <typename Scalar> |
| using ConstMatrixMap = typename Eigen::Map< |
| const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>; |
| template <typename Scalar> |
| using ConstRowVectorMap = |
| typename Eigen::Map<const Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>; |
| template <typename Scalar> |
| using RowVectorMap = |
| typename Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>; |
| template <typename Scalar> |
| using MatrixMap = |
| typename Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>; |
| #endif |
| |
| #define BENCHMARK_TIMER(...) |
| |
| // The size of a shape in terms of number of coefficients. |
| inline int ShapeSize(const int32_t rank, const int32_t* shape) { |
| int size = 1; |
| for (int i = 0; i < rank; ++i) |
| size *= shape[i]; |
| return size; |
| } |
| |
| // Helper to compute the size of the inner loop for an op that uses indices to |
| // specify which axes are reduced. |
| template <typename Tidx> |
| int32_t GetReduceInnerSize(int32_t input_tensor_rank, |
| const int32_t* __restrict input_shape, |
| int32_t index_tensor_rank, |
| const int32_t* __restrict index_shape, |
| const Tidx* __restrict index_values) { |
| assert(index_tensor_rank <= 1); |
| const int32_t num_indices = index_tensor_rank > 0 ? index_shape[0] : 1; |
| int32_t inner_size = 1; |
| for (int32_t i = 0; i < num_indices; ++i) { |
| inner_size *= input_shape[index_values[i]]; |
| } |
| return inner_size; |
| } |
| |
| template <typename T> |
| void ConcatV2Args2(int32_t arg0_rank, |
| const int32_t* __restrict arg0_shape, |
| const T* __restrict arg0_values, |
| int32_t arg1_rank, |
| const int32_t* __restrict arg1_shape, |
| const T* __restrict arg1_values, |
| const int32_t* __restrict axis_value, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("ConcatV2Args2"); |
| const int axis = axis_value[0]; |
| const int num_lines = ShapeSize(axis, arg0_shape); |
| const int arg0_line_size = ShapeSize(arg0_rank - axis, arg0_shape + axis); |
| const int arg1_line_size = ShapeSize(arg1_rank - axis, arg1_shape + axis); |
| for (int line = 0; line < num_lines; ++line) { |
| std::copy(arg0_values, arg0_values + arg0_line_size, output_values); |
| arg0_values += arg0_line_size; |
| output_values += arg0_line_size; |
| std::copy(arg1_values, arg1_values + arg1_line_size, output_values); |
| arg1_values += arg1_line_size; |
| output_values += arg1_line_size; |
| } |
| } |
| |
| template <typename T> |
| void Conv2DAsGemm(const int32_t* __restrict in_shape, |
| const T* __restrict in_values, |
| const int32_t* __restrict filter_shape, |
| const T* __restrict filter_values, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("Conv2DAsGemm"); |
| #if USE_EIGEN |
| const auto in = ConstMatrixMap<T>(in_values, in_shape[0], in_shape[1]); |
| const auto filter = |
| ConstMatrixMap<T>(filter_values, filter_shape[3], |
| filter_shape[0] * filter_shape[1] * filter_shape[2]); |
| auto result = MatrixMap<T>(output_values, filter_shape[3], in_shape[1]); |
| result.noalias() = filter * in; |
| #else |
| const int32_t out_rows = in_shape[1]; |
| const int32_t out_cols = filter_shape[3]; |
| const int32_t dot_len = in_shape[0]; |
| for (int row = 0; row < out_rows; ++row) { |
| for (int col = 0; col < out_cols; ++col) { |
| T value = 0; |
| for (int i = 0; i < dot_len; ++i) { |
| value += |
| in_values[row * dot_len + i] * filter_values[i * out_cols + col]; |
| } |
| *output_values++ = value; |
| } |
| } |
| #endif |
| } |
| |
| template <typename T> |
| void DepthwiseConv2dNative(const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| const int32_t* __restrict kernel_shape, |
| const T* __restrict kernel_values, |
| int32_t stride_y, |
| int32_t stride_x, |
| int32_t out_height, |
| int32_t out_width, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("DepthwiseConv2dNative"); |
| // Give the shape values nicer names. |
| assert(input_shape[3] == kernel_shape[2]); |
| const int batch_size = input_shape[0]; |
| const int kernel_height = kernel_shape[0]; |
| const int kernel_width = kernel_shape[1]; |
| const int in_depth = kernel_shape[2]; |
| const int depth_mul = kernel_shape[3]; |
| const int in_height = input_shape[1]; |
| const int in_width = input_shape[2]; |
| |
| // Compute the amount of padding needed to get the desired output size. |
| const int pad_height = |
| ((out_height - 1) * stride_y + kernel_height - in_height) / 2; |
| const int pad_width = |
| ((out_width - 1) * stride_x + kernel_width - in_width) / 2; |
| |
| // Cache the strides for address computations. |
| const int in_strides[4] = { |
| input_shape[1] * input_shape[2] * input_shape[3], // batch |
| input_shape[2] * input_shape[3], // y |
| input_shape[3], // x |
| 1, // channel |
| }; |
| const int kernel_strides[4] = { |
| kernel_shape[1] * kernel_shape[2] * kernel_shape[3], // y |
| kernel_shape[2] * kernel_shape[3], // x |
| kernel_shape[3], // in channels |
| 1, // channel mult |
| }; |
| |
| T* out_write_ptr = output_values; |
| for (int batch = 0; batch < batch_size; ++batch) { |
| for (int out_y = 0; out_y < out_height; ++out_y) { |
| for (int out_x = 0; out_x < out_width; ++out_x) { |
| // Compute the input read offsets. |
| const int in_y_origin = (out_y * stride_y) - pad_height; |
| const int in_x_origin = (out_x * stride_x) - pad_width; |
| |
| // Compute the range of the kernel to be applied (we may need to clip |
| // when we'd read outside of the valid input region - for SAME). |
| const int kernel_y_start = std::max(0, -in_y_origin); |
| const int kernel_y_end = |
| std::min(kernel_height, in_height - in_y_origin); |
| const int kernel_x_start = std::max(0, -in_x_origin); |
| const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin); |
| |
| for (int in_c = 0; in_c < in_depth; ++in_c) { |
| for (int mul_c = 0; mul_c < depth_mul; ++mul_c, ++out_write_ptr) { |
| // Convolve. |
| T sum = 0; |
| for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) { |
| const int in_y = in_y_origin + k_y; |
| assert(in_y >= 0 && in_y < in_height); |
| for (int k_x = kernel_x_start; k_x < kernel_x_end; ++k_x) { |
| const int in_x = in_x_origin + k_x; |
| assert(in_x >= 0 && in_x < in_width); |
| const T input_value = |
| input_values[batch * in_strides[0] + // batch |
| in_y * in_strides[1] + // y |
| in_x * in_strides[2] + // x |
| in_c]; // in chan |
| const T kernel_value = |
| kernel_values[k_y * kernel_strides[0] + // y |
| k_x * kernel_strides[1] + // x |
| in_c * kernel_strides[2] + // in chan |
| mul_c]; // chan mult |
| sum += input_value * kernel_value; |
| } |
| } |
| *out_write_ptr = sum; |
| } // mul_c |
| } // in_c |
| } // out_x |
| } // out_y |
| } // batch |
| } |
| |
| template <typename T> |
| void FullyConnected(const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| const int32_t* __restrict weight_shape, |
| const T* __restrict weight_values, |
| const int32_t* __restrict bias_shape, |
| const T* __restrict bias_values, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("FullyConnected"); |
| #if USE_EIGEN |
| const auto in = |
| ConstMatrixMap<T>(input_values, input_shape[1], input_shape[0]); |
| const auto weight = |
| ConstMatrixMap<T>(weight_values, weight_shape[1], weight_shape[0]); |
| const auto bias = ConstRowVectorMap<T>(bias_values, bias_shape[0]); |
| auto result = MatrixMap<T>(output_values, weight_shape[1], input_shape[0]); |
| result.noalias() = (weight * in).colwise() + bias; |
| #else |
| const int batch_size = input_shape[0]; |
| const int num_inputs = weight_shape[0]; |
| const int num_outputs = weight_shape[1]; |
| assert(input_shape[1] == num_inputs); |
| assert(bias_shape[0] == num_outputs); |
| for (int batch = 0; batch < batch_size; ++batch) { |
| for (int out_i = 0; out_i < num_outputs; ++out_i) { |
| T value = 0; |
| for (int in_i = 0; in_i < num_inputs; ++in_i) { |
| value += input_values[batch * num_inputs + in_i] * |
| weight_values[in_i * num_outputs + out_i]; |
| } |
| value += bias_values[out_i]; |
| output_values[batch * num_outputs + out_i] = value; |
| } |
| } |
| #endif |
| } |
| |
| template <typename T, typename TIndex> |
| void Gather(int params_rank, |
| const int32_t* __restrict params_shape, |
| const T* __restrict params_values, |
| int indices_rank, |
| const int32_t* __restrict indices_shape, |
| const TIndex* __restrict indices_values, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("Gather"); |
| const int num_indices = ShapeSize(indices_rank, indices_shape); |
| const int num_params = params_shape[0]; |
| const int slice_size = ShapeSize(params_rank - 1, params_shape + 1); |
| for (int i = 0; i < num_indices; ++i) { |
| const int index = indices_values[i]; |
| if (index < 0 || index >= num_params) { |
| std::fill(output_values, output_values + slice_size, 0); |
| } else { |
| std::copy(params_values + index * slice_size, |
| params_values + index * slice_size + slice_size, output_values); |
| } |
| output_values += slice_size; |
| } |
| } |
| |
| template <typename T, typename TIndex> |
| void Im2Col(const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| const int32_t* __restrict kernel_shape, |
| int32_t stride_y, |
| int32_t stride_x, |
| int32_t out_height, |
| int32_t out_width, |
| TIndex* output_shape, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("Im2Col"); |
| // Give the shape values nicer names. |
| assert(input_shape[3] == kernel_shape[2]); |
| const int batch_size = input_shape[0]; |
| const int kernel_height = kernel_shape[0]; |
| const int kernel_width = kernel_shape[1]; |
| const int in_depth = kernel_shape[2]; |
| const int in_height = input_shape[1]; |
| const int in_width = input_shape[2]; |
| |
| // Compute the amount of padding needed to get the desired output size. |
| const int pad_height = |
| ((out_height - 1) * stride_y + kernel_height - in_height) / 2; |
| const int pad_width = |
| ((out_width - 1) * stride_x + kernel_width - in_width) / 2; |
| |
| // Cache the strides for address computations. |
| const int x_stride = input_shape[3]; |
| const int y_stride = input_shape[2] * x_stride; |
| const int batch_stride = input_shape[1] * y_stride; |
| |
| // Write the output shape. |
| output_shape[0] = kernel_height * kernel_width * in_depth; |
| output_shape[1] = input_shape[0] * out_width * out_height; |
| |
| for (int batch = 0; batch < batch_size; ++batch) { |
| for (int out_y = 0; out_y < out_height; ++out_y) { |
| for (int out_x = 0; out_x < out_width; ++out_x) { |
| // Compute the input read offsets. |
| const int in_y_origin = (out_y * stride_y) - pad_height; |
| const int in_x_origin = (out_x * stride_x) - pad_width; |
| |
| // Compute the range of the kernel to be applied (we may need to clip |
| // when we'd read outside of the valid input region - for SAME). |
| const int kernel_y_start = std::max(0, -in_y_origin); |
| const int kernel_y_end = |
| std::min(kernel_height, in_height - in_y_origin); |
| const int kernel_x_start = std::max(0, -in_x_origin); |
| const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin); |
| |
| // Padding top. |
| if (kernel_y_start != 0) { |
| const int num_lines = kernel_y_start; |
| const int num_coeffs = num_lines * kernel_width * in_depth; |
| #if USE_TYPED_MEMSETMEMCPY |
| std::fill(output_values, output_values + num_coeffs, 0); |
| #else |
| std::memset(output_values, 0, num_coeffs * sizeof(T)); |
| #endif |
| output_values += num_coeffs; |
| } |
| for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) { |
| // Padding left. |
| if (kernel_x_start != 0) { |
| const int num_coeffs = kernel_x_start * in_depth; |
| #if USE_TYPED_MEMSETMEMCPY |
| std::fill(output_values, output_values + num_coeffs, 0); |
| #else |
| std::memset(output_values, 0, num_coeffs * sizeof(T)); |
| #endif |
| output_values += num_coeffs; |
| } |
| // Valid values. |
| { |
| const int in_y = in_y_origin + k_y; |
| const int in_x = in_x_origin + kernel_x_start; |
| const int num_coeffs = (kernel_x_end - kernel_x_start) * in_depth; |
| #if USE_TYPED_MEMSETMEMCPY |
| const int offset = |
| batch * batch_stride + in_y * y_stride + in_x * x_stride; |
| std::copy(input_values + offset, input_values + offset + num_coeffs, |
| output_values); |
| #else |
| std::memcpy(output_values, |
| input_values // Reusing the restricted pointer. |
| + batch * batch_stride // batch |
| + in_y * y_stride // y |
| + in_x * x_stride, // x |
| num_coeffs * sizeof(T)); |
| #endif |
| output_values += num_coeffs; |
| } |
| // Padding right. |
| if (kernel_x_end != kernel_width) { |
| const int num_coeffs = (kernel_width - kernel_x_end) * in_depth; |
| #if USE_TYPED_MEMSETMEMCPY |
| std::fill(output_values, output_values + num_coeffs, 0); |
| #else |
| std::memset(output_values, 0, num_coeffs * sizeof(T)); |
| #endif |
| output_values += num_coeffs; |
| } |
| } |
| // Padding bottom. |
| if (kernel_y_end != kernel_height) { |
| const int num_lines = kernel_height - kernel_y_end; |
| const int num_coeffs = num_lines * kernel_width * in_depth; |
| #if USE_TYPED_MEMSETMEMCPY |
| std::fill(output_values, output_values + num_coeffs, 0); |
| #else |
| std::memset(output_values, 0, num_coeffs * sizeof(T)); |
| #endif |
| output_values += num_coeffs; |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| void MaxPool(const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| int32_t stride_y, |
| int32_t stride_x, |
| int32_t kernel_height, |
| int32_t kernel_width, |
| int32_t out_height, |
| int32_t out_width, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("MaxPool"); |
| // Give the shape values nicer names. |
| const int batch_size = input_shape[0]; |
| const int in_height = input_shape[1]; |
| const int in_width = input_shape[2]; |
| const int depth = input_shape[3]; |
| |
| // Compute the amount of padding needed to get the desired output size. |
| const int pad_height = |
| ((out_height - 1) * stride_y + kernel_height - in_height) / 2; |
| const int pad_width = |
| ((out_width - 1) * stride_x + kernel_width - in_width) / 2; |
| |
| // Cache the strides for address computations. |
| const int in_strides[4] = { |
| input_shape[1] * input_shape[2] * input_shape[3], // batch |
| input_shape[2] * input_shape[3], // y |
| input_shape[3], // x |
| 1, // channel |
| }; |
| |
| T* out_write_ptr = output_values; |
| for (int batch = 0; batch < batch_size; ++batch) { |
| for (int out_y = 0; out_y < out_height; ++out_y) { |
| for (int out_x = 0; out_x < out_width; ++out_x) { |
| // Compute the input read offsets. |
| const int in_y_origin = (out_y * stride_y) - pad_height; |
| const int in_x_origin = (out_x * stride_x) - pad_width; |
| |
| // Compute the range of the kernel to be applied (we may need to clip |
| // when we'd read outside of the valid input region - for SAME). |
| const int kernel_y_start = std::max(0, -in_y_origin); |
| const int kernel_y_end = |
| std::min(kernel_height, in_height - in_y_origin); |
| const int kernel_x_start = std::max(0, -in_x_origin); |
| const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin); |
| |
| for (int chan = 0; chan < depth; ++chan, ++out_write_ptr) { |
| // Convolve. |
| T max_value = std::numeric_limits<T>::lowest(); |
| for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) { |
| const int in_y = in_y_origin + k_y; |
| assert(in_y >= 0 && in_y < in_height); |
| for (int k_x = kernel_x_start; k_x < kernel_x_end; ++k_x) { |
| const int in_x = in_x_origin + k_x; |
| assert(in_x >= 0 && in_x < in_width); |
| const T input_value = |
| input_values[batch * in_strides[0] + // batch |
| in_y * in_strides[1] + // y |
| in_x * in_strides[2] + // x |
| chan]; // channel |
| max_value = std::max(max_value, input_value); |
| } // kernel_x |
| } // kernel_y |
| *out_write_ptr = max_value; |
| } // chan |
| } // out_x |
| } // out_y |
| } // batch |
| } |
| |
| template <typename T> |
| void Memcpy(const int32_t rank, |
| const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("Memcpy"); |
| const int size = ShapeSize(rank, input_shape); |
| for (int i = 0; i < size; ++i) { |
| output_values[i] = input_values[i]; |
| } |
| } |
| |
| template <typename T> |
| void Softmax(const int32_t rank, |
| const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| const int32_t reduce_dim, |
| T* __restrict output_values, |
| T* __restrict scratch_values) { |
| BENCHMARK_TIMER("Softmax"); |
| const int size = ShapeSize(rank, input_shape); |
| if (rank == 2 && reduce_dim == 1) { |
| T logits_max = std::numeric_limits<T>::lowest(); |
| |
| // Max. |
| for (int i = 0; i < size; ++i) { |
| logits_max = std::max(logits_max, input_values[i]); |
| } |
| |
| // Pre-compute exp. |
| for (int i = 0; i < size; ++i) { |
| scratch_values[i] = std::exp(input_values[i] - logits_max); |
| } |
| |
| // Sum over the last dimension, then divide the exps and write out. |
| for (int offset = 0; offset < size; offset += input_shape[1]) { |
| T sum = 0; |
| const int end_offset = offset + input_shape[1]; |
| for (int i = offset; i < end_offset; ++i) |
| sum += scratch_values[i]; |
| const T rcp_denom = static_cast<T>(1) / sum; |
| for (int i = 0; i < input_shape[1]; ++i) { |
| output_values[offset + i] = scratch_values[offset + i] * rcp_denom; |
| } |
| } |
| } else { |
| assert(false && "Generic Softmax not yet supported."); |
| } |
| } |
| |
| // Returns the start position for a slice in a single dimension. |
| template <typename T> |
| int StridedSliceBegin(int range_mask, |
| const T* __restrict range_values, |
| const T* __restrict strides, |
| const int32_t* __restrict input_shape, |
| int dim) { |
| const bool is_explicit = 0 == (range_mask & (1 << dim)); |
| if (is_explicit) { |
| return range_values[dim]; |
| } else { |
| const bool is_reverse = strides[dim] < 0; |
| return is_reverse ? input_shape[dim] - 1 : 0; |
| } |
| } |
| |
| // Returns the end position for a slice in a single dimension. |
| template <typename T> |
| int StridedSliceEnd(int range_mask, |
| const T* __restrict range_values, |
| const T* __restrict strides, |
| const int32_t* __restrict input_shape, |
| int dim) { |
| const bool is_explicit = 0 == (range_mask & (1 << dim)); |
| if (is_explicit) { |
| return range_values[dim]; |
| } else { |
| const bool is_reverse = strides[dim] < 0; |
| return is_reverse ? -1 : input_shape[dim]; |
| } |
| } |
| |
| template <typename T, typename TIdx> |
| void StridedSlice(const int32_t input_rank, |
| const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| const TIdx* __restrict begin, |
| const TIdx* __restrict end, |
| const TIdx* __restrict strides, |
| int32_t begin_mask, |
| int32_t end_mask, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("StridedSlice"); |
| const int MAX_RANK = 8; |
| assert(input_rank < MAX_RANK); |
| |
| // Compute the address strides for each dimension. |
| int dim_addr_strides[MAX_RANK] = {0}; |
| dim_addr_strides[input_rank - 1] = 1; |
| for (int dim = input_rank - 2; dim >= 0; --dim) { |
| dim_addr_strides[dim] = dim_addr_strides[dim + 1] * input_shape[dim + 1]; |
| } |
| |
| // Resolve the masks and get explicit ranges for each dimension. |
| int dim_begin[MAX_RANK]; |
| int dim_end[MAX_RANK]; |
| bool dim_is_full_range[MAX_RANK]; |
| for (int dim = 0; dim < input_rank; ++dim) { |
| const int stride = strides[dim]; |
| dim_begin[dim] = |
| StridedSliceBegin(begin_mask, begin, strides, input_shape, dim); |
| dim_end[dim] = StridedSliceEnd(end_mask, end, strides, input_shape, dim); |
| dim_is_full_range[dim] = |
| dim_begin[dim] == 0 && dim_end[dim] == input_shape[dim] && stride == 1; |
| |
| // Our termination criteria for loops is that we hit the end exactly, so |
| // we need to ensure that we don't step over the end with stride != 1. |
| const int length_mod = (dim_end[dim] - dim_begin[dim]) % stride; |
| if (length_mod != 0) { |
| dim_end[dim] += stride - length_mod; |
| } |
| } |
| |
| // Find out how large the blocks are that we can copy contiguously. (All |
| // dimensions on the right for which we fetch the full range) |
| int last_sliced_dim = input_rank - 1; |
| int block_size = 1; |
| for (int dim = input_rank - 1; dim >= 0 && dim_is_full_range[dim]; --dim) { |
| block_size *= input_shape[dim]; |
| last_sliced_dim--; |
| } |
| |
| // Initialize the read pos for each dimension according to the begin offsets. |
| int read_pos[MAX_RANK] = {0}; |
| for (int dim = 0; dim < input_rank; ++dim) { |
| read_pos[dim] = dim_begin[dim]; |
| } |
| |
| while (read_pos[0] != dim_end[0]) { |
| // Compute the read offset for the current position. |
| int32_t read_offset = 0; |
| for (int dim = 0; dim <= last_sliced_dim; ++dim) { |
| const int addr_stride = dim_addr_strides[dim]; |
| if (read_pos[dim] < 0) { |
| read_offset += (input_shape[dim] + read_pos[dim]) * addr_stride; |
| } else { |
| read_offset += read_pos[dim] * addr_stride; |
| } |
| } |
| |
| #if USE_TYPED_MEMSETMEMCPY |
| std::copy(input_values + read_offset, |
| input_values + read_offset + block_size, output_values); |
| #else |
| std::memcpy(output_values, input_values + read_offset, |
| block_size * sizeof(T)); |
| #endif |
| output_values += block_size; |
| |
| // Advance the read position. |
| for (int dim = last_sliced_dim; dim >= 0; --dim) { |
| read_pos[dim] += strides[dim]; |
| if (dim == 0 || read_pos[dim] != dim_end[dim]) |
| break; |
| read_pos[dim] = dim_begin[dim]; |
| } |
| } |
| } |
| |
| template <typename T> |
| void TransposeRank3(const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| const int32_t* __restrict perm, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("TransposeRank3"); |
| const int32_t in_strides[3] = { |
| input_shape[1] * input_shape[2], |
| input_shape[2], |
| 1, |
| }; |
| const int32_t out_strides[3] = {in_strides[perm[0]], in_strides[perm[1]], |
| in_strides[perm[2]]}; |
| const int32_t out_shape[3] = {input_shape[perm[0]], input_shape[perm[1]], |
| input_shape[perm[2]]}; |
| |
| int32_t write_offset = 0; |
| for (int32_t it0 = 0; it0 < out_shape[0]; ++it0) { |
| const int32_t read_offset0 = it0 * out_strides[0]; |
| for (int32_t it1 = 0; it1 < out_shape[1]; ++it1) { |
| const int32_t read_offset01 = read_offset0 + it1 * out_strides[1]; |
| for (int32_t it2 = 0; it2 < out_shape[2]; ++it2, ++write_offset) { |
| const int32_t read_offset = read_offset01 + it2 * out_strides[2]; |
| output_values[write_offset] = input_values[read_offset]; |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| void TransposeRank4(const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| const int32_t* __restrict perm, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("TransposeRank4"); |
| const int32_t in_strides[4] = { |
| input_shape[1] * input_shape[2] * input_shape[3], |
| input_shape[2] * input_shape[3], |
| input_shape[3], |
| 1, |
| }; |
| const int32_t out_strides[4] = {in_strides[perm[0]], in_strides[perm[1]], |
| in_strides[perm[2]], in_strides[perm[3]]}; |
| const int32_t out_shape[4] = {input_shape[perm[0]], input_shape[perm[1]], |
| input_shape[perm[2]], input_shape[perm[3]]}; |
| |
| int32_t write_offset = 0; |
| for (int32_t it0 = 0; it0 < out_shape[0]; ++it0) { |
| const int32_t read_offset0 = it0 * out_strides[0]; |
| for (int32_t it1 = 0; it1 < out_shape[1]; ++it1) { |
| const int32_t read_offset01 = read_offset0 + it1 * out_strides[1]; |
| for (int32_t it2 = 0; it2 < out_shape[2]; ++it2) { |
| const int32_t read_offset012 = read_offset01 + it2 * out_strides[2]; |
| for (int32_t it3 = 0; it3 < out_shape[3]; ++it3, ++write_offset) { |
| const int32_t read_offset = read_offset012 + it3 * out_strides[3]; |
| output_values[write_offset] = input_values[read_offset]; |
| } |
| } |
| } |
| } |
| } |
| |
| template <typename T, typename TIdx, typename TDepth> |
| void OneHot(const int32_t input_rank, |
| const int32_t* __restrict input_shape, |
| const TIdx* __restrict input_values, |
| const TDepth* __restrict depth, |
| const T* __restrict on_value, |
| const T* __restrict off_value, |
| const int32_t axis, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("OneHot"); |
| const int32_t num_elements = ShapeSize(input_rank, input_shape); |
| // We can assume axis >= 0 in this implementation. |
| const int32_t prefix_dim_size = ShapeSize(axis, input_shape); |
| const int32_t suffix_dim_size = num_elements / prefix_dim_size; |
| int32_t write_offset = 0; |
| for (int32_t i = 0; i < prefix_dim_size; i++) { |
| int32_t read_offset_pre = i * suffix_dim_size; |
| for (TDepth d = 0; d < *depth; d++) { |
| for (int32_t j = 0; j < suffix_dim_size; j++, write_offset++) { |
| const int32_t read_offset = read_offset_pre + j; |
| output_values[write_offset] = |
| (input_values[read_offset] == d) ? *on_value : *off_value; |
| } |
| } |
| } |
| } |
| |
| template <typename T, typename TIdx, typename TDepth> |
| void OneHotLastDim(const int32_t input_rank, |
| const int32_t* __restrict input_shape, |
| const TIdx* __restrict input_values, |
| const TDepth* __restrict depth, |
| const T* __restrict on_value, |
| const T* __restrict off_value, |
| T* __restrict output_values) { |
| BENCHMARK_TIMER("OneHotLastDim"); |
| const int32_t num_elements = ShapeSize(input_rank, input_shape); |
| int32_t write_offset = 0; |
| for (int32_t i = 0; i < num_elements; i++) { |
| for (TDepth d = 0; d < *depth; d++, write_offset++) { |
| output_values[write_offset] = |
| (input_values[i] == d) ? *on_value : *off_value; |
| } |
| } |
| } |
| |
| // ----------------------------------------------------------------------------- |
| // Simple unary ops |
| // ----------------------------------------------------------------------------- |
| |
| // We use macros instead of template functions with templated functors here |
| // because it's a lot less verbose and easier for the compiler to optimize. |
| |
| #if USE_EIGEN |
| |
| #define SIMPLE_UNARY_OP(OP_NAME, _, EXPR_EIGEN) \ |
| template <typename T> \ |
| void OP_NAME(const int32_t rank, const int32_t* __restrict input_shape, \ |
| const T* __restrict input_values, \ |
| T* __restrict output_values) { \ |
| BENCHMARK_TIMER(#OP_NAME); \ |
| const int size = ShapeSize(rank, input_shape); \ |
| auto values = ConstRowVectorMap<T>(input_values, size).array(); \ |
| auto output = RowVectorMap<T>(output_values, size).array(); \ |
| output = EXPR_EIGEN; \ |
| } |
| |
| #else |
| |
| #define SIMPLE_UNARY_OP(OP_NAME, EXPR, _) \ |
| template <typename T> \ |
| void OP_NAME(const int32_t rank, const int32_t* __restrict input_shape, \ |
| const T* __restrict input_values, \ |
| T* __restrict output_values) { \ |
| BENCHMARK_TIMER(#OP_NAME); \ |
| const int size = ShapeSize(rank, input_shape); \ |
| for (int i = 0; i < size; ++i) { \ |
| const T value = input_values[i]; \ |
| output_values[i] = EXPR; \ |
| } \ |
| } |
| |
| #endif |
| |
| // Second macro param is value expression, third entry is Eigen vector |
| // expression. |
| SIMPLE_UNARY_OP(Abs, std::abs(value), values.abs()) |
| SIMPLE_UNARY_OP(Acos, std::acos(value), values.acos()) |
| SIMPLE_UNARY_OP(Asin, std::asin(value), values.asin()) |
| SIMPLE_UNARY_OP(Atan, std::atan(value), values.atan()) |
| SIMPLE_UNARY_OP(Cos, std::cos(value), values.cos()) |
| SIMPLE_UNARY_OP(Cosh, std::cosh(value), values.cosh()) |
| SIMPLE_UNARY_OP(Exp, std::exp(value), values.exp()) |
| SIMPLE_UNARY_OP(Elu, |
| value < 0 ? std::expm1(value) : value, |
| // Use branchless version of Elu: min(ReLU, e^x - 1) |
| values.max(0).min(values.exp() - 1)) |
| SIMPLE_UNARY_OP(Log, std::log(value), values.log()) |
| SIMPLE_UNARY_OP(Log1p, std::log1p(value), values.log1p()) |
| SIMPLE_UNARY_OP(Neg, -value, -values) |
| SIMPLE_UNARY_OP(Reciprocal, static_cast<T>(1) / value, values.cwiseInverse()) |
| SIMPLE_UNARY_OP(Relu, std::max(value, static_cast<T>(0)), values.max(0)) |
| SIMPLE_UNARY_OP(Relu6, |
| std::min(std::max(value, static_cast<T>(0)), static_cast<T>(6)), |
| values.max(0).min(6)) |
| SIMPLE_UNARY_OP(Rsqrt, static_cast<T>(1) / std::sqrt(value), values.rsqrt()) |
| SIMPLE_UNARY_OP(Sigmoid, |
| static_cast<T>(1) / (1 + std::exp(-value)), |
| ((-values).exp() + 1).cwiseInverse()) |
| SIMPLE_UNARY_OP(Sin, std::sin(value), values.sin()) |
| SIMPLE_UNARY_OP(Sinh, std::sinh(value), values.sinh()) |
| SIMPLE_UNARY_OP(Sqrt, std::sqrt(value), values.sqrt()) |
| SIMPLE_UNARY_OP(Square, value* value, values.square()) |
| SIMPLE_UNARY_OP(Tan, std::tan(value), values.tan()) |
| SIMPLE_UNARY_OP(Tanh, std::tanh(value), values.tanh()) |
| |
| // ----------------------------------------------------------------------------- |
| // Broadcasting binary ops |
| // ----------------------------------------------------------------------------- |
| |
| template <typename T, typename OP> |
| void OpNoBroadcast(const int32_t left_rank, |
| const int32_t* __restrict left_shape, |
| const T* __restrict left_values, |
| const int32_t right_rank, |
| const int32_t* __restrict right_shape, |
| const T* __restrict right_values, |
| T* __restrict output_values, |
| OP op) { |
| BENCHMARK_TIMER(op.name, "NoBroadcast"); |
| const int32_t size = ShapeSize(left_rank, left_shape); |
| #if USE_EIGEN |
| auto lhs = ConstRowVectorMap<T>(left_values, size).array(); |
| auto rhs = ConstRowVectorMap<T>(right_values, size).array(); |
| auto output = RowVectorMap<T>(output_values, size).array(); |
| op.apply(lhs, rhs, output); |
| #else |
| for (int32_t i = 0; i < size; ++i) { |
| output_values[i] = op(left_values[i], right_values[i]); |
| } |
| #endif |
| } |
| |
| template <typename T, typename OP> |
| void OpInnerBroadcast(int32_t left_rank, |
| const int32_t* __restrict left_shape, |
| const T* __restrict left_values, |
| int32_t right_rank, |
| const int32_t* __restrict right_shape, |
| const T* __restrict right_values, |
| T* __restrict output_values, |
| OP op) { |
| BENCHMARK_TIMER(op.name, "InnerBroadcast"); |
| const int32_t output_size = ShapeSize(left_rank, left_shape); |
| const int32_t inner_size = ShapeSize(right_rank, right_shape); |
| const int32_t outer_size = output_size / inner_size; |
| #if USE_EIGEN |
| if (inner_size == 1) { |
| // Apply the same value to all elements. |
| auto left = ConstMatrixMap<T>(left_values, inner_size, outer_size); |
| auto output = MatrixMap<T>(output_values, inner_size, outer_size); |
| op.apply(left.array(), right_values[0], output.array()); |
| } else { |
| auto left = ConstMatrixMap<T>(left_values, inner_size, outer_size); |
| auto right = ConstRowVectorMap<T>(right_values, inner_size); |
| auto output = MatrixMap<T>(output_values, inner_size, outer_size); |
| for (int32_t col = 0; col < outer_size; col++) { |
| op.apply(left.col(col).array(), right.array(), output.col(col).array()); |
| } |
| } |
| #else |
| for (int32_t idx_out = 0; idx_out < outer_size; ++idx_out) { |
| for (int32_t idx_in = 0; idx_in < inner_size; ++idx_in) { |
| const int32_t offset = idx_out * inner_size + idx_in; |
| output_values[offset] = op(left_values[offset], right_values[idx_in]); |
| } |
| } |
| #endif |
| } |
| |
| #define BROADCAST_BINARY_OP(OP_NAME, EXPR, EXPR_EIGEN) \ |
| template <typename T> \ |
| struct Op##OP_NAME { \ |
| const char* name = #OP_NAME; \ |
| T operator()(const T lhs, const T rhs) { return EXPR; } \ |
| template <typename X, typename Y, typename Z> \ |
| void apply(const X& lhs, const Y& rhs, Z out) { \ |
| out = EXPR_EIGEN; \ |
| } \ |
| }; \ |
| template <typename T> \ |
| void OP_NAME##NoBroadcast( \ |
| const int32_t left_rank, const int32_t* __restrict left_shape, \ |
| const T* __restrict left_values, const int32_t right_rank, \ |
| const int32_t* __restrict right_shape, const T* __restrict right_values, \ |
| T* __restrict output_values) { \ |
| OpNoBroadcast(left_rank, left_shape, left_values, right_rank, right_shape, \ |
| right_values, output_values, Op##OP_NAME<T>()); \ |
| } \ |
| template <typename T> \ |
| void OP_NAME##InnerBroadcast( \ |
| const int32_t left_rank, const int32_t* __restrict left_shape, \ |
| const T* __restrict left_values, const int32_t right_rank, \ |
| const int32_t* __restrict right_shape, const T* __restrict right_values, \ |
| T* __restrict output_values) { \ |
| OpInnerBroadcast(left_rank, left_shape, left_values, right_rank, \ |
| right_shape, right_values, output_values, \ |
| Op##OP_NAME<T>()); \ |
| } |
| |
| // Second macro param is value expression, third entry is Eigen vector |
| // expression. |
| BROADCAST_BINARY_OP(Add, lhs + rhs, lhs + rhs) |
| BROADCAST_BINARY_OP(Maximum, std::max(lhs, rhs), lhs.max(rhs)) |
| BROADCAST_BINARY_OP(Minimum, std::min(lhs, rhs), lhs.min(rhs)) |
| BROADCAST_BINARY_OP(Mul, lhs* rhs, lhs* rhs) |
| BROADCAST_BINARY_OP(Sub, lhs - rhs, lhs - rhs) |
| BROADCAST_BINARY_OP(SquaredDifference, |
| (lhs - rhs) * (lhs - rhs), |
| (lhs - rhs).square()) |
| |
| // ----------------------------------------------------------------------------- |
| // Reduce ops |
| // ----------------------------------------------------------------------------- |
| |
| // We use macros instead of template functions with templated functors here |
| // because it's a lot less verbose and easier for the compiler to optimize. |
| #define REDUCE_OP(OP_NAME, DEFAULT_VALUE, UPDATE_EXPR, RESULT_EXPR) \ |
| template <typename T, typename Tidx> \ |
| void OP_NAME##InnerReduce( \ |
| int32_t input_rank, const int32_t* __restrict input_shape, \ |
| const T* __restrict input_values, int32_t index_tensor_rank, \ |
| const int32_t* __restrict index_shape, \ |
| const Tidx* __restrict index_values, T* __restrict output_values) { \ |
| BENCHMARK_TIMER(#OP_NAME, "InnerReduce"); \ |
| const int32_t inner_size = \ |
| GetReduceInnerSize(input_rank, input_shape, index_tensor_rank, \ |
| index_shape, index_values); \ |
| const int32_t input_size = ShapeSize(input_rank, input_shape); \ |
| const int32_t outer_size = input_size / inner_size; \ |
| for (int32_t idx_out = 0; idx_out < outer_size; ++idx_out) { \ |
| T value = DEFAULT_VALUE; \ |
| for (int32_t idx_in = 0; idx_in < inner_size; ++idx_in) { \ |
| const T prev = value; \ |
| const T next = input_values[idx_out * inner_size + idx_in]; \ |
| value = UPDATE_EXPR; \ |
| } \ |
| const T count = inner_size; \ |
| (void)sizeof(count); \ |
| output_values[idx_out] = RESULT_EXPR; \ |
| } \ |
| } \ |
| template <typename T, typename Tidx> \ |
| void OP_NAME##GenericReduceRank4( \ |
| int32_t input_rank, const int32_t* __restrict input_shape, \ |
| const T* __restrict input_values, int32_t index_tensor_rank, \ |
| const int32_t* __restrict index_shape, \ |
| const Tidx* __restrict index_values, T* __restrict output_values) { \ |
| assert(input_rank == 4); \ |
| assert(index_tensor_rank <= 1); \ |
| BENCHMARK_TIMER(#OP_NAME, "GenericReduceRank4"); \ |
| int out_shape[4] = {input_shape[0], input_shape[1], input_shape[2], \ |
| input_shape[3]}; \ |
| bool reduce_mask[4] = {false, false, false, false}; \ |
| const int num_indices = index_tensor_rank > 0 ? index_shape[0] : 1; \ |
| for (int i = 0; i < num_indices; ++i) { \ |
| reduce_mask[index_values[i]] = true; \ |
| out_shape[index_values[i]] = 1; \ |
| } \ |
| const int out_strides[4] = { \ |
| reduce_mask[0] ? 0 : out_shape[1] * out_shape[2] * out_shape[3], \ |
| reduce_mask[1] ? 0 : out_shape[2] * out_shape[3], \ |
| reduce_mask[2] ? 0 : out_shape[3], \ |
| reduce_mask[3] ? 0 : 1, \ |
| }; \ |
| const int output_size = ShapeSize(input_rank, out_shape); \ |
| std::fill_n(output_values, output_size, DEFAULT_VALUE); \ |
| for (int dim0 = 0; dim0 < input_shape[0]; ++dim0) { \ |
| for (int dim1 = 0; dim1 < input_shape[1]; ++dim1) { \ |
| for (int dim2 = 0; dim2 < input_shape[2]; ++dim2) { \ |
| for (int dim3 = 0; dim3 < input_shape[3]; ++dim3, ++input_values) { \ |
| T* out_ptr = output_values + out_strides[0] * dim0 + \ |
| out_strides[1] * dim1 + out_strides[2] * dim2 + \ |
| out_strides[3] * dim3; \ |
| const T prev = *out_ptr; \ |
| const T next = *input_values; \ |
| *out_ptr = UPDATE_EXPR; \ |
| } \ |
| } \ |
| } \ |
| } \ |
| const T count = (reduce_mask[0] ? input_shape[0] : 1) * \ |
| (reduce_mask[1] ? input_shape[1] : 1) * \ |
| (reduce_mask[2] ? input_shape[2] : 1) * \ |
| (reduce_mask[3] ? input_shape[3] : 1); \ |
| (void)sizeof(count); \ |
| for (int i = 0; i < output_size; ++i) { \ |
| const T value = output_values[i]; \ |
| output_values[i] = RESULT_EXPR; \ |
| } \ |
| } |
| |
| REDUCE_OP(Max, std::numeric_limits<T>::lowest(), std::max(prev, next), value) |
| REDUCE_OP(Sum, 0, prev + next, value) |
| REDUCE_OP(Mean, 0, prev + next, value / count) |
| |
| #undef REDUCE_OP |
| |
| // ----------------------------------------------------------------------------- |
| // Dequantize ops |
| // ----------------------------------------------------------------------------- |
| |
| template <typename T> |
| void DequantizeMinCombined(const int32_t rank, |
| const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| const float* __restrict min_range, |
| const float* __restrict max_range, |
| float* __restrict output_values) { |
| BENCHMARK_TIMER("DequantizeMinCombined"); |
| const int size = ShapeSize(rank, input_shape); |
| const float offset = |
| std::is_signed<T>::value |
| ? (static_cast<float>(std::numeric_limits<T>::max()) - |
| std::numeric_limits<T>::min() + 1) / |
| 2.0f |
| : 0.0f; |
| const float range_scale = (max_range[0] - min_range[0]) / |
| (static_cast<float>(std::numeric_limits<T>::max()) - |
| std::numeric_limits<T>::min()); |
| for (int i = 0; i < size; i++) { |
| output_values[i] = |
| ((static_cast<int32_t>(input_values[i]) + offset) * range_scale) + |
| min_range[0]; |
| } |
| } |
| |
| template <typename T> |
| void DequantizeMinFirst(const int32_t rank, |
| const int32_t* __restrict input_shape, |
| const T* __restrict input_values, |
| const float* __restrict min_range, |
| const float* __restrict max_range, |
| float* __restrict output_values) { |
| BENCHMARK_TIMER("DequantizeMinFirst"); |
| const int size = ShapeSize(rank, input_shape); |
| const float range_scale = (max_range[0] - min_range[0]) / |
| (static_cast<float>(std::numeric_limits<T>::max()) - |
| std::numeric_limits<T>::min()); |
| const float range_min_rounded = |
| (max_range[0] == min_range[0] |
| ? min_range[0] |
| : round(min_range[0] / range_scale) * range_scale); |
| for (int i = 0; i < size; i++) { |
| output_values[i] = ((static_cast<int32_t>(input_values[i]) - |
| std::numeric_limits<T>::min()) * |
| range_scale) + |
| range_min_rounded; |
| } |
| } |
| |
| // ----------------------------------------------------------------------------- |
| // CONSTANTS |
| // Note that for now, endianness of the target machine needs to match that of |
| // the one training was performed on. |
| // ----------------------------------------------------------------------------- |
| const int32_t dnn_hiddenlayer_0_weights_part_0_shape[2] = {4, 10}; |
| const union { |
| uint8_t bytes[160]; |
| float values[40]; |
| } dnn_hiddenlayer_0_weights_part_0 = {{ |
| 0xbc, 0x22, 0x0a, 0xbf, 0xb4, 0x46, 0x8c, 0x3f, 0xba, 0x31, 0x34, 0xbe, |
| 0x4c, 0x65, 0xdb, 0xbe, 0xf0, 0x54, 0x5e, 0xbe, 0xc1, 0x5d, 0xb3, 0x3f, |
| 0xf4, 0xe6, 0x15, 0xbf, 0x05, 0xc6, 0x34, 0xbf, 0xc0, 0x37, 0x7e, 0xbd, |
| 0x6c, 0x35, 0x0b, 0xbf, 0xca, 0x53, 0x26, 0xbf, 0x58, 0xb4, 0x87, 0x3f, |
| 0x37, 0xee, 0x39, 0xbf, 0xda, 0xfa, 0xf9, 0xbe, 0x97, 0xc1, 0x06, 0xbf, |
| 0xf9, 0x4e, 0x81, 0x3f, 0xb2, 0x44, 0x85, 0xbf, 0x7f, 0x98, 0x7c, 0x3d, |
| 0x15, 0x26, 0xbc, 0xbe, 0x5c, 0x48, 0x05, 0x3f, 0xc8, 0xaa, 0xa1, 0xbd, |
| 0x35, 0xb3, 0x43, 0xbe, 0xeb, 0x46, 0x91, 0x3f, 0x80, 0x71, 0xe3, 0x3c, |
| 0xd1, 0x98, 0x79, 0x3f, 0x3c, 0xd0, 0x0d, 0xbf, 0x1e, 0x02, 0xd3, 0x3e, |
| 0x5d, 0x4b, 0xa2, 0xbf, 0x68, 0xac, 0xaa, 0xbd, 0xf8, 0xe1, 0x75, 0x3e, |
| 0x4a, 0x9c, 0x27, 0xbe, 0xf8, 0xae, 0xb2, 0xbe, 0x7f, 0x9d, 0x91, 0x3f, |
| 0x1e, 0x8b, 0xa8, 0xbe, 0x35, 0x7e, 0xb2, 0x3f, 0xbe, 0x8c, 0xd3, 0xbe, |
| 0xf9, 0xcd, 0xb5, 0x3f, 0xa1, 0x50, 0xaa, 0x3f, 0xe4, 0x6d, 0xdd, 0xbe, |
| 0x0d, 0xce, 0xd3, 0xbe, |
| }}; |
| const int32_t dnn_hiddenlayer_0_biases_part_0_shape[1] = {10}; |
| const union { |
| uint8_t bytes[40]; |
| float values[10]; |
| } dnn_hiddenlayer_0_biases_part_0 = {{ |
| 0x00, 0x00, 0x00, 0x00, 0xbf, 0x6a, 0x53, 0x3e, 0xd3, 0xc1, |
| 0xd0, 0x3e, 0x00, 0x00, 0x00, 0x00, 0xb6, 0xd8, 0xc0, 0x3e, |
| 0xca, 0xe7, 0x35, 0x3e, 0x23, 0xa5, 0x44, 0x3f, 0x61, 0xfd, |
| 0xd2, 0x3e, 0x00, 0x00, 0x00, 0x00, 0xb6, 0xe0, 0x43, 0x3c, |
| }}; |
| const int32_t dnn_logits_biases_part_0_shape[1] = {1}; |
| const union { |
| uint8_t bytes[4]; |
| float values[1]; |
| } dnn_logits_biases_part_0 = {{ |
| 0x75, |
| 0xca, |
| 0xd7, |
| 0xbe, |
| }}; |
| const int32_t dnn_logits_weights_part_0_shape[2] = {10, 1}; |
| const union { |
| uint8_t bytes[40]; |
| float values[10]; |
| } dnn_logits_weights_part_0 = {{ |
| 0x13, 0x12, 0x39, 0x3f, 0xf3, 0xa5, 0xc2, 0xbf, 0x81, 0x7f, |
| 0xbe, 0x3f, 0xf8, 0x17, 0x26, 0x3e, 0xa4, 0x19, 0xa6, 0x3f, |
| 0xf0, 0xc9, 0xb7, 0xbf, 0x6a, 0x99, 0xd2, 0x3f, 0x8a, 0x7d, |
| 0xe9, 0x3f, 0x83, 0x9a, 0x3a, 0xbf, 0xf1, 0x6c, 0x08, 0x3e, |
| }}; |
| |
| } // anonymous namespace |
| |
| // ----------------------------------------------------------------------------- |
| // INFERENCE |
| // ----------------------------------------------------------------------------- |
| |
| int32_t input0Shape[2] = {1, 4}; |
| int32_t logits_MatMul_merged_with_dnn_logits_BiasAdd0Shape[2] = {1, 1}; |
| |
| void Inference( |
| const float* __restrict input0 /* shape: 1,4 */, |
| float* __restrict logits_MatMul_merged_with_dnn_logits_BiasAdd0 /* shape: |
| 1,1 */ |
| , |
| FixedAllocations* __restrict fixed) { |
| const int32_t input0_shape[] = {1, 4}; |
| int32_t logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[2]; |
| |
| // dnn/hiddenlayer_0/MatMul_merged_with_dnn/hiddenlayer_0/BiasAdd |
| FullyConnected<float>(input0_shape, input0, |
| dnn_hiddenlayer_0_weights_part_0_shape, |
| dnn_hiddenlayer_0_weights_part_0.values, |
| dnn_hiddenlayer_0_biases_part_0_shape, |
| dnn_hiddenlayer_0_biases_part_0.values, fixed->alloc0); |
| fixed->alloc0_shape[0] = 1; |
| fixed->alloc0_shape[1] = 10; |
| |
| // dnn/hiddenlayer_0/hiddenlayer_0/Relu |
| Relu<float>(2, // rank |
| fixed->alloc0_shape, fixed->alloc0, fixed->alloc1); |
| fixed->alloc1_shape[0] = 1; |
| fixed->alloc1_shape[1] = 10; |
| |
| // dnn/logits/MatMul_merged_with_dnn/logits/BiasAdd |
| FullyConnected<float>( |
| fixed->alloc1_shape, fixed->alloc1, dnn_logits_weights_part_0_shape, |
| dnn_logits_weights_part_0.values, dnn_logits_biases_part_0_shape, |
| dnn_logits_biases_part_0.values, |
| logits_MatMul_merged_with_dnn_logits_BiasAdd0); |
| logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[0] = 1; |
| logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[1] = 1; |
| } |
| |
| } // namespace darkmode_tfnative_model |