blob: 858e6a14d76b75efdcc79c224cb861c04ee7267c [file] [log] [blame]
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// This file is automatically generated using tfNative from a neural network,
// trained by TensorFlow. Please do not edit.
#include "darkmode_classifier.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <cstring>
#include <limits>
#include <tuple>
#if USE_EIGEN
#include "third_party/eigen3/Eigen/Core"
#endif
namespace darkmode_tfnative_model {
namespace {
// -----------------------------------------------------------------------------
// OP LIBRARY
// Copied here to make sure that the inferece code always stays in sync with the
// lib that it was generated for.
// -----------------------------------------------------------------------------
// Default to using std::copy and std::fill over memcpy and memset as they
// are usually faster, thanks to the compiler getting stricter alignment
// guarantees.
#ifndef USE_TYPED_MEMSETMEMCPY
#define USE_TYPED_MEMSETMEMCPY 1
#endif
#define USE_EIGEN 0
#ifndef USE_EIGEN
#error Please define USE_EIGEN to either 0 or 1
#endif
// Helper to reinterpret memory as Eigen matrices.
#if USE_EIGEN
template <typename Scalar>
using ConstMatrixMap = typename Eigen::Map<
const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename Scalar>
using ConstRowVectorMap =
typename Eigen::Map<const Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>;
template <typename Scalar>
using RowVectorMap =
typename Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>;
template <typename Scalar>
using MatrixMap =
typename Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>;
#endif
#define BENCHMARK_TIMER(...)
// The size of a shape in terms of number of coefficients.
inline int ShapeSize(const int32_t rank, const int32_t* shape) {
int size = 1;
for (int i = 0; i < rank; ++i)
size *= shape[i];
return size;
}
// Helper to compute the size of the inner loop for an op that uses indices to
// specify which axes are reduced.
template <typename Tidx>
int32_t GetReduceInnerSize(int32_t input_tensor_rank,
const int32_t* __restrict input_shape,
int32_t index_tensor_rank,
const int32_t* __restrict index_shape,
const Tidx* __restrict index_values) {
assert(index_tensor_rank <= 1);
const int32_t num_indices = index_tensor_rank > 0 ? index_shape[0] : 1;
int32_t inner_size = 1;
for (int32_t i = 0; i < num_indices; ++i) {
inner_size *= input_shape[index_values[i]];
}
return inner_size;
}
template <typename T>
void ConcatV2Args2(int32_t arg0_rank,
const int32_t* __restrict arg0_shape,
const T* __restrict arg0_values,
int32_t arg1_rank,
const int32_t* __restrict arg1_shape,
const T* __restrict arg1_values,
const int32_t* __restrict axis_value,
T* __restrict output_values) {
BENCHMARK_TIMER("ConcatV2Args2");
const int axis = axis_value[0];
const int num_lines = ShapeSize(axis, arg0_shape);
const int arg0_line_size = ShapeSize(arg0_rank - axis, arg0_shape + axis);
const int arg1_line_size = ShapeSize(arg1_rank - axis, arg1_shape + axis);
for (int line = 0; line < num_lines; ++line) {
std::copy(arg0_values, arg0_values + arg0_line_size, output_values);
arg0_values += arg0_line_size;
output_values += arg0_line_size;
std::copy(arg1_values, arg1_values + arg1_line_size, output_values);
arg1_values += arg1_line_size;
output_values += arg1_line_size;
}
}
template <typename T>
void Conv2DAsGemm(const int32_t* __restrict in_shape,
const T* __restrict in_values,
const int32_t* __restrict filter_shape,
const T* __restrict filter_values,
T* __restrict output_values) {
BENCHMARK_TIMER("Conv2DAsGemm");
#if USE_EIGEN
const auto in = ConstMatrixMap<T>(in_values, in_shape[0], in_shape[1]);
const auto filter =
ConstMatrixMap<T>(filter_values, filter_shape[3],
filter_shape[0] * filter_shape[1] * filter_shape[2]);
auto result = MatrixMap<T>(output_values, filter_shape[3], in_shape[1]);
result.noalias() = filter * in;
#else
const int32_t out_rows = in_shape[1];
const int32_t out_cols = filter_shape[3];
const int32_t dot_len = in_shape[0];
for (int row = 0; row < out_rows; ++row) {
for (int col = 0; col < out_cols; ++col) {
T value = 0;
for (int i = 0; i < dot_len; ++i) {
value +=
in_values[row * dot_len + i] * filter_values[i * out_cols + col];
}
*output_values++ = value;
}
}
#endif
}
template <typename T>
void DepthwiseConv2dNative(const int32_t* __restrict input_shape,
const T* __restrict input_values,
const int32_t* __restrict kernel_shape,
const T* __restrict kernel_values,
int32_t stride_y,
int32_t stride_x,
int32_t out_height,
int32_t out_width,
T* __restrict output_values) {
BENCHMARK_TIMER("DepthwiseConv2dNative");
// Give the shape values nicer names.
assert(input_shape[3] == kernel_shape[2]);
const int batch_size = input_shape[0];
const int kernel_height = kernel_shape[0];
const int kernel_width = kernel_shape[1];
const int in_depth = kernel_shape[2];
const int depth_mul = kernel_shape[3];
const int in_height = input_shape[1];
const int in_width = input_shape[2];
// Compute the amount of padding needed to get the desired output size.
const int pad_height =
((out_height - 1) * stride_y + kernel_height - in_height) / 2;
const int pad_width =
((out_width - 1) * stride_x + kernel_width - in_width) / 2;
// Cache the strides for address computations.
const int in_strides[4] = {
input_shape[1] * input_shape[2] * input_shape[3], // batch
input_shape[2] * input_shape[3], // y
input_shape[3], // x
1, // channel
};
const int kernel_strides[4] = {
kernel_shape[1] * kernel_shape[2] * kernel_shape[3], // y
kernel_shape[2] * kernel_shape[3], // x
kernel_shape[3], // in channels
1, // channel mult
};
T* out_write_ptr = output_values;
for (int batch = 0; batch < batch_size; ++batch) {
for (int out_y = 0; out_y < out_height; ++out_y) {
for (int out_x = 0; out_x < out_width; ++out_x) {
// Compute the input read offsets.
const int in_y_origin = (out_y * stride_y) - pad_height;
const int in_x_origin = (out_x * stride_x) - pad_width;
// Compute the range of the kernel to be applied (we may need to clip
// when we'd read outside of the valid input region - for SAME).
const int kernel_y_start = std::max(0, -in_y_origin);
const int kernel_y_end =
std::min(kernel_height, in_height - in_y_origin);
const int kernel_x_start = std::max(0, -in_x_origin);
const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin);
for (int in_c = 0; in_c < in_depth; ++in_c) {
for (int mul_c = 0; mul_c < depth_mul; ++mul_c, ++out_write_ptr) {
// Convolve.
T sum = 0;
for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) {
const int in_y = in_y_origin + k_y;
assert(in_y >= 0 && in_y < in_height);
for (int k_x = kernel_x_start; k_x < kernel_x_end; ++k_x) {
const int in_x = in_x_origin + k_x;
assert(in_x >= 0 && in_x < in_width);
const T input_value =
input_values[batch * in_strides[0] + // batch
in_y * in_strides[1] + // y
in_x * in_strides[2] + // x
in_c]; // in chan
const T kernel_value =
kernel_values[k_y * kernel_strides[0] + // y
k_x * kernel_strides[1] + // x
in_c * kernel_strides[2] + // in chan
mul_c]; // chan mult
sum += input_value * kernel_value;
}
}
*out_write_ptr = sum;
} // mul_c
} // in_c
} // out_x
} // out_y
} // batch
}
template <typename T>
void FullyConnected(const int32_t* __restrict input_shape,
const T* __restrict input_values,
const int32_t* __restrict weight_shape,
const T* __restrict weight_values,
const int32_t* __restrict bias_shape,
const T* __restrict bias_values,
T* __restrict output_values) {
BENCHMARK_TIMER("FullyConnected");
#if USE_EIGEN
const auto in =
ConstMatrixMap<T>(input_values, input_shape[1], input_shape[0]);
const auto weight =
ConstMatrixMap<T>(weight_values, weight_shape[1], weight_shape[0]);
const auto bias = ConstRowVectorMap<T>(bias_values, bias_shape[0]);
auto result = MatrixMap<T>(output_values, weight_shape[1], input_shape[0]);
result.noalias() = (weight * in).colwise() + bias;
#else
const int batch_size = input_shape[0];
const int num_inputs = weight_shape[0];
const int num_outputs = weight_shape[1];
assert(input_shape[1] == num_inputs);
assert(bias_shape[0] == num_outputs);
for (int batch = 0; batch < batch_size; ++batch) {
for (int out_i = 0; out_i < num_outputs; ++out_i) {
T value = 0;
for (int in_i = 0; in_i < num_inputs; ++in_i) {
value += input_values[batch * num_inputs + in_i] *
weight_values[in_i * num_outputs + out_i];
}
value += bias_values[out_i];
output_values[batch * num_outputs + out_i] = value;
}
}
#endif
}
template <typename T, typename TIndex>
void Gather(int params_rank,
const int32_t* __restrict params_shape,
const T* __restrict params_values,
int indices_rank,
const int32_t* __restrict indices_shape,
const TIndex* __restrict indices_values,
T* __restrict output_values) {
BENCHMARK_TIMER("Gather");
const int num_indices = ShapeSize(indices_rank, indices_shape);
const int num_params = params_shape[0];
const int slice_size = ShapeSize(params_rank - 1, params_shape + 1);
for (int i = 0; i < num_indices; ++i) {
const int index = indices_values[i];
if (index < 0 || index >= num_params) {
std::fill(output_values, output_values + slice_size, 0);
} else {
std::copy(params_values + index * slice_size,
params_values + index * slice_size + slice_size, output_values);
}
output_values += slice_size;
}
}
template <typename T, typename TIndex>
void Im2Col(const int32_t* __restrict input_shape,
const T* __restrict input_values,
const int32_t* __restrict kernel_shape,
int32_t stride_y,
int32_t stride_x,
int32_t out_height,
int32_t out_width,
TIndex* output_shape,
T* __restrict output_values) {
BENCHMARK_TIMER("Im2Col");
// Give the shape values nicer names.
assert(input_shape[3] == kernel_shape[2]);
const int batch_size = input_shape[0];
const int kernel_height = kernel_shape[0];
const int kernel_width = kernel_shape[1];
const int in_depth = kernel_shape[2];
const int in_height = input_shape[1];
const int in_width = input_shape[2];
// Compute the amount of padding needed to get the desired output size.
const int pad_height =
((out_height - 1) * stride_y + kernel_height - in_height) / 2;
const int pad_width =
((out_width - 1) * stride_x + kernel_width - in_width) / 2;
// Cache the strides for address computations.
const int x_stride = input_shape[3];
const int y_stride = input_shape[2] * x_stride;
const int batch_stride = input_shape[1] * y_stride;
// Write the output shape.
output_shape[0] = kernel_height * kernel_width * in_depth;
output_shape[1] = input_shape[0] * out_width * out_height;
for (int batch = 0; batch < batch_size; ++batch) {
for (int out_y = 0; out_y < out_height; ++out_y) {
for (int out_x = 0; out_x < out_width; ++out_x) {
// Compute the input read offsets.
const int in_y_origin = (out_y * stride_y) - pad_height;
const int in_x_origin = (out_x * stride_x) - pad_width;
// Compute the range of the kernel to be applied (we may need to clip
// when we'd read outside of the valid input region - for SAME).
const int kernel_y_start = std::max(0, -in_y_origin);
const int kernel_y_end =
std::min(kernel_height, in_height - in_y_origin);
const int kernel_x_start = std::max(0, -in_x_origin);
const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin);
// Padding top.
if (kernel_y_start != 0) {
const int num_lines = kernel_y_start;
const int num_coeffs = num_lines * kernel_width * in_depth;
#if USE_TYPED_MEMSETMEMCPY
std::fill(output_values, output_values + num_coeffs, 0);
#else
std::memset(output_values, 0, num_coeffs * sizeof(T));
#endif
output_values += num_coeffs;
}
for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) {
// Padding left.
if (kernel_x_start != 0) {
const int num_coeffs = kernel_x_start * in_depth;
#if USE_TYPED_MEMSETMEMCPY
std::fill(output_values, output_values + num_coeffs, 0);
#else
std::memset(output_values, 0, num_coeffs * sizeof(T));
#endif
output_values += num_coeffs;
}
// Valid values.
{
const int in_y = in_y_origin + k_y;
const int in_x = in_x_origin + kernel_x_start;
const int num_coeffs = (kernel_x_end - kernel_x_start) * in_depth;
#if USE_TYPED_MEMSETMEMCPY
const int offset =
batch * batch_stride + in_y * y_stride + in_x * x_stride;
std::copy(input_values + offset, input_values + offset + num_coeffs,
output_values);
#else
std::memcpy(output_values,
input_values // Reusing the restricted pointer.
+ batch * batch_stride // batch
+ in_y * y_stride // y
+ in_x * x_stride, // x
num_coeffs * sizeof(T));
#endif
output_values += num_coeffs;
}
// Padding right.
if (kernel_x_end != kernel_width) {
const int num_coeffs = (kernel_width - kernel_x_end) * in_depth;
#if USE_TYPED_MEMSETMEMCPY
std::fill(output_values, output_values + num_coeffs, 0);
#else
std::memset(output_values, 0, num_coeffs * sizeof(T));
#endif
output_values += num_coeffs;
}
}
// Padding bottom.
if (kernel_y_end != kernel_height) {
const int num_lines = kernel_height - kernel_y_end;
const int num_coeffs = num_lines * kernel_width * in_depth;
#if USE_TYPED_MEMSETMEMCPY
std::fill(output_values, output_values + num_coeffs, 0);
#else
std::memset(output_values, 0, num_coeffs * sizeof(T));
#endif
output_values += num_coeffs;
}
}
}
}
}
template <typename T>
void MaxPool(const int32_t* __restrict input_shape,
const T* __restrict input_values,
int32_t stride_y,
int32_t stride_x,
int32_t kernel_height,
int32_t kernel_width,
int32_t out_height,
int32_t out_width,
T* __restrict output_values) {
BENCHMARK_TIMER("MaxPool");
// Give the shape values nicer names.
const int batch_size = input_shape[0];
const int in_height = input_shape[1];
const int in_width = input_shape[2];
const int depth = input_shape[3];
// Compute the amount of padding needed to get the desired output size.
const int pad_height =
((out_height - 1) * stride_y + kernel_height - in_height) / 2;
const int pad_width =
((out_width - 1) * stride_x + kernel_width - in_width) / 2;
// Cache the strides for address computations.
const int in_strides[4] = {
input_shape[1] * input_shape[2] * input_shape[3], // batch
input_shape[2] * input_shape[3], // y
input_shape[3], // x
1, // channel
};
T* out_write_ptr = output_values;
for (int batch = 0; batch < batch_size; ++batch) {
for (int out_y = 0; out_y < out_height; ++out_y) {
for (int out_x = 0; out_x < out_width; ++out_x) {
// Compute the input read offsets.
const int in_y_origin = (out_y * stride_y) - pad_height;
const int in_x_origin = (out_x * stride_x) - pad_width;
// Compute the range of the kernel to be applied (we may need to clip
// when we'd read outside of the valid input region - for SAME).
const int kernel_y_start = std::max(0, -in_y_origin);
const int kernel_y_end =
std::min(kernel_height, in_height - in_y_origin);
const int kernel_x_start = std::max(0, -in_x_origin);
const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin);
for (int chan = 0; chan < depth; ++chan, ++out_write_ptr) {
// Convolve.
T max_value = std::numeric_limits<T>::lowest();
for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) {
const int in_y = in_y_origin + k_y;
assert(in_y >= 0 && in_y < in_height);
for (int k_x = kernel_x_start; k_x < kernel_x_end; ++k_x) {
const int in_x = in_x_origin + k_x;
assert(in_x >= 0 && in_x < in_width);
const T input_value =
input_values[batch * in_strides[0] + // batch
in_y * in_strides[1] + // y
in_x * in_strides[2] + // x
chan]; // channel
max_value = std::max(max_value, input_value);
} // kernel_x
} // kernel_y
*out_write_ptr = max_value;
} // chan
} // out_x
} // out_y
} // batch
}
template <typename T>
void Memcpy(const int32_t rank,
const int32_t* __restrict input_shape,
const T* __restrict input_values,
T* __restrict output_values) {
BENCHMARK_TIMER("Memcpy");
const int size = ShapeSize(rank, input_shape);
for (int i = 0; i < size; ++i) {
output_values[i] = input_values[i];
}
}
template <typename T>
void Softmax(const int32_t rank,
const int32_t* __restrict input_shape,
const T* __restrict input_values,
const int32_t reduce_dim,
T* __restrict output_values,
T* __restrict scratch_values) {
BENCHMARK_TIMER("Softmax");
const int size = ShapeSize(rank, input_shape);
if (rank == 2 && reduce_dim == 1) {
T logits_max = std::numeric_limits<T>::lowest();
// Max.
for (int i = 0; i < size; ++i) {
logits_max = std::max(logits_max, input_values[i]);
}
// Pre-compute exp.
for (int i = 0; i < size; ++i) {
scratch_values[i] = std::exp(input_values[i] - logits_max);
}
// Sum over the last dimension, then divide the exps and write out.
for (int offset = 0; offset < size; offset += input_shape[1]) {
T sum = 0;
const int end_offset = offset + input_shape[1];
for (int i = offset; i < end_offset; ++i)
sum += scratch_values[i];
const T rcp_denom = static_cast<T>(1) / sum;
for (int i = 0; i < input_shape[1]; ++i) {
output_values[offset + i] = scratch_values[offset + i] * rcp_denom;
}
}
} else {
assert(false && "Generic Softmax not yet supported.");
}
}
// Returns the start position for a slice in a single dimension.
template <typename T>
int StridedSliceBegin(int range_mask,
const T* __restrict range_values,
const T* __restrict strides,
const int32_t* __restrict input_shape,
int dim) {
const bool is_explicit = 0 == (range_mask & (1 << dim));
if (is_explicit) {
return range_values[dim];
} else {
const bool is_reverse = strides[dim] < 0;
return is_reverse ? input_shape[dim] - 1 : 0;
}
}
// Returns the end position for a slice in a single dimension.
template <typename T>
int StridedSliceEnd(int range_mask,
const T* __restrict range_values,
const T* __restrict strides,
const int32_t* __restrict input_shape,
int dim) {
const bool is_explicit = 0 == (range_mask & (1 << dim));
if (is_explicit) {
return range_values[dim];
} else {
const bool is_reverse = strides[dim] < 0;
return is_reverse ? -1 : input_shape[dim];
}
}
template <typename T, typename TIdx>
void StridedSlice(const int32_t input_rank,
const int32_t* __restrict input_shape,
const T* __restrict input_values,
const TIdx* __restrict begin,
const TIdx* __restrict end,
const TIdx* __restrict strides,
int32_t begin_mask,
int32_t end_mask,
T* __restrict output_values) {
BENCHMARK_TIMER("StridedSlice");
const int MAX_RANK = 8;
assert(input_rank < MAX_RANK);
// Compute the address strides for each dimension.
int dim_addr_strides[MAX_RANK] = {0};
dim_addr_strides[input_rank - 1] = 1;
for (int dim = input_rank - 2; dim >= 0; --dim) {
dim_addr_strides[dim] = dim_addr_strides[dim + 1] * input_shape[dim + 1];
}
// Resolve the masks and get explicit ranges for each dimension.
int dim_begin[MAX_RANK];
int dim_end[MAX_RANK];
bool dim_is_full_range[MAX_RANK];
for (int dim = 0; dim < input_rank; ++dim) {
const int stride = strides[dim];
dim_begin[dim] =
StridedSliceBegin(begin_mask, begin, strides, input_shape, dim);
dim_end[dim] = StridedSliceEnd(end_mask, end, strides, input_shape, dim);
dim_is_full_range[dim] =
dim_begin[dim] == 0 && dim_end[dim] == input_shape[dim] && stride == 1;
// Our termination criteria for loops is that we hit the end exactly, so
// we need to ensure that we don't step over the end with stride != 1.
const int length_mod = (dim_end[dim] - dim_begin[dim]) % stride;
if (length_mod != 0) {
dim_end[dim] += stride - length_mod;
}
}
// Find out how large the blocks are that we can copy contiguously. (All
// dimensions on the right for which we fetch the full range)
int last_sliced_dim = input_rank - 1;
int block_size = 1;
for (int dim = input_rank - 1; dim >= 0 && dim_is_full_range[dim]; --dim) {
block_size *= input_shape[dim];
last_sliced_dim--;
}
// Initialize the read pos for each dimension according to the begin offsets.
int read_pos[MAX_RANK] = {0};
for (int dim = 0; dim < input_rank; ++dim) {
read_pos[dim] = dim_begin[dim];
}
while (read_pos[0] != dim_end[0]) {
// Compute the read offset for the current position.
int32_t read_offset = 0;
for (int dim = 0; dim <= last_sliced_dim; ++dim) {
const int addr_stride = dim_addr_strides[dim];
if (read_pos[dim] < 0) {
read_offset += (input_shape[dim] + read_pos[dim]) * addr_stride;
} else {
read_offset += read_pos[dim] * addr_stride;
}
}
#if USE_TYPED_MEMSETMEMCPY
std::copy(input_values + read_offset,
input_values + read_offset + block_size, output_values);
#else
std::memcpy(output_values, input_values + read_offset,
block_size * sizeof(T));
#endif
output_values += block_size;
// Advance the read position.
for (int dim = last_sliced_dim; dim >= 0; --dim) {
read_pos[dim] += strides[dim];
if (dim == 0 || read_pos[dim] != dim_end[dim])
break;
read_pos[dim] = dim_begin[dim];
}
}
}
template <typename T>
void TransposeRank3(const int32_t* __restrict input_shape,
const T* __restrict input_values,
const int32_t* __restrict perm,
T* __restrict output_values) {
BENCHMARK_TIMER("TransposeRank3");
const int32_t in_strides[3] = {
input_shape[1] * input_shape[2],
input_shape[2],
1,
};
const int32_t out_strides[3] = {in_strides[perm[0]], in_strides[perm[1]],
in_strides[perm[2]]};
const int32_t out_shape[3] = {input_shape[perm[0]], input_shape[perm[1]],
input_shape[perm[2]]};
int32_t write_offset = 0;
for (int32_t it0 = 0; it0 < out_shape[0]; ++it0) {
const int32_t read_offset0 = it0 * out_strides[0];
for (int32_t it1 = 0; it1 < out_shape[1]; ++it1) {
const int32_t read_offset01 = read_offset0 + it1 * out_strides[1];
for (int32_t it2 = 0; it2 < out_shape[2]; ++it2, ++write_offset) {
const int32_t read_offset = read_offset01 + it2 * out_strides[2];
output_values[write_offset] = input_values[read_offset];
}
}
}
}
template <typename T>
void TransposeRank4(const int32_t* __restrict input_shape,
const T* __restrict input_values,
const int32_t* __restrict perm,
T* __restrict output_values) {
BENCHMARK_TIMER("TransposeRank4");
const int32_t in_strides[4] = {
input_shape[1] * input_shape[2] * input_shape[3],
input_shape[2] * input_shape[3],
input_shape[3],
1,
};
const int32_t out_strides[4] = {in_strides[perm[0]], in_strides[perm[1]],
in_strides[perm[2]], in_strides[perm[3]]};
const int32_t out_shape[4] = {input_shape[perm[0]], input_shape[perm[1]],
input_shape[perm[2]], input_shape[perm[3]]};
int32_t write_offset = 0;
for (int32_t it0 = 0; it0 < out_shape[0]; ++it0) {
const int32_t read_offset0 = it0 * out_strides[0];
for (int32_t it1 = 0; it1 < out_shape[1]; ++it1) {
const int32_t read_offset01 = read_offset0 + it1 * out_strides[1];
for (int32_t it2 = 0; it2 < out_shape[2]; ++it2) {
const int32_t read_offset012 = read_offset01 + it2 * out_strides[2];
for (int32_t it3 = 0; it3 < out_shape[3]; ++it3, ++write_offset) {
const int32_t read_offset = read_offset012 + it3 * out_strides[3];
output_values[write_offset] = input_values[read_offset];
}
}
}
}
}
template <typename T, typename TIdx, typename TDepth>
void OneHot(const int32_t input_rank,
const int32_t* __restrict input_shape,
const TIdx* __restrict input_values,
const TDepth* __restrict depth,
const T* __restrict on_value,
const T* __restrict off_value,
const int32_t axis,
T* __restrict output_values) {
BENCHMARK_TIMER("OneHot");
const int32_t num_elements = ShapeSize(input_rank, input_shape);
// We can assume axis >= 0 in this implementation.
const int32_t prefix_dim_size = ShapeSize(axis, input_shape);
const int32_t suffix_dim_size = num_elements / prefix_dim_size;
int32_t write_offset = 0;
for (int32_t i = 0; i < prefix_dim_size; i++) {
int32_t read_offset_pre = i * suffix_dim_size;
for (TDepth d = 0; d < *depth; d++) {
for (int32_t j = 0; j < suffix_dim_size; j++, write_offset++) {
const int32_t read_offset = read_offset_pre + j;
output_values[write_offset] =
(input_values[read_offset] == d) ? *on_value : *off_value;
}
}
}
}
template <typename T, typename TIdx, typename TDepth>
void OneHotLastDim(const int32_t input_rank,
const int32_t* __restrict input_shape,
const TIdx* __restrict input_values,
const TDepth* __restrict depth,
const T* __restrict on_value,
const T* __restrict off_value,
T* __restrict output_values) {
BENCHMARK_TIMER("OneHotLastDim");
const int32_t num_elements = ShapeSize(input_rank, input_shape);
int32_t write_offset = 0;
for (int32_t i = 0; i < num_elements; i++) {
for (TDepth d = 0; d < *depth; d++, write_offset++) {
output_values[write_offset] =
(input_values[i] == d) ? *on_value : *off_value;
}
}
}
// -----------------------------------------------------------------------------
// Simple unary ops
// -----------------------------------------------------------------------------
// We use macros instead of template functions with templated functors here
// because it's a lot less verbose and easier for the compiler to optimize.
#if USE_EIGEN
#define SIMPLE_UNARY_OP(OP_NAME, _, EXPR_EIGEN) \
template <typename T> \
void OP_NAME(const int32_t rank, const int32_t* __restrict input_shape, \
const T* __restrict input_values, \
T* __restrict output_values) { \
BENCHMARK_TIMER(#OP_NAME); \
const int size = ShapeSize(rank, input_shape); \
auto values = ConstRowVectorMap<T>(input_values, size).array(); \
auto output = RowVectorMap<T>(output_values, size).array(); \
output = EXPR_EIGEN; \
}
#else
#define SIMPLE_UNARY_OP(OP_NAME, EXPR, _) \
template <typename T> \
void OP_NAME(const int32_t rank, const int32_t* __restrict input_shape, \
const T* __restrict input_values, \
T* __restrict output_values) { \
BENCHMARK_TIMER(#OP_NAME); \
const int size = ShapeSize(rank, input_shape); \
for (int i = 0; i < size; ++i) { \
const T value = input_values[i]; \
output_values[i] = EXPR; \
} \
}
#endif
// Second macro param is value expression, third entry is Eigen vector
// expression.
SIMPLE_UNARY_OP(Abs, std::abs(value), values.abs())
SIMPLE_UNARY_OP(Acos, std::acos(value), values.acos())
SIMPLE_UNARY_OP(Asin, std::asin(value), values.asin())
SIMPLE_UNARY_OP(Atan, std::atan(value), values.atan())
SIMPLE_UNARY_OP(Cos, std::cos(value), values.cos())
SIMPLE_UNARY_OP(Cosh, std::cosh(value), values.cosh())
SIMPLE_UNARY_OP(Exp, std::exp(value), values.exp())
SIMPLE_UNARY_OP(Elu,
value < 0 ? std::expm1(value) : value,
// Use branchless version of Elu: min(ReLU, e^x - 1)
values.max(0).min(values.exp() - 1))
SIMPLE_UNARY_OP(Log, std::log(value), values.log())
SIMPLE_UNARY_OP(Log1p, std::log1p(value), values.log1p())
SIMPLE_UNARY_OP(Neg, -value, -values)
SIMPLE_UNARY_OP(Reciprocal, static_cast<T>(1) / value, values.cwiseInverse())
SIMPLE_UNARY_OP(Relu, std::max(value, static_cast<T>(0)), values.max(0))
SIMPLE_UNARY_OP(Relu6,
std::min(std::max(value, static_cast<T>(0)), static_cast<T>(6)),
values.max(0).min(6))
SIMPLE_UNARY_OP(Rsqrt, static_cast<T>(1) / std::sqrt(value), values.rsqrt())
SIMPLE_UNARY_OP(Sigmoid,
static_cast<T>(1) / (1 + std::exp(-value)),
((-values).exp() + 1).cwiseInverse())
SIMPLE_UNARY_OP(Sin, std::sin(value), values.sin())
SIMPLE_UNARY_OP(Sinh, std::sinh(value), values.sinh())
SIMPLE_UNARY_OP(Sqrt, std::sqrt(value), values.sqrt())
SIMPLE_UNARY_OP(Square, value* value, values.square())
SIMPLE_UNARY_OP(Tan, std::tan(value), values.tan())
SIMPLE_UNARY_OP(Tanh, std::tanh(value), values.tanh())
// -----------------------------------------------------------------------------
// Broadcasting binary ops
// -----------------------------------------------------------------------------
template <typename T, typename OP>
void OpNoBroadcast(const int32_t left_rank,
const int32_t* __restrict left_shape,
const T* __restrict left_values,
const int32_t right_rank,
const int32_t* __restrict right_shape,
const T* __restrict right_values,
T* __restrict output_values,
OP op) {
BENCHMARK_TIMER(op.name, "NoBroadcast");
const int32_t size = ShapeSize(left_rank, left_shape);
#if USE_EIGEN
auto lhs = ConstRowVectorMap<T>(left_values, size).array();
auto rhs = ConstRowVectorMap<T>(right_values, size).array();
auto output = RowVectorMap<T>(output_values, size).array();
op.apply(lhs, rhs, output);
#else
for (int32_t i = 0; i < size; ++i) {
output_values[i] = op(left_values[i], right_values[i]);
}
#endif
}
template <typename T, typename OP>
void OpInnerBroadcast(int32_t left_rank,
const int32_t* __restrict left_shape,
const T* __restrict left_values,
int32_t right_rank,
const int32_t* __restrict right_shape,
const T* __restrict right_values,
T* __restrict output_values,
OP op) {
BENCHMARK_TIMER(op.name, "InnerBroadcast");
const int32_t output_size = ShapeSize(left_rank, left_shape);
const int32_t inner_size = ShapeSize(right_rank, right_shape);
const int32_t outer_size = output_size / inner_size;
#if USE_EIGEN
if (inner_size == 1) {
// Apply the same value to all elements.
auto left = ConstMatrixMap<T>(left_values, inner_size, outer_size);
auto output = MatrixMap<T>(output_values, inner_size, outer_size);
op.apply(left.array(), right_values[0], output.array());
} else {
auto left = ConstMatrixMap<T>(left_values, inner_size, outer_size);
auto right = ConstRowVectorMap<T>(right_values, inner_size);
auto output = MatrixMap<T>(output_values, inner_size, outer_size);
for (int32_t col = 0; col < outer_size; col++) {
op.apply(left.col(col).array(), right.array(), output.col(col).array());
}
}
#else
for (int32_t idx_out = 0; idx_out < outer_size; ++idx_out) {
for (int32_t idx_in = 0; idx_in < inner_size; ++idx_in) {
const int32_t offset = idx_out * inner_size + idx_in;
output_values[offset] = op(left_values[offset], right_values[idx_in]);
}
}
#endif
}
#define BROADCAST_BINARY_OP(OP_NAME, EXPR, EXPR_EIGEN) \
template <typename T> \
struct Op##OP_NAME { \
const char* name = #OP_NAME; \
T operator()(const T lhs, const T rhs) { return EXPR; } \
template <typename X, typename Y, typename Z> \
void apply(const X& lhs, const Y& rhs, Z out) { \
out = EXPR_EIGEN; \
} \
}; \
template <typename T> \
void OP_NAME##NoBroadcast( \
const int32_t left_rank, const int32_t* __restrict left_shape, \
const T* __restrict left_values, const int32_t right_rank, \
const int32_t* __restrict right_shape, const T* __restrict right_values, \
T* __restrict output_values) { \
OpNoBroadcast(left_rank, left_shape, left_values, right_rank, right_shape, \
right_values, output_values, Op##OP_NAME<T>()); \
} \
template <typename T> \
void OP_NAME##InnerBroadcast( \
const int32_t left_rank, const int32_t* __restrict left_shape, \
const T* __restrict left_values, const int32_t right_rank, \
const int32_t* __restrict right_shape, const T* __restrict right_values, \
T* __restrict output_values) { \
OpInnerBroadcast(left_rank, left_shape, left_values, right_rank, \
right_shape, right_values, output_values, \
Op##OP_NAME<T>()); \
}
// Second macro param is value expression, third entry is Eigen vector
// expression.
BROADCAST_BINARY_OP(Add, lhs + rhs, lhs + rhs)
BROADCAST_BINARY_OP(Maximum, std::max(lhs, rhs), lhs.max(rhs))
BROADCAST_BINARY_OP(Minimum, std::min(lhs, rhs), lhs.min(rhs))
BROADCAST_BINARY_OP(Mul, lhs* rhs, lhs* rhs)
BROADCAST_BINARY_OP(Sub, lhs - rhs, lhs - rhs)
BROADCAST_BINARY_OP(SquaredDifference,
(lhs - rhs) * (lhs - rhs),
(lhs - rhs).square())
// -----------------------------------------------------------------------------
// Reduce ops
// -----------------------------------------------------------------------------
// We use macros instead of template functions with templated functors here
// because it's a lot less verbose and easier for the compiler to optimize.
#define REDUCE_OP(OP_NAME, DEFAULT_VALUE, UPDATE_EXPR, RESULT_EXPR) \
template <typename T, typename Tidx> \
void OP_NAME##InnerReduce( \
int32_t input_rank, const int32_t* __restrict input_shape, \
const T* __restrict input_values, int32_t index_tensor_rank, \
const int32_t* __restrict index_shape, \
const Tidx* __restrict index_values, T* __restrict output_values) { \
BENCHMARK_TIMER(#OP_NAME, "InnerReduce"); \
const int32_t inner_size = \
GetReduceInnerSize(input_rank, input_shape, index_tensor_rank, \
index_shape, index_values); \
const int32_t input_size = ShapeSize(input_rank, input_shape); \
const int32_t outer_size = input_size / inner_size; \
for (int32_t idx_out = 0; idx_out < outer_size; ++idx_out) { \
T value = DEFAULT_VALUE; \
for (int32_t idx_in = 0; idx_in < inner_size; ++idx_in) { \
const T prev = value; \
const T next = input_values[idx_out * inner_size + idx_in]; \
value = UPDATE_EXPR; \
} \
const T count = inner_size; \
(void)sizeof(count); \
output_values[idx_out] = RESULT_EXPR; \
} \
} \
template <typename T, typename Tidx> \
void OP_NAME##GenericReduceRank4( \
int32_t input_rank, const int32_t* __restrict input_shape, \
const T* __restrict input_values, int32_t index_tensor_rank, \
const int32_t* __restrict index_shape, \
const Tidx* __restrict index_values, T* __restrict output_values) { \
assert(input_rank == 4); \
assert(index_tensor_rank <= 1); \
BENCHMARK_TIMER(#OP_NAME, "GenericReduceRank4"); \
int out_shape[4] = {input_shape[0], input_shape[1], input_shape[2], \
input_shape[3]}; \
bool reduce_mask[4] = {false, false, false, false}; \
const int num_indices = index_tensor_rank > 0 ? index_shape[0] : 1; \
for (int i = 0; i < num_indices; ++i) { \
reduce_mask[index_values[i]] = true; \
out_shape[index_values[i]] = 1; \
} \
const int out_strides[4] = { \
reduce_mask[0] ? 0 : out_shape[1] * out_shape[2] * out_shape[3], \
reduce_mask[1] ? 0 : out_shape[2] * out_shape[3], \
reduce_mask[2] ? 0 : out_shape[3], \
reduce_mask[3] ? 0 : 1, \
}; \
const int output_size = ShapeSize(input_rank, out_shape); \
std::fill_n(output_values, output_size, DEFAULT_VALUE); \
for (int dim0 = 0; dim0 < input_shape[0]; ++dim0) { \
for (int dim1 = 0; dim1 < input_shape[1]; ++dim1) { \
for (int dim2 = 0; dim2 < input_shape[2]; ++dim2) { \
for (int dim3 = 0; dim3 < input_shape[3]; ++dim3, ++input_values) { \
T* out_ptr = output_values + out_strides[0] * dim0 + \
out_strides[1] * dim1 + out_strides[2] * dim2 + \
out_strides[3] * dim3; \
const T prev = *out_ptr; \
const T next = *input_values; \
*out_ptr = UPDATE_EXPR; \
} \
} \
} \
} \
const T count = (reduce_mask[0] ? input_shape[0] : 1) * \
(reduce_mask[1] ? input_shape[1] : 1) * \
(reduce_mask[2] ? input_shape[2] : 1) * \
(reduce_mask[3] ? input_shape[3] : 1); \
(void)sizeof(count); \
for (int i = 0; i < output_size; ++i) { \
const T value = output_values[i]; \
output_values[i] = RESULT_EXPR; \
} \
}
REDUCE_OP(Max, std::numeric_limits<T>::lowest(), std::max(prev, next), value)
REDUCE_OP(Sum, 0, prev + next, value)
REDUCE_OP(Mean, 0, prev + next, value / count)
#undef REDUCE_OP
// -----------------------------------------------------------------------------
// Dequantize ops
// -----------------------------------------------------------------------------
template <typename T>
void DequantizeMinCombined(const int32_t rank,
const int32_t* __restrict input_shape,
const T* __restrict input_values,
const float* __restrict min_range,
const float* __restrict max_range,
float* __restrict output_values) {
BENCHMARK_TIMER("DequantizeMinCombined");
const int size = ShapeSize(rank, input_shape);
const float offset =
std::is_signed<T>::value
? (static_cast<float>(std::numeric_limits<T>::max()) -
std::numeric_limits<T>::min() + 1) /
2.0f
: 0.0f;
const float range_scale = (max_range[0] - min_range[0]) /
(static_cast<float>(std::numeric_limits<T>::max()) -
std::numeric_limits<T>::min());
for (int i = 0; i < size; i++) {
output_values[i] =
((static_cast<int32_t>(input_values[i]) + offset) * range_scale) +
min_range[0];
}
}
template <typename T>
void DequantizeMinFirst(const int32_t rank,
const int32_t* __restrict input_shape,
const T* __restrict input_values,
const float* __restrict min_range,
const float* __restrict max_range,
float* __restrict output_values) {
BENCHMARK_TIMER("DequantizeMinFirst");
const int size = ShapeSize(rank, input_shape);
const float range_scale = (max_range[0] - min_range[0]) /
(static_cast<float>(std::numeric_limits<T>::max()) -
std::numeric_limits<T>::min());
const float range_min_rounded =
(max_range[0] == min_range[0]
? min_range[0]
: round(min_range[0] / range_scale) * range_scale);
for (int i = 0; i < size; i++) {
output_values[i] = ((static_cast<int32_t>(input_values[i]) -
std::numeric_limits<T>::min()) *
range_scale) +
range_min_rounded;
}
}
// -----------------------------------------------------------------------------
// CONSTANTS
// Note that for now, endianness of the target machine needs to match that of
// the one training was performed on.
// -----------------------------------------------------------------------------
const int32_t dnn_hiddenlayer_0_weights_part_0_shape[2] = {4, 10};
const union {
uint8_t bytes[160];
float values[40];
} dnn_hiddenlayer_0_weights_part_0 = {{
0xbc, 0x22, 0x0a, 0xbf, 0xb4, 0x46, 0x8c, 0x3f, 0xba, 0x31, 0x34, 0xbe,
0x4c, 0x65, 0xdb, 0xbe, 0xf0, 0x54, 0x5e, 0xbe, 0xc1, 0x5d, 0xb3, 0x3f,
0xf4, 0xe6, 0x15, 0xbf, 0x05, 0xc6, 0x34, 0xbf, 0xc0, 0x37, 0x7e, 0xbd,
0x6c, 0x35, 0x0b, 0xbf, 0xca, 0x53, 0x26, 0xbf, 0x58, 0xb4, 0x87, 0x3f,
0x37, 0xee, 0x39, 0xbf, 0xda, 0xfa, 0xf9, 0xbe, 0x97, 0xc1, 0x06, 0xbf,
0xf9, 0x4e, 0x81, 0x3f, 0xb2, 0x44, 0x85, 0xbf, 0x7f, 0x98, 0x7c, 0x3d,
0x15, 0x26, 0xbc, 0xbe, 0x5c, 0x48, 0x05, 0x3f, 0xc8, 0xaa, 0xa1, 0xbd,
0x35, 0xb3, 0x43, 0xbe, 0xeb, 0x46, 0x91, 0x3f, 0x80, 0x71, 0xe3, 0x3c,
0xd1, 0x98, 0x79, 0x3f, 0x3c, 0xd0, 0x0d, 0xbf, 0x1e, 0x02, 0xd3, 0x3e,
0x5d, 0x4b, 0xa2, 0xbf, 0x68, 0xac, 0xaa, 0xbd, 0xf8, 0xe1, 0x75, 0x3e,
0x4a, 0x9c, 0x27, 0xbe, 0xf8, 0xae, 0xb2, 0xbe, 0x7f, 0x9d, 0x91, 0x3f,
0x1e, 0x8b, 0xa8, 0xbe, 0x35, 0x7e, 0xb2, 0x3f, 0xbe, 0x8c, 0xd3, 0xbe,
0xf9, 0xcd, 0xb5, 0x3f, 0xa1, 0x50, 0xaa, 0x3f, 0xe4, 0x6d, 0xdd, 0xbe,
0x0d, 0xce, 0xd3, 0xbe,
}};
const int32_t dnn_hiddenlayer_0_biases_part_0_shape[1] = {10};
const union {
uint8_t bytes[40];
float values[10];
} dnn_hiddenlayer_0_biases_part_0 = {{
0x00, 0x00, 0x00, 0x00, 0xbf, 0x6a, 0x53, 0x3e, 0xd3, 0xc1,
0xd0, 0x3e, 0x00, 0x00, 0x00, 0x00, 0xb6, 0xd8, 0xc0, 0x3e,
0xca, 0xe7, 0x35, 0x3e, 0x23, 0xa5, 0x44, 0x3f, 0x61, 0xfd,
0xd2, 0x3e, 0x00, 0x00, 0x00, 0x00, 0xb6, 0xe0, 0x43, 0x3c,
}};
const int32_t dnn_logits_biases_part_0_shape[1] = {1};
const union {
uint8_t bytes[4];
float values[1];
} dnn_logits_biases_part_0 = {{
0x75,
0xca,
0xd7,
0xbe,
}};
const int32_t dnn_logits_weights_part_0_shape[2] = {10, 1};
const union {
uint8_t bytes[40];
float values[10];
} dnn_logits_weights_part_0 = {{
0x13, 0x12, 0x39, 0x3f, 0xf3, 0xa5, 0xc2, 0xbf, 0x81, 0x7f,
0xbe, 0x3f, 0xf8, 0x17, 0x26, 0x3e, 0xa4, 0x19, 0xa6, 0x3f,
0xf0, 0xc9, 0xb7, 0xbf, 0x6a, 0x99, 0xd2, 0x3f, 0x8a, 0x7d,
0xe9, 0x3f, 0x83, 0x9a, 0x3a, 0xbf, 0xf1, 0x6c, 0x08, 0x3e,
}};
} // anonymous namespace
// -----------------------------------------------------------------------------
// INFERENCE
// -----------------------------------------------------------------------------
int32_t input0Shape[2] = {1, 4};
int32_t logits_MatMul_merged_with_dnn_logits_BiasAdd0Shape[2] = {1, 1};
void Inference(
const float* __restrict input0 /* shape: 1,4 */,
float* __restrict logits_MatMul_merged_with_dnn_logits_BiasAdd0 /* shape:
1,1 */
,
FixedAllocations* __restrict fixed) {
const int32_t input0_shape[] = {1, 4};
int32_t logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[2];
// dnn/hiddenlayer_0/MatMul_merged_with_dnn/hiddenlayer_0/BiasAdd
FullyConnected<float>(input0_shape, input0,
dnn_hiddenlayer_0_weights_part_0_shape,
dnn_hiddenlayer_0_weights_part_0.values,
dnn_hiddenlayer_0_biases_part_0_shape,
dnn_hiddenlayer_0_biases_part_0.values, fixed->alloc0);
fixed->alloc0_shape[0] = 1;
fixed->alloc0_shape[1] = 10;
// dnn/hiddenlayer_0/hiddenlayer_0/Relu
Relu<float>(2, // rank
fixed->alloc0_shape, fixed->alloc0, fixed->alloc1);
fixed->alloc1_shape[0] = 1;
fixed->alloc1_shape[1] = 10;
// dnn/logits/MatMul_merged_with_dnn/logits/BiasAdd
FullyConnected<float>(
fixed->alloc1_shape, fixed->alloc1, dnn_logits_weights_part_0_shape,
dnn_logits_weights_part_0.values, dnn_logits_biases_part_0_shape,
dnn_logits_biases_part_0.values,
logits_MatMul_merged_with_dnn_logits_BiasAdd0);
logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[0] = 1;
logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[1] = 1;
}
} // namespace darkmode_tfnative_model