chromium/src/third_party/blink/renderer/platform/graphics/darkmode/darkmode_classifier.cc - manifest_repos/chromium_src - Git at Google

 // Copyright 2017 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 // This file is automatically generated using tfNative from a neural network,
 // trained by TensorFlow. Please do not edit.

 #include "darkmode_classifier.h"
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdint>
 #include <cstring>
 #include <limits>
 #include <tuple>
 #if USE_EIGEN
 #include "third_party/eigen3/Eigen/Core"
 #endif
 namespace darkmode_tfnative_model {
 namespace {

 // -----------------------------------------------------------------------------
 // OP LIBRARY
 // Copied here to make sure that the inferece code always stays in sync with the
 // lib that it was generated for.
 // -----------------------------------------------------------------------------

 // Default to using std::copy and std::fill over memcpy and memset as they
 // are usually faster, thanks to the compiler getting stricter alignment
 // guarantees.
 #ifndef USE_TYPED_MEMSETMEMCPY
 #define USE_TYPED_MEMSETMEMCPY 1
 #endif
 #define USE_EIGEN 0
 #ifndef USE_EIGEN
 #error Please define USE_EIGEN to either 0 or 1
 #endif

 // Helper to reinterpret memory as Eigen matrices.
 #if USE_EIGEN
 template <typename Scalar>
 using ConstMatrixMap = typename Eigen::Map<
     const Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>;
 template <typename Scalar>
 using ConstRowVectorMap =
     typename Eigen::Map<const Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>;
 template <typename Scalar>
 using RowVectorMap =
     typename Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>;
 template <typename Scalar>
 using MatrixMap =
     typename Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>;
 #endif

 #define BENCHMARK_TIMER(...)

 // The size of a shape in terms of number of coefficients.
 inline int ShapeSize(const int32_t rank, const int32_t* shape) {
   int size = 1;
   for (int i = 0; i < rank; ++i)
     size *= shape[i];
   return size;
 }

 // Helper to compute the size of the inner loop for an op that uses indices to
 // specify which axes are reduced.
 template <typename Tidx>
 int32_t GetReduceInnerSize(int32_t input_tensor_rank,
                            const int32_t* __restrict input_shape,
                            int32_t index_tensor_rank,
                            const int32_t* __restrict index_shape,
                            const Tidx* __restrict index_values) {
   assert(index_tensor_rank <= 1);
   const int32_t num_indices = index_tensor_rank > 0 ? index_shape[0] : 1;
   int32_t inner_size = 1;
   for (int32_t i = 0; i < num_indices; ++i) {
     inner_size *= input_shape[index_values[i]];
   }
   return inner_size;
 }

 template <typename T>
 void ConcatV2Args2(int32_t arg0_rank,
                    const int32_t* __restrict arg0_shape,
                    const T* __restrict arg0_values,
                    int32_t arg1_rank,
                    const int32_t* __restrict arg1_shape,
                    const T* __restrict arg1_values,
                    const int32_t* __restrict axis_value,
                    T* __restrict output_values) {
   BENCHMARK_TIMER("ConcatV2Args2");
   const int axis = axis_value[0];
   const int num_lines = ShapeSize(axis, arg0_shape);
   const int arg0_line_size = ShapeSize(arg0_rank - axis, arg0_shape + axis);
   const int arg1_line_size = ShapeSize(arg1_rank - axis, arg1_shape + axis);
   for (int line = 0; line < num_lines; ++line) {
     std::copy(arg0_values, arg0_values + arg0_line_size, output_values);
     arg0_values += arg0_line_size;
     output_values += arg0_line_size;
     std::copy(arg1_values, arg1_values + arg1_line_size, output_values);
     arg1_values += arg1_line_size;
     output_values += arg1_line_size;
   }
 }

 template <typename T>
 void Conv2DAsGemm(const int32_t* __restrict in_shape,
                   const T* __restrict in_values,
                   const int32_t* __restrict filter_shape,
                   const T* __restrict filter_values,
                   T* __restrict output_values) {
   BENCHMARK_TIMER("Conv2DAsGemm");
 #if USE_EIGEN
   const auto in = ConstMatrixMap<T>(in_values, in_shape[0], in_shape[1]);
   const auto filter =
       ConstMatrixMap<T>(filter_values, filter_shape[3],
                         filter_shape[0] * filter_shape[1] * filter_shape[2]);
   auto result = MatrixMap<T>(output_values, filter_shape[3], in_shape[1]);
   result.noalias() = filter * in;
 #else
   const int32_t out_rows = in_shape[1];
   const int32_t out_cols = filter_shape[3];
   const int32_t dot_len = in_shape[0];
   for (int row = 0; row < out_rows; ++row) {
     for (int col = 0; col < out_cols; ++col) {
       T value = 0;
       for (int i = 0; i < dot_len; ++i) {
         value +=
             in_values[row * dot_len + i] * filter_values[i * out_cols + col];
       }
       *output_values++ = value;
     }
   }
 #endif
 }

 template <typename T>
 void DepthwiseConv2dNative(const int32_t* __restrict input_shape,
                            const T* __restrict input_values,
                            const int32_t* __restrict kernel_shape,
                            const T* __restrict kernel_values,
                            int32_t stride_y,
                            int32_t stride_x,
                            int32_t out_height,
                            int32_t out_width,
                            T* __restrict output_values) {
   BENCHMARK_TIMER("DepthwiseConv2dNative");
   // Give the shape values nicer names.
   assert(input_shape[3] == kernel_shape[2]);
   const int batch_size = input_shape[0];
   const int kernel_height = kernel_shape[0];
   const int kernel_width = kernel_shape[1];
   const int in_depth = kernel_shape[2];
   const int depth_mul = kernel_shape[3];
   const int in_height = input_shape[1];
   const int in_width = input_shape[2];

   // Compute the amount of padding needed to get the desired output size.
   const int pad_height =
       ((out_height - 1) * stride_y + kernel_height - in_height) / 2;
   const int pad_width =
       ((out_width - 1) * stride_x + kernel_width - in_width) / 2;

   // Cache the strides for address computations.
   const int in_strides[4] = {
       input_shape[1] * input_shape[2] * input_shape[3],  // batch
       input_shape[2] * input_shape[3],                   // y
       input_shape[3],                                    // x
       1,                                                 // channel
   };
   const int kernel_strides[4] = {
       kernel_shape[1] * kernel_shape[2] * kernel_shape[3],  // y
       kernel_shape[2] * kernel_shape[3],                    // x
       kernel_shape[3],                                      // in channels
       1,                                                    // channel mult
   };

   T* out_write_ptr = output_values;
   for (int batch = 0; batch < batch_size; ++batch) {
     for (int out_y = 0; out_y < out_height; ++out_y) {
       for (int out_x = 0; out_x < out_width; ++out_x) {
         // Compute the input read offsets.
         const int in_y_origin = (out_y * stride_y) - pad_height;
         const int in_x_origin = (out_x * stride_x) - pad_width;

         // Compute the range of the kernel to be applied (we may need to clip
         // when we'd read outside of the valid input region - for SAME).
         const int kernel_y_start = std::max(0, -in_y_origin);
         const int kernel_y_end =
             std::min(kernel_height, in_height - in_y_origin);
         const int kernel_x_start = std::max(0, -in_x_origin);
         const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin);

         for (int in_c = 0; in_c < in_depth; ++in_c) {
           for (int mul_c = 0; mul_c < depth_mul; ++mul_c, ++out_write_ptr) {
             // Convolve.
             T sum = 0;
             for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) {
               const int in_y = in_y_origin + k_y;
               assert(in_y >= 0 && in_y < in_height);
               for (int k_x = kernel_x_start; k_x < kernel_x_end; ++k_x) {
                 const int in_x = in_x_origin + k_x;
                 assert(in_x >= 0 && in_x < in_width);
                 const T input_value =
                     input_values[batch * in_strides[0] +  // batch
                                  in_y * in_strides[1] +   // y
                                  in_x * in_strides[2] +   // x
                                  in_c];                   // in chan
                 const T kernel_value =
                     kernel_values[k_y * kernel_strides[0] +   // y
                                   k_x * kernel_strides[1] +   // x
                                   in_c * kernel_strides[2] +  // in chan
                                   mul_c];                     // chan mult
                 sum += input_value * kernel_value;
               }
             }
             *out_write_ptr = sum;
           }  // mul_c
         }    // in_c
       }      // out_x
     }        // out_y
   }          // batch
 }

 template <typename T>
 void FullyConnected(const int32_t* __restrict input_shape,
                     const T* __restrict input_values,
                     const int32_t* __restrict weight_shape,
                     const T* __restrict weight_values,
                     const int32_t* __restrict bias_shape,
                     const T* __restrict bias_values,
                     T* __restrict output_values) {
   BENCHMARK_TIMER("FullyConnected");
 #if USE_EIGEN
   const auto in =
       ConstMatrixMap<T>(input_values, input_shape[1], input_shape[0]);
   const auto weight =
       ConstMatrixMap<T>(weight_values, weight_shape[1], weight_shape[0]);
   const auto bias = ConstRowVectorMap<T>(bias_values, bias_shape[0]);
   auto result = MatrixMap<T>(output_values, weight_shape[1], input_shape[0]);
   result.noalias() = (weight * in).colwise() + bias;
 #else
   const int batch_size = input_shape[0];
   const int num_inputs = weight_shape[0];
   const int num_outputs = weight_shape[1];
   assert(input_shape[1] == num_inputs);
   assert(bias_shape[0] == num_outputs);
   for (int batch = 0; batch < batch_size; ++batch) {
     for (int out_i = 0; out_i < num_outputs; ++out_i) {
       T value = 0;
       for (int in_i = 0; in_i < num_inputs; ++in_i) {
         value += input_values[batch * num_inputs + in_i] *
                  weight_values[in_i * num_outputs + out_i];
       }
       value += bias_values[out_i];
       output_values[batch * num_outputs + out_i] = value;
     }
   }
 #endif
 }

 template <typename T, typename TIndex>
 void Gather(int params_rank,
             const int32_t* __restrict params_shape,
             const T* __restrict params_values,
             int indices_rank,
             const int32_t* __restrict indices_shape,
             const TIndex* __restrict indices_values,
             T* __restrict output_values) {
   BENCHMARK_TIMER("Gather");
   const int num_indices = ShapeSize(indices_rank, indices_shape);
   const int num_params = params_shape[0];
   const int slice_size = ShapeSize(params_rank - 1, params_shape + 1);
   for (int i = 0; i < num_indices; ++i) {
     const int index = indices_values[i];
     if (index < 0 || index >= num_params) {
       std::fill(output_values, output_values + slice_size, 0);
     } else {
       std::copy(params_values + index * slice_size,
                 params_values + index * slice_size + slice_size, output_values);
     }
     output_values += slice_size;
   }
 }

 template <typename T, typename TIndex>
 void Im2Col(const int32_t* __restrict input_shape,
             const T* __restrict input_values,
             const int32_t* __restrict kernel_shape,
             int32_t stride_y,
             int32_t stride_x,
             int32_t out_height,
             int32_t out_width,
             TIndex* output_shape,
             T* __restrict output_values) {
   BENCHMARK_TIMER("Im2Col");
   // Give the shape values nicer names.
   assert(input_shape[3] == kernel_shape[2]);
   const int batch_size = input_shape[0];
   const int kernel_height = kernel_shape[0];
   const int kernel_width = kernel_shape[1];
   const int in_depth = kernel_shape[2];
   const int in_height = input_shape[1];
   const int in_width = input_shape[2];

   // Compute the amount of padding needed to get the desired output size.
   const int pad_height =
       ((out_height - 1) * stride_y + kernel_height - in_height) / 2;
   const int pad_width =
       ((out_width - 1) * stride_x + kernel_width - in_width) / 2;

   // Cache the strides for address computations.
   const int x_stride = input_shape[3];
   const int y_stride = input_shape[2] * x_stride;
   const int batch_stride = input_shape[1] * y_stride;

   // Write the output shape.
   output_shape[0] = kernel_height * kernel_width * in_depth;
   output_shape[1] = input_shape[0] * out_width * out_height;

   for (int batch = 0; batch < batch_size; ++batch) {
     for (int out_y = 0; out_y < out_height; ++out_y) {
       for (int out_x = 0; out_x < out_width; ++out_x) {
         // Compute the input read offsets.
         const int in_y_origin = (out_y * stride_y) - pad_height;
         const int in_x_origin = (out_x * stride_x) - pad_width;

         // Compute the range of the kernel to be applied (we may need to clip
         // when we'd read outside of the valid input region - for SAME).
         const int kernel_y_start = std::max(0, -in_y_origin);
         const int kernel_y_end =
             std::min(kernel_height, in_height - in_y_origin);
         const int kernel_x_start = std::max(0, -in_x_origin);
         const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin);

         // Padding top.
         if (kernel_y_start != 0) {
           const int num_lines = kernel_y_start;
           const int num_coeffs = num_lines * kernel_width * in_depth;
 #if USE_TYPED_MEMSETMEMCPY
           std::fill(output_values, output_values + num_coeffs, 0);
 #else
           std::memset(output_values, 0, num_coeffs * sizeof(T));
 #endif
           output_values += num_coeffs;
         }
         for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) {
           // Padding left.
           if (kernel_x_start != 0) {
             const int num_coeffs = kernel_x_start * in_depth;
 #if USE_TYPED_MEMSETMEMCPY
             std::fill(output_values, output_values + num_coeffs, 0);
 #else
             std::memset(output_values, 0, num_coeffs * sizeof(T));
 #endif
             output_values += num_coeffs;
           }
           // Valid values.
           {
             const int in_y = in_y_origin + k_y;
             const int in_x = in_x_origin + kernel_x_start;
             const int num_coeffs = (kernel_x_end - kernel_x_start) * in_depth;
 #if USE_TYPED_MEMSETMEMCPY
             const int offset =
                 batch * batch_stride + in_y * y_stride + in_x * x_stride;
             std::copy(input_values + offset, input_values + offset + num_coeffs,
                       output_values);
 #else
             std::memcpy(output_values,
                         input_values  // Reusing the restricted pointer.
                             + batch * batch_stride  // batch
                             + in_y * y_stride       // y
                             + in_x * x_stride,      // x
                         num_coeffs * sizeof(T));
 #endif
             output_values += num_coeffs;
           }
           // Padding right.
           if (kernel_x_end != kernel_width) {
             const int num_coeffs = (kernel_width - kernel_x_end) * in_depth;
 #if USE_TYPED_MEMSETMEMCPY
             std::fill(output_values, output_values + num_coeffs, 0);
 #else
             std::memset(output_values, 0, num_coeffs * sizeof(T));
 #endif
             output_values += num_coeffs;
           }
         }
         // Padding bottom.
         if (kernel_y_end != kernel_height) {
           const int num_lines = kernel_height - kernel_y_end;
           const int num_coeffs = num_lines * kernel_width * in_depth;
 #if USE_TYPED_MEMSETMEMCPY
           std::fill(output_values, output_values + num_coeffs, 0);
 #else
           std::memset(output_values, 0, num_coeffs * sizeof(T));
 #endif
           output_values += num_coeffs;
         }
       }
     }
   }
 }

 template <typename T>
 void MaxPool(const int32_t* __restrict input_shape,
              const T* __restrict input_values,
              int32_t stride_y,
              int32_t stride_x,
              int32_t kernel_height,
              int32_t kernel_width,
              int32_t out_height,
              int32_t out_width,
              T* __restrict output_values) {
   BENCHMARK_TIMER("MaxPool");
   // Give the shape values nicer names.
   const int batch_size = input_shape[0];
   const int in_height = input_shape[1];
   const int in_width = input_shape[2];
   const int depth = input_shape[3];

   // Compute the amount of padding needed to get the desired output size.
   const int pad_height =
       ((out_height - 1) * stride_y + kernel_height - in_height) / 2;
   const int pad_width =
       ((out_width - 1) * stride_x + kernel_width - in_width) / 2;

   // Cache the strides for address computations.
   const int in_strides[4] = {
       input_shape[1] * input_shape[2] * input_shape[3],  // batch
       input_shape[2] * input_shape[3],                   // y
       input_shape[3],                                    // x
       1,                                                 // channel
   };

   T* out_write_ptr = output_values;
   for (int batch = 0; batch < batch_size; ++batch) {
     for (int out_y = 0; out_y < out_height; ++out_y) {
       for (int out_x = 0; out_x < out_width; ++out_x) {
         // Compute the input read offsets.
         const int in_y_origin = (out_y * stride_y) - pad_height;
         const int in_x_origin = (out_x * stride_x) - pad_width;

         // Compute the range of the kernel to be applied (we may need to clip
         // when we'd read outside of the valid input region - for SAME).
         const int kernel_y_start = std::max(0, -in_y_origin);
         const int kernel_y_end =
             std::min(kernel_height, in_height - in_y_origin);
         const int kernel_x_start = std::max(0, -in_x_origin);
         const int kernel_x_end = std::min(kernel_width, in_width - in_x_origin);

         for (int chan = 0; chan < depth; ++chan, ++out_write_ptr) {
           // Convolve.
           T max_value = std::numeric_limits<T>::lowest();
           for (int k_y = kernel_y_start; k_y < kernel_y_end; ++k_y) {
             const int in_y = in_y_origin + k_y;
             assert(in_y >= 0 && in_y < in_height);
             for (int k_x = kernel_x_start; k_x < kernel_x_end; ++k_x) {
               const int in_x = in_x_origin + k_x;
               assert(in_x >= 0 && in_x < in_width);
               const T input_value =
                   input_values[batch * in_strides[0] +  // batch
                                in_y * in_strides[1] +   // y
                                in_x * in_strides[2] +   // x
                                chan];                   // channel
               max_value = std::max(max_value, input_value);
             }  // kernel_x
           }    // kernel_y
           *out_write_ptr = max_value;
         }  // chan
       }    // out_x
     }      // out_y
   }        // batch
 }

 template <typename T>
 void Memcpy(const int32_t rank,
             const int32_t* __restrict input_shape,
             const T* __restrict input_values,
             T* __restrict output_values) {
   BENCHMARK_TIMER("Memcpy");
   const int size = ShapeSize(rank, input_shape);
   for (int i = 0; i < size; ++i) {
     output_values[i] = input_values[i];
   }
 }

 template <typename T>
 void Softmax(const int32_t rank,
              const int32_t* __restrict input_shape,
              const T* __restrict input_values,
              const int32_t reduce_dim,
              T* __restrict output_values,
              T* __restrict scratch_values) {
   BENCHMARK_TIMER("Softmax");
   const int size = ShapeSize(rank, input_shape);
   if (rank == 2 && reduce_dim == 1) {
     T logits_max = std::numeric_limits<T>::lowest();

     // Max.
     for (int i = 0; i < size; ++i) {
       logits_max = std::max(logits_max, input_values[i]);
     }

     // Pre-compute exp.
     for (int i = 0; i < size; ++i) {
       scratch_values[i] = std::exp(input_values[i] - logits_max);
     }

     // Sum over the last dimension, then divide the exps and write out.
     for (int offset = 0; offset < size; offset += input_shape[1]) {
       T sum = 0;
       const int end_offset = offset + input_shape[1];
       for (int i = offset; i < end_offset; ++i)
         sum += scratch_values[i];
       const T rcp_denom = static_cast<T>(1) / sum;
       for (int i = 0; i < input_shape[1]; ++i) {
         output_values[offset + i] = scratch_values[offset + i] * rcp_denom;
       }
     }
   } else {
     assert(false && "Generic Softmax not yet supported.");
   }
 }

 // Returns the start position for a slice in a single dimension.
 template <typename T>
 int StridedSliceBegin(int range_mask,
                       const T* __restrict range_values,
                       const T* __restrict strides,
                       const int32_t* __restrict input_shape,
                       int dim) {
   const bool is_explicit = 0 == (range_mask & (1 << dim));
   if (is_explicit) {
     return range_values[dim];
   } else {
     const bool is_reverse = strides[dim] < 0;
     return is_reverse ? input_shape[dim] - 1 : 0;
   }
 }

 // Returns the end position for a slice in a single dimension.
 template <typename T>
 int StridedSliceEnd(int range_mask,
                     const T* __restrict range_values,
                     const T* __restrict strides,
                     const int32_t* __restrict input_shape,
                     int dim) {
   const bool is_explicit = 0 == (range_mask & (1 << dim));
   if (is_explicit) {
     return range_values[dim];
   } else {
     const bool is_reverse = strides[dim] < 0;
     return is_reverse ? -1 : input_shape[dim];
   }
 }

 template <typename T, typename TIdx>
 void StridedSlice(const int32_t input_rank,
                   const int32_t* __restrict input_shape,
                   const T* __restrict input_values,
                   const TIdx* __restrict begin,
                   const TIdx* __restrict end,
                   const TIdx* __restrict strides,
                   int32_t begin_mask,
                   int32_t end_mask,
                   T* __restrict output_values) {
   BENCHMARK_TIMER("StridedSlice");
   const int MAX_RANK = 8;
   assert(input_rank < MAX_RANK);

   // Compute the address strides for each dimension.
   int dim_addr_strides[MAX_RANK] = {0};
   dim_addr_strides[input_rank - 1] = 1;
   for (int dim = input_rank - 2; dim >= 0; --dim) {
     dim_addr_strides[dim] = dim_addr_strides[dim + 1] * input_shape[dim + 1];
   }

   // Resolve the masks and get explicit ranges for each dimension.
   int dim_begin[MAX_RANK];
   int dim_end[MAX_RANK];
   bool dim_is_full_range[MAX_RANK];
   for (int dim = 0; dim < input_rank; ++dim) {
     const int stride = strides[dim];
     dim_begin[dim] =
         StridedSliceBegin(begin_mask, begin, strides, input_shape, dim);
     dim_end[dim] = StridedSliceEnd(end_mask, end, strides, input_shape, dim);
     dim_is_full_range[dim] =
         dim_begin[dim] == 0 && dim_end[dim] == input_shape[dim] && stride == 1;

     // Our termination criteria for loops is that we hit the end exactly, so
     // we need to ensure that we don't step over the end with stride != 1.
     const int length_mod = (dim_end[dim] - dim_begin[dim]) % stride;
     if (length_mod != 0) {
       dim_end[dim] += stride - length_mod;
     }
   }

   // Find out how large the blocks are that we can copy contiguously. (All
   // dimensions on the right for which we fetch the full range)
   int last_sliced_dim = input_rank - 1;
   int block_size = 1;
   for (int dim = input_rank - 1; dim >= 0 && dim_is_full_range[dim]; --dim) {
     block_size *= input_shape[dim];
     last_sliced_dim--;
   }

   // Initialize the read pos for each dimension according to the begin offsets.
   int read_pos[MAX_RANK] = {0};
   for (int dim = 0; dim < input_rank; ++dim) {
     read_pos[dim] = dim_begin[dim];
   }

   while (read_pos[0] != dim_end[0]) {
     // Compute the read offset for the current position.
     int32_t read_offset = 0;
     for (int dim = 0; dim <= last_sliced_dim; ++dim) {
       const int addr_stride = dim_addr_strides[dim];
       if (read_pos[dim] < 0) {
         read_offset += (input_shape[dim] + read_pos[dim]) * addr_stride;
       } else {
         read_offset += read_pos[dim] * addr_stride;
       }
     }

 #if USE_TYPED_MEMSETMEMCPY
     std::copy(input_values + read_offset,
               input_values + read_offset + block_size, output_values);
 #else
     std::memcpy(output_values, input_values + read_offset,
                 block_size * sizeof(T));
 #endif
     output_values += block_size;

     // Advance the read position.
     for (int dim = last_sliced_dim; dim >= 0; --dim) {
       read_pos[dim] += strides[dim];
       if (dim == 0 || read_pos[dim] != dim_end[dim])
         break;
       read_pos[dim] = dim_begin[dim];
     }
   }
 }

 template <typename T>
 void TransposeRank3(const int32_t* __restrict input_shape,
                     const T* __restrict input_values,
                     const int32_t* __restrict perm,
                     T* __restrict output_values) {
   BENCHMARK_TIMER("TransposeRank3");
   const int32_t in_strides[3] = {
       input_shape[1] * input_shape[2],
       input_shape[2],
       1,
   };
   const int32_t out_strides[3] = {in_strides[perm[0]], in_strides[perm[1]],
                                   in_strides[perm[2]]};
   const int32_t out_shape[3] = {input_shape[perm[0]], input_shape[perm[1]],
                                 input_shape[perm[2]]};

   int32_t write_offset = 0;
   for (int32_t it0 = 0; it0 < out_shape[0]; ++it0) {
     const int32_t read_offset0 = it0 * out_strides[0];
     for (int32_t it1 = 0; it1 < out_shape[1]; ++it1) {
       const int32_t read_offset01 = read_offset0 + it1 * out_strides[1];
       for (int32_t it2 = 0; it2 < out_shape[2]; ++it2, ++write_offset) {
         const int32_t read_offset = read_offset01 + it2 * out_strides[2];
         output_values[write_offset] = input_values[read_offset];
       }
     }
   }
 }

 template <typename T>
 void TransposeRank4(const int32_t* __restrict input_shape,
                     const T* __restrict input_values,
                     const int32_t* __restrict perm,
                     T* __restrict output_values) {
   BENCHMARK_TIMER("TransposeRank4");
   const int32_t in_strides[4] = {
       input_shape[1] * input_shape[2] * input_shape[3],
       input_shape[2] * input_shape[3],
       input_shape[3],
       1,
   };
   const int32_t out_strides[4] = {in_strides[perm[0]], in_strides[perm[1]],
                                   in_strides[perm[2]], in_strides[perm[3]]};
   const int32_t out_shape[4] = {input_shape[perm[0]], input_shape[perm[1]],
                                 input_shape[perm[2]], input_shape[perm[3]]};

   int32_t write_offset = 0;
   for (int32_t it0 = 0; it0 < out_shape[0]; ++it0) {
     const int32_t read_offset0 = it0 * out_strides[0];
     for (int32_t it1 = 0; it1 < out_shape[1]; ++it1) {
       const int32_t read_offset01 = read_offset0 + it1 * out_strides[1];
       for (int32_t it2 = 0; it2 < out_shape[2]; ++it2) {
         const int32_t read_offset012 = read_offset01 + it2 * out_strides[2];
         for (int32_t it3 = 0; it3 < out_shape[3]; ++it3, ++write_offset) {
           const int32_t read_offset = read_offset012 + it3 * out_strides[3];
           output_values[write_offset] = input_values[read_offset];
         }
       }
     }
   }
 }

 template <typename T, typename TIdx, typename TDepth>
 void OneHot(const int32_t input_rank,
             const int32_t* __restrict input_shape,
             const TIdx* __restrict input_values,
             const TDepth* __restrict depth,
             const T* __restrict on_value,
             const T* __restrict off_value,
             const int32_t axis,
             T* __restrict output_values) {
   BENCHMARK_TIMER("OneHot");
   const int32_t num_elements = ShapeSize(input_rank, input_shape);
   // We can assume axis >= 0 in this implementation.
   const int32_t prefix_dim_size = ShapeSize(axis, input_shape);
   const int32_t suffix_dim_size = num_elements / prefix_dim_size;
   int32_t write_offset = 0;
   for (int32_t i = 0; i < prefix_dim_size; i++) {
     int32_t read_offset_pre = i * suffix_dim_size;
     for (TDepth d = 0; d < *depth; d++) {
       for (int32_t j = 0; j < suffix_dim_size; j++, write_offset++) {
         const int32_t read_offset = read_offset_pre + j;
         output_values[write_offset] =
             (input_values[read_offset] == d) ? *on_value : *off_value;
       }
     }
   }
 }

 template <typename T, typename TIdx, typename TDepth>
 void OneHotLastDim(const int32_t input_rank,
                    const int32_t* __restrict input_shape,
                    const TIdx* __restrict input_values,
                    const TDepth* __restrict depth,
                    const T* __restrict on_value,
                    const T* __restrict off_value,
                    T* __restrict output_values) {
   BENCHMARK_TIMER("OneHotLastDim");
   const int32_t num_elements = ShapeSize(input_rank, input_shape);
   int32_t write_offset = 0;
   for (int32_t i = 0; i < num_elements; i++) {
     for (TDepth d = 0; d < *depth; d++, write_offset++) {
       output_values[write_offset] =
           (input_values[i] == d) ? *on_value : *off_value;
     }
   }
 }

 // -----------------------------------------------------------------------------
 // Simple unary ops
 // -----------------------------------------------------------------------------

 // We use macros instead of template functions with templated functors here
 // because it's a lot less verbose and easier for the compiler to optimize.

 #if USE_EIGEN

 #define SIMPLE_UNARY_OP(OP_NAME, _, EXPR_EIGEN)                           \
   template <typename T>                                                   \
   void OP_NAME(const int32_t rank, const int32_t* __restrict input_shape, \
                const T* __restrict input_values,                          \
                T* __restrict output_values) {                             \
     BENCHMARK_TIMER(#OP_NAME);                                            \
     const int size = ShapeSize(rank, input_shape);                        \
     auto values = ConstRowVectorMap<T>(input_values, size).array();       \
     auto output = RowVectorMap<T>(output_values, size).array();           \
     output = EXPR_EIGEN;                                                  \
   }

 #else

 #define SIMPLE_UNARY_OP(OP_NAME, EXPR, _)                                 \
   template <typename T>                                                   \
   void OP_NAME(const int32_t rank, const int32_t* __restrict input_shape, \
                const T* __restrict input_values,                          \
                T* __restrict output_values) {                             \
     BENCHMARK_TIMER(#OP_NAME);                                            \
     const int size = ShapeSize(rank, input_shape);                        \
     for (int i = 0; i < size; ++i) {                                      \
       const T value = input_values[i];                                    \
       output_values[i] = EXPR;                                            \
     }                                                                     \
   }

 #endif

 // Second macro param is value expression, third entry is Eigen vector
 // expression.
 SIMPLE_UNARY_OP(Abs, std::abs(value), values.abs())
 SIMPLE_UNARY_OP(Acos, std::acos(value), values.acos())
 SIMPLE_UNARY_OP(Asin, std::asin(value), values.asin())
 SIMPLE_UNARY_OP(Atan, std::atan(value), values.atan())
 SIMPLE_UNARY_OP(Cos, std::cos(value), values.cos())
 SIMPLE_UNARY_OP(Cosh, std::cosh(value), values.cosh())
 SIMPLE_UNARY_OP(Exp, std::exp(value), values.exp())
 SIMPLE_UNARY_OP(Elu,
                 value < 0 ? std::expm1(value) : value,
                 // Use branchless version of Elu: min(ReLU, e^x - 1)
                 values.max(0).min(values.exp() - 1))
 SIMPLE_UNARY_OP(Log, std::log(value), values.log())
 SIMPLE_UNARY_OP(Log1p, std::log1p(value), values.log1p())
 SIMPLE_UNARY_OP(Neg, -value, -values)
 SIMPLE_UNARY_OP(Reciprocal, static_cast<T>(1) / value, values.cwiseInverse())
 SIMPLE_UNARY_OP(Relu, std::max(value, static_cast<T>(0)), values.max(0))
 SIMPLE_UNARY_OP(Relu6,
                 std::min(std::max(value, static_cast<T>(0)), static_cast<T>(6)),
                 values.max(0).min(6))
 SIMPLE_UNARY_OP(Rsqrt, static_cast<T>(1) / std::sqrt(value), values.rsqrt())
 SIMPLE_UNARY_OP(Sigmoid,
                 static_cast<T>(1) / (1 + std::exp(-value)),
                 ((-values).exp() + 1).cwiseInverse())
 SIMPLE_UNARY_OP(Sin, std::sin(value), values.sin())
 SIMPLE_UNARY_OP(Sinh, std::sinh(value), values.sinh())
 SIMPLE_UNARY_OP(Sqrt, std::sqrt(value), values.sqrt())
 SIMPLE_UNARY_OP(Square, value* value, values.square())
 SIMPLE_UNARY_OP(Tan, std::tan(value), values.tan())
 SIMPLE_UNARY_OP(Tanh, std::tanh(value), values.tanh())

 // -----------------------------------------------------------------------------
 // Broadcasting binary ops
 // -----------------------------------------------------------------------------

 template <typename T, typename OP>
 void OpNoBroadcast(const int32_t left_rank,
                    const int32_t* __restrict left_shape,
                    const T* __restrict left_values,
                    const int32_t right_rank,
                    const int32_t* __restrict right_shape,
                    const T* __restrict right_values,
                    T* __restrict output_values,
                    OP op) {
   BENCHMARK_TIMER(op.name, "NoBroadcast");
   const int32_t size = ShapeSize(left_rank, left_shape);
 #if USE_EIGEN
   auto lhs = ConstRowVectorMap<T>(left_values, size).array();
   auto rhs = ConstRowVectorMap<T>(right_values, size).array();
   auto output = RowVectorMap<T>(output_values, size).array();
   op.apply(lhs, rhs, output);
 #else
   for (int32_t i = 0; i < size; ++i) {
     output_values[i] = op(left_values[i], right_values[i]);
   }
 #endif
 }

 template <typename T, typename OP>
 void OpInnerBroadcast(int32_t left_rank,
                       const int32_t* __restrict left_shape,
                       const T* __restrict left_values,
                       int32_t right_rank,
                       const int32_t* __restrict right_shape,
                       const T* __restrict right_values,
                       T* __restrict output_values,
                       OP op) {
   BENCHMARK_TIMER(op.name, "InnerBroadcast");
   const int32_t output_size = ShapeSize(left_rank, left_shape);
   const int32_t inner_size = ShapeSize(right_rank, right_shape);
   const int32_t outer_size = output_size / inner_size;
 #if USE_EIGEN
   if (inner_size == 1) {
     // Apply the same value to all elements.
     auto left = ConstMatrixMap<T>(left_values, inner_size, outer_size);
     auto output = MatrixMap<T>(output_values, inner_size, outer_size);
     op.apply(left.array(), right_values[0], output.array());
   } else {
     auto left = ConstMatrixMap<T>(left_values, inner_size, outer_size);
     auto right = ConstRowVectorMap<T>(right_values, inner_size);
     auto output = MatrixMap<T>(output_values, inner_size, outer_size);
     for (int32_t col = 0; col < outer_size; col++) {
       op.apply(left.col(col).array(), right.array(), output.col(col).array());
     }
   }
 #else
   for (int32_t idx_out = 0; idx_out < outer_size; ++idx_out) {
     for (int32_t idx_in = 0; idx_in < inner_size; ++idx_in) {
       const int32_t offset = idx_out * inner_size + idx_in;
       output_values[offset] = op(left_values[offset], right_values[idx_in]);
     }
   }
 #endif
 }

 #define BROADCAST_BINARY_OP(OP_NAME, EXPR, EXPR_EIGEN)                         \
   template <typename T>                                                        \
   struct Op##OP_NAME {                                                         \
     const char* name = #OP_NAME;                                               \
     T operator()(const T lhs, const T rhs) { return EXPR; }                    \
     template <typename X, typename Y, typename Z>                              \
     void apply(const X& lhs, const Y& rhs, Z out) {                            \
       out = EXPR_EIGEN;                                                        \
     }                                                                          \
   };                                                                           \
   template <typename T>                                                        \
   void OP_NAME##NoBroadcast(                                                   \
       const int32_t left_rank, const int32_t* __restrict left_shape,           \
       const T* __restrict left_values, const int32_t right_rank,               \
       const int32_t* __restrict right_shape, const T* __restrict right_values, \
       T* __restrict output_values) {                                           \
     OpNoBroadcast(left_rank, left_shape, left_values, right_rank, right_shape, \
                   right_values, output_values, Op##OP_NAME<T>());              \
   }                                                                            \
   template <typename T>                                                        \
   void OP_NAME##InnerBroadcast(                                                \
       const int32_t left_rank, const int32_t* __restrict left_shape,           \
       const T* __restrict left_values, const int32_t right_rank,               \
       const int32_t* __restrict right_shape, const T* __restrict right_values, \
       T* __restrict output_values) {                                           \
     OpInnerBroadcast(left_rank, left_shape, left_values, right_rank,           \
                      right_shape, right_values, output_values,                 \
                      Op##OP_NAME<T>());                                        \
   }

 // Second macro param is value expression, third entry is Eigen vector
 // expression.
 BROADCAST_BINARY_OP(Add, lhs + rhs, lhs + rhs)
 BROADCAST_BINARY_OP(Maximum, std::max(lhs, rhs), lhs.max(rhs))
 BROADCAST_BINARY_OP(Minimum, std::min(lhs, rhs), lhs.min(rhs))
 BROADCAST_BINARY_OP(Mul, lhs* rhs, lhs* rhs)
 BROADCAST_BINARY_OP(Sub, lhs - rhs, lhs - rhs)
 BROADCAST_BINARY_OP(SquaredDifference,
                     (lhs - rhs) * (lhs - rhs),
                     (lhs - rhs).square())

 // -----------------------------------------------------------------------------
 // Reduce ops
 // -----------------------------------------------------------------------------

 // We use macros instead of template functions with templated functors here
 // because it's a lot less verbose and easier for the compiler to optimize.
 #define REDUCE_OP(OP_NAME, DEFAULT_VALUE, UPDATE_EXPR, RESULT_EXPR)           \
   template <typename T, typename Tidx>                                        \
   void OP_NAME##InnerReduce(                                                  \
       int32_t input_rank, const int32_t* __restrict input_shape,              \
       const T* __restrict input_values, int32_t index_tensor_rank,            \
       const int32_t* __restrict index_shape,                                  \
       const Tidx* __restrict index_values, T* __restrict output_values) {     \
     BENCHMARK_TIMER(#OP_NAME, "InnerReduce");                                 \
     const int32_t inner_size =                                                \
         GetReduceInnerSize(input_rank, input_shape, index_tensor_rank,        \
                            index_shape, index_values);                        \
     const int32_t input_size = ShapeSize(input_rank, input_shape);            \
     const int32_t outer_size = input_size / inner_size;                       \
     for (int32_t idx_out = 0; idx_out < outer_size; ++idx_out) {              \
       T value = DEFAULT_VALUE;                                                \
       for (int32_t idx_in = 0; idx_in < inner_size; ++idx_in) {               \
         const T prev = value;                                                 \
         const T next = input_values[idx_out * inner_size + idx_in];           \
         value = UPDATE_EXPR;                                                  \
       }                                                                       \
       const T count = inner_size;                                             \
       (void)sizeof(count);                                                    \
       output_values[idx_out] = RESULT_EXPR;                                   \
     }                                                                         \
   }                                                                           \
   template <typename T, typename Tidx>                                        \
   void OP_NAME##GenericReduceRank4(                                           \
       int32_t input_rank, const int32_t* __restrict input_shape,              \
       const T* __restrict input_values, int32_t index_tensor_rank,            \
       const int32_t* __restrict index_shape,                                  \
       const Tidx* __restrict index_values, T* __restrict output_values) {     \
     assert(input_rank == 4);                                                  \
     assert(index_tensor_rank <= 1);                                           \
     BENCHMARK_TIMER(#OP_NAME, "GenericReduceRank4");                          \
     int out_shape[4] = {input_shape[0], input_shape[1], input_shape[2],       \
                         input_shape[3]};                                      \
     bool reduce_mask[4] = {false, false, false, false};                       \
     const int num_indices = index_tensor_rank > 0 ? index_shape[0] : 1;       \
     for (int i = 0; i < num_indices; ++i) {                                   \
       reduce_mask[index_values[i]] = true;                                    \
       out_shape[index_values[i]] = 1;                                         \
     }                                                                         \
     const int out_strides[4] = {                                              \
         reduce_mask[0] ? 0 : out_shape[1] * out_shape[2] * out_shape[3],      \
         reduce_mask[1] ? 0 : out_shape[2] * out_shape[3],                     \
         reduce_mask[2] ? 0 : out_shape[3],                                    \
         reduce_mask[3] ? 0 : 1,                                               \
     };                                                                        \
     const int output_size = ShapeSize(input_rank, out_shape);                 \
     std::fill_n(output_values, output_size, DEFAULT_VALUE);                   \
     for (int dim0 = 0; dim0 < input_shape[0]; ++dim0) {                       \
       for (int dim1 = 0; dim1 < input_shape[1]; ++dim1) {                     \
         for (int dim2 = 0; dim2 < input_shape[2]; ++dim2) {                   \
           for (int dim3 = 0; dim3 < input_shape[3]; ++dim3, ++input_values) { \
             T* out_ptr = output_values + out_strides[0] * dim0 +              \
                          out_strides[1] * dim1 + out_strides[2] * dim2 +      \
                          out_strides[3] * dim3;                               \
             const T prev = *out_ptr;                                          \
             const T next = *input_values;                                     \
             *out_ptr = UPDATE_EXPR;                                           \
           }                                                                   \
         }                                                                     \
       }                                                                       \
     }                                                                         \
     const T count = (reduce_mask[0] ? input_shape[0] : 1) *                   \
                     (reduce_mask[1] ? input_shape[1] : 1) *                   \
                     (reduce_mask[2] ? input_shape[2] : 1) *                   \
                     (reduce_mask[3] ? input_shape[3] : 1);                    \
     (void)sizeof(count);                                                      \
     for (int i = 0; i < output_size; ++i) {                                   \
       const T value = output_values[i];                                       \
       output_values[i] = RESULT_EXPR;                                         \
     }                                                                         \
   }

 REDUCE_OP(Max, std::numeric_limits<T>::lowest(), std::max(prev, next), value)
 REDUCE_OP(Sum, 0, prev + next, value)
 REDUCE_OP(Mean, 0, prev + next, value / count)

 #undef REDUCE_OP

 // -----------------------------------------------------------------------------
 // Dequantize ops
 // -----------------------------------------------------------------------------

 template <typename T>
 void DequantizeMinCombined(const int32_t rank,
                            const int32_t* __restrict input_shape,
                            const T* __restrict input_values,
                            const float* __restrict min_range,
                            const float* __restrict max_range,
                            float* __restrict output_values) {
   BENCHMARK_TIMER("DequantizeMinCombined");
   const int size = ShapeSize(rank, input_shape);
   const float offset =
       std::is_signed<T>::value
           ? (static_cast<float>(std::numeric_limits<T>::max()) -
              std::numeric_limits<T>::min() + 1) /
                 2.0f
           : 0.0f;
   const float range_scale = (max_range[0] - min_range[0]) /
                             (static_cast<float>(std::numeric_limits<T>::max()) -
                              std::numeric_limits<T>::min());
   for (int i = 0; i < size; i++) {
     output_values[i] =
         ((static_cast<int32_t>(input_values[i]) + offset) * range_scale) +
         min_range[0];
   }
 }

 template <typename T>
 void DequantizeMinFirst(const int32_t rank,
                         const int32_t* __restrict input_shape,
                         const T* __restrict input_values,
                         const float* __restrict min_range,
                         const float* __restrict max_range,
                         float* __restrict output_values) {
   BENCHMARK_TIMER("DequantizeMinFirst");
   const int size = ShapeSize(rank, input_shape);
   const float range_scale = (max_range[0] - min_range[0]) /
                             (static_cast<float>(std::numeric_limits<T>::max()) -
                              std::numeric_limits<T>::min());
   const float range_min_rounded =
       (max_range[0] == min_range[0]
            ? min_range[0]
            : round(min_range[0] / range_scale) * range_scale);
   for (int i = 0; i < size; i++) {
     output_values[i] = ((static_cast<int32_t>(input_values[i]) -
                          std::numeric_limits<T>::min()) *
                         range_scale) +
                        range_min_rounded;
   }
 }

 // -----------------------------------------------------------------------------
 // CONSTANTS
 // Note that for now, endianness of the target machine needs to match that of
 // the one training was performed on.
 // -----------------------------------------------------------------------------
 const int32_t dnn_hiddenlayer_0_weights_part_0_shape[2] = {4, 10};
 const union {
   uint8_t bytes[160];
   float values[40];
 } dnn_hiddenlayer_0_weights_part_0 = {{
     0xbc, 0x22, 0x0a, 0xbf, 0xb4, 0x46, 0x8c, 0x3f, 0xba, 0x31, 0x34, 0xbe,
     0x4c, 0x65, 0xdb, 0xbe, 0xf0, 0x54, 0x5e, 0xbe, 0xc1, 0x5d, 0xb3, 0x3f,
     0xf4, 0xe6, 0x15, 0xbf, 0x05, 0xc6, 0x34, 0xbf, 0xc0, 0x37, 0x7e, 0xbd,
     0x6c, 0x35, 0x0b, 0xbf, 0xca, 0x53, 0x26, 0xbf, 0x58, 0xb4, 0x87, 0x3f,
     0x37, 0xee, 0x39, 0xbf, 0xda, 0xfa, 0xf9, 0xbe, 0x97, 0xc1, 0x06, 0xbf,
     0xf9, 0x4e, 0x81, 0x3f, 0xb2, 0x44, 0x85, 0xbf, 0x7f, 0x98, 0x7c, 0x3d,
     0x15, 0x26, 0xbc, 0xbe, 0x5c, 0x48, 0x05, 0x3f, 0xc8, 0xaa, 0xa1, 0xbd,
     0x35, 0xb3, 0x43, 0xbe, 0xeb, 0x46, 0x91, 0x3f, 0x80, 0x71, 0xe3, 0x3c,
     0xd1, 0x98, 0x79, 0x3f, 0x3c, 0xd0, 0x0d, 0xbf, 0x1e, 0x02, 0xd3, 0x3e,
     0x5d, 0x4b, 0xa2, 0xbf, 0x68, 0xac, 0xaa, 0xbd, 0xf8, 0xe1, 0x75, 0x3e,
     0x4a, 0x9c, 0x27, 0xbe, 0xf8, 0xae, 0xb2, 0xbe, 0x7f, 0x9d, 0x91, 0x3f,
     0x1e, 0x8b, 0xa8, 0xbe, 0x35, 0x7e, 0xb2, 0x3f, 0xbe, 0x8c, 0xd3, 0xbe,
     0xf9, 0xcd, 0xb5, 0x3f, 0xa1, 0x50, 0xaa, 0x3f, 0xe4, 0x6d, 0xdd, 0xbe,
     0x0d, 0xce, 0xd3, 0xbe,
 }};
 const int32_t dnn_hiddenlayer_0_biases_part_0_shape[1] = {10};
 const union {
   uint8_t bytes[40];
   float values[10];
 } dnn_hiddenlayer_0_biases_part_0 = {{
     0x00, 0x00, 0x00, 0x00, 0xbf, 0x6a, 0x53, 0x3e, 0xd3, 0xc1,
     0xd0, 0x3e, 0x00, 0x00, 0x00, 0x00, 0xb6, 0xd8, 0xc0, 0x3e,
     0xca, 0xe7, 0x35, 0x3e, 0x23, 0xa5, 0x44, 0x3f, 0x61, 0xfd,
     0xd2, 0x3e, 0x00, 0x00, 0x00, 0x00, 0xb6, 0xe0, 0x43, 0x3c,
 }};
 const int32_t dnn_logits_biases_part_0_shape[1] = {1};
 const union {
   uint8_t bytes[4];
   float values[1];
 } dnn_logits_biases_part_0 = {{
     0x75,
     0xca,
     0xd7,
     0xbe,
 }};
 const int32_t dnn_logits_weights_part_0_shape[2] = {10, 1};
 const union {
   uint8_t bytes[40];
   float values[10];
 } dnn_logits_weights_part_0 = {{
     0x13, 0x12, 0x39, 0x3f, 0xf3, 0xa5, 0xc2, 0xbf, 0x81, 0x7f,
     0xbe, 0x3f, 0xf8, 0x17, 0x26, 0x3e, 0xa4, 0x19, 0xa6, 0x3f,
     0xf0, 0xc9, 0xb7, 0xbf, 0x6a, 0x99, 0xd2, 0x3f, 0x8a, 0x7d,
     0xe9, 0x3f, 0x83, 0x9a, 0x3a, 0xbf, 0xf1, 0x6c, 0x08, 0x3e,
 }};

 }  // anonymous namespace

 // -----------------------------------------------------------------------------
 // INFERENCE
 // -----------------------------------------------------------------------------

 int32_t input0Shape[2] = {1, 4};
 int32_t logits_MatMul_merged_with_dnn_logits_BiasAdd0Shape[2] = {1, 1};

 void Inference(
     const float* __restrict input0 /* shape: 1,4 */,
     float* __restrict logits_MatMul_merged_with_dnn_logits_BiasAdd0 /* shape:
                                                                        1,1 */
     ,
     FixedAllocations* __restrict fixed) {
   const int32_t input0_shape[] = {1, 4};
   int32_t logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[2];

   // dnn/hiddenlayer_0/MatMul_merged_with_dnn/hiddenlayer_0/BiasAdd
   FullyConnected<float>(input0_shape, input0,
                         dnn_hiddenlayer_0_weights_part_0_shape,
                         dnn_hiddenlayer_0_weights_part_0.values,
                         dnn_hiddenlayer_0_biases_part_0_shape,
                         dnn_hiddenlayer_0_biases_part_0.values, fixed->alloc0);
   fixed->alloc0_shape[0] = 1;
   fixed->alloc0_shape[1] = 10;

   // dnn/hiddenlayer_0/hiddenlayer_0/Relu
   Relu<float>(2,  // rank
               fixed->alloc0_shape, fixed->alloc0, fixed->alloc1);
   fixed->alloc1_shape[0] = 1;
   fixed->alloc1_shape[1] = 10;

   // dnn/logits/MatMul_merged_with_dnn/logits/BiasAdd
   FullyConnected<float>(
       fixed->alloc1_shape, fixed->alloc1, dnn_logits_weights_part_0_shape,
       dnn_logits_weights_part_0.values, dnn_logits_biases_part_0_shape,
       dnn_logits_biases_part_0.values,
       logits_MatMul_merged_with_dnn_logits_BiasAdd0);
   logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[0] = 1;
   logits_MatMul_merged_with_dnn_logits_BiasAdd0_shape[1] = 1;
 }

 }  // namespace darkmode_tfnative_model