unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h - manifest_repos/eigen - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Mehdi Goli    Codeplay Software Ltd.
 // Ralph Potter  Codeplay Software Ltd.
 // Luke Iwanski  Codeplay Software Ltd.
 // Contact: <eigen@codeplay.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 /*****************************************************************
  * TensorSyclPlaceHolderExpr.h
  *
  * \brief:
  *  This is the specialisation of the placeholder expression based on the
  * operation type
  *
 *****************************************************************/

 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

 namespace Eigen {
 namespace internal {

 template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
 template<typename BufferTOut, typename BufferTIn>
 static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
   do {
           auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
             cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
                                     cl::sycl::range<1>{std::min(length, local)}};
             /* Two accessors are used: one to the buffer that is being reduced,
              * and a second to local memory, used to store intermediate data. */
             auto aI =
                 bufI.template get_access<cl::sycl::access::mode::read_write>(h);
             auto aOut =
                 bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
             cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
                                cl::sycl::access::target::local>
                 scratch(cl::sycl::range<1>(local), h);

             /* The parallel_for invocation chosen is the variant with an nd_item
              * parameter, since the code requires barriers for correctness. */
             h.parallel_for<KernelName>(
                 r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
                   size_t globalid = id.get_global(0);
                   size_t localid = id.get_local(0);
                   /* All threads collectively read from global memory into local.
                    * The barrier ensures all threads' IO is resolved before
                    * execution continues (strictly speaking, all threads within
                    * a single work-group - there is no co-ordination between
                    * work-groups, only work-items). */
                   if (globalid < length) {
                     scratch[localid] = aI[globalid];
                   }
                   id.barrier(cl::sycl::access::fence_space::local_space);

                   /* Apply the reduction operation between the current local
                    * id and the one on the other half of the vector. */
                   if (globalid < length) {
                     int min = (length < local) ? length : local;
                     for (size_t offset = min / 2; offset > 0; offset /= 2) {
                       if (localid < offset) {
                         scratch[localid] += scratch[localid + offset];
                       }
                       id.barrier(cl::sycl::access::fence_space::local_space);
                     }
                     /* The final result will be stored in local id 0. */
                     if (localid == 0) {
                       aI[id.get_group(0)] = scratch[localid];
                       if((length<=local) && globalid ==0){
                         aOut[globalid]=scratch[localid];
                       }
                     }
                   }
                 });
           };
             dev.m_queue.submit(f);
             dev.m_queue.throw_asynchronous();

           /* At this point, you could queue::wait_and_throw() to ensure that
            * errors are caught quickly. However, this would likely impact
            * performance negatively. */
           length = length / local;

         } while (length > 1);


 }

 };

 /// For now let's start with a full reducer
 /// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
 /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
 /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
 // a leafNode.
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {

   typedef typename Self::CoeffReturnType CoeffReturnType;
   static const bool HasOptimizedImplementation = false;

   static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
     typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
     auto functors = TensorSycl::internal::extractFunctors(self.impl());
     int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
     size_t inputSize =self.impl().dimensions().TotalSize();
     size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
     size_t remaining = inputSize% red_factor;
     if(rng ==0) {
       red_factor=1;
     };
     size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
     size_t GRange=std::max((size_t )1, rng);

     // convert global range to power of 2 for redecution
     GRange--;
     GRange |= GRange >> 1;
     GRange |= GRange >> 2;
     GRange |= GRange >> 4;
     GRange |= GRange >> 8;
     GRange |= GRange >> 16;
 #if __x86_64__ || __ppc64__ || _WIN64
     GRange |= GRange >> 32;
 #endif
     GRange++;
     size_t  outTileSize = tileSize;
     /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
     if (GRange < outTileSize) outTileSize=GRange;
     // getting final out buffer at the moment the created buffer is true because there is no need for assign
     auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
     /// creating the shared memory for calculating reduction.
     /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
     auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
     Dims dims= self.xprDims();
     Op functor = reducer;
     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
       // create a tuple of accessors from Evaluator
       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
       auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);

       cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
         typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
         /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
         /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
         /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
         const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
         /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
         /// the device_evaluator is detectable and recognisable on the device.
         auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
         /// const cast added as a naive solution to solve the qualifier drop error
         auto globalid=itemID.get_global_linear_id();

         if(globalid<rng)
           tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
         else
           tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);

         if(remaining!=0 && globalid==0 )
           // this will add the rest of input buffer when the input size is not devidable to red_factor.
           tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
       });
     });
   dev.m_queue.throw_asynchronous();

 /// This is used to recursively reduce the tmp value to an element of 1;
   syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
   }

 };

 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, const Eigen::SyclDevice> {

   typedef typename Self::CoeffReturnType CoeffReturnType;
   static const bool HasOptimizedImplementation = false;

   static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
     typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
     typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
     auto functors = TensorSycl::internal::extractFunctors(self.impl());

     size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;

     size_t GRange=num_coeffs_to_preserve;
     if (tileSize>GRange) tileSize=GRange;
     else if(GRange>tileSize){
       size_t xMode = GRange % tileSize;
       if (xMode != 0) GRange += (tileSize - xMode);
     }
     // getting final out buffer at the moment the created buffer is true because there is no need for assign
     /// creating the shared memory for calculating reduction.
     /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
     Dims dims= self.xprDims();
     Op functor = reducer;

     dev.m_queue.submit([&](cl::sycl::handler &cgh) {
       // create a tuple of accessors from Evaluator
       auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
       auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);

       cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
         typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
         auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
         /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
         /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
         /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
         const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
         /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
         /// the device_evaluator is detectable and recognisable on the device.
         typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
         auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
         /// const cast added as a naive solution to solve the qualifier drop error
         auto globalid=itemID.get_global_linear_id();
         if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
           typename DeiceSelf::CoeffReturnType accum = functor.initialize();
           GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
           functor.finalize(accum);
           output_accessor.get_pointer()[globalid]= accum;
         }
       });
     });
   dev.m_queue.throw_asynchronous();
     return false;
   }
 };

 }  // end namespace internal
 }  // namespace Eigen

 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Mehdi Goli Codeplay Software Ltd.
	// Ralph Potter Codeplay Software Ltd.
	// Luke Iwanski Codeplay Software Ltd.
	// Contact: <eigen@codeplay.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	/*****************************************************************
	* TensorSyclPlaceHolderExpr.h
	*
	* \brief:
	* This is the specialisation of the placeholder expression based on the
	* operation type
	*
	*****************************************************************/

	#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
	#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

	namespace Eigen {
	namespace internal {

	template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
	template<typename BufferTOut, typename BufferTIn>
	static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
	do {
	auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
	cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
	cl::sycl::range<1>{std::min(length, local)}};
	/* Two accessors are used: one to the buffer that is being reduced,
	* and a second to local memory, used to store intermediate data. */
	auto aI =
	bufI.template get_access<cl::sycl::access::mode::read_write>(h);
	auto aOut =
	bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
	cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
	cl::sycl::access::target::local>
	scratch(cl::sycl::range<1>(local), h);

	/* The parallel_for invocation chosen is the variant with an nd_item
	* parameter, since the code requires barriers for correctness. */
	h.parallel_for<KernelName>(
	r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
	size_t globalid = id.get_global(0);
	size_t localid = id.get_local(0);
	/* All threads collectively read from global memory into local.
	* The barrier ensures all threads' IO is resolved before
	* execution continues (strictly speaking, all threads within
	* a single work-group - there is no co-ordination between
	* work-groups, only work-items). */
	if (globalid < length) {
	scratch[localid] = aI[globalid];
	}
	id.barrier(cl::sycl::access::fence_space::local_space);

	/* Apply the reduction operation between the current local
	* id and the one on the other half of the vector. */
	if (globalid < length) {
	int min = (length < local) ? length : local;
	for (size_t offset = min / 2; offset > 0; offset /= 2) {
	if (localid < offset) {
	scratch[localid] += scratch[localid + offset];
	}
	id.barrier(cl::sycl::access::fence_space::local_space);
	}
	/* The final result will be stored in local id 0. */
	if (localid == 0) {
	aI[id.get_group(0)] = scratch[localid];
	if((length<=local) && globalid ==0){
	aOut[globalid]=scratch[localid];
	}
	}
	}
	});
	};
	dev.m_queue.submit(f);
	dev.m_queue.throw_asynchronous();

	/* At this point, you could queue::wait_and_throw() to ensure that
	* errors are caught quickly. However, this would likely impact
	* performance negatively. */
	length = length / local;

	} while (length > 1);



	}

	};

	/// For now let's start with a full reducer
	/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
	/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
	/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
	// a leafNode.
	template <typename Self, typename Op, bool Vectorizable>
	struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {

	typedef typename Self::CoeffReturnType CoeffReturnType;
	static const bool HasOptimizedImplementation = false;

	static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
	typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
	typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
	auto functors = TensorSycl::internal::extractFunctors(self.impl());
	int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
	size_t inputSize =self.impl().dimensions().TotalSize();
	size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
	size_t remaining = inputSize% red_factor;
	if(rng ==0) {
	red_factor=1;
	};
	size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
	size_t GRange=std::max((size_t )1, rng);

	// convert global range to power of 2 for redecution
	GRange--;
	GRange \|= GRange >> 1;
	GRange \|= GRange >> 2;
	GRange \|= GRange >> 4;
	GRange \|= GRange >> 8;
	GRange \|= GRange >> 16;
	#if __x86_64__ \|\| __ppc64__ \|\| _WIN64
	GRange \|= GRange >> 32;
	#endif
	GRange++;
	size_t outTileSize = tileSize;
	/// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
	if (GRange < outTileSize) outTileSize=GRange;
	// getting final out buffer at the moment the created buffer is true because there is no need for assign
	auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
	/// creating the shared memory for calculating reduction.
	/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
	/// recursively apply reduction on it in order to reduce the whole.
	auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
	typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
	Dims dims= self.xprDims();
	Op functor = reducer;
	dev.m_queue.submit([&](cl::sycl::handler &cgh) {
	// create a tuple of accessors from Evaluator
	auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
	auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);

	cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
	typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
	auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
	/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
	/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
	/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
	const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
	/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
	/// the device_evaluator is detectable and recognisable on the device.
	auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
	/// const cast added as a naive solution to solve the qualifier drop error
	auto globalid=itemID.get_global_linear_id();

	if(globalid<rng)
	tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
	else
	tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);

	if(remaining!=0 && globalid==0 )
	// this will add the rest of input buffer when the input size is not devidable to red_factor.
	tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
	});
	});
	dev.m_queue.throw_asynchronous();

	/// This is used to recursively reduce the tmp value to an element of 1;
	syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
	}

	};

	template <typename Self, typename Op>
	struct InnerReducer<Self, Op, const Eigen::SyclDevice> {

	typedef typename Self::CoeffReturnType CoeffReturnType;
	static const bool HasOptimizedImplementation = false;

	static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
	typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
	typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
	auto functors = TensorSycl::internal::extractFunctors(self.impl());

	size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;

	size_t GRange=num_coeffs_to_preserve;
	if (tileSize>GRange) tileSize=GRange;
	else if(GRange>tileSize){
	size_t xMode = GRange % tileSize;
	if (xMode != 0) GRange += (tileSize - xMode);
	}
	// getting final out buffer at the moment the created buffer is true because there is no need for assign
	/// creating the shared memory for calculating reduction.
	/// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
	/// recursively apply reduction on it in order to reduce the whole.
	typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
	Dims dims= self.xprDims();
	Op functor = reducer;

	dev.m_queue.submit([&](cl::sycl::handler &cgh) {
	// create a tuple of accessors from Evaluator
	auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
	auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);

	cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
	typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
	auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
	/// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
	/// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
	/// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
	const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
	/// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
	/// the device_evaluator is detectable and recognisable on the device.
	typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
	auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
	/// const cast added as a naive solution to solve the qualifier drop error
	auto globalid=itemID.get_global_linear_id();
	if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
	typename DeiceSelf::CoeffReturnType accum = functor.initialize();
	GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
	functor.finalize(accum);
	output_accessor.get_pointer()[globalid]= accum;
	}
	});
	});
	dev.m_queue.throw_asynchronous();
	return false;
	}
	};

	} // end namespace internal
	} // namespace Eigen

	#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP