Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h - nest-cam/v350/eigen - Git at Google

 #include "../../InternalHeaderCheck.h"

 namespace Eigen {
 namespace internal {

 #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG

 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
 // Here we specialize gebp_traits to eliminate these register spills.
 // See #2138.
 template<>
 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
 {
   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
   {
     // This volatile inline ASM both acts as a barrier to prevent reordering,
     // as well as enforces strict register use.
     asm volatile(
       "vmla.f32 %q[r], %q[c], %q[alpha]"
       : [r] "+w" (r)
       : [c] "w" (c),
         [alpha] "w" (alpha)
       : );
   }

   template <typename LaneIdType>
   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
                                 Packet4f& c, Packet4f& tmp,
                                 const LaneIdType&) const {
     acc(a, b, c);
   }

   template <typename LaneIdType>
   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
                                 Packet4f& c, Packet4f& tmp,
                                 const LaneIdType& lane) const {
     madd(a, b.get(lane), c, tmp, lane);
   }
 };

 #endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG

 #if EIGEN_ARCH_ARM64

 #ifndef EIGEN_NEON_GEBP_NR
 #define EIGEN_NEON_GEBP_NR 8
 #endif

 template<>
 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
 {
   typedef float RhsPacket;
   typedef float32x4_t RhsPacketx4;
   enum { nr = EIGEN_NEON_GEBP_NR };
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const {
     dest = *b;
   }

   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
   {
     dest = vld1q_f32(b);
   }

   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
   {
     dest = *b;
   }

   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
   {}

   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
     loadRhs(b,dest);
   }

   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
   {
     c = vfmaq_n_f32(c, a, b);
   }
   // NOTE: Template parameter inference failed when compiled with Android NDK:
   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".

   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
   { madd_helper<0>(a, b, c); }
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
   { madd_helper<1>(a, b, c); }
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
   { madd_helper<2>(a, b, c); }
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
   { madd_helper<3>(a, b, c); }

  private:
   template<int LaneID>
   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
   {
     #if EIGEN_COMP_GNUC_STRICT
     // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
     //    vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
     // 2. workaround the gcc register split problem on arm64-neon
          if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
     else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
     else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
     else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
     #else
     c = vfmaq_laneq_f32(c, a, b, LaneID);
     #endif
   }
 };


 template<>
 struct gebp_traits <double,double,false,false,Architecture::NEON>
  : gebp_traits<double,double,false,false,Architecture::Generic>
 {
   typedef double RhsPacket;
   enum { nr = EIGEN_NEON_GEBP_NR };
   struct RhsPacketx4 {
     float64x2_t B_0, B_1;
   };

   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
   {
     dest = *b;
   }

   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
   {
     dest.B_0 = vld1q_f64(b);
     dest.B_1 = vld1q_f64(b+2);
   }

   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
   {
     loadRhs(b,dest);
   }

   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
   {}

   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
     loadRhs(b,dest);
   }

   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
   {
     c = vfmaq_n_f64(c, a, b);
   }

   // NOTE: Template parameter inference failed when compiled with Android NDK:
   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".

   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
   { madd_helper<0>(a, b, c); }
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
   { madd_helper<1>(a, b, c); }
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
   { madd_helper<2>(a, b, c); }
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
   { madd_helper<3>(a, b, c); }

  private:
   template <int LaneID>
   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
   {
     #if EIGEN_COMP_GNUC_STRICT
     // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
     //    vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
     // 2. workaround the gcc register split problem on arm64-neon
          if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
     else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
     else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
     else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
     #else
          if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
     else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
     else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
     else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
     #endif
   }
 };

 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC

 template<>
 struct gebp_traits <half,half,false,false,Architecture::NEON>
  : gebp_traits<half,half,false,false,Architecture::Generic>
 {
   typedef half RhsPacket;
   typedef float16x4_t RhsPacketx4;
   typedef float16x4_t PacketHalf;
   enum { nr = EIGEN_NEON_GEBP_NR };

   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
   {
     dest = *b;
   }

   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
   {
     dest = vld1_f16((const __fp16 *)b);
   }

   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
   {
     dest = *b;
   }

   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
   {}

   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
     loadRhs(b,dest);
   }

   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
   {
     c = vfmaq_n_f16(c, a, b);
   }
   EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
   {
     c = vfma_n_f16(c, a, b);
   }

   // NOTE: Template parameter inference failed when compiled with Android NDK:
   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
   { madd_helper<0>(a, b, c); }
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
   { madd_helper<1>(a, b, c); }
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
   { madd_helper<2>(a, b, c); }
   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
   { madd_helper<3>(a, b, c); }
  private:
   template<int LaneID>
   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
   {
     #if EIGEN_COMP_GNUC_STRICT
     // 1. vfmaq_lane_f16 is implemented through a costly dup
     // 2. workaround the gcc register split problem on arm64-neon
          if(LaneID==0)  asm("fmla %0.8h, %1.8h, %2.h[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
     else if(LaneID==1)  asm("fmla %0.8h, %1.8h, %2.h[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
     else if(LaneID==2)  asm("fmla %0.8h, %1.8h, %2.h[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
     else if(LaneID==3)  asm("fmla %0.8h, %1.8h, %2.h[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
     #else
     c = vfmaq_lane_f16(c, a, b, LaneID);
     #endif
   }
 };
 #endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 #endif // EIGEN_ARCH_ARM64

 }  // namespace internal
 }  // namespace Eigen
	#include "../../InternalHeaderCheck.h"

	namespace Eigen {
	namespace internal {

	#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG

	// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
	// Here we specialize gebp_traits to eliminate these register spills.
	// See #2138.
	template<>
	struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
	: gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
	{
	EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
	{
	// This volatile inline ASM both acts as a barrier to prevent reordering,
	// as well as enforces strict register use.
	asm volatile(
	"vmla.f32 %q[r], %q[c], %q[alpha]"
	: [r] "+w" (r)
	: [c] "w" (c),
	[alpha] "w" (alpha)
	: );
	}

	template <typename LaneIdType>
	EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
	Packet4f& c, Packet4f& tmp,
	const LaneIdType&) const {
	acc(a, b, c);
	}

	template <typename LaneIdType>
	EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
	Packet4f& c, Packet4f& tmp,
	const LaneIdType& lane) const {
	madd(a, b.get(lane), c, tmp, lane);
	}
	};

	#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG

	#if EIGEN_ARCH_ARM64

	#ifndef EIGEN_NEON_GEBP_NR
	#define EIGEN_NEON_GEBP_NR 8
	#endif

	template<>
	struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
	: gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
	{
	typedef float RhsPacket;
	typedef float32x4_t RhsPacketx4;
	enum { nr = EIGEN_NEON_GEBP_NR };
	EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const {
	dest = *b;
	}

	EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
	{
	dest = vld1q_f32(b);
	}

	EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
	{
	dest = *b;
	}

	EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
	{}

	EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
	{
	loadRhs(b,dest);
	}

	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<0>&) const
	{
	c = vfmaq_n_f32(c, a, b);
	}
	// NOTE: Template parameter inference failed when compiled with Android NDK:
	// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".

	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<0>&) const
	{ madd_helper<0>(a, b, c); }
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<1>&) const
	{ madd_helper<1>(a, b, c); }
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<2>&) const
	{ madd_helper<2>(a, b, c); }
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<3>&) const
	{ madd_helper<3>(a, b, c); }

	private:
	template<int LaneID>
	EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
	{
	#if EIGEN_COMP_GNUC_STRICT
	// 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
	// vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
	// 2. workaround the gcc register split problem on arm64-neon
	if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
	else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
	else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
	else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
	#else
	c = vfmaq_laneq_f32(c, a, b, LaneID);
	#endif
	}
	};


	template<>
	struct gebp_traits <double,double,false,false,Architecture::NEON>
	: gebp_traits<double,double,false,false,Architecture::Generic>
	{
	typedef double RhsPacket;
	enum { nr = EIGEN_NEON_GEBP_NR };
	struct RhsPacketx4 {
	float64x2_t B_0, B_1;
	};

	EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
	{
	dest = *b;
	}

	EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
	{
	dest.B_0 = vld1q_f64(b);
	dest.B_1 = vld1q_f64(b+2);
	}

	EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
	{
	loadRhs(b,dest);
	}

	EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
	{}

	EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
	{
	loadRhs(b,dest);
	}

	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<0>&) const
	{
	c = vfmaq_n_f64(c, a, b);
	}

	// NOTE: Template parameter inference failed when compiled with Android NDK:
	// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".

	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<0>&) const
	{ madd_helper<0>(a, b, c); }
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<1>&) const
	{ madd_helper<1>(a, b, c); }
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<2>&) const
	{ madd_helper<2>(a, b, c); }
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<3>&) const
	{ madd_helper<3>(a, b, c); }

	private:
	template <int LaneID>
	EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
	{
	#if EIGEN_COMP_GNUC_STRICT
	// 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
	// vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
	// 2. workaround the gcc register split problem on arm64-neon
	if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
	else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
	else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
	else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
	#else
	if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
	else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
	else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
	else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
	#endif
	}
	};

	#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC

	template<>
	struct gebp_traits <half,half,false,false,Architecture::NEON>
	: gebp_traits<half,half,false,false,Architecture::Generic>
	{
	typedef half RhsPacket;
	typedef float16x4_t RhsPacketx4;
	typedef float16x4_t PacketHalf;
	enum { nr = EIGEN_NEON_GEBP_NR };

	EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
	{
	dest = *b;
	}

	EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
	{
	dest = vld1_f16((const __fp16 *)b);
	}

	EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
	{
	dest = *b;
	}

	EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
	{}

	EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
	{
	loadRhs(b,dest);
	}

	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<0>&) const
	{
	c = vfmaq_n_f16(c, a, b);
	}
	EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /tmp/, const FixedInt<0>&) const
	{
	c = vfma_n_f16(c, a, b);
	}

	// NOTE: Template parameter inference failed when compiled with Android NDK:
	// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<0>&) const
	{ madd_helper<0>(a, b, c); }
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<1>&) const
	{ madd_helper<1>(a, b, c); }
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<2>&) const
	{ madd_helper<2>(a, b, c); }
	EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /tmp/, const FixedInt<3>&) const
	{ madd_helper<3>(a, b, c); }
	private:
	template<int LaneID>
	EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
	{
	#if EIGEN_COMP_GNUC_STRICT
	// 1. vfmaq_lane_f16 is implemented through a costly dup
	// 2. workaround the gcc register split problem on arm64-neon
	if(LaneID==0) asm("fmla %0.8h, %1.8h, %2.h[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
	else if(LaneID==1) asm("fmla %0.8h, %1.8h, %2.h[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
	else if(LaneID==2) asm("fmla %0.8h, %1.8h, %2.h[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
	else if(LaneID==3) asm("fmla %0.8h, %1.8h, %2.h[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
	#else
	c = vfmaq_lane_f16(c, a, b, LaneID);
	#endif
	}
	};
	#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
	#endif // EIGEN_ARCH_ARM64

	} // namespace internal
	} // namespace Eigen