10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
18 enum GEBPPacketSizeType {
24 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs=false,
bool _ConjRhs=false,
int Arch=Architecture::Target,
int _PacketSize=GEBPPacketFull>
29 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
34 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
37 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
38 #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
40 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
43 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
44 #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
46 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_SET_DEFAULT_L3_CACHE_SIZE
49 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
50 #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
52 #if EIGEN_ARCH_i386_OR_x86_64
53 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
57 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
61 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
62 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
66 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
75 m_l1 = manage_caching_sizes_helper(
l1CacheSize, defaultL1CacheSize);
76 m_l2 = manage_caching_sizes_helper(
l2CacheSize, defaultL2CacheSize);
77 m_l3 = manage_caching_sizes_helper(
l3CacheSize, defaultL3CacheSize);
86 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
93 eigen_internal_assert(l1!=0 && l2!=0);
94 m_cacheSizes.m_l1 = *l1;
95 m_cacheSizes.m_l2 = *l2;
96 m_cacheSizes.m_l3 = *l3;
98 else if(action==GetAction)
100 eigen_internal_assert(l1!=0 && l2!=0);
101 *l1 = m_cacheSizes.m_l1;
102 *l2 = m_cacheSizes.m_l2;
103 *l3 = m_cacheSizes.m_l3;
107 eigen_internal_assert(
false);
123 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
126 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
133 std::ptrdiff_t l1, l2, l3;
134 manage_caching_sizes(GetAction, &l1, &l2, &l3);
135 #ifdef EIGEN_VECTORIZE_AVX512
146 if (num_threads > 1) {
147 typedef typename Traits::ResScalar ResScalar;
149 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
150 ksub = Traits::mr * Traits::nr *
sizeof(ResScalar),
160 const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
162 k = k_cache - (k_cache % kr);
163 eigen_internal_assert(k > 0);
166 const Index n_cache = (l2-l1) / (nr *
sizeof(RhsScalar) * k);
167 const Index n_per_thread = numext::div_ceil(n, num_threads);
168 if (n_cache <= n_per_thread) {
170 eigen_internal_assert(n_cache >=
static_cast<Index>(nr));
171 n = n_cache - (n_cache % nr);
172 eigen_internal_assert(n > 0);
174 n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
179 const Index m_cache = (l3-l2) / (
sizeof(LhsScalar) * k * num_threads);
180 const Index m_per_thread = numext::div_ceil(m, num_threads);
181 if(m_cache < m_per_thread && m_cache >=
static_cast<Index>(mr)) {
182 m = m_cache - (m_cache % mr);
183 eigen_internal_assert(m > 0);
185 m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
192 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
202 if((numext::maxi)(k,(numext::maxi)(m,n))<48)
205 typedef typename Traits::ResScalar ResScalar;
208 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
209 k_sub = Traits::mr * Traits::nr *
sizeof(ResScalar)
219 const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
220 const Index old_k = k;
226 k = (k%max_kc)==0 ? max_kc
227 : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
229 eigen_internal_assert(((old_k/k) == (old_k/max_kc)) &&
"the number of sweeps has to remain the same");
238 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
239 const Index actual_l2 = l3;
241 const Index actual_l2 = 1572864;
251 const Index lhs_bytes = m * k *
sizeof(LhsScalar);
252 const Index remaining_l1 = l1- k_sub - lhs_bytes;
253 if(remaining_l1 >=
Index(Traits::nr*
sizeof(RhsScalar))*k)
256 max_nc = remaining_l1 / (k*
sizeof(RhsScalar));
261 max_nc = (3*actual_l2)/(2*2*max_kc*
sizeof(RhsScalar));
264 Index nc = numext::mini<Index>(actual_l2/(2*k*
sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
272 : (nc - Traits::nr * ((nc-(n%nc))/(Traits::nr*(n/nc+1))));
279 Index problem_size = k*n*
sizeof(LhsScalar);
280 Index actual_lm = actual_l2;
282 if(problem_size<=1024)
288 else if(l3!=0 && problem_size<=32768)
293 max_mc = (numext::mini<Index>)(576,max_mc);
295 Index mc = (numext::mini<Index>)(actual_lm/(3*k*
sizeof(LhsScalar)), max_mc);
296 if (mc > Traits::mr) mc -= mc % Traits::mr;
297 else if (mc==0)
return;
299 : (mc - Traits::mr * ((mc-(m%mc))/(Traits::mr*(m/mc+1))));
304 template <
typename Index>
307 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
308 if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
309 k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
310 m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
311 n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
315 EIGEN_UNUSED_VARIABLE(k)
316 EIGEN_UNUSED_VARIABLE(m)
317 EIGEN_UNUSED_VARIABLE(n)
338 template<
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
341 if (!useSpecificBlockingSizes(k, m, n)) {
342 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
346 template<
typename LhsScalar,
typename RhsScalar,
typename Index>
349 computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
352 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
353 #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
359 EIGEN_ALWAYS_INLINE
static void run(
const CJ& cj, A& a, B& b, C& c, T& )
366 EIGEN_ALWAYS_INLINE
static void run(
const CJ& cj, T& a, T& b, T& c, T& t)
368 t = b; t = cj.pmul(a,t); c = padd(c,t);
372 template<
typename CJ,
typename A,
typename B,
typename C,
typename T>
373 EIGEN_STRONG_INLINE
void gebp_madd(
const CJ& cj, A& a, B& b, C& c, T& t)
378 #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
382 template <
typename RhsPacket,
typename RhsPacketx4,
int registers_taken>
385 static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
390 template <
typename Packet>
400 template <
int N,
typename T1,
typename T2,
typename T3>
403 template <
typename T1,
typename T2,
typename T3>
406 template <
typename T1,
typename T2,
typename T3>
409 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
410 typedef typename packet_conditional<packet_size, \
411 typename packet_traits<name ## Scalar>::type, \
412 typename packet_traits<name ## Scalar>::half, \
413 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
414 prefix ## name ## Packet
416 #define PACKET_DECL_COND(name, packet_size) \
417 typedef typename packet_conditional<packet_size, \
418 typename packet_traits<name ## Scalar>::type, \
419 typename packet_traits<name ## Scalar>::half, \
420 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
423 #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
424 typedef typename packet_conditional<packet_size, \
425 typename packet_traits<Scalar>::type, \
426 typename packet_traits<Scalar>::half, \
427 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
428 prefix ## ScalarPacket
430 #define PACKET_DECL_COND_SCALAR(packet_size) \
431 typedef typename packet_conditional<packet_size, \
432 typename packet_traits<Scalar>::type, \
433 typename packet_traits<Scalar>::half, \
434 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
447 template<
typename _LhsScalar,
typename _RhsScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
451 typedef _LhsScalar LhsScalar;
452 typedef _RhsScalar RhsScalar;
455 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
456 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
457 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
467 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
473 default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
474 #
if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
475 && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
480 mr = Vectorizable ? 3*LhsPacketSize : default_mr,
485 LhsProgress = LhsPacketSize,
498 EIGEN_STRONG_INLINE
void initAcc(
AccPacket& p)
500 p = pset1<ResPacket>(ResScalar(0));
503 template<
typename RhsPacketType>
504 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketType& dest)
const
506 dest = pset1<RhsPacketType>(*b);
509 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
RhsPacketx4& dest)
const
511 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
514 template<
typename RhsPacketType>
515 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, RhsPacketType& dest)
const
520 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*,
RhsPacketx4&)
const
524 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b,
RhsPacket& dest)
const
526 dest = ploadquad<RhsPacket>(b);
529 template<
typename LhsPacketType>
530 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a, LhsPacketType& dest)
const
532 dest = pload<LhsPacketType>(a);
535 template<
typename LhsPacketType>
536 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacketType& dest)
const
538 dest = ploadu<LhsPacketType>(a);
541 template<
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
542 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const LaneIdType&)
const
549 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
550 EIGEN_UNUSED_VARIABLE(tmp);
553 tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
557 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
558 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketx4& b, AccPacketType& c,
RhsPacket& tmp,
const LaneIdType& lane)
const
560 madd(a, b.get(lane), c, tmp, lane);
565 r = pmadd(c,alpha,r);
568 template<
typename ResPacketHalf>
569 EIGEN_STRONG_INLINE
void acc(
const ResPacketHalf& c,
const ResPacketHalf& alpha, ResPacketHalf& r)
const
571 r = pmadd(c,alpha,r);
576 template<
typename RealScalar,
bool _ConjLhs,
int Arch,
int _PacketSize>
577 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
580 typedef std::complex<RealScalar> LhsScalar;
581 typedef RealScalar RhsScalar;
584 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
585 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
586 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
596 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
598 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
600 mr = 3*LhsPacketSize,
602 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
605 LhsProgress = LhsPacketSize,
618 EIGEN_STRONG_INLINE
void initAcc(
AccPacket& p)
620 p = pset1<ResPacket>(ResScalar(0));
623 template<
typename RhsPacketType>
624 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketType& dest)
const
626 dest = pset1<RhsPacketType>(*b);
629 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
RhsPacketx4& dest)
const
631 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
634 template<
typename RhsPacketType>
635 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, RhsPacketType& dest)
const
640 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*,
RhsPacketx4&)
const
643 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b,
RhsPacket& dest)
const
648 EIGEN_STRONG_INLINE
void loadRhsQuad_impl(
const RhsScalar* b,
RhsPacket& dest,
const true_type&)
const
652 RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
653 dest = ploadquad<RhsPacket>(tmp);
656 EIGEN_STRONG_INLINE
void loadRhsQuad_impl(
const RhsScalar* b,
RhsPacket& dest,
const false_type&)
const
658 eigen_internal_assert(RhsPacketSize<=8);
659 dest = pset1<RhsPacket>(*b);
662 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a,
LhsPacket& dest)
const
664 dest = pload<LhsPacket>(a);
667 template<
typename LhsPacketType>
668 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacketType& dest)
const
670 dest = ploadu<LhsPacketType>(a);
673 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
674 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const LaneIdType&)
const
679 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
680 EIGEN_STRONG_INLINE
void madd_impl(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const true_type&)
const
682 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
683 EIGEN_UNUSED_VARIABLE(tmp);
684 c.v = pmadd(a.v,b,c.v);
686 tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
690 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar& a,
const RhsScalar& b, ResScalar& c, RhsScalar& ,
const false_type&)
const
695 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
696 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketx4& b, AccPacketType& c,
RhsPacket& tmp,
const LaneIdType& lane)
const
698 madd(a, b.get(lane), c, tmp, lane);
701 template <
typename ResPacketType,
typename AccPacketType>
702 EIGEN_STRONG_INLINE
void acc(
const AccPacketType& c,
const ResPacketType& alpha, ResPacketType& r)
const
705 r = cj.pmadd(c,alpha,r);
711 template<
typename Packet>
718 template<
typename Packet>
722 res.first = padd(a.first, b.first);
723 res.second = padd(a.second,b.second);
731 template<
typename Packet>
732 const DoublePacket<Packet>&
733 predux_half_dowto4(
const DoublePacket<Packet> &a,
734 typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
739 template<
typename Packet>
740 DoublePacket<typename unpacket_traits<Packet>::half>
741 predux_half_dowto4(
const DoublePacket<Packet> &a,
742 typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
745 DoublePacket<typename unpacket_traits<Packet>::half> res;
746 typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
747 typedef typename packet_traits<Cplx>::type CplxPacket;
748 res.first = predux_half_dowto4(CplxPacket(a.first)).v;
749 res.second = predux_half_dowto4(CplxPacket(a.second)).v;
754 template<
typename Scalar,
typename RealPacket>
755 void loadQuadToDoublePacket(
const Scalar* b, DoublePacket<RealPacket>& dest,
756 typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
758 dest.first = pset1<RealPacket>(numext::real(*b));
759 dest.second = pset1<RealPacket>(numext::imag(*b));
762 template<
typename Scalar,
typename RealPacket>
763 void loadQuadToDoublePacket(
const Scalar* b, DoublePacket<RealPacket>& dest,
764 typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
767 typedef typename NumTraits<Scalar>::Real RealScalar;
768 RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
769 RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
770 dest.first = ploadquad<RealPacket>(r);
771 dest.second = ploadquad<RealPacket>(i);
787 template<
typename RealScalar,
bool _ConjLhs,
bool _ConjRhs,
int Arch,
int _PacketSize>
788 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
791 typedef std::complex<RealScalar> Scalar;
792 typedef std::complex<RealScalar> LhsScalar;
793 typedef std::complex<RealScalar> RhsScalar;
794 typedef std::complex<RealScalar> ResScalar;
796 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
797 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
798 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
799 PACKET_DECL_COND(Real, _PacketSize);
800 PACKET_DECL_COND_SCALAR(_PacketSize);
816 LhsProgress = ResPacketSize,
831 EIGEN_STRONG_INLINE
void initAcc(Scalar& p) { p = Scalar(0); }
835 p.first = pset1<RealPacket>(RealScalar(0));
836 p.second = pset1<RealPacket>(RealScalar(0));
840 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, ScalarPacket& dest)
const
842 dest = pset1<ScalarPacket>(*b);
846 template<
typename RealPacketType>
849 dest.first = pset1<RealPacketType>(numext::real(*b));
850 dest.second = pset1<RealPacketType>(numext::imag(*b));
853 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
RhsPacketx4& dest)
const
855 loadRhs(b, dest.B_0);
856 loadRhs(b + 1, dest.B1);
857 loadRhs(b + 2, dest.B2);
858 loadRhs(b + 3, dest.B3);
862 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, ScalarPacket& dest)
const
868 template<
typename RealPacketType>
874 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*,
RhsPacketx4&)
const {}
876 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b,
ResPacket& dest)
const
880 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b,
DoublePacketType& dest)
const
882 loadQuadToDoublePacket(b,dest);
886 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a,
LhsPacket& dest)
const
888 dest = pload<LhsPacket>((
const typename unpacket_traits<LhsPacket>::type*)(a));
891 template<
typename LhsPacketType>
892 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacketType& dest)
const
894 dest = ploadu<LhsPacketType>((
const typename unpacket_traits<LhsPacketType>::type*)(a));
897 template<
typename LhsPacketType,
typename RhsPacketType,
typename ResPacketType,
typename TmpType,
typename LaneIdType>
902 c.first = padd(pmul(a,b.first), c.first);
903 c.second = padd(pmul(a,b.second),c.second);
906 template<
typename LaneIdType>
912 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
913 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketx4& b, AccPacketType& c,
RhsPacket& tmp,
const LaneIdType& lane)
const
915 madd(a, b.get(lane), c, tmp, lane);
918 EIGEN_STRONG_INLINE
void acc(
const Scalar& c,
const Scalar& alpha, Scalar& r)
const { r += alpha * c; }
920 template<
typename RealPacketType,
typename ResPacketType>
925 if((!ConjLhs)&&(!ConjRhs))
927 tmp = pcplxflip(pconj(ResPacketType(c.second)));
928 tmp = padd(ResPacketType(c.first),tmp);
930 else if((!ConjLhs)&&(ConjRhs))
932 tmp = pconj(pcplxflip(ResPacketType(c.second)));
933 tmp = padd(ResPacketType(c.first),tmp);
935 else if((ConjLhs)&&(!ConjRhs))
937 tmp = pcplxflip(ResPacketType(c.second));
938 tmp = padd(pconj(ResPacketType(c.first)),tmp);
940 else if((ConjLhs)&&(ConjRhs))
942 tmp = pcplxflip(ResPacketType(c.second));
943 tmp = psub(pconj(ResPacketType(c.first)),tmp);
946 r = pmadd(tmp,alpha,r);
953 template<
typename RealScalar,
bool _ConjRhs,
int Arch,
int _PacketSize>
954 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
957 typedef std::complex<RealScalar> Scalar;
958 typedef RealScalar LhsScalar;
959 typedef Scalar RhsScalar;
960 typedef Scalar ResScalar;
962 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
963 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
964 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
965 PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
966 PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
968 #undef PACKET_DECL_COND_SCALAR_PREFIX
969 #undef PACKET_DECL_COND_PREFIX
970 #undef PACKET_DECL_COND_SCALAR
971 #undef PACKET_DECL_COND
982 NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
985 mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
987 LhsProgress = ResPacketSize,
998 EIGEN_STRONG_INLINE
void initAcc(
AccPacket& p)
1000 p = pset1<ResPacket>(ResScalar(0));
1003 template<
typename RhsPacketType>
1004 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketType& dest)
const
1006 dest = pset1<RhsPacketType>(*b);
1009 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b,
RhsPacketx4& dest)
const
1011 pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
1014 template<
typename RhsPacketType>
1015 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, RhsPacketType& dest)
const
1020 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*,
RhsPacketx4&)
const
1023 EIGEN_STRONG_INLINE
void loadLhs(
const LhsScalar* a,
LhsPacket& dest)
const
1025 dest = ploaddup<LhsPacket>(a);
1028 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b,
RhsPacket& dest)
const
1030 dest = ploadquad<RhsPacket>(b);
1033 template<
typename LhsPacketType>
1034 EIGEN_STRONG_INLINE
void loadLhsUnaligned(
const LhsScalar* a, LhsPacketType& dest)
const
1036 dest = ploaddup<LhsPacketType>(a);
1039 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
1040 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const LaneIdType&)
const
1045 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
1046 EIGEN_STRONG_INLINE
void madd_impl(
const LhsPacketType& a,
const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
const true_type&)
const
1048 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1049 EIGEN_UNUSED_VARIABLE(tmp);
1050 c.v = pmadd(a,b.v,c.v);
1052 tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
1057 EIGEN_STRONG_INLINE
void madd_impl(
const LhsScalar& a,
const RhsScalar& b, ResScalar& c, RhsScalar& ,
const false_type&)
const
1062 template<
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
1063 EIGEN_STRONG_INLINE
void madd(
const LhsPacketType& a,
const RhsPacketx4& b, AccPacketType& c,
RhsPacket& tmp,
const LaneIdType& lane)
const
1065 madd(a, b.get(lane), c, tmp, lane);
1068 template <
typename ResPacketType,
typename AccPacketType>
1069 EIGEN_STRONG_INLINE
void acc(
const AccPacketType& c,
const ResPacketType& alpha, ResPacketType& r)
const
1072 r = cj.pmadd(alpha,c,r);
1080 #if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
1083 struct gebp_traits <float, float, false, false,Architecture::NEON,GEBPPacketFull>
1084 :
gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
1086 typedef float RhsPacket;
1088 typedef float32x4_t RhsPacketx4;
1090 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacket& dest)
const
1095 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketx4& dest)
const
1097 dest = vld1q_f32(b);
1100 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, RhsPacket& dest)
const
1105 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const
1108 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, RhsPacket& dest)
const
1113 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacket& b, AccPacket& c, RhsPacket& ,
const FixedInt<0>&)
const
1115 c = vfmaq_n_f32(c, a, b);
1121 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c, RhsPacket& ,
const FixedInt<0>&)
const
1122 { madd_helper<0>(a, b, c); }
1123 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c, RhsPacket& ,
const FixedInt<1>&)
const
1124 { madd_helper<1>(a, b, c); }
1125 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c, RhsPacket& ,
const FixedInt<2>&)
const
1126 { madd_helper<2>(a, b, c); }
1127 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c, RhsPacket& ,
const FixedInt<3>&)
const
1128 { madd_helper<3>(a, b, c); }
1131 template<
int LaneID>
1132 EIGEN_STRONG_INLINE
void madd_helper(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c)
const
1134 #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
1137 if(LaneID==0)
asm(
"fmla %0.4s, %1.4s, %2.s[0]\n" :
"+w" (c) :
"w" (a),
"w" (b) : );
1138 else if(LaneID==1)
asm(
"fmla %0.4s, %1.4s, %2.s[1]\n" :
"+w" (c) :
"w" (a),
"w" (b) : );
1139 else if(LaneID==2)
asm(
"fmla %0.4s, %1.4s, %2.s[2]\n" :
"+w" (c) :
"w" (a),
"w" (b) : );
1140 else if(LaneID==3)
asm(
"fmla %0.4s, %1.4s, %2.s[3]\n" :
"+w" (c) :
"w" (a),
"w" (b) : );
1142 c = vfmaq_laneq_f32(c, a, b, LaneID);
1149 struct gebp_traits <double, double, false, false,Architecture::NEON>
1150 : gebp_traits<double,double,false,false,Architecture::Generic>
1152 typedef double RhsPacket;
1154 struct RhsPacketx4 {
1155 float64x2_t B_0, B_1;
1158 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacket& dest)
const
1163 EIGEN_STRONG_INLINE
void loadRhs(
const RhsScalar* b, RhsPacketx4& dest)
const
1165 dest.B_0 = vld1q_f64(b);
1166 dest.B_1 = vld1q_f64(b+2);
1169 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar* b, RhsPacket& dest)
const
1174 EIGEN_STRONG_INLINE
void updateRhs(
const RhsScalar*, RhsPacketx4&)
const
1177 EIGEN_STRONG_INLINE
void loadRhsQuad(
const RhsScalar* b, RhsPacket& dest)
const
1182 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacket& b, AccPacket& c, RhsPacket& ,
const FixedInt<0>&)
const
1184 c = vfmaq_n_f64(c, a, b);
1190 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c, RhsPacket& ,
const FixedInt<0>&)
const
1191 { madd_helper<0>(a, b, c); }
1192 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c, RhsPacket& ,
const FixedInt<1>&)
const
1193 { madd_helper<1>(a, b, c); }
1194 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c, RhsPacket& ,
const FixedInt<2>&)
const
1195 { madd_helper<2>(a, b, c); }
1196 EIGEN_STRONG_INLINE
void madd(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c, RhsPacket& ,
const FixedInt<3>&)
const
1197 { madd_helper<3>(a, b, c); }
1200 template <
int LaneID>
1201 EIGEN_STRONG_INLINE
void madd_helper(
const LhsPacket& a,
const RhsPacketx4& b, AccPacket& c)
const
1203 #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
1206 if(LaneID==0)
asm(
"fmla %0.2d, %1.2d, %2.d[0]\n" :
"+w" (c) :
"w" (a),
"w" (b.B_0) : );
1207 else if(LaneID==1)
asm(
"fmla %0.2d, %1.2d, %2.d[1]\n" :
"+w" (c) :
"w" (a),
"w" (b.B_0) : );
1208 else if(LaneID==2)
asm(
"fmla %0.2d, %1.2d, %2.d[0]\n" :
"+w" (c) :
"w" (a),
"w" (b.B_1) : );
1209 else if(LaneID==3)
asm(
"fmla %0.2d, %1.2d, %2.d[1]\n" :
"+w" (c) :
"w" (a),
"w" (b.B_1) : );
1211 if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
1212 else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
1213 else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
1214 else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
1228 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1235 typedef typename Traits::ResScalar ResScalar;
1242 typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
1246 typedef typename SwappedTraits::ResScalar SResScalar;
1262 typedef typename DataMapper::LinearMapper LinearMapper;
1265 Vectorizable = Traits::Vectorizable,
1266 LhsProgress = Traits::LhsProgress,
1267 LhsProgressHalf = HalfTraits::LhsProgress,
1268 LhsProgressQuarter = QuarterTraits::LhsProgress,
1269 RhsProgress = Traits::RhsProgress,
1270 RhsProgressHalf = HalfTraits::RhsProgress,
1271 RhsProgressQuarter = QuarterTraits::RhsProgress,
1272 ResPacketSize = Traits::ResPacketSize
1276 void operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1281 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs,
1288 typedef typename Traits::ResScalar ResScalar;
1294 EIGEN_STRONG_INLINE
void operator()(
const DataMapper& res,
SwappedTraits &straits,
const LhsScalar* blA,
1298 EIGEN_UNUSED_VARIABLE(res);
1299 EIGEN_UNUSED_VARIABLE(straits);
1300 EIGEN_UNUSED_VARIABLE(blA);
1301 EIGEN_UNUSED_VARIABLE(blB);
1302 EIGEN_UNUSED_VARIABLE(depth);
1303 EIGEN_UNUSED_VARIABLE(endk);
1304 EIGEN_UNUSED_VARIABLE(i);
1305 EIGEN_UNUSED_VARIABLE(j2);
1306 EIGEN_UNUSED_VARIABLE(alpha);
1307 EIGEN_UNUSED_VARIABLE(C0);
1312 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1317 typedef typename Traits::ResScalar ResScalar;
1323 EIGEN_STRONG_INLINE
void operator()(
const DataMapper& res,
SwappedTraits &straits,
const LhsScalar* blA,
1332 SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1333 SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1335 if (depth - endk > 0)
1339 SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1341 for (
Index kk = endk; kk < depth; kk++)
1343 SLhsPacketQuarter a0;
1344 SRhsPacketQuarter b0;
1345 straits.loadLhsUnaligned(blB, a0);
1346 straits.loadRhs(blA, b0);
1347 straits.madd(a0,b0,c0,b0, fix<0>);
1348 blB += SwappedTraits::LhsProgress/4;
1351 straits.acc(c0, alphav, R);
1355 straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1357 res.scatterPacket(i, j2, R);
1361 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1364 typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1366 EIGEN_STRONG_INLINE
void peeled_kc_onestep(
Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits
traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1368 EIGEN_ASM_COMMENT(
"begin step of gebp micro kernel 1X4");
1369 EIGEN_ASM_COMMENT(
"Note: these asm comments work around bug 935!");
1370 traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1371 traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1372 traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1373 traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1374 traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1375 traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1376 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1377 __asm__ (
"" :
"+x,m" (*A0));
1379 EIGEN_ASM_COMMENT(
"end step of gebp micro kernel 1X4");
1382 EIGEN_STRONG_INLINE
void operator()(
1383 const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB, ResScalar alpha,
1391 for(
Index i=peelStart; i<peelEnd; i+=LhsProgress)
1394 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1399 const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1403 AccPacket C0, C1, C2, C3;
1413 AccPacket D0, D1, D2, D3;
1419 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1420 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1421 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1422 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1424 r0.prefetch(prefetch_res_offset);
1425 r1.prefetch(prefetch_res_offset);
1426 r2.prefetch(prefetch_res_offset);
1427 r3.prefetch(prefetch_res_offset);
1430 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1434 for(
Index k=0; k<peeled_kc; k+=pk)
1436 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 1/half/quarterX4");
1437 RhsPacketx4 rhs_panel;
1440 internal::prefetch(blB+(48+0));
1441 peeled_kc_onestep(0, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1442 peeled_kc_onestep(1, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1443 peeled_kc_onestep(2, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1444 peeled_kc_onestep(3, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1445 internal::prefetch(blB+(48+16));
1446 peeled_kc_onestep(4, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1447 peeled_kc_onestep(5, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1448 peeled_kc_onestep(6, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1449 peeled_kc_onestep(7, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1451 blB += pk*4*RhsProgress;
1452 blA += pk*LhsProgress;
1454 EIGEN_ASM_COMMENT(
"end gebp micro kernel 1/half/quarterX4");
1462 for(
Index k=peeled_kc; k<depth; k++)
1464 RhsPacketx4 rhs_panel;
1466 peeled_kc_onestep(0, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1467 blB += 4*RhsProgress;
1472 ResPacket alphav = pset1<ResPacket>(alpha);
1474 R0 = r0.template loadPacket<ResPacket>(0);
1475 R1 = r1.template loadPacket<ResPacket>(0);
1476 traits.acc(C0, alphav, R0);
1477 traits.acc(C1, alphav, R1);
1478 r0.storePacket(0, R0);
1479 r1.storePacket(0, R1);
1481 R0 = r2.template loadPacket<ResPacket>(0);
1482 R1 = r3.template loadPacket<ResPacket>(0);
1483 traits.acc(C2, alphav, R0);
1484 traits.acc(C3, alphav, R1);
1485 r2.storePacket(0, R0);
1486 r3.storePacket(0, R1);
1490 for(
Index j2=packet_cols4; j2<cols; j2++)
1493 const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1500 LinearMapper r0 = res.getLinearMapper(i, j2);
1503 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1506 for(
Index k= 0; k<peeled_kc; k+=pk)
1508 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 1/half/quarterX1");
1511 #define EIGEN_GEBGP_ONESTEP(K) \
1513 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1514 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1516 traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1517 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1518 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1519 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1522 EIGEN_GEBGP_ONESTEP(0);
1523 EIGEN_GEBGP_ONESTEP(1);
1524 EIGEN_GEBGP_ONESTEP(2);
1525 EIGEN_GEBGP_ONESTEP(3);
1526 EIGEN_GEBGP_ONESTEP(4);
1527 EIGEN_GEBGP_ONESTEP(5);
1528 EIGEN_GEBGP_ONESTEP(6);
1529 EIGEN_GEBGP_ONESTEP(7);
1531 blB += pk*RhsProgress;
1532 blA += pk*LhsProgress;
1534 EIGEN_ASM_COMMENT(
"end gebp micro kernel 1/half/quarterX1");
1538 for(
Index k=peeled_kc; k<depth; k++)
1541 EIGEN_GEBGP_ONESTEP(0);
1545 #undef EIGEN_GEBGP_ONESTEP
1547 ResPacket alphav = pset1<ResPacket>(alpha);
1548 R0 = r0.template loadPacket<ResPacket>(0);
1549 traits.acc(C0, alphav, R0);
1550 r0.storePacket(0, R0);
1556 template<
int nr, Index LhsProgress, Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
typename LinearMapper,
typename DataMapper>
1557 struct lhs_process_fraction_of_packet :
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1560 EIGEN_STRONG_INLINE
void peeled_kc_onestep(
Index K,
const LhsScalar* blA,
const RhsScalar* blB, GEBPTraits
traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1562 EIGEN_ASM_COMMENT(
"begin step of gebp micro kernel 1X4");
1563 EIGEN_ASM_COMMENT(
"Note: these asm comments work around bug 935!");
1564 traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1565 traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1566 traits.madd(*A0, *B_0, *C0, *B_0);
1567 traits.madd(*A0, *B1, *C1, *B1);
1568 traits.madd(*A0, *B2, *C2, *B2);
1569 traits.madd(*A0, *B3, *C3, *B3);
1570 EIGEN_ASM_COMMENT(
"end step of gebp micro kernel 1X4");
1574 template<
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
1577 ::operator()(
const DataMapper& res,
const LhsScalar* blockA,
const RhsScalar* blockB,
1582 SwappedTraits straits;
1584 if(strideA==-1) strideA = depth;
1585 if(strideB==-1) strideB = depth;
1587 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1588 const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1589 const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1590 const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1591 const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1592 const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1594 const Index peeled_kc = depth & ~(pk-1);
1595 const int prefetch_res_offset = 32/
sizeof(ResScalar);
1601 if(mr>=3*Traits::LhsProgress)
1608 const Index l1 = defaultL1CacheSize;
1612 const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 3*LhsProgress) ));
1613 for(
Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1615 const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
1616 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1618 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1624 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1628 AccPacket C0, C1, C2, C3,
1631 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1632 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1633 traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1635 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1636 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1637 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1638 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1646 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1650 for(
Index k=0; k<peeled_kc; k+=pk)
1652 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 3pX4");
1654 RhsPanel15 rhs_panel;
1657 #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1661 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1663 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1665 #define EIGEN_GEBP_ONESTEP(K) \
1667 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1668 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1669 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1670 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1671 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1673 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1674 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1675 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1676 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1677 traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1678 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1679 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1680 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1681 traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1682 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1683 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1684 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1685 traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1686 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1687 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1688 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1689 traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1690 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1691 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1692 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1693 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1696 internal::prefetch(blB);
1697 EIGEN_GEBP_ONESTEP(0);
1698 EIGEN_GEBP_ONESTEP(1);
1699 EIGEN_GEBP_ONESTEP(2);
1700 EIGEN_GEBP_ONESTEP(3);
1701 EIGEN_GEBP_ONESTEP(4);
1702 EIGEN_GEBP_ONESTEP(5);
1703 EIGEN_GEBP_ONESTEP(6);
1704 EIGEN_GEBP_ONESTEP(7);
1706 blB += pk*4*RhsProgress;
1707 blA += pk*3*Traits::LhsProgress;
1709 EIGEN_ASM_COMMENT(
"end gebp micro kernel 3pX4");
1712 for(
Index k=peeled_kc; k<depth; k++)
1714 RhsPanel15 rhs_panel;
1717 EIGEN_GEBP_ONESTEP(0);
1718 blB += 4*RhsProgress;
1719 blA += 3*Traits::LhsProgress;
1722 #undef EIGEN_GEBP_ONESTEP
1724 ResPacket R0, R1, R2;
1725 ResPacket alphav = pset1<ResPacket>(alpha);
1727 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1728 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1729 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1730 traits.acc(C0, alphav, R0);
1731 traits.acc(C4, alphav, R1);
1732 traits.acc(C8, alphav, R2);
1733 r0.storePacket(0 * Traits::ResPacketSize, R0);
1734 r0.storePacket(1 * Traits::ResPacketSize, R1);
1735 r0.storePacket(2 * Traits::ResPacketSize, R2);
1737 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1738 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1739 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1740 traits.acc(C1, alphav, R0);
1741 traits.acc(C5, alphav, R1);
1742 traits.acc(C9, alphav, R2);
1743 r1.storePacket(0 * Traits::ResPacketSize, R0);
1744 r1.storePacket(1 * Traits::ResPacketSize, R1);
1745 r1.storePacket(2 * Traits::ResPacketSize, R2);
1747 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1748 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1749 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1750 traits.acc(C2, alphav, R0);
1751 traits.acc(C6, alphav, R1);
1752 traits.acc(C10, alphav, R2);
1753 r2.storePacket(0 * Traits::ResPacketSize, R0);
1754 r2.storePacket(1 * Traits::ResPacketSize, R1);
1755 r2.storePacket(2 * Traits::ResPacketSize, R2);
1757 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1758 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1759 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1760 traits.acc(C3, alphav, R0);
1761 traits.acc(C7, alphav, R1);
1762 traits.acc(C11, alphav, R2);
1763 r3.storePacket(0 * Traits::ResPacketSize, R0);
1764 r3.storePacket(1 * Traits::ResPacketSize, R1);
1765 r3.storePacket(2 * Traits::ResPacketSize, R2);
1770 for(
Index j2=packet_cols4; j2<cols; j2++)
1772 for(
Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1775 const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1779 AccPacket C0, C4, C8;
1784 LinearMapper r0 = res.getLinearMapper(i, j2);
1788 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1789 LhsPacket A0, A1, A2;
1791 for(
Index k=0; k<peeled_kc; k+=pk)
1793 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 3pX1");
1795 #define EIGEN_GEBGP_ONESTEP(K) \
1797 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1798 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1799 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1800 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1801 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1802 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1803 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1804 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1805 traits.madd(A2, B_0, C8, B_0, fix<0>); \
1806 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1809 EIGEN_GEBGP_ONESTEP(0);
1810 EIGEN_GEBGP_ONESTEP(1);
1811 EIGEN_GEBGP_ONESTEP(2);
1812 EIGEN_GEBGP_ONESTEP(3);
1813 EIGEN_GEBGP_ONESTEP(4);
1814 EIGEN_GEBGP_ONESTEP(5);
1815 EIGEN_GEBGP_ONESTEP(6);
1816 EIGEN_GEBGP_ONESTEP(7);
1818 blB += pk*RhsProgress;
1819 blA += pk*3*Traits::LhsProgress;
1821 EIGEN_ASM_COMMENT(
"end gebp micro kernel 3pX1");
1825 for(
Index k=peeled_kc; k<depth; k++)
1828 EIGEN_GEBGP_ONESTEP(0);
1830 blA += 3*Traits::LhsProgress;
1832 #undef EIGEN_GEBGP_ONESTEP
1833 ResPacket R0, R1, R2;
1834 ResPacket alphav = pset1<ResPacket>(alpha);
1836 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1837 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1838 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1839 traits.acc(C0, alphav, R0);
1840 traits.acc(C4, alphav, R1);
1841 traits.acc(C8, alphav, R2);
1842 r0.storePacket(0 * Traits::ResPacketSize, R0);
1843 r0.storePacket(1 * Traits::ResPacketSize, R1);
1844 r0.storePacket(2 * Traits::ResPacketSize, R2);
1851 if(mr>=2*Traits::LhsProgress)
1853 const Index l1 = defaultL1CacheSize;
1857 Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 -
sizeof(ResScalar)*mr*nr - depth*nr*
sizeof(RhsScalar)) / (depth *
sizeof(LhsScalar) * 2*LhsProgress) ));
1859 for(
Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1861 Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1862 for(
Index j2=0; j2<packet_cols4; j2+=nr)
1864 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1870 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1874 AccPacket C0, C1, C2, C3,
1876 traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1877 traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1879 LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1880 LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1881 LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1882 LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1884 r0.prefetch(prefetch_res_offset);
1885 r1.prefetch(prefetch_res_offset);
1886 r2.prefetch(prefetch_res_offset);
1887 r3.prefetch(prefetch_res_offset);
1890 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1894 for(
Index k=0; k<peeled_kc; k+=pk)
1896 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 2pX4");
1897 RhsPacketx4 rhs_panel;
1902 #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1903 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1905 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1907 #define EIGEN_GEBGP_ONESTEP(K) \
1909 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1910 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1911 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1912 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1913 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1914 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1915 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1916 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1917 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1918 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1919 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1920 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1921 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1922 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1925 internal::prefetch(blB+(48+0));
1926 EIGEN_GEBGP_ONESTEP(0);
1927 EIGEN_GEBGP_ONESTEP(1);
1928 EIGEN_GEBGP_ONESTEP(2);
1929 EIGEN_GEBGP_ONESTEP(3);
1930 internal::prefetch(blB+(48+16));
1931 EIGEN_GEBGP_ONESTEP(4);
1932 EIGEN_GEBGP_ONESTEP(5);
1933 EIGEN_GEBGP_ONESTEP(6);
1934 EIGEN_GEBGP_ONESTEP(7);
1936 blB += pk*4*RhsProgress;
1937 blA += pk*(2*Traits::LhsProgress);
1939 EIGEN_ASM_COMMENT(
"end gebp micro kernel 2pX4");
1942 for(
Index k=peeled_kc; k<depth; k++)
1944 RhsPacketx4 rhs_panel;
1946 EIGEN_GEBGP_ONESTEP(0);
1947 blB += 4*RhsProgress;
1948 blA += 2*Traits::LhsProgress;
1950 #undef EIGEN_GEBGP_ONESTEP
1952 ResPacket R0, R1, R2, R3;
1953 ResPacket alphav = pset1<ResPacket>(alpha);
1955 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1956 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1957 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1958 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1959 traits.acc(C0, alphav, R0);
1960 traits.acc(C4, alphav, R1);
1961 traits.acc(C1, alphav, R2);
1962 traits.acc(C5, alphav, R3);
1963 r0.storePacket(0 * Traits::ResPacketSize, R0);
1964 r0.storePacket(1 * Traits::ResPacketSize, R1);
1965 r1.storePacket(0 * Traits::ResPacketSize, R2);
1966 r1.storePacket(1 * Traits::ResPacketSize, R3);
1968 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1969 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1970 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1971 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1972 traits.acc(C2, alphav, R0);
1973 traits.acc(C6, alphav, R1);
1974 traits.acc(C3, alphav, R2);
1975 traits.acc(C7, alphav, R3);
1976 r2.storePacket(0 * Traits::ResPacketSize, R0);
1977 r2.storePacket(1 * Traits::ResPacketSize, R1);
1978 r3.storePacket(0 * Traits::ResPacketSize, R2);
1979 r3.storePacket(1 * Traits::ResPacketSize, R3);
1984 for(
Index j2=packet_cols4; j2<cols; j2++)
1986 for(
Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1989 const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1997 LinearMapper r0 = res.getLinearMapper(i, j2);
1998 r0.prefetch(prefetch_res_offset);
2001 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2004 for(
Index k=0; k<peeled_kc; k+=pk)
2006 EIGEN_ASM_COMMENT(
"begin gebp micro kernel 2pX1");
2009 #define EIGEN_GEBGP_ONESTEP(K) \
2011 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2012 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2013 traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
2014 traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
2015 traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
2016 traits.madd(A0, B_0, C0, B1, fix<0>); \
2017 traits.madd(A1, B_0, C4, B_0, fix<0>); \
2018 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2021 EIGEN_GEBGP_ONESTEP(0);
2022 EIGEN_GEBGP_ONESTEP(1);
2023 EIGEN_GEBGP_ONESTEP(2);
2024 EIGEN_GEBGP_ONESTEP(3);
2025 EIGEN_GEBGP_ONESTEP(4);
2026 EIGEN_GEBGP_ONESTEP(5);
2027 EIGEN_GEBGP_ONESTEP(6);
2028 EIGEN_GEBGP_ONESTEP(7);
2030 blB += pk*RhsProgress;
2031 blA += pk*2*Traits::LhsProgress;
2033 EIGEN_ASM_COMMENT(
"end gebp micro kernel 2pX1");
2037 for(
Index k=peeled_kc; k<depth; k++)
2040 EIGEN_GEBGP_ONESTEP(0);
2042 blA += 2*Traits::LhsProgress;
2044 #undef EIGEN_GEBGP_ONESTEP
2046 ResPacket alphav = pset1<ResPacket>(alpha);
2048 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2049 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2050 traits.acc(C0, alphav, R0);
2051 traits.acc(C4, alphav, R1);
2052 r0.storePacket(0 * Traits::ResPacketSize, R0);
2053 r0.storePacket(1 * Traits::ResPacketSize, R1);
2059 if(mr>=1*Traits::LhsProgress)
2061 lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
2062 p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2065 if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
2067 lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
2068 p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2071 if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
2073 lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
2074 p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2077 if(peeled_mc_quarter<rows)
2080 for(
Index j2=0; j2<packet_cols4; j2+=nr)
2083 for(
Index i=peeled_mc_quarter; i<rows; i+=1)
2085 const LhsScalar* blA = &blockA[i*strideA+offsetA];
2087 const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
2092 const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
2093 const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
2094 if ((SwappedTraits::LhsProgress % 4) == 0 &&
2095 (SwappedTraits::LhsProgress<=16) &&
2096 (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
2097 (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
2099 SAccPacket C0, C1, C2, C3;
2100 straits.initAcc(C0);
2101 straits.initAcc(C1);
2102 straits.initAcc(C2);
2103 straits.initAcc(C3);
2105 const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
2106 const Index endk = (depth/spk)*spk;
2107 const Index endk4 = (depth/(spk*4))*(spk*4);
2110 for(; k<endk4; k+=4*spk)
2115 straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
2116 straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
2118 straits.loadRhsQuad(blA+0*spk, B_0);
2119 straits.loadRhsQuad(blA+1*spk, B_1);
2120 straits.madd(A0,B_0,C0,B_0, fix<0>);
2121 straits.madd(A1,B_1,C1,B_1, fix<0>);
2123 straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
2124 straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
2125 straits.loadRhsQuad(blA+2*spk, B_0);
2126 straits.loadRhsQuad(blA+3*spk, B_1);
2127 straits.madd(A0,B_0,C2,B_0, fix<0>);
2128 straits.madd(A1,B_1,C3,B_1, fix<0>);
2130 blB += 4*SwappedTraits::LhsProgress;
2133 C0 = padd(padd(C0,C1),padd(C2,C3));
2134 for(; k<endk; k+=spk)
2139 straits.loadLhsUnaligned(blB, A0);
2140 straits.loadRhsQuad(blA, B_0);
2141 straits.madd(A0,B_0,C0,B_0, fix<0>);
2143 blB += SwappedTraits::LhsProgress;
2146 if(SwappedTraits::LhsProgress==8)
2149 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
2150 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
2151 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
2152 typedef typename conditional<SwappedTraits::LhsProgress>=8,
typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
2154 SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
2155 SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
2162 straits.loadLhsUnaligned(blB, a0);
2163 straits.loadRhs(blA, b0);
2164 SAccPacketHalf c0 = predux_half_dowto4(C0);
2165 straits.madd(a0,b0,c0,b0, fix<0>);
2166 straits.acc(c0, alphav, R);
2170 straits.acc(predux_half_dowto4(C0), alphav, R);
2172 res.scatterPacket(i, j2, R);
2174 else if (SwappedTraits::LhsProgress==16)
2180 last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2181 p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2185 SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2186 SResPacket alphav = pset1<SResPacket>(alpha);
2187 straits.acc(C0, alphav, R);
2188 res.scatterPacket(i, j2, R);
2194 ResScalar C0(0), C1(0), C2(0), C3(0);
2196 for(
Index k=0; k<depth; k++)
2205 CJMADD(cj,A0,B_0,C0, B_0);
2206 CJMADD(cj,A0,B_1,C1, B_1);
2210 CJMADD(cj,A0,B_0,C2, B_0);
2211 CJMADD(cj,A0,B_1,C3, B_1);
2215 res(i, j2 + 0) += alpha * C0;
2216 res(i, j2 + 1) += alpha * C1;
2217 res(i, j2 + 2) += alpha * C2;
2218 res(i, j2 + 3) += alpha * C3;
2223 for(
Index j2=packet_cols4; j2<cols; j2++)
2226 for(
Index i=peeled_mc_quarter; i<rows; i+=1)
2228 const LhsScalar* blA = &blockA[i*strideA+offsetA];
2232 const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2233 for(
Index k=0; k<depth; k++)
2235 LhsScalar A0 = blA[k];
2236 RhsScalar B_0 = blB[k];
2237 CJMADD(cj, A0, B_0, C0, B_0);
2239 res(i, j2) += alpha * C0;
2262 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2265 typedef typename DataMapper::LinearMapper LinearMapper;
2266 EIGEN_DONT_INLINE
void operator()(Scalar* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2269 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2278 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2279 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2281 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK LHS");
2282 EIGEN_UNUSED_VARIABLE(stride);
2283 EIGEN_UNUSED_VARIABLE(offset);
2284 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2285 eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2286 conj_if<NumTraits<Scalar>::IsComplex &&
Conjugate> cj;
2289 const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
2290 const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2291 const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2292 const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2293 const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2294 const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2295 const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2296 : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
2301 if(Pack1>=3*PacketSize)
2303 for(; i<peeled_mc3; i+=3*PacketSize)
2305 if(PanelMode) count += (3*PacketSize) * offset;
2307 for(
Index k=0; k<depth; k++)
2310 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2311 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2312 C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
2313 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2314 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2315 pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2317 if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2321 if(Pack1>=2*PacketSize)
2323 for(; i<peeled_mc2; i+=2*PacketSize)
2325 if(PanelMode) count += (2*PacketSize) * offset;
2327 for(
Index k=0; k<depth; k++)
2330 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2331 B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2332 pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2333 pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2335 if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2339 if(Pack1>=1*PacketSize)
2341 for(; i<peeled_mc1; i+=1*PacketSize)
2343 if(PanelMode) count += (1*PacketSize) * offset;
2345 for(
Index k=0; k<depth; k++)
2348 A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2349 pstore(blockA+count, cj.pconj(A));
2352 if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2356 if(HasHalf && Pack1>=HalfPacketSize)
2358 for(; i<peeled_mc_half; i+=HalfPacketSize)
2360 if(PanelMode) count += (HalfPacketSize) * offset;
2362 for(
Index k=0; k<depth; k++)
2365 A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2366 pstoreu(blockA+count, cj.pconj(A));
2367 count+=HalfPacketSize;
2369 if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2373 if(HasQuarter && Pack1>=QuarterPacketSize)
2375 for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2377 if(PanelMode) count += (QuarterPacketSize) * offset;
2379 for(
Index k=0; k<depth; k++)
2382 A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2383 pstoreu(blockA+count, cj.pconj(A));
2384 count+=QuarterPacketSize;
2386 if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2395 if(Pack2<PacketSize && Pack2>1)
2397 for(; i<peeled_mc0; i+=last_lhs_progress)
2399 if(PanelMode) count += last_lhs_progress * offset;
2401 for(
Index k=0; k<depth; k++)
2402 for(
Index w=0; w<last_lhs_progress; w++)
2403 blockA[count++] = cj(lhs(i+w, k));
2405 if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2411 if(PanelMode) count += offset;
2412 for(
Index k=0; k<depth; k++)
2413 blockA[count++] = cj(lhs(i, k));
2414 if(PanelMode) count += (stride-offset-depth);
2418 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2421 typedef typename DataMapper::LinearMapper LinearMapper;
2422 EIGEN_DONT_INLINE
void operator()(Scalar* blockA,
const DataMapper& lhs,
Index depth,
Index rows,
Index stride=0,
Index offset=0);
2425 template<
typename Scalar,
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2434 HasHalf = (int)HalfPacketSize < (
int)PacketSize,
2435 HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize};
2437 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK LHS");
2438 EIGEN_UNUSED_VARIABLE(stride);
2439 EIGEN_UNUSED_VARIABLE(offset);
2440 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2441 conj_if<NumTraits<Scalar>::IsComplex &&
Conjugate> cj;
2443 bool gone_half =
false, gone_quarter =
false, gone_last =
false;
2447 int psize = PacketSize;
2450 Index remaining_rows = rows-i;
2451 Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2452 Index starting_pos = i;
2453 for(; i<peeled_mc; i+=pack)
2455 if(PanelMode) count += pack * offset;
2458 if(pack>=psize && psize >= QuarterPacketSize)
2460 const Index peeled_k = (depth/psize)*psize;
2461 for(; k<peeled_k; k+=psize)
2463 for (
Index m = 0; m < pack; m += psize)
2465 if (psize == PacketSize) {
2466 PacketBlock<Packet> kernel;
2467 for (
int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2469 for (
int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2470 }
else if (HasHalf && psize == HalfPacketSize) {
2472 PacketBlock<HalfPacket> kernel_half;
2473 for (
int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2474 ptranspose(kernel_half);
2475 for (
int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2476 }
else if (HasQuarter && psize == QuarterPacketSize) {
2477 gone_quarter =
true;
2478 PacketBlock<QuarterPacket> kernel_quarter;
2479 for (
int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2480 ptranspose(kernel_quarter);
2481 for (
int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2484 count += psize*pack;
2491 for(; w<pack-3; w+=4)
2493 Scalar a(cj(lhs(i+w+0, k))),
2494 b(cj(lhs(i+w+1, k))),
2495 c(cj(lhs(i+w+2, k))),
2496 d(cj(lhs(i+w+3, k)));
2497 blockA[count++] = a;
2498 blockA[count++] = b;
2499 blockA[count++] = c;
2500 blockA[count++] = d;
2504 blockA[count++] = cj(lhs(i+w, k));
2507 if(PanelMode) count += pack * (stride-offset-depth);
2511 Index left = rows - i;
2514 (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2515 ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2516 (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2527 if (Pack2 < PacketSize && !gone_last) {
2529 psize = pack = left & ~1;
2536 if(PanelMode) count += offset;
2537 for(
Index k=0; k<depth; k++)
2538 blockA[count++] = cj(lhs(i, k));
2539 if(PanelMode) count += (stride-offset-depth);
2550 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2553 typedef typename packet_traits<Scalar>::type Packet;
2554 typedef typename DataMapper::LinearMapper LinearMapper;
2556 EIGEN_DONT_INLINE
void operator()(Scalar* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0);
2559 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2563 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK RHS COLMAJOR");
2564 EIGEN_UNUSED_VARIABLE(stride);
2565 EIGEN_UNUSED_VARIABLE(offset);
2566 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2568 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2569 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2571 const Index peeled_k = (depth/PacketSize)*PacketSize;
2620 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2623 if(PanelMode) count += 4 * offset;
2624 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2625 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2626 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2627 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2630 if((PacketSize%4)==0)
2632 for(; k<peeled_k; k+=PacketSize) {
2634 kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2635 kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2636 kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2637 kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
2639 pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2640 pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2641 pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2642 pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2643 count+=4*PacketSize;
2648 blockB[count+0] = cj(dm0(k));
2649 blockB[count+1] = cj(dm1(k));
2650 blockB[count+2] = cj(dm2(k));
2651 blockB[count+3] = cj(dm3(k));
2655 if(PanelMode) count += 4 * (stride-offset-depth);
2660 for(
Index j2=packet_cols4; j2<cols; ++j2)
2662 if(PanelMode) count += offset;
2663 const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2664 for(
Index k=0; k<depth; k++)
2666 blockB[count] = cj(dm0(k));
2669 if(PanelMode) count += (stride-offset-depth);
2674 template<
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2677 typedef typename packet_traits<Scalar>::type Packet;
2680 typedef typename DataMapper::LinearMapper LinearMapper;
2684 EIGEN_DONT_INLINE
void operator()(Scalar* blockB,
const DataMapper& rhs,
Index depth,
Index cols,
Index stride=0,
Index offset=0)
2686 EIGEN_ASM_COMMENT(
"EIGEN PRODUCT PACK RHS ROWMAJOR");
2687 EIGEN_UNUSED_VARIABLE(stride);
2688 EIGEN_UNUSED_VARIABLE(offset);
2689 eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2690 const bool HasHalf = (int)HalfPacketSize < (
int)PacketSize;
2691 const bool HasQuarter = (int)QuarterPacketSize < (
int)HalfPacketSize;
2693 Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2694 Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2732 for(
Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2735 if(PanelMode) count += 4 * offset;
2736 for(
Index k=0; k<depth; k++)
2738 if (PacketSize==4) {
2739 Packet A = rhs.template loadPacket<Packet>(k, j2);
2740 pstoreu(blockB+count, cj.pconj(A));
2741 count += PacketSize;
2742 }
else if (HasHalf && HalfPacketSize==4) {
2743 HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2744 pstoreu(blockB+count, cj.pconj(A));
2745 count += HalfPacketSize;
2746 }
else if (HasQuarter && QuarterPacketSize==4) {
2747 QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2748 pstoreu(blockB+count, cj.pconj(A));
2749 count += QuarterPacketSize;
2751 const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2752 blockB[count+0] = cj(dm0(0));
2753 blockB[count+1] = cj(dm0(1));
2754 blockB[count+2] = cj(dm0(2));
2755 blockB[count+3] = cj(dm0(3));
2760 if(PanelMode) count += 4 * (stride-offset-depth);
2764 for(
Index j2=packet_cols4; j2<cols; ++j2)
2766 if(PanelMode) count += offset;
2767 for(
Index k=0; k<depth; k++)
2769 blockB[count] = cj(rhs(k, j2));
2772 if(PanelMode) count += stride-offset-depth;
2783 std::ptrdiff_t l1, l2, l3;
2784 internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2792 std::ptrdiff_t l1, l2, l3;
2793 internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2802 std::ptrdiff_t l1, l2, l3;
2803 internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2814 internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2819 #endif // EIGEN_GENERAL_BLOCK_PANEL_H