Path Tracer
GeneralBlockPanelKernel.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
12 
13 
14 namespace Eigen {
15 
16 namespace internal {
17 
18 enum GEBPPacketSizeType {
19  GEBPPacketFull = 0,
20  GEBPPacketHalf,
21  GEBPPacketQuarter
22 };
23 
24 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
25 class gebp_traits;
26 
27 
29 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
30 {
31  return a<=0 ? b : a;
32 }
33 
34 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
35 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
36 #else
37 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
38 #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
39 
40 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
41 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
42 #else
43 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
44 #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
45 
46 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
47 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_SET_DEFAULT_L3_CACHE_SIZE
48 #else
49 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
50 #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
51 
52 #if EIGEN_ARCH_i386_OR_x86_64
53 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
54 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
55 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
56 #elif EIGEN_ARCH_PPC
57 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
58 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
59 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
60 #else
61 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
62 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
63 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
64 #endif
65 
66 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
69 
71 struct CacheSizes {
72  CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
74  queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
75  m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
76  m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
77  m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
78  }
79 
80  std::ptrdiff_t m_l1;
81  std::ptrdiff_t m_l2;
82  std::ptrdiff_t m_l3;
83 };
84 
86 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
87 {
88  static CacheSizes m_cacheSizes;
89 
90  if(action==SetAction)
91  {
92  // set the cpu cache size and cache all block sizes from a global cache size in byte
93  eigen_internal_assert(l1!=0 && l2!=0);
94  m_cacheSizes.m_l1 = *l1;
95  m_cacheSizes.m_l2 = *l2;
96  m_cacheSizes.m_l3 = *l3;
97  }
98  else if(action==GetAction)
99  {
100  eigen_internal_assert(l1!=0 && l2!=0);
101  *l1 = m_cacheSizes.m_l1;
102  *l2 = m_cacheSizes.m_l2;
103  *l3 = m_cacheSizes.m_l3;
104  }
105  else
106  {
107  eigen_internal_assert(false);
108  }
109 }
110 
111 /* Helper for computeProductBlockingSizes.
112  *
113  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
114  * this function computes the blocking size parameters along the respective dimensions
115  * for matrix products and related algorithms. The blocking sizes depends on various
116  * parameters:
117  * - the L1 and L2 cache sizes,
118  * - the register level blocking sizes defined by gebp_traits,
119  * - the number of scalars that fit into a packet (when vectorization is enabled).
120  *
121  * \sa setCpuCacheSizes */
122 
123 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
124 void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1)
125 {
126  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
127 
128  // Explanations:
129  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
130  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
131  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
132  // at the register level. This small horizontal panel has to stay within L1 cache.
133  std::ptrdiff_t l1, l2, l3;
134  manage_caching_sizes(GetAction, &l1, &l2, &l3);
135  #ifdef EIGEN_VECTORIZE_AVX512
136  // We need to find a rationale for that, but without this adjustment,
137  // performance with AVX512 is pretty bad, like -20% slower.
138  // One reason is that with increasing packet-size, the blocking size k
139  // has to become pretty small if we want that 1 lhs panel fit within L1.
140  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
141  // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
142  // This is quite small for a good reuse of the accumulation registers.
143  l1 *= 4;
144  #endif
145 
146  if (num_threads > 1) {
147  typedef typename Traits::ResScalar ResScalar;
148  enum {
149  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
150  ksub = Traits::mr * Traits::nr * sizeof(ResScalar),
151  kr = 8,
152  mr = Traits::mr,
153  nr = Traits::nr
154  };
155  // Increasing k gives us more time to prefetch the content of the "C"
156  // registers. However once the latency is hidden there is no point in
157  // increasing the value of k, so we'll cap it at 320 (value determined
158  // experimentally).
159  // To avoid that k vanishes, we make k_cache at least as big as kr
160  const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
161  if (k_cache < k) {
162  k = k_cache - (k_cache % kr);
163  eigen_internal_assert(k > 0);
164  }
165 
166  const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k);
167  const Index n_per_thread = numext::div_ceil(n, num_threads);
168  if (n_cache <= n_per_thread) {
169  // Don't exceed the capacity of the l2 cache.
170  eigen_internal_assert(n_cache >= static_cast<Index>(nr));
171  n = n_cache - (n_cache % nr);
172  eigen_internal_assert(n > 0);
173  } else {
174  n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
175  }
176 
177  if (l3 > l2) {
178  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
179  const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads);
180  const Index m_per_thread = numext::div_ceil(m, num_threads);
181  if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
182  m = m_cache - (m_cache % mr);
183  eigen_internal_assert(m > 0);
184  } else {
185  m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
186  }
187  }
188  }
189  else {
190  // In unit tests we do not want to use extra large matrices,
191  // so we reduce the cache size to check the blocking strategy is not flawed
192 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
193  l1 = 9*1024;
194  l2 = 32*1024;
195  l3 = 512*1024;
196 #endif
197 
198  // Early return for small problems because the computation below are time consuming for small problems.
199  // Perhaps it would make more sense to consider k*n*m??
200  // Note that for very tiny problem, this function should be bypassed anyway
201  // because we use the coefficient-based implementation for them.
202  if((numext::maxi)(k,(numext::maxi)(m,n))<48)
203  return;
204 
205  typedef typename Traits::ResScalar ResScalar;
206  enum {
207  k_peeling = 8,
208  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
209  k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
210  };
211 
212  // ---- 1st level of blocking on L1, yields kc ----
213 
214  // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
215  // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
216  // We also include a register-level block of the result (mx x nr).
217  // (In an ideal world only the lhs panel would stay in L1)
218  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
219  const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1);
220  const Index old_k = k;
221  if(k>max_kc)
222  {
223  // We are really blocking on the third dimension:
224  // -> reduce blocking size to make sure the last block is as large as possible
225  // while keeping the same number of sweeps over the result.
226  k = (k%max_kc)==0 ? max_kc
227  : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
228 
229  eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
230  }
231 
232  // ---- 2nd level of blocking on max(L2,L3), yields nc ----
233 
234  // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
235  // actual_l2 = max(l2, l3/nb_core_sharing_l3)
236  // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
237  // For instance, it corresponds to 6MB of L3 shared among 4 cores.
238  #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
239  const Index actual_l2 = l3;
240  #else
241  const Index actual_l2 = 1572864; // == 1.5 MB
242  #endif
243 
244  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
245  // The second half is implicitly reserved to access the result and lhs coefficients.
246  // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
247  // to limit this growth: we bound nc to growth by a factor x1.5.
248  // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
249  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
250  Index max_nc;
251  const Index lhs_bytes = m * k * sizeof(LhsScalar);
252  const Index remaining_l1 = l1- k_sub - lhs_bytes;
253  if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k)
254  {
255  // L1 blocking
256  max_nc = remaining_l1 / (k*sizeof(RhsScalar));
257  }
258  else
259  {
260  // L2 blocking
261  max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
262  }
263  // WARNING Below, we assume that Traits::nr is a power of two.
264  Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
265  if(n>nc)
266  {
267  // We are really blocking over the columns:
268  // -> reduce blocking size to make sure the last block is as large as possible
269  // while keeping the same number of sweeps over the packed lhs.
270  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
271  n = (n%nc)==0 ? nc
272  : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
273  }
274  else if(old_k==k)
275  {
276  // So far, no blocking at all, i.e., kc==k, and nc==n.
277  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
278  // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete.
279  Index problem_size = k*n*sizeof(LhsScalar);
280  Index actual_lm = actual_l2;
281  Index max_mc = m;
282  if(problem_size<=1024)
283  {
284  // problem is small enough to keep in L1
285  // Let's choose m such that lhs's block fit in 1/3 of L1
286  actual_lm = l1;
287  }
288  else if(l3!=0 && problem_size<=32768)
289  {
290  // we have both L2 and L3, and problem is small enough to be kept in L2
291  // Let's choose m such that lhs's block fit in 1/3 of L2
292  actual_lm = l2;
293  max_mc = (numext::mini<Index>)(576,max_mc);
294  }
295  Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
296  if (mc > Traits::mr) mc -= mc % Traits::mr;
297  else if (mc==0) return;
298  m = (m%mc)==0 ? mc
299  : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
300  }
301  }
302 }
303 
304 template <typename Index>
305 inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n)
306 {
307 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
308  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
309  k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
310  m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
311  n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
312  return true;
313  }
314 #else
315  EIGEN_UNUSED_VARIABLE(k)
316  EIGEN_UNUSED_VARIABLE(m)
317  EIGEN_UNUSED_VARIABLE(n)
318 #endif
319  return false;
320 }
321 
338 template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
339 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
340 {
341  if (!useSpecificBlockingSizes(k, m, n)) {
342  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
343  }
344 }
345 
346 template<typename LhsScalar, typename RhsScalar, typename Index>
347 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1)
348 {
349  computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
350 }
351 
352 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
353  #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C);
354 #else
355 
356  // FIXME (a bit overkill maybe ?)
357 
358  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
359  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
360  {
361  c = cj.pmadd(a,b,c);
362  }
363  };
364 
365  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
366  EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
367  {
368  t = b; t = cj.pmul(a,t); c = padd(c,t);
369  }
370  };
371 
372  template<typename CJ, typename A, typename B, typename C, typename T>
373  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
374  {
376  }
377 
378  #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T);
379 // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T);
380 #endif
381 
382 template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
384  private:
385  static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
386  public:
387  typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
388 };
389 
390 template <typename Packet>
392 {
393  Packet B_0, B1, B2, B3;
394  const Packet& get(const FixedInt<0>&) const { return B_0; }
395  const Packet& get(const FixedInt<1>&) const { return B1; }
396  const Packet& get(const FixedInt<2>&) const { return B2; }
397  const Packet& get(const FixedInt<3>&) const { return B3; }
398 };
399 
400 template <int N, typename T1, typename T2, typename T3>
401 struct packet_conditional { typedef T3 type; };
402 
403 template <typename T1, typename T2, typename T3>
404 struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
405 
406 template <typename T1, typename T2, typename T3>
407 struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
408 
409 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \
410  typedef typename packet_conditional<packet_size, \
411  typename packet_traits<name ## Scalar>::type, \
412  typename packet_traits<name ## Scalar>::half, \
413  typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
414  prefix ## name ## Packet
415 
416 #define PACKET_DECL_COND(name, packet_size) \
417  typedef typename packet_conditional<packet_size, \
418  typename packet_traits<name ## Scalar>::type, \
419  typename packet_traits<name ## Scalar>::half, \
420  typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
421  name ## Packet
422 
423 #define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \
424  typedef typename packet_conditional<packet_size, \
425  typename packet_traits<Scalar>::type, \
426  typename packet_traits<Scalar>::half, \
427  typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
428  prefix ## ScalarPacket
429 
430 #define PACKET_DECL_COND_SCALAR(packet_size) \
431  typedef typename packet_conditional<packet_size, \
432  typename packet_traits<Scalar>::type, \
433  typename packet_traits<Scalar>::half, \
434  typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
435  ScalarPacket
436 
437 /* Vectorization logic
438  * real*real: unpack rhs to constant packets, ...
439  *
440  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
441  * storing each res packet into two packets (2x2),
442  * at the end combine them: swap the second and addsub them
443  * cf*cf : same but with 2x4 blocks
444  * cplx*real : unpack rhs to constant packets, ...
445  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
446  */
447 template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
449 {
450 public:
451  typedef _LhsScalar LhsScalar;
452  typedef _RhsScalar RhsScalar;
454 
455  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
456  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
457  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
458 
459  enum {
460  ConjLhs = _ConjLhs,
461  ConjRhs = _ConjRhs,
463  LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
464  RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
465  ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
466 
467  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
468 
469  // register block size along the N direction must be 1 or 4
470  nr = 4,
471 
472  // register block size along the M direction (currently, this one cannot be modified)
473  default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
474 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
475  && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
476  // we assume 16 registers or more
477  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
478  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
479  // Bug 1515: MSVC prior to v19.14 yields to register spilling.
480  mr = Vectorizable ? 3*LhsPacketSize : default_mr,
481 #else
482  mr = default_mr,
483 #endif
484 
485  LhsProgress = LhsPacketSize,
486  RhsProgress = 1
487  };
488 
489 
494 
496  typedef ResPacket AccPacket;
497 
498  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
499  {
500  p = pset1<ResPacket>(ResScalar(0));
501  }
502 
503  template<typename RhsPacketType>
504  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
505  {
506  dest = pset1<RhsPacketType>(*b);
507  }
508 
509  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
510  {
511  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
512  }
513 
514  template<typename RhsPacketType>
515  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
516  {
517  loadRhs(b, dest);
518  }
519 
520  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
521  {
522  }
523 
524  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
525  {
526  dest = ploadquad<RhsPacket>(b);
527  }
528 
529  template<typename LhsPacketType>
530  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const
531  {
532  dest = pload<LhsPacketType>(a);
533  }
534 
535  template<typename LhsPacketType>
536  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
537  {
538  dest = ploadu<LhsPacketType>(a);
539  }
540 
541  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
542  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
543  {
545  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
546  // let gcc allocate the register in which to store the result of the pmul
547  // (in the case where there is no FMA) gcc fails to figure out how to avoid
548  // spilling register.
549 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
550  EIGEN_UNUSED_VARIABLE(tmp);
551  c = cj.pmadd(a,b,c);
552 #else
553  tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp);
554 #endif
555  }
556 
557  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
558  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
559  {
560  madd(a, b.get(lane), c, tmp, lane);
561  }
562 
563  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
564  {
565  r = pmadd(c,alpha,r);
566  }
567 
568  template<typename ResPacketHalf>
569  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const
570  {
571  r = pmadd(c,alpha,r);
572  }
573 
574 };
575 
576 template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
577 class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
578 {
579 public:
580  typedef std::complex<RealScalar> LhsScalar;
581  typedef RealScalar RhsScalar;
583 
584  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
585  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
586  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
587 
588  enum {
589  ConjLhs = _ConjLhs,
590  ConjRhs = false,
592  LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
593  RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
594  ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
595 
596  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
597  nr = 4,
598 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
599  // we assume 16 registers
600  mr = 3*LhsPacketSize,
601 #else
602  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
603 #endif
604 
605  LhsProgress = LhsPacketSize,
606  RhsProgress = 1
607  };
608 
613 
615 
616  typedef ResPacket AccPacket;
617 
618  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
619  {
620  p = pset1<ResPacket>(ResScalar(0));
621  }
622 
623  template<typename RhsPacketType>
624  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
625  {
626  dest = pset1<RhsPacketType>(*b);
627  }
628 
629  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
630  {
631  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
632  }
633 
634  template<typename RhsPacketType>
635  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
636  {
637  loadRhs(b, dest);
638  }
639 
640  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
641  {}
642 
643  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
644  {
645  loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
646  }
647 
648  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
649  {
650  // FIXME we can do better!
651  // what we want here is a ploadheight
652  RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
653  dest = ploadquad<RhsPacket>(tmp);
654  }
655 
656  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
657  {
658  eigen_internal_assert(RhsPacketSize<=8);
659  dest = pset1<RhsPacket>(*b);
660  }
661 
662  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
663  {
664  dest = pload<LhsPacket>(a);
665  }
666 
667  template<typename LhsPacketType>
668  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
669  {
670  dest = ploadu<LhsPacketType>(a);
671  }
672 
673  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
674  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
675  {
676  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
677  }
678 
679  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
680  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
681  {
682 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
683  EIGEN_UNUSED_VARIABLE(tmp);
684  c.v = pmadd(a.v,b,c.v);
685 #else
686  tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
687 #endif
688  }
689 
690  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
691  {
692  c += a * b;
693  }
694 
695  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
696  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
697  {
698  madd(a, b.get(lane), c, tmp, lane);
699  }
700 
701  template <typename ResPacketType, typename AccPacketType>
702  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
703  {
705  r = cj.pmadd(c,alpha,r);
706  }
707 
708 protected:
709 };
710 
711 template<typename Packet>
713 {
714  Packet first;
715  Packet second;
716 };
717 
718 template<typename Packet>
720 {
722  res.first = padd(a.first, b.first);
723  res.second = padd(a.second,b.second);
724  return res;
725 }
726 
727 // note that for DoublePacket<RealPacket> the "4" in "downto4"
728 // corresponds to the number of complexes, so it means "8"
729 // it terms of real coefficients.
730 
731 template<typename Packet>
732 const DoublePacket<Packet>&
733 predux_half_dowto4(const DoublePacket<Packet> &a,
734  typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
735 {
736  return a;
737 }
738 
739 template<typename Packet>
740 DoublePacket<typename unpacket_traits<Packet>::half>
741 predux_half_dowto4(const DoublePacket<Packet> &a,
742  typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
743 {
744  // yes, that's pretty hackish :(
745  DoublePacket<typename unpacket_traits<Packet>::half> res;
746  typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
747  typedef typename packet_traits<Cplx>::type CplxPacket;
748  res.first = predux_half_dowto4(CplxPacket(a.first)).v;
749  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
750  return res;
751 }
752 
753 // same here, "quad" actually means "8" in terms of real coefficients
754 template<typename Scalar, typename RealPacket>
755 void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
756  typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
757 {
758  dest.first = pset1<RealPacket>(numext::real(*b));
759  dest.second = pset1<RealPacket>(numext::imag(*b));
760 }
761 
762 template<typename Scalar, typename RealPacket>
763 void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
764  typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
765 {
766  // yes, that's pretty hackish too :(
767  typedef typename NumTraits<Scalar>::Real RealScalar;
768  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
769  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
770  dest.first = ploadquad<RealPacket>(r);
771  dest.second = ploadquad<RealPacket>(i);
772 }
773 
774 
775 template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
777 };
778 // template<typename Packet>
779 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
780 // {
781 // DoublePacket<Packet> res;
782 // res.first = padd(a.first, b.first);
783 // res.second = padd(a.second,b.second);
784 // return res;
785 // }
786 
787 template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
788 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
789 {
790 public:
791  typedef std::complex<RealScalar> Scalar;
792  typedef std::complex<RealScalar> LhsScalar;
793  typedef std::complex<RealScalar> RhsScalar;
794  typedef std::complex<RealScalar> ResScalar;
795 
796  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
797  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
798  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
799  PACKET_DECL_COND(Real, _PacketSize);
800  PACKET_DECL_COND_SCALAR(_PacketSize);
801 
802  enum {
803  ConjLhs = _ConjLhs,
804  ConjRhs = _ConjRhs,
807  ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
808  LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
809  RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
810  RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
811 
812  // FIXME: should depend on NumberOfRegisters
813  nr = 4,
814  mr = ResPacketSize,
815 
816  LhsProgress = ResPacketSize,
817  RhsProgress = 1
818  };
819 
821 
827 
828  // this actualy holds 8 packets!
830 
831  EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
832 
833  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p)
834  {
835  p.first = pset1<RealPacket>(RealScalar(0));
836  p.second = pset1<RealPacket>(RealScalar(0));
837  }
838 
839  // Scalar path
840  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
841  {
842  dest = pset1<ScalarPacket>(*b);
843  }
844 
845  // Vectorized path
846  template<typename RealPacketType>
847  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
848  {
849  dest.first = pset1<RealPacketType>(numext::real(*b));
850  dest.second = pset1<RealPacketType>(numext::imag(*b));
851  }
852 
853  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
854  {
855  loadRhs(b, dest.B_0);
856  loadRhs(b + 1, dest.B1);
857  loadRhs(b + 2, dest.B2);
858  loadRhs(b + 3, dest.B3);
859  }
860 
861  // Scalar path
862  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
863  {
864  loadRhs(b, dest);
865  }
866 
867  // Vectorized path
868  template<typename RealPacketType>
869  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
870  {
871  loadRhs(b, dest);
872  }
873 
874  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
875 
876  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
877  {
878  loadRhs(b,dest);
879  }
880  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
881  {
882  loadQuadToDoublePacket(b,dest);
883  }
884 
885  // nothing special here
886  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
887  {
888  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
889  }
890 
891  template<typename LhsPacketType>
892  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
893  {
894  dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
895  }
896 
897  template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
898  EIGEN_STRONG_INLINE
900  madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
901  {
902  c.first = padd(pmul(a,b.first), c.first);
903  c.second = padd(pmul(a,b.second),c.second);
904  }
905 
906  template<typename LaneIdType>
907  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
908  {
909  c = cj.pmadd(a,b,c);
910  }
911 
912  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
913  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
914  {
915  madd(a, b.get(lane), c, tmp, lane);
916  }
917 
918  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
919 
920  template<typename RealPacketType, typename ResPacketType>
921  EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
922  {
923  // assemble c
924  ResPacketType tmp;
925  if((!ConjLhs)&&(!ConjRhs))
926  {
927  tmp = pcplxflip(pconj(ResPacketType(c.second)));
928  tmp = padd(ResPacketType(c.first),tmp);
929  }
930  else if((!ConjLhs)&&(ConjRhs))
931  {
932  tmp = pconj(pcplxflip(ResPacketType(c.second)));
933  tmp = padd(ResPacketType(c.first),tmp);
934  }
935  else if((ConjLhs)&&(!ConjRhs))
936  {
937  tmp = pcplxflip(ResPacketType(c.second));
938  tmp = padd(pconj(ResPacketType(c.first)),tmp);
939  }
940  else if((ConjLhs)&&(ConjRhs))
941  {
942  tmp = pcplxflip(ResPacketType(c.second));
943  tmp = psub(pconj(ResPacketType(c.first)),tmp);
944  }
945 
946  r = pmadd(tmp,alpha,r);
947  }
948 
949 protected:
951 };
952 
953 template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
954 class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
955 {
956 public:
957  typedef std::complex<RealScalar> Scalar;
958  typedef RealScalar LhsScalar;
959  typedef Scalar RhsScalar;
960  typedef Scalar ResScalar;
961 
962  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
963  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
964  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
965  PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
966  PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
967 
968 #undef PACKET_DECL_COND_SCALAR_PREFIX
969 #undef PACKET_DECL_COND_PREFIX
970 #undef PACKET_DECL_COND_SCALAR
971 #undef PACKET_DECL_COND
972 
973  enum {
974  ConjLhs = false,
975  ConjRhs = _ConjRhs,
978  LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
979  RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
980  ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
981 
982  NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
983  // FIXME: should depend on NumberOfRegisters
984  nr = 4,
985  mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize,
986 
987  LhsProgress = ResPacketSize,
988  RhsProgress = 1
989  };
990 
996  typedef ResPacket AccPacket;
997 
998  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
999  {
1000  p = pset1<ResPacket>(ResScalar(0));
1001  }
1002 
1003  template<typename RhsPacketType>
1004  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
1005  {
1006  dest = pset1<RhsPacketType>(*b);
1007  }
1008 
1009  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
1010  {
1011  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
1012  }
1013 
1014  template<typename RhsPacketType>
1015  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
1016  {
1017  loadRhs(b, dest);
1018  }
1019 
1020  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
1021  {}
1022 
1023  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
1024  {
1025  dest = ploaddup<LhsPacket>(a);
1026  }
1027 
1028  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
1029  {
1030  dest = ploadquad<RhsPacket>(b);
1031  }
1032 
1033  template<typename LhsPacketType>
1034  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
1035  {
1036  dest = ploaddup<LhsPacketType>(a);
1037  }
1038 
1039  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
1040  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
1041  {
1042  madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
1043  }
1044 
1045  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
1046  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
1047  {
1048 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
1049  EIGEN_UNUSED_VARIABLE(tmp);
1050  c.v = pmadd(a,b.v,c.v);
1051 #else
1052  tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
1053 #endif
1054 
1055  }
1056 
1057  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
1058  {
1059  c += a * b;
1060  }
1061 
1062  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
1063  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
1064  {
1065  madd(a, b.get(lane), c, tmp, lane);
1066  }
1067 
1068  template <typename ResPacketType, typename AccPacketType>
1069  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
1070  {
1072  r = cj.pmadd(alpha,c,r);
1073  }
1074 
1075 protected:
1076 
1077 };
1078 
1079 
1080 #if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
1081 
1082 template<>
1083 struct gebp_traits <float, float, false, false,Architecture::NEON,GEBPPacketFull>
1084  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
1085 {
1086  typedef float RhsPacket;
1087 
1088  typedef float32x4_t RhsPacketx4;
1089 
1090  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
1091  {
1092  dest = *b;
1093  }
1094 
1095  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
1096  {
1097  dest = vld1q_f32(b);
1098  }
1099 
1100  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
1101  {
1102  dest = *b;
1103  }
1104 
1105  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
1106  {}
1107 
1108  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
1109  {
1110  loadRhs(b,dest);
1111  }
1112 
1113  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
1114  {
1115  c = vfmaq_n_f32(c, a, b);
1116  }
1117 
1118  // NOTE: Template parameter inference failed when compiled with Android NDK:
1119  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
1120 
1121  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
1122  { madd_helper<0>(a, b, c); }
1123  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
1124  { madd_helper<1>(a, b, c); }
1125  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
1126  { madd_helper<2>(a, b, c); }
1127  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
1128  { madd_helper<3>(a, b, c); }
1129 
1130  private:
1131  template<int LaneID>
1132  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
1133  {
1134  #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
1135  // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
1136  // vfmaq_laneq_f32 is implemented through a costly dup
1137  if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
1138  else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
1139  else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
1140  else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
1141  #else
1142  c = vfmaq_laneq_f32(c, a, b, LaneID);
1143  #endif
1144  }
1145 };
1146 
1147 
1148 template<>
1149 struct gebp_traits <double, double, false, false,Architecture::NEON>
1150  : gebp_traits<double,double,false,false,Architecture::Generic>
1151 {
1152  typedef double RhsPacket;
1153 
1154  struct RhsPacketx4 {
1155  float64x2_t B_0, B_1;
1156  };
1157 
1158  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
1159  {
1160  dest = *b;
1161  }
1162 
1163  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
1164  {
1165  dest.B_0 = vld1q_f64(b);
1166  dest.B_1 = vld1q_f64(b+2);
1167  }
1168 
1169  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
1170  {
1171  loadRhs(b,dest);
1172  }
1173 
1174  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
1175  {}
1176 
1177  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
1178  {
1179  loadRhs(b,dest);
1180  }
1181 
1182  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
1183  {
1184  c = vfmaq_n_f64(c, a, b);
1185  }
1186 
1187  // NOTE: Template parameter inference failed when compiled with Android NDK:
1188  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
1189 
1190  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
1191  { madd_helper<0>(a, b, c); }
1192  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
1193  { madd_helper<1>(a, b, c); }
1194  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
1195  { madd_helper<2>(a, b, c); }
1196  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
1197  { madd_helper<3>(a, b, c); }
1198 
1199  private:
1200  template <int LaneID>
1201  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
1202  {
1203  #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
1204  // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
1205  // vfmaq_laneq_f64 is implemented through a costly dup
1206  if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
1207  else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
1208  else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
1209  else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
1210  #else
1211  if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
1212  else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
1213  else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
1214  else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
1215  #endif
1216  }
1217 };
1218 
1219 #endif
1220 
1221 /* optimized General packed Block * packed Panel product kernel
1222  *
1223  * Mixing type logic: C += A * B
1224  * | A | B | comments
1225  * |real |cplx | no vectorization yet, would require to pack A with duplication
1226  * |cplx |real | easy vectorization
1227  */
1228 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1230 {
1234 
1235  typedef typename Traits::ResScalar ResScalar;
1236  typedef typename Traits::LhsPacket LhsPacket;
1237  typedef typename Traits::RhsPacket RhsPacket;
1238  typedef typename Traits::ResPacket ResPacket;
1239  typedef typename Traits::AccPacket AccPacket;
1240  typedef typename Traits::RhsPacketx4 RhsPacketx4;
1241 
1242  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
1243 
1245 
1246  typedef typename SwappedTraits::ResScalar SResScalar;
1247  typedef typename SwappedTraits::LhsPacket SLhsPacket;
1248  typedef typename SwappedTraits::RhsPacket SRhsPacket;
1249  typedef typename SwappedTraits::ResPacket SResPacket;
1250  typedef typename SwappedTraits::AccPacket SAccPacket;
1251 
1252  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
1253  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
1254  typedef typename HalfTraits::ResPacket ResPacketHalf;
1255  typedef typename HalfTraits::AccPacket AccPacketHalf;
1256 
1257  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
1258  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
1259  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
1260  typedef typename QuarterTraits::AccPacket AccPacketQuarter;
1261 
1262  typedef typename DataMapper::LinearMapper LinearMapper;
1263 
1264  enum {
1265  Vectorizable = Traits::Vectorizable,
1266  LhsProgress = Traits::LhsProgress,
1267  LhsProgressHalf = HalfTraits::LhsProgress,
1268  LhsProgressQuarter = QuarterTraits::LhsProgress,
1269  RhsProgress = Traits::RhsProgress,
1270  RhsProgressHalf = HalfTraits::RhsProgress,
1271  RhsProgressQuarter = QuarterTraits::RhsProgress,
1272  ResPacketSize = Traits::ResPacketSize
1273  };
1274 
1275  EIGEN_DONT_INLINE
1276  void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1277  Index rows, Index depth, Index cols, ResScalar alpha,
1278  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
1279 };
1280 
1281 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
1284 {
1287 
1288  typedef typename Traits::ResScalar ResScalar;
1289  typedef typename SwappedTraits::LhsPacket SLhsPacket;
1290  typedef typename SwappedTraits::RhsPacket SRhsPacket;
1291  typedef typename SwappedTraits::ResPacket SResPacket;
1292  typedef typename SwappedTraits::AccPacket SAccPacket;
1293 
1294  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1295  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1296  ResScalar alpha, SAccPacket &C0)
1297  {
1298  EIGEN_UNUSED_VARIABLE(res);
1299  EIGEN_UNUSED_VARIABLE(straits);
1300  EIGEN_UNUSED_VARIABLE(blA);
1301  EIGEN_UNUSED_VARIABLE(blB);
1302  EIGEN_UNUSED_VARIABLE(depth);
1303  EIGEN_UNUSED_VARIABLE(endk);
1304  EIGEN_UNUSED_VARIABLE(i);
1305  EIGEN_UNUSED_VARIABLE(j2);
1306  EIGEN_UNUSED_VARIABLE(alpha);
1307  EIGEN_UNUSED_VARIABLE(C0);
1308  }
1309 };
1310 
1311 
1312 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1313 struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1316 
1317  typedef typename Traits::ResScalar ResScalar;
1318  typedef typename SwappedTraits::LhsPacket SLhsPacket;
1319  typedef typename SwappedTraits::RhsPacket SRhsPacket;
1320  typedef typename SwappedTraits::ResPacket SResPacket;
1321  typedef typename SwappedTraits::AccPacket SAccPacket;
1322 
1323  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
1324  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1325  ResScalar alpha, SAccPacket &C0)
1326  {
1327  typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1328  typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1329  typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1330  typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1331 
1332  SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1333  SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1334 
1335  if (depth - endk > 0)
1336  {
1337  // We have to handle the last row(s) of the rhs, which
1338  // correspond to a half-packet
1339  SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1340 
1341  for (Index kk = endk; kk < depth; kk++)
1342  {
1343  SLhsPacketQuarter a0;
1344  SRhsPacketQuarter b0;
1345  straits.loadLhsUnaligned(blB, a0);
1346  straits.loadRhs(blA, b0);
1347  straits.madd(a0,b0,c0,b0, fix<0>);
1348  blB += SwappedTraits::LhsProgress/4;
1349  blA += 1;
1350  }
1351  straits.acc(c0, alphav, R);
1352  }
1353  else
1354  {
1355  straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1356  }
1357  res.scatterPacket(i, j2, R);
1358  }
1359 };
1360 
1361 template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1363 {
1364  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1365 
1366  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1367  {
1368  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1369  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1370  traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
1371  traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
1372  traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1373  traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1374  traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1375  traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1376  #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1377  __asm__ ("" : "+x,m" (*A0));
1378  #endif
1379  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1380  }
1381 
1382  EIGEN_STRONG_INLINE void operator()(
1383  const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
1384  Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
1385  int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
1386  {
1387  GEBPTraits traits;
1388 
1389  // loops on each largest micro horizontal panel of lhs
1390  // (LhsProgress x depth)
1391  for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
1392  {
1393  // loops on each largest micro vertical panel of rhs (depth * nr)
1394  for(Index j2=0; j2<packet_cols4; j2+=nr)
1395  {
1396  // We select a LhsProgress x nr micro block of res
1397  // which is entirely stored into 1 x nr registers.
1398 
1399  const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1400  prefetch(&blA[0]);
1401 
1402  // gets res block as register
1403  AccPacket C0, C1, C2, C3;
1404  traits.initAcc(C0);
1405  traits.initAcc(C1);
1406  traits.initAcc(C2);
1407  traits.initAcc(C3);
1408  // To improve instruction pipelining, let's double the accumulation registers:
1409  // even k will accumulate in C*, while odd k will accumulate in D*.
1410  // This trick is crutial to get good performance with FMA, otherwise it is
1411  // actually faster to perform separated MUL+ADD because of a naturally
1412  // better instruction-level parallelism.
1413  AccPacket D0, D1, D2, D3;
1414  traits.initAcc(D0);
1415  traits.initAcc(D1);
1416  traits.initAcc(D2);
1417  traits.initAcc(D3);
1418 
1419  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1420  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1421  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1422  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1423 
1424  r0.prefetch(prefetch_res_offset);
1425  r1.prefetch(prefetch_res_offset);
1426  r2.prefetch(prefetch_res_offset);
1427  r3.prefetch(prefetch_res_offset);
1428 
1429  // performs "inner" products
1430  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1431  prefetch(&blB[0]);
1432  LhsPacket A0, A1;
1433 
1434  for(Index k=0; k<peeled_kc; k+=pk)
1435  {
1436  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1437  RhsPacketx4 rhs_panel;
1438  RhsPacket T0;
1439 
1440  internal::prefetch(blB+(48+0));
1441  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1442  peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1443  peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1444  peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1445  internal::prefetch(blB+(48+16));
1446  peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1447  peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1448  peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1449  peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1450 
1451  blB += pk*4*RhsProgress;
1452  blA += pk*LhsProgress;
1453 
1454  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1455  }
1456  C0 = padd(C0,D0);
1457  C1 = padd(C1,D1);
1458  C2 = padd(C2,D2);
1459  C3 = padd(C3,D3);
1460 
1461  // process remaining peeled loop
1462  for(Index k=peeled_kc; k<depth; k++)
1463  {
1464  RhsPacketx4 rhs_panel;
1465  RhsPacket T0;
1466  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1467  blB += 4*RhsProgress;
1468  blA += LhsProgress;
1469  }
1470 
1471  ResPacket R0, R1;
1472  ResPacket alphav = pset1<ResPacket>(alpha);
1473 
1474  R0 = r0.template loadPacket<ResPacket>(0);
1475  R1 = r1.template loadPacket<ResPacket>(0);
1476  traits.acc(C0, alphav, R0);
1477  traits.acc(C1, alphav, R1);
1478  r0.storePacket(0, R0);
1479  r1.storePacket(0, R1);
1480 
1481  R0 = r2.template loadPacket<ResPacket>(0);
1482  R1 = r3.template loadPacket<ResPacket>(0);
1483  traits.acc(C2, alphav, R0);
1484  traits.acc(C3, alphav, R1);
1485  r2.storePacket(0, R0);
1486  r3.storePacket(0, R1);
1487  }
1488 
1489  // Deal with remaining columns of the rhs
1490  for(Index j2=packet_cols4; j2<cols; j2++)
1491  {
1492  // One column at a time
1493  const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
1494  prefetch(&blA[0]);
1495 
1496  // gets res block as register
1497  AccPacket C0;
1498  traits.initAcc(C0);
1499 
1500  LinearMapper r0 = res.getLinearMapper(i, j2);
1501 
1502  // performs "inner" products
1503  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1504  LhsPacket A0;
1505 
1506  for(Index k= 0; k<peeled_kc; k+=pk)
1507  {
1508  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1509  RhsPacket B_0;
1510 
1511 #define EIGEN_GEBGP_ONESTEP(K) \
1512  do { \
1513  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1514  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1515  /* FIXME: why unaligned???? */ \
1516  traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
1517  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
1518  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1519  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1520  } while(false);
1521 
1522  EIGEN_GEBGP_ONESTEP(0);
1523  EIGEN_GEBGP_ONESTEP(1);
1524  EIGEN_GEBGP_ONESTEP(2);
1525  EIGEN_GEBGP_ONESTEP(3);
1526  EIGEN_GEBGP_ONESTEP(4);
1527  EIGEN_GEBGP_ONESTEP(5);
1528  EIGEN_GEBGP_ONESTEP(6);
1529  EIGEN_GEBGP_ONESTEP(7);
1530 
1531  blB += pk*RhsProgress;
1532  blA += pk*LhsProgress;
1533 
1534  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1535  }
1536 
1537  // process remaining peeled loop
1538  for(Index k=peeled_kc; k<depth; k++)
1539  {
1540  RhsPacket B_0;
1541  EIGEN_GEBGP_ONESTEP(0);
1542  blB += RhsProgress;
1543  blA += LhsProgress;
1544  }
1545 #undef EIGEN_GEBGP_ONESTEP
1546  ResPacket R0;
1547  ResPacket alphav = pset1<ResPacket>(alpha);
1548  R0 = r0.template loadPacket<ResPacket>(0);
1549  traits.acc(C0, alphav, R0);
1550  r0.storePacket(0, R0);
1551  }
1552  }
1553  }
1554 };
1555 
1556 template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
1557 struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
1558 {
1559 
1560 EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
1561  {
1562  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1563  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1564  traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
1565  traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
1566  traits.madd(*A0, *B_0, *C0, *B_0);
1567  traits.madd(*A0, *B1, *C1, *B1);
1568  traits.madd(*A0, *B2, *C2, *B2);
1569  traits.madd(*A0, *B3, *C3, *B3);
1570  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1571  }
1572 };
1573 
1574 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
1575 EIGEN_DONT_INLINE
1577  ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1578  Index rows, Index depth, Index cols, ResScalar alpha,
1579  Index strideA, Index strideB, Index offsetA, Index offsetB)
1580  {
1581  Traits traits;
1582  SwappedTraits straits;
1583 
1584  if(strideA==-1) strideA = depth;
1585  if(strideB==-1) strideB = depth;
1587  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
1588  const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
1589  const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
1590  const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
1591  const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
1592  const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
1593  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1594  const Index peeled_kc = depth & ~(pk-1);
1595  const int prefetch_res_offset = 32/sizeof(ResScalar);
1596 // const Index depth2 = depth & ~1;
1597 
1598  //---------- Process 3 * LhsProgress rows at once ----------
1599  // This corresponds to 3*LhsProgress x nr register blocks.
1600  // Usually, make sense only with FMA
1601  if(mr>=3*Traits::LhsProgress)
1602  {
1603  // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth)
1604  // and on each largest micro vertical panel of the rhs (depth * nr).
1605  // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1.
1606  // However, if depth is too small, we can extend the number of rows of these horizontal panels.
1607  // This actual number of rows is computed as follow:
1608  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1609  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1610  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1611  // or because we are testing specific blocking sizes.
1612  const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) ));
1613  for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows)
1614  {
1615  const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3);
1616  for(Index j2=0; j2<packet_cols4; j2+=nr)
1617  {
1618  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1619  {
1620 
1621  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
1622  // stored into 3 x nr registers.
1623 
1624  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)];
1625  prefetch(&blA[0]);
1626 
1627  // gets res block as register
1628  AccPacket C0, C1, C2, C3,
1629  C4, C5, C6, C7,
1630  C8, C9, C10, C11;
1631  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1632  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1633  traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11);
1634 
1635  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1636  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1637  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1638  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1639 
1640  r0.prefetch(0);
1641  r1.prefetch(0);
1642  r2.prefetch(0);
1643  r3.prefetch(0);
1644 
1645  // performs "inner" products
1646  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1647  prefetch(&blB[0]);
1648  LhsPacket A0, A1;
1649 
1650  for(Index k=0; k<peeled_kc; k+=pk)
1651  {
1652  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
1653  // 15 registers are taken (12 for acc, 2 for lhs).
1654  RhsPanel15 rhs_panel;
1655  RhsPacket T0;
1656  LhsPacket A2;
1657  #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
1658  // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1659  // without this workaround A0, A1, and A2 are loaded in the same register,
1660  // which is not good for pipelining
1661  #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
1662  #else
1663  #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1664  #endif
1665 #define EIGEN_GEBP_ONESTEP(K) \
1666  do { \
1667  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1668  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1669  internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1670  if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1671  internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1672  } /* Bug 953 */ \
1673  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1674  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1675  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1676  EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1677  traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \
1678  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1679  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1680  traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1681  traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \
1682  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1683  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1684  traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1685  traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \
1686  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1687  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1688  traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1689  traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \
1690  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1691  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1692  traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1693  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1694  } while (false)
1695 
1696  internal::prefetch(blB);
1697  EIGEN_GEBP_ONESTEP(0);
1698  EIGEN_GEBP_ONESTEP(1);
1699  EIGEN_GEBP_ONESTEP(2);
1700  EIGEN_GEBP_ONESTEP(3);
1701  EIGEN_GEBP_ONESTEP(4);
1702  EIGEN_GEBP_ONESTEP(5);
1703  EIGEN_GEBP_ONESTEP(6);
1704  EIGEN_GEBP_ONESTEP(7);
1705 
1706  blB += pk*4*RhsProgress;
1707  blA += pk*3*Traits::LhsProgress;
1708 
1709  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1710  }
1711  // process remaining peeled loop
1712  for(Index k=peeled_kc; k<depth; k++)
1713  {
1714  RhsPanel15 rhs_panel;
1715  RhsPacket T0;
1716  LhsPacket A2;
1717  EIGEN_GEBP_ONESTEP(0);
1718  blB += 4*RhsProgress;
1719  blA += 3*Traits::LhsProgress;
1720  }
1721 
1722 #undef EIGEN_GEBP_ONESTEP
1723 
1724  ResPacket R0, R1, R2;
1725  ResPacket alphav = pset1<ResPacket>(alpha);
1726 
1727  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1728  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1729  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1730  traits.acc(C0, alphav, R0);
1731  traits.acc(C4, alphav, R1);
1732  traits.acc(C8, alphav, R2);
1733  r0.storePacket(0 * Traits::ResPacketSize, R0);
1734  r0.storePacket(1 * Traits::ResPacketSize, R1);
1735  r0.storePacket(2 * Traits::ResPacketSize, R2);
1736 
1737  R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1738  R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1739  R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1740  traits.acc(C1, alphav, R0);
1741  traits.acc(C5, alphav, R1);
1742  traits.acc(C9, alphav, R2);
1743  r1.storePacket(0 * Traits::ResPacketSize, R0);
1744  r1.storePacket(1 * Traits::ResPacketSize, R1);
1745  r1.storePacket(2 * Traits::ResPacketSize, R2);
1746 
1747  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1748  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1749  R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1750  traits.acc(C2, alphav, R0);
1751  traits.acc(C6, alphav, R1);
1752  traits.acc(C10, alphav, R2);
1753  r2.storePacket(0 * Traits::ResPacketSize, R0);
1754  r2.storePacket(1 * Traits::ResPacketSize, R1);
1755  r2.storePacket(2 * Traits::ResPacketSize, R2);
1756 
1757  R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1758  R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1759  R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1760  traits.acc(C3, alphav, R0);
1761  traits.acc(C7, alphav, R1);
1762  traits.acc(C11, alphav, R2);
1763  r3.storePacket(0 * Traits::ResPacketSize, R0);
1764  r3.storePacket(1 * Traits::ResPacketSize, R1);
1765  r3.storePacket(2 * Traits::ResPacketSize, R2);
1766  }
1767  }
1768 
1769  // Deal with remaining columns of the rhs
1770  for(Index j2=packet_cols4; j2<cols; j2++)
1771  {
1772  for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress)
1773  {
1774  // One column at a time
1775  const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)];
1776  prefetch(&blA[0]);
1777 
1778  // gets res block as register
1779  AccPacket C0, C4, C8;
1780  traits.initAcc(C0);
1781  traits.initAcc(C4);
1782  traits.initAcc(C8);
1783 
1784  LinearMapper r0 = res.getLinearMapper(i, j2);
1785  r0.prefetch(0);
1786 
1787  // performs "inner" products
1788  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
1789  LhsPacket A0, A1, A2;
1790 
1791  for(Index k=0; k<peeled_kc; k+=pk)
1792  {
1793  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1794  RhsPacket B_0;
1795 #define EIGEN_GEBGP_ONESTEP(K) \
1796  do { \
1797  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1798  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1799  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1800  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1801  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1802  traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1803  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1804  traits.madd(A1, B_0, C4, B_0, fix<0>); \
1805  traits.madd(A2, B_0, C8, B_0, fix<0>); \
1806  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1807  } while (false)
1808 
1809  EIGEN_GEBGP_ONESTEP(0);
1810  EIGEN_GEBGP_ONESTEP(1);
1811  EIGEN_GEBGP_ONESTEP(2);
1812  EIGEN_GEBGP_ONESTEP(3);
1813  EIGEN_GEBGP_ONESTEP(4);
1814  EIGEN_GEBGP_ONESTEP(5);
1815  EIGEN_GEBGP_ONESTEP(6);
1816  EIGEN_GEBGP_ONESTEP(7);
1817 
1818  blB += pk*RhsProgress;
1819  blA += pk*3*Traits::LhsProgress;
1820 
1821  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1822  }
1823 
1824  // process remaining peeled loop
1825  for(Index k=peeled_kc; k<depth; k++)
1826  {
1827  RhsPacket B_0;
1828  EIGEN_GEBGP_ONESTEP(0);
1829  blB += RhsProgress;
1830  blA += 3*Traits::LhsProgress;
1831  }
1832 #undef EIGEN_GEBGP_ONESTEP
1833  ResPacket R0, R1, R2;
1834  ResPacket alphav = pset1<ResPacket>(alpha);
1835 
1836  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1837  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1838  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1839  traits.acc(C0, alphav, R0);
1840  traits.acc(C4, alphav, R1);
1841  traits.acc(C8, alphav, R2);
1842  r0.storePacket(0 * Traits::ResPacketSize, R0);
1843  r0.storePacket(1 * Traits::ResPacketSize, R1);
1844  r0.storePacket(2 * Traits::ResPacketSize, R2);
1845  }
1846  }
1847  }
1848  }
1849 
1850  //---------- Process 2 * LhsProgress rows at once ----------
1851  if(mr>=2*Traits::LhsProgress)
1852  {
1853  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1854  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1855  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess),
1856  // or because we are testing specific blocking sizes.
1857  Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) ));
1858 
1859  for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows)
1860  {
1861  Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2);
1862  for(Index j2=0; j2<packet_cols4; j2+=nr)
1863  {
1864  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1865  {
1866 
1867  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
1868  // stored into 2 x nr registers.
1869 
1870  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1871  prefetch(&blA[0]);
1872 
1873  // gets res block as register
1874  AccPacket C0, C1, C2, C3,
1875  C4, C5, C6, C7;
1876  traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3);
1877  traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7);
1878 
1879  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1880  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1881  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1882  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1883 
1884  r0.prefetch(prefetch_res_offset);
1885  r1.prefetch(prefetch_res_offset);
1886  r2.prefetch(prefetch_res_offset);
1887  r3.prefetch(prefetch_res_offset);
1888 
1889  // performs "inner" products
1890  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
1891  prefetch(&blB[0]);
1892  LhsPacket A0, A1;
1893 
1894  for(Index k=0; k<peeled_kc; k+=pk)
1895  {
1896  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
1897  RhsPacketx4 rhs_panel;
1898  RhsPacket T0;
1899 
1900  // NOTE: the begin/end asm comments below work around bug 935!
1901  // but they are not enough for gcc>=6 without FMA (bug 1637)
1902  #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
1903  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1));
1904  #else
1905  #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
1906  #endif
1907 #define EIGEN_GEBGP_ONESTEP(K) \
1908  do { \
1909  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
1910  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1911  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1912  traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
1913  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1914  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1915  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1916  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1917  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1918  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1919  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1920  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1921  EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
1922  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
1923  } while (false)
1924 
1925  internal::prefetch(blB+(48+0));
1926  EIGEN_GEBGP_ONESTEP(0);
1927  EIGEN_GEBGP_ONESTEP(1);
1928  EIGEN_GEBGP_ONESTEP(2);
1929  EIGEN_GEBGP_ONESTEP(3);
1930  internal::prefetch(blB+(48+16));
1931  EIGEN_GEBGP_ONESTEP(4);
1932  EIGEN_GEBGP_ONESTEP(5);
1933  EIGEN_GEBGP_ONESTEP(6);
1934  EIGEN_GEBGP_ONESTEP(7);
1935 
1936  blB += pk*4*RhsProgress;
1937  blA += pk*(2*Traits::LhsProgress);
1938 
1939  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
1940  }
1941  // process remaining peeled loop
1942  for(Index k=peeled_kc; k<depth; k++)
1943  {
1944  RhsPacketx4 rhs_panel;
1945  RhsPacket T0;
1946  EIGEN_GEBGP_ONESTEP(0);
1947  blB += 4*RhsProgress;
1948  blA += 2*Traits::LhsProgress;
1949  }
1950 #undef EIGEN_GEBGP_ONESTEP
1951 
1952  ResPacket R0, R1, R2, R3;
1953  ResPacket alphav = pset1<ResPacket>(alpha);
1954 
1955  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1956  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1957  R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1958  R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1959  traits.acc(C0, alphav, R0);
1960  traits.acc(C4, alphav, R1);
1961  traits.acc(C1, alphav, R2);
1962  traits.acc(C5, alphav, R3);
1963  r0.storePacket(0 * Traits::ResPacketSize, R0);
1964  r0.storePacket(1 * Traits::ResPacketSize, R1);
1965  r1.storePacket(0 * Traits::ResPacketSize, R2);
1966  r1.storePacket(1 * Traits::ResPacketSize, R3);
1967 
1968  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1969  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1970  R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1971  R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1972  traits.acc(C2, alphav, R0);
1973  traits.acc(C6, alphav, R1);
1974  traits.acc(C3, alphav, R2);
1975  traits.acc(C7, alphav, R3);
1976  r2.storePacket(0 * Traits::ResPacketSize, R0);
1977  r2.storePacket(1 * Traits::ResPacketSize, R1);
1978  r3.storePacket(0 * Traits::ResPacketSize, R2);
1979  r3.storePacket(1 * Traits::ResPacketSize, R3);
1980  }
1981  }
1982 
1983  // Deal with remaining columns of the rhs
1984  for(Index j2=packet_cols4; j2<cols; j2++)
1985  {
1986  for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress)
1987  {
1988  // One column at a time
1989  const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)];
1990  prefetch(&blA[0]);
1991 
1992  // gets res block as register
1993  AccPacket C0, C4;
1994  traits.initAcc(C0);
1995  traits.initAcc(C4);
1996 
1997  LinearMapper r0 = res.getLinearMapper(i, j2);
1998  r0.prefetch(prefetch_res_offset);
1999 
2000  // performs "inner" products
2001  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2002  LhsPacket A0, A1;
2003 
2004  for(Index k=0; k<peeled_kc; k+=pk)
2005  {
2006  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
2007  RhsPacket B_0, B1;
2008 
2009 #define EIGEN_GEBGP_ONESTEP(K) \
2010  do { \
2011  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2012  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2013  traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \
2014  traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \
2015  traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \
2016  traits.madd(A0, B_0, C0, B1, fix<0>); \
2017  traits.madd(A1, B_0, C4, B_0, fix<0>); \
2018  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2019  } while(false)
2020 
2021  EIGEN_GEBGP_ONESTEP(0);
2022  EIGEN_GEBGP_ONESTEP(1);
2023  EIGEN_GEBGP_ONESTEP(2);
2024  EIGEN_GEBGP_ONESTEP(3);
2025  EIGEN_GEBGP_ONESTEP(4);
2026  EIGEN_GEBGP_ONESTEP(5);
2027  EIGEN_GEBGP_ONESTEP(6);
2028  EIGEN_GEBGP_ONESTEP(7);
2029 
2030  blB += pk*RhsProgress;
2031  blA += pk*2*Traits::LhsProgress;
2032 
2033  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
2034  }
2035 
2036  // process remaining peeled loop
2037  for(Index k=peeled_kc; k<depth; k++)
2038  {
2039  RhsPacket B_0, B1;
2040  EIGEN_GEBGP_ONESTEP(0);
2041  blB += RhsProgress;
2042  blA += 2*Traits::LhsProgress;
2043  }
2044 #undef EIGEN_GEBGP_ONESTEP
2045  ResPacket R0, R1;
2046  ResPacket alphav = pset1<ResPacket>(alpha);
2047 
2048  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2049  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2050  traits.acc(C0, alphav, R0);
2051  traits.acc(C4, alphav, R1);
2052  r0.storePacket(0 * Traits::ResPacketSize, R0);
2053  r0.storePacket(1 * Traits::ResPacketSize, R1);
2054  }
2055  }
2056  }
2057  }
2058  //---------- Process 1 * LhsProgress rows at once ----------
2059  if(mr>=1*Traits::LhsProgress)
2060  {
2061  lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
2062  p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2063  }
2064  //---------- Process LhsProgressHalf rows at once ----------
2065  if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
2066  {
2067  lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
2068  p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2069  }
2070  //---------- Process LhsProgressQuarter rows at once ----------
2071  if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
2072  {
2073  lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
2074  p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2075  }
2076  //---------- Process remaining rows, 1 at once ----------
2077  if(peeled_mc_quarter<rows)
2078  {
2079  // loop on each panel of the rhs
2080  for(Index j2=0; j2<packet_cols4; j2+=nr)
2081  {
2082  // loop on each row of the lhs (1*LhsProgress x depth)
2083  for(Index i=peeled_mc_quarter; i<rows; i+=1)
2084  {
2085  const LhsScalar* blA = &blockA[i*strideA+offsetA];
2086  prefetch(&blA[0]);
2087  const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
2088 
2089  // If LhsProgress is 8 or 16, it assumes that there is a
2090  // half or quarter packet, respectively, of the same size as
2091  // nr (which is currently 4) for the return type.
2092  const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
2093  const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
2094  if ((SwappedTraits::LhsProgress % 4) == 0 &&
2095  (SwappedTraits::LhsProgress<=16) &&
2096  (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) &&
2097  (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
2098  {
2099  SAccPacket C0, C1, C2, C3;
2100  straits.initAcc(C0);
2101  straits.initAcc(C1);
2102  straits.initAcc(C2);
2103  straits.initAcc(C3);
2104 
2105  const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4);
2106  const Index endk = (depth/spk)*spk;
2107  const Index endk4 = (depth/(spk*4))*(spk*4);
2108 
2109  Index k=0;
2110  for(; k<endk4; k+=4*spk)
2111  {
2112  SLhsPacket A0,A1;
2113  SRhsPacket B_0,B_1;
2114 
2115  straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0);
2116  straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1);
2117 
2118  straits.loadRhsQuad(blA+0*spk, B_0);
2119  straits.loadRhsQuad(blA+1*spk, B_1);
2120  straits.madd(A0,B_0,C0,B_0, fix<0>);
2121  straits.madd(A1,B_1,C1,B_1, fix<0>);
2122 
2123  straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
2124  straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
2125  straits.loadRhsQuad(blA+2*spk, B_0);
2126  straits.loadRhsQuad(blA+3*spk, B_1);
2127  straits.madd(A0,B_0,C2,B_0, fix<0>);
2128  straits.madd(A1,B_1,C3,B_1, fix<0>);
2129 
2130  blB += 4*SwappedTraits::LhsProgress;
2131  blA += 4*spk;
2132  }
2133  C0 = padd(padd(C0,C1),padd(C2,C3));
2134  for(; k<endk; k+=spk)
2135  {
2136  SLhsPacket A0;
2137  SRhsPacket B_0;
2138 
2139  straits.loadLhsUnaligned(blB, A0);
2140  straits.loadRhsQuad(blA, B_0);
2141  straits.madd(A0,B_0,C0,B_0, fix<0>);
2142 
2143  blB += SwappedTraits::LhsProgress;
2144  blA += spk;
2145  }
2146  if(SwappedTraits::LhsProgress==8)
2147  {
2148  // Special case where we have to first reduce the accumulation register C0
2149  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
2150  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
2151  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
2152  typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
2153 
2154  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
2155  SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
2156 
2157  if(depth-endk>0)
2158  {
2159  // We have to handle the last row of the rhs which corresponds to a half-packet
2160  SLhsPacketHalf a0;
2161  SRhsPacketHalf b0;
2162  straits.loadLhsUnaligned(blB, a0);
2163  straits.loadRhs(blA, b0);
2164  SAccPacketHalf c0 = predux_half_dowto4(C0);
2165  straits.madd(a0,b0,c0,b0, fix<0>);
2166  straits.acc(c0, alphav, R);
2167  }
2168  else
2169  {
2170  straits.acc(predux_half_dowto4(C0), alphav, R);
2171  }
2172  res.scatterPacket(i, j2, R);
2173  }
2174  else if (SwappedTraits::LhsProgress==16)
2175  {
2176  // Special case where we have to first reduce the
2177  // accumulation register C0. We specialize the block in
2178  // template form, so that LhsProgress < 16 paths don't
2179  // fail to compile
2180  last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2181  p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
2182  }
2183  else
2184  {
2185  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2186  SResPacket alphav = pset1<SResPacket>(alpha);
2187  straits.acc(C0, alphav, R);
2188  res.scatterPacket(i, j2, R);
2189  }
2190  }
2191  else // scalar path
2192  {
2193  // get a 1 x 4 res block as registers
2194  ResScalar C0(0), C1(0), C2(0), C3(0);
2195 
2196  for(Index k=0; k<depth; k++)
2197  {
2198  LhsScalar A0;
2199  RhsScalar B_0, B_1;
2200 
2201  A0 = blA[k];
2202 
2203  B_0 = blB[0];
2204  B_1 = blB[1];
2205  CJMADD(cj,A0,B_0,C0, B_0);
2206  CJMADD(cj,A0,B_1,C1, B_1);
2207 
2208  B_0 = blB[2];
2209  B_1 = blB[3];
2210  CJMADD(cj,A0,B_0,C2, B_0);
2211  CJMADD(cj,A0,B_1,C3, B_1);
2212 
2213  blB += 4;
2214  }
2215  res(i, j2 + 0) += alpha * C0;
2216  res(i, j2 + 1) += alpha * C1;
2217  res(i, j2 + 2) += alpha * C2;
2218  res(i, j2 + 3) += alpha * C3;
2219  }
2220  }
2221  }
2222  // remaining columns
2223  for(Index j2=packet_cols4; j2<cols; j2++)
2224  {
2225  // loop on each row of the lhs (1*LhsProgress x depth)
2226  for(Index i=peeled_mc_quarter; i<rows; i+=1)
2227  {
2228  const LhsScalar* blA = &blockA[i*strideA+offsetA];
2229  prefetch(&blA[0]);
2230  // gets a 1 x 1 res block as registers
2231  ResScalar C0(0);
2232  const RhsScalar* blB = &blockB[j2*strideB+offsetB];
2233  for(Index k=0; k<depth; k++)
2234  {
2235  LhsScalar A0 = blA[k];
2236  RhsScalar B_0 = blB[k];
2237  CJMADD(cj, A0, B_0, C0, B_0);
2238  }
2239  res(i, j2) += alpha * C0;
2240  }
2241  }
2242  }
2243  }
2244 
2245 
2246 #undef CJMADD
2247 
2248 // pack a block of the lhs
2249 // The traversal is as follow (mr==4):
2250 // 0 4 8 12 ...
2251 // 1 5 9 13 ...
2252 // 2 6 10 14 ...
2253 // 3 7 11 15 ...
2254 //
2255 // 16 20 24 28 ...
2256 // 17 21 25 29 ...
2257 // 18 22 26 30 ...
2258 // 19 23 27 31 ...
2259 //
2260 // 32 33 34 35 ...
2261 // 36 36 38 39 ...
2262 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2263 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2264 {
2265  typedef typename DataMapper::LinearMapper LinearMapper;
2266  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2267 };
2268 
2269 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2271  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2272 {
2273  typedef typename unpacket_traits<Packet>::half HalfPacket;
2274  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2275  enum { PacketSize = unpacket_traits<Packet>::size,
2276  HalfPacketSize = unpacket_traits<HalfPacket>::size,
2277  QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2278  HasHalf = (int)HalfPacketSize < (int)PacketSize,
2279  HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2280 
2281  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2282  EIGEN_UNUSED_VARIABLE(stride);
2283  EIGEN_UNUSED_VARIABLE(offset);
2284  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2285  eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) );
2286  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2287  Index count = 0;
2288 
2289  const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
2290  const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
2291  const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
2292  const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
2293  const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
2294  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2295  const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
2296  : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
2297 
2298  Index i=0;
2299 
2300  // Pack 3 packets
2301  if(Pack1>=3*PacketSize)
2302  {
2303  for(; i<peeled_mc3; i+=3*PacketSize)
2304  {
2305  if(PanelMode) count += (3*PacketSize) * offset;
2306 
2307  for(Index k=0; k<depth; k++)
2308  {
2309  Packet A, B, C;
2310  A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2311  B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2312  C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
2313  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2314  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2315  pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
2316  }
2317  if(PanelMode) count += (3*PacketSize) * (stride-offset-depth);
2318  }
2319  }
2320  // Pack 2 packets
2321  if(Pack1>=2*PacketSize)
2322  {
2323  for(; i<peeled_mc2; i+=2*PacketSize)
2324  {
2325  if(PanelMode) count += (2*PacketSize) * offset;
2326 
2327  for(Index k=0; k<depth; k++)
2328  {
2329  Packet A, B;
2330  A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2331  B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
2332  pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
2333  pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
2334  }
2335  if(PanelMode) count += (2*PacketSize) * (stride-offset-depth);
2336  }
2337  }
2338  // Pack 1 packets
2339  if(Pack1>=1*PacketSize)
2340  {
2341  for(; i<peeled_mc1; i+=1*PacketSize)
2342  {
2343  if(PanelMode) count += (1*PacketSize) * offset;
2344 
2345  for(Index k=0; k<depth; k++)
2346  {
2347  Packet A;
2348  A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
2349  pstore(blockA+count, cj.pconj(A));
2350  count+=PacketSize;
2351  }
2352  if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
2353  }
2354  }
2355  // Pack half packets
2356  if(HasHalf && Pack1>=HalfPacketSize)
2357  {
2358  for(; i<peeled_mc_half; i+=HalfPacketSize)
2359  {
2360  if(PanelMode) count += (HalfPacketSize) * offset;
2361 
2362  for(Index k=0; k<depth; k++)
2363  {
2364  HalfPacket A;
2365  A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
2366  pstoreu(blockA+count, cj.pconj(A));
2367  count+=HalfPacketSize;
2368  }
2369  if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
2370  }
2371  }
2372  // Pack quarter packets
2373  if(HasQuarter && Pack1>=QuarterPacketSize)
2374  {
2375  for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
2376  {
2377  if(PanelMode) count += (QuarterPacketSize) * offset;
2378 
2379  for(Index k=0; k<depth; k++)
2380  {
2381  QuarterPacket A;
2382  A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
2383  pstoreu(blockA+count, cj.pconj(A));
2384  count+=QuarterPacketSize;
2385  }
2386  if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
2387  }
2388  }
2389  // Pack2 may be *smaller* than PacketSize—that happens for
2390  // products like real * complex, where we have to go half the
2391  // progress on the lhs in order to duplicate those operands to
2392  // address both real & imaginary parts on the rhs. This portion will
2393  // pack those half ones until they match the number expected on the
2394  // last peeling loop at this point (for the rhs).
2395  if(Pack2<PacketSize && Pack2>1)
2396  {
2397  for(; i<peeled_mc0; i+=last_lhs_progress)
2398  {
2399  if(PanelMode) count += last_lhs_progress * offset;
2400 
2401  for(Index k=0; k<depth; k++)
2402  for(Index w=0; w<last_lhs_progress; w++)
2403  blockA[count++] = cj(lhs(i+w, k));
2404 
2405  if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
2406  }
2407  }
2408  // Pack scalars
2409  for(; i<rows; i++)
2410  {
2411  if(PanelMode) count += offset;
2412  for(Index k=0; k<depth; k++)
2413  blockA[count++] = cj(lhs(i, k));
2414  if(PanelMode) count += (stride-offset-depth);
2415  }
2416 }
2417 
2418 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2419 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2420 {
2421  typedef typename DataMapper::LinearMapper LinearMapper;
2422  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2423 };
2424 
2425 template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
2427  ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2428 {
2429  typedef typename unpacket_traits<Packet>::half HalfPacket;
2430  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2431  enum { PacketSize = unpacket_traits<Packet>::size,
2432  HalfPacketSize = unpacket_traits<HalfPacket>::size,
2433  QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2434  HasHalf = (int)HalfPacketSize < (int)PacketSize,
2435  HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
2436 
2437  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2438  EIGEN_UNUSED_VARIABLE(stride);
2439  EIGEN_UNUSED_VARIABLE(offset);
2440  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2441  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
2442  Index count = 0;
2443  bool gone_half = false, gone_quarter = false, gone_last = false;
2444 
2445  Index i = 0;
2446  int pack = Pack1;
2447  int psize = PacketSize;
2448  while(pack>0)
2449  {
2450  Index remaining_rows = rows-i;
2451  Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
2452  Index starting_pos = i;
2453  for(; i<peeled_mc; i+=pack)
2454  {
2455  if(PanelMode) count += pack * offset;
2456 
2457  Index k=0;
2458  if(pack>=psize && psize >= QuarterPacketSize)
2459  {
2460  const Index peeled_k = (depth/psize)*psize;
2461  for(; k<peeled_k; k+=psize)
2462  {
2463  for (Index m = 0; m < pack; m += psize)
2464  {
2465  if (psize == PacketSize) {
2466  PacketBlock<Packet> kernel;
2467  for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
2468  ptranspose(kernel);
2469  for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
2470  } else if (HasHalf && psize == HalfPacketSize) {
2471  gone_half = true;
2472  PacketBlock<HalfPacket> kernel_half;
2473  for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
2474  ptranspose(kernel_half);
2475  for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
2476  } else if (HasQuarter && psize == QuarterPacketSize) {
2477  gone_quarter = true;
2478  PacketBlock<QuarterPacket> kernel_quarter;
2479  for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
2480  ptranspose(kernel_quarter);
2481  for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
2482  }
2483  }
2484  count += psize*pack;
2485  }
2486  }
2487 
2488  for(; k<depth; k++)
2489  {
2490  Index w=0;
2491  for(; w<pack-3; w+=4)
2492  {
2493  Scalar a(cj(lhs(i+w+0, k))),
2494  b(cj(lhs(i+w+1, k))),
2495  c(cj(lhs(i+w+2, k))),
2496  d(cj(lhs(i+w+3, k)));
2497  blockA[count++] = a;
2498  blockA[count++] = b;
2499  blockA[count++] = c;
2500  blockA[count++] = d;
2501  }
2502  if(pack%4)
2503  for(;w<pack;++w)
2504  blockA[count++] = cj(lhs(i+w, k));
2505  }
2506 
2507  if(PanelMode) count += pack * (stride-offset-depth);
2508  }
2509 
2510  pack -= psize;
2511  Index left = rows - i;
2512  if (pack <= 0) {
2513  if (!gone_last &&
2514  (starting_pos == i || left >= psize/2 || left >= psize/4) &&
2515  ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
2516  (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2517  psize /= 2;
2518  pack = psize;
2519  continue;
2520  }
2521  // Pack2 may be *smaller* than PacketSize—that happens for
2522  // products like real * complex, where we have to go half the
2523  // progress on the lhs in order to duplicate those operands to
2524  // address both real & imaginary parts on the rhs. This portion will
2525  // pack those half ones until they match the number expected on the
2526  // last peeling loop at this point (for the rhs).
2527  if (Pack2 < PacketSize && !gone_last) {
2528  gone_last = true;
2529  psize = pack = left & ~1;
2530  }
2531  }
2532  }
2533 
2534  for(; i<rows; i++)
2535  {
2536  if(PanelMode) count += offset;
2537  for(Index k=0; k<depth; k++)
2538  blockA[count++] = cj(lhs(i, k));
2539  if(PanelMode) count += (stride-offset-depth);
2540  }
2541 }
2542 
2543 // copy a complete panel of the rhs
2544 // this version is optimized for column major matrices
2545 // The traversal order is as follow: (nr==4):
2546 // 0 1 2 3 12 13 14 15 24 27
2547 // 4 5 6 7 16 17 18 19 25 28
2548 // 8 9 10 11 20 21 22 23 26 29
2549 // . . . . . . . . . .
2550 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2551 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2552 {
2553  typedef typename packet_traits<Scalar>::type Packet;
2554  typedef typename DataMapper::LinearMapper LinearMapper;
2555  enum { PacketSize = packet_traits<Scalar>::size };
2556  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2557 };
2558 
2559 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2561  ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2562 {
2563  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
2564  EIGEN_UNUSED_VARIABLE(stride);
2565  EIGEN_UNUSED_VARIABLE(offset);
2566  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2568  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2569  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2570  Index count = 0;
2571  const Index peeled_k = (depth/PacketSize)*PacketSize;
2572 // if(nr>=8)
2573 // {
2574 // for(Index j2=0; j2<packet_cols8; j2+=8)
2575 // {
2576 // // skip what we have before
2577 // if(PanelMode) count += 8 * offset;
2578 // const Scalar* b0 = &rhs[(j2+0)*rhsStride];
2579 // const Scalar* b1 = &rhs[(j2+1)*rhsStride];
2580 // const Scalar* b2 = &rhs[(j2+2)*rhsStride];
2581 // const Scalar* b3 = &rhs[(j2+3)*rhsStride];
2582 // const Scalar* b4 = &rhs[(j2+4)*rhsStride];
2583 // const Scalar* b5 = &rhs[(j2+5)*rhsStride];
2584 // const Scalar* b6 = &rhs[(j2+6)*rhsStride];
2585 // const Scalar* b7 = &rhs[(j2+7)*rhsStride];
2586 // Index k=0;
2587 // if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
2588 // {
2589 // for(; k<peeled_k; k+=PacketSize) {
2590 // PacketBlock<Packet> kernel;
2591 // for (int p = 0; p < PacketSize; ++p) {
2592 // kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]);
2593 // }
2594 // ptranspose(kernel);
2595 // for (int p = 0; p < PacketSize; ++p) {
2596 // pstoreu(blockB+count, cj.pconj(kernel.packet[p]));
2597 // count+=PacketSize;
2598 // }
2599 // }
2600 // }
2601 // for(; k<depth; k++)
2602 // {
2603 // blockB[count+0] = cj(b0[k]);
2604 // blockB[count+1] = cj(b1[k]);
2605 // blockB[count+2] = cj(b2[k]);
2606 // blockB[count+3] = cj(b3[k]);
2607 // blockB[count+4] = cj(b4[k]);
2608 // blockB[count+5] = cj(b5[k]);
2609 // blockB[count+6] = cj(b6[k]);
2610 // blockB[count+7] = cj(b7[k]);
2611 // count += 8;
2612 // }
2613 // // skip what we have after
2614 // if(PanelMode) count += 8 * (stride-offset-depth);
2615 // }
2616 // }
2617 
2618  if(nr>=4)
2619  {
2620  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2621  {
2622  // skip what we have before
2623  if(PanelMode) count += 4 * offset;
2624  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2625  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2626  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2627  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2628 
2629  Index k=0;
2630  if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ??
2631  {
2632  for(; k<peeled_k; k+=PacketSize) {
2633  PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
2634  kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
2635  kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
2636  kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
2637  kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
2638  ptranspose(kernel);
2639  pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
2640  pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
2641  pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize]));
2642  pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize]));
2643  count+=4*PacketSize;
2644  }
2645  }
2646  for(; k<depth; k++)
2647  {
2648  blockB[count+0] = cj(dm0(k));
2649  blockB[count+1] = cj(dm1(k));
2650  blockB[count+2] = cj(dm2(k));
2651  blockB[count+3] = cj(dm3(k));
2652  count += 4;
2653  }
2654  // skip what we have after
2655  if(PanelMode) count += 4 * (stride-offset-depth);
2656  }
2657  }
2658 
2659  // copy the remaining columns one at a time (nr==1)
2660  for(Index j2=packet_cols4; j2<cols; ++j2)
2661  {
2662  if(PanelMode) count += offset;
2663  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
2664  for(Index k=0; k<depth; k++)
2665  {
2666  blockB[count] = cj(dm0(k));
2667  count += 1;
2668  }
2669  if(PanelMode) count += (stride-offset-depth);
2670  }
2671 }
2672 
2673 // this version is optimized for row major matrices
2674 template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2675 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2676 {
2677  typedef typename packet_traits<Scalar>::type Packet;
2678  typedef typename unpacket_traits<Packet>::half HalfPacket;
2680  typedef typename DataMapper::LinearMapper LinearMapper;
2681  enum { PacketSize = packet_traits<Scalar>::size,
2682  HalfPacketSize = unpacket_traits<HalfPacket>::size,
2683  QuarterPacketSize = unpacket_traits<QuarterPacket>::size};
2684  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0)
2685  {
2686  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
2687  EIGEN_UNUSED_VARIABLE(stride);
2688  EIGEN_UNUSED_VARIABLE(offset);
2689  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
2690  const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
2691  const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
2693  Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
2694  Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
2695  Index count = 0;
2696 
2697  // if(nr>=8)
2698  // {
2699  // for(Index j2=0; j2<packet_cols8; j2+=8)
2700  // {
2701  // // skip what we have before
2702  // if(PanelMode) count += 8 * offset;
2703  // for(Index k=0; k<depth; k++)
2704  // {
2705  // if (PacketSize==8) {
2706  // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2707  // pstoreu(blockB+count, cj.pconj(A));
2708  // } else if (PacketSize==4) {
2709  // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]);
2710  // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]);
2711  // pstoreu(blockB+count, cj.pconj(A));
2712  // pstoreu(blockB+count+PacketSize, cj.pconj(B));
2713  // } else {
2714  // const Scalar* b0 = &rhs[k*rhsStride + j2];
2715  // blockB[count+0] = cj(b0[0]);
2716  // blockB[count+1] = cj(b0[1]);
2717  // blockB[count+2] = cj(b0[2]);
2718  // blockB[count+3] = cj(b0[3]);
2719  // blockB[count+4] = cj(b0[4]);
2720  // blockB[count+5] = cj(b0[5]);
2721  // blockB[count+6] = cj(b0[6]);
2722  // blockB[count+7] = cj(b0[7]);
2723  // }
2724  // count += 8;
2725  // }
2726  // // skip what we have after
2727  // if(PanelMode) count += 8 * (stride-offset-depth);
2728  // }
2729  // }
2730  if(nr>=4)
2731  {
2732  for(Index j2=packet_cols8; j2<packet_cols4; j2+=4)
2733  {
2734  // skip what we have before
2735  if(PanelMode) count += 4 * offset;
2736  for(Index k=0; k<depth; k++)
2737  {
2738  if (PacketSize==4) {
2739  Packet A = rhs.template loadPacket<Packet>(k, j2);
2740  pstoreu(blockB+count, cj.pconj(A));
2741  count += PacketSize;
2742  } else if (HasHalf && HalfPacketSize==4) {
2743  HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
2744  pstoreu(blockB+count, cj.pconj(A));
2745  count += HalfPacketSize;
2746  } else if (HasQuarter && QuarterPacketSize==4) {
2747  QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
2748  pstoreu(blockB+count, cj.pconj(A));
2749  count += QuarterPacketSize;
2750  } else {
2751  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
2752  blockB[count+0] = cj(dm0(0));
2753  blockB[count+1] = cj(dm0(1));
2754  blockB[count+2] = cj(dm0(2));
2755  blockB[count+3] = cj(dm0(3));
2756  count += 4;
2757  }
2758  }
2759  // skip what we have after
2760  if(PanelMode) count += 4 * (stride-offset-depth);
2761  }
2762  }
2763  // copy the remaining columns one at a time (nr==1)
2764  for(Index j2=packet_cols4; j2<cols; ++j2)
2765  {
2766  if(PanelMode) count += offset;
2767  for(Index k=0; k<depth; k++)
2768  {
2769  blockB[count] = cj(rhs(k, j2));
2770  count += 1;
2771  }
2772  if(PanelMode) count += stride-offset-depth;
2773  }
2774  }
2775 };
2776 
2777 } // end namespace internal
2778 
2781 inline std::ptrdiff_t l1CacheSize()
2782 {
2783  std::ptrdiff_t l1, l2, l3;
2784  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2785  return l1;
2786 }
2787 
2790 inline std::ptrdiff_t l2CacheSize()
2791 {
2792  std::ptrdiff_t l1, l2, l3;
2793  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2794  return l2;
2795 }
2796 
2800 inline std::ptrdiff_t l3CacheSize()
2801 {
2802  std::ptrdiff_t l1, l2, l3;
2803  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
2804  return l3;
2805 }
2806 
2812 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
2813 {
2814  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
2815 }
2816 
2817 } // end namespace Eigen
2818 
2819 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
Eigen
Namespace containing all symbols from the Eigen library.
Definition: LDLT.h:16
Eigen::internal::gebp_traits
Definition: GeneralBlockPanelKernel.h:449
Eigen::l3CacheSize
std::ptrdiff_t l3CacheSize()
Definition: GeneralBlockPanelKernel.h:2800
Eigen::internal::PacketBlock
Definition: GenericPacketMath.h:875
Eigen::internal::false_type
Definition: Meta.h:64
Eigen::internal::packet_traits
Definition: GenericPacketMath.h:107
Eigen::internal::last_row_process_16_packets
Definition: GeneralBlockPanelKernel.h:1284
Eigen::internal::FixedInt
Definition: IntegralConstant.h:52
Eigen::internal::QuadPacket
Definition: GeneralBlockPanelKernel.h:392
Eigen::ScalarBinaryOpTraits
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:814
Eigen::RowMajor
@ RowMajor
Definition: Constants.h:320
Eigen::setCpuCacheSizes
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition: GeneralBlockPanelKernel.h:2812
Eigen::internal::RhsPanelHelper
Definition: GeneralBlockPanelKernel.h:383
Eigen::internal::packet_conditional
Definition: GeneralBlockPanelKernel.h:401
Eigen::internal::true_type
Definition: Meta.h:63
Eigen::internal::lhs_process_one_packet
Definition: GeneralBlockPanelKernel.h:1363
Eigen::internal::unpacket_traits
Definition: XprHelper.h:184
Eigen::internal::gebp_kernel
Definition: GeneralBlockPanelKernel.h:1230
Eigen::internal::conj_if
Definition: BlasUtil.h:43
Eigen::internal::Packet
Definition: ZVector/PacketMath.h:53
Eigen::internal::gebp_madd_selector
Definition: GeneralBlockPanelKernel.h:358
Eigen::internal::gemm_pack_rhs
Definition: BlasUtil.h:25
Eigen::internal::CacheSizes
Definition: GeneralBlockPanelKernel.h:71
Eigen::Conjugate
Definition: ForwardDeclarations.h:87
Eigen::internal::lhs_process_fraction_of_packet
Definition: GeneralBlockPanelKernel.h:1558
Eigen::internal::traits
Definition: ForwardDeclarations.h:17
Eigen::internal::gemm_pack_lhs
Definition: BlasUtil.h:28
Eigen::internal::conj_helper
Definition: BlasUtil.h:62
Eigen::internal::conditional
Definition: Meta.h:76
Eigen::internal::enable_if
Definition: Meta.h:234
Eigen::ColMajor
@ ColMajor
Definition: Constants.h:318
Eigen::half
Definition: Half.h:114
Eigen::l1CacheSize
std::ptrdiff_t l1CacheSize()
Definition: GeneralBlockPanelKernel.h:2781
Eigen::internal::DoublePacket
Definition: GeneralBlockPanelKernel.h:713
Eigen::l2CacheSize
std::ptrdiff_t l2CacheSize()
Definition: GeneralBlockPanelKernel.h:2790
Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:42