Path Tracer
ZVector/PacketMath.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_PACKET_MATH_ZVECTOR_H
11 #define EIGEN_PACKET_MATH_ZVECTOR_H
12 
13 #include <stdint.h>
14 
15 namespace Eigen {
16 
17 namespace internal {
18 
19 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
20 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
21 #endif
22 
23 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
24 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
25 #endif
26 
27 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
28 #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
29 #endif
30 
31 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
32 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
33 #endif
34 
35 typedef __vector int Packet4i;
36 typedef __vector unsigned int Packet4ui;
37 typedef __vector __bool int Packet4bi;
38 typedef __vector short int Packet8i;
39 typedef __vector unsigned char Packet16uc;
40 typedef __vector double Packet2d;
41 typedef __vector unsigned long long Packet2ul;
42 typedef __vector long long Packet2l;
43 
44 // Z14 has builtin support for float vectors
45 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
46 typedef __vector float Packet4f;
47 #else
48 typedef struct {
49  Packet2d v4f[2];
50 } Packet4f;
51 #endif
52 
53 typedef union {
54  int32_t i[4];
55  uint32_t ui[4];
56  int64_t l[2];
57  uint64_t ul[2];
58  double d[2];
59  float f[4];
60  Packet4i v4i;
61  Packet4ui v4ui;
62  Packet2l v2l;
63  Packet2ul v2ul;
64  Packet2d v2d;
65 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
66  Packet4f v4f;
67 #endif
68 } Packet;
69 
70 // We don't want to write the same code all the time, but we need to reuse the constants
71 // and it doesn't really work to declare them global, so we define macros instead
72 
73 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
74  Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
75 
76 #define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \
77  Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
78 
79 #define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \
80  Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
81 
82 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
83  Packet4i p4i_##NAME = pset1<Packet4i>(X)
84 
85 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
86  Packet2d p2d_##NAME = pset1<Packet2d>(X)
87 
88 #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
89  Packet2l p2l_##NAME = pset1<Packet2l>(X)
90 
91 // These constants are endian-agnostic
92 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
93 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
94 
95 static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
96 static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
97 static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
98 
99 static Packet2d p2d_ONE = { 1.0, 1.0 };
100 static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
101 
102 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
103 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
104  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
105 
106 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
107  Packet4f p4f_##NAME = pset1<Packet4f>(X)
108 
109 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
110  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
111 
112 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
113 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
114 static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
115 #endif
116 
117 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
118 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
119 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
120 
121 static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
122 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
123 
124 // Mask alignment
125 #define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
126 
127 #define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
128 
129 // Handle endianness properly while loading constants
130 // Define global static constants:
131 
132 static Packet16uc p16uc_FORWARD = { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 };
133 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
134 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
135 
136 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
137 static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
138 /*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
139 
140 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
141 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
142 /*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
143 static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
144 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
145 static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
146 
147 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
148 
149 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
150 
151 
152 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
153  #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
154 #else
155  #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( " pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
156 #endif
157 
158 template<> struct packet_traits<int> : default_packet_traits
159 {
160  typedef Packet4i type;
161  typedef Packet4i half;
162  enum {
163  Vectorizable = 1,
164  AlignedOnScalar = 1,
165  size = 4,
166  HasHalfPacket = 0,
167 
168  HasAdd = 1,
169  HasSub = 1,
170  HasMul = 1,
171  HasDiv = 1,
172  HasBlend = 1
173  };
174 };
175 
176 template <>
177 struct packet_traits<float> : default_packet_traits {
178  typedef Packet4f type;
179  typedef Packet4f half;
180  enum {
181  Vectorizable = 1,
182  AlignedOnScalar = 1,
183  size = 4,
184  HasHalfPacket = 0,
185 
186  HasAdd = 1,
187  HasSub = 1,
188  HasMul = 1,
189  HasDiv = 1,
190  HasMin = 1,
191  HasMax = 1,
192  HasAbs = 1,
193  HasSin = 0,
194  HasCos = 0,
195  HasLog = 0,
196  HasExp = 1,
197  HasSqrt = 1,
198  HasRsqrt = 1,
199  HasTanh = 1,
200  HasErf = 1,
201  HasRound = 1,
202  HasFloor = 1,
203  HasCeil = 1,
204  HasNegate = 1,
205  HasBlend = 1
206  };
207 };
208 
209 template<> struct packet_traits<double> : default_packet_traits
210 {
211  typedef Packet2d type;
212  typedef Packet2d half;
213  enum {
214  Vectorizable = 1,
215  AlignedOnScalar = 1,
216  size=2,
217  HasHalfPacket = 1,
218 
219  HasAdd = 1,
220  HasSub = 1,
221  HasMul = 1,
222  HasDiv = 1,
223  HasMin = 1,
224  HasMax = 1,
225  HasAbs = 1,
226  HasSin = 0,
227  HasCos = 0,
228  HasLog = 0,
229  HasExp = 1,
230  HasSqrt = 1,
231  HasRsqrt = 1,
232  HasRound = 1,
233  HasFloor = 1,
234  HasCeil = 1,
235  HasNegate = 1,
236  HasBlend = 1
237  };
238 };
239 
240 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; };
241 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; };
242 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
243 
244 /* Forward declaration */
245 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
246 
247 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
248 {
249  Packet vt;
250  vt.v4i = v;
251  s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
252  return s;
253 }
254 
255 inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
256 {
257  Packet vt;
258  vt.v4ui = v;
259  s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
260  return s;
261 }
262 
263 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
264 {
265  Packet vt;
266  vt.v2l = v;
267  s << vt.l[0] << ", " << vt.l[1];
268  return s;
269 }
270 
271 inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v)
272 {
273  Packet vt;
274  vt.v2ul = v;
275  s << vt.ul[0] << ", " << vt.ul[1] ;
276  return s;
277 }
278 
279 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
280 {
281  Packet vt;
282  vt.v2d = v;
283  s << vt.d[0] << ", " << vt.d[1];
284  return s;
285 }
286 
287 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
288 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
289 {
290  Packet vt;
291  vt.v4f = v;
292  s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
293  return s;
294 }
295 #endif
296 
297 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
298 {
299  // FIXME: No intrinsic yet
300  EIGEN_DEBUG_ALIGNED_LOAD
301  Packet *vfrom;
302  vfrom = (Packet *) from;
303  return vfrom->v4i;
304 }
305 
306 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
307 {
308  // FIXME: No intrinsic yet
309  EIGEN_DEBUG_ALIGNED_LOAD
310  Packet *vfrom;
311  vfrom = (Packet *) from;
312  return vfrom->v2d;
313 }
314 
315 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
316 {
317  // FIXME: No intrinsic yet
318  EIGEN_DEBUG_ALIGNED_STORE
319  Packet *vto;
320  vto = (Packet *) to;
321  vto->v4i = from;
322 }
323 
324 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
325 {
326  // FIXME: No intrinsic yet
327  EIGEN_DEBUG_ALIGNED_STORE
328  Packet *vto;
329  vto = (Packet *) to;
330  vto->v2d = from;
331 }
332 
333 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from)
334 {
335  return vec_splats(from);
336 }
337 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
338  return vec_splats(from);
339 }
340 
341 template<> EIGEN_STRONG_INLINE void
342 pbroadcast4<Packet4i>(const int *a,
343  Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
344 {
345  a3 = pload<Packet4i>(a);
346  a0 = vec_splat(a3, 0);
347  a1 = vec_splat(a3, 1);
348  a2 = vec_splat(a3, 2);
349  a3 = vec_splat(a3, 3);
350 }
351 
352 template<> EIGEN_STRONG_INLINE void
353 pbroadcast4<Packet2d>(const double *a,
354  Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
355 {
356  a1 = pload<Packet2d>(a);
357  a0 = vec_splat(a1, 0);
358  a1 = vec_splat(a1, 1);
359  a3 = pload<Packet2d>(a+2);
360  a2 = vec_splat(a3, 0);
361  a3 = vec_splat(a3, 1);
362 }
363 
364 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
365 {
366  int EIGEN_ALIGN16 ai[4];
367  ai[0] = from[0*stride];
368  ai[1] = from[1*stride];
369  ai[2] = from[2*stride];
370  ai[3] = from[3*stride];
371  return pload<Packet4i>(ai);
372 }
373 
374 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
375 {
376  double EIGEN_ALIGN16 af[2];
377  af[0] = from[0*stride];
378  af[1] = from[1*stride];
379  return pload<Packet2d>(af);
380 }
381 
382 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
383 {
384  int EIGEN_ALIGN16 ai[4];
385  pstore<int>((int *)ai, from);
386  to[0*stride] = ai[0];
387  to[1*stride] = ai[1];
388  to[2*stride] = ai[2];
389  to[3*stride] = ai[3];
390 }
391 
392 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
393 {
394  double EIGEN_ALIGN16 af[2];
395  pstore<double>(af, from);
396  to[0*stride] = af[0];
397  to[1*stride] = af[1];
398 }
399 
400 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
401 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
402 
403 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
404 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
405 
406 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
407 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
408 
409 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
410 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
411 
412 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
413 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
414 
415 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
416 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
417 
418 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
419 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
420 
421 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
422 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
423 
424 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
425 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
426 
427 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
428 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
429 
430 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
431 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
432 
433 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
434 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
435 
436 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
437 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
438 
439 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
440 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
441 
442 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
443 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
444 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
445 
446 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); }
447 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); }
448 
449 
450 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
451 {
452  Packet4i p = pload<Packet4i>(from);
453  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
454 }
455 
456 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
457 {
458  Packet2d p = pload<Packet2d>(from);
459  return vec_perm(p, p, p16uc_PSET64_HI);
460 }
461 
462 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { pstore<int>(to, from); }
463 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); }
464 
465 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
466 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
467 
468 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
469 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
470 
471 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
472 {
473  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
474 }
475 
476 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
477 {
478  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
479 }
480 
481 template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
482 template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
483 
484 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
485 {
486  Packet4i b, sum;
487  b = vec_sld(a, a, 8);
488  sum = padd<Packet4i>(a, b);
489  b = vec_sld(sum, sum, 4);
490  sum = padd<Packet4i>(sum, b);
491  return pfirst(sum);
492 }
493 
494 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
495 {
496  Packet2d b, sum;
497  b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
498  sum = padd<Packet2d>(a, b);
499  return pfirst(sum);
500 }
501 
502 // Other reduction functions:
503 // mul
504 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
505 {
506  EIGEN_ALIGN16 int aux[4];
507  pstore(aux, a);
508  return aux[0] * aux[1] * aux[2] * aux[3];
509 }
510 
511 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
512 {
513  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
514 }
515 
516 // min
517 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
518 {
519  Packet4i b, res;
520  b = pmin<Packet4i>(a, vec_sld(a, a, 8));
521  res = pmin<Packet4i>(b, vec_sld(b, b, 4));
522  return pfirst(res);
523 }
524 
525 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
526 {
527  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
528 }
529 
530 // max
531 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
532 {
533  Packet4i b, res;
534  b = pmax<Packet4i>(a, vec_sld(a, a, 8));
535  res = pmax<Packet4i>(b, vec_sld(b, b, 4));
536  return pfirst(res);
537 }
538 
539 // max
540 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
541 {
542  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
543 }
544 
545 EIGEN_DEVICE_FUNC inline void
546 ptranspose(PacketBlock<Packet4i,4>& kernel) {
547  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
548  Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
549  Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
550  Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
551  kernel.packet[0] = vec_mergeh(t0, t2);
552  kernel.packet[1] = vec_mergel(t0, t2);
553  kernel.packet[2] = vec_mergeh(t1, t3);
554  kernel.packet[3] = vec_mergel(t1, t3);
555 }
556 
557 EIGEN_DEVICE_FUNC inline void
558 ptranspose(PacketBlock<Packet2d,2>& kernel) {
559  Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
560  Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
561  kernel.packet[0] = t0;
562  kernel.packet[1] = t1;
563 }
564 
565 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
566  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
567  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
568  return vec_sel(elsePacket, thenPacket, mask);
569 }
570 
571 
572 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
573  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
574  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
575  return vec_sel(elsePacket, thenPacket, mask);
576 }
577 
578 /* z13 has no vector float support so we emulate that with double
579  z14 has proper vector float support.
580 */
581 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
582 /* Helper function to simulate a vec_splat_packet4f
583  */
584 template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
585 {
586  Packet4f splat;
587  switch (element) {
588  case 0:
589  splat.v4f[0] = vec_splat(from.v4f[0], 0);
590  splat.v4f[1] = splat.v4f[0];
591  break;
592  case 1:
593  splat.v4f[0] = vec_splat(from.v4f[0], 1);
594  splat.v4f[1] = splat.v4f[0];
595  break;
596  case 2:
597  splat.v4f[0] = vec_splat(from.v4f[1], 0);
598  splat.v4f[1] = splat.v4f[0];
599  break;
600  case 3:
601  splat.v4f[0] = vec_splat(from.v4f[1], 1);
602  splat.v4f[1] = splat.v4f[0];
603  break;
604  }
605  return splat;
606 }
607 
608 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
609 {
610  // FIXME: No intrinsic yet
611  EIGEN_DEBUG_ALIGNED_LOAD
612  Packet4f vfrom;
613  vfrom.v4f[0] = vec_ld2f(&from[0]);
614  vfrom.v4f[1] = vec_ld2f(&from[2]);
615  return vfrom;
616 }
617 
618 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
619 {
620  // FIXME: No intrinsic yet
621  EIGEN_DEBUG_ALIGNED_STORE
622  vec_st2f(from.v4f[0], &to[0]);
623  vec_st2f(from.v4f[1], &to[2]);
624 }
625 
626 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
627 {
628  Packet4f to;
629  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
630  to.v4f[1] = to.v4f[0];
631  return to;
632 }
633 
634 template<> EIGEN_STRONG_INLINE void
635 pbroadcast4<Packet4f>(const float *a,
636  Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
637 {
638  a3 = pload<Packet4f>(a);
639  a0 = vec_splat_packet4f<0>(a3);
640  a1 = vec_splat_packet4f<1>(a3);
641  a2 = vec_splat_packet4f<2>(a3);
642  a3 = vec_splat_packet4f<3>(a3);
643 }
644 
645 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
646 {
647  float EIGEN_ALIGN16 ai[4];
648  ai[0] = from[0*stride];
649  ai[1] = from[1*stride];
650  ai[2] = from[2*stride];
651  ai[3] = from[3*stride];
652  return pload<Packet4f>(ai);
653 }
654 
655 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
656 {
657  float EIGEN_ALIGN16 ai[4];
658  pstore<float>((float *)ai, from);
659  to[0*stride] = ai[0];
660  to[1*stride] = ai[1];
661  to[2*stride] = ai[2];
662  to[3*stride] = ai[3];
663 }
664 
665 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
666 {
667  Packet4f c;
668  c.v4f[0] = a.v4f[0] + b.v4f[0];
669  c.v4f[1] = a.v4f[1] + b.v4f[1];
670  return c;
671 }
672 
673 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
674 {
675  Packet4f c;
676  c.v4f[0] = a.v4f[0] - b.v4f[0];
677  c.v4f[1] = a.v4f[1] - b.v4f[1];
678  return c;
679 }
680 
681 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
682 {
683  Packet4f c;
684  c.v4f[0] = a.v4f[0] * b.v4f[0];
685  c.v4f[1] = a.v4f[1] * b.v4f[1];
686  return c;
687 }
688 
689 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
690 {
691  Packet4f c;
692  c.v4f[0] = a.v4f[0] / b.v4f[0];
693  c.v4f[1] = a.v4f[1] / b.v4f[1];
694  return c;
695 }
696 
697 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
698 {
699  Packet4f c;
700  c.v4f[0] = -a.v4f[0];
701  c.v4f[1] = -a.v4f[1];
702  return c;
703 }
704 
705 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
706 {
707  Packet4f res;
708  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
709  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
710  return res;
711 }
712 
713 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
714 {
715  Packet4f res;
716  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
717  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
718  return res;
719 }
720 
721 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
722 {
723  Packet4f res;
724  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
725  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
726  return res;
727 }
728 
729 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
730 {
731  Packet4f res;
732  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
733  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
734  return res;
735 }
736 
737 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
738 {
739  Packet4f res;
740  res.v4f[0] = por(a.v4f[0], b.v4f[0]);
741  res.v4f[1] = por(a.v4f[1], b.v4f[1]);
742  return res;
743 }
744 
745 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
746 {
747  Packet4f res;
748  res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
749  res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
750  return res;
751 }
752 
753 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
754 {
755  Packet4f res;
756  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
757  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
758  return res;
759 }
760 
761 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
762 {
763  Packet4f res;
764  res.v4f[0] = vec_round(a.v4f[0]);
765  res.v4f[1] = vec_round(a.v4f[1]);
766  return res;
767 }
768 
769 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
770 {
771  Packet4f res;
772  res.v4f[0] = vec_ceil(a.v4f[0]);
773  res.v4f[1] = vec_ceil(a.v4f[1]);
774  return res;
775 }
776 
777 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
778 {
779  Packet4f res;
780  res.v4f[0] = vec_floor(a.v4f[0]);
781  res.v4f[1] = vec_floor(a.v4f[1]);
782  return res;
783 }
784 
785 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
786 {
787  Packet4f p = pload<Packet4f>(from);
788  p.v4f[1] = vec_splat(p.v4f[0], 1);
789  p.v4f[0] = vec_splat(p.v4f[0], 0);
790  return p;
791 }
792 
793 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
794 
795 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
796 {
797  Packet4f rev;
798  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
799  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
800  return rev;
801 }
802 
803 template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
804 {
805  Packet4f res;
806  res.v4f[0] = pabs(a.v4f[0]);
807  res.v4f[1] = pabs(a.v4f[1]);
808  return res;
809 }
810 
811 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
812 {
813  Packet2d sum;
814  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
815  double first = predux<Packet2d>(sum);
816  return static_cast<float>(first);
817 }
818 
819 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
820 {
821  // Return predux_mul<Packet2d> of the subvectors product
822  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
823 }
824 
825 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
826 {
827  Packet2d b, res;
828  b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
829  res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
830  return static_cast<float>(pfirst(res));
831 }
832 
833 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
834 {
835  Packet2d b, res;
836  b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
837  res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
838  return static_cast<float>(pfirst(res));
839 }
840 
841 /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
842  */
843 EIGEN_DEVICE_FUNC inline void
844 ptranspose(PacketBlock<Packet4f,4>& kernel) {
845  PacketBlock<Packet2d,2> t0,t1,t2,t3;
846  // copy top-left 2x2 Packet2d block
847  t0.packet[0] = kernel.packet[0].v4f[0];
848  t0.packet[1] = kernel.packet[1].v4f[0];
849 
850  // copy top-right 2x2 Packet2d block
851  t1.packet[0] = kernel.packet[0].v4f[1];
852  t1.packet[1] = kernel.packet[1].v4f[1];
853 
854  // copy bottom-left 2x2 Packet2d block
855  t2.packet[0] = kernel.packet[2].v4f[0];
856  t2.packet[1] = kernel.packet[3].v4f[0];
857 
858  // copy bottom-right 2x2 Packet2d block
859  t3.packet[0] = kernel.packet[2].v4f[1];
860  t3.packet[1] = kernel.packet[3].v4f[1];
861 
862  // Transpose all 2x2 blocks
863  ptranspose(t0);
864  ptranspose(t1);
865  ptranspose(t2);
866  ptranspose(t3);
867 
868  // Copy back transposed blocks, but exchange t1 and t2 due to transposition
869  kernel.packet[0].v4f[0] = t0.packet[0];
870  kernel.packet[0].v4f[1] = t2.packet[0];
871  kernel.packet[1].v4f[0] = t0.packet[1];
872  kernel.packet[1].v4f[1] = t2.packet[1];
873  kernel.packet[2].v4f[0] = t1.packet[0];
874  kernel.packet[2].v4f[1] = t3.packet[0];
875  kernel.packet[3].v4f[0] = t1.packet[1];
876  kernel.packet[3].v4f[1] = t3.packet[1];
877 }
878 
879 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
880  Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
881  Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
882  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
883  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
884  Packet4f result;
885  result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
886  result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
887  return result;
888 }
889 
890 template<> Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
891 {
892  Packet4f res;
893  res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
894  res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
895  return res;
896 }
897 
898 template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
899 {
900  Packet4f res;
901  res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
902  res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
903  return res;
904 }
905 
906 template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
907 {
908  Packet4f res;
909  res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
910  res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
911  return res;
912 }
913 
914 #else
915 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
916 {
917  // FIXME: No intrinsic yet
918  EIGEN_DEBUG_ALIGNED_LOAD
919  Packet *vfrom;
920  vfrom = (Packet *) from;
921  return vfrom->v4f;
922 }
923 
924 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
925 {
926  // FIXME: No intrinsic yet
927  EIGEN_DEBUG_ALIGNED_STORE
928  Packet *vto;
929  vto = (Packet *) to;
930  vto->v4f = from;
931 }
932 
933 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
934 {
935  return vec_splats(from);
936 }
937 
938 template<> EIGEN_STRONG_INLINE void
939 pbroadcast4<Packet4f>(const float *a,
940  Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
941 {
942  a3 = pload<Packet4f>(a);
943  a0 = vec_splat(a3, 0);
944  a1 = vec_splat(a3, 1);
945  a2 = vec_splat(a3, 2);
946  a3 = vec_splat(a3, 3);
947 }
948 
949 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
950 {
951  float EIGEN_ALIGN16 af[4];
952  af[0] = from[0*stride];
953  af[1] = from[1*stride];
954  af[2] = from[2*stride];
955  af[3] = from[3*stride];
956  return pload<Packet4f>(af);
957 }
958 
959 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
960 {
961  float EIGEN_ALIGN16 af[4];
962  pstore<float>((float*)af, from);
963  to[0*stride] = af[0];
964  to[1*stride] = af[1];
965  to[2*stride] = af[2];
966  to[3*stride] = af[3];
967 }
968 
969 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
970 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
971 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
972 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
973 template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
974 template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f> (const Packet4f& a) { return a; }
975 template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f> (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
976 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
977 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
978 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
979 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
980 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
981 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
982 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
983 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f> (const Packet4f& a) { return vec_ceil(a); }
984 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
985 template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f> (const Packet4f& a) { return vec_abs(a); }
986 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
987 
988 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
989 {
990  Packet4f p = pload<Packet4f>(from);
991  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
992 }
993 
994 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
995 {
996  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
997 }
998 
999 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
1000 {
1001  Packet4f b, sum;
1002  b = vec_sld(a, a, 8);
1003  sum = padd<Packet4f>(a, b);
1004  b = vec_sld(sum, sum, 4);
1005  sum = padd<Packet4f>(sum, b);
1006  return pfirst(sum);
1007 }
1008 
1009 // Other reduction functions:
1010 // mul
1011 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
1012 {
1013  Packet4f prod;
1014  prod = pmul(a, vec_sld(a, a, 8));
1015  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
1016 }
1017 
1018 // min
1019 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
1020 {
1021  Packet4f b, res;
1022  b = pmin<Packet4f>(a, vec_sld(a, a, 8));
1023  res = pmin<Packet4f>(b, vec_sld(b, b, 4));
1024  return pfirst(res);
1025 }
1026 
1027 // max
1028 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
1029 {
1030  Packet4f b, res;
1031  b = pmax<Packet4f>(a, vec_sld(a, a, 8));
1032  res = pmax<Packet4f>(b, vec_sld(b, b, 4));
1033  return pfirst(res);
1034 }
1035 
1036 EIGEN_DEVICE_FUNC inline void
1037 ptranspose(PacketBlock<Packet4f,4>& kernel) {
1038  Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
1039  Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
1040  Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
1041  Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
1042  kernel.packet[0] = vec_mergeh(t0, t2);
1043  kernel.packet[1] = vec_mergel(t0, t2);
1044  kernel.packet[2] = vec_mergeh(t1, t3);
1045  kernel.packet[3] = vec_mergel(t1, t3);
1046 }
1047 
1048 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
1049  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
1050  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
1051  return vec_sel(elsePacket, thenPacket, mask);
1052 }
1053 
1054 #endif
1055 
1056 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
1057 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
1058 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
1059 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f> (const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
1060 
1061 } // end namespace internal
1062 
1063 } // end namespace Eigen
1064 
1065 #endif // EIGEN_PACKET_MATH_ZVECTOR_H
Eigen
Namespace containing all symbols from the Eigen library.
Definition: LDLT.h:16
Eigen::internal::Packet
Definition: ZVector/PacketMath.h:53
Eigen::Aligned16
@ Aligned16
Definition: Constants.h:234
Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:42