17 #ifndef dealii__vectorization_h 18 #define dealii__vectorization_h 20 #include <deal.II/base/config.h> 21 #include <deal.II/base/exceptions.h> 22 #include <deal.II/base/template_constraints.h> 42 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 // AVX, AVX-512 43 #include <immintrin.h> 44 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL == 1 // SSE2 45 #include <emmintrin.h> 49 DEAL_II_NAMESPACE_OPEN
80 template<
typename Number>
88 #ifndef DEAL_II_WITH_CXX11 91 template <
typename Number>
92 struct ProductType<Number, VectorizedArray<Number> >
97 template <
typename Number>
109 struct ProductType<double, VectorizedArray<float> >
173 template <
typename Number>
174 class VectorizedArray
180 static const unsigned int n_array_elements = 1;
188 DEAL_II_ALWAYS_INLINE
190 operator = (
const Number scalar)
199 DEAL_II_ALWAYS_INLINE
201 operator [] (
const unsigned int comp)
211 DEAL_II_ALWAYS_INLINE
213 operator [] (
const unsigned int comp)
const 223 DEAL_II_ALWAYS_INLINE
234 DEAL_II_ALWAYS_INLINE
245 DEAL_II_ALWAYS_INLINE
256 DEAL_II_ALWAYS_INLINE
270 DEAL_II_ALWAYS_INLINE
282 DEAL_II_ALWAYS_INLINE
300 DEAL_II_ALWAYS_INLINE
302 const unsigned int *offsets)
304 data = base_ptr[offsets[0]];
319 DEAL_II_ALWAYS_INLINE
321 Number *base_ptr)
const 323 base_ptr[offsets[0]] = data;
337 DEAL_II_ALWAYS_INLINE
342 res.
data = std::sqrt(data);
350 DEAL_II_ALWAYS_INLINE
355 res.
data = std::fabs(data);
363 DEAL_II_ALWAYS_INLINE
368 res.
data = std::max (data, other.
data);
376 DEAL_II_ALWAYS_INLINE
381 res.
data = std::min (data, other.
data);
406 template <
typename Number>
407 inline DEAL_II_ALWAYS_INLINE
443 template <
typename Number>
448 const unsigned int *offsets,
451 for (
unsigned int i=0; i<n_entries; ++i)
452 for (
unsigned int v=0; v<VectorizedArray<Number>::n_array_elements; ++v)
453 out[i][v] = in[offsets[v]+i];
496 template <
typename Number>
500 const unsigned int n_entries,
502 const unsigned int *offsets,
506 for (
unsigned int i=0; i<n_entries; ++i)
507 for (
unsigned int v=0; v<VectorizedArray<Number>::n_array_elements; ++v)
508 out[offsets[v]+i] += in[i][v];
510 for (
unsigned int i=0; i<n_entries; ++i)
511 for (
unsigned int v=0; v<VectorizedArray<Number>::n_array_elements; ++v)
512 out[offsets[v]+i] = in[i][v];
520 #if DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 3 && defined(__AVX512F__) 526 class VectorizedArray<double>
532 static const unsigned int n_array_elements = 8;
537 DEAL_II_ALWAYS_INLINE
539 operator = (
const double x)
541 data = _mm512_set1_pd(x);
548 DEAL_II_ALWAYS_INLINE
550 operator [] (
const unsigned int comp)
553 return *(
reinterpret_cast<double *
>(&data)+comp);
559 DEAL_II_ALWAYS_INLINE
561 operator [] (
const unsigned int comp)
const 564 return *(
reinterpret_cast<const double *
>(&data)+comp);
570 DEAL_II_ALWAYS_INLINE
572 operator += (
const VectorizedArray &vec)
579 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 582 data = _mm512_add_pd(data,vec.
data);
590 DEAL_II_ALWAYS_INLINE
592 operator -= (
const VectorizedArray &vec)
594 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 597 data = _mm512_sub_pd(data,vec.
data);
604 DEAL_II_ALWAYS_INLINE
606 operator *= (
const VectorizedArray &vec)
608 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 611 data = _mm512_mul_pd(data,vec.
data);
619 DEAL_II_ALWAYS_INLINE
621 operator /= (
const VectorizedArray &vec)
623 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 626 data = _mm512_div_pd(data,vec.
data);
636 DEAL_II_ALWAYS_INLINE
637 void load (
const double *ptr)
639 data = _mm512_loadu_pd (ptr);
648 DEAL_II_ALWAYS_INLINE
649 void store (
double *ptr)
const 651 _mm512_storeu_pd (ptr, data);
666 DEAL_II_ALWAYS_INLINE
667 void gather (
const double *base_ptr,
668 const unsigned int *offsets)
673 const __m256 index_val = _mm256_loadu_ps((
const float *)offsets);
674 const __m256i index = *((__m256i *)(&index_val));
675 data = _mm512_i32gather_pd(index, base_ptr, 8);
690 DEAL_II_ALWAYS_INLINE
691 void scatter (
const unsigned int *offsets,
692 double *base_ptr)
const 694 for (
unsigned int i=0; i<8; ++i)
695 for (
unsigned int j=i+1; j<8; ++j)
696 Assert(offsets[i] != offsets[j],
697 ExcMessage(
"Result of scatter undefined if two offset elements" 698 " point to the same position"));
703 const __m256 index_val = _mm256_loadu_ps((
const float *)offsets);
704 const __m256i index = *((__m256i *)(&index_val));
705 _mm512_i32scatter_pd(base_ptr, index, data, 8);
719 DEAL_II_ALWAYS_INLINE
724 res.
data = _mm512_sqrt_pd(data);
732 DEAL_II_ALWAYS_INLINE
741 __m512d mask = _mm512_set1_pd (-0.);
743 res.
data = (__m512d)_mm512_andnot_epi64 ((__m512i)mask, (__m512i)data);
751 DEAL_II_ALWAYS_INLINE
753 get_max (
const VectorizedArray &other)
const 756 res.
data = _mm512_max_pd (data, other.
data);
764 DEAL_II_ALWAYS_INLINE
766 get_min (
const VectorizedArray &other)
const 769 res.
data = _mm512_min_pd (data, other.
data);
794 vectorized_load_and_transpose(
const unsigned int n_entries,
796 const unsigned int *offsets,
799 const unsigned int n_chunks = n_entries/4;
800 for (
unsigned int outer=0; outer<8; outer += 4)
802 const double *in0 = in + offsets[0+outer];
803 const double *in1 = in + offsets[1+outer];
804 const double *in2 = in + offsets[2+outer];
805 const double *in3 = in + offsets[3+outer];
807 for (
unsigned int i=0; i<n_chunks; ++i)
809 __m256d u0 = _mm256_loadu_pd(in0+4*i);
810 __m256d u1 = _mm256_loadu_pd(in1+4*i);
811 __m256d u2 = _mm256_loadu_pd(in2+4*i);
812 __m256d u3 = _mm256_loadu_pd(in3+4*i);
813 __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
814 __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
815 __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
816 __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
817 *(__m256d *)((
double *)(&out[4*i+0].
data)+outer) = _mm256_unpacklo_pd (t0, t1);
818 *(__m256d *)((
double *)(&out[4*i+1].
data)+outer) = _mm256_unpackhi_pd (t0, t1);
819 *(__m256d *)((
double *)(&out[4*i+2].
data)+outer) = _mm256_unpacklo_pd (t2, t3);
820 *(__m256d *)((
double *)(&out[4*i+3].
data)+outer) = _mm256_unpackhi_pd (t2, t3);
822 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
823 for (
unsigned int v=0; v<4; ++v)
824 out[i][outer+v] = in[offsets[v+outer]+i];
836 vectorized_transpose_and_store(
const bool add_into,
837 const unsigned int n_entries,
839 const unsigned int *offsets,
842 const unsigned int n_chunks = n_entries/4;
846 for (
unsigned int outer=0; outer<8; outer += 4)
848 double *out0 = out + offsets[0+outer];
849 double *out1 = out + offsets[1+outer];
850 double *out2 = out + offsets[2+outer];
851 double *out3 = out + offsets[3+outer];
852 for (
unsigned int i=0; i<n_chunks; ++i)
854 __m256d u0 = *(
const __m256d *)((
const double *)(&in[4*i+0].
data)+outer);
855 __m256d u1 = *(
const __m256d *)((
const double *)(&in[4*i+1].
data)+outer);
856 __m256d u2 = *(
const __m256d *)((
const double *)(&in[4*i+2].
data)+outer);
857 __m256d u3 = *(
const __m256d *)((
const double *)(&in[4*i+3].
data)+outer);
858 __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
859 __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
860 __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
861 __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
862 __m256d res0 = _mm256_unpacklo_pd (t0, t1);
863 __m256d res1 = _mm256_unpackhi_pd (t0, t1);
864 __m256d res2 = _mm256_unpacklo_pd (t2, t3);
865 __m256d res3 = _mm256_unpackhi_pd (t2, t3);
872 res0 = _mm256_add_pd(_mm256_loadu_pd(out0+4*i), res0);
873 _mm256_storeu_pd(out0+4*i, res0);
874 res1 = _mm256_add_pd(_mm256_loadu_pd(out1+4*i), res1);
875 _mm256_storeu_pd(out1+4*i, res1);
876 res2 = _mm256_add_pd(_mm256_loadu_pd(out2+4*i), res2);
877 _mm256_storeu_pd(out2+4*i, res2);
878 res3 = _mm256_add_pd(_mm256_loadu_pd(out3+4*i), res3);
879 _mm256_storeu_pd(out3+4*i, res3);
883 _mm256_storeu_pd(out0+4*i, res0);
884 _mm256_storeu_pd(out1+4*i, res1);
885 _mm256_storeu_pd(out2+4*i, res2);
886 _mm256_storeu_pd(out3+4*i, res3);
890 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
891 for (
unsigned int v=0; v<4; ++v)
892 out[offsets[v+outer]+i] += in[i][v+outer];
894 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
895 for (
unsigned int v=0; v<4; ++v)
896 out[offsets[v+outer]+i] = in[i][v+outer];
906 class VectorizedArray<float>
912 static const unsigned int n_array_elements = 16;
917 DEAL_II_ALWAYS_INLINE
919 operator = (
const float x)
921 data = _mm512_set1_ps(x);
928 DEAL_II_ALWAYS_INLINE
930 operator [] (
const unsigned int comp)
933 return *(
reinterpret_cast<float *
>(&data)+comp);
939 DEAL_II_ALWAYS_INLINE
941 operator [] (
const unsigned int comp)
const 944 return *(
reinterpret_cast<const float *
>(&data)+comp);
950 DEAL_II_ALWAYS_INLINE
952 operator += (
const VectorizedArray &vec)
959 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 962 data = _mm512_add_ps(data,vec.
data);
970 DEAL_II_ALWAYS_INLINE
972 operator -= (
const VectorizedArray &vec)
974 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 977 data = _mm512_sub_ps(data,vec.
data);
984 DEAL_II_ALWAYS_INLINE
986 operator *= (
const VectorizedArray &vec)
988 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 991 data = _mm512_mul_ps(data,vec.
data);
999 DEAL_II_ALWAYS_INLINE
1001 operator /= (
const VectorizedArray &vec)
1003 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1006 data = _mm512_div_ps(data,vec.
data);
1016 DEAL_II_ALWAYS_INLINE
1017 void load (
const float *ptr)
1019 data = _mm512_loadu_ps (ptr);
1028 DEAL_II_ALWAYS_INLINE
1029 void store (
float *ptr)
const 1031 _mm512_storeu_ps (ptr, data);
1046 DEAL_II_ALWAYS_INLINE
1047 void gather (
const float *base_ptr,
1048 const unsigned int *offsets)
1053 const __m512 index_val = _mm512_loadu_ps((
const float *)offsets);
1054 const __m512i index = *((__m512i *)(&index_val));
1055 data = _mm512_i32gather_ps(index, base_ptr, 4);
1070 DEAL_II_ALWAYS_INLINE
1071 void scatter (
const unsigned int *offsets,
1072 float *base_ptr)
const 1074 for (
unsigned int i=0; i<16; ++i)
1075 for (
unsigned int j=i+1; j<16; ++j)
1076 Assert(offsets[i] != offsets[j],
1077 ExcMessage(
"Result of scatter undefined if two offset elements" 1078 " point to the same position"));
1083 const __m512 index_val = _mm512_loadu_ps((
const float *)offsets);
1084 const __m512i index = *((__m512i *)(&index_val));
1085 _mm512_i32scatter_ps(base_ptr, index, data, 4);
1100 DEAL_II_ALWAYS_INLINE
1104 VectorizedArray res;
1105 res.
data = _mm512_sqrt_ps(data);
1113 DEAL_II_ALWAYS_INLINE
1122 __m512 mask = _mm512_set1_ps (-0.f);
1123 VectorizedArray res;
1124 res.
data = (__m512)_mm512_andnot_epi32 ((__m512i)mask, (__m512i)data);
1132 DEAL_II_ALWAYS_INLINE
1134 get_max (
const VectorizedArray &other)
const 1136 VectorizedArray res;
1137 res.
data = _mm512_max_ps (data, other.
data);
1145 DEAL_II_ALWAYS_INLINE
1147 get_min (
const VectorizedArray &other)
const 1149 VectorizedArray res;
1150 res.
data = _mm512_min_ps (data, other.
data);
1175 vectorized_load_and_transpose(
const unsigned int n_entries,
1177 const unsigned int *offsets,
1180 const unsigned int n_chunks = n_entries/4;
1181 for (
unsigned int outer = 0; outer<16; outer += 8)
1183 for (
unsigned int i=0; i<n_chunks; ++i)
1185 __m128 u0 = _mm_loadu_ps(in+4*i+offsets[0+outer]);
1186 __m128 u1 = _mm_loadu_ps(in+4*i+offsets[1+outer]);
1187 __m128 u2 = _mm_loadu_ps(in+4*i+offsets[2+outer]);
1188 __m128 u3 = _mm_loadu_ps(in+4*i+offsets[3+outer]);
1189 __m128 u4 = _mm_loadu_ps(in+4*i+offsets[4+outer]);
1190 __m128 u5 = _mm_loadu_ps(in+4*i+offsets[5+outer]);
1191 __m128 u6 = _mm_loadu_ps(in+4*i+offsets[6+outer]);
1192 __m128 u7 = _mm_loadu_ps(in+4*i+offsets[7+outer]);
1195 __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
1196 t0 = _mm256_insertf128_ps (t3, u0, 0);
1197 t0 = _mm256_insertf128_ps (t0, u4, 1);
1198 t1 = _mm256_insertf128_ps (t3, u1, 0);
1199 t1 = _mm256_insertf128_ps (t1, u5, 1);
1200 t2 = _mm256_insertf128_ps (t3, u2, 0);
1201 t2 = _mm256_insertf128_ps (t2, u6, 1);
1202 t3 = _mm256_insertf128_ps (t3, u3, 0);
1203 t3 = _mm256_insertf128_ps (t3, u7, 1);
1204 __m256 v0 = _mm256_shuffle_ps (t0, t1, 0x44);
1205 __m256 v1 = _mm256_shuffle_ps (t0, t1, 0xee);
1206 __m256 v2 = _mm256_shuffle_ps (t2, t3, 0x44);
1207 __m256 v3 = _mm256_shuffle_ps (t2, t3, 0xee);
1208 *(__m256 *)((
float *)(&out[4*i+0].
data)+outer) = _mm256_shuffle_ps (v0, v2, 0x88);
1209 *(__m256 *)((
float *)(&out[4*i+1].
data)+outer) = _mm256_shuffle_ps (v0, v2, 0xdd);
1210 *(__m256 *)((
float *)(&out[4*i+2].
data)+outer) = _mm256_shuffle_ps (v1, v3, 0x88);
1211 *(__m256 *)((
float *)(&out[4*i+3].
data)+outer) = _mm256_shuffle_ps (v1, v3, 0xdd);
1213 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
1214 for (
unsigned int v=0; v<8; ++v)
1215 out[i][v+outer] = in[offsets[v+outer]+i];
1227 vectorized_transpose_and_store(
const bool add_into,
1228 const unsigned int n_entries,
1230 const unsigned int *offsets,
1233 const unsigned int n_chunks = n_entries/4;
1234 for (
unsigned int outer = 0; outer<16; outer += 8)
1236 for (
unsigned int i=0; i<n_chunks; ++i)
1238 __m256 u0 = *(
const __m256 *)((
const float *)(&in[4*i+0].
data)+outer);
1239 __m256 u1 = *(
const __m256 *)((
const float *)(&in[4*i+1].
data)+outer);
1240 __m256 u2 = *(
const __m256 *)((
const float *)(&in[4*i+2].
data)+outer);
1241 __m256 u3 = *(
const __m256 *)((
const float *)(&in[4*i+3].
data)+outer);
1242 __m256 t0 = _mm256_shuffle_ps (u0, u1, 0x44);
1243 __m256 t1 = _mm256_shuffle_ps (u0, u1, 0xee);
1244 __m256 t2 = _mm256_shuffle_ps (u2, u3, 0x44);
1245 __m256 t3 = _mm256_shuffle_ps (u2, u3, 0xee);
1246 u0 = _mm256_shuffle_ps (t0, t2, 0x88);
1247 u1 = _mm256_shuffle_ps (t0, t2, 0xdd);
1248 u2 = _mm256_shuffle_ps (t1, t3, 0x88);
1249 u3 = _mm256_shuffle_ps (t1, t3, 0xdd);
1250 __m128 res0 = _mm256_extractf128_ps (u0, 0);
1251 __m128 res4 = _mm256_extractf128_ps (u0, 1);
1252 __m128 res1 = _mm256_extractf128_ps (u1, 0);
1253 __m128 res5 = _mm256_extractf128_ps (u1, 1);
1254 __m128 res2 = _mm256_extractf128_ps (u2, 0);
1255 __m128 res6 = _mm256_extractf128_ps (u2, 1);
1256 __m128 res3 = _mm256_extractf128_ps (u3, 0);
1257 __m128 res7 = _mm256_extractf128_ps (u3, 1);
1264 res0 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[0+outer]), res0);
1265 _mm_storeu_ps(out+4*i+offsets[0+outer], res0);
1266 res1 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[1+outer]), res1);
1267 _mm_storeu_ps(out+4*i+offsets[1+outer], res1);
1268 res2 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[2+outer]), res2);
1269 _mm_storeu_ps(out+4*i+offsets[2+outer], res2);
1270 res3 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[3+outer]), res3);
1271 _mm_storeu_ps(out+4*i+offsets[3+outer], res3);
1272 res4 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[4+outer]), res4);
1273 _mm_storeu_ps(out+4*i+offsets[4+outer], res4);
1274 res5 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[5+outer]), res5);
1275 _mm_storeu_ps(out+4*i+offsets[5+outer], res5);
1276 res6 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[6+outer]), res6);
1277 _mm_storeu_ps(out+4*i+offsets[6+outer], res6);
1278 res7 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[7+outer]), res7);
1279 _mm_storeu_ps(out+4*i+offsets[7+outer], res7);
1283 _mm_storeu_ps(out+4*i+offsets[0+outer], res0);
1284 _mm_storeu_ps(out+4*i+offsets[1+outer], res1);
1285 _mm_storeu_ps(out+4*i+offsets[2+outer], res2);
1286 _mm_storeu_ps(out+4*i+offsets[3+outer], res3);
1287 _mm_storeu_ps(out+4*i+offsets[4+outer], res4);
1288 _mm_storeu_ps(out+4*i+offsets[5+outer], res5);
1289 _mm_storeu_ps(out+4*i+offsets[6+outer], res6);
1290 _mm_storeu_ps(out+4*i+offsets[7+outer], res7);
1294 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
1295 for (
unsigned int v=0; v<8; ++v)
1296 out[offsets[v+outer]+i] += in[i][v+outer];
1298 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
1299 for (
unsigned int v=0; v<8; ++v)
1300 out[offsets[v+outer]+i] = in[i][v+outer];
1306 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 2 && defined(__AVX__) 1312 class VectorizedArray<double>
1318 static const unsigned int n_array_elements = 4;
1323 DEAL_II_ALWAYS_INLINE
1325 operator = (
const double x)
1327 data = _mm256_set1_pd(x);
1334 DEAL_II_ALWAYS_INLINE
1336 operator [] (
const unsigned int comp)
1339 return *(
reinterpret_cast<double *
>(&data)+comp);
1345 DEAL_II_ALWAYS_INLINE
1347 operator [] (
const unsigned int comp)
const 1350 return *(
reinterpret_cast<const double *
>(&data)+comp);
1356 DEAL_II_ALWAYS_INLINE
1358 operator += (
const VectorizedArray &vec)
1365 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1368 data = _mm256_add_pd(data,vec.
data);
1376 DEAL_II_ALWAYS_INLINE
1378 operator -= (
const VectorizedArray &vec)
1380 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1383 data = _mm256_sub_pd(data,vec.
data);
1390 DEAL_II_ALWAYS_INLINE
1392 operator *= (
const VectorizedArray &vec)
1394 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1397 data = _mm256_mul_pd(data,vec.
data);
1405 DEAL_II_ALWAYS_INLINE
1407 operator /= (
const VectorizedArray &vec)
1409 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1412 data = _mm256_div_pd(data,vec.
data);
1422 DEAL_II_ALWAYS_INLINE
1423 void load (
const double *ptr)
1425 data = _mm256_loadu_pd (ptr);
1434 DEAL_II_ALWAYS_INLINE
1435 void store (
double *ptr)
const 1437 _mm256_storeu_pd (ptr, data);
1452 DEAL_II_ALWAYS_INLINE
1453 void gather (
const double *base_ptr,
1454 const unsigned int *offsets)
1460 const __m128 index_val = _mm_loadu_ps((
const float *)offsets);
1461 const __m128i index = *((__m128i *)(&index_val));
1462 data = _mm256_i32gather_pd(base_ptr, index, 8);
1464 for (
unsigned int i=0; i<4; ++i)
1465 *(reinterpret_cast<double *>(&data)+i) = base_ptr[offsets[i]];
1481 DEAL_II_ALWAYS_INLINE
1482 void scatter (
const unsigned int *offsets,
1483 double *base_ptr)
const 1485 for (
unsigned int i=0; i<4; ++i)
1486 for (
unsigned int j=i+1; j<4; ++j)
1487 Assert(offsets[i] != offsets[j],
1488 ExcMessage(
"Result of scatter undefined if two offset elements" 1489 " point to the same position"));
1492 for (
unsigned int i=0; i<4; ++i)
1493 base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data)+i);
1507 DEAL_II_ALWAYS_INLINE
1511 VectorizedArray res;
1512 res.
data = _mm256_sqrt_pd(data);
1520 DEAL_II_ALWAYS_INLINE
1527 __m256d mask = _mm256_set1_pd (-0.);
1528 VectorizedArray res;
1529 res.
data = _mm256_andnot_pd(mask, data);
1537 DEAL_II_ALWAYS_INLINE
1539 get_max (
const VectorizedArray &other)
const 1541 VectorizedArray res;
1542 res.
data = _mm256_max_pd (data, other.
data);
1550 DEAL_II_ALWAYS_INLINE
1552 get_min (
const VectorizedArray &other)
const 1554 VectorizedArray res;
1555 res.
data = _mm256_min_pd (data, other.
data);
1580 vectorized_load_and_transpose(
const unsigned int n_entries,
1582 const unsigned int *offsets,
1585 const unsigned int n_chunks = n_entries/4;
1586 const double *in0 = in + offsets[0];
1587 const double *in1 = in + offsets[1];
1588 const double *in2 = in + offsets[2];
1589 const double *in3 = in + offsets[3];
1591 for (
unsigned int i=0; i<n_chunks; ++i)
1593 __m256d u0 = _mm256_loadu_pd(in0+4*i);
1594 __m256d u1 = _mm256_loadu_pd(in1+4*i);
1595 __m256d u2 = _mm256_loadu_pd(in2+4*i);
1596 __m256d u3 = _mm256_loadu_pd(in3+4*i);
1597 __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
1598 __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
1599 __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
1600 __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
1601 out[4*i+0].
data = _mm256_unpacklo_pd (t0, t1);
1602 out[4*i+1].
data = _mm256_unpackhi_pd (t0, t1);
1603 out[4*i+2].
data = _mm256_unpacklo_pd (t2, t3);
1604 out[4*i+3].
data = _mm256_unpackhi_pd (t2, t3);
1606 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
1607 for (
unsigned int v=0; v<4; ++v)
1608 out[i][v] = in[offsets[v]+i];
1619 vectorized_transpose_and_store(
const bool add_into,
1620 const unsigned int n_entries,
1622 const unsigned int *offsets,
1625 const unsigned int n_chunks = n_entries/4;
1626 double *out0 = out + offsets[0];
1627 double *out1 = out + offsets[1];
1628 double *out2 = out + offsets[2];
1629 double *out3 = out + offsets[3];
1630 for (
unsigned int i=0; i<n_chunks; ++i)
1632 __m256d u0 = in[4*i+0].
data;
1633 __m256d u1 = in[4*i+1].
data;
1634 __m256d u2 = in[4*i+2].
data;
1635 __m256d u3 = in[4*i+3].
data;
1636 __m256d t0 = _mm256_permute2f128_pd (u0, u2, 0x20);
1637 __m256d t1 = _mm256_permute2f128_pd (u1, u3, 0x20);
1638 __m256d t2 = _mm256_permute2f128_pd (u0, u2, 0x31);
1639 __m256d t3 = _mm256_permute2f128_pd (u1, u3, 0x31);
1640 __m256d res0 = _mm256_unpacklo_pd (t0, t1);
1641 __m256d res1 = _mm256_unpackhi_pd (t0, t1);
1642 __m256d res2 = _mm256_unpacklo_pd (t2, t3);
1643 __m256d res3 = _mm256_unpackhi_pd (t2, t3);
1650 res0 = _mm256_add_pd(_mm256_loadu_pd(out0+4*i), res0);
1651 _mm256_storeu_pd(out0+4*i, res0);
1652 res1 = _mm256_add_pd(_mm256_loadu_pd(out1+4*i), res1);
1653 _mm256_storeu_pd(out1+4*i, res1);
1654 res2 = _mm256_add_pd(_mm256_loadu_pd(out2+4*i), res2);
1655 _mm256_storeu_pd(out2+4*i, res2);
1656 res3 = _mm256_add_pd(_mm256_loadu_pd(out3+4*i), res3);
1657 _mm256_storeu_pd(out3+4*i, res3);
1661 _mm256_storeu_pd(out0+4*i, res0);
1662 _mm256_storeu_pd(out1+4*i, res1);
1663 _mm256_storeu_pd(out2+4*i, res2);
1664 _mm256_storeu_pd(out3+4*i, res3);
1668 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
1669 for (
unsigned int v=0; v<4; ++v)
1670 out[offsets[v]+i] += in[i][v];
1672 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
1673 for (
unsigned int v=0; v<4; ++v)
1674 out[offsets[v]+i] = in[i][v];
1683 class VectorizedArray<float>
1689 static const unsigned int n_array_elements = 8;
1694 DEAL_II_ALWAYS_INLINE
1696 operator = (
const float x)
1698 data = _mm256_set1_ps(x);
1705 DEAL_II_ALWAYS_INLINE
1707 operator [] (
const unsigned int comp)
1710 return *(
reinterpret_cast<float *
>(&data)+comp);
1716 DEAL_II_ALWAYS_INLINE
1718 operator [] (
const unsigned int comp)
const 1721 return *(
reinterpret_cast<const float *
>(&data)+comp);
1727 DEAL_II_ALWAYS_INLINE
1729 operator += (
const VectorizedArray &vec)
1736 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1739 data = _mm256_add_ps(data,vec.
data);
1747 DEAL_II_ALWAYS_INLINE
1749 operator -= (
const VectorizedArray &vec)
1751 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1754 data = _mm256_sub_ps(data,vec.
data);
1761 DEAL_II_ALWAYS_INLINE
1763 operator *= (
const VectorizedArray &vec)
1765 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1768 data = _mm256_mul_ps(data,vec.
data);
1776 DEAL_II_ALWAYS_INLINE
1778 operator /= (
const VectorizedArray &vec)
1780 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 1783 data = _mm256_div_ps(data,vec.
data);
1793 DEAL_II_ALWAYS_INLINE
1794 void load (
const float *ptr)
1796 data = _mm256_loadu_ps (ptr);
1805 DEAL_II_ALWAYS_INLINE
1806 void store (
float *ptr)
const 1808 _mm256_storeu_ps (ptr, data);
1823 DEAL_II_ALWAYS_INLINE
1824 void gather (
const float *base_ptr,
1825 const unsigned int *offsets)
1831 const __m256 index_val = _mm256_loadu_ps((
const float *)offsets);
1832 const __m256i index = *((__m256i *)(&index_val));
1833 data = _mm256_i32gather_ps(base_ptr, index, 4);
1835 for (
unsigned int i=0; i<8; ++i)
1836 *(reinterpret_cast<float *>(&data)+i) = base_ptr[offsets[i]];
1852 DEAL_II_ALWAYS_INLINE
1853 void scatter (
const unsigned int *offsets,
1854 float *base_ptr)
const 1856 for (
unsigned int i=0; i<8; ++i)
1857 for (
unsigned int j=i+1; j<8; ++j)
1858 Assert(offsets[i] != offsets[j],
1859 ExcMessage(
"Result of scatter undefined if two offset elements" 1860 " point to the same position"));
1863 for (
unsigned int i=0; i<8; ++i)
1864 base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data)+i);
1879 DEAL_II_ALWAYS_INLINE
1883 VectorizedArray res;
1884 res.
data = _mm256_sqrt_ps(data);
1892 DEAL_II_ALWAYS_INLINE
1899 __m256 mask = _mm256_set1_ps (-0.f);
1900 VectorizedArray res;
1901 res.
data = _mm256_andnot_ps(mask, data);
1909 DEAL_II_ALWAYS_INLINE
1911 get_max (
const VectorizedArray &other)
const 1913 VectorizedArray res;
1914 res.
data = _mm256_max_ps (data, other.
data);
1922 DEAL_II_ALWAYS_INLINE
1924 get_min (
const VectorizedArray &other)
const 1926 VectorizedArray res;
1927 res.
data = _mm256_min_ps (data, other.
data);
1952 vectorized_load_and_transpose(
const unsigned int n_entries,
1954 const unsigned int *offsets,
1957 const unsigned int n_chunks = n_entries/4;
1958 for (
unsigned int i=0; i<n_chunks; ++i)
1960 __m128 u0 = _mm_loadu_ps(in+4*i+offsets[0]);
1961 __m128 u1 = _mm_loadu_ps(in+4*i+offsets[1]);
1962 __m128 u2 = _mm_loadu_ps(in+4*i+offsets[2]);
1963 __m128 u3 = _mm_loadu_ps(in+4*i+offsets[3]);
1964 __m128 u4 = _mm_loadu_ps(in+4*i+offsets[4]);
1965 __m128 u5 = _mm_loadu_ps(in+4*i+offsets[5]);
1966 __m128 u6 = _mm_loadu_ps(in+4*i+offsets[6]);
1967 __m128 u7 = _mm_loadu_ps(in+4*i+offsets[7]);
1970 __m256 t0, t1, t2, t3 = _mm256_set1_ps(0.F);
1971 t0 = _mm256_insertf128_ps (t3, u0, 0);
1972 t0 = _mm256_insertf128_ps (t0, u4, 1);
1973 t1 = _mm256_insertf128_ps (t3, u1, 0);
1974 t1 = _mm256_insertf128_ps (t1, u5, 1);
1975 t2 = _mm256_insertf128_ps (t3, u2, 0);
1976 t2 = _mm256_insertf128_ps (t2, u6, 1);
1977 t3 = _mm256_insertf128_ps (t3, u3, 0);
1978 t3 = _mm256_insertf128_ps (t3, u7, 1);
1979 __m256 v0 = _mm256_shuffle_ps (t0, t1, 0x44);
1980 __m256 v1 = _mm256_shuffle_ps (t0, t1, 0xee);
1981 __m256 v2 = _mm256_shuffle_ps (t2, t3, 0x44);
1982 __m256 v3 = _mm256_shuffle_ps (t2, t3, 0xee);
1983 out[4*i+0].
data = _mm256_shuffle_ps (v0, v2, 0x88);
1984 out[4*i+1].
data = _mm256_shuffle_ps (v0, v2, 0xdd);
1985 out[4*i+2].
data = _mm256_shuffle_ps (v1, v3, 0x88);
1986 out[4*i+3].
data = _mm256_shuffle_ps (v1, v3, 0xdd);
1988 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
1989 for (
unsigned int v=0; v<8; ++v)
1990 out[i][v] = in[offsets[v]+i];
2001 vectorized_transpose_and_store(
const bool add_into,
2002 const unsigned int n_entries,
2004 const unsigned int *offsets,
2007 const unsigned int n_chunks = n_entries/4;
2008 for (
unsigned int i=0; i<n_chunks; ++i)
2010 __m256 u0 = in[4*i+0].
data;
2011 __m256 u1 = in[4*i+1].
data;
2012 __m256 u2 = in[4*i+2].
data;
2013 __m256 u3 = in[4*i+3].
data;
2014 __m256 t0 = _mm256_shuffle_ps (u0, u1, 0x44);
2015 __m256 t1 = _mm256_shuffle_ps (u0, u1, 0xee);
2016 __m256 t2 = _mm256_shuffle_ps (u2, u3, 0x44);
2017 __m256 t3 = _mm256_shuffle_ps (u2, u3, 0xee);
2018 u0 = _mm256_shuffle_ps (t0, t2, 0x88);
2019 u1 = _mm256_shuffle_ps (t0, t2, 0xdd);
2020 u2 = _mm256_shuffle_ps (t1, t3, 0x88);
2021 u3 = _mm256_shuffle_ps (t1, t3, 0xdd);
2022 __m128 res0 = _mm256_extractf128_ps (u0, 0);
2023 __m128 res4 = _mm256_extractf128_ps (u0, 1);
2024 __m128 res1 = _mm256_extractf128_ps (u1, 0);
2025 __m128 res5 = _mm256_extractf128_ps (u1, 1);
2026 __m128 res2 = _mm256_extractf128_ps (u2, 0);
2027 __m128 res6 = _mm256_extractf128_ps (u2, 1);
2028 __m128 res3 = _mm256_extractf128_ps (u3, 0);
2029 __m128 res7 = _mm256_extractf128_ps (u3, 1);
2036 res0 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[0]), res0);
2037 _mm_storeu_ps(out+4*i+offsets[0], res0);
2038 res1 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[1]), res1);
2039 _mm_storeu_ps(out+4*i+offsets[1], res1);
2040 res2 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[2]), res2);
2041 _mm_storeu_ps(out+4*i+offsets[2], res2);
2042 res3 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[3]), res3);
2043 _mm_storeu_ps(out+4*i+offsets[3], res3);
2044 res4 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[4]), res4);
2045 _mm_storeu_ps(out+4*i+offsets[4], res4);
2046 res5 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[5]), res5);
2047 _mm_storeu_ps(out+4*i+offsets[5], res5);
2048 res6 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[6]), res6);
2049 _mm_storeu_ps(out+4*i+offsets[6], res6);
2050 res7 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[7]), res7);
2051 _mm_storeu_ps(out+4*i+offsets[7], res7);
2055 _mm_storeu_ps(out+4*i+offsets[0], res0);
2056 _mm_storeu_ps(out+4*i+offsets[1], res1);
2057 _mm_storeu_ps(out+4*i+offsets[2], res2);
2058 _mm_storeu_ps(out+4*i+offsets[3], res3);
2059 _mm_storeu_ps(out+4*i+offsets[4], res4);
2060 _mm_storeu_ps(out+4*i+offsets[5], res5);
2061 _mm_storeu_ps(out+4*i+offsets[6], res6);
2062 _mm_storeu_ps(out+4*i+offsets[7], res7);
2066 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
2067 for (
unsigned int v=0; v<8; ++v)
2068 out[offsets[v]+i] += in[i][v];
2070 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
2071 for (
unsigned int v=0; v<8; ++v)
2072 out[offsets[v]+i] = in[i][v];
2080 #elif DEAL_II_COMPILER_VECTORIZATION_LEVEL >= 1 && defined(__SSE2__) 2086 class VectorizedArray<double>
2092 static const unsigned int n_array_elements = 2;
2097 DEAL_II_ALWAYS_INLINE
2099 operator = (
const double x)
2101 data = _mm_set1_pd(x);
2108 DEAL_II_ALWAYS_INLINE
2110 operator [] (
const unsigned int comp)
2113 return *(
reinterpret_cast<double *
>(&data)+comp);
2119 DEAL_II_ALWAYS_INLINE
2121 operator [] (
const unsigned int comp)
const 2124 return *(
reinterpret_cast<const double *
>(&data)+comp);
2130 DEAL_II_ALWAYS_INLINE
2132 operator += (
const VectorizedArray &vec)
2134 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2137 data = _mm_add_pd(data,vec.
data);
2145 DEAL_II_ALWAYS_INLINE
2147 operator -= (
const VectorizedArray &vec)
2149 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2152 data = _mm_sub_pd(data,vec.
data);
2160 DEAL_II_ALWAYS_INLINE
2162 operator *= (
const VectorizedArray &vec)
2164 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2167 data = _mm_mul_pd(data,vec.
data);
2175 DEAL_II_ALWAYS_INLINE
2177 operator /= (
const VectorizedArray &vec)
2179 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2182 data = _mm_div_pd(data,vec.
data);
2192 DEAL_II_ALWAYS_INLINE
2193 void load (
const double *ptr)
2195 data = _mm_loadu_pd (ptr);
2204 DEAL_II_ALWAYS_INLINE
2205 void store (
double *ptr)
const 2207 _mm_storeu_pd (ptr, data);
2222 DEAL_II_ALWAYS_INLINE
2223 void gather (
const double *base_ptr,
2224 const unsigned int *offsets)
2226 for (
unsigned int i=0; i<2; ++i)
2227 *(reinterpret_cast<double *>(&data)+i) = base_ptr[offsets[i]];
2242 DEAL_II_ALWAYS_INLINE
2243 void scatter (
const unsigned int *offsets,
2244 double *base_ptr)
const 2246 Assert(offsets[0] != offsets[1],
2247 ExcMessage(
"Result of scatter undefined if two offset elements" 2248 " point to the same position"));
2250 for (
unsigned int i=0; i<2; ++i)
2251 base_ptr[offsets[i]] = *(reinterpret_cast<const double *>(&data)+i);
2265 DEAL_II_ALWAYS_INLINE
2269 VectorizedArray res;
2270 res.
data = _mm_sqrt_pd(data);
2278 DEAL_II_ALWAYS_INLINE
2286 __m128d mask = _mm_set1_pd (-0.);
2287 VectorizedArray res;
2288 res.
data = _mm_andnot_pd(mask, data);
2296 DEAL_II_ALWAYS_INLINE
2298 get_max (
const VectorizedArray &other)
const 2300 VectorizedArray res;
2301 res.
data = _mm_max_pd (data, other.
data);
2309 DEAL_II_ALWAYS_INLINE
2311 get_min (
const VectorizedArray &other)
const 2313 VectorizedArray res;
2314 res.
data = _mm_min_pd (data, other.
data);
2338 void vectorized_load_and_transpose(
const unsigned int n_entries,
2340 const unsigned int *offsets,
2343 const unsigned int n_chunks = n_entries/2;
2344 for (
unsigned int i=0; i<n_chunks; ++i)
2346 __m128d u0 = _mm_loadu_pd(in+2*i+offsets[0]);
2347 __m128d u1 = _mm_loadu_pd(in+2*i+offsets[1]);
2348 out[2*i+0].
data = _mm_unpacklo_pd (u0, u1);
2349 out[2*i+1].
data = _mm_unpackhi_pd (u0, u1);
2351 for (
unsigned int i=2*n_chunks; i<n_entries; ++i)
2352 for (
unsigned int v=0; v<2; ++v)
2353 out[i][v] = in[offsets[v]+i];
2364 vectorized_transpose_and_store(
const bool add_into,
2365 const unsigned int n_entries,
2367 const unsigned int *offsets,
2370 const unsigned int n_chunks = n_entries/2;
2373 for (
unsigned int i=0; i<n_chunks; ++i)
2375 __m128d u0 = in[2*i+0].
data;
2376 __m128d u1 = in[2*i+1].
data;
2377 __m128d res0 = _mm_unpacklo_pd (u0, u1);
2378 __m128d res1 = _mm_unpackhi_pd (u0, u1);
2379 _mm_storeu_pd(out+2*i+offsets[0], _mm_add_pd(_mm_loadu_pd(out+2*i+offsets[0]), res0));
2380 _mm_storeu_pd(out+2*i+offsets[1], _mm_add_pd(_mm_loadu_pd(out+2*i+offsets[1]), res1));
2382 for (
unsigned int i=2*n_chunks; i<n_entries; ++i)
2383 for (
unsigned int v=0; v<2; ++v)
2384 out[offsets[v]+i] += in[i][v];
2388 for (
unsigned int i=0; i<n_chunks; ++i)
2390 __m128d u0 = in[2*i+0].
data;
2391 __m128d u1 = in[2*i+1].
data;
2392 __m128d res0 = _mm_unpacklo_pd (u0, u1);
2393 __m128d res1 = _mm_unpackhi_pd (u0, u1);
2394 _mm_storeu_pd(out+2*i+offsets[0], res0);
2395 _mm_storeu_pd(out+2*i+offsets[1], res1);
2397 for (
unsigned int i=2*n_chunks; i<n_entries; ++i)
2398 for (
unsigned int v=0; v<2; ++v)
2399 out[offsets[v]+i] = in[i][v];
2409 class VectorizedArray<float>
2415 static const unsigned int n_array_elements = 4;
2421 DEAL_II_ALWAYS_INLINE
2423 operator = (
const float x)
2425 data = _mm_set1_ps(x);
2432 DEAL_II_ALWAYS_INLINE
2434 operator [] (
const unsigned int comp)
2437 return *(
reinterpret_cast<float *
>(&data)+comp);
2443 DEAL_II_ALWAYS_INLINE
2445 operator [] (
const unsigned int comp)
const 2448 return *(
reinterpret_cast<const float *
>(&data)+comp);
2454 DEAL_II_ALWAYS_INLINE
2456 operator += (
const VectorizedArray &vec)
2458 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2461 data = _mm_add_ps(data,vec.
data);
2469 DEAL_II_ALWAYS_INLINE
2471 operator -= (
const VectorizedArray &vec)
2473 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2476 data = _mm_sub_ps(data,vec.
data);
2484 DEAL_II_ALWAYS_INLINE
2486 operator *= (
const VectorizedArray &vec)
2488 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2491 data = _mm_mul_ps(data,vec.
data);
2499 DEAL_II_ALWAYS_INLINE
2501 operator /= (
const VectorizedArray &vec)
2503 #ifdef DEAL_II_COMPILER_USE_VECTOR_ARITHMETICS 2506 data = _mm_div_ps(data,vec.
data);
2516 DEAL_II_ALWAYS_INLINE
2517 void load (
const float *ptr)
2519 data = _mm_loadu_ps (ptr);
2528 DEAL_II_ALWAYS_INLINE
2529 void store (
float *ptr)
const 2531 _mm_storeu_ps (ptr, data);
2546 DEAL_II_ALWAYS_INLINE
2547 void gather (
const float *base_ptr,
2548 const unsigned int *offsets)
2550 for (
unsigned int i=0; i<4; ++i)
2551 *(reinterpret_cast<float *>(&data)+i) = base_ptr[offsets[i]];
2566 DEAL_II_ALWAYS_INLINE
2567 void scatter (
const unsigned int *offsets,
2568 float *base_ptr)
const 2570 for (
unsigned int i=0; i<4; ++i)
2571 for (
unsigned int j=i+1; j<4; ++j)
2572 Assert(offsets[i] != offsets[j],
2573 ExcMessage(
"Result of scatter undefined if two offset elements" 2574 " point to the same position"));
2576 for (
unsigned int i=0; i<4; ++i)
2577 base_ptr[offsets[i]] = *(reinterpret_cast<const float *>(&data)+i);
2591 DEAL_II_ALWAYS_INLINE
2595 VectorizedArray res;
2596 res.
data = _mm_sqrt_ps(data);
2604 DEAL_II_ALWAYS_INLINE
2611 __m128 mask = _mm_set1_ps (-0.f);
2612 VectorizedArray res;
2613 res.
data = _mm_andnot_ps(mask, data);
2621 DEAL_II_ALWAYS_INLINE
2623 get_max (
const VectorizedArray &other)
const 2625 VectorizedArray res;
2626 res.
data = _mm_max_ps (data, other.
data);
2634 DEAL_II_ALWAYS_INLINE
2636 get_min (
const VectorizedArray &other)
const 2638 VectorizedArray res;
2639 res.
data = _mm_min_ps (data, other.
data);
2663 void vectorized_load_and_transpose(
const unsigned int n_entries,
2665 const unsigned int *offsets,
2668 const unsigned int n_chunks = n_entries/4;
2669 for (
unsigned int i=0; i<n_chunks; ++i)
2671 __m128 u0 = _mm_loadu_ps(in+4*i+offsets[0]);
2672 __m128 u1 = _mm_loadu_ps(in+4*i+offsets[1]);
2673 __m128 u2 = _mm_loadu_ps(in+4*i+offsets[2]);
2674 __m128 u3 = _mm_loadu_ps(in+4*i+offsets[3]);
2675 __m128 v0 = _mm_shuffle_ps (u0, u1, 0x44);
2676 __m128 v1 = _mm_shuffle_ps (u0, u1, 0xee);
2677 __m128 v2 = _mm_shuffle_ps (u2, u3, 0x44);
2678 __m128 v3 = _mm_shuffle_ps (u2, u3, 0xee);
2679 out[4*i+0].
data = _mm_shuffle_ps (v0, v2, 0x88);
2680 out[4*i+1].
data = _mm_shuffle_ps (v0, v2, 0xdd);
2681 out[4*i+2].
data = _mm_shuffle_ps (v1, v3, 0x88);
2682 out[4*i+3].
data = _mm_shuffle_ps (v1, v3, 0xdd);
2684 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
2685 for (
unsigned int v=0; v<4; ++v)
2686 out[i][v] = in[offsets[v]+i];
2697 vectorized_transpose_and_store(
const bool add_into,
2698 const unsigned int n_entries,
2700 const unsigned int *offsets,
2703 const unsigned int n_chunks = n_entries/4;
2704 for (
unsigned int i=0; i<n_chunks; ++i)
2706 __m128 u0 = in[4*i+0].
data;
2707 __m128 u1 = in[4*i+1].
data;
2708 __m128 u2 = in[4*i+2].
data;
2709 __m128 u3 = in[4*i+3].
data;
2710 __m128 t0 = _mm_shuffle_ps (u0, u1, 0x44);
2711 __m128 t1 = _mm_shuffle_ps (u0, u1, 0xee);
2712 __m128 t2 = _mm_shuffle_ps (u2, u3, 0x44);
2713 __m128 t3 = _mm_shuffle_ps (u2, u3, 0xee);
2714 u0 = _mm_shuffle_ps (t0, t2, 0x88);
2715 u1 = _mm_shuffle_ps (t0, t2, 0xdd);
2716 u2 = _mm_shuffle_ps (t1, t3, 0x88);
2717 u3 = _mm_shuffle_ps (t1, t3, 0xdd);
2724 u0 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[0]), u0);
2725 _mm_storeu_ps(out+4*i+offsets[0], u0);
2726 u1 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[1]), u1);
2727 _mm_storeu_ps(out+4*i+offsets[1], u1);
2728 u2 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[2]), u2);
2729 _mm_storeu_ps(out+4*i+offsets[2], u2);
2730 u3 = _mm_add_ps(_mm_loadu_ps(out+4*i+offsets[3]), u3);
2731 _mm_storeu_ps(out+4*i+offsets[3], u3);
2735 _mm_storeu_ps(out+4*i+offsets[0], u0);
2736 _mm_storeu_ps(out+4*i+offsets[1], u1);
2737 _mm_storeu_ps(out+4*i+offsets[2], u2);
2738 _mm_storeu_ps(out+4*i+offsets[3], u3);
2742 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
2743 for (
unsigned int v=0; v<4; ++v)
2744 out[offsets[v]+i] += in[i][v];
2746 for (
unsigned int i=4*n_chunks; i<n_entries; ++i)
2747 for (
unsigned int v=0; v<4; ++v)
2748 out[offsets[v]+i] = in[i][v];
2753 #endif // if DEAL_II_COMPILER_VECTORIZATION_LEVEL > 0 2761 template <
typename Number>
2762 inline DEAL_II_ALWAYS_INLINE
2776 template <
typename Number>
2777 inline DEAL_II_ALWAYS_INLINE
2791 template <
typename Number>
2792 inline DEAL_II_ALWAYS_INLINE
2806 template <
typename Number>
2807 inline DEAL_II_ALWAYS_INLINE
2822 template <
typename Number>
2823 inline DEAL_II_ALWAYS_INLINE
2825 operator + (
const Number &u,
2841 inline DEAL_II_ALWAYS_INLINE
2843 operator + (
const double &u,
2857 template <
typename Number>
2858 inline DEAL_II_ALWAYS_INLINE
2874 inline DEAL_II_ALWAYS_INLINE
2888 template <
typename Number>
2889 inline DEAL_II_ALWAYS_INLINE
2891 operator - (
const Number &u,
2907 inline DEAL_II_ALWAYS_INLINE
2909 operator - (
const double &u,
2923 template <
typename Number>
2924 inline DEAL_II_ALWAYS_INLINE
2942 inline DEAL_II_ALWAYS_INLINE
2958 template <
typename Number>
2959 inline DEAL_II_ALWAYS_INLINE
2961 operator * (
const Number &u,
2977 inline DEAL_II_ALWAYS_INLINE
2979 operator * (
const double &u,
2993 template <
typename Number>
2994 inline DEAL_II_ALWAYS_INLINE
3010 inline DEAL_II_ALWAYS_INLINE
3024 template <
typename Number>
3025 inline DEAL_II_ALWAYS_INLINE
3027 operator / (
const Number &u,
3043 inline DEAL_II_ALWAYS_INLINE
3045 operator / (
const double &u,
3059 template <
typename Number>
3060 inline DEAL_II_ALWAYS_INLINE
3078 inline DEAL_II_ALWAYS_INLINE
3093 template <
typename Number>
3094 inline DEAL_II_ALWAYS_INLINE
3106 template <
typename Number>
3107 inline DEAL_II_ALWAYS_INLINE
3117 DEAL_II_NAMESPACE_CLOSE
3135 template <
typename Number>
3137 ::VectorizedArray<Number>
3138 sin (const ::VectorizedArray<Number> &x)
3146 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
3147 values[i] = std::sin(x[i]);
3149 out.
load(&values[0]);
3162 template <
typename Number>
3164 ::VectorizedArray<Number>
3165 cos (const ::VectorizedArray<Number> &x)
3168 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
3169 values[i] = std::cos(x[i]);
3171 out.
load(&values[0]);
3184 template <
typename Number>
3186 ::VectorizedArray<Number>
3187 tan (const ::VectorizedArray<Number> &x)
3190 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
3191 values[i] = std::tan(x[i]);
3193 out.
load(&values[0]);
3206 template <
typename Number>
3208 ::VectorizedArray<Number>
3209 exp (const ::VectorizedArray<Number> &x)
3212 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
3213 values[i] = std::exp(x[i]);
3215 out.
load(&values[0]);
3228 template <
typename Number>
3230 ::VectorizedArray<Number>
3231 log (const ::VectorizedArray<Number> &x)
3234 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
3235 values[i] = std::log(x[i]);
3237 out.
load(&values[0]);
3250 template <
typename Number>
3252 ::VectorizedArray<Number>
3253 sqrt (const ::VectorizedArray<Number> &x)
3255 return x.get_sqrt();
3267 template <
typename Number>
3269 ::VectorizedArray<Number>
3270 pow (const ::VectorizedArray<Number> &x,
3274 for (
unsigned int i=0; i<::VectorizedArray<Number>::n_array_elements; ++i)
3275 values[i] = std::pow(x[i], p);
3277 out.
load(&values[0]);
3290 template <
typename Number>
3292 ::VectorizedArray<Number>
3293 abs (const ::VectorizedArray<Number> &x)
3307 template <
typename Number>
3309 ::VectorizedArray<Number>
3310 max (const ::VectorizedArray<Number> &x,
3311 const ::VectorizedArray<Number> &y)
3313 return x.get_max(y);
3325 template <
typename Number>
3327 ::VectorizedArray<Number>
3328 min (const ::VectorizedArray<Number> &x,
3329 const ::VectorizedArray<Number> &y)
3331 return x.get_min(y);
DEAL_II_ALWAYS_INLINE VectorizedArray get_sqrt() const
DEAL_II_ALWAYS_INLINE void scatter(const unsigned int *offsets, Number *base_ptr) const
VectorizedArray< Number > log(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > tan(const ::VectorizedArray< Number > &x)
#define AssertIndexRange(index, range)
DEAL_II_ALWAYS_INLINE VectorizedArray get_abs() const
VectorizedArray< Number > exp(const ::VectorizedArray< Number > &x)
DEAL_II_ALWAYS_INLINE void load(const Number *ptr)
void vectorized_transpose_and_store(const bool add_into, const unsigned int n_entries, const VectorizedArray< Number > *in, const unsigned int *offsets, Number *out)
static::ExceptionBase & ExcMessage(std::string arg1)
VectorizedArray< Number > min(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
#define Assert(cond, exc)
VectorizedArray< Number > pow(const ::VectorizedArray< Number > &x, const Number p)
void vectorized_load_and_transpose(const unsigned int n_entries, const Number *in, const unsigned int *offsets, VectorizedArray< Number > *out)
VectorizedArray< Number > sqrt(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > sin(const ::VectorizedArray< Number > &x)
DEAL_II_ALWAYS_INLINE VectorizedArray get_max(const VectorizedArray &other) const
DEAL_II_ALWAYS_INLINE void gather(const Number *base_ptr, const unsigned int *offsets)
DEAL_II_ALWAYS_INLINE void store(Number *ptr) const
DEAL_II_ALWAYS_INLINE VectorizedArray get_min(const VectorizedArray &other) const
VectorizedArray< Number > abs(const ::VectorizedArray< Number > &x)
VectorizedArray< Number > max(const ::VectorizedArray< Number > &x, const ::VectorizedArray< Number > &y)
VectorizedArray< Number > cos(const ::VectorizedArray< Number > &x)
DEAL_II_ALWAYS_INLINE VectorizedArray< Number > make_vectorized_array(const Number &u)