7PRAGMA_DISABLE_SHADOW_VARIABLE_WARNINGS
12#if ((PLATFORM_WINDOWS || PLATFORM_HOLOLENS) && PLATFORM_64BITS)
13#include <arm64_neon.h>
18#include "Math/Float16.h"
21
22
35template<
typename T,
typename BASE_TYPE>
36struct alignas(
alignof(T)) VectorRegisterWrapper
39 FORCEINLINE constexpr VectorRegisterWrapper(T vec) : m_vec(vec) {}
42 FORCEINLINE operator
const T&()
const {
return m_vec; }
50FORCEINLINE float VectorRegisterWrapper<float32x4_t,
float>::operator[](
int Index)
const
52 return m_vec.n128_f32[Index];
56FORCEINLINE double VectorRegisterWrapper<float64x2_t,
double>::operator[](
int Index)
const
58 return m_vec.n128_f64[Index];
62FORCEINLINE int VectorRegisterWrapper<int32x4_t,
int>::operator[](
int Index)
const
64 return m_vec.n128_i32[Index];
68FORCEINLINE int64 VectorRegisterWrapper<int64x2_t, int64>::operator[](
int Index)
const
70 return m_vec.n128_i64[Index];
74typedef VectorRegisterWrapper<float32x4_t,
float> VectorRegister4Float;
75typedef VectorRegisterWrapper<float64x2_t,
double> VectorRegister2Double;
76typedef VectorRegisterWrapper<int32x4_t,
int> VectorRegister4Int;
77typedef VectorRegisterWrapper<int64x2_t, int64> VectorRegister2Int64;
79FORCEINLINE constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)
89FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(
float X,
float Y,
float Z,
float W)
99FORCEINLINE constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(
double X,
double Y)
101 float64x2_t Out = {};
110typedef float32x4_t GCC_ALIGN(16) VectorRegister4Float;
111typedef float64x2_t GCC_ALIGN(16) VectorRegister2Double;
112typedef int32x4_t GCC_ALIGN(16) VectorRegister4Int;
113typedef int64x2_t GCC_ALIGN(16) VectorRegister2Int64;
115FORCEINLINE
constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)
117 return VectorRegister4Int { X, Y, Z, W };
120FORCEINLINE
constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(
float X,
float Y,
float Z,
float W)
122 return VectorRegister4Float { X, Y, Z, W };
125FORCEINLINE
constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(
double X,
double Y)
127 return VectorRegister2Double { X, Y };
132#define DECLARE_VECTOR_REGISTER(X, Y, Z, W) MakeVectorRegister( X, Y, Z, W )
134struct alignas(16) VectorRegister4Double
138 VectorRegister2Double XY;
139 VectorRegister2Double ZW;
148 XY = vcvt_f64_f32(*(float32x2_t*)&From);
149 ZW = vcvt_high_f64_f32(From);
156 *
this = VectorRegister4Double(From);
162#define VectorZeroVectorRegister() VectorZeroDouble()
163#define VectorOneVectorRegister() VectorOneDouble()
173struct alignas(
alignof(VectorRegister4Float)) AlignedFloat4
179 VectorStoreAligned(Vec, V);
187 return VectorLoadAligned(V);
193struct alignas(
alignof(VectorRegister4Double)) AlignedDouble4
219
220
221
222
223
224
225
226
230 VectorRegister4Float V; uint32 F[4];
242 return MakeVectorRegister(X, Y, Z, W);
248 return MakeVectorRegisterFloat(X, Y, Z, W);
253
254
255
256
257
258
259
260
264 VectorRegister4Float V;
float F[4];
276 return MakeVectorRegister(X, Y, Z, W);
280
281
282
283
284
285
286
287
292 VectorRegister4Double V;
double D[4];
309 return VectorRegister4Double
(XY
, ZW
);
316 VectorRegister4Double V; uint64_t D[4];
336 VectorRegister2Double V;
double D[2];
348 VectorRegister2Double V; uint64_t D[2];
357
358
359
360
361
362
363
364
368 VectorRegister4Int V; int32 I[4];
382 VectorRegister4Int V; int64 I[2];
393 return VectorRegister4Double(From);
399 return vcvt_high_f32_f64(vcvt_f32_f64(Vec.XY), Vec.ZW);
403
404
405
406
407
408
409
410
414
415
417#include "Math/UnrealMathVectorConstants.h"
421
422
425
426
427
428
431 return vdupq_n_f32( 0.0f );
436 VectorRegister2Double Zero = vdupq_n_f64(0.0);
437 return VectorRegister4Double
(Zero
, Zero
);
442
443
444
445
448 return vdupq_n_f32( 1.0f );
453 VectorRegister4Double Result;
454 Result.XY = vdupq_n_f64(1.0f);
455 Result.ZW = Result.XY;
460
461
462
463
464
467 return vld1q_f32( (float32_t*)Ptr );
472 float64x2x2_t Vec = vld1q_f64_x2(Ptr);
473 VectorRegister4Double Result = *(VectorRegister4Double*)&Vec;
478
479
480
481
482
485 return MakeVectorRegister(Ptr[0], Ptr[1], Ptr[0], Ptr[1]);
489
490
491
492
493
498 VectorRegister4Double V;
double D[4];
502 Tmp.V.XY = vld1q_f64(Ptr);
509
510
511
512
513
520
521
522template <
int ElementIndex>
525 return vsetq_lane_f32(Scalar, Vec, ElementIndex);
528template <
int ElementIndex>
531 return vsetq_lane_f64(Scalar, Vec, ElementIndex);
534template<
int ElementIndex,
typename std::enable_if< (ElementIndex > 1),
bool >::type =
true >
537 VectorRegister4Double Result;
539 Result.ZW = VectorSetComponentImpl<ElementIndex - 2>(Vec.ZW, Scalar);
543template<
int ElementIndex,
typename std::enable_if < (ElementIndex <= 1),
bool >::type =
true >
544FORCEINLINE VectorRegister4Double VectorSetComponentImpl(
const VectorRegister4Double& Vec,
double Scalar)
546 VectorRegister4Double Result;
547 Result.XY = VectorSetComponentImpl<ElementIndex>(Vec.XY, Scalar);
552#define VectorSetComponent( Vec, ElementIndex, Scalar ) VectorSetComponentImpl<ElementIndex>(Vec, Scalar)
556
557
558
559
560
563 return VectorLoad(Ptr);
572
573
574
575
576
579 return vdupq_n_f32(Ptr[0]);
584 VectorRegister4Double Result;
585 Result.XY = vdupq_n_f64(Ptr[0]);
586 Result.ZW = Result.XY;
591
592
593
594
595
596
597
600 float32x2_t Lo = vld1_f32(Ptr1);
601 float32x2_t Hi = vld1_f32(Ptr2);
602 return vcombine_f32(Lo, Hi);
607 VectorRegister4Double Res;
608 Res.XY = vld1q_f64(Ptr1);
609 Res.ZW = vld1q_f64(Ptr2);
614
615
616
617
618
621 return vdupq_n_f32(X);
626 VectorRegister4Double Result;
627 Result.XY = vdupq_n_f64(X);
628 Result.ZW = Result.XY;
633
634
635
636
637
645 vst1q_f64_x2(Ptr, *(float64x2x2_t*)&Vec);
651 AlignedFloat4 Floats(Vec);
652 for (
int i = 0; i < 4; ++i)
659
660
661
662
663
664#define VectorStoreAlignedStreamed( Vec, Ptr ) VectorStoreAligned( Vec, Ptr )
667
668
669
670
671
679 vst1q_f64_x2(Ptr, *(float64x2x2_t*)&Vec);
683
684
685
686
687
690 vst1_f32(Ptr, *(float32x2_t*)&Vec);
691 vst1q_lane_f32(((float32_t*)Ptr) + 2, Vec, 2);
695
696
697
698
699
702 vst1q_f64(Ptr, Vec.XY);
703 vst1q_lane_f64(((float64_t*)Ptr) + 2, Vec.ZW, 0);
708
709
710
711
712
715 vst1q_lane_f32( Ptr, Vec, 0 );
720 vst1q_lane_f64(Ptr, Vec.XY, 0);
724
725
726
727
728
729
730
731template <
int ElementIndex>
734 return vdupq_n_f32(vgetq_lane_f32(Vec, ElementIndex));
737template <
int ElementIndex>
740 return vdupq_n_f64(vgetq_lane_f64(Vec, ElementIndex));
743template <
int ElementIndex,
typename std::enable_if < (ElementIndex <= 1),
bool >::type =
true >
746 VectorRegister4Double Result;
747 Result.XY = VectorReplicateImpl<ElementIndex>(Vec.XY);
748 Result.ZW = Result.XY;
752template <
int ElementIndex,
typename std::enable_if < (ElementIndex > 1),
bool >::type =
true >
753FORCEINLINE VectorRegister4Double VectorReplicateImpl(
const VectorRegister4Double& Vec)
755 VectorRegister4Double Result;
756 Result.ZW = VectorReplicateImpl<ElementIndex - 2>(Vec.ZW);
757 Result.XY = Result.ZW;
761#define VectorReplicate( Vec, ElementIndex ) VectorReplicateImpl<ElementIndex>(Vec)
765
766
767
768
769
772 return vabsq_f32( Vec );
777 VectorRegister4Double Result;
778 Result.XY = vabsq_f64(Vec.XY);
779 Result.ZW = vabsq_f64(Vec.ZW);
784
785
786
787
788
791 return vnegq_f32( Vec );
796 VectorRegister4Double Result;
797 Result.XY = vnegq_f64(Vec.XY);
798 Result.ZW = vnegq_f64(Vec.ZW);
803
804
805
806
807
808
811 return vaddq_f32( Vec1, Vec2 );
816 VectorRegister4Double Result;
817 Result.XY = vaddq_f64(Vec1.XY, Vec2.XY);
818 Result.ZW = vaddq_f64(Vec1.ZW, Vec2.ZW);
824
825
826
827
828
829
832 return vsubq_f32( Vec1, Vec2 );
837 VectorRegister4Double Res;
838 Res.XY = vsubq_f64(Vec1.XY, Vec2.XY);
839 Res.ZW = vsubq_f64(Vec1.ZW, Vec2.ZW);
845
846
847
848
849
850
853 return vmulq_f32( Vec1, Vec2 );
858 return vmulq_f64(Vec1, Vec2);
863 VectorRegister4Double Result;
864 Result.XY = vmulq_f64(Vec1.XY, Vec2.XY);
865 Result.ZW = vmulq_f64(Vec1.ZW, Vec2.ZW);
871
872
873
874
875
876
879 return vdivq_f32(Vec1, Vec2);
884 VectorRegister4Double Res;
885 Res.XY = vdivq_f64(Vec1.XY, Vec2.XY);
886 Res.ZW = vdivq_f64(Vec1.ZW, Vec2.ZW);
892
893
894
895
896
897
898
901 return vfmaq_f32(Acc, Vec1, Vec2 );
906 VectorRegister4Double Result;
907 Result.XY = vfmaq_f64(Acc.XY, Vec1.XY, Vec2.XY);
908 Result.ZW = vfmaq_f64(Acc.ZW, Vec1.ZW, Vec2.ZW);
913
914
915
916
917
918
919
922 return vfmsq_f32(Sub, Vec1, Vec2);
927 VectorRegister4Double Result;
928 Result.XY = vfmsq_f64(Sub.XY, Vec1.XY, Vec2.XY);
929 Result.ZW = vfmsq_f64(Sub.ZW, Vec1.ZW, Vec2.ZW);
935
936
937
938
939
940
941
944 VectorRegister4Float Temp = VectorMultiply( Vec1, Vec2 );
945 Temp = vsetq_lane_f32( 0.0f, Temp, 3 );
946 float32x2_t sum = vpadd_f32( vget_low_f32( Temp ), vget_high_f32( Temp ) );
947 sum = vpadd_f32( sum, sum );
948 return vdupq_lane_f32( sum, 0 );
953 VectorRegister2Double A, B;
954 A = vmulq_f64(Vec1.XY, Vec2.XY);
955 B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);
956 float64x1_t Sum = vadd_f64(vget_low_f64(B), vget_high_f64(A));
957 VectorRegister4Double Temp;
958 Temp.XY = vdupq_lane_f64(Sum, 0);
965 return vgetq_lane_f32(VectorDot3(Vec1, Vec2), 0);
970 VectorRegister2Double A, B;
971 A = vmulq_f64(Vec1.XY, Vec2.XY);
972 B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);
973 float64x1_t Sum = vadd_f64(vget_low_f64(B), vget_high_f64(A));
974 return *(
double*)∑
980
981
982
983
984
985
986
989 VectorRegister4Float Temp = VectorMultiply(Vec1, Vec2);
990 float32x2_t sum = vpadd_f32(vget_low_f32(Temp), vget_high_f32(Temp));
991 sum = vpadd_f32(sum, sum);
992 return vdupq_lane_f32(sum, 0);
997 VectorRegister2Double A, B;
998 A = vmulq_f64(Vec1.XY, Vec2.XY);
999 B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);
1000 A = vextq_f64(B, B, 1);
1001 VectorRegister4Double Temp;
1002 Temp.XY = vaddq_f64(A, B);
1008
1009
1010
1011
1012
1013
1017 return (VectorRegister4Float)vceqq_f32( Vec1, Vec2 );
1022 VectorRegister4Double Result;
1023 Result.XY = (VectorRegister2Double)vceqq_f64(Vec1.XY, Vec2.XY);
1024 Result.ZW = (VectorRegister2Double)vceqq_f64(Vec1.ZW, Vec2.ZW);
1031
1032
1033
1034
1035
1036
1040 return (VectorRegister4Float)vmvnq_u32( vceqq_f32( Vec1, Vec2 ) );
1045 VectorRegister4Double Result;
1046 Result.XY = (VectorRegister2Double)vmvnq_u32(vceqq_f64(Vec1.XY, Vec2.XY));
1047 Result.ZW = (VectorRegister2Double)vmvnq_u32(vceqq_f64(Vec1.ZW, Vec2.ZW));
1052
1053
1054
1055
1056
1057
1061 return (VectorRegister4Float)vcgtq_f32( Vec1, Vec2 );
1066 VectorRegister4Double Result;
1067 Result.XY = (VectorRegister2Double)vcgtq_f64(Vec1.XY, Vec2.XY);
1068 Result.ZW = (VectorRegister2Double)vcgtq_f64(Vec1.ZW, Vec2.ZW);
1073
1074
1075
1076
1077
1078
1082 return (VectorRegister4Float)vcgeq_f32( Vec1, Vec2 );
1087 VectorRegister4Double Result;
1088 Result.XY = (VectorRegister2Double)vcgeq_f64(Vec1.XY, Vec2.XY);
1089 Result.ZW = (VectorRegister2Double)vcgeq_f64(Vec1.ZW, Vec2.ZW);
1094
1095
1096
1097
1098
1099
1102 return (VectorRegister4Float)vcltq_f32(Vec1, Vec2);
1107 VectorRegister4Double Res;
1108 Res.XY = (VectorRegister2Double)vcltq_f64(Vec1.XY, Vec2.XY);
1109 Res.ZW = (VectorRegister2Double)vcltq_f64(Vec1.ZW, Vec2.ZW);
1114
1115
1116
1117
1118
1119
1122 return (VectorRegister4Float)vcleq_f32(Vec1, Vec2);
1127 VectorRegister4Double Res;
1128 Res.XY = (VectorRegister2Double)vcleq_f64(Vec1.XY, Vec2.XY);
1129 Res.ZW = (VectorRegister2Double)vcleq_f64(Vec1.ZW, Vec2.ZW);
1134
1135
1136
1137
1138
1139
1140
1141
1145 return vbslq_f32((VectorRegister4Int)Mask, Vec1, Vec2);
1148FORCEINLINE VectorRegister4Double
VectorSelect(
const VectorRegister4Double& Mask,
const VectorRegister4Double& Vec1,
const VectorRegister4Double& Vec2)
1150 VectorRegister4Double Result;
1151 Result.XY = vbslq_f64((VectorRegister2Int64)Mask.XY, Vec1.XY, Vec2.XY);
1152 Result.ZW = vbslq_f64((VectorRegister2Int64)Mask.ZW, Vec1.ZW, Vec2.ZW);
1157
1158
1159
1160
1161
1162
1165 return (VectorRegister4Float)vorrq_u32( (VectorRegister4Int)Vec1, (VectorRegister4Int)Vec2 );
1170 VectorRegister4Double Result;
1171 Result.XY = (VectorRegister2Double)vorrq_u64((VectorRegister2Int64)Vec1.XY, (VectorRegister2Int64)Vec2.XY);
1172 Result.ZW = (VectorRegister2Double)vorrq_u64((VectorRegister2Int64)Vec1.ZW, (VectorRegister2Int64)Vec2.ZW);
1177
1178
1179
1180
1181
1182
1185 return (VectorRegister4Float)vandq_u32( (VectorRegister4Int)Vec1, (VectorRegister4Int)Vec2 );
1190 VectorRegister4Double Result;
1191 Result.XY = (VectorRegister2Double)vandq_u64((VectorRegister2Int64)Vec1.XY, (VectorRegister2Int64)Vec2.XY);
1192 Result.ZW = (VectorRegister2Double)vandq_u64((VectorRegister2Int64)Vec1.ZW, (VectorRegister2Int64)Vec2.ZW);
1197
1198
1199
1200
1201
1202
1205 return (VectorRegister4Float)veorq_u32( (VectorRegister4Int)Vec1, (VectorRegister4Int)Vec2 );
1210 VectorRegister4Double Result;
1211 Result.XY = (VectorRegister2Double)veorq_u64((VectorRegister2Int64)Vec1.XY, (VectorRegister2Int64)Vec2.XY);
1212 Result.ZW = (VectorRegister2Double)veorq_u64((VectorRegister2Int64)Vec1.ZW, (VectorRegister2Int64)Vec2.ZW);
1218
1219
1220
1221
1222
1223
1224
1225
1226
1228FORCEINLINE VectorRegister4Float VectorSwizzle
1230 VectorRegister4Float V,
1237 check((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
1238 static constexpr uint32_t ControlElement[4] =
1247 tbl.val[0] = vget_low_f32(V);
1248 tbl.val[1] = vget_high_f32(V);
1250 uint32x2_t idx = vcreate_u32(
static_cast<uint64>(ControlElement[E0]) | (
static_cast<uint64>(ControlElement[E1]) << 32));
1251 const uint8x8_t rL = vtbl2_u8(tbl, idx);
1253 idx = vcreate_u32(
static_cast<uint64>(ControlElement[E2]) | (
static_cast<uint64>(ControlElement[E3]) << 32));
1254 const uint8x8_t rH = vtbl2_u8(tbl, idx);
1256 return vcombine_f32(rL, rH);
1259FORCEINLINE VectorRegister4Double VectorSwizzle
1261 VectorRegister4Double V,
1268 check((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
1269 static constexpr uint64_t ControlElement[4] =
1271 0x0706050403020100ULL,
1272 0x0F0E0D0C0B0A0908ULL,
1273 0x1716151413121110ULL,
1274 0x1F1E1D1C1B1A1918ULL,
1281 VectorRegister4Double Result;
1282 uint32x4_t idx = vcombine_u64(vcreate_u64(ControlElement[E0]), vcreate_u64(ControlElement[E1]));
1283 Result.XY = vqtbl2q_u8(tbl, idx);
1285 idx = vcombine_u64(vcreate_u64(ControlElement[E2]), vcreate_u64(ControlElement[E3]));
1286 Result.ZW = vqtbl2q_u8(tbl, idx);
1291template <
int X,
int Y,
int Z,
int W>
1292FORCEINLINE VectorRegister4Float VectorSwizzleImpl(VectorRegister4Float Vec)
1294 return __builtin_shufflevector(Vec, Vec, X, Y, Z, W);
1297template <
int X,
int Y,
typename std::enable_if < (X <= 1) && (Y <= 1),
bool >::type =
true>
1298FORCEINLINE VectorRegister2Double VectorSwizzleImpl2(VectorRegister4Double Vec)
1300 return __builtin_shufflevector(Vec.XY, Vec.XY, X, Y);
1303template <
int X,
int Y,
typename std::enable_if < (X <= 1) && (Y > 1),
bool >::type =
true>
1304FORCEINLINE VectorRegister2Double VectorSwizzleImpl2(VectorRegister4Double Vec)
1306 return __builtin_shufflevector(Vec.XY, Vec.ZW, X, Y);
1309template <
int X,
int Y,
typename std::enable_if < (X > 1) && (Y <= 1),
bool >::type =
true>
1310FORCEINLINE VectorRegister2Double VectorSwizzleImpl2(VectorRegister4Double Vec)
1312 return __builtin_shufflevector(Vec.ZW, Vec.XY, X - 2, Y + 2);
1315template <
int X,
int Y,
typename std::enable_if < (X > 1) && (Y > 1),
bool >::type =
true>
1316FORCEINLINE VectorRegister2Double VectorSwizzleImpl2(VectorRegister4Double Vec)
1318 return __builtin_shufflevector(Vec.ZW, Vec.ZW, X - 2, Y);
1321template <
int X,
int Y,
int Z,
int W>
1322FORCEINLINE VectorRegister4Double VectorSwizzleImpl(VectorRegister4Double Vec)
1324 VectorRegister4Double Result;
1325 Result.XY = VectorSwizzleImpl2<X, Y>(Vec);
1326 Result.ZW = VectorSwizzleImpl2<Z, W>(Vec);
1330#define VectorSwizzle( Vec, X, Y, Z, W ) VectorSwizzleImpl<X, Y, Z, W>(Vec)
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1346FORCEINLINE VectorRegister4Float VectorShuffle
1348 VectorRegister4Float V1,
1349 VectorRegister4Float V2,
1356 check(PermuteX <= 3 && PermuteY <= 3 && PermuteZ <= 3 && PermuteW <= 3);
1358 static constexpr uint32 ControlElement[8] =
1371 tbl.val[0] = vget_low_f32(V1);
1372 tbl.val[1] = vget_high_f32(V1);
1373 tbl.val[2] = vget_low_f32(V2);
1374 tbl.val[3] = vget_high_f32(V2);
1376 uint32x2_t idx = vcreate_u32(
static_cast<uint64>(ControlElement[PermuteX]) | (
static_cast<uint64>(ControlElement[PermuteY]) << 32));
1377 const uint8x8_t rL = vtbl4_u8(tbl, idx);
1379 idx = vcreate_u32(
static_cast<uint64>(ControlElement[PermuteZ + 4]) | (
static_cast<uint64>(ControlElement[PermuteW + 4]) << 32));
1380 const uint8x8_t rH = vtbl4_u8(tbl, idx);
1382 return vcombine_f32(rL, rH);
1385FORCEINLINE VectorRegister4Double VectorShuffle
1387 VectorRegister4Double V1,
1388 VectorRegister4Double V2,
1395 check(PermuteX <= 3 && PermuteY <= 3 && PermuteZ <= 3 && PermuteW <= 3);
1397 static constexpr uint64 ControlElement[8] =
1399 0x0706050403020100ULL,
1400 0x0F0E0D0C0B0A0908ULL,
1401 0x1716151413121110ULL,
1402 0x1F1E1D1C1B1A1918ULL,
1404 0x2726252423222120ULL,
1405 0x2F2E2D2C2B2A2928ULL,
1406 0x3736353433323130ULL,
1407 0x3F3E3D3C3B3A3938ULL,
1416 VectorRegister4Double Result;
1417 uint32x4_t idx = vcombine_u64(vcreate_u64(ControlElement[PermuteX]), vcreate_u64(ControlElement[PermuteY]));
1418 Result.XY = vqtbl4q_u8(tbl, idx);
1420 idx = vcombine_u64(vcreate_u64(ControlElement[PermuteZ + 4]), vcreate_u64(ControlElement[PermuteW + 4]));
1421 Result.ZW = vqtbl4q_u8(tbl, idx);
1427template <
int X,
int Y,
int Z,
int W>
1428FORCEINLINE VectorRegister4Float VectorShuffleImpl(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
1430 return __builtin_shufflevector(Vec1, Vec2, X, Y, Z + 4, W + 4);
1433template <
int X,
int Y,
int Z,
int W>
1434FORCEINLINE VectorRegister4Double VectorShuffleImpl(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
1436 VectorRegister4Double Result;
1437 Result.XY = VectorSwizzleImpl2<X, Y>(Vec1);
1438 Result.ZW = VectorSwizzleImpl2<Z, W>(Vec2);
1442#define VectorShuffle( Vec1, Vec2, X, Y, Z, W ) VectorShuffleImpl<X, Y, Z, W>(Vec1, Vec2)
1446
1447
1448
1449
1450
1453 uint32x4_t mmA = vtstq_u32(vreinterpretq_u32_f32(VecMask), GlobalVectorConstants::SignBit());
1454 uint32x4_t mmB = vandq_u32(mmA, MakeVectorRegisterInt(0x1, 0x2, 0x4, 0x8));
1455 uint32x2_t mmC = vorr_u32(vget_low_u32(mmB), vget_high_u32(mmB));
1456 return vget_lane_u32(mmC, 0) | vget_lane_u32(mmC, 1);
1461 uint64x2_t mmA = vtstq_u64(vreinterpretq_u64_f64(VecMask.XY),
GlobalVectorConstants::DoubleSignBit().XY);
1462 uint64x2_t mmA1 = vtstq_u64(vreinterpretq_u64_f64(VecMask.ZW),
GlobalVectorConstants::DoubleSignBit().XY);
1463 uint64x2_t mmB = vandq_u64(mmA, MakeVectorRegisterInt64(0x1, 0x2));
1464 uint64x2_t mmB1 = vandq_u64(mmA1, MakeVectorRegisterInt64(0x4, 0x8));
1465 uint64x2_t mmC = vorrq_u64(mmB, mmB1);
1466 return (uint32)(vgetq_lane_u64(mmC, 0) | vgetq_lane_u64(mmC, 1));
1470
1471
1472
1473
1474
1475
1478 return vcombine_f32(vget_high_f32(Vec1), vget_high_f32(Vec2));
1483 VectorRegister4Double Result;
1484 Result.XY = Vec1.ZW;
1485 Result.ZW = Vec2.ZW;
1490
1491
1492
1493
1494
1495
1498 return vcombine_f32(vget_low_f32(Vec1), vget_low_f32(Vec2));
1503 VectorRegister4Double Result;
1504 Result.XY = Vec1.XY;
1505 Result.ZW = Vec2.XY;
1510
1511
1512
1513
1514
1515
1516
1517
1518FORCEINLINE void VectorDeinterleave(VectorRegister4Float& OutEvens, VectorRegister4Float& OutOdds,
const VectorRegister4Float& Lo,
const VectorRegister4Float& Hi)
1520 float32x4x2_t deinterleaved = vuzpq_f32(Lo, Hi);
1521 OutEvens = deinterleaved.val[0];
1522 OutOdds = deinterleaved.val[1];
1532
1533
1534
1535
1536
1537
1540 VectorRegister4Float C = VectorMultiply(Vec1,
VectorSwizzle(Vec2, 1, 2, 0, 3));
1541 C = VectorNegateMultiplyAdd(
VectorSwizzle(Vec1, 1, 2, 0, 3), Vec2, C);
1555
1556
1557
1558
1559
1560
1565 VectorRegister4Float V;
float F[4];
1570 return MakeVectorRegister( powf(B.F[0], E.F[0]), powf(B.F[1], E.F[1]), powf(B.F[2], E.F[2]), powf(B.F[3], E.F[3]) );
1573FORCEINLINE VectorRegister4Double
VectorPow(
const VectorRegister4Double& Base,
const VectorRegister4Double& Exponent)
1576 AlignedDouble4 Values
(Base
);
1577 AlignedDouble4 Exponents
(Exponent
);
1579 Values[0] = FMath::Pow(Values[0], Exponents[0]);
1580 Values[1] = FMath::Pow(Values[1], Exponents[1]);
1581 Values[2] = FMath::Pow(Values[2], Exponents[2]);
1582 Values[3] = FMath::Pow(Values[3], Exponents[3]);
1587
1588
1589
1590
1591
1594 return vrecpeq_f32(Vec);
1599 VectorRegister4Double Result;
1600 Result.XY = vrecpeq_f64(Vec.XY);
1601 Result.ZW = vrecpeq_f64(Vec.ZW);
1607
1608
1609
1610
1611
1618 VectorRegister4Float Reciprocal = VectorReciprocalEstimate(Vec);
1621 VectorRegister4Float Squared = VectorMultiply(Reciprocal, Reciprocal);
1622 VectorRegister4Float Double = VectorAdd(Reciprocal, Reciprocal);
1623 Reciprocal = VectorNegateMultiplyAdd(Vec, Squared, Double);
1626 Squared = VectorMultiply(Reciprocal, Reciprocal);
1627 Double = VectorAdd(Reciprocal, Reciprocal);
1628 return VectorNegateMultiplyAdd(Vec, Squared, Double);
1638
1639
1640
1641
1642
1645 return vsqrtq_f32(Vec);
1650 VectorRegister4Double Result;
1651 Result.XY = vsqrtq_f64(Vec.XY);
1652 Result.ZW = vsqrtq_f64(Vec.ZW);
1657
1658
1659
1660
1661
1664 return vrsqrteq_f32(Vec);
1669 VectorRegister4Double Result;
1670 Result.XY = vrsqrteq_f64(Vec.XY);
1671 Result.ZW = vrsqrteq_f64(Vec.ZW);
1676
1677
1678
1679
1680
1687 VectorRegister4Float RecipSqrt = VectorReciprocalSqrtEstimate(Vec);
1690 RecipSqrt = VectorMultiply(vrsqrtsq_f32(Vec, VectorMultiply(RecipSqrt, RecipSqrt)), RecipSqrt);
1691 return VectorMultiply(vrsqrtsq_f32(Vec, VectorMultiply(RecipSqrt, RecipSqrt)), RecipSqrt);
1703 VectorRegister4Double Tmp;
1704 Tmp.XY = vrsqrtsq_f64(Vec.XY, VectorMultiply(RecipSqrt.XY, RecipSqrt.XY));
1705 Tmp.ZW = vrsqrtsq_f64(Vec.ZW, VectorMultiply(RecipSqrt.ZW, RecipSqrt.ZW));
1708 Tmp.XY = vrsqrtsq_f64(Vec.XY, VectorMultiply(RecipSqrt.XY, RecipSqrt.XY));
1709 Tmp.ZW = vrsqrtsq_f64(Vec.ZW, VectorMultiply(RecipSqrt.ZW, RecipSqrt.ZW));
1714
1715
1716
1717
1718
1721 return VectorReciprocalSqrt(VectorDot4(Vector, Vector));
1730
1731
1732
1733
1734
1737 return VectorReciprocalSqrtEstimate(VectorDot4(Vector, Vector));
1747
1748
1749
1750
1751
1764
1765
1766
1767
1768
1782
1783
1784
1785
1786
1787
1788template <uint32 ElementIndex>
1791 return vgetq_lane_f32(Vec, ElementIndex);
1794template <
int ElementIndex>
1797 return vgetq_lane_f64(Vec, ElementIndex);
1800template<
int ElementIndex,
typename std::enable_if< (ElementIndex > 1),
bool >::type =
true >
1803 return VectorGetComponentImpl<ElementIndex - 2>(Vec.ZW);
1806template<
int ElementIndex,
typename std::enable_if < (ElementIndex <= 1),
bool >::type =
true >
1807FORCEINLINE double VectorGetComponentImpl(
const VectorRegister4Double& Vec)
1809 return VectorGetComponentImpl<ElementIndex>(Vec.XY);
1812#define VectorGetComponent(Vec, ElementIndex) VectorGetComponentImpl<ElementIndex>(Vec)
1816 AlignedFloat4 Floats(Vec);
1817 return Floats
[ElementIndex
];
1822 AlignedDouble4 Doubles
(Vec
);
1823 return Doubles
[ElementIndex
];
1827
1828
1829
1830
1831
1832
1835 float32x4x4_t A = vld1q_f32_x4((
const float*)Matrix1);
1836 float32x4x4_t B = vld1q_f32_x4((
const float*)Matrix2);
1840 R.val[0] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[0]), 0);
1841 R.val[0] = vfmaq_lane_f32(R.val[0], B.val[1], vget_low_f32(A.val[0]), 1);
1842 R.val[0] = vfmaq_lane_f32(R.val[0], B.val[2], vget_high_f32(A.val[0]), 0);
1843 R.val[0] = vfmaq_lane_f32(R.val[0], B.val[3], vget_high_f32(A.val[0]), 1);
1846 R.val[1] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[1]), 0);
1847 R.val[1] = vfmaq_lane_f32(R.val[1], B.val[1], vget_low_f32(A.val[1]), 1);
1848 R.val[1] = vfmaq_lane_f32(R.val[1], B.val[2], vget_high_f32(A.val[1]), 0);
1849 R.val[1] = vfmaq_lane_f32(R.val[1], B.val[3], vget_high_f32(A.val[1]), 1);
1852 R.val[2] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[2]), 0);
1853 R.val[2] = vfmaq_lane_f32(R.val[2], B.val[1], vget_low_f32(A.val[2]), 1);
1854 R.val[2] = vfmaq_lane_f32(R.val[2], B.val[2], vget_high_f32(A.val[2]), 0);
1855 R.val[2] = vfmaq_lane_f32(R.val[2], B.val[3], vget_high_f32(A.val[2]), 1);
1858 R.val[3] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[3]), 0);
1859 R.val[3] = vfmaq_lane_f32(R.val[3], B.val[1], vget_low_f32(A.val[3]), 1);
1860 R.val[3] = vfmaq_lane_f32(R.val[3], B.val[2], vget_high_f32(A.val[3]), 0);
1861 R.val[3] = vfmaq_lane_f32(R.val[3], B.val[3], vget_high_f32(A.val[3]), 1);
1863 vst1q_f32_x4((
float*)Result, R);
1868 float64x2x4_t A = vld1q_f64_x4((
const double*)Matrix1);
1869 float64x2x4_t B1 = vld1q_f64_x4((
const double*)Matrix2);
1870 float64x2x4_t B2 = vld1q_f64_x4((
const double*)Matrix2 + 8);
1871 float64_t* V = (float64_t*)&A;
1875 R.val[0] = vmulq_n_f64(B1.val[0], V[0]);
1876 R.val[0] = vfmaq_n_f64(R.val[0], B1.val[2], V[1]);
1877 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[0], V[2]);
1878 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[2], V[3]);
1880 R.val[1] = vmulq_n_f64(B1.val[1], V[0]);
1881 R.val[1] = vfmaq_n_f64(R.val[1], B1.val[3], V[1]);
1882 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[1], V[2]);
1883 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[3], V[3]);
1886 R.val[2] = vmulq_n_f64(B1.val[0], V[4]);
1887 R.val[2] = vfmaq_n_f64(R.val[2], B1.val[2], V[5]);
1888 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[0], V[6]);
1889 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[2], V[7]);
1891 R.val[3] = vmulq_n_f64(B1.val[1], V[4]);
1892 R.val[3] = vfmaq_n_f64(R.val[3], B1.val[3], V[5]);
1893 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[1], V[6]);
1894 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[3], V[7]);
1896 vst1q_f64_x4((
double*)Result, R);
1897 A = vld1q_f64_x4((
const double*)Matrix1 + 8);
1901 R.val[0] = vmulq_n_f64(B1.val[0], V[0]);
1902 R.val[0] = vfmaq_n_f64(R.val[0], B1.val[2], V[1]);
1903 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[0], V[2]);
1904 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[2], V[3]);
1906 R.val[1] = vmulq_n_f64(B1.val[1], V[0]);
1907 R.val[1] = vfmaq_n_f64(R.val[1], B1.val[3], V[1]);
1908 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[1], V[2]);
1909 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[3], V[3]);
1912 R.val[2] = vmulq_n_f64(B1.val[0], V[4]);
1913 R.val[2] = vfmaq_n_f64(R.val[2], B1.val[2], V[5]);
1914 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[0], V[6]);
1915 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[2], V[7]);
1917 R.val[3] = vmulq_n_f64(B1.val[1], V[4]);
1918 R.val[3] = vfmaq_n_f64(R.val[3], B1.val[3], V[5]);
1919 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[1], V[6]);
1920 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[3], V[7]);
1922 vst1q_f64_x4((
double*)Result + 8, R);
1926
1927
1928
1929
1930
1934 typedef float Float4x4[4][4];
1935 const Float4x4& M = *((
const Float4x4*) SrcMatrix);
1940 Tmp[0][0] = M[2][2] * M[3][3] - M[2][3] * M[3][2];
1941 Tmp[0][1] = M[1][2] * M[3][3] - M[1][3] * M[3][2];
1942 Tmp[0][2] = M[1][2] * M[2][3] - M[1][3] * M[2][2];
1944 Tmp[1][0] = M[2][2] * M[3][3] - M[2][3] * M[3][2];
1945 Tmp[1][1] = M[0][2] * M[3][3] - M[0][3] * M[3][2];
1946 Tmp[1][2] = M[0][2] * M[2][3] - M[0][3] * M[2][2];
1948 Tmp[2][0] = M[1][2] * M[3][3] - M[1][3] * M[3][2];
1949 Tmp[2][1] = M[0][2] * M[3][3] - M[0][3] * M[3][2];
1950 Tmp[2][2] = M[0][2] * M[1][3] - M[0][3] * M[1][2];
1952 Tmp[3][0] = M[1][2] * M[2][3] - M[1][3] * M[2][2];
1953 Tmp[3][1] = M[0][2] * M[2][3] - M[0][3] * M[2][2];
1954 Tmp[3][2] = M[0][2] * M[1][3] - M[0][3] * M[1][2];
1956 Det[0] = M[1][1]*Tmp[0][0] - M[2][1]*Tmp[0][1] + M[3][1]*Tmp[0][2];
1957 Det[1] = M[0][1]*Tmp[1][0] - M[2][1]*Tmp[1][1] + M[3][1]*Tmp[1][2];
1958 Det[2] = M[0][1]*Tmp[2][0] - M[1][1]*Tmp[2][1] + M[3][1]*Tmp[2][2];
1959 Det[3] = M[0][1]*Tmp[3][0] - M[1][1]*Tmp[3][1] + M[2][1]*Tmp[3][2];
1961 float Determinant = M[0][0]*Det[0] - M[1][0]*Det[1] + M[2][0]*Det[2] - M[3][0]*Det[3];
1962 const float RDet = 1.0f / Determinant;
1964 Result[0][0] = RDet * Det[0];
1965 Result[0][1] = -RDet * Det[1];
1966 Result[0][2] = RDet * Det[2];
1967 Result[0][3] = -RDet * Det[3];
1968 Result[1][0] = -RDet * (M[1][0]*Tmp[0][0] - M[2][0]*Tmp[0][1] + M[3][0]*Tmp[0][2]);
1969 Result[1][1] = RDet * (M[0][0]*Tmp[1][0] - M[2][0]*Tmp[1][1] + M[3][0]*Tmp[1][2]);
1970 Result[1][2] = -RDet * (M[0][0]*Tmp[2][0] - M[1][0]*Tmp[2][1] + M[3][0]*Tmp[2][2]);
1971 Result[1][3] = RDet * (M[0][0]*Tmp[3][0] - M[1][0]*Tmp[3][1] + M[2][0]*Tmp[3][2]);
1972 Result[2][0] = RDet * (
1973 M[1][0] * (M[2][1] * M[3][3] - M[2][3] * M[3][1]) -
1974 M[2][0] * (M[1][1] * M[3][3] - M[1][3] * M[3][1]) +
1975 M[3][0] * (M[1][1] * M[2][3] - M[1][3] * M[2][1])
1977 Result[2][1] = -RDet * (
1978 M[0][0] * (M[2][1] * M[3][3] - M[2][3] * M[3][1]) -
1979 M[2][0] * (M[0][1] * M[3][3] - M[0][3] * M[3][1]) +
1980 M[3][0] * (M[0][1] * M[2][3] - M[0][3] * M[2][1])
1982 Result[2][2] = RDet * (
1983 M[0][0] * (M[1][1] * M[3][3] - M[1][3] * M[3][1]) -
1984 M[1][0] * (M[0][1] * M[3][3] - M[0][3] * M[3][1]) +
1985 M[3][0] * (M[0][1] * M[1][3] - M[0][3] * M[1][1])
1987 Result[2][3] = -RDet * (
1988 M[0][0] * (M[1][1] * M[2][3] - M[1][3] * M[2][1]) -
1989 M[1][0] * (M[0][1] * M[2][3] - M[0][3] * M[2][1]) +
1990 M[2][0] * (M[0][1] * M[1][3] - M[0][3] * M[1][1])
1992 Result[3][0] = -RDet * (
1993 M[1][0] * (M[2][1] * M[3][2] - M[2][2] * M[3][1]) -
1994 M[2][0] * (M[1][1] * M[3][2] - M[1][2] * M[3][1]) +
1995 M[3][0] * (M[1][1] * M[2][2] - M[1][2] * M[2][1])
1997 Result[3][1] = RDet * (
1998 M[0][0] * (M[2][1] * M[3][2] - M[2][2] * M[3][1]) -
1999 M[2][0] * (M[0][1] * M[3][2] - M[0][2] * M[3][1]) +
2000 M[3][0] * (M[0][1] * M[2][2] - M[0][2] * M[2][1])
2002 Result[3][2] = -RDet * (
2003 M[0][0] * (M[1][1] * M[3][2] - M[1][2] * M[3][1]) -
2004 M[1][0] * (M[0][1] * M[3][2] - M[0][2] * M[3][1]) +
2005 M[3][0] * (M[0][1] * M[1][2] - M[0][2] * M[1][1])
2007 Result[3][3] = RDet * (
2008 M[0][0] * (M[1][1] * M[2][2] - M[1][2] * M[2][1]) -
2009 M[1][0] * (M[0][1] * M[2][2] - M[0][2] * M[2][1]) +
2010 M[2][0] * (M[0][1] * M[1][2] - M[0][2] * M[1][1])
2013 memcpy( DstMatrix, &Result,
sizeof(Result) );
2018 typedef double Double4x4[4][4];
2019 const Double4x4& M = *((
const Double4x4*)SrcMatrix);
2024 Tmp[0][0] = M[2][2] * M[3][3] - M[2][3] * M[3][2];
2025 Tmp[0][1] = M[1][2] * M[3][3] - M[1][3] * M[3][2];
2026 Tmp[0][2] = M[1][2] * M[2][3] - M[1][3] * M[2][2];
2028 Tmp[1][0] = M[2][2] * M[3][3] - M[2][3] * M[3][2];
2029 Tmp[1][1] = M[0][2] * M[3][3] - M[0][3] * M[3][2];
2030 Tmp[1][2] = M[0][2] * M[2][3] - M[0][3] * M[2][2];
2032 Tmp[2][0] = M[1][2] * M[3][3] - M[1][3] * M[3][2];
2033 Tmp[2][1] = M[0][2] * M[3][3] - M[0][3] * M[3][2];
2034 Tmp[2][2] = M[0][2] * M[1][3] - M[0][3] * M[1][2];
2036 Tmp[3][0] = M[1][2] * M[2][3] - M[1][3] * M[2][2];
2037 Tmp[3][1] = M[0][2] * M[2][3] - M[0][3] * M[2][2];
2038 Tmp[3][2] = M[0][2] * M[1][3] - M[0][3] * M[1][2];
2040 Det[0] = M[1][1] * Tmp[0][0] - M[2][1] * Tmp[0][1] + M[3][1] * Tmp[0][2];
2041 Det[1] = M[0][1] * Tmp[1][0] - M[2][1] * Tmp[1][1] + M[3][1] * Tmp[1][2];
2042 Det[2] = M[0][1] * Tmp[2][0] - M[1][1] * Tmp[2][1] + M[3][1] * Tmp[2][2];
2043 Det[3] = M[0][1] * Tmp[3][0] - M[1][1] * Tmp[3][1] + M[2][1] * Tmp[3][2];
2045 double Determinant = M[0][0] * Det[0] - M[1][0] * Det[1] + M[2][0] * Det[2] - M[3][0] * Det[3];
2046 const double RDet = 1.0 / Determinant;
2048 Result[0][0] = RDet * Det[0];
2049 Result[0][1] = -RDet * Det[1];
2050 Result[0][2] = RDet * Det[2];
2051 Result[0][3] = -RDet * Det[3];
2052 Result[1][0] = -RDet * (M[1][0] * Tmp[0][0] - M[2][0] * Tmp[0][1] + M[3][0] * Tmp[0][2]);
2053 Result[1][1] = RDet * (M[0][0] * Tmp[1][0] - M[2][0] * Tmp[1][1] + M[3][0] * Tmp[1][2]);
2054 Result[1][2] = -RDet * (M[0][0] * Tmp[2][0] - M[1][0] * Tmp[2][1] + M[3][0] * Tmp[2][2]);
2055 Result[1][3] = RDet * (M[0][0] * Tmp[3][0] - M[1][0] * Tmp[3][1] + M[2][0] * Tmp[3][2]);
2056 Result[2][0] = RDet * (
2057 M[1][0] * (M[2][1] * M[3][3] - M[2][3] * M[3][1]) -
2058 M[2][0] * (M[1][1] * M[3][3] - M[1][3] * M[3][1]) +
2059 M[3][0] * (M[1][1] * M[2][3] - M[1][3] * M[2][1])
2061 Result[2][1] = -RDet * (
2062 M[0][0] * (M[2][1] * M[3][3] - M[2][3] * M[3][1]) -
2063 M[2][0] * (M[0][1] * M[3][3] - M[0][3] * M[3][1]) +
2064 M[3][0] * (M[0][1] * M[2][3] - M[0][3] * M[2][1])
2066 Result[2][2] = RDet * (
2067 M[0][0] * (M[1][1] * M[3][3] - M[1][3] * M[3][1]) -
2068 M[1][0] * (M[0][1] * M[3][3] - M[0][3] * M[3][1]) +
2069 M[3][0] * (M[0][1] * M[1][3] - M[0][3] * M[1][1])
2071 Result[2][3] = -RDet * (
2072 M[0][0] * (M[1][1] * M[2][3] - M[1][3] * M[2][1]) -
2073 M[1][0] * (M[0][1] * M[2][3] - M[0][3] * M[2][1]) +
2074 M[2][0] * (M[0][1] * M[1][3] - M[0][3] * M[1][1])
2076 Result[3][0] = -RDet * (
2077 M[1][0] * (M[2][1] * M[3][2] - M[2][2] * M[3][1]) -
2078 M[2][0] * (M[1][1] * M[3][2] - M[1][2] * M[3][1]) +
2079 M[3][0] * (M[1][1] * M[2][2] - M[1][2] * M[2][1])
2081 Result[3][1] = RDet * (
2082 M[0][0] * (M[2][1] * M[3][2] - M[2][2] * M[3][1]) -
2083 M[2][0] * (M[0][1] * M[3][2] - M[0][2] * M[3][1]) +
2084 M[3][0] * (M[0][1] * M[2][2] - M[0][2] * M[2][1])
2086 Result[3][2] = -RDet * (
2087 M[0][0] * (M[1][1] * M[3][2] - M[1][2] * M[3][1]) -
2088 M[1][0] * (M[0][1] * M[3][2] - M[0][2] * M[3][1]) +
2089 M[3][0] * (M[0][1] * M[1][2] - M[0][2] * M[1][1])
2091 Result[3][3] = RDet * (
2092 M[0][0] * (M[1][1] * M[2][2] - M[1][2] * M[2][1]) -
2093 M[1][0] * (M[0][1] * M[2][2] - M[0][2] * M[2][1]) +
2094 M[2][0] * (M[0][1] * M[1][2] - M[0][2] * M[1][1])
2097 memcpy(DstMatrix, &Result,
sizeof(Result));
2101
2102
2103
2104
2105
2106
2109 float32x4x4_t M = vld1q_f32_x4((
const float*)MatrixM);
2110 VectorRegister4Float Result;
2112 Result = vmulq_n_f32(M.val[0], VecP[0]);
2113 Result = vfmaq_n_f32(Result, M.val[1], VecP[1]);
2114 Result = vfmaq_n_f32(Result, M.val[2], VecP[2]);
2115 Result = vfmaq_n_f32(Result, M.val[3], VecP[3]);
2122 float64x2x4_t M1 = vld1q_f64_x4((
const double*)MatrixM);
2123 float64x2x4_t M2 = vld1q_f64_x4(((
const double*)MatrixM) + 8);
2124 VectorRegister4Double Result;
2125 VectorRegister4Double Vec(VecP);
2127 Result.XY = vmulq_n_f64(M1.val[0], Vec.XY[0]);
2128 Result.XY = vfmaq_n_f64(Result.XY, M1.val[2], Vec.XY[1]);
2129 Result.XY = vfmaq_n_f64(Result.XY, M2.val[0], Vec.ZW[0]);
2130 Result.XY = vfmaq_n_f64(Result.XY, M2.val[2], Vec.ZW[1]);
2132 Result.ZW = vmulq_n_f64(M1.val[1], Vec.XY[0]);
2133 Result.ZW = vfmaq_n_f64(Result.ZW, M1.val[3], Vec.XY[1]);
2134 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[1], Vec.ZW[0]);
2135 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[3], Vec.ZW[1]);
2137 return MakeVectorRegisterFloatFromDouble(Result);
2142 float64x2x4_t M1 = vld1q_f64_x4((
const double*)MatrixM);
2143 float64x2x4_t M2 = vld1q_f64_x4(((
const double*)MatrixM) + 8);
2144 VectorRegister4Double Result;
2147 Result.XY = vmulq_n_f64(M1.val[0], VecP.XY[0]);
2148 Result.XY = vfmaq_n_f64(Result.XY, M1.val[2], VecP.XY[1]);
2149 Result.XY = vfmaq_n_f64(Result.XY, M2.val[0], VecP.ZW[0]);
2150 Result.XY = vfmaq_n_f64(Result.XY, M2.val[2], VecP.ZW[1]);
2152 Result.ZW = vmulq_n_f64(M1.val[1], VecP.XY[0]);
2153 Result.ZW = vfmaq_n_f64(Result.ZW, M1.val[3], VecP.XY[1]);
2154 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[1], VecP.ZW[0]);
2155 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[3], VecP.ZW[1]);
2161
2162
2163
2164
2165
2166
2169 return vminq_f32( Vec1, Vec2 );
2174 VectorRegister4Double Result;
2175 Result.XY = vminq_f64(Vec1.XY, Vec2.XY);
2176 Result.ZW = vminq_f64(Vec1.ZW, Vec2.ZW);
2181
2182
2183
2184
2185
2186
2189 return vmaxq_f32( Vec1, Vec2 );
2194 VectorRegister4Double Result;
2195 Result.XY = vmaxq_f64(Vec1.XY, Vec2.XY);
2196 Result.ZW = vmaxq_f64(Vec1.ZW, Vec2.ZW);
2201
2202
2203
2204
2205
2206
2209 return vsetq_lane_f32(vgetq_lane_f32(VecW, 3), VecXYZ, 3);
2214 VectorRegister4Double Res;
2216 Res.ZW = vsetq_lane_f64(vgetq_lane_f64(VecW.ZW, 1), VecXYZ.ZW, 1);
2221
2222
2223
2224
2225
2226
2230 const uint8 *P = (
const uint8 *)Ptr;
2231 return MakeVectorRegister( (
float)P[0], (
float)P[1], (
float)P[2], (
float)P[3] );
2235
2236
2237
2238
2239
2240
2244 const int8 *P = (
const int8 *)Ptr;
2245 return MakeVectorRegister((
float)P[0], (
float)P[1], (
float)P[2], (
float)P[3]);
2249
2250
2251
2252
2253
2254
2258 const uint8 *P = (
const uint8 *)Ptr;
2259 return MakeVectorRegister( (
float)P[3], (
float)P[2], (
float)P[1], (
float)P[0] );
2263
2264
2265
2266
2267
2268
2271 uint16x8_t u16x8 = (uint16x8_t)vcvtq_u32_f32(VectorMin(Vec, GlobalVectorConstants::Float255));
2272 uint8x8_t u8x8 = (uint8x8_t)vget_low_u16( vuzpq_u16( u16x8, u16x8 ).val[0] );
2273 u8x8 = vuzp_u8( u8x8, u8x8 ).val[0];
2275 vst1_u8( (uint8_t *)buf, u8x8 );
2276 *(uint32_t *)Ptr = buf[0];
2280
2281
2282
2283
2284
2285
2288 int16x8_t s16x8 = (int16x8_t)vcvtq_s32_f32(VectorMax(VectorMin(Vec, GlobalVectorConstants::Float127), GlobalVectorConstants::FloatNeg127));
2289 int8x8_t s8x8 = (int8x8_t)vget_low_s16(vuzpq_s16(s16x8, s16x8).val[0]);
2290 s8x8 = vuzp_s8(s8x8, s8x8).val[0];
2292 vst1_s8((int8_t *)buf, s8x8);
2293 *(int32_t *)Ptr = buf[0];
2297
2298
2299
2300
2301
2302template <
bool bAligned>
2305 float16x4_t f16x4 = vcvt_f16_f32(Vec);
2309 vst1_u8( (uint8_t *)Ptr, f16x4 );
2313 alignas(16) uint16_t Buf[4];
2314 vst1_u8( (uint8_t *)Buf, f16x4 );
2315 for (
int i = 0; i < 4; ++i)
2317 ((uint16_t*)Ptr)[i] = Buf[i];
2323
2324
2325
2326
2327
2328
2331 alignas(16)
float V[4];
2332 const uint32 E = *(uint32*)Ptr;
2333 V[0] =
float((E >> 00) & 0x3FF);
2334 V[1] =
float((E >> 10) & 0x3FF);
2335 V[2] =
float((E >> 20) & 0x3FF);
2336 V[3] =
float((E >> 30) & 0x3);
2338 VectorRegister4Float Div = MakeVectorRegister(1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f);
2339 return VectorMultiply(MakeVectorRegister(V[0], V[1], V[2], V[3]), Div);
2343
2344
2345
2346
2347
2348
2352 VectorRegister4Float V;
float F[4];
2355 Tmp.V = VectorMax(Vec, VectorZeroFloat());
2356 Tmp.V = VectorMin(Tmp.V, VectorOneFloat());
2359 uint32* Out = (uint32*)Ptr;
2360 *Out = (uint32(Tmp.F[0]) & 0x3FF) << 00 |
2361 (uint32(Tmp.F[1]) & 0x3FF) << 10 |
2362 (uint32(Tmp.F[2]) & 0x3FF) << 20 |
2363 (uint32(Tmp.F[3]) & 0x003) << 30;
2367
2368
2369
2370
2371
2372
2375 uint16x8_t u16x8 = (uint16x8_t)vcgtq_f32( Vec1, Vec2 );
2376 uint8x8_t u8x8 = (uint8x8_t)vget_low_u16( vuzpq_u16( u16x8, u16x8 ).val[0] );
2377 u8x8 = vuzp_u8( u8x8, u8x8 ).val[0];
2379 vst1_u8( (uint8_t *)buf, u8x8 );
2380 return (int32)buf[0];
2385 uint16x8_t u16x8_1 = (uint16x8_t)vcgtq_f64(Vec1.XY, Vec2.XY);
2386 uint16x8_t u16x8_2 = (uint16x8_t)vcgtq_f64(Vec1.ZW, Vec2.ZW);
2387 uint16x8x2_t tmp = vuzpq_u16(u16x8_1, u16x8_2);
2388 uint8x8_t u8x8 = (uint8x8_t)vget_low_u16(vuzpq_u16(tmp.val[0], tmp.val[0]).val[0]);
2389 u8x8 = vuzp_u8(u8x8, u8x8).val[0];
2391 vst1_u8((uint8_t*)buf, u8x8);
2392 return (int32)buf[0];
2396
2397
2398
2399#define VectorResetFloatRegisters()
2402
2403
2404
2405
2406#define VectorGetControlRegister() 0
2409
2410
2411
2412
2413#define VectorSetControlRegister(ControlStatus)
2416
2417
2418#define VECTOR_ROUND_TOWARD_ZERO 0
2422
2423
2424
2425
2426
2427
2428
2429
2430
2433 VectorRegister4Float Result = VectorMultiply(
VectorReplicate(Quat1, 3), Quat2);
2434 Result = VectorMultiplyAdd( VectorMultiply(
VectorReplicate(Quat1, 0),
VectorSwizzle(Quat2, 3,2,1,0)), GlobalVectorConstants::QMULTI_SIGN_MASK0, Result);
2435 Result = VectorMultiplyAdd( VectorMultiply(
VectorReplicate(Quat1, 1),
VectorSwizzle(Quat2, 2,3,0,1)), GlobalVectorConstants::QMULTI_SIGN_MASK1, Result);
2436 Result = VectorMultiplyAdd( VectorMultiply(
VectorReplicate(Quat1, 2),
VectorSwizzle(Quat2, 1,0,3,2)), GlobalVectorConstants::QMULTI_SIGN_MASK2, Result);
2452
2453
2454
2455
2456
2457
2458
2459
2460
2463 *Result = VectorQuaternionMultiply2(*Quat1, *Quat2);
2472
2473
2474
2475
2476
2477
2485 VectorRegister4Float Quotient = VectorMultiply(*VAngles, GlobalVectorConstants::OneOverTwoPi);
2486 Quotient = vrndnq_f32(Quotient);
2489 VectorRegister4Float X = VectorNegateMultiplyAdd(GlobalVectorConstants::TwoPi, Quotient, *VAngles);
2492 VectorRegister4Float sign = VectorBitwiseAnd(X, GlobalVectorConstants::SignBit());
2493 VectorRegister4Float c = VectorBitwiseOr(GlobalVectorConstants::Pi, sign);
2494 VectorRegister4Float absx = VectorAbs(X);
2495 VectorRegister4Float rflx = VectorSubtract(c, X);
2496 VectorRegister4Float comp = VectorCompareGT(absx, GlobalVectorConstants::PiByTwo);
2497 X = VectorSelect(comp, rflx, X);
2498 sign = VectorSelect(comp, GlobalVectorConstants::FloatMinusOne, GlobalVectorConstants::FloatOne);
2500 const VectorRegister4Float XSquared = VectorMultiply(X, X);
2504 const VectorRegister4Float SinCoeff0 = MakeVectorRegister(1.0f, -0.16666667f, 0.0083333310f, -0.00019840874f);
2505 const VectorRegister4Float SinCoeff1 = MakeVectorRegister(2.7525562e-06f, -2.3889859e-08f, 0.f, 0.f);
2507 VectorRegister4Float S;
2514 *VSinAngles = VectorMultiply(S, X);
2518 const VectorRegister4Float CosCoeff0 = MakeVectorRegister(1.0f, -0.5f, 0.041666638f, -0.0013888378f);
2519 const VectorRegister4Float CosCoeff1 = MakeVectorRegister(2.4760495e-05f, -2.6051615e-07f, 0.f, 0.f);
2521 VectorRegister4Float C;
2528 *VCosAngles = VectorMultiply(C, sign);
2539 union {
float F; uint32 U; } InfUnion;
2540 InfUnion.U = 0x7F800000;
2541 const float Inf = InfUnion.F;
2542 const VectorRegister4Float FloatInfinity = MakeVectorRegister(Inf, Inf, Inf, Inf);
2545 VectorRegister4Float ExpTest = VectorBitwiseAnd(Vec, FloatInfinity);
2548 const int32x4_t Table = MakeVectorRegisterIntConstant(0x0C080400, 0, 0, 0);
2550 uint8x16_t res = (uint8x16_t)VectorCompareEQ(ExpTest, FloatInfinity);
2552 return vgetq_lane_u32((uint32x4_t)vqtbx1q_u8(res, res, Table), 0) != 0;
2562 union {
double F; uint64 U; } InfUnion;
2563 InfUnion.U = 0x7FF0000000000000ULL;
2564 const double Inf = InfUnion.F;
2571 const int32x4_t Table = MakeVectorRegisterIntConstant(0x18100800, 0, 0, 0);
2576 uint8x16_t ZeroVec = vdupq_n_u8(0);
2578 return vgetq_lane_u32((uint32x4_t)vqtbx2q_u8(ZeroVec, *(uint8x16x2_t*)&InfTestRes, Table), 0) != 0;
2584 AlignedFloat4 Val(X);
2585 return MakeVectorRegister(FMath::Exp(Val[0]), FMath::Exp(Val[1]), FMath::Exp(Val[2]), FMath::Exp(Val[3]));
2590 AlignedDouble4 Val
(X
);
2591 return MakeVectorRegister(FMath::Exp(Val[0]), FMath::Exp(Val[1]), FMath::Exp(Val[2]), FMath::Exp(Val[3]));
2597 AlignedFloat4 Val(X);
2598 return MakeVectorRegister(FMath::Exp2(Val[0]), FMath::Exp2(Val[1]), FMath::Exp2(Val[2]), FMath::Exp2(Val[3]));
2603 AlignedDouble4 Val
(X
);
2604 return MakeVectorRegister(FMath::Exp2(Val[0]), FMath::Exp2(Val[1]), FMath::Exp2(Val[2]), FMath::Exp2(Val[3]));
2610 AlignedFloat4 Val(X);
2611 return MakeVectorRegister(FMath::Loge(Val[0]), FMath::Loge(Val[1]), FMath::Loge(Val[2]), FMath::Loge(Val[3]));
2616 AlignedDouble4 Val
(X
);
2617 return MakeVectorRegister(FMath::Loge(Val[0]), FMath::Loge(Val[1]), FMath::Loge(Val[2]), FMath::Loge(Val[3]));
2623 AlignedFloat4 Val(X);
2624 return MakeVectorRegister(FMath::Log2(Val[0]), FMath::Log2(Val[1]), FMath::Log2(Val[2]), FMath::Log2(Val[3]));
2629 AlignedDouble4 Val
(X
);
2636 AlignedFloat4 Val(X);
2637 return MakeVectorRegister(FMath::Tan(Val[0]), FMath::Tan(Val[1]), FMath::Tan(Val[2]), FMath::Tan(Val[3]));
2642 AlignedDouble4 Val
(X
);
2643 return MakeVectorRegister(FMath::Tan(Val[0]), FMath::Tan(Val[1]), FMath::Tan(Val[2]), FMath::Tan(Val[3]));
2649 AlignedFloat4 Val(X);
2650 return MakeVectorRegister(FMath::Asin(Val[0]), FMath::Asin(Val[1]), FMath::Asin(Val[2]), FMath::Asin(Val[3]));
2655 AlignedDouble4 Val
(X
);
2656 return MakeVectorRegister(FMath::Asin(Val[0]), FMath::Asin(Val[1]), FMath::Asin(Val[2]), FMath::Asin(Val[3]));
2662 AlignedFloat4 Val(X);
2663 return MakeVectorRegister(FMath::Acos(Val[0]), FMath::Acos(Val[1]), FMath::Acos(Val[2]), FMath::Acos(Val[3]));
2668 AlignedDouble4 Val
(X
);
2669 return MakeVectorRegister(FMath::Acos(Val[0]), FMath::Acos(Val[1]), FMath::Acos(Val[2]), FMath::Acos(Val[3]));
2675 AlignedFloat4 Val(X);
2676 return MakeVectorRegister(FMath::Atan(Val[0]), FMath::Atan(Val[1]), FMath::Atan(Val[2]), FMath::Atan(Val[3]));
2681 AlignedDouble4 Val
(X
);
2682 return MakeVectorRegister(FMath::Atan(Val[0]), FMath::Atan(Val[1]), FMath::Atan(Val[2]), FMath::Atan(Val[3]));
2688 AlignedFloat4 ValX(X);
2689 AlignedFloat4 ValY(Y);
2691 return MakeVectorRegister(FMath::Atan2(ValX[0], ValY[0]),
2692 FMath::Atan2(ValX[1], ValY[1]),
2693 FMath::Atan2(ValX[2], ValY[2]),
2694 FMath::Atan2(ValX[3], ValY[3]));
2699 AlignedDouble4 ValX
(X
);
2700 AlignedDouble4 ValY
(Y
);
2702 return MakeVectorRegister(FMath::Atan2(ValX[0], ValY[0]),
2703 FMath::Atan2(ValX[1], ValY[1]),
2704 FMath::Atan2(ValX[2], ValY[2]),
2705 FMath::Atan2(ValX[3], ValY[3]));
2710 return vrndpq_f32(X);
2715 VectorRegister4Double Result;
2716 Result.XY = vrndpq_f64(X.XY);
2717 Result.ZW = vrndpq_f64(X.ZW);
2723 return vrndmq_f32(X);
2728 VectorRegister4Double Result;
2729 Result.XY = vrndmq_f64(X.XY);
2730 Result.ZW = vrndmq_f64(X.ZW);
2736 return vrndq_f32(X);
2741 VectorRegister4Double Result;
2742 Result.XY = vrndq_f64(X.XY);
2743 Result.ZW = vrndq_f64(X.ZW);
2750 VectorRegister4Float InvalidDivisorMask = VectorCompareLE(VectorAbs(Y), GlobalVectorConstants::SmallNumber);
2752 AlignedFloat4 XFloats(X), YFloats(Y);
2753 XFloats
[0
] = fmodf(XFloats
[0
], YFloats
[0
]);
2754 XFloats
[1
] = fmodf(XFloats
[1
], YFloats
[1
]);
2755 XFloats
[2
] = fmodf(XFloats
[2
], YFloats
[2
]);
2756 XFloats
[3
] = fmodf(XFloats
[3
], YFloats
[3
]);
2757 VectorRegister4Float Result = XFloats.ToVectorRegister();
2760 Result = VectorSelect(InvalidDivisorMask, GlobalVectorConstants::FloatZero, Result);
2769 AlignedDouble4 XDoubles
(X
), YDoubles
(Y
);
2770 XDoubles
[0
] = fmod(XDoubles
[0
], YDoubles
[0
]);
2771 XDoubles
[1
] = fmod(XDoubles
[1
], YDoubles
[1
]);
2772 XDoubles
[2
] = fmod(XDoubles
[2
], YDoubles
[2
]);
2773 XDoubles
[3
] = fmod(XDoubles
[3
], YDoubles
[3
]);
2778 return DoubleResult;
2783 VectorRegister4Float Mask = VectorCompareGE(X, GlobalVectorConstants::FloatZero);
2784 return VectorSelect(Mask, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatMinusOne);
2795 VectorRegister4Float Mask = VectorCompareGE(X, GlobalVectorConstants::FloatZero);
2796 return VectorSelect(Mask, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatZero);
2807 static const float p = 0.225f;
2808 static const float a = 7.58946609f;
2809 static const float b = 1.63384342f;
2828 VectorRegister4Float Y = VectorMultiply(X, GlobalVectorConstants::OneOverTwoPi);
2829 Y = VectorSubtract(Y, VectorFloor(VectorAdd(Y, GlobalVectorConstants::FloatOneHalf)));
2830 Y = VectorMultiply(VectorSinConstantsNEON::A, VectorMultiply(Y, VectorSubtract(GlobalVectorConstants::FloatOneHalf, VectorAbs(Y))));
2831 return VectorMultiply(Y, VectorAdd(VectorSinConstantsNEON::B, VectorAbs(Y)));
2836 AlignedDouble4 Doubles
(X
);
2837 Doubles[0] = FMath::Sin(Doubles[0]);
2838 Doubles[1] = FMath::Sin(Doubles[1]);
2839 Doubles[2] = FMath::Sin(Doubles[2]);
2840 Doubles[3] = FMath::Sin(Doubles[3]);
2846 return VectorSin(VectorAdd(X, GlobalVectorConstants::PiByTwo));
2851 AlignedDouble4 Doubles
(X
);
2852 Doubles[0] = FMath::Cos(Doubles[0]);
2853 Doubles[1] = FMath::Cos(Doubles[1]);
2854 Doubles[2] = FMath::Cos(Doubles[2]);
2855 Doubles[3] = FMath::Cos(Doubles[3]);
2866
2867
2868
2869
2870
2871
2874 alignas(16)
float V[4];
2880 return VectorLoad(V);
2884
2885
2886
2887
2888
2889
2892 alignas(16)
float V[4];
2893 int16* E = (int16*)Ptr;
2900 return VectorLoad(V);
2904
2905
2906
2907
2908
2909
2912 VectorRegister4Float Tmp;
2913 Tmp = VectorMax(Vec, VectorZeroFloat());
2914 Tmp = VectorMin(Tmp, VectorOneFloat());
2915 Tmp = VectorMultiplyAdd(Tmp, vdupq_n_f32(65535.0f), vdupq_n_f32(0.5f));
2916 Tmp = VectorTruncate(Tmp);
2918 alignas(16)
float F[4];
2919 VectorStoreAligned(Tmp, F);
2921 Out[0] = (uint16)F[0];
2922 Out[1] = (uint16)F[1];
2923 Out[2] = (uint16)F[2];
2924 Out[3] = (uint16)F[3];
2932#define VectorIntAnd(A, B) vandq_s32
(A, B)
2934#define VectorIntOr(A, B) vorrq_s32
(A, B)
2936#define VectorIntXor(A, B) veorq_s32
(A, B)
2938#define VectorIntAndNot(A, B) vandq_s32
(vmvnq_s32
(A), B)
2940#define VectorIntNot(A) vmvnq_s32
(A)
2943#define VectorIntCompareEQ(A, B) vceqq_s32
(A,B)
2945#define VectorIntCompareGT(A, B) vcgtq_s32
(A,B)
2946#define VectorIntCompareLT(A, B) vcltq_s32
(A,B)
2947#define VectorIntCompareGE(A, B) vcgeq_s32
(A,B)
2948#define VectorIntCompareLE(A, B) vcleq_s32
(A,B)
2957#define VectorIntAdd(A, B) vaddq_s32
(A, B)
2958#define VectorIntSubtract(A, B) vsubq_s32
(A, B)
2959#define VectorIntMultiply(A, B) vmulq_s32
(A, B)
2960#define VectorIntNegate(A) vnegq_s32
(A)
2961#define VectorIntMin(A, B) vminq_s32
(A,B)
2962#define VectorIntMax(A, B) vmaxq_s32
(A,B)
2964#define VectorIntAbs(A) vabdq_s32
(A, GlobalVectorConstants::IntZero)
2966#define VectorIntSign(A) VectorIntSelect( VectorIntCompareGE(A, GlobalVectorConstants::IntZero), GlobalVectorConstants::IntOne, GlobalVectorConstants::IntMinusOne )
2968#define VectorIntToFloat(A) vcvtq_f32_s32
(A)
2972 return vcvtq_s32_f32(A);
2977 return VectorFloatToInt(MakeVectorRegisterFloatFromDouble(A));
2983
2984
2985
2986
2987
2988#define VectorIntStore( Vec, Ptr ) vst1q_s32
( (int32*)(Ptr), Vec )
2991
2992
2993
2994
2995
2996#define VectorIntLoad( Ptr ) vld1q_s32
( (int32*)((void*)(Ptr)) )
2999
3000
3001
3002
3003
3004#define VectorIntStoreAligned( Vec, Ptr ) vst1q_s32
( (int32*)(Ptr), Vec )
3007
3008
3009
3010
3011
3012#define VectorIntLoadAligned( Ptr ) vld1q_s32
( (int32*)((void*)(Ptr)) )
3015
3016
3017
3018
3019
3020#define VectorIntLoad1( Ptr ) vld1q_dup_s32
((int32*)(Ptr))
3022#define VectorIntSet1(F) vdupq_n_s32
(F)
3023#define VectorSetZero() vdupq_n_s32
(0
)
3024#define VectorSet1(F) vdupq_n_f32
(F)
3025#define VectorCastIntToFloat(Vec) ((VectorRegister4f)vreinterpretq_f32_s32
(Vec))
3026#define VectorCastFloatToInt(Vec) ((VectorRegister4i)vreinterpretq_s32_f32
(Vec))
3027#define VectorShiftLeftImm(Vec, ImmAmt) vshlq_n_s32
(Vec, ImmAmt)
3028#define VectorShiftRightImmArithmetic(Vec, ImmAmt) vshrq_n_s32
(Vec, ImmAmt)
3029#define VectorShiftRightImmLogical(Vec, ImmAmt) vshrq_n_u32
(Vec, ImmAmt)
3030#define VectorRound(Vec) vrndnq_f32
(Vec)
3034 return vcvtnq_s32_f32(Vec);
3038 int16x4x2_t res = vzip_s16(vget_low_u16(V), vdup_n_u16(0));
3039 return vcombine_s16(res.val[0], res.val[1]);
3044PRAGMA_ENABLE_SHADOW_VARIABLE_WARNINGS
#define VectorShuffle(Vec1, Vec2, X, Y, Z, W)
FORCEINLINE VectorRegister VectorLoadByte4Reverse(const uint8 *Ptr)
#define VectorIntCompareEQ(A, B)
#define VectorIntXor(A, B)
FORCEINLINE VectorRegister4Int VectorRoundToIntHalfToEven(const VectorRegister4Float &Vec)
#define VectorIntAnd(A, B)
FORCEINLINE void VectorMatrixMultiply(FMatrix *Result, const FMatrix *Matrix1, const FMatrix *Matrix2)
FORCEINLINE VectorRegister VectorLoadTwoPairsFloat(const float *Ptr1, const float *Ptr2)
#define VectorSwizzle(Vec, X, Y, Z, W)
#define VectorIntCompareGE(A, B)
FORCEINLINE void VectorMatrixInverse(FMatrix *DstMatrix, const FMatrix *SrcMatrix)
#define VectorReplicate(Vec, ElementIndex)
FORCEINLINE VectorRegister4Double VectorReciprocalEstimate(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt(int32 X, int32 Y, int32 Z, int32 W)
FORCEINLINE VectorRegister4Double VectorTan(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorATan2(const VectorRegister4Float &X, const VectorRegister4Float &Y)
FORCEINLINE void VectorStore(const VectorRegister4Double &Vec, double *Ptr)
FORCEINLINE VectorRegister4Double VectorSet_W1(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorLoadFloat3(const double *Ptr)
FORCEINLINE VectorRegister4Double VectorMultiply(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Double VectorACos(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorExp2(const VectorRegister4Float &X)
bool VectorContainsNaNOrInfinite(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorTransformVector(const VectorRegister4Double &VecP, const FMatrix44d *MatrixM)
FORCEINLINE VectorRegister4Double VectorReciprocalSqrt(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(uint64 X, uint64 Y)
FORCEINLINE VectorRegister4Float VectorLog2(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorCos(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorMin(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Double VectorNegate(VectorRegister4Double Vec)
FORCEINLINE VectorRegister4Double VectorBitwiseAnd(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Float VectorSqrt(const VectorRegister4Float &Vec)
void VectorStoreAligned(const VectorRegister4Double &Vec, double *Dst)
FORCEINLINE VectorRegister4Float VectorReciprocalSqrt(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Float VectorSign(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorATan(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorDot3(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatFromDouble(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorLoad(const double *Ptr)
FORCEINLINE VectorRegister4Double VectorLoadTwoPairsFloat(const double *Ptr1, const double *Ptr2)
FORCEINLINE VectorRegister4Float VectorExp(const VectorRegister4Float &X)
FORCEINLINE void VectorStoreSignedByte4(VectorRegister4Float Vec, void *Ptr)
FORCEINLINE VectorRegister4Double VectorMax(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Double VectorCompareGT(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE float VectorGetComponentImpl(VectorRegister4Float Vec)
FORCEINLINE VectorRegister4Double VectorLog(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float MakeVectorRegister(uint32 X, uint32 Y, uint32 Z, uint32 W)
FORCEINLINE VectorRegister4Float VectorFloor(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Float VectorDivide(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Double *RESTRICT Result, const VectorRegister4Double *RESTRICT Quat1, const VectorRegister4Double *RESTRICT Quat2)
FORCEINLINE VectorRegister4Double VectorCeil(const VectorRegister4Double &X)
FORCEINLINE void VectorSinCos(VectorRegister4Float *RESTRICT VSinAngles, VectorRegister4Float *RESTRICT VCosAngles, const VectorRegister4Float *RESTRICT VAngles)
#define VectorIntMin(A, B)
FORCEINLINE VectorRegister4Float VectorLoadURGB10A2N(void *Ptr)
FORCEINLINE VectorRegister4Float VectorSubtract(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Float VectorSet_W1(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Double VectorStep(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Double VectorReciprocal(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Float VectorSetComponentImpl(const VectorRegister4Float &Vec, float Scalar)
FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Float *RESTRICT Result, const VectorRegister4Float *RESTRICT Quat1, const VectorRegister4Float *RESTRICT Quat2)
FORCEINLINE VectorRegister4Float VectorLoadByte4(const void *Ptr)
FORCEINLINE VectorRegister4Float VectorReciprocalLen(const VectorRegister4Float &Vector)
FORCEINLINE double VectorGetComponentImpl(VectorRegister2Double Vec)
VectorRegister4Double VectorRegister4d
FORCEINLINE void VectorDeinterleave(VectorRegister4Double &RESTRICT OutEvens, VectorRegister4Double &RESTRICT OutOdds, const VectorRegister4Double &Lo, const VectorRegister4Double &Hi)
FORCEINLINE VectorRegister4Float VectorACos(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorCompareGE(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE void VectorStore(const VectorRegister4Float &Vec, float *Ptr)
FORCEINLINE VectorRegister4Float VectorLog(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorDot3(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
AlignedDouble4 AlignedRegister4
VectorRegister4Float VectorLoadAligned(const float *Ptr)
FORCEINLINE VectorRegister4Double VectorCross(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Double VectorReciprocalLenEstimate(const VectorRegister4Double &Vector)
FORCEINLINE VectorRegister4Float VectorBitwiseAnd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorLoadFloat1(const float *Ptr)
FORCEINLINE VectorRegister4Float VectorTan(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorMod(const VectorRegister4Double &X, const VectorRegister4Double &Y)
FORCEINLINE VectorRegister4Double VectorExp(const VectorRegister4Double &X)
FORCEINLINE double VectorGetComponentDynamic(VectorRegister4Double Vec, uint32 ElementIndex)
FORCEINLINE VectorRegister4Double VectorAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Float VectorTransformVector(const VectorRegister4Float &VecP, const FMatrix44d *MatrixM)
FORCEINLINE VectorRegister4Float VectorLoadFloat2(const float *Ptr)
FORCEINLINE VectorRegister4Int VectorIntSelect(const VectorRegister4Int &Mask, const VectorRegister4Int &Vec1, const VectorRegister4Int &Vec2)
FORCEINLINE void VectorSinCos(VectorRegister4Double *RESTRICT VSinAngles, VectorRegister4Double *RESTRICT VCosAngles, const VectorRegister4Double *RESTRICT VAngles)
FORCEINLINE VectorRegister4Double VectorExp2(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorCombineLow(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE uint32 VectorMaskBits(VectorRegister4Double VecMask)
FORCEINLINE VectorRegister4Double VectorCompareLE(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE double VectorDot3Scalar(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Double VectorCombineLow(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Float VectorReciprocalSqrtEstimate(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Double VectorOneDouble()
FORCEINLINE void VectorStoreAligned(VectorRegister4Float Vec, FFloat16 *Ptr)
FORCEINLINE VectorRegister4Double VectorLoadDouble1(const double *Ptr)
FORCEINLINE double VectorGetComponentImpl(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorSin(const VectorRegister4Double &X)
FORCEINLINE void VectorStoreFloat1(const VectorRegister4Double &Vec, double *Ptr)
VectorRegister4Int VectorRegister4i
FORCEINLINE VectorRegister4Double VectorSet_W0(const VectorRegister4Double &Vec)
FORCEINLINE void VectorStoreHalf4(VectorRegister4Float Vec, void *RESTRICT Ptr)
FORCEINLINE VectorRegister4Float VectorSetFloat1(float X)
FORCEINLINE VectorRegister4Float VectorSelect(const VectorRegister4Float &Mask, const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double VectorMultiplyAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2, VectorRegister4Double Acc)
FORCEINLINE VectorRegister4Float VectorASin(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double MakeVectorRegister(double X, double Y, double Z, double W)
FORCEINLINE VectorRegister4Float VectorCompareGT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDoubleMask(uint64 X, uint64 Y, uint64 Z, uint64 W)
FORCEINLINE VectorRegister4Double VectorMin(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE int32 VectorAnyGreaterThan(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Double VectorTruncate(const VectorRegister4Double &X)
FORCEINLINE int32 VectorAnyGreaterThan(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Float VectorSin(const VectorRegister4Float &X)
VectorRegister4Double VectorLoadAligned(const double *Ptr)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(double X, double Y, double Z, double W)
FORCEINLINE VectorRegister4Float MakeVectorRegister(float X, float Y, float Z, float W)
FORCEINLINE VectorRegister4Float VectorReciprocalLenEstimate(const VectorRegister4Float &Vector)
FORCEINLINE VectorRegister4Float VectorTransformVector(const VectorRegister4Float &VecP, const FMatrix44f *MatrixM)
FORCEINLINE VectorRegister4Float VectorCompareGE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
VectorRegister4i VectorIntExpandLow16To32(VectorRegister4i V)
FORCEINLINE VectorRegister4Float VectorLoadSRGBA16N(const void *Ptr)
FORCEINLINE VectorRegister4Double VectorSetFloat1(double X)
FORCEINLINE VectorRegister4Float VectorMod(const VectorRegister4Float &X, const VectorRegister4Float &Y)
FORCEINLINE VectorRegister4Float VectorMultiply(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt64(int64 X, int64 Y)
FORCEINLINE VectorRegister4Float VectorCombineHigh(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorCompareLT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorOneFloat()
FORCEINLINE VectorRegister4Double VectorBitwiseOr(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(uint64 X, uint64 Y, uint64 Z, uint64 W)
FORCEINLINE VectorRegister4Double VectorSign(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Double VectorReplicateImpl(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorFloor(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorSet_W0(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Float VectorLoadSignedByte4(const void *Ptr)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(const VectorRegister2Double &XY, const VectorRegister2Double &ZW)
FORCEINLINE void VectorStoreFloat3(const VectorRegister4Double &Vec, double *Ptr)
FORCEINLINE VectorRegister4Double VectorZeroDouble()
FORCEINLINE VectorRegister4Float VectorDot4(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Int VectorFloatToInt(const VectorRegister4Double &A)
FORCEINLINE VectorRegister4Float VectorReciprocal(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Float VectorStep(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorSelect(const VectorRegister4Double &Mask, const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
VectorRegister4Double VectorRegister
FORCEINLINE float VectorDot3Scalar(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double VectorSubtract(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Float VectorCeil(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Float VectorZeroFloat()
FORCEINLINE float VectorGetComponentDynamic(VectorRegister4Float Vec, uint32 ElementIndex)
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatMask(uint32 X, uint32 Y, uint32 Z, uint32 W)
FORCEINLINE VectorRegister4Float VectorBitwiseXor(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double VectorReciprocalLen(const VectorRegister4Double &Vector)
FORCEINLINE VectorRegister4Double VectorBitwiseXor(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE void VectorMatrixInverse(FMatrix44f *DstMatrix, const FMatrix44f *SrcMatrix)
FORCEINLINE void VectorDeinterleave(VectorRegister4Float &OutEvens, VectorRegister4Float &OutOdds, const VectorRegister4Float &Lo, const VectorRegister4Float &Hi)
FORCEINLINE VectorRegister4Double VectorLoadFloat3_W1(const double *Ptr)
FORCEINLINE VectorRegister4Double VectorCombineHigh(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Float VectorQuaternionMultiply2(const VectorRegister4Float &Quat1, const VectorRegister4Float &Quat2)
FORCEINLINE void VectorStoreFloat3(const VectorRegister4Float &Vec, float *Ptr)
FORCEINLINE VectorRegister4Double VectorSetComponentImpl(const VectorRegister4Double &Vec, double Scalar)
FORCEINLINE VectorRegister4Double VectorCompareNE(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Double VectorCompareEQ(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
void VectorStoreAligned(const VectorRegister4Float &Vec, float *Ptr)
FORCEINLINE void VectorStoreURGBA16N(const VectorRegister4Float &Vec, uint16 *Out)
FORCEINLINE VectorRegister4Double VectorNegateMultiplyAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2, VectorRegister4Double Sub)
FORCEINLINE VectorRegister4Double VectorLog2(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorLoad(const float *Ptr)
FORCEINLINE VectorRegister4Float VectorCross(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE void VectorMatrixMultiply(FMatrix44f *Result, const FMatrix44f *Matrix1, const FMatrix44f *Matrix2)
FORCEINLINE VectorRegister4Float VectorBitwiseOr(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorAdd(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Double VectorDivide(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Int VectorFloatToInt(const VectorRegister4Float &A)
bool VectorContainsNaNOrInfinite(const VectorRegister4Float &Vec)
#define VectorSetComponent(Vec, ElementIndex, Scalar)
FORCEINLINE VectorRegister4Float VectorTruncate(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorCompareLT(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Float VectorMax(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Double VectorASin(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Double VectorPow(const VectorRegister4Double &Base, const VectorRegister4Double &Exponent)
FORCEINLINE VectorRegister4Float VectorMultiplyAdd(VectorRegister4Float Vec1, VectorRegister4Float Vec2, VectorRegister4Float Acc)
FORCEINLINE VectorRegister4Float VectorNegateMultiplyAdd(VectorRegister4Float Vec1, VectorRegister4Float Vec2, VectorRegister4Float Sub)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(const VectorRegister4Float &From)
VectorRegister4Float VectorRegister4f
FORCEINLINE VectorRegister4Double VectorReciprocalSqrtEstimate(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister2Double VectorSetComponentImpl(const VectorRegister2Double &Vec, double Scalar)
FORCEINLINE VectorRegister4Float VectorPow(const VectorRegister4Float &Base, const VectorRegister4Float &Exponent)
FORCEINLINE VectorRegister4Float VectorAbs(VectorRegister4Float Vec)
FORCEINLINE VectorRegister4Float VectorLoadURGBA16N(const uint16 *E)
FORCEINLINE void VectorStoreFloat1(VectorRegister4Float Vec, float *Ptr)
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(float X, float Y, float Z, float W)
FORCEINLINE VectorRegister4Float VectorReciprocalEstimate(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Double VectorAbs(VectorRegister4Double Vec)
#define VectorIntMax(A, B)
FORCEINLINE VectorRegister4Float VectorATan(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Float VectorCompareLE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorCompareEQ(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorReplicateImpl(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Double VectorQuaternionMultiply2(const VectorRegister4Double &Quat1, const VectorRegister4Double &Quat2)
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(uint32 X, uint32 Y, uint32 Z, uint32 W)
FORCEINLINE VectorRegister4Float VectorNegate(VectorRegister4Float Vec)
FORCEINLINE VectorRegister4Double VectorMergeVecXYZ_VecW(const VectorRegister4Double &VecXYZ, const VectorRegister4Double &VecW)
FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(double X, double Y)
FORCEINLINE void VectorStoreByte4(VectorRegister4Float Vec, void *Ptr)
VectorRegister2Double VectorRegister2d
FORCEINLINE uint32 VectorMaskBits(VectorRegister4Float VecMask)
FORCEINLINE VectorRegister4Double VectorSqrt(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Float VectorCompareNE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double VectorATan2(const VectorRegister4Double &X, const VectorRegister4Double &Y)
FORCEINLINE VectorRegister4Float VectorMergeVecXYZ_VecW(const VectorRegister4Float &VecXYZ, const VectorRegister4Float &VecW)
FORCEINLINE VectorRegister4Double VectorDot4(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE void VectorStoreURGB10A2N(const VectorRegister4Float &Vec, void *Ptr)
FORCEINLINE VectorRegister4Float VectorCos(const VectorRegister4Float &X)
FFloat16 & operator=(float FP32Value)
static const VectorRegister4Float A
static const VectorRegister4Float B
FORCEINLINE double operator[](int32 Index) const
FORCEINLINE AlignedDouble4(const VectorRegister4Double &Vec)
FORCEINLINE double & operator[](int32 Index)
FORCEINLINE VectorRegister4Double ToVectorRegister() const
FORCEINLINE AlignedFloat4(const VectorRegister4Float &Vec)
FORCEINLINE float & operator[](int32 Index)
FORCEINLINE float operator[](int32 Index) const
FORCEINLINE VectorRegister4Float ToVectorRegister() const
static UE_NODISCARD FORCEINLINE double Log2(double Value)
FORCEINLINE constexpr VectorRegister4Double(VectorRegister2Double xy, VectorRegister2Double zw, VectorRegisterConstInit)
FORCEINLINE VectorRegister4Double & operator=(const VectorRegister4Float &From)
FORCEINLINE VectorRegister4Double(const VectorRegister4Float &From)
FORCEINLINE VectorRegister4Double(const VectorRegister2Double &xy, const VectorRegister2Double &zw)
FORCEINLINE VectorRegister4Double()=default