Ark Server API (ASA) - Wiki
Loading...
Searching...
No Matches
UnrealMathNeon.h
Go to the documentation of this file.
1// Copyright Epic Games, Inc. All Rights Reserved.
2
3#pragma once
4
5// HEADER_UNIT_SKIP - Not included directly
6
7PRAGMA_DISABLE_SHADOW_VARIABLE_WARNINGS
8
9#include <type_traits>
10
11// Include the intrinsic functions header
12#if ((PLATFORM_WINDOWS || PLATFORM_HOLOLENS) && PLATFORM_64BITS)
13#include <arm64_neon.h>
14#else
15#include <arm_neon.h>
16#endif
17
18#include "Math/Float16.h"
19
20/*=============================================================================
21 * Helpers:
22 *============================================================================*/
23
24#ifdef _MSC_VER
25
26// MSVC NEON headers typedef float32x4_t and int32x4_t both to __n128
27// This wrapper type allows VectorRegister4Float and VectorRegister4Int to be
28// discriminated for template specialization (e.g. FConstantHandler)
29//
30// This comes at the cost of having to define constructors for some
31// anonymous unions, because VectorRegister4Float/VectorRegister4Int are no
32// longer trivially constructible. The optimizer should eliminate the
33// redundant zero initialization in these cases for non-MSVC (e.g. V()
34// is called now where it wasn't before)
35template<typename T, typename BASE_TYPE>
36struct alignas(alignof(T)) VectorRegisterWrapper
37{
38 FORCEINLINE VectorRegisterWrapper() = default;
39 FORCEINLINE constexpr VectorRegisterWrapper(T vec) : m_vec(vec) {}
40
41 FORCEINLINE operator T&() { return m_vec; }
42 FORCEINLINE operator const T&() const { return m_vec; }
43
44 FORCEINLINE BASE_TYPE operator[](int Index) const;
45
46 T m_vec;
47};
48
49template<>
50FORCEINLINE float VectorRegisterWrapper<float32x4_t, float>::operator[](int Index) const
51{
52 return m_vec.n128_f32[Index];
53}
54
55template<>
56FORCEINLINE double VectorRegisterWrapper<float64x2_t, double>::operator[](int Index) const
57{
58 return m_vec.n128_f64[Index];
59}
60
61template<>
62FORCEINLINE int VectorRegisterWrapper<int32x4_t, int>::operator[](int Index) const
63{
64 return m_vec.n128_i32[Index];
65}
66
67template<>
68FORCEINLINE int64 VectorRegisterWrapper<int64x2_t, int64>::operator[](int Index) const
69{
70 return m_vec.n128_i64[Index];
71}
72
73/** 16-byte vector register type */
74typedef VectorRegisterWrapper<float32x4_t, float> VectorRegister4Float;
75typedef VectorRegisterWrapper<float64x2_t, double> VectorRegister2Double;
76typedef VectorRegisterWrapper<int32x4_t, int> VectorRegister4Int;
77typedef VectorRegisterWrapper<int64x2_t, int64> VectorRegister2Int64;
78
79FORCEINLINE constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)
80{
81 int32x4_t Out = {};
82 Out.n128_i32[0] = X;
83 Out.n128_i32[1] = Y;
84 Out.n128_i32[2] = Z;
85 Out.n128_i32[3] = W;
86 return Out;
87}
88
89FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)
90{
91 float32x4_t Out = {};
92 Out.n128_f32[0] = X;
93 Out.n128_f32[1] = Y;
94 Out.n128_f32[2] = Z;
95 Out.n128_f32[3] = W;
96 return Out;
97}
98
99FORCEINLINE constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(double X, double Y)
100{
101 float64x2_t Out = {};
102 Out.n128_f64[0] = X;
103 Out.n128_f64[1] = Y;
104 return Out;
105}
106
107#else
108
109/** 16-byte vector register type */
110typedef float32x4_t GCC_ALIGN(16) VectorRegister4Float;
111typedef float64x2_t GCC_ALIGN(16) VectorRegister2Double;
112typedef int32x4_t GCC_ALIGN(16) VectorRegister4Int;
113typedef int64x2_t GCC_ALIGN(16) VectorRegister2Int64;
114
115FORCEINLINE constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)
116{
117 return VectorRegister4Int { X, Y, Z, W };
118}
119
120FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)
121{
122 return VectorRegister4Float { X, Y, Z, W };
123}
124
125FORCEINLINE constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(double X, double Y)
126{
127 return VectorRegister2Double { X, Y };
128}
129
130#endif
131
132#define DECLARE_VECTOR_REGISTER(X, Y, Z, W) MakeVectorRegister( X, Y, Z, W )
133
134struct alignas(16) VectorRegister4Double
135{
136 struct
137 {
138 VectorRegister2Double XY;
139 VectorRegister2Double ZW;
140 };
141
143 FORCEINLINE VectorRegister4Double(const VectorRegister2Double& xy, const VectorRegister2Double& zw) : XY(xy), ZW(zw) {}
144 FORCEINLINE constexpr VectorRegister4Double(VectorRegister2Double xy, VectorRegister2Double zw, VectorRegisterConstInit) : XY(xy), ZW(zw) {}
145
146 FORCEINLINE VectorRegister4Double(const VectorRegister4Float& From)
147 {
148 XY = vcvt_f64_f32(*(float32x2_t*)&From);
149 ZW = vcvt_high_f64_f32(From);
150 }
151
152 VectorRegister4Double(const VectorRegister2Double& From) = delete;
153
154 FORCEINLINE VectorRegister4Double& operator=(const VectorRegister4Float& From)
155 {
156 *this = VectorRegister4Double(From);
157 return *this;
158 }
159};
160
161typedef VectorRegister4Double VectorRegister;
162#define VectorZeroVectorRegister() VectorZeroDouble()
163#define VectorOneVectorRegister() VectorOneDouble()
164
165// Forward declarations
167VectorRegister4Double VectorLoadAligned(const double* Ptr);
168void VectorStoreAligned(const VectorRegister4Float& Vec, float* Ptr);
169void VectorStoreAligned(const VectorRegister4Double& Vec, double* Dst);
170
171
172// Helper for conveniently aligning a float array for extraction from VectorRegister4Float
173struct alignas(alignof(VectorRegister4Float)) AlignedFloat4
174{
175 float V[4];
176
177 FORCEINLINE AlignedFloat4(const VectorRegister4Float& Vec)
178 {
179 VectorStoreAligned(Vec, V);
180 }
181
182 FORCEINLINE float operator[](int32 Index) const { return V[Index]; }
183 FORCEINLINE float& operator[](int32 Index) { return V[Index]; }
184
186 {
187 return VectorLoadAligned(V);
188 }
189};
190
191
192// Helper for conveniently aligning a double array for extraction from VectorRegister4Double
193struct alignas(alignof(VectorRegister4Double)) AlignedDouble4
194{
195 double V[4];
196
197 FORCEINLINE AlignedDouble4(const VectorRegister4Double& Vec)
198 {
200 }
201
202 FORCEINLINE double operator[](int32 Index) const { return V[Index]; }
203 FORCEINLINE double& operator[](int32 Index) { return V[Index]; }
204
205 FORCEINLINE VectorRegister4Double ToVectorRegister() const
206 {
207 return VectorLoadAligned(V);
208 }
209};
210
211typedef AlignedDouble4 AlignedRegister4;
212// Aliases
213typedef VectorRegister4Int VectorRegister4i;
215typedef VectorRegister4Double VectorRegister4d;
216typedef VectorRegister2Double VectorRegister2d;
217
218/**
219 * Returns a bitwise equivalent vector based on 4 uint32s.
220 *
221 * @param X 1st uint32 component
222 * @param Y 2nd uint32 component
223 * @param Z 3rd uint32 component
224 * @param W 4th uint32 component
225 * @return Bitwise equivalent vector with 4 floats
226 */
228{
229 union U {
230 VectorRegister4Float V; uint32 F[4];
231 FORCEINLINE U() : V() {}
232 } Tmp;
233 Tmp.F[0] = X;
234 Tmp.F[1] = Y;
235 Tmp.F[2] = Z;
236 Tmp.F[3] = W;
237 return Tmp.V;
238}
239
241{
242 return MakeVectorRegister(X, Y, Z, W);
243}
244
245// Nicer alias
247{
248 return MakeVectorRegisterFloat(X, Y, Z, W);
249}
250
251
252/**
253 * Returns a vector based on 4 floats.
254 *
255 * @param X 1st float component
256 * @param Y 2nd float component
257 * @param Z 3rd float component
258 * @param W 4th float component
259 * @return Vector of the 4 floats
260 */
262{
263 union U {
264 VectorRegister4Float V; float F[4];
265 FORCEINLINE U() : V() {}
266 } Tmp;
267 Tmp.F[0] = X;
268 Tmp.F[1] = Y;
269 Tmp.F[2] = Z;
270 Tmp.F[3] = W;
271 return Tmp.V;
272}
273
275{
276 return MakeVectorRegister(X, Y, Z, W);
277}
278
279/**
280 * Returns a vector based on 4 doubles.
281 *
282 * @param X 1st double component
283 * @param Y 2nd double component
284 * @param Z 3rd double component
285 * @param W 4th double component
286 * @return Vector of the 4 doubles
287 */
288FORCEINLINE VectorRegister4Double MakeVectorRegister(double X, double Y, double Z, double W)
289{
290 union U
291 {
292 VectorRegister4Double V; double D[4];
293 FORCEINLINE U() : V() {}
294 } Tmp;
295 Tmp.D[0] = X;
296 Tmp.D[1] = Y;
297 Tmp.D[2] = Z;
298 Tmp.D[3] = W;
299 return Tmp.V;
300}
301
302FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(double X, double Y, double Z, double W)
303{
304 return MakeVectorRegister(X, Y, Z, W);
305}
306
307FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(const VectorRegister2Double& XY, const VectorRegister2Double& ZW)
308{
309 return VectorRegister4Double(XY, ZW);
310}
311
312FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(uint64 X, uint64 Y, uint64 Z, uint64 W)
313{
314 union U
315 {
316 VectorRegister4Double V; uint64_t D[4];
317 FORCEINLINE U() : V() {}
318 } Tmp;
319 Tmp.D[0] = X;
320 Tmp.D[1] = Y;
321 Tmp.D[2] = Z;
322 Tmp.D[3] = W;
323 return Tmp.V;
324}
325
326// Nicer alias
327FORCEINLINE VectorRegister4Double MakeVectorRegisterDoubleMask(uint64 X, uint64 Y, uint64 Z, uint64 W)
328{
329 return MakeVectorRegisterDouble(X, Y, Z, W);
330}
331
332FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(double X, double Y)
333{
334 union U
335 {
336 VectorRegister2Double V; double D[2];
337 FORCEINLINE U() : V() {}
338 } Tmp;
339 Tmp.D[0] = X;
340 Tmp.D[1] = Y;
341 return Tmp.V;
342}
343
344FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(uint64 X, uint64 Y)
345{
346 union U
347 {
348 VectorRegister2Double V; uint64_t D[2];
349 FORCEINLINE U() : V() {}
350 } Tmp;
351 Tmp.D[0] = X;
352 Tmp.D[1] = Y;
353 return Tmp.V;
354}
355
356/**
357* Returns a vector based on 4 int32.
358*
359* @param X 1st int32 component
360* @param Y 2nd int32 component
361* @param Z 3rd int32 component
362* @param W 4th int32 component
363* @return Vector of the 4 int32
364*/
365FORCEINLINE VectorRegister4Int MakeVectorRegisterInt(int32 X, int32 Y, int32 Z, int32 W)
366{
367 union U {
368 VectorRegister4Int V; int32 I[4];
369 FORCEINLINE U() : V() {}
370 } Tmp;
371 Tmp.I[0] = X;
372 Tmp.I[1] = Y;
373 Tmp.I[2] = Z;
374 Tmp.I[3] = W;
375 return Tmp.V;
376}
377
378FORCEINLINE VectorRegister4Int MakeVectorRegisterInt64(int64 X, int64 Y)
379{
380 union U
381 {
382 VectorRegister4Int V; int64 I[2];
383 FORCEINLINE U() : V() {}
384 } Tmp;
385 Tmp.I[0] = X;
386 Tmp.I[1] = Y;
387 return Tmp.V;
388}
389
390// Make double register from float register
391FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(const VectorRegister4Float& From)
392{
393 return VectorRegister4Double(From);
394}
395
396// Lossy conversion: double->float vector
398{
399 return vcvt_high_f32_f64(vcvt_f32_f64(Vec.XY), Vec.ZW);
400}
401
402/*
403#define VectorPermute(Vec1, Vec2, Mask) my_perm(Vec1, Vec2, Mask)
404
405/ ** Reads NumBytesMinusOne+1 bytes from the address pointed to by Ptr, always reading the aligned 16 bytes containing the start of Ptr, but only reading the next 16 bytes if the data straddles the boundary * /
406FORCEINLINE VectorRegister4Float VectorLoadNPlusOneUnalignedBytes(const void* Ptr, int NumBytesMinusOne)
407{
408 return VectorPermute( my_ld (0, (float*)Ptr), my_ld(NumBytesMinusOne, (float*)Ptr), my_lvsl(0, (float*)Ptr) );
409}
410*/
411
412
413/*=============================================================================
414 * Constants:
415 *============================================================================*/
416
417#include "Math/UnrealMathVectorConstants.h"
418
419
420/*=============================================================================
421 * Intrinsics:
422 *============================================================================*/
423
424/**
425 * Returns a vector with all zeros.
426 *
427 * @return VectorRegister4Float(0.0f, 0.0f, 0.0f, 0.0f)
428 */
430{
431 return vdupq_n_f32( 0.0f );
432}
433
434FORCEINLINE VectorRegister4Double VectorZeroDouble()
435{
436 VectorRegister2Double Zero = vdupq_n_f64(0.0);
437 return VectorRegister4Double(Zero, Zero);
438}
439
440
441/**
442 * Returns a vector with all ones.
443 *
444 * @return VectorRegister4Float(1.0f, 1.0f, 1.0f, 1.0f)
445 */
447{
448 return vdupq_n_f32( 1.0f );
449}
450
451FORCEINLINE VectorRegister4Double VectorOneDouble()
452{
453 VectorRegister4Double Result;
454 Result.XY = vdupq_n_f64(1.0f);
455 Result.ZW = Result.XY;
456 return Result;
457}
458
459/**
460 * Loads 4 floats from unaligned memory.
461 *
462 * @param Ptr Unaligned memory pointer to the 4 floats
463 * @return VectorRegister4Float(Ptr[0], Ptr[1], Ptr[2], Ptr[3])
464 */
466{
467 return vld1q_f32( (float32_t*)Ptr );
468}
469
470FORCEINLINE VectorRegister4Double VectorLoad(const double* Ptr)
471{
472 float64x2x2_t Vec = vld1q_f64_x2(Ptr);
473 VectorRegister4Double Result = *(VectorRegister4Double*)&Vec;
474 return Result;
475}
476
477/**
478 * Loads 2 floats from unaligned memory into X and Y and duplicates them in Z and W.
479 *
480 * @param Ptr Unaligned memory pointer to the floats
481 * @return VectorRegister4Float(Ptr[0], Ptr[1], Ptr[0], Ptr[1])
482 */
484{
485 return MakeVectorRegister(Ptr[0], Ptr[1], Ptr[0], Ptr[1]);
486}
487
488/**
489 * Loads 3 floats from unaligned memory and leaves W undefined.
490 *
491 * @param Ptr Unaligned memory pointer to the 3 floats
492 * @return VectorRegister4Float(Ptr[0], Ptr[1], Ptr[2], 0.0f)
493 */
494FORCEINLINE VectorRegister4Double VectorLoadFloat3(const double* Ptr)
495{
496 union U
497 {
498 VectorRegister4Double V; double D[4];
499 inline U() : V() {}
500 } Tmp;
501
502 Tmp.V.XY = vld1q_f64(Ptr);
503 Tmp.D[2] = Ptr[2];
504 Tmp.D[3] = 0.0;
505 return Tmp.V;
506}
507
508/**
509 * Loads 3 FLOATs from unaligned memory and sets W=1.
510 *
511 * @param Ptr Unaligned memory pointer to the 3 FLOATs
512 * @return VectorRegister4Float(Ptr[0], Ptr[1], Ptr[2], 1.0f)
513 */
514FORCEINLINE VectorRegister4Double VectorLoadFloat3_W1(const double* Ptr)
515{
516 return MakeVectorRegisterDouble(Ptr[0], Ptr[1], Ptr[2], 1.0f);
517}
518
519/**
520 * Sets a single component of a vector. Must be a define since ElementIndex needs to be a constant integer
521 */
522template <int ElementIndex>
524{
525 return vsetq_lane_f32(Scalar, Vec, ElementIndex);
526}
527
528template <int ElementIndex>
529FORCEINLINE VectorRegister2Double VectorSetComponentImpl(const VectorRegister2Double& Vec, double Scalar)
530{
531 return vsetq_lane_f64(Scalar, Vec, ElementIndex);
532}
533
534template<int ElementIndex, typename std::enable_if< (ElementIndex > 1), bool >::type = true >
535FORCEINLINE VectorRegister4Double VectorSetComponentImpl(const VectorRegister4Double& Vec, double Scalar)
536{
537 VectorRegister4Double Result;
538 Result.XY = Vec.XY;
539 Result.ZW = VectorSetComponentImpl<ElementIndex - 2>(Vec.ZW, Scalar);
540 return Result;
541}
542
543template<int ElementIndex, typename std::enable_if < (ElementIndex <= 1), bool >::type = true >
544FORCEINLINE VectorRegister4Double VectorSetComponentImpl(const VectorRegister4Double& Vec, double Scalar)
545{
546 VectorRegister4Double Result;
547 Result.XY = VectorSetComponentImpl<ElementIndex>(Vec.XY, Scalar);
548 Result.ZW = Vec.ZW;
549 return Result;
550}
551
552#define VectorSetComponent( Vec, ElementIndex, Scalar ) VectorSetComponentImpl<ElementIndex>(Vec, Scalar)
553
554
555/**
556 * Loads 4 floats from aligned memory.
557 *
558 * @param Ptr Aligned memory pointer to the 4 floats
559 * @return VectorRegister4Float(Ptr[0], Ptr[1], Ptr[2], Ptr[3])
560 */
562{
563 return VectorLoad(Ptr);
564}
565
566FORCEINLINE VectorRegister4Double VectorLoadAligned(const double* Ptr)
567{
568 return VectorLoad(Ptr);
569}
570
571/**
572 * Loads 1 float from unaligned memory and replicates it to all 4 elements.
573 *
574 * @param Ptr Unaligned memory pointer to the float
575 * @return VectorRegister4Float(Ptr[0], Ptr[0], Ptr[0], Ptr[0])
576 */
578{
579 return vdupq_n_f32(Ptr[0]);
580}
581
582FORCEINLINE VectorRegister4Double VectorLoadDouble1(const double* Ptr)
583{
584 VectorRegister4Double Result;
585 Result.XY = vdupq_n_f64(Ptr[0]);
586 Result.ZW = Result.XY;
587 return Result;
588}
589
590/**
591 * Loads 4 unaligned floats - 2 from the first pointer, 2 from the second, and packs
592 * them in to 1 vector.
593 *
594 * @param Ptr1 Unaligned memory pointer to the first 2 floats
595 * @param Ptr2 Unaligned memory pointer to the second 2 floats
596 * @return VectorRegister4Float(Ptr1[0], Ptr1[1], Ptr2[0], Ptr2[1])
597 */
599{
600 float32x2_t Lo = vld1_f32(Ptr1);
601 float32x2_t Hi = vld1_f32(Ptr2);
602 return vcombine_f32(Lo, Hi);
603}
604
605FORCEINLINE VectorRegister4Double VectorLoadTwoPairsFloat(const double* Ptr1, const double* Ptr2)
606{
607 VectorRegister4Double Res;
608 Res.XY = vld1q_f64(Ptr1);
609 Res.ZW = vld1q_f64(Ptr2);
610 return Res;
611}
612
613/**
614* Propagates passed in float to all registers.
615*
616* @param X float component
617* @return VectorRegister4Float(X, X, X, X)
618*/
620{
621 return vdupq_n_f32(X);
622}
623
624FORCEINLINE VectorRegister4Double VectorSetFloat1(double X)
625{
626 VectorRegister4Double Result;
627 Result.XY = vdupq_n_f64(X);
628 Result.ZW = Result.XY;
629 return Result;
630}
631
632/**
633 * Stores a vector to aligned memory.
634 *
635 * @param Vec Vector to store
636 * @param Ptr Aligned memory pointer
637 */
638FORCEINLINE void VectorStoreAligned(const VectorRegister4Float& Vec, float* Ptr)
639{
640 vst1q_f32(Ptr, Vec);
641}
642
643FORCEINLINE void VectorStoreAligned(const VectorRegister4Double& Vec, double* Ptr)
644{
645 vst1q_f64_x2(Ptr, *(float64x2x2_t*)&Vec);
646}
647
648//TODO: LWC VectorVM.cpp calls it on a line 3294, case EVectorVMOp::outputdata_half: Context.WriteExecFunction(CopyConstantToOutput<float, FFloat16, 2>); break;
649FORCEINLINE void VectorStoreAligned(VectorRegister4Float Vec, FFloat16* Ptr)
650{
651 AlignedFloat4 Floats(Vec);
652 for (int i = 0; i < 4; ++i)
653 {
654 Ptr[i] = Floats[i];
655 }
656}
657
658/**
659* Same as VectorStoreAligned for Neon.
660*
661* @param Vec Vector to store
662* @param Ptr Aligned memory pointer
663*/
664#define VectorStoreAlignedStreamed( Vec, Ptr ) VectorStoreAligned( Vec, Ptr )
665
666/**
667 * Stores a vector to memory (aligned or unaligned).
668 *
669 * @param Vec Vector to store
670 * @param Ptr Memory pointer
671 */
672FORCEINLINE void VectorStore(const VectorRegister4Float& Vec, float* Ptr)
673{
674 vst1q_f32(Ptr, Vec);
675}
676
677FORCEINLINE void VectorStore(const VectorRegister4Double& Vec, double* Ptr)
678{
679 vst1q_f64_x2(Ptr, *(float64x2x2_t*)&Vec);
680}
681
682/**
683 * Stores the XYZ components of a vector to unaligned memory.
684 *
685 * @param Vec Vector to store XYZ
686 * @param Ptr Unaligned memory pointer
687 */
688FORCEINLINE void VectorStoreFloat3( const VectorRegister4Float& Vec, float* Ptr )
689{
690 vst1_f32(Ptr, *(float32x2_t*)&Vec);
691 vst1q_lane_f32(((float32_t*)Ptr) + 2, Vec, 2);
692}
693
694/**
695 * Stores the XYZ components of a double vector pair to unaligned memory.
696 *
697 * @param Vec Vector to store XYZ
698 * @param Ptr Unaligned memory pointer
699 */
700FORCEINLINE void VectorStoreFloat3(const VectorRegister4Double& Vec, double* Ptr)
701{
702 vst1q_f64(Ptr, Vec.XY);
703 vst1q_lane_f64(((float64_t*)Ptr) + 2, Vec.ZW, 0);
704}
705
706
707/**
708 * Stores the X component of a vector to unaligned memory.
709 *
710 * @param Vec Vector to store X
711 * @param Ptr Unaligned memory pointer
712 */
713FORCEINLINE void VectorStoreFloat1(VectorRegister4Float Vec, float* Ptr )
714{
715 vst1q_lane_f32( Ptr, Vec, 0 );
716}
717
718FORCEINLINE void VectorStoreFloat1(const VectorRegister4Double& Vec, double* Ptr)
719{
720 vst1q_lane_f64(Ptr, Vec.XY, 0);
721}
722
723/**
724 * Replicates one element into all four elements and returns the new vector. Must be a #define for ELementIndex
725 * to be a constant integer
726 *
727 * @param Vec Source vector
728 * @param ElementIndex Index (0-3) of the element to replicate
729 * @return VectorRegister4Float( Vec[ElementIndex], Vec[ElementIndex], Vec[ElementIndex], Vec[ElementIndex] )
730 */
731template <int ElementIndex>
733{
734 return vdupq_n_f32(vgetq_lane_f32(Vec, ElementIndex));
735}
736
737template <int ElementIndex>
738FORCEINLINE VectorRegister2Double VectorReplicateImpl(const VectorRegister2Double& Vec)
739{
740 return vdupq_n_f64(vgetq_lane_f64(Vec, ElementIndex));
741}
742
743template <int ElementIndex, typename std::enable_if < (ElementIndex <= 1), bool >::type = true >
744FORCEINLINE VectorRegister4Double VectorReplicateImpl(const VectorRegister4Double& Vec)
745{
746 VectorRegister4Double Result;
747 Result.XY = VectorReplicateImpl<ElementIndex>(Vec.XY);
748 Result.ZW = Result.XY;
749 return Result;
750}
751
752template <int ElementIndex, typename std::enable_if < (ElementIndex > 1), bool >::type = true >
753FORCEINLINE VectorRegister4Double VectorReplicateImpl(const VectorRegister4Double& Vec)
754{
755 VectorRegister4Double Result;
756 Result.ZW = VectorReplicateImpl<ElementIndex - 2>(Vec.ZW);
757 Result.XY = Result.ZW;
758 return Result;
759}
760
761#define VectorReplicate( Vec, ElementIndex ) VectorReplicateImpl<ElementIndex>(Vec)
762
763
764/**
765 * Returns the absolute value (component-wise).
766 *
767 * @param Vec Source vector
768 * @return VectorRegister4Float( abs(Vec.x), abs(Vec.y), abs(Vec.z), abs(Vec.w) )
769 */
771{
772 return vabsq_f32( Vec );
773}
774
775FORCEINLINE VectorRegister4Double VectorAbs(VectorRegister4Double Vec)
776{
777 VectorRegister4Double Result;
778 Result.XY = vabsq_f64(Vec.XY);
779 Result.ZW = vabsq_f64(Vec.ZW);
780 return Result;
781}
782
783/**
784 * Returns the negated value (component-wise).
785 *
786 * @param Vec Source vector
787 * @return VectorRegister4Float( -Vec.x, -Vec.y, -Vec.z, -Vec.w )
788 */
790{
791 return vnegq_f32( Vec );
792}
793
794FORCEINLINE VectorRegister4Double VectorNegate(VectorRegister4Double Vec)
795{
796 VectorRegister4Double Result;
797 Result.XY = vnegq_f64(Vec.XY);
798 Result.ZW = vnegq_f64(Vec.ZW);
799 return Result;
800}
801
802/**
803 * Adds two vectors (component-wise) and returns the result.
804 *
805 * @param Vec1 1st vector
806 * @param Vec2 2nd vector
807 * @return VectorRegister4Float( Vec1.x+Vec2.x, Vec1.y+Vec2.y, Vec1.z+Vec2.z, Vec1.w+Vec2.w )
808 */
810{
811 return vaddq_f32( Vec1, Vec2 );
812}
813
814FORCEINLINE VectorRegister4Double VectorAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
815{
816 VectorRegister4Double Result;
817 Result.XY = vaddq_f64(Vec1.XY, Vec2.XY);
818 Result.ZW = vaddq_f64(Vec1.ZW, Vec2.ZW);
819 return Result;
820}
821
822
823/**
824 * Subtracts a vector from another (component-wise) and returns the result.
825 *
826 * @param Vec1 1st vector
827 * @param Vec2 2nd vector
828 * @return VectorRegister4Float( Vec1.x-Vec2.x, Vec1.y-Vec2.y, Vec1.z-Vec2.z, Vec1.w-Vec2.w )
829 */
831{
832 return vsubq_f32( Vec1, Vec2 );
833}
834
835FORCEINLINE VectorRegister4Double VectorSubtract(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
836{
837 VectorRegister4Double Res;
838 Res.XY = vsubq_f64(Vec1.XY, Vec2.XY);
839 Res.ZW = vsubq_f64(Vec1.ZW, Vec2.ZW);
840 return Res;
841}
842
843
844/**
845 * Multiplies two vectors (component-wise) and returns the result.
846 *
847 * @param Vec1 1st vector
848 * @param Vec2 2nd vector
849 * @return VectorRegister4Float( Vec1.x*Vec2.x, Vec1.y*Vec2.y, Vec1.z*Vec2.z, Vec1.w*Vec2.w )
850 */
852{
853 return vmulq_f32( Vec1, Vec2 );
854}
855
856FORCEINLINE VectorRegister2Double VectorMultiply(VectorRegister2Double Vec1, VectorRegister2Double Vec2)
857{
858 return vmulq_f64(Vec1, Vec2);
859}
860
861FORCEINLINE VectorRegister4Double VectorMultiply(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
862{
863 VectorRegister4Double Result;
864 Result.XY = vmulq_f64(Vec1.XY, Vec2.XY);
865 Result.ZW = vmulq_f64(Vec1.ZW, Vec2.ZW);
866 return Result;
867}
868
869
870/**
871* Divides two vectors (component-wise) and returns the result.
872*
873* @param Vec1 1st vector
874* @param Vec2 2nd vector
875* @return VectorRegister4Float( Vec1.x/Vec2.x, Vec1.y/Vec2.y, Vec1.z/Vec2.z, Vec1.w/Vec2.w )
876*/
878{
879 return vdivq_f32(Vec1, Vec2);
880}
881
882FORCEINLINE VectorRegister4Double VectorDivide(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
883{
884 VectorRegister4Double Res;
885 Res.XY = vdivq_f64(Vec1.XY, Vec2.XY);
886 Res.ZW = vdivq_f64(Vec1.ZW, Vec2.ZW);
887 return Res;
888}
889
890
891/**
892 * Multiplies two vectors (component-wise), adds in the third vector and returns the result.
893 *
894 * @param Vec1 1st vector
895 * @param Vec2 2nd vector
896 * @param Acc 3rd vector
897 * @return VectorRegister4Float( Vec1.x*Vec2.x + Acc.x, Vec1.y*Vec2.y + Acc.y, Vec1.z*Vec2.z + Acc.z, Vec1.w*Vec2.w + Acc.w )
898 */
900{
901 return vfmaq_f32(Acc, Vec1, Vec2 );
902}
903
904FORCEINLINE VectorRegister4Double VectorMultiplyAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2, VectorRegister4Double Acc)
905{
906 VectorRegister4Double Result;
907 Result.XY = vfmaq_f64(Acc.XY, Vec1.XY, Vec2.XY);
908 Result.ZW = vfmaq_f64(Acc.ZW, Vec1.ZW, Vec2.ZW);
909 return Result;
910}
911
912/**
913 * Multiplies two vectors (component-wise) and subtracts the result from the third vector.
914 *
915 * @param Vec1 1st vector
916 * @param Vec2 2nd vector
917 * @param Sub 3rd vector
918 * @return VectorRegister4Float( Sub.x - Vec1.x*Vec2.x, Sub.y - Vec1.y*Vec2.y, Sub.z - Vec1.z*Vec2.z, Sub.w - Vec1.w*Vec2.w )
919 */
921{
922 return vfmsq_f32(Sub, Vec1, Vec2);
923}
924
925FORCEINLINE VectorRegister4Double VectorNegateMultiplyAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2, VectorRegister4Double Sub)
926{
927 VectorRegister4Double Result;
928 Result.XY = vfmsq_f64(Sub.XY, Vec1.XY, Vec2.XY);
929 Result.ZW = vfmsq_f64(Sub.ZW, Vec1.ZW, Vec2.ZW);
930 return Result;
931}
932
933
934/**
935 * Calculates the dot3 product of two vectors and returns a vector with the result in all 4 components.
936 * Only really efficient on Xbox 360.
937 *
938 * @param Vec1 1st vector
939 * @param Vec2 2nd vector
940 * @return d = dot3(Vec1.xyz, Vec2.xyz), VectorRegister4Float( d, d, d, d )
941 */
943{
944 VectorRegister4Float Temp = VectorMultiply( Vec1, Vec2 );
945 Temp = vsetq_lane_f32( 0.0f, Temp, 3 );
946 float32x2_t sum = vpadd_f32( vget_low_f32( Temp ), vget_high_f32( Temp ) );
947 sum = vpadd_f32( sum, sum );
948 return vdupq_lane_f32( sum, 0 );
949}
950
951FORCEINLINE VectorRegister4Double VectorDot3(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
952{
953 VectorRegister2Double A, B;
954 A = vmulq_f64(Vec1.XY, Vec2.XY);
955 B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);
956 float64x1_t Sum = vadd_f64(vget_low_f64(B), vget_high_f64(A));
957 VectorRegister4Double Temp;
958 Temp.XY = vdupq_lane_f64(Sum, 0);
959 Temp.ZW = Temp.XY;
960 return Temp;
961}
962
963FORCEINLINE float VectorDot3Scalar(const VectorRegister4Float& Vec1, const VectorRegister4Float& Vec2)
964{
965 return vgetq_lane_f32(VectorDot3(Vec1, Vec2), 0);
966}
967
968FORCEINLINE double VectorDot3Scalar(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
969{
970 VectorRegister2Double A, B;
971 A = vmulq_f64(Vec1.XY, Vec2.XY);
972 B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);
973 float64x1_t Sum = vadd_f64(vget_low_f64(B), vget_high_f64(A));
974 return *(double*)&Sum;
975}
976
977
978
979/**
980 * Calculates the dot4 product of two vectors and returns a vector with the result in all 4 components.
981 * Only really efficient on Xbox 360.
982 *
983 * @param Vec1 1st vector
984 * @param Vec2 2nd vector
985 * @return d = dot4(Vec1.xyzw, Vec2.xyzw), VectorRegister4Float( d, d, d, d )
986 */
988{
989 VectorRegister4Float Temp = VectorMultiply(Vec1, Vec2);
990 float32x2_t sum = vpadd_f32(vget_low_f32(Temp), vget_high_f32(Temp));
991 sum = vpadd_f32(sum, sum);
992 return vdupq_lane_f32(sum, 0);
993}
994
995FORCEINLINE VectorRegister4Double VectorDot4(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
996{
997 VectorRegister2Double A, B;
998 A = vmulq_f64(Vec1.XY, Vec2.XY);
999 B = vfmaq_f64(A, Vec1.ZW, Vec2.ZW);
1000 A = vextq_f64(B, B, 1);
1001 VectorRegister4Double Temp;
1002 Temp.XY = vaddq_f64(A, B);
1003 Temp.ZW = Temp.XY;
1004 return Temp;
1005}
1006
1007/**
1008 * Creates a four-part mask based on component-wise == compares of the input vectors
1009 *
1010 * @param Vec1 1st vector
1011 * @param Vec2 2nd vector
1012 * @return VectorRegister4Float( Vec1.x == Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
1013 */
1014
1016{
1017 return (VectorRegister4Float)vceqq_f32( Vec1, Vec2 );
1018}
1019
1020FORCEINLINE VectorRegister4Double VectorCompareEQ(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1021{
1022 VectorRegister4Double Result;
1023 Result.XY = (VectorRegister2Double)vceqq_f64(Vec1.XY, Vec2.XY);
1024 Result.ZW = (VectorRegister2Double)vceqq_f64(Vec1.ZW, Vec2.ZW);
1025 return Result;
1026}
1027
1028
1029
1030/**
1031 * Creates a four-part mask based on component-wise != compares of the input vectors
1032 *
1033 * @param Vec1 1st vector
1034 * @param Vec2 2nd vector
1035 * @return VectorRegister4Float( Vec1.x != Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
1036 */
1037
1039{
1040 return (VectorRegister4Float)vmvnq_u32( vceqq_f32( Vec1, Vec2 ) );
1041}
1042
1043FORCEINLINE VectorRegister4Double VectorCompareNE(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1044{
1045 VectorRegister4Double Result;
1046 Result.XY = (VectorRegister2Double)vmvnq_u32(vceqq_f64(Vec1.XY, Vec2.XY));
1047 Result.ZW = (VectorRegister2Double)vmvnq_u32(vceqq_f64(Vec1.ZW, Vec2.ZW));
1048 return Result;
1049}
1050
1051/**
1052 * Creates a four-part mask based on component-wise > compares of the input vectors
1053 *
1054 * @param Vec1 1st vector
1055 * @param Vec2 2nd vector
1056 * @return VectorRegister4Float( Vec1.x > Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
1057 */
1058
1060{
1061 return (VectorRegister4Float)vcgtq_f32( Vec1, Vec2 );
1062}
1063
1064FORCEINLINE VectorRegister4Double VectorCompareGT(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1065{
1066 VectorRegister4Double Result;
1067 Result.XY = (VectorRegister2Double)vcgtq_f64(Vec1.XY, Vec2.XY);
1068 Result.ZW = (VectorRegister2Double)vcgtq_f64(Vec1.ZW, Vec2.ZW);
1069 return Result;
1070}
1071
1072/**
1073 * Creates a four-part mask based on component-wise >= compares of the input vectors
1074 *
1075 * @param Vec1 1st vector
1076 * @param Vec2 2nd vector
1077 * @return VectorRegister4Float( Vec1.x >= Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
1078 */
1079
1081{
1082 return (VectorRegister4Float)vcgeq_f32( Vec1, Vec2 );
1083}
1084
1085FORCEINLINE VectorRegister4Double VectorCompareGE(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1086{
1087 VectorRegister4Double Result;
1088 Result.XY = (VectorRegister2Double)vcgeq_f64(Vec1.XY, Vec2.XY);
1089 Result.ZW = (VectorRegister2Double)vcgeq_f64(Vec1.ZW, Vec2.ZW);
1090 return Result;
1091}
1092
1093/**
1094* Creates a four-part mask based on component-wise < compares of the input vectors
1095*
1096* @param Vec1 1st vector
1097* @param Vec2 2nd vector
1098* @return VectorRegister4Float( Vec1.x < Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
1099*/
1101{
1102 return (VectorRegister4Float)vcltq_f32(Vec1, Vec2);
1103}
1104
1105FORCEINLINE VectorRegister4Double VectorCompareLT(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1106{
1107 VectorRegister4Double Res;
1108 Res.XY = (VectorRegister2Double)vcltq_f64(Vec1.XY, Vec2.XY);
1109 Res.ZW = (VectorRegister2Double)vcltq_f64(Vec1.ZW, Vec2.ZW);
1110 return Res;
1111}
1112
1113/**
1114* Creates a four-part mask based on component-wise <= compares of the input vectors
1115*
1116* @param Vec1 1st vector
1117* @param Vec2 2nd vector
1118* @return VectorRegister4Float( Vec1.x <= Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
1119*/
1121{
1122 return (VectorRegister4Float)vcleq_f32(Vec1, Vec2);
1123}
1124
1125FORCEINLINE VectorRegister4Double VectorCompareLE(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1126{
1127 VectorRegister4Double Res;
1128 Res.XY = (VectorRegister2Double)vcleq_f64(Vec1.XY, Vec2.XY);
1129 Res.ZW = (VectorRegister2Double)vcleq_f64(Vec1.ZW, Vec2.ZW);
1130 return Res;
1131}
1132
1133/**
1134 * Does a bitwise vector selection based on a mask (e.g., created from VectorCompareXX)
1135 *
1136 * @param Mask Mask (when 1: use the corresponding bit from Vec1 otherwise from Vec2)
1137 * @param Vec1 1st vector
1138 * @param Vec2 2nd vector
1139 * @return VectorRegister4Float( for each bit i: Mask[i] ? Vec1[i] : Vec2[i] )
1140 *
1141 */
1142
1144{
1145 return vbslq_f32((VectorRegister4Int)Mask, Vec1, Vec2);
1146}
1147
1148FORCEINLINE VectorRegister4Double VectorSelect(const VectorRegister4Double& Mask, const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1149{
1150 VectorRegister4Double Result;
1151 Result.XY = vbslq_f64((VectorRegister2Int64)Mask.XY, Vec1.XY, Vec2.XY);
1152 Result.ZW = vbslq_f64((VectorRegister2Int64)Mask.ZW, Vec1.ZW, Vec2.ZW);
1153 return Result;
1154}
1155
1156/**
1157 * Combines two vectors using bitwise OR (treating each vector as a 128 bit field)
1158 *
1159 * @param Vec1 1st vector
1160 * @param Vec2 2nd vector
1161 * @return VectorRegister4Float( for each bit i: Vec1[i] | Vec2[i] )
1162 */
1164{
1165 return (VectorRegister4Float)vorrq_u32( (VectorRegister4Int)Vec1, (VectorRegister4Int)Vec2 );
1166}
1167
1168FORCEINLINE VectorRegister4Double VectorBitwiseOr(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1169{
1170 VectorRegister4Double Result;
1171 Result.XY = (VectorRegister2Double)vorrq_u64((VectorRegister2Int64)Vec1.XY, (VectorRegister2Int64)Vec2.XY);
1172 Result.ZW = (VectorRegister2Double)vorrq_u64((VectorRegister2Int64)Vec1.ZW, (VectorRegister2Int64)Vec2.ZW);
1173 return Result;
1174}
1175
1176/**
1177 * Combines two vectors using bitwise AND (treating each vector as a 128 bit field)
1178 *
1179 * @param Vec1 1st vector
1180 * @param Vec2 2nd vector
1181 * @return VectorRegister4Float( for each bit i: Vec1[i] & Vec2[i] )
1182 */
1184{
1185 return (VectorRegister4Float)vandq_u32( (VectorRegister4Int)Vec1, (VectorRegister4Int)Vec2 );
1186}
1187
1188FORCEINLINE VectorRegister4Double VectorBitwiseAnd(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1189{
1190 VectorRegister4Double Result;
1191 Result.XY = (VectorRegister2Double)vandq_u64((VectorRegister2Int64)Vec1.XY, (VectorRegister2Int64)Vec2.XY);
1192 Result.ZW = (VectorRegister2Double)vandq_u64((VectorRegister2Int64)Vec1.ZW, (VectorRegister2Int64)Vec2.ZW);
1193 return Result;
1194}
1195
1196/**
1197 * Combines two vectors using bitwise XOR (treating each vector as a 128 bit field)
1198 *
1199 * @param Vec1 1st vector
1200 * @param Vec2 2nd vector
1201 * @return VectorRegister4Float( for each bit i: Vec1[i] ^ Vec2[i] )
1202 */
1204{
1205 return (VectorRegister4Float)veorq_u32( (VectorRegister4Int)Vec1, (VectorRegister4Int)Vec2 );
1206}
1207
1208FORCEINLINE VectorRegister4Double VectorBitwiseXor(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1209{
1210 VectorRegister4Double Result;
1211 Result.XY = (VectorRegister2Double)veorq_u64((VectorRegister2Int64)Vec1.XY, (VectorRegister2Int64)Vec2.XY);
1212 Result.ZW = (VectorRegister2Double)veorq_u64((VectorRegister2Int64)Vec1.ZW, (VectorRegister2Int64)Vec2.ZW);
1213 return Result;
1214}
1215
1216
1217/**
1218 * Swizzles the 4 components of a vector and returns the result.
1219 *
1220 * @param Vec Source vector
1221 * @param X Index for which component to use for X (literal 0-3)
1222 * @param Y Index for which component to use for Y (literal 0-3)
1223 * @param Z Index for which component to use for Z (literal 0-3)
1224 * @param W Index for which component to use for W (literal 0-3)
1225 * @return The swizzled vector
1226 */
1227#ifndef __clang__
1228FORCEINLINE VectorRegister4Float VectorSwizzle
1229(
1230 VectorRegister4Float V,
1231 uint32 E0,
1232 uint32 E1,
1233 uint32 E2,
1234 uint32 E3
1235)
1236{
1237 check((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
1238 static constexpr uint32_t ControlElement[4] =
1239 {
1240 0x03020100, // XM_SWIZZLE_X
1241 0x07060504, // XM_SWIZZLE_Y
1242 0x0B0A0908, // XM_SWIZZLE_Z
1243 0x0F0E0D0C, // XM_SWIZZLE_W
1244 };
1245
1246 uint8x8x2_t tbl;
1247 tbl.val[0] = vget_low_f32(V);
1248 tbl.val[1] = vget_high_f32(V);
1249
1250 uint32x2_t idx = vcreate_u32(static_cast<uint64>(ControlElement[E0]) | (static_cast<uint64>(ControlElement[E1]) << 32));
1251 const uint8x8_t rL = vtbl2_u8(tbl, idx);
1252
1253 idx = vcreate_u32(static_cast<uint64>(ControlElement[E2]) | (static_cast<uint64>(ControlElement[E3]) << 32));
1254 const uint8x8_t rH = vtbl2_u8(tbl, idx);
1255
1256 return vcombine_f32(rL, rH);
1257}
1258
1259FORCEINLINE VectorRegister4Double VectorSwizzle
1260(
1261 VectorRegister4Double V,
1262 uint32 E0,
1263 uint32 E1,
1264 uint32 E2,
1265 uint32 E3
1266)
1267{
1268 check((E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4));
1269 static constexpr uint64_t ControlElement[4] =
1270 {
1271 0x0706050403020100ULL, // XM_SWIZZLE_X
1272 0x0F0E0D0C0B0A0908ULL, // XM_SWIZZLE_Y
1273 0x1716151413121110ULL, // XM_SWIZZLE_Z
1274 0x1F1E1D1C1B1A1918ULL, // XM_SWIZZLE_W
1275 };
1276
1277 uint8x16x2_t tbl;
1278 tbl.val[0] = V.XY;
1279 tbl.val[1] = V.ZW;
1280
1281 VectorRegister4Double Result;
1282 uint32x4_t idx = vcombine_u64(vcreate_u64(ControlElement[E0]), vcreate_u64(ControlElement[E1]));
1283 Result.XY = vqtbl2q_u8(tbl, idx);
1284
1285 idx = vcombine_u64(vcreate_u64(ControlElement[E2]), vcreate_u64(ControlElement[E3]));
1286 Result.ZW = vqtbl2q_u8(tbl, idx);
1287
1288 return Result;
1289}
1290#else
1291template <int X, int Y, int Z, int W>
1292FORCEINLINE VectorRegister4Float VectorSwizzleImpl(VectorRegister4Float Vec)
1293{
1294 return __builtin_shufflevector(Vec, Vec, X, Y, Z, W);
1295}
1296
1297template <int X, int Y, typename std::enable_if < (X <= 1) && (Y <= 1), bool >::type = true>
1298FORCEINLINE VectorRegister2Double VectorSwizzleImpl2(VectorRegister4Double Vec)
1299{
1300 return __builtin_shufflevector(Vec.XY, Vec.XY, X, Y);
1301}
1302
1303template <int X, int Y, typename std::enable_if < (X <= 1) && (Y > 1), bool >::type = true>
1304FORCEINLINE VectorRegister2Double VectorSwizzleImpl2(VectorRegister4Double Vec)
1305{
1306 return __builtin_shufflevector(Vec.XY, Vec.ZW, X, Y);
1307}
1308
1309template <int X, int Y, typename std::enable_if < (X > 1) && (Y <= 1), bool >::type = true>
1310FORCEINLINE VectorRegister2Double VectorSwizzleImpl2(VectorRegister4Double Vec)
1311{
1312 return __builtin_shufflevector(Vec.ZW, Vec.XY, X - 2, Y + 2);
1313}
1314
1315template <int X, int Y, typename std::enable_if < (X > 1) && (Y > 1), bool >::type = true>
1316FORCEINLINE VectorRegister2Double VectorSwizzleImpl2(VectorRegister4Double Vec)
1317{
1318 return __builtin_shufflevector(Vec.ZW, Vec.ZW, X - 2, Y);
1319}
1320
1321template <int X, int Y, int Z, int W>
1322FORCEINLINE VectorRegister4Double VectorSwizzleImpl(VectorRegister4Double Vec)
1323{
1324 VectorRegister4Double Result;
1325 Result.XY = VectorSwizzleImpl2<X, Y>(Vec);
1326 Result.ZW = VectorSwizzleImpl2<Z, W>(Vec);
1327 return Result;
1328}
1329
1330#define VectorSwizzle( Vec, X, Y, Z, W ) VectorSwizzleImpl<X, Y, Z, W>(Vec)
1331#endif // __clang__
1332
1333
1334/**
1335* Creates a vector through selecting two components from each vector via a shuffle mask.
1336*
1337* @param Vec1 Source vector1
1338* @param Vec2 Source vector2
1339* @param X Index for which component of Vector1 to use for X (literal 0-3)
1340* @param Y Index for which component of Vector1 to use for Y (literal 0-3)
1341* @param Z Index for which component of Vector2 to use for Z (literal 0-3)
1342* @param W Index for which component of Vector2 to use for W (literal 0-3)
1343* @return The swizzled vector
1344*/
1345#ifndef __clang__
1346FORCEINLINE VectorRegister4Float VectorShuffle
1347(
1348 VectorRegister4Float V1,
1349 VectorRegister4Float V2,
1350 uint32 PermuteX,
1351 uint32 PermuteY,
1352 uint32 PermuteZ,
1353 uint32 PermuteW
1354)
1355{
1356 check(PermuteX <= 3 && PermuteY <= 3 && PermuteZ <= 3 && PermuteW <= 3);
1357
1358 static constexpr uint32 ControlElement[8] =
1359 {
1360 0x03020100, // XM_PERMUTE_0X
1361 0x07060504, // XM_PERMUTE_0Y
1362 0x0B0A0908, // XM_PERMUTE_0Z
1363 0x0F0E0D0C, // XM_PERMUTE_0W
1364 0x13121110, // XM_PERMUTE_1X
1365 0x17161514, // XM_PERMUTE_1Y
1366 0x1B1A1918, // XM_PERMUTE_1Z
1367 0x1F1E1D1C, // XM_PERMUTE_1W
1368 };
1369
1370 uint8x8x4_t tbl;
1371 tbl.val[0] = vget_low_f32(V1);
1372 tbl.val[1] = vget_high_f32(V1);
1373 tbl.val[2] = vget_low_f32(V2);
1374 tbl.val[3] = vget_high_f32(V2);
1375
1376 uint32x2_t idx = vcreate_u32(static_cast<uint64>(ControlElement[PermuteX]) | (static_cast<uint64>(ControlElement[PermuteY]) << 32));
1377 const uint8x8_t rL = vtbl4_u8(tbl, idx);
1378
1379 idx = vcreate_u32(static_cast<uint64>(ControlElement[PermuteZ + 4]) | (static_cast<uint64>(ControlElement[PermuteW + 4]) << 32));
1380 const uint8x8_t rH = vtbl4_u8(tbl, idx);
1381
1382 return vcombine_f32(rL, rH);
1383}
1384
1385FORCEINLINE VectorRegister4Double VectorShuffle
1386(
1387 VectorRegister4Double V1,
1388 VectorRegister4Double V2,
1389 uint32 PermuteX,
1390 uint32 PermuteY,
1391 uint32 PermuteZ,
1392 uint32 PermuteW
1393)
1394{
1395 check(PermuteX <= 3 && PermuteY <= 3 && PermuteZ <= 3 && PermuteW <= 3);
1396
1397 static constexpr uint64 ControlElement[8] =
1398 {
1399 0x0706050403020100ULL, // XM_PERMUTE_0X
1400 0x0F0E0D0C0B0A0908ULL, // XM_PERMUTE_0Y
1401 0x1716151413121110ULL, // XM_PERMUTE_0Z
1402 0x1F1E1D1C1B1A1918ULL, // XM_PERMUTE_0W
1403
1404 0x2726252423222120ULL, // XM_PERMUTE_1X
1405 0x2F2E2D2C2B2A2928ULL, // XM_PERMUTE_1Y
1406 0x3736353433323130ULL, // XM_PERMUTE_1Z
1407 0x3F3E3D3C3B3A3938ULL, // XM_PERMUTE_1W
1408 };
1409
1410 uint8x16x4_t tbl;
1411 tbl.val[0] = V1.XY;
1412 tbl.val[1] = V1.ZW;
1413 tbl.val[2] = V2.XY;
1414 tbl.val[3] = V2.ZW;
1415
1416 VectorRegister4Double Result;
1417 uint32x4_t idx = vcombine_u64(vcreate_u64(ControlElement[PermuteX]), vcreate_u64(ControlElement[PermuteY]));
1418 Result.XY = vqtbl4q_u8(tbl, idx);
1419
1420 idx = vcombine_u64(vcreate_u64(ControlElement[PermuteZ + 4]), vcreate_u64(ControlElement[PermuteW + 4]));
1421 Result.ZW = vqtbl4q_u8(tbl, idx);
1422
1423 return Result;
1424}
1425#else
1426
1427template <int X, int Y, int Z, int W>
1428FORCEINLINE VectorRegister4Float VectorShuffleImpl(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
1429{
1430 return __builtin_shufflevector(Vec1, Vec2, X, Y, Z + 4, W + 4);
1431}
1432
1433template <int X, int Y, int Z, int W>
1434FORCEINLINE VectorRegister4Double VectorShuffleImpl(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
1435{
1436 VectorRegister4Double Result;
1437 Result.XY = VectorSwizzleImpl2<X, Y>(Vec1);
1438 Result.ZW = VectorSwizzleImpl2<Z, W>(Vec2);
1439 return Result;
1440}
1441
1442#define VectorShuffle( Vec1, Vec2, X, Y, Z, W ) VectorShuffleImpl<X, Y, Z, W>(Vec1, Vec2)
1443#endif // __clang__
1444
1445/**
1446 * Returns an integer bit-mask (0x00 - 0x0f) based on the sign-bit for each component in a vector.
1447 *
1448 * @param VecMask Vector
1449 * @return Bit 0 = sign(VecMask.x), Bit 1 = sign(VecMask.y), Bit 2 = sign(VecMask.z), Bit 3 = sign(VecMask.w)
1450 */
1451FORCEINLINE uint32 VectorMaskBits(VectorRegister4Float VecMask)
1452{
1453 uint32x4_t mmA = vtstq_u32(vreinterpretq_u32_f32(VecMask), GlobalVectorConstants::SignBit()); // mask with 1s every bit for vector element if it's sign is negative
1454 uint32x4_t mmB = vandq_u32(mmA, MakeVectorRegisterInt(0x1, 0x2, 0x4, 0x8)); // pick only one bit on it's corresponding position
1455 uint32x2_t mmC = vorr_u32(vget_low_u32(mmB), vget_high_u32(mmB)); // now combine the result
1456 return vget_lane_u32(mmC, 0) | vget_lane_u32(mmC, 1); // reduce the result from 2 elements to one
1457}
1458
1459FORCEINLINE uint32 VectorMaskBits(VectorRegister4Double VecMask)
1460{
1461 uint64x2_t mmA = vtstq_u64(vreinterpretq_u64_f64(VecMask.XY), GlobalVectorConstants::DoubleSignBit().XY); // mask with 1s every bit for vector element if it's sign is negative
1462 uint64x2_t mmA1 = vtstq_u64(vreinterpretq_u64_f64(VecMask.ZW), GlobalVectorConstants::DoubleSignBit().XY);
1463 uint64x2_t mmB = vandq_u64(mmA, MakeVectorRegisterInt64(0x1, 0x2)); // pick only one bit on it's corresponding position
1464 uint64x2_t mmB1 = vandq_u64(mmA1, MakeVectorRegisterInt64(0x4, 0x8));
1465 uint64x2_t mmC = vorrq_u64(mmB, mmB1); // now combine the result
1466 return (uint32)(vgetq_lane_u64(mmC, 0) | vgetq_lane_u64(mmC, 1)); // reduce the result from 2 elements to one
1467}
1468
1469/**
1470* Creates a vector by combining two high components from each vector
1471*
1472* @param Vec1 Source vector1
1473* @param Vec2 Source vector2
1474* @return The combined vector
1475*/
1477{
1478 return vcombine_f32(vget_high_f32(Vec1), vget_high_f32(Vec2));
1479}
1480
1481FORCEINLINE VectorRegister4Double VectorCombineHigh(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1482{
1483 VectorRegister4Double Result;
1484 Result.XY = Vec1.ZW;
1485 Result.ZW = Vec2.ZW;
1486 return Result;
1487}
1488
1489/**
1490* Creates a vector by combining two low components from each vector
1491*
1492* @param Vec1 Source vector1
1493* @param Vec2 Source vector2
1494* @return The combined vector
1495*/
1497{
1498 return vcombine_f32(vget_low_f32(Vec1), vget_low_f32(Vec2));
1499}
1500
1501FORCEINLINE VectorRegister4Double VectorCombineLow(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1502{
1503 VectorRegister4Double Result;
1504 Result.XY = Vec1.XY;
1505 Result.ZW = Vec2.XY;
1506 return Result;
1507}
1508
1509/**
1510 * Deinterleaves the components of the two given vectors such that the even components
1511 * are in one vector and the odds in another.
1512 *
1513 * @param Lo [Even0, Odd0, Even1, Odd1]
1514 * @param Hi [Even2, Odd2, Even3, Odd3]
1515 * @param OutEvens [Even0, Even1, Even2, Even3]
1516 * @param OutOdds [Odd0, Odd1, Odd2, Odd3]
1517*/
1518FORCEINLINE void VectorDeinterleave(VectorRegister4Float& OutEvens, VectorRegister4Float& OutOdds, const VectorRegister4Float& Lo, const VectorRegister4Float& Hi)
1519{
1520 float32x4x2_t deinterleaved = vuzpq_f32(Lo, Hi);
1521 OutEvens = deinterleaved.val[0];
1522 OutOdds = deinterleaved.val[1];
1523}
1524
1525FORCEINLINE void VectorDeinterleave(VectorRegister4Double& RESTRICT OutEvens, VectorRegister4Double& RESTRICT OutOdds, const VectorRegister4Double& Lo, const VectorRegister4Double& Hi)
1526{
1527 OutEvens = VectorShuffle(Lo, Hi, 0, 2, 0, 2);
1528 OutOdds = VectorShuffle(Lo, Hi, 1, 3, 1, 3);
1529}
1530
1531/**
1532 * Calculates the cross product of two vectors (XYZ components). W of the input should be 0, and will remain 0.
1533 *
1534 * @param Vec1 1st vector
1535 * @param Vec2 2nd vector
1536 * @return cross(Vec1.xyz, Vec2.xyz). W of the input should be 0, and will remain 0.
1537 */
1539{
1540 VectorRegister4Float C = VectorMultiply(Vec1, VectorSwizzle(Vec2, 1, 2, 0, 3));
1541 C = VectorNegateMultiplyAdd(VectorSwizzle(Vec1, 1, 2, 0, 3), Vec2, C);
1542 C = VectorSwizzle(C, 1, 2, 0, 3);
1543 return C;
1544}
1545
1546FORCEINLINE VectorRegister4Double VectorCross(const VectorRegister4Double& Vec1, const VectorRegister4Double& Vec2)
1547{
1548 VectorRegister4Double C = VectorMultiply(Vec1, VectorSwizzle(Vec2, 1, 2, 0, 3));
1549 C = VectorNegateMultiplyAdd(VectorSwizzle(Vec1, 1, 2, 0, 3), Vec2, C);
1550 C = VectorSwizzle(C, 1, 2, 0, 3);
1551 return C;
1552}
1553
1554/**
1555 * Calculates x raised to the power of y (component-wise).
1556 *
1557 * @param Base Base vector
1558 * @param Exponent Exponent vector
1559 * @return VectorRegister4Float( Base.x^Exponent.x, Base.y^Exponent.y, Base.z^Exponent.z, Base.w^Exponent.w )
1560 */
1562{
1563 //@TODO: Optimize this
1564 union U {
1565 VectorRegister4Float V; float F[4];
1566 FORCEINLINE U() : V() {}
1567 } B, E;
1568 B.V = Base;
1569 E.V = Exponent;
1570 return MakeVectorRegister( powf(B.F[0], E.F[0]), powf(B.F[1], E.F[1]), powf(B.F[2], E.F[2]), powf(B.F[3], E.F[3]) );
1571}
1572
1573FORCEINLINE VectorRegister4Double VectorPow(const VectorRegister4Double& Base, const VectorRegister4Double& Exponent)
1574{
1575 //@TODO: Optimize this
1576 AlignedDouble4 Values(Base);
1577 AlignedDouble4 Exponents(Exponent);
1578
1579 Values[0] = FMath::Pow(Values[0], Exponents[0]);
1580 Values[1] = FMath::Pow(Values[1], Exponents[1]);
1581 Values[2] = FMath::Pow(Values[2], Exponents[2]);
1582 Values[3] = FMath::Pow(Values[3], Exponents[3]);
1583 return Values.ToVectorRegister();
1584}
1585
1586/**
1587 * Computes an estimate of the reciprocal of a vector (component-wise) and returns the result.
1588 *
1589 * @param Vec 1st vector
1590 * @return VectorRegister4Float( (Estimate) 1.0f / Vec.x, (Estimate) 1.0f / Vec.y, (Estimate) 1.0f / Vec.z, (Estimate) 1.0f / Vec.w )
1591 */
1593{
1594 return vrecpeq_f32(Vec);
1595}
1596
1597FORCEINLINE VectorRegister4Double VectorReciprocalEstimate(const VectorRegister4Double& Vec)
1598{
1599 VectorRegister4Double Result;
1600 Result.XY = vrecpeq_f64(Vec.XY);
1601 Result.ZW = vrecpeq_f64(Vec.ZW);
1602 return Result;
1603}
1604
1605
1606/**
1607 * Computes the reciprocal of a vector (component-wise) and returns the result.
1608 *
1609 * @param Vec 1st vector
1610 * @return VectorRegister4Float( 1.0f / Vec.x, 1.0f / Vec.y, 1.0f / Vec.z, 1.0f / Vec.w )
1611 */
1613{
1614 // Perform two passes of Newton-Raphson iteration on the hardware estimate
1615 // The built-in instruction (VRECPS) is not as accurate
1616
1617 // Initial estimate
1618 VectorRegister4Float Reciprocal = VectorReciprocalEstimate(Vec);
1619
1620 // First iteration
1621 VectorRegister4Float Squared = VectorMultiply(Reciprocal, Reciprocal);
1622 VectorRegister4Float Double = VectorAdd(Reciprocal, Reciprocal);
1623 Reciprocal = VectorNegateMultiplyAdd(Vec, Squared, Double);
1624
1625 // Second iteration
1626 Squared = VectorMultiply(Reciprocal, Reciprocal);
1627 Double = VectorAdd(Reciprocal, Reciprocal);
1628 return VectorNegateMultiplyAdd(Vec, Squared, Double);
1629}
1630
1631FORCEINLINE VectorRegister4Double VectorReciprocal(const VectorRegister4Double& Vec)
1632{
1633 return VectorDivide(GlobalVectorConstants::DoubleOne, Vec);
1634}
1635
1636
1637/**
1638 * Return the square root of each component
1639 *
1640 * @param Vector Vector
1641 * @return VectorRegister4Float(sqrt(Vec.X), sqrt(Vec.Y), sqrt(Vec.Z), sqrt(Vec.W))
1642 */
1644{
1645 return vsqrtq_f32(Vec);
1646}
1647
1648FORCEINLINE VectorRegister4Double VectorSqrt(const VectorRegister4Double& Vec)
1649{
1650 VectorRegister4Double Result;
1651 Result.XY = vsqrtq_f64(Vec.XY);
1652 Result.ZW = vsqrtq_f64(Vec.ZW);
1653 return Result;
1654}
1655
1656/**
1657 * Returns an estimate of 1/sqrt(c) for each component of the vector
1658 *
1659 * @param Vector Vector
1660 * @return VectorRegister4Float(1/sqrt(t), 1/sqrt(t), 1/sqrt(t), 1/sqrt(t))
1661 */
1663{
1664 return vrsqrteq_f32(Vec);
1665}
1666
1667FORCEINLINE VectorRegister4Double VectorReciprocalSqrtEstimate(const VectorRegister4Double& Vec)
1668{
1669 VectorRegister4Double Result;
1670 Result.XY = vrsqrteq_f64(Vec.XY);
1671 Result.ZW = vrsqrteq_f64(Vec.ZW);
1672 return Result;
1673}
1674
1675/**
1676 * Return the reciprocal of the square root of each component
1677 *
1678 * @param Vector Vector
1679 * @return VectorRegister4Float(1/sqrt(Vec.X), 1/sqrt(Vec.Y), 1/sqrt(Vec.Z), 1/sqrt(Vec.W))
1680 */
1682{
1683 // Perform a single pass of Newton-Raphson iteration on the hardware estimate
1684 // This is a builtin instruction (VRSQRTS)
1685
1686 // Initial estimate
1687 VectorRegister4Float RecipSqrt = VectorReciprocalSqrtEstimate(Vec);
1688
1689 // Two refinement
1690 RecipSqrt = VectorMultiply(vrsqrtsq_f32(Vec, VectorMultiply(RecipSqrt, RecipSqrt)), RecipSqrt);
1691 return VectorMultiply(vrsqrtsq_f32(Vec, VectorMultiply(RecipSqrt, RecipSqrt)), RecipSqrt);
1692}
1693
1694FORCEINLINE VectorRegister4Double VectorReciprocalSqrt(const VectorRegister4Double& Vec)
1695{
1696 // Perform a single pass of Newton-Raphson iteration on the hardware estimate
1697 // This is a builtin instruction (VRSQRTS)
1698
1699 // Initial estimate
1700 VectorRegister4Double RecipSqrt = VectorReciprocalSqrtEstimate(Vec);
1701
1702 // Two refinement
1703 VectorRegister4Double Tmp;
1704 Tmp.XY = vrsqrtsq_f64(Vec.XY, VectorMultiply(RecipSqrt.XY, RecipSqrt.XY));
1705 Tmp.ZW = vrsqrtsq_f64(Vec.ZW, VectorMultiply(RecipSqrt.ZW, RecipSqrt.ZW));
1706 RecipSqrt = VectorMultiply(Tmp, RecipSqrt);
1707
1708 Tmp.XY = vrsqrtsq_f64(Vec.XY, VectorMultiply(RecipSqrt.XY, RecipSqrt.XY));
1709 Tmp.ZW = vrsqrtsq_f64(Vec.ZW, VectorMultiply(RecipSqrt.ZW, RecipSqrt.ZW));
1710 return VectorMultiply(Tmp, RecipSqrt);
1711}
1712
1713/**
1714 * Return Reciprocal Length of the vector
1715 *
1716 * @param Vector Vector
1717 * @return VectorRegister4Float(rlen, rlen, rlen, rlen) when rlen = 1/sqrt(dot4(V))
1718 */
1720{
1721 return VectorReciprocalSqrt(VectorDot4(Vector, Vector));
1722}
1723
1724FORCEINLINE VectorRegister4Double VectorReciprocalLen(const VectorRegister4Double& Vector)
1725{
1726 return VectorReciprocalSqrt(VectorDot4(Vector, Vector));
1727}
1728
1729/**
1730 * Return Reciprocal Length of the vector (estimate)
1731 *
1732 * @param Vector Vector
1733 * @return VectorRegister4Float(rlen, rlen, rlen, rlen) when rlen = 1/sqrt(dot4(V)) (estimate)
1734 */
1736{
1737 return VectorReciprocalSqrtEstimate(VectorDot4(Vector, Vector));
1738}
1739
1740FORCEINLINE VectorRegister4Double VectorReciprocalLenEstimate(const VectorRegister4Double& Vector)
1741{
1743}
1744
1745
1746/**
1747* Loads XYZ and sets W=0
1748*
1749* @param Vector VectorRegister4Float
1750* @return VectorRegister4Float(X, Y, Z, 0.0f)
1751*/
1753{
1754 return VectorSetComponent(Vec, 3, 0.0f);
1755}
1756
1757FORCEINLINE VectorRegister4Double VectorSet_W0(const VectorRegister4Double& Vec)
1758{
1759 return VectorSetComponent(Vec, 3, 0.0);
1760}
1761
1762
1763/**
1764* Loads XYZ and sets W=1.
1765*
1766* @param Vector VectorRegister4Float
1767* @return VectorRegister4Float(X, Y, Z, 1.0f)
1768*/
1770{
1771 return VectorSetComponent(Vec, 3, 1.0f);
1772}
1773
1774FORCEINLINE VectorRegister4Double VectorSet_W1(const VectorRegister4Double& Vec)
1775{
1776 return VectorSetComponent(Vec, 3, 1.0);
1777}
1778
1779
1780
1781/**
1782* Returns a component from a vector.
1783*
1784* @param Vec Vector register
1785* @param ComponentIndex Which component to get, X=0, Y=1, Z=2, W=3
1786* @return The component as a float
1787*/
1788template <uint32 ElementIndex>
1789FORCEINLINE float VectorGetComponentImpl(VectorRegister4Float Vec)
1790{
1791 return vgetq_lane_f32(Vec, ElementIndex);
1792}
1793
1794template <int ElementIndex>
1795FORCEINLINE double VectorGetComponentImpl(VectorRegister2Double Vec)
1796{
1797 return vgetq_lane_f64(Vec, ElementIndex);
1798}
1799
1800template<int ElementIndex, typename std::enable_if< (ElementIndex > 1), bool >::type = true >
1801FORCEINLINE double VectorGetComponentImpl(const VectorRegister4Double& Vec)
1802{
1803 return VectorGetComponentImpl<ElementIndex - 2>(Vec.ZW);
1804}
1805
1806template<int ElementIndex, typename std::enable_if < (ElementIndex <= 1), bool >::type = true >
1807FORCEINLINE double VectorGetComponentImpl(const VectorRegister4Double& Vec)
1808{
1809 return VectorGetComponentImpl<ElementIndex>(Vec.XY);
1810}
1811
1812#define VectorGetComponent(Vec, ElementIndex) VectorGetComponentImpl<ElementIndex>(Vec)
1813
1814FORCEINLINE float VectorGetComponentDynamic(VectorRegister4Float Vec, uint32 ElementIndex)
1815{
1816 AlignedFloat4 Floats(Vec);
1817 return Floats[ElementIndex];
1818}
1819
1820FORCEINLINE double VectorGetComponentDynamic(VectorRegister4Double Vec, uint32 ElementIndex)
1821{
1822 AlignedDouble4 Doubles(Vec);
1823 return Doubles[ElementIndex];
1824}
1825
1826/**
1827 * Multiplies two 4x4 matrices.
1828 *
1829 * @param Result Pointer to where the result should be stored
1830 * @param Matrix1 Pointer to the first matrix
1831 * @param Matrix2 Pointer to the second matrix
1832 */
1833FORCEINLINE void VectorMatrixMultiply( FMatrix44f* Result, const FMatrix44f* Matrix1, const FMatrix44f* Matrix2 )
1834{
1835 float32x4x4_t A = vld1q_f32_x4((const float*)Matrix1);
1836 float32x4x4_t B = vld1q_f32_x4((const float*)Matrix2);
1837 float32x4x4_t R;
1838
1839 // First row of result (Matrix1[0] * Matrix2).
1840 R.val[0] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[0]), 0);
1841 R.val[0] = vfmaq_lane_f32(R.val[0], B.val[1], vget_low_f32(A.val[0]), 1);
1842 R.val[0] = vfmaq_lane_f32(R.val[0], B.val[2], vget_high_f32(A.val[0]), 0);
1843 R.val[0] = vfmaq_lane_f32(R.val[0], B.val[3], vget_high_f32(A.val[0]), 1);
1844
1845 // Second row of result (Matrix1[1] * Matrix2).
1846 R.val[1] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[1]), 0);
1847 R.val[1] = vfmaq_lane_f32(R.val[1], B.val[1], vget_low_f32(A.val[1]), 1);
1848 R.val[1] = vfmaq_lane_f32(R.val[1], B.val[2], vget_high_f32(A.val[1]), 0);
1849 R.val[1] = vfmaq_lane_f32(R.val[1], B.val[3], vget_high_f32(A.val[1]), 1);
1850
1851 // Third row of result (Matrix1[2] * Matrix2).
1852 R.val[2] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[2]), 0);
1853 R.val[2] = vfmaq_lane_f32(R.val[2], B.val[1], vget_low_f32(A.val[2]), 1);
1854 R.val[2] = vfmaq_lane_f32(R.val[2], B.val[2], vget_high_f32(A.val[2]), 0);
1855 R.val[2] = vfmaq_lane_f32(R.val[2], B.val[3], vget_high_f32(A.val[2]), 1);
1856
1857 // Fourth row of result (Matrix1[3] * Matrix2).
1858 R.val[3] = vmulq_lane_f32(B.val[0], vget_low_f32(A.val[3]), 0);
1859 R.val[3] = vfmaq_lane_f32(R.val[3], B.val[1], vget_low_f32(A.val[3]), 1);
1860 R.val[3] = vfmaq_lane_f32(R.val[3], B.val[2], vget_high_f32(A.val[3]), 0);
1861 R.val[3] = vfmaq_lane_f32(R.val[3], B.val[3], vget_high_f32(A.val[3]), 1);
1862
1863 vst1q_f32_x4((float*)Result, R);
1864}
1865
1866FORCEINLINE void VectorMatrixMultiply(FMatrix44d* Result, const FMatrix44d* Matrix1, const FMatrix44d* Matrix2)
1867{
1868 float64x2x4_t A = vld1q_f64_x4((const double*)Matrix1);
1869 float64x2x4_t B1 = vld1q_f64_x4((const double*)Matrix2);
1870 float64x2x4_t B2 = vld1q_f64_x4((const double*)Matrix2 + 8);
1871 float64_t* V = (float64_t*)&A;
1872 float64x2x4_t R;
1873
1874 // First row of result (Matrix1[0] * Matrix2).
1875 R.val[0] = vmulq_n_f64(B1.val[0], V[0]);
1876 R.val[0] = vfmaq_n_f64(R.val[0], B1.val[2], V[1]);
1877 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[0], V[2]);
1878 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[2], V[3]);
1879
1880 R.val[1] = vmulq_n_f64(B1.val[1], V[0]);
1881 R.val[1] = vfmaq_n_f64(R.val[1], B1.val[3], V[1]);
1882 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[1], V[2]);
1883 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[3], V[3]);
1884
1885 // Second row of result (Matrix1[1] * Matrix2).
1886 R.val[2] = vmulq_n_f64(B1.val[0], V[4]);
1887 R.val[2] = vfmaq_n_f64(R.val[2], B1.val[2], V[5]);
1888 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[0], V[6]);
1889 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[2], V[7]);
1890
1891 R.val[3] = vmulq_n_f64(B1.val[1], V[4]);
1892 R.val[3] = vfmaq_n_f64(R.val[3], B1.val[3], V[5]);
1893 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[1], V[6]);
1894 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[3], V[7]);
1895
1896 vst1q_f64_x4((double*)Result, R);
1897 A = vld1q_f64_x4((const double*)Matrix1 + 8);
1898 V = (float64_t*)&A;
1899
1900 // Third row of result (Matrix1[2] * Matrix2).
1901 R.val[0] = vmulq_n_f64(B1.val[0], V[0]);
1902 R.val[0] = vfmaq_n_f64(R.val[0], B1.val[2], V[1]);
1903 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[0], V[2]);
1904 R.val[0] = vfmaq_n_f64(R.val[0], B2.val[2], V[3]);
1905
1906 R.val[1] = vmulq_n_f64(B1.val[1], V[0]);
1907 R.val[1] = vfmaq_n_f64(R.val[1], B1.val[3], V[1]);
1908 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[1], V[2]);
1909 R.val[1] = vfmaq_n_f64(R.val[1], B2.val[3], V[3]);
1910
1911 // Fourth row of result (Matrix1[3] * Matrix2).
1912 R.val[2] = vmulq_n_f64(B1.val[0], V[4]);
1913 R.val[2] = vfmaq_n_f64(R.val[2], B1.val[2], V[5]);
1914 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[0], V[6]);
1915 R.val[2] = vfmaq_n_f64(R.val[2], B2.val[2], V[7]);
1916
1917 R.val[3] = vmulq_n_f64(B1.val[1], V[4]);
1918 R.val[3] = vfmaq_n_f64(R.val[3], B1.val[3], V[5]);
1919 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[1], V[6]);
1920 R.val[3] = vfmaq_n_f64(R.val[3], B2.val[3], V[7]);
1921
1922 vst1q_f64_x4((double*)Result + 8, R);
1923}
1924
1925/**
1926 * Calculate the inverse of an FMatrix.
1927 *
1928 * @param DstMatrix FMatrix pointer to where the result should be stored
1929 * @param SrcMatrix FMatrix pointer to the Matrix to be inversed
1930 */
1931// OPTIMIZE ME: stolen from UnMathFpu.h
1932FORCEINLINE void VectorMatrixInverse(FMatrix44f* DstMatrix, const FMatrix44f* SrcMatrix )
1933{
1934 typedef float Float4x4[4][4];
1935 const Float4x4& M = *((const Float4x4*) SrcMatrix);
1936 Float4x4 Result;
1937 float Det[4];
1938 Float4x4 Tmp;
1939
1940 Tmp[0][0] = M[2][2] * M[3][3] - M[2][3] * M[3][2];
1941 Tmp[0][1] = M[1][2] * M[3][3] - M[1][3] * M[3][2];
1942 Tmp[0][2] = M[1][2] * M[2][3] - M[1][3] * M[2][2];
1943
1944 Tmp[1][0] = M[2][2] * M[3][3] - M[2][3] * M[3][2];
1945 Tmp[1][1] = M[0][2] * M[3][3] - M[0][3] * M[3][2];
1946 Tmp[1][2] = M[0][2] * M[2][3] - M[0][3] * M[2][2];
1947
1948 Tmp[2][0] = M[1][2] * M[3][3] - M[1][3] * M[3][2];
1949 Tmp[2][1] = M[0][2] * M[3][3] - M[0][3] * M[3][2];
1950 Tmp[2][2] = M[0][2] * M[1][3] - M[0][3] * M[1][2];
1951
1952 Tmp[3][0] = M[1][2] * M[2][3] - M[1][3] * M[2][2];
1953 Tmp[3][1] = M[0][2] * M[2][3] - M[0][3] * M[2][2];
1954 Tmp[3][2] = M[0][2] * M[1][3] - M[0][3] * M[1][2];
1955
1956 Det[0] = M[1][1]*Tmp[0][0] - M[2][1]*Tmp[0][1] + M[3][1]*Tmp[0][2];
1957 Det[1] = M[0][1]*Tmp[1][0] - M[2][1]*Tmp[1][1] + M[3][1]*Tmp[1][2];
1958 Det[2] = M[0][1]*Tmp[2][0] - M[1][1]*Tmp[2][1] + M[3][1]*Tmp[2][2];
1959 Det[3] = M[0][1]*Tmp[3][0] - M[1][1]*Tmp[3][1] + M[2][1]*Tmp[3][2];
1960
1961 float Determinant = M[0][0]*Det[0] - M[1][0]*Det[1] + M[2][0]*Det[2] - M[3][0]*Det[3];
1962 const float RDet = 1.0f / Determinant;
1963
1964 Result[0][0] = RDet * Det[0];
1965 Result[0][1] = -RDet * Det[1];
1966 Result[0][2] = RDet * Det[2];
1967 Result[0][3] = -RDet * Det[3];
1968 Result[1][0] = -RDet * (M[1][0]*Tmp[0][0] - M[2][0]*Tmp[0][1] + M[3][0]*Tmp[0][2]);
1969 Result[1][1] = RDet * (M[0][0]*Tmp[1][0] - M[2][0]*Tmp[1][1] + M[3][0]*Tmp[1][2]);
1970 Result[1][2] = -RDet * (M[0][0]*Tmp[2][0] - M[1][0]*Tmp[2][1] + M[3][0]*Tmp[2][2]);
1971 Result[1][3] = RDet * (M[0][0]*Tmp[3][0] - M[1][0]*Tmp[3][1] + M[2][0]*Tmp[3][2]);
1972 Result[2][0] = RDet * (
1973 M[1][0] * (M[2][1] * M[3][3] - M[2][3] * M[3][1]) -
1974 M[2][0] * (M[1][1] * M[3][3] - M[1][3] * M[3][1]) +
1975 M[3][0] * (M[1][1] * M[2][3] - M[1][3] * M[2][1])
1976 );
1977 Result[2][1] = -RDet * (
1978 M[0][0] * (M[2][1] * M[3][3] - M[2][3] * M[3][1]) -
1979 M[2][0] * (M[0][1] * M[3][3] - M[0][3] * M[3][1]) +
1980 M[3][0] * (M[0][1] * M[2][3] - M[0][3] * M[2][1])
1981 );
1982 Result[2][2] = RDet * (
1983 M[0][0] * (M[1][1] * M[3][3] - M[1][3] * M[3][1]) -
1984 M[1][0] * (M[0][1] * M[3][3] - M[0][3] * M[3][1]) +
1985 M[3][0] * (M[0][1] * M[1][3] - M[0][3] * M[1][1])
1986 );
1987 Result[2][3] = -RDet * (
1988 M[0][0] * (M[1][1] * M[2][3] - M[1][3] * M[2][1]) -
1989 M[1][0] * (M[0][1] * M[2][3] - M[0][3] * M[2][1]) +
1990 M[2][0] * (M[0][1] * M[1][3] - M[0][3] * M[1][1])
1991 );
1992 Result[3][0] = -RDet * (
1993 M[1][0] * (M[2][1] * M[3][2] - M[2][2] * M[3][1]) -
1994 M[2][0] * (M[1][1] * M[3][2] - M[1][2] * M[3][1]) +
1995 M[3][0] * (M[1][1] * M[2][2] - M[1][2] * M[2][1])
1996 );
1997 Result[3][1] = RDet * (
1998 M[0][0] * (M[2][1] * M[3][2] - M[2][2] * M[3][1]) -
1999 M[2][0] * (M[0][1] * M[3][2] - M[0][2] * M[3][1]) +
2000 M[3][0] * (M[0][1] * M[2][2] - M[0][2] * M[2][1])
2001 );
2002 Result[3][2] = -RDet * (
2003 M[0][0] * (M[1][1] * M[3][2] - M[1][2] * M[3][1]) -
2004 M[1][0] * (M[0][1] * M[3][2] - M[0][2] * M[3][1]) +
2005 M[3][0] * (M[0][1] * M[1][2] - M[0][2] * M[1][1])
2006 );
2007 Result[3][3] = RDet * (
2008 M[0][0] * (M[1][1] * M[2][2] - M[1][2] * M[2][1]) -
2009 M[1][0] * (M[0][1] * M[2][2] - M[0][2] * M[2][1]) +
2010 M[2][0] * (M[0][1] * M[1][2] - M[0][2] * M[1][1])
2011 );
2012
2013 memcpy( DstMatrix, &Result, sizeof(Result) );
2014}
2015
2016FORCEINLINE void VectorMatrixInverse(FMatrix44d* DstMatrix, const FMatrix44d* SrcMatrix)
2017{
2018 typedef double Double4x4[4][4];
2019 const Double4x4& M = *((const Double4x4*)SrcMatrix);
2020 Double4x4 Result;
2021 double Det[4];
2022 Double4x4 Tmp;
2023
2024 Tmp[0][0] = M[2][2] * M[3][3] - M[2][3] * M[3][2];
2025 Tmp[0][1] = M[1][2] * M[3][3] - M[1][3] * M[3][2];
2026 Tmp[0][2] = M[1][2] * M[2][3] - M[1][3] * M[2][2];
2027
2028 Tmp[1][0] = M[2][2] * M[3][3] - M[2][3] * M[3][2];
2029 Tmp[1][1] = M[0][2] * M[3][3] - M[0][3] * M[3][2];
2030 Tmp[1][2] = M[0][2] * M[2][3] - M[0][3] * M[2][2];
2031
2032 Tmp[2][0] = M[1][2] * M[3][3] - M[1][3] * M[3][2];
2033 Tmp[2][1] = M[0][2] * M[3][3] - M[0][3] * M[3][2];
2034 Tmp[2][2] = M[0][2] * M[1][3] - M[0][3] * M[1][2];
2035
2036 Tmp[3][0] = M[1][2] * M[2][3] - M[1][3] * M[2][2];
2037 Tmp[3][1] = M[0][2] * M[2][3] - M[0][3] * M[2][2];
2038 Tmp[3][2] = M[0][2] * M[1][3] - M[0][3] * M[1][2];
2039
2040 Det[0] = M[1][1] * Tmp[0][0] - M[2][1] * Tmp[0][1] + M[3][1] * Tmp[0][2];
2041 Det[1] = M[0][1] * Tmp[1][0] - M[2][1] * Tmp[1][1] + M[3][1] * Tmp[1][2];
2042 Det[2] = M[0][1] * Tmp[2][0] - M[1][1] * Tmp[2][1] + M[3][1] * Tmp[2][2];
2043 Det[3] = M[0][1] * Tmp[3][0] - M[1][1] * Tmp[3][1] + M[2][1] * Tmp[3][2];
2044
2045 double Determinant = M[0][0] * Det[0] - M[1][0] * Det[1] + M[2][0] * Det[2] - M[3][0] * Det[3];
2046 const double RDet = 1.0 / Determinant;
2047
2048 Result[0][0] = RDet * Det[0];
2049 Result[0][1] = -RDet * Det[1];
2050 Result[0][2] = RDet * Det[2];
2051 Result[0][3] = -RDet * Det[3];
2052 Result[1][0] = -RDet * (M[1][0] * Tmp[0][0] - M[2][0] * Tmp[0][1] + M[3][0] * Tmp[0][2]);
2053 Result[1][1] = RDet * (M[0][0] * Tmp[1][0] - M[2][0] * Tmp[1][1] + M[3][0] * Tmp[1][2]);
2054 Result[1][2] = -RDet * (M[0][0] * Tmp[2][0] - M[1][0] * Tmp[2][1] + M[3][0] * Tmp[2][2]);
2055 Result[1][3] = RDet * (M[0][0] * Tmp[3][0] - M[1][0] * Tmp[3][1] + M[2][0] * Tmp[3][2]);
2056 Result[2][0] = RDet * (
2057 M[1][0] * (M[2][1] * M[3][3] - M[2][3] * M[3][1]) -
2058 M[2][0] * (M[1][1] * M[3][3] - M[1][3] * M[3][1]) +
2059 M[3][0] * (M[1][1] * M[2][3] - M[1][3] * M[2][1])
2060 );
2061 Result[2][1] = -RDet * (
2062 M[0][0] * (M[2][1] * M[3][3] - M[2][3] * M[3][1]) -
2063 M[2][0] * (M[0][1] * M[3][3] - M[0][3] * M[3][1]) +
2064 M[3][0] * (M[0][1] * M[2][3] - M[0][3] * M[2][1])
2065 );
2066 Result[2][2] = RDet * (
2067 M[0][0] * (M[1][1] * M[3][3] - M[1][3] * M[3][1]) -
2068 M[1][0] * (M[0][1] * M[3][3] - M[0][3] * M[3][1]) +
2069 M[3][0] * (M[0][1] * M[1][3] - M[0][3] * M[1][1])
2070 );
2071 Result[2][3] = -RDet * (
2072 M[0][0] * (M[1][1] * M[2][3] - M[1][3] * M[2][1]) -
2073 M[1][0] * (M[0][1] * M[2][3] - M[0][3] * M[2][1]) +
2074 M[2][0] * (M[0][1] * M[1][3] - M[0][3] * M[1][1])
2075 );
2076 Result[3][0] = -RDet * (
2077 M[1][0] * (M[2][1] * M[3][2] - M[2][2] * M[3][1]) -
2078 M[2][0] * (M[1][1] * M[3][2] - M[1][2] * M[3][1]) +
2079 M[3][0] * (M[1][1] * M[2][2] - M[1][2] * M[2][1])
2080 );
2081 Result[3][1] = RDet * (
2082 M[0][0] * (M[2][1] * M[3][2] - M[2][2] * M[3][1]) -
2083 M[2][0] * (M[0][1] * M[3][2] - M[0][2] * M[3][1]) +
2084 M[3][0] * (M[0][1] * M[2][2] - M[0][2] * M[2][1])
2085 );
2086 Result[3][2] = -RDet * (
2087 M[0][0] * (M[1][1] * M[3][2] - M[1][2] * M[3][1]) -
2088 M[1][0] * (M[0][1] * M[3][2] - M[0][2] * M[3][1]) +
2089 M[3][0] * (M[0][1] * M[1][2] - M[0][2] * M[1][1])
2090 );
2091 Result[3][3] = RDet * (
2092 M[0][0] * (M[1][1] * M[2][2] - M[1][2] * M[2][1]) -
2093 M[1][0] * (M[0][1] * M[2][2] - M[0][2] * M[2][1]) +
2094 M[2][0] * (M[0][1] * M[1][2] - M[0][2] * M[1][1])
2095 );
2096
2097 memcpy(DstMatrix, &Result, sizeof(Result));
2098}
2099
2100/**
2101 * Calculate Homogeneous transform.
2102 *
2103 * @param VecP VectorRegister4Float
2104 * @param MatrixM FMatrix pointer to the Matrix to apply transform
2105 * @return VectorRegister4Float = VecP*MatrixM
2106 */
2108{
2109 float32x4x4_t M = vld1q_f32_x4((const float*)MatrixM);
2110 VectorRegister4Float Result;
2111
2112 Result = vmulq_n_f32(M.val[0], VecP[0]);
2113 Result = vfmaq_n_f32(Result, M.val[1], VecP[1]);
2114 Result = vfmaq_n_f32(Result, M.val[2], VecP[2]);
2115 Result = vfmaq_n_f32(Result, M.val[3], VecP[3]);
2116
2117 return Result;
2118}
2119
2121{
2122 float64x2x4_t M1 = vld1q_f64_x4((const double*)MatrixM);
2123 float64x2x4_t M2 = vld1q_f64_x4(((const double*)MatrixM) + 8);
2124 VectorRegister4Double Result;
2125 VectorRegister4Double Vec(VecP);
2126
2127 Result.XY = vmulq_n_f64(M1.val[0], Vec.XY[0]);
2128 Result.XY = vfmaq_n_f64(Result.XY, M1.val[2], Vec.XY[1]);
2129 Result.XY = vfmaq_n_f64(Result.XY, M2.val[0], Vec.ZW[0]);
2130 Result.XY = vfmaq_n_f64(Result.XY, M2.val[2], Vec.ZW[1]);
2131
2132 Result.ZW = vmulq_n_f64(M1.val[1], Vec.XY[0]);
2133 Result.ZW = vfmaq_n_f64(Result.ZW, M1.val[3], Vec.XY[1]);
2134 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[1], Vec.ZW[0]);
2135 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[3], Vec.ZW[1]);
2136
2137 return MakeVectorRegisterFloatFromDouble(Result);
2138}
2139
2140FORCEINLINE VectorRegister4Double VectorTransformVector(const VectorRegister4Double& VecP, const FMatrix44d* MatrixM)
2141{
2142 float64x2x4_t M1 = vld1q_f64_x4((const double*)MatrixM);
2143 float64x2x4_t M2 = vld1q_f64_x4(((const double*)MatrixM) + 8);
2144 VectorRegister4Double Result;
2145
2146 //TODO: this can be rewritten to avoid using M2 var, saves some registers
2147 Result.XY = vmulq_n_f64(M1.val[0], VecP.XY[0]);
2148 Result.XY = vfmaq_n_f64(Result.XY, M1.val[2], VecP.XY[1]);
2149 Result.XY = vfmaq_n_f64(Result.XY, M2.val[0], VecP.ZW[0]);
2150 Result.XY = vfmaq_n_f64(Result.XY, M2.val[2], VecP.ZW[1]);
2151
2152 Result.ZW = vmulq_n_f64(M1.val[1], VecP.XY[0]);
2153 Result.ZW = vfmaq_n_f64(Result.ZW, M1.val[3], VecP.XY[1]);
2154 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[1], VecP.ZW[0]);
2155 Result.ZW = vfmaq_n_f64(Result.ZW, M2.val[3], VecP.ZW[1]);
2156
2157 return Result;
2158}
2159
2160/**
2161 * Returns the minimum values of two vectors (component-wise).
2162 *
2163 * @param Vec1 1st vector
2164 * @param Vec2 2nd vector
2165 * @return VectorRegister4Float( min(Vec1.x,Vec2.x), min(Vec1.y,Vec2.y), min(Vec1.z,Vec2.z), min(Vec1.w,Vec2.w) )
2166 */
2168{
2169 return vminq_f32( Vec1, Vec2 );
2170}
2171
2172FORCEINLINE VectorRegister4Double VectorMin(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
2173{
2174 VectorRegister4Double Result;
2175 Result.XY = vminq_f64(Vec1.XY, Vec2.XY);
2176 Result.ZW = vminq_f64(Vec1.ZW, Vec2.ZW);
2177 return Result;
2178}
2179
2180/**
2181 * Returns the maximum values of two vectors (component-wise).
2182 *
2183 * @param Vec1 1st vector
2184 * @param Vec2 2nd vector
2185 * @return VectorRegister4Float( max(Vec1.x,Vec2.x), max(Vec1.y,Vec2.y), max(Vec1.z,Vec2.z), max(Vec1.w,Vec2.w) )
2186 */
2188{
2189 return vmaxq_f32( Vec1, Vec2 );
2190}
2191
2192FORCEINLINE VectorRegister4Double VectorMax(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
2193{
2194 VectorRegister4Double Result;
2195 Result.XY = vmaxq_f64(Vec1.XY, Vec2.XY);
2196 Result.ZW = vmaxq_f64(Vec1.ZW, Vec2.ZW);
2197 return Result;
2198}
2199
2200/**
2201 * Merges the XYZ components of one vector with the W component of another vector and returns the result.
2202 *
2203 * @param VecXYZ Source vector for XYZ_
2204 * @param VecW Source register for ___W (note: the fourth component is used, not the first)
2205 * @return VectorRegister4Float(VecXYZ.x, VecXYZ.y, VecXYZ.z, VecW.w)
2206 */
2208{
2209 return vsetq_lane_f32(vgetq_lane_f32(VecW, 3), VecXYZ, 3);
2210}
2211
2212FORCEINLINE VectorRegister4Double VectorMergeVecXYZ_VecW(const VectorRegister4Double& VecXYZ, const VectorRegister4Double& VecW)
2213{
2214 VectorRegister4Double Res;
2215 Res.XY = VecXYZ.XY;
2216 Res.ZW = vsetq_lane_f64(vgetq_lane_f64(VecW.ZW, 1), VecXYZ.ZW, 1);
2217 return Res;
2218}
2219
2220/**
2221 * Loads 4 uint8s from unaligned memory and converts them into 4 floats.
2222 * IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar floats after you've used this intrinsic!
2223 *
2224 * @param Ptr Unaligned memory pointer to the 4 uint8s.
2225 * @return VectorRegister4Float( float(Ptr[0]), float(Ptr[1]), float(Ptr[2]), float(Ptr[3]) )
2226 */
2228{
2229 // OPTIMIZE ME!
2230 const uint8 *P = (const uint8 *)Ptr;
2231 return MakeVectorRegister( (float)P[0], (float)P[1], (float)P[2], (float)P[3] );
2232}
2233
2234/**
2235* Loads 4 int8s from unaligned memory and converts them into 4 floats.
2236* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar floats after you've used this intrinsic!
2237*
2238* @param Ptr Unaligned memory pointer to the 4 uint8s.
2239* @return VectorRegister4Float( float(Ptr[0]), float(Ptr[1]), float(Ptr[2]), float(Ptr[3]) )
2240*/
2242{
2243 // OPTIMIZE ME!
2244 const int8 *P = (const int8 *)Ptr;
2245 return MakeVectorRegister((float)P[0], (float)P[1], (float)P[2], (float)P[3]);
2246}
2247
2248/**
2249 * Loads 4 uint8s from unaligned memory and converts them into 4 floats in reversed order.
2250 * IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar floats after you've used this intrinsic!
2251 *
2252 * @param Ptr Unaligned memory pointer to the 4 uint8s.
2253 * @return VectorRegister4Float( float(Ptr[3]), float(Ptr[2]), float(Ptr[1]), float(Ptr[0]) )
2254 */
2256{
2257 // OPTIMIZE ME!
2258 const uint8 *P = (const uint8 *)Ptr;
2259 return MakeVectorRegister( (float)P[3], (float)P[2], (float)P[1], (float)P[0] );
2260}
2261
2262/**
2263 * Converts the 4 floats in the vector to 4 uint8s, clamped to [0,255], and stores to unaligned memory.
2264 * IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar floats after you've used this intrinsic!
2265 *
2266 * @param Vec Vector containing 4 floats
2267 * @param Ptr Unaligned memory pointer to store the 4 uint8s.
2268 */
2269FORCEINLINE void VectorStoreByte4( VectorRegister4Float Vec, void* Ptr )
2270{
2271 uint16x8_t u16x8 = (uint16x8_t)vcvtq_u32_f32(VectorMin(Vec, GlobalVectorConstants::Float255));
2272 uint8x8_t u8x8 = (uint8x8_t)vget_low_u16( vuzpq_u16( u16x8, u16x8 ).val[0] );
2273 u8x8 = vuzp_u8( u8x8, u8x8 ).val[0];
2274 uint32_t buf[2];
2275 vst1_u8( (uint8_t *)buf, u8x8 );
2276 *(uint32_t *)Ptr = buf[0];
2277}
2278
2279/**
2280* Converts the 4 floats in the vector to 4 int8s, clamped to [-127, 127], and stores to unaligned memory.
2281* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar floats after you've used this intrinsic!
2282*
2283* @param Vec Vector containing 4 floats
2284* @param Ptr Unaligned memory pointer to store the 4 uint8s.
2285*/
2286FORCEINLINE void VectorStoreSignedByte4(VectorRegister4Float Vec, void* Ptr)
2287{
2288 int16x8_t s16x8 = (int16x8_t)vcvtq_s32_f32(VectorMax(VectorMin(Vec, GlobalVectorConstants::Float127), GlobalVectorConstants::FloatNeg127));
2289 int8x8_t s8x8 = (int8x8_t)vget_low_s16(vuzpq_s16(s16x8, s16x8).val[0]);
2290 s8x8 = vuzp_s8(s8x8, s8x8).val[0];
2291 int32_t buf[2];
2292 vst1_s8((int8_t *)buf, s8x8);
2293 *(int32_t *)Ptr = buf[0];
2294}
2295
2296/**
2297 * Converts the 4 floats in the vector to 4 fp16 and stores based off bool to [un]aligned memory.
2298 *
2299 * @param Vec Vector containing 4 floats
2300 * @param Ptr Memory pointer to store the 4 fp16's.
2301 */
2302template <bool bAligned>
2303FORCEINLINE void VectorStoreHalf4(VectorRegister4Float Vec, void* RESTRICT Ptr)
2304{
2305 float16x4_t f16x4 = vcvt_f16_f32(Vec);
2306
2307 if (bAligned)
2308 {
2309 vst1_u8( (uint8_t *)Ptr, f16x4 );
2310 }
2311 else
2312 {
2313 alignas(16) uint16_t Buf[4];
2314 vst1_u8( (uint8_t *)Buf, f16x4 );
2315 for (int i = 0; i < 4; ++i)
2316 {
2317 ((uint16_t*)Ptr)[i] = Buf[i];
2318 }
2319 }
2320}
2321
2322/**
2323* Loads packed RGB10A2(4 bytes) from unaligned memory and converts them into 4 FLOATs.
2324* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
2325*
2326* @param Ptr Unaligned memory pointer to the RGB10A2(4 bytes).
2327* @return VectorRegister4Float with 4 FLOATs loaded from Ptr.
2328*/
2330{
2331 alignas(16) float V[4];
2332 const uint32 E = *(uint32*)Ptr;
2333 V[0] = float((E >> 00) & 0x3FF);
2334 V[1] = float((E >> 10) & 0x3FF);
2335 V[2] = float((E >> 20) & 0x3FF);
2336 V[3] = float((E >> 30) & 0x3);
2337
2338 VectorRegister4Float Div = MakeVectorRegister(1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f);
2339 return VectorMultiply(MakeVectorRegister(V[0], V[1], V[2], V[3]), Div);
2340}
2341
2342/**
2343* Converts the 4 FLOATs in the vector RGB10A2, clamped to [0, 1023] and [0, 3], and stores to unaligned memory.
2344* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
2345*
2346* @param Vec Vector containing 4 FLOATs
2347* @param Ptr Unaligned memory pointer to store the packed RGB10A2(4 bytes).
2348*/
2349FORCEINLINE void VectorStoreURGB10A2N(const VectorRegister4Float& Vec, void* Ptr)
2350{
2351 union U {
2352 VectorRegister4Float V; float F[4];
2353 FORCEINLINE U() : V() {}
2354 } Tmp;
2355 Tmp.V = VectorMax(Vec, VectorZeroFloat());
2356 Tmp.V = VectorMin(Tmp.V, VectorOneFloat());
2357 Tmp.V = VectorMultiply(Tmp.V, MakeVectorRegister(1023.0f, 1023.0f, 1023.0f, 3.0f));
2358
2359 uint32* Out = (uint32*)Ptr;
2360 *Out = (uint32(Tmp.F[0]) & 0x3FF) << 00 |
2361 (uint32(Tmp.F[1]) & 0x3FF) << 10 |
2362 (uint32(Tmp.F[2]) & 0x3FF) << 20 |
2363 (uint32(Tmp.F[3]) & 0x003) << 30;
2364}
2365
2366/**
2367 * Returns non-zero if any element in Vec1 is greater than the corresponding element in Vec2, otherwise 0.
2368 *
2369 * @param Vec1 1st source vector
2370 * @param Vec2 2nd source vector
2371 * @return Non-zero integer if (Vec1.x > Vec2.x) || (Vec1.y > Vec2.y) || (Vec1.z > Vec2.z) || (Vec1.w > Vec2.w)
2372 */
2373FORCEINLINE int32 VectorAnyGreaterThan( VectorRegister4Float Vec1, VectorRegister4Float Vec2 )
2374{
2375 uint16x8_t u16x8 = (uint16x8_t)vcgtq_f32( Vec1, Vec2 );
2376 uint8x8_t u8x8 = (uint8x8_t)vget_low_u16( vuzpq_u16( u16x8, u16x8 ).val[0] );
2377 u8x8 = vuzp_u8( u8x8, u8x8 ).val[0];
2378 uint32_t buf[2];
2379 vst1_u8( (uint8_t *)buf, u8x8 );
2380 return (int32)buf[0]; // each byte of output corresponds to a component comparison
2381}
2382
2383FORCEINLINE int32 VectorAnyGreaterThan(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
2384{
2385 uint16x8_t u16x8_1 = (uint16x8_t)vcgtq_f64(Vec1.XY, Vec2.XY);
2386 uint16x8_t u16x8_2 = (uint16x8_t)vcgtq_f64(Vec1.ZW, Vec2.ZW);
2387 uint16x8x2_t tmp = vuzpq_u16(u16x8_1, u16x8_2);
2388 uint8x8_t u8x8 = (uint8x8_t)vget_low_u16(vuzpq_u16(tmp.val[0], tmp.val[0]).val[0]);
2389 u8x8 = vuzp_u8(u8x8, u8x8).val[0];
2390 uint32_t buf[2];
2391 vst1_u8((uint8_t*)buf, u8x8);
2392 return (int32)buf[0]; // each byte of output corresponds to a component comparison
2393}
2394
2395/**
2396 * Resets the floating point registers so that they can be used again.
2397 * Some intrinsics use these for MMX purposes (e.g. VectorLoadByte4 and VectorStoreByte4).
2398 */
2399#define VectorResetFloatRegisters()
2400
2401/**
2402 * Returns the control register.
2403 *
2404 * @return The uint32 control register
2405 */
2406#define VectorGetControlRegister() 0
2407
2408/**
2409 * Sets the control register.
2410 *
2411 * @param ControlStatus The uint32 control status value to set
2412 */
2413#define VectorSetControlRegister(ControlStatus)
2414
2415/**
2416 * Control status bit to round all floating point math results towards zero.
2417 */
2418#define VECTOR_ROUND_TOWARD_ZERO 0
2419
2420
2421/**
2422* Multiplies two quaternions; the order matters.
2423*
2424* Order matters when composing quaternions: C = VectorQuaternionMultiply2(A, B) will yield a quaternion C = A * B
2425* that logically first applies B then A to any subsequent transformation (right first, then left).
2426*
2427* @param Quat1 Pointer to the first quaternion
2428* @param Quat2 Pointer to the second quaternion
2429* @return Quat1 * Quat2
2430*/
2432{
2433 VectorRegister4Float Result = VectorMultiply(VectorReplicate(Quat1, 3), Quat2);
2434 Result = VectorMultiplyAdd( VectorMultiply(VectorReplicate(Quat1, 0), VectorSwizzle(Quat2, 3,2,1,0)), GlobalVectorConstants::QMULTI_SIGN_MASK0, Result);
2435 Result = VectorMultiplyAdd( VectorMultiply(VectorReplicate(Quat1, 1), VectorSwizzle(Quat2, 2,3,0,1)), GlobalVectorConstants::QMULTI_SIGN_MASK1, Result);
2436 Result = VectorMultiplyAdd( VectorMultiply(VectorReplicate(Quat1, 2), VectorSwizzle(Quat2, 1,0,3,2)), GlobalVectorConstants::QMULTI_SIGN_MASK2, Result);
2437
2438 return Result;
2439}
2440
2441FORCEINLINE VectorRegister4Double VectorQuaternionMultiply2(const VectorRegister4Double& Quat1, const VectorRegister4Double& Quat2)
2442{
2443 VectorRegister4Double Result = VectorMultiply(VectorReplicate(Quat1, 3), Quat2);
2444 Result = VectorMultiplyAdd(VectorMultiply(VectorReplicate(Quat1, 0), VectorSwizzle(Quat2, 3, 2, 1, 0)), GlobalVectorConstants::DOUBLE_QMULTI_SIGN_MASK0, Result);
2445 Result = VectorMultiplyAdd(VectorMultiply(VectorReplicate(Quat1, 1), VectorSwizzle(Quat2, 2, 3, 0, 1)), GlobalVectorConstants::DOUBLE_QMULTI_SIGN_MASK1, Result);
2446 Result = VectorMultiplyAdd(VectorMultiply(VectorReplicate(Quat1, 2), VectorSwizzle(Quat2, 1, 0, 3, 2)), GlobalVectorConstants::DOUBLE_QMULTI_SIGN_MASK2, Result);
2447
2448 return Result;
2449}
2450
2451/**
2452* Multiplies two quaternions; the order matters.
2453*
2454* When composing quaternions: VectorQuaternionMultiply(C, A, B) will yield a quaternion C = A * B
2455* that logically first applies B then A to any subsequent transformation (right first, then left).
2456*
2457* @param Result Pointer to where the result Quat1 * Quat2 should be stored
2458* @param Quat1 Pointer to the first quaternion (must not be the destination)
2459* @param Quat2 Pointer to the second quaternion (must not be the destination)
2460*/
2461FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Float* RESTRICT Result, const VectorRegister4Float* RESTRICT Quat1, const VectorRegister4Float* RESTRICT Quat2)
2462{
2463 *Result = VectorQuaternionMultiply2(*Quat1, *Quat2);
2464}
2465
2466FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Double* RESTRICT Result, const VectorRegister4Double* RESTRICT Quat1, const VectorRegister4Double* RESTRICT Quat2)
2467{
2468 *Result = VectorQuaternionMultiply2(*Quat1, *Quat2);
2469}
2470
2471/**
2472* Computes the sine and cosine of each component of a Vector.
2473*
2474* @param VSinAngles VectorRegister4Float Pointer to where the Sin result should be stored
2475* @param VCosAngles VectorRegister4Float Pointer to where the Cos result should be stored
2476* @param VAngles VectorRegister4Float Pointer to the input angles
2477*/
2478FORCEINLINE void VectorSinCos( VectorRegister4Float* RESTRICT VSinAngles, VectorRegister4Float* RESTRICT VCosAngles, const VectorRegister4Float* RESTRICT VAngles )
2479{
2480 // Map to [-pi, pi]
2481 // X = A - 2pi * round(A/2pi)
2482 // Note the round(), not truncate(). In this case round() can round halfway cases using round-to-nearest-even OR round-to-nearest.
2483
2484 // Quotient = round(A/2pi)
2485 VectorRegister4Float Quotient = VectorMultiply(*VAngles, GlobalVectorConstants::OneOverTwoPi);
2486 Quotient = vrndnq_f32(Quotient); // round to nearest even is the default rounding mode but that's fine here.
2487
2488 // X = A - 2pi * Quotient
2489 VectorRegister4Float X = VectorNegateMultiplyAdd(GlobalVectorConstants::TwoPi, Quotient, *VAngles);
2490
2491 // Map in [-pi/2,pi/2]
2492 VectorRegister4Float sign = VectorBitwiseAnd(X, GlobalVectorConstants::SignBit());
2493 VectorRegister4Float c = VectorBitwiseOr(GlobalVectorConstants::Pi, sign); // pi when x >= 0, -pi when x < 0
2494 VectorRegister4Float absx = VectorAbs(X);
2495 VectorRegister4Float rflx = VectorSubtract(c, X);
2496 VectorRegister4Float comp = VectorCompareGT(absx, GlobalVectorConstants::PiByTwo);
2497 X = VectorSelect(comp, rflx, X);
2498 sign = VectorSelect(comp, GlobalVectorConstants::FloatMinusOne, GlobalVectorConstants::FloatOne);
2499
2500 const VectorRegister4Float XSquared = VectorMultiply(X, X);
2501
2502 // 11-degree minimax approximation
2503 //*ScalarSin = (((((-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f) * y2 + 0.0083333310f) * y2 - 0.16666667f) * y2 + 1.0f) * y;
2504 const VectorRegister4Float SinCoeff0 = MakeVectorRegister(1.0f, -0.16666667f, 0.0083333310f, -0.00019840874f);
2505 const VectorRegister4Float SinCoeff1 = MakeVectorRegister(2.7525562e-06f, -2.3889859e-08f, /*unused*/ 0.f, /*unused*/ 0.f);
2506
2507 VectorRegister4Float S;
2508 S = VectorReplicate(SinCoeff1, 1);
2509 S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff1, 0));
2510 S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff0, 3));
2511 S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff0, 2));
2512 S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff0, 1));
2513 S = VectorMultiplyAdd(XSquared, S, VectorReplicate(SinCoeff0, 0));
2514 *VSinAngles = VectorMultiply(S, X);
2515
2516 // 10-degree minimax approximation
2517 //*ScalarCos = sign * (((((-2.6051615e-07f * y2 + 2.4760495e-05f) * y2 - 0.0013888378f) * y2 + 0.041666638f) * y2 - 0.5f) * y2 + 1.0f);
2518 const VectorRegister4Float CosCoeff0 = MakeVectorRegister(1.0f, -0.5f, 0.041666638f, -0.0013888378f);
2519 const VectorRegister4Float CosCoeff1 = MakeVectorRegister(2.4760495e-05f, -2.6051615e-07f, /*unused*/ 0.f, /*unused*/ 0.f);
2520
2521 VectorRegister4Float C;
2522 C = VectorReplicate(CosCoeff1, 1);
2523 C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff1, 0));
2524 C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff0, 3));
2525 C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff0, 2));
2526 C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff0, 1));
2527 C = VectorMultiplyAdd(XSquared, C, VectorReplicate(CosCoeff0, 0));
2528 *VCosAngles = VectorMultiply(C, sign);
2529}
2530
2531// Returns true if the vector contains a component that is either NAN or +/-infinite.
2532inline bool VectorContainsNaNOrInfinite(const VectorRegister4Float& Vec)
2533{
2534 // https://en.wikipedia.org/wiki/IEEE_754-1985
2535 // Infinity is represented with all exponent bits set, with the correct sign bit.
2536 // NaN is represented with all exponent bits set, plus at least one fraction/significant bit set.
2537 // This means finite values will not have all exponent bits set, so check against those bits.
2538
2539 union { float F; uint32 U; } InfUnion;
2540 InfUnion.U = 0x7F800000;
2541 const float Inf = InfUnion.F;
2542 const VectorRegister4Float FloatInfinity = MakeVectorRegister(Inf, Inf, Inf, Inf);
2543
2544 // Mask off Exponent
2545 VectorRegister4Float ExpTest = VectorBitwiseAnd(Vec, FloatInfinity);
2546
2547 // Compare to full exponent & combine resulting flags into lane 0
2548 const int32x4_t Table = MakeVectorRegisterIntConstant(0x0C080400, 0, 0, 0);
2549
2550 uint8x16_t res = (uint8x16_t)VectorCompareEQ(ExpTest, FloatInfinity);
2551 // If we have all zeros, all elements are finite
2552 return vgetq_lane_u32((uint32x4_t)vqtbx1q_u8(res, res, Table), 0) != 0;
2553}
2554
2555inline bool VectorContainsNaNOrInfinite(const VectorRegister4Double& Vec)
2556{
2557 // https://en.wikipedia.org/wiki/IEEE_754-1985
2558 // Infinity is represented with all exponent bits set, with the correct sign bit.
2559 // NaN is represented with all exponent bits set, plus at least one fraction/significant bit set.
2560 // This means finite values will not have all exponent bits set, so check against those bits.
2561
2562 union { double F; uint64 U; } InfUnion;
2563 InfUnion.U = 0x7FF0000000000000ULL;
2564 const double Inf = InfUnion.F;
2565 const VectorRegister4Double DoubleInfinity = MakeVectorRegister(Inf, Inf, Inf, Inf);
2566
2567 // Mask off Exponent
2568 VectorRegister4Double ExpTest = VectorBitwiseAnd(Vec, DoubleInfinity);
2569
2570 // Compare to full exponent & combine resulting flags into lane 0
2571 const int32x4_t Table = MakeVectorRegisterIntConstant(0x18100800, 0, 0, 0);
2572
2573 VectorRegister4Double InfTestRes = VectorCompareEQ(ExpTest, DoubleInfinity);
2574
2575 // If we have all zeros, all elements are finite
2576 uint8x16_t ZeroVec = vdupq_n_u8(0);
2577 //TODO: there must be a better instruction to just get the top bits or smth
2578 return vgetq_lane_u32((uint32x4_t)vqtbx2q_u8(ZeroVec, *(uint8x16x2_t*)&InfTestRes, Table), 0) != 0;
2579}
2580
2581//TODO: Vectorize
2583{
2584 AlignedFloat4 Val(X);
2585 return MakeVectorRegister(FMath::Exp(Val[0]), FMath::Exp(Val[1]), FMath::Exp(Val[2]), FMath::Exp(Val[3]));
2586}
2587
2588FORCEINLINE VectorRegister4Double VectorExp(const VectorRegister4Double& X)
2589{
2590 AlignedDouble4 Val(X);
2591 return MakeVectorRegister(FMath::Exp(Val[0]), FMath::Exp(Val[1]), FMath::Exp(Val[2]), FMath::Exp(Val[3]));
2592}
2593
2594//TODO: Vectorize
2596{
2597 AlignedFloat4 Val(X);
2598 return MakeVectorRegister(FMath::Exp2(Val[0]), FMath::Exp2(Val[1]), FMath::Exp2(Val[2]), FMath::Exp2(Val[3]));
2599}
2600
2601FORCEINLINE VectorRegister4Double VectorExp2(const VectorRegister4Double& X)
2602{
2603 AlignedDouble4 Val(X);
2604 return MakeVectorRegister(FMath::Exp2(Val[0]), FMath::Exp2(Val[1]), FMath::Exp2(Val[2]), FMath::Exp2(Val[3]));
2605}
2606
2607//TODO: Vectorize
2609{
2610 AlignedFloat4 Val(X);
2611 return MakeVectorRegister(FMath::Loge(Val[0]), FMath::Loge(Val[1]), FMath::Loge(Val[2]), FMath::Loge(Val[3]));
2612}
2613
2614FORCEINLINE VectorRegister4Double VectorLog(const VectorRegister4Double& X)
2615{
2616 AlignedDouble4 Val(X);
2617 return MakeVectorRegister(FMath::Loge(Val[0]), FMath::Loge(Val[1]), FMath::Loge(Val[2]), FMath::Loge(Val[3]));
2618}
2619
2620//TODO: Vectorize
2622{
2623 AlignedFloat4 Val(X);
2624 return MakeVectorRegister(FMath::Log2(Val[0]), FMath::Log2(Val[1]), FMath::Log2(Val[2]), FMath::Log2(Val[3]));
2625}
2626
2627FORCEINLINE VectorRegister4Double VectorLog2(const VectorRegister4Double& X)
2628{
2629 AlignedDouble4 Val(X);
2631}
2632
2633//TODO: Vectorize
2635{
2636 AlignedFloat4 Val(X);
2637 return MakeVectorRegister(FMath::Tan(Val[0]), FMath::Tan(Val[1]), FMath::Tan(Val[2]), FMath::Tan(Val[3]));
2638}
2639
2640FORCEINLINE VectorRegister4Double VectorTan(const VectorRegister4Double& X)
2641{
2642 AlignedDouble4 Val(X);
2643 return MakeVectorRegister(FMath::Tan(Val[0]), FMath::Tan(Val[1]), FMath::Tan(Val[2]), FMath::Tan(Val[3]));
2644}
2645
2646//TODO: Vectorize
2648{
2649 AlignedFloat4 Val(X);
2650 return MakeVectorRegister(FMath::Asin(Val[0]), FMath::Asin(Val[1]), FMath::Asin(Val[2]), FMath::Asin(Val[3]));
2651}
2652
2653FORCEINLINE VectorRegister4Double VectorASin(const VectorRegister4Double& X)
2654{
2655 AlignedDouble4 Val(X);
2656 return MakeVectorRegister(FMath::Asin(Val[0]), FMath::Asin(Val[1]), FMath::Asin(Val[2]), FMath::Asin(Val[3]));
2657}
2658
2659//TODO: Vectorize
2661{
2662 AlignedFloat4 Val(X);
2663 return MakeVectorRegister(FMath::Acos(Val[0]), FMath::Acos(Val[1]), FMath::Acos(Val[2]), FMath::Acos(Val[3]));
2664}
2665
2666FORCEINLINE VectorRegister4Double VectorACos(const VectorRegister4Double& X)
2667{
2668 AlignedDouble4 Val(X);
2669 return MakeVectorRegister(FMath::Acos(Val[0]), FMath::Acos(Val[1]), FMath::Acos(Val[2]), FMath::Acos(Val[3]));
2670}
2671
2672//TODO: Vectorize
2674{
2675 AlignedFloat4 Val(X);
2676 return MakeVectorRegister(FMath::Atan(Val[0]), FMath::Atan(Val[1]), FMath::Atan(Val[2]), FMath::Atan(Val[3]));
2677}
2678
2679FORCEINLINE VectorRegister4Double VectorATan(const VectorRegister4Double& X)
2680{
2681 AlignedDouble4 Val(X);
2682 return MakeVectorRegister(FMath::Atan(Val[0]), FMath::Atan(Val[1]), FMath::Atan(Val[2]), FMath::Atan(Val[3]));
2683}
2684
2685//TODO: Vectorize
2687{
2688 AlignedFloat4 ValX(X);
2689 AlignedFloat4 ValY(Y);
2690
2691 return MakeVectorRegister(FMath::Atan2(ValX[0], ValY[0]),
2692 FMath::Atan2(ValX[1], ValY[1]),
2693 FMath::Atan2(ValX[2], ValY[2]),
2694 FMath::Atan2(ValX[3], ValY[3]));
2695}
2696
2697FORCEINLINE VectorRegister4Double VectorATan2(const VectorRegister4Double& X, const VectorRegister4Double& Y)
2698{
2699 AlignedDouble4 ValX(X);
2700 AlignedDouble4 ValY(Y);
2701
2702 return MakeVectorRegister(FMath::Atan2(ValX[0], ValY[0]),
2703 FMath::Atan2(ValX[1], ValY[1]),
2704 FMath::Atan2(ValX[2], ValY[2]),
2705 FMath::Atan2(ValX[3], ValY[3]));
2706}
2707
2709{
2710 return vrndpq_f32(X);
2711}
2712
2713FORCEINLINE VectorRegister4Double VectorCeil(const VectorRegister4Double& X)
2714{
2715 VectorRegister4Double Result;
2716 Result.XY = vrndpq_f64(X.XY);
2717 Result.ZW = vrndpq_f64(X.ZW);
2718 return Result;
2719}
2720
2722{
2723 return vrndmq_f32(X);
2724}
2725
2726FORCEINLINE VectorRegister4Double VectorFloor(const VectorRegister4Double& X)
2727{
2728 VectorRegister4Double Result;
2729 Result.XY = vrndmq_f64(X.XY);
2730 Result.ZW = vrndmq_f64(X.ZW);
2731 return Result;
2732}
2733
2735{
2736 return vrndq_f32(X);
2737}
2738
2739FORCEINLINE VectorRegister4Double VectorTruncate(const VectorRegister4Double& X)
2740{
2741 VectorRegister4Double Result;
2742 Result.XY = vrndq_f64(X.XY);
2743 Result.ZW = vrndq_f64(X.ZW);
2744 return Result;
2745}
2746
2748{
2749 // Check against invalid divisor
2750 VectorRegister4Float InvalidDivisorMask = VectorCompareLE(VectorAbs(Y), GlobalVectorConstants::SmallNumber);
2751
2752 AlignedFloat4 XFloats(X), YFloats(Y);
2753 XFloats[0] = fmodf(XFloats[0], YFloats[0]);
2754 XFloats[1] = fmodf(XFloats[1], YFloats[1]);
2755 XFloats[2] = fmodf(XFloats[2], YFloats[2]);
2756 XFloats[3] = fmodf(XFloats[3], YFloats[3]);
2757 VectorRegister4Float Result = XFloats.ToVectorRegister();
2758
2759 // Return 0 where divisor Y was too small
2760 Result = VectorSelect(InvalidDivisorMask, GlobalVectorConstants::FloatZero, Result);
2761 return Result;
2762}
2763
2764FORCEINLINE VectorRegister4Double VectorMod(const VectorRegister4Double& X, const VectorRegister4Double& Y)
2765{
2766 // Check against invalid divisor
2767 VectorRegister4Double InvalidDivisorMask = VectorCompareLE(VectorAbs(Y), GlobalVectorConstants::DoubleSmallNumber);
2768
2769 AlignedDouble4 XDoubles(X), YDoubles(Y);
2770 XDoubles[0] = fmod(XDoubles[0], YDoubles[0]);
2771 XDoubles[1] = fmod(XDoubles[1], YDoubles[1]);
2772 XDoubles[2] = fmod(XDoubles[2], YDoubles[2]);
2773 XDoubles[3] = fmod(XDoubles[3], YDoubles[3]);
2774 VectorRegister4Double DoubleResult = XDoubles.ToVectorRegister();
2775
2776 // Return 0 where divisor Y was too small
2777 DoubleResult = VectorSelect(InvalidDivisorMask, GlobalVectorConstants::DoubleZero, DoubleResult);
2778 return DoubleResult;
2779}
2780
2782{
2783 VectorRegister4Float Mask = VectorCompareGE(X, GlobalVectorConstants::FloatZero);
2784 return VectorSelect(Mask, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatMinusOne);
2785}
2786
2787FORCEINLINE VectorRegister4Double VectorSign(const VectorRegister4Double& X)
2788{
2789 VectorRegister4Double Mask = VectorCompareGE(X, GlobalVectorConstants::DoubleZero);
2790 return VectorSelect(Mask, GlobalVectorConstants::DoubleOne, GlobalVectorConstants::DoubleMinusOne);
2791}
2792
2794{
2795 VectorRegister4Float Mask = VectorCompareGE(X, GlobalVectorConstants::FloatZero);
2796 return VectorSelect(Mask, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatZero);
2797}
2798
2799FORCEINLINE VectorRegister4Double VectorStep(const VectorRegister4Double& X)
2800{
2801 VectorRegister4Double Mask = VectorCompareGE(X, GlobalVectorConstants::DoubleZero);
2802 return VectorSelect(Mask, GlobalVectorConstants::DoubleOne, GlobalVectorConstants::DoubleZero);
2803}
2804
2806{
2807 static const float p = 0.225f;
2808 static const float a = 7.58946609f; // 16 * sqrtf(p)
2809 static const float b = 1.63384342f; // (1 - p) / sqrtf(p)
2812}
2813
2815{
2816 //Sine approximation using a squared parabola restrained to f(0) = 0, f(PI) = 0, f(PI/2) = 1.
2817 //based on a good discussion here http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648
2818 //After approx 2.5 million tests comparing to sin():
2819 //Average error of 0.000128
2820 //Max error of 0.001091
2821 //
2822 // Error clarification - the *relative* error rises above 1.2% near
2823 // 0 and PI (as the result nears 0). This is enough to introduce
2824 // harmonic distortion when used as an oscillator - VectorSinCos
2825 // doesn't cost that much more and is significantly more accurate.
2826 // (though don't use either for an oscillator if you care about perf)
2827
2828 VectorRegister4Float Y = VectorMultiply(X, GlobalVectorConstants::OneOverTwoPi);
2829 Y = VectorSubtract(Y, VectorFloor(VectorAdd(Y, GlobalVectorConstants::FloatOneHalf)));
2830 Y = VectorMultiply(VectorSinConstantsNEON::A, VectorMultiply(Y, VectorSubtract(GlobalVectorConstants::FloatOneHalf, VectorAbs(Y))));
2831 return VectorMultiply(Y, VectorAdd(VectorSinConstantsNEON::B, VectorAbs(Y)));
2832}
2833
2834FORCEINLINE VectorRegister4Double VectorSin(const VectorRegister4Double& X)
2835{
2836 AlignedDouble4 Doubles(X);
2837 Doubles[0] = FMath::Sin(Doubles[0]);
2838 Doubles[1] = FMath::Sin(Doubles[1]);
2839 Doubles[2] = FMath::Sin(Doubles[2]);
2840 Doubles[3] = FMath::Sin(Doubles[3]);
2841 return Doubles.ToVectorRegister();
2842}
2843
2845{
2846 return VectorSin(VectorAdd(X, GlobalVectorConstants::PiByTwo));
2847}
2848
2849FORCEINLINE VectorRegister4Double VectorCos(const VectorRegister4Double& X)
2850{
2851 AlignedDouble4 Doubles(X);
2852 Doubles[0] = FMath::Cos(Doubles[0]);
2853 Doubles[1] = FMath::Cos(Doubles[1]);
2854 Doubles[2] = FMath::Cos(Doubles[2]);
2855 Doubles[3] = FMath::Cos(Doubles[3]);
2856 return Doubles.ToVectorRegister();
2857}
2858
2859FORCEINLINE void VectorSinCos(VectorRegister4Double* RESTRICT VSinAngles, VectorRegister4Double* RESTRICT VCosAngles, const VectorRegister4Double* RESTRICT VAngles)
2860{
2861 *VSinAngles = VectorSin(*VAngles);
2862 *VCosAngles = VectorCos(*VAngles);
2863}
2864
2865/**
2866* Loads packed RGBA16(4 bytes) from unaligned memory and converts them into 4 FLOATs.
2867* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
2868*
2869* @param Ptr Unaligned memory pointer to the RGBA16(8 bytes).
2870* @return VectorRegister4Float with 4 FLOATs loaded from Ptr.
2871*/
2873{
2874 alignas(16) float V[4];
2875 V[0] = float(E[0]);
2876 V[1] = float(E[1]);
2877 V[2] = float(E[2]);
2878 V[3] = float(E[3]);
2879
2880 return VectorLoad(V);
2881}
2882
2883/**
2884* Loads packed signed RGBA16(4 bytes) from unaligned memory and converts them into 4 FLOATs.
2885* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
2886*
2887* @param Ptr Unaligned memory pointer to the RGBA16(8 bytes).
2888* @return VectorRegister4Float with 4 FLOATs loaded from Ptr.
2889*/
2891{
2892 alignas(16) float V[4];
2893 int16* E = (int16*)Ptr;
2894
2895 V[0] = float(E[0]);
2896 V[1] = float(E[1]);
2897 V[2] = float(E[2]);
2898 V[3] = float(E[3]);
2899
2900 return VectorLoad(V);
2901}
2902
2903/**
2904* Converts the 4 FLOATs in the vector RGBA16, clamped to [0, 65535], and stores to unaligned memory.
2905* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
2906*
2907* @param Vec Vector containing 4 FLOATs
2908* @param Ptr Unaligned memory pointer to store the packed RGBA16(8 bytes).
2909*/
2910FORCEINLINE void VectorStoreURGBA16N(const VectorRegister4Float& Vec, uint16* Out)
2911{
2912 VectorRegister4Float Tmp;
2913 Tmp = VectorMax(Vec, VectorZeroFloat());
2914 Tmp = VectorMin(Tmp, VectorOneFloat());
2915 Tmp = VectorMultiplyAdd(Tmp, vdupq_n_f32(65535.0f), vdupq_n_f32(0.5f));
2916 Tmp = VectorTruncate(Tmp);
2917
2918 alignas(16) float F[4];
2919 VectorStoreAligned(Tmp, F);
2920
2921 Out[0] = (uint16)F[0];
2922 Out[1] = (uint16)F[1];
2923 Out[2] = (uint16)F[2];
2924 Out[3] = (uint16)F[3];
2925}
2926
2927//////////////////////////////////////////////////////////////////////////
2928//Integer ops
2929
2930//Bitwise
2931/** = a & b */
2932#define VectorIntAnd(A, B) vandq_s32(A, B)
2933/** = a | b */
2934#define VectorIntOr(A, B) vorrq_s32(A, B)
2935/** = a ^ b */
2936#define VectorIntXor(A, B) veorq_s32(A, B)
2937/** = (~a) & b to match _mm_andnot_si128 */
2938#define VectorIntAndNot(A, B) vandq_s32(vmvnq_s32(A), B)
2939/** = ~a */
2940#define VectorIntNot(A) vmvnq_s32(A)
2941
2942//Comparison
2943#define VectorIntCompareEQ(A, B) vceqq_s32(A,B)
2944#define VectorIntCompareNEQ(A, B) VectorIntNot(VectorIntCompareEQ(A,B))
2945#define VectorIntCompareGT(A, B) vcgtq_s32(A,B)
2946#define VectorIntCompareLT(A, B) vcltq_s32(A,B)
2947#define VectorIntCompareGE(A, B) vcgeq_s32(A,B)
2948#define VectorIntCompareLE(A, B) vcleq_s32(A,B)
2949
2950
2951FORCEINLINE VectorRegister4Int VectorIntSelect(const VectorRegister4Int& Mask, const VectorRegister4Int& Vec1, const VectorRegister4Int& Vec2)
2952{
2953 return VectorIntXor(Vec2, VectorIntAnd(Mask, VectorIntXor(Vec1, Vec2)));
2954}
2955
2956//Arithmetic
2957#define VectorIntAdd(A, B) vaddq_s32(A, B)
2958#define VectorIntSubtract(A, B) vsubq_s32(A, B)
2959#define VectorIntMultiply(A, B) vmulq_s32(A, B)
2960#define VectorIntNegate(A) vnegq_s32(A)
2961#define VectorIntMin(A, B) vminq_s32(A,B)
2962#define VectorIntMax(A, B) vmaxq_s32(A,B)
2963#define VectorIntClamp(A, B, C) VectorIntMin(VectorIntMax(A, B), C)
2964#define VectorIntAbs(A) vabdq_s32(A, GlobalVectorConstants::IntZero)
2965
2966#define VectorIntSign(A) VectorIntSelect( VectorIntCompareGE(A, GlobalVectorConstants::IntZero), GlobalVectorConstants::IntOne, GlobalVectorConstants::IntMinusOne )
2967
2968#define VectorIntToFloat(A) vcvtq_f32_s32(A)
2969
2970FORCEINLINE VectorRegister4Int VectorFloatToInt(const VectorRegister4Float& A)
2971{
2972 return vcvtq_s32_f32(A);
2973}
2974
2975FORCEINLINE VectorRegister4Int VectorFloatToInt(const VectorRegister4Double& A)
2976{
2977 return VectorFloatToInt(MakeVectorRegisterFloatFromDouble(A));
2978}
2979
2980//Loads and stores
2981
2982/**
2983* Stores a vector to memory (aligned or unaligned).
2984*
2985* @param Vec Vector to store
2986* @param Ptr Memory pointer
2987*/
2988#define VectorIntStore( Vec, Ptr ) vst1q_s32( (int32*)(Ptr), Vec )
2989
2990/**
2991* Loads 4 int32s from unaligned memory.
2992*
2993* @param Ptr Unaligned memory pointer to the 4 int32s
2994* @return VectorRegister4Int(Ptr[0], Ptr[1], Ptr[2], Ptr[3])
2995*/
2996#define VectorIntLoad( Ptr ) vld1q_s32( (int32*)((void*)(Ptr)) )
2997
2998/**
2999* Stores a vector to memory (aligned).
3000*
3001* @param Vec Vector to store
3002* @param Ptr Aligned Memory pointer
3003*/
3004#define VectorIntStoreAligned( Vec, Ptr ) vst1q_s32( (int32*)(Ptr), Vec )
3005
3006/**
3007* Loads 4 int32s from aligned memory.
3008*
3009* @param Ptr Aligned memory pointer to the 4 int32s
3010* @return VectorRegister4Int(Ptr[0], Ptr[1], Ptr[2], Ptr[3])
3011*/
3012#define VectorIntLoadAligned( Ptr ) vld1q_s32( (int32*)((void*)(Ptr)) )
3013
3014/**
3015* Loads 1 int32 from unaligned memory into all components of a vector register.
3016*
3017* @param Ptr Unaligned memory pointer to the 4 int32s
3018* @return VectorRegister4Int(*Ptr, *Ptr, *Ptr, *Ptr)
3019*/
3020#define VectorIntLoad1( Ptr ) vld1q_dup_s32((int32*)(Ptr))
3021
3022#define VectorIntSet1(F) vdupq_n_s32(F)
3023#define VectorSetZero() vdupq_n_s32(0)
3024#define VectorSet1(F) vdupq_n_f32(F)
3025#define VectorCastIntToFloat(Vec) ((VectorRegister4f)vreinterpretq_f32_s32(Vec))
3026#define VectorCastFloatToInt(Vec) ((VectorRegister4i)vreinterpretq_s32_f32(Vec))
3027#define VectorShiftLeftImm(Vec, ImmAmt) vshlq_n_s32(Vec, ImmAmt)
3028#define VectorShiftRightImmArithmetic(Vec, ImmAmt) vshrq_n_s32(Vec, ImmAmt)
3029#define VectorShiftRightImmLogical(Vec, ImmAmt) vshrq_n_u32(Vec, ImmAmt)
3030#define VectorRound(Vec) vrndnq_f32(Vec)
3031
3032FORCEINLINE VectorRegister4Int VectorRoundToIntHalfToEven(const VectorRegister4Float& Vec)
3033{
3034 return vcvtnq_s32_f32(Vec);
3035}
3036
3038 int16x4x2_t res = vzip_s16(vget_low_u16(V), vdup_n_u16(0));
3039 return vcombine_s16(res.val[0], res.val[1]);
3040}
3041
3042// To be continued...
3043
3044PRAGMA_ENABLE_SHADOW_VARIABLE_WARNINGS
#define FORCEINLINE
Definition Platform.h:644
#define RESTRICT
Definition Platform.h:650
#define VectorShuffle(Vec1, Vec2, X, Y, Z, W)
FORCEINLINE VectorRegister VectorLoadByte4Reverse(const uint8 *Ptr)
#define VectorIntCompareEQ(A, B)
#define VectorIntXor(A, B)
FORCEINLINE VectorRegister4Int VectorRoundToIntHalfToEven(const VectorRegister4Float &Vec)
#define VectorIntAnd(A, B)
FORCEINLINE void VectorMatrixMultiply(FMatrix *Result, const FMatrix *Matrix1, const FMatrix *Matrix2)
FORCEINLINE VectorRegister VectorLoadTwoPairsFloat(const float *Ptr1, const float *Ptr2)
#define VectorSwizzle(Vec, X, Y, Z, W)
#define VectorIntCompareGE(A, B)
FORCEINLINE void VectorMatrixInverse(FMatrix *DstMatrix, const FMatrix *SrcMatrix)
#define VectorIntNot(A)
#define VectorReplicate(Vec, ElementIndex)
FORCEINLINE VectorRegister4Double VectorReciprocalEstimate(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt(int32 X, int32 Y, int32 Z, int32 W)
FORCEINLINE VectorRegister4Double VectorTan(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorATan2(const VectorRegister4Float &X, const VectorRegister4Float &Y)
FORCEINLINE void VectorStore(const VectorRegister4Double &Vec, double *Ptr)
FORCEINLINE VectorRegister4Double VectorSet_W1(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorLoadFloat3(const double *Ptr)
FORCEINLINE VectorRegister4Double VectorMultiply(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Double VectorACos(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorExp2(const VectorRegister4Float &X)
bool VectorContainsNaNOrInfinite(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorTransformVector(const VectorRegister4Double &VecP, const FMatrix44d *MatrixM)
FORCEINLINE VectorRegister4Double VectorReciprocalSqrt(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(uint64 X, uint64 Y)
FORCEINLINE VectorRegister4Float VectorLog2(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorCos(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorMin(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Double VectorNegate(VectorRegister4Double Vec)
FORCEINLINE VectorRegister4Double VectorBitwiseAnd(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Float VectorSqrt(const VectorRegister4Float &Vec)
void VectorStoreAligned(const VectorRegister4Double &Vec, double *Dst)
FORCEINLINE VectorRegister4Float VectorReciprocalSqrt(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Float VectorSign(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorATan(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorDot3(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatFromDouble(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorLoad(const double *Ptr)
FORCEINLINE VectorRegister4Double VectorLoadTwoPairsFloat(const double *Ptr1, const double *Ptr2)
FORCEINLINE VectorRegister4Float VectorExp(const VectorRegister4Float &X)
FORCEINLINE void VectorStoreSignedByte4(VectorRegister4Float Vec, void *Ptr)
FORCEINLINE VectorRegister4Double VectorMax(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Double VectorCompareGT(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE float VectorGetComponentImpl(VectorRegister4Float Vec)
FORCEINLINE VectorRegister4Double VectorLog(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float MakeVectorRegister(uint32 X, uint32 Y, uint32 Z, uint32 W)
FORCEINLINE VectorRegister4Float VectorFloor(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Float VectorDivide(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Double *RESTRICT Result, const VectorRegister4Double *RESTRICT Quat1, const VectorRegister4Double *RESTRICT Quat2)
FORCEINLINE VectorRegister4Double VectorCeil(const VectorRegister4Double &X)
FORCEINLINE void VectorSinCos(VectorRegister4Float *RESTRICT VSinAngles, VectorRegister4Float *RESTRICT VCosAngles, const VectorRegister4Float *RESTRICT VAngles)
#define VectorIntMin(A, B)
FORCEINLINE VectorRegister4Float VectorLoadURGB10A2N(void *Ptr)
FORCEINLINE VectorRegister4Float VectorSubtract(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Float VectorSet_W1(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Double VectorStep(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Double VectorReciprocal(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Float VectorSetComponentImpl(const VectorRegister4Float &Vec, float Scalar)
FORCEINLINE void VectorQuaternionMultiply(VectorRegister4Float *RESTRICT Result, const VectorRegister4Float *RESTRICT Quat1, const VectorRegister4Float *RESTRICT Quat2)
FORCEINLINE VectorRegister4Float VectorLoadByte4(const void *Ptr)
FORCEINLINE VectorRegister4Float VectorReciprocalLen(const VectorRegister4Float &Vector)
FORCEINLINE double VectorGetComponentImpl(VectorRegister2Double Vec)
VectorRegister4Double VectorRegister4d
FORCEINLINE void VectorDeinterleave(VectorRegister4Double &RESTRICT OutEvens, VectorRegister4Double &RESTRICT OutOdds, const VectorRegister4Double &Lo, const VectorRegister4Double &Hi)
FORCEINLINE VectorRegister4Float VectorACos(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorCompareGE(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE void VectorStore(const VectorRegister4Float &Vec, float *Ptr)
FORCEINLINE VectorRegister4Float VectorLog(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorDot3(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
AlignedDouble4 AlignedRegister4
VectorRegister4Float VectorLoadAligned(const float *Ptr)
FORCEINLINE VectorRegister4Double VectorCross(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Double VectorReciprocalLenEstimate(const VectorRegister4Double &Vector)
FORCEINLINE VectorRegister4Float VectorBitwiseAnd(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorLoadFloat1(const float *Ptr)
FORCEINLINE VectorRegister4Float VectorTan(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorMod(const VectorRegister4Double &X, const VectorRegister4Double &Y)
FORCEINLINE VectorRegister4Double VectorExp(const VectorRegister4Double &X)
FORCEINLINE double VectorGetComponentDynamic(VectorRegister4Double Vec, uint32 ElementIndex)
FORCEINLINE VectorRegister4Double VectorAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Float VectorTransformVector(const VectorRegister4Float &VecP, const FMatrix44d *MatrixM)
FORCEINLINE VectorRegister4Float VectorLoadFloat2(const float *Ptr)
FORCEINLINE VectorRegister4Int VectorIntSelect(const VectorRegister4Int &Mask, const VectorRegister4Int &Vec1, const VectorRegister4Int &Vec2)
FORCEINLINE void VectorSinCos(VectorRegister4Double *RESTRICT VSinAngles, VectorRegister4Double *RESTRICT VCosAngles, const VectorRegister4Double *RESTRICT VAngles)
FORCEINLINE VectorRegister4Double VectorExp2(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorCombineLow(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE uint32 VectorMaskBits(VectorRegister4Double VecMask)
FORCEINLINE VectorRegister4Double VectorCompareLE(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE double VectorDot3Scalar(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Double VectorCombineLow(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Float VectorReciprocalSqrtEstimate(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Double VectorOneDouble()
FORCEINLINE void VectorStoreAligned(VectorRegister4Float Vec, FFloat16 *Ptr)
FORCEINLINE VectorRegister4Double VectorLoadDouble1(const double *Ptr)
FORCEINLINE double VectorGetComponentImpl(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorSin(const VectorRegister4Double &X)
FORCEINLINE void VectorStoreFloat1(const VectorRegister4Double &Vec, double *Ptr)
VectorRegister4Int VectorRegister4i
FORCEINLINE VectorRegister4Double VectorSet_W0(const VectorRegister4Double &Vec)
FORCEINLINE void VectorStoreHalf4(VectorRegister4Float Vec, void *RESTRICT Ptr)
FORCEINLINE VectorRegister4Float VectorSetFloat1(float X)
FORCEINLINE VectorRegister4Float VectorSelect(const VectorRegister4Float &Mask, const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double VectorMultiplyAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2, VectorRegister4Double Acc)
FORCEINLINE VectorRegister4Float VectorASin(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double MakeVectorRegister(double X, double Y, double Z, double W)
FORCEINLINE VectorRegister4Float VectorCompareGT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDoubleMask(uint64 X, uint64 Y, uint64 Z, uint64 W)
FORCEINLINE VectorRegister4Double VectorMin(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE int32 VectorAnyGreaterThan(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Double VectorTruncate(const VectorRegister4Double &X)
FORCEINLINE int32 VectorAnyGreaterThan(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Float VectorSin(const VectorRegister4Float &X)
VectorRegister4Double VectorLoadAligned(const double *Ptr)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(double X, double Y, double Z, double W)
FORCEINLINE VectorRegister4Float MakeVectorRegister(float X, float Y, float Z, float W)
FORCEINLINE VectorRegister4Float VectorReciprocalLenEstimate(const VectorRegister4Float &Vector)
FORCEINLINE VectorRegister4Float VectorTransformVector(const VectorRegister4Float &VecP, const FMatrix44f *MatrixM)
FORCEINLINE VectorRegister4Float VectorCompareGE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
VectorRegister4i VectorIntExpandLow16To32(VectorRegister4i V)
FORCEINLINE VectorRegister4Float VectorLoadSRGBA16N(const void *Ptr)
FORCEINLINE VectorRegister4Double VectorSetFloat1(double X)
FORCEINLINE VectorRegister4Float VectorMod(const VectorRegister4Float &X, const VectorRegister4Float &Y)
FORCEINLINE VectorRegister4Float VectorMultiply(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Int MakeVectorRegisterInt64(int64 X, int64 Y)
FORCEINLINE VectorRegister4Float VectorCombineHigh(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorCompareLT(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorOneFloat()
FORCEINLINE VectorRegister4Double VectorBitwiseOr(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(uint64 X, uint64 Y, uint64 Z, uint64 W)
FORCEINLINE VectorRegister4Double VectorSign(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Double VectorReplicateImpl(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Double VectorFloor(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorSet_W0(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Float VectorLoadSignedByte4(const void *Ptr)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(const VectorRegister2Double &XY, const VectorRegister2Double &ZW)
FORCEINLINE void VectorStoreFloat3(const VectorRegister4Double &Vec, double *Ptr)
FORCEINLINE VectorRegister4Double VectorZeroDouble()
FORCEINLINE VectorRegister4Float VectorDot4(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Int VectorFloatToInt(const VectorRegister4Double &A)
FORCEINLINE VectorRegister4Float VectorReciprocal(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Float VectorStep(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorSelect(const VectorRegister4Double &Mask, const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
VectorRegister4Double VectorRegister
FORCEINLINE float VectorDot3Scalar(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double VectorSubtract(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Float VectorCeil(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Float VectorZeroFloat()
FORCEINLINE float VectorGetComponentDynamic(VectorRegister4Float Vec, uint32 ElementIndex)
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloatMask(uint32 X, uint32 Y, uint32 Z, uint32 W)
FORCEINLINE VectorRegister4Float VectorBitwiseXor(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double VectorReciprocalLen(const VectorRegister4Double &Vector)
FORCEINLINE VectorRegister4Double VectorBitwiseXor(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE void VectorMatrixInverse(FMatrix44f *DstMatrix, const FMatrix44f *SrcMatrix)
FORCEINLINE void VectorDeinterleave(VectorRegister4Float &OutEvens, VectorRegister4Float &OutOdds, const VectorRegister4Float &Lo, const VectorRegister4Float &Hi)
FORCEINLINE VectorRegister4Double VectorLoadFloat3_W1(const double *Ptr)
FORCEINLINE VectorRegister4Double VectorCombineHigh(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Float VectorQuaternionMultiply2(const VectorRegister4Float &Quat1, const VectorRegister4Float &Quat2)
FORCEINLINE void VectorStoreFloat3(const VectorRegister4Float &Vec, float *Ptr)
FORCEINLINE VectorRegister4Double VectorSetComponentImpl(const VectorRegister4Double &Vec, double Scalar)
FORCEINLINE VectorRegister4Double VectorCompareNE(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Double VectorCompareEQ(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
void VectorStoreAligned(const VectorRegister4Float &Vec, float *Ptr)
FORCEINLINE void VectorStoreURGBA16N(const VectorRegister4Float &Vec, uint16 *Out)
FORCEINLINE VectorRegister4Double VectorNegateMultiplyAdd(VectorRegister4Double Vec1, VectorRegister4Double Vec2, VectorRegister4Double Sub)
FORCEINLINE VectorRegister4Double VectorLog2(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Float VectorLoad(const float *Ptr)
FORCEINLINE VectorRegister4Float VectorCross(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE void VectorMatrixMultiply(FMatrix44f *Result, const FMatrix44f *Matrix1, const FMatrix44f *Matrix2)
FORCEINLINE VectorRegister4Float VectorBitwiseOr(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorAdd(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Double VectorDivide(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE VectorRegister4Int VectorFloatToInt(const VectorRegister4Float &A)
bool VectorContainsNaNOrInfinite(const VectorRegister4Float &Vec)
#define VectorSetComponent(Vec, ElementIndex, Scalar)
FORCEINLINE VectorRegister4Float VectorTruncate(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Double VectorCompareLT(const VectorRegister4Double &Vec1, const VectorRegister4Double &Vec2)
FORCEINLINE VectorRegister4Float VectorMax(VectorRegister4Float Vec1, VectorRegister4Float Vec2)
FORCEINLINE VectorRegister4Double VectorASin(const VectorRegister4Double &X)
FORCEINLINE VectorRegister4Double VectorPow(const VectorRegister4Double &Base, const VectorRegister4Double &Exponent)
FORCEINLINE VectorRegister4Float VectorMultiplyAdd(VectorRegister4Float Vec1, VectorRegister4Float Vec2, VectorRegister4Float Acc)
FORCEINLINE VectorRegister4Float VectorNegateMultiplyAdd(VectorRegister4Float Vec1, VectorRegister4Float Vec2, VectorRegister4Float Sub)
FORCEINLINE VectorRegister4Double MakeVectorRegisterDouble(const VectorRegister4Float &From)
VectorRegister4Float VectorRegister4f
FORCEINLINE VectorRegister4Double VectorReciprocalSqrtEstimate(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister2Double VectorSetComponentImpl(const VectorRegister2Double &Vec, double Scalar)
FORCEINLINE VectorRegister4Float VectorPow(const VectorRegister4Float &Base, const VectorRegister4Float &Exponent)
FORCEINLINE VectorRegister4Float VectorAbs(VectorRegister4Float Vec)
FORCEINLINE VectorRegister4Float VectorLoadURGBA16N(const uint16 *E)
FORCEINLINE void VectorStoreFloat1(VectorRegister4Float Vec, float *Ptr)
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(float X, float Y, float Z, float W)
FORCEINLINE VectorRegister4Float VectorReciprocalEstimate(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Double VectorAbs(VectorRegister4Double Vec)
#define VectorIntMax(A, B)
FORCEINLINE VectorRegister4Float VectorATan(const VectorRegister4Float &X)
FORCEINLINE VectorRegister4Float VectorCompareLE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorCompareEQ(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Float VectorReplicateImpl(const VectorRegister4Float &Vec)
FORCEINLINE VectorRegister4Double VectorQuaternionMultiply2(const VectorRegister4Double &Quat1, const VectorRegister4Double &Quat2)
FORCEINLINE VectorRegister4Float MakeVectorRegisterFloat(uint32 X, uint32 Y, uint32 Z, uint32 W)
FORCEINLINE VectorRegister4Float VectorNegate(VectorRegister4Float Vec)
FORCEINLINE VectorRegister4Double VectorMergeVecXYZ_VecW(const VectorRegister4Double &VecXYZ, const VectorRegister4Double &VecW)
FORCEINLINE VectorRegister2Double MakeVectorRegister2Double(double X, double Y)
FORCEINLINE void VectorStoreByte4(VectorRegister4Float Vec, void *Ptr)
VectorRegister2Double VectorRegister2d
FORCEINLINE uint32 VectorMaskBits(VectorRegister4Float VecMask)
FORCEINLINE VectorRegister4Double VectorSqrt(const VectorRegister4Double &Vec)
FORCEINLINE VectorRegister4Float VectorCompareNE(const VectorRegister4Float &Vec1, const VectorRegister4Float &Vec2)
FORCEINLINE VectorRegister4Double VectorATan2(const VectorRegister4Double &X, const VectorRegister4Double &Y)
FORCEINLINE VectorRegister4Float VectorMergeVecXYZ_VecW(const VectorRegister4Float &VecXYZ, const VectorRegister4Float &VecW)
FORCEINLINE VectorRegister4Double VectorDot4(VectorRegister4Double Vec1, VectorRegister4Double Vec2)
FORCEINLINE void VectorStoreURGB10A2N(const VectorRegister4Float &Vec, void *Ptr)
FORCEINLINE VectorRegister4Float VectorCos(const VectorRegister4Float &X)
FFloat16 & operator=(float FP32Value)
Definition Float16.h:140
static const VectorRegister4Float A
static const float p
static const float b
static const float a
static const VectorRegister4Float B
Definition json.hpp:4518
FORCEINLINE double operator[](int32 Index) const
FORCEINLINE AlignedDouble4(const VectorRegister4Double &Vec)
FORCEINLINE double & operator[](int32 Index)
FORCEINLINE VectorRegister4Double ToVectorRegister() const
FORCEINLINE AlignedFloat4(const VectorRegister4Float &Vec)
FORCEINLINE float & operator[](int32 Index)
FORCEINLINE float operator[](int32 Index) const
FORCEINLINE VectorRegister4Float ToVectorRegister() const
static UE_NODISCARD FORCEINLINE double Log2(double Value)
FORCEINLINE constexpr VectorRegister4Double(VectorRegister2Double xy, VectorRegister2Double zw, VectorRegisterConstInit)
FORCEINLINE VectorRegister4Double & operator=(const VectorRegister4Float &From)
FORCEINLINE VectorRegister4Double(const VectorRegister4Float &From)
FORCEINLINE VectorRegister4Double(const VectorRegister2Double &xy, const VectorRegister2Double &zw)
FORCEINLINE VectorRegister4Double()=default