Ark Server API (ASA) - Wiki
Loading...
Searching...
No Matches
UnrealMathDirectX.h
Go to the documentation of this file.
1// Copyright Epic Games, Inc. All Rights Reserved.
2
3#pragma once
4
5// HEADER_UNIT_SKIP - Not included directly
6
7#if defined __cplusplus_cli && !PLATFORM_HOLOLENS
8// there are compile issues with this file in managed mode, so use the FPU version
9#include "Math/UnrealMathFPU.h"
10#else
11
12#pragma warning( push )
13// We're in non-managed mode, ignore warnings about _M_CEE and _MANAGED for DirectXMath.h
14#pragma warning ( disable : 4668 )
15#include <DirectXMath.h>
16#pragma warning( pop )
17#include <DirectXPackedVector.h>
18
19/*=============================================================================
20 * Helpers:
21 *============================================================================*/
22
23/**
24 * float4 vector register type, where the first float (X) is stored in the lowest 32 bits, and so on.
25 */
26typedef DirectX::XMVECTOR VectorRegister;
27typedef __m128i VectorRegister4Int;
28
29// for an DirectX::XMVECTOR, we need a single set of braces (for clang)
30#define DECLARE_VECTOR_REGISTER(X, Y, Z, W) { X, Y, Z, W }
31
32
33/**
34 * Returns a bitwise equivalent vector based on 4 DWORDs.
35 *
36 * @param X 1st uint32 component
37 * @param Y 2nd uint32 component
38 * @param Z 3rd uint32 component
39 * @param W 4th uint32 component
40 * @return Bitwise equivalent vector with 4 floats
41 */
42FORCEINLINE VectorRegister MakeVectorRegister( uint32 X, uint32 Y, uint32 Z, uint32 W )
43{
44 using namespace DirectX;
45 return DirectX::XMVectorSetInt( X, Y, Z, W );
46}
47
48/**
49 * Returns a vector based on 4 FLOATs.
50 *
51 * @param X 1st float component
52 * @param Y 2nd float component
53 * @param Z 3rd float component
54 * @param W 4th float component
55 * @return Vector of the 4 FLOATs
56 */
57FORCEINLINE VectorRegister MakeVectorRegister( float X, float Y, float Z, float W )
58{
59 return DirectX::XMVectorSet( X, Y, Z, W );
60}
61
62/**
63 * Returns a vector based on 4 int32.
64 *
65 * @param X 1st int32 component
66 * @param Y 2nd int32 component
67 * @param Z 3rd int32 component
68 * @param W 4th int32 component
69 * @return Vector of the 4 int32
70 */
71FORCEINLINE VectorRegister4Int MakeVectorRegisterInt(int32 X, int32 Y, int32 Z, int32 W)
72{
73 return _mm_castps_si128(DirectX::XMVectorSetInt(X, Y, Z, W));
74}
75
76FORCEINLINE constexpr VectorRegister4Int MakeVectorRegisterIntConstant(int32 X, int32 Y, int32 Z, int32 W)
77{
78 return {static_cast<char>(X >> 0), static_cast<char>(X >> 8), static_cast<char>(X >> 16), static_cast<char>(X >> 24),
79 static_cast<char>(Y >> 0), static_cast<char>(Y >> 8), static_cast<char>(Y >> 16), static_cast<char>(Y >> 24),
80 static_cast<char>(Z >> 0), static_cast<char>(Z >> 8), static_cast<char>(Z >> 16), static_cast<char>(Z >> 24),
81 static_cast<char>(W >> 0), static_cast<char>(W >> 8), static_cast<char>(W >> 16), static_cast<char>(W >> 24)};
82}
83
84FORCEINLINE constexpr VectorRegister4Float MakeVectorRegisterFloatConstant(float X, float Y, float Z, float W)
85{
86 return VectorRegister4Float { X, Y, Z, W };
87}
88
89FORCEINLINE constexpr VectorRegister2Double MakeVectorRegister2DoubleConstant(double X, double Y)
90{
91 return VectorRegister2Double { X, Y };
92}
93
94/*=============================================================================
95 * Constants:
96 *============================================================================*/
97
98#include "Math/UnrealMathVectorConstants.h"
99
100
101/*=============================================================================
102 * Intrinsics:
103 *============================================================================*/
104
105/**
106 * Returns a vector with all zeros.
107 *
108 * @return VectorRegister(0.0f, 0.0f, 0.0f, 0.0f)
109 */
110#define VectorZero() DirectX::XMVectorZero()
111
112/**
113 * Returns a vector with all ones.
114 *
115 * @return VectorRegister(1.0f, 1.0f, 1.0f, 1.0f)
116 */
117#define VectorOne() DirectX::g_XMOne.v
118
119/**
120 * Loads 4 FLOATs from unaligned memory.
121 *
122 * @param Ptr Unaligned memory pointer to the 4 FLOATs
123 * @return VectorRegister(Ptr[0], Ptr[1], Ptr[2], Ptr[3])
124 */
125#define VectorLoad( Ptr ) DirectX::XMLoadFloat4( (const DirectX::XMFLOAT4*)(Ptr) )
126
127/**
128 * Loads 3 FLOATs from unaligned memory and leaves W undefined.
129 *
130 * @param Ptr Unaligned memory pointer to the 3 FLOATs
131 * @return VectorRegister(Ptr[0], Ptr[1], Ptr[2], undefined)
132 */
133#define VectorLoadFloat3( Ptr ) MakeVectorRegister( ((const float*)(Ptr))[0], ((const float*)(Ptr))[1], ((const float*)(Ptr))[2], 0.0f )
134
135
136/**
137 * Loads 3 FLOATs from unaligned memory and sets W=0.
138 *
139 * @param Ptr Unaligned memory pointer to the 3 FLOATs
140 * @return VectorRegister(Ptr[0], Ptr[1], Ptr[2], 0.0f)
141 */
142#define VectorLoadFloat3_W0( Ptr ) MakeVectorRegister( ((const float*)(Ptr))[0], ((const float*)(Ptr))[1], ((const float*)(Ptr))[2], 0.0f )
143
144
145/**
146 * Loads 3 FLOATs from unaligned memory and sets W=1.
147 *
148 * @param Ptr Unaligned memory pointer to the 3 FLOATs
149 * @return VectorRegister(Ptr[0], Ptr[1], Ptr[2], 1.0f)
150 */
151#define VectorLoadFloat3_W1( Ptr ) MakeVectorRegister( ((const float*)(Ptr))[0], ((const float*)(Ptr))[1], ((const float*)(Ptr))[2], 1.0f )
152
153/**
154 * Loads 4 FLOATs from aligned memory.
155 *
156 * @param Ptr Aligned memory pointer to the 4 FLOATs
157 * @return VectorRegister(Ptr[0], Ptr[1], Ptr[2], Ptr[3])
158 */
159#define VectorLoadAligned( Ptr ) DirectX::XMLoadFloat4A( (const DirectX::XMFLOAT4A*)(Ptr) )
160
161/**
162 * Loads 1 float from unaligned memory and replicates it to all 4 elements.
163 *
164 * @param Ptr Unaligned memory pointer to the float
165 * @return VectorRegister(Ptr[0], Ptr[0], Ptr[0], Ptr[0])
166 */
167#define VectorLoadFloat1( Ptr ) DirectX::XMVectorReplicatePtr( (const float*)(Ptr) )
168
169/**
170 * Loads 2 floats from unaligned memory into X and Y and duplicates them in Z and W.
171 *
172 * @param Ptr Unaligned memory pointer to the floats
173 * @return VectorRegister(Ptr[0], Ptr[1], Ptr[0], Ptr[1])
174 */
175#define VectorLoadFloat2( Ptr ) MakeVectorRegister( ((const float*)(Ptr))[0], ((const float*)(Ptr))[1], ((const float*)(Ptr))[0], ((const float*)(Ptr))[1] )
176
177/**
178 * Loads 4 unaligned floats - 2 from the first pointer, 2 from the second, and packs
179 * them in to 1 vector.
180 *
181 * @param Ptr1 Unaligned memory pointer to the first 2 floats
182 * @param Ptr2 Unaligned memory pointer to the second 2 floats
183 * @return VectorRegister(Ptr1[0], Ptr1[1], Ptr2[0], Ptr2[1])
184 */
185FORCEINLINE VectorRegister VectorLoadTwoPairsFloat(const float* Ptr1, const float* Ptr2)
186{
187 __m128 Ret = _mm_castpd_ps(_mm_load_sd((double const*)(Ptr1)));
188 Ret = _mm_loadh_pi(Ret, (__m64 const*)(Ptr2));
189 return Ret;
190}
191
192
193/**
194 * Creates a vector out of three FLOATs and leaves W undefined.
195 *
196 * @param X 1st float component
197 * @param Y 2nd float component
198 * @param Z 3rd float component
199 * @return VectorRegister(X, Y, Z, undefined)
200 */
201#define VectorSetFloat3( X, Y, Z ) MakeVectorRegister( X, Y, Z, 0.0f )
202
203/**
204 * Propagates passed in float to all registers
205 *
206 * @param X float to replicate to all registers
207 * @return VectorRegister(X, X, X, X)
208 */
209#define VectorSetFloat1( X ) MakeVectorRegister( X, X, X, X )
210
211 /**
212 * Creates a vector out of four FLOATs.
213 *
214 * @param X 1st float component
215 * @param Y 2nd float component
216 * @param Z 3rd float component
217 * @param W 4th float component
218 * @return VectorRegister(X, Y, Z, W)
219 */
220#define VectorSet( X, Y, Z, W ) MakeVectorRegister( X, Y, Z, W )
221
222/**
223 * Stores a vector to aligned memory.
224 *
225 * @param Vec Vector to store
226 * @param Ptr Aligned memory pointer
227 */
228#define VectorStoreAligned( Vec, Ptr ) DirectX::XMStoreFloat4A((DirectX::XMFLOAT4A*)(Ptr), Vec )
229
230 /**
231 * Performs non-temporal store of a vector to aligned memory without polluting the caches
232 *
233 * @param Vec Vector to store
234 * @param Ptr Aligned memory pointer
235 */
236#define VectorStoreAlignedStreamed( Vec, Ptr ) XM_STREAM_PS( (float*)(Ptr), Vec )
237
238/**
239 * Stores a vector to memory (aligned or unaligned).
240 *
241 * @param Vec Vector to store
242 * @param Ptr Memory pointer
243 */
244#define VectorStore( Vec, Ptr ) DirectX::XMStoreFloat4((DirectX::XMFLOAT4*)(Ptr), Vec )
245
246/**
247 * Stores the XYZ components of a vector to unaligned memory.
248 *
249 * @param Vec Vector to store XYZ
250 * @param Ptr Unaligned memory pointer
251 */
252#define VectorStoreFloat3( Vec, Ptr ) DirectX::XMStoreFloat3((DirectX::XMFLOAT3*)(Ptr), Vec )
253
254/**
255 * Stores the X component of a vector to unaligned memory.
256 *
257 * @param Vec Vector to store X
258 * @param Ptr Unaligned memory pointer
259 */
260#define VectorStoreFloat1( Vec, Ptr ) DirectX::XMStoreFloat((float*)(Ptr), Vec )
261
262
263/**
264 * Returns an component from a vector.
265 *
266 * @param Vec Vector register
267 * @param ComponentIndex Which component to get, X=0, Y=1, Z=2, W=3
268 * @return The component as a float
269 */
270FORCEINLINE float VectorGetComponent( VectorRegister Vec, uint32 ComponentIndex )
271{
272 switch (ComponentIndex)
273 {
274 case 0:
275 return DirectX::XMVectorGetX(Vec);
276 case 1:
277 return DirectX::XMVectorGetY(Vec);
278 case 2:
279 return DirectX::XMVectorGetZ(Vec);
280 case 3:
281 return DirectX::XMVectorGetW(Vec);
282 }
283
284 return 0.0f;
285}
286
287
288 /**
289 * Replicates one element into all four elements and returns the new vector.
290 *
291 * @param Vec Source vector
292 * @param ElementIndex Index (0-3) of the element to replicate
293 * @return VectorRegister( Vec[ElementIndex], Vec[ElementIndex], Vec[ElementIndex], Vec[ElementIndex] )
294 */
295#define VectorReplicate( Vec, ElementIndex ) DirectX::XMVectorSwizzle<ElementIndex,ElementIndex,ElementIndex,ElementIndex>(Vec)
296
297/**
298 * Returns the absolute value (component-wise).
299 *
300 * @param Vec Source vector
301 * @return VectorRegister( abs(Vec.x), abs(Vec.y), abs(Vec.z), abs(Vec.w) )
302 */
303#define VectorAbs( Vec ) DirectX::XMVectorAbs( Vec )
304
305/**
306 * Returns the negated value (component-wise).
307 *
308 * @param Vec Source vector
309 * @return VectorRegister( -Vec.x, -Vec.y, -Vec.z, -Vec.w )
310 */
311#define VectorNegate( Vec ) DirectX::XMVectorNegate( Vec )
312
313/**
314 * Adds two vectors (component-wise) and returns the result.
315 *
316 * @param Vec1 1st vector
317 * @param Vec2 2nd vector
318 * @return VectorRegister( Vec1.x+Vec2.x, Vec1.y+Vec2.y, Vec1.z+Vec2.z, Vec1.w+Vec2.w )
319 */
320#define VectorAdd( Vec1, Vec2 ) DirectX::XMVectorAdd( Vec1, Vec2 )
321
322/**
323 * Subtracts a vector from another (component-wise) and returns the result.
324 *
325 * @param Vec1 1st vector
326 * @param Vec2 2nd vector
327 * @return VectorRegister( Vec1.x-Vec2.x, Vec1.y-Vec2.y, Vec1.z-Vec2.z, Vec1.w-Vec2.w )
328 */
329 #define VectorSubtract( Vec1, Vec2 ) DirectX::XMVectorSubtract( Vec1, Vec2 )
330
331/**
332 * Multiplies two vectors (component-wise) and returns the result.
333 *
334 * @param Vec1 1st vector
335 * @param Vec2 2nd vector
336 * @return VectorRegister( Vec1.x*Vec2.x, Vec1.y*Vec2.y, Vec1.z*Vec2.z, Vec1.w*Vec2.w )
337 */
338#define VectorMultiply( Vec1, Vec2 ) DirectX::XMVectorMultiply( Vec1, Vec2 )
339
340 /**
341 * Divides two vectors (component-wise) and returns the result.
342 *
343 * @param Vec1 1st vector
344 * @param Vec2 2nd vector
345 * @return VectorRegister( Vec1.x/Vec2.x, Vec1.y/Vec2.y, Vec1.z/Vec2.z, Vec1.w/Vec2.w )
346 */
347#define VectorDivide( Vec1, Vec2 ) DirectX::XMVectorDivide( Vec1, Vec2 )
348
349/**
350 * Multiplies two vectors (component-wise), adds in the third vector and returns the result.
351 *
352 * @param Vec1 1st vector
353 * @param Vec2 2nd vector
354 * @param Vec3 3rd vector
355 * @return VectorRegister( Vec1.x*Vec2.x + Vec3.x, Vec1.y*Vec2.y + Vec3.y, Vec1.z*Vec2.z + Vec3.z, Vec1.w*Vec2.w + Vec3.w )
356 */
357#define VectorMultiplyAdd( Vec1, Vec2, Vec3 ) DirectX::XMVectorMultiplyAdd( Vec1, Vec2, Vec3 )
358
359/**
360 * Multiplies two vectors (component-wise), negates the results and adds it to the third vector i.e. -AB + C = C - AB
361 *
362 * @param Vec1 1st vector
363 * @param Vec2 2nd vector
364 * @param Vec3 3rd vector
365 * @return VectorRegister( Vec3.x - Vec1.x*Vec2.x, Vec3.y - Vec1.y*Vec2.y, Vec3.z - Vec1.z*Vec2.z, Vec3.w - Vec1.w*Vec2.w )
366 */
367#define VectorNegateMultiplyAdd(Vec1, Vec2, Vec3) DirectX::XMVectorNegativeMultiplySubtract( Vec1, Vec2, Vec3 )
368
369/**
370 * Calculates the dot3 product of two vectors and returns a vector with the result in all 4 components.
371 * Only really efficient on Xbox 360.
372 *
373 * @param Vec1 1st vector
374 * @param Vec2 2nd vector
375 * @return d = dot3(Vec1.xyz, Vec2.xyz), VectorRegister( d, d, d, d )
376 */
377#define VectorDot3( Vec1, Vec2 ) DirectX::XMVector3Dot( Vec1, Vec2 )
378
379/**
380 * Calculates the dot4 product of two vectors and returns a vector with the result in all 4 components.
381 * Only really efficient on Xbox 360.
382 *
383 * @param Vec1 1st vector
384 * @param Vec2 2nd vector
385 * @return d = dot4(Vec1.xyzw, Vec2.xyzw), VectorRegister( d, d, d, d )
386 */
387 #define VectorDot4( Vec1, Vec2 ) DirectX::XMVector4Dot( Vec1, Vec2 )
388
389/**
390 * Creates a four-part mask based on component-wise == compares of the input vectors
391 *
392 * @param Vec1 1st vector
393 * @param Vec2 2nd vector
394 * @return VectorRegister( Vec1.x == Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
395 */
396#define VectorCompareEQ( Vec1, Vec2 ) DirectX::XMVectorEqual( Vec1, Vec2 )
397
398/**
399 * Creates a four-part mask based on component-wise != compares of the input vectors
400 *
401 * @param Vec1 1st vector
402 * @param Vec2 2nd vector
403 * @return VectorRegister( Vec1.x != Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
404 */
405#define VectorCompareNE( Vec1, Vec2 ) DirectX::XMVectorNotEqual( Vec1, Vec2 )
406
407/**
408 * Creates a four-part mask based on component-wise > compares of the input vectors
409 *
410 * @param Vec1 1st vector
411 * @param Vec2 2nd vector
412 * @return VectorRegister( Vec1.x > Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
413 */
414#define VectorCompareGT( Vec1, Vec2 ) DirectX::XMVectorGreater( Vec1, Vec2 )
415
416/**
417 * Creates a four-part mask based on component-wise >= compares of the input vectors
418 *
419 * @param Vec1 1st vector
420 * @param Vec2 2nd vector
421 * @return VectorRegister( Vec1.x >= Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
422 */
423#define VectorCompareGE( Vec1, Vec2 ) DirectX::XMVectorGreaterOrEqual( Vec1, Vec2 )
424
425 /**
426 * Creates a four-part mask based on component-wise < compares of the input vectors
427 *
428 * @param Vec1 1st vector
429 * @param Vec2 2nd vector
430 * @return VectorRegister( Vec1.x < Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
431 */
432#define VectorCompareLT( Vec1, Vec2 ) _mm_cmplt_ps( Vec1, Vec2 )
433
434 /**
435 * Creates a four-part mask based on component-wise <= compares of the input vectors
436 *
437 * @param Vec1 1st vector
438 * @param Vec2 2nd vector
439 * @return VectorRegister( Vec1.x <= Vec2.x ? 0xFFFFFFFF : 0, same for yzw )
440 */
441#define VectorCompareLE( Vec1, Vec2 ) _mm_cmple_ps( Vec1, Vec2 )
442
443/**
444 * Does a bitwise vector selection based on a mask (e.g., created from VectorCompareXX)
445 *
446 * @param Mask Mask (when 1: use the corresponding bit from Vec1 otherwise from Vec2)
447 * @param Vec1 1st vector
448 * @param Vec2 2nd vector
449 * @return VectorRegister( for each bit i: Mask[i] ? Vec1[i] : Vec2[i] )
450 *
451 */
452#define VectorSelect( Mask, Vec1, Vec2 ) DirectX::XMVectorSelect( Vec2, Vec1, Mask )
453
454/**
455 * Combines two vectors using bitwise OR (treating each vector as a 128 bit field)
456 *
457 * @param Vec1 1st vector
458 * @param Vec2 2nd vector
459 * @return VectorRegister( for each bit i: Vec1[i] | Vec2[i] )
460 */
461#define VectorBitwiseOr( Vec1, Vec2 ) DirectX::XMVectorOrInt( Vec1, Vec2 )
462
463/**
464 * Combines two vectors using bitwise AND (treating each vector as a 128 bit field)
465 *
466 * @param Vec1 1st vector
467 * @param Vec2 2nd vector
468 * @return VectorRegister( for each bit i: Vec1[i] & Vec2[i] )
469 */
470#define VectorBitwiseAnd( Vec1, Vec2 ) DirectX::XMVectorAndInt( Vec1, Vec2 )
471
472/**
473 * Combines two vectors using bitwise XOR (treating each vector as a 128 bit field)
474 *
475 * @param Vec1 1st vector
476 * @param Vec2 2nd vector
477 * @return VectorRegister( for each bit i: Vec1[i] ^ Vec2[i] )
478 */
479#define VectorBitwiseXor( Vec1, Vec2 ) DirectX::XMVectorXorInt( Vec1, Vec2 )
480
481 /**
482 * Returns an integer bit-mask (0x00 - 0x0f) based on the sign-bit for each component in a vector.
483 *
484 * @param VecMask Vector
485 * @return Bit 0 = sign(VecMask.x), Bit 1 = sign(VecMask.y), Bit 2 = sign(VecMask.z), Bit 3 = sign(VecMask.w)
486 */
487#define VectorMaskBits( VecMask ) _mm_movemask_ps( VecMask )
488
489
490
491/**
492 * Calculates the cross product of two vectors (XYZ components). W is set to 0.
493 *
494 * @param Vec1 1st vector
495 * @param Vec2 2nd vector
496 * @return cross(Vec1.xyz, Vec2.xyz). W is set to 0.
497 */
498#define VectorCross( Vec1, Vec2 ) DirectX::XMVector3Cross( Vec1, Vec2 )
499
500/**
501 * Calculates x raised to the power of y (component-wise).
502 *
503 * @param Base Base vector
504 * @param Exponent Exponent vector
505 * @return VectorRegister( Base.x^Exponent.x, Base.y^Exponent.y, Base.z^Exponent.z, Base.w^Exponent.w )
506 */
507#define VectorPow( Vec1, Vec2 ) DirectX::XMVectorPow( Vec1, Vec2 )
508
509
510/**
511* Returns an estimate of 1/sqrt(c) for each component of the vector
512*
513* @param Vector Vector
514* @return VectorRegister(1/sqrt(t), 1/sqrt(t), 1/sqrt(t), 1/sqrt(t))
515*/
516#define VectorReciprocalSqrt( Vec ) DirectX::XMVectorReciprocalSqrtEst( Vec )
517
518/**
519 * Computes an estimate of the reciprocal of a vector (component-wise) and returns the result.
520 *
521 * @param Vec 1st vector
522 * @return VectorRegister( (Estimate) 1.0f / Vec.x, (Estimate) 1.0f / Vec.y, (Estimate) 1.0f / Vec.z, (Estimate) 1.0f / Vec.w )
523 */
524#define VectorReciprocal( Vec ) DirectX::XMVectorReciprocalEst( Vec )
525
526/**
527* Return Reciprocal Length of the vector
528*
529* @param Vector Vector
530* @return VectorRegister(rlen, rlen, rlen, rlen) when rlen = 1/sqrt(dot4(V))
531*/
532#define VectorReciprocalLen( Vec ) DirectX::XMVector4ReciprocalLengthEst( Vec )
533
534/**
535* Return the reciprocal of the square root of each component
536*
537* @param Vector Vector
538* @return VectorRegister(1/sqrt(Vec.X), 1/sqrt(Vec.Y), 1/sqrt(Vec.Z), 1/sqrt(Vec.W))
539*/
540#define VectorReciprocalSqrtAccurate( Vec ) DirectX::XMVectorReciprocalSqrt( Vec )
541
542/**
543 * Computes the reciprocal of a vector (component-wise) and returns the result.
544 *
545 * @param Vec 1st vector
546 * @return VectorRegister( 1.0f / Vec.x, 1.0f / Vec.y, 1.0f / Vec.z, 1.0f / Vec.w )
547 */
548#define VectorReciprocalAccurate( Vec ) DirectX::XMVectorReciprocal( Vec )
549
550/**
551* Normalize vector
552*
553* @param Vector Vector to normalize
554* @return Normalized VectorRegister
555*/
556#define VectorNormalize( Vec ) DirectX::XMVector4NormalizeEst( Vec )
557
558/**
559* Loads XYZ and sets W=0
560*
561* @param Vector VectorRegister
562* @return VectorRegister(X, Y, Z, 0.0f)
563*/
564#define VectorSet_W0( Vec ) DirectX::XMVectorAndInt( Vec , DirectX::g_XMMask3 )
565
566/**
567* Loads XYZ and sets W=1
568*
569* @param Vector VectorRegister
570* @return VectorRegister(X, Y, Z, 1.0f)
571*/
572#define VectorSet_W1( Vec ) DirectX::XMVectorPermute<0,1,2,7>( Vec, VectorOne() )
573
574/**
575 * Multiplies two 4x4 matrices.
576 *
577 * @param Result Pointer to where the result should be stored
578 * @param Matrix1 Pointer to the first matrix
579 * @param Matrix2 Pointer to the second matrix
580 */
581FORCEINLINE void VectorMatrixMultiply( FMatrix* Result, const FMatrix* Matrix1, const FMatrix* Matrix2 )
582{
583 using namespace DirectX;
584 XMMATRIX XMatrix1 = XMLoadFloat4x4A((const XMFLOAT4X4A*)(Matrix1));
585 XMMATRIX XMatrix2 = XMLoadFloat4x4A((const XMFLOAT4X4A*)(Matrix2));
586 XMMATRIX XMatrixR = XMMatrixMultiply( XMatrix1, XMatrix2);
587 XMStoreFloat4x4A( (XMFLOAT4X4A*)(Result), XMatrixR);
588}
589
590/**
591 * Calculate the inverse of an FMatrix.
592 *
593 * @param DstMatrix FMatrix pointer to where the result should be stored
594 * @param SrcMatrix FMatrix pointer to the Matrix to be inversed
595 */
596FORCEINLINE void VectorMatrixInverse( FMatrix* DstMatrix, const FMatrix* SrcMatrix )
597{
598 using namespace DirectX;
599 XMMATRIX XMSrcMatrix = XMLoadFloat4x4A((const XMFLOAT4X4A*)(SrcMatrix));
600 XMMATRIX XMDstMatrix = XMMatrixInverse( nullptr, XMSrcMatrix );
601 XMStoreFloat4x4A( (XMFLOAT4X4A*)(DstMatrix), XMDstMatrix);
602}
603
604/**
605 * Calculate Homogeneous transform.
606 *
607 * @param VecP VectorRegister
608 * @param MatrixM FMatrix pointer to the Matrix to apply transform
609 * @return VectorRegister = VecP*MatrixM
610 */
612{
613 using namespace DirectX;
614 XMMATRIX M1 = XMLoadFloat4x4A( (const XMFLOAT4X4A*)(MatrixM) );
615 return XMVector4Transform( VecP, M1 );
616}
617
618/**
619 * Returns the minimum values of two vectors (component-wise).
620 *
621 * @param Vec1 1st vector
622 * @param Vec2 2nd vector
623 * @return VectorRegister( min(Vec1.x,Vec2.x), min(Vec1.y,Vec2.y), min(Vec1.z,Vec2.z), min(Vec1.w,Vec2.w) )
624 */
625#define VectorMin( Vec1, Vec2 ) DirectX::XMVectorMin( Vec1, Vec2 )
626
627/**
628 * Returns the maximum values of two vectors (component-wise).
629 *
630 * @param Vec1 1st vector
631 * @param Vec2 2nd vector
632 * @return VectorRegister( max(Vec1.x,Vec2.x), max(Vec1.y,Vec2.y), max(Vec1.z,Vec2.z), max(Vec1.w,Vec2.w) )
633 */
634#define VectorMax( Vec1, Vec2 ) DirectX::XMVectorMax( Vec1, Vec2 )
635
636/**
637 * Swizzles the 4 components of a vector and returns the result.
638 *
639 * @param Vec Source vector
640 * @param X Index for which component to use for X (literal 0-3)
641 * @param Y Index for which component to use for Y (literal 0-3)
642 * @param Z Index for which component to use for Z (literal 0-3)
643 * @param W Index for which component to use for W (literal 0-3)
644 * @return The swizzled vector
645 */
646#define VectorSwizzle( Vec, X, Y, Z, W ) DirectX::XMVectorSwizzle<X,Y,Z,W>( Vec )
647
648/**
649 * Creates a vector through selecting two components from each vector via a shuffle mask.
650 *
651 * @param Vec1 Source vector1
652 * @param Vec2 Source vector2
653 * @param X Index for which component of Vector1 to use for X (literal 0-3)
654 * @param Y Index for which component to Vector1 to use for Y (literal 0-3)
655 * @param Z Index for which component to Vector2 to use for Z (literal 0-3)
656 * @param W Index for which component to Vector2 to use for W (literal 0-3)
657 * @return The swizzled vector
658 */
659#define VectorShuffle( Vec1, Vec2, X, Y, Z, W ) DirectX::XMVectorPermute<X,Y,Z+4,W+4>( Vec1, Vec2 )
660
661/**
662* Creates a vector by combining two high components from each vector
663*
664* @param Vec1 Source vector1
665* @param Vec2 Source vector2
666* @return The combined vector
667*/
669{
670 return VectorShuffle(Vec1, Vec2, 2, 3, 2, 3);
671}
672
673/**
674* Creates a vector by combining two low components from each vector
675*
676* @param Vec1 Source vector1
677* @param Vec2 Source vector2
678* @return The combined vector
679*/
681{
682 return VectorShuffle(Vec1, Vec2, 0, 1, 0, 1);
683}
684
685/**
686 * Deinterleaves the components of the two given vectors such that the even components
687 * are in one vector and the odds in another.
688 *
689 * @param Lo [Even0, Odd0, Even1, Odd1]
690 * @param Hi [Even2, Odd2, Even3, Odd3]
691 * @param OutEvens [Even0, Even1, Even2, Even3]
692 * @param OutOdds [Odd0, Odd1, Odd2, Odd3]
693*/
695{
696 OutEvens = _mm_shuffle_ps(Lo, Hi, _MM_SHUFFLE(2, 0, 2, 0));
697 OutOdds = _mm_shuffle_ps(Lo, Hi, _MM_SHUFFLE(3, 1, 3, 1));
698}
699
700
701/**
702 * These functions return a vector mask to indicate which components pass the comparison.
703 * Each component is 0xffffffff if it passes, 0x00000000 if it fails.
704 *
705 * @param Vec1 1st source vector
706 * @param Vec2 2nd source vector
707 * @return Vector with a mask for each component.
708 */
709#define VectorMask_LT( Vec1, Vec2 ) _mm_cmplt_ps( Vec1, Vec2 )
710#define VectorMask_LE( Vec1, Vec2 ) _mm_cmple_ps( Vec1, Vec2 )
711#define VectorMask_GT( Vec1, Vec2 ) DirectX::XMVectorGreater( Vec1, Vec2 )
712#define VectorMask_GE( Vec1, Vec2 ) DirectX::XMVectorGreaterOrEqual( Vec1, Vec2 )
713#define VectorMask_EQ( Vec1, Vec2 ) DirectX::XMVectorEqual( Vec1, Vec2 )
714#define VectorMask_NE( Vec1, Vec2 ) DirectX::XMVectorNotEqual( Vec1, Vec2 )
715
716/**
717 * Merges the XYZ components of one vector with the W component of another vector and returns the result.
718 *
719 * @param VecXYZ Source vector for XYZ_
720 * @param VecW Source register for ___W (note: the fourth component is used, not the first)
721 * @return VectorRegister(VecXYZ.x, VecXYZ.y, VecXYZ.z, VecW.w)
722 */
724{
725 using namespace DirectX;
726 return DirectX::XMVectorSelect( VecXYZ, VecW, g_XMMaskW );
727}
728
729/**
730 * Loads 4 BYTEs from unaligned memory and converts them into 4 FLOATs.
731 * IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
732 *
733 * @param Ptr Unaligned memory pointer to the 4 BYTEs.
734 * @return VectorRegister( float(Ptr[0]), float(Ptr[1]), float(Ptr[2]), float(Ptr[3]) )
735 */
736#define VectorLoadByte4( Ptr ) DirectX::PackedVector::XMLoadUByte4((const DirectX::PackedVector::XMUBYTE4*)(Ptr) )
737
738 /**
739 * Loads 4 signed BYTEs from unaligned memory and converts them into 4 FLOATs.
740 * IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
741 *
742 * @param Ptr Unaligned memory pointer to the 4 BYTEs.
743 * @return VectorRegister( float(Ptr[0]), float(Ptr[1]), float(Ptr[2]), float(Ptr[3]) )
744 */
745#define VectorLoadSignedByte4( Ptr ) DirectX::PackedVector::XMLoadByte4((const DirectX::PackedVector::XMBYTE4*)(Ptr) )
746
747/**
748 * Loads 4 BYTEs from unaligned memory and converts them into 4 FLOATs in reversed order.
749 * IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
750 *
751 * @param Ptr Unaligned memory pointer to the 4 BYTEs.
752 * @return VectorRegister( float(Ptr[3]), float(Ptr[2]), float(Ptr[1]), float(Ptr[0]) )
753 */
755{
757 return VectorSwizzle(Temp,3,2,1,0);
758}
759
760/**
761 * Converts the 4 FLOATs in the vector to 4 BYTEs, clamped to [0,255], and stores to unaligned memory.
762 * IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
763 *
764 * @param Vec Vector containing 4 FLOATs
765 * @param Ptr Unaligned memory pointer to store the 4 BYTEs.
766 */
767#define VectorStoreByte4( Vec, Ptr ) DirectX::PackedVector::XMStoreUByte4( (DirectX::PackedVector::XMUBYTE4*)(Ptr), Vec )
768
769 /**
770 * Converts the 4 FLOATs in the vector to 4 BYTEs, clamped to [-127,127], and stores to unaligned memory.
771 * IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
772 *
773 * @param Vec Vector containing 4 FLOATs
774 * @param Ptr Unaligned memory pointer to store the 4 BYTEs.
775 */
776#define VectorStoreSignedByte4( Vec, Ptr ) DirectX::PackedVector::XMStoreByte4( (DirectX::PackedVector::XMBYTE4*)(Ptr), Vec )
777
778/**
779* Loads packed RGB10A2(4 bytes) from unaligned memory and converts them into 4 FLOATs.
780* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
781*
782* @param Ptr Unaligned memory pointer to the RGB10A2(4 bytes).
783* @return VectorRegister with 4 FLOATs loaded from Ptr.
784*/
785#define VectorLoadURGB10A2N( Ptr ) DirectX::PackedVector::XMLoadUDecN4( (const DirectX::PackedVector::XMUDECN4*)(Ptr) )
786
787/**
788* Converts the 4 FLOATs in the vector RGB10A2, clamped to [0, 1023] and [0, 3], and stores to unaligned memory.
789* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
790*
791* @param Vec Vector containing 4 FLOATs
792* @param Ptr Unaligned memory pointer to store the packed RGB10A2(4 bytes).
793*/
794#define VectorStoreURGB10A2N( Vec, Ptr ) DirectX::PackedVector::XMStoreUDecN4( (const DirectX::PackedVector::XMUDECN4*)(Ptr), Vec )
795
796/**
797* Loads packed RGBA16(4 bytes) from unaligned memory and converts them into 4 FLOATs.
798* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
799*
800* @param Ptr Unaligned memory pointer to the RGBA16(8 bytes).
801* @return VectorRegister with 4 FLOATs loaded from Ptr.
802*/
803#define VectorLoadURGBA16N( Ptr ) DirectX::PackedVector::XMLoadUShortN4( (const DirectX::PackedVector::XMUSHORTN4*)(Ptr) )
804
805/**
806* Loads packed RGBA16(4 bytes) from unaligned memory and converts them into 4 FLOATs.
807* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
808*
809* @param Ptr Unaligned memory pointer to the RGBA16(8 bytes).
810* @return VectorRegister with 4 FLOATs loaded from Ptr.
811*/
812#define VectorLoadSRGBA16N( Ptr ) DirectX::PackedVector::XMLoadShortN4( (const DirectX::PackedVector::XMSHORTN4*)(Ptr) )
813
814/**
815* Converts the 4 FLOATs in the vector RGBA16, clamped to [0, 65535], and stores to unaligned memory.
816* IMPORTANT: You need to call VectorResetFloatRegisters() before using scalar FLOATs after you've used this intrinsic!
817*
818* @param Vec Vector containing 4 FLOATs
819* @param Ptr Unaligned memory pointer to store the packed RGBA16(8 bytes).
820*/
821#define VectorStoreURGBA16N( Vec, Ptr ) DirectX::PackedVector::XMStoreUShortN4( (const DirectX::PackedVector::XMUSHORTN4*)(Ptr), Vec )
822
823/**
824 * Returns non-zero if any element in Vec1 is greater than the corresponding element in Vec2, otherwise 0.
825 *
826 * @param Vec1 1st source vector
827 * @param Vec2 2nd source vector
828 * @return Non-zero integer if (Vec1.x > Vec2.x) || (Vec1.y > Vec2.y) || (Vec1.z > Vec2.z) || (Vec1.w > Vec2.w)
829 */
831{
832 using namespace DirectX;
833 // Returns a comparison value that can be examined using functions such as XMComparisonAllTrue
834 uint32_t comparisonValue = XMVector4GreaterR( Vec1, Vec2 );
835
836 //Returns true if any of the compared components are true
837 return (uint32)XMComparisonAnyTrue( comparisonValue );
838}
839
840/**
841 * Resets the floating point registers so that they can be used again.
842 * Some intrinsics use these for MMX purposes (e.g. VectorLoadByte4 and VectorStoreByte4).
843 */
844// This is no longer necessary now that we don't use MMX instructions
845#define VectorResetFloatRegisters()
846
847/**
848 * Returns the control register.
849 *
850 * @return The uint32 control register
851 */
852#define VectorGetControlRegister() _mm_getcsr()
853
854/**
855 * Sets the control register.
856 *
857 * @param ControlStatus The uint32 control status value to set
858 */
859#define VectorSetControlRegister(ControlStatus) _mm_setcsr( ControlStatus )
860
861/**
862 * Control status bit to round all floating point math results towards zero.
863 */
864#define VECTOR_ROUND_TOWARD_ZERO _MM_ROUND_TOWARD_ZERO
865
866/**
867* Multiplies two quaternions; the order matters.
868*
869* Order matters when composing quaternions: C = VectorQuaternionMultiply2(A, B) will yield a quaternion C = A * B
870* that logically first applies B then A to any subsequent transformation (right first, then left).
871*
872* @param Quat1 Pointer to the first quaternion
873* @param Quat2 Pointer to the second quaternion
874* @return Quat1 * Quat2
875*/
877{
878 // DirectXMath uses reverse parameter order to UnrealMath
879 // XMQuaternionMultiply( FXMVECTOR Q1, FXMVECTOR Q2)
880 // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2)
881
882 // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y),
883 // (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x),
884 // (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w),
885 // (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ]
886 return DirectX::XMQuaternionMultiply( Quat2, Quat1 );
887}
888
889/**
890* Multiplies two quaternions; the order matters.
891*
892* When composing quaternions: VectorQuaternionMultiply(C, A, B) will yield a quaternion C = A * B
893* that logically first applies B then A to any subsequent transformation (right first, then left).
894*
895* @param Result Pointer to where the result Quat1 * Quat2 should be stored
896* @param Quat1 Pointer to the first quaternion (must not be the destination)
897* @param Quat2 Pointer to the second quaternion (must not be the destination)
898*/
899FORCEINLINE void VectorQuaternionMultiply( FQuat *Result, const FQuat* Quat1, const FQuat* Quat2)
900{
901 VectorRegister XMQuat1 = VectorLoadAligned(Quat1);
902 VectorRegister XMQuat2 = VectorLoadAligned(Quat2);
903 VectorRegister XMResult = VectorQuaternionMultiply2(XMQuat1, XMQuat2);
904 VectorStoreAligned(XMResult, Result);
905
906}
907
908/**
909* Multiplies two quaternions; the order matters.
910*
911* When composing quaternions: VectorQuaternionMultiply(C, A, B) will yield a quaternion C = A * B
912* that logically first applies B then A to any subsequent transformation (right first, then left).
913*
914* @param Result Pointer to where the result Quat1 * Quat2 should be stored
915* @param Quat1 Pointer to the first quaternion (must not be the destination)
916* @param Quat2 Pointer to the second quaternion (must not be the destination)
917*/
919{
920 *VResult = VectorQuaternionMultiply2(*VQuat1, *VQuat2);
921}
922
923FORCEINLINE void VectorQuaternionVector3Rotate( FVector *Result, const FVector* Vec, const FQuat* Quat)
924{
926 VectorRegister XMQuat = VectorLoadAligned(Quat);
927 VectorRegister XMResult = DirectX::XMVector3Rotate(XMVec, XMQuat);
928 VectorStoreFloat3(XMResult, Result);
929}
930
931FORCEINLINE void VectorQuaternionVector3InverseRotate( FVector *Result, const FVector* Vec, const FQuat* Quat)
932{
934 VectorRegister XMQuat = VectorLoadAligned(Quat);
935 VectorRegister XMResult = DirectX::XMVector3InverseRotate(XMVec, XMQuat);
936 VectorStoreFloat3(XMResult, Result);
937}
938
939
940/**
941* Computes the sine and cosine of each component of a Vector.
942*
943* @param VSinAngles VectorRegister Pointer to where the Sin result should be stored
944* @param VCosAngles VectorRegister Pointer to where the Cos result should be stored
945* @param VAngles VectorRegister Pointer to the input angles
946*/
948{
949 using namespace DirectX;
950 // Force the value within the bounds of pi
951 XMVECTOR x = XMVectorModAngles(*VAngles);
952
953 // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
954 XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
955 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
956 __m128 absx = _mm_andnot_ps(sign, x); // |x|
957 __m128 rflx = _mm_sub_ps(c, x);
958 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
959 __m128 select0 = _mm_and_ps(comp, x);
960 __m128 select1 = _mm_andnot_ps(comp, rflx);
961 x = _mm_or_ps(select0, select1);
962 select0 = _mm_and_ps(comp, g_XMOne);
963 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
964 sign = _mm_or_ps(select0, select1);
965
966 __m128 x2 = _mm_mul_ps(x, x);
967
968 // Compute polynomial approximation of sine
969 const XMVECTOR SC1 = g_XMSinCoefficients1;
970 XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
971 __m128 Result = _mm_mul_ps(vConstants, x2);
972
973 const XMVECTOR SC0 = g_XMSinCoefficients0;
974 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
975 Result = _mm_add_ps(Result, vConstants);
976 Result = _mm_mul_ps(Result, x2);
977
978 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
979 Result = _mm_add_ps(Result, vConstants);
980 Result = _mm_mul_ps(Result, x2);
981
982 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) );
983 Result = _mm_add_ps(Result, vConstants);
984 Result = _mm_mul_ps(Result, x2);
985
986 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
987 Result = _mm_add_ps(Result, vConstants);
988 Result = _mm_mul_ps(Result, x2);
989 Result = _mm_add_ps(Result, g_XMOne);
990 Result = _mm_mul_ps(Result, x);
991 *VSinAngles = Result;
992
993 // Compute polynomial approximation of cosine
994 const XMVECTOR CC1 = g_XMCosCoefficients1;
995 vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
996 Result = _mm_mul_ps(vConstants, x2);
997
998 const XMVECTOR CC0 = g_XMCosCoefficients0;
999 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
1000 Result = _mm_add_ps(Result, vConstants);
1001 Result = _mm_mul_ps(Result, x2);
1002
1003 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) );
1004 Result = _mm_add_ps(Result, vConstants);
1005 Result = _mm_mul_ps(Result, x2);
1006
1007 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) );
1008 Result = _mm_add_ps(Result, vConstants);
1009 Result = _mm_mul_ps(Result, x2);
1010
1011 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
1012 Result = _mm_add_ps(Result, vConstants);
1013 Result = _mm_mul_ps(Result, x2);
1014 Result = _mm_add_ps(Result, g_XMOne);
1015 Result = _mm_mul_ps(Result, sign);
1016 *VCosAngles = Result;
1017}
1018
1019
1020// Returns true if the vector contains a component that is either NAN or +/-infinite.
1022{
1023 // https://en.wikipedia.org/wiki/IEEE_754-1985
1024 // Infinity is represented with all exponent bits set, with the correct sign bit.
1025 // NaN is represented with all exponent bits set, plus at least one fraction/significand bit set.
1026 // This means finite values will not have all exponent bits set, so check against those bits.
1027
1028 // Mask off Exponent
1029 const VectorRegister ExpTest = VectorBitwiseAnd(Vec, GlobalVectorConstants::FloatInfinity());
1030 // Compare to full exponent. If any are full exponent (not finite), the signs copied to the mask are non-zero, otherwise it's zero and finite.
1031 bool IsFinite = VectorMaskBits(VectorCompareEQ(ExpTest, GlobalVectorConstants::FloatInfinity())) == 0;
1032 return !IsFinite;
1033}
1034
1036{
1037 return DirectX::XMVectorExp(X);
1038}
1039
1041{
1042 return DirectX::XMVectorExp2(X);
1043}
1044
1046{
1047 return DirectX::XMVectorLog(X);
1048}
1049
1051{
1052 return DirectX::XMVectorLog2(X);
1053}
1054
1056{
1057 return DirectX::XMVectorSin(X);
1058}
1059
1061{
1062 return DirectX::XMVectorCos(X);
1063}
1064
1066{
1067 return DirectX::XMVectorTan(X);
1068}
1069
1071{
1072 return DirectX::XMVectorASin(X);
1073}
1074
1076{
1077 return DirectX::XMVectorACos(X);
1078}
1079
1081{
1082 return DirectX::XMVectorATan(X);
1083}
1084
1086{
1087 return DirectX::XMVectorATan2(X, Y);
1088}
1089
1091{
1092 return DirectX::XMVectorCeiling(X);
1093}
1094
1096{
1097 return DirectX::XMVectorFloor(X);
1098}
1099
1101{
1102 return DirectX::XMVectorTruncate(X);
1103}
1104
1106{
1107 return VectorSubtract(X, VectorTruncate(X));
1108}
1109
1111{
1112 VectorRegister AbsY = VectorAbs(Y);
1113 VectorRegister Result = DirectX::XMVectorMod(X, Y);
1114 // Clamp to [-AbsY, AbsY] because of possible failures for very large numbers (>1e10) due to precision loss.
1115 return DirectX::XMVectorClamp(Result, VectorNegate(AbsY), AbsY);
1116}
1117
1118//TODO: Vectorize
1120{
1121 return MakeVectorRegister(
1122 (float)(VectorGetComponent(X, 0) >= 0.0f ? 1.0f : -1.0f),
1123 (float)(VectorGetComponent(X, 1) >= 0.0f ? 1.0f : -1.0f),
1124 (float)(VectorGetComponent(X, 2) >= 0.0f ? 1.0f : -1.0f),
1125 (float)(VectorGetComponent(X, 3) >= 0.0f ? 1.0f : -1.0f));
1126}
1127
1128//TODO: Vectorize
1130{
1131 return MakeVectorRegister(
1132 (float)(VectorGetComponent(X, 0) >= 0.0f ? 1.0f : 0.0f),
1133 (float)(VectorGetComponent(X, 1) >= 0.0f ? 1.0f : 0.0f),
1134 (float)(VectorGetComponent(X, 2) >= 0.0f ? 1.0f : 0.0f),
1135 (float)(VectorGetComponent(X, 3) >= 0.0f ? 1.0f : 0.0f));
1136}
1137
1138//////////////////////////////////////////////////////////////////////////
1139//Integer ops
1140
1141//Bitwise
1142/** = a & b */
1143#define VectorIntAnd(A, B) _mm_and_si128(A, B)
1144/** = a | b */
1145#define VectorIntOr(A, B) _mm_or_si128(A, B)
1146/** = a ^ b */
1147#define VectorIntXor(A, B) _mm_xor_si128(A, B)
1148/** = (~a) & b */
1149#define VectorIntAndNot(A, B) _mm_andnot_si128(A, B)
1150/** = ~a */
1151#define VectorIntNot(A) _mm_xor_si128(A, GlobalVectorConstants::IntAllMask)
1152
1153//Comparison
1154#define VectorIntCompareEQ(A, B) _mm_cmpeq_epi32(A,B)
1155#define VectorIntCompareNEQ(A, B) VectorIntNot(_mm_cmpeq_epi32(A,B))
1156#define VectorIntCompareGT(A, B) _mm_cmpgt_epi32(A,B)
1157#define VectorIntCompareLT(A, B) _mm_cmplt_epi32(A,B)
1158#define VectorIntCompareGE(A, B) VectorIntNot(VectorIntCompareLT(A,B))
1159#define VectorIntCompareLE(A, B) VectorIntNot(VectorIntCompareGT(A,B))
1160
1161
1163{
1164 return _mm_xor_si128(Vec2, _mm_and_si128(Mask, _mm_xor_si128(Vec1, Vec2)));
1165}
1166
1167//Arithmetic
1168#define VectorIntAdd(A, B) _mm_add_epi32(A, B)
1169#define VectorIntSubtract(A, B) _mm_sub_epi32(A, B)
1170
1172{
1173 //SSE2 doesn't have a multiply op for 4 32bit ints. Ugh.
1174 __m128i Temp0 = _mm_mul_epu32(A, B);
1175 __m128i Temp1 = _mm_mul_epu32(_mm_srli_si128(A, 4), _mm_srli_si128(B, 4));
1176 return _mm_unpacklo_epi32(_mm_shuffle_epi32(Temp0, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(Temp1, _MM_SHUFFLE(0, 0, 2, 0)));
1177}
1178
1179#define VectorIntNegate(A) VectorIntSubtract( GlobalVectorConstants::IntZero, A)
1180
1182{
1184 return VectorIntSelect(Mask, A, B);
1185}
1186
1188{
1190 return VectorIntSelect(Mask, A, B);
1191}
1192
1194{
1197}
1198
1199#define VectorIntSign(A) VectorIntSelect( VectorIntCompareGE(A, GlobalVectorConstants::IntZero), GlobalVectorConstants::IntOne, GlobalVectorConstants::IntMinusOne )
1200
1201#define VectorIntToFloat(A) _mm_cvtepi32_ps(A)
1202#define VectorFloatToInt(A) _mm_cvttps_epi32(A)
1203
1205{
1206 return _mm_cvtps_epi32(Vec);
1207}
1208
1209//Loads and stores
1210
1211/**
1212* Stores a vector to memory (aligned or unaligned).
1213*
1214* @param Vec Vector to store
1215* @param Ptr Memory pointer
1216*/
1217#define VectorIntStore( Vec, Ptr ) _mm_storeu_si128( (VectorRegister4Int*)(Ptr), Vec )
1218
1219/**
1220* Loads 4 int32s from unaligned memory.
1221*
1222* @param Ptr Unaligned memory pointer to the 4 int32s
1223* @return VectorRegister4Int(Ptr[0], Ptr[1], Ptr[2], Ptr[3])
1224*/
1225#define VectorIntLoad( Ptr ) _mm_loadu_si128( (VectorRegister4Int*)(Ptr) )
1226
1227/**
1228* Stores a vector to memory (aligned).
1229*
1230* @param Vec Vector to store
1231* @param Ptr Aligned Memory pointer
1232*/
1233#define VectorIntStoreAligned( Vec, Ptr ) _mm_store_si128( (VectorRegister4Int*)(Ptr), Vec )
1234
1235/**
1236* Loads 4 int32s from aligned memory.
1237*
1238* @param Ptr Aligned memory pointer to the 4 int32s
1239* @return VectorRegister4Int(Ptr[0], Ptr[1], Ptr[2], Ptr[3])
1240*/
1241#define VectorIntLoadAligned( Ptr ) _mm_load_si128( (VectorRegister4Int*)(Ptr) )
1242
1243/**
1244* Loads 1 int32 from unaligned memory into all components of a vector register.
1245*
1246* @param Ptr Unaligned memory pointer to the 4 int32s
1247* @return VectorRegister4Int(*Ptr, *Ptr, *Ptr, *Ptr)
1248*/
1249#define VectorIntLoad1( Ptr ) _mm_shuffle_epi32(_mm_loadu_si128((VectorRegister4Int*)(Ptr)),_MM_SHUFFLE(0,0,0,0))
1250
1251#endif
#define FORCEINLINE
Definition Platform.h:644
#define RESTRICT
Definition Platform.h:650
FORCEINLINE VectorRegister VectorStep(const VectorRegister &X)
#define VectorStoreAligned(Vec, Ptr)
FORCEINLINE VectorRegister VectorFloor(const VectorRegister &X)
FORCEINLINE void VectorQuaternionMultiply(VectorRegister *VResult, const VectorRegister *VQuat1, const VectorRegister *VQuat2)
FORCEINLINE VectorRegister4Int VectorIntAbs(const VectorRegister4Int &A)
FORCEINLINE float VectorGetComponent(VectorRegister Vec, uint32 ComponentIndex)
FORCEINLINE VectorRegister4Int VectorIntMin(const VectorRegister4Int &A, const VectorRegister4Int &B)
#define VectorAbs(Vec)
FORCEINLINE VectorRegister VectorQuaternionMultiply2(const VectorRegister &Quat1, const VectorRegister &Quat2)
#define VectorMaskBits(VecMask)
FORCEINLINE void VectorQuaternionVector3Rotate(FVector *Result, const FVector *Vec, const FQuat *Quat)
FORCEINLINE bool VectorContainsNaNOrInfinite(const VectorRegister &Vec)
#define VectorShuffle(Vec1, Vec2, X, Y, Z, W)
FORCEINLINE VectorRegister VectorLoadByte4Reverse(const uint8 *Ptr)
FORCEINLINE VectorRegister VectorTruncate(const VectorRegister &X)
#define VectorIntCompareLT(A, B)
FORCEINLINE void VectorQuaternionMultiply(FQuat *Result, const FQuat *Quat1, const FQuat *Quat2)
FORCEINLINE VectorRegister VectorMod(const VectorRegister &X, const VectorRegister &Y)
FORCEINLINE VectorRegister VectorSin(const VectorRegister &X)
FORCEINLINE VectorRegister4Int VectorIntSelect(const VectorRegister4Int &Mask, const VectorRegister4Int &Vec1, const VectorRegister4Int &Vec2)
FORCEINLINE VectorRegister VectorExp(const VectorRegister &X)
DirectX::XMVECTOR VectorRegister
FORCEINLINE VectorRegister VectorMergeVecXYZ_VecW(const VectorRegister &VecXYZ, const VectorRegister &VecW)
FORCEINLINE VectorRegister VectorCos(const VectorRegister &X)
FORCEINLINE VectorRegister VectorLog(const VectorRegister &X)
#define VectorNegate(Vec)
FORCEINLINE VectorRegister VectorATan(const VectorRegister &X)
FORCEINLINE VectorRegister VectorTransformVector(const VectorRegister &VecP, const FMatrix *MatrixM)
FORCEINLINE VectorRegister VectorLog2(const VectorRegister &X)
FORCEINLINE VectorRegister VectorATan2(const VectorRegister &X, const VectorRegister &Y)
FORCEINLINE VectorRegister VectorCeil(const VectorRegister &X)
FORCEINLINE VectorRegister VectorCombineHigh(const VectorRegister &Vec1, const VectorRegister &Vec2)
#define VectorBitwiseAnd(Vec1, Vec2)
FORCEINLINE VectorRegister VectorASin(const VectorRegister &X)
FORCEINLINE VectorRegister4Int VectorRoundToIntHalfToEven(const VectorRegister4Float &Vec)
#define VectorIntCompareGT(A, B)
FORCEINLINE void VectorQuaternionVector3InverseRotate(FVector *Result, const FVector *Vec, const FQuat *Quat)
FORCEINLINE VectorRegister VectorACos(const VectorRegister &X)
#define VectorCompareEQ(Vec1, Vec2)
FORCEINLINE void VectorDeinterleave(VectorRegister &OutEvens, VectorRegister &OutOdds, const VectorRegister &Lo, const VectorRegister &Hi)
FORCEINLINE void VectorMatrixMultiply(FMatrix *Result, const FMatrix *Matrix1, const FMatrix *Matrix2)
#define VectorIntNegate(A)
FORCEINLINE VectorRegister VectorCombineLow(const VectorRegister &Vec1, const VectorRegister &Vec2)
FORCEINLINE VectorRegister VectorExp2(const VectorRegister &X)
FORCEINLINE VectorRegister VectorLoadTwoPairsFloat(const float *Ptr1, const float *Ptr2)
#define VectorOne()
#define VectorIntSubtract(A, B)
#define VectorSwizzle(Vec, X, Y, Z, W)
#define VectorIntCompareGE(A, B)
FORCEINLINE VectorRegister4Int VectorIntMax(const VectorRegister4Int &A, const VectorRegister4Int &B)
#define VectorLoadAligned(Ptr)
#define VectorLoadFloat3_W0(Ptr)
#define VectorLoadByte4(Ptr)
FORCEINLINE VectorRegister VectorTan(const VectorRegister &X)
FORCEINLINE uint32 VectorAnyGreaterThan(const VectorRegister &Vec1, const VectorRegister &Vec2)
#define VectorSubtract(Vec1, Vec2)
FORCEINLINE void VectorMatrixInverse(FMatrix *DstMatrix, const FMatrix *SrcMatrix)
#define VectorIntNot(A)
FORCEINLINE VectorRegister4Int VectorIntMultiply(const VectorRegister4Int &A, const VectorRegister4Int &B)
FORCEINLINE VectorRegister VectorSign(const VectorRegister &X)
FORCEINLINE void VectorSinCos(VectorRegister *RESTRICT VSinAngles, VectorRegister *RESTRICT VCosAngles, const VectorRegister *RESTRICT VAngles)
#define VectorStoreFloat3(Vec, Ptr)
__m128i VectorRegister4Int
FORCEINLINE VectorRegister4Float VectorFractional(const VectorRegister4Float &Vec)