Ark Server API (ASA) - Wiki
Loading...
Searching...
No Matches
UnrealPlatformMathSSE.h
Go to the documentation of this file.
1// Copyright Epic Games, Inc. All Rights Reserved.
2
3#pragma once
4
5// HEADER_UNIT_SKIP - Included through other header
6
7#include "CoreTypes.h"
8#include "HAL/PlatformMisc.h"
9// Code including this header is responsible for including the correct platform-specific header for SSE intrinsics.
10
12
13template<class Base>
14struct TUnrealPlatformMathSSEBase : public Base
15{
16};
17
18#else
19
20namespace UE4
21{
22namespace SSE
23{
24 FORCEINLINE float InvSqrt(float InValue)
25 {
26 const __m128 One = _mm_set_ss(1.0f);
27 const __m128 Y0 = _mm_set_ss(InValue);
28 const __m128 X0 = _mm_sqrt_ss(Y0);
29 const __m128 R0 = _mm_div_ss(One, X0);
30 float temp;
31 _mm_store_ss(&temp, R0);
32 return temp;
33 }
34
35 FORCEINLINE double InvSqrt(double InValue)
36 {
37 const __m128d One = _mm_set_sd(1.0);
38 const __m128d Y0 = _mm_set_sd(InValue);
39 const __m128d X0 = _mm_sqrt_sd(One, Y0);
40 const __m128d R0 = _mm_div_sd(One, X0);
41 double temp;
42 _mm_store_sd(&temp, R0);
43 return temp;
44 }
45
46 FORCEINLINE float InvSqrtEst(float F)
47 {
48 // Performs one pass of Newton-Raphson iteration on the hardware estimate
49 const __m128 fOneHalf = _mm_set_ss(0.5f);
50 __m128 Y0, X0, X1, FOver2;
51 float temp;
52
53 Y0 = _mm_set_ss(F);
54 X0 = _mm_rsqrt_ss(Y0); // 1/sqrt estimate (12 bits)
55 FOver2 = _mm_mul_ss(Y0, fOneHalf);
56
57 // 1st Newton-Raphson iteration
58 X1 = _mm_mul_ss(X0, X0);
59 X1 = _mm_sub_ss(fOneHalf, _mm_mul_ss(FOver2, X1));
60 X1 = _mm_add_ss(X0, _mm_mul_ss(X0, X1));
61
62 _mm_store_ss(&temp, X1);
63 return temp;
64 }
65
66 FORCEINLINE double InvSqrtEst(double InValue)
67 {
68 return InvSqrt(InValue);
69 }
70
71 FORCEINLINE int32 TruncToInt32(float F)
72 {
73 return _mm_cvtt_ss2si(_mm_set_ss(F));
74 }
75
76 FORCEINLINE int32 TruncToInt32(double InValue)
77 {
78 return _mm_cvttsd_si32(_mm_set_sd(InValue));
79 }
80
81 FORCEINLINE int64 TruncToInt64(double InValue)
82 {
83 return _mm_cvttsd_si64(_mm_set_sd(InValue));
84 }
85
86 FORCEINLINE int32 FloorToInt32(float F)
87 {
88 // Note: unlike the Generic solution and the SSE4 float solution, we implement FloorToInt using a rounding instruction, rather than implementing RoundToInt using a floor instruction.
89 // We therefore need to do the same times-2 transform (with a slighly different formula) that RoundToInt does; see the note on RoundToInt
90 return _mm_cvt_ss2si(_mm_set_ss(F + F - 0.5f)) >> 1;
91 }
92
93 FORCEINLINE int32 FloorToInt32(double InValue)
94 {
95 return _mm_cvtsd_si32(_mm_set_sd(InValue + InValue - 0.5)) >> 1;
96 }
97
98 FORCEINLINE int64 FloorToInt64(double InValue)
99 {
100 return _mm_cvtsd_si64(_mm_set_sd(InValue + InValue - 0.5)) >> 1;
101 }
102
103 FORCEINLINE int32 RoundToInt32(float F)
104 {
105 // Note: the times-2 is to remove the rounding-to-nearest-even-number behavior that mm_cvt_ss2si uses when the fraction is .5
106 // The formula we uses causes the round instruction to always be applied to a an odd integer when the original value was 0.5, and eliminates the rounding-to-nearest-even-number behavior
107 // Input -> multiply by two and add .5 -> Round to nearest whole -> divide by two and truncate
108 // N -> (2N) + .5 -> 2N (or possibly 2N+1) -> N
109 // N + .5 -> 2N + 1.5 -> (round towards even now always means round up) -> 2N + 2 -> N + 1
110 return _mm_cvt_ss2si(_mm_set_ss(F + F + 0.5f)) >> 1;
111 }
112
113 FORCEINLINE int32 RoundToInt32(double InValue)
114 {
115 return _mm_cvtsd_si32(_mm_set_sd(InValue + InValue + 0.5)) >> 1;
116 }
117
118 FORCEINLINE int64 RoundToInt64(double InValue)
119 {
120 return _mm_cvtsd_si64(_mm_set_sd(InValue + InValue + 0.5)) >> 1;
121 }
122
123 FORCEINLINE int32 CeilToInt32(float F)
124 {
125 // Note: unlike the Generic solution and the SSE4 float solution, we implement CeilToInt using a rounding instruction, rather than a dedicated ceil instruction
126 // We therefore need to do the same times-2 transform (with a slighly different formula) that RoundToInt does; see the note on RoundToInt
127 return -(_mm_cvt_ss2si(_mm_set_ss(-0.5f - (F + F))) >> 1);
128 }
129
130 FORCEINLINE int32 CeilToInt32(double InValue)
131 {
132 return -(_mm_cvtsd_si32(_mm_set_sd(-0.5 - (InValue + InValue))) >> 1);
133 }
134
135 FORCEINLINE int64 CeilToInt64(double InValue)
136 {
137 return -(_mm_cvtsd_si64(_mm_set_sd(-0.5 - (InValue + InValue))) >> 1);
138 }
139
140 // https://gist.github.com/rygorous/2156668
141 // float_to_half_rtne_SSE2
142 inline __m128i FloatToHalf(__m128 f)
143 {
144 const __m128 mask_sign = _mm_set1_ps(-0.0f);
145 const __m128i c_f16max = _mm_set1_epi32((127 + 16) << 23); // all FP32 values >=this round to +inf
146 const __m128i c_nanbit = _mm_set1_epi32(0x200);
147 const __m128i c_nanlobits = _mm_set1_epi32(0x1ff);
148 const __m128i c_infty_as_fp16 = _mm_set1_epi32(0x7c00);
149 const __m128i c_min_normal = _mm_set1_epi32((127 - 14) << 23); // smallest FP32 that yields a normalized FP16
150 const __m128i c_subnorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
151 const __m128i c_normal_bias = _mm_set1_epi32(0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
152
153 //__m128 justsign = f & mask_sign;
154 __m128 justsign = _mm_and_ps( f , mask_sign );
155 //__m128 absf = andnot(f, mask_sign); // f & ~mask_sign
156 __m128 absf = _mm_andnot_ps(mask_sign, f); // f & ~mask_sign
157 __m128i absf_int = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
158 __m128 b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN?
159 __m128i b_isregular = _mm_cmpgt_epi32(c_f16max, absf_int); // (sub)normalized or special?
160 __m128i nan_payload = _mm_and_si128(_mm_srli_epi32(absf_int, 13), c_nanlobits); // payload bits for NaNs
161 __m128i nan_quiet = _mm_or_si128(nan_payload, c_nanbit); // and set quiet bit
162 __m128i nanfinal = _mm_and_si128(_mm_castps_si128(b_isnan), nan_quiet);
163 __m128i inf_or_nan = _mm_or_si128(nanfinal, c_infty_as_fp16); // output for specials
164
165 __m128i b_issub = _mm_cmpgt_epi32(c_min_normal, absf_int);
166
167 // "result is subnormal" path
168 __m128 subnorm1 = _mm_add_ps( absf , __m128(_mm_castsi128_ps(c_subnorm_magic)) ); // magic value to round output mantissa
169 __m128i subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), c_subnorm_magic); // subtract out bias
170
171 // "result is normal" path
172 __m128i mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
173 __m128i mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
174
175 __m128i round1 = _mm_add_epi32(absf_int, c_normal_bias);
176 __m128i round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
177 __m128i normal = _mm_srli_epi32(round2, 13); // rounded result
178
179 // combine the two non-specials
180 __m128i nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
181
182 // merge in specials as well
183 __m128i joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
184
185 __m128i sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
186 __m128i rgba_half32 = _mm_or_si128(joined, sign_shift);
187
188 // there's now a half in each 32-bit lane
189 // pack down to 64 bits :
190 // packs works because rgba_half32 is sign-extended
191 __m128i four_halfs_u64 = _mm_packs_epi32(rgba_half32, rgba_half32);
192
193 return four_halfs_u64;
194 }
195
196 // four halfs should be in the u64 part of input
197 inline __m128 HalfToFloat(__m128i four_halfs_u64)
198 {
199 __m128i rgba_half32 = _mm_unpacklo_epi16(four_halfs_u64, _mm_setzero_si128());
200
201 const __m128i mask_nosign = _mm_set1_epi32(0x7fff);
202 const __m128 magic_mult = _mm_castsi128_ps(_mm_set1_epi32((254 - 15) << 23));
203 const __m128i was_infnan = _mm_set1_epi32(0x7bff);
204 const __m128 exp_infnan = _mm_castsi128_ps(_mm_set1_epi32(255 << 23));
205 const __m128i was_nan = _mm_set1_epi32(0x7c00);
206 const __m128i nan_quiet = _mm_set1_epi32(1 << 22);
207
208 __m128i expmant = _mm_and_si128(mask_nosign, rgba_half32);
209 __m128i justsign = _mm_xor_si128(rgba_half32, expmant);
210 __m128i shifted = _mm_slli_epi32(expmant, 13);
211 __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), magic_mult);
212 __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant, was_infnan);
213 __m128i sign = _mm_slli_epi32(justsign, 16);
214 __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), exp_infnan);
215 __m128i b_wasnan = _mm_cmpgt_epi32(expmant, was_nan);
216 __m128i nanquiet = _mm_and_si128(b_wasnan, nan_quiet);
217 __m128 infnandone = _mm_or_ps(infnanexp, _mm_castsi128_ps(nanquiet));
218
219 __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnandone);
220 __m128 result = _mm_or_ps(scaled, sign_inf);
221
222 return result;
223 }
224}
225}
226
227template<class Base>
228struct TUnrealPlatformMathSSEBase : public Base
229{
230 template<typename T>
231 static FORCEINLINE int32 TruncToInt32(T F)
232 {
233 return UE4::SSE::TruncToInt32(F);
234 }
235
236 static FORCEINLINE int64 TruncToInt64(double F)
237 {
238 return UE4::SSE::TruncToInt64(F);
239 }
240
241 template<typename T>
242 static FORCEINLINE int32 RoundToInt32(T F)
243 {
244 return UE4::SSE::RoundToInt32(F);
245 }
246
247 static FORCEINLINE int64 RoundToInt64(double F)
248 {
249 return UE4::SSE::RoundToInt64(F);
250 }
251
252 template<typename T>
253 static FORCEINLINE int32 FloorToInt32(T F)
254 {
255 return UE4::SSE::FloorToInt32(F);
256 }
257
258 static FORCEINLINE int64 FloorToInt64(double F)
259 {
260 return UE4::SSE::FloorToInt64(F);
261 }
262
263 template<typename T>
264 static FORCEINLINE int32 CeilToInt32(T F)
265 {
266 return UE4::SSE::CeilToInt32(F);
267 }
268
269 static FORCEINLINE int64 CeilToInt64(double F)
270 {
271 return UE4::SSE::CeilToInt64(F);
272 }
273
274 //
275 // Wrappers for overloads in the base, required since calls declared in base struct won't redirect back to this class
276 //
277
278 static FORCEINLINE int32 TruncToInt(float F) { return TruncToInt32(F); }
279 static FORCEINLINE int64 TruncToInt(double F) { return TruncToInt64(F); }
280 static FORCEINLINE int32 FloorToInt(float F) { return FloorToInt32(F); }
281 static FORCEINLINE int64 FloorToInt(double F) { return FloorToInt64(F); }
282 static FORCEINLINE int32 RoundToInt(float F) { return RoundToInt32(F); }
283 static FORCEINLINE int64 RoundToInt(double F) { return RoundToInt64(F); }
284 static FORCEINLINE int32 CeilToInt(float F) { return CeilToInt32(F); }
285 static FORCEINLINE int64 CeilToInt(double F) { return CeilToInt64(F); }
286
287
288 template<typename T>
289 static FORCEINLINE T InvSqrt(T F)
290 {
291 return UE4::SSE::InvSqrt(F);
292 }
293
294 template<typename T>
295 static FORCEINLINE T InvSqrtEst(T F)
296 {
297 return UE4::SSE::InvSqrtEst(F);
298 }
299
300 static FORCEINLINE void VectorStoreHalf(uint16* RESTRICT Dst, const float* RESTRICT Src)
301 {
302 _mm_storeu_si64((__m128i*)Dst, UE4::SSE::FloatToHalf(_mm_loadu_ps(Src)));
303 }
304
305 static FORCEINLINE void VectorLoadHalf(float* RESTRICT Dst, const uint16* RESTRICT Src)
306 {
307 _mm_storeu_ps(Dst, UE4::SSE::HalfToFloat(_mm_loadu_si64((__m128i*)Src)));
308 }
309};
310
311#endif // PLATFORM_ENABLE_VECTORINTRINSICS
#define PLATFORM_ENABLE_VECTORINTRINSICS
Definition Platform.h:162
#define FORCEINLINE
Definition Platform.h:644
#define RESTRICT
Definition Platform.h:650
#define PLATFORM_ENABLE_VECTORINTRINSICS_NEON
Definition Platform.h:212