8#include "HAL/PlatformMisc.h"
14struct TUnrealPlatformMathSSEBase :
public Base
26 const __m128 One = _mm_set_ss(1.0f);
27 const __m128 Y0 = _mm_set_ss(InValue);
28 const __m128 X0 = _mm_sqrt_ss(Y0);
29 const __m128 R0 = _mm_div_ss(One, X0);
31 _mm_store_ss(&temp, R0);
37 const __m128d One = _mm_set_sd(1.0);
38 const __m128d Y0 = _mm_set_sd(InValue);
39 const __m128d X0 = _mm_sqrt_sd(One, Y0);
40 const __m128d R0 = _mm_div_sd(One, X0);
42 _mm_store_sd(&temp, R0);
49 const __m128 fOneHalf = _mm_set_ss(0.5f);
50 __m128 Y0, X0, X1, FOver2;
54 X0 = _mm_rsqrt_ss(Y0);
55 FOver2 = _mm_mul_ss(Y0, fOneHalf);
58 X1 = _mm_mul_ss(X0, X0);
59 X1 = _mm_sub_ss(fOneHalf, _mm_mul_ss(FOver2, X1));
60 X1 = _mm_add_ss(X0, _mm_mul_ss(X0, X1));
62 _mm_store_ss(&temp, X1);
68 return InvSqrt(InValue);
73 return _mm_cvtt_ss2si(_mm_set_ss(F));
78 return _mm_cvttsd_si32(_mm_set_sd(InValue));
83 return _mm_cvttsd_si64(_mm_set_sd(InValue));
90 return _mm_cvt_ss2si(_mm_set_ss(F + F - 0.5f)) >> 1;
95 return _mm_cvtsd_si32(_mm_set_sd(InValue + InValue - 0.5)) >> 1;
100 return _mm_cvtsd_si64(_mm_set_sd(InValue + InValue - 0.5)) >> 1;
110 return _mm_cvt_ss2si(_mm_set_ss(F + F + 0.5f)) >> 1;
115 return _mm_cvtsd_si32(_mm_set_sd(InValue + InValue + 0.5)) >> 1;
120 return _mm_cvtsd_si64(_mm_set_sd(InValue + InValue + 0.5)) >> 1;
127 return -(_mm_cvt_ss2si(_mm_set_ss(-0.5f - (F + F))) >> 1);
132 return -(_mm_cvtsd_si32(_mm_set_sd(-0.5 - (InValue + InValue))) >> 1);
137 return -(_mm_cvtsd_si64(_mm_set_sd(-0.5 - (InValue + InValue))) >> 1);
142 inline __m128i FloatToHalf(__m128 f)
144 const __m128 mask_sign = _mm_set1_ps(-0.0f);
145 const __m128i c_f16max = _mm_set1_epi32((127 + 16) << 23);
146 const __m128i c_nanbit = _mm_set1_epi32(0x200);
147 const __m128i c_nanlobits = _mm_set1_epi32(0x1ff);
148 const __m128i c_infty_as_fp16 = _mm_set1_epi32(0x7c00);
149 const __m128i c_min_normal = _mm_set1_epi32((127 - 14) << 23);
150 const __m128i c_subnorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
151 const __m128i c_normal_bias = _mm_set1_epi32(0xfff - ((127 - 15) << 23));
154 __m128 justsign = _mm_and_ps( f , mask_sign );
156 __m128 absf = _mm_andnot_ps(mask_sign, f);
157 __m128i absf_int = _mm_castps_si128(absf);
158 __m128 b_isnan = _mm_cmpunord_ps(absf, absf);
159 __m128i b_isregular = _mm_cmpgt_epi32(c_f16max, absf_int);
160 __m128i nan_payload = _mm_and_si128(_mm_srli_epi32(absf_int, 13), c_nanlobits);
161 __m128i nan_quiet = _mm_or_si128(nan_payload, c_nanbit);
162 __m128i nanfinal = _mm_and_si128(_mm_castps_si128(b_isnan), nan_quiet);
163 __m128i inf_or_nan = _mm_or_si128(nanfinal, c_infty_as_fp16);
165 __m128i b_issub = _mm_cmpgt_epi32(c_min_normal, absf_int);
168 __m128 subnorm1 = _mm_add_ps( absf , __m128(_mm_castsi128_ps(c_subnorm_magic)) );
169 __m128i subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), c_subnorm_magic);
172 __m128i mantoddbit = _mm_slli_epi32(absf_int, 31 - 13);
173 __m128i mantodd = _mm_srai_epi32(mantoddbit, 31);
175 __m128i round1 = _mm_add_epi32(absf_int, c_normal_bias);
176 __m128i round2 = _mm_sub_epi32(round1, mantodd);
177 __m128i normal = _mm_srli_epi32(round2, 13);
180 __m128i nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
183 __m128i joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
185 __m128i sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
186 __m128i rgba_half32 = _mm_or_si128(joined, sign_shift);
191 __m128i four_halfs_u64 = _mm_packs_epi32(rgba_half32, rgba_half32);
193 return four_halfs_u64;
197 inline __m128 HalfToFloat(__m128i four_halfs_u64)
199 __m128i rgba_half32 = _mm_unpacklo_epi16(four_halfs_u64, _mm_setzero_si128());
201 const __m128i mask_nosign = _mm_set1_epi32(0x7fff);
202 const __m128 magic_mult = _mm_castsi128_ps(_mm_set1_epi32((254 - 15) << 23));
203 const __m128i was_infnan = _mm_set1_epi32(0x7bff);
204 const __m128 exp_infnan = _mm_castsi128_ps(_mm_set1_epi32(255 << 23));
205 const __m128i was_nan = _mm_set1_epi32(0x7c00);
206 const __m128i nan_quiet = _mm_set1_epi32(1 << 22);
208 __m128i expmant = _mm_and_si128(mask_nosign, rgba_half32);
209 __m128i justsign = _mm_xor_si128(rgba_half32, expmant);
210 __m128i shifted = _mm_slli_epi32(expmant, 13);
211 __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), magic_mult);
212 __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant, was_infnan);
213 __m128i sign = _mm_slli_epi32(justsign, 16);
214 __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), exp_infnan);
215 __m128i b_wasnan = _mm_cmpgt_epi32(expmant, was_nan);
216 __m128i nanquiet = _mm_and_si128(b_wasnan, nan_quiet);
217 __m128 infnandone = _mm_or_ps(infnanexp, _mm_castsi128_ps(nanquiet));
219 __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnandone);
220 __m128 result = _mm_or_ps(scaled, sign_inf);
228struct TUnrealPlatformMathSSEBase :
public Base
233 return UE4::SSE::TruncToInt32(F);
238 return UE4::SSE::TruncToInt64(F);
244 return UE4::SSE::RoundToInt32(F);
249 return UE4::SSE::RoundToInt64(F);
255 return UE4::SSE::FloorToInt32(F);
260 return UE4::SSE::FloorToInt64(F);
266 return UE4::SSE::CeilToInt32(F);
271 return UE4::SSE::CeilToInt64(F);
278 static FORCEINLINE int32 TruncToInt(
float F) {
return TruncToInt32(F); }
279 static FORCEINLINE int64 TruncToInt(
double F) {
return TruncToInt64(F); }
280 static FORCEINLINE int32 FloorToInt(
float F) {
return FloorToInt32(F); }
281 static FORCEINLINE int64 FloorToInt(
double F) {
return FloorToInt64(F); }
282 static FORCEINLINE int32 RoundToInt(
float F) {
return RoundToInt32(F); }
283 static FORCEINLINE int64 RoundToInt(
double F) {
return RoundToInt64(F); }
284 static FORCEINLINE int32 CeilToInt(
float F) {
return CeilToInt32(F); }
285 static FORCEINLINE int64 CeilToInt(
double F) {
return CeilToInt64(F); }
291 return UE4::SSE::InvSqrt(F);
297 return UE4::SSE::InvSqrtEst(F);
302 _mm_storeu_si64((__m128i*)Dst, UE4::SSE::FloatToHalf(_mm_loadu_ps(Src)));
307 _mm_storeu_ps(Dst, UE4::SSE::HalfToFloat(_mm_loadu_si64((__m128i*)Src)));