2021-05-07 17:00:41 +02:00
// Do not include this header directly.
//
2024-09-28 12:05:45 +02:00
// Copyright 2020-2024 Binomial LLC
2021-05-07 17:00:41 +02:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The general goal of these vectorized estimated math functions is scalability/performance.
// There are explictly no checks NaN's/Inf's on the input arguments. There are no assertions either.
// These are fast estimate functions - if you need more than that, use stdlib. Please do a proper
// engineering analysis before relying on them.
// I have chosen functions written by others, ported them to CppSPMD, then measured their abs/rel errors.
// I compared each to the ones in DirectXMath and stdlib's for accuracy/performance.
CPPSPMD_FORCE_INLINE vfloat fmod_inv ( const vfloat & a , const vfloat & b , const vfloat & b_inv )
{
vfloat c = frac ( abs ( a * b_inv ) ) * abs ( b ) ;
return spmd_ternaryf ( a < 0 , - c , c ) ;
}
CPPSPMD_FORCE_INLINE vfloat fmod_inv_p ( const vfloat & a , const vfloat & b , const vfloat & b_inv )
{
return frac ( a * b_inv ) * b ;
}
// Avoids dividing by zero or very small values.
CPPSPMD_FORCE_INLINE vfloat safe_div ( vfloat a , vfloat b , float fDivThresh = 1e-7 f )
{
return a / spmd_ternaryf ( abs ( b ) > fDivThresh , b , spmd_ternaryf ( b < 0.0f , - fDivThresh , fDivThresh ) ) ;
}
/*
clang 9.0 .0 for win / fp : precise release
f range : 0.0000000000001250 10000000000.0000000000000000 , vals : 1073741824
log2_est ( ) :
max abs err : 0.0000023076808731
max rel err : 0.0000000756678881
avg abs err : 0.0000007535452724
avg rel err : 0.0000000235117843
XMVectorLog2 ( ) :
max abs err : 0.0000023329709933
max rel err : 0.0000000826961046
avg abs err : 0.0000007564889684
avg rel err : 0.0000000236051899
std : : log2f ( ) :
max abs err : 0.0000020265979401
max rel err : 0.0000000626647654
avg abs err : 0.0000007494445227
avg rel err : 0.0000000233800985
*/
// See https://tech.ebayinc.com/engineering/fast-approximate-logarithms-part-iii-the-formulas/
inline vfloat spmd_kernel : : log2_est ( vfloat v )
{
vfloat signif , fexp ;
// Just clamp to a very small value, instead of checking for invalid inputs.
vfloat x = max ( v , 2.2e-38 f ) ;
/*
* Assume IEEE representation , which is sgn ( 1 ) : exp ( 8 ) : frac ( 23 )
* representing ( 1 + frac ) * 2 ^ ( exp - 127 ) . Call 1 + frac the significand
*/
// get exponent
vint ux1_i = cast_vfloat_to_vint ( x ) ;
vint exp = VUINT_SHIFT_RIGHT ( ux1_i & 0x7F800000 , 23 ) ;
// actual exponent is exp-127, will subtract 127 later
vint ux2_i ;
vfloat ux2_f ;
vint greater = ux1_i & 0x00400000 ; // true if signif > 1.5
SPMD_SIF ( greater ! = 0 )
{
// signif >= 1.5 so need to divide by 2. Accomplish this by stuffing exp = 126 which corresponds to an exponent of -1
store_all ( ux2_i , ( ux1_i & 0x007FFFFF ) | 0x3f000000 ) ;
store_all ( ux2_f , cast_vint_to_vfloat ( ux2_i ) ) ;
// 126 instead of 127 compensates for division by 2
store_all ( fexp , vfloat ( exp - 126 ) ) ;
}
SPMD_SELSE ( greater ! = 0 )
{
// get signif by stuffing exp = 127 which corresponds to an exponent of 0
store ( ux2_i , ( ux1_i & 0x007FFFFF ) | 0x3f800000 ) ;
store ( ux2_f , cast_vint_to_vfloat ( ux2_i ) ) ;
store ( fexp , vfloat ( exp - 127 ) ) ;
}
SPMD_SENDIF
store_all ( signif , ux2_f ) ;
store_all ( signif , signif - 1.0f ) ;
const float a = 0.1501692f , b = 3.4226132f , c = 5.0225057f , d = 4.1130283f , e = 3.4813372f ;
vfloat xm1 = signif ;
vfloat xm1sqr = xm1 * xm1 ;
return fexp + ( ( a * ( xm1sqr * xm1 ) + b * xm1sqr + c * xm1 ) / ( xm1sqr + d * xm1 + e ) ) ;
// fma lowers accuracy for SSE4.1 - no idea why (compiler reordering?)
//return fexp + ((vfma(a, (xm1sqr * xm1), vfma(b, xm1sqr, c * xm1))) / (xm1sqr + vfma(d, xm1, e)));
}
// Uses log2_est(), so this function must be <= the precision of that.
inline vfloat spmd_kernel : : log_est ( vfloat v )
{
return log2_est ( v ) * 0.693147181f ;
}
CPPSPMD_FORCE_INLINE void spmd_kernel : : reduce_expb ( vfloat & arg , vfloat & two_int_a , vint & adjustment )
{
// Assume we're using equation (2)
store_all ( adjustment , 0 ) ;
// integer part of the input argument
vint int_arg = ( vint ) arg ;
// if frac(arg) is in [0.5, 1.0]...
SPMD_SIF ( ( arg - int_arg ) > 0.5f )
{
store ( adjustment , 1 ) ;
// then change it to [0.0, 0.5]
store ( arg , arg - 0.5f ) ;
}
SPMD_SENDIF
// arg == just the fractional part
store_all ( arg , arg - ( vfloat ) int_arg ) ;
// Now compute 2** (int) arg.
store_all ( int_arg , min ( int_arg + 127 , 254 ) ) ;
store_all ( two_int_a , cast_vint_to_vfloat ( VINT_SHIFT_LEFT ( int_arg , 23 ) ) ) ;
}
/*
clang 9.0 .0 for win / fp : precise release
f range : - 50.0000000000000000 49.9999940395355225 , vals : 16777216
exp2_est ( ) :
Total passed near - zero check : 16777216
Total sign diffs : 0
max abs err : 1668910609.7500000000000000
max rel err : 0.0000015642030031
avg abs err : 10793794.4007573910057545
avg rel err : 0.0000003890893282
XMVectorExp2 ( ) :
Total passed near - zero check : 16777216
Total sign diffs : 0
max abs err : 1665552836.8750000000000000
max rel err : 0.0000114674862370
avg abs err : 10771868.2627860084176064
avg rel err : 0.0000011218880770
std : : exp2f ( ) :
Total passed near - zero check : 16777216
Total sign diffs : 0
max abs err : 1591636585.6250000000000000
max rel err : 0.0000014849731018
avg abs err : 10775800.3204844966530800
avg rel err : 0.0000003851496422
*/
// http://www.ganssle.com/item/approximations-c-code-exponentiation-log.htm
inline vfloat spmd_kernel : : exp2_est ( vfloat arg )
{
SPMD_BEGIN_CALL
const vfloat P00 = + 7.2152891521493f ;
const vfloat P01 = + 0.0576900723731f ;
const vfloat Q00 = + 20.8189237930062f ;
const vfloat Q01 = + 1.0f ;
const vfloat sqrt2 = 1.4142135623730950488f ; // sqrt(2) for scaling
vfloat result = 0.0f ;
// Return 0 if arg is too large.
// We're not introducing inf/nan's into calculations, or risk doing so by returning huge default values.
SPMD_IF ( abs ( arg ) > 126.0f )
{
spmd_return ( ) ;
}
SPMD_END_IF
// 2**(int(a))
vfloat two_int_a ;
// set to 1 by reduce_expb
vint adjustment ;
// 0 if arg is +; 1 if negative
vint negative = 0 ;
// If the input is negative, invert it. At the end we'll take the reciprocal, since n**(-1) = 1/(n**x).
SPMD_SIF ( arg < 0.0f )
{
store ( arg , - arg ) ;
store ( negative , 1 ) ;
}
SPMD_SENDIF
store_all ( arg , min ( arg , 126.0f ) ) ;
// reduce to [0.0, 0.5]
reduce_expb ( arg , two_int_a , adjustment ) ;
// The format of the polynomial is:
// answer=(Q(x**2) + x*P(x**2))/(Q(x**2) - x*P(x**2))
//
// The following computes the polynomial in several steps:
// Q(x**2)
vfloat Q = vfma ( Q01 , ( arg * arg ) , Q00 ) ;
// x*P(x**2)
vfloat x_P = arg * ( vfma ( P01 , arg * arg , P00 ) ) ;
vfloat answer = ( Q + x_P ) / ( Q - x_P ) ;
// Now correct for the scaling factor of 2**(int(a))
store_all ( answer , answer * two_int_a ) ;
// If the result had a fractional part > 0.5, correct for that
store_all ( answer , spmd_ternaryf ( adjustment ! = 0 , answer * sqrt2 , answer ) ) ;
// Correct for a negative input
SPMD_SIF ( negative ! = 0 )
{
store ( answer , 1.0f / answer ) ;
}
SPMD_SENDIF
store ( result , answer ) ;
return result ;
}
inline vfloat spmd_kernel : : exp_est ( vfloat arg )
{
// e^x = exp2(x / log_base_e(2))
// constant is 1.0/(log(2)/log(e)) or 1/log(2)
return exp2_est ( arg * 1.44269504f ) ;
}
inline vfloat spmd_kernel : : pow_est ( vfloat arg1 , vfloat arg2 )
{
return exp_est ( log_est ( arg1 ) * arg2 ) ;
}
/*
clang 9.0 .0 for win / fp : precise release
Total near - zero : 144 , output above near - zero tresh : 30
Total near - zero avg : 0.0000067941016621 max : 0.0000134706497192
Total near - zero sign diffs : 5
Total passed near - zero check : 16777072
Total sign diffs : 5
max abs err : 0.0000031375306036
max rel err : 0.1140846017075028
avg abs err : 0.0000003026226621
avg rel err : 0.0000033564977623
*/
// Math from this web page: http://developer.download.nvidia.com/cg/sin.html
// This is ~2x slower than sin_est() or cos_est(), and less accurate, but I'm keeping it here for comparison purposes to help validate/sanity check sin_est() and cos_est().
inline vfloat spmd_kernel : : sincos_est_a ( vfloat a , bool sin_flag )
{
const float c0_x = 0.0f , c0_y = 0.5f , c0_z = 1.0f ;
const float c1_x = 0.25f , c1_y = - 9.0f , c1_z = 0.75f , c1_w = 0.159154943091f ;
const float c2_x = 24.9808039603f , c2_y = - 24.9808039603f , c2_z = - 60.1458091736f , c2_w = 60.1458091736f ;
const float c3_x = 85.4537887573f , c3_y = - 85.4537887573f , c3_z = - 64.9393539429f , c3_w = 64.9393539429f ;
const float c4_x = 19.7392082214f , c4_y = - 19.7392082214f , c4_z = - 1.0f , c4_w = 1.0f ;
vfloat r0_x , r0_y , r0_z , r1_x , r1_y , r1_z , r2_x , r2_y , r2_z ;
store_all ( r1_x , sin_flag ? vfms ( c1_w , a , c1_x ) : c1_w * a ) ;
store_all ( r1_y , frac ( r1_x ) ) ;
store_all ( r2_x , ( vfloat ) ( r1_y < c1_x ) ) ;
store_all ( r2_y , ( vfloat ) ( r1_y > = c1_y ) ) ;
store_all ( r2_z , ( vfloat ) ( r1_y > = c1_z ) ) ;
store_all ( r2_y , vfma ( r2_x , c4_z , vfma ( r2_y , c4_w , r2_z * c4_z ) ) ) ;
store_all ( r0_x , c0_x - r1_y ) ;
store_all ( r0_y , c0_y - r1_y ) ;
store_all ( r0_z , c0_z - r1_y ) ;
store_all ( r0_x , r0_x * r0_x ) ;
store_all ( r0_y , r0_y * r0_y ) ;
store_all ( r0_z , r0_z * r0_z ) ;
store_all ( r1_x , vfma ( c2_x , r0_x , c2_z ) ) ;
store_all ( r1_y , vfma ( c2_y , r0_y , c2_w ) ) ;
store_all ( r1_z , vfma ( c2_x , r0_z , c2_z ) ) ;
store_all ( r1_x , vfma ( r1_x , r0_x , c3_x ) ) ;
store_all ( r1_y , vfma ( r1_y , r0_y , c3_y ) ) ;
store_all ( r1_z , vfma ( r1_z , r0_z , c3_x ) ) ;
store_all ( r1_x , vfma ( r1_x , r0_x , c3_z ) ) ;
store_all ( r1_y , vfma ( r1_y , r0_y , c3_w ) ) ;
store_all ( r1_z , vfma ( r1_z , r0_z , c3_z ) ) ;
store_all ( r1_x , vfma ( r1_x , r0_x , c4_x ) ) ;
store_all ( r1_y , vfma ( r1_y , r0_y , c4_y ) ) ;
store_all ( r1_z , vfma ( r1_z , r0_z , c4_x ) ) ;
store_all ( r1_x , vfma ( r1_x , r0_x , c4_z ) ) ;
store_all ( r1_y , vfma ( r1_y , r0_y , c4_w ) ) ;
store_all ( r1_z , vfma ( r1_z , r0_z , c4_z ) ) ;
store_all ( r0_x , vfnma ( r1_x , r2_x , vfnma ( r1_y , r2_y , r1_z * - r2_z ) ) ) ;
return r0_x ;
}
// positive values only
CPPSPMD_FORCE_INLINE vfloat spmd_kernel : : recip_est1 ( const vfloat & q )
{
//const int mag = 0x7EF312AC; // 2 NR iters, 3 is 0x7EEEEBB3
const int mag = 0x7EF311C3 ;
const float fMinThresh = .0000125f ;
vfloat l = spmd_ternaryf ( q > = fMinThresh , q , cast_vint_to_vfloat ( vint ( mag ) ) ) ;
vint x_l = vint ( mag ) - cast_vfloat_to_vint ( l ) ;
vfloat rcp_l = cast_vint_to_vfloat ( x_l ) ;
return rcp_l * vfnma ( rcp_l , q , 2.0f ) ;
}
CPPSPMD_FORCE_INLINE vfloat spmd_kernel : : recip_est1_pn ( const vfloat & t )
{
//const int mag = 0x7EF312AC; // 2 NR iters, 3 is 0x7EEEEBB3
const int mag = 0x7EF311C3 ;
const float fMinThresh = .0000125f ;
vfloat s = sign ( t ) ;
vfloat q = abs ( t ) ;
vfloat l = spmd_ternaryf ( q > = fMinThresh , q , cast_vint_to_vfloat ( vint ( mag ) ) ) ;
vint x_l = vint ( mag ) - cast_vfloat_to_vint ( l ) ;
vfloat rcp_l = cast_vint_to_vfloat ( x_l ) ;
return rcp_l * vfnma ( rcp_l , q , 2.0f ) * s ;
}
// https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
// https://github.com/hcs0/Hackers-Delight/blob/master/rsqrt.c.txt
CPPSPMD_FORCE_INLINE vfloat spmd_kernel : : rsqrt_est1 ( vfloat x0 )
{
vfloat xhalf = 0.5f * x0 ;
vfloat x = cast_vint_to_vfloat ( vint ( 0x5F375A82 ) - ( VINT_SHIFT_RIGHT ( cast_vfloat_to_vint ( x0 ) , 1 ) ) ) ;
return x * vfnma ( xhalf * x , x , 1.5008909f ) ;
}
CPPSPMD_FORCE_INLINE vfloat spmd_kernel : : rsqrt_est2 ( vfloat x0 )
{
vfloat xhalf = 0.5f * x0 ;
vfloat x = cast_vint_to_vfloat ( vint ( 0x5F37599E ) - ( VINT_SHIFT_RIGHT ( cast_vfloat_to_vint ( x0 ) , 1 ) ) ) ;
vfloat x1 = x * vfnma ( xhalf * x , x , 1.5 ) ;
vfloat x2 = x1 * vfnma ( xhalf * x1 , x1 , 1.5 ) ;
return x2 ;
}
// Math from: http://developer.download.nvidia.com/cg/atan2.html
// TODO: Needs more validation, parameter checking.
CPPSPMD_FORCE_INLINE vfloat spmd_kernel : : atan2_est ( vfloat y , vfloat x )
{
vfloat t1 = abs ( y ) ;
vfloat t3 = abs ( x ) ;
vfloat t0 = max ( t3 , t1 ) ;
store_all ( t1 , min ( t3 , t1 ) ) ;
store_all ( t3 , t1 / t0 ) ;
vfloat t4 = t3 * t3 ;
store_all ( t0 , vfma ( - 0.013480470f , t4 , 0.057477314f ) ) ;
store_all ( t0 , vfms ( t0 , t4 , 0.121239071f ) ) ;
store_all ( t0 , vfma ( t0 , t4 , 0.195635925f ) ) ;
store_all ( t0 , vfms ( t0 , t4 , 0.332994597f ) ) ;
store_all ( t0 , vfma ( t0 , t4 , 0.999995630f ) ) ;
store_all ( t3 , t0 * t3 ) ;
store_all ( t3 , spmd_ternaryf ( abs ( y ) > abs ( x ) , vfloat ( 1.570796327f ) - t3 , t3 ) ) ;
store_all ( t3 , spmd_ternaryf ( x < 0.0f , vfloat ( 3.141592654f ) - t3 , t3 ) ) ;
store_all ( t3 , spmd_ternaryf ( y < 0.0f , - t3 , t3 ) ) ;
return t3 ;
}
/*
clang 9.0 .0 for win / fp : precise release
Tested range : - 25.1327412287183449 25.1327382326621169 , vals : 16777216
Skipped angles near 90 / 270 within + - .001 radians .
Near - zero threshold : .0000125f
Near - zero output above check threshold : 1e-6 f
Total near - zero : 144 , output above near - zero tresh : 20
Total near - zero avg : 0.0000067510751968 max : 0.0000133514404297
Total near - zero sign diffs : 5
Total passed near - zero check : 16766400
Total sign diffs : 5
max abs err : 1.4982600811139264
max rel err : 0.1459155900188041
avg rel err : 0.0000054659502568
XMVectorTan ( ) precise :
Total near - zero : 144 , output above near - zero tresh : 18
Total near - zero avg : 0.0000067641216186 max : 0.0000133524126795
Total near - zero sign diffs : 0
Total passed near - zero check : 16766400
Total sign diffs : 0
max abs err : 1.9883573246424930
max rel err : 0.1459724171926864
avg rel err : 0.0000054965766843
std : : tanf ( ) :
Total near - zero : 144 , output above near - zero tresh : 0
Total near - zero avg : 0.0000067116930779 max : 0.0000127713074107
Total near - zero sign diffs : 11
Total passed near - zero check : 16766400
Total sign diffs : 11
max abs err : 0.8989131818294709
max rel err : 0.0573181403173166
avg rel err : 0.0000030791301203
Originally from :
http : //www.ganssle.com/approx.htm
*/
CPPSPMD_FORCE_INLINE vfloat spmd_kernel : : tan82 ( vfloat x )
{
// Original double version was 8.2 digits
//double c1 = 211.849369664121f, c2 = -12.5288887278448f, c3 = 269.7350131214121f, c4 = -71.4145309347748f;
// Tuned float constants for lower avg rel error (without using FMA3):
const float c1 = 211.849350f , c2 = - 12.5288887f , c3 = 269.734985f , c4 = - 71.4145203f ;
vfloat x2 = x * x ;
return ( x * ( vfma ( c2 , x2 , c1 ) ) / ( vfma ( x2 , ( c4 + x2 ) , c3 ) ) ) ;
}
// Don't call this for angles close to 90/270!.
inline vfloat spmd_kernel : : tan_est ( vfloat x )
{
const float fPi = 3.141592653589793f , fOneOverPi = 0.3183098861837907f ;
CPPSPMD_DECL ( const uint8_t , s_table0 [ 16 ] ) = { 128 + 0 , 128 + 2 , 128 + - 2 , 128 + 4 , 128 + 0 , 128 + 2 , 128 + - 2 , 128 + 4 , 128 + 0 , 128 + 2 , 128 + - 2 , 128 + 4 , 128 + 0 , 128 + 2 , 128 + - 2 , 128 + 4 } ;
vint table = init_lookup4 ( s_table0 ) ; // a load
vint sgn = cast_vfloat_to_vint ( x ) & 0x80000000 ;
store_all ( x , abs ( x ) ) ;
vfloat orig_x = x ;
vfloat q = x * fOneOverPi ;
store_all ( x , q - floor ( q ) ) ;
vfloat x4 = x * 4.0f ;
vint octant = ( vint ) ( x4 ) ;
vfloat x0 = spmd_ternaryf ( ( octant & 1 ) ! = 0 , - x4 , x4 ) ;
vint k = table_lookup4_8 ( octant , table ) & 0xFF ; // a shuffle
vfloat bias = ( vfloat ) k + - 128.0f ;
vfloat y = x0 + bias ;
vfloat z = tan82 ( y ) ;
vfloat r ;
vbool octant_one_or_two = ( octant = = 1 ) | | ( octant = = 2 ) ;
// SPMD optimization - skip costly divide if we can
if ( spmd_any ( octant_one_or_two ) )
{
const float fDivThresh = .4371e-7 f ;
vfloat one_over_z = 1.0f / spmd_ternaryf ( abs ( z ) > fDivThresh , z , spmd_ternaryf ( z < 0.0f , - fDivThresh , fDivThresh ) ) ;
vfloat b = spmd_ternaryf ( octant_one_or_two , one_over_z , z ) ;
store_all ( r , spmd_ternaryf ( ( octant & 2 ) ! = 0 , - b , b ) ) ;
}
else
{
store_all ( r , spmd_ternaryf ( octant = = 0 , z , - z ) ) ;
}
// Small angle approximation, to decrease the max rel error near Pi.
SPMD_SIF ( x > = ( 1.0f - .0003125f * 4.0f ) )
{
store ( r , vfnma ( floor ( q ) + 1.0f , fPi , orig_x ) ) ;
}
SPMD_SENDIF
return cast_vint_to_vfloat ( cast_vfloat_to_vint ( r ) ^ sgn ) ;
}
inline void spmd_kernel : : seed_rand ( rand_context & x , vint seed )
{
store ( x . a , 0xf1ea5eed ) ;
store ( x . b , seed ^ 0xd8487b1f ) ;
store ( x . c , seed ^ 0xdbadef9a ) ;
store ( x . d , seed ) ;
for ( int i = 0 ; i < 20 ; + + i )
( void ) get_randu ( x ) ;
}
// https://burtleburtle.net/bob/rand/smallprng.html
// Returns 32-bit unsigned random numbers.
inline vint spmd_kernel : : get_randu ( rand_context & x )
{
vint e = x . a - VINT_ROT ( x . b , 27 ) ;
store ( x . a , x . b ^ VINT_ROT ( x . c , 17 ) ) ;
store ( x . b , x . c + x . d ) ;
store ( x . c , x . d + e ) ;
store ( x . d , e + x . a ) ;
return x . d ;
}
// Returns random numbers between [low, high), or low if low >= high
inline vint spmd_kernel : : get_randi ( rand_context & x , vint low , vint high )
{
vint rnd = get_randu ( x ) ;
vint range = high - low ;
vint rnd_range = mulhiu ( rnd , range ) ;
return spmd_ternaryi ( low < high , low + rnd_range , low ) ;
}
// Returns random numbers between [low, high), or low if low >= high
inline vfloat spmd_kernel : : get_randf ( rand_context & x , vfloat low , vfloat high )
{
vint rndi = get_randu ( x ) & 0x7fffff ;
vfloat rnd = ( vfloat ) ( rndi ) * ( 1.0f / 8388608.0f ) ;
return spmd_ternaryf ( low < high , vfma ( high - low , rnd , low ) , low ) ;
}
CPPSPMD_FORCE_INLINE void spmd_kernel : : init_reverse_bits ( vint & tab1 , vint & tab2 )
{
const uint8_t tab1_bytes [ 16 ] = { 0 , 8 , 4 , 12 , 2 , 10 , 6 , 14 , 1 , 9 , 5 , 13 , 3 , 11 , 7 , 15 } ;
const uint8_t tab2_bytes [ 16 ] = { 0 , 8 < < 4 , 4 < < 4 , 12 < < 4 , 2 < < 4 , 10 < < 4 , 6 < < 4 , 14 < < 4 , 1 < < 4 , 9 < < 4 , 5 < < 4 , 13 < < 4 , 3 < < 4 , 11 < < 4 , 7 < < 4 , 15 < < 4 } ;
store_all ( tab1 , init_lookup4 ( tab1_bytes ) ) ;
store_all ( tab2 , init_lookup4 ( tab2_bytes ) ) ;
}
CPPSPMD_FORCE_INLINE vint spmd_kernel : : reverse_bits ( vint k , vint tab1 , vint tab2 )
{
vint r0 = table_lookup4_8 ( k & 0x7F7F7F7F , tab2 ) ;
vint r1 = table_lookup4_8 ( VUINT_SHIFT_RIGHT ( k , 4 ) & 0x7F7F7F7F , tab1 ) ;
vint r3 = r0 | r1 ;
return byteswap ( r3 ) ;
}
CPPSPMD_FORCE_INLINE vint spmd_kernel : : count_leading_zeros ( vint x )
{
CPPSPMD_DECL ( const uint8_t , s_tab [ 16 ] ) = { 0 , 3 , 2 , 2 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 } ;
vint tab = init_lookup4 ( s_tab ) ;
//x <= 0x0000ffff
vbool c0 = ( x & 0xFFFF0000 ) = = 0 ;
vint n0 = spmd_ternaryi ( c0 , 16 , 0 ) ;
vint x0 = spmd_ternaryi ( c0 , VINT_SHIFT_LEFT ( x , 16 ) , x ) ;
//x <= 0x00ffffff
vbool c1 = ( x0 & 0xFF000000 ) = = 0 ;
vint n1 = spmd_ternaryi ( c1 , n0 + 8 , n0 ) ;
vint x1 = spmd_ternaryi ( c1 , VINT_SHIFT_LEFT ( x0 , 8 ) , x0 ) ;
//x <= 0x0fffffff
vbool c2 = ( x1 & 0xF0000000 ) = = 0 ;
vint n2 = spmd_ternaryi ( c2 , n1 + 4 , n1 ) ;
vint x2 = spmd_ternaryi ( c2 , VINT_SHIFT_LEFT ( x1 , 4 ) , x1 ) ;
return table_lookup4_8 ( VUINT_SHIFT_RIGHT ( x2 , 28 ) , tab ) + n2 ;
}
CPPSPMD_FORCE_INLINE vint spmd_kernel : : count_leading_zeros_alt ( vint x )
{
//x <= 0x0000ffff
vbool c0 = ( x & 0xFFFF0000 ) = = 0 ;
vint n0 = spmd_ternaryi ( c0 , 16 , 0 ) ;
vint x0 = spmd_ternaryi ( c0 , VINT_SHIFT_LEFT ( x , 16 ) , x ) ;
//x <= 0x00ffffff
vbool c1 = ( x0 & 0xFF000000 ) = = 0 ;
vint n1 = spmd_ternaryi ( c1 , n0 + 8 , n0 ) ;
vint x1 = spmd_ternaryi ( c1 , VINT_SHIFT_LEFT ( x0 , 8 ) , x0 ) ;
//x <= 0x0fffffff
vbool c2 = ( x1 & 0xF0000000 ) = = 0 ;
vint n2 = spmd_ternaryi ( c2 , n1 + 4 , n1 ) ;
vint x2 = spmd_ternaryi ( c2 , VINT_SHIFT_LEFT ( x1 , 4 ) , x1 ) ;
// x <= 0x3fffffff
vbool c3 = ( x2 & 0xC0000000 ) = = 0 ;
vint n3 = spmd_ternaryi ( c3 , n2 + 2 , n2 ) ;
vint x3 = spmd_ternaryi ( c3 , VINT_SHIFT_LEFT ( x2 , 2 ) , x2 ) ;
// x <= 0x7fffffff
vbool c4 = ( x3 & 0x80000000 ) = = 0 ;
return spmd_ternaryi ( c4 , n3 + 1 , n3 ) ;
}
CPPSPMD_FORCE_INLINE vint spmd_kernel : : count_trailing_zeros ( vint x )
{
// cast the least significant bit in v to a float
vfloat f = ( vfloat ) ( x & - x ) ;
// extract exponent and adjust
return VUINT_SHIFT_RIGHT ( cast_vfloat_to_vint ( f ) , 23 ) - 0x7F ;
}
CPPSPMD_FORCE_INLINE vint spmd_kernel : : count_set_bits ( vint x )
{
vint v = x - ( VUINT_SHIFT_RIGHT ( x , 1 ) & 0x55555555 ) ;
vint v1 = ( v & 0x33333333 ) + ( VUINT_SHIFT_RIGHT ( v , 2 ) & 0x33333333 ) ;
2024-09-28 12:05:45 +02:00
return VUINT_SHIFT_RIGHT ( ( ( v1 + ( VUINT_SHIFT_RIGHT ( v1 , 4 ) & 0xF0F0F0F ) ) * 0x1010101 ) , 24 ) ;
2021-05-07 17:00:41 +02:00
}
CPPSPMD_FORCE_INLINE vint cmple_epu16 ( const vint & a , const vint & b )
{
return cmpeq_epi16 ( subs_epu16 ( a , b ) , vint ( 0 ) ) ;
}
CPPSPMD_FORCE_INLINE vint cmpge_epu16 ( const vint & a , const vint & b )
{
return cmple_epu16 ( b , a ) ;
}
CPPSPMD_FORCE_INLINE vint cmpgt_epu16 ( const vint & a , const vint & b )
{
return andnot ( cmpeq_epi16 ( a , b ) , cmple_epu16 ( b , a ) ) ;
}
CPPSPMD_FORCE_INLINE vint cmplt_epu16 ( const vint & a , const vint & b )
{
return cmpgt_epu16 ( b , a ) ;
}
CPPSPMD_FORCE_INLINE vint cmpge_epi16 ( const vint & a , const vint & b )
{
return cmpeq_epi16 ( a , b ) | cmpgt_epi16 ( a , b ) ;
}
CPPSPMD_FORCE_INLINE vint cmple_epi16 ( const vint & a , const vint & b )
{
return cmpge_epi16 ( b , a ) ;
}
void spmd_kernel : : print_vint ( vint v )
{
for ( uint32_t i = 0 ; i < PROGRAM_COUNT ; i + + )
printf ( " %i " , extract ( v , i ) ) ;
printf ( " \n " ) ;
}
void spmd_kernel : : print_vbool ( vbool v )
{
for ( uint32_t i = 0 ; i < PROGRAM_COUNT ; i + + )
printf ( " %i " , extract ( v , i ) ? 1 : 0 ) ;
printf ( " \n " ) ;
}
void spmd_kernel : : print_vint_hex ( vint v )
{
for ( uint32_t i = 0 ; i < PROGRAM_COUNT ; i + + )
printf ( " 0x%X " , extract ( v , i ) ) ;
printf ( " \n " ) ;
}
void spmd_kernel : : print_active_lanes ( const char * pPrefix )
{
CPPSPMD_DECL ( int , flags [ PROGRAM_COUNT ] ) ;
memset ( flags , 0 , sizeof ( flags ) ) ;
storeu_linear ( flags , vint ( 1 ) ) ;
if ( pPrefix )
printf ( " %s " , pPrefix ) ;
for ( uint32_t i = 0 ; i < PROGRAM_COUNT ; i + + )
{
if ( flags [ i ] )
printf ( " %u " , i ) ;
}
printf ( " \n " ) ;
}
void spmd_kernel : : print_vfloat ( vfloat v )
{
for ( uint32_t i = 0 ; i < PROGRAM_COUNT ; i + + )
printf ( " %f " , extract ( v , i ) ) ;
printf ( " \n " ) ;
}