4
0
mirror of git://sourceware.org/git/newlib-cygwin.git synced 2025-01-27 01:27:21 +08:00
Kwok Cheung Yeung e18743072b amdgcn: Add vectorized math routines
This implements a set of vectorized math routines to be used by the
compiler auto-vectorizer.  Versions for vectors with 2 lanes up to
64 lanes (in powers of 2) are provided.

These routines are based on the scalar versions of the math routines in
libm/common, libm/math and libm/mathfp.  They make extensive use of the GCC
C vector extensions and GCN-specific builtins in GCC.
2023-01-18 13:22:58 -05:00

322 lines
10 KiB
C

/*
* Copyright 2023 Siemens
*
* The authors hereby grant permission to use, copy, modify, distribute,
* and license this software and its documentation for any purpose, provided
* that existing copyright notices are retained in all copies and that this
* notice is included verbatim in any distributions. No written agreement,
* license, or royalty fee is required for any of the authorized uses.
* Modifications to this software may be copyrighted by their authors
* and need not follow the licensing terms described here, provided that
* the new terms are clearly indicated on the first page of each file where
* they apply.
*/
/* Macro library used to help during conversion of scalar math functions to
vectorized SIMD equivalents on AMD GCN. */
/* Optimization at -O2 and above currently result in ICEs when converting
between vector types. */
#pragma GCC optimize ("O1")
typedef float v2sf __attribute__ ((vector_size (8)));
typedef float v4sf __attribute__ ((vector_size (16)));
typedef float v8sf __attribute__ ((vector_size (32)));
typedef float v16sf __attribute__ ((vector_size (64)));
typedef float v32sf __attribute__ ((vector_size (128)));
typedef float v64sf __attribute__ ((vector_size (256)));
typedef double v2df __attribute__ ((vector_size (16)));
typedef double v4df __attribute__ ((vector_size (32)));
typedef double v8df __attribute__ ((vector_size (64)));
typedef double v16df __attribute__ ((vector_size (128)));
typedef double v32df __attribute__ ((vector_size (256)));
typedef double v64df __attribute__ ((vector_size (512)));
typedef int v2si __attribute__ ((vector_size (8)));
typedef int v4si __attribute__ ((vector_size (16)));
typedef int v8si __attribute__ ((vector_size (32)));
typedef int v16si __attribute__ ((vector_size (64)));
typedef int v32si __attribute__ ((vector_size (128)));
typedef int v64si __attribute__ ((vector_size (256)));
typedef unsigned int v64usi __attribute__ ((vector_size (256)));
typedef long v2di __attribute__ ((vector_size (16)));
typedef long v4di __attribute__ ((vector_size (32)));
typedef long v8di __attribute__ ((vector_size (64)));
typedef long v16di __attribute__ ((vector_size (128)));
typedef long v32di __attribute__ ((vector_size (256)));
typedef long v64di __attribute__ ((vector_size (512)));
typedef union {
v2sf t_v2sf;
v4sf t_v4sf;
v8sf t_v8sf;
v16sf t_v16sf;
v32sf t_v32sf;
v64sf t_v64sf;
v2df t_v2df;
v4df t_v4df;
v8df t_v8df;
v16df t_v16df;
v32df t_v32df;
v64df t_v64df;
v2si t_v2si;
v4si t_v4si;
v8si t_v8si;
v16si t_v16si;
v32si t_v32si;
v64si t_v64si;
v64usi t_v64usi;
v2di t_v2di;
v4di t_v4di;
v8di t_v8di;
v16di t_v16di;
v32di t_v32di;
v64di t_v64di;
} vector_union;
/* Cast between vectors with a different number of elements. */
#define RESIZE_VECTOR(to_t, from) \
({ \
__auto_type __from = (from); \
*((to_t *) &__from); \
})
/* Bit-wise cast vector FROM to type TO_T. */
#define CAST_VECTOR(to_t, from) \
({ \
_Static_assert (sizeof (to_t) == sizeof (from)); \
union { \
typeof (from) __from; \
to_t __to; \
} __tmp; \
__tmp.__from = (from); \
__tmp.__to; \
})
#define NO_COND __mask
/* Note - __mask is _not_ accounted for in VECTOR_MERGE! */
#define VECTOR_MERGE(vec1, vec2, cond) \
({ \
_Static_assert (__builtin_types_compatible_p (typeof (vec1), typeof (vec2))); \
union { \
typeof (vec1) val; \
v64si t_v64si; \
v64di t_v64di; \
} __vec1, __vec2, __res; \
__vec1.val = (vec1); \
__vec2.val = (vec2); \
__builtin_choose_expr ( \
sizeof (vec1) == sizeof (v64si), \
({ \
v64si __bitmask = __builtin_convertvector ((cond), v64si); \
__res.t_v64si = (__vec1.t_v64si & __bitmask) \
| (__vec2.t_v64si & ~__bitmask); \
}), \
({ \
v64di __bitmask = __builtin_convertvector ((cond), v64di); \
__res.t_v64di = (__vec1.t_v64di & __bitmask) \
| (__vec2.t_v64di & ~__bitmask); \
})); \
__res.val; \
})
#define VECTOR_RETURN(retval, cond) \
do { \
_Static_assert (__builtin_types_compatible_p (typeof (retval), typeof (__ret))); \
__auto_type __cond = __builtin_convertvector ((cond), typeof (__mask)); \
__auto_type __retval = (retval); \
VECTOR_COND_MOVE (__ret, __retval, __cond); \
__mask &= ~__cond; \
} while (0)
#define VECTOR_COND_MOVE(var, val, cond) \
do { \
_Static_assert (__builtin_types_compatible_p (typeof (var), typeof (val))); \
__auto_type __cond = __builtin_convertvector ((cond), typeof (__mask)); \
var = VECTOR_MERGE ((val), var, __cond & __mask); \
} while (0)
#define VECTOR_IF(cond, cond_var) \
{ \
__auto_type cond_var = (cond); \
__auto_type __inv_cond = ~cond_var; \
if (!ALL_ZEROES_P (cond_var)) \
{
#define VECTOR_ELSEIF(cond, cond_var) \
} \
cond_var = __inv_cond & (cond); \
__inv_cond &= ~(cond); \
if (!ALL_ZEROES_P (cond_var)) \
{
#define VECTOR_ELSE(cond_var) \
} \
cond_var = __inv_cond; \
if (!ALL_ZEROES_P (cond_var)) \
{
#define VECTOR_IF2(cond, cond_var, prev_cond_var) \
{ \
__auto_type cond_var = (cond) & __builtin_convertvector (prev_cond_var, typeof (cond)); \
__auto_type __inv_cond = ~(cond); \
if (!ALL_ZEROES_P (cond_var)) \
{
#define VECTOR_ELSEIF2(cond, cond_var, prev_cond_var) \
} \
cond_var = (cond) & __inv_cond & __builtin_convertvector (prev_cond_var, typeof (cond)); \
__inv_cond &= ~(cond); \
if (!ALL_ZEROES_P (cond_var)) \
{
#define VECTOR_ELSE2(cond_var, prev_cond_var) \
} \
cond_var = __inv_cond & __builtin_convertvector (prev_cond_var, typeof (__inv_cond)); \
if (!ALL_ZEROES_P (cond_var)) \
{
#define VECTOR_ENDIF \
} \
}
#define VECTOR_INIT_AUX(x, type) \
({ \
typeof (x) __e = (x); \
type __tmp = { \
__e, __e, __e, __e, __e, __e, __e, __e, \
__e, __e, __e, __e, __e, __e, __e, __e, \
__e, __e, __e, __e, __e, __e, __e, __e, \
__e, __e, __e, __e, __e, __e, __e, __e, \
__e, __e, __e, __e, __e, __e, __e, __e, \
__e, __e, __e, __e, __e, __e, __e, __e, \
__e, __e, __e, __e, __e, __e, __e, __e, \
__e, __e, __e, __e, __e, __e, __e, __e }; \
__tmp; \
})
#define VECTOR_INIT(x) \
(_Generic ((x), int: VECTOR_INIT_AUX ((x), v64si), \
unsigned: VECTOR_INIT_AUX ((x), v64usi), \
long: VECTOR_INIT_AUX ((x), v64di), \
float: VECTOR_INIT_AUX ((x), v64sf), \
double: VECTOR_INIT_AUX ((x), v64df)))
#define VECTOR_WIDTH(TYPE) (sizeof (TYPE) / (V_SF_SI_P (TYPE) ? 4 : 8))
#define V_SF_SI_P(TYPE) \
(__builtin_types_compatible_p (TYPE, v2sf) \
|| __builtin_types_compatible_p (TYPE, v4sf) \
|| __builtin_types_compatible_p (TYPE, v8sf) \
|| __builtin_types_compatible_p (TYPE, v16sf) \
|| __builtin_types_compatible_p (TYPE, v32sf) \
|| __builtin_types_compatible_p (TYPE, v64sf) \
|| __builtin_types_compatible_p (TYPE, v2si) \
|| __builtin_types_compatible_p (TYPE, v4si) \
|| __builtin_types_compatible_p (TYPE, v8si) \
|| __builtin_types_compatible_p (TYPE, v16si) \
|| __builtin_types_compatible_p (TYPE, v32si) \
|| __builtin_types_compatible_p (TYPE, v64si))
#define VECTOR_INIT_MASK(TYPE) \
({ \
vector_union __mask; \
__mask.t_v64di = VECTOR_INIT (0L); \
for (int i = 0; i < VECTOR_WIDTH (TYPE); i++) \
__mask.t_v64di[i] = -1; \
__builtin_choose_expr (V_SF_SI_P (TYPE), __mask.t_v64si, __mask.t_v64di); \
})
#define ALL_ZEROES_P(x) (COND_TO_BITMASK(x) == 0)
#define COND_TO_BITMASK(x) \
({ \
long __tmp = 0; \
__auto_type __x = __builtin_convertvector((x), typeof (__mask)) & __mask; \
__builtin_choose_expr (sizeof (__mask) == 256, \
({ asm ("v_cmp_ne_u32_e64 %0, %1, 0" \
: "=Sg" (__tmp) \
: "v" (__x)); }), \
({ asm ("v_cmp_ne_u64_e64 %0, %1, 0" \
: "=Sg" (__tmp) \
: "v" (__x)); })); \
__tmp; \
})
#define VECTOR_WHILE(cond, cond_var, prev_cond_var) \
{ \
__auto_type cond_var = prev_cond_var; \
for (;;) { \
cond_var &= (cond); \
if (ALL_ZEROES_P (cond_var)) \
break;
#define VECTOR_ENDWHILE \
} \
}
#define DEF_VS_MATH_FUNC(rettype, name, args...) \
rettype v64sf##_##name##_aux (args, v64si __mask)
#define DEF_VD_MATH_FUNC(rettype, name, args...) \
rettype v64df##_##name##_aux (args, v64di __mask)
/* Use this for predicate functions that take a vector of doubles but
return a vector of ints. */
#define DEF_VD_MATH_PRED(rettype, name, args...) \
rettype v64df##_##name##_aux (args, v64si __mask)
#define FUNCTION_INIT(rettype) \
rettype __ret
#define FUNCTION_RETURN \
return __ret
#define DEF_VARIANT(FUN, TRET, TARG, COUNT) \
v##COUNT##TRET \
v##COUNT##TARG##_##FUN (v##COUNT##TARG __arg) \
{ \
__auto_type __upsized_arg = RESIZE_VECTOR (v64##TARG, __arg); \
__auto_type __mask = VECTOR_INIT_MASK (v##COUNT##TRET); \
__auto_type __result = v64##TARG##_##FUN##_aux (__upsized_arg, __mask); \
return RESIZE_VECTOR (v##COUNT##TRET, __result); \
}
#define DEF_VARIANT2(FUN, TRET, TARG, COUNT) \
v##COUNT##TRET \
v##COUNT##TARG##_##FUN (v##COUNT##TARG __arg1, v##COUNT##TARG __arg2) \
{ \
__auto_type __upsized_arg1 = RESIZE_VECTOR (v64##TARG, __arg1); \
__auto_type __upsized_arg2 = RESIZE_VECTOR (v64##TARG, __arg2); \
__auto_type __mask = VECTOR_INIT_MASK (v##COUNT##TRET); \
__auto_type __result = v64##TARG##_##FUN##_aux (__upsized_arg1, __upsized_arg2, __mask); \
return RESIZE_VECTOR (v##COUNT##TRET, __result); \
}
#define DEF_VARIANTS(FUN, RETTYPE, ARGTYPE) \
DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 2) \
DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 4) \
DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 8) \
DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 16) \
DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 32) \
DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 64)
#define DEF_VARIANTS2(FUN, RETTYPE, ARGTYPE) \
DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 2) \
DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 4) \
DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 8) \
DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 16) \
DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 32) \
DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 64)