/*
 * Copyright 2023 Siemens
 *
 * The authors hereby grant permission to use, copy, modify, distribute,
 * and license this software and its documentation for any purpose, provided
 * that existing copyright notices are retained in all copies and that this
 * notice is included verbatim in any distributions.  No written agreement,
 * license, or royalty fee is required for any of the authorized uses.
 * Modifications to this software may be copyrighted by their authors
 * and need not follow the licensing terms described here, provided that
 * the new terms are clearly indicated on the first page of each file where
 * they apply.
 */

/* Macro library used to help during conversion of scalar math functions to
   vectorized SIMD equivalents on AMD GCN.  */

/* Optimization at -O2 and above currently result in ICEs when converting
   between vector types.  */
#pragma GCC optimize ("O1")

typedef float v2sf __attribute__ ((vector_size (8)));
typedef float v4sf __attribute__ ((vector_size (16)));
typedef float v8sf __attribute__ ((vector_size (32)));
typedef float v16sf __attribute__ ((vector_size (64)));
typedef float v32sf __attribute__ ((vector_size (128)));
typedef float v64sf __attribute__ ((vector_size (256)));

typedef double v2df __attribute__ ((vector_size (16)));
typedef double v4df __attribute__ ((vector_size (32)));
typedef double v8df __attribute__ ((vector_size (64)));
typedef double v16df __attribute__ ((vector_size (128)));
typedef double v32df __attribute__ ((vector_size (256)));
typedef double v64df __attribute__ ((vector_size (512)));

typedef int v2si __attribute__ ((vector_size (8)));
typedef int v4si __attribute__ ((vector_size (16)));
typedef int v8si __attribute__ ((vector_size (32)));
typedef int v16si __attribute__ ((vector_size (64)));
typedef int v32si __attribute__ ((vector_size (128)));
typedef int v64si __attribute__ ((vector_size (256)));

typedef unsigned int v64usi __attribute__ ((vector_size (256)));

typedef long v2di __attribute__ ((vector_size (16)));
typedef long v4di __attribute__ ((vector_size (32)));
typedef long v8di __attribute__ ((vector_size (64)));
typedef long v16di __attribute__ ((vector_size (128)));
typedef long v32di __attribute__ ((vector_size (256)));
typedef long v64di __attribute__ ((vector_size (512)));

typedef union {
  v2sf t_v2sf;
  v4sf t_v4sf;
  v8sf t_v8sf;
  v16sf t_v16sf;
  v32sf t_v32sf;
  v64sf t_v64sf;

  v2df t_v2df;
  v4df t_v4df;
  v8df t_v8df;
  v16df t_v16df;
  v32df t_v32df;
  v64df t_v64df;

  v2si t_v2si;
  v4si t_v4si;
  v8si t_v8si;
  v16si t_v16si;
  v32si t_v32si;
  v64si t_v64si;

  v64usi t_v64usi;

  v2di t_v2di;
  v4di t_v4di;
  v8di t_v8di;
  v16di t_v16di;
  v32di t_v32di;
  v64di t_v64di;
} vector_union;

/* Cast between vectors with a different number of elements.  */

#define RESIZE_VECTOR(to_t, from) \
({ \
  __auto_type __from = (from); \
  *((to_t *) &__from); \
})

/* Bit-wise cast vector FROM to type TO_T.  */

#define CAST_VECTOR(to_t, from) \
({ \
  _Static_assert (sizeof (to_t) == sizeof (from)); \
  union { \
    typeof (from) __from; \
    to_t __to; \
  } __tmp; \
  __tmp.__from = (from); \
  __tmp.__to; \
})

#define NO_COND __mask

/* Note - __mask is _not_ accounted for in VECTOR_MERGE!  */
#define VECTOR_MERGE(vec1, vec2, cond) \
({ \
  _Static_assert (__builtin_types_compatible_p (typeof (vec1), typeof (vec2))); \
  union { \
    typeof (vec1) val; \
    v64si t_v64si; \
    v64di t_v64di; \
  } __vec1, __vec2, __res; \
  __vec1.val = (vec1); \
  __vec2.val = (vec2); \
  __builtin_choose_expr ( \
        sizeof (vec1) == sizeof (v64si), \
        ({ \
          v64si __bitmask = __builtin_convertvector ((cond), v64si); \
          __res.t_v64si = (__vec1.t_v64si & __bitmask) \
                          | (__vec2.t_v64si & ~__bitmask); \
        }), \
        ({ \
          v64di __bitmask = __builtin_convertvector ((cond), v64di); \
          __res.t_v64di = (__vec1.t_v64di & __bitmask) \
                          | (__vec2.t_v64di & ~__bitmask); \
        })); \
  __res.val; \
})

#define VECTOR_RETURN(retval, cond) \
do { \
  _Static_assert (__builtin_types_compatible_p (typeof (retval), typeof (__ret))); \
  __auto_type __cond = __builtin_convertvector ((cond), typeof (__mask)); \
  __auto_type __retval = (retval); \
  VECTOR_COND_MOVE (__ret, __retval, __cond); \
  __mask &= ~__cond; \
} while (0)

#define VECTOR_COND_MOVE(var, val, cond) \
do { \
  _Static_assert (__builtin_types_compatible_p (typeof (var), typeof (val))); \
  __auto_type __cond = __builtin_convertvector ((cond), typeof (__mask)); \
  var = VECTOR_MERGE ((val), var, __cond & __mask); \
} while (0)

#define VECTOR_IF(cond, cond_var) \
{ \
  __auto_type cond_var = (cond); \
  __auto_type __inv_cond = ~cond_var; \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_ELSEIF(cond, cond_var) \
  } \
  cond_var = __inv_cond & (cond); \
  __inv_cond &= ~(cond); \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_ELSE(cond_var) \
  } \
  cond_var = __inv_cond; \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_IF2(cond, cond_var, prev_cond_var) \
{ \
  __auto_type cond_var = (cond) & __builtin_convertvector (prev_cond_var, typeof (cond)); \
  __auto_type __inv_cond = ~(cond); \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_ELSEIF2(cond, cond_var, prev_cond_var) \
  } \
  cond_var = (cond) & __inv_cond & __builtin_convertvector (prev_cond_var, typeof (cond)); \
  __inv_cond &= ~(cond); \
  if (!ALL_ZEROES_P (cond_var)) \
  {

#define VECTOR_ELSE2(cond_var, prev_cond_var) \
  } \
  cond_var = __inv_cond & __builtin_convertvector (prev_cond_var, typeof (__inv_cond)); \
  if (!ALL_ZEROES_P (cond_var)) \
  {


#define VECTOR_ENDIF \
  } \
}

#define VECTOR_INIT_AUX(x, type) \
({ \
  typeof (x) __e = (x); \
  type __tmp = { \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e, \
    __e, __e, __e, __e, __e, __e, __e, __e }; \
  __tmp; \
})

#define VECTOR_INIT(x) \
  (_Generic ((x), int: VECTOR_INIT_AUX ((x), v64si), \
                  unsigned: VECTOR_INIT_AUX ((x), v64usi), \
                  long: VECTOR_INIT_AUX ((x), v64di), \
                  float: VECTOR_INIT_AUX ((x), v64sf), \
                  double: VECTOR_INIT_AUX ((x), v64df)))

#define VECTOR_WIDTH(TYPE) (sizeof (TYPE) / (V_SF_SI_P (TYPE) ? 4 : 8))

#define V_SF_SI_P(TYPE) \
  (__builtin_types_compatible_p (TYPE, v2sf) \
   || __builtin_types_compatible_p (TYPE, v4sf) \
   || __builtin_types_compatible_p (TYPE, v8sf) \
   || __builtin_types_compatible_p (TYPE, v16sf) \
   || __builtin_types_compatible_p (TYPE, v32sf) \
   || __builtin_types_compatible_p (TYPE, v64sf) \
   || __builtin_types_compatible_p (TYPE, v2si) \
   || __builtin_types_compatible_p (TYPE, v4si) \
   || __builtin_types_compatible_p (TYPE, v8si) \
   || __builtin_types_compatible_p (TYPE, v16si) \
   || __builtin_types_compatible_p (TYPE, v32si) \
   || __builtin_types_compatible_p (TYPE, v64si))

#define VECTOR_INIT_MASK(TYPE) \
({ \
  vector_union __mask; \
  __mask.t_v64di = VECTOR_INIT (0L); \
  for (int i = 0; i < VECTOR_WIDTH (TYPE); i++) \
    __mask.t_v64di[i] = -1; \
  __builtin_choose_expr (V_SF_SI_P (TYPE), __mask.t_v64si, __mask.t_v64di); \
})

#define ALL_ZEROES_P(x) (COND_TO_BITMASK(x) == 0)

#define COND_TO_BITMASK(x) \
({ \
  long __tmp = 0; \
  __auto_type __x = __builtin_convertvector((x), typeof (__mask)) & __mask; \
  __builtin_choose_expr (sizeof (__mask) == 256, \
                         ({ asm ("v_cmp_ne_u32_e64 %0, %1, 0" \
                                 : "=Sg" (__tmp) \
                                 : "v" (__x)); }), \
                         ({ asm ("v_cmp_ne_u64_e64 %0, %1, 0" \
                                 : "=Sg" (__tmp) \
                                 : "v" (__x)); })); \
  __tmp; \
})

#define VECTOR_WHILE(cond, cond_var, prev_cond_var) \
{ \
  __auto_type cond_var = prev_cond_var; \
  for (;;) { \
    cond_var &= (cond); \
    if (ALL_ZEROES_P (cond_var)) \
      break;

#define VECTOR_ENDWHILE \
  } \
}

#define DEF_VS_MATH_FUNC(rettype, name, args...) \
    rettype v64sf##_##name##_aux (args, v64si __mask)

#define DEF_VD_MATH_FUNC(rettype, name, args...) \
    rettype v64df##_##name##_aux (args, v64di __mask)

/* Use this for predicate functions that take a vector of doubles but
   return a vector of ints.  */
#define DEF_VD_MATH_PRED(rettype, name, args...) \
    rettype v64df##_##name##_aux (args, v64si __mask)

#define FUNCTION_INIT(rettype) \
  rettype __ret

#define FUNCTION_RETURN \
  return __ret

#define DEF_VARIANT(FUN, TRET, TARG, COUNT) \
v##COUNT##TRET \
v##COUNT##TARG##_##FUN (v##COUNT##TARG __arg) \
{ \
  __auto_type __upsized_arg = RESIZE_VECTOR (v64##TARG, __arg); \
  __auto_type __mask = VECTOR_INIT_MASK (v##COUNT##TRET); \
  __auto_type __result = v64##TARG##_##FUN##_aux (__upsized_arg, __mask); \
  return RESIZE_VECTOR (v##COUNT##TRET, __result); \
}

#define DEF_VARIANT2(FUN, TRET, TARG, COUNT) \
v##COUNT##TRET \
v##COUNT##TARG##_##FUN (v##COUNT##TARG __arg1, v##COUNT##TARG __arg2) \
{ \
  __auto_type __upsized_arg1 = RESIZE_VECTOR (v64##TARG, __arg1); \
  __auto_type __upsized_arg2 = RESIZE_VECTOR (v64##TARG, __arg2); \
  __auto_type __mask = VECTOR_INIT_MASK (v##COUNT##TRET); \
  __auto_type __result = v64##TARG##_##FUN##_aux (__upsized_arg1, __upsized_arg2, __mask); \
  return RESIZE_VECTOR (v##COUNT##TRET, __result); \
}

#define DEF_VARIANTS(FUN, RETTYPE, ARGTYPE) \
  DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 2) \
  DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 4) \
  DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 8) \
  DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 16) \
  DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 32) \
  DEF_VARIANT (FUN, RETTYPE, ARGTYPE, 64)

#define DEF_VARIANTS2(FUN, RETTYPE, ARGTYPE) \
  DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 2) \
  DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 4) \
  DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 8) \
  DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 16) \
  DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 32) \
  DEF_VARIANT2 (FUN, RETTYPE, ARGTYPE, 64)