1  /* Cell BEA specific SPU intrinsics to PPU/VMX intrinsics
       2     Copyright (C) 2007-2023 Free Software Foundation, Inc.
       3  
       4     This file is free software; you can redistribute it and/or modify it under
       5     the terms of the GNU General Public License as published by the Free
       6     Software Foundation; either version 3 of the License, or (at your option) 
       7     any later version.
       8  
       9     This file is distributed in the hope that it will be useful, but WITHOUT
      10     ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
      11     FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      12     for more details.
      13  
      14     Under Section 7 of GPL version 3, you are granted additional
      15     permissions described in the GCC Runtime Library Exception, version
      16     3.1, as published by the Free Software Foundation.
      17  
      18     You should have received a copy of the GNU General Public License and
      19     a copy of the GCC Runtime Library Exception along with this program;
      20     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      21     <http://www.gnu.org/licenses/>.  */
      22  
      23  #ifndef _SI2VMX_H_
      24  #define _SI2VMX_H_	1
      25  
      26  #ifndef __SPU__
      27  
      28  #include <stdlib.h>
      29  #include <vec_types.h>
      30  
      31  
      32  /* Specify a default halt action for spu_hcmpeq and spu_hcmpgt intrinsics.
      33   * Users can override the action by defining it prior to including this 
      34   * header file.
      35   */
      36  #ifndef SPU_HALT_ACTION
      37  #define SPU_HALT_ACTION		abort()
      38  #endif
      39  
      40  /* Specify a default stop action for the spu_stop intrinsic.
      41   * Users can override the action by defining it prior to including this 
      42   * header file.
      43   */
      44  #ifndef SPU_STOP_ACTION
      45  #define SPU_STOP_ACTION		abort()
      46  #endif
      47  
      48  
      49  /* Specify a default action for unsupported intrinsic.
      50   * Users can override the action by defining it prior to including this 
      51   * header file.
      52   */
      53  #ifndef SPU_UNSUPPORTED_ACTION
      54  #define SPU_UNSUPPORTED_ACTION	abort()
      55  #endif
      56  
      57  
      58  /* Casting intrinsics - from scalar to quadword 
      59   */
      60  
      61  static __inline qword si_from_uchar(unsigned char c) {
      62    union {
      63      qword q;
      64      unsigned char c[16];
      65    } x;
      66    x.c[3] = c;
      67    return (x.q);
      68  }
      69  
      70  static __inline qword si_from_char(signed char c) {
      71    union {
      72      qword q;
      73      signed char c[16];
      74    } x;
      75    x.c[3] = c;
      76    return (x.q);
      77  }
      78  
      79  static __inline qword si_from_ushort(unsigned short s) {
      80    union {
      81      qword q;
      82      unsigned short s[8];
      83    } x;
      84    x.s[1] = s;
      85    return (x.q);
      86  }
      87  
      88  static __inline qword si_from_short(short s) {
      89    union {
      90      qword q;
      91      short s[8];
      92    } x;
      93    x.s[1] = s;
      94    return (x.q);
      95  }
      96  
      97  
      98  static __inline qword si_from_uint(unsigned int i) {
      99    union {
     100      qword q;
     101      unsigned int i[4];
     102    } x;
     103    x.i[0] = i;
     104    return (x.q);
     105  }
     106  
     107  static __inline qword si_from_int(int i) {
     108    union {
     109      qword q;
     110      int i[4];
     111    } x;
     112    x.i[0] = i;
     113    return (x.q);
     114  }
     115  
     116  static __inline qword si_from_ullong(unsigned long long l) {
     117    union {
     118      qword q;
     119      unsigned long long l[2];
     120    } x;
     121    x.l[0] = l;
     122    return (x.q);
     123  }
     124  
     125  static __inline qword si_from_llong(long long l) {
     126    union {
     127      qword q;
     128      long long l[2];
     129    } x;
     130    x.l[0] = l;
     131    return (x.q);
     132  }
     133  
     134  static __inline qword si_from_float(float f) {
     135    union {
     136      qword q;
     137      float f[4];
     138    } x;
     139    x.f[0] = f;
     140    return (x.q);
     141  }
     142  
     143  static __inline qword si_from_double(double d) {
     144    union {
     145      qword q;
     146      double d[2];
     147    } x;
     148    x.d[0] = d;
     149    return (x.q);
     150  }
     151  
     152  static __inline qword si_from_ptr(void *ptr) {
     153    union {
     154      qword q;
     155      void *p;
     156    } x;
     157    x.p = ptr;
     158    return (x.q);
     159  }
     160  
     161  
     162  /* Casting intrinsics - from quadword to scalar
     163   */
     164  static __inline unsigned char si_to_uchar(qword q) {
     165    union {
     166      qword q;
     167      unsigned char c[16];
     168    } x;
     169    x.q = q;
     170    return (x.c[3]);
     171  }
     172  
     173  static __inline signed char si_to_char(qword q) {
     174    union {
     175      qword q;
     176      signed char c[16];
     177    } x;
     178    x.q = q;
     179    return (x.c[3]);
     180  }
     181  
     182  static __inline unsigned short si_to_ushort(qword q) {
     183    union {
     184      qword q;
     185      unsigned short s[8];
     186    } x;
     187    x.q = q;
     188    return (x.s[1]);
     189  }
     190  
     191  static __inline short si_to_short(qword q) {
     192    union {
     193      qword q;
     194      short s[8];
     195    } x;
     196    x.q = q;
     197    return (x.s[1]);
     198  }
     199  
     200  static __inline unsigned int si_to_uint(qword q) {
     201    union {
     202      qword q;
     203      unsigned int i[4];
     204    } x;
     205    x.q = q;
     206    return (x.i[0]);
     207  }
     208  
     209  static __inline int si_to_int(qword q) {
     210    union {
     211      qword q;
     212      int i[4];
     213    } x;
     214    x.q = q;
     215    return (x.i[0]);
     216  }
     217  
     218  static __inline unsigned long long si_to_ullong(qword q) {
     219    union {
     220      qword q;
     221      unsigned long long l[2];
     222    } x;
     223    x.q = q;
     224    return (x.l[0]);
     225  }
     226  
     227  static __inline long long si_to_llong(qword q) {
     228    union {
     229      qword q;
     230      long long l[2];
     231    } x;
     232    x.q = q;
     233    return (x.l[0]);
     234  }
     235  
     236  static __inline float si_to_float(qword q) {
     237    union {
     238      qword q;
     239      float f[4];
     240    } x;
     241    x.q = q;
     242    return (x.f[0]);
     243  }
     244  
     245  static __inline double si_to_double(qword q) {
     246    union {
     247      qword q;
     248      double d[2];
     249    } x;
     250    x.q = q;
     251    return (x.d[0]);
     252  }
     253  
     254  static __inline void * si_to_ptr(qword q) {
     255    union {
     256      qword q;
     257      void *p;
     258    } x;
     259    x.q = q;
     260    return (x.p);
     261  }
     262  
     263  
     264  /* Absolute difference
     265   */
     266  static __inline qword si_absdb(qword a, qword b)
     267  {
     268    vec_uchar16 ac, bc, dc;
     269  
     270    ac = (vec_uchar16)(a);
     271    bc = (vec_uchar16)(b);
     272    dc = vec_sel(vec_sub(bc, ac), vec_sub(ac, bc), vec_cmpgt(ac, bc));
     273  
     274    return ((qword)(dc));
     275  }
     276  
     277  /* Add intrinsics 
     278   */
     279  #define si_a(_a, _b)		((qword)(vec_add((vec_uint4)(_a), (vec_uint4)(_b))))
     280  
     281  #define si_ah(_a, _b)		((qword)(vec_add((vec_ushort8)(_a), (vec_ushort8)(_b))))
     282  
     283  static __inline qword si_ai(qword a, int b)
     284  {
     285    return ((qword)(vec_add((vec_int4)(a), 
     286  			  vec_splat((vec_int4)(si_from_int(b)), 0))));
     287  }
     288  
     289  
     290  static __inline qword si_ahi(qword a, short b)
     291  {
     292    return ((qword)(vec_add((vec_short8)(a), 
     293  			  vec_splat((vec_short8)(si_from_short(b)), 1))));
     294  }
     295  
     296  
     297  #define si_fa(_a, _b)	((qword)(vec_add((vec_float4)(_a), (vec_float4)(_b))))
     298  
     299  
     300  static __inline qword si_dfa(qword a, qword b)
     301  {
     302    union {
     303      vec_double2 v;
     304      double d[2];
     305    } ad, bd, dd;
     306  
     307    ad.v = (vec_double2)(a);
     308    bd.v = (vec_double2)(b);
     309    dd.d[0] = ad.d[0] + bd.d[0];
     310    dd.d[1] = ad.d[1] + bd.d[1];
     311  
     312    return ((qword)(dd.v));
     313  }
     314  
     315  /* Add word extended
     316   */
     317  #define si_addx(_a, _b, _c)	((qword)(vec_add(vec_add((vec_uint4)(_a), (vec_uint4)(_b)), 	\
     318  						 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
     319  
     320  
     321  /* Bit-wise AND
     322   */
     323  #define si_and(_a, _b)		((qword)(vec_and((vec_uint4)(_a), (vec_uint4)(_b))))
     324  
     325  
     326  static __inline qword si_andbi(qword a, signed char b)
     327  {
     328    return ((qword)(vec_and((vec_char16)(a), 
     329  			  vec_splat((vec_char16)(si_from_char(b)), 3))));
     330  }
     331  
     332  static __inline qword si_andhi(qword a, signed short b)
     333  {
     334    return ((qword)(vec_and((vec_short8)(a), 
     335  			  vec_splat((vec_short8)(si_from_short(b)), 1))));
     336  }
     337  
     338  
     339  static __inline qword si_andi(qword a, signed int b)
     340  {
     341    return ((qword)(vec_and((vec_int4)(a),
     342  			  vec_splat((vec_int4)(si_from_int(b)), 0))));
     343  }
     344  
     345  
     346  /* Bit-wise AND with complement
     347   */
     348  #define si_andc(_a, _b)		((qword)(vec_andc((vec_uchar16)(_a), (vec_uchar16)(_b))))
     349  
     350  
     351  /* Average byte vectors
     352   */
     353  #define si_avgb(_a, _b)		((qword)(vec_avg((vec_uchar16)(_a), (vec_uchar16)(_b))))
     354  
     355  
     356  /* Branch indirect and set link on external data
     357   */
     358  #define si_bisled(_func)	/* not mappable */
     359  #define si_bisledd(_func)	/* not mappable */
     360  #define si_bislede(_func)	/* not mappable */
     361  
     362  
     363  /* Borrow generate
     364   */
     365  #define si_bg(_a, _b)		((qword)(vec_subc((vec_uint4)(_b), (vec_uint4)(_a))))
     366  
     367  #define si_bgx(_a, _b, _c)	((qword)(vec_and(vec_or(vec_cmpgt((vec_uint4)(_b), (vec_uint4)(_a)),		\
     368  							vec_and(vec_cmpeq((vec_uint4)(_b), (vec_uint4)(_a)), 	\
     369  								(vec_uint4)(_c))), vec_splat_u32(1))))
     370  
     371  /* Compare absolute equal
     372   */
     373  static __inline qword si_fcmeq(qword a, qword b)
     374  {
     375    vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
     376    
     377    return ((qword)(vec_cmpeq(vec_andc((vec_float4)(a), msb), 
     378  				  vec_andc((vec_float4)(b), msb))));
     379  }
     380  
     381  static __inline qword si_dfcmeq(qword a, qword b)
     382  {
     383    vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
     384    vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
     385    vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
     386  
     387    vec_uint4 biteq;
     388    vec_uint4 aabs;
     389    vec_uint4 babs;
     390    vec_uint4 a_gt;
     391    vec_uint4 ahi_inf;
     392    vec_uint4 anan;
     393    vec_uint4 result;
     394  
     395    union {
     396      vec_uchar16 v;
     397      int i[4];
     398    } x;
     399  
     400    /* Shift 4 bytes  */
     401    x.i[3] = 4 << 3;
     402  
     403    /*  Mask out sign bits */
     404    aabs = vec_and((vec_uint4)a,sign_mask);
     405    babs = vec_and((vec_uint4)b,sign_mask);
     406  
     407    /*  A)  Check for bit equality, store in high word */
     408    biteq = (vec_uint4) vec_cmpeq((vec_uint4)aabs,(vec_uint4)babs);
     409    biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
     410  
     411    /*  
     412        B)  Check if a is NaN, store in high word
     413          
     414        B1) If the high word is greater than max_exp (indicates a NaN)
     415        B2) If the low word is greater than 0 
     416    */
     417    a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
     418  
     419    /*  B3) Check if the high word is equal to the inf exponent */
     420    ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
     421  
     422    /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
     423    anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
     424  
     425    /*  result = A and not B  */
     426    result = vec_andc(biteq, anan);
     427  
     428    /*  Promote high words to 64 bits and return  */
     429    return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote)));
     430  }
     431  
     432  
     433  /* Compare absolute greater than
     434   */
     435  static __inline qword si_fcmgt(qword a, qword b)
     436  {
     437    vec_float4 msb = (vec_float4)((vec_uint4){0x80000000, 0x80000000, 0x80000000, 0x80000000});
     438    
     439    return ((qword)(vec_cmpgt(vec_andc((vec_float4)(a), msb),
     440  				  vec_andc((vec_float4)(b), msb))));
     441  }
     442  
     443  static __inline qword si_dfcmgt(qword a, qword b)
     444  {
     445    vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
     446    vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
     447    vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
     448  
     449    union {
     450      vec_uchar16 v;
     451      int i[4];
     452    } x;
     453  
     454    /* Shift 4 bytes  */
     455    x.i[3] = 4 << 3;
     456  
     457    // absolute value of a,b 
     458    vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
     459    vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
     460  
     461    // check if a is nan
     462    vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
     463    vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
     464    a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
     465    a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
     466  
     467    // check if b is nan
     468    vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
     469    vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
     470    b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
     471    b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
     472  
     473    // A) Check if the exponents are different 
     474    vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aabs,babs);
     475  
     476    // B) Check if high word equal, and low word greater
     477    vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aabs, (vec_uint4)babs);
     478    vec_uint4 eq = (vec_uint4)vec_cmpeq(aabs, babs);
     479    vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
     480  
     481    //  If either A or B is true, return true (unless NaNs detected) 
     482    vec_uint4 r = vec_or(gt_hi, eqgt);
     483  
     484    // splat the high words of the comparison step
     485    r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
     486  
     487    // correct for NaNs in input
     488    return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
     489  }
     490  
     491  
     492  /* Compare equal
     493   */
     494  static __inline qword si_ceqb(qword a, qword b)
     495  {
     496    return ((qword)(vec_cmpeq((vec_uchar16)(a), (vec_uchar16)(b))));
     497  }
     498  
     499  static __inline qword si_ceqh(qword a, qword b)
     500  {
     501    return ((qword)(vec_cmpeq((vec_ushort8)(a), (vec_ushort8)(b))));
     502  }
     503  
     504  static __inline qword si_ceq(qword a, qword b)
     505  {
     506    return ((qword)(vec_cmpeq((vec_uint4)(a), (vec_uint4)(b))));
     507  }
     508  
     509  static __inline qword si_fceq(qword a, qword b)
     510  {
     511    return ((qword)(vec_cmpeq((vec_float4)(a), (vec_float4)(b))));
     512  }
     513  
     514  static __inline qword si_ceqbi(qword a, signed char b)
     515  {
     516    return ((qword)(vec_cmpeq((vec_char16)(a), 
     517  			    vec_splat((vec_char16)(si_from_char(b)), 3))));
     518  }
     519  
     520  static __inline qword si_ceqhi(qword a, signed short b)
     521  {
     522    return ((qword)(vec_cmpeq((vec_short8)(a), 
     523  			  vec_splat((vec_short8)(si_from_short(b)), 1))));
     524  }
     525  
     526  static __inline qword si_ceqi(qword a, signed int b)
     527  {
     528    return ((qword)(vec_cmpeq((vec_int4)(a), 
     529  			  vec_splat((vec_int4)(si_from_int(b)), 0))));
     530  }
     531  
     532  static __inline qword si_dfceq(qword a, qword b)
     533  {
     534    vec_uint4 sign_mask= (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
     535    vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x00000000, 0x7FF00000, 0x00000000 };
     536    vec_uchar16 hihi_promote = (vec_uchar16) { 0,1,2,3,  16,17,18,19,  8,9,10,11, 24,25,26,27};
     537  
     538    vec_uint4 biteq;
     539    vec_uint4 aabs;
     540    vec_uint4 babs;
     541    vec_uint4 a_gt;
     542    vec_uint4 ahi_inf;
     543    vec_uint4 anan;
     544    vec_uint4 iszero;
     545    vec_uint4 result;
     546  
     547    union {
     548      vec_uchar16 v;
     549      int i[4];
     550    } x;
     551  
     552    /* Shift 4 bytes  */
     553    x.i[3] = 4 << 3;
     554  
     555    /*  A)  Check for bit equality, store in high word */
     556    biteq = (vec_uint4) vec_cmpeq((vec_uint4)a,(vec_uint4)b);
     557    biteq = vec_and(biteq,(vec_uint4)vec_slo((vec_uchar16)biteq,x.v));
     558  
     559    /*  Mask out sign bits */
     560    aabs = vec_and((vec_uint4)a,sign_mask);
     561    babs = vec_and((vec_uint4)b,sign_mask);
     562  
     563    /*  
     564        B)  Check if a is NaN, store in high word
     565          
     566        B1) If the high word is greater than max_exp (indicates a NaN)
     567        B2) If the low word is greater than 0 
     568    */
     569    a_gt = (vec_uint4)vec_cmpgt(aabs,nan_mask);
     570  
     571    /*  B3) Check if the high word is equal to the inf exponent */
     572    ahi_inf = (vec_uint4)vec_cmpeq(aabs,nan_mask);
     573  
     574    /*  anan = B1[hi] or (B2[lo] and B3[hi]) */
     575    anan = (vec_uint4)vec_or(a_gt,vec_and((vec_uint4)vec_slo((vec_uchar16)a_gt,x.v),ahi_inf));
     576  
     577    /*  C)  Check for 0 = -0 special case */
     578    iszero =(vec_uint4)vec_cmpeq((vec_uint4)vec_or(aabs,babs),(vec_uint4)vec_splat_u32(0));
     579    iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
     580  
     581    /*  result = (A or C) and not B  */
     582    result = vec_or(biteq,iszero);
     583    result = vec_andc(result, anan);
     584  
     585    /*  Promote high words to 64 bits and return  */
     586    return ((qword)(vec_perm((vec_uchar16)result, (vec_uchar16)result, hihi_promote))); 
     587  }
     588  
     589  
     590  /* Compare greater than
     591   */
     592  static __inline qword si_cgtb(qword a, qword b)
     593  {
     594    return ((qword)(vec_cmpgt((vec_char16)(a), (vec_char16)(b))));
     595  }
     596  
     597  static __inline qword si_cgth(qword a, qword b)
     598  {
     599    return ((qword)(vec_cmpgt((vec_short8)(a), (vec_short8)(b))));
     600  }
     601  
     602  static __inline qword si_cgt(qword a, qword b)
     603  {
     604    return ((qword)(vec_cmpgt((vec_int4)(a), (vec_int4)(b))));
     605  }
     606  
     607  static __inline qword si_clgtb(qword a, qword b)
     608  {
     609    return ((qword)(vec_cmpgt((vec_uchar16)(a), (vec_uchar16)(b))));
     610  }
     611  
     612  static __inline qword si_clgth(qword a, qword b)
     613  {
     614    return ((qword)(vec_cmpgt((vec_ushort8)(a), (vec_ushort8)(b))));
     615  }
     616  
     617  static __inline qword si_clgt(qword a, qword b)
     618  {
     619    return ((qword)(vec_cmpgt((vec_uint4)(a), (vec_uint4)(b))));
     620  }
     621  
     622  static __inline qword si_fcgt(qword a, qword b)
     623  {
     624    return ((qword)(vec_cmpgt((vec_float4)(a), (vec_float4)(b))));
     625  }
     626  
     627  static __inline qword si_dfcgt(qword a, qword b)
     628  {
     629    vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
     630    vec_uchar16 borrow_shuffle = (vec_uchar16) { 4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192 };
     631    vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
     632    vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
     633  
     634    union {
     635      vec_uchar16 v;
     636      int i[4];
     637    } x;
     638  
     639    /* Shift 4 bytes  */
     640    x.i[3] = 4 << 3;
     641  
     642    // absolute value of a,b 
     643    vec_uint4 aabs = vec_and((vec_uint4)a, sign_mask);
     644    vec_uint4 babs = vec_and((vec_uint4)b, sign_mask);
     645  
     646    // check if a is nan
     647    vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
     648    vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
     649    a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
     650    a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi);
     651  
     652    // check if b is nan
     653    vec_uint4 b_inf = (vec_uint4)vec_cmpeq(babs, nan_mask);
     654    vec_uint4 b_nan = (vec_uint4)vec_cmpgt(babs, nan_mask);
     655    b_nan = vec_or(b_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)b_nan,x.v),b_inf));
     656    b_nan = (vec_uint4)vec_perm((vec_uchar16)b_nan, (vec_uchar16)b_nan, splat_hi);
     657  
     658    // sign of a
     659    vec_uint4 asel = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
     660    asel = (vec_uint4)vec_perm((vec_uchar16)asel,(vec_uchar16)asel,splat_hi);
     661  
     662    // sign of b
     663    vec_uint4 bsel = (vec_uint4)vec_sra((vec_int4)(b), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
     664    bsel = (vec_uint4)vec_perm((vec_uchar16)bsel,(vec_uchar16)bsel,splat_hi);
     665  
     666    // negative a
     667    vec_uint4 abor = vec_subc((vec_uint4)vec_splat_u32(0), aabs);
     668    vec_uchar16 pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), vec_sr(borrow_shuffle, vec_splat_u8(3)), vec_sra(borrow_shuffle, vec_splat_u8(7)));
     669    abor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)abor, (vec_uchar16)abor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
     670    vec_uint4 aneg = vec_add(vec_add(vec_splat_u32(0), vec_nor(aabs, aabs)), vec_and(abor, vec_splat_u32(1)));
     671  
     672    // pick the one we want
     673    vec_int4 aval = (vec_int4)vec_sel((vec_uchar16)aabs, (vec_uchar16)aneg, (vec_uchar16)asel);
     674  
     675    // negative b
     676    vec_uint4 bbor = vec_subc((vec_uint4)vec_splat_u32(0), babs);
     677    bbor = (vec_uint4)(vec_perm(vec_perm((vec_uchar16)bbor, (vec_uchar16)bbor, borrow_shuffle),((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),pat));
     678    vec_uint4 bneg = vec_add(vec_nor(babs, babs), vec_and(bbor, vec_splat_u32(1)));
     679  
     680    // pick the one we want
     681    vec_int4 bval=(vec_int4)vec_sel((vec_uchar16)babs, (vec_uchar16)bneg, (vec_uchar16)bsel);
     682  
     683    // A) Check if the exponents are different 
     684    vec_uint4 gt_hi = (vec_uint4)vec_cmpgt(aval,bval);
     685  
     686    // B) Check if high word equal, and low word greater
     687    vec_uint4 gt_lo = (vec_uint4)vec_cmpgt((vec_uint4)aval, (vec_uint4)bval);
     688    vec_uint4 eq = (vec_uint4)vec_cmpeq(aval, bval);
     689    vec_uint4 eqgt = vec_and(eq,vec_slo(gt_lo,x.v));
     690  
     691    //  If either A or B is true, return true (unless NaNs detected) 
     692    vec_uint4 r = vec_or(gt_hi, eqgt);
     693  
     694    // splat the high words of the comparison step
     695    r = (vec_uint4)vec_perm((vec_uchar16)r,(vec_uchar16)r,splat_hi);
     696  
     697    // correct for NaNs in input
     698    return ((qword)vec_andc(r,vec_or(a_nan,b_nan)));
     699  }
     700  
     701  static __inline qword si_cgtbi(qword a, signed char b)
     702  {
     703    return ((qword)(vec_cmpgt((vec_char16)(a), 
     704  			    vec_splat((vec_char16)(si_from_char(b)), 3))));
     705  }
     706  
     707  static __inline qword si_cgthi(qword a, signed short b)
     708  {
     709    return ((qword)(vec_cmpgt((vec_short8)(a), 
     710  			    vec_splat((vec_short8)(si_from_short(b)), 1))));
     711  }
     712  
     713  static __inline qword si_cgti(qword a, signed int b)
     714  {
     715    return ((qword)(vec_cmpgt((vec_int4)(a), 
     716  			    vec_splat((vec_int4)(si_from_int(b)), 0))));
     717  }
     718  
     719  static __inline qword si_clgtbi(qword a, unsigned char b)
     720  {
     721    return ((qword)(vec_cmpgt((vec_uchar16)(a), 
     722  			    vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
     723  }
     724  
     725  static __inline qword si_clgthi(qword a, unsigned short b)
     726  {
     727    return ((qword)(vec_cmpgt((vec_ushort8)(a),
     728  			    vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
     729  }
     730  
     731  static __inline qword si_clgti(qword a, unsigned int b)
     732  {
     733    return ((qword)(vec_cmpgt((vec_uint4)(a), 
     734  			    vec_splat((vec_uint4)(si_from_uint(b)), 0))));
     735  }
     736  
     737  static __inline qword si_dftsv(qword a, char b)
     738  {
     739    vec_uchar16 splat_hi = (vec_uchar16) { 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
     740    vec_uint4 sign_mask = (vec_uint4) { 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF };
     741    vec_uint4 result = (vec_uint4){0};
     742    vec_uint4 sign = (vec_uint4)vec_sra((vec_int4)(a), (vec_uint4)vec_splat(((vec_uint4)si_from_int(31)), 0));
     743    sign = (vec_uint4)vec_perm((vec_uchar16)sign,(vec_uchar16)sign,splat_hi);
     744    vec_uint4 aabs = vec_and((vec_uint4)a,sign_mask);
     745    
     746    union {
     747      vec_uchar16 v;
     748      int i[4];
     749    } x;
     750  
     751    /* Shift 4 bytes  */
     752    x.i[3] = 4 << 3;
     753    
     754    /* Nan or +inf or -inf  */
     755    if (b & 0x70)
     756    {
     757      vec_uint4 nan_mask = (vec_uint4) { 0x7FF00000, 0x0, 0x7FF00000, 0x0 };
     758      vec_uint4 a_inf = (vec_uint4)vec_cmpeq(aabs, nan_mask);
     759       /* NaN  */
     760       if (b & 0x40)
     761       {
     762         vec_uint4 a_nan = (vec_uint4)vec_cmpgt(aabs, nan_mask);
     763         a_nan = vec_or(a_nan, vec_and((vec_uint4)vec_slo((vec_uchar16)a_nan,x.v),a_inf));
     764         a_nan = (vec_uint4)vec_perm((vec_uchar16)a_nan, (vec_uchar16)a_nan, splat_hi); 
     765         result = vec_or(result, a_nan);
     766       }
     767       /* inf  */ 
     768       if (b & 0x30)
     769       {
     770         a_inf = vec_and((vec_uint4)vec_slo((vec_uchar16)a_inf,x.v), a_inf);
     771         a_inf = (vec_uint4)vec_perm((vec_uchar16)a_inf, (vec_uchar16)a_inf, splat_hi); 
     772          /* +inf  */
     773          if (b & 0x20)
     774            result = vec_or(vec_andc(a_inf, sign), result);
     775          /* -inf  */
     776          if (b & 0x10)
     777            result = vec_or(vec_and(a_inf, sign), result);
     778       } 
     779    }
     780    /* 0 or denorm  */
     781    if (b & 0xF)
     782    {
     783      vec_uint4 iszero =(vec_uint4)vec_cmpeq(aabs,(vec_uint4)vec_splat_u32(0));
     784      iszero = vec_and(iszero,(vec_uint4)vec_slo((vec_uchar16)iszero,x.v));
     785      /* denorm  */
     786      if (b & 0x3)
     787      {
     788        vec_uint4 denorm_mask = (vec_uint4){0xFFFFF, 0xFFFFF, 0xFFFFF, 0xFFFFF};
     789        vec_uint4 isdenorm = vec_nor((vec_uint4)vec_cmpgt(aabs, denorm_mask), iszero);
     790        isdenorm = (vec_uint4)vec_perm((vec_uchar16)isdenorm, (vec_uchar16)isdenorm, splat_hi);
     791        /* +denorm  */
     792       if (b & 0x2)
     793          result = vec_or(vec_andc(isdenorm, sign), result);
     794        /* -denorm  */
     795       if (b & 0x1)
     796          result = vec_or(vec_and(isdenorm, sign), result);
     797      }
     798      /* 0  */
     799      if (b & 0xC)
     800      {
     801        iszero = (vec_uint4)vec_perm((vec_uchar16)iszero, (vec_uchar16)iszero, splat_hi);
     802        /* +0  */
     803       if (b & 0x8)
     804          result = vec_or(vec_andc(iszero, sign), result);
     805        /* -0  */
     806       if (b & 0x4)
     807          result = vec_or(vec_and(iszero, sign), result);
     808      }
     809    }
     810    return ((qword)result);
     811  }
     812  
     813  
     814  /* Carry generate
     815   */
     816  #define si_cg(_a, _b)		((qword)(vec_addc((vec_uint4)(_a), (vec_uint4)(_b))))
     817  
     818  #define si_cgx(_a, _b, _c)	((qword)(vec_or(vec_addc((vec_uint4)(_a), (vec_uint4)(_b)), 		\
     819  						vec_addc(vec_add((vec_uint4)(_a), (vec_uint4)(_b)),	\
     820  							 vec_and((vec_uint4)(_c), vec_splat_u32(1))))))
     821  
     822  
     823  /* Count ones for bytes
     824   */
     825  static __inline qword si_cntb(qword a)
     826  {
     827    vec_uchar16 nib_cnt = (vec_uchar16){0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
     828    vec_uchar16 four = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
     829    vec_uchar16 av;
     830  
     831    av = (vec_uchar16)(a);
     832  
     833    return ((qword)(vec_add(vec_perm(nib_cnt, nib_cnt, av),
     834  			  vec_perm(nib_cnt, nib_cnt, vec_sr (av, four)))));
     835  }
     836  
     837  /* Count ones for bytes
     838   */
     839  static __inline qword si_clz(qword a)
     840  {
     841    vec_uchar16 av;
     842    vec_uchar16 cnt_hi, cnt_lo, cnt, tmp1, tmp2, tmp3;
     843    vec_uchar16 four    = vec_splat_u8(4);
     844    vec_uchar16 nib_cnt = (vec_uchar16){4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
     845    vec_uchar16 eight   = vec_splat_u8(8);
     846    vec_uchar16 sixteen = (vec_uchar16){16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16};
     847    vec_uchar16 twentyfour = (vec_uchar16){24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24};
     848  
     849    av = (vec_uchar16)(a);
     850  
     851    cnt_hi = vec_perm(nib_cnt, nib_cnt, vec_sr(av, four));
     852    cnt_lo = vec_perm(nib_cnt, nib_cnt, av);
     853  
     854    cnt = vec_add(cnt_hi, vec_and(cnt_lo, vec_cmpeq(cnt_hi, four)));
     855  
     856    tmp1 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(eight));
     857    tmp2 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(sixteen));
     858    tmp3 = (vec_uchar16)vec_sl((vec_uint4)(cnt), (vec_uint4)(twentyfour));
     859  
     860    cnt = vec_add(cnt, vec_and(tmp1, vec_cmpeq(cnt, eight)));
     861    cnt = vec_add(cnt, vec_and(tmp2, vec_cmpeq(cnt, sixteen)));
     862    cnt = vec_add(cnt, vec_and(tmp3, vec_cmpeq(cnt, twentyfour)));
     863    
     864    return (qword)((vec_sr((vec_uint4)(cnt), (vec_uint4)(twentyfour))));
     865  }
     866  
     867  /* Convert to float
     868   */
     869  #define si_cuflt(_a, _b)	((qword)(vec_ctf((vec_uint4)(_a), _b)))
     870  #define si_csflt(_a, _b)	((qword)(vec_ctf((vec_int4)(_a), _b)))
     871  
     872  /* Convert to signed int
     873   */
     874  #define si_cflts(_a, _b)	((qword)(vec_cts((vec_float4)(_a), _b)))
     875  
     876  /* Convert to unsigned int
     877   */
     878  #define si_cfltu(_a, _b)	((qword)(vec_ctu((vec_float4)(_a), _b)))
     879  
     880  /* Synchronize
     881   */
     882  #define si_dsync()		/* do nothing */
     883  #define si_sync()		/* do nothing */
     884  #define si_syncc()		/* do nothing */
     885  
     886  
     887  /* Equivalence
     888   */
     889  static __inline qword si_eqv(qword a, qword b)
     890  {
     891    vec_uchar16 d;
     892  
     893    d = vec_xor((vec_uchar16)(a), (vec_uchar16)(b));
     894    return ((qword)(vec_nor(d, d)));
     895  }
     896  
     897  /* Extend
     898   */
     899  static __inline qword si_xsbh(qword a)
     900  {
     901    vec_char16 av;
     902  
     903    av = (vec_char16)(a);
     904    return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){1, 3, 5, 7, 9,11,13,15, 
     905  						              0, 0, 0, 0, 0, 0, 0, 0})))));
     906  }
     907  
     908  static __inline qword si_xshw(qword a)
     909  {
     910    vec_short8 av;
     911  
     912    av = (vec_short8)(a);
     913    return ((qword)(vec_unpackh(vec_perm(av, av, ((vec_uchar16){2, 3, 6, 7, 
     914  					                      10,11,14,15,
     915  							      0, 0, 0, 0, 
     916  						              0, 0, 0, 0})))));
     917  }
     918  
     919  static __inline qword si_xswd(qword a)
     920  {
     921    vec_int4 av;
     922  
     923    av = (vec_int4)(a);
     924    return ((qword)(vec_perm(av, vec_sra(av, ((vec_uint4){31,31,31,31})), 
     925  			   ((vec_uchar16){20, 21, 22, 23,  
     926  					   4,  5,  6,  7, 
     927  				          28, 29, 30, 31, 
     928  				          12, 13, 14, 15}))));
     929  }
     930  
     931  static __inline qword si_fesd(qword a)
     932  {
     933    union {
     934      double d[2];
     935      vec_double2	vd;
     936    } out;
     937    union {
     938      float f[4];
     939      vec_float4 vf;
     940    } in;
     941  
     942    in.vf = (vec_float4)(a);
     943    out.d[0] = (double)(in.f[0]);
     944    out.d[1] = (double)(in.f[2]);
     945    return ((qword)(out.vd));
     946  }
     947  
     948  /* Gather
     949   */
     950  static __inline qword si_gbb(qword a)
     951  {
     952    vec_uchar16 bits;
     953    vec_uint4   bytes;
     954  
     955    bits  = vec_sl(vec_and((vec_uchar16)(a), vec_splat_u8(1)), ((vec_uchar16){7, 6, 5, 4, 3, 2, 1, 0,
     956  								            7, 6, 5, 4, 3, 2, 1, 0}));
     957    bytes = (vec_uint4)vec_sum2s((vec_int4)(vec_sum4s(bits, ((vec_uint4){0}))), ((vec_int4){0}));
     958  
     959    return ((qword)(vec_perm(bytes, bytes, ((vec_uchar16){0, 0, 7,15, 0, 0, 0, 0,
     960  					                0, 0, 0, 0, 0, 0, 0, 0}))));
     961  }
     962  
     963  
     964  static __inline qword si_gbh(qword a)
     965  {
     966    vec_ushort8 bits;
     967    vec_uint4   bytes;
     968  
     969    bits  = vec_sl(vec_and((vec_ushort8)(a), vec_splat_u16(1)), ((vec_ushort8){7, 6, 5, 4, 3, 2, 1, 0}));
     970  
     971    bytes = (vec_uint4)vec_sums((vec_int4)(vec_sum4s((vec_short8)(bits), (vec_int4){0})), (vec_int4){0});
     972  
     973    return ((qword)(vec_sld(bytes, bytes, 12)));
     974  }
     975  
     976  static __inline qword si_gb(qword a)
     977  {
     978    vec_uint4 bits;
     979    vec_uint4 bytes;
     980  
     981    bits  = vec_sl(vec_and((vec_uint4)(a), vec_splat_u32(1)), ((vec_uint4){3, 2, 1, 0}));
     982    bytes = (vec_uint4)vec_sums((vec_int4)(bits), ((vec_int4){0}));
     983    return ((qword)(vec_sld(bytes, bytes, 12)));
     984  }
     985  
     986  
     987  /* Compare and halt 
     988   */
     989  static __inline void si_heq(qword a, qword b)
     990  {
     991    union {
     992      vector unsigned int v;
     993      unsigned int i[4];
     994    } aa, bb;
     995  
     996    aa.v = (vector unsigned int)(a);
     997    bb.v = (vector unsigned int)(b);
     998  
     999    if (aa.i[0] == bb.i[0]) { SPU_HALT_ACTION; };
    1000  }
    1001  
    1002  static __inline void si_heqi(qword a, unsigned int b)
    1003  {
    1004    union {
    1005      vector unsigned int v;
    1006      unsigned int i[4];
    1007    } aa;
    1008  
    1009    aa.v = (vector unsigned int)(a);
    1010  
    1011    if (aa.i[0] == b) { SPU_HALT_ACTION; };
    1012  }
    1013  
    1014  static __inline void si_hgt(qword a, qword b)
    1015  {
    1016    union {
    1017      vector signed int v;
    1018      signed int i[4];
    1019    } aa, bb;
    1020  
    1021    aa.v = (vector signed int)(a);
    1022    bb.v = (vector signed int)(b);
    1023  
    1024    if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
    1025  }
    1026  
    1027  static __inline void si_hgti(qword a, signed int b)
    1028  {
    1029    union {
    1030      vector signed int v;
    1031      signed int i[4];
    1032    } aa;
    1033  
    1034    aa.v = (vector signed int)(a);
    1035  
    1036    if (aa.i[0] > b) { SPU_HALT_ACTION; };
    1037  }
    1038  
    1039  static __inline void si_hlgt(qword a, qword b)
    1040  {
    1041    union {
    1042      vector unsigned int v;
    1043      unsigned int i[4];
    1044    } aa, bb;
    1045  
    1046    aa.v = (vector unsigned int)(a);
    1047    bb.v = (vector unsigned int)(b);
    1048  
    1049    if (aa.i[0] > bb.i[0]) { SPU_HALT_ACTION; };
    1050  }
    1051  
    1052  static __inline void si_hlgti(qword a, unsigned int b)
    1053  {
    1054    union {
    1055      vector unsigned int v;
    1056      unsigned int i[4];
    1057    } aa;
    1058  
    1059    aa.v = (vector unsigned int)(a);
    1060  
    1061    if (aa.i[0] > b) { SPU_HALT_ACTION; };
    1062  }
    1063  
    1064  
    1065  /* Multiply and Add
    1066   */
    1067  static __inline qword si_mpya(qword a, qword b, qword c)
    1068  {
    1069    return ((qword)(vec_msum(vec_and((vec_short8)(a), 
    1070  				   ((vec_short8){0, -1, 0, -1, 0, -1, 0, -1})), 
    1071  			   (vec_short8)(b), (vec_int4)(c))));
    1072  }
    1073  
    1074  static __inline qword si_fma(qword a, qword b, qword c)
    1075  {
    1076    return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
    1077  }
    1078  
    1079  static __inline qword si_dfma(qword a, qword b, qword c)
    1080  {
    1081    union {
    1082      vec_double2 v;
    1083      double d[2];
    1084    } aa, bb, cc, dd;
    1085  
    1086    aa.v = (vec_double2)(a);
    1087    bb.v = (vec_double2)(b);
    1088    cc.v = (vec_double2)(c);
    1089    dd.d[0] = aa.d[0] * bb.d[0] + cc.d[0];
    1090    dd.d[1] = aa.d[1] * bb.d[1] + cc.d[1];
    1091    return ((qword)(dd.v));
    1092  }
    1093  
    1094  /* Form Mask
    1095   */
    1096  #define si_fsmbi(_a)	si_fsmb(si_from_int(_a))
    1097  
    1098  static __inline qword si_fsmb(qword a)
    1099  {
    1100    vec_char16 mask;
    1101    vec_ushort8 in;
    1102  
    1103    in = (vec_ushort8)(a);
    1104    mask = (vec_char16)(vec_perm(in, in, ((vec_uchar16){2, 2, 2, 2, 2, 2, 2, 2,
    1105  					              3, 3, 3, 3, 3, 3, 3, 3})));
    1106    return ((qword)(vec_sra(vec_sl(mask, ((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7,
    1107  				                      0, 1, 2, 3, 4, 5, 6, 7})),
    1108  			  vec_splat_u8(7))));
    1109  }
    1110  
    1111  
    1112  static __inline qword si_fsmh(qword a)
    1113  {
    1114    vec_uchar16 in;
    1115    vec_short8 mask;
    1116  
    1117    in = (vec_uchar16)(a);
    1118    mask = (vec_short8)(vec_splat(in, 3));
    1119    return ((qword)(vec_sra(vec_sl(mask, ((vec_ushort8){0, 1, 2, 3, 4, 5, 6, 7})), 
    1120  			  vec_splat_u16(15))));
    1121  }
    1122  
    1123  static __inline qword si_fsm(qword a)
    1124  {
    1125    vec_uchar16 in;
    1126    vec_int4 mask;
    1127  
    1128    in = (vec_uchar16)(a);
    1129    mask = (vec_int4)(vec_splat(in, 3));
    1130    return ((qword)(vec_sra(vec_sl(mask, ((vec_uint4){28, 29, 30, 31})),
    1131  			  ((vec_uint4){31,31,31,31}))));
    1132  }
    1133  
    1134  /* Move from/to registers
    1135   */
    1136  #define si_fscrrd()		((qword)((vec_uint4){0}))
    1137  #define si_fscrwr(_a)
    1138  
    1139  #define si_mfspr(_reg)		((qword)((vec_uint4){0}))
    1140  #define si_mtspr(_reg, _a)
    1141  
    1142  /* Multiply High High Add
    1143   */
    1144  static __inline qword si_mpyhha(qword a, qword b, qword c)
    1145  {
    1146    return ((qword)(vec_add(vec_mule((vec_short8)(a), (vec_short8)(b)), (vec_int4)(c))));
    1147  }
    1148  
    1149  static __inline qword si_mpyhhau(qword a, qword b, qword c)
    1150  {
    1151    return ((qword)(vec_add(vec_mule((vec_ushort8)(a), (vec_ushort8)(b)), (vec_uint4)(c))));
    1152  }
    1153  
    1154  /* Multiply Subtract
    1155   */
    1156  static __inline qword si_fms(qword a, qword b, qword c)
    1157  {
    1158    return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), 
    1159  			   vec_sub(((vec_float4){0.0f}), (vec_float4)(c)))));
    1160  }
    1161  
    1162  static __inline qword si_dfms(qword a, qword b, qword c)
    1163  {
    1164    union {
    1165      vec_double2 v;
    1166      double d[2];
    1167    } aa, bb, cc, dd;
    1168  
    1169    aa.v = (vec_double2)(a);
    1170    bb.v = (vec_double2)(b);
    1171    cc.v = (vec_double2)(c);
    1172    dd.d[0] = aa.d[0] * bb.d[0] - cc.d[0];
    1173    dd.d[1] = aa.d[1] * bb.d[1] - cc.d[1];
    1174    return ((qword)(dd.v));
    1175  }
    1176  
    1177  /* Multiply
    1178   */
    1179  static __inline qword si_fm(qword a, qword b)
    1180  {
    1181    return ((qword)(vec_madd((vec_float4)(a), (vec_float4)(b), ((vec_float4){0.0f}))));
    1182  }
    1183  
    1184  static __inline qword si_dfm(qword a, qword b)
    1185  {
    1186    union {
    1187      vec_double2 v;
    1188      double d[2];
    1189    } aa, bb, dd;
    1190  
    1191    aa.v = (vec_double2)(a);
    1192    bb.v = (vec_double2)(b);
    1193    dd.d[0] = aa.d[0] * bb.d[0];
    1194    dd.d[1] = aa.d[1] * bb.d[1];
    1195    return ((qword)(dd.v));
    1196  }
    1197  
    1198  /* Multiply High
    1199   */
    1200  static __inline qword si_mpyh(qword a, qword b)
    1201  {
    1202    vec_uint4 sixteen = (vec_uint4){16, 16, 16, 16};
    1203  
    1204    return ((qword)(vec_sl(vec_mule((vec_short8)(a), (vec_short8)(vec_sl((vec_uint4)(b), sixteen))), sixteen)));
    1205  }
    1206  
    1207  
    1208  /* Multiply High High
    1209   */
    1210  static __inline qword si_mpyhh(qword a, qword b)
    1211  {
    1212    return ((qword)(vec_mule((vec_short8)(a), (vec_short8)(b))));
    1213  }
    1214  
    1215  static __inline qword si_mpyhhu(qword a, qword b)
    1216  {
    1217    return ((qword)(vec_mule((vec_ushort8)(a), (vec_ushort8)(b))));
    1218  }
    1219  
    1220  /* Multiply Odd
    1221   */
    1222  static __inline qword si_mpy(qword a, qword b)
    1223  {
    1224    return ((qword)(vec_mulo((vec_short8)(a), (vec_short8)(b))));
    1225  }
    1226  
    1227  static __inline qword si_mpyu(qword a, qword b)
    1228  {
    1229    return ((qword)(vec_mulo((vec_ushort8)(a), (vec_ushort8)(b))));
    1230  }
    1231  
    1232  static __inline qword si_mpyi(qword a, short b)
    1233  {
    1234    return ((qword)(vec_mulo((vec_short8)(a), 
    1235  			   vec_splat((vec_short8)(si_from_short(b)), 1))));
    1236  }
    1237  
    1238  static __inline qword si_mpyui(qword a, unsigned short b)
    1239  {
    1240    return ((qword)(vec_mulo((vec_ushort8)(a), 
    1241  			   vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
    1242  }
    1243  
    1244  /* Multiply and Shift Right
    1245   */
    1246  static __inline qword si_mpys(qword a, qword b)
    1247  {
    1248    return ((qword)(vec_sra(vec_mulo((vec_short8)(a), (vec_short8)(b)), ((vec_uint4){16,16,16,16}))));
    1249  }
    1250  
    1251  /* Nand
    1252   */
    1253  static __inline qword si_nand(qword a, qword b)
    1254  {
    1255    vec_uchar16 d;
    1256  
    1257    d = vec_and((vec_uchar16)(a), (vec_uchar16)(b));
    1258    return ((qword)(vec_nor(d, d)));
    1259  }
    1260  
    1261  /* Negative Multiply Add
    1262   */
    1263  static __inline qword si_dfnma(qword a, qword b, qword c)
    1264  {
    1265    union {
    1266      vec_double2 v;
    1267      double d[2];
    1268    } aa, bb, cc, dd;
    1269  
    1270    aa.v = (vec_double2)(a);
    1271    bb.v = (vec_double2)(b);
    1272    cc.v = (vec_double2)(c);
    1273    dd.d[0] = -cc.d[0] - aa.d[0] * bb.d[0];
    1274    dd.d[1] = -cc.d[1] - aa.d[1] * bb.d[1];
    1275    return ((qword)(dd.v));
    1276  }
    1277  
    1278  /* Negative Multiply and Subtract
    1279   */
    1280  static __inline qword si_fnms(qword a, qword b, qword c)
    1281  {
    1282    return ((qword)(vec_nmsub((vec_float4)(a), (vec_float4)(b), (vec_float4)(c))));
    1283  }
    1284  
    1285  static __inline qword si_dfnms(qword a, qword b, qword c)
    1286  {
    1287    union {
    1288      vec_double2 v;
    1289      double d[2];
    1290    } aa, bb, cc, dd;
    1291  
    1292    aa.v = (vec_double2)(a);
    1293    bb.v = (vec_double2)(b);
    1294    cc.v = (vec_double2)(c);
    1295    dd.d[0] = cc.d[0] - aa.d[0] * bb.d[0];
    1296    dd.d[1] = cc.d[1] - aa.d[1] * bb.d[1];
    1297    return ((qword)(dd.v));
    1298  }
    1299  
    1300  /* Nor
    1301   */
    1302  static __inline qword si_nor(qword a, qword b)
    1303  {
    1304    return ((qword)(vec_nor((vec_uchar16)(a), (vec_uchar16)(b))));
    1305  }
    1306  
    1307  /* Or
    1308   */
    1309  static __inline qword si_or(qword a, qword b)
    1310  {
    1311    return ((qword)(vec_or((vec_uchar16)(a), (vec_uchar16)(b))));
    1312  }
    1313  
    1314  static __inline qword si_orbi(qword a, unsigned char b)
    1315  {
    1316    return ((qword)(vec_or((vec_uchar16)(a), 
    1317  			 vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
    1318  }
    1319  
    1320  static __inline qword si_orhi(qword a, unsigned short b)
    1321  {
    1322    return ((qword)(vec_or((vec_ushort8)(a), 
    1323  			  vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
    1324  }
    1325  
    1326  static __inline qword si_ori(qword a, unsigned int b)
    1327  {
    1328    return ((qword)(vec_or((vec_uint4)(a), 
    1329  			  vec_splat((vec_uint4)(si_from_uint(b)), 0))));
    1330  }
    1331  
    1332  /* Or Complement
    1333   */
    1334  static __inline qword si_orc(qword a, qword b)
    1335  {
    1336    return ((qword)(vec_or((vec_uchar16)(a), vec_nor((vec_uchar16)(b), (vec_uchar16)(b)))));
    1337  }
    1338  
    1339  
    1340  /* Or Across
    1341   */
    1342  static __inline qword si_orx(qword a)
    1343  {
    1344    vec_uchar16 tmp;
    1345    tmp = (vec_uchar16)(a);
    1346    tmp = vec_or(tmp, vec_sld(tmp, tmp, 8));
    1347    tmp = vec_or(tmp, vec_sld(tmp, tmp, 4));
    1348    return ((qword)(vec_and(tmp, ((vec_uchar16){0xFF,0xFF,0xFF,0xFF, 0x00,0x00,0x00,0x00,
    1349  				              0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00}))));
    1350  }
    1351  
    1352  
    1353  /* Estimates
    1354   */
    1355  static __inline qword si_frest(qword a)
    1356  {
    1357    return ((qword)(vec_re((vec_float4)(a))));
    1358  }
    1359  
    1360  static __inline qword si_frsqest(qword a)
    1361  {
    1362    return ((qword)(vec_rsqrte((vec_float4)(a))));
    1363  }
    1364  
    1365  #define si_fi(_a, _d)		(_d)
    1366  
    1367  /* Channel Read and Write
    1368   */
    1369  #define si_rdch(_channel)		((qword)(vec_splat_u8(0)))	/* not mappable */
    1370  #define si_rchcnt(_channel)		((qword)(vec_splat_u8(0)))	/* not mappable */
    1371  #define si_wrch(_channel, _a)		/* not mappable */
    1372  
    1373  /* Rotate Left
    1374   */
    1375  static __inline qword si_roth(qword a, qword b)
    1376  {
    1377    return ((qword)(vec_rl((vec_ushort8)(a), (vec_ushort8)(b))));
    1378  }
    1379  
    1380  static __inline qword si_rot(qword a, qword b)
    1381  {
    1382    return ((qword)(vec_rl((vec_uint4)(a), (vec_uint4)(b))));
    1383  }
    1384  
    1385  static __inline qword si_rothi(qword a, int b)
    1386  {
    1387    return ((qword)(vec_rl((vec_ushort8)(a), 
    1388  			 vec_splat((vec_ushort8)(si_from_int(b)), 1))));
    1389  }
    1390  
    1391  static __inline qword si_roti(qword a, int b)
    1392  {
    1393    return ((qword)(vec_rl((vec_uint4)(a), 
    1394  			 vec_splat((vec_uint4)(si_from_int(b)), 0))));
    1395  }
    1396  
    1397  /* Rotate Left with Mask
    1398   */
    1399  static __inline qword si_rothm(qword a, qword b)
    1400  {
    1401    vec_ushort8 neg_b;
    1402    vec_ushort8 mask;
    1403  
    1404    neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
    1405    mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
    1406    return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
    1407  }
    1408  
    1409  static __inline qword si_rotm(qword a, qword b)
    1410  {
    1411    vec_uint4 neg_b;
    1412    vec_uint4 mask;
    1413  
    1414    neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
    1415    mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
    1416    return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
    1417  }
    1418  
    1419  static __inline qword si_rothmi(qword a, int b)
    1420  {
    1421    vec_ushort8 neg_b;
    1422    vec_ushort8 mask;
    1423  
    1424    neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
    1425    mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
    1426    return ((qword)(vec_andc(vec_sr((vec_ushort8)(a), neg_b), mask)));
    1427  }
    1428  
    1429  static __inline qword si_rotmi(qword a, int b)
    1430  {
    1431    vec_uint4 neg_b;
    1432    vec_uint4 mask;
    1433  
    1434    neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
    1435    mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
    1436    return ((qword)(vec_andc(vec_sr((vec_uint4)(a), neg_b), mask)));
    1437  }
    1438  
    1439  
    1440  /* Rotate Left Algebraic with Mask
    1441   */
    1442  static __inline qword si_rotmah(qword a, qword b)
    1443  {
    1444    vec_ushort8 neg_b;
    1445    vec_ushort8 mask;
    1446  
    1447    neg_b = (vec_ushort8)vec_sub(vec_splat_s16(0), (vec_short8)(b));
    1448    mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
    1449    return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
    1450  }
    1451  
    1452  static __inline qword si_rotma(qword a, qword b)
    1453  {
    1454    vec_uint4 neg_b;
    1455    vec_uint4 mask;
    1456  
    1457    neg_b = (vec_uint4)vec_sub(vec_splat_s32(0), (vec_int4)(b));
    1458    mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
    1459    return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
    1460  }
    1461  
    1462  
    1463  static __inline qword si_rotmahi(qword a, int b)
    1464  {
    1465    vec_ushort8 neg_b;
    1466    vec_ushort8 mask;
    1467  
    1468    neg_b = vec_splat((vec_ushort8)(si_from_int(-b)), 1);
    1469    mask = vec_sra(vec_sl(neg_b, vec_splat_u16(11)), vec_splat_u16(15));
    1470    return ((qword)(vec_sra((vec_short8)(a), (vec_ushort8)vec_or(neg_b, mask))));
    1471  }
    1472  
    1473  static __inline qword si_rotmai(qword a, int b)
    1474  {
    1475    vec_uint4 neg_b;
    1476    vec_uint4 mask;
    1477  
    1478    neg_b = vec_splat((vec_uint4)(si_from_int(-b)), 0);
    1479    mask = vec_sra(vec_sl(neg_b, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
    1480    return ((qword)(vec_sra((vec_int4)(a), (vec_uint4)vec_or(neg_b, mask))));
    1481  }
    1482  
    1483  
    1484  /* Rotate Left Quadword by Bytes with Mask
    1485   */
    1486  static __inline qword si_rotqmbyi(qword a, int count)
    1487  {
    1488    union {
    1489      vec_uchar16 v;
    1490      int i[4];
    1491    } x;
    1492    vec_uchar16 mask;
    1493  
    1494    count = 0 - count;
    1495    x.i[3] = count << 3;
    1496    mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
    1497  
    1498    return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
    1499  }
    1500  
    1501  
    1502  static __inline qword si_rotqmby(qword a, qword count)
    1503  {
    1504    union {
    1505      vec_uchar16 v;
    1506      int i[4];
    1507    } x;
    1508    int cnt;
    1509    vec_uchar16 mask;
    1510  
    1511    x.v = (vec_uchar16)(count);
    1512    x.i[0] = cnt = (0 - x.i[0]) << 3;
    1513  
    1514    x.v = vec_splat(x.v, 3);
    1515    mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
    1516  
    1517    return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
    1518  }
    1519  
    1520  
    1521  /* Rotate Left Quadword by Bytes
    1522   */
    1523  static __inline qword si_rotqbyi(qword a, int count)
    1524  {
    1525    union {
    1526      vec_uchar16 v;
    1527      int i[4];
    1528    } left, right;
    1529   
    1530    count <<= 3;
    1531    left.i[3] = count;
    1532    right.i[3] = 0 - count;
    1533    return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left.v), vec_sro((vec_uchar16)(a), right.v))));
    1534  }
    1535  
    1536  static __inline qword si_rotqby(qword a, qword count)
    1537  {
    1538    vec_uchar16 left, right;
    1539   
    1540    left = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
    1541    right = vec_sub(vec_splat_u8(0), left);
    1542    return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
    1543  }
    1544  
    1545  /* Rotate Left Quadword by Bytes Bit Count
    1546   */
    1547  static __inline qword si_rotqbybi(qword a, qword count)
    1548  {
    1549    vec_uchar16 left, right;
    1550  
    1551    left = vec_splat((vec_uchar16)(count), 3);
    1552    right = vec_sub(vec_splat_u8(7), left);
    1553    return ((qword)(vec_or(vec_slo((vec_uchar16)(a), left), vec_sro((vec_uchar16)(a), right))));
    1554  }
    1555  
    1556  
    1557  /* Rotate Left Quadword by Bytes Bit Count
    1558   */
    1559  static __inline qword si_rotqbii(qword a, int count)
    1560  {
    1561    vec_uchar16 x, y;
    1562    vec_uchar16 result;
    1563   
    1564    x = vec_splat((vec_uchar16)(si_from_int(count & 7)), 3);
    1565    y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
    1566  			   (vec_uint4)vec_sub(vec_splat_u8(8), x)));
    1567    result = vec_or(vec_sll((qword)(a), x), y);
    1568    return ((qword)(result));
    1569  }
    1570  
    1571  static __inline qword si_rotqbi(qword a, qword count)
    1572  {
    1573    vec_uchar16 x, y;
    1574    vec_uchar16 result;
    1575   
    1576    x = vec_and(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(7));
    1577    y = (vec_uchar16)(vec_sr((vec_uint4)vec_sro((vec_uchar16)(a), ((vec_uchar16)((vec_uint4){0,0,0,120}))),
    1578  			   (vec_uint4)vec_sub(vec_splat_u8(8), x)));
    1579    
    1580    result = vec_or(vec_sll((qword)(a), x), y);
    1581    return ((qword)(result));
    1582  }
    1583  
    1584  
    1585  /* Rotate Left Quadword and Mask by Bits
    1586   */
    1587  static __inline qword si_rotqmbii(qword a, int count)
    1588  {
    1589    return ((qword)(vec_srl((vec_uchar16)(a), vec_splat((vec_uchar16)(si_from_int(0 - count)), 3))));
    1590  }
    1591  
    1592  static __inline qword si_rotqmbi(qword a, qword count)
    1593  {
    1594    return ((qword)(vec_srl((vec_uchar16)(a), vec_sub(vec_splat_u8(0), vec_splat((vec_uchar16)(count), 3)))));
    1595  }
    1596  
    1597  
    1598  /* Rotate Left Quadword and Mask by Bytes with Bit Count
    1599   */
    1600  static __inline qword si_rotqmbybi(qword a, qword count)
    1601  {
    1602    union {
    1603      vec_uchar16 v;
    1604      int i[4];
    1605    } x;
    1606    int cnt;
    1607    vec_uchar16 mask;
    1608  
    1609    x.v = (vec_uchar16)(count);
    1610    x.i[0] = cnt = 0 - (x.i[0] & ~7);
    1611    x.v = vec_splat(x.v, 3);
    1612    mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
    1613  
    1614    return ((qword)(vec_and(vec_sro((vec_uchar16)(a), x.v), mask)));
    1615  }
    1616  
    1617  
    1618  
    1619  
    1620  /* Round Double to Float
    1621   */
    1622  static __inline qword si_frds(qword a)
    1623  {
    1624    union {
    1625      vec_float4 v;
    1626      float f[4];
    1627    } d;
    1628    union {
    1629      vec_double2 v;
    1630      double d[2];
    1631    } in;
    1632  
    1633    in.v = (vec_double2)(a);
    1634    d.v = (vec_float4){0.0f};
    1635    d.f[0] = (float)in.d[0];
    1636    d.f[2] = (float)in.d[1];
    1637  
    1638    return ((qword)(d.v));
    1639  }
    1640  
    1641  /* Select Bits
    1642   */
    1643  static __inline qword si_selb(qword a, qword b, qword c)
    1644  {
    1645    return ((qword)(vec_sel((vec_uchar16)(a), (vec_uchar16)(b), (vec_uchar16)(c))));
    1646  }
    1647  
    1648  
    1649  /* Shuffle Bytes
    1650   */
    1651  static __inline qword si_shufb(qword a, qword b, qword pattern)
    1652  {
    1653    vec_uchar16 pat;
    1654  
    1655    pat = vec_sel(((vec_uchar16){0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}), 
    1656  		vec_sr((vec_uchar16)(pattern), vec_splat_u8(3)),
    1657  		vec_sra((vec_uchar16)(pattern), vec_splat_u8(7)));
    1658    return ((qword)(vec_perm(vec_perm(a, b, pattern), 
    1659  			   ((vec_uchar16){0, 0, 0, 0, 0, 0, 0, 0,
    1660  				          0xFF, 0xFF, 0xFF, 0xFF, 0x80, 0x80, 0x80, 0x80}),
    1661  			   pat)));
    1662  }
    1663  
    1664  
    1665  /* Shift Left
    1666   */
    1667  static __inline qword si_shlh(qword a, qword b)
    1668  {
    1669    vec_ushort8 mask;
    1670  
    1671    mask = (vec_ushort8)vec_sra(vec_sl((vec_ushort8)(b), vec_splat_u16(11)), vec_splat_u16(15));
    1672    return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), (vec_ushort8)(b)), mask)));
    1673  }
    1674  
    1675  static __inline qword si_shl(qword a, qword b)
    1676  {
    1677    vec_uint4 mask;
    1678  
    1679    mask = (vec_uint4)vec_sra(vec_sl((vec_uint4)(b), ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
    1680    return ((qword)(vec_andc(vec_sl((vec_uint4)(a), (vec_uint4)(b)), mask)));
    1681  }
    1682  
    1683  
    1684  static __inline qword si_shlhi(qword a, unsigned int b)
    1685  {
    1686    vec_ushort8 mask;
    1687    vec_ushort8 bv;
    1688  
    1689    bv = vec_splat((vec_ushort8)(si_from_int(b)), 1);
    1690    mask = (vec_ushort8)vec_sra(vec_sl(bv, vec_splat_u16(11)), vec_splat_u16(15));
    1691    return ((qword)(vec_andc(vec_sl((vec_ushort8)(a), bv), mask)));
    1692  }
    1693  
    1694  static __inline qword si_shli(qword a, unsigned int b)
    1695  {
    1696    vec_uint4 bv;
    1697    vec_uint4 mask;
    1698  
    1699    bv = vec_splat((vec_uint4)(si_from_uint(b)), 0);
    1700    mask = (vec_uint4)vec_sra(vec_sl(bv, ((vec_uint4){26,26,26,26})), ((vec_uint4){31,31,31,31}));
    1701    return ((qword)(vec_andc(vec_sl((vec_uint4)(a), bv), mask)));
    1702  }
    1703  
    1704  
    1705  /* Shift Left Quadword
    1706   */
    1707  static __inline qword si_shlqbii(qword a, unsigned int count)
    1708  {
    1709    vec_uchar16 x;
    1710  
    1711    x = vec_splat((vec_uchar16)(si_from_uint(count)), 3);
    1712    return ((qword)(vec_sll((vec_uchar16)(a), x)));
    1713  }
    1714  
    1715  static __inline qword si_shlqbi(qword a, qword count)
    1716  {
    1717    vec_uchar16 x;
    1718  
    1719    x = vec_splat((vec_uchar16)(count), 3);
    1720    return ((qword)(vec_sll((vec_uchar16)(a), x)));
    1721  }
    1722  
    1723  
    1724  /* Shift Left Quadword by Bytes
    1725   */
    1726  static __inline qword si_shlqbyi(qword a, unsigned int count)
    1727  {
    1728    union {
    1729      vec_uchar16 v;
    1730      int i[4];
    1731    } x;
    1732    vec_uchar16 mask;
    1733  
    1734    x.i[3] = count << 3;
    1735    mask = (count & 0x10) ? vec_splat_u8(0) : vec_splat_u8(-1);
    1736    return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
    1737  }
    1738  
    1739  static __inline qword si_shlqby(qword a, qword count)
    1740  {
    1741    union {
    1742      vec_uchar16 v;
    1743      unsigned int i[4];
    1744    } x;
    1745    unsigned int cnt;
    1746    vec_uchar16 mask;
    1747  
    1748    x.v = vec_sl(vec_splat((vec_uchar16)(count), 3), vec_splat_u8(3));
    1749    cnt = x.i[0];
    1750    mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
    1751    return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
    1752  }
    1753  
    1754  /* Shift Left Quadword by Bytes with Bit Count
    1755   */
    1756  static __inline qword si_shlqbybi(qword a, qword count)
    1757  {
    1758    union {
    1759      vec_uchar16 v;
    1760      int i[4];
    1761    } x;
    1762    unsigned int cnt;
    1763    vec_uchar16 mask;
    1764  
    1765    x.v = vec_splat((vec_uchar16)(count), 3);
    1766    cnt = x.i[0];
    1767    mask = (cnt & 0x80) ? vec_splat_u8(0) : vec_splat_u8(-1);
    1768    return ((qword)(vec_and(vec_slo((vec_uchar16)(a), x.v), mask)));
    1769  }
    1770  
    1771  
    1772  /* Stop and Signal
    1773   */
    1774  #define si_stop(_type)		SPU_STOP_ACTION
    1775  #define si_stopd(a, b, c)	SPU_STOP_ACTION
    1776  
    1777  
    1778  /* Subtract
    1779   */
    1780  static __inline qword si_sfh(qword a, qword b)
    1781  {
    1782    return ((qword)(vec_sub((vec_ushort8)(b), (vec_ushort8)(a))));
    1783  }
    1784  
    1785  static __inline qword si_sf(qword a, qword b)
    1786  {
    1787    return ((qword)(vec_sub((vec_uint4)(b), (vec_uint4)(a))));
    1788  }
    1789  
    1790  static __inline qword si_fs(qword a, qword b)
    1791  {
    1792    return ((qword)(vec_sub((vec_float4)(a), (vec_float4)(b))));
    1793  }
    1794  
    1795  static __inline qword si_dfs(qword a, qword b)
    1796  {
    1797    union {
    1798      vec_double2 v;
    1799      double d[2];
    1800    } aa, bb, dd;
    1801  
    1802    aa.v = (vec_double2)(a);
    1803    bb.v = (vec_double2)(b);
    1804    dd.d[0] = aa.d[0] - bb.d[0];
    1805    dd.d[1] = aa.d[1] - bb.d[1];
    1806    return ((qword)(dd.v));
    1807  }
    1808  
    1809  static __inline qword si_sfhi(qword a, short b)
    1810  {
    1811    return ((qword)(vec_sub(vec_splat((vec_short8)(si_from_short(b)), 1),
    1812  			  (vec_short8)(a))));
    1813  }
    1814  
    1815  static __inline qword si_sfi(qword a, int b)
    1816  {
    1817    return ((qword)(vec_sub(vec_splat((vec_int4)(si_from_int(b)), 0),
    1818  			  (vec_int4)(a))));
    1819  }
    1820  
    1821  /* Subtract word extended
    1822   */
    1823  #define si_sfx(_a, _b, _c)	((qword)(vec_add(vec_add((vec_uint4)(_b), 				\
    1824  							 vec_nor((vec_uint4)(_a), (vec_uint4)(_a))), 	\
    1825  						 vec_and((vec_uint4)(_c), vec_splat_u32(1)))))
    1826  
    1827  
    1828  /* Sum Bytes into Shorts
    1829   */
    1830  static __inline qword si_sumb(qword a, qword b)
    1831  {
    1832    vec_uint4 zero = (vec_uint4){0};
    1833    vec_ushort8 sum_a, sum_b;
    1834    
    1835    sum_a = (vec_ushort8)vec_sum4s((vec_uchar16)(a), zero);
    1836    sum_b = (vec_ushort8)vec_sum4s((vec_uchar16)(b), zero);
    1837  
    1838    return ((qword)(vec_perm(sum_a, sum_b, ((vec_uchar16){18, 19,  2,  3, 22, 23,  6,  7,
    1839  					                26, 27, 10, 11, 30, 31, 14, 15}))));
    1840  }
    1841  
    1842  /* Exclusive OR
    1843   */
    1844  static __inline qword si_xor(qword a, qword b)
    1845  {
    1846    return ((qword)(vec_xor((vec_uchar16)(a), (vec_uchar16)(b))));
    1847  }
    1848  
    1849  static __inline qword si_xorbi(qword a, unsigned char b)
    1850  {
    1851    return ((qword)(vec_xor((vec_uchar16)(a), 
    1852  			  vec_splat((vec_uchar16)(si_from_uchar(b)), 3))));
    1853  }
    1854  
    1855  static __inline qword si_xorhi(qword a, unsigned short b)
    1856  {
    1857    return ((qword)(vec_xor((vec_ushort8)(a), 
    1858  			  vec_splat((vec_ushort8)(si_from_ushort(b)), 1))));
    1859  }
    1860  
    1861  static __inline qword si_xori(qword a, unsigned int b)
    1862  {
    1863    return ((qword)(vec_xor((vec_uint4)(a), 
    1864  			  vec_splat((vec_uint4)(si_from_uint(b)), 0))));
    1865  }
    1866  
    1867  
    1868  /* Generate Controls for Sub-Quadword Insertion
    1869   */
    1870  static __inline qword si_cbd(qword a, int imm)
    1871  {
    1872    union {
    1873      vec_uint4 v;
    1874      unsigned char c[16];
    1875    } shmask;
    1876  
    1877    shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
    1878    shmask.c[(si_to_uint(a) + (unsigned int)(imm)) & 0xF] = 0x03;
    1879    return ((qword)(shmask.v));
    1880  }
    1881  
    1882  static __inline qword si_cdd(qword a, int imm)
    1883  {
    1884    union {
    1885      vec_uint4 v;
    1886      unsigned long long ll[2];
    1887    } shmask;
    1888  
    1889    shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
    1890    shmask.ll[((si_to_uint(a) + (unsigned int)(imm)) >> 3) & 0x1] = 0x0001020304050607ULL;
    1891    return ((qword)(shmask.v));
    1892  }
    1893  
    1894  static __inline qword si_chd(qword a, int imm)
    1895  {
    1896    union {
    1897      vec_uint4 v;
    1898      unsigned short s[8];
    1899    } shmask;
    1900  
    1901    shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
    1902    shmask.s[((si_to_uint(a) + (unsigned int)(imm)) >> 1) & 0x7] = 0x0203;
    1903    return ((qword)(shmask.v));
    1904  }
    1905  
    1906  static __inline qword si_cwd(qword a, int imm)
    1907  {
    1908    union {
    1909      vec_uint4 v;
    1910      unsigned int i[4];
    1911    } shmask;
    1912  
    1913    shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
    1914    shmask.i[((si_to_uint(a) + (unsigned int)(imm)) >> 2) & 0x3] = 0x00010203;
    1915    return ((qword)(shmask.v));
    1916  }
    1917  
    1918  static __inline qword si_cbx(qword a, qword b)
    1919  {
    1920    union {
    1921      vec_uint4 v;
    1922      unsigned char c[16];
    1923    } shmask;
    1924  
    1925    shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
    1926    shmask.c[si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) & 0xF] = 0x03;
    1927    return ((qword)(shmask.v));
    1928  }
    1929  
    1930  
    1931  static __inline qword si_cdx(qword a, qword b)
    1932  {
    1933    union {
    1934      vec_uint4 v;
    1935      unsigned long long ll[2];
    1936    } shmask;
    1937  
    1938    shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
    1939    shmask.ll[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 3) & 0x1] = 0x0001020304050607ULL;
    1940    return ((qword)(shmask.v));
    1941  }
    1942  
    1943  static __inline qword si_chx(qword a, qword b)
    1944  {
    1945    union {
    1946      vec_uint4 v;
    1947      unsigned short s[8];
    1948    } shmask;
    1949  
    1950    shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
    1951    shmask.s[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 1) & 0x7] = 0x0203;
    1952    return ((qword)(shmask.v));
    1953  }
    1954  
    1955  static __inline qword si_cwx(qword a, qword b)
    1956  {
    1957    union {
    1958      vec_uint4 v;
    1959      unsigned int i[4];
    1960    } shmask;
    1961  
    1962    shmask.v = ((vec_uint4){0x10111213, 0x14151617, 0x18191A1B, 0x1C1D1E1F});
    1963    shmask.i[(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))) >> 2) & 0x3] = 0x00010203;
    1964    return ((qword)(shmask.v));
    1965  }
    1966  
    1967  
    1968  /* Constant Formation
    1969   */
    1970  static __inline qword si_il(signed short imm)
    1971  {
    1972    return ((qword)(vec_splat((vec_int4)(si_from_int((signed int)(imm))), 0)));
    1973  }
    1974  
    1975  
    1976  static __inline qword si_ila(unsigned int imm)
    1977  {
    1978    return ((qword)(vec_splat((vec_uint4)(si_from_uint(imm)), 0)));
    1979  }
    1980  
    1981  static __inline qword si_ilh(signed short imm)
    1982  {
    1983    return ((qword)(vec_splat((vec_short8)(si_from_short(imm)), 1)));
    1984  }
    1985  
    1986  static __inline qword si_ilhu(signed short imm)
    1987  {
    1988    return ((qword)(vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm) << 16)), 0)));
    1989  }
    1990  
    1991  static __inline qword si_iohl(qword a, unsigned short imm)
    1992  {
    1993    return ((qword)(vec_or((vec_uint4)(a), vec_splat((vec_uint4)(si_from_uint((unsigned int)(imm))), 0))));
    1994  }
    1995  
    1996  /* No Operation
    1997   */
    1998  #define si_lnop()		/* do nothing */
    1999  #define si_nop()		/* do nothing */
    2000  
    2001  
    2002  /* Memory Load and Store
    2003   */
    2004  static __inline qword si_lqa(unsigned int imm)
    2005  {
    2006    return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
    2007  }
    2008  
    2009  static __inline qword si_lqd(qword a, unsigned int imm)
    2010  {
    2011    return ((qword)(vec_ld(si_to_uint(a) & ~0xF, (vector unsigned char *)(imm))));
    2012  }
    2013  
    2014  static __inline qword si_lqr(unsigned int imm)
    2015  {
    2016    return ((qword)(vec_ld(0, (vector unsigned char *)(imm))));
    2017  }
    2018  
    2019  static __inline qword si_lqx(qword a, qword b)
    2020  {
    2021    return ((qword)(vec_ld(si_to_uint((qword)(vec_add((vec_uint4)(a), (vec_uint4)(b)))), (vector unsigned char *)(0))));
    2022  }
    2023  
    2024  static __inline void si_stqa(qword a, unsigned int imm)
    2025  {
    2026    vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
    2027  }
    2028  
    2029  static __inline void si_stqd(qword a, qword b, unsigned int imm)
    2030  {
    2031    vec_st((vec_uchar16)(a), si_to_uint(b) & ~0xF, (vector unsigned char *)(imm));
    2032  }
    2033  
    2034  static __inline void si_stqr(qword a, unsigned int imm)
    2035  {
    2036    vec_st((vec_uchar16)(a), 0, (vector unsigned char *)(imm));
    2037  }
    2038  
    2039  static __inline void si_stqx(qword a, qword b, qword c)
    2040  {
    2041    vec_st((vec_uchar16)(a), 
    2042  	 si_to_uint((qword)(vec_add((vec_uint4)(b), (vec_uint4)(c)))),
    2043  	 (vector unsigned char *)(0));
    2044  }
    2045  
    2046  #endif /* !__SPU__ */
    2047  #endif /* !_SI2VMX_H_ */
    2048