typedef int *__restrict__ pRINT;
typedef unsigned int *__restrict__ pRUINT;
typedef long long *__restrict__ pRINT64;
typedef unsigned long long *__restrict__ pRUINT64;
extern int abs (int j);

void test_orn (pRUINT a, pRUINT b, pRUINT c)
{
  int i;
  for (i = 0; i < 16; i++)
     c[i] = a[i] | (~b[i]);
}

void test_bic (pRUINT a, pRUINT b, pRUINT c)
{
  int i;
  for (i = 0; i < 16; i++)
     c[i] = a[i] & (~b[i]);
}

void mla (pRINT a, pRINT b, pRINT c)
{
  int i;
  for (i=0;i<16;i++)
    c[i] += a[i] * b[i]; 
}

void mls (pRINT a, pRINT b, pRINT c)
{
  int i;
  for (i=0;i<16;i++)
    c[i] -= a[i] * b[i];
}

void smax (pRINT a, pRINT b, pRINT c)
{
  int i;
  for (i=0;i<16;i++)
    c[i] = (a[i] > b[i] ? a[i] : b[i]);
}

void smin (pRINT a, pRINT b, pRINT c)
{
  int i;
  for (i=0;i<16;i++)
    c[i] = (a[i] < b[i] ? a[i] : b[i]);
}

void umax (pRUINT a, pRUINT b, pRUINT c)
{
  int i;
  for (i=0;i<16;i++)
    c[i] = (a[i] > b[i] ? a[i] : b[i]);
}

void umin (pRUINT a, pRUINT b, pRUINT c)
{
  int i;
  for (i=0;i<16;i++)
    c[i] = (a[i] < b[i] ? a[i] : b[i]);
}

unsigned int reduce_umax (pRUINT a)
{
  int i;
  unsigned int s = a[0];
  for (i = 1; i < 16; i++)
    s = (s > a[i] ? s : a[i]);

  return s;
}

unsigned int reduce_umin (pRUINT a)
{
  int i;
  unsigned int s = a[0];
  for (i = 1; i < 16; i++)
    s = (s < a[i] ? s : a[i]);

  return s;
}

int reduce_smax (pRINT a)
{
  int i;
  int s = a[0];
  for (i = 1; i < 16; i++)
    s = (s > a[i] ? s : a[i]);

  return s;
}

int reduce_smin (pRINT a)
{
  int i;
  int s = a[0];
  for (i = 1; i < 16; i++)
    s = (s < a[i] ? s : a[i]);

  return s;
}

unsigned int reduce_add_u32 (pRINT a)
{
  int i;
  unsigned int s = 0;
  for (i = 0; i < 16; i++)
    s += a[i];

  return s;
}

int reduce_add_s32 (pRINT a)
{
  int i;
  int s = 0;
  for (i = 0; i < 16; i++)
    s += a[i];

  return s;
}

unsigned long long reduce_add_u64 (pRUINT64 a)
{
  int i;
  unsigned long long s = 0;
  for (i = 0; i < 16; i++)
    s += a[i];

  return s;
}

long long reduce_add_s64 (pRINT64 a)
{
  int i;
  long long s = 0;
  for (i = 0; i < 16; i++)
    s += a[i];

  return s;
}

void sabd (pRINT a, pRINT b, pRINT c)
{
  int i;
  for (i = 0; i < 16; i++)
    c[i] = abs (a[i] - b[i]);
}

void saba (pRINT a, pRINT b, pRINT c)
{
  int i;
  for (i = 0; i < 16; i++)
    c[i] += abs (a[i] - b[i]);
}