1
2 /* Single float has 23 bits of fraction. */
3 #define FRAC (1.0f / (1 << 20))
4 typedef float Type;
5
6 int close_enough (Type a, Type b)
7 {
8 Type diff = a - b;
9 if (diff < 0)
10 diff = -diff;
11
12 return diff / a < FRAC;
13 }
14
15 #define N 100
16
17 static int __attribute__ ((noinline))
18 vector (Type ary[N], Type sum, Type prod)
19 {
20 Type tsum = 0, tprod = 1;
21
22 #pragma acc parallel vector_length(32) copyin(ary[0:N])
23 {
24 #pragma acc loop vector reduction(+:tsum) reduction (*:tprod)
25 for (int ix = 0; ix < N; ix++)
26 {
27 tsum += ary[ix];
28 tprod *= ary[ix];
29 }
30 }
31
32 if (!close_enough (sum, tsum))
33 return 1;
34
35 if (!close_enough (prod, tprod))
36 return 1;
37
38 return 0;
39 }
40
41 static int __attribute__ ((noinline))
42 worker (Type ary[N], Type sum, Type prod)
43 {
44 Type tsum = 0, tprod = 1;
45
46 #pragma acc parallel num_workers(32) copyin(ary[0:N])
47 {
48 #pragma acc loop worker reduction(+:tsum) reduction (*:tprod)
49 for (int ix = 0; ix < N; ix++)
50 {
51 tsum += ary[ix];
52 tprod *= ary[ix];
53 }
54 }
55
56 if (!close_enough (sum, tsum))
57 return 1;
58
59 if (!close_enough (prod, tprod))
60 return 1;
61
62 return 0;
63 }
64
65 static int __attribute__ ((noinline))
66 gang (Type ary[N], Type sum, Type prod)
67 {
68 Type tsum = 0, tprod = 1;
69
70 #pragma acc parallel num_gangs (32) copyin(ary[0:N])
71 {
72 #pragma acc loop gang reduction(+:tsum) reduction (*:tprod)
73 for (int ix = 0; ix < N; ix++)
74 {
75 tsum += ary[ix];
76 tprod *= ary[ix];
77 }
78 }
79
80 if (!close_enough (sum, tsum))
81 return 1;
82
83 if (!close_enough (prod, tprod))
84 return 1;
85
86 return 0;
87 }
88
89 int main (void)
90 {
91 Type ary[N], sum = 0, prod = 1;
92
93 for (int ix = 0; ix < N; ix++)
94 {
95 float frac = ix * (1.0f / 1024) + 1.0f;
96
97 ary[ix] = frac;
98 sum += ary[ix];
99 prod *= ary[ix];
100 }
101
102 if (vector (ary, sum, prod))
103 return 1;
104
105 if (worker (ary, sum, prod))
106 return 1;
107
108 if (gang (ary, sum, prod))
109 return 1;
110
111 return 0;
112 }