1 /* filter_vsx_intrinsics.c - PowerPC optimised filter functions
2 *
3 * Copyright (c) 2018 Cosmin Truta
4 * Copyright (c) 2017 Glenn Randers-Pehrson
5 * Written by Vadim Barkov, 2017.
6 *
7 * This code is released under the libpng license.
8 * For conditions of distribution and use, see the disclaimer
9 * and license in png.h
10 */
11
12 #include <stdio.h>
13 #include <stdint.h>
14 #include "../pngpriv.h"
15
16 #ifdef PNG_READ_SUPPORTED
17
18 /* This code requires -maltivec and -mvsx on the command line: */
19 #if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
20
21 #include <altivec.h>
22
23 #if PNG_POWERPC_VSX_OPT > 0
24
25 #ifndef __VSX__
26 # error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag."
27 #endif
28
29 #define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data)
30 #define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data)
31
32
33 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
34 * They're positioned like this:
35 * prev: c b
36 * row: a d
37 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
38 * whichever of a, b, or c is closest to p=a+b-c.
39 * ( this is taken from ../intel/filter_sse2_intrinsics.c )
40 */
41
42 #define vsx_declare_common_vars(row_info,row,prev_row,offset) \
43 png_byte i;\
44 png_bytep rp = row + offset;\
45 png_const_bytep pp = prev_row;\
46 size_t unaligned_top = 16 - (((size_t)rp % 16));\
47 size_t istop;\
48 if(unaligned_top == 16)\
49 unaligned_top = 0;\
50 istop = row_info->rowbytes;\
51 if((unaligned_top < istop))\
52 istop -= unaligned_top;\
53 else{\
54 unaligned_top = istop;\
55 istop = 0;\
56 }
57
58 void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row,
59 png_const_bytep prev_row)
60 {
61 vector unsigned char rp_vec;
62 vector unsigned char pp_vec;
63 vsx_declare_common_vars(row_info,row,prev_row,0)
64
65 /* Altivec operations require 16-byte aligned data
66 * but input can be unaligned. So we calculate
67 * unaligned part as usual.
68 */
69 for (i = 0; i < unaligned_top; i++)
70 {
71 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
72 rp++;
73 }
74
75 /* Using SIMD while we can */
76 while( istop >= 16 )
77 {
78 rp_vec = vec_ld(0,rp);
79 vec_ld_unaligned(pp_vec,pp);
80
81 rp_vec = vec_add(rp_vec,pp_vec);
82
83 vec_st(rp_vec,0,rp);
84
85 pp += 16;
86 rp += 16;
87 istop -= 16;
88 }
89
90 if(istop > 0)
91 {
92 /* If byte count of row is not divisible by 16
93 * we will process remaining part as usual
94 */
95 for (i = 0; i < istop; i++)
96 {
97 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
98 rp++;
99 }
100 }
101
102 }
103
104 static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16};
105 static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16};
106 static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11};
107
108 static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16};
109 static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16};
110 static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16};
111 static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16};
112
113 static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16};
114 static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16};
115 static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15};
116
117 static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16};
118 static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16};
119 static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16};
120 static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16};
121
122 static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
123 #ifdef __LITTLE_ENDIAN__
124
125 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16};
126 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16};
127 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16};
128
129 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16};
130 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16};
131 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6};
132
133 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16};
134 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16};
135 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16};
136 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16};
137
138 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16};
139 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16};
140 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16};
141 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16};
142
143 #elif defined(__BIG_ENDIAN__)
144
145 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16};
146 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16};
147 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16};
148
149 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16};
150 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16};
151 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7};
152
153 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16};
154 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16};
155 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16};
156 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16};
157
158 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16};
159 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16};
160 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16};
161 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16};
162
163 #endif
164
165 #define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp)
166 #define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp)
167
168 #ifdef PNG_USE_ABS
169 # define vsx_abs(number) abs(number)
170 #else
171 # define vsx_abs(number) (number > 0) ? (number) : -(number)
172 #endif
173
174 void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row,
175 png_const_bytep prev_row)
176 {
177 png_byte bpp = 4;
178
179 vector unsigned char rp_vec;
180 vector unsigned char part_vec;
181
182 vsx_declare_common_vars(row_info,row,prev_row,bpp)
183
184 PNG_UNUSED(pp)
185
186 /* Altivec operations require 16-byte aligned data
187 * but input can be unaligned. So we calculate
188 * unaligned part as usual.
189 */
190 for (i = 0; i < unaligned_top; i++)
191 {
192 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
193 rp++;
194 }
195
196 /* Using SIMD while we can */
197 while( istop >= 16 )
198 {
199 for(i=0;i < bpp ; i++)
200 {
201 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
202 rp++;
203 }
204 rp -= bpp;
205
206 rp_vec = vec_ld(0,rp);
207 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
208 rp_vec = vec_add(rp_vec,part_vec);
209
210 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
211 rp_vec = vec_add(rp_vec,part_vec);
212
213 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
214 rp_vec = vec_add(rp_vec,part_vec);
215
216 vec_st(rp_vec,0,rp);
217
218 rp += 16;
219 istop -= 16;
220 }
221
222 if(istop > 0)
223 for (i = 0; i < istop % 16; i++)
224 {
225 *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff);
226 rp++;
227 }
228
229 }
230
231 void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row,
232 png_const_bytep prev_row)
233 {
234 png_byte bpp = 3;
235
236 vector unsigned char rp_vec;
237 vector unsigned char part_vec;
238
239 vsx_declare_common_vars(row_info,row,prev_row,bpp)
240
241 PNG_UNUSED(pp)
242
243 /* Altivec operations require 16-byte aligned data
244 * but input can be unaligned. So we calculate
245 * unaligned part as usual.
246 */
247 for (i = 0; i < unaligned_top; i++)
248 {
249 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
250 rp++;
251 }
252
253 /* Using SIMD while we can */
254 while( istop >= 16 )
255 {
256 for(i=0;i < bpp ; i++)
257 {
258 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
259 rp++;
260 }
261 rp -= bpp;
262
263 rp_vec = vec_ld(0,rp);
264 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
265 rp_vec = vec_add(rp_vec,part_vec);
266
267 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
268 rp_vec = vec_add(rp_vec,part_vec);
269
270 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
271 rp_vec = vec_add(rp_vec,part_vec);
272
273 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
274 rp_vec = vec_add(rp_vec,part_vec);
275
276 vec_st(rp_vec,0,rp);
277 rp += 15;
278 istop -= 16;
279
280 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
281 * be proceeded manually
282 */
283 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
284 rp++;
285 }
286
287 if(istop > 0)
288 for (i = 0; i < istop % 16; i++)
289 {
290 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
291 rp++;
292 }
293 }
294
295 void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row,
296 png_const_bytep prev_row)
297 {
298 png_byte bpp = 4;
299
300 vector unsigned char rp_vec;
301 vector unsigned char pp_vec;
302 vector unsigned char pp_part_vec;
303 vector unsigned char rp_part_vec;
304 vector unsigned char avg_vec;
305
306 vsx_declare_common_vars(row_info,row,prev_row,bpp)
307 rp -= bpp;
308 if(istop >= bpp)
309 istop -= bpp;
310
311 for (i = 0; i < bpp; i++)
312 {
313 *rp = (png_byte)(((int)(*rp) +
314 ((int)(*pp++) / 2 )) & 0xff);
315
316 rp++;
317 }
318
319 /* Altivec operations require 16-byte aligned data
320 * but input can be unaligned. So we calculate
321 * unaligned part as usual.
322 */
323 for (i = 0; i < unaligned_top; i++)
324 {
325 *rp = (png_byte)(((int)(*rp) +
326 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
327
328 rp++;
329 }
330
331 /* Using SIMD while we can */
332 while( istop >= 16 )
333 {
334 for(i=0;i < bpp ; i++)
335 {
336 *rp = (png_byte)(((int)(*rp) +
337 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
338
339 rp++;
340 }
341 rp -= bpp;
342 pp -= bpp;
343
344 vec_ld_unaligned(pp_vec,pp);
345 rp_vec = vec_ld(0,rp);
346
347 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
348 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4);
349 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
350 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
351 rp_vec = vec_add(rp_vec,avg_vec);
352
353 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
354 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4);
355 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
356 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
357 rp_vec = vec_add(rp_vec,avg_vec);
358
359 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
360 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4);
361 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
362 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
363 rp_vec = vec_add(rp_vec,avg_vec);
364
365 vec_st(rp_vec,0,rp);
366
367 rp += 16;
368 pp += 16;
369 istop -= 16;
370 }
371
372 if(istop > 0)
373 for (i = 0; i < istop % 16; i++)
374 {
375 *rp = (png_byte)(((int)(*rp) +
376 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
377
378 rp++;
379 }
380 }
381
382 void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row,
383 png_const_bytep prev_row)
384 {
385 png_byte bpp = 3;
386
387 vector unsigned char rp_vec;
388 vector unsigned char pp_vec;
389 vector unsigned char pp_part_vec;
390 vector unsigned char rp_part_vec;
391 vector unsigned char avg_vec;
392
393 vsx_declare_common_vars(row_info,row,prev_row,bpp)
394 rp -= bpp;
395 if(istop >= bpp)
396 istop -= bpp;
397
398 for (i = 0; i < bpp; i++)
399 {
400 *rp = (png_byte)(((int)(*rp) +
401 ((int)(*pp++) / 2 )) & 0xff);
402
403 rp++;
404 }
405
406 /* Altivec operations require 16-byte aligned data
407 * but input can be unaligned. So we calculate
408 * unaligned part as usual.
409 */
410 for (i = 0; i < unaligned_top; i++)
411 {
412 *rp = (png_byte)(((int)(*rp) +
413 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
414
415 rp++;
416 }
417
418 /* Using SIMD while we can */
419 while( istop >= 16 )
420 {
421 for(i=0;i < bpp ; i++)
422 {
423 *rp = (png_byte)(((int)(*rp) +
424 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
425
426 rp++;
427 }
428 rp -= bpp;
429 pp -= bpp;
430
431 vec_ld_unaligned(pp_vec,pp);
432 rp_vec = vec_ld(0,rp);
433
434 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
435 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3);
436 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
437 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
438 rp_vec = vec_add(rp_vec,avg_vec);
439
440 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
441 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3);
442 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
443 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
444 rp_vec = vec_add(rp_vec,avg_vec);
445
446 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
447 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3);
448 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
449 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
450 rp_vec = vec_add(rp_vec,avg_vec);
451
452 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
453 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3);
454 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
455 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
456 rp_vec = vec_add(rp_vec,avg_vec);
457
458 vec_st(rp_vec,0,rp);
459
460 rp += 15;
461 pp += 15;
462 istop -= 16;
463
464 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
465 * be proceeded manually
466 */
467 *rp = (png_byte)(((int)(*rp) +
468 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
469 rp++;
470 }
471
472 if(istop > 0)
473 for (i = 0; i < istop % 16; i++)
474 {
475 *rp = (png_byte)(((int)(*rp) +
476 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
477
478 rp++;
479 }
480 }
481
482 /* Bytewise c ? t : e. */
483 #define if_then_else(c,t,e) vec_sel(e,t,c)
484
485 #define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\
486 c = *(pp - bpp);\
487 a = *(rp - bpp);\
488 b = *pp++;\
489 p = b - c;\
490 pc = a - c;\
491 pa = vsx_abs(p);\
492 pb = vsx_abs(pc);\
493 pc = vsx_abs(p + pc);\
494 if (pb < pa) pa = pb, a = b;\
495 if (pc < pa) a = c;\
496 a += *rp;\
497 *rp++ = (png_byte)a;\
498 }
499
500 void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row,
501 png_const_bytep prev_row)
502 {
503 png_byte bpp = 4;
504
505 int a, b, c, pa, pb, pc, p;
506 vector unsigned char rp_vec;
507 vector unsigned char pp_vec;
508 vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
509 vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
510
511 vsx_declare_common_vars(row_info,row,prev_row,bpp)
512 rp -= bpp;
513 if(istop >= bpp)
514 istop -= bpp;
515
516 /* Process the first pixel in the row completely (this is the same as 'up'
517 * because there is only one candidate predictor for the first row).
518 */
519 for(i = 0; i < bpp ; i++)
520 {
521 *rp = (png_byte)( *rp + *pp);
522 rp++;
523 pp++;
524 }
525
526 for(i = 0; i < unaligned_top ; i++)
527 {
528 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
529 }
530
531 while( istop >= 16)
532 {
533 for(i = 0; i < bpp ; i++)
534 {
535 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
536 }
537
538 rp -= bpp;
539 pp -= bpp;
540 rp_vec = vec_ld(0,rp);
541 vec_ld_unaligned(pp_vec,pp);
542
543 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
544 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4);
545 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
546 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
547 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
548 pc_vec = vec_add(pa_vec,pb_vec);
549 pa_vec = vec_abs(pa_vec);
550 pb_vec = vec_abs(pb_vec);
551 pc_vec = vec_abs(pc_vec);
552 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
553 nearest_vec = if_then_else(
554 vec_cmpeq(pa_vec,smallest_vec),
555 a_vec,
556 if_then_else(
557 vec_cmpeq(pb_vec,smallest_vec),
558 b_vec,
559 c_vec
560 )
561 );
562 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4)));
563
564 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
565 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4);
566 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
567 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
568 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
569 pc_vec = vec_add(pa_vec,pb_vec);
570 pa_vec = vec_abs(pa_vec);
571 pb_vec = vec_abs(pb_vec);
572 pc_vec = vec_abs(pc_vec);
573 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
574 nearest_vec = if_then_else(
575 vec_cmpeq(pa_vec,smallest_vec),
576 a_vec,
577 if_then_else(
578 vec_cmpeq(pb_vec,smallest_vec),
579 b_vec,
580 c_vec
581 )
582 );
583 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4)));
584
585 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
586 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4);
587 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
588 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
589 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
590 pc_vec = vec_add(pa_vec,pb_vec);
591 pa_vec = vec_abs(pa_vec);
592 pb_vec = vec_abs(pb_vec);
593 pc_vec = vec_abs(pc_vec);
594 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
595 nearest_vec = if_then_else(
596 vec_cmpeq(pa_vec,smallest_vec),
597 a_vec,
598 if_then_else(
599 vec_cmpeq(pb_vec,smallest_vec),
600 b_vec,
601 c_vec
602 )
603 );
604 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4)));
605
606 vec_st(rp_vec,0,rp);
607
608 rp += 16;
609 pp += 16;
610 istop -= 16;
611 }
612
613 if(istop > 0)
614 for (i = 0; i < istop % 16; i++)
615 {
616 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
617 }
618 }
619
620 void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row,
621 png_const_bytep prev_row)
622 {
623 png_byte bpp = 3;
624
625 int a, b, c, pa, pb, pc, p;
626 vector unsigned char rp_vec;
627 vector unsigned char pp_vec;
628 vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
629 vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
630
631 vsx_declare_common_vars(row_info,row,prev_row,bpp)
632 rp -= bpp;
633 if(istop >= bpp)
634 istop -= bpp;
635
636 /* Process the first pixel in the row completely (this is the same as 'up'
637 * because there is only one candidate predictor for the first row).
638 */
639 for(i = 0; i < bpp ; i++)
640 {
641 *rp = (png_byte)( *rp + *pp);
642 rp++;
643 pp++;
644 }
645
646 for(i = 0; i < unaligned_top ; i++)
647 {
648 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
649 }
650
651 while( istop >= 16)
652 {
653 for(i = 0; i < bpp ; i++)
654 {
655 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
656 }
657
658 rp -= bpp;
659 pp -= bpp;
660 rp_vec = vec_ld(0,rp);
661 vec_ld_unaligned(pp_vec,pp);
662
663 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
664 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3);
665 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
666 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
667 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
668 pc_vec = vec_add(pa_vec,pb_vec);
669 pa_vec = vec_abs(pa_vec);
670 pb_vec = vec_abs(pb_vec);
671 pc_vec = vec_abs(pc_vec);
672 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
673 nearest_vec = if_then_else(
674 vec_cmpeq(pa_vec,smallest_vec),
675 a_vec,
676 if_then_else(
677 vec_cmpeq(pb_vec,smallest_vec),
678 b_vec,
679 c_vec
680 )
681 );
682 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3)));
683
684 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
685 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3);
686 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
687 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
688 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
689 pc_vec = vec_add(pa_vec,pb_vec);
690 pa_vec = vec_abs(pa_vec);
691 pb_vec = vec_abs(pb_vec);
692 pc_vec = vec_abs(pc_vec);
693 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
694 nearest_vec = if_then_else(
695 vec_cmpeq(pa_vec,smallest_vec),
696 a_vec,
697 if_then_else(
698 vec_cmpeq(pb_vec,smallest_vec),
699 b_vec,
700 c_vec
701 )
702 );
703 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3)));
704
705 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
706 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3);
707 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
708 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
709 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
710 pc_vec = vec_add(pa_vec,pb_vec);
711 pa_vec = vec_abs(pa_vec);
712 pb_vec = vec_abs(pb_vec);
713 pc_vec = vec_abs(pc_vec);
714 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
715 nearest_vec = if_then_else(
716 vec_cmpeq(pa_vec,smallest_vec),
717 a_vec,
718 if_then_else(
719 vec_cmpeq(pb_vec,smallest_vec),
720 b_vec,
721 c_vec
722 )
723 );
724 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3)));
725
726 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
727 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3);
728 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
729 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
730 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
731 pc_vec = vec_add(pa_vec,pb_vec);
732 pa_vec = vec_abs(pa_vec);
733 pb_vec = vec_abs(pb_vec);
734 pc_vec = vec_abs(pc_vec);
735 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
736 nearest_vec = if_then_else(
737 vec_cmpeq(pa_vec,smallest_vec),
738 a_vec,
739 if_then_else(
740 vec_cmpeq(pb_vec,smallest_vec),
741 b_vec,
742 c_vec
743 )
744 );
745 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3)));
746
747 vec_st(rp_vec,0,rp);
748
749 rp += 15;
750 pp += 15;
751 istop -= 16;
752
753 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
754 * be proceeded manually
755 */
756 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
757 }
758
759 if(istop > 0)
760 for (i = 0; i < istop % 16; i++)
761 {
762 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
763 }
764 }
765
766 #endif /* PNG_POWERPC_VSX_OPT > 0 */
767 #endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */
768 #endif /* READ */