1 /* filter_vsx_intrinsics.c - PowerPC optimised filter functions
3 * Copyright (c) 2018 Cosmin Truta
4 * Copyright (c) 2017 Glenn Randers-Pehrson
5 * Written by Vadim Barkov, 2017.
7 * This code is released under the libpng license.
8 * For conditions of distribution and use, see the disclaimer
14 #include "../pngpriv.h"
16 #ifdef PNG_READ_SUPPORTED
18 /* This code requires -maltivec and -mvsx on the command line: */
19 #if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
23 #if PNG_POWERPC_VSX_OPT > 0
26 # error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag."
29 #define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data)
30 #define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data)
33 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
34 * They're positioned like this:
37 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
38 * whichever of a, b, or c is closest to p=a+b-c.
39 * ( this is taken from ../intel/filter_sse2_intrinsics.c )
42 #define vsx_declare_common_vars(row_info,row,prev_row,offset) \
44 png_bytep rp = row + offset;\
45 png_const_bytep pp = prev_row;\
46 size_t unaligned_top = 16 - (((size_t)rp % 16));\
48 if(unaligned_top == 16)\
50 istop = row_info->rowbytes;\
51 if((unaligned_top < istop))\
52 istop -= unaligned_top;\
54 unaligned_top = istop;\
58 void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row,
59 png_const_bytep prev_row)
61 vector unsigned char rp_vec;
62 vector unsigned char pp_vec;
63 vsx_declare_common_vars(row_info,row,prev_row,0)
65 /* Altivec operations require 16-byte aligned data
66 * but input can be unaligned. So we calculate
67 * unaligned part as usual.
69 for (i = 0; i < unaligned_top; i++)
71 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
75 /* Using SIMD while we can */
78 rp_vec = vec_ld(0,rp);
79 vec_ld_unaligned(pp_vec,pp);
81 rp_vec = vec_add(rp_vec,pp_vec);
92 /* If byte count of row is not divisible by 16
93 * we will process remaining part as usual
95 for (i = 0; i < istop; i++)
97 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
104 static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16};
105 static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16};
106 static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11};
108 static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16};
109 static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16};
110 static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16};
111 static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16};
113 static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16};
114 static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16};
115 static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15};
117 static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16};
118 static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16};
119 static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16};
120 static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16};
122 static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
123 #ifdef __LITTLE_ENDIAN__
125 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16};
126 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16};
127 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16};
129 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16};
130 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16};
131 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6};
133 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16};
134 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16};
135 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16};
136 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16};
138 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16};
139 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16};
140 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16};
141 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16};
143 #elif defined(__BIG_ENDIAN__)
145 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16};
146 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16};
147 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16};
149 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16};
150 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16};
151 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7};
153 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16};
154 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16};
155 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16};
156 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16};
158 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16};
159 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16};
160 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16};
161 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16};
165 #define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp)
166 #define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp)
169 # define vsx_abs(number) abs(number)
171 # define vsx_abs(number) (number > 0) ? (number) : -(number)
174 void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row,
175 png_const_bytep prev_row)
179 vector unsigned char rp_vec;
180 vector unsigned char part_vec;
182 vsx_declare_common_vars(row_info,row,prev_row,bpp)
186 /* Altivec operations require 16-byte aligned data
187 * but input can be unaligned. So we calculate
188 * unaligned part as usual.
190 for (i = 0; i < unaligned_top; i++)
192 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
196 /* Using SIMD while we can */
199 for(i=0;i < bpp ; i++)
201 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
206 rp_vec = vec_ld(0,rp);
207 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
208 rp_vec = vec_add(rp_vec,part_vec);
210 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
211 rp_vec = vec_add(rp_vec,part_vec);
213 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
214 rp_vec = vec_add(rp_vec,part_vec);
223 for (i = 0; i < istop % 16; i++)
225 *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff);
231 void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row,
232 png_const_bytep prev_row)
236 vector unsigned char rp_vec;
237 vector unsigned char part_vec;
239 vsx_declare_common_vars(row_info,row,prev_row,bpp)
243 /* Altivec operations require 16-byte aligned data
244 * but input can be unaligned. So we calculate
245 * unaligned part as usual.
247 for (i = 0; i < unaligned_top; i++)
249 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
253 /* Using SIMD while we can */
256 for(i=0;i < bpp ; i++)
258 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
263 rp_vec = vec_ld(0,rp);
264 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
265 rp_vec = vec_add(rp_vec,part_vec);
267 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
268 rp_vec = vec_add(rp_vec,part_vec);
270 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
271 rp_vec = vec_add(rp_vec,part_vec);
273 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
274 rp_vec = vec_add(rp_vec,part_vec);
280 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
281 * be proceeded manually
283 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
288 for (i = 0; i < istop % 16; i++)
290 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
295 void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row,
296 png_const_bytep prev_row)
300 vector unsigned char rp_vec;
301 vector unsigned char pp_vec;
302 vector unsigned char pp_part_vec;
303 vector unsigned char rp_part_vec;
304 vector unsigned char avg_vec;
306 vsx_declare_common_vars(row_info,row,prev_row,bpp)
311 for (i = 0; i < bpp; i++)
313 *rp = (png_byte)(((int)(*rp) +
314 ((int)(*pp++) / 2 )) & 0xff);
319 /* Altivec operations require 16-byte aligned data
320 * but input can be unaligned. So we calculate
321 * unaligned part as usual.
323 for (i = 0; i < unaligned_top; i++)
325 *rp = (png_byte)(((int)(*rp) +
326 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
331 /* Using SIMD while we can */
334 for(i=0;i < bpp ; i++)
336 *rp = (png_byte)(((int)(*rp) +
337 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
344 vec_ld_unaligned(pp_vec,pp);
345 rp_vec = vec_ld(0,rp);
347 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
348 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4);
349 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
350 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
351 rp_vec = vec_add(rp_vec,avg_vec);
353 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
354 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4);
355 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
356 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
357 rp_vec = vec_add(rp_vec,avg_vec);
359 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
360 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4);
361 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
362 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
363 rp_vec = vec_add(rp_vec,avg_vec);
373 for (i = 0; i < istop % 16; i++)
375 *rp = (png_byte)(((int)(*rp) +
376 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
382 void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row,
383 png_const_bytep prev_row)
387 vector unsigned char rp_vec;
388 vector unsigned char pp_vec;
389 vector unsigned char pp_part_vec;
390 vector unsigned char rp_part_vec;
391 vector unsigned char avg_vec;
393 vsx_declare_common_vars(row_info,row,prev_row,bpp)
398 for (i = 0; i < bpp; i++)
400 *rp = (png_byte)(((int)(*rp) +
401 ((int)(*pp++) / 2 )) & 0xff);
406 /* Altivec operations require 16-byte aligned data
407 * but input can be unaligned. So we calculate
408 * unaligned part as usual.
410 for (i = 0; i < unaligned_top; i++)
412 *rp = (png_byte)(((int)(*rp) +
413 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
418 /* Using SIMD while we can */
421 for(i=0;i < bpp ; i++)
423 *rp = (png_byte)(((int)(*rp) +
424 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
431 vec_ld_unaligned(pp_vec,pp);
432 rp_vec = vec_ld(0,rp);
434 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
435 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3);
436 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
437 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
438 rp_vec = vec_add(rp_vec,avg_vec);
440 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
441 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3);
442 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
443 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
444 rp_vec = vec_add(rp_vec,avg_vec);
446 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
447 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3);
448 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
449 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
450 rp_vec = vec_add(rp_vec,avg_vec);
452 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
453 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3);
454 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
455 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
456 rp_vec = vec_add(rp_vec,avg_vec);
464 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
465 * be proceeded manually
467 *rp = (png_byte)(((int)(*rp) +
468 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
473 for (i = 0; i < istop % 16; i++)
475 *rp = (png_byte)(((int)(*rp) +
476 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
482 /* Bytewise c ? t : e. */
483 #define if_then_else(c,t,e) vec_sel(e,t,c)
485 #define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\
493 pc = vsx_abs(p + pc);\
494 if (pb < pa) pa = pb, a = b;\
497 *rp++ = (png_byte)a;\
500 void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row,
501 png_const_bytep prev_row)
505 int a, b, c, pa, pb, pc, p;
506 vector unsigned char rp_vec;
507 vector unsigned char pp_vec;
508 vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
509 vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
511 vsx_declare_common_vars(row_info,row,prev_row,bpp)
516 /* Process the first pixel in the row completely (this is the same as 'up'
517 * because there is only one candidate predictor for the first row).
519 for(i = 0; i < bpp ; i++)
521 *rp = (png_byte)( *rp + *pp);
526 for(i = 0; i < unaligned_top ; i++)
528 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
533 for(i = 0; i < bpp ; i++)
535 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
540 rp_vec = vec_ld(0,rp);
541 vec_ld_unaligned(pp_vec,pp);
543 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
544 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4);
545 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
546 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
547 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
548 pc_vec = vec_add(pa_vec,pb_vec);
549 pa_vec = vec_abs(pa_vec);
550 pb_vec = vec_abs(pb_vec);
551 pc_vec = vec_abs(pc_vec);
552 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
553 nearest_vec = if_then_else(
554 vec_cmpeq(pa_vec,smallest_vec),
557 vec_cmpeq(pb_vec,smallest_vec),
562 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4)));
564 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
565 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4);
566 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
567 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
568 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
569 pc_vec = vec_add(pa_vec,pb_vec);
570 pa_vec = vec_abs(pa_vec);
571 pb_vec = vec_abs(pb_vec);
572 pc_vec = vec_abs(pc_vec);
573 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
574 nearest_vec = if_then_else(
575 vec_cmpeq(pa_vec,smallest_vec),
578 vec_cmpeq(pb_vec,smallest_vec),
583 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4)));
585 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
586 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4);
587 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
588 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
589 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
590 pc_vec = vec_add(pa_vec,pb_vec);
591 pa_vec = vec_abs(pa_vec);
592 pb_vec = vec_abs(pb_vec);
593 pc_vec = vec_abs(pc_vec);
594 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
595 nearest_vec = if_then_else(
596 vec_cmpeq(pa_vec,smallest_vec),
599 vec_cmpeq(pb_vec,smallest_vec),
604 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4)));
614 for (i = 0; i < istop % 16; i++)
616 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
620 void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row,
621 png_const_bytep prev_row)
625 int a, b, c, pa, pb, pc, p;
626 vector unsigned char rp_vec;
627 vector unsigned char pp_vec;
628 vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
629 vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
631 vsx_declare_common_vars(row_info,row,prev_row,bpp)
636 /* Process the first pixel in the row completely (this is the same as 'up'
637 * because there is only one candidate predictor for the first row).
639 for(i = 0; i < bpp ; i++)
641 *rp = (png_byte)( *rp + *pp);
646 for(i = 0; i < unaligned_top ; i++)
648 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
653 for(i = 0; i < bpp ; i++)
655 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
660 rp_vec = vec_ld(0,rp);
661 vec_ld_unaligned(pp_vec,pp);
663 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
664 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3);
665 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
666 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
667 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
668 pc_vec = vec_add(pa_vec,pb_vec);
669 pa_vec = vec_abs(pa_vec);
670 pb_vec = vec_abs(pb_vec);
671 pc_vec = vec_abs(pc_vec);
672 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
673 nearest_vec = if_then_else(
674 vec_cmpeq(pa_vec,smallest_vec),
677 vec_cmpeq(pb_vec,smallest_vec),
682 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3)));
684 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
685 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3);
686 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
687 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
688 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
689 pc_vec = vec_add(pa_vec,pb_vec);
690 pa_vec = vec_abs(pa_vec);
691 pb_vec = vec_abs(pb_vec);
692 pc_vec = vec_abs(pc_vec);
693 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
694 nearest_vec = if_then_else(
695 vec_cmpeq(pa_vec,smallest_vec),
698 vec_cmpeq(pb_vec,smallest_vec),
703 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3)));
705 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
706 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3);
707 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
708 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
709 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
710 pc_vec = vec_add(pa_vec,pb_vec);
711 pa_vec = vec_abs(pa_vec);
712 pb_vec = vec_abs(pb_vec);
713 pc_vec = vec_abs(pc_vec);
714 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
715 nearest_vec = if_then_else(
716 vec_cmpeq(pa_vec,smallest_vec),
719 vec_cmpeq(pb_vec,smallest_vec),
724 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3)));
726 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
727 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3);
728 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
729 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
730 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
731 pc_vec = vec_add(pa_vec,pb_vec);
732 pa_vec = vec_abs(pa_vec);
733 pb_vec = vec_abs(pb_vec);
734 pc_vec = vec_abs(pc_vec);
735 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
736 nearest_vec = if_then_else(
737 vec_cmpeq(pa_vec,smallest_vec),
740 vec_cmpeq(pb_vec,smallest_vec),
745 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3)));
753 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
754 * be proceeded manually
756 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
760 for (i = 0; i < istop % 16; i++)
762 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
766 #endif /* PNG_POWERPC_VSX_OPT > 0 */
767 #endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */