2 /* filter_msa_intrinsics.c - MSA optimised filter functions
4 * Copyright (c) 2018 Cosmin Truta
5 * Copyright (c) 2016 Glenn Randers-Pehrson
6 * Written by Mandar Sahastrabuddhe, August 2016.
8 * This code is released under the libpng license.
9 * For conditions of distribution and use, see the disclaimer
10 * and license in png.h
15 #include "../pngpriv.h"
17 #ifdef PNG_READ_SUPPORTED
19 /* This code requires -mfpu=msa on the command line: */
20 #if PNG_MIPS_MSA_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
24 /* libpng row pointers are not necessarily aligned to any particular boundary,
25 * however this code will only work with appropriate alignment. mips/mips_init.c
26 * checks for this (and will not compile unless it is done). This code uses
27 * variants of png_aligncast to avoid compiler warnings.
29 #define png_ptr(type,pointer) png_aligncast(type *,pointer)
30 #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
32 /* The following relies on a variable 'temp_pointer' being declared with type
33 * 'type'. This is written this way just to hide the GCC strict aliasing
34 * warning; note that the code is safe because there never is an alias between
35 * the input and output pointers.
37 #define png_ldr(type,pointer)\
38 (temp_pointer = png_ptr(type,pointer), *temp_pointer)
40 #if PNG_MIPS_MSA_OPT > 0
43 #define MSA_SRLI_B(a, b) __msa_srli_b((v16i8) a, b)
47 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
51 "lw %[val_m], %[psrc_lw_m] \n\t" \
53 : [val_m] "=r" (val_m) \
54 : [psrc_lw_m] "m" (*psrc_lw_m) \
60 #define SH(val, pdst) \
62 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
63 uint16_t val_m = (val); \
66 "sh %[val_m], %[pdst_sh_m] \n\t" \
68 : [pdst_sh_m] "=m" (*pdst_sh_m) \
69 : [val_m] "r" (val_m) \
73 #define SW(val, pdst) \
75 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
76 uint32_t val_m = (val); \
79 "sw %[val_m], %[pdst_sw_m] \n\t" \
81 : [pdst_sw_m] "=m" (*pdst_sw_m) \
82 : [val_m] "r" (val_m) \
87 #define SD(val, pdst) \
89 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
90 uint64_t val_m = (val); \
93 "sd %[val_m], %[pdst_sd_m] \n\t" \
95 : [pdst_sd_m] "=m" (*pdst_sd_m) \
96 : [val_m] "r" (val_m) \
100 #define SD(val, pdst) \
102 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
103 uint32_t val0_m, val1_m; \
105 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
106 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
108 SW(val0_m, pdst_sd_m); \
109 SW(val1_m, pdst_sd_m + 4); \
113 #define MSA_SRLI_B(a, b) (a >> b)
115 #if (__mips_isa_rev >= 6)
118 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
122 "lw %[val_m], %[psrc_lw_m] \n\t" \
124 : [val_m] "=r" (val_m) \
125 : [psrc_lw_m] "m" (*psrc_lw_m) \
131 #define SH(val, pdst) \
133 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
134 uint16_t val_m = (val); \
137 "sh %[val_m], %[pdst_sh_m] \n\t" \
139 : [pdst_sh_m] "=m" (*pdst_sh_m) \
140 : [val_m] "r" (val_m) \
144 #define SW(val, pdst) \
146 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
147 uint32_t val_m = (val); \
150 "sw %[val_m], %[pdst_sw_m] \n\t" \
152 : [pdst_sw_m] "=m" (*pdst_sw_m) \
153 : [val_m] "r" (val_m) \
158 #define SD(val, pdst) \
160 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
161 uint64_t val_m = (val); \
164 "sd %[val_m], %[pdst_sd_m] \n\t" \
166 : [pdst_sd_m] "=m" (*pdst_sd_m) \
167 : [val_m] "r" (val_m) \
171 #define SD(val, pdst) \
173 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
174 uint32_t val0_m, val1_m; \
176 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
177 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
179 SW(val0_m, pdst_sd_m); \
180 SW(val1_m, pdst_sd_m + 4); \
183 #else // !(__mips_isa_rev >= 6)
186 uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
190 "ulw %[val_m], %[psrc_lw_m] \n\t" \
192 : [val_m] "=r" (val_m) \
193 : [psrc_lw_m] "m" (*psrc_lw_m) \
199 #define SH(val, pdst) \
201 uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
202 uint16_t val_m = (val); \
205 "ush %[val_m], %[pdst_sh_m] \n\t" \
207 : [pdst_sh_m] "=m" (*pdst_sh_m) \
208 : [val_m] "r" (val_m) \
212 #define SW(val, pdst) \
214 uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
215 uint32_t val_m = (val); \
218 "usw %[val_m], %[pdst_sw_m] \n\t" \
220 : [pdst_sw_m] "=m" (*pdst_sw_m) \
221 : [val_m] "r" (val_m) \
225 #define SD(val, pdst) \
227 uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
228 uint32_t val0_m, val1_m; \
230 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
231 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
233 SW(val0_m, pdst_sd_m); \
234 SW(val1_m, pdst_sd_m + 4); \
237 #define SW_ZERO(pdst) \
239 uint8_t *pdst_m = (uint8_t *) (pdst); \
242 "usw $0, %[pdst_m] \n\t" \
244 : [pdst_m] "=m" (*pdst_m) \
248 #endif // (__mips_isa_rev >= 6)
251 #define LD_B(RTYPE, psrc) *((RTYPE *) (psrc))
252 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
253 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
255 out0 = LD_B(RTYPE, (psrc)); \
256 out1 = LD_B(RTYPE, (psrc) + stride); \
258 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
259 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
261 LD_B2(RTYPE, (psrc), stride, out0, out1); \
262 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
264 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
266 #define ST_B(RTYPE, in, pdst) *((RTYPE *) (pdst)) = (in)
267 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
268 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
270 ST_B(RTYPE, in0, (pdst)); \
271 ST_B(RTYPE, in1, (pdst) + stride); \
273 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
274 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
276 ST_B2(RTYPE, in0, in1, (pdst), stride); \
277 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
279 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
281 #define ADD2(in0, in1, in2, in3, out0, out1) \
286 #define ADD3(in0, in1, in2, in3, in4, in5, \
289 ADD2(in0, in1, in2, in3, out0, out1); \
292 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
293 out0, out1, out2, out3) \
295 ADD2(in0, in1, in2, in3, out0, out1); \
296 ADD2(in4, in5, in6, in7, out2, out3); \
299 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
301 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
302 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
304 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
306 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
308 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
309 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
311 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
313 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
315 v16i8 zero_m = { 0 }; \
316 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
317 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
319 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
321 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
323 v16i8 zero_m = { 0 }; \
324 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
325 out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
327 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
329 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
331 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
332 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
334 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
336 #define ADD_ABS_H3(RTYPE, in0, in1, in2, out0, out1, out2) \
340 out0 = __msa_add_a_h((v8i16) zero, in0); \
341 out1 = __msa_add_a_h((v8i16) zero, in1); \
342 out2 = __msa_add_a_h((v8i16) zero, in2); \
344 #define ADD_ABS_H3_SH(...) ADD_ABS_H3(v8i16, __VA_ARGS__)
346 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
348 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
349 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
351 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
353 #define CMP_AND_SELECT(inp0, inp1, inp2, inp3, inp4, inp5, out0) \
355 v8i16 _sel_h0, _sel_h1; \
356 v16u8 _sel_b0, _sel_b1; \
357 _sel_h0 = (v8i16) __msa_clt_u_h((v8u16) inp1, (v8u16) inp0); \
358 _sel_b0 = (v16u8) __msa_pckev_b((v16i8) _sel_h0, (v16i8) _sel_h0); \
359 inp0 = (v8i16) __msa_bmnz_v((v16u8) inp0, (v16u8) inp1, (v16u8) _sel_h0); \
360 inp4 = (v16u8) __msa_bmnz_v(inp3, inp4, _sel_b0); \
361 _sel_h1 = (v8i16) __msa_clt_u_h((v8u16) inp2, (v8u16) inp0); \
362 _sel_b1 = (v16u8) __msa_pckev_b((v16i8) _sel_h1, (v16i8) _sel_h1); \
363 inp4 = (v16u8) __msa_bmnz_v(inp4, inp5, _sel_b1); \
367 void png_read_filter_row_up_msa(png_row_infop row_info, png_bytep row,
368 png_const_bytep prev_row)
370 size_t i, cnt, cnt16, cnt32;
371 size_t istop = row_info->rowbytes;
373 png_const_bytep pp = prev_row;
374 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
376 for (i = 0; i < (istop >> 6); i++)
378 LD_UB4(rp, 16, src0, src1, src2, src3);
379 LD_UB4(pp, 16, src4, src5, src6, src7);
382 ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
383 src0, src1, src2, src3);
385 ST_UB4(src0, src1, src2, src3, rp, 16);
391 cnt32 = istop & 0x20;
392 cnt16 = istop & 0x10;
399 LD_UB4(rp, 16, src0, src1, src2, src3);
400 LD_UB4(pp, 16, src4, src5, src6, src7);
402 ADD4(src0, src4, src1, src5, src2, src6, src3, src7,
403 src0, src1, src2, src3);
405 ST_UB4(src0, src1, src2, src3, rp, 16);
408 else if (cnt16 || cnt)
410 LD_UB2(rp, 16, src0, src1);
411 LD_UB2(pp, 16, src4, src5);
413 src2 = LD_UB(rp + 32);
416 ADD3(src0, src4, src1, src5, src2, src6, src0, src1, src2);
418 ST_UB2(src0, src1, rp, 16);
425 LD_UB2(rp, 16, src0, src1);
426 LD_UB2(pp, 16, src4, src5);
428 ADD2(src0, src4, src1, src5, src0, src1);
430 ST_UB2(src0, src1, rp, 16);
434 else if (cnt16 && cnt)
436 LD_UB2(rp, 16, src0, src1);
437 LD_UB2(pp, 16, src4, src5);
439 ADD2(src0, src4, src1, src5, src0, src1);
441 ST_UB2(src0, src1, rp, 16);
444 else if (cnt16 || cnt)
458 void png_read_filter_row_sub4_msa(png_row_infop row_info, png_bytep row,
459 png_const_bytep prev_row)
462 size_t istop = row_info->rowbytes;
464 png_bytep nxt = row + 4;
466 v16u8 src0, src1, src2, src3, src4;
474 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
476 for (count = 0; count < istop; count += 16)
481 src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 4);
482 src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 8);
483 src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 12);
489 ILVEV_W2_UB(src1, src2, src3, src4, dst0, dst1);
490 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
497 void png_read_filter_row_sub3_msa(png_row_infop row_info, png_bytep row,
498 png_const_bytep prev_row)
501 size_t istop = row_info->rowbytes;
503 png_bytep nxt = row + 3;
506 v16u8 src0, src1, src2, src3, src4, dst0, dst1;
508 v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
509 v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
515 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
517 for (count = 0; count < istop; count += 12)
522 src2 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 3);
523 src3 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 6);
524 src4 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) src1, 9);
530 VSHF_B2_UB(src1, src2, src3, src4, mask0, mask0, dst0, dst1);
531 dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
532 out0 = __msa_copy_s_d((v2i64) dst0, 0);
533 out1 = __msa_copy_s_w((v4i32) dst0, 2);
542 void png_read_filter_row_avg4_msa(png_row_infop row_info, png_bytep row,
543 png_const_bytep prev_row)
548 png_const_bytep pp = prev_row;
549 size_t istop = row_info->rowbytes - 4;
550 int32_t inp0, inp1, out0;
551 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
558 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
559 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
560 src0 = (v16u8) MSA_SRLI_B(src0, 1);
562 out0 = __msa_copy_s_w((v4i32) src1, 0);
566 for (i = 0; i < istop; i += 16)
573 SLDI_B2_0_UB(src2, src6, src3, src7, 4);
574 SLDI_B2_0_UB(src2, src6, src4, src8, 8);
575 SLDI_B2_0_UB(src2, src6, src5, src9, 12);
576 src2 = __msa_ave_u_b(src2, src1);
578 src3 = __msa_ave_u_b(src3, src6);
580 src4 = __msa_ave_u_b(src4, src7);
582 src5 = __msa_ave_u_b(src5, src8);
585 ILVEV_W2_UB(src6, src7, src8, src9, dst0, dst1);
586 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
593 void png_read_filter_row_avg3_msa(png_row_infop row_info, png_bytep row,
594 png_const_bytep prev_row)
599 png_const_bytep pp = prev_row;
600 size_t istop = row_info->rowbytes - 3;
602 int32_t inp0, inp1, out1;
604 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
606 v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
607 v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
613 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
614 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
615 src0 = (v16u8) MSA_SRLI_B(src0, 1);
617 out2 = __msa_copy_s_h((v8i16) src1, 0);
623 for (i = 0; i < istop; i += 12)
630 SLDI_B2_0_UB(src2, src6, src3, src7, 3);
631 SLDI_B2_0_UB(src2, src6, src4, src8, 6);
632 SLDI_B2_0_UB(src2, src6, src5, src9, 9);
633 src2 = __msa_ave_u_b(src2, src1);
635 src3 = __msa_ave_u_b(src3, src6);
637 src4 = __msa_ave_u_b(src4, src7);
639 src5 = __msa_ave_u_b(src5, src8);
642 VSHF_B2_UB(src6, src7, src8, src9, mask0, mask0, dst0, dst1);
643 dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
644 out0 = __msa_copy_s_d((v2i64) dst0, 0);
645 out1 = __msa_copy_s_w((v4i32) dst0, 2);
654 void png_read_filter_row_paeth4_msa(png_row_infop row_info,
656 png_const_bytep prev_row)
658 int32_t count, rp_end;
660 png_const_bytep prev_nxt;
661 int32_t inp0, inp1, res0;
662 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
663 v16u8 src10, src11, src12, src13, dst0, dst1;
664 v8i16 vec0, vec1, vec2;
673 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
674 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
677 res0 = __msa_copy_s_w((v4i32) src1, 0);
683 rp_end = row_info->rowbytes - 4;
685 for (count = 0; count < rp_end; count += 16)
687 src2 = LD_UB(prev_nxt);
689 src6 = LD_UB(prev_row);
693 SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 4);
694 SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 8);
695 SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 12);
696 ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1);
697 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
699 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
700 CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10);
701 ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1);
702 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
704 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
705 CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11);
706 ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1);
707 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
709 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
710 CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12);
711 ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1);
712 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
714 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
715 CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13);
717 ILVEV_W2_UB(src10, src11, src12, src1, dst0, dst1);
718 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
725 void png_read_filter_row_paeth3_msa(png_row_infop row_info,
727 png_const_bytep prev_row)
729 int32_t count, rp_end;
731 png_const_bytep prev_nxt;
733 int32_t inp0, inp1, out1;
735 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, dst0, dst1;
736 v16u8 src10, src11, src12, src13;
737 v8i16 vec0, vec1, vec2;
739 v16i8 mask0 = { 0, 1, 2, 16, 17, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
740 v16i8 mask1 = { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 0, 0, 0, 0 };
748 src0 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp0);
749 src1 = (v16u8) __msa_insert_w((v4i32) zero, 0, inp1);
752 out2 = __msa_copy_s_h((v8i16) src1, 0);
760 rp_end = row_info->rowbytes - 3;
762 for (count = 0; count < rp_end; count += 12)
764 src2 = LD_UB(prev_nxt);
766 src6 = LD_UB(prev_row);
770 SLDI_B3_0_UB(src2, src6, src10, src3, src7, src11, 3);
771 SLDI_B3_0_UB(src2, src6, src10, src4, src8, src12, 6);
772 SLDI_B3_0_UB(src2, src6, src10, src5, src9, src13, 9);
773 ILVR_B2_SH(src2, src6, src1, src6, vec0, vec1);
774 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
776 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
777 CMP_AND_SELECT(vec0, vec1, vec2, src1, src2, src6, src10);
778 ILVR_B2_SH(src3, src7, src10, src7, vec0, vec1);
779 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
781 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
782 CMP_AND_SELECT(vec0, vec1, vec2, src10, src3, src7, src11);
783 ILVR_B2_SH(src4, src8, src11, src8, vec0, vec1);
784 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
786 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
787 CMP_AND_SELECT(vec0, vec1, vec2, src11, src4, src8, src12);
788 ILVR_B2_SH(src5, src9, src12, src9, vec0, vec1);
789 HSUB_UB2_SH(vec0, vec1, vec0, vec1);
791 ADD_ABS_H3_SH(vec0, vec1, vec2, vec0, vec1, vec2);
792 CMP_AND_SELECT(vec0, vec1, vec2, src12, src5, src9, src13);
794 VSHF_B2_UB(src10, src11, src12, src13, mask0, mask0, dst0, dst1);
795 dst0 = (v16u8) __msa_vshf_b(mask1, (v16i8) dst1, (v16i8) dst0);
796 out0 = __msa_copy_s_d((v2i64) dst0, 0);
797 out1 = __msa_copy_s_w((v4i32) dst0, 2);
806 #endif /* PNG_MIPS_MSA_OPT > 0 */
807 #endif /* PNG_MIPS_MSA_IMPLEMENTATION == 1 (intrinsics) */