(root)/
libpng-1.6.40/
arm/
filter_neon_intrinsics.c
       1  
       2  /* filter_neon_intrinsics.c - NEON optimised filter functions
       3   *
       4   * Copyright (c) 2018 Cosmin Truta
       5   * Copyright (c) 2014,2016 Glenn Randers-Pehrson
       6   * Written by James Yu <james.yu at linaro.org>, October 2013.
       7   * Based on filter_neon.S, written by Mans Rullgard, 2011.
       8   *
       9   * This code is released under the libpng license.
      10   * For conditions of distribution and use, see the disclaimer
      11   * and license in png.h
      12   */
      13  
      14  #include "../pngpriv.h"
      15  
      16  #ifdef PNG_READ_SUPPORTED
      17  
      18  /* This code requires -mfpu=neon on the command line: */
      19  #if PNG_ARM_NEON_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
      20  
      21  #if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
      22  #  include <arm64_neon.h>
      23  #else
      24  #  include <arm_neon.h>
      25  #endif
      26  
      27  /* libpng row pointers are not necessarily aligned to any particular boundary,
      28   * however this code will only work with appropriate alignment.  arm/arm_init.c
      29   * checks for this (and will not compile unless it is done). This code uses
      30   * variants of png_aligncast to avoid compiler warnings.
      31   */
      32  #define png_ptr(type,pointer) png_aligncast(type *,pointer)
      33  #define png_ptrc(type,pointer) png_aligncastconst(const type *,pointer)
      34  
      35  /* The following relies on a variable 'temp_pointer' being declared with type
      36   * 'type'.  This is written this way just to hide the GCC strict aliasing
      37   * warning; note that the code is safe because there never is an alias between
      38   * the input and output pointers.
      39   *
      40   * When compiling with MSVC ARM64, the png_ldr macro can't be passed directly
      41   * to vst4_lane_u32, because of an internal compiler error inside MSVC.
      42   * To avoid this compiler bug, we use a temporary variable (vdest_val) to store
      43   * the result of png_ldr.
      44   */
      45  #define png_ldr(type,pointer)\
      46     (temp_pointer = png_ptr(type,pointer), *temp_pointer)
      47  
      48  #if PNG_ARM_NEON_OPT > 0
      49  
      50  void
      51  png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
      52     png_const_bytep prev_row)
      53  {
      54     png_bytep rp = row;
      55     png_bytep rp_stop = row + row_info->rowbytes;
      56     png_const_bytep pp = prev_row;
      57  
      58     png_debug(1, "in png_read_filter_row_up_neon");
      59  
      60     for (; rp < rp_stop; rp += 16, pp += 16)
      61     {
      62        uint8x16_t qrp, qpp;
      63  
      64        qrp = vld1q_u8(rp);
      65        qpp = vld1q_u8(pp);
      66        qrp = vaddq_u8(qrp, qpp);
      67        vst1q_u8(rp, qrp);
      68     }
      69  }
      70  
      71  void
      72  png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
      73     png_const_bytep prev_row)
      74  {
      75     png_bytep rp = row;
      76     png_bytep rp_stop = row + row_info->rowbytes;
      77  
      78     uint8x16_t vtmp = vld1q_u8(rp);
      79     uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
      80     uint8x8x2_t vrp = *vrpt;
      81  
      82     uint8x8x4_t vdest;
      83     vdest.val[3] = vdup_n_u8(0);
      84  
      85     png_debug(1, "in png_read_filter_row_sub3_neon");
      86  
      87     for (; rp < rp_stop;)
      88     {
      89        uint8x8_t vtmp1, vtmp2;
      90        uint32x2_t *temp_pointer;
      91  
      92        vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
      93        vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
      94        vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
      95        vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);
      96  
      97        vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
      98        vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
      99        vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);
     100  
     101        vtmp = vld1q_u8(rp + 12);
     102        vrpt = png_ptr(uint8x8x2_t, &vtmp);
     103        vrp = *vrpt;
     104  
     105        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
     106        rp += 3;
     107        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
     108        rp += 3;
     109        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
     110        rp += 3;
     111        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
     112        rp += 3;
     113     }
     114  
     115     PNG_UNUSED(prev_row)
     116  }
     117  
     118  void
     119  png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
     120     png_const_bytep prev_row)
     121  {
     122     png_bytep rp = row;
     123     png_bytep rp_stop = row + row_info->rowbytes;
     124  
     125     uint8x8x4_t vdest;
     126     vdest.val[3] = vdup_n_u8(0);
     127  
     128     png_debug(1, "in png_read_filter_row_sub4_neon");
     129  
     130     for (; rp < rp_stop; rp += 16)
     131     {
     132        uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
     133        uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
     134        uint8x8x4_t vrp = *vrpt;
     135        uint32x2x4_t *temp_pointer;
     136        uint32x2x4_t vdest_val;
     137  
     138        vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
     139        vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
     140        vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
     141        vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
     142  
     143        vdest_val = png_ldr(uint32x2x4_t, &vdest);
     144        vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
     145     }
     146  
     147     PNG_UNUSED(prev_row)
     148  }
     149  
     150  void
     151  png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
     152     png_const_bytep prev_row)
     153  {
     154     png_bytep rp = row;
     155     png_const_bytep pp = prev_row;
     156     png_bytep rp_stop = row + row_info->rowbytes;
     157  
     158     uint8x16_t vtmp;
     159     uint8x8x2_t *vrpt;
     160     uint8x8x2_t vrp;
     161     uint8x8x4_t vdest;
     162     vdest.val[3] = vdup_n_u8(0);
     163  
     164     vtmp = vld1q_u8(rp);
     165     vrpt = png_ptr(uint8x8x2_t,&vtmp);
     166     vrp = *vrpt;
     167  
     168     png_debug(1, "in png_read_filter_row_avg3_neon");
     169  
     170     for (; rp < rp_stop; pp += 12)
     171     {
     172        uint8x8_t vtmp1, vtmp2, vtmp3;
     173  
     174        uint8x8x2_t *vppt;
     175        uint8x8x2_t vpp;
     176  
     177        uint32x2_t *temp_pointer;
     178  
     179        vtmp = vld1q_u8(pp);
     180        vppt = png_ptr(uint8x8x2_t,&vtmp);
     181        vpp = *vppt;
     182  
     183        vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
     184        vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
     185        vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
     186  
     187        vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
     188        vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
     189        vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
     190        vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
     191  
     192        vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
     193        vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
     194  
     195        vtmp = vld1q_u8(rp + 12);
     196        vrpt = png_ptr(uint8x8x2_t,&vtmp);
     197        vrp = *vrpt;
     198  
     199        vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
     200        vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);
     201  
     202        vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
     203  
     204        vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
     205        vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
     206  
     207        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
     208        rp += 3;
     209        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
     210        rp += 3;
     211        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
     212        rp += 3;
     213        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
     214        rp += 3;
     215     }
     216  }
     217  
     218  void
     219  png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
     220     png_const_bytep prev_row)
     221  {
     222     png_bytep rp = row;
     223     png_bytep rp_stop = row + row_info->rowbytes;
     224     png_const_bytep pp = prev_row;
     225  
     226     uint8x8x4_t vdest;
     227     vdest.val[3] = vdup_n_u8(0);
     228  
     229     png_debug(1, "in png_read_filter_row_avg4_neon");
     230  
     231     for (; rp < rp_stop; rp += 16, pp += 16)
     232     {
     233        uint32x2x4_t vtmp;
     234        uint8x8x4_t *vrpt, *vppt;
     235        uint8x8x4_t vrp, vpp;
     236        uint32x2x4_t *temp_pointer;
     237        uint32x2x4_t vdest_val;
     238  
     239        vtmp = vld4_u32(png_ptr(uint32_t,rp));
     240        vrpt = png_ptr(uint8x8x4_t,&vtmp);
     241        vrp = *vrpt;
     242        vtmp = vld4_u32(png_ptrc(uint32_t,pp));
     243        vppt = png_ptr(uint8x8x4_t,&vtmp);
     244        vpp = *vppt;
     245  
     246        vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
     247        vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
     248        vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
     249        vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
     250        vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
     251        vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
     252        vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
     253        vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
     254  
     255        vdest_val = png_ldr(uint32x2x4_t, &vdest);
     256        vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
     257     }
     258  }
     259  
     260  static uint8x8_t
     261  paeth(uint8x8_t a, uint8x8_t b, uint8x8_t c)
     262  {
     263     uint8x8_t d, e;
     264     uint16x8_t p1, pa, pb, pc;
     265  
     266     p1 = vaddl_u8(a, b); /* a + b */
     267     pc = vaddl_u8(c, c); /* c * 2 */
     268     pa = vabdl_u8(b, c); /* pa */
     269     pb = vabdl_u8(a, c); /* pb */
     270     pc = vabdq_u16(p1, pc); /* pc */
     271  
     272     p1 = vcleq_u16(pa, pb); /* pa <= pb */
     273     pa = vcleq_u16(pa, pc); /* pa <= pc */
     274     pb = vcleq_u16(pb, pc); /* pb <= pc */
     275  
     276     p1 = vandq_u16(p1, pa); /* pa <= pb && pa <= pc */
     277  
     278     d = vmovn_u16(pb);
     279     e = vmovn_u16(p1);
     280  
     281     d = vbsl_u8(d, b, c);
     282     e = vbsl_u8(e, a, d);
     283  
     284     return e;
     285  }
     286  
     287  void
     288  png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
     289     png_const_bytep prev_row)
     290  {
     291     png_bytep rp = row;
     292     png_const_bytep pp = prev_row;
     293     png_bytep rp_stop = row + row_info->rowbytes;
     294  
     295     uint8x16_t vtmp;
     296     uint8x8x2_t *vrpt;
     297     uint8x8x2_t vrp;
     298     uint8x8_t vlast = vdup_n_u8(0);
     299     uint8x8x4_t vdest;
     300     vdest.val[3] = vdup_n_u8(0);
     301  
     302     vtmp = vld1q_u8(rp);
     303     vrpt = png_ptr(uint8x8x2_t,&vtmp);
     304     vrp = *vrpt;
     305  
     306     png_debug(1, "in png_read_filter_row_paeth3_neon");
     307  
     308     for (; rp < rp_stop; pp += 12)
     309     {
     310        uint8x8x2_t *vppt;
     311        uint8x8x2_t vpp;
     312        uint8x8_t vtmp1, vtmp2, vtmp3;
     313        uint32x2_t *temp_pointer;
     314  
     315        vtmp = vld1q_u8(pp);
     316        vppt = png_ptr(uint8x8x2_t,&vtmp);
     317        vpp = *vppt;
     318  
     319        vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
     320        vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
     321  
     322        vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
     323        vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
     324        vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
     325        vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);
     326  
     327        vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
     328        vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
     329        vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
     330        vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);
     331  
     332        vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
     333        vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);
     334  
     335        vtmp = vld1q_u8(rp + 12);
     336        vrpt = png_ptr(uint8x8x2_t,&vtmp);
     337        vrp = *vrpt;
     338  
     339        vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
     340        vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);
     341  
     342        vlast = vtmp2;
     343  
     344        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
     345        rp += 3;
     346        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
     347        rp += 3;
     348        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
     349        rp += 3;
     350        vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
     351        rp += 3;
     352     }
     353  }
     354  
     355  void
     356  png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
     357     png_const_bytep prev_row)
     358  {
     359     png_bytep rp = row;
     360     png_bytep rp_stop = row + row_info->rowbytes;
     361     png_const_bytep pp = prev_row;
     362  
     363     uint8x8_t vlast = vdup_n_u8(0);
     364     uint8x8x4_t vdest;
     365     vdest.val[3] = vdup_n_u8(0);
     366  
     367     png_debug(1, "in png_read_filter_row_paeth4_neon");
     368  
     369     for (; rp < rp_stop; rp += 16, pp += 16)
     370     {
     371        uint32x2x4_t vtmp;
     372        uint8x8x4_t *vrpt, *vppt;
     373        uint8x8x4_t vrp, vpp;
     374        uint32x2x4_t *temp_pointer;
     375        uint32x2x4_t vdest_val;
     376  
     377        vtmp = vld4_u32(png_ptr(uint32_t,rp));
     378        vrpt = png_ptr(uint8x8x4_t,&vtmp);
     379        vrp = *vrpt;
     380        vtmp = vld4_u32(png_ptrc(uint32_t,pp));
     381        vppt = png_ptr(uint8x8x4_t,&vtmp);
     382        vpp = *vppt;
     383  
     384        vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
     385        vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
     386        vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
     387        vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
     388        vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
     389        vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
     390        vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
     391        vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);
     392  
     393        vlast = vpp.val[3];
     394  
     395        vdest_val = png_ldr(uint32x2x4_t, &vdest);
     396        vst4_lane_u32(png_ptr(uint32_t,rp), vdest_val, 0);
     397     }
     398  }
     399  
     400  #endif /* PNG_ARM_NEON_OPT > 0 */
     401  #endif /* PNG_ARM_NEON_IMPLEMENTATION == 1 (intrinsics) */
     402  #endif /* READ */