1  
       2  /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
       3   *
       4   * Copyright (c) 2018 Cosmin Truta
       5   * Copyright (c) 2016-2017 Glenn Randers-Pehrson
       6   * Written by Mike Klein and Matt Sarett
       7   * Derived from arm/filter_neon_intrinsics.c
       8   *
       9   * This code is released under the libpng license.
      10   * For conditions of distribution and use, see the disclaimer
      11   * and license in png.h
      12   */
      13  
      14  #include "../pngpriv.h"
      15  
      16  #ifdef PNG_READ_SUPPORTED
      17  
      18  #if PNG_INTEL_SSE_IMPLEMENTATION > 0
      19  
      20  #include <immintrin.h>
      21  
      22  /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
      23   * They're positioned like this:
      24   *    prev:  c b
      25   *    row:   a d
      26   * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
      27   * whichever of a, b, or c is closest to p=a+b-c.
      28   */
      29  
      30  static __m128i load4(const void* p) {
      31     int tmp;
      32     memcpy(&tmp, p, sizeof(tmp));
      33     return _mm_cvtsi32_si128(tmp);
      34  }
      35  
      36  static void store4(void* p, __m128i v) {
      37     int tmp = _mm_cvtsi128_si32(v);
      38     memcpy(p, &tmp, sizeof(int));
      39  }
      40  
      41  static __m128i load3(const void* p) {
      42     png_uint_32 tmp = 0;
      43     memcpy(&tmp, p, 3);
      44     return _mm_cvtsi32_si128(tmp);
      45  }
      46  
      47  static void store3(void* p, __m128i v) {
      48     int tmp = _mm_cvtsi128_si32(v);
      49     memcpy(p, &tmp, 3);
      50  }
      51  
      52  void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
      53     png_const_bytep prev)
      54  {
      55     /* The Sub filter predicts each pixel as the previous pixel, a.
      56      * There is no pixel to the left of the first pixel.  It's encoded directly.
      57      * That works with our main loop if we just say that left pixel was zero.
      58      */
      59     size_t rb;
      60  
      61     __m128i a, d = _mm_setzero_si128();
      62  
      63     png_debug(1, "in png_read_filter_row_sub3_sse2");
      64  
      65     rb = row_info->rowbytes;
      66     while (rb >= 4) {
      67        a = d; d = load4(row);
      68        d = _mm_add_epi8(d, a);
      69        store3(row, d);
      70  
      71        row += 3;
      72        rb  -= 3;
      73     }
      74     if (rb > 0) {
      75        a = d; d = load3(row);
      76        d = _mm_add_epi8(d, a);
      77        store3(row, d);
      78  
      79        row += 3;
      80        rb  -= 3;
      81     }
      82     PNG_UNUSED(prev)
      83  }
      84  
      85  void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
      86     png_const_bytep prev)
      87  {
      88     /* The Sub filter predicts each pixel as the previous pixel, a.
      89      * There is no pixel to the left of the first pixel.  It's encoded directly.
      90      * That works with our main loop if we just say that left pixel was zero.
      91      */
      92     size_t rb;
      93  
      94     __m128i a, d = _mm_setzero_si128();
      95  
      96     png_debug(1, "in png_read_filter_row_sub4_sse2");
      97  
      98     rb = row_info->rowbytes+4;
      99     while (rb > 4) {
     100        a = d; d = load4(row);
     101        d = _mm_add_epi8(d, a);
     102        store4(row, d);
     103  
     104        row += 4;
     105        rb  -= 4;
     106     }
     107     PNG_UNUSED(prev)
     108  }
     109  
     110  void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
     111     png_const_bytep prev)
     112  {
     113     /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     114      * There's no pixel to the left of the first pixel.  Luckily, it's
     115      * predicted to be half of the pixel above it.  So again, this works
     116      * perfectly with our loop if we make sure a starts at zero.
     117      */
     118  
     119     size_t rb;
     120  
     121     const __m128i zero = _mm_setzero_si128();
     122  
     123     __m128i    b;
     124     __m128i a, d = zero;
     125  
     126     png_debug(1, "in png_read_filter_row_avg3_sse2");
     127     rb = row_info->rowbytes;
     128     while (rb >= 4) {
     129        __m128i avg;
     130               b = load4(prev);
     131        a = d; d = load4(row );
     132  
     133        /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
     134        avg = _mm_avg_epu8(a,b);
     135        /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     136        avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
     137                                              _mm_set1_epi8(1)));
     138        d = _mm_add_epi8(d, avg);
     139        store3(row, d);
     140  
     141        prev += 3;
     142        row  += 3;
     143        rb   -= 3;
     144     }
     145     if (rb > 0) {
     146        __m128i avg;
     147               b = load3(prev);
     148        a = d; d = load3(row );
     149  
     150        /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
     151        avg = _mm_avg_epu8(a,b);
     152        /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     153        avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
     154                                              _mm_set1_epi8(1)));
     155  
     156        d = _mm_add_epi8(d, avg);
     157        store3(row, d);
     158  
     159        prev += 3;
     160        row  += 3;
     161        rb   -= 3;
     162     }
     163  }
     164  
     165  void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
     166     png_const_bytep prev)
     167  {
     168     /* The Avg filter predicts each pixel as the (truncated) average of a and b.
     169      * There's no pixel to the left of the first pixel.  Luckily, it's
     170      * predicted to be half of the pixel above it.  So again, this works
     171      * perfectly with our loop if we make sure a starts at zero.
     172      */
     173     size_t rb;
     174     const __m128i zero = _mm_setzero_si128();
     175     __m128i    b;
     176     __m128i a, d = zero;
     177  
     178     png_debug(1, "in png_read_filter_row_avg4_sse2");
     179  
     180     rb = row_info->rowbytes+4;
     181     while (rb > 4) {
     182        __m128i avg;
     183               b = load4(prev);
     184        a = d; d = load4(row );
     185  
     186        /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
     187        avg = _mm_avg_epu8(a,b);
     188        /* ...but we can fix it up by subtracting off 1 if it rounded up. */
     189        avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
     190                                              _mm_set1_epi8(1)));
     191  
     192        d = _mm_add_epi8(d, avg);
     193        store4(row, d);
     194  
     195        prev += 4;
     196        row  += 4;
     197        rb   -= 4;
     198     }
     199  }
     200  
     201  /* Returns |x| for 16-bit lanes. */
     202  static __m128i abs_i16(__m128i x) {
     203  #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
     204     return _mm_abs_epi16(x);
     205  #else
     206     /* Read this all as, return x<0 ? -x : x.
     207     * To negate two's complement, you flip all the bits then add 1.
     208      */
     209     __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
     210  
     211     /* Flip negative lanes. */
     212     x = _mm_xor_si128(x, is_negative);
     213  
     214     /* +1 to negative lanes, else +0. */
     215     x = _mm_sub_epi16(x, is_negative);
     216     return x;
     217  #endif
     218  }
     219  
     220  /* Bytewise c ? t : e. */
     221  static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
     222  #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
     223     return _mm_blendv_epi8(e,t,c);
     224  #else
     225     return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
     226  #endif
     227  }
     228  
     229  void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
     230     png_const_bytep prev)
     231  {
     232     /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     233      * and two pixels from the previous row, b and c:
     234      *   prev: c b
     235      *   row:  a d
     236      * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     237      * p=a+b-c.
     238      *
     239      * The first pixel has no left context, and so uses an Up filter, p = b.
     240      * This works naturally with our main loop's p = a+b-c if we force a and c
     241      * to zero.
     242      * Here we zero b and d, which become c and a respectively at the start of
     243      * the loop.
     244      */
     245     size_t rb;
     246     const __m128i zero = _mm_setzero_si128();
     247     __m128i c, b = zero,
     248             a, d = zero;
     249  
     250     png_debug(1, "in png_read_filter_row_paeth3_sse2");
     251  
     252     rb = row_info->rowbytes;
     253     while (rb >= 4) {
     254        /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     255         * intermediates.
     256         */
     257        __m128i pa,pb,pc,smallest,nearest;
     258        c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
     259        a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
     260  
     261        /* (p-a) == (a+b-c - a) == (b-c) */
     262  
     263        pa = _mm_sub_epi16(b,c);
     264  
     265        /* (p-b) == (a+b-c - b) == (a-c) */
     266        pb = _mm_sub_epi16(a,c);
     267  
     268        /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     269        pc = _mm_add_epi16(pa,pb);
     270  
     271        pa = abs_i16(pa);  /* |p-a| */
     272        pb = abs_i16(pb);  /* |p-b| */
     273        pc = abs_i16(pc);  /* |p-c| */
     274  
     275        smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     276  
     277        /* Paeth breaks ties favoring a over b over c. */
     278        nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     279                   if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
     280                                                               c));
     281  
     282        /* Note `_epi8`: we need addition to wrap modulo 255. */
     283        d = _mm_add_epi8(d, nearest);
     284        store3(row, _mm_packus_epi16(d,d));
     285  
     286        prev += 3;
     287        row  += 3;
     288        rb   -= 3;
     289     }
     290     if (rb > 0) {
     291        /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     292         * intermediates.
     293         */
     294        __m128i pa,pb,pc,smallest,nearest;
     295        c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
     296        a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
     297  
     298        /* (p-a) == (a+b-c - a) == (b-c) */
     299        pa = _mm_sub_epi16(b,c);
     300  
     301        /* (p-b) == (a+b-c - b) == (a-c) */
     302        pb = _mm_sub_epi16(a,c);
     303  
     304        /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     305        pc = _mm_add_epi16(pa,pb);
     306  
     307        pa = abs_i16(pa);  /* |p-a| */
     308        pb = abs_i16(pb);  /* |p-b| */
     309        pc = abs_i16(pc);  /* |p-c| */
     310  
     311        smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     312  
     313        /* Paeth breaks ties favoring a over b over c. */
     314        nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     315                           if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
     316                                                                       c));
     317  
     318        /* Note `_epi8`: we need addition to wrap modulo 255. */
     319        d = _mm_add_epi8(d, nearest);
     320        store3(row, _mm_packus_epi16(d,d));
     321  
     322        prev += 3;
     323        row  += 3;
     324        rb   -= 3;
     325     }
     326  }
     327  
     328  void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
     329     png_const_bytep prev)
     330  {
     331     /* Paeth tries to predict pixel d using the pixel to the left of it, a,
     332      * and two pixels from the previous row, b and c:
     333      *   prev: c b
     334      *   row:  a d
     335      * The Paeth function predicts d to be whichever of a, b, or c is nearest to
     336      * p=a+b-c.
     337      *
     338      * The first pixel has no left context, and so uses an Up filter, p = b.
     339      * This works naturally with our main loop's p = a+b-c if we force a and c
     340      * to zero.
     341      * Here we zero b and d, which become c and a respectively at the start of
     342      * the loop.
     343      */
     344     size_t rb;
     345     const __m128i zero = _mm_setzero_si128();
     346     __m128i pa,pb,pc,smallest,nearest;
     347     __m128i c, b = zero,
     348             a, d = zero;
     349  
     350     png_debug(1, "in png_read_filter_row_paeth4_sse2");
     351  
     352     rb = row_info->rowbytes+4;
     353     while (rb > 4) {
     354        /* It's easiest to do this math (particularly, deal with pc) with 16-bit
     355         * intermediates.
     356         */
     357        c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
     358        a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
     359  
     360        /* (p-a) == (a+b-c - a) == (b-c) */
     361        pa = _mm_sub_epi16(b,c);
     362  
     363        /* (p-b) == (a+b-c - b) == (a-c) */
     364        pb = _mm_sub_epi16(a,c);
     365  
     366        /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
     367        pc = _mm_add_epi16(pa,pb);
     368  
     369        pa = abs_i16(pa);  /* |p-a| */
     370        pb = abs_i16(pb);  /* |p-b| */
     371        pc = abs_i16(pc);  /* |p-c| */
     372  
     373        smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
     374  
     375        /* Paeth breaks ties favoring a over b over c. */
     376        nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
     377                           if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
     378                                                                       c));
     379  
     380        /* Note `_epi8`: we need addition to wrap modulo 255. */
     381        d = _mm_add_epi8(d, nearest);
     382        store4(row, _mm_packus_epi16(d,d));
     383  
     384        prev += 4;
     385        row  += 4;
     386        rb   -= 4;
     387     }
     388  }
     389  
     390  #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
     391  #endif /* READ */