(root)/
libpng-1.6.40/
arm/
palette_neon_intrinsics.c
       1  
       2  /* palette_neon_intrinsics.c - NEON optimised palette expansion functions
       3   *
       4   * Copyright (c) 2018-2019 Cosmin Truta
       5   * Copyright (c) 2017-2018 Arm Holdings. All rights reserved.
       6   * Written by Richard Townsend <Richard.Townsend@arm.com>, February 2017.
       7   *
       8   * This code is released under the libpng license.
       9   * For conditions of distribution and use, see the disclaimer
      10   * and license in png.h
      11   */
      12  
      13  #include "../pngpriv.h"
      14  
      15  #if PNG_ARM_NEON_IMPLEMENTATION == 1
      16  
      17  #if defined(_MSC_VER) && !defined(__clang__) && defined(_M_ARM64)
      18  #  include <arm64_neon.h>
      19  #else
      20  #  include <arm_neon.h>
      21  #endif
      22  
      23  /* Build an RGBA8 palette from the separate RGB and alpha palettes. */
      24  void
      25  png_riffle_palette_neon(png_structrp png_ptr)
      26  {
      27     png_const_colorp palette = png_ptr->palette;
      28     png_bytep riffled_palette = png_ptr->riffled_palette;
      29     png_const_bytep trans_alpha = png_ptr->trans_alpha;
      30     int num_trans = png_ptr->num_trans;
      31     int i;
      32  
      33     /* Initially black, opaque. */
      34     uint8x16x4_t w = {{
      35        vdupq_n_u8(0x00),
      36        vdupq_n_u8(0x00),
      37        vdupq_n_u8(0x00),
      38        vdupq_n_u8(0xff),
      39     }};
      40  
      41     png_debug(1, "in png_riffle_palette_neon");
      42  
      43     /* First, riffle the RGB colours into an RGBA8 palette.
      44      * The alpha component is set to opaque for now.
      45      */
      46     for (i = 0; i < 256; i += 16)
      47     {
      48        uint8x16x3_t v = vld3q_u8((png_const_bytep)(palette + i));
      49        w.val[0] = v.val[0];
      50        w.val[1] = v.val[1];
      51        w.val[2] = v.val[2];
      52        vst4q_u8(riffled_palette + (i << 2), w);
      53     }
      54  
      55     /* Fix up the missing transparency values. */
      56     for (i = 0; i < num_trans; i++)
      57        riffled_palette[(i << 2) + 3] = trans_alpha[i];
      58  }
      59  
      60  /* Expands a palettized row into RGBA8. */
      61  int
      62  png_do_expand_palette_rgba8_neon(png_structrp png_ptr, png_row_infop row_info,
      63      png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
      64  {
      65     png_uint_32 row_width = row_info->width;
      66     const png_uint_32 *riffled_palette =
      67        (const png_uint_32 *)png_ptr->riffled_palette;
      68     const png_uint_32 pixels_per_chunk = 4;
      69     png_uint_32 i;
      70  
      71     png_debug(1, "in png_do_expand_palette_rgba8_neon");
      72  
      73     PNG_UNUSED(row)
      74     if (row_width < pixels_per_chunk)
      75        return 0;
      76  
      77     /* This function originally gets the last byte of the output row.
      78      * The NEON part writes forward from a given position, so we have
      79      * to seek this back by 4 pixels x 4 bytes.
      80      */
      81     *ddp = *ddp - ((pixels_per_chunk * sizeof(png_uint_32)) - 1);
      82  
      83     for (i = 0; i < row_width; i += pixels_per_chunk)
      84     {
      85        uint32x4_t cur;
      86        png_bytep sp = *ssp - i, dp = *ddp - (i << 2);
      87        cur = vld1q_dup_u32 (riffled_palette + *(sp - 3));
      88        cur = vld1q_lane_u32(riffled_palette + *(sp - 2), cur, 1);
      89        cur = vld1q_lane_u32(riffled_palette + *(sp - 1), cur, 2);
      90        cur = vld1q_lane_u32(riffled_palette + *(sp - 0), cur, 3);
      91        vst1q_u32((void *)dp, cur);
      92     }
      93     if (i != row_width)
      94     {
      95        /* Remove the amount that wasn't processed. */
      96        i -= pixels_per_chunk;
      97     }
      98  
      99     /* Decrement output pointers. */
     100     *ssp = *ssp - i;
     101     *ddp = *ddp - (i << 2);
     102     return i;
     103  }
     104  
     105  /* Expands a palettized row into RGB8. */
     106  int
     107  png_do_expand_palette_rgb8_neon(png_structrp png_ptr, png_row_infop row_info,
     108      png_const_bytep row, png_bytepp ssp, png_bytepp ddp)
     109  {
     110     png_uint_32 row_width = row_info->width;
     111     png_const_bytep palette = (png_const_bytep)png_ptr->palette;
     112     const png_uint_32 pixels_per_chunk = 8;
     113     png_uint_32 i;
     114  
     115     png_debug(1, "in png_do_expand_palette_rgb8_neon");
     116  
     117     PNG_UNUSED(row)
     118     if (row_width <= pixels_per_chunk)
     119        return 0;
     120  
     121     /* Seeking this back by 8 pixels x 3 bytes. */
     122     *ddp = *ddp - ((pixels_per_chunk * sizeof(png_color)) - 1);
     123  
     124     for (i = 0; i < row_width; i += pixels_per_chunk)
     125     {
     126        uint8x8x3_t cur;
     127        png_bytep sp = *ssp - i, dp = *ddp - ((i << 1) + i);
     128        cur = vld3_dup_u8(palette + sizeof(png_color) * (*(sp - 7)));
     129        cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 6)), cur, 1);
     130        cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 5)), cur, 2);
     131        cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 4)), cur, 3);
     132        cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 3)), cur, 4);
     133        cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 2)), cur, 5);
     134        cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 1)), cur, 6);
     135        cur = vld3_lane_u8(palette + sizeof(png_color) * (*(sp - 0)), cur, 7);
     136        vst3_u8((void *)dp, cur);
     137     }
     138  
     139     if (i != row_width)
     140     {
     141        /* Remove the amount that wasn't processed. */
     142        i -= pixels_per_chunk;
     143     }
     144  
     145     /* Decrement output pointers. */
     146     *ssp = *ssp - i;
     147     *ddp = *ddp - ((i << 1) + i);
     148     return i;
     149  }
     150  
     151  #endif /* PNG_ARM_NEON_IMPLEMENTATION */