(root)/
coreutils-9.4/
src/
wc_avx2.c
       1  /* wc_avx - Count the number of newlines with avx2 instructions.
       2     Copyright (C) 2021-2023 Free Software Foundation, Inc.
       3  
       4     This program is free software: you can redistribute it and/or modify
       5     it under the terms of the GNU General Public License as published by
       6     the Free Software Foundation, either version 3 of the License, or
       7     (at your option) any later version.
       8  
       9     This program is distributed in the hope that it will be useful,
      10     but WITHOUT ANY WARRANTY; without even the implied warranty of
      11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12     GNU General Public License for more details.
      13  
      14     You should have received a copy of the GNU General Public License
      15     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
      16  
      17  #include <config.h>
      18  
      19  #include "system.h"
      20  #include "safe-read.h"
      21  
      22  #include <x86intrin.h>
      23  
      24  /* This must be below 16 KB (16384) or else the accumulators can
      25     theoretically overflow, producing wrong result. This is 2*32 bytes below,
      26     so there is no single bytes in the optimal case. */
      27  #define BUFSIZE (16320)
      28  
      29  extern bool
      30  wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
      31                 uintmax_t *bytes_out);
      32  
      33  extern bool
      34  wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out,
      35                 uintmax_t *bytes_out)
      36  {
      37    __m256i accumulator;
      38    __m256i accumulator2;
      39    __m256i zeroes;
      40    __m256i endlines;
      41    __m256i avx_buf[BUFSIZE / sizeof (__m256i)];
      42    __m256i *datap;
      43    uintmax_t lines = 0;
      44    uintmax_t bytes = 0;
      45    size_t bytes_read = 0;
      46  
      47  
      48    if (!lines_out || !bytes_out)
      49      return false;
      50  
      51    /* Using two parallel accumulators gave a good performance increase.
      52       Adding a third gave no additional benefit, at least on an
      53       Intel Xeon E3-1231v3.  Maybe on a newer CPU with additional vector
      54       execution engines it would be a win. */
      55    accumulator = _mm256_setzero_si256 ();
      56    accumulator2 = _mm256_setzero_si256 ();
      57    zeroes = _mm256_setzero_si256 ();
      58    endlines = _mm256_set1_epi8 ('\n');
      59  
      60    while ((bytes_read = safe_read (fd, avx_buf, sizeof (avx_buf))) > 0)
      61      {
      62        __m256i to_match;
      63        __m256i to_match2;
      64        __m256i matches;
      65        __m256i matches2;
      66  
      67        if (bytes_read == SAFE_READ_ERROR)
      68          {
      69            error (0, errno, "%s", quotef (file));
      70            return false;
      71          }
      72  
      73        bytes += bytes_read;
      74  
      75        datap = avx_buf;
      76        char *end = ((char *)avx_buf) + bytes_read;
      77  
      78        while (bytes_read >= 64)
      79          {
      80            to_match = _mm256_load_si256 (datap);
      81            to_match2 = _mm256_load_si256 (datap + 1);
      82  
      83            matches = _mm256_cmpeq_epi8 (to_match, endlines);
      84            matches2 = _mm256_cmpeq_epi8 (to_match2, endlines);
      85            /* Compare will set each 8 bit integer in the register to 0xFF
      86               on match.  When we subtract it the 8 bit accumulators
      87               will underflow, so this is equal to adding 1. */
      88            accumulator = _mm256_sub_epi8 (accumulator, matches);
      89            accumulator2 = _mm256_sub_epi8 (accumulator2, matches2);
      90  
      91            datap += 2;
      92            bytes_read -= 64;
      93          }
      94  
      95        /* Horizontally add all 8 bit integers in the register,
      96           and then reset it */
      97        accumulator = _mm256_sad_epu8 (accumulator, zeroes);
      98        lines +=   _mm256_extract_epi16 (accumulator, 0)
      99                 + _mm256_extract_epi16 (accumulator, 4)
     100                 + _mm256_extract_epi16 (accumulator, 8)
     101                 + _mm256_extract_epi16 (accumulator, 12);
     102        accumulator = _mm256_setzero_si256 ();
     103  
     104        accumulator2 = _mm256_sad_epu8 (accumulator2, zeroes);
     105        lines +=   _mm256_extract_epi16 (accumulator2, 0)
     106                 + _mm256_extract_epi16 (accumulator2, 4)
     107                 + _mm256_extract_epi16 (accumulator2, 8)
     108                 + _mm256_extract_epi16 (accumulator2, 12);
     109        accumulator2 = _mm256_setzero_si256 ();
     110  
     111        /* Finish up any left over bytes */
     112        char *p = (char *)datap;
     113        while (p != end)
     114          lines += *p++ == '\n';
     115      }
     116  
     117    *lines_out = lines;
     118    *bytes_out = bytes;
     119  
     120    return true;
     121  }