1  /* Initialize x86 cache info.
       2     Copyright (C) 2020-2023 Free Software Foundation, Inc.
       3     This file is part of the GNU C Library.
       4  
       5     The GNU C Library is free software; you can redistribute it and/or
       6     modify it under the terms of the GNU Lesser General Public
       7     License as published by the Free Software Foundation; either
       8     version 2.1 of the License, or (at your option) any later version.
       9  
      10     The GNU C Library is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      13     Lesser General Public License for more details.
      14  
      15     You should have received a copy of the GNU Lesser General Public
      16     License along with the GNU C Library; if not, see
      17     <https://www.gnu.org/licenses/>.  */
      18  
      19  static const struct intel_02_cache_info
      20  {
      21    unsigned char idx;
      22    unsigned char assoc;
      23    unsigned char linesize;
      24    unsigned char rel_name;
      25    unsigned int size;
      26  } intel_02_known [] =
      27    {
      28  #define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
      29      { 0x06,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),    8192 },
      30      { 0x08,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   16384 },
      31      { 0x09,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
      32      { 0x0a,  2, 32, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
      33      { 0x0c,  4, 32, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
      34      { 0x0d,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
      35      { 0x0e,  6, 64, M(_SC_LEVEL1_DCACHE_SIZE),   24576 },
      36      { 0x21,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
      37      { 0x22,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
      38      { 0x23,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
      39      { 0x25,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
      40      { 0x29,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
      41      { 0x2c,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
      42      { 0x30,  8, 64, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
      43      { 0x39,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
      44      { 0x3a,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   196608 },
      45      { 0x3b,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
      46      { 0x3c,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
      47      { 0x3d,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   393216 },
      48      { 0x3e,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
      49      { 0x3f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
      50      { 0x41,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
      51      { 0x42,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
      52      { 0x43,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
      53      { 0x44,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
      54      { 0x45,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
      55      { 0x46,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
      56      { 0x47,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
      57      { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE),  3145728 },
      58      { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE),  4194304 },
      59      { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  6291456 },
      60      { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
      61      { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
      62      { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
      63      { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE),  6291456 },
      64      { 0x60,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
      65      { 0x66,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
      66      { 0x67,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
      67      { 0x68,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
      68      { 0x78,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
      69      { 0x79,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
      70      { 0x7a,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
      71      { 0x7b,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
      72      { 0x7c,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
      73      { 0x7d,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
      74      { 0x7f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
      75      { 0x80,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
      76      { 0x82,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
      77      { 0x83,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
      78      { 0x84,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
      79      { 0x85,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
      80      { 0x86,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
      81      { 0x87,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
      82      { 0xd0,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
      83      { 0xd1,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
      84      { 0xd2,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
      85      { 0xd6,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
      86      { 0xd7,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
      87      { 0xd8,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
      88      { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
      89      { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
      90      { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
      91      { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
      92      { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
      93      { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
      94      { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
      95      { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
      96      { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
      97    };
      98  
      99  #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
     100  
     101  static int
     102  intel_02_known_compare (const void *p1, const void *p2)
     103  {
     104    const struct intel_02_cache_info *i1;
     105    const struct intel_02_cache_info *i2;
     106  
     107    i1 = (const struct intel_02_cache_info *) p1;
     108    i2 = (const struct intel_02_cache_info *) p2;
     109  
     110    if (i1->idx == i2->idx)
     111      return 0;
     112  
     113    return i1->idx < i2->idx ? -1 : 1;
     114  }
     115  
     116  
     117  static long int
     118  __attribute__ ((noinline))
     119  intel_check_word (int name, unsigned int value, bool *has_level_2,
     120  		  bool *no_level_2_or_3,
     121  		  const struct cpu_features *cpu_features)
     122  {
     123    if ((value & 0x80000000) != 0)
     124      /* The register value is reserved.  */
     125      return 0;
     126  
     127    /* Fold the name.  The _SC_ constants are always in the order SIZE,
     128       ASSOC, LINESIZE.  */
     129    int folded_rel_name = (M(name) / 3) * 3;
     130  
     131    while (value != 0)
     132      {
     133        unsigned int byte = value & 0xff;
     134  
     135        if (byte == 0x40)
     136  	{
     137  	  *no_level_2_or_3 = true;
     138  
     139  	  if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
     140  	    /* No need to look further.  */
     141  	    break;
     142  	}
     143        else if (byte == 0xff)
     144  	{
     145  	  /* CPUID leaf 0x4 contains all the information.  We need to
     146  	     iterate over it.  */
     147  	  unsigned int eax;
     148  	  unsigned int ebx;
     149  	  unsigned int ecx;
     150  	  unsigned int edx;
     151  
     152  	  unsigned int round = 0;
     153  	  while (1)
     154  	    {
     155  	      __cpuid_count (4, round, eax, ebx, ecx, edx);
     156  
     157  	      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
     158  	      if (type == null)
     159  		/* That was the end.  */
     160  		break;
     161  
     162  	      unsigned int level = (eax >> 5) & 0x7;
     163  
     164  	      if ((level == 1 && type == data
     165  		   && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
     166  		  || (level == 1 && type == inst
     167  		      && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
     168  		  || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
     169  		  || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
     170  		  || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
     171  		{
     172  		  unsigned int offset = M(name) - folded_rel_name;
     173  
     174  		  if (offset == 0)
     175  		    /* Cache size.  */
     176  		    return (((ebx >> 22) + 1)
     177  			    * (((ebx >> 12) & 0x3ff) + 1)
     178  			    * ((ebx & 0xfff) + 1)
     179  			    * (ecx + 1));
     180  		  if (offset == 1)
     181  		    return (ebx >> 22) + 1;
     182  
     183  		  assert (offset == 2);
     184  		  return (ebx & 0xfff) + 1;
     185  		}
     186  
     187  	      ++round;
     188  	    }
     189  	  /* There is no other cache information anywhere else.  */
     190  	  break;
     191  	}
     192        else
     193  	{
     194  	  if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
     195  	    {
     196  	      /* Intel reused this value.  For family 15, model 6 it
     197  		 specifies the 3rd level cache.  Otherwise the 2nd
     198  		 level cache.  */
     199  	      unsigned int family = cpu_features->basic.family;
     200  	      unsigned int model = cpu_features->basic.model;
     201  
     202  	      if (family == 15 && model == 6)
     203  		{
     204  		  /* The level 3 cache is encoded for this model like
     205  		     the level 2 cache is for other models.  Pretend
     206  		     the caller asked for the level 2 cache.  */
     207  		  name = (_SC_LEVEL2_CACHE_SIZE
     208  			  + (name - _SC_LEVEL3_CACHE_SIZE));
     209  		  folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
     210  		}
     211  	    }
     212  
     213  	  struct intel_02_cache_info *found;
     214  	  struct intel_02_cache_info search;
     215  
     216  	  search.idx = byte;
     217  	  found = bsearch (&search, intel_02_known, nintel_02_known,
     218  			   sizeof (intel_02_known[0]), intel_02_known_compare);
     219  	  if (found != NULL)
     220  	    {
     221  	      if (found->rel_name == folded_rel_name)
     222  		{
     223  		  unsigned int offset = M(name) - folded_rel_name;
     224  
     225  		  if (offset == 0)
     226  		    /* Cache size.  */
     227  		    return found->size;
     228  		  if (offset == 1)
     229  		    return found->assoc;
     230  
     231  		  assert (offset == 2);
     232  		  return found->linesize;
     233  		}
     234  
     235  	      if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
     236  		*has_level_2 = true;
     237  	    }
     238  	}
     239  
     240        /* Next byte for the next round.  */
     241        value >>= 8;
     242      }
     243  
     244    /* Nothing found.  */
     245    return 0;
     246  }
     247  
     248  
     249  static long int __attribute__ ((noinline))
     250  handle_intel (int name, const struct cpu_features *cpu_features)
     251  {
     252    unsigned int maxidx = cpu_features->basic.max_cpuid;
     253  
     254    /* Return -1 for older CPUs.  */
     255    if (maxidx < 2)
     256      return -1;
     257  
     258    /* OK, we can use the CPUID instruction to get all info about the
     259       caches.  */
     260    unsigned int cnt = 0;
     261    unsigned int max = 1;
     262    long int result = 0;
     263    bool no_level_2_or_3 = false;
     264    bool has_level_2 = false;
     265  
     266    while (cnt++ < max)
     267      {
     268        unsigned int eax;
     269        unsigned int ebx;
     270        unsigned int ecx;
     271        unsigned int edx;
     272        __cpuid (2, eax, ebx, ecx, edx);
     273  
     274        /* The low byte of EAX in the first round contain the number of
     275  	 rounds we have to make.  At least one, the one we are already
     276  	 doing.  */
     277        if (cnt == 1)
     278  	{
     279  	  max = eax & 0xff;
     280  	  eax &= 0xffffff00;
     281  	}
     282  
     283        /* Process the individual registers' value.  */
     284        result = intel_check_word (name, eax, &has_level_2,
     285  				 &no_level_2_or_3, cpu_features);
     286        if (result != 0)
     287  	return result;
     288  
     289        result = intel_check_word (name, ebx, &has_level_2,
     290  				 &no_level_2_or_3, cpu_features);
     291        if (result != 0)
     292  	return result;
     293  
     294        result = intel_check_word (name, ecx, &has_level_2,
     295  				 &no_level_2_or_3, cpu_features);
     296        if (result != 0)
     297  	return result;
     298  
     299        result = intel_check_word (name, edx, &has_level_2,
     300  				 &no_level_2_or_3, cpu_features);
     301        if (result != 0)
     302  	return result;
     303      }
     304  
     305    if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
     306        && no_level_2_or_3)
     307      return -1;
     308  
     309    return 0;
     310  }
     311  
     312  
     313  static long int __attribute__ ((noinline))
     314  handle_amd (int name)
     315  {
     316    unsigned int eax;
     317    unsigned int ebx;
     318    unsigned int ecx;
     319    unsigned int edx;
     320    unsigned int count = 0x1;
     321  
     322    /* No level 4 cache (yet).  */
     323    if (name > _SC_LEVEL3_CACHE_LINESIZE)
     324      return 0;
     325  
     326    if (name >= _SC_LEVEL3_CACHE_SIZE)
     327      count = 0x3;
     328    else if (name >= _SC_LEVEL2_CACHE_SIZE)
     329      count = 0x2;
     330    else if (name >= _SC_LEVEL1_DCACHE_SIZE)
     331      count = 0x0;
     332  
     333    __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
     334  
     335    switch (name)
     336      {
     337      case _SC_LEVEL1_ICACHE_ASSOC:
     338      case _SC_LEVEL1_DCACHE_ASSOC:
     339      case _SC_LEVEL2_CACHE_ASSOC:
     340      case _SC_LEVEL3_CACHE_ASSOC:
     341        return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0;
     342      case _SC_LEVEL1_ICACHE_LINESIZE:
     343      case _SC_LEVEL1_DCACHE_LINESIZE:
     344      case _SC_LEVEL2_CACHE_LINESIZE:
     345      case _SC_LEVEL3_CACHE_LINESIZE:
     346        return ecx ? (ebx & 0xfff) + 1 : 0;
     347      case _SC_LEVEL1_ICACHE_SIZE:
     348      case _SC_LEVEL1_DCACHE_SIZE:
     349      case _SC_LEVEL2_CACHE_SIZE:
     350      case _SC_LEVEL3_CACHE_SIZE:
     351        return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0;
     352      default:
     353        __builtin_unreachable ();
     354      }
     355    return -1;
     356  }
     357  
     358  
     359  static long int __attribute__ ((noinline))
     360  handle_zhaoxin (int name)
     361  {
     362    unsigned int eax;
     363    unsigned int ebx;
     364    unsigned int ecx;
     365    unsigned int edx;
     366  
     367    int folded_rel_name = (M(name) / 3) * 3;
     368  
     369    unsigned int round = 0;
     370    while (1)
     371      {
     372        __cpuid_count (4, round, eax, ebx, ecx, edx);
     373  
     374        enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
     375        if (type == null)
     376          break;
     377  
     378        unsigned int level = (eax >> 5) & 0x7;
     379  
     380        if ((level == 1 && type == data
     381          && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
     382          || (level == 1 && type == inst
     383              && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
     384          || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
     385          || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
     386          {
     387            unsigned int offset = M(name) - folded_rel_name;
     388  
     389            if (offset == 0)
     390              /* Cache size.  */
     391              return (((ebx >> 22) + 1)
     392                  * (((ebx >> 12) & 0x3ff) + 1)
     393                  * ((ebx & 0xfff) + 1)
     394                  * (ecx + 1));
     395            if (offset == 1)
     396              return (ebx >> 22) + 1;
     397  
     398            assert (offset == 2);
     399            return (ebx & 0xfff) + 1;
     400          }
     401  
     402        ++round;
     403      }
     404  
     405    /* Nothing found.  */
     406    return 0;
     407  }
     408  
     409  static void
     410  get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
     411                  long int core)
     412  {
     413    unsigned int eax;
     414    unsigned int ebx;
     415    unsigned int ecx;
     416    unsigned int edx;
     417  
     418    /* Number of logical processors sharing L2 cache.  */
     419    int threads_l2;
     420  
     421    /* Number of logical processors sharing L3 cache.  */
     422    int threads_l3;
     423  
     424    const struct cpu_features *cpu_features = __get_cpu_features ();
     425    int max_cpuid = cpu_features->basic.max_cpuid;
     426    unsigned int family = cpu_features->basic.family;
     427    unsigned int model = cpu_features->basic.model;
     428    long int shared = *shared_ptr;
     429    long int shared_per_thread = *shared_per_thread_ptr;
     430    unsigned int threads = *threads_ptr;
     431    bool inclusive_cache = true;
     432    bool support_count_mask = true;
     433  
     434    /* Try L3 first.  */
     435    unsigned int level = 3;
     436  
     437    if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
     438      support_count_mask = false;
     439  
     440    if (shared <= 0)
     441      {
     442        /* Try L2 otherwise.  */
     443        level  = 2;
     444        shared = core;
     445        shared_per_thread = core;
     446        threads_l2 = 0;
     447        threads_l3 = -1;
     448      }
     449    else
     450      {
     451        threads_l2 = 0;
     452        threads_l3 = 0;
     453      }
     454  
     455    /* A value of 0 for the HTT bit indicates there is only a single
     456       logical processor.  */
     457    if (HAS_CPU_FEATURE (HTT))
     458      {
     459        /* Figure out the number of logical threads that share the
     460           highest cache level.  */
     461        if (max_cpuid >= 4)
     462          {
     463            int i = 0;
     464  
     465            /* Query until cache level 2 and 3 are enumerated.  */
     466            int check = 0x1 | (threads_l3 == 0) << 1;
     467            do
     468              {
     469                __cpuid_count (4, i++, eax, ebx, ecx, edx);
     470  
     471                /* There seems to be a bug in at least some Pentium Ds
     472                   which sometimes fail to iterate all cache parameters.
     473                   Do not loop indefinitely here, stop in this case and
     474                   assume there is no such information.  */
     475                if (cpu_features->basic.kind == arch_kind_intel
     476                    && (eax & 0x1f) == 0 )
     477                  goto intel_bug_no_cache_info;
     478  
     479                switch ((eax >> 5) & 0x7)
     480                  {
     481                    default:
     482                      break;
     483                    case 2:
     484                      if ((check & 0x1))
     485                        {
     486                          /* Get maximum number of logical processors
     487                             sharing L2 cache.  */
     488                          threads_l2 = (eax >> 14) & 0x3ff;
     489                          check &= ~0x1;
     490                        }
     491                      break;
     492                    case 3:
     493                      if ((check & (0x1 << 1)))
     494                        {
     495                          /* Get maximum number of logical processors
     496                             sharing L3 cache.  */
     497                          threads_l3 = (eax >> 14) & 0x3ff;
     498  
     499                          /* Check if L2 and L3 caches are inclusive.  */
     500                          inclusive_cache = (edx & 0x2) != 0;
     501                          check &= ~(0x1 << 1);
     502                        }
     503                      break;
     504                  }
     505              }
     506            while (check);
     507  
     508            /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
     509               numbers of addressable IDs for logical processors sharing
     510               the cache, instead of the maximum number of threads
     511               sharing the cache.  */
     512            if (max_cpuid >= 11 && support_count_mask)
     513              {
     514                /* Find the number of logical processors shipped in
     515                   one core and apply count mask.  */
     516                i = 0;
     517  
     518                /* Count SMT only if there is L3 cache.  Always count
     519                   core if there is no L3 cache.  */
     520                int count = ((threads_l2 > 0 && level == 3)
     521                             | ((threads_l3 > 0
     522                                 || (threads_l2 > 0 && level == 2)) << 1));
     523  
     524                while (count)
     525                  {
     526                    __cpuid_count (11, i++, eax, ebx, ecx, edx);
     527  
     528                    int shipped = ebx & 0xff;
     529                    int type = ecx & 0xff00;
     530                    if (shipped == 0 || type == 0)
     531                      break;
     532                    else if (type == 0x100)
     533                      {
     534                        /* Count SMT.  */
     535                        if ((count & 0x1))
     536                          {
     537                            int count_mask;
     538  
     539                            /* Compute count mask.  */
     540                            asm ("bsr %1, %0"
     541                                 : "=r" (count_mask) : "g" (threads_l2));
     542                            count_mask = ~(-1 << (count_mask + 1));
     543                            threads_l2 = (shipped - 1) & count_mask;
     544                            count &= ~0x1;
     545                          }
     546                      }
     547                    else if (type == 0x200)
     548                      {
     549                        /* Count core.  */
     550                        if ((count & (0x1 << 1)))
     551                          {
     552                            int count_mask;
     553                            int threads_core
     554                              = (level == 2 ? threads_l2 : threads_l3);
     555  
     556                            /* Compute count mask.  */
     557                            asm ("bsr %1, %0"
     558                                 : "=r" (count_mask) : "g" (threads_core));
     559                            count_mask = ~(-1 << (count_mask + 1));
     560                            threads_core = (shipped - 1) & count_mask;
     561                            if (level == 2)
     562                              threads_l2 = threads_core;
     563                            else
     564                              threads_l3 = threads_core;
     565                            count &= ~(0x1 << 1);
     566                          }
     567                      }
     568                  }
     569              }
     570            if (threads_l2 > 0)
     571              threads_l2 += 1;
     572            if (threads_l3 > 0)
     573              threads_l3 += 1;
     574            if (level == 2)
     575              {
     576                if (threads_l2)
     577                  {
     578                    threads = threads_l2;
     579                    if (cpu_features->basic.kind == arch_kind_intel
     580                        && threads > 2
     581                        && family == 6)
     582                      switch (model)
     583                        {
     584                          case 0x37:
     585                          case 0x4a:
     586                          case 0x4d:
     587                          case 0x5a:
     588                          case 0x5d:
     589                            /* Silvermont has L2 cache shared by 2 cores.  */
     590                            threads = 2;
     591                            break;
     592                          default:
     593                            break;
     594                        }
     595                  }
     596              }
     597            else if (threads_l3)
     598              threads = threads_l3;
     599          }
     600        else
     601          {
     602  	intel_bug_no_cache_info:
     603  	  /* Assume that all logical threads share the highest cache
     604  	     level.  */
     605  	  threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
     606  		     & 0xff);
     607  
     608  	  /* Get per-thread size of highest level cache.  */
     609  	  if (shared_per_thread > 0 && threads > 0)
     610  	    shared_per_thread /= threads;
     611  	}
     612      }
     613  
     614    /* Account for non-inclusive L2 and L3 caches.  */
     615    if (!inclusive_cache)
     616      {
     617        long int core_per_thread = threads_l2 > 0 ? (core / threads_l2) : core;
     618        shared_per_thread += core_per_thread;
     619        shared += core;
     620      }
     621  
     622    *shared_ptr = shared;
     623    *shared_per_thread_ptr = shared_per_thread;
     624    *threads_ptr = threads;
     625  }
     626  
     627  static void
     628  dl_init_cacheinfo (struct cpu_features *cpu_features)
     629  {
     630    /* Find out what brand of processor.  */
     631    long int data = -1;
     632    long int shared = -1;
     633    long int shared_per_thread = -1;
     634    long int core = -1;
     635    unsigned int threads = 0;
     636    unsigned long int level1_icache_size = -1;
     637    unsigned long int level1_icache_linesize = -1;
     638    unsigned long int level1_dcache_size = -1;
     639    unsigned long int level1_dcache_assoc = -1;
     640    unsigned long int level1_dcache_linesize = -1;
     641    unsigned long int level2_cache_size = -1;
     642    unsigned long int level2_cache_assoc = -1;
     643    unsigned long int level2_cache_linesize = -1;
     644    unsigned long int level3_cache_size = -1;
     645    unsigned long int level3_cache_assoc = -1;
     646    unsigned long int level3_cache_linesize = -1;
     647    unsigned long int level4_cache_size = -1;
     648  
     649    if (cpu_features->basic.kind == arch_kind_intel)
     650      {
     651        data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
     652        core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
     653        shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
     654        shared_per_thread = shared;
     655  
     656        level1_icache_size
     657  	= handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
     658        level1_icache_linesize
     659  	= handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
     660        level1_dcache_size = data;
     661        level1_dcache_assoc
     662  	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
     663        level1_dcache_linesize
     664  	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
     665        level2_cache_size = core;
     666        level2_cache_assoc
     667  	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
     668        level2_cache_linesize
     669  	= handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
     670        level3_cache_size = shared;
     671        level3_cache_assoc
     672  	= handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
     673        level3_cache_linesize
     674  	= handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
     675        level4_cache_size
     676  	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
     677  
     678        get_common_cache_info (&shared, &shared_per_thread, &threads, core);
     679      }
     680    else if (cpu_features->basic.kind == arch_kind_zhaoxin)
     681      {
     682        data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
     683        core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
     684        shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
     685        shared_per_thread = shared;
     686  
     687        level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
     688        level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
     689        level1_dcache_size = data;
     690        level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
     691        level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
     692        level2_cache_size = core;
     693        level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
     694        level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
     695        level3_cache_size = shared;
     696        level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
     697        level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
     698  
     699        get_common_cache_info (&shared, &shared_per_thread, &threads, core);
     700      }
     701    else if (cpu_features->basic.kind == arch_kind_amd)
     702      {
     703        data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
     704        core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
     705        shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
     706        shared_per_thread = shared;
     707  
     708        level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
     709        level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
     710        level1_dcache_size = data;
     711        level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
     712        level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
     713        level2_cache_size = core;
     714        level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
     715        level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
     716        level3_cache_size = shared;
     717        level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
     718        level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
     719  
     720        if (shared <= 0)
     721          /* No shared L3 cache.  All we have is the L2 cache.  */
     722  	shared = core;
     723  
     724        if (shared_per_thread <= 0)
     725  	shared_per_thread = shared;
     726      }
     727  
     728    cpu_features->level1_icache_size = level1_icache_size;
     729    cpu_features->level1_icache_linesize = level1_icache_linesize;
     730    cpu_features->level1_dcache_size = level1_dcache_size;
     731    cpu_features->level1_dcache_assoc = level1_dcache_assoc;
     732    cpu_features->level1_dcache_linesize = level1_dcache_linesize;
     733    cpu_features->level2_cache_size = level2_cache_size;
     734    cpu_features->level2_cache_assoc = level2_cache_assoc;
     735    cpu_features->level2_cache_linesize = level2_cache_linesize;
     736    cpu_features->level3_cache_size = level3_cache_size;
     737    cpu_features->level3_cache_assoc = level3_cache_assoc;
     738    cpu_features->level3_cache_linesize = level3_cache_linesize;
     739    cpu_features->level4_cache_size = level4_cache_size;
     740  
     741    unsigned long int cachesize_non_temporal_divisor
     742        = cpu_features->cachesize_non_temporal_divisor;
     743    if (cachesize_non_temporal_divisor <= 0)
     744      cachesize_non_temporal_divisor = 4;
     745  
     746    /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
     747       of the chip's cache (depending on `cachesize_non_temporal_divisor` which
     748       is microarch specific. The default is 1/4). For most Intel processors
     749       with an initial release date between 2017 and 2023, a thread's
     750       typical share of the cache is from 18-64MB. Using a reasonable size
     751       fraction of L3 is meant to estimate the point where non-temporal stores
     752       begin out-competing REP MOVSB. As well the point where the fact that
     753       non-temporal stores are forced back to main memory would already occurred
     754       to the majority of the lines in the copy. Note, concerns about the entire
     755       L3 cache being evicted by the copy are mostly alleviated by the fact that
     756       modern HW detects streaming patterns and provides proper LRU hints so that
     757       the maximum thrashing capped at 1/associativity. */
     758    unsigned long int non_temporal_threshold
     759        = shared / cachesize_non_temporal_divisor;
     760  
     761    /* If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most
     762       likely have incorrect/incomplete cache info in which case, default to
     763       3/4 * per-thread L3 to avoid regressions.  */
     764    unsigned long int non_temporal_threshold_lowbound
     765        = shared_per_thread * 3 / 4;
     766    if (non_temporal_threshold < non_temporal_threshold_lowbound)
     767      non_temporal_threshold = non_temporal_threshold_lowbound;
     768  
     769    /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
     770       a higher risk of actually thrashing the cache as they don't have a HW LRU
     771       hint. As well, their performance in highly parallel situations is
     772       noticeably worse.  */
     773    if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
     774      non_temporal_threshold = non_temporal_threshold_lowbound;
     775    /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
     776       'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
     777       if that operation cannot overflow. Minimum of 0x4040 (16448) because the
     778       L(large_memset_4x) loops need 64-byte to cache align and enough space for
     779       at least 1 iteration of 4x PAGE_SIZE unrolled loop.  Both values are
     780       reflected in the manual.  */
     781    unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> 4;
     782    unsigned long int minimum_non_temporal_threshold = 0x4040;
     783  
     784    /* If `non_temporal_threshold` less than `minimum_non_temporal_threshold`
     785       it most likely means we failed to detect the cache info. We don't want
     786       to default to `minimum_non_temporal_threshold` as such a small value,
     787       while correct, has bad performance. We default to 64MB as reasonable
     788       default bound. 64MB is likely conservative in that most/all systems would
     789       choose a lower value so it should never forcing non-temporal stores when
     790       they otherwise wouldn't be used.  */
     791    if (non_temporal_threshold < minimum_non_temporal_threshold)
     792      non_temporal_threshold = 64 * 1024 * 1024;
     793    else if (non_temporal_threshold > maximum_non_temporal_threshold)
     794      non_temporal_threshold = maximum_non_temporal_threshold;
     795  
     796    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
     797    unsigned int minimum_rep_movsb_threshold;
     798    /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
     799       VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
     800       threshold is 2048 * (VEC_SIZE / 16).  */
     801    unsigned int rep_movsb_threshold;
     802    if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
     803        && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     804      {
     805        rep_movsb_threshold = 4096 * (64 / 16);
     806        minimum_rep_movsb_threshold = 64 * 8;
     807      }
     808    else if (CPU_FEATURE_PREFERRED_P (cpu_features,
     809  				    AVX_Fast_Unaligned_Load))
     810      {
     811        rep_movsb_threshold = 4096 * (32 / 16);
     812        minimum_rep_movsb_threshold = 32 * 8;
     813      }
     814    else
     815      {
     816        rep_movsb_threshold = 2048 * (16 / 16);
     817        minimum_rep_movsb_threshold = 16 * 8;
     818      }
     819    /* NB: The default REP MOVSB threshold is 2112 on processors with fast
     820       short REP MOVSB (FSRM).  */
     821    if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
     822      rep_movsb_threshold = 2112;
     823  
     824    /* The default threshold to use Enhanced REP STOSB.  */
     825    unsigned long int rep_stosb_threshold = 2048;
     826  
     827    long int tunable_size;
     828  
     829    tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
     830    /* NB: Ignore the default value 0.  */
     831    if (tunable_size != 0)
     832      data = tunable_size;
     833  
     834    tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
     835    /* NB: Ignore the default value 0.  */
     836    if (tunable_size != 0)
     837      shared = tunable_size;
     838  
     839    tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
     840    if (tunable_size > minimum_non_temporal_threshold
     841        && tunable_size <= maximum_non_temporal_threshold)
     842      non_temporal_threshold = tunable_size;
     843  
     844    tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
     845    if (tunable_size > minimum_rep_movsb_threshold)
     846      rep_movsb_threshold = tunable_size;
     847  
     848    /* NB: The default value of the x86_rep_stosb_threshold tunable is the
     849       same as the default value of __x86_rep_stosb_threshold and the
     850       minimum value is fixed.  */
     851    rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
     852  				     long int, NULL);
     853  
     854    TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
     855    TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
     856    TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
     857  			   minimum_non_temporal_threshold,
     858  			   maximum_non_temporal_threshold);
     859    TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
     860  			   minimum_rep_movsb_threshold, SIZE_MAX);
     861    TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
     862  			   SIZE_MAX);
     863  
     864    unsigned long int rep_movsb_stop_threshold;
     865    /* ERMS feature is implemented from AMD Zen3 architecture and it is
     866       performing poorly for data above L2 cache size. Henceforth, adding
     867       an upper bound threshold parameter to limit the usage of Enhanced
     868       REP MOVSB operations and setting its value to L2 cache size.  */
     869    if (cpu_features->basic.kind == arch_kind_amd)
     870      rep_movsb_stop_threshold = core;
     871    /* Setting the upper bound of ERMS to the computed value of
     872       non-temporal threshold for architectures other than AMD.  */
     873    else
     874      rep_movsb_stop_threshold = non_temporal_threshold;
     875  
     876    cpu_features->data_cache_size = data;
     877    cpu_features->shared_cache_size = shared;
     878    cpu_features->non_temporal_threshold = non_temporal_threshold;
     879    cpu_features->rep_movsb_threshold = rep_movsb_threshold;
     880    cpu_features->rep_stosb_threshold = rep_stosb_threshold;
     881    cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
     882  }