1  /* mpn_invert_limb -- Invert a normalized limb.
       2  
       3  Copyright 1991, 2000, 2001 Free Software Foundation, Inc.
       4  
       5  This file is part of the GNU MP Library.
       6  
       7  The GNU MP Library is free software; you can redistribute it and/or modify
       8  it under the terms of either:
       9  
      10    * the GNU Lesser General Public License as published by the Free
      11      Software Foundation; either version 3 of the License, or (at your
      12      option) any later version.
      13  
      14  or
      15  
      16    * the GNU General Public License as published by the Free Software
      17      Foundation; either version 2 of the License, or (at your option) any
      18      later version.
      19  
      20  or both in parallel, as here.
      21  
      22  The GNU MP Library is distributed in the hope that it will be useful, but
      23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      25  for more details.
      26  
      27  You should have received copies of the GNU General Public License and the
      28  GNU Lesser General Public License along with the GNU MP Library.  If not,
      29  see https://www.gnu.org/licenses/.  */
      30  
      31  #include "gmp-impl.h"
      32  #include "longlong.h"
      33  
      34  /*
      35    This is needed to make configure define HAVE_NATIVE_mpn_invert_limb:
      36    PROLOGUE(mpn_invert_limb)
      37  */
      38  
      39  static const unsigned short int approx_tab[0x100] =
      40  {
      41    /* 0x400, */
      42    0x3ff,
      43           0x3fc, 0x3f8, 0x3f4, 0x3f0, 0x3ec, 0x3e8, 0x3e4,
      44    0x3e0, 0x3dd, 0x3d9, 0x3d5, 0x3d2, 0x3ce, 0x3ca, 0x3c7,
      45    0x3c3, 0x3c0, 0x3bc, 0x3b9, 0x3b5, 0x3b2, 0x3ae, 0x3ab,
      46    0x3a8, 0x3a4, 0x3a1, 0x39e, 0x39b, 0x397, 0x394, 0x391,
      47    0x38e, 0x38b, 0x387, 0x384, 0x381, 0x37e, 0x37b, 0x378,
      48    0x375, 0x372, 0x36f, 0x36c, 0x369, 0x366, 0x364, 0x361,
      49    0x35e, 0x35b, 0x358, 0x355, 0x353, 0x350, 0x34d, 0x34a,
      50    0x348, 0x345, 0x342, 0x340, 0x33d, 0x33a, 0x338, 0x335,
      51    0x333, 0x330, 0x32e, 0x32b, 0x329, 0x326, 0x324, 0x321,
      52    0x31f, 0x31c, 0x31a, 0x317, 0x315, 0x313, 0x310, 0x30e,
      53    0x30c, 0x309, 0x307, 0x305, 0x303, 0x300, 0x2fe, 0x2fc,
      54    0x2fa, 0x2f7, 0x2f5, 0x2f3, 0x2f1, 0x2ef, 0x2ec, 0x2ea,
      55    0x2e8, 0x2e6, 0x2e4, 0x2e2, 0x2e0, 0x2de, 0x2dc, 0x2da,
      56    0x2d8, 0x2d6, 0x2d4, 0x2d2, 0x2d0, 0x2ce, 0x2cc, 0x2ca,
      57    0x2c8, 0x2c6, 0x2c4, 0x2c2, 0x2c0, 0x2be, 0x2bc, 0x2bb,
      58    0x2b9, 0x2b7, 0x2b5, 0x2b3, 0x2b1, 0x2b0, 0x2ae, 0x2ac,
      59    0x2aa, 0x2a8, 0x2a7, 0x2a5, 0x2a3, 0x2a1, 0x2a0, 0x29e,
      60    0x29c, 0x29b, 0x299, 0x297, 0x295, 0x294, 0x292, 0x291,
      61    0x28f, 0x28d, 0x28c, 0x28a, 0x288, 0x287, 0x285, 0x284,
      62    0x282, 0x280, 0x27f, 0x27d, 0x27c, 0x27a, 0x279, 0x277,
      63    0x276, 0x274, 0x273, 0x271, 0x270, 0x26e, 0x26d, 0x26b,
      64    0x26a, 0x268, 0x267, 0x265, 0x264, 0x263, 0x261, 0x260,
      65    0x25e, 0x25d, 0x25c, 0x25a, 0x259, 0x257, 0x256, 0x255,
      66    0x253, 0x252, 0x251, 0x24f, 0x24e, 0x24d, 0x24b, 0x24a,
      67    0x249, 0x247, 0x246, 0x245, 0x243, 0x242, 0x241, 0x240,
      68    0x23e, 0x23d, 0x23c, 0x23b, 0x239, 0x238, 0x237, 0x236,
      69    0x234, 0x233, 0x232, 0x231, 0x230, 0x22e, 0x22d, 0x22c,
      70    0x22b, 0x22a, 0x229, 0x227, 0x226, 0x225, 0x224, 0x223,
      71    0x222, 0x220, 0x21f, 0x21e, 0x21d, 0x21c, 0x21b, 0x21a,
      72    0x219, 0x218, 0x216, 0x215, 0x214, 0x213, 0x212, 0x211,
      73    0x210, 0x20f, 0x20e, 0x20d, 0x20c, 0x20b, 0x20a, 0x209,
      74    0x208, 0x207, 0x206, 0x205, 0x204, 0x203, 0x202, 0x201,
      75  };
      76  
      77  /* iteration: z = 2z-(z**2)d */
      78  
      79  mp_limb_t
      80  mpn_invert_limb (mp_limb_t d)
      81  {
      82    mp_limb_t z, z2l, z2h, tl, th;
      83    mp_limb_t xh, xl;
      84    mp_limb_t zh, zl;
      85  
      86  #if GMP_LIMB_BITS == 32
      87    z = approx_tab[(d >> 23) - 0x100] << 6;	/* z < 2^16 */
      88  
      89    z2l = z * z;					/* z2l < 2^32 */
      90    umul_ppmm (th, tl, z2l, d);
      91    z = (z << 17) - (th << 1);
      92  #endif
      93  #if GMP_LIMB_BITS == 64
      94    z = approx_tab[(d >> 55) - 0x100] << 6;	/* z < 2^16 */
      95  
      96    z2l = z * z;					/* z2l < 2^32 */
      97    th = z2l * (d >> 32);				/* th < 2^64 */
      98    z = (z << 17) - (th >> 31);			/* z < 2^32 */
      99  
     100    z2l = z * z;
     101    umul_ppmm (th, tl, z2l, d);
     102    z = (z << 33) - (th << 1);
     103  #endif
     104  
     105    umul_ppmm (z2h, z2l, z, z);
     106    umul_ppmm (th, tl, z2h, d);
     107    umul_ppmm (xh, xl, z2l, d);
     108    tl += xh;
     109    th += tl < xh;
     110    th = (th << 2) | (tl >> GMP_LIMB_BITS - 2);
     111    tl = tl << 2;
     112    sub_ddmmss (zh, zl, z << 2, 0, th, tl);
     113  
     114    umul_ppmm (xh, xl, d, zh);
     115    xh += d;		/* add_ssaaaa (xh, xl, xh, xl, d, 0); */
     116    if (~xh != 0)
     117      {
     118        add_ssaaaa (xh, xl, xh, xl, 0, d);
     119        zh++;
     120      }
     121  
     122    add_ssaaaa (xh, xl, xh, xl, 0, d);
     123    if (xh != 0)
     124      zh++;
     125  
     126    return zh;
     127  }