1  /* Copyright (C) 2019-2023 Free Software Foundation, Inc.
       2  
       3     This file is part of LIBF7, which is part of GCC.
       4  
       5     GCC is free software; you can redistribute it and/or modify it under
       6     the terms of the GNU General Public License as published by the Free
       7     Software Foundation; either version 3, or (at your option) any later
       8     version.
       9  
      10     GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      11     WARRANTY; without even the implied warranty of MERCHANTABILITY or
      12     FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      13     for more details.
      14  
      15     Under Section 7 of GPL version 3, you are granted additional
      16     permissions described in the GCC Runtime Library Exception, version
      17     3.1, as published by the Free Software Foundation.
      18  
      19     You should have received a copy of the GNU General Public License and
      20     a copy of the GCC Runtime Library Exception along with this program;
      21     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      22     <http://www.gnu.org/licenses/>.  */
      23  
      24  #include "libf7.h"
      25  
      26  #ifndef __AVR_TINY__
      27  
      28  #define ALIAS(X, Y) \
      29    F7_WEAK __attribute__((__alias__(F7_STRINGY(X)))) __typeof__(X) Y;
      30  
      31  #define DALIAS(...) // empty
      32  #define LALIAS(...) // empty
      33  
      34  #ifndef IN_LIBGCC2
      35  
      36  #include <stdio.h>
      37  #include <assert.h>
      38  
      39  #define in_libgcc false
      40  
      41  _Static_assert (sizeof (f7_t) == 10 && F7_MANT_BYTES == 7,
      42  		"libf7 will only work with 7-byte mantissa.");
      43  #else
      44  
      45  #define in_libgcc true
      46  
      47  #if __SIZEOF_DOUBLE__ == 8
      48  #undef  DALIAS
      49  #define DALIAS(X,Y) \
      50    F7_WEAK __attribute__((__alias__(F7_STRINGY(X)))) __typeof__(X) Y;
      51  #endif
      52  
      53  #if __SIZEOF_LONG_DOUBLE__ == 8
      54  #undef  LALIAS
      55  #define LALIAS(X,Y) \
      56    F7_WEAK __attribute__((__alias__(F7_STRINGY(X)))) __typeof__(X) Y;
      57  #endif
      58  
      59  #endif // in libgcc
      60  
      61  static F7_INLINE
      62  void f7_assert (bool x)
      63  {
      64    if (!in_libgcc && !x)
      65      __builtin_abort();
      66  }
      67  
      68  static F7_INLINE
      69  int16_t abs_ssat16 (int16_t a)
      70  {
      71    _Sat _Fract sa = __builtin_avr_rbits (a);
      72    return __builtin_avr_bitsr (__builtin_avr_absr (sa));
      73  }
      74  
      75  static F7_INLINE
      76  int16_t add_ssat16 (int16_t a, int16_t b)
      77  {
      78    _Sat _Fract sa = __builtin_avr_rbits (a);
      79    _Sat _Fract sb = __builtin_avr_rbits (b);
      80    return __builtin_avr_bitsr (sa + sb);
      81  }
      82  
      83  static F7_INLINE
      84  int16_t sub_ssat16 (int16_t a, int16_t b)
      85  {
      86    _Sat _Fract sa = __builtin_avr_rbits (a);
      87    _Sat _Fract sb = __builtin_avr_rbits (b);
      88    return __builtin_avr_bitsr (sa - sb);
      89  }
      90  
      91  static F7_INLINE
      92  int8_t ssat8_range (int16_t a, int8_t range)
      93  {
      94    if (a >= range)
      95      return range;
      96    if (a <= -range)
      97      return -range;
      98    return a;
      99  }
     100  
     101  
     102  #define IN_LIBF7_H
     103    #define F7_CONST_DEF(NAME, FLAGS, M6, M5, M4, M3, M2, M1, M0, EXPO) \
     104      F7_UNUSED static const uint8_t F7_(const_##NAME##_msb)  = M6;     \
     105      F7_UNUSED static const int16_t F7_(const_##NAME##_expo) = EXPO;
     106    #include "libf7-const.def"
     107    #undef F7_CONST_DEF
     108  #undef IN_LIBF7_H
     109  
     110  
     111  /*
     112    libgcc naming converntions for conversions:
     113  
     114     __float<fmode><fmode>  : Convert float modes.
     115     __floatun<imode><fmode>: Convert unsigned integral to float.
     116     __fix<fmode><imode>    : Convert float to signed integral.
     117     __fixuns<fmode><imode> : Convert float to unsigned integral.
     118  */
     119  
     120  
     121  #ifdef F7MOD_floatundidf_
     122  F7_WEAK
     123  f7_double_t __floatundidf (uint64_t x)
     124  {
     125    f7_t xx;
     126    f7_set_u64 (&xx, x);
     127    return f7_get_double (&xx);
     128  }
     129  #endif // F7MOD_floatundidf_
     130  
     131  
     132  #ifdef F7MOD_floatdidf_
     133  F7_WEAK
     134  f7_double_t __floatdidf (int64_t x)
     135  {
     136    f7_t xx;
     137    f7_set_s64 (&xx, x);
     138    return f7_get_double (&xx);
     139  }
     140  #endif // F7MOD_floatdidf_
     141  
     142  
     143  #ifdef F7MOD_init_
     144  f7_t* f7_init_impl (uint64_t mant, uint8_t flags, f7_t *cc, int16_t expo)
     145  {
     146    flags &= F7_FLAGS;
     147    if (f7_class_number (flags))
     148      {
     149        uint8_t msb;
     150        while ((__builtin_memcpy (&msb, (uint8_t*) &mant + 7, 1), msb))
     151  	{
     152  	  mant >>= 1;
     153  	  expo = add_ssat16 (expo, 1);
     154  	}
     155        *(uint64_t*) cc->mant = mant;
     156        cc->expo = add_ssat16 (expo, F7_MANT_BITS-1);
     157  
     158        cc = f7_normalize_asm (cc);
     159      }
     160  
     161    cc->flags = flags;
     162  
     163    return cc;
     164  }
     165  #endif // F7MOD_init_
     166  
     167  
     168  #ifdef F7MOD_set_s16_
     169  f7_t* f7_set_s16_impl (f7_t *cc, int16_t i16)
     170  {
     171    uint16_t u16 = (uint16_t) i16;
     172    uint8_t flags = 0;
     173    if (i16 < 0)
     174      {
     175        u16 = -u16;
     176        flags = F7_FLAG_sign;
     177      }
     178    f7_set_u16_impl (cc, u16);
     179    cc->flags = flags;
     180    return cc;
     181  }
     182  #endif // F7MOD_set_s16_
     183  
     184  
     185  #ifdef F7MOD_set_u16_
     186  f7_t* f7_set_u16_impl (f7_t *cc, uint16_t u16)
     187  {
     188    f7_clr (cc);
     189    F7_MANT_HI2 (cc) = u16;
     190    cc->expo = 15;
     191    return f7_normalize_asm (cc);
     192  }
     193  #endif // F7MOD_set_u16_
     194  
     195  
     196  #ifdef F7MOD_set_s32_
     197  f7_t* f7_set_s32 (f7_t *cc, int32_t i32)
     198  {
     199    uint32_t u32 = (uint32_t) i32;
     200    uint8_t flags = 0;
     201    if (i32 < 0)
     202      {
     203        u32 = -u32;
     204        flags = F7_FLAG_sign;
     205      }
     206    cc = f7_set_u32 (cc, u32);
     207    cc->flags = flags;
     208    return cc;
     209  }
     210  ALIAS (f7_set_s32, f7_floatsidf)
     211  #endif // F7MOD_set_s32_
     212  
     213  
     214  #ifdef F7MOD_set_u32_
     215  f7_t* f7_set_u32 (f7_t *cc, uint32_t u32)
     216  {
     217    f7_clr (cc);
     218    F7_MANT_HI4 (cc) = u32;
     219    cc->expo = 31;
     220    return f7_normalize_asm (cc);
     221  }
     222  ALIAS (f7_set_u32, f7_floatunsidf)
     223  #endif // F7MOD_set_u32_
     224  
     225  
     226  // IEEE 754 single
     227  // float =  s  bbbbbbbb mmmmmmmmmmmmmmmmmmmmmmm
     228  //	   31
     229  // s = sign
     230  // b = biased exponent, bias = 127
     231  // m = mantissa
     232  
     233  // +0	      =	 0 0 0
     234  // -0	      =	 1 0 0
     235  // Inf	      =	 S B 0	=  S * Inf, B = 0xff
     236  // NaN	      =	 S B M,	   B = 0xff, M != 0
     237  // Sub-normal =	 S 0 M	=  S * 0.M * 2^{1 - bias}, M != 0
     238  // Normal     =  S B M  =  S * 1.M * 2^{B - bias}, B = 1 ... 0xfe
     239  
     240  #define FLT_DIG_EXP   8
     241  #define FLT_DIG_MANT  (31 - FLT_DIG_EXP)
     242  #define FLT_MAX_EXP   ((1 << FLT_DIG_EXP) - 1)
     243  #define FLT_EXP_BIAS  (FLT_MAX_EXP >> 1)
     244  
     245  #ifdef F7MOD_set_float_
     246  F7_WEAK
     247  void f7_set_float (f7_t *cc, float f)
     248  {
     249    uint32_t val32;
     250  
     251    _Static_assert (__SIZEOF_FLOAT__ == 4, "");
     252    _Static_assert (__FLT_MANT_DIG__ == 24, "");
     253    __builtin_memcpy (&val32, &f, __SIZEOF_FLOAT__);
     254  
     255    uint16_t val16 = val32 >> 16;
     256    val16 >>= FLT_DIG_MANT - 16;
     257  
     258    uint8_t expo_biased = val16 & FLT_MAX_EXP;
     259    bool sign = val16 & (1u << FLT_DIG_EXP);
     260  
     261    f7_clr (cc);
     262  
     263    uint32_t mant = val32 & ((1ul << FLT_DIG_MANT) -1);
     264  
     265    if (mant == 0)
     266      {
     267        if (expo_biased == 0)
     268  	return;
     269        if (expo_biased >= FLT_MAX_EXP)
     270  	return f7_set_inf (cc, sign);
     271      }
     272  
     273    if (expo_biased == 0)
     274      expo_biased = 1;   // Sub-normal: biased expo of 1 was encoded as 0.
     275    else if (expo_biased < FLT_MAX_EXP)
     276      mant |= (1ul << FLT_DIG_MANT);
     277    else
     278      return f7_set_nan (cc);
     279  
     280    __builtin_memcpy (& F7_MANT_HI4 (cc), &mant, 4);
     281  
     282    cc->expo = expo_biased - FLT_EXP_BIAS + 31 - FLT_DIG_MANT;
     283    f7_normalize_asm (cc);
     284    f7_set_sign (cc, sign);
     285  }
     286  ALIAS (f7_set_float, f7_extendsfdf2)
     287  #endif // F7MOD_set_float_
     288  
     289  
     290  #ifdef F7MOD_get_float_
     291  static F7_INLINE
     292  float make_float (uint32_t x)
     293  {
     294    float ff;
     295    __builtin_memcpy (&ff, &x, 4);
     296    return ff;
     297  
     298  }
     299  
     300  F7_WEAK
     301  float f7_get_float (const f7_t *aa)
     302  {
     303    uint8_t a_class = f7_classify (aa);
     304  
     305    if (f7_class_nan (a_class))
     306      return make_float (0xffc00000 /* NaN: Biased expo = 0xff, mant != 0 */);
     307  
     308    uint32_t mant;
     309    __builtin_memcpy (&mant, &F7_MANT_CONST_HI4 (aa), 4);
     310  
     311    uint8_t expo8 = 0;
     312    uint8_t mant_offset = FLT_DIG_EXP;
     313    int16_t c_expo = add_ssat16 (aa->expo, FLT_EXP_BIAS);
     314  
     315    if (f7_class_zero (a_class) || c_expo <= -FLT_DIG_MANT)
     316      {
     317        // Zero or tiny.
     318        return 0.0f;
     319      }
     320    else if (c_expo >= FLT_MAX_EXP || f7_class_inf (a_class))
     321      {
     322        // Inf or overflow.
     323        expo8 = FLT_MAX_EXP;
     324        mant = 0;
     325      }
     326    else if (c_expo > 0)
     327      {
     328        // Normal.
     329        expo8 = c_expo;
     330      }
     331    else
     332      {
     333        // Sub-normal:  -DIG_MANT < c_expo <= 0.
     334        // Encoding of 0 represents a biased exponent of 1.
     335        // mant_offset in 9...31.
     336        expo8 = 0;
     337        mant_offset += 1 - c_expo;
     338      }
     339  
     340    uint16_t expo16 = expo8 << (FLT_DIG_MANT - 16);
     341  
     342    if (f7_class_sign (a_class))
     343      expo16 |= 1u << (FLT_DIG_EXP + FLT_DIG_MANT - 16);
     344  
     345    mant >>= mant_offset;
     346  
     347    __asm ("cbr %T0%t2, 1 << (7 & %2)"  "\n\t"
     348  	 "or  %C0, %A1"		      "\n\t"
     349  	 "or  %D0, %B1"
     350  	 : "+d" (mant)
     351  	 : "r" (expo16), "n" (FLT_DIG_MANT));
     352  
     353    return make_float (mant);
     354  }
     355  F7_PURE ALIAS (f7_get_float, f7_truncdfsf2)
     356  #endif // F7MOD_get_float_
     357  
     358  #define DBL_DIG_EXP   11
     359  #define DBL_DIG_MANT  (63 - DBL_DIG_EXP)
     360  #define DBL_MAX_EXP   ((1 << DBL_DIG_EXP) - 1)
     361  #define DBL_EXP_BIAS  (DBL_MAX_EXP >> 1)
     362  
     363  
     364  #ifdef F7MOD_set_double_
     365  void f7_set_double_impl (f7_double_t val64, f7_t *cc)
     366  {
     367    f7_clr (cc);
     368    register uint64_t mant __asm ("r18") = val64 & ((1ull << DBL_DIG_MANT) -1);
     369  
     370    uint16_t val16 = 3[(uint16_t*) & val64];
     371    val16 >>= DBL_DIG_MANT - 48;
     372  
     373    uint16_t expo_biased = val16 & DBL_MAX_EXP;
     374    bool sign = val16 & (1u << DBL_DIG_EXP);
     375  
     376    if (mant == 0)
     377      {
     378        if (expo_biased == 0)
     379  	return;
     380        if (expo_biased >= DBL_MAX_EXP)
     381  	return f7_set_inf (cc, sign);
     382      }
     383    __asm ("" : "+r" (mant));
     384  
     385    if (expo_biased == 0)
     386      expo_biased = 1;   // Sub-normal: biased expo of 1 was encoded as 0.
     387    else if (expo_biased < DBL_MAX_EXP)
     388      mant |= (1ull << DBL_DIG_MANT);
     389    else
     390      return f7_set_nan (cc);
     391  
     392    *(uint64_t*) & cc->mant = mant;
     393  
     394    cc->expo = expo_biased - DBL_EXP_BIAS + 63 - DBL_DIG_MANT - 8;
     395    f7_normalize_asm (cc);
     396    f7_set_sign (cc, sign);
     397  }
     398  #endif // F7MOD_set_double_
     399  
     400  
     401  #ifdef F7MOD_set_pdouble_
     402  void f7_set_pdouble (f7_t *cc, const f7_double_t *val64)
     403  {
     404    f7_set_double (cc, *val64);
     405  }
     406  #endif // F7MOD_set_pdouble_
     407  
     408  
     409  #ifdef F7MOD_get_double_
     410  static F7_INLINE
     411  uint64_t clr_r18 (void)
     412  {
     413    extern void __clr_8 (void);
     414    register uint64_t r18 __asm ("r18");
     415    __asm ("%~call %x[f]" : "=r" (r18) : [f] "i" (__clr_8));
     416    return r18;
     417  }
     418  
     419  static F7_INLINE
     420  f7_double_t make_double (uint64_t x)
     421  {
     422    register f7_double_t r18 __asm ("r18") = x;
     423    __asm ("" : "+r" (r18));
     424    return r18;
     425  }
     426  
     427  F7_WEAK
     428  f7_double_t f7_get_double (const f7_t *aa)
     429  {
     430    uint8_t a_class = f7_classify (aa);
     431  
     432    if (f7_class_nan (a_class))
     433      {
     434        uint64_t nan = clr_r18() | (0x7fffull << 48);
     435        return make_double (nan);
     436      }
     437  
     438    uint64_t mant;
     439    __builtin_memcpy (&mant, & aa->mant, 8);
     440  
     441    mant &= 0x00ffffffffffffff;
     442  
     443    // FIXME: For subnormals, rounding is premature and should be
     444    //	    done *after* the mantissa has been shifted into place
     445    //	    (or the round value be shifted left accordingly).
     446    // Round.
     447    mant += 1u << (F7_MANT_BITS - (1 + DBL_DIG_MANT) - 1);
     448  
     449    uint8_t dex;
     450    register uint64_t r18 __asm ("r18") = mant;
     451    // dex = Overflow ? 1 : 0.
     452    __asm ("bst %T[mant]%T[bitno]"  "\n\t"
     453  	 "clr %0"		  "\n\t"
     454  	 "bld %0,0"
     455  	 : "=r" (dex), [mant] "+r" (r18)
     456  	 : [bitno] "n" (64 - 8));
     457  
     458    mant = r18 >> dex;
     459  
     460    uint16_t expo16 = 0;
     461    uint16_t mant_offset = DBL_DIG_EXP - 8;
     462    int16_t c_expo = add_ssat16 (aa->expo, DBL_EXP_BIAS + dex);
     463  
     464    if (f7_class_zero (a_class) || c_expo <= -DBL_DIG_MANT)
     465      {
     466        // Zero or tiny.
     467        return make_double (clr_r18());
     468      }
     469    else if (c_expo >= DBL_MAX_EXP || f7_class_inf (a_class))
     470      {
     471        // Inf or overflow.
     472        expo16 = DBL_MAX_EXP;
     473        mant = clr_r18();
     474      }
     475    else if (c_expo > 0)
     476      {
     477        // Normal.
     478        expo16 = c_expo;
     479      }
     480    else
     481      {
     482        // Sub-normal:  -DIG_MANT < c_expo <= 0.
     483        // Encoding expo of 0 represents a biased exponent of 1.
     484        // mant_offset in 5...55 = 63-8.
     485        mant_offset += 1 - c_expo;
     486      }
     487  
     488    expo16 <<= (DBL_DIG_MANT - 48);
     489  
     490    if (f7_class_sign (a_class))
     491      expo16 |= 1u << (DBL_DIG_EXP + DBL_DIG_MANT - 48);
     492  
     493    // mant >>= mant_offset;
     494    mant = f7_lshrdi3 (mant, mant_offset);
     495  
     496    r18 = mant;
     497    __asm ("cbr %T0%t2, 1 << (7 & %2)"  "\n\t"
     498  	 "or  %r0+6, %A1"	      "\n\t"
     499  	 "or  %r0+7, %B1"
     500  	 : "+r" (r18)
     501  	 : "r" (expo16), "n" (DBL_DIG_MANT));
     502  
     503    return make_double (r18);
     504  }
     505  #endif // F7MOD_get_double_
     506  
     507  
     508  #ifdef F7MOD_fabs_
     509  F7_WEAK
     510  void f7_fabs (f7_t *cc, const f7_t *aa)
     511  {
     512    f7_abs (cc, aa);
     513  }
     514  #endif // F7MOD_fabs_
     515  
     516  
     517  #ifdef F7MOD_neg_
     518  F7_WEAK
     519  f7_t* f7_neg (f7_t *cc, const f7_t *aa)
     520  {
     521    f7_copy (cc, aa);
     522  
     523    uint8_t c_class = f7_classify (cc);
     524  
     525    if (! f7_class_zero (c_class))
     526      cc->sign = ! f7_class_sign (c_class);
     527  
     528    return cc;
     529  }
     530  #endif // F7MOD_neg_
     531  
     532  
     533  #ifdef F7MOD_frexp_
     534  F7_WEAK
     535  void f7_frexp (f7_t *cc, const f7_t *aa, int *expo)
     536  {
     537    uint8_t a_class = f7_classify (aa);
     538  
     539    if (f7_class_nan (a_class))
     540      return f7_set_nan (cc);
     541  
     542    if (f7_class_inf (a_class) || aa->expo == INT16_MAX)
     543      return f7_set_inf (cc, f7_class_sign (a_class));
     544  
     545    if (! f7_msbit (aa))
     546      {
     547        *expo = 0;
     548        return f7_clr (cc);
     549      }
     550  
     551    *expo = 1 + aa->expo;
     552    cc->flags = a_class & F7_FLAG_sign;
     553    cc->expo = -1;
     554    f7_copy_mant (cc, aa);
     555  }
     556  #endif // F7MOD_frexp_
     557  
     558  #ifdef F7MOD_get_s16_
     559  F7_WEAK
     560  int16_t f7_get_s16 (const f7_t *aa)
     561  {
     562    extern int16_t to_s16 (const f7_t*, uint8_t) F7ASM(f7_to_integer_asm);
     563    return to_s16 (aa, 0xf);
     564  }
     565  #endif // F7MOD_get_s16_
     566  
     567  
     568  #ifdef F7MOD_get_s32_
     569  F7_WEAK
     570  int32_t f7_get_s32 (const f7_t *aa)
     571  {
     572    extern int32_t to_s32 (const f7_t*, uint8_t) F7ASM(f7_to_integer_asm);
     573    return to_s32 (aa, 0x1f);
     574  }
     575  F7_PURE ALIAS (f7_get_s32, f7_fixdfsi)
     576  #endif // F7MOD_get_s32_
     577  
     578  
     579  #ifdef F7MOD_get_s64_
     580    F7_WEAK
     581    int64_t f7_get_s64 (const f7_t *aa)
     582  {
     583    extern int64_t to_s64 (const f7_t*, uint8_t) F7ASM(f7_to_integer_asm);
     584    return to_s64 (aa, 0x3f);
     585  }
     586  F7_PURE ALIAS (f7_get_s64, f7_fixdfdi)
     587  #endif // F7MOD_get_s64_
     588  
     589  #ifdef F7MOD_get_u16_
     590    F7_WEAK
     591    uint16_t f7_get_u16 (const f7_t *aa)
     592  {
     593    extern uint16_t to_u16 (const f7_t*, uint8_t) F7ASM(f7_to_unsigned_asm);
     594    return to_u16 (aa, 0xf);
     595  }
     596  #endif // F7MOD_get_u16_
     597  
     598  
     599  #ifdef F7MOD_get_u32_
     600  F7_WEAK
     601  uint32_t f7_get_u32 (const f7_t *aa)
     602  {
     603    extern uint32_t to_u32 (const f7_t*, uint8_t) F7ASM(f7_to_unsigned_asm);
     604    return to_u32 (aa, 0x1f);
     605  }
     606  F7_PURE ALIAS (f7_get_u32, f7_fixunsdfsi)
     607  #endif // F7MOD_get_u32_
     608  
     609  
     610  #ifdef F7MOD_get_u64_
     611  F7_WEAK
     612  uint64_t f7_get_u64 (const f7_t *aa)
     613  {
     614    extern int64_t to_u64 (const f7_t*, uint8_t) F7ASM(f7_to_unsigned_asm);
     615    return to_u64 (aa, 0x3f);
     616  }
     617  F7_PURE ALIAS (f7_get_u64, f7_fixunsdfdi)
     618  #endif // F7MOD_get_u64_
     619  
     620  
     621  #ifdef F7MOD_cmp_unordered_
     622  F7_NOINLINE
     623  static int8_t cmp_u8 (uint8_t a_class, uint8_t b_class, bool sign_a);
     624  
     625  F7_WEAK
     626  int8_t f7_cmp_unordered (const f7_t *aa, const f7_t *bb, bool with_sign)
     627  {
     628    uint8_t a_class = f7_classify (aa);
     629    uint8_t b_class = f7_classify (bb);
     630  
     631    uint8_t a_sign = f7_class_sign (a_class) & with_sign;
     632    uint8_t b_sign = f7_class_sign (b_class) & with_sign;
     633    uint8_t ab_class = a_class | b_class;
     634    ab_class &= with_sign - 2;
     635  
     636    if (f7_class_nan (ab_class))
     637      return INT8_MIN;
     638  
     639    if (a_sign != b_sign)
     640      return b_sign - a_sign;
     641  
     642    if (f7_class_inf (ab_class))
     643      return cmp_u8 (a_class, b_class, a_sign);
     644  
     645    if (f7_class_zero (ab_class))
     646      return cmp_u8 (b_class, a_class, a_sign);
     647  
     648    if (aa->expo < bb->expo)
     649      return a_sign ? 1 : -1;
     650  
     651    if (aa->expo > bb->expo)
     652      return a_sign ? -1 : 1;
     653  
     654    return cmp_u8 (1 + f7_cmp_mant (aa, bb), 1, a_sign);
     655  }
     656  
     657  
     658  int8_t cmp_u8 (uint8_t a_class, uint8_t b_class, bool sign_a)
     659  {
     660    int8_t c;
     661    __asm ("sub  %[a], %[b]"    "\n\t"
     662  	 "breq 1f"	      "\n\t"
     663  	 "sbc  %[c], %[c]"    "\n\t"
     664  	 "sbci %[c], -1"      "\n\t"
     665  	 "sbrc %[s], 0"	      "\n\t"
     666  	 "neg  %[c]"	      "\n\t"
     667  	 "1:"
     668  	 : [c] "=d" (c)
     669  	 : [a] "0" (a_class), [b] "r" (b_class), [s] "r" (sign_a));
     670    return c;
     671  }
     672  #endif // F7MOD_cmp_unordered_
     673  
     674  
     675  #ifdef F7MOD_cmp_abs_
     676  F7_WEAK
     677  int8_t f7_cmp_abs (const f7_t *aa, const f7_t *bb)
     678  {
     679    return f7_cmp_unordered (aa, bb, false /* no signs */);
     680  }
     681  #endif // F7MOD_cmp_abs_
     682  
     683  
     684  #ifdef F7MOD_cmp_
     685  F7_WEAK
     686  int8_t f7_cmp (const f7_t *aa, const f7_t *bb)
     687  {
     688    return f7_cmp_unordered (aa, bb, true /* with signs */);
     689  }
     690  #endif // F7MOD_cmp_
     691  
     692  
     693  #ifdef F7MOD_abscmp_msb_ge_
     694  // Compare absolute value of Number aa against a f7_t represented
     695  // by msb and expo.
     696  F7_WEAK
     697  bool f7_abscmp_msb_ge (const f7_t *aa, uint8_t msb, int16_t expo)
     698  {
     699    uint8_t a_msb = aa->mant[F7_MANT_BYTES - 1];
     700  
     701    if (0 == (0x80 & a_msb))
     702      // 0 or subnormal.
     703      return false;
     704  
     705    return aa->expo == expo
     706      ? a_msb >= msb
     707      : aa->expo > expo;
     708  }
     709  #endif // F7MOD_abscmp_msb_ge_
     710  
     711  #ifdef F7MOD_lt_
     712  F7_WEAK
     713  bool f7_lt_impl (const f7_t *aa, const f7_t *bb)
     714  {
     715    return f7_lt (aa, bb);
     716  }
     717  #endif // F7MOD_lt_
     718  
     719  #ifdef F7MOD_le_
     720  F7_WEAK
     721  bool f7_le_impl (const f7_t *aa, const f7_t *bb)
     722  {
     723    return f7_le (aa, bb);
     724  }
     725  #endif // F7MOD_le_
     726  
     727  #ifdef F7MOD_gt_
     728  F7_WEAK
     729  bool f7_gt_impl (const f7_t *aa, const f7_t *bb)
     730  {
     731    return f7_gt (aa, bb);
     732  }
     733  #endif // F7MOD_gt_
     734  
     735  #ifdef F7MOD_ge_
     736  F7_WEAK
     737  bool f7_ge_impl (const f7_t *aa, const f7_t *bb)
     738  {
     739    return f7_ge (aa, bb);
     740  }
     741  #endif // F7MOD_ge_
     742  
     743  #ifdef F7MOD_ne_
     744  F7_WEAK
     745  bool f7_ne_impl (const f7_t *aa, const f7_t *bb)
     746  {
     747    return f7_ne (aa, bb);
     748  }
     749  #endif // F7MOD_ne_
     750  
     751  #ifdef F7MOD_eq_
     752  F7_WEAK
     753  bool f7_eq_impl (const f7_t *aa, const f7_t *bb)
     754  {
     755    return f7_eq (aa, bb);
     756  }
     757  #endif // F7MOD_eq_
     758  
     759  
     760  #ifdef F7MOD_unord_
     761  F7_WEAK
     762  bool f7_unord_impl (const f7_t *aa, const f7_t *bb)
     763  {
     764    return f7_unordered (aa, bb);
     765  }
     766  #endif // F7MOD_unord_
     767  
     768  
     769  #ifdef F7MOD_minmax_
     770  F7_WEAK
     771  f7_t* f7_minmax (f7_t *cc, const f7_t *aa, const f7_t *bb, bool do_min)
     772  {
     773    int8_t cmp = f7_cmp_unordered (aa, bb, true /* with signs */);
     774  
     775    if (cmp == INT8_MIN)
     776      return (f7_set_nan (cc), cc);
     777  
     778    if (do_min)
     779      cmp = -cmp;
     780  
     781    return f7_copy (cc, cmp >= 0 ? aa : bb);
     782  }
     783  #endif // F7MOD_minmax_
     784  
     785  
     786  #ifdef F7MOD_fmax_
     787  F7_WEAK
     788  f7_t* f7_fmax (f7_t *cc, const f7_t *aa, const f7_t *bb)
     789  {
     790    return f7_minmax (cc, aa, bb, false);
     791  }
     792  ALIAS (f7_fmax, f7_max)
     793  #endif // F7MOD_fmax_
     794  
     795  
     796  #ifdef F7MOD_fmin_
     797  F7_WEAK
     798  f7_t* f7_fmin (f7_t *cc, const f7_t *aa, const f7_t *bb)
     799  {
     800    return f7_minmax (cc, aa, bb, true);
     801  }
     802  ALIAS (f7_fmin, f7_min)
     803  #endif // F7MOD_fmin_
     804  
     805  
     806  #ifdef F7MOD_mulx_
     807  F7_WEAK
     808  uint8_t f7_mulx (f7_t *cc, const f7_t *aa, const f7_t *bb, bool no_rounding)
     809  {
     810    uint8_t a_class = f7_classify (aa);
     811    uint8_t b_class = f7_classify (bb);
     812    // From this point on, no more access aa->flags or bb->flags
     813    // to avoid early-clobber when writing cc->flags.
     814  
     815    uint8_t ab_class = a_class | b_class;
     816    // If either value is NaN, return NaN.
     817    if (f7_class_nan (ab_class)
     818        // Any combination of Inf and 0.
     819        || (f7_class_zero (ab_class) && f7_class_inf (ab_class)))
     820      {
     821        cc->flags = F7_FLAG_nan;
     822        return 0;
     823      }
     824    // If either value is 0.0, return 0.0.
     825    if (f7_class_zero (ab_class))
     826      {
     827        f7_clr (cc);
     828        return 0;
     829      }
     830    // We have 2 non-zero numbers-or-INF.
     831  
     832    uint8_t c_sign = (a_class ^ b_class) & F7_FLAG_sign;
     833    uint8_t c_inf  = ab_class & F7_FLAG_inf;
     834    cc->flags = c_sign | c_inf;
     835    if (c_inf)
     836      return 0;
     837  
     838    int16_t expo = add_ssat16 (aa->expo, bb->expo);
     839    // Store expo and handle expo = INT16_MIN  and INT16_MAX.
     840    if (f7_store_expo (cc, expo))
     841      return 0;
     842  
     843    return f7_mul_mant_asm (cc, aa, bb, no_rounding);
     844  }
     845  #endif // F7MOD_mulx_
     846  
     847  
     848  #ifdef F7MOD_square_
     849  F7_WEAK
     850  void f7_square (f7_t *cc, const f7_t *aa)
     851  {
     852    f7_mul (cc, aa, aa);
     853  }
     854  #endif // F7MOD_square_
     855  
     856  
     857  #ifdef F7MOD_mul_
     858  F7_WEAK
     859  void f7_mul (f7_t *cc, const f7_t *aa, const f7_t *bb)
     860  {
     861    f7_mulx (cc, aa, bb, false);
     862  }
     863  #endif // F7MOD_mul_
     864  
     865  
     866  #ifdef F7MOD_Iadd_
     867  F7_WEAK void f7_Iadd (f7_t *cc, const f7_t *aa) { f7_add (cc, cc, aa); }
     868  #endif
     869  
     870  #ifdef F7MOD_Isub_
     871  F7_WEAK void f7_Isub (f7_t *cc, const f7_t *aa) { f7_sub (cc, cc, aa); }
     872  #endif
     873  
     874  #ifdef F7MOD_Imul_
     875  F7_WEAK void f7_Imul (f7_t *cc, const f7_t *aa) { f7_mul (cc, cc, aa); }
     876  #endif
     877  
     878  #ifdef F7MOD_Idiv_
     879  F7_WEAK void f7_Idiv (f7_t *cc, const f7_t *aa) { f7_div (cc, cc, aa); }
     880  #endif
     881  
     882  #ifdef F7MOD_IRsub_
     883  F7_WEAK void f7_IRsub (f7_t *cc, const f7_t *aa) { f7_sub (cc, aa, cc); }
     884  #endif
     885  
     886  #ifdef F7MOD_Ineg_
     887  F7_WEAK void f7_Ineg (f7_t *cc) { f7_neg (cc, cc); }
     888  #endif
     889  
     890  #ifdef F7MOD_Isqrt_
     891  F7_WEAK void f7_Isqrt (f7_t *cc) { f7_sqrt (cc, cc); }
     892  #endif
     893  
     894  #ifdef F7MOD_Isquare_
     895  F7_WEAK void f7_Isquare (f7_t *cc) { f7_square (cc, cc); }
     896  #endif
     897  
     898  #ifdef F7MOD_Ildexp_
     899  F7_WEAK f7_t* f7_Ildexp (f7_t *cc, int ex) { return f7_ldexp (cc, cc, ex); }
     900  #endif
     901  
     902  
     903  #ifdef F7MOD_add_
     904  F7_WEAK
     905  void f7_add (f7_t *cc, const f7_t *aa, const f7_t *bb)
     906  {
     907    f7_addsub (cc, aa, bb, false);
     908  }
     909  #endif // F7MOD_add_
     910  
     911  
     912  #ifdef F7MOD_sub_
     913  F7_WEAK
     914  void f7_sub (f7_t *cc, const f7_t *aa, const f7_t *bb)
     915  {
     916    f7_addsub (cc, aa, bb, true);
     917  }
     918  #endif // F7MOD_sub_
     919  
     920  
     921  #ifdef F7MOD_addsub_
     922  static void return_with_sign (f7_t *cc, const f7_t *aa, int8_t c_sign)
     923  {
     924    __asm (";;; return with sign");
     925    f7_copy (cc, aa);
     926    if (c_sign != -1)
     927      f7_set_sign (cc, c_sign);
     928  }
     929  
     930  F7_WEAK
     931  void f7_addsub (f7_t *cc, const f7_t *aa, const f7_t *bb, bool neg_b)
     932  {
     933    uint8_t a_class = f7_classify (aa);
     934    uint8_t b_class = f7_classify (bb);
     935    // From this point on, no more access aa->flags or bb->flags
     936    // to avoid early-clobber when writing cc->flags.
     937  
     938    // Hande NaNs.
     939    if (f7_class_nan (a_class | b_class))
     940      return f7_set_nan (cc);
     941  
     942    bool a_sign = f7_class_sign (a_class);
     943    bool b_sign = f7_class_sign (b_class) ^ neg_b;
     944  
     945    // Add the mantissae?
     946    bool do_add = a_sign == b_sign;
     947  
     948    // Handle +Infs and -Infs.
     949    bool a_inf = f7_class_inf (a_class);
     950    bool b_inf = f7_class_inf (b_class);
     951  
     952    if (a_inf && b_inf)
     953      {
     954        if (do_add)
     955  	return f7_set_inf (cc, a_sign);
     956        else
     957  	return f7_set_nan (cc);
     958      }
     959    else if (a_inf)
     960      return f7_set_inf (cc, a_sign);
     961    else if (b_inf)
     962      return f7_set_inf (cc, b_sign);
     963  
     964    int16_t shift16 = sub_ssat16 (aa->expo, bb->expo);
     965  
     966    // aa + 0 = aa.
     967    // Also check MSBit to get rid of Subnormals and 0.
     968    if (shift16 > F7_MANT_BITS || f7_is0 (bb))
     969      return return_with_sign (cc, aa, -1);
     970  
     971    // 0 + bb = bb.
     972    // 0 - bb = -bb.
     973    // Also check MSBit to get rid of Subnormals and 0.
     974    if (shift16 < -F7_MANT_BITS || f7_is0 (aa))
     975      return return_with_sign (cc, bb, b_sign);
     976  
     977    // Now aa and bb are non-zero, non-NaN, non-Inf.
     978    // shift > 0 ==> |a| > |b|
     979    // shift < 0 ==> |a| < |b|
     980    int8_t shift = (int8_t) shift16;
     981    bool c_sign = a_sign;
     982    if (shift < 0
     983        || (shift == 0 && f7_cmp_mant (aa, bb) < 0))
     984      {
     985        const f7_t *p = aa; aa = bb; bb = p;
     986        c_sign = b_sign;
     987        shift = -shift;
     988      }
     989    uint8_t shift2 = (uint8_t) (shift << 1);
     990  
     991    cc->expo = aa->expo;
     992    // From this point on, no more access aa->expo or bb->expo
     993    // to avoid early-clobber when writing cc->expo.
     994  
     995    cc->flags = c_sign;  _Static_assert (F7_FLAGNO_sign == 0, "");
     996  
     997    // This function uses neither .expo nor .flags from either aa or bb,
     998    // hence there is early-clobber for cc->expo and cc->flags.
     999    f7_addsub_mant_scaled_asm (cc, aa, bb, shift2 | do_add);
    1000  }
    1001  #endif // F7MOD_addsub_
    1002  
    1003  
    1004  #ifdef F7MOD_madd_msub_
    1005  F7_WEAK
    1006  void f7_madd_msub (f7_t *cc, const f7_t *aa, const f7_t *bb, const f7_t *dd,
    1007                     bool neg_d)
    1008  {
    1009    f7_t xx7, *xx = &xx7;
    1010    uint8_t x_lsb = f7_mulx (xx, aa, bb, true /* no rounding */);
    1011    uint8_t x_sign = f7_signbit (xx);
    1012    int16_t x_expo = xx->expo;
    1013    f7_addsub (xx, xx, dd, neg_d);
    1014    // Now add LSB.  If cancellation occured in the add / sub, then we have the
    1015    // chance of extra 8 bits of precision.  Turn LSByte into f7_t.
    1016    f7_clr (cc);
    1017    cc->expo = sub_ssat16 (x_expo, F7_MANT_BITS);
    1018    cc->mant[F7_MANT_BYTES - 1] = x_lsb;
    1019    cc = f7_normalize_asm (cc);
    1020    cc->flags = x_sign;
    1021    f7_Iadd (cc, xx);
    1022  }
    1023  #endif // F7MOD_madd_msub_
    1024  
    1025  #ifdef F7MOD_madd_
    1026  F7_WEAK
    1027  void f7_madd (f7_t *cc, const f7_t *aa, const f7_t *bb, const f7_t *dd)
    1028  {
    1029    f7_madd_msub (cc, aa, bb, dd, false);
    1030  }
    1031  #endif // F7MOD_madd_
    1032  
    1033  #ifdef F7MOD_msub_
    1034  F7_WEAK
    1035  void f7_msub (f7_t *cc, const f7_t *aa, const f7_t *bb, const f7_t *dd)
    1036  {
    1037    f7_madd_msub (cc, aa, bb, dd, true);
    1038  }
    1039  #endif // F7MOD_msub_
    1040  
    1041  
    1042  #ifdef F7MOD_ldexp_
    1043  F7_WEAK
    1044  f7_t* f7_ldexp (f7_t *cc, const f7_t *aa, int delta)
    1045  {
    1046    uint8_t a_class = f7_classify (aa);
    1047  
    1048    cc->flags = a_class;
    1049  
    1050    // Inf and NaN.
    1051    if (! f7_class_number (a_class))
    1052      return cc;
    1053  
    1054    if (f7_msbit (aa) == 0)
    1055      return (f7_clr (cc), cc);
    1056  
    1057    int16_t expo = add_ssat16 (delta, aa->expo);
    1058    // Store expo and handle expo = INT16_MIN  and INT16_MAX.
    1059    if (! f7_store_expo (cc, expo))
    1060      f7_copy_mant (cc, aa);
    1061  
    1062    return cc;
    1063  }
    1064  #endif // F7MOD_ldexp_
    1065  
    1066  
    1067  #if USE_LPM
    1068  #elif USE_LD
    1069  #else
    1070  #error need include "asm-defs.h"
    1071  #endif // USE_LPM
    1072  
    1073  /*
    1074    Handling constants:
    1075  
    1076    F7_PCONST (PVAR, X)
    1077  
    1078        Set  f7_t [const] *PVAR  to an LD address for one
    1079        of the  f7_const_X[_P]  constants.
    1080        PVAR might be set to point to a local auto that serves
    1081        as temporary storage for f7_const_X_P.  PVAR is only
    1082        valid in the current block.
    1083  
    1084    const f7_t* F7_PCONST_U16 (PVAR, <ident> X)       // USE_LD
    1085    f7_t*       F7_PCONST_U16 (PVAR, uint16_t X)      // USE_LPM
    1086  
    1087        Set  f7_t [const] *PVAR  to an LD address for one of the
    1088        f7_const_X[_P]  constants.  PVAR might be set to point to a
    1089        local auto that serves as temporary storage for X.  PVAR is
    1090        only valid in the current block.
    1091  
    1092    F7_PCONST_VAR (PVAR, VAR)
    1093  
    1094        VAR is a pointer variable holding the address of some f7_const_X[_P]
    1095        constant.  Set  [const] f7_t *PVAR  to a respective LD address.
    1096        PVAR might be set to point to a local auto that serves
    1097        as temporary storage for f7_const_X_P.  PVAR is only
    1098        valid in the current block.
    1099  
    1100    F7_CONST_ADDR (<ident> CST, f7_t* PTMP)
    1101  
    1102        Return an LD address to for some f7_const_X[_P] constant.
    1103        *PTMP might be needed to hold a copy of f7_const_X_P in RAM.
    1104  
    1105    f7_t*       F7_U16_ADDR (uint16_t     X, f7_t* PTMP)   // USE_LPM
    1106    const f7_t* F7_U16_ADDR (<cst-ident>  X, <unused>)     // USE_LD
    1107  
    1108        Return an LD address to compile-time constant  uint16_t X  which is
    1109        also known as  f7_const_X[_P].  *PTMP might be set to  (f7_t) X.
    1110  
    1111    f7_t* f7_const (f7_t *PVAR, <cst-ident> X)
    1112  
    1113        Copy  f7_const_X[_P]  to *PVAR.
    1114  
    1115    f7_t* f7_copy_flash (f7_t *DST, const f7_t *SRC)
    1116  
    1117        Copy to *DST with LD (from .rodata in flash) if the address
    1118        space is linear, or with  LPM (from .progmem.data) if the
    1119        address space is not linear.
    1120  
    1121    f7_t* f7_copy (f7_t *DST, const f7_t* SRC)
    1122  
    1123        Copy to RAM using LD.
    1124  
    1125    f7_t* f7_copy_P (f7_t *DST, const f7_t *SRC)
    1126  
    1127        Copy to RAM using LPM.
    1128  */
    1129  
    1130  #if USE_LPM
    1131    #define F7_RAW_CONST_ADDR(CST) \
    1132        & F7_(const_##CST##_P)
    1133  
    1134    #define F7_PCONST(PVAR, CST)				    \
    1135        f7_t _var_for_##CST;				    \
    1136        f7_copy_P (& _var_for_##CST, & F7_(const_##CST##_P)); \
    1137        PVAR = & _var_for_##CST
    1138  
    1139    #define F7_PCONST_U16(PVAR, CST)		\
    1140        f7_t _var_for_##CST;			\
    1141        PVAR = f7_set_u16 (& _var_for_##CST, CST)
    1142  
    1143    #define F7_PCONST_VAR(PVAR, VAR)		\
    1144        f7_t _var_for_##VAR;			\
    1145        f7_copy_P (& _var_for_##VAR, VAR);	\
    1146        PVAR = & _var_for_##VAR
    1147  
    1148    #define MAYBE_const // empty
    1149  
    1150    #define F7_CONST_ADDR(CST, PTMP) \
    1151        f7_copy_P ((PTMP), & F7_(const_##CST##_P))
    1152  
    1153    #define F7_U16_ADDR(CST, PTMP)   \
    1154        f7_set_u16 ((PTMP), CST)
    1155  
    1156  #elif USE_LD
    1157    #define F7_RAW_CONST_ADDR(CST)   \
    1158        & F7_(const_##CST)
    1159  
    1160    #define F7_PCONST(PVAR, CST)	   \
    1161        PVAR = & F7_(const_##CST)
    1162  
    1163    #define F7_PCONST_U16(PVAR, CST) \
    1164        PVAR = & F7_(const_##CST)
    1165  
    1166    #define F7_PCONST_VAR(PVAR, VAR) \
    1167        PVAR = (VAR)
    1168  
    1169    #define F7_CONST_ADDR(CST, PTMP) \
    1170        (& F7_(const_##CST))
    1171  
    1172    #define F7_U16_ADDR(CST, PTMP)   \
    1173        (& F7_(const_##CST))
    1174  
    1175    #define MAYBE_const const
    1176  #endif
    1177  
    1178  
    1179  
    1180  #define DD(str, X)		\
    1181    do {				\
    1182      LOG_PSTR (PSTR (str));	\
    1183      f7_dump (X);		\
    1184    } while (0)
    1185  
    1186  #undef DD
    1187  #define DD(...) (void) 0
    1188  
    1189  
    1190  #ifdef F7MOD_sqrt_
    1191  static void sqrt_worker (f7_t *cc, const f7_t *rr)
    1192  {
    1193    f7_t tmp7, *tmp = &tmp7;
    1194    f7_t aa7, *aa = &aa7;
    1195  
    1196    // aa in  [1/2, 2)  =>  aa->expo in { -1, 0 }.
    1197    int16_t a_expo = -(rr->expo & 1);
    1198    int16_t c_expo = (rr->expo - a_expo) >> 1;  // FIXME: r_expo = INT_MAX???
    1199  
    1200    __asm ("" : "+r" (aa));
    1201  
    1202    f7_copy (aa, rr);
    1203    aa->expo = a_expo;
    1204  
    1205    // No use of rr or *cc past this point:  We may use cc as temporary.
    1206    // Approximate square-root of  A  by  X <-- (X + A / X) / 2.
    1207  
    1208    f7_sqrt_approx_asm (cc, aa);
    1209  
    1210    // Iterate  X <-- (X + A / X) / 2.
    1211    // 3 Iterations with 16, 32, 58 bits of precision for the quotient.
    1212  
    1213    for (uint8_t prec = 16; (prec & 0x80) == 0; prec <<= 1)
    1214      {
    1215        f7_divx (tmp, aa, cc, (prec & 64) ? 2 + F7_MANT_BITS : prec);
    1216        f7_Iadd (cc, tmp);
    1217        // This will never underflow because |c_expo| is small.
    1218        cc->expo--;
    1219      }
    1220  
    1221    // Similar: |c_expo| is small, hence no ldexp needed.
    1222    cc->expo += c_expo;
    1223  }
    1224  
    1225  F7_WEAK
    1226  void f7_sqrt (f7_t *cc, const f7_t *aa)
    1227  {
    1228    uint8_t a_class = f7_classify (aa);
    1229  
    1230    if (f7_class_nan (a_class) || f7_class_sign (a_class))
    1231      return f7_set_nan (cc);
    1232  
    1233    if (f7_class_inf (a_class))
    1234      return f7_set_inf (cc, 0);
    1235  
    1236    if (f7_class_zero (a_class))
    1237      return f7_clr (cc);
    1238  
    1239    sqrt_worker (cc, aa);
    1240  }
    1241  #endif // F7MOD_sqrt_
    1242  
    1243  
    1244  #ifdef F7MOD_hypot_
    1245  F7_WEAK
    1246  void f7_hypot (f7_t *cc, const f7_t *aa, const f7_t *bb)
    1247  {
    1248    f7_t xx7, *xx = &xx7;
    1249  
    1250    f7_square (xx, aa);
    1251    f7_square (cc, bb);
    1252    f7_Iadd (cc, xx);
    1253    f7_Isqrt (cc);
    1254  }
    1255  #endif // F7MOD_hypot_
    1256  
    1257  
    1258  #ifdef F7MOD_const_m1_
    1259  #include "libf7-constdef.h"
    1260  #endif // -1
    1261  
    1262  #ifdef F7MOD_const_1_2_
    1263  #include "libf7-constdef.h"
    1264  #endif // 1/2
    1265  
    1266  #ifdef F7MOD_const_1_3_
    1267  #include "libf7-constdef.h"
    1268  #endif // 1/3
    1269  
    1270  #ifdef F7MOD_const_ln2_
    1271  #include "libf7-constdef.h"
    1272  #endif // ln2
    1273  
    1274  #ifdef F7MOD_const_1_ln2_
    1275  #include "libf7-constdef.h"
    1276  #endif // 1_ln2
    1277  
    1278  #ifdef F7MOD_const_ln10_
    1279  #include "libf7-constdef.h"
    1280  #endif // ln10
    1281  
    1282  #ifdef F7MOD_const_1_ln10_
    1283  #include "libf7-constdef.h"
    1284  #endif // 1_ln10
    1285  
    1286  #ifdef F7MOD_const_1_
    1287  #include "libf7-constdef.h"
    1288  #endif // 1
    1289  
    1290  #ifdef F7MOD_const_sqrt2_
    1291  #include "libf7-constdef.h"
    1292  #endif // sqrt2
    1293  
    1294  #ifdef F7MOD_const_2_
    1295  #include "libf7-constdef.h"
    1296  #endif // 2
    1297  
    1298  #ifdef F7MOD_const_pi_
    1299  #include "libf7-constdef.h"
    1300  #endif // pi
    1301  
    1302  
    1303  #ifdef F7MOD_divx_
    1304  
    1305  // C /= A
    1306  extern void f7_div_asm (f7_t*, const f7_t*, uint8_t);
    1307  
    1308  F7_WEAK
    1309  void f7_divx (f7_t *cc, const f7_t *aa, const f7_t *bb, uint8_t quot_bits)
    1310  {
    1311    uint8_t a_class = f7_classify (aa);
    1312    uint8_t b_class = f7_classify (bb);
    1313    // From this point on, no more access aa->flags or bb->flags
    1314    // to avoid early-clobber when writing cc->flags.
    1315  
    1316    // If either value is NaN, return NaN.
    1317    if (f7_class_nan (a_class | b_class)
    1318        // If both values are Inf or both are 0, return NaN.
    1319        || f7_class_zero (a_class & b_class)
    1320        || f7_class_inf (a_class & b_class)
    1321        // Inf / 0 = NaN.
    1322        || (f7_class_inf (a_class) && f7_class_zero (b_class)))
    1323      {
    1324        return f7_set_nan (cc);
    1325      }
    1326  
    1327    // 0 / B   = 0  for non-zero, non-NaN B.
    1328    // A / Inf = 0  for non-zero numbers A.
    1329    if (f7_class_zero (a_class) || f7_class_inf (b_class))
    1330      return f7_clr (cc);
    1331  
    1332    uint8_t c_sign = (a_class ^ b_class) & F7_FLAG_sign;
    1333  
    1334    if (f7_class_inf (a_class) || f7_class_zero (b_class))
    1335      return f7_set_inf (cc, c_sign);
    1336  
    1337    cc->flags = c_sign;     _Static_assert (F7_FLAGNO_sign == 0, "");
    1338    int16_t expo = sub_ssat16 (aa->expo, bb->expo);
    1339  
    1340    // Store expo and handle expo = INT16_MIN  and INT16_MAX.
    1341    if (f7_store_expo (cc, expo))
    1342      return;
    1343  
    1344    f7_t ss7, *ss = &ss7;
    1345    ss->flags = cc->flags;
    1346    ss->expo  = cc->expo;
    1347  
    1348    f7_copy_mant (ss, aa);
    1349    f7_div_asm (ss, bb, quot_bits);
    1350    f7_copy (cc, ss);
    1351  }
    1352  #endif // F7MOD_divx_
    1353  
    1354  
    1355  #ifdef F7MOD_div_
    1356  F7_WEAK
    1357  void f7_div (f7_t *cc, const f7_t *aa, const f7_t *bb)
    1358  {
    1359    /* When f7_divx calls f7_div_asm, dividend and divisor are valid
    1360       mantissae, i.e. their MSBit is set.  Therefore, the quotient will
    1361       be in  [0x0.ff..., 0x0.40...]  and to adjust it, at most 1 left-shift
    1362       is needed.  Compute F7_MANT_BITS + 2 bits of the quotient:
    1363       One bit is used for rounding, and one bit might be consumed by the
    1364       mentioned left-shift.  */
    1365  
    1366    f7_divx (cc, aa, bb, 2 + F7_MANT_BITS);
    1367  }
    1368  #endif // F7MOD_div_
    1369  
    1370  
    1371  #ifdef F7MOD_div1_
    1372  F7_WEAK
    1373  void f7_div1 (f7_t *cc, const f7_t *aa)
    1374  {
    1375    F7_PCONST_U16 (const f7_t *one, 1);
    1376    f7_div (cc, one, aa);
    1377  }
    1378  #endif // F7MOD_div_
    1379  
    1380  
    1381  #ifdef F7MOD_fmod_
    1382  F7_WEAK
    1383  void f7_fmod (f7_t *cc, const f7_t *aa, const f7_t *bb)
    1384  {
    1385    uint8_t a_class = f7_classify (aa);
    1386    uint8_t b_class = f7_classify (bb);
    1387  
    1388    if (! f7_class_number (a_class)
    1389        || f7_class_nan (b_class)
    1390        || f7_class_zero (b_class))
    1391      {
    1392        return f7_set_nan (cc);
    1393      }
    1394  
    1395    // A == 0 and B != 0  =>  0.
    1396    if (f7_class_zero (a_class))
    1397      return f7_clr (cc);
    1398  
    1399    f7_t zz7, *zz = & zz7;
    1400  
    1401    f7_div (zz, aa, bb);
    1402  
    1403    // Z in Z,  |Z| <= |A/B|.
    1404    f7_trunc (zz, zz);
    1405  
    1406    // C = A - Z * B.
    1407    f7_msub (cc, zz, bb, aa);
    1408    cc->flags ^= F7_FLAG_sign;
    1409  }
    1410  #endif // F7MOD_fmod_
    1411  
    1412  
    1413  #ifdef F7MOD_truncx_
    1414  F7_WEAK
    1415  f7_t* f7_truncx (f7_t *cc, const f7_t *aa, bool do_floor)
    1416  {
    1417    uint8_t a_class = f7_classify (aa);
    1418  
    1419    if (! f7_class_nonzero (a_class))
    1420      return f7_copy (cc, aa);
    1421  
    1422    bool sign = f7_class_sign (a_class);
    1423  
    1424    int16_t a_expo = aa->expo;
    1425  
    1426    if (a_expo < 0)
    1427      {
    1428        // |A| < 1.
    1429        if (sign & do_floor)
    1430  	return f7_set_s16 (cc, -1);
    1431  
    1432        f7_clr (cc);
    1433        return cc;
    1434      }
    1435    else if (a_expo >= F7_MANT_BITS - 1)
    1436      // A == floor (A).
    1437      return f7_copy (cc, aa);
    1438  
    1439    f7_t tmp7, *tmp = &tmp7;
    1440  
    1441    // Needed if aa === cc.
    1442    f7_copy (tmp, aa);
    1443  
    1444    cc->flags = sign;
    1445    cc->expo = a_expo;
    1446    f7_clr_mant_lsbs (cc, aa, F7_MANT_BITS - 1 - a_expo);
    1447  
    1448    if (do_floor && cc->sign && f7_cmp_mant (cc, tmp) != 0)
    1449      {
    1450        F7_PCONST_U16 (const f7_t *one, 1);
    1451        f7_Isub (cc, one);
    1452      }
    1453  
    1454    return cc;
    1455  }
    1456  #endif // F7MOD_truncx_
    1457  
    1458  
    1459  #ifdef F7MOD_floor_
    1460  F7_WEAK
    1461  f7_t* f7_floor (f7_t *cc, const f7_t *aa)
    1462  {
    1463    return f7_truncx (cc, aa, true);
    1464  }
    1465  #endif // F7MOD_floor_
    1466  
    1467  
    1468  #ifdef F7MOD_trunc_
    1469  F7_WEAK
    1470  f7_t* f7_trunc (f7_t *cc, const f7_t *aa)
    1471  {
    1472    return f7_truncx (cc, aa, false);
    1473  }
    1474  #endif // F7MOD_trunc_
    1475  
    1476  
    1477  #ifdef F7MOD_ceil_
    1478  F7_WEAK
    1479  void f7_ceil (f7_t *cc, const f7_t *aa)
    1480  {
    1481    cc = f7_copy (cc, aa);
    1482    cc->flags ^= F7_FLAG_sign;
    1483    cc = f7_floor (cc, cc);
    1484    f7_Ineg (cc);
    1485  }
    1486  #endif // F7MOD_ceil_
    1487  
    1488  
    1489  #ifdef F7MOD_round_
    1490  F7_WEAK
    1491  void f7_round (f7_t *cc, const f7_t *aa)
    1492  {
    1493    f7_t tmp;
    1494    (void) tmp;
    1495    const f7_t *half = F7_CONST_ADDR (1_2, &tmp);
    1496  
    1497    f7_addsub   (cc, aa, half, f7_signbit (aa));
    1498    f7_trunc (cc, cc);
    1499  }
    1500  #endif // F7MOD_round_
    1501  
    1502  
    1503  #ifdef F7MOD_horner_
    1504  
    1505  // Assertion when using this function is that either cc != xx,
    1506  // or if cc == xx, then tmp1 must be non-NULL and tmp1 != xx.
    1507  // In General, the calling functions have a spare f7_t object available
    1508  // and can pass it down to save some stack.
    1509  // Moreover, the power series must have degree 1 at least.
    1510  
    1511  F7_WEAK
    1512  void f7_horner (f7_t *cc, const f7_t *xx, uint8_t n_coeff, const f7_t *coeff,
    1513                  f7_t *tmp1)
    1514  {
    1515    f7_assert (n_coeff > 1);
    1516  
    1517    if (cc != xx)
    1518      tmp1 = cc;
    1519    else
    1520      f7_assert (tmp1 != NULL && tmp1 != xx);
    1521  
    1522    f7_t *yy = tmp1;
    1523    f7_t tmp27, *tmp2 = &tmp27;
    1524  
    1525    n_coeff--;
    1526    const f7_t *pcoeff = coeff + n_coeff;
    1527  
    1528    f7_copy_flash (yy, pcoeff);
    1529  
    1530    while (1)
    1531      {
    1532        --pcoeff;
    1533  #if 1
    1534        f7_Imul (yy, xx);
    1535        const f7_t *cst = USE_LD ? pcoeff : f7_copy_P (tmp2, pcoeff);
    1536        if (coeff == pcoeff)
    1537  	return f7_add (cc, yy, cst);
    1538        f7_Iadd (yy, cst);
    1539  #else
    1540        const f7_t *cst = USE_LD ? pcoeff : f7_copy_P (tmp2, pcoeff);
    1541        f7_madd (yy, yy, xx, cst);
    1542        if (coeff == pcoeff)
    1543          {
    1544  	  f7_copy (cc, yy);
    1545  	  return;
    1546          }
    1547  #endif
    1548      }
    1549  
    1550    __builtin_unreachable();
    1551  }
    1552  #endif // F7MOD_horner_
    1553  
    1554  
    1555  #ifdef F7MOD_log_
    1556  F7_WEAK
    1557  void f7_log (f7_t *cc, const f7_t *aa)
    1558  {
    1559      f7_logx (cc, aa, NULL);
    1560  }
    1561  #endif // F7MOD_log_
    1562  
    1563  
    1564  #ifdef F7MOD_log2_
    1565  F7_WEAK
    1566  void f7_log2 (f7_t *cc, const f7_t *aa)
    1567  {
    1568    f7_logx (cc, aa, F7_RAW_CONST_ADDR (1_ln2));
    1569  }
    1570  #endif // F7MOD_log2_
    1571  
    1572  
    1573  #ifdef F7MOD_log10_
    1574  F7_WEAK
    1575  void f7_log10 (f7_t *cc, const f7_t *aa)
    1576  {
    1577    f7_logx (cc, aa, F7_RAW_CONST_ADDR (1_ln10));
    1578  }
    1579  #endif // F7MOD_log10_
    1580  
    1581  
    1582  #ifdef F7MOD_logx_
    1583  
    1584  #define ARRAY_NAME coeff_artanh
    1585  #include "libf7-array.def"
    1586  #undef ARRAY_NAME
    1587  
    1588  // Compute P * ln(A)  if P != NULL and ln(A), otherwise.
    1589  // P is a LD-address if USE_LD and a LPM-address if USE_LPM.
    1590  // Assumption is that P > 0.
    1591  
    1592  F7_WEAK
    1593  void f7_logx (f7_t *cc, const f7_t *aa, const f7_t *p)
    1594  {
    1595    uint8_t a_class = f7_classify (aa);
    1596  
    1597    if (f7_class_nan (a_class) || f7_class_sign (a_class))
    1598      return f7_set_nan (cc);
    1599  
    1600    if (f7_class_inf (a_class))
    1601      return f7_set_inf (cc, 0);
    1602  
    1603    if (f7_class_zero (a_class))
    1604      return f7_set_inf (cc, 1);
    1605  
    1606    f7_t *yy = cc;
    1607    f7_t xx7, *xx = &xx7;
    1608    f7_t tmp7, *tmp = &tmp7;
    1609  
    1610    // Y in [1, 2]  =  A * 2 ^ (-a_expo).
    1611    int16_t a_expo = aa->expo;
    1612    f7_copy (yy, aa);
    1613    yy->expo = 0;
    1614  
    1615    // Y in [1 / sqrt2, sqrt2].
    1616  
    1617    if (f7_abscmp_msb_ge (yy, F7_(const_sqrt2_msb), F7_(const_sqrt2_expo)))
    1618      {
    1619        yy->expo = -1;
    1620        a_expo = add_ssat16 (a_expo, 1);
    1621      }
    1622  
    1623    const f7_t *one = F7_U16_ADDR (1, & tmp7);
    1624  
    1625    // X := (Y - 1) / (Y + 1),  |X| <= (sqrt2 - 1) / (sqrt2 + 1)  ~  0.172.
    1626    f7_sub (xx, yy, one);
    1627    f7_Iadd (yy, one);
    1628    f7_Idiv (xx, yy);
    1629  
    1630    // Y := X^2,  |Y| < 0.03.
    1631    f7_square (yy, xx);
    1632  
    1633    // Y := artanh (X^2) / X
    1634    f7_horner (yy, yy, n_coeff_artanh, coeff_artanh, tmp);
    1635  
    1636    // C = X * Y = ln A - a_expo * ln2.
    1637    f7_mul (cc, xx, yy);
    1638  
    1639    // X := a_expo * ln2.
    1640    f7_set_s16 (xx, a_expo);
    1641    f7_Imul (xx, F7_CONST_ADDR (ln2, & tmp7));
    1642  
    1643    // C = ln A.
    1644    f7_Iadd (cc, xx);
    1645  
    1646    if (p && USE_LPM)
    1647      f7_Imul (cc, f7_copy_P (tmp, p));
    1648    if (p && USE_LD)
    1649      f7_Imul (cc, p);
    1650  }
    1651  #endif // F7MOD_logx_
    1652  
    1653  
    1654  #ifdef F7MOD_exp_
    1655  
    1656  #define ARRAY_NAME coeff_exp
    1657  #include "libf7-array.def"
    1658  #undef ARRAY_NAME
    1659  
    1660  #define STATIC static
    1661  #include "libf7-constdef.h" // ln2_low
    1662  #undef STATIC
    1663  
    1664  F7_WEAK
    1665  void f7_exp (f7_t *cc, const f7_t *aa)
    1666  {
    1667    uint8_t a_class = f7_classify (aa);
    1668  
    1669    if (f7_class_nan (a_class))
    1670      return f7_set_nan (cc);
    1671  
    1672    /* The maximal exponent of 2 for a double is 1023, hence we may limit
    1673       to  |A| < 1023 * ln2 ~ 709.  We limit to  1024 ~ 1.99 * 2^9  */
    1674  
    1675    if (f7_class_inf (a_class)
    1676        || (f7_class_nonzero (a_class) && aa->expo >= 9))
    1677      {
    1678        if (f7_class_sign (a_class))
    1679  	return f7_clr (cc);
    1680        else
    1681  	return f7_set_inf (cc, 0);
    1682      }
    1683  
    1684    f7_t const *cst;
    1685    f7_t qq7, *qq = &qq7;
    1686  
    1687    F7_PCONST (cst, ln2);
    1688  
    1689    // We limited |A| to 1024 and are now dividing by ln2, hence Q will
    1690    // be at most 1024 / ln2 ~ 1477 and fit into 11 bits.  We will
    1691    // round Q anyway, hence only request 11 bits from the division and
    1692    // one additional bit that might be needed to normalize the quotient.
    1693    f7_divx (qq, aa, cst, 1 + 11);
    1694  
    1695    // Use the smallest (by absolute value) remainder system.
    1696    f7_round (qq, qq);
    1697    int16_t q = f7_get_s16 (qq);
    1698  
    1699    // Reducing A mod ln2 gives |C| <= ln2 / 2,  C = -A mod ln2.
    1700    f7_msub (cc, qq, cst, aa);
    1701  
    1702    // Corrigendum:  We added Q * ln2; now add Q times the low part of ln2
    1703    // for better precision.  Due to |C| < |A| this is not a no-op in general.
    1704    const f7_t *yy = F7_CONST_ADDR (ln2_low, &_var_for_ln2);
    1705    f7_madd (cc, qq, yy, cc);
    1706  
    1707    // Because we computed C = -A mod ...
    1708    cc->flags ^= F7_FLAG_sign;
    1709  
    1710    // Reduce further to |C| < ln2 / 8 which is the range of our MiniMax poly.
    1711    const uint8_t MAX_LN2_RED = 3;
    1712    int8_t scal2 = 0;
    1713  
    1714    while (f7_abscmp_msb_ge (cc, F7_(const_ln2_msb),
    1715  			   F7_(const_ln2_expo) - MAX_LN2_RED))
    1716      {
    1717        scal2++;
    1718        cc->expo--;
    1719      }
    1720  
    1721    f7_horner (cc, cc, n_coeff_exp, coeff_exp, qq);
    1722  
    1723    while (--scal2 >= 0)
    1724      f7_Isquare (cc);
    1725  
    1726    f7_Ildexp (cc, q);
    1727  }
    1728  #endif // F7MOD_exp_
    1729  
    1730  
    1731  #ifdef F7MOD_pow10_
    1732  F7_WEAK
    1733  void f7_pow10 (f7_t *cc, const f7_t *aa)
    1734  {
    1735    const f7_t *p_ln10;
    1736    F7_PCONST (p_ln10, ln10);
    1737    f7_mul (cc, aa, p_ln10);
    1738    f7_exp (cc, cc);
    1739  }
    1740  ALIAS (f7_pow10, f7_exp10)
    1741  #endif // F7MOD_pow10_
    1742  
    1743  
    1744  #ifdef F7MOD_cbrt_
    1745  F7_WEAK
    1746  void f7_cbrt (f7_t *cc, const f7_t *aa)
    1747  {
    1748    f7_copy (cc, aa);
    1749    const f7_t *p_1_3;
    1750    uint8_t c_flags = cc->flags;
    1751    cc->flags &= ~F7_FLAG_sign;
    1752    f7_log (cc, cc);
    1753    F7_PCONST (p_1_3, 1_3);
    1754    f7_Imul (cc, p_1_3);
    1755    f7_exp (cc, cc);
    1756  
    1757    if (c_flags & F7_FLAG_sign)
    1758      cc->flags |= F7_FLAG_sign;
    1759  }
    1760  #endif // F7MOD_cbrt_
    1761  
    1762  
    1763  #ifdef F7MOD_pow_
    1764  F7_WEAK
    1765  void f7_pow (f7_t *cc, const f7_t *aa, const f7_t *bb)
    1766  {
    1767  #if 0
    1768    f7_t slots[cc == bb];
    1769    f7_t *yy = cc == bb ? slots : cc;
    1770  #else
    1771    f7_t yy7, *yy = &yy7;
    1772  #endif
    1773    f7_log (yy, aa);
    1774    f7_Imul (yy, bb);
    1775    f7_exp (cc, yy);
    1776  }
    1777  #endif // F7MOD_pow_
    1778  
    1779  
    1780  #ifdef F7MOD_powi_
    1781  F7_WEAK
    1782  void f7_powi (f7_t *cc, const f7_t *aa, int ii)
    1783  {
    1784    uint16_t u16 = ii;
    1785    f7_t xx27, *xx2 = &xx27;
    1786  
    1787    if (ii < 0)
    1788      u16 = -u16;
    1789  
    1790    f7_copy (xx2, aa);
    1791  
    1792    f7_set_u16 (cc, 1);
    1793  
    1794    while (1)
    1795      {
    1796        if (u16 & 1)
    1797  	f7_Imul (cc, xx2);
    1798  
    1799        if (! f7_is_nonzero (cc))
    1800  	break;
    1801  
    1802        u16 >>= 1;
    1803        if (u16 == 0)
    1804  	break;
    1805        f7_Isquare (xx2);
    1806      }
    1807  
    1808    if (ii < 0)
    1809      f7_div1 (xx2, aa);
    1810  }
    1811  #endif // F7MOD_powi_
    1812  
    1813  
    1814  #ifdef F7MOD_sincos_
    1815  
    1816  #define ARRAY_NAME coeff_sin
    1817    #define FOR_SIN
    1818    #include "libf7-array.def"
    1819    #undef  FOR_SIN
    1820  #undef ARRAY_NAME
    1821  
    1822  #define ARRAY_NAME coeff_cos
    1823    #define FOR_COS
    1824    #include "libf7-array.def"
    1825    #undef  FOR_COS
    1826  #undef ARRAY_NAME
    1827  
    1828  #define STATIC static
    1829  #include "libf7-constdef.h" // pi_low
    1830  #undef STATIC
    1831  
    1832  typedef union
    1833  {
    1834    struct
    1835    {
    1836      bool    neg_sin : 1; // Must be bit F7_FLAGNO_sign.
    1837      bool    neg_cos : 1;
    1838      bool    do_sin: 1;
    1839      bool    do_cos: 1;
    1840      bool    swap_sincos : 1;
    1841      uint8_t res : 3;
    1842    };
    1843    uint8_t bits;
    1844  } sincos_t;
    1845  
    1846  
    1847  F7_WEAK
    1848  void f7_sincos (f7_t *ss, f7_t *cc, const f7_t *aa)
    1849  {
    1850    uint8_t a_class = f7_classify (aa);
    1851  
    1852    sincos_t sc = { .bits = a_class & F7_FLAG_sign };
    1853    if (ss != NULL) sc.do_sin = 1;
    1854    if (cc != NULL) sc.do_cos = 1;
    1855  
    1856    if (f7_class_nan (a_class) || f7_class_inf (a_class))
    1857      {
    1858        if (sc.do_sin)  f7_set_nan (ss);
    1859        if (sc.do_cos)  f7_set_nan (cc);
    1860        return;
    1861      }
    1862  
    1863    f7_t pi7, *pi = &pi7;
    1864    f7_t xx7, *xx = &xx7;
    1865    f7_t yy7, *yy = &yy7;
    1866    f7_t *hh = sc.do_sin ? ss : cc;
    1867  
    1868    // X = |A|
    1869    f7_copy (xx, aa);
    1870    xx->flags = 0;
    1871  
    1872    // Y is how often we subtract PI from X.
    1873    f7_clr (yy);
    1874    f7_const (pi, pi);
    1875  
    1876    if (f7_abscmp_msb_ge (xx, F7_(const_pi_msb), F7_(const_pi_expo) + 1))
    1877      {
    1878        pi->expo = 1 + F7_(const_pi_expo);  // 2*pi
    1879  
    1880        // Y = X / 2pi.
    1881        f7_div (yy, xx, pi);
    1882  
    1883        // The integral part of |A| / pi mod 2 is bit 55 - x_expo.
    1884        if (yy->expo >= F7_MANT_BITS && !f7_is_zero (yy))
    1885          {
    1886  	  // Too big for sensible calculation:  Should this be NaN instead?
    1887  	  if (sc.do_sin)  f7_clr (ss);
    1888  	  if (sc.do_cos)  f7_clr (cc);
    1889  	  return;
    1890          }
    1891  
    1892        // X -= 2pi * [ X / 2pi ]
    1893        f7_floor (yy, yy);
    1894  
    1895        f7_msub (xx, yy, pi, xx);
    1896        xx->flags ^= F7_FLAG_sign;
    1897  
    1898        // We divided by 2pi, but Y should count times we subtracted pi.
    1899        yy->expo++;
    1900      }
    1901  
    1902    pi->expo = F7_(const_pi_expo); // pi
    1903    f7_sub (hh, xx, pi);
    1904    if (!f7_signbit (hh))
    1905      {
    1906        // H = X - pi >= 0  =>  X >= pi
    1907        // sin(x) = -sin(x - pi)
    1908        // cos(x) = -cos(x - pi)
    1909        f7_copy (xx, hh);
    1910        // Y: We subtracted pi one more time.
    1911        f7_Iadd (yy, f7_set_u16 (hh, 1));
    1912        sc.neg_sin ^= 1;
    1913        sc.neg_cos ^= 1;
    1914      }
    1915  
    1916    pi->expo = F7_(const_pi_expo) - 1; // pi/2
    1917    if (f7_gt (xx, pi))
    1918      {
    1919        // x > pi/2
    1920        // sin(x) =  sin(pi - x)
    1921        // cos(x) = -cos(pi - x)
    1922        pi->expo = F7_(const_pi_expo); // pi
    1923        f7_IRsub (xx, pi);
    1924        // Y: We subtracted pi one more time (and then negated).
    1925        f7_Iadd (yy, f7_set_u16 (hh, 1));
    1926        yy->flags ^= F7_FLAG_sign;
    1927        sc.neg_cos ^= 1;
    1928      }
    1929  
    1930    pi->expo = F7_(const_pi_expo) - 2; // pi/4
    1931    if (f7_gt (xx, pi))
    1932      {
    1933        // x > pi/4
    1934        // sin(x) = cos(pi/2 - x)
    1935        // cos(x) = sin(pi/2 - x)
    1936        pi->expo = F7_(const_pi_expo) - 1; // pi/2
    1937        f7_IRsub (xx, pi);
    1938        // Y: We subtracted pi/2 one more time (and then negated).
    1939        f7_Iadd (yy, f7_set_1pow2 (hh, -1, 0));
    1940        yy->flags ^= F7_FLAG_sign;
    1941        sc.swap_sincos = 1;
    1942      }
    1943  
    1944    if (!f7_is0 (yy))
    1945      {
    1946        // Y counts how often we subtracted pi from X in order to
    1947        // get 0 <= X < pi/4 as small as possible (Y is 0 mod 0.5).
    1948        // Now also subtract the low part of pi:
    1949        // f7_const_pi_low = pi - f7_const_pi  in order to get more precise
    1950        // results in the cases where the final result is close to 0.
    1951        const f7_t *pi_low = F7_CONST_ADDR (pi_low, pi);
    1952        //f7_const (pi, pi_low);
    1953        f7_Imul (yy, pi_low);
    1954        f7_Isub (xx, yy);
    1955      }
    1956  
    1957    // X   in [0, pi/4].
    1958    // X^2 in [0, pi^2/16] ~ [0, 0.6169]
    1959  
    1960    f7_square (yy, xx);
    1961  
    1962    f7_t *x_sin = xx;
    1963    f7_t *x_cos = yy;
    1964  
    1965    if ((sc.do_sin && !sc.swap_sincos)
    1966        || (sc.do_cos && sc.swap_sincos))
    1967      {
    1968        f7_horner (hh, yy, n_coeff_sin, coeff_sin, NULL);
    1969        f7_mul (x_sin, hh, xx);
    1970      }
    1971  
    1972    if ((sc.do_cos && !sc.swap_sincos)
    1973        || (sc.do_sin && sc.swap_sincos))
    1974      {
    1975        f7_horner (x_cos, yy, n_coeff_cos, coeff_cos, hh);
    1976      }
    1977  
    1978    if (sc.swap_sincos)
    1979      {
    1980        x_sin = yy;
    1981        x_cos = xx;
    1982      }
    1983  
    1984    if (sc.do_sin)
    1985      {
    1986        x_sin->flags ^= sc.bits;
    1987        x_sin->flags &= F7_FLAG_sign;
    1988        f7_copy (ss, x_sin);
    1989      }
    1990  
    1991    if (sc.do_cos)
    1992      {
    1993        x_cos->flags = sc.neg_cos;
    1994        f7_copy (cc, x_cos);
    1995      }
    1996  }
    1997  #endif // F7MOD_sincos_
    1998  
    1999  #ifdef F7MOD_sin_
    2000  F7_WEAK
    2001  void f7_sin (f7_t *ss, const f7_t *aa)
    2002  {
    2003    f7_sincos (ss, NULL, aa);
    2004  }
    2005  #endif // F7MOD_sin_
    2006  
    2007  #ifdef F7MOD_cos_
    2008  F7_WEAK
    2009  void f7_cos (f7_t *cc, const f7_t *aa)
    2010  {
    2011    f7_sincos (NULL, cc, aa);
    2012  }
    2013  #endif // F7MOD_cos_
    2014  
    2015  
    2016  #ifdef F7MOD_tan_
    2017  F7_WEAK
    2018  void f7_tan (f7_t *tt, const f7_t *aa)
    2019  {
    2020    f7_t xcos;
    2021    f7_sincos (tt, & xcos, aa);
    2022    f7_Idiv (tt, & xcos);
    2023  }
    2024  #endif // F7MOD_tan_
    2025  
    2026  
    2027  #ifdef F7MOD_cotan_
    2028  F7_WEAK
    2029  void f7_cotan (f7_t *ct, const f7_t *aa)
    2030  {
    2031    f7_t xcos;
    2032    f7_sincos (ct, & xcos, aa);
    2033    f7_div (ct, & xcos, ct);
    2034  }
    2035  #endif // F7MOD_cotan_
    2036  
    2037  
    2038  #ifdef F7MOD_sinhcosh_
    2039  F7_WEAK
    2040  void f7_sinhcosh (f7_t *cc, const f7_t *aa, bool do_sinh)
    2041  {
    2042    f7_t xx7, *xx = &xx7;
    2043    // C = exp(A)
    2044    f7_exp (cc, aa);
    2045    // X = exp(-A)
    2046    f7_div (xx, f7_set_u16 (xx, 1), cc);
    2047    // sinh(A) = (exp(A) - exp(-A)) / 2
    2048    // cosh(A) = (exp(A) + exp(-A)) / 2
    2049    f7_addsub (cc, cc, xx, do_sinh);
    2050    cc->expo--;
    2051  }
    2052  #endif // F7MOD_sinhcosh_
    2053  
    2054  
    2055  #ifdef F7MOD_sinh_
    2056  F7_WEAK
    2057  void f7_sinh (f7_t *cc, const f7_t *aa)
    2058  {
    2059    f7_sinhcosh (cc, aa, true);
    2060  }
    2061  #endif // F7MOD_sinh_
    2062  
    2063  
    2064  #ifdef F7MOD_cosh_
    2065  F7_WEAK
    2066  void f7_cosh (f7_t *cc, const f7_t *aa)
    2067  {
    2068    f7_sinhcosh (cc, aa, false);
    2069  }
    2070  #endif // F7MOD_cosh_
    2071  
    2072  
    2073  #ifdef F7MOD_tanh_
    2074  F7_WEAK
    2075  void f7_tanh (f7_t *cc, const f7_t *aa)
    2076  {
    2077    // tanh(A) = (exp(2A) - 1) / (exp(2A) + 1)
    2078    f7_t xx7, *xx = &xx7;
    2079    F7_PCONST_U16 (const f7_t *one, 1);
    2080    // C = 2A
    2081    f7_copy (cc, aa);
    2082    cc->expo++;
    2083    // C = exp(2A)
    2084    f7_exp (cc, cc);
    2085    // X = exp(2A) + 1
    2086    f7_add (xx, cc, one);
    2087    // C = exp(2A) - 1
    2088    f7_Isub (cc, one);
    2089    // C = tanh(A)
    2090    f7_Idiv (cc, xx);
    2091  }
    2092  #endif // F7MOD_tanh_
    2093  
    2094  
    2095  #ifdef F7MOD_atan_
    2096  
    2097  #define MINIMAX_6_6_IN_0_1
    2098  
    2099  #define ARRAY_NAME coeff_atan_zahler
    2100  #define FOR_NUMERATOR
    2101  #include "libf7-array.def"
    2102  #undef FOR_NUMERATOR
    2103  #undef ARRAY_NAME
    2104  
    2105  #define ARRAY_NAME coeff_atan_nenner
    2106  #define FOR_DENOMINATOR
    2107  #include "libf7-array.def"
    2108  #undef FOR_DENOMINATOR
    2109  #undef ARRAY_NAME
    2110  
    2111  #include "libf7-constdef.h"
    2112  
    2113  F7_WEAK
    2114  void f7_atan (f7_t *cc, const f7_t *aa)
    2115  {
    2116    uint8_t a_class = f7_classify (aa);
    2117    uint8_t flags = a_class & F7_FLAG_sign;
    2118  
    2119    if (f7_class_nan (a_class))
    2120      return f7_set_nan (cc);
    2121  
    2122    f7_t yy7, *yy = &yy7;
    2123    f7_t zz7, *zz = &zz7;
    2124  
    2125    if (f7_class_inf (a_class))
    2126      {
    2127        f7_set_u16 (cc, 0);
    2128        goto do_Inf;
    2129      }
    2130  
    2131    // C = |A|
    2132    f7_copy (cc, aa);
    2133    cc->flags = 0;
    2134  
    2135    if (!f7_class_zero (a_class) && cc->expo >= 0)
    2136      {
    2137        // C >= 1:  use  atan (x) + atan (1/x) = pi / 2  to reduce to [0, 1].
    2138        flags |= 1 << 1;
    2139        f7_div (cc, f7_set_u16 (yy, 1), cc);
    2140      }
    2141  #if !defined (MINIMAX_6_6_IN_0_1)
    2142    const uint8_t const_a_msb = 0x89;
    2143    const int16_t const_a_expo = -2;
    2144    if (f7_abscmp_msb_ge (cc, const_a_msb, const_a_expo))
    2145      {
    2146        // We have C in [0, 1] and we want to use argument reduction by means
    2147        // of addition theorem  atan(x) - atan(y) = atan((x - y) / (1 + xy)).
    2148        // We want to split [0, 1] into  [0, a] u [a, 1]  in such a way that
    2149        // the upper interval will be mapped to  [-a, a].  The system is easy
    2150        // to solve and yiels
    2151        //    y = 1 / sqrt (3)       ~  0.57735     atan(y) = pi / 6
    2152        //    a = (1 - y) / (1 + y)  ~  0.26795  ~  0x0.8930A2F * 2^-1.
    2153        flags |= 1 << 2;
    2154        // C <- (C - Y) / (1 + C * Y)  in  [-a, a]
    2155        const f7_t *cst = F7_CONST_ADDR (1_sqrt3, zz);
    2156        f7_mul (yy, cc, cst);
    2157        f7_Isub (cc, cst);
    2158        f7_Iadd (yy, F7_U16_ADDR (1, zz));
    2159        f7_Idiv (cc, yy);
    2160      }
    2161  #endif
    2162    // C <- C * p(C^2) / q(C^2)
    2163    f7_square (yy, cc);
    2164    f7_horner (zz, yy, n_coeff_atan_zahler, coeff_atan_zahler, NULL);
    2165    f7_Imul (zz, cc);
    2166    f7_horner (cc, yy, n_coeff_atan_nenner, coeff_atan_nenner, NULL);
    2167    f7_div (cc, zz, cc);
    2168  
    2169  #if !defined (MINIMAX_6_6_IN_0_1)
    2170    if (flags & (1 << 2))
    2171      f7_Iadd (cc, F7_CONST_ADDR (pi_6, yy));
    2172  #endif
    2173  
    2174    if (flags & (1 << 1))
    2175      {
    2176      do_Inf:;
    2177        // Y = pi / 2
    2178        f7_const (yy, pi);
    2179        yy->expo = F7_(const_pi_expo) - 1;
    2180        f7_IRsub (cc, yy);
    2181      }
    2182  
    2183    cc->flags = a_class & F7_FLAG_sign;
    2184  }
    2185  #undef MINIMAX_6_6_IN_0_1
    2186  #endif // F7MOD_atan_
    2187  
    2188  
    2189  #ifdef F7MOD_asinacos_
    2190  
    2191  #define ARRAY_NAME coeff_func_a_zahler
    2192  #define FOR_NUMERATOR
    2193  #include "libf7-array.def"
    2194  #undef  FOR_NUMERATOR
    2195  #undef  ARRAY_NAME
    2196  
    2197  #define ARRAY_NAME coeff_func_a_nenner
    2198  #define FOR_DENOMINATOR
    2199  #include "libf7-array.def"
    2200  #undef  FOR_DENOMINATOR
    2201  #undef  ARRAY_NAME
    2202  
    2203  typedef union
    2204  {
    2205    struct
    2206    {
    2207      bool    sign : 1;       // Must be bit F7_FLAGNO_sign.
    2208      bool    do_acos : 1;    // From caller.
    2209      bool    have_acos : 1;  // What we compute from rational approx p/q.
    2210      uint8_t res : 5;
    2211    };
    2212    uint8_t bits;
    2213  } asinacos_t;
    2214  
    2215  F7_WEAK
    2216  void f7_asinacos (f7_t *cc, const f7_t *aa, uint8_t what)
    2217  {
    2218    f7_t xx7, *xx = &xx7;
    2219    f7_t xx27, *xx2 = &xx27;
    2220  
    2221    asinacos_t flags = { .bits = what | f7_signbit (aa) };
    2222  
    2223    f7_abs (xx, aa);
    2224  
    2225    int8_t cmp = f7_cmp (xx, f7_set_u16 (cc, 1));
    2226  
    2227    if (cmp == INT8_MIN
    2228        || cmp > 0)
    2229      {
    2230        return f7_set_nan (cc);
    2231      }
    2232  
    2233    if (xx->expo <= -2 || f7_is_zero (xx))
    2234      {
    2235        // |A| < 1/2:  asin(x) = x * a(2*x^2)
    2236        f7_square (xx2, xx);
    2237        xx2->expo ++;
    2238      }
    2239    else
    2240      {
    2241        // |A| > 1/2: acos (1-x) = sqrt(2*x) * a(x)
    2242        // C is 1 from above.
    2243        f7_IRsub (xx, cc);
    2244        f7_copy (xx2, xx);
    2245        flags.have_acos = 1;
    2246      }
    2247  
    2248    // MiniMax [5/4] numerator.
    2249    f7_horner (cc, xx2, n_coeff_func_a_zahler, coeff_func_a_zahler, NULL);
    2250  
    2251    if (flags.have_acos)
    2252      {
    2253        xx->expo ++;
    2254        f7_Isqrt (xx);
    2255      }
    2256    f7_Imul (cc, xx);
    2257  
    2258    // MiniMax [5/4] denominator.
    2259    f7_horner (xx, xx2, n_coeff_func_a_nenner, coeff_func_a_nenner, NULL);
    2260  
    2261    f7_Idiv (cc, xx);
    2262  
    2263    /*
    2264        With the current value of C, we have:
    2265  
    2266  		|	     |	      do_asin	    |	    do_acos
    2267  		|     C	     |	A <= 0	 |  A >= 0  |  A <= 0  |  A >= 0
    2268        ----------+------------+-----------+----------+----------+----------
    2269        have_asin | asin (|A|) |	  -C	 |    C	    | pi/2 + C | pi/2 - C
    2270        have_acos | acos (|A|) | -pi/2 + C | pi/2 - C |  pi - C  |     C
    2271  
    2272        Result = n_pi2 * pi/2 + C * (c_sign ? -1 : 1)
    2273        Result (A, do_asin) = asin (A)
    2274        Result (A, do_acos) = acos (A)
    2275  
    2276        with
    2277  	  c_sign = do_acos ^ have_acos ^ a_sign
    2278  	  n_pi2	 = do_acos + have_acos * (a_sign ^ do_acos) ? (-1 : 1)
    2279  	  n_pi2 in { -1, 0, 1, 2 }
    2280    */
    2281  
    2282    // All what matters for c_sign is bit 0.
    2283    uint8_t c_sign = flags.bits;
    2284    int8_t n_pi2 = flags.do_acos;
    2285    c_sign ^= flags.do_acos;
    2286    if (flags.have_acos)
    2287      {
    2288        n_pi2++;
    2289        __asm ("" : "+r" (n_pi2));
    2290        if (c_sign & 1)  // c_sign & 1  =  a_sign ^ do_acos
    2291  	n_pi2 -= 2;
    2292        c_sign++;
    2293      }
    2294  
    2295    cc->flags = c_sign & F7_FLAG_sign;
    2296  
    2297    if (n_pi2)
    2298      {
    2299        f7_const (xx, pi);
    2300        if (n_pi2 < 0)
    2301  	xx->sign = 1;
    2302        if (n_pi2 != 2)
    2303  	xx->expo = F7_(const_pi_expo) - 1;
    2304  
    2305        f7_Iadd (cc, xx);
    2306      }
    2307  }
    2308  #endif // F7MOD_asinacos_
    2309  
    2310  
    2311  #ifdef F7MOD_asin_
    2312  F7_WEAK
    2313  void f7_asin (f7_t *cc, const f7_t *aa)
    2314  {
    2315    f7_asinacos (cc, aa, 0);
    2316  }
    2317  #endif // F7MOD_asin_
    2318  
    2319  
    2320  #ifdef F7MOD_acos_
    2321  F7_WEAK
    2322  void f7_acos (f7_t *cc, const f7_t *aa)
    2323  {
    2324    f7_asinacos (cc, aa, 1 << 1);
    2325  }
    2326  #endif // F7MOD_acos_
    2327  
    2328  
    2329  #ifndef IN_LIBGCC2
    2330  
    2331  #ifdef F7MOD_put_C_
    2332  
    2333  #include <stdio.h>
    2334  #include <avr/pgmspace.h>
    2335  
    2336  static F7_INLINE
    2337  uint8_t f7_hex_digit (uint8_t nibble)
    2338  {
    2339    nibble = (uint8_t) (nibble + '0');
    2340    if (nibble > '9')
    2341      nibble = (uint8_t) (nibble + ('a' - '0' - 10));
    2342    return nibble;
    2343  }
    2344  
    2345  static void f7_put_hex2 (uint8_t x, FILE *stream)
    2346  {
    2347    putc ('0', stream);
    2348    if (x)
    2349      {
    2350        putc ('x', stream);
    2351        putc (f7_hex_digit (x >> 4), stream);
    2352        putc (f7_hex_digit (x & 0xf), stream);
    2353      }
    2354  }
    2355  
    2356  #define XPUT(str) \
    2357    fputs_P (PSTR (str), stream)
    2358  
    2359  // Write to STREAM a line that is appropriate for usage in libf7-const.def.
    2360  
    2361  F7_WEAK
    2362  void f7_put_CDEF (const char *name, const f7_t *aa, FILE *stream)
    2363  {
    2364    char buf[7];
    2365    XPUT ("F7_CONST_DEF (");
    2366    fputs (name, stream);
    2367    XPUT (",\t");
    2368    uint8_t a_class = f7_classify (aa);
    2369    if (! f7_class_nonzero (a_class))
    2370      {
    2371        f7_put_hex2 (a_class & F7_FLAGS, stream);
    2372        XPUT (",\t0,0,0,0,0,0,0,\t0)");
    2373        return;
    2374      }
    2375    putc ('0' + (a_class & F7_FLAGS), stream);
    2376    XPUT (",\t");
    2377  
    2378    for (uint8_t i = 0; i < F7_MANT_BYTES; i++)
    2379      {
    2380        f7_put_hex2 (aa->mant[F7_MANT_BYTES-1 - i], stream);
    2381        putc (',', stream);
    2382      }
    2383    putc ('\t', stream);
    2384  
    2385    itoa (aa->expo, buf, 10);
    2386    fputs (buf, stream);
    2387    XPUT (")");
    2388  }
    2389  
    2390  void f7_put_C (const f7_t *aa, FILE *stream)
    2391  {
    2392    char buf[7];
    2393  
    2394    uint8_t a_class = f7_classify (aa);
    2395    if (f7_class_nan (a_class))
    2396      {
    2397        XPUT ("{ .is_nan = 1 }");
    2398        return;
    2399      }
    2400    bool sign = a_class & F7_FLAG_sign;
    2401  
    2402    if (f7_class_inf (a_class))
    2403      {
    2404        XPUT ("{ .is_nan = 1, .sign = ");
    2405        putc ('0' + sign, stream);
    2406        XPUT (" }");
    2407        return;
    2408      }
    2409  
    2410    XPUT ("{ .sign = ");
    2411    putc ('0' + sign, stream);
    2412  
    2413    XPUT (", .mant = { ");
    2414    for (uint8_t i = 0; i < F7_MANT_BYTES; i++)
    2415      {
    2416        f7_put_hex2 (aa->mant[F7_MANT_BYTES-1 - i], stream);
    2417        if (i != F7_MANT_BYTES - 1)
    2418  	putc (',', stream);
    2419      }
    2420  
    2421    XPUT (" }, .expo = ");
    2422    itoa (aa->expo, buf, 10);
    2423    fputs (buf, stream);
    2424    XPUT (" }");
    2425  }
    2426  #endif //F7MOD_put_C_
    2427  
    2428  
    2429  #ifdef F7MOD_dump_
    2430  
    2431  #include <avr/pgmspace.h>
    2432  
    2433  #ifndef AVRTEST_H
    2434  
    2435  #include <stdio.h>
    2436  
    2437  static void LOG_PSTR (const char *str)
    2438  {
    2439    fputs_P (str, stdout);
    2440  }
    2441  
    2442  static void LOG_PFMT_U16 (const char *fmt, uint16_t x)
    2443  {
    2444    printf_P (fmt, x);
    2445  }
    2446  
    2447  static void LOG_PFMT_FLOAT (const char *fmt, float x)
    2448  {
    2449    printf_P (fmt, x);
    2450  }
    2451  
    2452  #define LOG_X8(X)               LOG_PFMT_U16 (PSTR (" 0x%02x "), (uint8_t)(X))
    2453  #define LOG_PFMT_S16(FMT, X)    LOG_PFMT_U16 (FMT, (unsigned)(X))
    2454  #define LOG_PFMT_ADDR(FMT, X)   LOG_PFMT_U16 (FMT, (unsigned)(X))
    2455  
    2456  #endif // AVRTEST_H
    2457  
    2458  static void dump_byte (uint8_t b)
    2459  {
    2460    LOG_PSTR (PSTR (" "));
    2461    for (uint8_t i = 0; i < 8; i++)
    2462      {
    2463        LOG_PSTR ((b & 0x80) ? PSTR ("1") : PSTR ("0"));
    2464        b = (uint8_t) (b << 1);
    2465      }
    2466  }
    2467  
    2468  void f7_dump_mant (const f7_t *aa)
    2469  {
    2470    LOG_PSTR (PSTR ("\tmant   ="));
    2471    for (int i = F7_MANT_BYTES - 1; i >= 0; i--)
    2472      LOG_X8 (aa->mant[i]);
    2473    LOG_PSTR (PSTR ("\n\t       ="));
    2474  
    2475    for (int i = F7_MANT_BYTES - 1; i >= 0; i--)
    2476      dump_byte (aa->mant[i]);
    2477    LOG_PSTR (PSTR ("\n"));
    2478  }
    2479  
    2480  void f7_dump (const f7_t *aa)
    2481  {
    2482    LOG_PFMT_ADDR (PSTR ("\n0x%04x\tflags  = "), aa);
    2483    dump_byte (aa->flags);
    2484    uint8_t a_class = f7_classify (aa);
    2485    LOG_PSTR (PSTR ("  = "));
    2486    LOG_PSTR (f7_class_sign (a_class) ? PSTR ("-") : PSTR ("+"));
    2487    if (f7_class_inf (a_class))    LOG_PSTR (PSTR ("Inf "));
    2488    if (f7_class_nan (a_class))    LOG_PSTR (PSTR ("NaN "));
    2489    if (f7_class_zero (a_class))   LOG_PSTR (PSTR ("0 "));
    2490    if (f7_class_number (a_class)) LOG_PSTR (PSTR ("Number "));
    2491  
    2492    LOG_PFMT_FLOAT (PSTR (" = %.10g\n"), f7_get_float (aa));
    2493    LOG_PFMT_S16 (PSTR ("\texpo   = %d\n"), aa->expo);
    2494  
    2495    f7_dump_mant (aa);
    2496  }
    2497  #endif // F7MOD_dump_
    2498  
    2499  #endif // ! libgcc
    2500  
    2501  #endif // !AVR_TINY