1  """
       2  Basic statistics module.
       3  
       4  This module provides functions for calculating statistics of data, including
       5  averages, variance, and standard deviation.
       6  
       7  Calculating averages
       8  --------------------
       9  
      10  ==================  ==================================================
      11  Function            Description
      12  ==================  ==================================================
      13  mean                Arithmetic mean (average) of data.
      14  fmean               Fast, floating point arithmetic mean.
      15  geometric_mean      Geometric mean of data.
      16  harmonic_mean       Harmonic mean of data.
      17  median              Median (middle value) of data.
      18  median_low          Low median of data.
      19  median_high         High median of data.
      20  median_grouped      Median, or 50th percentile, of grouped data.
      21  mode                Mode (most common value) of data.
      22  multimode           List of modes (most common values of data).
      23  quantiles           Divide data into intervals with equal probability.
      24  ==================  ==================================================
      25  
      26  Calculate the arithmetic mean ("the average") of data:
      27  
      28  >>> mean([-1.0, 2.5, 3.25, 5.75])
      29  2.625
      30  
      31  
      32  Calculate the standard median of discrete data:
      33  
      34  >>> median([2, 3, 4, 5])
      35  3.5
      36  
      37  
      38  Calculate the median, or 50th percentile, of data grouped into class intervals
      39  centred on the data values provided. E.g. if your data points are rounded to
      40  the nearest whole number:
      41  
      42  >>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
      43  2.8333333333...
      44  
      45  This should be interpreted in this way: you have two data points in the class
      46  interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
      47  the class interval 3.5-4.5. The median of these data points is 2.8333...
      48  
      49  
      50  Calculating variability or spread
      51  ---------------------------------
      52  
      53  ==================  =============================================
      54  Function            Description
      55  ==================  =============================================
      56  pvariance           Population variance of data.
      57  variance            Sample variance of data.
      58  pstdev              Population standard deviation of data.
      59  stdev               Sample standard deviation of data.
      60  ==================  =============================================
      61  
      62  Calculate the standard deviation of sample data:
      63  
      64  >>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
      65  4.38961843444...
      66  
      67  If you have previously calculated the mean, you can pass it as the optional
      68  second argument to the four "spread" functions to avoid recalculating it:
      69  
      70  >>> data = [1, 2, 2, 4, 4, 4, 5, 6]
      71  >>> mu = mean(data)
      72  >>> pvariance(data, mu)
      73  2.5
      74  
      75  
      76  Statistics for relations between two inputs
      77  -------------------------------------------
      78  
      79  ==================  ====================================================
      80  Function            Description
      81  ==================  ====================================================
      82  covariance          Sample covariance for two variables.
      83  correlation         Pearson's correlation coefficient for two variables.
      84  linear_regression   Intercept and slope for simple linear regression.
      85  ==================  ====================================================
      86  
      87  Calculate covariance, Pearson's correlation, and simple linear regression
      88  for two inputs:
      89  
      90  >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
      91  >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
      92  >>> covariance(x, y)
      93  0.75
      94  >>> correlation(x, y)  #doctest: +ELLIPSIS
      95  0.31622776601...
      96  >>> linear_regression(x, y)  #doctest:
      97  LinearRegression(slope=0.1, intercept=1.5)
      98  
      99  
     100  Exceptions
     101  ----------
     102  
     103  A single exception is defined: StatisticsError is a subclass of ValueError.
     104  
     105  """
     106  
     107  __all__ = [
     108      'NormalDist',
     109      'StatisticsError',
     110      'correlation',
     111      'covariance',
     112      'fmean',
     113      'geometric_mean',
     114      'harmonic_mean',
     115      'linear_regression',
     116      'mean',
     117      'median',
     118      'median_grouped',
     119      'median_high',
     120      'median_low',
     121      'mode',
     122      'multimode',
     123      'pstdev',
     124      'pvariance',
     125      'quantiles',
     126      'stdev',
     127      'variance',
     128  ]
     129  
     130  import math
     131  import numbers
     132  import random
     133  import sys
     134  
     135  from fractions import Fraction
     136  from decimal import Decimal
     137  from itertools import groupby, repeat
     138  from bisect import bisect_left, bisect_right
     139  from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
     140  from functools import reduce
     141  from operator import mul
     142  from collections import Counter, namedtuple, defaultdict
     143  
     144  _SQRT2 = sqrt(2.0)
     145  
     146  # === Exceptions ===
     147  
     148  class ESC[4;38;5;81mStatisticsError(ESC[4;38;5;149mValueError):
     149      pass
     150  
     151  
     152  # === Private utilities ===
     153  
     154  def _sum(data):
     155      """_sum(data) -> (type, sum, count)
     156  
     157      Return a high-precision sum of the given numeric data as a fraction,
     158      together with the type to be converted to and the count of items.
     159  
     160      Examples
     161      --------
     162  
     163      >>> _sum([3, 2.25, 4.5, -0.5, 0.25])
     164      (<class 'float'>, Fraction(19, 2), 5)
     165  
     166      Some sources of round-off error will be avoided:
     167  
     168      # Built-in sum returns zero.
     169      >>> _sum([1e50, 1, -1e50] * 1000)
     170      (<class 'float'>, Fraction(1000, 1), 3000)
     171  
     172      Fractions and Decimals are also supported:
     173  
     174      >>> from fractions import Fraction as F
     175      >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
     176      (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
     177  
     178      >>> from decimal import Decimal as D
     179      >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
     180      >>> _sum(data)
     181      (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
     182  
     183      Mixed types are currently treated as an error, except that int is
     184      allowed.
     185      """
     186      count = 0
     187      types = set()
     188      types_add = types.add
     189      partials = {}
     190      partials_get = partials.get
     191      for typ, values in groupby(data, type):
     192          types_add(typ)
     193          for n, d in map(_exact_ratio, values):
     194              count += 1
     195              partials[d] = partials_get(d, 0) + n
     196      if None in partials:
     197          # The sum will be a NAN or INF. We can ignore all the finite
     198          # partials, and just look at this special one.
     199          total = partials[None]
     200          assert not _isfinite(total)
     201      else:
     202          # Sum all the partial sums using builtin sum.
     203          total = sum(Fraction(n, d) for d, n in partials.items())
     204      T = reduce(_coerce, types, int)  # or raise TypeError
     205      return (T, total, count)
     206  
     207  
     208  def _ss(data, c=None):
     209      """Return the exact mean and sum of square deviations of sequence data.
     210  
     211      Calculations are done in a single pass, allowing the input to be an iterator.
     212  
     213      If given *c* is used the mean; otherwise, it is calculated from the data.
     214      Use the *c* argument with care, as it can lead to garbage results.
     215  
     216      """
     217      if c is not None:
     218          T, ssd, count = _sum((d := x - c) * d for x in data)
     219          return (T, ssd, c, count)
     220      count = 0
     221      types = set()
     222      types_add = types.add
     223      sx_partials = defaultdict(int)
     224      sxx_partials = defaultdict(int)
     225      for typ, values in groupby(data, type):
     226          types_add(typ)
     227          for n, d in map(_exact_ratio, values):
     228              count += 1
     229              sx_partials[d] += n
     230              sxx_partials[d] += n * n
     231      if not count:
     232          ssd = c = Fraction(0)
     233      elif None in sx_partials:
     234          # The sum will be a NAN or INF. We can ignore all the finite
     235          # partials, and just look at this special one.
     236          ssd = c = sx_partials[None]
     237          assert not _isfinite(ssd)
     238      else:
     239          sx = sum(Fraction(n, d) for d, n in sx_partials.items())
     240          sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items())
     241          # This formula has poor numeric properties for floats,
     242          # but with fractions it is exact.
     243          ssd = (count * sxx - sx * sx) / count
     244          c = sx / count
     245      T = reduce(_coerce, types, int)  # or raise TypeError
     246      return (T, ssd, c, count)
     247  
     248  
     249  def _isfinite(x):
     250      try:
     251          return x.is_finite()  # Likely a Decimal.
     252      except AttributeError:
     253          return math.isfinite(x)  # Coerces to float first.
     254  
     255  
     256  def _coerce(T, S):
     257      """Coerce types T and S to a common type, or raise TypeError.
     258  
     259      Coercion rules are currently an implementation detail. See the CoerceTest
     260      test class in test_statistics for details.
     261      """
     262      # See http://bugs.python.org/issue24068.
     263      assert T is not bool, "initial type T is bool"
     264      # If the types are the same, no need to coerce anything. Put this
     265      # first, so that the usual case (no coercion needed) happens as soon
     266      # as possible.
     267      if T is S:  return T
     268      # Mixed int & other coerce to the other type.
     269      if S is int or S is bool:  return T
     270      if T is int:  return S
     271      # If one is a (strict) subclass of the other, coerce to the subclass.
     272      if issubclass(S, T):  return S
     273      if issubclass(T, S):  return T
     274      # Ints coerce to the other type.
     275      if issubclass(T, int):  return S
     276      if issubclass(S, int):  return T
     277      # Mixed fraction & float coerces to float (or float subclass).
     278      if issubclass(T, Fraction) and issubclass(S, float):
     279          return S
     280      if issubclass(T, float) and issubclass(S, Fraction):
     281          return T
     282      # Any other combination is disallowed.
     283      msg = "don't know how to coerce %s and %s"
     284      raise TypeError(msg % (T.__name__, S.__name__))
     285  
     286  
     287  def _exact_ratio(x):
     288      """Return Real number x to exact (numerator, denominator) pair.
     289  
     290      >>> _exact_ratio(0.25)
     291      (1, 4)
     292  
     293      x is expected to be an int, Fraction, Decimal or float.
     294      """
     295  
     296      # XXX We should revisit whether using fractions to accumulate exact
     297      # ratios is the right way to go.
     298  
     299      # The integer ratios for binary floats can have numerators or
     300      # denominators with over 300 decimal digits.  The problem is more
     301      # acute with decimal floats where the default decimal context
     302      # supports a huge range of exponents from Emin=-999999 to
     303      # Emax=999999.  When expanded with as_integer_ratio(), numbers like
     304      # Decimal('3.14E+5000') and Decimal('3.14E-5000') have large
     305      # numerators or denominators that will slow computation.
     306  
     307      # When the integer ratios are accumulated as fractions, the size
     308      # grows to cover the full range from the smallest magnitude to the
     309      # largest.  For example, Fraction(3.14E+300) + Fraction(3.14E-300),
     310      # has a 616 digit numerator.  Likewise,
     311      # Fraction(Decimal('3.14E+5000')) + Fraction(Decimal('3.14E-5000'))
     312      # has 10,003 digit numerator.
     313  
     314      # This doesn't seem to have been problem in practice, but it is a
     315      # potential pitfall.
     316  
     317      try:
     318          return x.as_integer_ratio()
     319      except AttributeError:
     320          pass
     321      except (OverflowError, ValueError):
     322          # float NAN or INF.
     323          assert not _isfinite(x)
     324          return (x, None)
     325      try:
     326          # x may be an Integral ABC.
     327          return (x.numerator, x.denominator)
     328      except AttributeError:
     329          msg = f"can't convert type '{type(x).__name__}' to numerator/denominator"
     330          raise TypeError(msg)
     331  
     332  
     333  def _convert(value, T):
     334      """Convert value to given numeric type T."""
     335      if type(value) is T:
     336          # This covers the cases where T is Fraction, or where value is
     337          # a NAN or INF (Decimal or float).
     338          return value
     339      if issubclass(T, int) and value.denominator != 1:
     340          T = float
     341      try:
     342          # FIXME: what do we do if this overflows?
     343          return T(value)
     344      except TypeError:
     345          if issubclass(T, Decimal):
     346              return T(value.numerator) / T(value.denominator)
     347          else:
     348              raise
     349  
     350  
     351  def _fail_neg(values, errmsg='negative value'):
     352      """Iterate over values, failing if any are less than zero."""
     353      for x in values:
     354          if x < 0:
     355              raise StatisticsError(errmsg)
     356          yield x
     357  
     358  
     359  def _integer_sqrt_of_frac_rto(n: int, m: int) -> int:
     360      """Square root of n/m, rounded to the nearest integer using round-to-odd."""
     361      # Reference: https://www.lri.fr/~melquion/doc/05-imacs17_1-expose.pdf
     362      a = math.isqrt(n // m)
     363      return a | (a*a*m != n)
     364  
     365  
     366  # For 53 bit precision floats, the bit width used in
     367  # _float_sqrt_of_frac() is 109.
     368  _sqrt_bit_width: int = 2 * sys.float_info.mant_dig + 3
     369  
     370  
     371  def _float_sqrt_of_frac(n: int, m: int) -> float:
     372      """Square root of n/m as a float, correctly rounded."""
     373      # See principle and proof sketch at: https://bugs.python.org/msg407078
     374      q = (n.bit_length() - m.bit_length() - _sqrt_bit_width) // 2
     375      if q >= 0:
     376          numerator = _integer_sqrt_of_frac_rto(n, m << 2 * q) << q
     377          denominator = 1
     378      else:
     379          numerator = _integer_sqrt_of_frac_rto(n << -2 * q, m)
     380          denominator = 1 << -q
     381      return numerator / denominator   # Convert to float
     382  
     383  
     384  def _decimal_sqrt_of_frac(n: int, m: int) -> Decimal:
     385      """Square root of n/m as a Decimal, correctly rounded."""
     386      # Premise:  For decimal, computing (n/m).sqrt() can be off
     387      #           by 1 ulp from the correctly rounded result.
     388      # Method:   Check the result, moving up or down a step if needed.
     389      if n <= 0:
     390          if not n:
     391              return Decimal('0.0')
     392          n, m = -n, -m
     393  
     394      root = (Decimal(n) / Decimal(m)).sqrt()
     395      nr, dr = root.as_integer_ratio()
     396  
     397      plus = root.next_plus()
     398      np, dp = plus.as_integer_ratio()
     399      # test: n / m > ((root + plus) / 2) ** 2
     400      if 4 * n * (dr*dp)**2 > m * (dr*np + dp*nr)**2:
     401          return plus
     402  
     403      minus = root.next_minus()
     404      nm, dm = minus.as_integer_ratio()
     405      # test: n / m < ((root + minus) / 2) ** 2
     406      if 4 * n * (dr*dm)**2 < m * (dr*nm + dm*nr)**2:
     407          return minus
     408  
     409      return root
     410  
     411  
     412  # === Measures of central tendency (averages) ===
     413  
     414  def mean(data):
     415      """Return the sample arithmetic mean of data.
     416  
     417      >>> mean([1, 2, 3, 4, 4])
     418      2.8
     419  
     420      >>> from fractions import Fraction as F
     421      >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
     422      Fraction(13, 21)
     423  
     424      >>> from decimal import Decimal as D
     425      >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
     426      Decimal('0.5625')
     427  
     428      If ``data`` is empty, StatisticsError will be raised.
     429      """
     430      T, total, n = _sum(data)
     431      if n < 1:
     432          raise StatisticsError('mean requires at least one data point')
     433      return _convert(total / n, T)
     434  
     435  
     436  def fmean(data, weights=None):
     437      """Convert data to floats and compute the arithmetic mean.
     438  
     439      This runs faster than the mean() function and it always returns a float.
     440      If the input dataset is empty, it raises a StatisticsError.
     441  
     442      >>> fmean([3.5, 4.0, 5.25])
     443      4.25
     444      """
     445      try:
     446          n = len(data)
     447      except TypeError:
     448          # Handle iterators that do not define __len__().
     449          n = 0
     450          def count(iterable):
     451              nonlocal n
     452              for n, x in enumerate(iterable, start=1):
     453                  yield x
     454          data = count(data)
     455      if weights is None:
     456          total = fsum(data)
     457          if not n:
     458              raise StatisticsError('fmean requires at least one data point')
     459          return total / n
     460      try:
     461          num_weights = len(weights)
     462      except TypeError:
     463          weights = list(weights)
     464          num_weights = len(weights)
     465      num = fsum(map(mul, data, weights))
     466      if n != num_weights:
     467          raise StatisticsError('data and weights must be the same length')
     468      den = fsum(weights)
     469      if not den:
     470          raise StatisticsError('sum of weights must be non-zero')
     471      return num / den
     472  
     473  
     474  def geometric_mean(data):
     475      """Convert data to floats and compute the geometric mean.
     476  
     477      Raises a StatisticsError if the input dataset is empty,
     478      if it contains a zero, or if it contains a negative value.
     479  
     480      No special efforts are made to achieve exact results.
     481      (However, this may change in the future.)
     482  
     483      >>> round(geometric_mean([54, 24, 36]), 9)
     484      36.0
     485      """
     486      try:
     487          return exp(fmean(map(log, data)))
     488      except ValueError:
     489          raise StatisticsError('geometric mean requires a non-empty dataset '
     490                                'containing positive numbers') from None
     491  
     492  
     493  def harmonic_mean(data, weights=None):
     494      """Return the harmonic mean of data.
     495  
     496      The harmonic mean is the reciprocal of the arithmetic mean of the
     497      reciprocals of the data.  It can be used for averaging ratios or
     498      rates, for example speeds.
     499  
     500      Suppose a car travels 40 km/hr for 5 km and then speeds-up to
     501      60 km/hr for another 5 km. What is the average speed?
     502  
     503          >>> harmonic_mean([40, 60])
     504          48.0
     505  
     506      Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
     507      speeds-up to 60 km/hr for the remaining 30 km of the journey. What
     508      is the average speed?
     509  
     510          >>> harmonic_mean([40, 60], weights=[5, 30])
     511          56.0
     512  
     513      If ``data`` is empty, or any element is less than zero,
     514      ``harmonic_mean`` will raise ``StatisticsError``.
     515      """
     516      if iter(data) is data:
     517          data = list(data)
     518      errmsg = 'harmonic mean does not support negative values'
     519      n = len(data)
     520      if n < 1:
     521          raise StatisticsError('harmonic_mean requires at least one data point')
     522      elif n == 1 and weights is None:
     523          x = data[0]
     524          if isinstance(x, (numbers.Real, Decimal)):
     525              if x < 0:
     526                  raise StatisticsError(errmsg)
     527              return x
     528          else:
     529              raise TypeError('unsupported type')
     530      if weights is None:
     531          weights = repeat(1, n)
     532          sum_weights = n
     533      else:
     534          if iter(weights) is weights:
     535              weights = list(weights)
     536          if len(weights) != n:
     537              raise StatisticsError('Number of weights does not match data size')
     538          _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
     539      try:
     540          data = _fail_neg(data, errmsg)
     541          T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
     542      except ZeroDivisionError:
     543          return 0
     544      if total <= 0:
     545          raise StatisticsError('Weighted sum must be positive')
     546      return _convert(sum_weights / total, T)
     547  
     548  # FIXME: investigate ways to calculate medians without sorting? Quickselect?
     549  def median(data):
     550      """Return the median (middle value) of numeric data.
     551  
     552      When the number of data points is odd, return the middle data point.
     553      When the number of data points is even, the median is interpolated by
     554      taking the average of the two middle values:
     555  
     556      >>> median([1, 3, 5])
     557      3
     558      >>> median([1, 3, 5, 7])
     559      4.0
     560  
     561      """
     562      data = sorted(data)
     563      n = len(data)
     564      if n == 0:
     565          raise StatisticsError("no median for empty data")
     566      if n % 2 == 1:
     567          return data[n // 2]
     568      else:
     569          i = n // 2
     570          return (data[i - 1] + data[i]) / 2
     571  
     572  
     573  def median_low(data):
     574      """Return the low median of numeric data.
     575  
     576      When the number of data points is odd, the middle value is returned.
     577      When it is even, the smaller of the two middle values is returned.
     578  
     579      >>> median_low([1, 3, 5])
     580      3
     581      >>> median_low([1, 3, 5, 7])
     582      3
     583  
     584      """
     585      data = sorted(data)
     586      n = len(data)
     587      if n == 0:
     588          raise StatisticsError("no median for empty data")
     589      if n % 2 == 1:
     590          return data[n // 2]
     591      else:
     592          return data[n // 2 - 1]
     593  
     594  
     595  def median_high(data):
     596      """Return the high median of data.
     597  
     598      When the number of data points is odd, the middle value is returned.
     599      When it is even, the larger of the two middle values is returned.
     600  
     601      >>> median_high([1, 3, 5])
     602      3
     603      >>> median_high([1, 3, 5, 7])
     604      5
     605  
     606      """
     607      data = sorted(data)
     608      n = len(data)
     609      if n == 0:
     610          raise StatisticsError("no median for empty data")
     611      return data[n // 2]
     612  
     613  
     614  def median_grouped(data, interval=1.0):
     615      """Estimates the median for numeric data binned around the midpoints
     616      of consecutive, fixed-width intervals.
     617  
     618      The *data* can be any iterable of numeric data with each value being
     619      exactly the midpoint of a bin.  At least one value must be present.
     620  
     621      The *interval* is width of each bin.
     622  
     623      For example, demographic information may have been summarized into
     624      consecutive ten-year age groups with each group being represented
     625      by the 5-year midpoints of the intervals:
     626  
     627          >>> demographics = Counter({
     628          ...    25: 172,   # 20 to 30 years old
     629          ...    35: 484,   # 30 to 40 years old
     630          ...    45: 387,   # 40 to 50 years old
     631          ...    55:  22,   # 50 to 60 years old
     632          ...    65:   6,   # 60 to 70 years old
     633          ... })
     634  
     635      The 50th percentile (median) is the 536th person out of the 1071
     636      member cohort.  That person is in the 30 to 40 year old age group.
     637  
     638      The regular median() function would assume that everyone in the
     639      tricenarian age group was exactly 35 years old.  A more tenable
     640      assumption is that the 484 members of that age group are evenly
     641      distributed between 30 and 40.  For that, we use median_grouped().
     642  
     643          >>> data = list(demographics.elements())
     644          >>> median(data)
     645          35
     646          >>> round(median_grouped(data, interval=10), 1)
     647          37.5
     648  
     649      The caller is responsible for making sure the data points are separated
     650      by exact multiples of *interval*.  This is essential for getting a
     651      correct result.  The function does not check this precondition.
     652  
     653      Inputs may be any numeric type that can be coerced to a float during
     654      the interpolation step.
     655  
     656      """
     657      data = sorted(data)
     658      n = len(data)
     659      if not n:
     660          raise StatisticsError("no median for empty data")
     661  
     662      # Find the value at the midpoint. Remember this corresponds to the
     663      # midpoint of the class interval.
     664      x = data[n // 2]
     665  
     666      # Using O(log n) bisection, find where all the x values occur in the data.
     667      # All x will lie within data[i:j].
     668      i = bisect_left(data, x)
     669      j = bisect_right(data, x, lo=i)
     670  
     671      # Coerce to floats, raising a TypeError if not possible
     672      try:
     673          interval = float(interval)
     674          x = float(x)
     675      except ValueError:
     676          raise TypeError(f'Value cannot be converted to a float')
     677  
     678      # Interpolate the median using the formula found at:
     679      # https://www.cuemath.com/data/median-of-grouped-data/
     680      L = x - interval / 2.0    # Lower limit of the median interval
     681      cf = i                    # Cumulative frequency of the preceding interval
     682      f = j - i                 # Number of elements in the median internal
     683      return L + interval * (n / 2 - cf) / f
     684  
     685  
     686  def mode(data):
     687      """Return the most common data point from discrete or nominal data.
     688  
     689      ``mode`` assumes discrete data, and returns a single value. This is the
     690      standard treatment of the mode as commonly taught in schools:
     691  
     692          >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
     693          3
     694  
     695      This also works with nominal (non-numeric) data:
     696  
     697          >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
     698          'red'
     699  
     700      If there are multiple modes with same frequency, return the first one
     701      encountered:
     702  
     703          >>> mode(['red', 'red', 'green', 'blue', 'blue'])
     704          'red'
     705  
     706      If *data* is empty, ``mode``, raises StatisticsError.
     707  
     708      """
     709      pairs = Counter(iter(data)).most_common(1)
     710      try:
     711          return pairs[0][0]
     712      except IndexError:
     713          raise StatisticsError('no mode for empty data') from None
     714  
     715  
     716  def multimode(data):
     717      """Return a list of the most frequently occurring values.
     718  
     719      Will return more than one result if there are multiple modes
     720      or an empty list if *data* is empty.
     721  
     722      >>> multimode('aabbbbbbbbcc')
     723      ['b']
     724      >>> multimode('aabbbbccddddeeffffgg')
     725      ['b', 'd', 'f']
     726      >>> multimode('')
     727      []
     728      """
     729      counts = Counter(iter(data))
     730      if not counts:
     731          return []
     732      maxcount = max(counts.values())
     733      return [value for value, count in counts.items() if count == maxcount]
     734  
     735  
     736  # Notes on methods for computing quantiles
     737  # ----------------------------------------
     738  #
     739  # There is no one perfect way to compute quantiles.  Here we offer
     740  # two methods that serve common needs.  Most other packages
     741  # surveyed offered at least one or both of these two, making them
     742  # "standard" in the sense of "widely-adopted and reproducible".
     743  # They are also easy to explain, easy to compute manually, and have
     744  # straight-forward interpretations that aren't surprising.
     745  
     746  # The default method is known as "R6", "PERCENTILE.EXC", or "expected
     747  # value of rank order statistics". The alternative method is known as
     748  # "R7", "PERCENTILE.INC", or "mode of rank order statistics".
     749  
     750  # For sample data where there is a positive probability for values
     751  # beyond the range of the data, the R6 exclusive method is a
     752  # reasonable choice.  Consider a random sample of nine values from a
     753  # population with a uniform distribution from 0.0 to 1.0.  The
     754  # distribution of the third ranked sample point is described by
     755  # betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
     756  # mean=0.300.  Only the latter (which corresponds with R6) gives the
     757  # desired cut point with 30% of the population falling below that
     758  # value, making it comparable to a result from an inv_cdf() function.
     759  # The R6 exclusive method is also idempotent.
     760  
     761  # For describing population data where the end points are known to
     762  # be included in the data, the R7 inclusive method is a reasonable
     763  # choice.  Instead of the mean, it uses the mode of the beta
     764  # distribution for the interior points.  Per Hyndman & Fan, "One nice
     765  # property is that the vertices of Q7(p) divide the range into n - 1
     766  # intervals, and exactly 100p% of the intervals lie to the left of
     767  # Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
     768  
     769  # If needed, other methods could be added.  However, for now, the
     770  # position is that fewer options make for easier choices and that
     771  # external packages can be used for anything more advanced.
     772  
     773  def quantiles(data, *, n=4, method='exclusive'):
     774      """Divide *data* into *n* continuous intervals with equal probability.
     775  
     776      Returns a list of (n - 1) cut points separating the intervals.
     777  
     778      Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
     779      Set *n* to 100 for percentiles which gives the 99 cuts points that
     780      separate *data* in to 100 equal sized groups.
     781  
     782      The *data* can be any iterable containing sample.
     783      The cut points are linearly interpolated between data points.
     784  
     785      If *method* is set to *inclusive*, *data* is treated as population
     786      data.  The minimum value is treated as the 0th percentile and the
     787      maximum value is treated as the 100th percentile.
     788      """
     789      if n < 1:
     790          raise StatisticsError('n must be at least 1')
     791      data = sorted(data)
     792      ld = len(data)
     793      if ld < 2:
     794          raise StatisticsError('must have at least two data points')
     795      if method == 'inclusive':
     796          m = ld - 1
     797          result = []
     798          for i in range(1, n):
     799              j, delta = divmod(i * m, n)
     800              interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
     801              result.append(interpolated)
     802          return result
     803      if method == 'exclusive':
     804          m = ld + 1
     805          result = []
     806          for i in range(1, n):
     807              j = i * m // n                               # rescale i to m/n
     808              j = 1 if j < 1 else ld-1 if j > ld-1 else j  # clamp to 1 .. ld-1
     809              delta = i*m - j*n                            # exact integer math
     810              interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
     811              result.append(interpolated)
     812          return result
     813      raise ValueError(f'Unknown method: {method!r}')
     814  
     815  
     816  # === Measures of spread ===
     817  
     818  # See http://mathworld.wolfram.com/Variance.html
     819  #     http://mathworld.wolfram.com/SampleVariance.html
     820  
     821  
     822  def variance(data, xbar=None):
     823      """Return the sample variance of data.
     824  
     825      data should be an iterable of Real-valued numbers, with at least two
     826      values. The optional argument xbar, if given, should be the mean of
     827      the data. If it is missing or None, the mean is automatically calculated.
     828  
     829      Use this function when your data is a sample from a population. To
     830      calculate the variance from the entire population, see ``pvariance``.
     831  
     832      Examples:
     833  
     834      >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
     835      >>> variance(data)
     836      1.3720238095238095
     837  
     838      If you have already calculated the mean of your data, you can pass it as
     839      the optional second argument ``xbar`` to avoid recalculating it:
     840  
     841      >>> m = mean(data)
     842      >>> variance(data, m)
     843      1.3720238095238095
     844  
     845      This function does not check that ``xbar`` is actually the mean of
     846      ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
     847      impossible results.
     848  
     849      Decimals and Fractions are supported:
     850  
     851      >>> from decimal import Decimal as D
     852      >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
     853      Decimal('31.01875')
     854  
     855      >>> from fractions import Fraction as F
     856      >>> variance([F(1, 6), F(1, 2), F(5, 3)])
     857      Fraction(67, 108)
     858  
     859      """
     860      T, ss, c, n = _ss(data, xbar)
     861      if n < 2:
     862          raise StatisticsError('variance requires at least two data points')
     863      return _convert(ss / (n - 1), T)
     864  
     865  
     866  def pvariance(data, mu=None):
     867      """Return the population variance of ``data``.
     868  
     869      data should be a sequence or iterable of Real-valued numbers, with at least one
     870      value. The optional argument mu, if given, should be the mean of
     871      the data. If it is missing or None, the mean is automatically calculated.
     872  
     873      Use this function to calculate the variance from the entire population.
     874      To estimate the variance from a sample, the ``variance`` function is
     875      usually a better choice.
     876  
     877      Examples:
     878  
     879      >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
     880      >>> pvariance(data)
     881      1.25
     882  
     883      If you have already calculated the mean of the data, you can pass it as
     884      the optional second argument to avoid recalculating it:
     885  
     886      >>> mu = mean(data)
     887      >>> pvariance(data, mu)
     888      1.25
     889  
     890      Decimals and Fractions are supported:
     891  
     892      >>> from decimal import Decimal as D
     893      >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
     894      Decimal('24.815')
     895  
     896      >>> from fractions import Fraction as F
     897      >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
     898      Fraction(13, 72)
     899  
     900      """
     901      T, ss, c, n = _ss(data, mu)
     902      if n < 1:
     903          raise StatisticsError('pvariance requires at least one data point')
     904      return _convert(ss / n, T)
     905  
     906  
     907  def stdev(data, xbar=None):
     908      """Return the square root of the sample variance.
     909  
     910      See ``variance`` for arguments and other details.
     911  
     912      >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
     913      1.0810874155219827
     914  
     915      """
     916      T, ss, c, n = _ss(data, xbar)
     917      if n < 2:
     918          raise StatisticsError('stdev requires at least two data points')
     919      mss = ss / (n - 1)
     920      if issubclass(T, Decimal):
     921          return _decimal_sqrt_of_frac(mss.numerator, mss.denominator)
     922      return _float_sqrt_of_frac(mss.numerator, mss.denominator)
     923  
     924  
     925  def pstdev(data, mu=None):
     926      """Return the square root of the population variance.
     927  
     928      See ``pvariance`` for arguments and other details.
     929  
     930      >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
     931      0.986893273527251
     932  
     933      """
     934      T, ss, c, n = _ss(data, mu)
     935      if n < 1:
     936          raise StatisticsError('pstdev requires at least one data point')
     937      mss = ss / n
     938      if issubclass(T, Decimal):
     939          return _decimal_sqrt_of_frac(mss.numerator, mss.denominator)
     940      return _float_sqrt_of_frac(mss.numerator, mss.denominator)
     941  
     942  
     943  def _mean_stdev(data):
     944      """In one pass, compute the mean and sample standard deviation as floats."""
     945      T, ss, xbar, n = _ss(data)
     946      if n < 2:
     947          raise StatisticsError('stdev requires at least two data points')
     948      mss = ss / (n - 1)
     949      try:
     950          return float(xbar), _float_sqrt_of_frac(mss.numerator, mss.denominator)
     951      except AttributeError:
     952          # Handle Nans and Infs gracefully
     953          return float(xbar), float(xbar) / float(ss)
     954  
     955  
     956  # === Statistics for relations between two inputs ===
     957  
     958  # See https://en.wikipedia.org/wiki/Covariance
     959  #     https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
     960  #     https://en.wikipedia.org/wiki/Simple_linear_regression
     961  
     962  
     963  def covariance(x, y, /):
     964      """Covariance
     965  
     966      Return the sample covariance of two inputs *x* and *y*. Covariance
     967      is a measure of the joint variability of two inputs.
     968  
     969      >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     970      >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
     971      >>> covariance(x, y)
     972      0.75
     973      >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
     974      >>> covariance(x, z)
     975      -7.5
     976      >>> covariance(z, x)
     977      -7.5
     978  
     979      """
     980      n = len(x)
     981      if len(y) != n:
     982          raise StatisticsError('covariance requires that both inputs have same number of data points')
     983      if n < 2:
     984          raise StatisticsError('covariance requires at least two data points')
     985      xbar = fsum(x) / n
     986      ybar = fsum(y) / n
     987      sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
     988      return sxy / (n - 1)
     989  
     990  
     991  def correlation(x, y, /):
     992      """Pearson's correlation coefficient
     993  
     994      Return the Pearson's correlation coefficient for two inputs. Pearson's
     995      correlation coefficient *r* takes values between -1 and +1. It measures the
     996      strength and direction of the linear relationship, where +1 means very
     997      strong, positive linear relationship, -1 very strong, negative linear
     998      relationship, and 0 no linear relationship.
     999  
    1000      >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    1001      >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
    1002      >>> correlation(x, x)
    1003      1.0
    1004      >>> correlation(x, y)
    1005      -1.0
    1006  
    1007      """
    1008      n = len(x)
    1009      if len(y) != n:
    1010          raise StatisticsError('correlation requires that both inputs have same number of data points')
    1011      if n < 2:
    1012          raise StatisticsError('correlation requires at least two data points')
    1013      xbar = fsum(x) / n
    1014      ybar = fsum(y) / n
    1015      sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
    1016      sxx = fsum((d := xi - xbar) * d for xi in x)
    1017      syy = fsum((d := yi - ybar) * d for yi in y)
    1018      try:
    1019          return sxy / sqrt(sxx * syy)
    1020      except ZeroDivisionError:
    1021          raise StatisticsError('at least one of the inputs is constant')
    1022  
    1023  
    1024  LinearRegression = namedtuple('LinearRegression', ('slope', 'intercept'))
    1025  
    1026  
    1027  def linear_regression(x, y, /, *, proportional=False):
    1028      """Slope and intercept for simple linear regression.
    1029  
    1030      Return the slope and intercept of simple linear regression
    1031      parameters estimated using ordinary least squares. Simple linear
    1032      regression describes relationship between an independent variable
    1033      *x* and a dependent variable *y* in terms of a linear function:
    1034  
    1035          y = slope * x + intercept + noise
    1036  
    1037      where *slope* and *intercept* are the regression parameters that are
    1038      estimated, and noise represents the variability of the data that was
    1039      not explained by the linear regression (it is equal to the
    1040      difference between predicted and actual values of the dependent
    1041      variable).
    1042  
    1043      The parameters are returned as a named tuple.
    1044  
    1045      >>> x = [1, 2, 3, 4, 5]
    1046      >>> noise = NormalDist().samples(5, seed=42)
    1047      >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)]
    1048      >>> linear_regression(x, y)  #doctest: +ELLIPSIS
    1049      LinearRegression(slope=3.09078914170..., intercept=1.75684970486...)
    1050  
    1051      If *proportional* is true, the independent variable *x* and the
    1052      dependent variable *y* are assumed to be directly proportional.
    1053      The data is fit to a line passing through the origin.
    1054  
    1055      Since the *intercept* will always be 0.0, the underlying linear
    1056      function simplifies to:
    1057  
    1058          y = slope * x + noise
    1059  
    1060      >>> y = [3 * x[i] + noise[i] for i in range(5)]
    1061      >>> linear_regression(x, y, proportional=True)  #doctest: +ELLIPSIS
    1062      LinearRegression(slope=3.02447542484..., intercept=0.0)
    1063  
    1064      """
    1065      n = len(x)
    1066      if len(y) != n:
    1067          raise StatisticsError('linear regression requires that both inputs have same number of data points')
    1068      if n < 2:
    1069          raise StatisticsError('linear regression requires at least two data points')
    1070      if proportional:
    1071          sxy = fsum(xi * yi for xi, yi in zip(x, y))
    1072          sxx = fsum(xi * xi for xi in x)
    1073      else:
    1074          xbar = fsum(x) / n
    1075          ybar = fsum(y) / n
    1076          sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
    1077          sxx = fsum((d := xi - xbar) * d for xi in x)
    1078      try:
    1079          slope = sxy / sxx   # equivalent to:  covariance(x, y) / variance(x)
    1080      except ZeroDivisionError:
    1081          raise StatisticsError('x is constant')
    1082      intercept = 0.0 if proportional else ybar - slope * xbar
    1083      return LinearRegression(slope=slope, intercept=intercept)
    1084  
    1085  
    1086  ## Normal Distribution #####################################################
    1087  
    1088  
    1089  def _normal_dist_inv_cdf(p, mu, sigma):
    1090      # There is no closed-form solution to the inverse CDF for the normal
    1091      # distribution, so we use a rational approximation instead:
    1092      # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
    1093      # Normal Distribution".  Applied Statistics. Blackwell Publishing. 37
    1094      # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
    1095      q = p - 0.5
    1096      if fabs(q) <= 0.425:
    1097          r = 0.180625 - q * q
    1098          # Hash sum: 55.88319_28806_14901_4439
    1099          num = (((((((2.50908_09287_30122_6727e+3 * r +
    1100                       3.34305_75583_58812_8105e+4) * r +
    1101                       6.72657_70927_00870_0853e+4) * r +
    1102                       4.59219_53931_54987_1457e+4) * r +
    1103                       1.37316_93765_50946_1125e+4) * r +
    1104                       1.97159_09503_06551_4427e+3) * r +
    1105                       1.33141_66789_17843_7745e+2) * r +
    1106                       3.38713_28727_96366_6080e+0) * q
    1107          den = (((((((5.22649_52788_52854_5610e+3 * r +
    1108                       2.87290_85735_72194_2674e+4) * r +
    1109                       3.93078_95800_09271_0610e+4) * r +
    1110                       2.12137_94301_58659_5867e+4) * r +
    1111                       5.39419_60214_24751_1077e+3) * r +
    1112                       6.87187_00749_20579_0830e+2) * r +
    1113                       4.23133_30701_60091_1252e+1) * r +
    1114                       1.0)
    1115          x = num / den
    1116          return mu + (x * sigma)
    1117      r = p if q <= 0.0 else 1.0 - p
    1118      r = sqrt(-log(r))
    1119      if r <= 5.0:
    1120          r = r - 1.6
    1121          # Hash sum: 49.33206_50330_16102_89036
    1122          num = (((((((7.74545_01427_83414_07640e-4 * r +
    1123                       2.27238_44989_26918_45833e-2) * r +
    1124                       2.41780_72517_74506_11770e-1) * r +
    1125                       1.27045_82524_52368_38258e+0) * r +
    1126                       3.64784_83247_63204_60504e+0) * r +
    1127                       5.76949_72214_60691_40550e+0) * r +
    1128                       4.63033_78461_56545_29590e+0) * r +
    1129                       1.42343_71107_49683_57734e+0)
    1130          den = (((((((1.05075_00716_44416_84324e-9 * r +
    1131                       5.47593_80849_95344_94600e-4) * r +
    1132                       1.51986_66563_61645_71966e-2) * r +
    1133                       1.48103_97642_74800_74590e-1) * r +
    1134                       6.89767_33498_51000_04550e-1) * r +
    1135                       1.67638_48301_83803_84940e+0) * r +
    1136                       2.05319_16266_37758_82187e+0) * r +
    1137                       1.0)
    1138      else:
    1139          r = r - 5.0
    1140          # Hash sum: 47.52583_31754_92896_71629
    1141          num = (((((((2.01033_43992_92288_13265e-7 * r +
    1142                       2.71155_55687_43487_57815e-5) * r +
    1143                       1.24266_09473_88078_43860e-3) * r +
    1144                       2.65321_89526_57612_30930e-2) * r +
    1145                       2.96560_57182_85048_91230e-1) * r +
    1146                       1.78482_65399_17291_33580e+0) * r +
    1147                       5.46378_49111_64114_36990e+0) * r +
    1148                       6.65790_46435_01103_77720e+0)
    1149          den = (((((((2.04426_31033_89939_78564e-15 * r +
    1150                       1.42151_17583_16445_88870e-7) * r +
    1151                       1.84631_83175_10054_68180e-5) * r +
    1152                       7.86869_13114_56132_59100e-4) * r +
    1153                       1.48753_61290_85061_48525e-2) * r +
    1154                       1.36929_88092_27358_05310e-1) * r +
    1155                       5.99832_20655_58879_37690e-1) * r +
    1156                       1.0)
    1157      x = num / den
    1158      if q < 0.0:
    1159          x = -x
    1160      return mu + (x * sigma)
    1161  
    1162  
    1163  # If available, use C implementation
    1164  try:
    1165      from _statistics import _normal_dist_inv_cdf
    1166  except ImportError:
    1167      pass
    1168  
    1169  
    1170  class ESC[4;38;5;81mNormalDist:
    1171      "Normal distribution of a random variable"
    1172      # https://en.wikipedia.org/wiki/Normal_distribution
    1173      # https://en.wikipedia.org/wiki/Variance#Properties
    1174  
    1175      __slots__ = {
    1176          '_mu': 'Arithmetic mean of a normal distribution',
    1177          '_sigma': 'Standard deviation of a normal distribution',
    1178      }
    1179  
    1180      def __init__(self, mu=0.0, sigma=1.0):
    1181          "NormalDist where mu is the mean and sigma is the standard deviation."
    1182          if sigma < 0.0:
    1183              raise StatisticsError('sigma must be non-negative')
    1184          self._mu = float(mu)
    1185          self._sigma = float(sigma)
    1186  
    1187      @classmethod
    1188      def from_samples(cls, data):
    1189          "Make a normal distribution instance from sample data."
    1190          return cls(*_mean_stdev(data))
    1191  
    1192      def samples(self, n, *, seed=None):
    1193          "Generate *n* samples for a given mean and standard deviation."
    1194          gauss = random.gauss if seed is None else random.Random(seed).gauss
    1195          mu, sigma = self._mu, self._sigma
    1196          return [gauss(mu, sigma) for i in range(n)]
    1197  
    1198      def pdf(self, x):
    1199          "Probability density function.  P(x <= X < x+dx) / dx"
    1200          variance = self._sigma * self._sigma
    1201          if not variance:
    1202              raise StatisticsError('pdf() not defined when sigma is zero')
    1203          diff = x - self._mu
    1204          return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance)
    1205  
    1206      def cdf(self, x):
    1207          "Cumulative distribution function.  P(X <= x)"
    1208          if not self._sigma:
    1209              raise StatisticsError('cdf() not defined when sigma is zero')
    1210          return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * _SQRT2)))
    1211  
    1212      def inv_cdf(self, p):
    1213          """Inverse cumulative distribution function.  x : P(X <= x) = p
    1214  
    1215          Finds the value of the random variable such that the probability of
    1216          the variable being less than or equal to that value equals the given
    1217          probability.
    1218  
    1219          This function is also called the percent point function or quantile
    1220          function.
    1221          """
    1222          if p <= 0.0 or p >= 1.0:
    1223              raise StatisticsError('p must be in the range 0.0 < p < 1.0')
    1224          if self._sigma <= 0.0:
    1225              raise StatisticsError('cdf() not defined when sigma at or below zero')
    1226          return _normal_dist_inv_cdf(p, self._mu, self._sigma)
    1227  
    1228      def quantiles(self, n=4):
    1229          """Divide into *n* continuous intervals with equal probability.
    1230  
    1231          Returns a list of (n - 1) cut points separating the intervals.
    1232  
    1233          Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
    1234          Set *n* to 100 for percentiles which gives the 99 cuts points that
    1235          separate the normal distribution in to 100 equal sized groups.
    1236          """
    1237          return [self.inv_cdf(i / n) for i in range(1, n)]
    1238  
    1239      def overlap(self, other):
    1240          """Compute the overlapping coefficient (OVL) between two normal distributions.
    1241  
    1242          Measures the agreement between two normal probability distributions.
    1243          Returns a value between 0.0 and 1.0 giving the overlapping area in
    1244          the two underlying probability density functions.
    1245  
    1246              >>> N1 = NormalDist(2.4, 1.6)
    1247              >>> N2 = NormalDist(3.2, 2.0)
    1248              >>> N1.overlap(N2)
    1249              0.8035050657330205
    1250          """
    1251          # See: "The overlapping coefficient as a measure of agreement between
    1252          # probability distributions and point estimation of the overlap of two
    1253          # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
    1254          # http://dx.doi.org/10.1080/03610928908830127
    1255          if not isinstance(other, NormalDist):
    1256              raise TypeError('Expected another NormalDist instance')
    1257          X, Y = self, other
    1258          if (Y._sigma, Y._mu) < (X._sigma, X._mu):  # sort to assure commutativity
    1259              X, Y = Y, X
    1260          X_var, Y_var = X.variance, Y.variance
    1261          if not X_var or not Y_var:
    1262              raise StatisticsError('overlap() not defined when sigma is zero')
    1263          dv = Y_var - X_var
    1264          dm = fabs(Y._mu - X._mu)
    1265          if not dv:
    1266              return 1.0 - erf(dm / (2.0 * X._sigma * _SQRT2))
    1267          a = X._mu * Y_var - Y._mu * X_var
    1268          b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var))
    1269          x1 = (a + b) / dv
    1270          x2 = (a - b) / dv
    1271          return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
    1272  
    1273      def zscore(self, x):
    1274          """Compute the Standard Score.  (x - mean) / stdev
    1275  
    1276          Describes *x* in terms of the number of standard deviations
    1277          above or below the mean of the normal distribution.
    1278          """
    1279          # https://www.statisticshowto.com/probability-and-statistics/z-score/
    1280          if not self._sigma:
    1281              raise StatisticsError('zscore() not defined when sigma is zero')
    1282          return (x - self._mu) / self._sigma
    1283  
    1284      @property
    1285      def mean(self):
    1286          "Arithmetic mean of the normal distribution."
    1287          return self._mu
    1288  
    1289      @property
    1290      def median(self):
    1291          "Return the median of the normal distribution"
    1292          return self._mu
    1293  
    1294      @property
    1295      def mode(self):
    1296          """Return the mode of the normal distribution
    1297  
    1298          The mode is the value x where which the probability density
    1299          function (pdf) takes its maximum value.
    1300          """
    1301          return self._mu
    1302  
    1303      @property
    1304      def stdev(self):
    1305          "Standard deviation of the normal distribution."
    1306          return self._sigma
    1307  
    1308      @property
    1309      def variance(self):
    1310          "Square of the standard deviation."
    1311          return self._sigma * self._sigma
    1312  
    1313      def __add__(x1, x2):
    1314          """Add a constant or another NormalDist instance.
    1315  
    1316          If *other* is a constant, translate mu by the constant,
    1317          leaving sigma unchanged.
    1318  
    1319          If *other* is a NormalDist, add both the means and the variances.
    1320          Mathematically, this works only if the two distributions are
    1321          independent or if they are jointly normally distributed.
    1322          """
    1323          if isinstance(x2, NormalDist):
    1324              return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
    1325          return NormalDist(x1._mu + x2, x1._sigma)
    1326  
    1327      def __sub__(x1, x2):
    1328          """Subtract a constant or another NormalDist instance.
    1329  
    1330          If *other* is a constant, translate by the constant mu,
    1331          leaving sigma unchanged.
    1332  
    1333          If *other* is a NormalDist, subtract the means and add the variances.
    1334          Mathematically, this works only if the two distributions are
    1335          independent or if they are jointly normally distributed.
    1336          """
    1337          if isinstance(x2, NormalDist):
    1338              return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
    1339          return NormalDist(x1._mu - x2, x1._sigma)
    1340  
    1341      def __mul__(x1, x2):
    1342          """Multiply both mu and sigma by a constant.
    1343  
    1344          Used for rescaling, perhaps to change measurement units.
    1345          Sigma is scaled with the absolute value of the constant.
    1346          """
    1347          return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
    1348  
    1349      def __truediv__(x1, x2):
    1350          """Divide both mu and sigma by a constant.
    1351  
    1352          Used for rescaling, perhaps to change measurement units.
    1353          Sigma is scaled with the absolute value of the constant.
    1354          """
    1355          return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
    1356  
    1357      def __pos__(x1):
    1358          "Return a copy of the instance."
    1359          return NormalDist(x1._mu, x1._sigma)
    1360  
    1361      def __neg__(x1):
    1362          "Negates mu while keeping sigma the same."
    1363          return NormalDist(-x1._mu, x1._sigma)
    1364  
    1365      __radd__ = __add__
    1366  
    1367      def __rsub__(x1, x2):
    1368          "Subtract a NormalDist from a constant or another NormalDist."
    1369          return -(x1 - x2)
    1370  
    1371      __rmul__ = __mul__
    1372  
    1373      def __eq__(x1, x2):
    1374          "Two NormalDist objects are equal if their mu and sigma are both equal."
    1375          if not isinstance(x2, NormalDist):
    1376              return NotImplemented
    1377          return x1._mu == x2._mu and x1._sigma == x2._sigma
    1378  
    1379      def __hash__(self):
    1380          "NormalDist objects hash equal if their mu and sigma are both equal."
    1381          return hash((self._mu, self._sigma))
    1382  
    1383      def __repr__(self):
    1384          return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'
    1385  
    1386      def __getstate__(self):
    1387          return self._mu, self._sigma
    1388  
    1389      def __setstate__(self, state):
    1390          self._mu, self._sigma = state