1 """
2 Basic statistics module.
3
4 This module provides functions for calculating statistics of data, including
5 averages, variance, and standard deviation.
6
7 Calculating averages
8 --------------------
9
10 ================== ==================================================
11 Function Description
12 ================== ==================================================
13 mean Arithmetic mean (average) of data.
14 fmean Fast, floating point arithmetic mean.
15 geometric_mean Geometric mean of data.
16 harmonic_mean Harmonic mean of data.
17 median Median (middle value) of data.
18 median_low Low median of data.
19 median_high High median of data.
20 median_grouped Median, or 50th percentile, of grouped data.
21 mode Mode (most common value) of data.
22 multimode List of modes (most common values of data).
23 quantiles Divide data into intervals with equal probability.
24 ================== ==================================================
25
26 Calculate the arithmetic mean ("the average") of data:
27
28 >>> mean([-1.0, 2.5, 3.25, 5.75])
29 2.625
30
31
32 Calculate the standard median of discrete data:
33
34 >>> median([2, 3, 4, 5])
35 3.5
36
37
38 Calculate the median, or 50th percentile, of data grouped into class intervals
39 centred on the data values provided. E.g. if your data points are rounded to
40 the nearest whole number:
41
42 >>> median_grouped([2, 2, 3, 3, 3, 4]) #doctest: +ELLIPSIS
43 2.8333333333...
44
45 This should be interpreted in this way: you have two data points in the class
46 interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47 the class interval 3.5-4.5. The median of these data points is 2.8333...
48
49
50 Calculating variability or spread
51 ---------------------------------
52
53 ================== =============================================
54 Function Description
55 ================== =============================================
56 pvariance Population variance of data.
57 variance Sample variance of data.
58 pstdev Population standard deviation of data.
59 stdev Sample standard deviation of data.
60 ================== =============================================
61
62 Calculate the standard deviation of sample data:
63
64 >>> stdev([2.5, 3.25, 5.5, 11.25, 11.75]) #doctest: +ELLIPSIS
65 4.38961843444...
66
67 If you have previously calculated the mean, you can pass it as the optional
68 second argument to the four "spread" functions to avoid recalculating it:
69
70 >>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71 >>> mu = mean(data)
72 >>> pvariance(data, mu)
73 2.5
74
75
76 Statistics for relations between two inputs
77 -------------------------------------------
78
79 ================== ====================================================
80 Function Description
81 ================== ====================================================
82 covariance Sample covariance for two variables.
83 correlation Pearson's correlation coefficient for two variables.
84 linear_regression Intercept and slope for simple linear regression.
85 ================== ====================================================
86
87 Calculate covariance, Pearson's correlation, and simple linear regression
88 for two inputs:
89
90 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
91 >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
92 >>> covariance(x, y)
93 0.75
94 >>> correlation(x, y) #doctest: +ELLIPSIS
95 0.31622776601...
96 >>> linear_regression(x, y) #doctest:
97 LinearRegression(slope=0.1, intercept=1.5)
98
99
100 Exceptions
101 ----------
102
103 A single exception is defined: StatisticsError is a subclass of ValueError.
104
105 """
106
107 __all__ = [
108 'NormalDist',
109 'StatisticsError',
110 'correlation',
111 'covariance',
112 'fmean',
113 'geometric_mean',
114 'harmonic_mean',
115 'linear_regression',
116 'mean',
117 'median',
118 'median_grouped',
119 'median_high',
120 'median_low',
121 'mode',
122 'multimode',
123 'pstdev',
124 'pvariance',
125 'quantiles',
126 'stdev',
127 'variance',
128 ]
129
130 import math
131 import numbers
132 import random
133 import sys
134
135 from fractions import Fraction
136 from decimal import Decimal
137 from itertools import groupby, repeat
138 from bisect import bisect_left, bisect_right
139 from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
140 from functools import reduce
141 from operator import mul
142 from collections import Counter, namedtuple, defaultdict
143
144 _SQRT2 = sqrt(2.0)
145
146 # === Exceptions ===
147
148 class ESC[4;38;5;81mStatisticsError(ESC[4;38;5;149mValueError):
149 pass
150
151
152 # === Private utilities ===
153
154 def _sum(data):
155 """_sum(data) -> (type, sum, count)
156
157 Return a high-precision sum of the given numeric data as a fraction,
158 together with the type to be converted to and the count of items.
159
160 Examples
161 --------
162
163 >>> _sum([3, 2.25, 4.5, -0.5, 0.25])
164 (<class 'float'>, Fraction(19, 2), 5)
165
166 Some sources of round-off error will be avoided:
167
168 # Built-in sum returns zero.
169 >>> _sum([1e50, 1, -1e50] * 1000)
170 (<class 'float'>, Fraction(1000, 1), 3000)
171
172 Fractions and Decimals are also supported:
173
174 >>> from fractions import Fraction as F
175 >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
176 (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
177
178 >>> from decimal import Decimal as D
179 >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
180 >>> _sum(data)
181 (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
182
183 Mixed types are currently treated as an error, except that int is
184 allowed.
185 """
186 count = 0
187 types = set()
188 types_add = types.add
189 partials = {}
190 partials_get = partials.get
191 for typ, values in groupby(data, type):
192 types_add(typ)
193 for n, d in map(_exact_ratio, values):
194 count += 1
195 partials[d] = partials_get(d, 0) + n
196 if None in partials:
197 # The sum will be a NAN or INF. We can ignore all the finite
198 # partials, and just look at this special one.
199 total = partials[None]
200 assert not _isfinite(total)
201 else:
202 # Sum all the partial sums using builtin sum.
203 total = sum(Fraction(n, d) for d, n in partials.items())
204 T = reduce(_coerce, types, int) # or raise TypeError
205 return (T, total, count)
206
207
208 def _ss(data, c=None):
209 """Return the exact mean and sum of square deviations of sequence data.
210
211 Calculations are done in a single pass, allowing the input to be an iterator.
212
213 If given *c* is used the mean; otherwise, it is calculated from the data.
214 Use the *c* argument with care, as it can lead to garbage results.
215
216 """
217 if c is not None:
218 T, ssd, count = _sum((d := x - c) * d for x in data)
219 return (T, ssd, c, count)
220 count = 0
221 types = set()
222 types_add = types.add
223 sx_partials = defaultdict(int)
224 sxx_partials = defaultdict(int)
225 for typ, values in groupby(data, type):
226 types_add(typ)
227 for n, d in map(_exact_ratio, values):
228 count += 1
229 sx_partials[d] += n
230 sxx_partials[d] += n * n
231 if not count:
232 ssd = c = Fraction(0)
233 elif None in sx_partials:
234 # The sum will be a NAN or INF. We can ignore all the finite
235 # partials, and just look at this special one.
236 ssd = c = sx_partials[None]
237 assert not _isfinite(ssd)
238 else:
239 sx = sum(Fraction(n, d) for d, n in sx_partials.items())
240 sxx = sum(Fraction(n, d*d) for d, n in sxx_partials.items())
241 # This formula has poor numeric properties for floats,
242 # but with fractions it is exact.
243 ssd = (count * sxx - sx * sx) / count
244 c = sx / count
245 T = reduce(_coerce, types, int) # or raise TypeError
246 return (T, ssd, c, count)
247
248
249 def _isfinite(x):
250 try:
251 return x.is_finite() # Likely a Decimal.
252 except AttributeError:
253 return math.isfinite(x) # Coerces to float first.
254
255
256 def _coerce(T, S):
257 """Coerce types T and S to a common type, or raise TypeError.
258
259 Coercion rules are currently an implementation detail. See the CoerceTest
260 test class in test_statistics for details.
261 """
262 # See http://bugs.python.org/issue24068.
263 assert T is not bool, "initial type T is bool"
264 # If the types are the same, no need to coerce anything. Put this
265 # first, so that the usual case (no coercion needed) happens as soon
266 # as possible.
267 if T is S: return T
268 # Mixed int & other coerce to the other type.
269 if S is int or S is bool: return T
270 if T is int: return S
271 # If one is a (strict) subclass of the other, coerce to the subclass.
272 if issubclass(S, T): return S
273 if issubclass(T, S): return T
274 # Ints coerce to the other type.
275 if issubclass(T, int): return S
276 if issubclass(S, int): return T
277 # Mixed fraction & float coerces to float (or float subclass).
278 if issubclass(T, Fraction) and issubclass(S, float):
279 return S
280 if issubclass(T, float) and issubclass(S, Fraction):
281 return T
282 # Any other combination is disallowed.
283 msg = "don't know how to coerce %s and %s"
284 raise TypeError(msg % (T.__name__, S.__name__))
285
286
287 def _exact_ratio(x):
288 """Return Real number x to exact (numerator, denominator) pair.
289
290 >>> _exact_ratio(0.25)
291 (1, 4)
292
293 x is expected to be an int, Fraction, Decimal or float.
294 """
295
296 # XXX We should revisit whether using fractions to accumulate exact
297 # ratios is the right way to go.
298
299 # The integer ratios for binary floats can have numerators or
300 # denominators with over 300 decimal digits. The problem is more
301 # acute with decimal floats where the default decimal context
302 # supports a huge range of exponents from Emin=-999999 to
303 # Emax=999999. When expanded with as_integer_ratio(), numbers like
304 # Decimal('3.14E+5000') and Decimal('3.14E-5000') have large
305 # numerators or denominators that will slow computation.
306
307 # When the integer ratios are accumulated as fractions, the size
308 # grows to cover the full range from the smallest magnitude to the
309 # largest. For example, Fraction(3.14E+300) + Fraction(3.14E-300),
310 # has a 616 digit numerator. Likewise,
311 # Fraction(Decimal('3.14E+5000')) + Fraction(Decimal('3.14E-5000'))
312 # has 10,003 digit numerator.
313
314 # This doesn't seem to have been problem in practice, but it is a
315 # potential pitfall.
316
317 try:
318 return x.as_integer_ratio()
319 except AttributeError:
320 pass
321 except (OverflowError, ValueError):
322 # float NAN or INF.
323 assert not _isfinite(x)
324 return (x, None)
325 try:
326 # x may be an Integral ABC.
327 return (x.numerator, x.denominator)
328 except AttributeError:
329 msg = f"can't convert type '{type(x).__name__}' to numerator/denominator"
330 raise TypeError(msg)
331
332
333 def _convert(value, T):
334 """Convert value to given numeric type T."""
335 if type(value) is T:
336 # This covers the cases where T is Fraction, or where value is
337 # a NAN or INF (Decimal or float).
338 return value
339 if issubclass(T, int) and value.denominator != 1:
340 T = float
341 try:
342 # FIXME: what do we do if this overflows?
343 return T(value)
344 except TypeError:
345 if issubclass(T, Decimal):
346 return T(value.numerator) / T(value.denominator)
347 else:
348 raise
349
350
351 def _fail_neg(values, errmsg='negative value'):
352 """Iterate over values, failing if any are less than zero."""
353 for x in values:
354 if x < 0:
355 raise StatisticsError(errmsg)
356 yield x
357
358
359 def _integer_sqrt_of_frac_rto(n: int, m: int) -> int:
360 """Square root of n/m, rounded to the nearest integer using round-to-odd."""
361 # Reference: https://www.lri.fr/~melquion/doc/05-imacs17_1-expose.pdf
362 a = math.isqrt(n // m)
363 return a | (a*a*m != n)
364
365
366 # For 53 bit precision floats, the bit width used in
367 # _float_sqrt_of_frac() is 109.
368 _sqrt_bit_width: int = 2 * sys.float_info.mant_dig + 3
369
370
371 def _float_sqrt_of_frac(n: int, m: int) -> float:
372 """Square root of n/m as a float, correctly rounded."""
373 # See principle and proof sketch at: https://bugs.python.org/msg407078
374 q = (n.bit_length() - m.bit_length() - _sqrt_bit_width) // 2
375 if q >= 0:
376 numerator = _integer_sqrt_of_frac_rto(n, m << 2 * q) << q
377 denominator = 1
378 else:
379 numerator = _integer_sqrt_of_frac_rto(n << -2 * q, m)
380 denominator = 1 << -q
381 return numerator / denominator # Convert to float
382
383
384 def _decimal_sqrt_of_frac(n: int, m: int) -> Decimal:
385 """Square root of n/m as a Decimal, correctly rounded."""
386 # Premise: For decimal, computing (n/m).sqrt() can be off
387 # by 1 ulp from the correctly rounded result.
388 # Method: Check the result, moving up or down a step if needed.
389 if n <= 0:
390 if not n:
391 return Decimal('0.0')
392 n, m = -n, -m
393
394 root = (Decimal(n) / Decimal(m)).sqrt()
395 nr, dr = root.as_integer_ratio()
396
397 plus = root.next_plus()
398 np, dp = plus.as_integer_ratio()
399 # test: n / m > ((root + plus) / 2) ** 2
400 if 4 * n * (dr*dp)**2 > m * (dr*np + dp*nr)**2:
401 return plus
402
403 minus = root.next_minus()
404 nm, dm = minus.as_integer_ratio()
405 # test: n / m < ((root + minus) / 2) ** 2
406 if 4 * n * (dr*dm)**2 < m * (dr*nm + dm*nr)**2:
407 return minus
408
409 return root
410
411
412 # === Measures of central tendency (averages) ===
413
414 def mean(data):
415 """Return the sample arithmetic mean of data.
416
417 >>> mean([1, 2, 3, 4, 4])
418 2.8
419
420 >>> from fractions import Fraction as F
421 >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
422 Fraction(13, 21)
423
424 >>> from decimal import Decimal as D
425 >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
426 Decimal('0.5625')
427
428 If ``data`` is empty, StatisticsError will be raised.
429 """
430 T, total, n = _sum(data)
431 if n < 1:
432 raise StatisticsError('mean requires at least one data point')
433 return _convert(total / n, T)
434
435
436 def fmean(data, weights=None):
437 """Convert data to floats and compute the arithmetic mean.
438
439 This runs faster than the mean() function and it always returns a float.
440 If the input dataset is empty, it raises a StatisticsError.
441
442 >>> fmean([3.5, 4.0, 5.25])
443 4.25
444 """
445 try:
446 n = len(data)
447 except TypeError:
448 # Handle iterators that do not define __len__().
449 n = 0
450 def count(iterable):
451 nonlocal n
452 for n, x in enumerate(iterable, start=1):
453 yield x
454 data = count(data)
455 if weights is None:
456 total = fsum(data)
457 if not n:
458 raise StatisticsError('fmean requires at least one data point')
459 return total / n
460 try:
461 num_weights = len(weights)
462 except TypeError:
463 weights = list(weights)
464 num_weights = len(weights)
465 num = fsum(map(mul, data, weights))
466 if n != num_weights:
467 raise StatisticsError('data and weights must be the same length')
468 den = fsum(weights)
469 if not den:
470 raise StatisticsError('sum of weights must be non-zero')
471 return num / den
472
473
474 def geometric_mean(data):
475 """Convert data to floats and compute the geometric mean.
476
477 Raises a StatisticsError if the input dataset is empty,
478 if it contains a zero, or if it contains a negative value.
479
480 No special efforts are made to achieve exact results.
481 (However, this may change in the future.)
482
483 >>> round(geometric_mean([54, 24, 36]), 9)
484 36.0
485 """
486 try:
487 return exp(fmean(map(log, data)))
488 except ValueError:
489 raise StatisticsError('geometric mean requires a non-empty dataset '
490 'containing positive numbers') from None
491
492
493 def harmonic_mean(data, weights=None):
494 """Return the harmonic mean of data.
495
496 The harmonic mean is the reciprocal of the arithmetic mean of the
497 reciprocals of the data. It can be used for averaging ratios or
498 rates, for example speeds.
499
500 Suppose a car travels 40 km/hr for 5 km and then speeds-up to
501 60 km/hr for another 5 km. What is the average speed?
502
503 >>> harmonic_mean([40, 60])
504 48.0
505
506 Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
507 speeds-up to 60 km/hr for the remaining 30 km of the journey. What
508 is the average speed?
509
510 >>> harmonic_mean([40, 60], weights=[5, 30])
511 56.0
512
513 If ``data`` is empty, or any element is less than zero,
514 ``harmonic_mean`` will raise ``StatisticsError``.
515 """
516 if iter(data) is data:
517 data = list(data)
518 errmsg = 'harmonic mean does not support negative values'
519 n = len(data)
520 if n < 1:
521 raise StatisticsError('harmonic_mean requires at least one data point')
522 elif n == 1 and weights is None:
523 x = data[0]
524 if isinstance(x, (numbers.Real, Decimal)):
525 if x < 0:
526 raise StatisticsError(errmsg)
527 return x
528 else:
529 raise TypeError('unsupported type')
530 if weights is None:
531 weights = repeat(1, n)
532 sum_weights = n
533 else:
534 if iter(weights) is weights:
535 weights = list(weights)
536 if len(weights) != n:
537 raise StatisticsError('Number of weights does not match data size')
538 _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
539 try:
540 data = _fail_neg(data, errmsg)
541 T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
542 except ZeroDivisionError:
543 return 0
544 if total <= 0:
545 raise StatisticsError('Weighted sum must be positive')
546 return _convert(sum_weights / total, T)
547
548 # FIXME: investigate ways to calculate medians without sorting? Quickselect?
549 def median(data):
550 """Return the median (middle value) of numeric data.
551
552 When the number of data points is odd, return the middle data point.
553 When the number of data points is even, the median is interpolated by
554 taking the average of the two middle values:
555
556 >>> median([1, 3, 5])
557 3
558 >>> median([1, 3, 5, 7])
559 4.0
560
561 """
562 data = sorted(data)
563 n = len(data)
564 if n == 0:
565 raise StatisticsError("no median for empty data")
566 if n % 2 == 1:
567 return data[n // 2]
568 else:
569 i = n // 2
570 return (data[i - 1] + data[i]) / 2
571
572
573 def median_low(data):
574 """Return the low median of numeric data.
575
576 When the number of data points is odd, the middle value is returned.
577 When it is even, the smaller of the two middle values is returned.
578
579 >>> median_low([1, 3, 5])
580 3
581 >>> median_low([1, 3, 5, 7])
582 3
583
584 """
585 data = sorted(data)
586 n = len(data)
587 if n == 0:
588 raise StatisticsError("no median for empty data")
589 if n % 2 == 1:
590 return data[n // 2]
591 else:
592 return data[n // 2 - 1]
593
594
595 def median_high(data):
596 """Return the high median of data.
597
598 When the number of data points is odd, the middle value is returned.
599 When it is even, the larger of the two middle values is returned.
600
601 >>> median_high([1, 3, 5])
602 3
603 >>> median_high([1, 3, 5, 7])
604 5
605
606 """
607 data = sorted(data)
608 n = len(data)
609 if n == 0:
610 raise StatisticsError("no median for empty data")
611 return data[n // 2]
612
613
614 def median_grouped(data, interval=1.0):
615 """Estimates the median for numeric data binned around the midpoints
616 of consecutive, fixed-width intervals.
617
618 The *data* can be any iterable of numeric data with each value being
619 exactly the midpoint of a bin. At least one value must be present.
620
621 The *interval* is width of each bin.
622
623 For example, demographic information may have been summarized into
624 consecutive ten-year age groups with each group being represented
625 by the 5-year midpoints of the intervals:
626
627 >>> demographics = Counter({
628 ... 25: 172, # 20 to 30 years old
629 ... 35: 484, # 30 to 40 years old
630 ... 45: 387, # 40 to 50 years old
631 ... 55: 22, # 50 to 60 years old
632 ... 65: 6, # 60 to 70 years old
633 ... })
634
635 The 50th percentile (median) is the 536th person out of the 1071
636 member cohort. That person is in the 30 to 40 year old age group.
637
638 The regular median() function would assume that everyone in the
639 tricenarian age group was exactly 35 years old. A more tenable
640 assumption is that the 484 members of that age group are evenly
641 distributed between 30 and 40. For that, we use median_grouped().
642
643 >>> data = list(demographics.elements())
644 >>> median(data)
645 35
646 >>> round(median_grouped(data, interval=10), 1)
647 37.5
648
649 The caller is responsible for making sure the data points are separated
650 by exact multiples of *interval*. This is essential for getting a
651 correct result. The function does not check this precondition.
652
653 Inputs may be any numeric type that can be coerced to a float during
654 the interpolation step.
655
656 """
657 data = sorted(data)
658 n = len(data)
659 if not n:
660 raise StatisticsError("no median for empty data")
661
662 # Find the value at the midpoint. Remember this corresponds to the
663 # midpoint of the class interval.
664 x = data[n // 2]
665
666 # Using O(log n) bisection, find where all the x values occur in the data.
667 # All x will lie within data[i:j].
668 i = bisect_left(data, x)
669 j = bisect_right(data, x, lo=i)
670
671 # Coerce to floats, raising a TypeError if not possible
672 try:
673 interval = float(interval)
674 x = float(x)
675 except ValueError:
676 raise TypeError(f'Value cannot be converted to a float')
677
678 # Interpolate the median using the formula found at:
679 # https://www.cuemath.com/data/median-of-grouped-data/
680 L = x - interval / 2.0 # Lower limit of the median interval
681 cf = i # Cumulative frequency of the preceding interval
682 f = j - i # Number of elements in the median internal
683 return L + interval * (n / 2 - cf) / f
684
685
686 def mode(data):
687 """Return the most common data point from discrete or nominal data.
688
689 ``mode`` assumes discrete data, and returns a single value. This is the
690 standard treatment of the mode as commonly taught in schools:
691
692 >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
693 3
694
695 This also works with nominal (non-numeric) data:
696
697 >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
698 'red'
699
700 If there are multiple modes with same frequency, return the first one
701 encountered:
702
703 >>> mode(['red', 'red', 'green', 'blue', 'blue'])
704 'red'
705
706 If *data* is empty, ``mode``, raises StatisticsError.
707
708 """
709 pairs = Counter(iter(data)).most_common(1)
710 try:
711 return pairs[0][0]
712 except IndexError:
713 raise StatisticsError('no mode for empty data') from None
714
715
716 def multimode(data):
717 """Return a list of the most frequently occurring values.
718
719 Will return more than one result if there are multiple modes
720 or an empty list if *data* is empty.
721
722 >>> multimode('aabbbbbbbbcc')
723 ['b']
724 >>> multimode('aabbbbccddddeeffffgg')
725 ['b', 'd', 'f']
726 >>> multimode('')
727 []
728 """
729 counts = Counter(iter(data))
730 if not counts:
731 return []
732 maxcount = max(counts.values())
733 return [value for value, count in counts.items() if count == maxcount]
734
735
736 # Notes on methods for computing quantiles
737 # ----------------------------------------
738 #
739 # There is no one perfect way to compute quantiles. Here we offer
740 # two methods that serve common needs. Most other packages
741 # surveyed offered at least one or both of these two, making them
742 # "standard" in the sense of "widely-adopted and reproducible".
743 # They are also easy to explain, easy to compute manually, and have
744 # straight-forward interpretations that aren't surprising.
745
746 # The default method is known as "R6", "PERCENTILE.EXC", or "expected
747 # value of rank order statistics". The alternative method is known as
748 # "R7", "PERCENTILE.INC", or "mode of rank order statistics".
749
750 # For sample data where there is a positive probability for values
751 # beyond the range of the data, the R6 exclusive method is a
752 # reasonable choice. Consider a random sample of nine values from a
753 # population with a uniform distribution from 0.0 to 1.0. The
754 # distribution of the third ranked sample point is described by
755 # betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
756 # mean=0.300. Only the latter (which corresponds with R6) gives the
757 # desired cut point with 30% of the population falling below that
758 # value, making it comparable to a result from an inv_cdf() function.
759 # The R6 exclusive method is also idempotent.
760
761 # For describing population data where the end points are known to
762 # be included in the data, the R7 inclusive method is a reasonable
763 # choice. Instead of the mean, it uses the mode of the beta
764 # distribution for the interior points. Per Hyndman & Fan, "One nice
765 # property is that the vertices of Q7(p) divide the range into n - 1
766 # intervals, and exactly 100p% of the intervals lie to the left of
767 # Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
768
769 # If needed, other methods could be added. However, for now, the
770 # position is that fewer options make for easier choices and that
771 # external packages can be used for anything more advanced.
772
773 def quantiles(data, *, n=4, method='exclusive'):
774 """Divide *data* into *n* continuous intervals with equal probability.
775
776 Returns a list of (n - 1) cut points separating the intervals.
777
778 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles.
779 Set *n* to 100 for percentiles which gives the 99 cuts points that
780 separate *data* in to 100 equal sized groups.
781
782 The *data* can be any iterable containing sample.
783 The cut points are linearly interpolated between data points.
784
785 If *method* is set to *inclusive*, *data* is treated as population
786 data. The minimum value is treated as the 0th percentile and the
787 maximum value is treated as the 100th percentile.
788 """
789 if n < 1:
790 raise StatisticsError('n must be at least 1')
791 data = sorted(data)
792 ld = len(data)
793 if ld < 2:
794 raise StatisticsError('must have at least two data points')
795 if method == 'inclusive':
796 m = ld - 1
797 result = []
798 for i in range(1, n):
799 j, delta = divmod(i * m, n)
800 interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
801 result.append(interpolated)
802 return result
803 if method == 'exclusive':
804 m = ld + 1
805 result = []
806 for i in range(1, n):
807 j = i * m // n # rescale i to m/n
808 j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1
809 delta = i*m - j*n # exact integer math
810 interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
811 result.append(interpolated)
812 return result
813 raise ValueError(f'Unknown method: {method!r}')
814
815
816 # === Measures of spread ===
817
818 # See http://mathworld.wolfram.com/Variance.html
819 # http://mathworld.wolfram.com/SampleVariance.html
820
821
822 def variance(data, xbar=None):
823 """Return the sample variance of data.
824
825 data should be an iterable of Real-valued numbers, with at least two
826 values. The optional argument xbar, if given, should be the mean of
827 the data. If it is missing or None, the mean is automatically calculated.
828
829 Use this function when your data is a sample from a population. To
830 calculate the variance from the entire population, see ``pvariance``.
831
832 Examples:
833
834 >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
835 >>> variance(data)
836 1.3720238095238095
837
838 If you have already calculated the mean of your data, you can pass it as
839 the optional second argument ``xbar`` to avoid recalculating it:
840
841 >>> m = mean(data)
842 >>> variance(data, m)
843 1.3720238095238095
844
845 This function does not check that ``xbar`` is actually the mean of
846 ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
847 impossible results.
848
849 Decimals and Fractions are supported:
850
851 >>> from decimal import Decimal as D
852 >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
853 Decimal('31.01875')
854
855 >>> from fractions import Fraction as F
856 >>> variance([F(1, 6), F(1, 2), F(5, 3)])
857 Fraction(67, 108)
858
859 """
860 T, ss, c, n = _ss(data, xbar)
861 if n < 2:
862 raise StatisticsError('variance requires at least two data points')
863 return _convert(ss / (n - 1), T)
864
865
866 def pvariance(data, mu=None):
867 """Return the population variance of ``data``.
868
869 data should be a sequence or iterable of Real-valued numbers, with at least one
870 value. The optional argument mu, if given, should be the mean of
871 the data. If it is missing or None, the mean is automatically calculated.
872
873 Use this function to calculate the variance from the entire population.
874 To estimate the variance from a sample, the ``variance`` function is
875 usually a better choice.
876
877 Examples:
878
879 >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
880 >>> pvariance(data)
881 1.25
882
883 If you have already calculated the mean of the data, you can pass it as
884 the optional second argument to avoid recalculating it:
885
886 >>> mu = mean(data)
887 >>> pvariance(data, mu)
888 1.25
889
890 Decimals and Fractions are supported:
891
892 >>> from decimal import Decimal as D
893 >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
894 Decimal('24.815')
895
896 >>> from fractions import Fraction as F
897 >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
898 Fraction(13, 72)
899
900 """
901 T, ss, c, n = _ss(data, mu)
902 if n < 1:
903 raise StatisticsError('pvariance requires at least one data point')
904 return _convert(ss / n, T)
905
906
907 def stdev(data, xbar=None):
908 """Return the square root of the sample variance.
909
910 See ``variance`` for arguments and other details.
911
912 >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
913 1.0810874155219827
914
915 """
916 T, ss, c, n = _ss(data, xbar)
917 if n < 2:
918 raise StatisticsError('stdev requires at least two data points')
919 mss = ss / (n - 1)
920 if issubclass(T, Decimal):
921 return _decimal_sqrt_of_frac(mss.numerator, mss.denominator)
922 return _float_sqrt_of_frac(mss.numerator, mss.denominator)
923
924
925 def pstdev(data, mu=None):
926 """Return the square root of the population variance.
927
928 See ``pvariance`` for arguments and other details.
929
930 >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
931 0.986893273527251
932
933 """
934 T, ss, c, n = _ss(data, mu)
935 if n < 1:
936 raise StatisticsError('pstdev requires at least one data point')
937 mss = ss / n
938 if issubclass(T, Decimal):
939 return _decimal_sqrt_of_frac(mss.numerator, mss.denominator)
940 return _float_sqrt_of_frac(mss.numerator, mss.denominator)
941
942
943 def _mean_stdev(data):
944 """In one pass, compute the mean and sample standard deviation as floats."""
945 T, ss, xbar, n = _ss(data)
946 if n < 2:
947 raise StatisticsError('stdev requires at least two data points')
948 mss = ss / (n - 1)
949 try:
950 return float(xbar), _float_sqrt_of_frac(mss.numerator, mss.denominator)
951 except AttributeError:
952 # Handle Nans and Infs gracefully
953 return float(xbar), float(xbar) / float(ss)
954
955
956 # === Statistics for relations between two inputs ===
957
958 # See https://en.wikipedia.org/wiki/Covariance
959 # https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
960 # https://en.wikipedia.org/wiki/Simple_linear_regression
961
962
963 def covariance(x, y, /):
964 """Covariance
965
966 Return the sample covariance of two inputs *x* and *y*. Covariance
967 is a measure of the joint variability of two inputs.
968
969 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
970 >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
971 >>> covariance(x, y)
972 0.75
973 >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
974 >>> covariance(x, z)
975 -7.5
976 >>> covariance(z, x)
977 -7.5
978
979 """
980 n = len(x)
981 if len(y) != n:
982 raise StatisticsError('covariance requires that both inputs have same number of data points')
983 if n < 2:
984 raise StatisticsError('covariance requires at least two data points')
985 xbar = fsum(x) / n
986 ybar = fsum(y) / n
987 sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
988 return sxy / (n - 1)
989
990
991 def correlation(x, y, /):
992 """Pearson's correlation coefficient
993
994 Return the Pearson's correlation coefficient for two inputs. Pearson's
995 correlation coefficient *r* takes values between -1 and +1. It measures the
996 strength and direction of the linear relationship, where +1 means very
997 strong, positive linear relationship, -1 very strong, negative linear
998 relationship, and 0 no linear relationship.
999
1000 >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
1001 >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
1002 >>> correlation(x, x)
1003 1.0
1004 >>> correlation(x, y)
1005 -1.0
1006
1007 """
1008 n = len(x)
1009 if len(y) != n:
1010 raise StatisticsError('correlation requires that both inputs have same number of data points')
1011 if n < 2:
1012 raise StatisticsError('correlation requires at least two data points')
1013 xbar = fsum(x) / n
1014 ybar = fsum(y) / n
1015 sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
1016 sxx = fsum((d := xi - xbar) * d for xi in x)
1017 syy = fsum((d := yi - ybar) * d for yi in y)
1018 try:
1019 return sxy / sqrt(sxx * syy)
1020 except ZeroDivisionError:
1021 raise StatisticsError('at least one of the inputs is constant')
1022
1023
1024 LinearRegression = namedtuple('LinearRegression', ('slope', 'intercept'))
1025
1026
1027 def linear_regression(x, y, /, *, proportional=False):
1028 """Slope and intercept for simple linear regression.
1029
1030 Return the slope and intercept of simple linear regression
1031 parameters estimated using ordinary least squares. Simple linear
1032 regression describes relationship between an independent variable
1033 *x* and a dependent variable *y* in terms of a linear function:
1034
1035 y = slope * x + intercept + noise
1036
1037 where *slope* and *intercept* are the regression parameters that are
1038 estimated, and noise represents the variability of the data that was
1039 not explained by the linear regression (it is equal to the
1040 difference between predicted and actual values of the dependent
1041 variable).
1042
1043 The parameters are returned as a named tuple.
1044
1045 >>> x = [1, 2, 3, 4, 5]
1046 >>> noise = NormalDist().samples(5, seed=42)
1047 >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)]
1048 >>> linear_regression(x, y) #doctest: +ELLIPSIS
1049 LinearRegression(slope=3.09078914170..., intercept=1.75684970486...)
1050
1051 If *proportional* is true, the independent variable *x* and the
1052 dependent variable *y* are assumed to be directly proportional.
1053 The data is fit to a line passing through the origin.
1054
1055 Since the *intercept* will always be 0.0, the underlying linear
1056 function simplifies to:
1057
1058 y = slope * x + noise
1059
1060 >>> y = [3 * x[i] + noise[i] for i in range(5)]
1061 >>> linear_regression(x, y, proportional=True) #doctest: +ELLIPSIS
1062 LinearRegression(slope=3.02447542484..., intercept=0.0)
1063
1064 """
1065 n = len(x)
1066 if len(y) != n:
1067 raise StatisticsError('linear regression requires that both inputs have same number of data points')
1068 if n < 2:
1069 raise StatisticsError('linear regression requires at least two data points')
1070 if proportional:
1071 sxy = fsum(xi * yi for xi, yi in zip(x, y))
1072 sxx = fsum(xi * xi for xi in x)
1073 else:
1074 xbar = fsum(x) / n
1075 ybar = fsum(y) / n
1076 sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
1077 sxx = fsum((d := xi - xbar) * d for xi in x)
1078 try:
1079 slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x)
1080 except ZeroDivisionError:
1081 raise StatisticsError('x is constant')
1082 intercept = 0.0 if proportional else ybar - slope * xbar
1083 return LinearRegression(slope=slope, intercept=intercept)
1084
1085
1086 ## Normal Distribution #####################################################
1087
1088
1089 def _normal_dist_inv_cdf(p, mu, sigma):
1090 # There is no closed-form solution to the inverse CDF for the normal
1091 # distribution, so we use a rational approximation instead:
1092 # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
1093 # Normal Distribution". Applied Statistics. Blackwell Publishing. 37
1094 # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
1095 q = p - 0.5
1096 if fabs(q) <= 0.425:
1097 r = 0.180625 - q * q
1098 # Hash sum: 55.88319_28806_14901_4439
1099 num = (((((((2.50908_09287_30122_6727e+3 * r +
1100 3.34305_75583_58812_8105e+4) * r +
1101 6.72657_70927_00870_0853e+4) * r +
1102 4.59219_53931_54987_1457e+4) * r +
1103 1.37316_93765_50946_1125e+4) * r +
1104 1.97159_09503_06551_4427e+3) * r +
1105 1.33141_66789_17843_7745e+2) * r +
1106 3.38713_28727_96366_6080e+0) * q
1107 den = (((((((5.22649_52788_52854_5610e+3 * r +
1108 2.87290_85735_72194_2674e+4) * r +
1109 3.93078_95800_09271_0610e+4) * r +
1110 2.12137_94301_58659_5867e+4) * r +
1111 5.39419_60214_24751_1077e+3) * r +
1112 6.87187_00749_20579_0830e+2) * r +
1113 4.23133_30701_60091_1252e+1) * r +
1114 1.0)
1115 x = num / den
1116 return mu + (x * sigma)
1117 r = p if q <= 0.0 else 1.0 - p
1118 r = sqrt(-log(r))
1119 if r <= 5.0:
1120 r = r - 1.6
1121 # Hash sum: 49.33206_50330_16102_89036
1122 num = (((((((7.74545_01427_83414_07640e-4 * r +
1123 2.27238_44989_26918_45833e-2) * r +
1124 2.41780_72517_74506_11770e-1) * r +
1125 1.27045_82524_52368_38258e+0) * r +
1126 3.64784_83247_63204_60504e+0) * r +
1127 5.76949_72214_60691_40550e+0) * r +
1128 4.63033_78461_56545_29590e+0) * r +
1129 1.42343_71107_49683_57734e+0)
1130 den = (((((((1.05075_00716_44416_84324e-9 * r +
1131 5.47593_80849_95344_94600e-4) * r +
1132 1.51986_66563_61645_71966e-2) * r +
1133 1.48103_97642_74800_74590e-1) * r +
1134 6.89767_33498_51000_04550e-1) * r +
1135 1.67638_48301_83803_84940e+0) * r +
1136 2.05319_16266_37758_82187e+0) * r +
1137 1.0)
1138 else:
1139 r = r - 5.0
1140 # Hash sum: 47.52583_31754_92896_71629
1141 num = (((((((2.01033_43992_92288_13265e-7 * r +
1142 2.71155_55687_43487_57815e-5) * r +
1143 1.24266_09473_88078_43860e-3) * r +
1144 2.65321_89526_57612_30930e-2) * r +
1145 2.96560_57182_85048_91230e-1) * r +
1146 1.78482_65399_17291_33580e+0) * r +
1147 5.46378_49111_64114_36990e+0) * r +
1148 6.65790_46435_01103_77720e+0)
1149 den = (((((((2.04426_31033_89939_78564e-15 * r +
1150 1.42151_17583_16445_88870e-7) * r +
1151 1.84631_83175_10054_68180e-5) * r +
1152 7.86869_13114_56132_59100e-4) * r +
1153 1.48753_61290_85061_48525e-2) * r +
1154 1.36929_88092_27358_05310e-1) * r +
1155 5.99832_20655_58879_37690e-1) * r +
1156 1.0)
1157 x = num / den
1158 if q < 0.0:
1159 x = -x
1160 return mu + (x * sigma)
1161
1162
1163 # If available, use C implementation
1164 try:
1165 from _statistics import _normal_dist_inv_cdf
1166 except ImportError:
1167 pass
1168
1169
1170 class ESC[4;38;5;81mNormalDist:
1171 "Normal distribution of a random variable"
1172 # https://en.wikipedia.org/wiki/Normal_distribution
1173 # https://en.wikipedia.org/wiki/Variance#Properties
1174
1175 __slots__ = {
1176 '_mu': 'Arithmetic mean of a normal distribution',
1177 '_sigma': 'Standard deviation of a normal distribution',
1178 }
1179
1180 def __init__(self, mu=0.0, sigma=1.0):
1181 "NormalDist where mu is the mean and sigma is the standard deviation."
1182 if sigma < 0.0:
1183 raise StatisticsError('sigma must be non-negative')
1184 self._mu = float(mu)
1185 self._sigma = float(sigma)
1186
1187 @classmethod
1188 def from_samples(cls, data):
1189 "Make a normal distribution instance from sample data."
1190 return cls(*_mean_stdev(data))
1191
1192 def samples(self, n, *, seed=None):
1193 "Generate *n* samples for a given mean and standard deviation."
1194 gauss = random.gauss if seed is None else random.Random(seed).gauss
1195 mu, sigma = self._mu, self._sigma
1196 return [gauss(mu, sigma) for i in range(n)]
1197
1198 def pdf(self, x):
1199 "Probability density function. P(x <= X < x+dx) / dx"
1200 variance = self._sigma * self._sigma
1201 if not variance:
1202 raise StatisticsError('pdf() not defined when sigma is zero')
1203 diff = x - self._mu
1204 return exp(diff * diff / (-2.0 * variance)) / sqrt(tau * variance)
1205
1206 def cdf(self, x):
1207 "Cumulative distribution function. P(X <= x)"
1208 if not self._sigma:
1209 raise StatisticsError('cdf() not defined when sigma is zero')
1210 return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * _SQRT2)))
1211
1212 def inv_cdf(self, p):
1213 """Inverse cumulative distribution function. x : P(X <= x) = p
1214
1215 Finds the value of the random variable such that the probability of
1216 the variable being less than or equal to that value equals the given
1217 probability.
1218
1219 This function is also called the percent point function or quantile
1220 function.
1221 """
1222 if p <= 0.0 or p >= 1.0:
1223 raise StatisticsError('p must be in the range 0.0 < p < 1.0')
1224 if self._sigma <= 0.0:
1225 raise StatisticsError('cdf() not defined when sigma at or below zero')
1226 return _normal_dist_inv_cdf(p, self._mu, self._sigma)
1227
1228 def quantiles(self, n=4):
1229 """Divide into *n* continuous intervals with equal probability.
1230
1231 Returns a list of (n - 1) cut points separating the intervals.
1232
1233 Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles.
1234 Set *n* to 100 for percentiles which gives the 99 cuts points that
1235 separate the normal distribution in to 100 equal sized groups.
1236 """
1237 return [self.inv_cdf(i / n) for i in range(1, n)]
1238
1239 def overlap(self, other):
1240 """Compute the overlapping coefficient (OVL) between two normal distributions.
1241
1242 Measures the agreement between two normal probability distributions.
1243 Returns a value between 0.0 and 1.0 giving the overlapping area in
1244 the two underlying probability density functions.
1245
1246 >>> N1 = NormalDist(2.4, 1.6)
1247 >>> N2 = NormalDist(3.2, 2.0)
1248 >>> N1.overlap(N2)
1249 0.8035050657330205
1250 """
1251 # See: "The overlapping coefficient as a measure of agreement between
1252 # probability distributions and point estimation of the overlap of two
1253 # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
1254 # http://dx.doi.org/10.1080/03610928908830127
1255 if not isinstance(other, NormalDist):
1256 raise TypeError('Expected another NormalDist instance')
1257 X, Y = self, other
1258 if (Y._sigma, Y._mu) < (X._sigma, X._mu): # sort to assure commutativity
1259 X, Y = Y, X
1260 X_var, Y_var = X.variance, Y.variance
1261 if not X_var or not Y_var:
1262 raise StatisticsError('overlap() not defined when sigma is zero')
1263 dv = Y_var - X_var
1264 dm = fabs(Y._mu - X._mu)
1265 if not dv:
1266 return 1.0 - erf(dm / (2.0 * X._sigma * _SQRT2))
1267 a = X._mu * Y_var - Y._mu * X_var
1268 b = X._sigma * Y._sigma * sqrt(dm * dm + dv * log(Y_var / X_var))
1269 x1 = (a + b) / dv
1270 x2 = (a - b) / dv
1271 return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
1272
1273 def zscore(self, x):
1274 """Compute the Standard Score. (x - mean) / stdev
1275
1276 Describes *x* in terms of the number of standard deviations
1277 above or below the mean of the normal distribution.
1278 """
1279 # https://www.statisticshowto.com/probability-and-statistics/z-score/
1280 if not self._sigma:
1281 raise StatisticsError('zscore() not defined when sigma is zero')
1282 return (x - self._mu) / self._sigma
1283
1284 @property
1285 def mean(self):
1286 "Arithmetic mean of the normal distribution."
1287 return self._mu
1288
1289 @property
1290 def median(self):
1291 "Return the median of the normal distribution"
1292 return self._mu
1293
1294 @property
1295 def mode(self):
1296 """Return the mode of the normal distribution
1297
1298 The mode is the value x where which the probability density
1299 function (pdf) takes its maximum value.
1300 """
1301 return self._mu
1302
1303 @property
1304 def stdev(self):
1305 "Standard deviation of the normal distribution."
1306 return self._sigma
1307
1308 @property
1309 def variance(self):
1310 "Square of the standard deviation."
1311 return self._sigma * self._sigma
1312
1313 def __add__(x1, x2):
1314 """Add a constant or another NormalDist instance.
1315
1316 If *other* is a constant, translate mu by the constant,
1317 leaving sigma unchanged.
1318
1319 If *other* is a NormalDist, add both the means and the variances.
1320 Mathematically, this works only if the two distributions are
1321 independent or if they are jointly normally distributed.
1322 """
1323 if isinstance(x2, NormalDist):
1324 return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
1325 return NormalDist(x1._mu + x2, x1._sigma)
1326
1327 def __sub__(x1, x2):
1328 """Subtract a constant or another NormalDist instance.
1329
1330 If *other* is a constant, translate by the constant mu,
1331 leaving sigma unchanged.
1332
1333 If *other* is a NormalDist, subtract the means and add the variances.
1334 Mathematically, this works only if the two distributions are
1335 independent or if they are jointly normally distributed.
1336 """
1337 if isinstance(x2, NormalDist):
1338 return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
1339 return NormalDist(x1._mu - x2, x1._sigma)
1340
1341 def __mul__(x1, x2):
1342 """Multiply both mu and sigma by a constant.
1343
1344 Used for rescaling, perhaps to change measurement units.
1345 Sigma is scaled with the absolute value of the constant.
1346 """
1347 return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
1348
1349 def __truediv__(x1, x2):
1350 """Divide both mu and sigma by a constant.
1351
1352 Used for rescaling, perhaps to change measurement units.
1353 Sigma is scaled with the absolute value of the constant.
1354 """
1355 return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
1356
1357 def __pos__(x1):
1358 "Return a copy of the instance."
1359 return NormalDist(x1._mu, x1._sigma)
1360
1361 def __neg__(x1):
1362 "Negates mu while keeping sigma the same."
1363 return NormalDist(-x1._mu, x1._sigma)
1364
1365 __radd__ = __add__
1366
1367 def __rsub__(x1, x2):
1368 "Subtract a NormalDist from a constant or another NormalDist."
1369 return -(x1 - x2)
1370
1371 __rmul__ = __mul__
1372
1373 def __eq__(x1, x2):
1374 "Two NormalDist objects are equal if their mu and sigma are both equal."
1375 if not isinstance(x2, NormalDist):
1376 return NotImplemented
1377 return x1._mu == x2._mu and x1._sigma == x2._sigma
1378
1379 def __hash__(self):
1380 "NormalDist objects hash equal if their mu and sigma are both equal."
1381 return hash((self._mu, self._sigma))
1382
1383 def __repr__(self):
1384 return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'
1385
1386 def __getstate__(self):
1387 return self._mu, self._sigma
1388
1389 def __setstate__(self, state):
1390 self._mu, self._sigma = state