htswanalysis/MACS/lib/gsl/gsl-1.11/doc/statistics.texi

   1 @cindex statistics
   2 @cindex mean
   3 @cindex standard deviation
   4 @cindex variance
   5 @cindex estimated standard deviation
   6 @cindex estimated variance
   7 @cindex t-test
   8 @cindex range
   9 @cindex min
  10 @cindex max
  11
  12 This chapter describes the statistical functions in the library.  The
  13 basic statistical functions include routines to compute the mean,
  14 variance and standard deviation.  More advanced functions allow you to
  15 calculate absolute deviations, skewness, and kurtosis as well as the
  16 median and arbitrary percentiles.  The algorithms use recurrence
  17 relations to compute average quantities in a stable way, without large
  18 intermediate values that might overflow.
  19
  20 The functions are available in versions for datasets in the standard
  21 floating-point and integer types.  The versions for double precision
  22 floating-point data have the prefix @code{gsl_stats} and are declared in
  23 the header file @file{gsl_statistics_double.h}.  The versions for integer
  24 data have the prefix @code{gsl_stats_int} and are declared in the header
  25 file @file{gsl_statistics_int.h}.
  26
  27 @menu
  28 * Mean and standard deviation and variance::
  29 * Absolute deviation::
  30 * Higher moments (skewness and kurtosis)::
  31 * Autocorrelation::
  32 * Covariance::
  33 * Correlation::
  34 * Weighted Samples::
  35 * Maximum and Minimum values::
  36 * Median and Percentiles::
  37 * Example statistical programs::
  38 * Statistics References and Further Reading::
  39 @end menu
  40
  41 @node Mean and standard deviation and variance
  42 @section Mean, Standard Deviation and Variance
  43
  44 @deftypefun double gsl_stats_mean (const double @var{data}[], size_t @var{stride}, size_t @var{n})
  45 This function returns the arithmetic mean of @var{data}, a dataset of
  46 length @var{n} with stride @var{stride}.  The arithmetic mean, or
  47 @dfn{sample mean}, is denoted by @math{\Hat\mu} and defined as,
  48 @tex
  49 \beforedisplay
  50 $$
  51 {\Hat\mu} = {1 \over N} \sum x_i
  52 $$
  53 \afterdisplay
  54 @end tex
  55 @ifinfo
  56
  57 @example
  58 \Hat\mu = (1/N) \sum x_i
  59 @end example
  60
  61 @end ifinfo
  62 @noindent
  63 where @math{x_i} are the elements of the dataset @var{data}.  For
  64 samples drawn from a gaussian distribution the variance of
  65 @math{\Hat\mu} is @math{\sigma^2 / N}.
  66 @end deftypefun
  67
  68 @deftypefun double gsl_stats_variance (const double @var{data}[], size_t @var{stride}, size_t @var{n})
  69 This function returns the estimated, or @dfn{sample}, variance of
  70 @var{data}, a dataset of length @var{n} with stride @var{stride}.  The
  71 estimated variance is denoted by @math{\Hat\sigma^2} and is defined by,
  72 @tex
  73 \beforedisplay
  74 $$
  75 {\Hat\sigma}^2 = {1 \over (N-1)} \sum (x_i - {\Hat\mu})^2
  76 $$
  77 \afterdisplay
  78 @end tex
  79 @ifinfo
  80
  81 @example
  82 \Hat\sigma^2 = (1/(N-1)) \sum (x_i - \Hat\mu)^2
  83 @end example
  84
  85 @end ifinfo
  86 @noindent
  87 where @math{x_i} are the elements of the dataset @var{data}.  Note that
  88 the normalization factor of @math{1/(N-1)} results from the derivation
  89 of @math{\Hat\sigma^2} as an unbiased estimator of the population
  90 variance @math{\sigma^2}.  For samples drawn from a gaussian distribution
  91 the variance of @math{\Hat\sigma^2} itself is @math{2 \sigma^4 / N}.
  92
  93 This function computes the mean via a call to @code{gsl_stats_mean}.  If
  94 you have already computed the mean then you can pass it directly to
  95 @code{gsl_stats_variance_m}.
  96 @end deftypefun
  97
  98 @deftypefun double gsl_stats_variance_m (const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{mean})
  99 This function returns the sample variance of @var{data} relative to the
 100 given value of @var{mean}.  The function is computed with @math{\Hat\mu}
 101 replaced by the value of @var{mean} that you supply,
 102 @tex
 103 \beforedisplay
 104 $$
 105 {\Hat\sigma}^2 = {1 \over (N-1)} \sum (x_i - mean)^2
 106 $$
 107 \afterdisplay
 108 @end tex
 109 @ifinfo
 110
 111 @example
 112 \Hat\sigma^2 = (1/(N-1)) \sum (x_i - mean)^2
 113 @end example
 114 @end ifinfo
 115 @end deftypefun
 116
 117 @deftypefun double gsl_stats_sd (const double @var{data}[], size_t @var{stride}, size_t @var{n})
 118 @deftypefunx double gsl_stats_sd_m (const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{mean})
 119 The standard deviation is defined as the square root of the variance.
 120 These functions return the square root of the corresponding variance
 121 functions above.
 122 @end deftypefun
 123
 124 @deftypefun double gsl_stats_tss (const double @var{data}[], size_t @var{stride}, size_t @var{n})
 125 @deftypefunx double gsl_stats_tss_m (const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{mean})
 126 These functions return the total sum of squares (TSS) of @var{data} about
 127 the mean.  For @code{gsl_stats_tss_m} the user-supplied value of
 128 @var{mean} is used, and for @code{gsl_stats_tss} it is computed using
 129 @code{gsl_stats_mean}.
 130
 131 @tex
 132 \beforedisplay
 133 $$
 134 {\rm TSS} = \sum (x_i - mean)^2
 135 $$
 136 \afterdisplay
 137 @end tex
 138 @ifinfo
 139
 140 @example
 141 TSS =  \sum (x_i - mean)^2
 142 @end example
 143 @end ifinfo
 144 @end deftypefun
 145
 146 @deftypefun double gsl_stats_variance_with_fixed_mean (const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{mean})
 147 This function computes an unbiased estimate of the variance of
 148 @var{data} when the population mean @var{mean} of the underlying
 149 distribution is known @emph{a priori}.  In this case the estimator for
 150 the variance uses the factor @math{1/N} and the sample mean
 151 @math{\Hat\mu} is replaced by the known population mean @math{\mu},
 152 @tex
 153 \beforedisplay
 154 $$
 155 {\Hat\sigma}^2 = {1 \over N} \sum (x_i - \mu)^2
 156 $$
 157 \afterdisplay
 158 @end tex
 159 @ifinfo
 160
 161 @example
 162 \Hat\sigma^2 = (1/N) \sum (x_i - \mu)^2
 163 @end example
 164 @end ifinfo
 165 @end deftypefun
 166
 167
 168 @deftypefun double gsl_stats_sd_with_fixed_mean (const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{mean})
 169 This function calculates the standard deviation of @var{data} for a
 170 fixed population mean @var{mean}.  The result is the square root of the
 171 corresponding variance function.
 172 @end deftypefun
 173
 174 @node Absolute deviation
 175 @section Absolute deviation
 176
 177 @deftypefun double gsl_stats_absdev (const double @var{data}[], size_t @var{stride}, size_t @var{n})
 178 This function computes the absolute deviation from the mean of
 179 @var{data}, a dataset of length @var{n} with stride @var{stride}.  The
 180 absolute deviation from the mean is defined as,
 181 @tex
 182 \beforedisplay
 183 $$
 184 absdev  = {1 \over N} \sum |x_i - {\Hat\mu}|
 185 $$
 186 \afterdisplay
 187 @end tex
 188 @ifinfo
 189
 190 @example
 191 absdev  = (1/N) \sum |x_i - \Hat\mu|
 192 @end example
 193
 194 @end ifinfo
 195 @noindent
 196 where @math{x_i} are the elements of the dataset @var{data}.  The
 197 absolute deviation from the mean provides a more robust measure of the
 198 width of a distribution than the variance.  This function computes the
 199 mean of @var{data} via a call to @code{gsl_stats_mean}.
 200 @end deftypefun
 201
 202 @deftypefun double gsl_stats_absdev_m (const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{mean})
 203 This function computes the absolute deviation of the dataset @var{data}
 204 relative to the given value of @var{mean},
 205 @tex
 206 \beforedisplay
 207 $$
 208 absdev  = {1 \over N} \sum |x_i - mean|
 209 $$
 210 \afterdisplay
 211 @end tex
 212 @ifinfo
 213
 214 @example
 215 absdev  = (1/N) \sum |x_i - mean|
 216 @end example
 217
 218 @end ifinfo
 219 @noindent
 220 This function is useful if you have already computed the mean of
 221 @var{data} (and want to avoid recomputing it), or wish to calculate the
 222 absolute deviation relative to another value (such as zero, or the
 223 median).
 224 @end deftypefun
 225
 226 @node Higher moments (skewness and kurtosis)
 227 @section Higher moments (skewness and kurtosis)
 228
 229 @deftypefun double gsl_stats_skew (const double @var{data}[], size_t @var{stride}, size_t @var{n})
 230 This function computes the skewness of @var{data}, a dataset of length
 231 @var{n} with stride @var{stride}.  The skewness is defined as,
 232 @tex
 233 \beforedisplay
 234 $$
 235 skew = {1 \over N} \sum
 236  {\left( x_i - {\Hat\mu} \over {\Hat\sigma} \right)}^3
 237 $$
 238 \afterdisplay
 239 @end tex
 240 @ifinfo
 241
 242 @example
 243 skew = (1/N) \sum ((x_i - \Hat\mu)/\Hat\sigma)^3
 244 @end example
 245
 246 @end ifinfo
 247 @noindent
 248 where @math{x_i} are the elements of the dataset @var{data}.  The skewness
 249 measures the asymmetry of the tails of a distribution.
 250
 251 The function computes the mean and estimated standard deviation of
 252 @var{data} via calls to @code{gsl_stats_mean} and @code{gsl_stats_sd}.
 253 @end deftypefun
 254
 255 @deftypefun double gsl_stats_skew_m_sd (const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{mean}, double @var{sd})
 256 This function computes the skewness of the dataset @var{data} using the
 257 given values of the mean @var{mean} and standard deviation @var{sd},
 258 @tex
 259 \beforedisplay
 260 $$
 261 skew = {1 \over N}
 262      \sum {\left( x_i - mean \over sd \right)}^3
 263 $$
 264 \afterdisplay
 265 @end tex
 266 @ifinfo
 267
 268 @example
 269 skew = (1/N) \sum ((x_i - mean)/sd)^3
 270 @end example
 271
 272 @end ifinfo
 273 @noindent
 274 These functions are useful if you have already computed the mean and
 275 standard deviation of @var{data} and want to avoid recomputing them.
 276 @end deftypefun
 277
 278 @deftypefun double gsl_stats_kurtosis (const double @var{data}[], size_t @var{stride}, size_t @var{n})
 279 This function computes the kurtosis of @var{data}, a dataset of length
 280 @var{n} with stride @var{stride}.  The kurtosis is defined as,
 281 @tex
 282 \beforedisplay
 283 $$
 284 kurtosis = \left( {1 \over N} \sum
 285  {\left(x_i - {\Hat\mu} \over {\Hat\sigma} \right)}^4
 286  \right)
 287  - 3
 288 $$
 289 \afterdisplay
 290 @end tex
 291 @ifinfo
 292
 293 @example
 294 kurtosis = ((1/N) \sum ((x_i - \Hat\mu)/\Hat\sigma)^4)  - 3
 295 @end example
 296
 297 @end ifinfo
 298 @noindent
 299 The kurtosis measures how sharply peaked a distribution is, relative to
 300 its width.  The kurtosis is normalized to zero for a gaussian
 301 distribution.
 302 @end deftypefun
 303
 304 @deftypefun double gsl_stats_kurtosis_m_sd (const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{mean}, double @var{sd})
 305 This function computes the kurtosis of the dataset @var{data} using the
 306 given values of the mean @var{mean} and standard deviation @var{sd},
 307 @tex
 308 \beforedisplay
 309 $$
 310 kurtosis = {1 \over N}
 311   \left( \sum {\left(x_i - mean \over sd \right)}^4 \right)
 312   - 3
 313 $$
 314 \afterdisplay
 315 @end tex
 316 @ifinfo
 317
 318 @example
 319 kurtosis = ((1/N) \sum ((x_i - mean)/sd)^4) - 3
 320 @end example
 321
 322 @end ifinfo
 323 @noindent
 324 This function is useful if you have already computed the mean and
 325 standard deviation of @var{data} and want to avoid recomputing them.
 326 @end deftypefun
 327
 328 @node Autocorrelation
 329 @section Autocorrelation
 330
 331 @deftypefun double gsl_stats_lag1_autocorrelation (const double @var{data}[], const size_t @var{stride}, const size_t @var{n})
 332 This function computes the lag-1 autocorrelation of the dataset @var{data}.
 333 @tex
 334 \beforedisplay
 335 $$
 336 a_1 = {\sum_{i = 1}^{n} (x_{i} - \Hat\mu) (x_{i-1} - \Hat\mu)
 337 \over
 338 \sum_{i = 1}^{n} (x_{i} - \Hat\mu) (x_{i} - \Hat\mu)}
 339 $$
 340 \afterdisplay
 341 @end tex
 342 @ifinfo
 343
 344 @example
 345 a_1 = @{\sum_@{i = 1@}^@{n@} (x_@{i@} - \Hat\mu) (x_@{i-1@} - \Hat\mu)
 346        \over
 347        \sum_@{i = 1@}^@{n@} (x_@{i@} - \Hat\mu) (x_@{i@} - \Hat\mu)@}
 348 @end example
 349 @end ifinfo
 350 @end deftypefun
 351
 352
 353 @deftypefun double gsl_stats_lag1_autocorrelation_m (const double @var{data}[], const size_t @var{stride}, const size_t @var{n}, const double @var{mean})
 354 This function computes the lag-1 autocorrelation of the dataset
 355 @var{data} using the given value of the mean @var{mean}.
 356
 357 @end deftypefun
 358
 359 @node Covariance
 360 @section Covariance
 361 @cindex covariance, of two datasets
 362
 363 @deftypefun double gsl_stats_covariance (const double @var{data1}[], const size_t @var{stride1}, const double @var{data2}[], const size_t @var{stride2}, const size_t @var{n})
 364 This function computes the covariance of the datasets @var{data1} and
 365 @var{data2} which must both be of the same length @var{n}.
 366 @tex
 367 \beforedisplay
 368 $$
 369 covar = {1 \over (n - 1)} \sum_{i = 1}^{n} (x_{i} - \Hat x) (y_{i} - \Hat y)
 370 $$
 371 \afterdisplay
 372 @end tex
 373 @ifinfo
 374
 375 @example
 376 covar = (1/(n - 1)) \sum_@{i = 1@}^@{n@} (x_i - \Hat x) (y_i - \Hat y)
 377 @end example
 378 @end ifinfo
 379 @end deftypefun
 380
 381 @deftypefun double gsl_stats_covariance_m (const double @var{data1}[], const size_t @var{stride1}, const double @var{data2}[], const size_t @var{stride2}, const size_t @var{n}, const double @var{mean1}, const double @var{mean2})
 382 This function computes the covariance of the datasets @var{data1} and
 383 @var{data2} using the given values of the means, @var{mean1} and
 384 @var{mean2}.  This is useful if you have already computed the means of
 385 @var{data1} and @var{data2} and want to avoid recomputing them.
 386 @end deftypefun
 387
 388 @node Correlation
 389 @section Correlation
 390 @cindex correlation, of two datasets
 391
 392 @deftypefun double gsl_stats_correlation (const double @var{data1}[], const size_t @var{stride1}, const double @var{data2}[], const size_t @var{stride2}, const size_t @var{n})
 393 This function efficiently computes the Pearson correlation coefficient
 394 between the datasets @var{data1} and @var{data2} which must both be of
 395 the same length @var{n}.
 396 @tex
 397 \beforedisplay
 398 $$
 399 r = {cov(x, y) \over \Hat\sigma_x \Hat\sigma_y} =
 400 {{1 \over n-1} \sum (x_i - \Hat x) (y_i - \Hat y)
 401 \over
 402 \sqrt{{1 \over n-1} \sum (x_i - {\Hat x})^2}
 403 \sqrt{{1 \over n-1} \sum (y_i - {\Hat y})^2}
 404 }
 405 $$
 406 \afterdisplay
 407 @end tex
 408 @ifinfo
 409 @example
 410 r = cov(x, y) / (\Hat\sigma_x \Hat\sigma_y)
 411   = @{1/(n-1) \sum (x_i - \Hat x) (y_i - \Hat y)
 412      \over
 413      \sqrt@{1/(n-1) \sum (x_i - \Hat x)^2@} \sqrt@{1/(n-1) \sum (y_i - \Hat y)^2@}
 414     @}
 415 @end example
 416 @end ifinfo
 417 @end deftypefun
 418
 419 @node Weighted Samples
 420 @section Weighted Samples
 421
 422 The functions described in this section allow the computation of
 423 statistics for weighted samples.  The functions accept an array of
 424 samples, @math{x_i}, with associated weights, @math{w_i}.  Each sample
 425 @math{x_i} is considered as having been drawn from a Gaussian
 426 distribution with variance @math{\sigma_i^2}.  The sample weight
 427 @math{w_i} is defined as the reciprocal of this variance, @math{w_i =
 428 1/\sigma_i^2}.  Setting a weight to zero corresponds to removing a
 429 sample from a dataset.
 430
 431 @deftypefun double gsl_stats_wmean (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n})
 432 This function returns the weighted mean of the dataset @var{data} with
 433 stride @var{stride} and length @var{n}, using the set of weights @var{w}
 434 with stride @var{wstride} and length @var{n}.  The weighted mean is defined as,
 435 @tex
 436 \beforedisplay
 437 $$
 438 {\Hat\mu} = {{\sum w_i x_i} \over {\sum w_i}}
 439 $$
 440 \afterdisplay
 441 @end tex
 442 @ifinfo
 443
 444 @example
 445 \Hat\mu = (\sum w_i x_i) / (\sum w_i)
 446 @end example
 447 @end ifinfo
 448 @end deftypefun
 449
 450
 451 @deftypefun double gsl_stats_wvariance (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n})
 452 This function returns the estimated variance of the dataset @var{data}
 453 with stride @var{stride} and length @var{n}, using the set of weights
 454 @var{w} with stride @var{wstride} and length @var{n}.  The estimated
 455 variance of a weighted dataset is defined as,
 456 @tex
 457 \beforedisplay
 458 $$
 459 \Hat\sigma^2 = {{\sum w_i} \over {(\sum w_i)^2 - \sum (w_i^2)}}
 460                 \sum w_i (x_i - \Hat\mu)^2
 461 $$
 462 \afterdisplay
 463 @end tex
 464 @ifinfo
 465
 466 @example
 467 \Hat\sigma^2 = ((\sum w_i)/((\sum w_i)^2 - \sum (w_i^2)))
 468                 \sum w_i (x_i - \Hat\mu)^2
 469 @end example
 470
 471 @end ifinfo
 472 @noindent
 473 Note that this expression reduces to an unweighted variance with the
 474 familiar @math{1/(N-1)} factor when there are @math{N} equal non-zero
 475 weights.
 476 @end deftypefun
 477
 478 @deftypefun double gsl_stats_wvariance_m (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{wmean})
 479 This function returns the estimated variance of the weighted dataset
 480 @var{data} using the given weighted mean @var{wmean}.
 481 @end deftypefun
 482
 483 @deftypefun double gsl_stats_wsd (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n})
 484 The standard deviation is defined as the square root of the variance.
 485 This function returns the square root of the corresponding variance
 486 function @code{gsl_stats_wvariance} above.
 487 @end deftypefun
 488
 489 @deftypefun double gsl_stats_wsd_m (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{wmean})
 490 This function returns the square root of the corresponding variance
 491 function @code{gsl_stats_wvariance_m} above.
 492 @end deftypefun
 493
 494 @deftypefun double gsl_stats_wvariance_with_fixed_mean (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n}, const double @var{mean})
 495 This function computes an unbiased estimate of the variance of weighted
 496 dataset @var{data} when the population mean @var{mean} of the underlying
 497 distribution is known @emph{a priori}.  In this case the estimator for
 498 the variance replaces the sample mean @math{\Hat\mu} by the known
 499 population mean @math{\mu},
 500 @tex
 501 \beforedisplay
 502 $$
 503 \Hat\sigma^2 = {{\sum w_i (x_i - \mu)^2} \over {\sum w_i}}
 504 $$
 505 \afterdisplay
 506 @end tex
 507 @ifinfo
 508
 509 @example
 510 \Hat\sigma^2 = (\sum w_i (x_i - \mu)^2) / (\sum w_i)
 511 @end example
 512 @end ifinfo
 513 @end deftypefun
 514
 515 @deftypefun double gsl_stats_wsd_with_fixed_mean (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n}, const double @var{mean})
 516 The standard deviation is defined as the square root of the variance.
 517 This function returns the square root of the corresponding variance
 518 function above.
 519 @end deftypefun
 520
 521 @deftypefun double gsl_stats_wtss (const double @var{w}[], const size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n})
 522 @deftypefunx double gsl_stats_wtss_m (const double @var{w}[], const size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{wmean})
 523 These functions return the weighted total sum of squares (TSS) of
 524 @var{data} about the weighted mean.  For @code{gsl_stats_wtss_m} the
 525 user-supplied value of @var{wmean} is used, and for @code{gsl_stats_wtss}
 526 it is computed using @code{gsl_stats_wmean}.
 527
 528 @tex
 529 \beforedisplay
 530 $$
 531 {\rm TSS} = \sum w_i (x_i - wmean)^2
 532 $$
 533 \afterdisplay
 534 @end tex
 535 @ifinfo
 536
 537 @example
 538 TSS =  \sum w_i (x_i - wmean)^2
 539 @end example
 540 @end ifinfo
 541 @end deftypefun
 542
 543 @deftypefun double gsl_stats_wabsdev (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n})
 544 This function computes the weighted absolute deviation from the weighted
 545 mean of @var{data}.  The absolute deviation from the mean is defined as,
 546 @tex
 547 \beforedisplay
 548 $$
 549 absdev = {{\sum w_i |x_i - \Hat\mu|} \over {\sum w_i}}
 550 $$
 551 \afterdisplay
 552 @end tex
 553 @ifinfo
 554
 555 @example
 556 absdev = (\sum w_i |x_i - \Hat\mu|) / (\sum w_i)
 557 @end example
 558 @end ifinfo
 559 @end deftypefun
 560
 561 @deftypefun double gsl_stats_wabsdev_m (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{wmean})
 562 This function computes the absolute deviation of the weighted dataset
 563 @var{data} about the given weighted mean @var{wmean}.
 564 @end deftypefun
 565
 566 @deftypefun double gsl_stats_wskew (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n})
 567 This function computes the weighted skewness of the dataset @var{data}.
 568 @tex
 569 \beforedisplay
 570 $$
 571 skew = {{\sum w_i ((x_i - xbar)/\sigma)^3} \over {\sum w_i}}
 572 $$
 573 \afterdisplay
 574 @end tex
 575 @ifinfo
 576
 577 @example
 578 skew = (\sum w_i ((x_i - xbar)/\sigma)^3) / (\sum w_i)
 579 @end example
 580 @end ifinfo
 581 @end deftypefun
 582
 583 @deftypefun double gsl_stats_wskew_m_sd (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{wmean}, double @var{wsd})
 584 This function computes the weighted skewness of the dataset @var{data}
 585 using the given values of the weighted mean and weighted standard
 586 deviation, @var{wmean} and @var{wsd}.
 587 @end deftypefun
 588
 589 @deftypefun double gsl_stats_wkurtosis (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n})
 590 This function computes the weighted kurtosis of the dataset @var{data}.
 591
 592 @tex
 593 \beforedisplay
 594 $$
 595 kurtosis = {{\sum w_i ((x_i - xbar)/sigma)^4} \over {\sum w_i}} - 3
 596 $$
 597 \afterdisplay
 598 @end tex
 599 @ifinfo
 600
 601 @example
 602 kurtosis = ((\sum w_i ((x_i - xbar)/sigma)^4) / (\sum w_i)) - 3
 603 @end example
 604 @end ifinfo
 605 @end deftypefun
 606
 607 @deftypefun double gsl_stats_wkurtosis_m_sd (const double @var{w}[], size_t @var{wstride}, const double @var{data}[], size_t @var{stride}, size_t @var{n}, double @var{wmean}, double @var{wsd})
 608 This function computes the weighted kurtosis of the dataset @var{data}
 609 using the given values of the weighted mean and weighted standard
 610 deviation, @var{wmean} and @var{wsd}.
 611 @end deftypefun
 612
 613 @node Maximum and Minimum values
 614 @section Maximum and Minimum values
 615
 616 The following functions find the maximum and minimum values of a
 617 dataset (or their indices).  If the data contains @code{NaN}s then a
 618 @code{NaN} will be returned, since the maximum or minimum value is
 619 undefined.  For functions which return an index, the location of the
 620 first @code{NaN} in the array is returned.
 621
 622 @deftypefun double gsl_stats_max (const double @var{data}[], size_t @var{stride}, size_t @var{n})
 623 This function returns the maximum value in @var{data}, a dataset of
 624 length @var{n} with stride @var{stride}.  The maximum value is defined
 625 as the value of the element @math{x_i} which satisfies @c{$x_i \ge x_j$}
 626 @math{x_i >= x_j} for all @math{j}.
 627
 628 If you want instead to find the element with the largest absolute
 629 magnitude you will need to apply @code{fabs} or @code{abs} to your data
 630 before calling this function.
 631 @end deftypefun
 632
 633 @deftypefun double gsl_stats_min (const double @var{data}[], size_t @var{stride}, size_t @var{n})
 634 This function returns the minimum value in @var{data}, a dataset of
 635 length @var{n} with stride @var{stride}.  The minimum value is defined
 636 as the value of the element @math{x_i} which satisfies @c{$x_i \le x_j$}
 637 @math{x_i <= x_j} for all @math{j}.
 638
 639 If you want instead to find the element with the smallest absolute
 640 magnitude you will need to apply @code{fabs} or @code{abs} to your data
 641 before calling this function.
 642 @end deftypefun
 643
 644 @deftypefun void gsl_stats_minmax (double * @var{min}, double * @var{max}, const double @var{data}[], size_t @var{stride}, size_t @var{n})
 645 This function finds both the minimum and maximum values @var{min},
 646 @var{max} in @var{data} in a single pass.
 647 @end deftypefun
 648
 649 @deftypefun size_t gsl_stats_max_index (const double @var{data}[], size_t @var{stride}, size_t @var{n})
 650 This function returns the index of the maximum value in @var{data}, a
 651 dataset of length @var{n} with stride @var{stride}.  The maximum value is
 652 defined as the value of the element @math{x_i} which satisfies
 653 @c{$x_i \ge x_j$}
 654 @math{x_i >= x_j} for all @math{j}.  When there are several equal maximum
 655 elements then the first one is chosen.
 656 @end deftypefun
 657
 658 @deftypefun size_t gsl_stats_min_index (const double @var{data}[], size_t @var{stride}, size_t @var{n})
 659 This function returns the index of the minimum value in @var{data}, a
 660 dataset of length @var{n} with stride @var{stride}.  The minimum value
 661 is defined as the value of the element @math{x_i} which satisfies
 662 @c{$x_i \ge x_j$}
 663 @math{x_i >= x_j} for all @math{j}.  When there are several equal
 664 minimum elements then the first one is chosen.
 665 @end deftypefun
 666
 667 @deftypefun void gsl_stats_minmax_index (size_t * @var{min_index}, size_t * @var{max_index}, const double @var{data}[], size_t @var{stride}, size_t @var{n})
 668 This function returns the indexes @var{min_index}, @var{max_index} of
 669 the minimum and maximum values in @var{data} in a single pass.
 670 @end deftypefun
 671
 672 @node Median and Percentiles
 673 @section Median and Percentiles
 674
 675 The median and percentile functions described in this section operate on
 676 sorted data.  For convenience we use @dfn{quantiles}, measured on a scale
 677 of 0 to 1, instead of percentiles (which use a scale of 0 to 100).
 678
 679 @deftypefun double gsl_stats_median_from_sorted_data (const double @var{sorted_data}[], size_t @var{stride}, size_t @var{n})
 680 This function returns the median value of @var{sorted_data}, a dataset
 681 of length @var{n} with stride @var{stride}.  The elements of the array
 682 must be in ascending numerical order.  There are no checks to see
 683 whether the data are sorted, so the function @code{gsl_sort} should
 684 always be used first.
 685
 686 When the dataset has an odd number of elements the median is the value
 687 of element @math{(n-1)/2}.  When the dataset has an even number of
 688 elements the median is the mean of the two nearest middle values,
 689 elements @math{(n-1)/2} and @math{n/2}.  Since the algorithm for
 690 computing the median involves interpolation this function always returns
 691 a floating-point number, even for integer data types.
 692 @end deftypefun
 693
 694 @deftypefun double gsl_stats_quantile_from_sorted_data (const double @var{sorted_data}[], size_t @var{stride}, size_t @var{n}, double @var{f})
 695 This function returns a quantile value of @var{sorted_data}, a
 696 double-precision array of length @var{n} with stride @var{stride}.  The
 697 elements of the array must be in ascending numerical order.  The
 698 quantile is determined by the @var{f}, a fraction between 0 and 1.  For
 699 example, to compute the value of the 75th percentile @var{f} should have
 700 the value 0.75.
 701
 702 There are no checks to see whether the data are sorted, so the function
 703 @code{gsl_sort} should always be used first.
 704
 705 The quantile is found by interpolation, using the formula
 706 @tex
 707 \beforedisplay
 708 $$
 709 \hbox{quantile} = (1 - \delta) x_i + \delta x_{i+1}
 710 $$
 711 \afterdisplay
 712 @end tex
 713 @ifinfo
 714
 715 @example
 716 quantile = (1 - \delta) x_i + \delta x_@{i+1@}
 717 @end example
 718
 719 @end ifinfo
 720 @noindent
 721 where @math{i} is @code{floor}(@math{(n - 1)f}) and @math{\delta} is
 722 @math{(n-1)f - i}.
 723
 724 Thus the minimum value of the array (@code{data[0*stride]}) is given by
 725 @var{f} equal to zero, the maximum value (@code{data[(n-1)*stride]}) is
 726 given by @var{f} equal to one and the median value is given by @var{f}
 727 equal to 0.5.  Since the algorithm for computing quantiles involves
 728 interpolation this function always returns a floating-point number, even
 729 for integer data types.
 730 @end deftypefun
 731
 732
 733 @comment @node Statistical tests
 734 @comment @section Statistical tests
 735
 736 @comment FIXME, do more work on the statistical tests
 737
 738 @comment -@deftypefun double gsl_stats_ttest (const double @var{data1}[], double @var{data2}[], size_t @var{n1}, size_t @var{n2})
 739 @comment -@deftypefunx Statistics double gsl_stats_int_ttest (const double @var{data1}[], double @var{data2}[], size_t @var{n1}, size_t @var{n2})
 740
 741 @comment The function @code{gsl_stats_ttest} computes the t-test statistic for
 742 @comment the two arrays @var{data1}[] and @var{data2}[], of lengths @var{n1} and
 743 @comment -@var{n2} respectively.
 744
 745 @comment The t-test statistic measures the difference between the means of two
 746 @comment datasets.
 747
 748 @node Example statistical programs
 749 @section Examples
 750 Here is a basic example of how to use the statistical functions:
 751
 752 @example
 753 @verbatiminclude examples/stat.c
 754 @end example
 755
 756 The program should produce the following output,
 757
 758 @example
 759 @verbatiminclude examples/stat.out
 760 @end example
 761
 762
 763 Here is an example using sorted data,
 764
 765 @example
 766 @verbatiminclude examples/statsort.c
 767 @end example
 768
 769 This program should produce the following output,
 770
 771 @example
 772 @verbatiminclude examples/statsort.out
 773 @end example
 774
 775 @node Statistics References and Further Reading
 776 @section References and Further Reading
 777
 778 The standard reference for almost any topic in statistics is the
 779 multi-volume @cite{Advanced Theory of Statistics} by Kendall and Stuart.
 780
 781 @itemize @asis
 782 @item
 783 Maurice Kendall, Alan Stuart, and J. Keith Ord.
 784 @cite{The Advanced Theory of Statistics} (multiple volumes)
 785 reprinted as @cite{Kendall's Advanced Theory of Statistics}.
 786 Wiley, ISBN 047023380X.
 787 @end itemize
 788
 789 @noindent
 790 Many statistical concepts can be more easily understood by a Bayesian
 791 approach.  The following book by Gelman, Carlin, Stern and Rubin gives a
 792 comprehensive coverage of the subject.
 793
 794 @itemize @asis
 795 @item
 796 Andrew Gelman, John B. Carlin, Hal S. Stern, Donald B. Rubin.
 797 @cite{Bayesian Data Analysis}.
 798 Chapman & Hall, ISBN 0412039915.
 799 @end itemize
 800
 801 @noindent
 802 For physicists the Particle Data Group provides useful reviews of
 803 Probability and Statistics in the ``Mathematical Tools'' section of its
 804 Annual Review of Particle Physics.
 805
 806 @itemize @asis
 807 @item
 808 @cite{Review of Particle Properties}
 809 R.M. Barnett et al., Physical Review D54, 1 (1996)
 810 @end itemize
 811
 812 @noindent
 813 The Review of Particle Physics is available online at
 814 the website @uref{http://pdg.lbl.gov/}.
 815
 816