Ignore:
Timestamp:
Oct 30, 2011, 3:36:17 AM (11 years ago)
Author:
Peter
Message:

improve docs for ROC and sister class AUC. closes #144

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/yat/statistics/ROC.h

    r2592 r2594  
    3535
    3636  ///
    37   /// @brief Class for Reciever Operating Characteristic.
    38   ///   
     37  /// @brief Reciever Operating Characteristic.
     38  ///
    3939  /// As the area under an ROC curve is equivalent to Mann-Whitney U
    4040  /// statistica, this class can be used to perform a Mann-Whitney
    4141  /// U-test (aka Wilcoxon).
     42  ///
     43  /// \see AUC
    4244  ///
    4345  class ROC
     
    5153         
    5254    /**
    53        Adding a data value to ROC.
     55       \brief Add a data value.
     56
     57       \param value data value
     58       \param target \c true if value belongs to class positive
     59
     60       \param weight indicating how important the data point is. A
     61       zero weight implies the data point is ignored. A negative
     62       weight should be understood as removing a data point and thus
     63       typically only makes sense if there is a previously added data
     64       point with same \a value and \a target.
     65
    5466    */
    5567    void add(double value, bool target, double weight=1.0);
    5668
    5769    /**
    58        The area is defines as \f$ \frac{\sum w^+w^-} {\sum w^+w^-}\f$,
    59        where the sum in the numerator goes over all pairs where value+
    60        is larger than value-. The denominator goes over all pairs.
     70       \brief Area Under Curve, AUC
     71
     72       \see AUC for how the area is calculated
    6173
    6274       @return Area under curve.
     
    6476    double area(void);
    6577
    66     ///
    67     /// minimum_size is the threshold for when a normal
    68     /// approximation is used for the p-value calculation.
    69     ///
    70     /// @return reference to minimum_size
    71     ///
     78    /**
     79       \brief threshold for p_value calculation
     80
     81       Function can used to change the minimum_size.
     82
     83       \return reference to threshold minimum size
     84     */
    7285    unsigned int& minimum_size(void);
    7386
    7487    /**
    75        minimum_size is the threshold for when a normal
    76        approximation is used for the p-value calculation.
     88       \brief threshold for p_value calculation
     89
     90       Threshold deciding whether p-value is computed using exact
     91       method or a Gaussian approximation. If both number of positive
     92       samples, n_pos(void), and number of negative samples,
     93       n_neg(void), are smaller than minimum_size the exact method is
     94       used.
    7795       
    78        @return const reference to minimum_size
     96       \see p_value
     97
     98       \return const reference to minimum_size
    7999    */
    80100    const unsigned int& minimum_size(void) const;
    81101
    82102    ///
     103    /// \brief number of samples
     104    ///
    83105    /// @return sum of weights
    84106    ///
     
    86108
    87109    ///
     110    /// \brief number of negative samples
     111    ///
    88112    /// @return sum of weights with negative target
    89113    ///
     
    91115
    92116    ///
     117    /// \brief number of positive samples
     118    ///
    93119    /// @return sum of weights with positive target
    94120    ///
    95121    double n_pos(void) const;
    96122
    97     ///
    98     ///Calculates the p-value, i.e. the probability of observing an
    99     ///area equally or larger if the null hypothesis is true. If P is
    100     ///near zero, this casts doubt on this hypothesis. The null
    101     ///hypothesis is that the values from the 2 classes are generated
    102     ///from 2 identical distributions. The alternative is that the
    103     ///median of the first distribution is shifted from the median of
    104     ///the second distribution by a non-zero amount. If the smallest
    105     ///group size is larger than minimum_size (default = 10), then P
    106     ///is calculated using a normal approximation. 
    107     ///
    108     /// \note Weights should be either zero or unity, else present
    109     /// implementation is nonsense.
    110     ///
    111     /// @return One-sided p-value.
    112     ///
     123    /**
     124       \brief One-sided P-value
     125
     126       Calculates the one-sided p-value, i.e., probability to get this
     127       area (or greater) given that there is no difference
     128       between the two classes.
     129
     130       \b Exact \b method: In the exact method the function goes
     131       through all permutations and counts what fraction for which the
     132       area is greater (or equal) than area in original permutation.
     133
     134       \b Large-sample \b Approximation: When many data points are
     135       available, see minimum_size(), a Gaussian approximation is used
     136       and the p-value is calculated as
     137       \f[
     138       P = \frac{1}{\sqrt{2\pi}} \int_{-\infty}^z
     139       \exp{\left(-\frac{t^2}{2}\right)} dt
     140       \f]
     141
     142       where
     143
     144       \f[
     145       z = \frac{\textrm{area} - 0.5 - 0.5/(n^\cdot +n^-)}{s}
     146       \f]
     147
     148       and
     149
     150       \f[
     151       s^2 = \frac{n+1+\sum \left(n_x \cdot (n_x^2-1)\right)}
     152       {12\cdot n^+\cdot n^-}
     153       \f]
     154
     155       where sum runs over different data values (of ties) and \f$ n_x
     156       \f$ is number data points with that value. The sum i a
     157       correction term for ties and is zero if there are no ties.
     158
     159       \return \f$ P(a \ge \textrm{area}) \f$
     160
     161       \note Weights should be -1, 0, or 1; otherwise the p-value is
     162       undefined and may change in future versions.
     163     */
    113164    double p_value_one_sided(void) const;
    114165   
    115166    /**
    116        @brief Two-sided p-value.
    117 
    118        @return min(2*p_value_one_sided, 2-2*p_value_one_sided)
     167       \brief Two-sided p-value.
     168
     169       Calculates the probability to get an area, \c a, equal or more
     170       extreme than \c area
     171       \f[
     172       P(a \ge \textrm{max}(\textrm{area},1-\textrm{area})) +
     173       P(a \le \textrm{min}(\textrm{area}, 1-\textrm{area})) \f]
     174
     175       If there are no ties, distribution of \a a is symmetric, so if
     176       area is greater than 0.5, this boils down to \f$ P = 2*P(a \ge
     177       \textrm{area}) = 2*P_\textrm{one-sided}\f$.
     178
     179       \return two-sided p-value
     180
     181       \see p_value_one_sided
    119182    */
    120183    double p_value(void) const;
Note: See TracChangeset for help on using the changeset viewer.