Changeset 2594


Ignore:
Timestamp:
Oct 30, 2011, 3:36:17 AM (10 years ago)
Author:
Peter
Message:

improve docs for ROC and sister class AUC. closes #144

Location:
trunk/yat/statistics
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/yat/statistics/AUC.h

    r2119 r2594  
    4040namespace statistics { 
    4141
    42   ///
    43   /// @brief Class calculating Area Under ROC Curve
    44   ///   
     42  /**
     43     \brief Area Under ROC Curve
     44
     45     Class calculates area under curve from values, Target, and
     46     possibly weights. If weights are left out, unity weights are
     47     assumed. The area under curve is defined as
     48     \f[
     49     \frac{\sum_{i,j} I(x_i^+-x_j^-) w_i^+w_j^-} {\sum_{i,j}
     50     w_i^+w_j^-} \f]
     51     where
     52     \f{eqnarray*}{
     53            &0; &x<0 \\
     54     I(x) = &0.5; &x=0 \\
     55            &1; &x>0
     56     \f}
     57
     58     Complexity of calculating the AUC is \f$ N \log N \f$.
     59  */ 
    4560  class AUC : public Score
    4661  {
     
    4863  public:
    4964    ///
    50     /// \brief Defaul Constructor
     65    /// \brief Default Constructor
    5166    /// \param absolute if true max(AUC, 1-AUC) is used
    5267    ///
    5368    AUC(bool absolute=true);
    5469
    55     /// Function taking \a value, \a target (+1 or -1) and vector
    56     /// defining what samples to use. The score is equivalent to
    57     /// Mann-Whitney statistics.
    58     /// @return the area under the ROC curve. If the area is less
    59     /// than 0.5 and absolute=true, 1-area is returned. Complexity is
    60     /// \f$ N\log N \f$ where \f$ N \f$ is number of samples.
    61     ///
     70    /**
     71       \return area under the ROC curve.
     72    */
    6273    double score(const classifier::Target& target,
    6374                 const utility::VectorBase& value) const;
    6475   
    65     /**
    66         Function taking values, target, weight and a vector defining
    67         what samples to use. The area is defines as \f$ \frac{\sum
    68         w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
    69         over all pairs where value+ is larger than value-. The
    70         denominator goes over all pairs. If target is equal to 1,
    71         sample belonges to class + otherwise sample belongs to class
    72         -. @return wheighted version of area under the ROC curve. If
    73         the area is less than 0.5 and absolute=true, 1-area is
    74         returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
    75         of samples.
     76    /**
     77       \return area under the ROC curve.
    7678    */
    7779    double score(const classifier::Target& target,
    7880                 const classifier::DataLookupWeighted1D& value) const;
    7981
    80     /**
    81         Function taking values, target, weight and a vector defining
    82         what samples to use. The area is defines as \f$ \frac{\sum
    83         w^+w^-}{\sum w^+w^-}\f$, where the sum in the numerator goes
    84         over all pairs where value+ is larger than value-. The
    85         denominator goes over all pairs. If target is equal to 1,
    86         sample belonges to class + otherwise sample belongs to class
    87         -. @return wheighted version of area under the ROC curve. If
    88         the area is less than 0.5 and absolute=true, 1-area is
    89         returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
    90         of samples.
     82    /**
     83       \return area under the ROC curve.
    9184    */
    9285    double score(const classifier::Target& target,
     
    9992    typedef std::multimap<double, std::pair<bool, double> > MultiMap;
    10093    double score(const MultiMap&) const;
    101    
    10294  };
    10395
  • trunk/yat/statistics/ROC.h

    r2592 r2594  
    3535
    3636  ///
    37   /// @brief Class for Reciever Operating Characteristic.
    38   ///   
     37  /// @brief Reciever Operating Characteristic.
     38  ///
    3939  /// As the area under an ROC curve is equivalent to Mann-Whitney U
    4040  /// statistica, this class can be used to perform a Mann-Whitney
    4141  /// U-test (aka Wilcoxon).
     42  ///
     43  /// \see AUC
    4244  ///
    4345  class ROC
     
    5153         
    5254    /**
    53        Adding a data value to ROC.
     55       \brief Add a data value.
     56
     57       \param value data value
     58       \param target \c true if value belongs to class positive
     59
     60       \param weight indicating how important the data point is. A
     61       zero weight implies the data point is ignored. A negative
     62       weight should be understood as removing a data point and thus
     63       typically only makes sense if there is a previously added data
     64       point with same \a value and \a target.
     65
    5466    */
    5567    void add(double value, bool target, double weight=1.0);
    5668
    5769    /**
    58        The area is defines as \f$ \frac{\sum w^+w^-} {\sum w^+w^-}\f$,
    59        where the sum in the numerator goes over all pairs where value+
    60        is larger than value-. The denominator goes over all pairs.
     70       \brief Area Under Curve, AUC
     71
     72       \see AUC for how the area is calculated
    6173
    6274       @return Area under curve.
     
    6476    double area(void);
    6577
    66     ///
    67     /// minimum_size is the threshold for when a normal
    68     /// approximation is used for the p-value calculation.
    69     ///
    70     /// @return reference to minimum_size
    71     ///
     78    /**
     79       \brief threshold for p_value calculation
     80
     81       Function can used to change the minimum_size.
     82
     83       \return reference to threshold minimum size
     84     */
    7285    unsigned int& minimum_size(void);
    7386
    7487    /**
    75        minimum_size is the threshold for when a normal
    76        approximation is used for the p-value calculation.
     88       \brief threshold for p_value calculation
     89
     90       Threshold deciding whether p-value is computed using exact
     91       method or a Gaussian approximation. If both number of positive
     92       samples, n_pos(void), and number of negative samples,
     93       n_neg(void), are smaller than minimum_size the exact method is
     94       used.
    7795       
    78        @return const reference to minimum_size
     96       \see p_value
     97
     98       \return const reference to minimum_size
    7999    */
    80100    const unsigned int& minimum_size(void) const;
    81101
    82102    ///
     103    /// \brief number of samples
     104    ///
    83105    /// @return sum of weights
    84106    ///
     
    86108
    87109    ///
     110    /// \brief number of negative samples
     111    ///
    88112    /// @return sum of weights with negative target
    89113    ///
     
    91115
    92116    ///
     117    /// \brief number of positive samples
     118    ///
    93119    /// @return sum of weights with positive target
    94120    ///
    95121    double n_pos(void) const;
    96122
    97     ///
    98     ///Calculates the p-value, i.e. the probability of observing an
    99     ///area equally or larger if the null hypothesis is true. If P is
    100     ///near zero, this casts doubt on this hypothesis. The null
    101     ///hypothesis is that the values from the 2 classes are generated
    102     ///from 2 identical distributions. The alternative is that the
    103     ///median of the first distribution is shifted from the median of
    104     ///the second distribution by a non-zero amount. If the smallest
    105     ///group size is larger than minimum_size (default = 10), then P
    106     ///is calculated using a normal approximation. 
    107     ///
    108     /// \note Weights should be either zero or unity, else present
    109     /// implementation is nonsense.
    110     ///
    111     /// @return One-sided p-value.
    112     ///
     123    /**
     124       \brief One-sided P-value
     125
     126       Calculates the one-sided p-value, i.e., probability to get this
     127       area (or greater) given that there is no difference
     128       between the two classes.
     129
     130       \b Exact \b method: In the exact method the function goes
     131       through all permutations and counts what fraction for which the
     132       area is greater (or equal) than area in original permutation.
     133
     134       \b Large-sample \b Approximation: When many data points are
     135       available, see minimum_size(), a Gaussian approximation is used
     136       and the p-value is calculated as
     137       \f[
     138       P = \frac{1}{\sqrt{2\pi}} \int_{-\infty}^z
     139       \exp{\left(-\frac{t^2}{2}\right)} dt
     140       \f]
     141
     142       where
     143
     144       \f[
     145       z = \frac{\textrm{area} - 0.5 - 0.5/(n^\cdot +n^-)}{s}
     146       \f]
     147
     148       and
     149
     150       \f[
     151       s^2 = \frac{n+1+\sum \left(n_x \cdot (n_x^2-1)\right)}
     152       {12\cdot n^+\cdot n^-}
     153       \f]
     154
     155       where sum runs over different data values (of ties) and \f$ n_x
     156       \f$ is number data points with that value. The sum i a
     157       correction term for ties and is zero if there are no ties.
     158
     159       \return \f$ P(a \ge \textrm{area}) \f$
     160
     161       \note Weights should be -1, 0, or 1; otherwise the p-value is
     162       undefined and may change in future versions.
     163     */
    113164    double p_value_one_sided(void) const;
    114165   
    115166    /**
    116        @brief Two-sided p-value.
    117 
    118        @return min(2*p_value_one_sided, 2-2*p_value_one_sided)
     167       \brief Two-sided p-value.
     168
     169       Calculates the probability to get an area, \c a, equal or more
     170       extreme than \c area
     171       \f[
     172       P(a \ge \textrm{max}(\textrm{area},1-\textrm{area})) +
     173       P(a \le \textrm{min}(\textrm{area}, 1-\textrm{area})) \f]
     174
     175       If there are no ties, distribution of \a a is symmetric, so if
     176       area is greater than 0.5, this boils down to \f$ P = 2*P(a \ge
     177       \textrm{area}) = 2*P_\textrm{one-sided}\f$.
     178
     179       \return two-sided p-value
     180
     181       \see p_value_one_sided
    119182    */
    120183    double p_value(void) const;
Note: See TracChangeset for help on using the changeset viewer.