Index: /trunk/yat/statistics/AUC.h
===================================================================
 /trunk/yat/statistics/AUC.h (revision 2593)
+++ /trunk/yat/statistics/AUC.h (revision 2594)
@@ 40,7 +40,22 @@
namespace statistics {
 ///
 /// @brief Class calculating Area Under ROC Curve
 ///
+ /**
+ \brief Area Under ROC Curve
+
+ Class calculates area under curve from values, Target, and
+ possibly weights. If weights are left out, unity weights are
+ assumed. The area under curve is defined as
+ \f[
+ \frac{\sum_{i,j} I(x_i^+x_j^) w_i^+w_j^} {\sum_{i,j}
+ w_i^+w_j^} \f]
+ where
+ \f{eqnarray*}{
+ &0; &x<0 \\
+ I(x) = &0.5; &x=0 \\
+ &1; &x>0
+ \f}
+
+ Complexity of calculating the AUC is \f$ N \log N \f$.
+ */
class AUC : public Score
{
@@ 48,45 +63,23 @@
public:
///
 /// \brief Defaul Constructor
+ /// \brief Default Constructor
/// \param absolute if true max(AUC, 1AUC) is used
///
AUC(bool absolute=true);
 /// Function taking \a value, \a target (+1 or 1) and vector
 /// defining what samples to use. The score is equivalent to
 /// MannWhitney statistics.
 /// @return the area under the ROC curve. If the area is less
 /// than 0.5 and absolute=true, 1area is returned. Complexity is
 /// \f$ N\log N \f$ where \f$ N \f$ is number of samples.
 ///
+ /**
+ \return area under the ROC curve.
+ */
double score(const classifier::Target& target,
const utility::VectorBase& value) const;
 /**
 Function taking values, target, weight and a vector defining
 what samples to use. The area is defines as \f$ \frac{\sum
 w^+w^}{\sum w^+w^}\f$, where the sum in the numerator goes
 over all pairs where value+ is larger than value. The
 denominator goes over all pairs. If target is equal to 1,
 sample belonges to class + otherwise sample belongs to class
 . @return wheighted version of area under the ROC curve. If
 the area is less than 0.5 and absolute=true, 1area is
 returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
 of samples.
+ /**
+ \return area under the ROC curve.
*/
double score(const classifier::Target& target,
const classifier::DataLookupWeighted1D& value) const;
 /**
 Function taking values, target, weight and a vector defining
 what samples to use. The area is defines as \f$ \frac{\sum
 w^+w^}{\sum w^+w^}\f$, where the sum in the numerator goes
 over all pairs where value+ is larger than value. The
 denominator goes over all pairs. If target is equal to 1,
 sample belonges to class + otherwise sample belongs to class
 . @return wheighted version of area under the ROC curve. If
 the area is less than 0.5 and absolute=true, 1area is
 returned. Complexity is \f$ N^2 \f$ where \f$ N \f$ is number
 of samples.
+ /**
+ \return area under the ROC curve.
*/
double score(const classifier::Target& target,
@@ 99,5 +92,4 @@
typedef std::multimap > MultiMap;
double score(const MultiMap&) const;

};
Index: /trunk/yat/statistics/ROC.h
===================================================================
 /trunk/yat/statistics/ROC.h (revision 2593)
+++ /trunk/yat/statistics/ROC.h (revision 2594)
@@ 35,9 +35,11 @@
///
 /// @brief Class for Reciever Operating Characteristic.
 ///
+ /// @brief Reciever Operating Characteristic.
+ ///
/// As the area under an ROC curve is equivalent to MannWhitney U
/// statistica, this class can be used to perform a MannWhitney
/// Utest (aka Wilcoxon).
+ ///
+ /// \see AUC
///
class ROC
@@ 51,12 +53,22 @@
/**
 Adding a data value to ROC.
+ \brief Add a data value.
+
+ \param value data value
+ \param target \c true if value belongs to class positive
+
+ \param weight indicating how important the data point is. A
+ zero weight implies the data point is ignored. A negative
+ weight should be understood as removing a data point and thus
+ typically only makes sense if there is a previously added data
+ point with same \a value and \a target.
+
*/
void add(double value, bool target, double weight=1.0);
/**
 The area is defines as \f$ \frac{\sum w^+w^} {\sum w^+w^}\f$,
 where the sum in the numerator goes over all pairs where value+
 is larger than value. The denominator goes over all pairs.
+ \brief Area Under Curve, AUC
+
+ \see AUC for how the area is calculated
@return Area under curve.
@@ 64,21 +76,31 @@
double area(void);
 ///
 /// minimum_size is the threshold for when a normal
 /// approximation is used for the pvalue calculation.
 ///
 /// @return reference to minimum_size
 ///
+ /**
+ \brief threshold for p_value calculation
+
+ Function can used to change the minimum_size.
+
+ \return reference to threshold minimum size
+ */
unsigned int& minimum_size(void);
/**
 minimum_size is the threshold for when a normal
 approximation is used for the pvalue calculation.
+ \brief threshold for p_value calculation
+
+ Threshold deciding whether pvalue is computed using exact
+ method or a Gaussian approximation. If both number of positive
+ samples, n_pos(void), and number of negative samples,
+ n_neg(void), are smaller than minimum_size the exact method is
+ used.
 @return const reference to minimum_size
+ \see p_value
+
+ \return const reference to minimum_size
*/
const unsigned int& minimum_size(void) const;
///
+ /// \brief number of samples
+ ///
/// @return sum of weights
///
@@ 86,4 +108,6 @@
///
+ /// \brief number of negative samples
+ ///
/// @return sum of weights with negative target
///
@@ 91,30 +115,69 @@
///
+ /// \brief number of positive samples
+ ///
/// @return sum of weights with positive target
///
double n_pos(void) const;
 ///
 ///Calculates the pvalue, i.e. the probability of observing an
 ///area equally or larger if the null hypothesis is true. If P is
 ///near zero, this casts doubt on this hypothesis. The null
 ///hypothesis is that the values from the 2 classes are generated
 ///from 2 identical distributions. The alternative is that the
 ///median of the first distribution is shifted from the median of
 ///the second distribution by a nonzero amount. If the smallest
 ///group size is larger than minimum_size (default = 10), then P
 ///is calculated using a normal approximation.
 ///
 /// \note Weights should be either zero or unity, else present
 /// implementation is nonsense.
 ///
 /// @return Onesided pvalue.
 ///
+ /**
+ \brief Onesided Pvalue
+
+ Calculates the onesided pvalue, i.e., probability to get this
+ area (or greater) given that there is no difference
+ between the two classes.
+
+ \b Exact \b method: In the exact method the function goes
+ through all permutations and counts what fraction for which the
+ area is greater (or equal) than area in original permutation.
+
+ \b Largesample \b Approximation: When many data points are
+ available, see minimum_size(), a Gaussian approximation is used
+ and the pvalue is calculated as
+ \f[
+ P = \frac{1}{\sqrt{2\pi}} \int_{\infty}^z
+ \exp{\left(\frac{t^2}{2}\right)} dt
+ \f]
+
+ where
+
+ \f[
+ z = \frac{\textrm{area}  0.5  0.5/(n^\cdot +n^)}{s}
+ \f]
+
+ and
+
+ \f[
+ s^2 = \frac{n+1+\sum \left(n_x \cdot (n_x^21)\right)}
+ {12\cdot n^+\cdot n^}
+ \f]
+
+ where sum runs over different data values (of ties) and \f$ n_x
+ \f$ is number data points with that value. The sum i a
+ correction term for ties and is zero if there are no ties.
+
+ \return \f$ P(a \ge \textrm{area}) \f$
+
+ \note Weights should be 1, 0, or 1; otherwise the pvalue is
+ undefined and may change in future versions.
+ */
double p_value_one_sided(void) const;
/**
 @brief Twosided pvalue.

 @return min(2*p_value_one_sided, 22*p_value_one_sided)
+ \brief Twosided pvalue.
+
+ Calculates the probability to get an area, \c a, equal or more
+ extreme than \c area
+ \f[
+ P(a \ge \textrm{max}(\textrm{area},1\textrm{area})) +
+ P(a \le \textrm{min}(\textrm{area}, 1\textrm{area})) \f]
+
+ If there are no ties, distribution of \a a is symmetric, so if
+ area is greater than 0.5, this boils down to \f$ P = 2*P(a \ge
+ \textrm{area}) = 2*P_\textrm{onesided}\f$.
+
+ \return twosided pvalue
+
+ \see p_value_one_sided
*/
double p_value(void) const;