source: trunk/lib/statistics/Fisher.h @ 447

Last change on this file since 447 was 447, checked in by Peter, 16 years ago

added copy constructor for KernelView? and added construction of KernelView? in test

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 4.8 KB
Line 
1// $Id: Fisher.h 447 2005-12-15 18:51:18Z peter $
2
3#ifndef _theplu_statistics_fisher_
4#define _theplu_statistics_fisher_
5
6#include <c++_tools/statistics/Score.h>
7#include <c++_tools/gslapi/vector.h>
8
9
10namespace theplu {
11namespace statistics { 
12  ///
13  /// @brief Fisher's exact test.   
14  /// Fisher's Exact test is a procedure that you can use for data
15  /// in a two by two contingency table: \f[ \begin{tabular}{|c|c|}
16  /// \hline a&b \tabularnewline \hline c&d \tabularnewline \hline
17  /// \end{tabular} \f] Fisher's Exact Test is based on exact
18  /// probabilities from a specific distribution (the hypergeometric
19  /// distribution). There's really no lower bound on the amount of
20  /// data that is needed for Fisher's Exact Test. You do have to
21  /// have at least one data value in each row and one data value in
22  /// each column. If an entire row or column is zero, then you
23  /// don't really have a 2 by 2 table. But you can use Fisher's
24  /// Exact Test when one of the cells in your table has a zero in
25  /// it. Fisher's Exact Test is also very useful for highly
26  /// imbalanced tables. If one or two of the cells in a two by two
27  /// table have numbers in the thousands and one or two of the
28  /// other cells has numbers less than 5, you can still use
29  /// Fisher's Exact Test. For very large tables (where all four
30  /// entries in the two by two table are large), your computer may
31  /// take too much time to compute Fisher's Exact Test. In these
32  /// situations, though, you might as well use the Chi-square test
33  /// because a large sample approximation (that the Chi-square test
34  /// relies on) is very reasonable. If all elements are larger than
35  /// 10 a Chi-square test is reasonable to use.
36  ///
37  /// @note The statistica assumes that each column and row sum,
38  /// respectively, are fixed. Just because you have a 2x2 table, this
39  /// assumtion does not necessarily match you experimental upset. See
40  /// e.g. Barnard's test for alternative.
41  ///
42 
43  class Fisher : public Score
44  {
45 
46  public:
47    ///
48    /// Default Constructor.
49    ///
50    Fisher(bool absolute=true);
51
52    ///
53    /// Destructor
54    ///
55    virtual ~Fisher(void) {};
56         
57   
58    ///
59    /// Cutoff sets the limit whether a value should go into the left
60    /// or the right column. @see score
61    ///
62    /// @return reference to cutoff for column
63    ///
64    inline double& cutoff_column(void) { return cutoff_column_; }
65
66    ///
67    /// Cutoff sets the limit whether a value should go into the left
68    /// or the right row. @see score
69    ///
70    /// @return reference to cutoff for row
71    ///
72    inline double& cutoff_row(void) { return cutoff_row_; }
73
74    ///
75    /// Calculates the expected values under the null hypothesis.
76    /// a' = (a+c)(a+b)/(a+b+c+d)
77    ///
78    void expected(u_int& a, u_int& b, u_unt& c, u_int& d);
79
80    ///
81    /// minimum_size is the threshold for when the p-value calculation
82    /// is performed using a Chi2 approximation.
83    ///
84    /// @return reference to minimum_size
85    ///
86    inline u_int& minimum_size(void){ return minimum_size_; } 
87
88    ///
89    /// If absolute, the p-value is the two-sided p-value. If all
90    /// elements in table is at least minimum_size, a Chi2
91    /// approximation is used.
92    ///
93    /// @return p-value
94    ///
95    double p_value() const;
96   
97    ///
98    /// Function calculating score from 2x2 table for which the
99    /// elements are calculated as follows \n
100    /// a: #data \f$ x=1 \f$ AND \f$ y=1 \f$ \n
101    /// b: #data \f$ x=-1 \f$ AND \f$ y=1 \f$ \n
102    /// c: #data \f$ x=1 \f$ AND \f$ y=-1 \f$ \n
103    /// d: #data \f$ x=-1 \f$ AND \f$ y=1 \f$ \n
104    ///
105    /// @return odds ratio. If absolute_ is true and odds ratio is
106    /// less than unity 1 divided by odds ratio is returned
107    ///
108    double score(const gslapi::vector& x, const gslapi::vector& y, 
109                 const std::vector<size_t>& = std::vector<size_t>());
110
111    ///
112    /// Weighted version of score. Each element in 2x2 table is
113    /// calculated as \f$ \sum w_i \f$, so when each weight is
114    /// unitary the same table is created as in the unweighted version
115    ///
116    /// @return odds ratio
117    ///
118    /// @note
119    ///
120    double score(const gslapi::vector& x, const gslapi::vector& y, 
121                 const gslapi::vector& w,
122                 const std::vector<size_t>& = std::vector<size_t>());
123
124    ///
125    /// \f$ \frac{ad}{bc} \f$
126    ///
127    /// @return odds ratio. If absolute_ is true and odds ratio is
128    /// less than unity, 1 divided by odds ratio is returned
129    ///
130    double score(const u_int a, const u_int b, 
131                 const u_int c, const u_int d); 
132   
133
134         
135  private:
136    double oddsratio(const double a, const double b, 
137                     const double c, const double d) const;
138
139    double p_value_approximative(void) const;
140    double p_value_exact(void) const;
141
142    std::vector<size_t> train_set_;
143    gslapi::vector weight_;
144    double a_;
145    double b_;
146    double c_;
147    double d_;
148    double cutoff_column_;
149    double cutoff_row_;
150    double minimum_size_;
151
152  };
153
154}} // of namespace statistics and namespace theplu
155
156#endif
157
Note: See TracBrowser for help on using the repository browser.