source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1737

Last change on this file since 1737 was 1737, checked in by Peter, 12 years ago

addresses #478 - preparing for weighted version

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 8.2 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataWeight.h"
25#include "yat/utility/iterator_traits.h"
26#include "yat/utility/Vector.h"
27#include "yat/utility/yat_assert.h"
28
29#include <algorithm>
30#include <iterator>
31#include <stdexcept>
32#include <vector>
33
34namespace theplu {
35namespace yat {
36namespace utility {
37  class VectorBase;
38}
39namespace normalizer {
40
41  /**
42     \brief Perform Q-quantile normalization
43
44     After a Q-quantile normalization each column has approximately
45     the same distribution of data (the Q-quantiles are the
46     same). Also, within each column the rank of an element is not
47     changed.
48
49     There is currently no weighted version of qQuantileNormalizer
50
51     The normalization goes like this
52     - Data is not assumed to be sorted.
53     - Partition sorted target data in N parts. N must be 3 larger
54       because of requirements from the underlying cspline fit
55     - Calculate the arithmetic mean for each part, the mean is
56       assigned to the mid point of each part.
57     - Do the above for the data to be tranformed (called source
58       here).
59     - For each part, calculate the difference between the target and
60       the source. Now we have N differences d_i with associated rank
61       (midpoint of each part).
62     - Create a cubic spline fit to this difference vector d. The
63       resulting curve is used to recalculate all column values.
64       - Use the cubic spline fit for values within the cubic spline
65         fit range [midpoint 1st part, midpoint last part].
66       - For data outside the cubic spline fit use linear
67         extrapolation, i.e., a constant shift. d_first for points
68         below fit range, and d_last for points above fit range.
69
70     \since New in yat 0.5
71   */
72  class qQuantileNormalizer
73  {
74  public:
75    /**
76       \brief Documentation please.
77
78       \a Q is the number of parts and must be within \f$ [3,N] \f$
79       where \f$ N \f$ is the total number of data points in the
80       target. However, if \f$ N \f$ is larger than the number of points
81       in the data to be normalized the behaviour of the code is
82       undefined. Keep \f$ N \f$ equal to or less than the smallest
83       number of data points in the target or each data set to be
84       normalized against a given target. The lower bound of three is
85       due to restrictions in the cspline fit utilized in the
86       normalization.
87    */
88    template<typename BidirectionalIterator>
89    qQuantileNormalizer(BidirectionalIterator first, BidirectionalIterator last,
90                        unsigned int Q);
91
92    /**
93       \brief perform the Q-quantile normalization.
94
95       It is possible to normalize "in place"; it is permissible for
96       \a matrix and \a result to reference the same Matrix.
97
98       \note dimensions of \a matrix and \a result must match.
99     */
100    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
101    RandomAccessIterator2 operator()(RandomAccessIterator1 first, 
102                                     RandomAccessIterator1 last,
103                                     RandomAccessIterator2 result) const;
104
105  private:
106
107  /**
108     \brief Partition a vector of data into equal sizes.
109
110     The class also calculates the average of each part and assigns
111     the average to the mid point of each part. The midpoint is a
112     double, i.e., it is not forced to be an integer index.
113  */
114  class Partitioner
115  {
116  public:
117    /**
118       \brief Create the partition and perform required calculations.
119    */
120    template<typename BidirectionalIterator>
121    Partitioner(BidirectionalIterator first, BidirectionalIterator last, 
122                unsigned int N);
123
124    /**
125       \brief Return the averages for each part.
126
127       \return The average vector.
128    */
129    const utility::Vector& averages(void) const;
130
131    /**
132       \brief Return the mid point for each partition.
133
134       \return The index vector.
135    */
136    const utility::Vector& index(void) const;
137
138    /**
139       \return The number of parts.
140    */
141    size_t size(void) const;
142
143  private:
144    // unweighted "constructor"
145    template<typename Iterator>
146    void build(Iterator first, Iterator last, unsigned int N, 
147               utility::unweighted_iterator_tag);
148    void init(const utility::VectorBase&, unsigned int N);
149
150    utility::Vector average_;
151    utility::Vector index_;
152  };
153
154
155    Partitioner target_;
156  };
157
158
159  // template implementations
160
161  template<typename BidirectionalIterator>
162  qQuantileNormalizer::qQuantileNormalizer(BidirectionalIterator first, 
163                                           BidirectionalIterator last,
164                                           unsigned int Q)
165    : target_(Partitioner(first, last, Q))
166  {
167    utility::yat_assert<std::runtime_error>(Q>2, 
168                                            "qQuantileNormalizer: Q too small");
169  }
170
171
172  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
173  RandomAccessIterator2
174  qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
175                                  RandomAccessIterator1 last,
176                                  RandomAccessIterator2 result) const
177  {
178    size_t N = last-first;
179    utility::yat_assert<std::runtime_error>
180      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
181
182    std::vector<size_t> sorted_index(last-first);
183    utility::sort_index(first, last, sorted_index);
184
185    Partitioner source(first, last, target_.size());
186    utility::Vector diff(source.averages());
187    diff-=target_.averages();
188    const utility::Vector& idx=target_.index();
189    regression::CSplineInterpolation cspline(idx,diff);
190
191    // linear interpolation for first part, i.e., use first diff for
192    // all points in the first part.
193    size_t start=0;
194    size_t end=static_cast<unsigned int>(idx(0));
195    // The first condition below takes care of limiting case number
196    // of parts approximately equal to the number of matrix rows and
197    // the second condition makes sure that index is large enough
198    // when using cspline below ... the static cast above takes the
199    // floor whereas we want to take the "roof" forcing next index
200    // range to be within interpolation range for the cspline.
201    if ((end==0) || (end<idx(0)))
202      ++end;
203    for (size_t row=start; row<end; ++row) {
204      size_t srow=sorted_index[row];
205      result[srow] = first[srow] - diff(0);
206    }
207   
208    // cspline interpolation for all data between the mid points of
209    // the first and last part
210    start=end;
211    end=static_cast<unsigned int>(idx(target_.size()-1));
212    // take care of limiting case number of parts approximately
213    // equal to the number of matrix rows
214    if (end==(static_cast<size_t>(N-1)) )
215      --end;
216    for (size_t row=start; row<=end; ++row) {
217      size_t srow=sorted_index[row];
218      result[srow] = first[srow] - cspline.evaluate(row) ;
219    }
220   
221    // linear interpolation for last part, i.e., use last diff for
222    // all points in the last part.
223    start=end+1;
224    end=N;
225    for (size_t row=start; row<end; ++row) {
226      size_t srow=sorted_index[row];
227      result[srow] = first[srow] - diff(diff.size()-1);
228    }
229    return result + N;
230  }
231
232
233  template<typename BidirectionalIterator>
234  qQuantileNormalizer::Partitioner::Partitioner(BidirectionalIterator first, 
235                                                BidirectionalIterator last, 
236                                                unsigned int N)
237    : average_(utility::Vector(N)), index_(utility::Vector(N))
238  {
239    typedef typename 
240      utility::weighted_iterator_traits<BidirectionalIterator>::type tag;
241    build(first, last, N, tag());
242         
243  }
244
245
246  template<typename Iterator>
247  void qQuantileNormalizer::Partitioner::build(Iterator first, Iterator last, 
248                                               unsigned int N, 
249                                               utility::unweighted_iterator_tag)
250  {
251    utility::Vector vec(std::distance(first, last));
252    std::copy(first, last, vec.begin());
253    std::sort(vec.begin(), vec.end());
254    init(vec, N);
255  }
256
257
258}}} // end of namespace normalizer, yat and thep
259
260#endif
Note: See TracBrowser for help on using the repository browser.