source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1739

Last change on this file since 1739 was 1739, checked in by Peter, 12 years ago

fixes #479 and removing some debug output

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 8.8 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataWeight.h"
25#include "yat/utility/iterator_traits.h"
26#include "yat/utility/Vector.h"
27#include "yat/utility/yat_assert.h"
28
29#include <algorithm>
30#include <iterator>
31#include <stdexcept>
32#include <vector>
33
34namespace theplu {
35namespace yat {
36namespace utility {
37  class VectorBase;
38}
39namespace normalizer {
40
41  /**
42     \brief Perform Q-quantile normalization
43
44     After a Q-quantile normalization each column has approximately
45     the same distribution of data (the Q-quantiles are the
46     same). Also, within each column the rank of an element is not
47     changed.
48
49     There is currently no weighted version of qQuantileNormalizer
50
51     The normalization goes like this
52     - Data is not assumed to be sorted.
53     - Partition sorted target data in N parts. N must be 3 larger
54       because of requirements from the underlying cspline fit
55     - Calculate the arithmetic mean for each part, the mean is
56       assigned to the mid point of each part.
57     - Do the above for the data to be tranformed (called source
58       here).
59     - For each part, calculate the difference between the target and
60       the source. Now we have N differences d_i with associated rank
61       (midpoint of each part).
62     - Create a cubic spline fit to this difference vector d. The
63       resulting curve is used to recalculate all column values.
64       - Use the cubic spline fit for values within the cubic spline
65         fit range [midpoint 1st part, midpoint last part].
66       - For data outside the cubic spline fit use linear
67         extrapolation, i.e., a constant shift. d_first for points
68         below fit range, and d_last for points above fit range.
69
70     \since New in yat 0.5
71   */
72  class qQuantileNormalizer
73  {
74  public:
75    /**
76       \brief Documentation please.
77
78       \a Q is the number of parts and must be within \f$ [3,N] \f$
79       where \f$ N \f$ is the total number of data points in the
80       target. However, if \f$ N \f$ is larger than the number of points
81       in the data to be normalized the behaviour of the code is
82       undefined. Keep \f$ N \f$ equal to or less than the smallest
83       number of data points in the target or each data set to be
84       normalized against a given target. The lower bound of three is
85       due to restrictions in the cspline fit utilized in the
86       normalization.
87    */
88    template<typename BidirectionalIterator>
89    qQuantileNormalizer(BidirectionalIterator first, BidirectionalIterator last,
90                        unsigned int Q);
91
92    /**
93       \brief perform the Q-quantile normalization.
94
95       It is possible to normalize "in place"; it is permissible for
96       \a matrix and \a result to reference the same Matrix.
97
98       \note dimensions of \a matrix and \a result must match.
99     */
100    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
101    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
102                    RandomAccessIterator2 result) const;
103
104  private:
105
106  /**
107     \brief Partition a vector of data into equal sizes.
108
109     The class also calculates the average of each part and assigns
110     the average to the mid point of each part. The midpoint is a
111     double, i.e., it is not forced to be an integer index.
112  */
113  class Partitioner
114  {
115  public:
116    /**
117       \brief Create the partition and perform required calculations.
118    */
119    template<typename BidirectionalIterator>
120    Partitioner(BidirectionalIterator first, BidirectionalIterator last, 
121                unsigned int N);
122
123    /**
124       \brief Return the averages for each part.
125
126       \return The average vector.
127    */
128    const utility::Vector& averages(void) const;
129
130    /**
131       \brief Return the mid point for each partition.
132
133       \return The index vector.
134    */
135    const utility::Vector& index(void) const;
136
137    /**
138       \return The number of parts.
139    */
140    size_t size(void) const;
141
142  private:
143    // unweighted "constructor"
144    template<typename Iterator>
145    void build(Iterator first, Iterator last, unsigned int N, 
146               utility::unweighted_iterator_tag);
147    // weighted "constructor"
148    template<typename Iterator>
149    void build(Iterator first, Iterator last, unsigned int N, 
150               utility::weighted_iterator_tag);
151    void init(const utility::VectorBase&, unsigned int N);
152    void init(const std::vector<utility::DataWeight>&, unsigned int N);
153
154    utility::Vector average_;
155    utility::Vector index_;
156  };
157
158
159    Partitioner target_;
160  };
161
162
163  // template implementations
164
165  template<typename BidirectionalIterator>
166  qQuantileNormalizer::qQuantileNormalizer(BidirectionalIterator first, 
167                                           BidirectionalIterator last,
168                                           unsigned int Q)
169    : target_(Partitioner(first, last, Q))
170  {
171    utility::yat_assert<std::runtime_error>(Q>2, 
172                                            "qQuantileNormalizer: Q too small");
173  }
174
175
176  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
177  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
178                                       RandomAccessIterator1 last,
179                                       RandomAccessIterator2 result) const
180  {
181    size_t N = last-first;
182    utility::yat_assert<std::runtime_error>
183      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
184
185    std::vector<size_t> sorted_index(last-first);
186    utility::sort_index(first, last, sorted_index);
187
188    Partitioner source(first, last, target_.size());
189    utility::Vector diff(source.averages());
190    diff-=target_.averages();
191    const utility::Vector& idx=target_.index();
192    regression::CSplineInterpolation cspline(idx,diff);
193
194    // linear interpolation for first part, i.e., use first diff for
195    // all points in the first part.
196    size_t start=0;
197    size_t end=static_cast<unsigned int>(idx(0));
198    // The first condition below takes care of limiting case number
199    // of parts approximately equal to the number of matrix rows and
200    // the second condition makes sure that index is large enough
201    // when using cspline below ... the static cast above takes the
202    // floor whereas we want to take the "roof" forcing next index
203    // range to be within interpolation range for the cspline.
204    if ((end==0) || (end<idx(0)))
205      ++end;
206    for (size_t row=start; row<end; ++row) {
207      size_t srow=sorted_index[row];
208      result[srow] = first[srow] - diff(0);
209    }
210   
211    // cspline interpolation for all data between the mid points of
212    // the first and last part
213    start=end;
214    end=static_cast<unsigned int>(idx(target_.size()-1));
215    // take care of limiting case number of parts approximately
216    // equal to the number of matrix rows
217    if (end==(static_cast<size_t>(N-1)) )
218      --end;
219    for (size_t row=start; row<=end; ++row) {
220      size_t srow=sorted_index[row];
221      result[srow] = first[srow] - cspline.evaluate(row) ;
222    }
223   
224    // linear interpolation for last part, i.e., use last diff for
225    // all points in the last part.
226    start=end+1;
227    end=N;
228    for (size_t row=start; row<end; ++row) {
229      size_t srow=sorted_index[row];
230      result[srow] = first[srow] - diff(diff.size()-1);
231    }
232  }
233
234
235  template<typename BidirectionalIterator>
236  qQuantileNormalizer::Partitioner::Partitioner(BidirectionalIterator first, 
237                                                BidirectionalIterator last, 
238                                                unsigned int N)
239    : average_(utility::Vector(N)), index_(utility::Vector(N))
240  {
241    typedef typename 
242      utility::weighted_iterator_traits<BidirectionalIterator>::type tag;
243    build(first, last, N, tag());
244         
245  }
246
247
248  template<typename Iterator>
249  void qQuantileNormalizer::Partitioner::build(Iterator first, Iterator last, 
250                                               unsigned int N, 
251                                               utility::unweighted_iterator_tag)
252  {
253    utility::Vector vec(std::distance(first, last));
254    std::copy(first, last, vec.begin());
255    std::sort(vec.begin(), vec.end());
256    init(vec, N);
257  }
258
259
260  template<typename Iterator>
261  void qQuantileNormalizer::Partitioner::build(Iterator first, 
262                                               Iterator last, unsigned int N, 
263                                               utility::weighted_iterator_tag)
264  {
265    std::vector<utility::DataWeight> vec;
266    vec.reserve(std::distance(first, last));
267    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
268    std::copy(first, last, inserter);
269    std::sort(vec.begin(), vec.end());
270    init(vec, N);
271  }
272
273
274}}} // end of namespace normalizer, yat and thep
275
276#endif
Note: See TracBrowser for help on using the repository browser.