source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1738

Last change on this file since 1738 was 1738, checked in by Peter, 12 years ago

added support for creation of an qQuantileNormalizer from a weighted range. Addresses #478

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 8.9 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataWeight.h"
25#include "yat/utility/iterator_traits.h"
26#include "yat/utility/Vector.h"
27#include "yat/utility/yat_assert.h"
28
29#include <algorithm>
30#include <iterator>
31#include <stdexcept>
32#include <vector>
33
34namespace theplu {
35namespace yat {
36namespace utility {
37  class VectorBase;
38}
39namespace normalizer {
40
41  /**
42     \brief Perform Q-quantile normalization
43
44     After a Q-quantile normalization each column has approximately
45     the same distribution of data (the Q-quantiles are the
46     same). Also, within each column the rank of an element is not
47     changed.
48
49     There is currently no weighted version of qQuantileNormalizer
50
51     The normalization goes like this
52     - Data is not assumed to be sorted.
53     - Partition sorted target data in N parts. N must be 3 larger
54       because of requirements from the underlying cspline fit
55     - Calculate the arithmetic mean for each part, the mean is
56       assigned to the mid point of each part.
57     - Do the above for the data to be tranformed (called source
58       here).
59     - For each part, calculate the difference between the target and
60       the source. Now we have N differences d_i with associated rank
61       (midpoint of each part).
62     - Create a cubic spline fit to this difference vector d. The
63       resulting curve is used to recalculate all column values.
64       - Use the cubic spline fit for values within the cubic spline
65         fit range [midpoint 1st part, midpoint last part].
66       - For data outside the cubic spline fit use linear
67         extrapolation, i.e., a constant shift. d_first for points
68         below fit range, and d_last for points above fit range.
69
70     \since New in yat 0.5
71   */
72  class qQuantileNormalizer
73  {
74  public:
75    /**
76       \brief Documentation please.
77
78       \a Q is the number of parts and must be within \f$ [3,N] \f$
79       where \f$ N \f$ is the total number of data points in the
80       target. However, if \f$ N \f$ is larger than the number of points
81       in the data to be normalized the behaviour of the code is
82       undefined. Keep \f$ N \f$ equal to or less than the smallest
83       number of data points in the target or each data set to be
84       normalized against a given target. The lower bound of three is
85       due to restrictions in the cspline fit utilized in the
86       normalization.
87    */
88    template<typename BidirectionalIterator>
89    qQuantileNormalizer(BidirectionalIterator first, BidirectionalIterator last,
90                        unsigned int Q);
91
92    /**
93       \brief perform the Q-quantile normalization.
94
95       It is possible to normalize "in place"; it is permissible for
96       \a matrix and \a result to reference the same Matrix.
97
98       \note dimensions of \a matrix and \a result must match.
99     */
100    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
101    RandomAccessIterator2 operator()(RandomAccessIterator1 first, 
102                                     RandomAccessIterator1 last,
103                                     RandomAccessIterator2 result) const;
104
105  private:
106
107  /**
108     \brief Partition a vector of data into equal sizes.
109
110     The class also calculates the average of each part and assigns
111     the average to the mid point of each part. The midpoint is a
112     double, i.e., it is not forced to be an integer index.
113  */
114  class Partitioner
115  {
116  public:
117    /**
118       \brief Create the partition and perform required calculations.
119    */
120    template<typename BidirectionalIterator>
121    Partitioner(BidirectionalIterator first, BidirectionalIterator last, 
122                unsigned int N);
123
124    /**
125       \brief Return the averages for each part.
126
127       \return The average vector.
128    */
129    const utility::Vector& averages(void) const;
130
131    /**
132       \brief Return the mid point for each partition.
133
134       \return The index vector.
135    */
136    const utility::Vector& index(void) const;
137
138    /**
139       \return The number of parts.
140    */
141    size_t size(void) const;
142
143  private:
144    // unweighted "constructor"
145    template<typename Iterator>
146    void build(Iterator first, Iterator last, unsigned int N, 
147               utility::unweighted_iterator_tag);
148    // weighted "constructor"
149    template<typename Iterator>
150    void build(Iterator first, Iterator last, unsigned int N, 
151               utility::weighted_iterator_tag);
152    void init(const utility::VectorBase&, unsigned int N);
153    void init(const std::vector<utility::DataWeight>&, unsigned int N);
154
155    utility::Vector average_;
156    utility::Vector index_;
157  };
158
159
160    Partitioner target_;
161  };
162
163
164  // template implementations
165
166  template<typename BidirectionalIterator>
167  qQuantileNormalizer::qQuantileNormalizer(BidirectionalIterator first, 
168                                           BidirectionalIterator last,
169                                           unsigned int Q)
170    : target_(Partitioner(first, last, Q))
171  {
172    utility::yat_assert<std::runtime_error>(Q>2, 
173                                            "qQuantileNormalizer: Q too small");
174  }
175
176
177  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
178  RandomAccessIterator2
179  qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
180                                  RandomAccessIterator1 last,
181                                  RandomAccessIterator2 result) const
182  {
183    size_t N = last-first;
184    utility::yat_assert<std::runtime_error>
185      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
186
187    std::vector<size_t> sorted_index(last-first);
188    utility::sort_index(first, last, sorted_index);
189
190    Partitioner source(first, last, target_.size());
191    utility::Vector diff(source.averages());
192    diff-=target_.averages();
193    const utility::Vector& idx=target_.index();
194    regression::CSplineInterpolation cspline(idx,diff);
195
196    // linear interpolation for first part, i.e., use first diff for
197    // all points in the first part.
198    size_t start=0;
199    size_t end=static_cast<unsigned int>(idx(0));
200    // The first condition below takes care of limiting case number
201    // of parts approximately equal to the number of matrix rows and
202    // the second condition makes sure that index is large enough
203    // when using cspline below ... the static cast above takes the
204    // floor whereas we want to take the "roof" forcing next index
205    // range to be within interpolation range for the cspline.
206    if ((end==0) || (end<idx(0)))
207      ++end;
208    for (size_t row=start; row<end; ++row) {
209      size_t srow=sorted_index[row];
210      result[srow] = first[srow] - diff(0);
211    }
212   
213    // cspline interpolation for all data between the mid points of
214    // the first and last part
215    start=end;
216    end=static_cast<unsigned int>(idx(target_.size()-1));
217    // take care of limiting case number of parts approximately
218    // equal to the number of matrix rows
219    if (end==(static_cast<size_t>(N-1)) )
220      --end;
221    for (size_t row=start; row<=end; ++row) {
222      size_t srow=sorted_index[row];
223      result[srow] = first[srow] - cspline.evaluate(row) ;
224    }
225   
226    // linear interpolation for last part, i.e., use last diff for
227    // all points in the last part.
228    start=end+1;
229    end=N;
230    for (size_t row=start; row<end; ++row) {
231      size_t srow=sorted_index[row];
232      result[srow] = first[srow] - diff(diff.size()-1);
233    }
234    return result + N;
235  }
236
237
238  template<typename BidirectionalIterator>
239  qQuantileNormalizer::Partitioner::Partitioner(BidirectionalIterator first, 
240                                                BidirectionalIterator last, 
241                                                unsigned int N)
242    : average_(utility::Vector(N)), index_(utility::Vector(N))
243  {
244    typedef typename 
245      utility::weighted_iterator_traits<BidirectionalIterator>::type tag;
246    build(first, last, N, tag());
247         
248  }
249
250
251  template<typename Iterator>
252  void qQuantileNormalizer::Partitioner::build(Iterator first, Iterator last, 
253                                               unsigned int N, 
254                                               utility::unweighted_iterator_tag)
255  {
256    utility::Vector vec(std::distance(first, last));
257    std::copy(first, last, vec.begin());
258    std::sort(vec.begin(), vec.end());
259    init(vec, N);
260  }
261
262
263  template<typename Iterator>
264  void qQuantileNormalizer::Partitioner::build(Iterator first, 
265                                               Iterator last, unsigned int N, 
266                                               utility::weighted_iterator_tag)
267  {
268    std::vector<utility::DataWeight> vec;
269    vec.reserve(std::distance(first, last));
270    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
271    std::copy(first, last, inserter);
272    std::sort(vec.begin(), vec.end());
273    init(vec, N);
274  }
275
276
277}}} // end of namespace normalizer, yat and thep
278
279#endif
Note: See TracBrowser for help on using the repository browser.