source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1751

Last change on this file since 1751 was 1751, checked in by Peter, 12 years ago

refs #478 - updated docs and simplified call to data_iterator and weight_iterator.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 10.6 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/Vector.h"
28#include "yat/utility/WeightIterator.h"
29#include "yat/utility/yat_assert.h"
30
31#include <algorithm>
32#include <iterator>
33#include <stdexcept>
34#include <vector>
35
36namespace theplu {
37namespace yat {
38namespace utility {
39  class VectorBase;
40}
41namespace normalizer {
42
43  /**
44     \brief Perform Q-quantile normalization
45
46     After a Q-quantile normalization each column has approximately
47     the same distribution of data (the Q-quantiles are the
48     same). Also, within each column the rank of an element is not
49     changed.
50
51     The normalization goes like this
52     - Data is not assumed to be sorted.
53     - Partition sorted target data in N parts. N must be 3 larger
54       because of requirements from the underlying cspline fit
55     - Calculate the arithmetic mean for each part, the mean is
56       assigned to the mid point of each part.
57     - Do the above for the data to be tranformed (called source
58       here).
59     - For each part, calculate the difference between the target and
60       the source. Now we have N differences d_i with associated rank
61       (midpoint of each part).
62     - Create a cubic spline fit to this difference vector d. The
63       resulting curve is used to recalculate all column values.
64       - Use the cubic spline fit for values within the cubic spline
65         fit range [midpoint 1st part, midpoint last part].
66       - For data outside the cubic spline fit use linear
67         extrapolation, i.e., a constant shift. d_first for points
68         below fit range, and d_last for points above fit range.
69
70     \since New in yat 0.5
71   */
72  class qQuantileNormalizer
73  {
74  public:
75    /**
76       \brief Documentation please.
77
78       \a Q is the number of parts and must be within \f$ [3,N] \f$
79       where \f$ N \f$ is the total number of data points in the
80       target. However, if \f$ N \f$ is larger than the number of points
81       in the data to be normalized the behaviour of the code is
82       undefined. Keep \f$ N \f$ equal to or less than the smallest
83       number of data points in the target or each data set to be
84       normalized against a given target. The lower bound of three is
85       due to restrictions in the cspline fit utilized in the
86       normalization.
87    */
88    template<typename BidirectionalIterator>
89    qQuantileNormalizer(BidirectionalIterator first, BidirectionalIterator last,
90                        unsigned int Q);
91
92    /**
93       \brief perform the Q-quantile normalization.
94
95       It is possible to normalize "in place"; it is permissible for
96       \a first and \a result to be the same.
97     */
98    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
99    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
100                    RandomAccessIterator2 result) const;
101
102  private:
103
104  /**
105     \brief Partition a range of data into equal sizes.
106
107     The class also calculates the average of each part and assigns
108     the average to the mid point of each part. The midpoint is a
109     double, i.e., it is not forced to be an integer index.
110  */
111  class Partitioner
112  {
113  public:
114    /**
115       \brief Create the partition and perform required calculations.
116    */
117    template<typename BidirectionalIterator>
118    Partitioner(BidirectionalIterator first, BidirectionalIterator last, 
119                unsigned int N);
120
121    /**
122       \brief Return the averages for each part.
123
124       \return The average vector.
125    */
126    const utility::Vector& averages(void) const;
127
128    /**
129       \brief Return the mid point for each partition.
130
131       \return The index vector.
132    */
133    const utility::Vector& index(void) const;
134
135    /**
136       \return The number of parts.
137    */
138    size_t size(void) const;
139
140  private:
141    // unweighted "constructor"
142    template<typename Iterator>
143    void build(Iterator first, Iterator last, unsigned int N, 
144               utility::unweighted_iterator_tag);
145    // weighted "constructor"
146    template<typename Iterator>
147    void build(Iterator first, Iterator last, unsigned int N, 
148               utility::weighted_iterator_tag);
149    void init(const utility::VectorBase&, unsigned int N);
150    void init(const std::vector<utility::DataWeight>&, unsigned int N);
151
152    utility::Vector average_;
153    utility::Vector index_;
154  };
155
156
157    Partitioner target_;
158
159    // unweighted version
160    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
161    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
162                   RandomAccessIterator1 last, RandomAccessIterator2 result,
163                   utility::unweighted_iterator_tag tag) const;
164
165    // weighted version
166    // copy weights and apply unweighted version on data part
167    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
168    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
169                   RandomAccessIterator1 last, RandomAccessIterator2 result,
170                   utility::weighted_iterator_tag tag) const;
171  };
172
173
174  // template implementations
175
176  template<typename BidirectionalIterator>
177  qQuantileNormalizer::qQuantileNormalizer(BidirectionalIterator first, 
178                                           BidirectionalIterator last,
179                                           unsigned int Q)
180    : target_(Partitioner(first, last, Q))
181  {
182    utility::yat_assert<std::runtime_error>(Q>2, 
183                                            "qQuantileNormalizer: Q too small");
184  }
185
186
187  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
188  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
189                                       RandomAccessIterator1 last,
190                                       RandomAccessIterator2 result) const
191  {
192    Partitioner source(first, last, target_.size());
193    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
194    normalize(source, first, last, result, tag);
195  }
196
197
198  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
199  void 
200  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
201                                 RandomAccessIterator1 first, 
202                                 RandomAccessIterator1 last, 
203                                 RandomAccessIterator2 result,
204                                 utility::unweighted_iterator_tag tag) const
205  {
206    utility::check_iterator_is_unweighted(first);
207    utility::check_iterator_is_unweighted(result);
208    size_t N = last-first;
209    utility::yat_assert<std::runtime_error>
210      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
211
212    std::vector<size_t> sorted_index(last-first);
213    utility::sort_index(first, last, sorted_index);
214
215    utility::Vector diff(source.averages());
216    diff-=target_.averages();
217    const utility::Vector& idx=target_.index();
218    regression::CSplineInterpolation cspline(idx,diff);
219
220    // linear interpolation for first part, i.e., use first diff for
221    // all points in the first part.
222    size_t start=0;
223    size_t end=static_cast<unsigned int>(idx(0));
224    // The first condition below takes care of limiting case number
225    // of parts approximately equal to the number of matrix rows and
226    // the second condition makes sure that index is large enough
227    // when using cspline below ... the static cast above takes the
228    // floor whereas we want to take the "roof" forcing next index
229    // range to be within interpolation range for the cspline.
230    if ((end==0) || (end<idx(0)))
231      ++end;
232    for (size_t row=start; row<end; ++row) {
233      size_t srow=sorted_index[row];
234      result[srow] = first[srow] - diff(0);
235    }
236   
237    // cspline interpolation for all data between the mid points of
238    // the first and last part
239    start=end;
240    end=static_cast<unsigned int>(idx(target_.size()-1));
241    // take care of limiting case number of parts approximately
242    // equal to the number of matrix rows
243    if (end==(static_cast<size_t>(N-1)) )
244      --end;
245    for (size_t row=start; row<=end; ++row) {
246      size_t srow=sorted_index[row];
247      result[srow] = first[srow] - cspline.evaluate(row) ;
248    }
249   
250    // linear interpolation for last part, i.e., use last diff for
251    // all points in the last part.
252    start=end+1;
253    end=N;
254    for (size_t row=start; row<end; ++row) {
255      size_t srow=sorted_index[row];
256      result[srow] = first[srow] - diff(diff.size()-1);
257    }
258  }
259
260
261  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
262  void qQuantileNormalizer::normalize(const Partitioner& source,
263                                      RandomAccessIterator1 first, 
264                                      RandomAccessIterator1 last, 
265                                      RandomAccessIterator2 result,
266                                      utility::weighted_iterator_tag tag) const
267  {
268    // copy the weights
269    std::copy(utility::weight_iterator(first), 
270              utility::weight_iterator(last), 
271              utility::weight_iterator(result)); 
272    // apply algorithm on data part of range
273    normalize(source, utility::data_iterator(first),
274              utility::data_iterator(last),
275              utility::data_iterator(result),
276              utility::unweighted_iterator_tag());
277  }
278
279
280  template<typename BidirectionalIterator>
281  qQuantileNormalizer::Partitioner::Partitioner(BidirectionalIterator first, 
282                                                BidirectionalIterator last, 
283                                                unsigned int N)
284    : average_(utility::Vector(N)), index_(utility::Vector(N))
285  {
286    typedef typename 
287      utility::weighted_iterator_traits<BidirectionalIterator>::type tag;
288    build(first, last, N, tag());
289         
290  }
291
292
293  template<typename Iterator>
294  void qQuantileNormalizer::Partitioner::build(Iterator first, Iterator last, 
295                                               unsigned int N, 
296                                               utility::unweighted_iterator_tag)
297  {
298    utility::Vector vec(std::distance(first, last));
299    std::copy(first, last, vec.begin());
300    std::sort(vec.begin(), vec.end());
301    init(vec, N);
302  }
303
304
305  template<typename Iterator>
306  void qQuantileNormalizer::Partitioner::build(Iterator first, 
307                                               Iterator last, unsigned int N, 
308                                               utility::weighted_iterator_tag)
309  {
310    std::vector<utility::DataWeight> vec;
311    vec.reserve(std::distance(first, last));
312    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
313    std::copy(first, last, inserter);
314    std::sort(vec.begin(), vec.end());
315    init(vec, N);
316  }
317
318
319}}} // end of namespace normalizer, yat and thep
320
321#endif
Note: See TracBrowser for help on using the repository browser.