source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1750

Last change on this file since 1750 was 1750, checked in by Peter, 12 years ago

added a compilation test for qQN::operator() with weighted iterators. refs #478

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 10.9 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/Vector.h"
28#include "yat/utility/WeightIterator.h"
29#include "yat/utility/yat_assert.h"
30
31#include <algorithm>
32#include <iterator>
33#include <stdexcept>
34#include <vector>
35
36namespace theplu {
37namespace yat {
38namespace utility {
39  class VectorBase;
40}
41namespace normalizer {
42
43  /**
44     \brief Perform Q-quantile normalization
45
46     After a Q-quantile normalization each column has approximately
47     the same distribution of data (the Q-quantiles are the
48     same). Also, within each column the rank of an element is not
49     changed.
50
51     There is currently no weighted version of qQuantileNormalizer
52
53     The normalization goes like this
54     - Data is not assumed to be sorted.
55     - Partition sorted target data in N parts. N must be 3 larger
56       because of requirements from the underlying cspline fit
57     - Calculate the arithmetic mean for each part, the mean is
58       assigned to the mid point of each part.
59     - Do the above for the data to be tranformed (called source
60       here).
61     - For each part, calculate the difference between the target and
62       the source. Now we have N differences d_i with associated rank
63       (midpoint of each part).
64     - Create a cubic spline fit to this difference vector d. The
65       resulting curve is used to recalculate all column values.
66       - Use the cubic spline fit for values within the cubic spline
67         fit range [midpoint 1st part, midpoint last part].
68       - For data outside the cubic spline fit use linear
69         extrapolation, i.e., a constant shift. d_first for points
70         below fit range, and d_last for points above fit range.
71
72     \since New in yat 0.5
73   */
74  class qQuantileNormalizer
75  {
76  public:
77    /**
78       \brief Documentation please.
79
80       \a Q is the number of parts and must be within \f$ [3,N] \f$
81       where \f$ N \f$ is the total number of data points in the
82       target. However, if \f$ N \f$ is larger than the number of points
83       in the data to be normalized the behaviour of the code is
84       undefined. Keep \f$ N \f$ equal to or less than the smallest
85       number of data points in the target or each data set to be
86       normalized against a given target. The lower bound of three is
87       due to restrictions in the cspline fit utilized in the
88       normalization.
89    */
90    template<typename BidirectionalIterator>
91    qQuantileNormalizer(BidirectionalIterator first, BidirectionalIterator last,
92                        unsigned int Q);
93
94    /**
95       \brief perform the Q-quantile normalization.
96
97       It is possible to normalize "in place"; it is permissible for
98       \a matrix and \a result to reference the same Matrix.
99
100       \note dimensions of \a matrix and \a result must match.
101     */
102    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
103    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
104                    RandomAccessIterator2 result) const;
105
106  private:
107
108  /**
109     \brief Partition a vector of data into equal sizes.
110
111     The class also calculates the average of each part and assigns
112     the average to the mid point of each part. The midpoint is a
113     double, i.e., it is not forced to be an integer index.
114  */
115  class Partitioner
116  {
117  public:
118    /**
119       \brief Create the partition and perform required calculations.
120    */
121    template<typename BidirectionalIterator>
122    Partitioner(BidirectionalIterator first, BidirectionalIterator last, 
123                unsigned int N);
124
125    /**
126       \brief Return the averages for each part.
127
128       \return The average vector.
129    */
130    const utility::Vector& averages(void) const;
131
132    /**
133       \brief Return the mid point for each partition.
134
135       \return The index vector.
136    */
137    const utility::Vector& index(void) const;
138
139    /**
140       \return The number of parts.
141    */
142    size_t size(void) const;
143
144  private:
145    // unweighted "constructor"
146    template<typename Iterator>
147    void build(Iterator first, Iterator last, unsigned int N, 
148               utility::unweighted_iterator_tag);
149    // weighted "constructor"
150    template<typename Iterator>
151    void build(Iterator first, Iterator last, unsigned int N, 
152               utility::weighted_iterator_tag);
153    void init(const utility::VectorBase&, unsigned int N);
154    void init(const std::vector<utility::DataWeight>&, unsigned int N);
155
156    utility::Vector average_;
157    utility::Vector index_;
158  };
159
160
161    Partitioner target_;
162
163    // unweighted version
164    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
165    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
166                   RandomAccessIterator1 last, RandomAccessIterator2 result,
167                   utility::unweighted_iterator_tag tag) const;
168
169    // weighted version
170    // copy weights and apply unweighted version on data part
171    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
172    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
173                   RandomAccessIterator1 last, RandomAccessIterator2 result,
174                   utility::weighted_iterator_tag tag) const;
175  };
176
177
178  // template implementations
179
180  template<typename BidirectionalIterator>
181  qQuantileNormalizer::qQuantileNormalizer(BidirectionalIterator first, 
182                                           BidirectionalIterator last,
183                                           unsigned int Q)
184    : target_(Partitioner(first, last, Q))
185  {
186    utility::yat_assert<std::runtime_error>(Q>2, 
187                                            "qQuantileNormalizer: Q too small");
188  }
189
190
191  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
192  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
193                                       RandomAccessIterator1 last,
194                                       RandomAccessIterator2 result) const
195  {
196    Partitioner source(first, last, target_.size());
197    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
198    normalize(source, first, last, result, tag);
199  }
200
201
202  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
203  void 
204  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
205                                 RandomAccessIterator1 first, 
206                                 RandomAccessIterator1 last, 
207                                 RandomAccessIterator2 result,
208                                 utility::unweighted_iterator_tag tag) const
209  {
210    utility::check_iterator_is_unweighted(first);
211    utility::check_iterator_is_unweighted(result);
212    size_t N = last-first;
213    utility::yat_assert<std::runtime_error>
214      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
215
216    std::vector<size_t> sorted_index(last-first);
217    utility::sort_index(first, last, sorted_index);
218
219    utility::Vector diff(source.averages());
220    diff-=target_.averages();
221    const utility::Vector& idx=target_.index();
222    regression::CSplineInterpolation cspline(idx,diff);
223
224    // linear interpolation for first part, i.e., use first diff for
225    // all points in the first part.
226    size_t start=0;
227    size_t end=static_cast<unsigned int>(idx(0));
228    // The first condition below takes care of limiting case number
229    // of parts approximately equal to the number of matrix rows and
230    // the second condition makes sure that index is large enough
231    // when using cspline below ... the static cast above takes the
232    // floor whereas we want to take the "roof" forcing next index
233    // range to be within interpolation range for the cspline.
234    if ((end==0) || (end<idx(0)))
235      ++end;
236    for (size_t row=start; row<end; ++row) {
237      size_t srow=sorted_index[row];
238      result[srow] = first[srow] - diff(0);
239    }
240   
241    // cspline interpolation for all data between the mid points of
242    // the first and last part
243    start=end;
244    end=static_cast<unsigned int>(idx(target_.size()-1));
245    // take care of limiting case number of parts approximately
246    // equal to the number of matrix rows
247    if (end==(static_cast<size_t>(N-1)) )
248      --end;
249    for (size_t row=start; row<=end; ++row) {
250      size_t srow=sorted_index[row];
251      result[srow] = first[srow] - cspline.evaluate(row) ;
252    }
253   
254    // linear interpolation for last part, i.e., use last diff for
255    // all points in the last part.
256    start=end+1;
257    end=N;
258    for (size_t row=start; row<end; ++row) {
259      size_t srow=sorted_index[row];
260      result[srow] = first[srow] - diff(diff.size()-1);
261    }
262  }
263
264
265  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
266  void qQuantileNormalizer::normalize(const Partitioner& source,
267                                      RandomAccessIterator1 first, 
268                                      RandomAccessIterator1 last, 
269                                      RandomAccessIterator2 result,
270                                      utility::weighted_iterator_tag tag) const
271  {
272    // copy the weights
273    std::copy(utility::weight_iterator<RandomAccessIterator1>(first), 
274              utility::weight_iterator<RandomAccessIterator1>(last), 
275              utility::weight_iterator<RandomAccessIterator2>(result)); 
276    // apply algorithm on data part of range
277    normalize(source, utility::data_iterator<RandomAccessIterator1>(first),
278              utility::data_iterator<RandomAccessIterator1>(last),
279              utility::data_iterator<RandomAccessIterator2>(result),
280              utility::unweighted_iterator_tag());
281  }
282
283
284  template<typename BidirectionalIterator>
285  qQuantileNormalizer::Partitioner::Partitioner(BidirectionalIterator first, 
286                                                BidirectionalIterator last, 
287                                                unsigned int N)
288    : average_(utility::Vector(N)), index_(utility::Vector(N))
289  {
290    typedef typename 
291      utility::weighted_iterator_traits<BidirectionalIterator>::type tag;
292    build(first, last, N, tag());
293         
294  }
295
296
297  template<typename Iterator>
298  void qQuantileNormalizer::Partitioner::build(Iterator first, Iterator last, 
299                                               unsigned int N, 
300                                               utility::unweighted_iterator_tag)
301  {
302    utility::Vector vec(std::distance(first, last));
303    std::copy(first, last, vec.begin());
304    std::sort(vec.begin(), vec.end());
305    init(vec, N);
306  }
307
308
309  template<typename Iterator>
310  void qQuantileNormalizer::Partitioner::build(Iterator first, 
311                                               Iterator last, unsigned int N, 
312                                               utility::weighted_iterator_tag)
313  {
314    std::vector<utility::DataWeight> vec;
315    vec.reserve(std::distance(first, last));
316    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
317    std::copy(first, last, inserter);
318    std::sort(vec.begin(), vec.end());
319    init(vec, N);
320  }
321
322
323}}} // end of namespace normalizer, yat and thep
324
325#endif
Note: See TracBrowser for help on using the repository browser.