source: trunk/yat/normalizer/qQuantileNormalizer.h @ 1814

Last change on this file since 1814 was 1814, checked in by Peter, 13 years ago

Finalized dox for qQuantileNormalizer. Closes #478

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 13.7 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6
7  This file is part of the yat library, http://dev.thep.lu.se/yat
8
9  The yat library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU General Public License as
11  published by the Free Software Foundation; either version 3 of the
12  License, or (at your option) any later version.
13
14  The yat library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  General Public License for more details.
18
19  You should have received a copy of the GNU General Public License
20  along with yat. If not, see <http://www.gnu.org/licenses/>.
21*/
22
23#include "yat/regression/CSplineInterpolation.h"
24#include "yat/utility/DataIterator.h"
25#include "yat/utility/DataWeight.h"
26#include "yat/utility/iterator_traits.h"
27#include "yat/utility/sort_index.h"
28#include "yat/utility/Vector.h"
29#include "yat/utility/WeightIterator.h"
30#include "yat/utility/yat_assert.h"
31
32#include <algorithm>
33#include <cmath>
34#include <iterator>
35#include <numeric>
36#include <stdexcept>
37#include <vector>
38
39namespace theplu {
40namespace yat {
41namespace utility {
42  class VectorBase;
43}
44namespace normalizer {
45
46  /**
47     \brief Perform Q-quantile normalization
48
49     Perform a Q-quantile normalization on a \a source range, after
50     which it will approximately have the same distribution of data as
51     the \a target range (the Q-quantiles are the same). The rank of
52     an element in the \a source range is not changed.
53
54     The class works also with unweighed ranges, and there is no
55     restriction that weighted \a source range requires weighted \a
56     target range or vice versa.
57
58     Normalization goes like this:
59     - Data are not assumed to be sorted.
60     - Partition sorted \a target data in N parts. N must be 3 or larger
61       because of requirements from the underlying cspline fit
62     - Calculate the arithmetic (weighted) mean for each part, the mean is
63       assigned to the mid point of each part.
64     - Do the above for the data to be tranformed (called \a source
65       here).
66     - For each part, calculate the difference between the \a target
67       and \a the source. Now we have \a N differences \f$ d_i \f$
68       with associated rank (midpoint of each part).
69     - Create a cubic spline fit to this difference vector \a d. The
70       resulting curve is used to recalculate all values in \a source.
71       - Use the cubic spline fit for values within the cubic spline
72         fit range [midpoint 1st part, midpoint last part].
73       - For data outside the cubic spline fit use linear
74         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
75         points below fit range, and \f$ d_{last} \f$ for points above fit
76         range.
77
78     \since New in yat 0.5
79   */
80  class qQuantileNormalizer
81  {
82  public:
83    /**
84       \brief Contructor
85
86       Divides a sorted copy of range [\a first,\a last) into \a Q
87       parts. Parts are divided such that the sum of weights is
88       approximately the same in the different parts. If a relative
89       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
90       be difficult to achieve, in which case a an exception is
91       thrown. In the unweighted case this implies that \a Q should be
92       smaller (or equal) than number of elements in [\a first, \a
93       last).
94
95       As the \a source range is also divided into \a Q parts (in
96       operator()), it is recommended to keep \a Q smaller (or equal)
97       than the size of ranges that will be normalized.
98
99       Also, \a Q must not be smaller than 3 due to restrictions in
100       the cspline fit utilized in the normalization.
101
102       \tparam ForwardIterator is a model of \forward_iterator, and is
103       either a \ref concept_weighted_iterator or unweighted with
104       value type convertible to \c double.
105
106    */
107    template<typename ForwardIterator>
108    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
109                        unsigned int Q);
110
111    /**
112       \brief perform the Q-quantile normalization.
113
114       Elements in [\a first, \a last) are normalized as described
115       above and the result is assigned to [\a result, \a result + \a
116       last-\a first). Input range [\a first, \a last) is not
117       modified. If ranges are weighted, the weights are copied from
118       [\a first, \a last) to \a result range.
119
120       It is possible to normalize "in place"; it is permissible for
121       \a first and \a result to be the same. However, as assignment
122       occurs sequentially, the operation is undefined if \a result is
123       the same as any in range [\a first, \a last).
124
125       \tparam RandomAccessIterator1 is a model of
126       \random_access_iterator, and is either a \ref
127       concept_weighted_iterator or unweighted with value type
128       convertible to \c double.
129
130       \tparam RandomAccessIterator2 is a model of
131       \random_access_iterator and its value type is convertible from
132       \c double. \a RandomAccessIterator2 is mutable.
133
134     */
135    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
136    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
137                    RandomAccessIterator2 result) const;
138
139  private:
140
141  /**
142     \brief Partition a range of data into equal sizes.
143
144     Copy the range [first, last), sort the copy, and divide the
145     sorted copy in N parts. The parts are created such that the total
146     weight in a part is approximately W/N where W is the total weight
147     (over all parts). The class calculates the average value in each
148     part and also the "quantile".
149
150     The class also calculates the average of each part and assigns
151     the average to the mid point of each part. The midpoint is a
152     double, i.e., it is not forced to be an integer index.
153  */
154  class Partitioner
155  {
156  public:
157    /**
158       \brief Create the partition and perform required calculations.
159    */
160    template<typename ForwardIterator>
161    Partitioner(ForwardIterator first, ForwardIterator last, 
162                unsigned int N);
163
164    /**
165       \brief Return the averages for each part.
166
167       \return The average vector.
168    */
169    const utility::Vector& averages(void) const;
170
171    /**
172       The quantile (here) is defined as (w_lower + w_upper) / 2W,
173       where w_lower is the total weight of elements smaller than the
174       smallest element in the part, and w_upper is the total weight
175       of elements smaller (or equal) than the largest value in the
176       part. 
177
178       In the unweighted case all weights are 1.0, which implies q_0 =
179       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
180       is number of elements in ith part.
181
182       \return The quantiles vector.
183    */
184    const utility::Vector& quantiles(void) const;
185
186    /**
187       \return The number of parts.
188    */
189    size_t size(void) const;
190
191  private:
192    // unweighted "constructor"
193    template<typename ForwardIterator>
194    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
195               utility::unweighted_iterator_tag);
196    // weighted "constructor"
197    template<typename ForwardIterator>
198    void build(ForwardIterator first, ForwardIterator last, unsigned int N, 
199               utility::weighted_iterator_tag);
200    void init(const utility::VectorBase&, unsigned int N);
201    void init(const std::vector<utility::DataWeight>&, unsigned int N);
202
203    utility::Vector average_;
204    utility::Vector quantiles_;
205  };
206
207    Partitioner target_;
208
209    // unweighted version
210    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
211    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
212                   RandomAccessIterator1 last, RandomAccessIterator2 result,
213                   utility::unweighted_iterator_tag tag) const;
214
215    // weighted version
216    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
217    void normalize(const Partitioner& source,RandomAccessIterator1 first, 
218                   RandomAccessIterator1 last, RandomAccessIterator2 result,
219                   utility::weighted_iterator_tag tag) const;
220  };
221
222
223  // template implementations
224
225  template<typename ForwardIterator>
226  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 
227                                           ForwardIterator last,
228                                           unsigned int Q)
229    : target_(Partitioner(first, last, Q))
230  {
231    utility::yat_assert<std::runtime_error>(Q>2,
232                                            "qQuantileNormalizer: Q too small");
233  }
234
235
236  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
237  void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 
238                                       RandomAccessIterator1 last,
239                                       RandomAccessIterator2 result) const
240  {
241    Partitioner source(first, last, target_.size());
242    typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
243    normalize(source, first, last, result, tag);
244  }
245
246
247  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
248  void 
249  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
250                                 RandomAccessIterator1 first, 
251                                 RandomAccessIterator1 last, 
252                                 RandomAccessIterator2 result,
253                                 utility::unweighted_iterator_tag tag) const
254  {
255    utility::check_iterator_is_unweighted(first);
256    utility::check_iterator_is_unweighted(result);
257    size_t N = last-first;
258    utility::yat_assert<std::runtime_error>
259      (N >= target_.size(), "qQuantileNormalizer: Input range too small");
260
261    std::vector<size_t> sorted_index(last-first);
262    utility::sort_index(first, last, sorted_index);
263
264    utility::Vector diff(source.averages());
265    diff-=target_.averages();
266    const utility::Vector& idx=target_.quantiles();
267    regression::CSplineInterpolation cspline(idx,diff);
268
269    // linear extrapolation for first part, i.e., use first diff for
270    // all points in the first part.
271    size_t start=0;
272    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
273    // take care of limiting case number of parts approximately equal
274    // to the number of elements in range.
275    if (end==0)
276      ++end;
277    for (size_t i=start; i<end; ++i) {
278      size_t si = sorted_index[i];
279      result[si] = first[si] - diff(0);
280    }
281
282    using utility::yat_assert;
283   
284    // cspline interpolation for all data between the mid points of
285    // the first and last part
286    start=end;
287    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
288    if (end>=N)
289      end = N-1;
290    for ( size_t i=start; i<end; ++i) {
291      size_t si = sorted_index[i];
292     
293      yat_assert<std::runtime_error>((i+0.5)/N>idx(0), 
294                               "qQuantileNormalizer: invalid input to cspline");
295      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
296    }
297   
298    // linear extrapolation for last part, i.e., use last diff for
299    // all points in the last part.
300    for (size_t i=end ; i<N; ++i) {
301      size_t si = sorted_index[i];
302      result[si] = first[si] - diff(diff.size()-1);
303    }
304  }
305
306
307  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
308  void qQuantileNormalizer::normalize(const Partitioner& source,
309                                      RandomAccessIterator1 first, 
310                                      RandomAccessIterator1 last, 
311                                      RandomAccessIterator2 result,
312                                      utility::weighted_iterator_tag tag) const
313  {
314    // copy the weights
315    std::copy(utility::weight_iterator(first), 
316              utility::weight_iterator(last), 
317              utility::weight_iterator(result)); 
318
319    double total_w = std::accumulate(utility::weight_iterator(first),
320                                     utility::weight_iterator(last), 0.0);
321
322    std::vector<size_t> sorted_index(last-first);
323    utility::sort_index(utility::data_iterator(first), 
324                        utility::data_iterator(last), sorted_index);
325
326    utility::Vector diff(source.averages());
327    diff-=target_.averages();
328    const utility::Vector& idx=target_.quantiles();
329    regression::CSplineInterpolation cspline(idx,diff);
330
331    double sum_w=0;
332    utility::iterator_traits<RandomAccessIterator1> traits1;
333    utility::iterator_traits<RandomAccessIterator2> traits2;
334    for (size_t i=0; i<sorted_index.size(); ++i) {
335      size_t si = sorted_index[i];
336      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
337      double correction = 0;
338      if (w <= idx(0)) {
339        // linear extrapolation for first part, i.e., use first diff for
340        // all points in the first part.
341        correction = diff(0);
342      }
343      else if (w < idx(idx.size()-1) ) {
344        // cspline interpolation for all data between the mid points of
345        // the first and last part
346        correction = cspline.evaluate(w);
347      }
348      else {
349        // linear extrapolation for last part, i.e., use last diff for
350        // all points in the last part.
351        correction = diff(diff.size()-1);
352      }
353      traits2.data(result+si) = traits1.data(first+si) - correction;
354      sum_w += traits1.weight(first+si);
355    }
356  }
357
358
359  template<typename ForwardIterator>
360  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 
361                                                ForwardIterator last, 
362                                                unsigned int N)
363    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
364  {
365    build(first, last, N, 
366          typename utility::weighted_iterator_traits<ForwardIterator>::type());
367  }
368
369
370  template<typename ForwardIterator>
371  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
372                                               ForwardIterator last, 
373                                               unsigned int N, 
374                                               utility::unweighted_iterator_tag)
375  {
376    utility::Vector vec(std::distance(first, last));
377    std::copy(first, last, vec.begin());
378    std::sort(vec.begin(), vec.end());
379    init(vec, N);
380  }
381
382
383  template<typename ForwardIterator>
384  void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 
385                                               ForwardIterator last, 
386                                               unsigned int N, 
387                                               utility::weighted_iterator_tag)
388  {
389    std::vector<utility::DataWeight> vec;
390    vec.reserve(std::distance(first, last));
391    std::back_insert_iterator<std::vector<utility::DataWeight> > inserter(vec);
392    std::copy(first, last, inserter);
393    std::sort(vec.begin(), vec.end());
394    init(vec, N);
395  }
396
397
398}}} // end of namespace normalizer, yat and thep
399
400#endif
Note: See TracBrowser for help on using the repository browser.