source: trunk/yat/normalizer/qQuantileNormalizer.h @ 3571

Last change on this file since 3571 was 3571, checked in by Peter, 6 years ago

closes #875

With new version of sort_index (since yat 0.13) allows sorting ranges
of DataWeight?, which behaves as desired, use that function rather than
copying input range.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 13.8 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6  Copyright (C) 2010, 2016 Peter Johansson
7
8  This file is part of the yat library, http://dev.thep.lu.se/yat
9
10  The yat library is free software; you can redistribute it and/or
11  modify it under the terms of the GNU General Public License as
12  published by the Free Software Foundation; either version 3 of the
13  License, or (at your option) any later version.
14
15  The yat library is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  General Public License for more details.
19
20  You should have received a copy of the GNU General Public License
21  along with yat. If not, see <http://www.gnu.org/licenses/>.
22*/
23
24#include "utility.h"
25
26#include "yat/regression/CSplineInterpolation.h"
27#include "yat/utility/concept_check.h"
28#include "yat/utility/DataIterator.h"
29#include "yat/utility/DataWeight.h"
30#include "yat/utility/Exception.h"
31#include "yat/utility/iterator_traits.h"
32#include "yat/utility/sort_index.h"
33#include "yat/utility/Vector.h"
34#include "yat/utility/WeightIterator.h"
35#include "yat/utility/yat_assert.h"
36
37#include <boost/concept_check.hpp>
38
39#include <algorithm>
40#include <cmath>
41#include <iterator>
42#include <limits>
43#include <numeric>
44#include <stdexcept>
45#include <vector>
46
47namespace theplu {
48namespace yat {
49namespace utility {
50  class VectorBase;
51}
52namespace normalizer {
53
54  /**
55     \brief Perform Q-quantile normalization
56
57     Perform a Q-quantile normalization on a \a source range, after
58     which it will approximately have the same distribution of data as
59     the \a target range (the Q-quantiles are the same). The rank of
60     an element in the \a source range is not changed.
61
62     The class works also with unweighed ranges, and there is no
63     restriction that weighted \a source range requires weighted \a
64     target range or vice versa.
65
66     Normalization goes like this:
67     - Data are not assumed to be sorted.
68     - Partition sorted \a target data in Q parts. Q must be 3 or larger
69       because of requirements from the underlying cubic spline fit
70     - Calculate the arithmetic (weighted) mean for each part, the mean is
71       assigned to the mid point of each part.
72     - Do the above for the data to be transformed (called \a source
73       here).
74     - For each part, calculate the difference between the \a target
75       and \a the source. Now we have \a Q differences \f$ d_i \f$
76       with associated rank (midpoint of each part).
77     - Create a cubic spline fit to this difference vector \a d. The
78       resulting curve is used to recalculate all values in \a source.
79       - Use the cubic spline fit for values within the cubic spline
80         fit range [midpoint 1st part, midpoint last part].
81       - For data outside the cubic spline fit use linear
82         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
83         points below fit range, and \f$ d_{last} \f$ for points above fit
84         range.
85
86     \since New in yat 0.5
87   */
88  class qQuantileNormalizer
89  {
90  public:
91    /**
92       \brief Constructor
93
94       Divides a sorted copy of range [\a first,\a last) into \a Q
95       parts. Parts are divided such that the sum of weights is
96       approximately the same in the different parts. If a relative
97       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
98       be difficult to achieve, in which case a an exception is
99       thrown. In the unweighted case this implies that \a Q should be
100       smaller (or equal) than number of elements in [\a first, \a
101       last).
102
103       The range supplied to the constructor sets the target
104       distribution.
105
106       As the \a source range is also divided into \a Q parts (when
107       operator() is called), it is recommended to keep \a Q smaller
108       (or equal) than the size of ranges that will be normalized.
109
110       Also, \a Q must not be smaller than 3 due to restrictions in
111       the cubic spline fit utilized in the normalization.
112
113       \b Type \b Requirements:
114       - \c ForwardIterator is a model of \forward_traversal_iterator
115       - \c ForwardIterator is a \ref concept_data_iterator
116    */
117    template<typename ForwardIterator>
118    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
119                        unsigned int Q);
120
121    /**
122       \brief Perform the Q-quantile normalization.
123
124       Elements in [\a first, \a last) are normalized as described
125       above and the result is assigned to [\a result, \a result + \a
126       last-\a first). Input range [\a first, \a last) is not
127       modified. If ranges are weighted, the weights are copied from
128       [\a first, \a last) to \a result range.
129
130       It is possible to normalize "in place"; it is permissible for
131       \a first and \a result to be the same. However, as assignment
132       occurs sequentially, the operation is undefined if \a result is
133       the same as any in range [\a first, \a last).
134
135       \b Type Requirements:
136       - \c RandomAccessIterator1 is a \ref concept_data_iterator
137       - \c RandomAccessIterator1 is a \random_access_traversal_iterator
138       - \c RandomAccessIterator2 is a \ref concept_data_iterator
139       - \c RandomAccessIterator2 is a \random_access_traversal_iterator
140       - \c RandomAccessIterator2 is a \writable_iterator
141
142     */
143    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
144    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
145                    RandomAccessIterator2 result) const;
146
147  private:
148
149  /**
150     \brief Partition a range of data into equal sizes.
151
152     Copy the range [first, last), sort the copy, and divide the
153     sorted copy in Q parts. The parts are created such that the total
154     weight in a part is approximately W/Q where W is the total weight
155     (over all parts). The class calculates the average value in each
156     part and also the "quantile".
157  */
158  class Partitioner
159  {
160  public:
161    /**
162       \brief Create the partition and perform required calculations.
163    */
164    template<typename ForwardIterator>
165    Partitioner(ForwardIterator first, ForwardIterator last,
166                unsigned int N);
167
168    /**
169       \brief Return the averages for each part.
170
171       \return The average vector.
172    */
173    const utility::Vector& averages(void) const;
174
175    /**
176       The quantile (here) is defined as (w_lower + w_upper) / 2W,
177       where w_lower is the total weight of elements smaller than the
178       smallest element in the part, and w_upper is the total weight
179       of elements smaller (or equal) than the largest value in the
180       part.
181
182       In the unweighted case all weights are 1.0, which implies q_0 =
183       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
184       is number of elements in ith part.
185
186       \return The quantiles vector.
187    */
188    const utility::Vector& quantiles(void) const;
189
190    /**
191       \return The number of parts.
192    */
193    size_t size(void) const;
194
195  private:
196    // unweighted "constructor"
197    template<typename ForwardIterator>
198    void build(ForwardIterator first, ForwardIterator last, unsigned int N,
199               utility::unweighted_iterator_tag);
200    // weighted "constructor"
201    template<typename ForwardIterator>
202    void build(ForwardIterator first, ForwardIterator last, unsigned int N,
203               utility::weighted_iterator_tag);
204    void init(const utility::VectorBase&, unsigned int N);
205    void init(const std::vector<utility::DataWeight>&, unsigned int N);
206
207    utility::Vector average_;
208    utility::Vector quantiles_;
209  };
210
211    Partitioner target_;
212
213    // unweighted version
214    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
215    void normalize(const Partitioner& source,RandomAccessIterator1 first,
216                   RandomAccessIterator1 last, RandomAccessIterator2 result,
217                   utility::unweighted_iterator_tag tag) const;
218
219    // weighted version
220    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
221    void normalize(const Partitioner& source,RandomAccessIterator1 first,
222                   RandomAccessIterator1 last, RandomAccessIterator2 result,
223                   utility::weighted_iterator_tag tag) const;
224  };
225
226
227  // template implementations
228
229  template<typename ForwardIterator>
230  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first,
231                                           ForwardIterator last,
232                                           unsigned int Q)
233    : target_(Partitioner(first, last, Q))
234  {
235    YAT_ASSERT(Q>2);
236  }
237
238
239  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
240  void qQuantileNormalizer::operator()(RandomAccessIterator1 first,
241                                       RandomAccessIterator1 last,
242                                       RandomAccessIterator2 result) const
243  {
244    BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator1>));
245    using boost_concepts::RandomAccessTraversal;
246    BOOST_CONCEPT_ASSERT((RandomAccessTraversal<RandomAccessIterator1>));
247    BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator2>));
248    BOOST_CONCEPT_ASSERT((RandomAccessTraversal<RandomAccessIterator2>));
249    using boost_concepts::WritableIterator;
250    BOOST_CONCEPT_ASSERT((WritableIterator<RandomAccessIterator2>));
251
252    Partitioner source(first, last, target_.size());
253    typename utility::weighted_if_any2<RandomAccessIterator1, RandomAccessIterator2>::type tag;
254    normalize(source, first, last, result, tag);
255  }
256
257
258  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
259  void
260  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
261                                 RandomAccessIterator1 first,
262                                 RandomAccessIterator1 last,
263                                 RandomAccessIterator2 result,
264                                 utility::unweighted_iterator_tag tag) const
265  {
266    utility::check_iterator_is_unweighted(first);
267    utility::check_iterator_is_unweighted(result);
268    size_t N = last-first;
269    YAT_ASSERT(N >= target_.size());
270
271    std::vector<size_t> sorted_index(last-first);
272    utility::sort_index(first, last, sorted_index);
273
274    utility::Vector diff(source.averages());
275    diff-=target_.averages();
276    const utility::Vector& idx=target_.quantiles();
277    regression::CSplineInterpolation cspline(idx,diff);
278
279    // linear extrapolation for first part, i.e., use first diff for
280    // all points in the first part.
281    size_t start=0;
282    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
283    // take care of limiting case number of parts approximately equal
284    // to the number of elements in range.
285    if (end==0)
286      ++end;
287    for (size_t i=start; i<end; ++i) {
288      size_t si = sorted_index[i];
289      result[si] = first[si] - diff(0);
290    }
291
292    using utility::yat_assert;
293
294    // cspline interpolation for all data between the mid points of
295    // the first and last part
296    start=end;
297    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
298    if (end>=N)
299      end = N-1;
300    for ( size_t i=start; i<end; ++i) {
301      size_t si = sorted_index[i];
302
303      YAT_ASSERT((i+0.5)/N>idx(0));
304      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
305    }
306
307    // linear extrapolation for last part, i.e., use last diff for
308    // all points in the last part.
309    for (size_t i=end ; i<N; ++i) {
310      size_t si = sorted_index[i];
311      result[si] = first[si] - diff(diff.size()-1);
312    }
313  }
314
315
316  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
317  void qQuantileNormalizer::normalize(const Partitioner& source,
318                                      RandomAccessIterator1 first,
319                                      RandomAccessIterator1 last,
320                                      RandomAccessIterator2 result,
321                                      utility::weighted_iterator_tag tag) const
322  {
323    // copy the weights
324    detail::copy_weight_if_weighted(first, last, result);
325
326    double total_w = std::accumulate(utility::weight_iterator(first),
327                                     utility::weight_iterator(last), 0.0);
328
329    std::vector<size_t> sorted_index(last-first);
330    utility::sort_index(first, last, sorted_index);
331
332    utility::Vector diff(source.averages());
333    diff-=target_.averages();
334    const utility::Vector& idx=target_.quantiles();
335    regression::CSplineInterpolation cspline(idx,diff);
336
337    double sum_w=0;
338    utility::iterator_traits<RandomAccessIterator1> traits1;
339    utility::iterator_traits<RandomAccessIterator2> traits2;
340    for (size_t i=0; i<sorted_index.size(); ++i) {
341      size_t si = sorted_index[i];
342      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
343      double correction = 0;
344      if (w <= idx(0)) {
345        // linear extrapolation for first part, i.e., use first diff for
346        // all points in the first part.
347        correction = diff(0);
348      }
349      else if (w < idx(idx.size()-1) ) {
350        // cspline interpolation for all data between the mid points of
351        // the first and last part
352        correction = cspline.evaluate(w);
353      }
354      else {
355        // linear extrapolation for last part, i.e., use last diff for
356        // all points in the last part.
357        correction = diff(diff.size()-1);
358      }
359      traits2.data(result+si) = traits1.data(first+si) - correction;
360      sum_w += traits1.weight(first+si);
361    }
362  }
363
364
365  template<typename ForwardIterator>
366  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first,
367                                                ForwardIterator last,
368                                                unsigned int N)
369    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
370  {
371    BOOST_CONCEPT_ASSERT((boost_concepts::ForwardTraversal<ForwardIterator>));
372    BOOST_CONCEPT_ASSERT((utility::DataIterator<ForwardIterator>));
373    build(first, last, N,
374          typename utility::weighted_iterator_traits<ForwardIterator>::type());
375  }
376
377
378  template<typename ForwardIterator>
379  void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
380                                               ForwardIterator last,
381                                               unsigned int N,
382                                               utility::unweighted_iterator_tag)
383  {
384    utility::Vector vec(std::distance(first, last));
385    std::copy(first, last, vec.begin());
386    std::sort(vec.begin(), vec.end());
387    init(vec, N);
388  }
389
390
391  template<typename ForwardIterator>
392  void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
393                                               ForwardIterator last,
394                                               unsigned int N,
395                                               utility::weighted_iterator_tag)
396  {
397    using namespace utility;
398    std::vector<DataWeight> vec(first, last);
399    std::sort(vec.begin(), vec.end());
400    init(vec, N);
401  }
402
403
404}}} // end of namespace normalizer, yat and thep
405
406#endif
Note: See TracBrowser for help on using the repository browser.