source: trunk/yat/normalizer/qQuantileNormalizer.h

Last change on this file was 3701, checked in by Peter, 4 years ago

avoid includes when not needed

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 14.0 KB
Line 
1#ifndef _theplu_yat_normalizer_qquantile_normalizer_
2#define _theplu_yat_normalizer_qquantile_normalizer_
3
4/*
5  Copyright (C) 2009 Jari Häkkinen, Peter Johansson
6  Copyright (C) 2010, 2016, 2017 Peter Johansson
7
8  This file is part of the yat library, http://dev.thep.lu.se/yat
9
10  The yat library is free software; you can redistribute it and/or
11  modify it under the terms of the GNU General Public License as
12  published by the Free Software Foundation; either version 3 of the
13  License, or (at your option) any later version.
14
15  The yat library is distributed in the hope that it will be useful,
16  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  General Public License for more details.
19
20  You should have received a copy of the GNU General Public License
21  along with yat. If not, see <http://www.gnu.org/licenses/>.
22*/
23
24#include "utility.h"
25
26#include "yat/regression/CSplineInterpolation.h"
27#include "yat/utility/concept_check.h"
28#include "yat/utility/DataIterator.h"
29#include "yat/utility/DataWeight.h"
30#include "yat/utility/iterator_traits.h"
31#include "yat/utility/sort_index.h"
32#include "yat/utility/Vector.h"
33#include "yat/utility/WeightIterator.h"
34#include "yat/utility/yat_assert.h"
35
36#include <boost/concept_check.hpp>
37
38#include <algorithm>
39#include <cmath>
40#include <iterator>
41#include <limits>
42#include <numeric>
43#include <stdexcept>
44#include <vector>
45
46namespace theplu {
47namespace yat {
48namespace utility {
49  class VectorBase;
50}
51namespace normalizer {
52
53  /**
54     \brief Perform Q-quantile normalization
55
56     Perform a Q-quantile normalization on a \a source range, after
57     which it will approximately have the same distribution of data as
58     the \a target range (the Q-quantiles are the same). The rank of
59     an element in the \a source range is not changed.
60
61     The class works also with unweighed ranges, and there is no
62     restriction that weighted \a source range requires weighted \a
63     target range or vice versa.
64
65     Normalization goes like this:
66     - Data are not assumed to be sorted.
67     - Partition sorted \a target data in Q parts. Q must be 3 or larger
68       because of requirements from the underlying cubic spline fit
69     - Calculate the arithmetic (weighted) mean for each part, the mean is
70       assigned to the mid point of each part.
71     - Do the above for the data to be transformed (called \a source
72       here).
73     - For each part, calculate the difference between the \a target
74       and \a the source. Now we have \a Q differences \f$ d_i \f$
75       with associated rank (midpoint of each part).
76     - Create a cubic spline fit to this difference vector \a d. The
77       resulting curve is used to recalculate all values in \a source.
78       - Use the cubic spline fit for values within the cubic spline
79         fit range [midpoint 1st part, midpoint last part].
80       - For data outside the cubic spline fit use linear
81         extrapolation, i.e., a constant shift. \f$ d_{first} \f$ for
82         points below fit range, and \f$ d_{last} \f$ for points above fit
83         range.
84
85     \since New in yat 0.5
86   */
87  class qQuantileNormalizer
88  {
89  public:
90    /**
91       \brief Constructor
92
93       Divides a sorted copy of range [\a first,\a last) into \a Q
94       parts. Parts are divided such that the sum of weights is
95       approximately the same in the different parts. If a relative
96       weight, \f$ w_i / \sum w_i \f$, is larger than 1/Q this might
97       be difficult to achieve, in which case a an exception is
98       thrown. In the unweighted case this implies that \a Q should be
99       smaller (or equal) than number of elements in [\a first, \a
100       last).
101
102       The range supplied to the constructor sets the target
103       distribution.
104
105       As the \a source range is also divided into \a Q parts (when
106       operator() is called), it is recommended to keep \a Q smaller
107       (or equal) than the size of ranges that will be normalized.
108
109       Also, \a Q must not be smaller than 3 due to restrictions in
110       the cubic spline fit utilized in the normalization.
111
112       \b Type \b Requirements:
113       - \c ForwardIterator is a model of \forward_traversal_iterator
114       - \c ForwardIterator is a \ref concept_data_iterator
115    */
116    template<typename ForwardIterator>
117    qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
118                        unsigned int Q);
119
120    /**
121       \brief Perform the Q-quantile normalization.
122
123       Elements in [\a first, \a last) are normalized as described
124       above and the result is assigned to [\a result, \a result + \a
125       last-\a first). Input range [\a first, \a last) is not
126       modified. If ranges are weighted, the weights are copied from
127       [\a first, \a last) to \a result range.
128
129       It is possible to normalize "in place"; it is permissible for
130       \a first and \a result to be the same. However, as assignment
131       occurs sequentially, the operation is undefined if \a result is
132       the same as any in range [\a first, \a last).
133
134       \b Type Requirements:
135       - \c RandomAccessIterator1 is a \ref concept_data_iterator
136       - \c RandomAccessIterator1 is a \random_access_traversal_iterator
137       - \c RandomAccessIterator2 is a \ref concept_data_iterator
138       - \c RandomAccessIterator2 is a \random_access_traversal_iterator
139       - \c RandomAccessIterator2 is a \writable_iterator
140
141     */
142    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
143    void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
144                    RandomAccessIterator2 result) const;
145
146  private:
147
148  /**
149     \brief Partition a range of data into equal sizes.
150
151     Copy the range [first, last), sort the copy, and divide the
152     sorted copy in Q parts. The parts are created such that the total
153     weight in a part is approximately W/Q where W is the total weight
154     (over all parts). The class calculates the average value in each
155     part and also the "quantile".
156  */
157  class Partitioner
158  {
159  public:
160    /**
161       \brief Create the partition and perform required calculations.
162    */
163    template<typename ForwardIterator>
164    Partitioner(ForwardIterator first, ForwardIterator last,
165                unsigned int N);
166
167    /**
168       \brief Return the averages for each part.
169
170       \return The average vector.
171    */
172    const utility::Vector& averages(void) const;
173
174    /**
175       The quantile (here) is defined as (w_lower + w_upper) / 2W,
176       where w_lower is the total weight of elements smaller than the
177       smallest element in the part, and w_upper is the total weight
178       of elements smaller (or equal) than the largest value in the
179       part.
180
181       In the unweighted case all weights are 1.0, which implies q_0 =
182       n_0/N, q_1 = (n_0+n_1/2)/N, q_2 = (n_0+n_1+n_2/2)/N where n_i
183       is number of elements in ith part.
184
185       \return The quantiles vector.
186    */
187    const utility::Vector& quantiles(void) const;
188
189    /**
190       \return The number of parts.
191    */
192    size_t size(void) const;
193
194  private:
195    // unweighted "constructor"
196    template<typename ForwardIterator>
197    void build(ForwardIterator first, ForwardIterator last, unsigned int N,
198               utility::unweighted_iterator_tag);
199    // weighted "constructor"
200    template<typename ForwardIterator>
201    void build(ForwardIterator first, ForwardIterator last, unsigned int N,
202               utility::weighted_iterator_tag);
203    void init(const utility::VectorBase&, unsigned int N);
204    void init(const std::vector<utility::DataWeight>&, unsigned int N);
205
206    utility::Vector average_;
207    utility::Vector quantiles_;
208  };
209
210    Partitioner target_;
211
212    // unweighted version
213    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
214    void normalize(const Partitioner& source,RandomAccessIterator1 first,
215                   RandomAccessIterator1 last, RandomAccessIterator2 result,
216                   utility::unweighted_iterator_tag tag) const;
217
218    // weighted version
219    template<typename RandomAccessIterator1, typename RandomAccessIterator2>
220    void normalize(const Partitioner& source,RandomAccessIterator1 first,
221                   RandomAccessIterator1 last, RandomAccessIterator2 result,
222                   utility::weighted_iterator_tag tag) const;
223  };
224
225
226  // template implementations
227
228  template<typename ForwardIterator>
229  qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first,
230                                           ForwardIterator last,
231                                           unsigned int Q)
232    : target_(Partitioner(first, last, Q))
233  {
234    YAT_ASSERT(Q>2);
235  }
236
237
238  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
239  void qQuantileNormalizer::operator()(RandomAccessIterator1 first,
240                                       RandomAccessIterator1 last,
241                                       RandomAccessIterator2 result) const
242  {
243    BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator1>));
244    using boost_concepts::RandomAccessTraversal;
245    BOOST_CONCEPT_ASSERT((RandomAccessTraversal<RandomAccessIterator1>));
246    BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator2>));
247    BOOST_CONCEPT_ASSERT((RandomAccessTraversal<RandomAccessIterator2>));
248    using boost_concepts::WritableIterator;
249    BOOST_CONCEPT_ASSERT((WritableIterator<RandomAccessIterator2>));
250
251    Partitioner source(first, last, target_.size());
252    typename utility::weighted_iterator_traits<RandomAccessIterator1>::type tag;
253    normalize(source, first, last, result, tag);
254  }
255
256
257  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
258  void
259  qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
260                                 RandomAccessIterator1 first,
261                                 RandomAccessIterator1 last,
262                                 RandomAccessIterator2 result,
263                                 utility::unweighted_iterator_tag tag) const
264  {
265    utility::check_iterator_is_unweighted(first);
266    // copy the weights if needed
267    detail::copy_weight_if_weighted(first, last, result);
268
269    size_t N = last-first;
270    YAT_ASSERT(N >= target_.size());
271
272    std::vector<size_t> sorted_index(last-first);
273    utility::sort_index(first, last, sorted_index);
274
275    utility::Vector diff(source.averages());
276    diff-=target_.averages();
277    const utility::Vector& idx=target_.quantiles();
278    regression::CSplineInterpolation cspline(idx,diff);
279
280    // linear extrapolation for first part, i.e., use first diff for
281    // all points in the first part.
282    size_t start=0;
283    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
284    // take care of limiting case number of parts approximately equal
285    // to the number of elements in range.
286    utility::iterator_traits<RandomAccessIterator1> traits1;
287    utility::iterator_traits<RandomAccessIterator2> traits2;
288    if (end==0)
289      ++end;
290    for (size_t i=start; i<end; ++i) {
291      size_t si = sorted_index[i];
292      traits2.data(result+si) = traits1.data(first+si) - diff(0);
293    }
294
295    using utility::yat_assert;
296
297    // cspline interpolation for all data between the mid points of
298    // the first and last part
299    start=end;
300    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
301    if (end>=N)
302      end = N-1;
303    for ( size_t i=start; i<end; ++i) {
304      size_t si = sorted_index[i];
305
306      YAT_ASSERT((i+0.5)/N>idx(0));
307      traits2.data(result+si) =
308        traits1.data(first+si) - cspline.evaluate((i+0.5)/N);
309    }
310
311    // linear extrapolation for last part, i.e., use last diff for
312    // all points in the last part.
313    for (size_t i=end ; i<N; ++i) {
314      size_t si = sorted_index[i];
315      traits2.data(result+si) = traits1.data(first+si) - diff(diff.size()-1);
316    }
317  }
318
319
320  template<typename RandomAccessIterator1, typename RandomAccessIterator2>
321  void qQuantileNormalizer::normalize(const Partitioner& source,
322                                      RandomAccessIterator1 first,
323                                      RandomAccessIterator1 last,
324                                      RandomAccessIterator2 result,
325                                      utility::weighted_iterator_tag tag) const
326  {
327    // copy the weights
328    detail::copy_weight_if_weighted(first, last, result);
329
330    double total_w = std::accumulate(utility::weight_iterator(first),
331                                     utility::weight_iterator(last), 0.0);
332
333    std::vector<size_t> sorted_index(last-first);
334    utility::sort_index(first, last, sorted_index);
335
336    utility::Vector diff(source.averages());
337    diff-=target_.averages();
338    const utility::Vector& idx=target_.quantiles();
339    regression::CSplineInterpolation cspline(idx,diff);
340
341    double sum_w=0;
342    utility::iterator_traits<RandomAccessIterator1> traits1;
343    utility::iterator_traits<RandomAccessIterator2> traits2;
344    for (size_t i=0; i<sorted_index.size(); ++i) {
345      size_t si = sorted_index[i];
346      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
347      double correction = 0;
348      if (w <= idx(0)) {
349        // linear extrapolation for first part, i.e., use first diff for
350        // all points in the first part.
351        correction = diff(0);
352      }
353      else if (w < idx(idx.size()-1) ) {
354        // cspline interpolation for all data between the mid points of
355        // the first and last part
356        correction = cspline.evaluate(w);
357      }
358      else {
359        // linear extrapolation for last part, i.e., use last diff for
360        // all points in the last part.
361        correction = diff(diff.size()-1);
362      }
363      traits2.data(result+si) = traits1.data(first+si) - correction;
364      sum_w += traits1.weight(first+si);
365    }
366  }
367
368
369  template<typename ForwardIterator>
370  qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first,
371                                                ForwardIterator last,
372                                                unsigned int N)
373    : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
374  {
375    BOOST_CONCEPT_ASSERT((boost_concepts::ForwardTraversal<ForwardIterator>));
376    BOOST_CONCEPT_ASSERT((utility::DataIterator<ForwardIterator>));
377    build(first, last, N,
378          typename utility::weighted_iterator_traits<ForwardIterator>::type());
379  }
380
381
382  template<typename ForwardIterator>
383  void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
384                                               ForwardIterator last,
385                                               unsigned int N,
386                                               utility::unweighted_iterator_tag)
387  {
388    utility::Vector vec(std::distance(first, last));
389    std::copy(first, last, vec.begin());
390    std::sort(vec.begin(), vec.end());
391    init(vec, N);
392  }
393
394
395  template<typename ForwardIterator>
396  void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
397                                               ForwardIterator last,
398                                               unsigned int N,
399                                               utility::weighted_iterator_tag)
400  {
401    using namespace utility;
402    std::vector<DataWeight> vec(first, last);
403    std::sort(vec.begin(), vec.end());
404    init(vec, N);
405  }
406
407
408}}} // end of namespace normalizer, yat and thep
409
410#endif
Note: See TracBrowser for help on using the repository browser.