Changeset 1778


Ignore:
Timestamp:
Feb 6, 2009, 2:28:33 PM (11 years ago)
Author:
Peter
Message:

refs #478. Fixed so the tests for weighted qQN now are OK. To align
weighted and unweighted, I needed to change the definition of
index. The weighted index is more natural as the sum of weights with
value less than x plus half of the weight for x. The half of its own
weight part makes it quite symmetric. Anyway, for unity weights it
becomes, e.g, for a vector of size 4: 0.5, 1.5, 2.5, 3.5. Obviously
this doesnt change anything in the behavior since we are just adding a
0.5 to all index.

Also I decided to re-scale index inside the Partitioner, so the index
are now within range (0, 1).

Weighted version of normalize had to be implemented in itself and
could not use the unweighted because the weights come into play also
in this step.

I wanna test with a weighted range also as target, and when that works
I think we can close this ticket.

Location:
trunk
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/test/normalization_test.cc

    r1775 r1778  
    193193 
    194194  using utility::DataWeight;
    195   // test with unweighted target and weighted source
     195  suite.err() << "Testing with unweighted target and weighted source\n";
    196196  std::vector<utility::DataWeight> src_w(source.size(), DataWeight(0, 1));
    197197  std::copy(source.begin(), source.end(),
     
    200200  std::vector<utility::DataWeight> result_w(src_w.size());
    201201  qQN(src_w.begin(), src_w.end(), result_w.begin());
    202   // FIXME: this test fails
    203   suite.xadd(suite.equal_range(result.begin(), result.end(),
     202  suite.add(suite.equal_range(result.begin(), result.end(),
    204203                              utility::data_iterator(result_w.begin())));
    205204
     205  suite.err() << "Testing with missing value in source\n";
    206206  // adding a missing value
    207207  src_w.insert(src_w.begin(), DataWeight(5.2, 0.0));
    208   result_w.resize(src_w.size());
    209   qQN(src_w.begin(), src_w.end(), result_w.begin());
     208  std::vector<utility::DataWeight> result_w2(src_w.size());
     209  qQN(src_w.begin(), src_w.end(), result_w2.begin());
    210210  // excluding missing value (result_w[0])
    211   suite.xadd(suite.equal_range(result.begin(), result.end(),
    212                               ++utility::data_iterator(result_w.begin())));
     211  suite.add(suite.equal_range(utility::data_iterator(result_w.begin()),
     212                               utility::data_iterator(result_w.end()),
     213                               ++utility::data_iterator(result_w2.begin())));
    213214}
    214215
  • trunk/yat/normalizer/qQuantileNormalizer.cc

    r1739 r1778  
    5959        av.add(sortedvec(r));
    6060      average_(i) = av.mean();
    61       index_(i)   = 0.5*(end+start-1);
     61      index_(i)   = 0.5*(end+start);
    6262      start=end;
    6363    }
     64    // rescale index to be in range (0,1)
     65    index_ *= 1.0/sortedvec.size();
    6466  }
    6567
     
    8183      double end_sum_w = (i+1) * total_w / N - sum_w;
    8284      if (i!=N-1) {
    83         while(av.sum_w() < end_sum_w) {
     85        while(av.sum_w() + iter->weight() <= end_sum_w) {
    8486          av.add(iter->data(), iter->weight());
    8587          ++iter;
     
    98100      }
    99101      average_(i) = av.mean();
    100       index_(i)   = sum_w + 0.5*av.sum_w();
     102      index_(i)   = (sum_w + 0.5*av.sum_w())/total_w;
    101103      sum_w += av.sum_w();
    102104    }
  • trunk/yat/normalizer/qQuantileNormalizer.h

    r1777 r1778  
    3333#include <cmath>
    3434#include <iterator>
     35#include <numeric>
    3536#include <stdexcept>
    3637#include <vector>
     
    156157  };
    157158
    158 
    159     size_t N_target_;
    160159    Partitioner target_;
    161160
     
    181180                                           BidirectionalIterator last,
    182181                                           unsigned int Q)
    183     : N_target_(std::distance(first, last)),
    184       target_(Partitioner(first, last, Q))
    185   {
    186     utility::yat_assert<std::runtime_error>(Q>2,
     182    : target_(Partitioner(first, last, Q))
     183  {
     184    utility::yat_assert<std::runtime_error>(Q>2,
    187185                                            "qQuantileNormalizer: Q too small");
    188186  }
     
    219217    utility::Vector diff(source.averages());
    220218    diff-=target_.averages();
    221     utility::Vector idx=target_.index();
    222     // if N_target_ is different from N_source_ we need to rescale the
    223     // idx so they stretch over same range
    224     idx *= static_cast<double>(N)/static_cast<double>(N_target_);
    225 
     219    const utility::Vector& idx=target_.index();
    226220    regression::CSplineInterpolation cspline(idx,diff);
    227221
     
    229223    // all points in the first part.
    230224    size_t start=0;
    231     size_t end=static_cast<unsigned int>(std::ceil(idx(0)));
     225    size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
    232226    // take care of limiting case number of parts approximately equal
    233227    // to the number of elements in range.
    234228    if (end==0)
    235229      ++end;
    236     for (size_t row=start; row<end; ++row) {
    237       size_t srow=sorted_index[row];
    238       result[srow] = first[srow] - diff(0);
     230    for (size_t i=start; i<end; ++i) {
     231      size_t si = sorted_index[i];
     232      result[si] = first[si] - diff(0);
    239233    }
     234
     235    using utility::yat_assert;
    240236   
    241237    // cspline interpolation for all data between the mid points of
    242238    // the first and last part
    243239    start=end;
    244     end=static_cast<unsigned int>(idx(target_.size()-1));
    245     // take care of limiting case number of parts approximately
    246     // equal to the number of matrix rows
    247     if (end==(static_cast<size_t>(N-1)) )
    248       --end;
    249     for (size_t row=start; row<=end; ++row) {
    250       size_t srow=sorted_index[row];
    251       result[srow] = first[srow] - cspline.evaluate(row) ;
     240    end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
     241    if (end>=N)
     242      end = N-1;
     243    for ( size_t i=start; i<end; ++i) {
     244      size_t si = sorted_index[i];
     245     
     246      yat_assert<std::runtime_error>((i+0.5)/N>idx(0),
     247                               "qQuantileNormalizer: invalid input to cspline");
     248      result[si] = first[si] - cspline.evaluate((i+0.5)/N);
    252249    }
    253250   
    254251    // linear extrapolation for last part, i.e., use last diff for
    255252    // all points in the last part.
    256     start=end+1;
    257     end=N;
    258     for (size_t row=start; row<end; ++row) {
    259       size_t srow=sorted_index[row];
    260       result[srow] = first[srow] - diff(diff.size()-1);
     253    for (size_t i=end ; i<N; ++i) {
     254      size_t si = sorted_index[i];
     255      result[si] = first[si] - diff(diff.size()-1);
    261256    }
    262257  }
     
    274269              utility::weight_iterator(last),
    275270              utility::weight_iterator(result));
    276     // apply algorithm on data part of range
    277     normalize(source, utility::data_iterator(first),
    278               utility::data_iterator(last),
    279               utility::data_iterator(result),
    280               utility::unweighted_iterator_tag());
     271
     272    double total_w = std::accumulate(utility::weight_iterator(first),
     273                                     utility::weight_iterator(last), 0.0);
     274
     275    std::vector<size_t> sorted_index(last-first);
     276    utility::sort_index(utility::data_iterator(first),
     277                        utility::data_iterator(last), sorted_index);
     278
     279    utility::Vector diff(source.averages());
     280    diff-=target_.averages();
     281    const utility::Vector& idx=target_.index();
     282    regression::CSplineInterpolation cspline(idx,diff);
     283
     284    double sum_w=0;
     285    utility::iterator_traits<RandomAccessIterator2> traits1;
     286    utility::iterator_traits<RandomAccessIterator2> traits2;
     287    for (size_t i=0; i<sorted_index.size(); ++i) {
     288      size_t si = sorted_index[i];
     289      double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
     290      double correction = 0;
     291      if (w <= idx(0)) {
     292        // linear extrapolation for first part, i.e., use first diff for
     293        // all points in the first part.
     294        correction = diff(0);
     295      }
     296      else if (w < idx(idx.size()-1) ) {
     297        // cspline interpolation for all data between the mid points of
     298        // the first and last part
     299        correction = cspline.evaluate(w);
     300      }
     301      else {
     302        // linear extrapolation for last part, i.e., use last diff for
     303        // all points in the last part.
     304        correction = diff(diff.size()-1);
     305      }
     306      traits2.data(result+si) = traits1.data(first+si) - correction;
     307      sum_w += traits1.weight(first+si);
     308    }
    281309  }
    282310
Note: See TracChangeset for help on using the changeset viewer.