Changeset 1712


Ignore:
Timestamp:
Jan 13, 2009, 7:41:09 PM (12 years ago)
Author:
Jari Häkkinen
Message:

Addresses #425. Added linear extrapolation at ends. Added documentation. Cleaneup of code.

Location:
trunk/yat/normalizer
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/yat/normalizer/qQuantileNormalizer.cc

    r1709 r1712  
    2525#include "yat/statistics/Averager.h"
    2626#include "yat/utility/Matrix.h"
    27 #include "yat/utility/VectorConstView.h"
     27#include "yat/utility/Vector.h"
     28#include "yat/utility/VectorBase.h"
    2829
    2930#include <algorithm>
     
    3536
    3637
    37   Partitioner::Partitioner(const utility::VectorConstView& vec,
     38  Partitioner::Partitioner(const utility::VectorBase& vec,
    3839                           unsigned int N)
    3940    : average_(utility::Vector(N)), index_(utility::Vector(N))
     
    7778
    7879
    79   qQuantileNormalizer::qQuantileNormalizer(const
    80                                            utility::VectorConstView& target,
     80  qQuantileNormalizer::qQuantileNormalizer(const utility::VectorBase& target,
    8181                                           unsigned int Q)
    8282    : target_(Partitioner(target,Q))
     
    102102      diff-=target_.averages();
    103103      const utility::Vector& idx=target_.index();
     104      regression::CSplineInterpolation cspline(idx,diff);
    104105
    105       // add linear interpolation for first part
    106       for (size_t row=0; row<idx(0); ++row) {
     106      // linear interpolation for first part, i.e., use first diff for
     107      // all points in the first part.
     108      size_t start=0;
     109      size_t end=idx(0);
     110      for (size_t row=start; row<end; ++row) {
    107111        size_t srow=sorted_index[column][row];
    108         result(srow,column) = matrix(srow,column);
     112        result(srow,column) = matrix(srow,column) + diff(0);
    109113      }
    110114
    111115      // cspline interpolation for all data between the first and last
    112116      // parts
    113       regression::CSplineInterpolation cspline(idx,diff);
    114       for (size_t row=idx(0); row<=idx(target_.size()-1); ++row) {
     117      start=idx(0);
     118      end=idx(target_.size()-1);
     119      for (size_t row=start; row<=end; ++row) {
    115120        size_t srow=sorted_index[column][row];
    116         result(srow,column) = ( matrix(srow,column) + cspline.evaluate(row) );
     121        result(srow,column) = matrix(srow,column) + cspline.evaluate(row) ;
    117122      }
    118123
    119       // add linear interpolation for last part
    120       for (size_t row=idx(target_.size()-1)+1; row<result.rows(); ++row) {
     124      // linear interpolation for last part, i.e., use last diff for
     125      // all points in the last part.
     126      start=idx(target_.size()-1)+1;
     127      end=result.rows();
     128      for (size_t row=start; row<end; ++row) {
    121129        size_t srow=sorted_index[column][row];
    122         result(srow,column) = matrix(srow,column);
     130        result(srow,column) = matrix(srow,column) + diff(diff.size()-1);
    123131      }
    124132    }
  • trunk/yat/normalizer/qQuantileNormalizer.h

    r1711 r1712  
    2727namespace utility {
    2828  class Matrix;
    29   class VectorConstView;
     29  class VectorBase;
    3030}
    3131namespace normalizer {
    3232
    3333  /**
    34      \brief Documentation please.
     34     \brief Partition a vector of data into equal sizes.
     35
     36     The class also calculates the average of each part and assigns
     37     the average to the mid point of each part. The midpoint is a
     38     double, i.e., it is not forced to be an integer index.
    3539  */
    3640  class Partitioner
     
    3842  public:
    3943    /**
    40        \brief Documentation please.
     44       \brief Create the partition and perform required calculations.
    4145    */
    42     Partitioner(const utility::VectorConstView& vec, unsigned int N);
     46    Partitioner(const utility::VectorBase& vec, unsigned int N);
    4347
    4448    /**
    45        \brief Documentation please.
     49       \brief Return the averages for each part.
     50
     51       \return The average vector.
    4652    */
    4753    const utility::Vector& averages(void) const;
    4854
    4955    /**
    50        \brief Documentation please.
     56       \brief Return the mid point for each partition.
     57
     58       \return The index vector.
    5159    */
    5260    const utility::Vector& index(void) const;
    5361
    5462    /**
    55        \brief The number of parts.
     63       \return The number of parts.
    5664    */
    5765    size_t size(void) const;
     
    6674     \brief Perform Q-quantile normalization
    6775
    68      After a Q-quantile normalization each column has the same
    69      distribution of data (the Q-quantiles are the same). Also, within
    70      each column the rank of an element is not changed.
     76     After a Q-quantile normalization each column has approximately
     77     the same distribution of data (the Q-quantiles are the
     78     same). Also, within each column the rank of an element is not
     79     changed.
    7180
    7281     There is currently no weighted version of qQuantileNormalizer
    7382
    7483     The normalization goes like this
    75 
    76      0. Data is not assumed to be sorted.
    77      
    78      1. Partition the target data in N+1 parts. The ends have half
    79      size of the "normal" part size ( = \#targetdata/N )
    80 
    81      2. Calculate the arithmetic mean for each part
    82 
    83      3. Do the same for the data to be tranformed (called source
    84      here).
    85 
    86      4. For each part, calculate the difference between the target and
    87      the source. Now we have N differences d_i.
    88 
    89      5. Create a cubic spline fit to this difference vector d. The
    90      resulting curve is used to recalculate all column values.
    91 
    92          I. For values in parts 1 through N-1 we use a cubic spline
    93          fit.
    94 
    95          II. For end parts 0 and N linear interpolation is used
    96 
    97     Linear interpolation simply means a translation.
     84     - Data is not assumed to be sorted.
     85     - Partition the target data in N parts.
     86     - Calculate the arithmetic mean for each part, the mean is
     87       assigned to the mid point of each part.
     88     - Do the same for the data to be tranformed (called source
     89       here).
     90     - For each part, calculate the difference between the target and
     91       the source. Now we have N differences d_i with associated rank
     92       (midpoint of each part).
     93     - Create a cubic spline fit to this difference vector d. The
     94       resulting curve is used to recalculate all column values.
     95       - Use the cubic spline fit for values within the cubic spline
     96         fit range [midpoint 1st part, midpoint last part].
     97       - For data outside the cubic spline fit use linear
     98         extrapolation, i.e., a constant shift. d_first for points
     99         below fit range, and d_last for points above fit range.
    98100
    99101     \since New in yat 0.5
     
    111113       undefined. Keep \f$ N \f$ equal to or less than the smallest
    112114       number of data points in the target or each data set to be
    113        normalized with a ginven target.
     115       normalized against a ginven target.
    114116    */
    115     qQuantileNormalizer(const utility::VectorConstView& target,
    116                         unsigned int Q);
     117    qQuantileNormalizer(const utility::VectorBase& target, unsigned int Q);
    117118
    118119    /**
Note: See TracChangeset for help on using the changeset viewer.