Changeset 4446


Ignore:
Timestamp:
Mar 13, 2013, 3:45:40 PM (10 years ago)
Author:
marianne
Message:

Refs #774. Tolerances for extracting matching features are calculated from the set of matching sequences.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/plugin/src/org/proteios/plugins/FeatureSequencePropagator.java

    r4442 r4446  
    3636import java.util.Comparator;
    3737import java.util.Date;
    38 import java.util.Iterator;
    3938import java.util.List;
    4039import java.util.ListIterator;
     
    4746import org.apache.commons.collections.ListUtils;
    4847import org.apache.commons.collections.functors.NotNullPredicate;
    49 import org.apache.commons.lang.ArrayUtils;
    5048import org.apache.commons.math.ArgumentOutsideDomainException;
    5149import org.apache.commons.math.MathException;
     
    545543           
    546544            if (totNbrMatches < MIN_NBR_MATCHING_SEQ) {
    547 
    548               double mzForPointMatchByTol = 0.01;
    549               double rtForPointMatchByTol = 5;
    550 
    551               //TODO:fill up with tolerances matches, check for duplicates when finding tolerance matches instead of clearing the points list
    552               points.clear();
    553 
     545             
    554546              writer.println("There are too few common sequences for file "
    555547                  + (fileNbr + 1)
    556548                  + " and "
    557                   + (secFileNbr + 1) + ".");
    558               writer.println("Alignment will be performed by using tolerances: " +mzForPointMatchByTol +" Da and " +rtForPointMatchByTol +" min.");
     549                  + (secFileNbr + 1) + ". Adding landmarks by tolerances.");
    559550              log.debug("There are too few common sequences for file "
    560551                  + (fileNbr + 1)
    561552                  + " and "
    562                   + (secFileNbr + 1) + ".");
    563               log.debug("Alignment will be performed by using tolerances: " +mzForPointMatchByTol +" Da and " +rtForPointMatchByTol +" min.");
     553                  + (secFileNbr + 1) + ". Adding landmarks by tolerances.");
     554             
     555             
     556              double mzForPointMatchByTol = 0.01;
     557              double rtForPointMatchByTol = 10;
     558             
     559             
     560              if (totNbrMatches >= 20){
     561               
     562                writer.println("Estimating tolerances from sequences.");
     563                log.debug("Estimating tolerances from sequences.");
     564               
     565                int quartileFactor = 10;
     566                double[] initialMzRTVec = getInitialMzAndRT(points, quartileFactor, writer);
     567               
     568                mzForPointMatchByTol = initialMzRTVec[0];
     569                rtForPointMatchByTol = initialMzRTVec[1];
     570               
     571              }
     572
     573              points.clear();
     574
     575             
     576              writer.println("Feature pairs will be extracted by using tolerances: " +mzForPointMatchByTol +" Da and " +rtForPointMatchByTol +" min.");
     577             
     578              log.debug("Feature pairs will be extracted by using tolerances: " +mzForPointMatchByTol +" Da and " +rtForPointMatchByTol +" min.");
    564579
    565580              simValue = getPointMatchByTol(points,
     
    573588              simTemp = simValue;
    574589
    575               //TODO: integrate simValue for using both sequences and tolerances
    576590              simValue = getPointMatchByTol(points,
    577591                  uniqueMsFiles.get
     
    596610              if (totNbrMatches < MIN_NBR_MATCHING_SEQ) {
    597611
    598                 writer.println("There are still too few common sequences for file "
     612                writer.println("There are still too few common features for file "
    599613                    + (fileNbr + 1)
    600614                    + " and "
    601615                    + (secFileNbr + 1) + ".");
    602616                writer.println("No alignment will be performed for this file-pair.");
    603                 log.debug("There are still too few common sequences for file "
     617                log.debug("There are still too few common features for file "
    604618                    + (fileNbr + 1)
    605619                    + " and "
     
    614628            // considered an outlier
    615629            int quartileFactor = 10;
    616             int nbrOfOutliers = 0;
    617 
    618             Collections.sort(points, new DiffComparator());
    619 
    620             double lowerQuartile = points.get(
    621                 (int) Math.round(0.25 * points.size()))
    622                 .getDiff();
    623             double upperQuartile = points.get(
    624                 (int) Math.round(0.75 * points.size()))
    625                 .getDiff();
    626             double medianQuartile = points.get(
    627                 (int) Math.round(0.5 * points.size()))
    628                 .getDiff();
    629             double interQuartileRange = upperQuartile
    630                 - lowerQuartile;
    631             double upperFence = upperQuartile + quartileFactor
    632                 * interQuartileRange;
    633 
    634             writer.println("Median RT difference before alignment: "
    635                 + medianQuartile);
    636             writer.println("Largest RT difference before alignment: "
    637                 + points.get(points.size() - 1).getDiff());
    638 
    639             int maxCutOff = points.size() - 1;
    640             while (points.get(maxCutOff).getDiff() > upperFence) {
    641               points.remove(maxCutOff);
    642               maxCutOff--;
    643               nbrOfOutliers++;
    644             }
    645 
    646             writer.println("Removing "
    647                 + nbrOfOutliers
    648                 + " outliers. Total number of sequence matches is now: "
    649                 + points.size());
    650 
    651             medianQuartile = points.get(
    652                 (int) Math.round(0.5 * points.size()))
    653                 .getDiff();
    654 
    655             writer.println("Median RT difference is now: "
    656                 + medianQuartile);
    657             writer.println("Largest RT difference is now: "
    658                 + points.get(points.size() - 1).getDiff());
    659 
    660             Collections.sort(points, new MzDiffComparator());
    661             double medianQuartileMz = points.get(
    662                 (int) Math.round(0.5 * points.size()))
    663                 .getMzDiff();
    664             writer.println("Median mz difference before alignment: "
    665                 + medianQuartileMz);
    666             writer.println("Largest mz difference before alignment: "
    667                 + points.get(points.size() - 1).getMzDiff());
    668 
    669             // the mz tolrance is set as the largest (rounded) mz
    670             double mzTol = points.get(points.size() - 1)
    671                 .getMzDiff();
    672             mzTol = Math.round(100000 * mzTol + 1)
    673                 / (double) 100000;
    674 
    675             log.debug("Mz tol is: " + mzTol);
    676 
    677             Collections.sort(points, new RTComparator());
    678 
     630           
     631            double[] initialMzRTVec = getInitialMzAndRT(points, quartileFactor, writer);
     632            double mzTol = initialMzRTVec[0];
     633           
     634           
    679635            if (simValue == 0) {
    680636              simValue = points.size() * 2.0
     
    14711427    }
    14721428
     1429  }
     1430 
     1431  private double[] getInitialMzAndRT(ArrayList<Point> points, int quartileFactor, PrintWriter writer){
     1432   
     1433    double[] initialMzRTVec = new double[2];
     1434   
     1435    int nbrOfOutliers = 0;
     1436
     1437    Collections.sort(points, new DiffComparator());
     1438
     1439    double lowerQuartile = points.get(
     1440        (int) Math.round(0.25 * points.size()))
     1441        .getDiff();
     1442    double upperQuartile = points.get(
     1443        (int) Math.round(0.75 * points.size()))
     1444        .getDiff();
     1445    double medianQuartile = points.get(
     1446        (int) Math.round(0.5 * points.size()))
     1447        .getDiff();
     1448    double interQuartileRange = upperQuartile
     1449        - lowerQuartile;
     1450    double upperFence = upperQuartile + quartileFactor
     1451        * interQuartileRange;
     1452
     1453    writer.println("Median RT difference before alignment: "
     1454        + medianQuartile);
     1455    writer.println("Largest RT difference before alignment: "
     1456        + points.get(points.size() - 1).getDiff());
     1457
     1458    int maxCutOff = points.size() - 1;
     1459    while (points.get(maxCutOff).getDiff() > upperFence) {
     1460      points.remove(maxCutOff);
     1461      maxCutOff--;
     1462      nbrOfOutliers++;
     1463    }
     1464
     1465    writer.println("Removing "
     1466        + nbrOfOutliers
     1467        + " outliers. Total number of sequence matches is now: "
     1468        + points.size());
     1469
     1470    medianQuartile = points.get(
     1471        (int) Math.round(0.5 * points.size()))
     1472        .getDiff();
     1473
     1474    writer.println("Median RT difference is now: "
     1475        + medianQuartile);
     1476   
     1477    double largestRTDiff = points.get(points.size() - 1).getDiff();
     1478   
     1479   
     1480    writer.println("Largest RT difference is now: "
     1481        +largestRTDiff );
     1482   
     1483    largestRTDiff = Math.round(100000 * largestRTDiff + 1)
     1484        / (double) 100000;
     1485   
     1486    Collections.sort(points, new MzDiffComparator());
     1487    double medianQuartileMz = points.get(
     1488        (int) Math.round(0.5 * points.size()))
     1489        .getMzDiff();
     1490    writer.println("Median mz difference before alignment: "
     1491        + medianQuartileMz);
     1492    writer.println("Largest mz difference before alignment: "
     1493        + points.get(points.size() - 1).getMzDiff());
     1494
     1495    // the mz tolrance is set as the largest (rounded) mz
     1496    double mzTol = points.get(points.size() - 1)
     1497        .getMzDiff();
     1498    mzTol = Math.round(100000 * mzTol + 1)
     1499        / (double) 100000;
     1500
     1501    log.debug("Mz tol is: " + mzTol);
     1502   
     1503    initialMzRTVec[0]=mzTol;
     1504    initialMzRTVec[1]=largestRTDiff;
     1505
     1506    Collections.sort(points, new RTComparator());
     1507   
     1508    return initialMzRTVec;
    14731509  }
    14741510 
Note: See TracChangeset for help on using the changeset viewer.