Changeset 4091


Ignore:
Timestamp:
Jan 18, 2008, 2:21:13 PM (14 years ago)
Author:
Johan Enell
Message:

Merged from 2.5.1

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/plugins/core/net/sf/basedb/plugins/IlluminaRawDataImporter.java

    r4083 r4091  
    3838
    3939
    40 import net.sf.basedb.core.ArrayDesign;
    4140import net.sf.basedb.core.BaseException;
    4241import net.sf.basedb.core.BasicItem;
     
    4443import net.sf.basedb.core.DbControl;
    4544import net.sf.basedb.core.Experiment;
    46 import net.sf.basedb.core.FeatureIdentificationMethod;
    4745import net.sf.basedb.core.File;
    4846import net.sf.basedb.core.Include;
     
    8886import net.sf.basedb.util.parser.FlatFileParser.Data;
    8987import net.sf.basedb.util.parser.FlatFileParser.Line;
     88import net.sf.basedb.util.parser.FlatFileParser.LineType;
    9089
    9190/**
     
    9493  to an experiment.
    9594  <p>
    96   Since the data files doesn't have any coordinate information the importer will
    97   use one of the following methods.
    98  
    99   <ul>
    100   <li>
    101     If no array design has been selected or if the array design is using the
    102     {@link FeatureIdentificationMethod#COORDINATES} method for identifying features:
    103     The plug-in create fake coordinates like this: block=1, column=1, row=linenumber in file.
    104     The linenmbers start with 1 at the first data line, ie. header lines are not counted.
    105 
    106   <li>
    107     If the array design uses the {@link FeatureIdentificationMethod#POSITION}
    108     method for identiyfing features: The plug-in sets the position=line number in file.
    109     The linenmbers start with 1 at the first data line, ie. header lines are not counted.
    110    
    111   <li>
    112     If the array design uses the {@link FeatureIdentificationMethod#FEATURE_ID}
    113     method for identifying features. The plug-in assumes that the feature ID is the
    114     same as the reporter ID.
    115   </ul>
    116  
    117   <p>
    118   NOTE! Since the methods are not conflicting with each other, there will not
    119   be an actual check which method to use by this plug-in. We will simple set all
    120   values as specified above and let the BASE core handle the identification.
     95  Since the data files doesn't have any coordinate information, the plug-in create
     96  fake coordinates like this: block=1, column=1, row=linenumber in file.
     97  The linenmbers start with 1 at the first data line, ie. header lines are not counted.
    12198
    12299  @author nicklas
     
    152129    ));
    153130 
    154   private static final PluginParameter<String> featureIdentificationParameter =
    155     new PluginParameter<String>(
    156       "featureIdentification",
    157       "Identify features by",
    158       "Choose which method to use for identifying features. If no value " +
    159       "is selected the identification method used on the array design is used: \n\n" +
    160       "* COORDINATES: Use auto-generated coordinates (block = 1, column = 1, row = line number in file)\n" +
    161       "* POSITION: Use auto-generated position = line number in file\n" +
    162       "* FEATURE_ID: Use TargetID (requires that each Target ID only appears once in the file)\n\n" +
    163       "NOTE! This parameter has no meaning unless an array design is selected",
    164       new StringParameterType(255, null, false, 1, 0, 0,
    165         Arrays.asList(new String[] {"COORDINATE", "POSITION", "FEATURE_ID" } ))
    166     );
    167 
    168131 
    169132  private static final PluginParameter<String> invalidColumnsErrorParameter = new PluginParameter<String>(
     
    290253        storeValue(job, request, fileParameter);
    291254        storeValue(job, request, ri.getParameter(CHARSET));
     255        storeValue(job, request, ri.getParameter(DECIMAL_SEPARATOR));
    292256       
    293257        // Associations
    294258        storeValue(job, request, ri.getParameter("experiment"));
    295         storeValue(job, request, ri.getParameter("arrayDesign"));
    296         storeValue(job, request, featureIdentificationParameter);
    297259        storeValue(job, request, ri.getParameter("scan"));
    298260        storeValue(job, request, ri.getParameter("protocol"));
     
    324286  private DbControl dc;
    325287  private Experiment experiment;
    326   private ArrayDesign design;
    327288  private Scan scan;
    328289  private Software software;
    329290  private Protocol protocol;
    330   private FeatureIdentificationMethod fiMethod;
    331291  private List<RawBioAssay> rawBioAssays;
    332292  private List<BatchAndMapHolder> holders;
     
    343303    Create a FlatFileParser that can parse Illumina data files:
    344304    <ul>
    345     <li>Data splitter: ,
     305    <li>Data splitter: (,|\t)
    346306    <li>Header regexp: (.+)=(.*?),*
    347     <li>Data header: TargetId,.*
     307    <li>Data header: TargetID(,|\t).*
    348308    </ul>
     309    NOTE! To begin with we support both comma and tab as column splitter but
     310    later on (in {@link #isImportable(FlatFileParser)}) when we know which one is actually
     311    used, we change this in the parser. We need to do this since numbers may
     312    use comma as decimal separator.
    349313  */
    350314  @Override
     
    352316    throws BaseException
    353317  {
     318    String separator = "(,|\\t)";
    354319    FlatFileParser ffp = new FlatFileParser();
    355     ffp.setDataSplitterRegexp(Pattern.compile(","));
    356     ffp.setDataHeaderRegexp(Pattern.compile("TargetID,.*"));
     320    ffp.setDataSplitterRegexp(Pattern.compile(separator));
     321    ffp.setDataHeaderRegexp(Pattern.compile("TargetID"+separator + ".*"));
    357322    ffp.setHeaderRegexp(Pattern.compile("(.+)=(.*?),*"));
    358323    return ffp;
    359324  }
    360325  /**
    361     @return Always "dot"
     326    @return As specified by job parameter or "dot" if not
    362327  */
    363328  @Override
    364329  protected String getDecimalSeparator()
    365330  {
    366     return "dot";
     331    String separator = super.getDecimalSeparator();
     332    if (separator == null) separator = "dot";
     333    return separator;
    367334  }
    368335
     
    376343  {
    377344    String firstLine = ffp.getLineCount() >= 1 ? ffp.getLine(0).line() : null;
    378     return firstLine != null && firstLine.contains("Illumina") ;
     345    boolean isIllumina = firstLine != null && firstLine.contains("Illumina");
     346    if (isIllumina)
     347    {
     348      String separator = ",";
     349      FlatFileParser.Line lastLine = ffp.getLine(ffp.getLineCount()-1);
     350      if (lastLine.type() == LineType.DATA_HEADER)
     351      {
     352        int firstTab = lastLine.line().indexOf("\t");
     353        if (firstTab > 0 && firstTab < lastLine.line().indexOf(","))
     354        {
     355          separator = "\\t";
     356        }
     357        ffp.setDataSplitterRegexp(Pattern.compile(separator));
     358      }
     359    }
     360    return isIllumina;
    379361  }
    380362 
     
    389371    this.headerLines = new LinkedList<Line>();
    390372    this.experiment = (Experiment)job.getValue("experiment");
    391     this.design = (ArrayDesign)job.getValue("arrayDesign");
    392373    this.scan = (Scan)job.getValue("scan");
    393374    this.protocol = (Protocol)job.getValue("protocol");
    394375    this.software = (Software)job.getValue("software");
    395376   
    396     // Feature identification
    397     try
    398     {
    399       String fiTemp = (String)job.getValue("featureIdentification");
    400       fiMethod = FeatureIdentificationMethod.valueOf(fiTemp);
    401     }
    402     catch (Exception ex)
    403     {
    404       fiMethod = null;
    405     }
    406377    // Setup error handling
    407378    this.nullIfException = "null".equals(getErrorOption("numberFormatError"));
     
    460431      // Need to reload raw bioassay with current DbControl
    461432      rba = RawBioAssay.getById(dc, rba.getId());
    462       RawDataBatcher batcher = rba.getRawDataBatcher(fiMethod);
     433      RawDataBatcher batcher = rba.getRawDataBatcher();
    463434      batcher.useNullIfReporterNotFound(nullIfMissingReporter);
    464435      holders.add(new BatchAndMapHolder(batcher, illumina, ffp));
     
    471442  {
    472443    String externalId = reporterMapper.getValue(data);
    473     int lineNo = data.dataLineNo();
    474444    for (BatchAndMapHolder holder : holders)
    475445    {
    476446      RawData raw = holder.batcher.newRawData();
    477       raw.setPosition(lineNo);
    478447      raw.setBlock(1);
    479448      raw.setColumn(1);
    480       raw.setRow(lineNo);
     449      raw.setRow(data.dataLineNo());
    481450      for (Map.Entry<RawDataProperty, Mapper> entry : holder.mappers.entrySet())
    482451      {
     
    485454        raw.setExtended(ep.getName(), ep.parseString(m.getValue(data), numberFormat, nullIfException));
    486455      }
    487       holder.batcher.insert(raw, externalId, externalId);
     456      holder.batcher.insert(raw, externalId);
    488457    }
    489458    numInserted++;
     
    597566            RawBioAssay rba = RawBioAssay.getNew(dc, generic, illumina);
    598567            rba.setName(arrayName);
    599             if (design != null) rba.setArrayDesign(design);
    600568            if (scan != null) rba.setScan(scan);
    601569            if (protocol != null) rba.setProtocol(protocol);
     
    649617        parameters.add(fileParameter);
    650618        parameters.add(getCharsetParameter(null, null, null));
    651  
    652         // parameters for scan, protocol, software and array design
     619        parameters.add(getDecimalSeparatorParameter(null, null, (String)job.getValue(DECIMAL_SEPARATOR)));
     620 
     621        // parameters for scan, protocol and software
    653622        dc = sc.newDbControl();
    654623        List<Scan> scans = getItems(dc, Scan.getQuery());
     
    665634            )
    666635          );
    667         List<ArrayDesign> designs = getItems(dc, ArrayDesign.getQuery(),
    668           Restrictions.gt(
    669               Hql.property("numDbFeatures"),
    670               Expressions.integer(0)
    671             )
    672           );
    673636       
    674637        boolean hasAssociations =
    675638          context.getItem() == Item.EXPERIMENT || scans.size() > 0 ||
    676           protocols.size() > 0 || software.size() > 0 || designs.size() > 0;
     639          protocols.size() > 0 || software.size() > 0;
    677640       
    678641        if (hasAssociations)
     
    689652              ));
    690653          }
    691           if (!designs.isEmpty())
    692           {
    693             parameters.add(new PluginParameter<ArrayDesign>(
    694               "arrayDesign",
    695               "Array design",
    696               "The imported raw bioassays will be linked to the selected array design.",
    697               new ItemParameterType<ArrayDesign>(ArrayDesign.class, null, false, 1, designs)
    698             ));
    699             parameters.add(featureIdentificationParameter);
    700           }
    701654          if (!scans.isEmpty())
    702655          {
Note: See TracChangeset for help on using the changeset viewer.