Changeset 619


Ignore:
Timestamp:
Mar 6, 2008, 4:44:32 PM (13 years ago)
Author:
Martin Svensson
Message:

Fixes #101. The plug-in tested both from raw bioassay-listview and experiment-singel_itemview. Code has been cleaned up and documentation added.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • plugins/base2/net.sf.basedb.illumina/trunk/src/net/sf/basedb/illumina/plugins/SnpRawDataImporter.java

    r618 r619  
    3232import net.sf.basedb.core.Experiment;
    3333import net.sf.basedb.core.File;
     34import net.sf.basedb.core.FileType;
    3435import net.sf.basedb.core.Include;
    3536import net.sf.basedb.core.InvalidDataException;
    3637import net.sf.basedb.core.Item;
     38import net.sf.basedb.core.ItemAlreadyExistsException;
     39import net.sf.basedb.core.ItemNotFoundException;
    3740import net.sf.basedb.core.ItemParameterType;
    3841import net.sf.basedb.core.ItemQuery;
     
    5356import net.sf.basedb.core.Software;
    5457import net.sf.basedb.core.SoftwareType;
    55 import net.sf.basedb.core.StringParameterType;
    5658import net.sf.basedb.core.SystemItems;
    5759import net.sf.basedb.core.plugin.About;
     
    7375import net.sf.basedb.util.parser.Mapper;
    7476import net.sf.basedb.util.parser.FlatFileParser.Data;
    75 import net.sf.basedb.util.parser.FlatFileParser.Line;
    7677import net.sf.basedb.util.parser.FlatFileParser.LineType;
    7778
    7879import java.io.BufferedWriter;
    7980import java.io.IOException;
    80 import java.io.OutputStream;
    8181import java.io.OutputStreamWriter;
    82 import java.io.PrintWriter;
    83 import java.text.NumberFormat;
    8482import java.util.ArrayList;
    8583import java.util.Arrays;
     
    9492
    9593/**
     94  This plug-in is part of the Illumina plug-in package.
     95  It extracts a SNP raw data file into one file for each raw bio-assay,
     96  creates the raw bio-assays and connect them to the right file.
     97  Common properties for the raw bioassays can be configured.
     98  An experiment can be associated with the raw bioassay if the plug-in is started
     99  from the experiment's item-view. 
     100   
    96101    @author Martin
    97102    @version 2.6
     
    114119      Illumina.EMAIL,
    115120      Illumina.URL
    116   );
    117  
     121  ); 
    118122  private static final Set<GuiContext> guiContexts =
    119123    Collections.unmodifiableSet(new HashSet<GuiContext>(
     
    123127      )
    124128    ));
    125  
    126   private static final PluginParameter<String> invalidColumnsErrorParameter = new PluginParameter<String>(
    127       "invalidColumnsError",
    128       "Mismatch of columns",
    129       "What to do if , for example " +
    130       "if one array has GType and Score but another only has GType\n\n"+
    131       "ignore = Ignore this and import the data that is there\n"+
    132       "fail = Stop with an error message",
    133       new StringParameterType(255, null, false, 1, 0, 0,
    134         Arrays.asList( new String[] { "ignore", "fail"} ))
    135     );
    136 
    137 //  private static final PluginParameter<String> missingReporterErrorParameter = new PluginParameter<String>(
    138 //      "missingReporterError",
    139 //      "Reporter not found",
    140 //      "How to handle errors that are caused by a reporter not beeing present in the datbase. If not specified the " +
    141 //        "default error handling is used.\n\n"+
    142 //        "null = Import the data but set the reporter to null\n"+
    143 //        "skip = Skip the current data line and continue\n"+
    144 //        "fail = Stop with an error message",
    145 //        new StringParameterType(255, null, false, 1, 0, 0,
    146 //            Arrays.asList( new String[] { "null", "skip", "fail"} ))
    147 //      );
    148 
    149129  private static final PluginParameter<String> associationsSection = new PluginParameter<String>(
    150130    "associationsSection",
     
    153133    null
    154134  );
    155 
    156135  private static final PluginParameter<String> splitFilesDirectoryParameter = new PluginParameter<String>(
    157136    "splitFileDirectory",
    158     "Directory of split files",
     137    "Directory of extracted files",
    159138    "The directory were to put the " +
    160139    "new files when splitting up the original " +
    161140    "raw datafile. Leave this parameter empty to " +
    162141    "put the split files in a new sub-directory in " +
    163     "the original file's location",
     142    "the original file's location.",
    164143    new PathParameterType(Path.Type.DIRECTORY, null, false)
    165144  );
     
    167146  private RequestInformation configureJob;
    168147 
     148  //Columns that are required for the raw data.
    169149  private String[] requiredColumnNames = {"Address", "GenTrain Score", ".GType", ".B Allele Freq", ".Log R Ratio"};
    170150 
     151  //Maps the samples with their columns
    171152  private Map<String, List<String>> sampleMappings = new HashMap<String, List<String>>();
    172    
     153 
    173154  //Original Illumina SNP raw data file type
    174155  private DataFileType originalDataFileType;
     156 
    175157  //Split Illumina SNP raw data file type
    176158  private DataFileType splitDataFileType;
     159 
     160  //SNP raw data type
     161  private RawDataType illuminaSNP;
     162 
     163  //Flat file parser that is used to read the original file
     164  private FlatFileParser ffp;
     165 
     166  //Dbcontrol to access the database with
     167  private DbControl dc;
     168 
     169  //A list with the raw bioassays and mapped column index
     170  private List<MapHolder> holders;
     171 
     172  //Experiment that uses the raw bioassays
     173  private Experiment experiment;
     174 
     175  //Array design associated with the raw bioassays
     176  private ArrayDesign design;
     177 
     178  //Scan associated with the raw bioassays
     179  private Scan scan;
     180 
     181  //Software associated with the raw bioassays
     182  private Software software;
     183
     184  //Protocol associated with the raw bioassays
     185  private Protocol protocol;
    177186 
    178187  public SnpRawDataImporter()
     
    214223        message = "The object is null";
    215224      }
    216       else if (! (item instanceof Experiment))
     225      else if (!(item instanceof Experiment))
    217226      {
    218227        message = "The object is not an Experiment: " + item;
    219228      }
    220229      Experiment experiment = (Experiment)item;
    221       RawDataType illuminaSnp = RawDataTypes.getSafeRawDataType(Illumina.SNP_VARIANT_ID);
     230      RawDataType illuminaSnp = RawDataTypes.getRawDataType("variant." + Illumina.SNP_VARIANT_ID);
    222231      if (illuminaSnp != experiment.getRawDataType())
    223232      {
     
    228237        experiment.checkPermission(Permission.WRITE);
    229238      }
    230     }   
     239    }
    231240    return message;
    232241  }
     242 
    233243  public void configure(GuiContext context, Request request, Response response)
    234244  {
     
    258268       
    259269        // Error handling parameters
    260         storeValue(job, request, invalidColumnsErrorParameter);
     270        storeValue(job, request, defaultErrorParameter);
    261271       
    262272        response.setDone("Job configuration complete", Job.ExecutionTime.SHORT);
     
    267277      response.setError(ex.getMessage(), Arrays.asList(ex));
    268278    }
    269   }
     279    finally
     280    {
     281      if (dc != null) dc.close();
     282    }
     283  }
     284 
    270285  public Set<GuiContext> getGuiContexts()
    271286  {
    272287    return guiContexts;
    273288  }
     289
    274290  public RequestInformation getRequestInformation(GuiContext context, String command)
    275291      throws BaseException
     
    281297    }
    282298    return requestInformation;
     299  }
     300
     301  /**
     302    Checks that there not is already a directory with the same
     303    name as the file, but only if directory parameter is left empty.
     304   */
     305  @Override
     306  protected List<Throwable> validateRequestParameters(List<PluginParameter<?>> parameters, Request request)
     307  {
     308    List<Throwable> errors = super.validateRequestParameters(parameters, request);
     309    DbControl dc = null;
     310    String directoryPath = (String)request.getParameterValue(splitFilesDirectoryParameter.getName());
     311    File rawDataFile = (File)request.getParameterValue(fileParameter.getName());
     312    if (directoryPath == null || directoryPath == "")
     313    {
     314      dc = sc.newDbControl();         
     315      dc.refreshItem(rawDataFile);
     316      try
     317      {
     318        Directory dir = Directory.getByPath(dc, new Path(rawDataFile.getPath().toString(), Path.Type.DIRECTORY));
     319        if (dir != null)
     320        {
     321          errors = errors == null ? new ArrayList<Throwable>() : errors;
     322          errors.add(new ItemAlreadyExistsException("It seems that the raw data file already has been extracted into a directory with the file's name. " +
     323              "Please select another directory to put the extracted files in"));
     324        }
     325      }
     326      catch(ItemNotFoundException ex)
     327      {}
     328      finally
     329      {
     330        if (dc != null) dc.close();
     331      }
     332    }
     333   
     334    return errors;
    283335  }
    284336 
     
    288340    From the AbstractFlatFileImporter
    289341   */
    290   private RawDataType illuminaSNP;
    291   private FlatFileParser ffp;
    292   private DbControl dc;
    293   private List<MapHolder> holders;
    294   private Experiment experiment;
    295   private ArrayDesign design;
    296   private Scan scan;
    297   private Software software;
    298   private Protocol protocol;
    299   private List<RawBioAssay> rawBioAssays;
    300 //  private Map<String, List<Mapper>> mapper;
    301 //  private List<Line> headerLines;
    302 //  private Mapper reporterMapper;
    303 //  private int numInserted;
    304 //  private int numRawBioAssays;
    305 //  private NumberFormat numberFormat;
    306 //  private boolean nullIfException;
    307   private boolean verifyColumns;
    308 //  private boolean nullIfMissingReporter;
    309  
    310   /**
    311     Initalize a FlatFileParser so it can parse Illumina SNP data files.
     342  /**
     343    Initalize a FlatFileParser so it can parse Illumina SNP data file.
    312344    <ul>
    313345    <li>Data splitter: (\t)
     
    322354    String separator = "\\t";
    323355    FlatFileParser ffp = new FlatFileParser();
     356    //The data files should always be tab-separated.
    324357    ffp.setDataSplitterRegexp(Pattern.compile(separator));
     358    //Assume that all data files have the same two columns first
    325359    ffp.setDataHeaderRegexp(Pattern.compile("Address"+separator + "GenTrain Score"+separator+".*"));
    326360    ffp.setHeaderRegexp(Pattern.compile("(.+)=(.*?),*"));
     
    329363 
    330364  /**
    331     @return As specified by job parameter or "dot" if not
    332   */
    333   @Override
    334   protected String getDecimalSeparator()
    335   {
    336     String separator = super.getDecimalSeparator();
    337     if (separator == null) separator = "dot";
    338     return separator;
    339   }
    340  
    341   /**
    342365    Check that the first line is the column names and that it begins with
    343366    columns 'Address' and 'GenTrain Score'
    344     @return TRUE if the column names are right in the first line, FALSE
    345       otherwise
     367    @return TRUE if the first line is the column headers, FALSE otherwise.
    346368  */
    347369  @Override
     
    350372    FlatFileParser.Line firstLine = ffp.getLineCount() >= 1 ? ffp.getLine(0) : null;
    351373    boolean isSNPData = firstLine != null && firstLine.type() == LineType.DATA_HEADER;
    352    
    353374    return isSNPData;
    354375  }
    355376 
     377  /**
     378    Initialise the local variables
     379   */
    356380  @Override
    357381  protected void begin(FlatFileParser ffp)
     
    366390    this.protocol = (Protocol)job.getValue("protocol");
    367391    this.software = (Software)job.getValue("software");
    368    
    369     // Setup error handling
    370     this.verifyColumns = "fail".equals(getErrorOption("invalidColumnsError"));
    371   }
    372  
    373   /**
    374     Check column headers and map them to raw bioassays.
    375     Create raw bioassays. Initialise column <code>Mapper</code>:s.
     392  }
     393 
     394  /**
     395    Get column headers, create raw bioassays and map them to the headers.
     396    Initialise the data import.
    376397  */
    377398  @Override
     
    379400  {
    380401    File rawDataFile = (File)job.getValue("file");
    381     String splitFilesPath = (String)job.getValue("splitFileDirectory");
     402    String directoryPath = (String)job.getValue("splitFileDirectory");
     403    Directory splitDir = null;
    382404    this.dc = sc.newDbControl();
    383405   
    384     this.splitDataFileType = DataFileType.getByExternalId(dc, Illumina.SNP_SPLITDATA_FILE_ID);
    385     this.originalDataFileType = DataFileType.getByExternalId(dc, Illumina.SNP_DATA_FILE_ID);
    386     sampleMappings = extractSamplesAndColumns(ffp, Arrays.asList(requiredColumnNames), verifyColumns);
    387     this.rawBioAssays = extractAndCreateRawBioAssays(dc, sampleMappings,
    388         rawDataFile, new Path(splitFilesPath, Path.Type.DIRECTORY));
    389     this.holders = new ArrayList<MapHolder>(rawBioAssays.size());     
     406    if (directoryPath == null || directoryPath == "")
     407    {
     408     
     409      dc.refreshItem(rawDataFile);
     410      splitDir = rawDataFile.getDirectory().newSubDirectory();
     411      dc.saveItem(splitDir);
     412      splitDir.setName(rawDataFile.getName());
     413      splitDir.setDescription("Contains files that have been created by split up the raw data file: " + rawDataFile.getName());
     414    }
     415    else
     416    {
     417      Path path = new Path(directoryPath, Path.Type.DIRECTORY);
     418      splitDir = Directory.getByPath(dc, path);
     419    }
     420       
     421    splitDataFileType = DataFileType.getByExternalId(dc, Illumina.SNP_SPLITDATA_FILE_ID);
     422    originalDataFileType = DataFileType.getByExternalId(dc, Illumina.SNP_DATA_FILE_ID);
     423    sampleMappings = extractSamplesAndColumns(ffp, Arrays.asList(requiredColumnNames));
     424    List<RawBioAssay> rawBioAssays = extractAndCreateRawBioAssays(dc, sampleMappings, rawDataFile, splitDir);
     425    holders = new ArrayList<MapHolder>(rawBioAssays.size());     
    390426   
    391427    for (RawBioAssay rba : rawBioAssays)
     
    395431  }
    396432
     433  @Override
    397434  protected void handleData(Data data)
    398435    throws BaseException
     
    403440      try
    404441      {
    405         if (data.dataLineNo() < 5) System.out.println(data.dataLineNo());
     442        //Write the column names to file if this is the first line of data.
    406443        if (data.dataLineNo() == 1)
    407444        {
     
    416453          holder.bw.flush();
    417454        }
    418                
     455       
     456        //Extract data line for each sample
    419457        separator = "";
    420458        for (String columnName : sampleMappings.get(holder.rba.getName()))
     
    429467      catch(IOException io)
    430468      {
    431        
    432       }
    433     }     
    434   }
    435  
     469        throw new BaseException("Could not split up the data into files: " + io);
     470      }
     471      holder.numInserted++;
     472    }
     473  }
     474
     475  @Override
    436476  protected void end (boolean success)
    437477  {
    438     for (MapHolder holder : holders)
    439     {
    440       try
     478    try
     479    {
     480      if (experiment != null)
     481      {
     482        experiment = Experiment.getById(dc, experiment.getId());
     483      }
     484      for (MapHolder holder : holders)
    441485      {
    442486        holder.bw.flush();
    443         holder.bw.close();
    444       }
    445       catch(IOException io)
    446       {}     
    447     }   
     487        holder.bw.close(); 
     488        if (experiment != null)
     489        {
     490          experiment.addRawBioAssay(holder.rba);
     491        }
     492        holder.rba.getFileSet().getMember(splitDataFileType).setValid(true, null);
     493        holder.rba.setNumFileSpots(holder.numInserted);
     494      }
     495      dc.commit();
     496    }
     497    catch (Exception ex)
     498    {
     499      success = false;
     500    }
     501    finally
     502    {
     503      if (dc != null) dc.close();
     504    }
     505  }
     506 
     507  @Override
     508  protected String getSuccessMessage(int skippedLines)
     509  {
     510    String skipped = skippedLines > 0 ? skippedLines + "parsed lines in the original data file were skipped due to errors\n": "";
     511    String message = holders.size() + " raw bioassays were created, with " + holders.get(0).numInserted + " data entries each." + skipped;   
     512    return message;
     513  }
     514  // -------------------------------------------
     515 
     516  private RequestInformation getConfigureJobParameters(GuiContext context)
     517  {
     518    DbControl dc = null;
    448519    try
    449520    {
    450       dc.commit();
    451     }
    452     finally
    453     {
    454       if (dc != null) dc.close();
    455     }
    456   }
    457  
    458   // -------------------------------------------
    459  
    460   private RequestInformation getConfigureJobParameters(GuiContext context)
    461   {
    462     net.sf.basedb.core.DbControl dc = null;
    463     try
    464     {
    465521      if (configureJob == null)
    466522      {
    467523        List<PluginParameter<?>> parameters = new ArrayList<PluginParameter<?>>();
    468        
    469524        parameters.add(fileParameter);
    470525        parameters.add(splitFilesDirectoryParameter);
     
    487542        List<ArrayDesign> designs = getItems(dc, ArrayDesign.getQuery(),
    488543          Restrictions.gt(
    489               Hql.property("numDbFeatures"),
     544              Hql.property("numFileFeatures"),
    490545              Expressions.integer(0)
    491546            )
     
    549604        // Error handling parameters
    550605        parameters.add(errorSection);
    551         parameters.add(invalidColumnsErrorParameter);
     606        parameters.add(defaultErrorParameter);
    552607       
    553608        configureJob = new RequestInformation
     
    564619      if (dc != null) dc.close();
    565620    }
    566    
    567621    return configureJob;
    568622  }
     
    590644   */
    591645  private List<RawBioAssay> extractAndCreateRawBioAssays(DbControl dc,
    592       Map<String, List<String>> sampleMappings, File rawDataFile, Path splitFilesPath)
     646      Map<String, List<String>> sampleMappings, File rawDataFile, Directory splitFilesDirectory)
    593647  {
    594648    //The created raw bioassays
     
    596650   
    597651    PlatformVariant snpVariant = PlatformVariant.getByExternalId(dc, Illumina.SNP_VARIANT_ID);
    598    
    599     //The directory to place the split files in
    600     Directory splitDir = null;
    601     if (splitFilesPath != null)
    602     {
    603       splitDir = Directory.getByPath(dc, splitFilesPath);
    604     }
    605     else
    606     {
    607       dc.refreshItem(rawDataFile);
    608       splitDir = rawDataFile.getDirectory().newSubDirectory();
    609       splitDir.setName(rawDataFile.getName());
    610       dc.saveItem(splitDir);
    611     }   
    612     //Create raw bio assays and files with columns set.
     652 
     653    //Create raw bioassays and files with columns set.
    613654    for (Map.Entry<String, List<String>> entry : sampleMappings.entrySet())
    614655    {
    615       File splitFile = File.getFile(dc, splitDir, entry.getKey()+".split", true);
     656      File splitFile = File.getFile(dc, splitFilesDirectory, entry.getKey()+".split", true);
    616657      splitFile.setLocation(Location.PRIMARY);
     658      splitFile.setFileType(FileType.getById(dc, SystemItems.getId(FileType.RAW_DATA)));
    617659      dc.saveItem(splitFile);
    618660     
     
    627669      rba.setDescription("Raw bioassay for sample " + entry.getKey() + " in file " + rawDataFile.getName());
    628670      dc.saveItem(rba);
    629       dc.reattachItem(rba);
    630671      createdRba.add(rba);
    631     }
    632    
     672    }   
    633673    return createdRba;
    634674  }
    635675 
    636   private Map<String, List<String>> extractSamplesAndColumns (FlatFileParser ffp, 
    637       List<String> requiredColumns, boolean verifyColumns)
     676  private Map<String, List<String>> extractSamplesAndColumns (FlatFileParser ffp, List<String> requiredColumns)
    638677  {   
    639678    Map<String, List<String>> sampleMappings = new HashMap<String, List<String>>();   
    640 //    //Holds column names common for all samples
    641 //    Map <String, Mapper> commonCol = new HashMap<String, Mapper>();
    642    
    643     List<String> commonCol = new ArrayList<String>();
    644    
     679    List<String> commonCol = new ArrayList<String>();   
    645680    List<String> columnHeaders = ffp.getColumnHeaders();
     681
    646682    //Extract column headers common for all samples
    647683    for (String header : columnHeaders)
     
    670706    }
    671707   
    672     //VerifyColumns
     708    //VerifyColumns for each sample
    673709    for (Map.Entry<String, List<String>> entry : sampleMappings.entrySet())
    674710    {
    675       //Verify columns - if selected
    676       if (verifyColumns)
    677       {
    678         for (String required : requiredColumns)
    679         {
    680           String fullColumnName = required.startsWith(".") ? entry.getKey()+required : required;
    681           if (!entry.getValue().contains(fullColumnName))
    682           {
    683             throw new InvalidDataException("Missing data column: '" + fullColumnName + "' for one of the raw bio assays");
    684           }
     711      for (String required : requiredColumns)
     712      {
     713        String fullColumnName = required.startsWith(".") ? entry.getKey()+required : required;
     714        if (!entry.getValue().contains(fullColumnName))
     715        {
     716          throw new InvalidDataException("Missing data column: '" + fullColumnName + "' for raw bio assays '" + entry.getKey());
    685717        }
    686718      }     
     
    689721  }
    690722 
    691 
    692723  private static String getFileSpecificColumnName(String fullColumnName)
    693724  {
     
    695726    return dotIndex > -1 ? fullColumnName.substring(dotIndex+1) : fullColumnName;
    696727  }
    697    
     728 
     729  /**
     730    Internal class that holds an output stream,
     731    number of inserted data lines and mapped column index for a raw bioassay
     732   */
    698733  private class MapHolder
    699734  {
     
    701736    private final HashMap<String, Mapper> mappers;
    702737    private final BufferedWriter bw;
    703    
     738    private int numInserted;   
    704739   
    705740    private MapHolder(DbControl dc, RawBioAssay rba, FlatFileParser ffp, List<String> colNames)
     
    709744      File splitFile = rba.getFileSet().getMember(splitDataFileType).getFile();         
    710745      bw = new BufferedWriter(new OutputStreamWriter(splitFile.getUploadStream(true)));
     746      numInserted = 0;
    711747      createMappers(ffp, colNames);
    712748    }
Note: See TracChangeset for help on using the changeset viewer.