Changeset 2497


Ignore:
Timestamp:
Jun 5, 2014, 3:41:07 PM (9 years ago)
Author:
Nicklas Nordborg
Message:

References #547: Start Demux and Merge

Added trimmomatic to the script. This is called after the merge. Since the merge and copy to project_archive happened at the same time before the needed some bigger changes.

After the demux all FASTQ files are in the FASTQ folder (we can no longer use the COMPRESS_OUTPUTS option since trimmomatic don't like concatenated gz files).

The second step is to merge and compress the FASTQ files into 'fastq.merged' directory.

The third step runs trimmomatic which save the modified FASTQ files (compressed) in the 'fastq.trimmomatic' directory. Options for trimmomatic are configured in the reggie-ogs-hosts.xml file. Information about what has happened is written to a log file 'trimmomatic.out' in the job folder. When the job has been completed this information is parsed and number of surviving reads are stored in annotation PT_READS.

Last step is to simply copy the generated files to the project_archive.

Location:
extensions/net.sf.basedb.reggie/trunk
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • extensions/net.sf.basedb.reggie/trunk/reggie-ogs-hosts.xml

    r2464 r2497  
    5959    </demux>
    6060
     61    <!-- Settings for the Trimmomatic step -->
     62    <trimmomatic>
     63      <!-- full path to the JAR file with the Trimmomatic program -->
     64      <jar-path>/usr/local/packages/trimmomatic/0.32/trimmomatic-0.32.jar</jar-path>
     65      <!-- full path to the file with Illumina adapter information -->
     66      <adapter-file>/usr/local/packages/trimmomatic/0.32/adapters/TruSeq3-PE-2.fa</adapter-file>
     67      <!-- static options for Trimmomatic -->
     68      <options>ILLUMINACLIP:${AdapterFile}:2:30:12:1:true LEADING:3 TRAILING:3 MAXINFO:40:0.9 MINLEN:20</options>
     69    </trimmomatic>
     70
    6171    <pe-filter>
    6272      <!-- parallel environment option to the queue system -->
  • extensions/net.sf.basedb.reggie/trunk/src/net/sf/basedb/reggie/dao/Annotationtype.java

    r2454 r2497  
    867867
    868868  /**
     869    The "PT_READS" annotation, used for derived bioassays (MergedSequences).
     870    The annotation is the number of sequences that remains for a library after
     871    running trimmomatic.
     872    @since 2.16
     873  */
     874  public static final Annotationtype PT_READS =
     875    new Annotationtype("PT_READS", Type.LONG, false, Item.DERIVEDBIOASSAY);
     876
     877  /**
    869878    The "PF_NNNN_PCT" annotation, used for derived bioassays (DemuxedSequences).
    870879    The annotation is the percentage of sequences that has could not be assigned
  • extensions/net.sf.basedb.reggie/trunk/src/net/sf/basedb/reggie/servlet/DemuxMergeServlet.java

    r2464 r2497  
    1313import java.util.Set;
    1414import java.util.TreeMap;
     15import java.util.regex.Matcher;
    1516import java.util.regex.Pattern;
    1617
     
    285286          mg.loadAnnotations(dc, "READS", Annotationtype.READS, null);
    286287          mg.loadAnnotations(dc, "PF_READS", Annotationtype.PF_READS, null);
     288          mg.loadAnnotations(dc, "PT_READS", Annotationtype.PT_READS, null);
    287289          jsonMerged.add(mg.asJSONObject());
    288290        }
     
    376378        String debug_tileLimit = cluster.getConfig("demux/debug-tile-limit", "16");
    377379       
     380        String trim_jarPath = cluster.getRequiredConfig("trimmomatic/jar-path");
     381        String trim_adapterFile = cluster.getRequiredConfig("trimmomatic/adapter-file");
     382        String trim_options = cluster.getRequiredConfig("trimmomatic/options");
     383       
    378384        String projectRoot = cluster.getRequiredConfig("project-archive");
    379385        String projectFolder = projectName+(debug ? ".debug" : "");
     
    430436            script.cmd("export PicardDir="+demux_picardDir);
    431437            script.cmd("export PicardMemory="+demux_picardMemory);
     438            script.cmd("export TrimmomaticJAR="+trim_jarPath);
     439            script.cmd("export AdapterFile="+trim_adapterFile);
    432440            script.newLine();
    433441            script.cmd("mkdir -p " + tmpFolder);
    434442            script.cmd("cd " + tmpFolder);
    435443            script.cmd("mkdir fastq");
     444            script.cmd("mkdir fastq.merged");
     445            script.cmd("mkdir fastq.trimmomatic");
    436446            script.cmd("cp ${ScriptDir}/picard .");
     447            script.cmd("cp ${ScriptDir}/trimmomatic .");
    437448            script.newLine();
    438449          }
     
    555566          List<Tag> allTags = barcodeQuery.list(dc);
    556567          String totalMetricsFile = jobFolder + "/demultiplex_metrics.txt";
     568          String trimmomaticOut = jobFolder + "/trimmomatic.out";
    557569
    558570          int currentLane = 0;
     
    612624           
    613625              script.comment(flowCellBarcode + "; lane " + lane);
    614               // Progress between 5-75%
    615               int percent = 5 + (currentLane * 70 / totalLanes);
     626              // Progress between 5-50%
     627              int percent = 5 + ((currentLane * 45) / totalLanes);
    616628              currentLane++;
    617629              script.progress(percent, "Extracting barcodes: " + flowCellBarcode + "; lane "+ lane);
     
    632644              if (debug)
    633645              {
    634                 //extractCmd += " TILE_LIMIT="+debug_tileLimit;
     646                extractCmd += " TILE_LIMIT="+debug_tileLimit;
    635647              }
    636648              extractCmd += " >> " + demuxName + "_" + lane + ".out";
     
    654666                fastqCmd += " MACHINE_NAME="+sequencerName;
    655667              fastqCmd += " NUM_PROCESSORS=0";
    656               fastqCmd += " COMPRESS_OUTPUTS=true";
     668              //fastqCmd += " COMPRESS_OUTPUTS=true";
    657669              if (demux_fastqOptions != null)
    658670              {
     
    681693            String fastqFolder = (String)Annotationtype.DATA_FILES_FOLDER.getAnnotationValue(dc, merged);
    682694            script.comment("Consolidating FASTQ: " + merged.getName());
    683             // Progress between 75-95%
    684             int percent = 75 + (currentToMove * 20 / totalToMove);
     695            // Progress between 50-95%
     696            int percent = 50 + ((currentToMove * 45) / totalToMove);
    685697            currentToMove++;
     698           
     699            String R1_name = merged.getName()+"_R1.fastq.gz";
     700            String R2_name = merged.getName()+"_R2.fastq.gz";
     701           
    686702            script.progress(percent, "Consolidating FASTQ: " + merged.getName());
     703            script.bkgr("cat fastq/" + merged.getName() + "_*.1.fastq | gzip > fastq.merged/"+R1_name);
     704            script.cmd("cat fastq/" + merged.getName() + "_*.2.fastq | gzip > fastq.merged/"+R2_name);
     705            script.newLine();
     706
     707            script.progress(percent, "Trimmomatic: " + merged.getName());
     708            script.cmd("echo [" + merged.getName() + "] >> " + trimmomaticOut);
     709            String trimCmd = "./trimmomatic PE";
     710            //trimCmd += " -trimlog fastq.trimmomatic/" + merged.getName()+".log";
     711            trimCmd += " fastq.merged/"+R1_name;
     712            trimCmd += " fastq.merged/"+R2_name;
     713            trimCmd += " fastq.trimmomatic/"+R1_name;
     714            trimCmd += " fastq.trimmomatic/un_"+R1_name;
     715            trimCmd += " fastq.trimmomatic/"+R2_name;
     716            trimCmd += " fastq.trimmomatic/un_"+R2_name;
     717
     718            if (trim_options != null)
     719            {
     720              trimCmd += " " + trim_options;
     721            }
     722            trimCmd += " >> "+trimmomaticOut+" 2>&1";
     723            script.cmd(trimCmd);
     724            script.newLine();
     725           
     726            script.progress(percent, "Archiving FASTQ: " + merged.getName());
    687727            script.cmd("mkdir -p " + projectRoot + "/" + fastqFolder);
    688             script.cmd("cat fastq/" + merged.getName() + "_*.1.fastq.gz > " + projectRoot + "/" + fastqFolder + "/"+merged.getName()+"_R1.fastq.gz");
    689             script.cmd("cat fastq/" + merged.getName() + "_*.2.fastq.gz > " + projectRoot + "/" + fastqFolder + "/"+merged.getName()+"_R2.fastq.gz");
     728            script.bkgr("cat fastq.trimmomatic/" + R1_name + " > " + projectRoot + "/" + fastqFolder + "/"+R1_name);
     729            script.bkgr("cat fastq.trimmomatic/" + R2_name + " > " + projectRoot + "/" + fastqFolder + "/"+R2_name);
    690730            script.newLine();
    691731          }
     
    9771017    {
    9781018      CmdResult metrics = cluster.executeCmd(ssh, "cat " + cluster.getJobFolder() + "/" + jobStatus.getJobName() + "/demultiplex_metrics.txt", 2);
    979       if (metrics.getExitStatus() == 0)
    980       {
    981         Reads total = parseDemultiplexMetrics(sc, metrics.getStdout());
    982         String msg = Values.formatNumber(total.reads/1000000f, 1) + "M reads; " + Values.formatNumber(total.passedFilter/1000000f, 1) + "M passed filter";
     1019      CmdResult trimmomatic = cluster.executeCmd(ssh, "cat " + cluster.getJobFolder() + "/" + jobStatus.getJobName() + "/trimmomatic.out", 2);
     1020      if (metrics.getExitStatus() != 0)
     1021      {
     1022        job.setDescription(metrics.getStderr());
     1023      }
     1024      else if (trimmomatic.getExitStatus() != 0)
     1025      {
     1026        job.setDescription(trimmomatic.getStderr());
     1027      }
     1028      else
     1029      {
     1030        Reads total = parseDemultiplexMetrics(sc, metrics.getStdout(), trimmomatic.getStdout());
     1031        String msg = Values.formatNumber(total.reads/1000000f, 1) + "M reads; ";
     1032        msg += Values.formatNumber(total.passedFilter/1000000f, 1) + "M passed filter; ";
     1033        msg += Values.formatNumber(total.passedTrimmomatic/1000000f, 1) + "M passed trimmomatic; ";
    9831034        if (total.warnings.size() > 0)
    9841035        {
     
    9871038        return msg;
    9881039      }
    989       else
    990       {
    991         job.setDescription(metrics.getStderr());
    992       }
    9931040      return null;
    9941041    }
    9951042   
    996     private Reads parseDemultiplexMetrics(SessionControl sc, String metrics)
    997     {
     1043    private Reads parseDemultiplexMetrics(SessionControl sc, String metrics, String trimmomatic)
     1044    {
     1045      Map<String, Reads> sumReads = new HashMap<String, Reads>();
     1046
     1047      // Parse the demultiplex_metrics file
    9981048      FlatFileParser ffp = new FlatFileParser();
    9991049      ffp.setIgnoreRegexp(Pattern.compile("#.*"));
     
    10051055      ffp.setMinDataColumns(10);
    10061056
    1007       Map<String, Reads> sumReads = new HashMap<String, Reads>();
    10081057      Set<String> demuxNames = new HashSet<String>();
    10091058      try
     
    10941143        throw new RuntimeException(ex);
    10951144      }
    1096            
     1145     
     1146      // Parse the trimmomatic.out file
     1147      Pattern libPattern = Pattern.compile("\\[(.*)\\]");
     1148      Pattern dataPattern = Pattern.compile(".*Both Surviving:\\s+(\\d+).*");
     1149      Reads currentLib = null;
     1150      int lineNo = 0;
     1151      for (String line : trimmomatic.split("\n"))
     1152      {
     1153        lineNo++;
     1154        Matcher m = libPattern.matcher(line);
     1155        if (m.matches())
     1156        {
     1157          String libName = m.group(1);
     1158          currentLib = sumReads.get(libName);
     1159          if (currentLib == null)
     1160          {
     1161            logger.error("At line " + lineNo + ": Found trimmomatic section for lib '" + libName + "' but not demultiplex metrics");
     1162          }
     1163          continue;
     1164        }
     1165        m = dataPattern.matcher(line);
     1166        if (m.matches())
     1167        {
     1168          if (currentLib == null)
     1169          {
     1170            logger.error("At line " + lineNo + ": Found trimmomatic data but has not found a library name");
     1171          }
     1172          else
     1173          {
     1174            currentLib.passedTrimmomatic = Values.getLong(m.group(1), -1);
     1175            if (logger.isDebugEnabled())
     1176            {
     1177              logger.debug("Trimmomatic: " + currentLib.libName + "; " + currentLib.passedTrimmomatic);
     1178            }
     1179            currentLib = null;
     1180          }
     1181        }
     1182      }
     1183     
    10971184      DbControl dc = null;
    10981185      Reads total = new Reads(null);
     
    11441231          if (logger.isDebugEnabled())
    11451232          {
    1146             logger.debug(r.libName + "; " + r.reads + "; " + r.passedFilter);
     1233            logger.debug(r.libName + "; " + r.reads + "; " + r.passedFilter + "; " + r.passedTrimmomatic);
    11471234          }
    11481235         
     
    11501237          if (merged != null)
    11511238          {
    1152             Annotationtype.READS.setAnnotationValue(dc, merged.getItem(), r.reads);
    1153             Annotationtype.PF_READS.setAnnotationValue(dc, merged.getItem(), r.passedFilter);
     1239            DerivedBioAssay m = merged.getItem();
     1240            Annotationtype.READS.setAnnotationValue(dc, m, r.reads);
     1241            Annotationtype.PF_READS.setAnnotationValue(dc, m, r.passedFilter);
     1242            Annotationtype.PT_READS.setAnnotationValue(dc, m, r.passedTrimmomatic);
    11541243            total.reads += r.reads;
    11551244            total.passedFilter += r.passedFilter;
     1245            total.passedTrimmomatic += r.passedTrimmomatic;
    11561246          }
    11571247        }
     
    11741264    long reads = 0;
    11751265    long passedFilter = 0;
     1266    long passedTrimmomatic = 0;
    11761267   
    11771268    Reads(String libName)
  • extensions/net.sf.basedb.reggie/trunk/src/net/sf/basedb/reggie/servlet/InstallServlet.java

    r2454 r2497  
    388388        jsonChecks.add(checkAnnotationType(dc, Annotationtype.READS, 1, null, effectivePermissionsUse, createIfMissing));
    389389        jsonChecks.add(checkAnnotationType(dc, Annotationtype.PF_READS, 1, null, effectivePermissionsUse, createIfMissing));
     390        jsonChecks.add(checkAnnotationType(dc, Annotationtype.PT_READS, 1, null, effectivePermissionsUse, createIfMissing));
    390391        jsonChecks.add(checkAnnotationType(dc, Annotationtype.PF_NNNN_PCT, 1, null, effectivePermissionsUse, createIfMissing));
    391392        jsonChecks.add(checkAnnotationType(dc, Annotationtype.PF_UNUSED_PCT, 1, null, effectivePermissionsUse, createIfMissing));
     
    505506
    506507        jsonChecks.add(checkAnnotationTypeCategory(dc, Subtype.MERGED_SEQUENCES, createIfMissing,
    507             Annotationtype.ANALYSIS_RESULT, Annotationtype.READS, Annotationtype.PF_READS,
     508            Annotationtype.ANALYSIS_RESULT,
     509            Annotationtype.READS, Annotationtype.PF_READS, Annotationtype.PT_READS,
    508510            Annotationtype.DATA_FILES_FOLDER,
    509511            Annotationtype.AUTO_PROCESSING
Note: See TracChangeset for help on using the changeset viewer.