Changeset 5841


Ignore:
Timestamp:
Feb 25, 2020, 11:47:30 AM (22 months ago)
Author:
Nicklas Nordborg
Message:

References #1218: Implement MIPs alignment

Finalizing the MIPs alignment script by implementing the last two steps for collecting metrics. Results files are copied back to the project archive and linked in BASE to the alignment item.

Still need to implement importing some numbers as annotations from the metrics data.

Location:
extensions/net.sf.basedb.reggie/trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • extensions/net.sf.basedb.reggie/trunk/config/reggie-config.xml

    r5831 r5841  
    364364        <genome-fasta>${ReferenceDir}/gatk_bundle/2.8/b37/human_g1k_v37_decoy.fasta</genome-fasta>
    365365       
    366         <mark-duplicate-options>-CREATE_INDEX true -CREATE_MD5_FILE true -MAX_FILE_HANDLES 20000 -ALLOW_MISSING_UMIS false -DUPLICATE_SCORING_STRATEGY SUM_OF_BASE_QUALITIES -MAX_EDIT_DISTANCE_TO_JOIN 1 -REMOVE_DUPLICATES true</mark-duplicate-options>
    367        
     366        <mark-duplicates-options>-CREATE_INDEX true -CREATE_MD5_FILE true -MAX_FILE_HANDLES 20000 -ALLOW_MISSING_UMIS false -DUPLICATE_SCORING_STRATEGY SUM_OF_BASE_QUALITIES -MAX_EDIT_DISTANCE_TO_JOIN 1 -REMOVE_DUPLICATES true</mark-duplicates-options>
     367        <pcr-metrics-options>-METRIC_ACCUMULATION_LEVEL null -METRIC_ACCUMULATION_LEVEL ALL_READS -MINIMUM_MAPPING_QUALITY 1 -MINIMUM_BASE_QUALITY 20 -CLIP_OVERLAPPING_READS true -COVERAGE_CAP 500 -NEAR_DISTANCE 5</pcr-metrics-options>
    368368      </align-mips>
    369369     
  • extensions/net.sf.basedb.reggie/trunk/src/net/sf/basedb/reggie/grid/MipsAlignJobCreator.java

    r5839 r5841  
    99import org.slf4j.LoggerFactory;
    1010
     11import net.sf.basedb.core.AnyToAny;
     12import net.sf.basedb.core.DataFileType;
    1113import net.sf.basedb.core.DbControl;
    1214import net.sf.basedb.core.DerivedBioAssay;
     15import net.sf.basedb.core.Directory;
    1316import net.sf.basedb.core.File;
     17import net.sf.basedb.core.FileServer;
     18import net.sf.basedb.core.FileSetMember;
    1419import net.sf.basedb.core.InvalidDataException;
    1520import net.sf.basedb.core.ItemList;
     
    1722import net.sf.basedb.core.ItemSubtype;
    1823import net.sf.basedb.core.Job;
     24import net.sf.basedb.core.Path;
    1925import net.sf.basedb.core.Protocol;
    2026import net.sf.basedb.core.Sample;
     
    3238import net.sf.basedb.reggie.Reggie;
    3339import net.sf.basedb.reggie.XmlConfig;
     40import net.sf.basedb.reggie.dao.AlignedSequences;
    3441import net.sf.basedb.reggie.dao.Annotationtype;
    3542import net.sf.basedb.reggie.dao.BiomaterialList;
     
    3744import net.sf.basedb.reggie.dao.Dna;
    3845import net.sf.basedb.reggie.dao.DoNotUse;
     46import net.sf.basedb.reggie.dao.Fileserver;
    3947import net.sf.basedb.reggie.dao.Library;
    4048import net.sf.basedb.reggie.dao.MergedSequences;
     
    159167    String genome_dict = cfg.getRequiredConfig("align-mips/genome-dict", alignParameterSet);
    160168    String genome_fasta = cfg.getRequiredConfig("align-mips/genome-fasta", alignParameterSet);
    161     String mark_duplicate_options = cfg.getRequiredConfig("align-mips/mark-duplicate-options", alignParameterSet);
     169    String mark_duplicates_options = cfg.getRequiredConfig("align-mips/mark-duplicates-options", alignParameterSet);
     170    String pcr_metrics_options = cfg.getRequiredConfig("align-mips/pcr-metrics-options", alignParameterSet);
    162171
    163172    // Load common items
     
    294303      script.cmd("  " + Values.getString(fastqPrefix, "\n  ", true));
    295304      script.cmd(")");
     305      script.cmd("AlignFolder=${ArchiveRoot}"+alignFolder);
    296306      script.cmd("TrimOptionsAdapter=\""+align_trimmomaticOptions1+"\"");
    297307      script.cmd("TrimOptionsQual=\""+align_trimmomaticOptions2+"\"");
    298308      script.cmd("NovoAlignOptions=\""+novoalign_options+"\"");
    299       script.cmd("MarkDuplicatesOptions=\""+mark_duplicate_options+"\"");
     309      script.cmd("MarkDuplicatesOptions=\""+mark_duplicates_options+"\"");
     310      script.cmd("PcrMetricsOptions=\""+pcr_metrics_options+"\"");
    300311     
    301312      script.newLine();
     
    364375      script.comment("Run novoalign");
    365376      script.progress(40, "Aligning with novoalign");
    366       script.cmd("min_insert=$(awk 'NR == 1 || $3 - $2 < min {min = $3 - $2}END{print min - 1}' \"${AmpliconsBed}\")");
    367       script.cmd("max_insert=$(awk 'NR == 1 || $3 - $2 > max {max = $3 - $2}END{print max + 1}' \"${AmpliconsBed}\")");
     377      script.cmd("get_min_max_insert");
    368378      script.cmd("n=0");
    369379      script.cmd("AlignedBams=''");
     
    377387      script.newLine();
    378388
    379       script.comment("Re-header BAM file");
     389      script.comment("Re-header and split BAM file");
    380390      script.progress(70, "Post-processing aligned BAM file");
    381391      script.cmd("reheader_bam");
    382       script.newLine();
    383      
    384       script.comment("Split BAM file into concordant, discordant and unmapped");
    385392      script.cmd("split_bam");
    386393      script.newLine();
     
    392399      script.newLine();
    393400
    394       /*
     401      script.comment("Collect metrics");
     402      script.progress(90, "Collecting PCR and alignment metrics");
     403      script.cmd("collect_pcr_metrics");
     404      script.cmd("collect_alignment_metrics");
     405      script.newLine();
     406     
    395407      script.progress(95, "Copying result files to project archive");
    396       script.cmd("mkdir -p ${HisatFolder}");
    397       script.cmd("rm -rf ${HisatFolder}/*");
    398       script.cmd("cp masked/*.out ${WD}");
    399       script.cmd("cp aligned/*.out ${WD}");
    400       script.cmd("cp aligned/alignment_picardmetrics.csv ${WD}");
    401       script.cmd("cp aligned/* ${HisatFolder}");
     408      script.cmd("mkdir -p ${AlignFolder}");
     409      script.cmd("rm -rf ${AlignFolder}/*");
     410      script.cmd("cp out/* ${AlignFolder}");
     411      // TODO - copy some files to $WD so that we can update some annotations like ALIGNED_PAIRS, etc.
    402412      if (externalGroup != null)
    403413      {
    404         script.cmd("chgrp -R " + externalGroup + " ${HisatFolder} 2>> ${WD}/chgrp.out || echo [" + alignedName +"] >> ${WD}/chgrp.out");
    405       }
    406       script.cmd("ls -1 ${HisatFolder}/* >> ${WD}/files.out");
    407       script.newLine();
    408       */
     414        script.cmd("chgrp -R " + externalGroup + " ${AlignFolder} 2>> ${WD}/chgrp.out || echo [" + alignedName +"] >> ${WD}/chgrp.out");
     415      }
     416      script.cmd("ls -1 ${AlignFolder}/* >> ${WD}/files.out");
     417      script.newLine();
    409418     
    410419      script.progress(99, "Cleaning up temporary folders");
     
    435444    public String jobCompleted(SessionControl sc, OpenGridSession session, Job job, JobStatus status)
    436445    {
     446      String jobName = status.getName();
     447      String files = session.getJobFileAsString(jobName, "files.out", "UTF-8");
     448
     449      Metrics metrics = parseAlignedOut(sc, job, files);
    437450      return null;
    438451    }
    439452   
     453    private Metrics parseAlignedOut(SessionControl sc, Job job, String filesOut)
     454    {
     455      Metrics metrics = new Metrics();
     456     
     457      DbControl dc = null;
     458      try
     459      {
     460        dc = sc.newDbControl();
     461       
     462        AlignedSequences alignedSequences = AlignedSequences.getByJob(dc, job);
     463        DerivedBioAssay aligned = alignedSequences.getItem();
     464
     465        // Create file links
     466        boolean useExternalProjectArchive = Reggie.isExternalItem(aligned.getName());
     467        FileServer fileArchive = useExternalProjectArchive ? Fileserver.EXTERNAL_ARCHIVE.load(dc) : Fileserver.PROJECT_ARCHIVE.load(dc);
     468        String analysisDir = useExternalProjectArchive ? Reggie.EXTERNAL_ANALYSIS_DIR : Reggie.SECONDARY_ANALYSIS_DIR;
     469
     470        String dataFilesFolder = (String)Annotationtype.DATA_FILES_FOLDER.getAnnotationValue(dc, aligned);
     471        String baseFolder = Reggie.convertDataFilesFolderToBaseFolder(dataFilesFolder);
     472        Directory localDataDir = Directory.getNew(dc, new Path(analysisDir+baseFolder, Path.Type.DIRECTORY));
     473        DataFileType bamData = Datafiletype.BAM.load(dc);
     474        ItemSubtype bamType = bamData.getGenericType();
     475 
     476        int lineNo = 0;
     477        for (String line : filesOut.split("\n"))
     478        {
     479          lineNo++;
     480         
     481          File f = File.getFile(dc, localDataDir, line.substring(line.lastIndexOf("/")+1), true);
     482          f.setFileServer(fileArchive);
     483          String fileUrl = "sftp://" + fileArchive.getHost() + dataFilesFolder + "/" + f.getName();
     484          try
     485          {
     486            f.setUrl(fileUrl, true);
     487          }
     488          catch (RuntimeException ex)
     489          {
     490            f.setUrl(fileUrl, false);
     491          }
     492         
     493          if (!f.isInDatabase())
     494          {
     495            dc.saveItem(f);
     496          }
     497          if (f.getName().equals("concordant.bam"))
     498          {
     499            //f.setDescription(metrics.numReadsAfterAlign + " ALIGNED PAIRS");
     500            f.setItemSubtype(bamType);
     501            FileSetMember member = aligned.getFileSet().addMember(f, bamData);
     502          }
     503          else
     504          {
     505            AnyToAny link = AnyToAny.getNewOrExisting(dc, aligned, f.getName(), f, true);
     506            if (!link.isInDatabase()) dc.saveItem(link);
     507          }
     508        }
     509       
     510        dc.commit();
     511      }
     512      finally
     513      {
     514        if (dc != null) dc.close();
     515      }
     516 
     517      return metrics;
     518    }
     519
     520  }
     521
     522  static class Metrics
     523  {
    440524  }
    441525
Note: See TracChangeset for help on using the changeset viewer.