Changeset 5720


Ignore:
Timestamp:
Nov 13, 2019, 8:17:15 AM (3 years ago)
Author:
Nicklas Nordborg
Message:

References #1199: Implement Variant calling pipeline

Checked the implementation to create child raw bioassay for the filtered part of the variant calling.

This means that if we run the pipeline in raw-only mode, the raw variant calling files will be attached to the alignment and no child raw bioassay is created.

If we run the pipeline in filtering mode, a child raw bioassay is created and the filtered vcf file is attached to that.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • extensions/net.sf.basedb.reggie/trunk/src/net/sf/basedb/reggie/grid/VariantCallingJobCreator.java

    r5717 r5720  
    11package net.sf.basedb.reggie.grid;
    22
     3import java.io.IOException;
    34import java.io.InputStream;
    45import java.util.ArrayList;
    56import java.util.List;
     7import java.util.Set;
    68import java.util.zip.GZIPInputStream;
    79
     
    911import org.slf4j.LoggerFactory;
    1012
     13import net.sf.basedb.core.Annotatable;
    1114import net.sf.basedb.core.AnyToAny;
     15import net.sf.basedb.core.BasicItem;
    1216import net.sf.basedb.core.BooleanParameterType;
     17import net.sf.basedb.core.DataFileType;
    1318import net.sf.basedb.core.DbControl;
    1419import net.sf.basedb.core.DerivedBioAssay;
     
    1621import net.sf.basedb.core.File;
    1722import net.sf.basedb.core.FileServer;
     23import net.sf.basedb.core.FileSetMember;
    1824import net.sf.basedb.core.ItemList;
    1925import net.sf.basedb.core.ItemNotFoundException;
     
    2228import net.sf.basedb.core.Job;
    2329import net.sf.basedb.core.Path;
     30import net.sf.basedb.core.RawBioAssay;
    2431import net.sf.basedb.core.Sample;
    2532import net.sf.basedb.core.SessionControl;
     
    4148import net.sf.basedb.reggie.dao.Fileserver;
    4249import net.sf.basedb.reggie.dao.Library;
     50import net.sf.basedb.reggie.dao.Rawbioassay;
     51import net.sf.basedb.reggie.dao.Rawdatatype;
    4352import net.sf.basedb.reggie.dao.Subtype;
    4453import net.sf.basedb.reggie.vcf.VcfData;
     
    190199    ItemList vcallPipeline = BiomaterialList.VARIANT_CALLING_PIPELINE.load(dc);
    191200
     201    // Create VariantCall raw bioassays
     202    Rawdatatype variantCallType = Rawdatatype.VARIANT_CALL;
     203
    192204    // Options common for all jobs
    193205    JobConfig jobConfig = new JobConfig();
     
    251263      dc.saveItem(vCallJob);
    252264
     265      // Create raw bioassay if filtering is included
     266      String vCallFolder = null;
     267      if (!rawOnly)
     268      {
     269        String rawName = as.getNextRawBioAssayName(dc, variantCallType);
     270        RawBioAssay raw = variantCallType.createRawBioAssay(dc);
     271//        Pipeline.RNASEQ_HISAT_STRINGTIE.setAnnotation(dc, raw);
     272        raw.setJob(vCallJob);
     273        raw.setName(rawName);
     274        raw.setParentExtract(lib.getExtract());
     275        raw.setSoftware(software);
     276        raw.setProtocol(null); // To prevent a default value
     277        raw.setArrayDesign(null); // To prevent the default hg38
     278        raw.setParentBioAssay(aligned);
     279        dc.saveItem(raw);
     280       
     281        vCallFolder = bamFolder + "/"+raw.getName().substring(aligned.getName().length()+1);
     282        if (debug && !vCallFolder.startsWith("/debug"))
     283        {
     284          vCallFolder = "/debug" + vCallFolder;
     285        }
     286        Annotationtype.DATA_FILES_FOLDER.setAnnotationValue(dc, raw, vCallFolder);
     287        if (autoConfirm)
     288        {
     289          // TODO -- what about raw-only mode??
     290          Annotationtype.AUTO_PROCESSING.setAnnotationValue(dc, raw, "AutoConfirm");
     291        }
     292       
     293        // Checks to make sure no bad things are included in script file
     294        ScriptUtil.checkValidPath(vCallFolder, true, true);
     295        ScriptUtil.checkValidScriptParameter(raw.getName());
     296      }
     297     
    253298      // Temporary link to the job so that it is possible to handle failed jobs in the VariantCallingAutoConfirmer
    254299      if (autoConfirm)
    255300      {
     301        // TODO -- what about filtering mode??
    256302        AnyToAny link = AnyToAny.getNewOrExisting(dc, aligned, "variants-all.vcf.gz", vCallJob, false);
    257303        if (!link.isInDatabase()) dc.saveItem(link);
     
    267313      script.cmd("ArchiveRoot="+archiveRoot);
    268314      script.cmd("BamFolder=${ArchiveRoot}"+bamFolder);
     315      if (vCallFolder != null)
     316      {
     317        script.cmd("FilteredFolder=${ArchiveRoot}"+vCallFolder);
     318      }
     319
    269320      script.cmd("ReferenceDir=" + referenceRoot);
    270321      script.cmd("Genome=${ReferenceDir}/" + genome_ref);
     
    294345      script.cmd("mkdir mosdepth");
    295346      script.cmd("mkdir tmp");
    296       script.cmd("mkdir results");
     347      script.cmd("mkdir resultsraw");
     348      script.cmd("mkdir resultsfilter");
    297349      script.newLine();
    298350     
     
    322374        script.newLine();
    323375        // Filter regions with depth >= 'minDepth' and merge
    324         script.cmd("zcat mosdepth/alignment.per-base.bed.gz | awk '$4 > "+(minDepth-1)+"' | " + bedtools_path + " merge > results/variants-callable.bed");
     376        script.cmd("zcat mosdepth/alignment.per-base.bed.gz | awk '$4 > "+(minDepth-1)+"' | " + bedtools_path + " merge > resultsraw/variants-callable.bed");
    325377        script.newLine();
    326378
     
    330382        vardictCmd += " " +vardict_options;
    331383        vardictCmd += " -b bam/alignment.bam";
    332         vardictCmd += " results/variants-callable.bed";
     384        vardictCmd += " resultsraw/variants-callable.bed";
    333385        vardictCmd += " | ${VarDictPath}/bin/teststrandbias.R";
    334386        vardictCmd += " | ${VarDictPath}/bin/var2vcf_valid.pl";
     
    351403        script.cmd("tabix tmp/gc-50.vcf.gz");
    352404        script.cmd("./stderrwrap.sh " + vcfanno_path + " gc_stat.toml tmp/variants-raw-1.vcf > tmp/variants-raw-2.vcf 3>> vcfanno.out");
    353         script.cmd("cat tmp/variants-raw-2.vcf | bgzip -c > results/variants-all.vcf.gz");
     405        script.cmd("cat tmp/variants-raw-2.vcf | bgzip -c > resultsraw/variants-all.vcf.gz");
    354406        script.newLine();
    355407      }
     
    383435        snpEffCmd += " 3> snpeff.out";
    384436        snpEffCmd += " | bgzip -c";
    385         snpEffCmd += " > results/variants-annotated.vcf.gz";
     437        snpEffCmd += " > resultsfilter/variants-annotated.vcf.gz";
    386438       
    387439        script.comment("Second annotation step with snpEff");
     
    393445        snpSiftCmd += " filter";
    394446        snpSiftCmd += " " + snpsift_options;
    395         snpSiftCmd += " results/variants-annotated.vcf.gz";
     447        snpSiftCmd += " resultsfilter/variants-annotated.vcf.gz";
    396448        snpSiftCmd += " 3> snpsift.out";
    397         snpSiftCmd += " > results/variants-filtered.vcf";
     449        snpSiftCmd += " > resultsfilter/variants-filtered.vcf";
    398450       
    399451        script.comment("Filtering step with SnpSift");
     
    404456
    405457      script.progress(90, "Copying result files to project archive");
    406       script.cmd("\\cp results/variants-* ${BamFolder}");  // "\cp" will enable overwriting existing files
    407      
    408       if (externalGroup != null)
    409       {
    410         script.cmd("chgrp -R " + externalGroup + " ${BamFolder}/variants-* 2>> ${WD}/chgrp.out || echo [" + aligned.getName() +"] >> ${WD}/chgrp.out" );
    411       }
    412       script.cmd("ls -1 results/variants-* >> ${WD}/files.out");
    413       script.comment("Collect some statistics that we import into BASE");
    414458      if (!rawCallingSkipped)
    415459      {
    416         script.cmd("echo \"Callable bases: `awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 } END{print SUM}' results/variants-callable.bed`\" >> ${WD}/stats.out");
    417         script.cmd("echo \"All variants: `zcat results/variants-all.vcf.gz | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out");
     460        script.cmd("\\cp resultsraw/variants-* ${BamFolder}");  // "\cp" will enable overwriting existing files
     461        script.cmd("ls -1 resultsraw/variants-* >> ${WD}/files.out");
     462        script.cmd("echo \"Callable bases: `awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 } END{print SUM}' resultsraw/variants-callable.bed`\" >> ${WD}/stats.out");
     463        script.cmd("echo \"All variants: `zcat resultsraw/variants-all.vcf.gz | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out");
     464        if (externalGroup != null)
     465        {
     466          script.cmd("chgrp -R " + externalGroup + " ${BamFolder}/variants-* 2>> ${WD}/chgrp.out || echo [" + aligned.getName() +"] >> ${WD}/chgrp.out" );
     467        }
    418468      }
    419469      if (!rawOnly)
    420470      {
    421         script.cmd("echo \"Annotated variants: `zcat results/variants-annotated.vcf.gz | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out");
    422         script.cmd("echo \"Filtered variants: `cat results/variants-filtered.vcf | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out");
     471        script.cmd("mkdir -p ${FilteredFolder}");
     472        script.cmd("rm -rf ${FilteredFolder}/*");
     473        script.cmd("cp resultsfilter/variants-* ${FilteredFolder}");
     474        script.cmd("ls -1 resultsfilter/variants-* >> ${WD}/files.out");
     475        script.cmd("echo \"Annotated variants: `zcat resultsfilter/variants-annotated.vcf.gz | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out");
     476        script.cmd("echo \"Filtered variants: `cat resultsfilter/variants-filtered.vcf | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out");
     477        if (externalGroup != null)
     478        {
     479          script.cmd("chgrp -R " + externalGroup + " ${FilteredFolder}/variants-* 2>> ${WD}/chgrp.out || echo [" + aligned.getName() +"] >> ${WD}/chgrp.out" );
     480        }
    423481      }
    424482      script.newLine();
     
    471529        boolean rawCallingSkipped = Boolean.TRUE.equals(job.getParameterValue("rawCallingSkipped"));
    472530        boolean filterSkipped = Boolean.TRUE.equals(job.getParameterValue("filterSkipped"));
     531        RawBioAssay raw = null;
     532        if (!filterSkipped)
     533        {
     534          Rawbioassay rawVCall = Rawbioassay.getByJob(dc, job);
     535          raw = rawVCall.getItem();
     536        }
    473537 
    474538        // Create file links
     
    477541        String analysisDir = useExternalProjectArchive ? Reggie.EXTERNAL_ANALYSIS_DIR : Reggie.SECONDARY_ANALYSIS_DIR;
    478542 
    479         String dataFilesFolder = (String)Annotationtype.DATA_FILES_FOLDER.getAnnotationValue(dc, aligned);
    480         String baseFolder = Reggie.convertDataFilesFolderToBaseFolder(dataFilesFolder);
    481         Directory localDataDir = Directory.getNew(dc, new Path(analysisDir+baseFolder, Path.Type.DIRECTORY));
    482         ItemSubtype vcfType = Subtype.VARIANT_CALL_FORMAT.load(dc);
    483  
    484         int lineNo = 0;
    485         int numCallableBases = 0;
    486         int numAllVariants = 0;
    487         int numAnnotated = 0;
    488         int numFiltered = 0;
    489        
    490         for (String line : statsOut.split("\n"))
    491         {
    492           String[] stat = line.split(":", 2);
    493           String key = stat[0].strip();
    494           String val = stat[1].strip();
    495           if ("Callable bases".equals(key))
     543        ParseFiles pf = new ParseFiles();
     544        pf.stat = Stats.parse(statsOut);
     545        pf.software = software;
     546        pf.fileArchive = fileArchive;
     547        pf.vcfType = Subtype.VARIANT_CALL_FORMAT.load(dc);
     548        pf.vcfData = Datafiletype.VCF.load(dc);
     549       
     550        if (!rawCallingSkipped)
     551        {
     552          FileOwner alignedOwner = FileOwner.create(dc, aligned, analysisDir);
     553          pf.parseFiles(dc, alignedOwner, filesOut, Set.of("variants-callable.bed", "variants-all.vcf.gz"));
     554        }
     555        if (!filterSkipped)
     556        {
     557          FileOwner vCallOwner = FileOwner.create(dc, raw, analysisDir);
     558          msg = pf.parseFiles(dc, vCallOwner, filesOut, Set.of("variants-annotated.vcf.gz", "variants-filtered.vcf"));
     559        }
     560
     561        if (msg == null)
     562        {
     563          if (rawCallingSkipped)
    496564          {
    497             numCallableBases = Values.getInt(val);
     565            msg = "Used existing raw variants; annotated " + pf.stat.numAnnotated + " variants; " + pf.stat.numFiltered + " passed filter.";
    498566          }
    499           else if ("All variants".equals(key))
     567          else if (filterSkipped)
    500568          {
    501             numAllVariants = Values.getInt(val);
     569            msg = "Found " + pf.stat.numAllVariants + " variants; skipped filtering; ";
     570            msg += Values.formatNumber(pf.stat.numCallableBases/1000000f, 1) + "M callable bases.";
    502571          }
    503           else if ("Annotated variants".equals(key))
     572          else
    504573          {
    505             numAnnotated = Values.getInt(val);
     574            msg = "Found " + pf.stat.numAllVariants + " variants; " + pf.stat.numFiltered + " passed filter. ";
     575            msg += Values.formatNumber(pf.stat.numCallableBases/1000000f, 1) + "M callable bases.";
    506576          }
    507           else if ("Filtered variants".equals(key))
    508           {
    509             numFiltered = Values.getInt(val);
    510           }
    511         }
    512        
    513         for (String line : filesOut.split("\n"))
    514         {
    515           lineNo++;
    516          
    517           String filename = line.substring(line.lastIndexOf("/")+1);
    518           File f = File.getFile(dc, localDataDir, filename, true);
    519           f.setFileServer(fileArchive);
    520           String fileUrl = "sftp://" + fileArchive.getHost() + dataFilesFolder + "/" + f.getName();
     577        }
     578        dc.commit();
     579      }
     580      finally
     581      {
     582        if (dc != null) dc.close();
     583      }
     584 
     585      return msg == null ? "" : msg;
     586    }
     587   
     588
     589
     590  }
     591 
     592  /**
     593    Statistics from the variant calling. Parsed from
     594    stats.out.
     595  */
     596  static class Stats
     597  {
     598    int numCallableBases = 0;
     599    int numAllVariants = 0;
     600    int numAnnotated = 0;
     601    int numFiltered = 0;
     602   
     603    static Stats parse(String statsOut)
     604    {
     605      Stats s = new Stats();
     606      for (String line : statsOut.split("\n"))
     607      {
     608        String[] stat = line.split(":", 2);
     609        String key = stat[0].strip();
     610        int val = Values.getInt(stat[1].strip());
     611        if ("Callable bases".equals(key))
     612        {
     613          s.numCallableBases = val;
     614        }
     615        else if ("All variants".equals(key))
     616        {
     617          s.numAllVariants = val;
     618        }
     619        else if ("Annotated variants".equals(key))
     620        {
     621          s.numAnnotated = val;
     622        }
     623        else if ("Filtered variants".equals(key))
     624        {
     625          s.numFiltered = val;
     626        }
     627      }
     628      return s;
     629    }
     630
     631  }
     632 
     633  /**
     634    Helper class for wrapping information about were files to an
     635    item should go. From the owner 'item' we get the DataFilesFolder
     636    annotation and the corresponding local directory in the BASE file
     637    system.
     638  */
     639  static class FileOwner
     640  {
     641   
     642    static FileOwner create(DbControl dc, BasicItem item, String analysisDir)
     643    {
     644      FileOwner owner = new FileOwner();
     645      owner.item = item;
     646      owner.dataFilesFolder = (String)Annotationtype.DATA_FILES_FOLDER.getAnnotationValue(dc, (Annotatable)item);
     647      String baseFolder = Reggie.convertDataFilesFolderToBaseFolder(owner.dataFilesFolder);
     648      owner.localDataDir = Directory.getNew(dc, new Path(analysisDir+baseFolder, Path.Type.DIRECTORY));
     649      return owner;
     650    }
     651   
     652    BasicItem item;
     653    String dataFilesFolder;
     654    Directory localDataDir;
     655  }
     656 
     657  /**
     658    Helper class for parsing the 'files.out' file and attaching files
     659    to the specified owner. A filter with filenames need to be specified.
     660  */
     661  static class ParseFiles
     662  {
     663    Software software;
     664    FileServer fileArchive;
     665    Stats stat;
     666    ItemSubtype vcfType;
     667    DataFileType vcfData;
     668   
     669    String parseFiles(DbControl dc, FileOwner owner, String filesOut, Set<String> filenames)
     670    {
     671      String msg = null;
     672     
     673      for (String line : filesOut.split("\n"))
     674      {
     675        String filename = line.substring(line.lastIndexOf("/")+1);
     676        if (!filenames.contains(filename)) continue; // Skip files not owned by this owner
     677       
     678        File f = File.getFile(dc, owner.localDataDir, filename, true);
     679        f.setFileServer(fileArchive);
     680        String fileUrl = "sftp://" + fileArchive.getHost() + owner.dataFilesFolder + "/" + f.getName();
     681        try
     682        {
     683          f.setUrl(fileUrl, true);
     684        }
     685        catch (RuntimeException ex)
     686        {
     687          f.setUrl(fileUrl, false);
     688        }
     689       
     690        if (!f.isInDatabase())
     691        {
     692          dc.saveItem(f);
     693        }
     694       
     695        if (filename.equals("variants-filtered.vcf"))
     696        {
     697          RawBioAssay raw = (RawBioAssay)owner.item;
     698          FileSetMember member = raw.getFileSet().addMember(f, vcfData);
    521699          try
    522700          {
    523             f.setUrl(fileUrl, true);
     701            VcfData x = parseVcf(f);
     702            member.setValid(true, null);
     703            raw.setNumFileSpots(x.getGtCount());
    524704          }
    525           catch (RuntimeException ex)
     705          catch (IOException | RuntimeException ex)
    526706          {
    527             f.setUrl(fileUrl, false);
     707            msg = "Could not parse " + filename + " (" + ex.getMessage() + ")";
     708            member.setValid(false, ex.getMessage());
     709            logger.warn("Could not parse file: " + f, ex);
    528710          }
    529          
    530           if (!f.isInDatabase())
    531           {
    532             dc.saveItem(f);
    533           }
    534          
    535           AnyToAny link = AnyToAny.getNewOrExisting(dc, aligned, f.getName(), f, false);
     711        }
     712        else
     713        {
     714          AnyToAny link = AnyToAny.getNewOrExisting(dc, owner.item, f.getName(), f, false);
    536715          if (!link.isInDatabase()) dc.saveItem(link);
    537716          if (software != null)
     
    539718            link.setDescription("Created with " + software.getName());
    540719          }
    541          
    542           if (filename.endsWith(".vcf.gz")) f.setMimeTypeAuto("application/x-gzip", vcfType);
    543           if (filename.endsWith(".vcf")) f.setMimeTypeAuto("text/plain", vcfType);
    544          
    545           if (filename.equals("variants-all.vcf.gz"))
    546           {
    547             f.setDescription(numAllVariants + " variants.");
    548           }
    549           else if (filename.equals("variants-annotated.vcf.gz"))
    550           {
    551             f.setDescription(numAnnotated + " variants.");
    552           }
    553           else if (filename.equals("variants-filtered.vcf"))
    554           {
    555             f.setDescription(numFiltered + " variants.");
    556           }
    557           else if (filename.equals("variants-callable.bed"))
    558           {
    559             f.setDescription(Values.formatNumber(numCallableBases/1000000f, 1) + "M callable bases");
    560           }
    561         }
    562         if (rawCallingSkipped)
    563         {
    564           msg = "Used existing raw variants; annotated " + numAnnotated + " variants; " + numFiltered + " passed filter.";
    565         }
    566         else if (filterSkipped)
    567         {
    568           msg = "Found " + numAllVariants + " variants; skipped filtering; ";
    569           msg += Values.formatNumber(numCallableBases/1000000f, 1) + "M callable bases.";
    570         }
    571         else
    572         {
    573           msg = "Found " + numAllVariants + " variants; " + numFiltered + " passed filter. ";
    574           msg += Values.formatNumber(numCallableBases/1000000f, 1) + "M callable bases.";
    575         }
    576         dc.commit();
    577       }
    578       finally
    579       {
    580         if (dc != null) dc.close();
    581       }
    582  
    583       return msg == null ? "" : msg;
     720        }
     721       
     722        if (filename.endsWith(".vcf.gz")) f.setMimeTypeAuto("application/x-gzip", vcfType);
     723        if (filename.endsWith(".vcf")) f.setMimeTypeAuto("text/plain", vcfType);
     724       
     725        if (filename.equals("variants-all.vcf.gz"))
     726        {
     727          f.setDescription(stat.numAllVariants + " variants.");
     728        }
     729        else if (filename.equals("variants-annotated.vcf.gz"))
     730        {
     731          f.setDescription(stat.numAnnotated + " variants.");
     732        }
     733        else if (filename.equals("variants-filtered.vcf"))
     734        {
     735          f.setDescription(stat.numFiltered + " variants.");
     736        }
     737        else if (filename.equals("variants-callable.bed"))
     738        {
     739          f.setDescription(Values.formatNumber(stat.numCallableBases/1000000f, 1) + "M callable bases");
     740        }
     741      }
     742     
     743      return msg;
    584744    }
    585745   
    586746    /**
    587       Helper method for copying the VCF file from the file server
    588       while at the same time parsing it and extracting genotype
    589       information and statistics.
     747      Parses the VCF file for validation.
    590748    */
    591     private VcfData parseVcf(VcfParser parser, File vcfFile)
    592     {
    593       // Stream for copying the vcfFile
     749    private VcfData parseVcf(File vcfFile)
     750      throws IOException
     751    {
     752      VcfParser parser = new VcfParser();
    594753      InputStream fromFileServer = null;
    595      
    596754      VcfData vcfData = null;
    597755      try
    598756      {
    599         fromFileServer = new GZIPInputStream(vcfFile.getDownloadStream(0));
     757        fromFileServer = vcfFile.getDownloadStream(0);
     758        if (vcfFile.getName().endsWith(".gz"))
     759        {
     760          fromFileServer = new GZIPInputStream(fromFileServer);
     761        }
    600762        parser.setUseLineNoAsId(true);
    601763        vcfData = parser.parse(fromFileServer, vcfFile.getName());
    602764      }
    603       catch (Exception ex)
    604       {
    605         logger.warn("Could not parse VCF file: "+ vcfFile, ex);
    606       }
    607765      finally
    608766      {
     
    611769      return vcfData;
    612770    }
    613 
    614   }
     771  }
     772 
    615773}
Note: See TracChangeset for help on using the changeset viewer.