Changeset 5720
- Timestamp:
- Nov 13, 2019, 8:17:15 AM (3 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
extensions/net.sf.basedb.reggie/trunk/src/net/sf/basedb/reggie/grid/VariantCallingJobCreator.java
r5717 r5720 1 1 package net.sf.basedb.reggie.grid; 2 2 3 import java.io.IOException; 3 4 import java.io.InputStream; 4 5 import java.util.ArrayList; 5 6 import java.util.List; 7 import java.util.Set; 6 8 import java.util.zip.GZIPInputStream; 7 9 … … 9 11 import org.slf4j.LoggerFactory; 10 12 13 import net.sf.basedb.core.Annotatable; 11 14 import net.sf.basedb.core.AnyToAny; 15 import net.sf.basedb.core.BasicItem; 12 16 import net.sf.basedb.core.BooleanParameterType; 17 import net.sf.basedb.core.DataFileType; 13 18 import net.sf.basedb.core.DbControl; 14 19 import net.sf.basedb.core.DerivedBioAssay; … … 16 21 import net.sf.basedb.core.File; 17 22 import net.sf.basedb.core.FileServer; 23 import net.sf.basedb.core.FileSetMember; 18 24 import net.sf.basedb.core.ItemList; 19 25 import net.sf.basedb.core.ItemNotFoundException; … … 22 28 import net.sf.basedb.core.Job; 23 29 import net.sf.basedb.core.Path; 30 import net.sf.basedb.core.RawBioAssay; 24 31 import net.sf.basedb.core.Sample; 25 32 import net.sf.basedb.core.SessionControl; … … 41 48 import net.sf.basedb.reggie.dao.Fileserver; 42 49 import net.sf.basedb.reggie.dao.Library; 50 import net.sf.basedb.reggie.dao.Rawbioassay; 51 import net.sf.basedb.reggie.dao.Rawdatatype; 43 52 import net.sf.basedb.reggie.dao.Subtype; 44 53 import net.sf.basedb.reggie.vcf.VcfData; … … 190 199 ItemList vcallPipeline = BiomaterialList.VARIANT_CALLING_PIPELINE.load(dc); 191 200 201 // Create VariantCall raw bioassays 202 Rawdatatype variantCallType = Rawdatatype.VARIANT_CALL; 203 192 204 // Options common for all jobs 193 205 JobConfig jobConfig = new JobConfig(); … … 251 263 dc.saveItem(vCallJob); 252 264 265 // Create raw bioassay if filtering is included 266 String vCallFolder = null; 267 if (!rawOnly) 268 { 269 String rawName = as.getNextRawBioAssayName(dc, variantCallType); 270 RawBioAssay raw = variantCallType.createRawBioAssay(dc); 271 // Pipeline.RNASEQ_HISAT_STRINGTIE.setAnnotation(dc, raw); 272 raw.setJob(vCallJob); 273 raw.setName(rawName); 274 raw.setParentExtract(lib.getExtract()); 275 raw.setSoftware(software); 276 raw.setProtocol(null); // To prevent a default value 277 raw.setArrayDesign(null); // To prevent the default hg38 278 raw.setParentBioAssay(aligned); 279 dc.saveItem(raw); 280 281 vCallFolder = bamFolder + "/"+raw.getName().substring(aligned.getName().length()+1); 282 if (debug && !vCallFolder.startsWith("/debug")) 283 { 284 vCallFolder = "/debug" + vCallFolder; 285 } 286 Annotationtype.DATA_FILES_FOLDER.setAnnotationValue(dc, raw, vCallFolder); 287 if (autoConfirm) 288 { 289 // TODO -- what about raw-only mode?? 290 Annotationtype.AUTO_PROCESSING.setAnnotationValue(dc, raw, "AutoConfirm"); 291 } 292 293 // Checks to make sure no bad things are included in script file 294 ScriptUtil.checkValidPath(vCallFolder, true, true); 295 ScriptUtil.checkValidScriptParameter(raw.getName()); 296 } 297 253 298 // Temporary link to the job so that it is possible to handle failed jobs in the VariantCallingAutoConfirmer 254 299 if (autoConfirm) 255 300 { 301 // TODO -- what about filtering mode?? 256 302 AnyToAny link = AnyToAny.getNewOrExisting(dc, aligned, "variants-all.vcf.gz", vCallJob, false); 257 303 if (!link.isInDatabase()) dc.saveItem(link); … … 267 313 script.cmd("ArchiveRoot="+archiveRoot); 268 314 script.cmd("BamFolder=${ArchiveRoot}"+bamFolder); 315 if (vCallFolder != null) 316 { 317 script.cmd("FilteredFolder=${ArchiveRoot}"+vCallFolder); 318 } 319 269 320 script.cmd("ReferenceDir=" + referenceRoot); 270 321 script.cmd("Genome=${ReferenceDir}/" + genome_ref); … … 294 345 script.cmd("mkdir mosdepth"); 295 346 script.cmd("mkdir tmp"); 296 script.cmd("mkdir results"); 347 script.cmd("mkdir resultsraw"); 348 script.cmd("mkdir resultsfilter"); 297 349 script.newLine(); 298 350 … … 322 374 script.newLine(); 323 375 // Filter regions with depth >= 'minDepth' and merge 324 script.cmd("zcat mosdepth/alignment.per-base.bed.gz | awk '$4 > "+(minDepth-1)+"' | " + bedtools_path + " merge > results /variants-callable.bed");376 script.cmd("zcat mosdepth/alignment.per-base.bed.gz | awk '$4 > "+(minDepth-1)+"' | " + bedtools_path + " merge > resultsraw/variants-callable.bed"); 325 377 script.newLine(); 326 378 … … 330 382 vardictCmd += " " +vardict_options; 331 383 vardictCmd += " -b bam/alignment.bam"; 332 vardictCmd += " results /variants-callable.bed";384 vardictCmd += " resultsraw/variants-callable.bed"; 333 385 vardictCmd += " | ${VarDictPath}/bin/teststrandbias.R"; 334 386 vardictCmd += " | ${VarDictPath}/bin/var2vcf_valid.pl"; … … 351 403 script.cmd("tabix tmp/gc-50.vcf.gz"); 352 404 script.cmd("./stderrwrap.sh " + vcfanno_path + " gc_stat.toml tmp/variants-raw-1.vcf > tmp/variants-raw-2.vcf 3>> vcfanno.out"); 353 script.cmd("cat tmp/variants-raw-2.vcf | bgzip -c > results /variants-all.vcf.gz");405 script.cmd("cat tmp/variants-raw-2.vcf | bgzip -c > resultsraw/variants-all.vcf.gz"); 354 406 script.newLine(); 355 407 } … … 383 435 snpEffCmd += " 3> snpeff.out"; 384 436 snpEffCmd += " | bgzip -c"; 385 snpEffCmd += " > results /variants-annotated.vcf.gz";437 snpEffCmd += " > resultsfilter/variants-annotated.vcf.gz"; 386 438 387 439 script.comment("Second annotation step with snpEff"); … … 393 445 snpSiftCmd += " filter"; 394 446 snpSiftCmd += " " + snpsift_options; 395 snpSiftCmd += " results /variants-annotated.vcf.gz";447 snpSiftCmd += " resultsfilter/variants-annotated.vcf.gz"; 396 448 snpSiftCmd += " 3> snpsift.out"; 397 snpSiftCmd += " > results /variants-filtered.vcf";449 snpSiftCmd += " > resultsfilter/variants-filtered.vcf"; 398 450 399 451 script.comment("Filtering step with SnpSift"); … … 404 456 405 457 script.progress(90, "Copying result files to project archive"); 406 script.cmd("\\cp results/variants-* ${BamFolder}"); // "\cp" will enable overwriting existing files407 408 if (externalGroup != null)409 {410 script.cmd("chgrp -R " + externalGroup + " ${BamFolder}/variants-* 2>> ${WD}/chgrp.out || echo [" + aligned.getName() +"] >> ${WD}/chgrp.out" );411 }412 script.cmd("ls -1 results/variants-* >> ${WD}/files.out");413 script.comment("Collect some statistics that we import into BASE");414 458 if (!rawCallingSkipped) 415 459 { 416 script.cmd("echo \"Callable bases: `awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 } END{print SUM}' results/variants-callable.bed`\" >> ${WD}/stats.out"); 417 script.cmd("echo \"All variants: `zcat results/variants-all.vcf.gz | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out"); 460 script.cmd("\\cp resultsraw/variants-* ${BamFolder}"); // "\cp" will enable overwriting existing files 461 script.cmd("ls -1 resultsraw/variants-* >> ${WD}/files.out"); 462 script.cmd("echo \"Callable bases: `awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 } END{print SUM}' resultsraw/variants-callable.bed`\" >> ${WD}/stats.out"); 463 script.cmd("echo \"All variants: `zcat resultsraw/variants-all.vcf.gz | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out"); 464 if (externalGroup != null) 465 { 466 script.cmd("chgrp -R " + externalGroup + " ${BamFolder}/variants-* 2>> ${WD}/chgrp.out || echo [" + aligned.getName() +"] >> ${WD}/chgrp.out" ); 467 } 418 468 } 419 469 if (!rawOnly) 420 470 { 421 script.cmd("echo \"Annotated variants: `zcat results/variants-annotated.vcf.gz | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out"); 422 script.cmd("echo \"Filtered variants: `cat results/variants-filtered.vcf | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out"); 471 script.cmd("mkdir -p ${FilteredFolder}"); 472 script.cmd("rm -rf ${FilteredFolder}/*"); 473 script.cmd("cp resultsfilter/variants-* ${FilteredFolder}"); 474 script.cmd("ls -1 resultsfilter/variants-* >> ${WD}/files.out"); 475 script.cmd("echo \"Annotated variants: `zcat resultsfilter/variants-annotated.vcf.gz | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out"); 476 script.cmd("echo \"Filtered variants: `cat resultsfilter/variants-filtered.vcf | grep -v '^#\' | wc -l`\" >> ${WD}/stats.out"); 477 if (externalGroup != null) 478 { 479 script.cmd("chgrp -R " + externalGroup + " ${FilteredFolder}/variants-* 2>> ${WD}/chgrp.out || echo [" + aligned.getName() +"] >> ${WD}/chgrp.out" ); 480 } 423 481 } 424 482 script.newLine(); … … 471 529 boolean rawCallingSkipped = Boolean.TRUE.equals(job.getParameterValue("rawCallingSkipped")); 472 530 boolean filterSkipped = Boolean.TRUE.equals(job.getParameterValue("filterSkipped")); 531 RawBioAssay raw = null; 532 if (!filterSkipped) 533 { 534 Rawbioassay rawVCall = Rawbioassay.getByJob(dc, job); 535 raw = rawVCall.getItem(); 536 } 473 537 474 538 // Create file links … … 477 541 String analysisDir = useExternalProjectArchive ? Reggie.EXTERNAL_ANALYSIS_DIR : Reggie.SECONDARY_ANALYSIS_DIR; 478 542 479 String dataFilesFolder = (String)Annotationtype.DATA_FILES_FOLDER.getAnnotationValue(dc, aligned); 480 String baseFolder = Reggie.convertDataFilesFolderToBaseFolder(dataFilesFolder); 481 Directory localDataDir = Directory.getNew(dc, new Path(analysisDir+baseFolder, Path.Type.DIRECTORY)); 482 ItemSubtype vcfType = Subtype.VARIANT_CALL_FORMAT.load(dc); 483 484 int lineNo = 0; 485 int numCallableBases = 0; 486 int numAllVariants = 0; 487 int numAnnotated = 0; 488 int numFiltered = 0; 489 490 for (String line : statsOut.split("\n")) 491 { 492 String[] stat = line.split(":", 2); 493 String key = stat[0].strip(); 494 String val = stat[1].strip(); 495 if ("Callable bases".equals(key)) 543 ParseFiles pf = new ParseFiles(); 544 pf.stat = Stats.parse(statsOut); 545 pf.software = software; 546 pf.fileArchive = fileArchive; 547 pf.vcfType = Subtype.VARIANT_CALL_FORMAT.load(dc); 548 pf.vcfData = Datafiletype.VCF.load(dc); 549 550 if (!rawCallingSkipped) 551 { 552 FileOwner alignedOwner = FileOwner.create(dc, aligned, analysisDir); 553 pf.parseFiles(dc, alignedOwner, filesOut, Set.of("variants-callable.bed", "variants-all.vcf.gz")); 554 } 555 if (!filterSkipped) 556 { 557 FileOwner vCallOwner = FileOwner.create(dc, raw, analysisDir); 558 msg = pf.parseFiles(dc, vCallOwner, filesOut, Set.of("variants-annotated.vcf.gz", "variants-filtered.vcf")); 559 } 560 561 if (msg == null) 562 { 563 if (rawCallingSkipped) 496 564 { 497 numCallableBases = Values.getInt(val);565 msg = "Used existing raw variants; annotated " + pf.stat.numAnnotated + " variants; " + pf.stat.numFiltered + " passed filter."; 498 566 } 499 else if ( "All variants".equals(key))567 else if (filterSkipped) 500 568 { 501 numAllVariants = Values.getInt(val); 569 msg = "Found " + pf.stat.numAllVariants + " variants; skipped filtering; "; 570 msg += Values.formatNumber(pf.stat.numCallableBases/1000000f, 1) + "M callable bases."; 502 571 } 503 else if ("Annotated variants".equals(key))572 else 504 573 { 505 numAnnotated = Values.getInt(val); 574 msg = "Found " + pf.stat.numAllVariants + " variants; " + pf.stat.numFiltered + " passed filter. "; 575 msg += Values.formatNumber(pf.stat.numCallableBases/1000000f, 1) + "M callable bases."; 506 576 } 507 else if ("Filtered variants".equals(key)) 508 { 509 numFiltered = Values.getInt(val); 510 } 511 } 512 513 for (String line : filesOut.split("\n")) 514 { 515 lineNo++; 516 517 String filename = line.substring(line.lastIndexOf("/")+1); 518 File f = File.getFile(dc, localDataDir, filename, true); 519 f.setFileServer(fileArchive); 520 String fileUrl = "sftp://" + fileArchive.getHost() + dataFilesFolder + "/" + f.getName(); 577 } 578 dc.commit(); 579 } 580 finally 581 { 582 if (dc != null) dc.close(); 583 } 584 585 return msg == null ? "" : msg; 586 } 587 588 589 590 } 591 592 /** 593 Statistics from the variant calling. Parsed from 594 stats.out. 595 */ 596 static class Stats 597 { 598 int numCallableBases = 0; 599 int numAllVariants = 0; 600 int numAnnotated = 0; 601 int numFiltered = 0; 602 603 static Stats parse(String statsOut) 604 { 605 Stats s = new Stats(); 606 for (String line : statsOut.split("\n")) 607 { 608 String[] stat = line.split(":", 2); 609 String key = stat[0].strip(); 610 int val = Values.getInt(stat[1].strip()); 611 if ("Callable bases".equals(key)) 612 { 613 s.numCallableBases = val; 614 } 615 else if ("All variants".equals(key)) 616 { 617 s.numAllVariants = val; 618 } 619 else if ("Annotated variants".equals(key)) 620 { 621 s.numAnnotated = val; 622 } 623 else if ("Filtered variants".equals(key)) 624 { 625 s.numFiltered = val; 626 } 627 } 628 return s; 629 } 630 631 } 632 633 /** 634 Helper class for wrapping information about were files to an 635 item should go. From the owner 'item' we get the DataFilesFolder 636 annotation and the corresponding local directory in the BASE file 637 system. 638 */ 639 static class FileOwner 640 { 641 642 static FileOwner create(DbControl dc, BasicItem item, String analysisDir) 643 { 644 FileOwner owner = new FileOwner(); 645 owner.item = item; 646 owner.dataFilesFolder = (String)Annotationtype.DATA_FILES_FOLDER.getAnnotationValue(dc, (Annotatable)item); 647 String baseFolder = Reggie.convertDataFilesFolderToBaseFolder(owner.dataFilesFolder); 648 owner.localDataDir = Directory.getNew(dc, new Path(analysisDir+baseFolder, Path.Type.DIRECTORY)); 649 return owner; 650 } 651 652 BasicItem item; 653 String dataFilesFolder; 654 Directory localDataDir; 655 } 656 657 /** 658 Helper class for parsing the 'files.out' file and attaching files 659 to the specified owner. A filter with filenames need to be specified. 660 */ 661 static class ParseFiles 662 { 663 Software software; 664 FileServer fileArchive; 665 Stats stat; 666 ItemSubtype vcfType; 667 DataFileType vcfData; 668 669 String parseFiles(DbControl dc, FileOwner owner, String filesOut, Set<String> filenames) 670 { 671 String msg = null; 672 673 for (String line : filesOut.split("\n")) 674 { 675 String filename = line.substring(line.lastIndexOf("/")+1); 676 if (!filenames.contains(filename)) continue; // Skip files not owned by this owner 677 678 File f = File.getFile(dc, owner.localDataDir, filename, true); 679 f.setFileServer(fileArchive); 680 String fileUrl = "sftp://" + fileArchive.getHost() + owner.dataFilesFolder + "/" + f.getName(); 681 try 682 { 683 f.setUrl(fileUrl, true); 684 } 685 catch (RuntimeException ex) 686 { 687 f.setUrl(fileUrl, false); 688 } 689 690 if (!f.isInDatabase()) 691 { 692 dc.saveItem(f); 693 } 694 695 if (filename.equals("variants-filtered.vcf")) 696 { 697 RawBioAssay raw = (RawBioAssay)owner.item; 698 FileSetMember member = raw.getFileSet().addMember(f, vcfData); 521 699 try 522 700 { 523 f.setUrl(fileUrl, true); 701 VcfData x = parseVcf(f); 702 member.setValid(true, null); 703 raw.setNumFileSpots(x.getGtCount()); 524 704 } 525 catch ( RuntimeException ex)705 catch (IOException | RuntimeException ex) 526 706 { 527 f.setUrl(fileUrl, false); 707 msg = "Could not parse " + filename + " (" + ex.getMessage() + ")"; 708 member.setValid(false, ex.getMessage()); 709 logger.warn("Could not parse file: " + f, ex); 528 710 } 529 530 if (!f.isInDatabase()) 531 { 532 dc.saveItem(f); 533 } 534 535 AnyToAny link = AnyToAny.getNewOrExisting(dc, aligned, f.getName(), f, false); 711 } 712 else 713 { 714 AnyToAny link = AnyToAny.getNewOrExisting(dc, owner.item, f.getName(), f, false); 536 715 if (!link.isInDatabase()) dc.saveItem(link); 537 716 if (software != null) … … 539 718 link.setDescription("Created with " + software.getName()); 540 719 } 541 542 if (filename.endsWith(".vcf.gz")) f.setMimeTypeAuto("application/x-gzip", vcfType); 543 if (filename.endsWith(".vcf")) f.setMimeTypeAuto("text/plain", vcfType); 544 545 if (filename.equals("variants-all.vcf.gz")) 546 { 547 f.setDescription(numAllVariants + " variants."); 548 } 549 else if (filename.equals("variants-annotated.vcf.gz")) 550 { 551 f.setDescription(numAnnotated + " variants."); 552 } 553 else if (filename.equals("variants-filtered.vcf")) 554 { 555 f.setDescription(numFiltered + " variants."); 556 } 557 else if (filename.equals("variants-callable.bed")) 558 { 559 f.setDescription(Values.formatNumber(numCallableBases/1000000f, 1) + "M callable bases"); 560 } 561 } 562 if (rawCallingSkipped) 563 { 564 msg = "Used existing raw variants; annotated " + numAnnotated + " variants; " + numFiltered + " passed filter."; 565 } 566 else if (filterSkipped) 567 { 568 msg = "Found " + numAllVariants + " variants; skipped filtering; "; 569 msg += Values.formatNumber(numCallableBases/1000000f, 1) + "M callable bases."; 570 } 571 else 572 { 573 msg = "Found " + numAllVariants + " variants; " + numFiltered + " passed filter. "; 574 msg += Values.formatNumber(numCallableBases/1000000f, 1) + "M callable bases."; 575 } 576 dc.commit(); 577 } 578 finally 579 { 580 if (dc != null) dc.close(); 581 } 582 583 return msg == null ? "" : msg; 720 } 721 722 if (filename.endsWith(".vcf.gz")) f.setMimeTypeAuto("application/x-gzip", vcfType); 723 if (filename.endsWith(".vcf")) f.setMimeTypeAuto("text/plain", vcfType); 724 725 if (filename.equals("variants-all.vcf.gz")) 726 { 727 f.setDescription(stat.numAllVariants + " variants."); 728 } 729 else if (filename.equals("variants-annotated.vcf.gz")) 730 { 731 f.setDescription(stat.numAnnotated + " variants."); 732 } 733 else if (filename.equals("variants-filtered.vcf")) 734 { 735 f.setDescription(stat.numFiltered + " variants."); 736 } 737 else if (filename.equals("variants-callable.bed")) 738 { 739 f.setDescription(Values.formatNumber(stat.numCallableBases/1000000f, 1) + "M callable bases"); 740 } 741 } 742 743 return msg; 584 744 } 585 745 586 746 /** 587 Helper method for copying the VCF file from the file server 588 while at the same time parsing it and extracting genotype 589 information and statistics. 747 Parses the VCF file for validation. 590 748 */ 591 private VcfData parseVcf(VcfParser parser, File vcfFile) 592 { 593 // Stream for copying the vcfFile 749 private VcfData parseVcf(File vcfFile) 750 throws IOException 751 { 752 VcfParser parser = new VcfParser(); 594 753 InputStream fromFileServer = null; 595 596 754 VcfData vcfData = null; 597 755 try 598 756 { 599 fromFileServer = new GZIPInputStream(vcfFile.getDownloadStream(0)); 757 fromFileServer = vcfFile.getDownloadStream(0); 758 if (vcfFile.getName().endsWith(".gz")) 759 { 760 fromFileServer = new GZIPInputStream(fromFileServer); 761 } 600 762 parser.setUseLineNoAsId(true); 601 763 vcfData = parser.parse(fromFileServer, vcfFile.getName()); 602 764 } 603 catch (Exception ex)604 {605 logger.warn("Could not parse VCF file: "+ vcfFile, ex);606 }607 765 finally 608 766 { … … 611 769 return vcfData; 612 770 } 613 614 }771 } 772 615 773 }
Note: See TracChangeset
for help on using the changeset viewer.