source: extensions/net.sf.basedb.reggie/trunk/config/reggie-config.xml @ 5829

Last change on this file since 5829 was 5829, checked in by Nicklas Nordborg, 2 years ago

References #1218: Implement MIPs alignment

Added AnnotateBamWithUmis step.

File size: 21.8 KB
Line 
1<?xml version="1.0" encoding="UTF-8"?>
2<reggie>
3
4  <!-- Section for enabling/disabling experimental features -->
5  <!-- The list of feature that are considered experimental may change over time -->
6  <!-- 0=The feature is disabled, 1=The feature is enabled -->
7  <experimental-features>
8  </experimental-features>
9 
10  <!-- Configuration options related to how external samples (RNA or DNA) are handled -->
11  <external-samples>
12    <!-- Files generated in the secondary analysis can be shared with read permission to -->
13    <!-- a group if this is specified here. The prefix attribute is the sample name prefix -->
14    <!-- and the value is the group name. This translates to a 'chgrp' command in the secondary -->
15    <!-- analysis. Samples with a prefix that is not mapped here are not shared to other groups. -->
16    <!--  <groupname prefix="BR">brcalab</groupname> -->
17  </external-samples>
18
19  <!-- Settings for the Activity log that is displayed on the Reggie start page -->
20  <activity-log>
21    <!-- Max number of entries to display in the log (exception: all events within the last two days are always displayed) -->
22    <max-entries>35</max-entries>
23    <!-- Max age (in days) of entries to display (even if the max number hasn't been reached) -->
24    <max-age-in-days>14</max-age-in-days>
25    <quote-of-the-day>
26      <!-- URL to quote-of-the-day endpoint (optional, set an empty URL to disable this feature) -->
27      <url>https://quotes.rest/qod.json</url>
28      <!-- Default is 12 hours; do not set to less than 3600 since the external API has a limit -->
29      <max-age-in-seconds>43200</max-age-in-seconds>
30    </quote-of-the-day>
31  </activity-log>
32
33  <!-- Options related to R that is executed on the local server -->
34  <rscript>
35    <!-- Full or partial path to 'Rscript' executable -->
36    <path>Rscript</path>
37    <!-- Set the locale to use when running R -->
38    <!-- If not set, use whatever locale the operating system provides -->
39    <locale>en_US.UTF-8</locale>
40   
41    <!-- options for the 'geneReport' script -->
42    <gene-report>
43      <!-- full path to the R script -->
44      <path>/path/to/R_RNAseq_scanb_geneReport.R</path>
45      <!-- full path to directory with SCAN-B reference data -->
46      <!-- default is same directory as the R script -->
47      <ref-dir-scanb></ref-dir-scanb>
48      <!-- full path to directory with validation reference data -->
49      <!-- default is same directory as the R script -->
50      <ref-dir-validation></ref-dir-validation>
51      <!-- full path to the PDF template -->
52      <!-- default is 'template.pdf' in the same directory as the R script -->
53      <template></template>
54      <!-- file name in BASE for storing the generated report  -->
55      <pdf-name>genereport.pdf</pdf-name>
56    </gene-report>
57   
58    <!-- options for the 'pilot report' script -->
59    <pilot-report>
60      <!-- full path to the R script -->
61      <path>/path/to/pilot-report.R</path>
62      <!-- full path to directory with reference data -->
63      <!-- default is 'referenceData' directory inside -->
64      <!-- the same directory as the R script -->
65      <ref-dir></ref-dir>
66      <!-- full path to directory with source code -->
67      <!-- default is 'source' directory inside -->
68      <!-- the same directory as the R script -->
69      <source-dir></source-dir>
70      <!-- full path to the PDF template -->
71      <!-- default is 'template.pdf' in the same directory as the R script -->
72      <template></template>
73      <!-- file name in BASE for storing the generated report  -->
74      <pdf-name>pilotreport.pdf</pdf-name>
75    </pilot-report>
76   
77  </rscript>
78
79  <!-- Logotype information for the different sites -->
80  <!-- Uncomment as needed and set full path to image file -->
81  <!-- Supported file formats: WMF, PNG, JPG (and possible more) -->
82  <logos>
83    <!-- <region-skåne></region-skåne>  -->
84    <!-- <landstinget-kronoberg></landstinget-kronoberg>  -->
85    <!-- <uppsala-landsting></uppsala-landsting>  -->
86    <!-- <region-halland></region-halland>  -->
87    <!-- <landstinget-blekinge></landstinget-blekinge>  -->
88    <!-- <jönköpings-län></jönköpings-län>  -->
89  </logos>
90
91  <remote-hosts>
92    <!-- one or more hosts entries. Each entry should match an -->
93    <!-- entry in the opengrid-config.xml. The 'ID' of an Open Grid cluster -->
94    <!-- is a combination of the username, address and port: user@host:port -->
95    <!-- A comma-separated list is allowed -->
96    <!-- Note that the default port number (22) must be included in the ID  -->
97    <!-- even if it is not specified in the opengrid-config.xml file. -->
98 
99    <host 
100      id="user@address:port in opengrid-config.xml (one or more separated by comma)"
101      >
102     
103      <!-- full path to the location where HiSeq/NextSeq data is stored (required) -->
104      <run-archive>/casa2/run_archive</run-archive>
105      <!-- Alternate paths in search order in case data is not found in the primary -->
106      <!-- run archive. Add more entries as needed, but it is important that they -->
107      <!-- are numbered in strictly increasing order from '2' and up. -->
108      <run-archive-2></run-archive-2>
109     
110      <!-- Full path to the location where data files should be archived (required) -->
111      <!-- The path should include the name of the project -->
112      <project-archive>/casa4/project_archive/scanb</project-archive>
113      <!-- Full path to the location where external data files should be archive (optional) -->
114      <!-- If not specified, the 'project-archive' path is used -->
115      <external-archive></external-archive>
116     
117      <!-- Full path to the root location where reference genomes are located -->
118      <!-- Do not include name of project -->
119      <reference-folder>/reference</reference-folder>
120     
121      <!-- Information about programs used by reggie -->
122      <!-- Unless otherwise noted, all paths must be the same on all nodes -->
123      <programs>
124        <java>
125          <!-- full path to java binary to use (1.8 is required by GATK!) -->
126          <path>/usr/local/packages/jre/8.0_144/bin/java</path>
127        </java>
128        <pipeline-scripts>
129          <!-- folder where the pipeline scripts are located (required). -->
130          <path>/home/scanb/lorry-pipeline/pipeline-2.16</path>
131        </pipeline-scripts>
132        <picard>
133          <!-- full path to the directory with Picard jar files (required) -->
134          <path>/usr/local/packages/picard-tools/2.20.8</path>
135        </picard>
136        <genseq>
137          <!-- full path to the genseq_check_illumina_dir.pl script (required) -->
138          <path>/usr/local/packages/genseq_tools/v0.01/genseq_check_illumina_dir.pl</path>
139        </genseq>
140        <trimmomatic>
141          <!-- full path to the JAR file with the Trimmomatic program (required) -->
142          <path>/usr/local/packages/trimmomatic/0.32/trimmomatic-0.32.jar</path>
143          <!-- full path to the file with Illumina adapter information -->
144          <adapter-file>/usr/local/packages/trimmomatic/0.32/adapters/TruSeq3-PE-2.fa</adapter-file>
145        </trimmomatic>
146        <bowtie2>
147          <!-- full or partial path to bowtie2 (required) -->
148          <path>/usr/local/packages/bowtie/2.2.4/bin/bowtie2</path>
149        </bowtie2>
150        <tophat>
151          <!-- full or partial path to tophat (required) -->
152          <path>/usr/local/packages/tophat/2.0.12/bin/tophat</path>
153        </tophat>
154        <hisat>
155          <!-- full or partial path to hisat (required) -->
156          <path>/usr/local/packages/hisat/2.1.0/bin/hisat2</path>
157        </hisat>
158        <samtools>
159          <!-- full or partial path to samtools (required) -->
160          <path>/usr/local/packages/samtools/1.4/samtools</path>
161        </samtools>
162        <bedtools>
163          <!-- full or partial path to bedtools (required) -->
164          <path>/usr/local/packages/bedtools/2.26.0/bin/bedtools</path>
165        </bedtools>
166        <cufflinks>
167          <!-- full or partial path to cufflinks (required) -->
168          <path>/usr/local/packages/cufflinks/2.2.1/bin/cufflinks</path>
169        </cufflinks>
170        <stringtie>
171          <!-- full or partial path to stringtie (required) -->
172          <path>/usr/local/packages/stringtie/1.3.3b/bin/stringtie</path>
173        </stringtie>
174        <gatk>
175          <!-- full path to GenomeAnalysisToolkit JAR file (required) -->
176          <path>/usr/local/packages/GenomeAnalysisTK/3.8/GenomeAnalysisTK.jar</path>
177        </gatk>
178        <mosdepth>
179          <!-- full or partial path to mosdepth (required) -->
180          <path>/usr/local/packages/mosdepth/0.2.6/bin/mosdepth</path>
181        </mosdepth>
182        <vardict>
183          <!-- path to the directory where VarDict is installed -->
184          <!-- NOTE! not including the 'bin/VarDict' part since -->
185          <!-- that will be added automatically -->
186          <path>/usr/local/packages/vardict/1.6.0</path>
187        </vardict>
188        <vcfanno>
189          <!-- full or partial path to vcfanno (required) -->
190          <path>/usr/local/packages/vcfanno/0.3.2/bin/vcfanno</path>
191        </vcfanno>
192        <snpeff>
193          <!-- full path to the snpEff.jar file (required) -->
194          <path>/usr/local/packages/snpeff/4.3s/snpEff.jar</path>
195        </snpeff>
196        <snpsift>
197          <!-- full path to the SnpSift.jar file (required) -->
198          <path>/usr/local/packages/snpeff/4.3s/SnpSift.jar</path>
199        </snpsift>
200        <fgbio>
201          <!-- full path to the fgbio.jar file (required) -->
202          <path>/usr/local/packages/fgbio/0.8.1/fgbio.jar</path>
203        </fgbio>
204      </programs>
205     
206      <!-- priority values that are selectable in the web interface -->
207      <!-- allowed range is -1023 to 1024 -->
208      <!-- NOTE! positive values require special permissions on the cluster -->
209      <priorities>
210        <!-- <priority name="high" value="500" /> -->
211        <priority name="normal" value="0" default="true" />
212        <priority name="low" value="-500" />
213      </priorities>
214     
215      <!-- settings for the demuxing step (RNAseq) -->
216      <demux>
217        <!-- parallel environment option to the queue system -->
218        <!-- the default setting requests 4 slots -->
219        <parallel-environment>smp 4-4</parallel-environment>
220        <!-- Number of open files to set with 'ulimit -n' command -->
221        <!-- if not specified, the default on the server is used -->
222        <ulimit></ulimit>
223        <!-- amount of memory to give to Picard (default is 50g)-->
224        <picard-memory>50g</picard-memory>
225        <!-- static options for the picard ExtractIlluminaBarcodes step -->
226        <extract-options>-QUIET true -VERBOSITY WARNING</extract-options>
227        <!-- static options for the picard IlluminaBasecallsToFastq step -->
228        <fastq-options>-INCLUDE_NON_PF_READS false -MAX_READS_IN_RAM_PER_TILE 5000000 -QUIET true -VERBOSITY WARNING</fastq-options>
229        <!-- number of tiles to process when debugging (default=2 (HiSeq), 16 (NextSeq)) -->
230        <debug-tile-limit-hiseq>2</debug-tile-limit-hiseq>
231        <debug-tile-limit-nextseq>16</debug-tile-limit-nextseq>
232        <!-- static options for Bowtie when used for estimating fragment size -->
233        <bowtie-options>-q --fr -k 1 --phred33 --local --no-hd --no-unal -t -u 100000</bowtie-options>
234        <!-- the smallest number of fragments that must have been used in the fragment -->
235        <!-- size estimation, or we will set FragmentSizeAvg and FragmentSizeStdev to -1 -->
236        <bowtie-fragment-count-limit>20000</bowtie-fragment-count-limit>
237        <!-- static options for Trimmomatic -->
238        <trimmomatic-options>
239          <!-- The first step should ONLY filter Illumina adapters-->
240          <step-1>ILLUMINACLIP:${AdapterFile}:2:30:12:1:true MINLEN:20</step-1>
241          <!-- The second step is for all other filters -->
242          <step-2>MAXINFO:40:0.9 MINLEN:20</step-2>
243        </trimmomatic-options>
244        <!-- static options for gzip compression with pigz (default=-5) -->
245        <!-- NOTE! Number of threads (-p) is set automatically and should not be included here -->
246        <pigz-options>-5</pigz-options>
247      </demux>
248 
249      <!-- settings for the demuxing step (MIPs) -->
250      <demux-mips>
251        <!-- parallel environment option to the queue system -->
252        <!-- the default setting requests 8-16 slots -->
253        <parallel-environment>smp 8-16</parallel-environment>
254        <!-- amount of memory to give to Picard (default is 50g)-->
255        <picard-memory>50g</picard-memory>
256        <!-- static options for the picard ExtractIlluminaBarcodes step -->
257        <extract-options>-MINIMUM_BASE_QUALITY 0 -MINIMUM_QUALITY 2 -MAX_MISMATCHES 2 -MIN_MISMATCH_DELTA 2 -MAX_NO_CALLS 2 -QUIET true -VERBOSITY WARNING</extract-options>
258        <!-- static options for the picard IlluminaBasecallsToFastq step -->
259        <fastq-options>-INCLUDE_NON_PF_READS false -APPLY_EAMSS_FILTER false -MINIMUM_QUALITY 2 -MAX_READS_IN_RAM_PER_TILE 5000000 -QUIET true -VERBOSITY WARNING</fastq-options>
260        <!-- static options to put into the "Read group" files -->
261        <readgroup-options>PL=ILLUMINA CN=BRCAlab</readgroup-options>
262        <!-- number of tiles to process when debugging (default=2 (HiSeq), 16 (NextSeq)) -->
263        <debug-tile-limit-hiseq>2</debug-tile-limit-hiseq>
264        <debug-tile-limit-nextseq>16</debug-tile-limit-nextseq>
265        <!-- static options for gzip compression with pigz (default=-5) -->
266        <!-- NOTE! Number of threads (-p) is set automatically and should not be included here -->
267        <pigz-options>-5</pigz-options>
268      </demux-mips>
269 
270      <mask>
271        <!-- relative path from <reference-folder> to the reference genome used for masking -->
272        <!-- This is the -x option used for bowtie -->
273        <reference-name>scanb/ribo_phix_repeats_filter/ribo_phix_repeats_filter</reference-name>
274       
275        <!-- static options for bowtie -->
276        <bowtie-options>-q --fr -k 1 --phred33 -t --local</bowtie-options>
277       
278        <!-- max number of sequences to align when running in debug mode (default=2 millions)-->
279        <debug-max-align>2000000</debug-max-align>
280      </mask>
281 
282      <align>
283        <!-- relative path from <reference-folder> to the reference genome used for alignment -->
284        <!-- TODO selectable in GUI? saved as annotation? -->
285        <reference-gidx>hg38/hg38.analysisSet/hg38.analysisSet</reference-gidx>
286        <reference-tidx>hg38/UCSC_hg38_knownGenes_22sep2014/knownGenes.vs.hg38.analysisSet</reference-tidx>
287       
288        <!-- static options for tophat -->
289        <tophat-options>--library-type fr-firststrand --keep-fasta-order --no-coverage-search --max-insertion-length 20 --max-deletion-length 20 --read-gap-length 20 --read-edit-dist 22</tophat-options>
290        <!-- adjustment values for the 'mate-inner-dist' and 'mate-std-dev' -->
291        <!-- parameters to tophat. The specified values are added to those -->
292        <!-- calculated by bowtie -->
293        <adjust-mate-inner-dist>13</adjust-mate-inner-dist>
294        <adjust-mate-std-dev>10</adjust-mate-std-dev>
295       
296        <!-- static options for the picard MarkDuplicates step -->
297        <mark-duplicates-options>-REMOVE_DUPLICATES false -ASSUME_SORTED true -MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 2000 -QUIET true -VERBOSITY WARNING</mark-duplicates-options>
298      </align>
299     
300      <!-- settings for aligning with Hisat -->
301      <align-hisat>
302        <!-- parallel environment option to the queue system -->
303        <!-- the default setting use up to 16 slots on hosts with at least 8 slots available -->
304        <parallel-environment>smp 8-16</parallel-environment>
305       
306        <!-- relative path from <reference-folder> to the reference genome used for alignment -->
307        <reference-tidx>hg38/hg38.analysisSet_gencode27_snp150/genome_snp_tran</reference-tidx>
308       
309        <!-- static options for hisat -->
310        <hisat-options>-q --fr --phred33 -t --dta --dta-cufflink --new-summary --no-unal --non-deterministic --novel-splicesite-outfile aligned/splicesites.tsv --rna-strandness RF --summary-file aligned/summary.txt --rg PL:Illumina --rg CN:SCANB-prim</hisat-options>
311       
312        <!-- static options for the picard MarkDuplicates step -->
313        <mark-duplicates-options>-REMOVE_DUPLICATES false -ASSUME_SORTED true -MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 2000 -QUIET true -VERBOSITY WARNING</mark-duplicates-options>
314       
315        <!-- relative path from <reference-folder> to FASTA file used as reference for the alignment  -->
316        <haplotypecaller-ref>hg38/hg38.analysisSet_gencode27_snp150/hg38.analysisSet_gencodeid.fa</haplotypecaller-ref>
317       
318        <!-- relative path from <reference-folder> to VCF file with SNP that we should look for -->
319        <haplotypecaller-dbsnp>scanb/genotyping-213-snp_feb2018.vcf</haplotypecaller-dbsnp>
320       
321        <!-- static options for the HaplotypeCaller step -->
322        <haplotypecaller-options>-stand_call_conf 20 --filter_reads_with_N_cigar --annotation AlleleBalance --no_cmdline_in_header</haplotypecaller-options>
323      </align-hisat>
324     
325      <!-- settings for aligning MIPs sequencing -->
326      <align-mips>
327        <!-- parallel environment option to the queue system -->
328        <!-- the default setting use up to 16 slots on hosts with at least 8 slots available -->
329        <parallel-environment>smp 8-16</parallel-environment>
330       
331        <!-- Options for Trimmomatic -->
332        <trimmomatic>
333          <!-- Optional path to Trimmomatic, if not specified the default in the 'programs' section is used -->
334          <path>/usr/local/packages/trimmomatic/0.39/trimmomatic.jar</path>
335          <!-- The first step should filter Illumina adapters-->
336          <step-1>ILLUMINACLIP:adapter.fa:3:12:7:1:true MINLEN:30</step-1>
337          <!-- The second step is for all other filters -->
338          <step-2>MAXINFO:30:0.25 MINLEN:30</step-2>
339        </trimmomatic>
340       
341       
342      </align-mips>
343     
344      <mbaf>
345        <!-- parallel environment option to the queue system -->
346        <!-- the default setting use up to 16 slots on hosts with at least 8 slots available -->
347        <parallel-environment>smp 8-16</parallel-environment>
348       
349        <!-- relative path from <reference-folder> to FASTA file used as reference for the alignment  -->
350        <!-- this should probably be the same as in <align-hisat>/<haplotypecaller-ref> -->
351        <haplotypecaller-ref>hg38/hg38.analysisSet_gencode27_snp150/hg38.analysisSet_gencodeid.fa</haplotypecaller-ref>
352       
353        <!-- relative path from <reference-folder> to VCF file with SNP:s that we should look for -->
354        <haplotypecaller-dbsnp>scanb/genotyping-mbaf-snp_oct2018.vcf</haplotypecaller-dbsnp>
355       
356        <!-- static options for the HaplotypeCaller step -->
357        <haplotypecaller-options>-stand_call_conf 20 --filter_reads_with_N_cigar --no_cmdline_in_header</haplotypecaller-options>
358      </mbaf>
359     
360      <!-- settings for variant calling -->
361      <variant-call>
362        <!-- parallel environment option to the queue system -->
363        <!-- the default setting use up to 16 slots on hosts with at least 8 slots available -->
364        <parallel-environment>smp 8-16</parallel-environment>
365
366        <!-- relative path from <reference-folder> to FASTA file used as reference for the alignment  -->
367        <!-- this should probably be the same as in <align-hisat>/<haplotypecaller-ref> -->
368        <genome-fasta>hg38/hg38.analysisSet_gencode27_snp150/hg38.analysisSet_gencodeid.fa</genome-fasta>
369       
370        <!-- Full path to base directory with databases and other stuff needed by the pipeline -->
371        <!-- This value can be used in other options as ${BaseDir} -->
372        <base-dir>${ReferenceDir}/scanb/rnaseqvarcall-feb2020</base-dir>
373       
374        <!-- static options for 'mosdepth' for regular and debug modes (optional) -->
375        <mosdepth-options></mosdepth-options>
376        <mosdepth-options-debug>-c chr6</mosdepth-options-debug>
377       
378        <!-- the required depth for a base to be callable for variants (optional, default=5) -->
379        <min-depth>5</min-depth>
380       
381        <!-- static options for VarDict (required) -->
382        <vardict-options>-f 0.02 -c 1 -S 2 -E 3 -g 4 -Q 20 -r 2 -q 20 --nosv</vardict-options>
383       
384        <!-- static options for var2vcf_valid.pl (required) -->
385        <var2vcf-options>-A -f 0.02</var2vcf-options>
386       
387        <!--static options for vcfanno command line (required) -->
388        <!-- See https://github.com/brentp/vcfanno for more information -->
389        <vcfanno-options>-p 8 -lua ${BaseDir}/vcfanno.lua -base-path ${BaseDir} ${BaseDir}/allDbs.toml</vcfanno-options>
390       
391        <!-- static options for the snpEff command (required) -->
392        <snpeff-options>-configOption data.dir=${BaseDir}/snpEff_v4_3_hg38/data -noLog -noStats -canon hg38</snpeff-options>
393
394        <!-- static options for the SnpSift command (required) -->
395        <snpsift-options>-s ${BaseDir}/rna_chr_set.txt -s ${BaseDir}/intogen-BRCA-genes-list_patch.txt -e ${BaseDir}/filter_expression.txt</snpsift-options>
396       
397        <!-- path to the COSMIC mutation signature data -->
398        <mutation-signature>${BaseDir}/COSMIC_Cancer_signatures_probabilities.RData</mutation-signature>
399      </variant-call>
400     
401      <cufflinks>
402        <!-- parallel environment option to the queue system -->
403        <!-- the default setting use between 8 and 16 slots on hosts with at least 8 slots available -->
404        <parallel-environment>smp 8-16</parallel-environment>
405 
406        <!-- relative path from <reference-folder> to the reference genome used by cufflinks -->
407        <reference-gidx>hg38/hg38.analysisSet/hg38.analysisSet.fa</reference-gidx>
408        <reference-gtf>hg38/UCSC_hg38_knownGenes_22sep2014.gtf</reference-gtf>
409       
410        <!-- static options for cufflinks -->
411        <options>--multi-read-correct --library-type fr-firststrand --total-hits-norm --max-bundle-frags 10000000 --no-update-check --quiet</options>
412       
413        <!-- if the aligned sequences item has more reads than this limit (when running in debug mode) -->
414        <!-- the accepted_hits.bam will be  limited to chr1 before running cufflinks -->
415        <debug-max-aligned>2000000</debug-max-aligned>
416       
417        <!-- path to a file containing pairs of tracking_id values -->
418        <!-- *.fpkm_tracking files are searched and values from the -->
419        <!-- second column are replaced with values in the first column -->
420        <!-- If no mapping file is specified, no replacement is done -->
421        <tracking-id-map>hg38/UCSC_hg38_knownGenes_22sep2014_duplicate_transcript_id.txt</tracking-id-map>
422      </cufflinks>
423     
424      <stringtie>
425        <!-- parallel environment option to the queue system -->
426        <!-- the default setting use between 8 and 16 slots on hosts with at least 8 slots available -->
427        <parallel-environment>smp 8-16</parallel-environment>
428       
429        <!-- relative path from <reference-folder> to the reference genome used by stringtie -->
430        <reference-gtf>hg38/hg38.analysisSet_gencode27_snp150/gencode.v27.primary_assembly.annotation_subset_transcripttype_proteincoding.gtf</reference-gtf>
431
432        <!-- static options for stringtie -->
433        <options>--rf -B -e</options>
434
435      </stringtie>
436    </host>
437 
438   
439  </remote-hosts>
440
441</reggie>
Note: See TracBrowser for help on using the repository browser.