source: extensions/net.sf.basedb.reggie/trunk/config/reggie-config.xml @ 5827

Last change on this file since 5827 was 5827, checked in by Nicklas Nordborg, 3 years ago

References #1218: Implement MIPs alignment

Added Trimmomatic steps to the script. Logging and error handling is not fully implemented yet.

File size: 21.7 KB
Line 
1<?xml version="1.0" encoding="UTF-8"?>
2<reggie>
3
4  <!-- Section for enabling/disabling experimental features -->
5  <!-- The list of feature that are considered experimental may change over time -->
6  <!-- 0=The feature is disabled, 1=The feature is enabled -->
7  <experimental-features>
8  </experimental-features>
9 
10  <!-- Configuration options related to how external samples (RNA or DNA) are handled -->
11  <external-samples>
12    <!-- Files generated in the secondary analysis can be shared with read permission to -->
13    <!-- a group if this is specified here. The prefix attribute is the sample name prefix -->
14    <!-- and the value is the group name. This translates to a 'chgrp' command in the secondary -->
15    <!-- analysis. Samples with a prefix that is not mapped here are not shared to other groups. -->
16    <!--  <groupname prefix="BR">brcalab</groupname> -->
17  </external-samples>
18
19  <!-- Settings for the Activity log that is displayed on the Reggie start page -->
20  <activity-log>
21    <!-- Max number of entries to display in the log (exception: all events within the last two days are always displayed) -->
22    <max-entries>35</max-entries>
23    <!-- Max age (in days) of entries to display (even if the max number hasn't been reached) -->
24    <max-age-in-days>14</max-age-in-days>
25    <quote-of-the-day>
26      <!-- URL to quote-of-the-day endpoint (optional, set an empty URL to disable this feature) -->
27      <url>https://quotes.rest/qod.json</url>
28      <!-- Default is 12 hours; do not set to less than 3600 since the external API has a limit -->
29      <max-age-in-seconds>43200</max-age-in-seconds>
30    </quote-of-the-day>
31  </activity-log>
32
33  <!-- Options related to R that is executed on the local server -->
34  <rscript>
35    <!-- Full or partial path to 'Rscript' executable -->
36    <path>Rscript</path>
37    <!-- Set the locale to use when running R -->
38    <!-- If not set, use whatever locale the operating system provides -->
39    <locale>en_US.UTF-8</locale>
40   
41    <!-- options for the 'geneReport' script -->
42    <gene-report>
43      <!-- full path to the R script -->
44      <path>/path/to/R_RNAseq_scanb_geneReport.R</path>
45      <!-- full path to directory with SCAN-B reference data -->
46      <!-- default is same directory as the R script -->
47      <ref-dir-scanb></ref-dir-scanb>
48      <!-- full path to directory with validation reference data -->
49      <!-- default is same directory as the R script -->
50      <ref-dir-validation></ref-dir-validation>
51      <!-- full path to the PDF template -->
52      <!-- default is 'template.pdf' in the same directory as the R script -->
53      <template></template>
54      <!-- file name in BASE for storing the generated report  -->
55      <pdf-name>genereport.pdf</pdf-name>
56    </gene-report>
57   
58    <!-- options for the 'pilot report' script -->
59    <pilot-report>
60      <!-- full path to the R script -->
61      <path>/path/to/pilot-report.R</path>
62      <!-- full path to directory with reference data -->
63      <!-- default is 'referenceData' directory inside -->
64      <!-- the same directory as the R script -->
65      <ref-dir></ref-dir>
66      <!-- full path to directory with source code -->
67      <!-- default is 'source' directory inside -->
68      <!-- the same directory as the R script -->
69      <source-dir></source-dir>
70      <!-- full path to the PDF template -->
71      <!-- default is 'template.pdf' in the same directory as the R script -->
72      <template></template>
73      <!-- file name in BASE for storing the generated report  -->
74      <pdf-name>pilotreport.pdf</pdf-name>
75    </pilot-report>
76   
77  </rscript>
78
79  <!-- Logotype information for the different sites -->
80  <!-- Uncomment as needed and set full path to image file -->
81  <!-- Supported file formats: WMF, PNG, JPG (and possible more) -->
82  <logos>
83    <!-- <region-skåne></region-skåne>  -->
84    <!-- <landstinget-kronoberg></landstinget-kronoberg>  -->
85    <!-- <uppsala-landsting></uppsala-landsting>  -->
86    <!-- <region-halland></region-halland>  -->
87    <!-- <landstinget-blekinge></landstinget-blekinge>  -->
88    <!-- <jönköpings-län></jönköpings-län>  -->
89  </logos>
90
91  <remote-hosts>
92    <!-- one or more hosts entries. Each entry should match an -->
93    <!-- entry in the opengrid-config.xml. The 'ID' of an Open Grid cluster -->
94    <!-- is a combination of the username, address and port: user@host:port -->
95    <!-- A comma-separated list is allowed -->
96    <!-- Note that the default port number (22) must be included in the ID  -->
97    <!-- even if it is not specified in the opengrid-config.xml file. -->
98 
99    <host 
100      id="user@address:port in opengrid-config.xml (one or more separated by comma)"
101      >
102     
103      <!-- full path to the location where HiSeq/NextSeq data is stored (required) -->
104      <run-archive>/casa2/run_archive</run-archive>
105      <!-- Alternate paths in search order in case data is not found in the primary -->
106      <!-- run archive. Add more entries as needed, but it is important that they -->
107      <!-- are numbered in strictly increasing order from '2' and up. -->
108      <run-archive-2></run-archive-2>
109     
110      <!-- Full path to the location where data files should be archived (required) -->
111      <!-- The path should include the name of the project -->
112      <project-archive>/casa4/project_archive/scanb</project-archive>
113      <!-- Full path to the location where external data files should be archive (optional) -->
114      <!-- If not specified, the 'project-archive' path is used -->
115      <external-archive></external-archive>
116     
117      <!-- Full path to the root location where reference genomes are located -->
118      <!-- Do not include name of project -->
119      <reference-folder>/reference</reference-folder>
120     
121      <!-- Information about programs used by reggie -->
122      <!-- Unless otherwise noted, all paths must be the same on all nodes -->
123      <programs>
124        <java>
125          <!-- full path to java binary to use (1.8 is required by GATK!) -->
126          <path>/usr/local/packages/jre/8.0_144/bin/java</path>
127        </java>
128        <pipeline-scripts>
129          <!-- folder where the pipeline scripts are located (required). -->
130          <path>/home/scanb/lorry-pipeline/pipeline-2.16</path>
131        </pipeline-scripts>
132        <picard>
133          <!-- full path to the directory with Picard jar files (required) -->
134          <path>/usr/local/packages/picard-tools/2.20.8</path>
135        </picard>
136        <genseq>
137          <!-- full path to the genseq_check_illumina_dir.pl script (required) -->
138          <path>/usr/local/packages/genseq_tools/v0.01/genseq_check_illumina_dir.pl</path>
139        </genseq>
140        <trimmomatic>
141          <!-- full path to the JAR file with the Trimmomatic program (required) -->
142          <path>/usr/local/packages/trimmomatic/0.32/trimmomatic-0.32.jar</path>
143          <!-- full path to the file with Illumina adapter information -->
144          <adapter-file>/usr/local/packages/trimmomatic/0.32/adapters/TruSeq3-PE-2.fa</adapter-file>
145        </trimmomatic>
146        <bowtie2>
147          <!-- full or partial path to bowtie2 (required) -->
148          <path>/usr/local/packages/bowtie/2.2.4/bin/bowtie2</path>
149        </bowtie2>
150        <tophat>
151          <!-- full or partial path to tophat (required) -->
152          <path>/usr/local/packages/tophat/2.0.12/bin/tophat</path>
153        </tophat>
154        <hisat>
155          <!-- full or partial path to hisat (required) -->
156          <path>/usr/local/packages/hisat/2.1.0/bin/hisat2</path>
157        </hisat>
158        <samtools>
159          <!-- full or partial path to samtools (required) -->
160          <path>/usr/local/packages/samtools/1.4/samtools</path>
161        </samtools>
162        <bedtools>
163          <!-- full or partial path to bedtools (required) -->
164          <path>/usr/local/packages/bedtools/2.26.0/bin/bedtools</path>
165        </bedtools>
166        <cufflinks>
167          <!-- full or partial path to cufflinks (required) -->
168          <path>/usr/local/packages/cufflinks/2.2.1/bin/cufflinks</path>
169        </cufflinks>
170        <stringtie>
171          <!-- full or partial path to stringtie (required) -->
172          <path>/usr/local/packages/stringtie/1.3.3b/bin/stringtie</path>
173        </stringtie>
174        <gatk>
175          <!-- full path to GenomeAnalysisToolkit JAR file (required) -->
176          <path>/usr/local/packages/GenomeAnalysisTK/3.8/GenomeAnalysisTK.jar</path>
177        </gatk>
178        <mosdepth>
179          <!-- full or partial path to mosdepth (required) -->
180          <path>/usr/local/packages/mosdepth/0.2.6/bin/mosdepth</path>
181        </mosdepth>
182        <vardict>
183          <!-- path to the directory where VarDict is installed -->
184          <!-- NOTE! not including the 'bin/VarDict' part since -->
185          <!-- that will be added automatically -->
186          <path>/usr/local/packages/vardict/1.6.0</path>
187        </vardict>
188        <vcfanno>
189          <!-- full or partial path to vcfanno (required) -->
190          <path>/usr/local/packages/vcfanno/0.3.2/bin/vcfanno</path>
191        </vcfanno>
192        <snpeff>
193          <!-- full path to the snpEff.jar file (required) -->
194          <path>/usr/local/packages/snpeff/4.3s/snpEff.jar</path>
195        </snpeff>
196        <snpsift>
197          <!-- full path to the SnpSift.jar file (required) -->
198          <path>/usr/local/packages/snpeff/4.3s/SnpSift.jar</path>
199        </snpsift>
200      </programs>
201     
202      <!-- priority values that are selectable in the web interface -->
203      <!-- allowed range is -1023 to 1024 -->
204      <!-- NOTE! positive values require special permissions on the cluster -->
205      <priorities>
206        <!-- <priority name="high" value="500" /> -->
207        <priority name="normal" value="0" default="true" />
208        <priority name="low" value="-500" />
209      </priorities>
210     
211      <!-- settings for the demuxing step (RNAseq) -->
212      <demux>
213        <!-- parallel environment option to the queue system -->
214        <!-- the default setting requests 4 slots -->
215        <parallel-environment>smp 4-4</parallel-environment>
216        <!-- Number of open files to set with 'ulimit -n' command -->
217        <!-- if not specified, the default on the server is used -->
218        <ulimit></ulimit>
219        <!-- amount of memory to give to Picard (default is 50g)-->
220        <picard-memory>50g</picard-memory>
221        <!-- static options for the picard ExtractIlluminaBarcodes step -->
222        <extract-options>-QUIET true -VERBOSITY WARNING</extract-options>
223        <!-- static options for the picard IlluminaBasecallsToFastq step -->
224        <fastq-options>-INCLUDE_NON_PF_READS false -MAX_READS_IN_RAM_PER_TILE 5000000 -QUIET true -VERBOSITY WARNING</fastq-options>
225        <!-- number of tiles to process when debugging (default=2 (HiSeq), 16 (NextSeq)) -->
226        <debug-tile-limit-hiseq>2</debug-tile-limit-hiseq>
227        <debug-tile-limit-nextseq>16</debug-tile-limit-nextseq>
228        <!-- static options for Bowtie when used for estimating fragment size -->
229        <bowtie-options>-q --fr -k 1 --phred33 --local --no-hd --no-unal -t -u 100000</bowtie-options>
230        <!-- the smallest number of fragments that must have been used in the fragment -->
231        <!-- size estimation, or we will set FragmentSizeAvg and FragmentSizeStdev to -1 -->
232        <bowtie-fragment-count-limit>20000</bowtie-fragment-count-limit>
233        <!-- static options for Trimmomatic -->
234        <trimmomatic-options>
235          <!-- The first step should ONLY filter Illumina adapters-->
236          <step-1>ILLUMINACLIP:${AdapterFile}:2:30:12:1:true MINLEN:20</step-1>
237          <!-- The second step is for all other filters -->
238          <step-2>MAXINFO:40:0.9 MINLEN:20</step-2>
239        </trimmomatic-options>
240        <!-- static options for gzip compression with pigz (default=-5) -->
241        <!-- NOTE! Number of threads (-p) is set automatically and should not be included here -->
242        <pigz-options>-5</pigz-options>
243      </demux>
244 
245      <!-- settings for the demuxing step (MIPs) -->
246      <demux-mips>
247        <!-- parallel environment option to the queue system -->
248        <!-- the default setting requests 8-16 slots -->
249        <parallel-environment>smp 8-16</parallel-environment>
250        <!-- amount of memory to give to Picard (default is 50g)-->
251        <picard-memory>50g</picard-memory>
252        <!-- static options for the picard ExtractIlluminaBarcodes step -->
253        <extract-options>-MINIMUM_BASE_QUALITY 0 -MINIMUM_QUALITY 2 -MAX_MISMATCHES 2 -MIN_MISMATCH_DELTA 2 -MAX_NO_CALLS 2 -QUIET true -VERBOSITY WARNING</extract-options>
254        <!-- static options for the picard IlluminaBasecallsToFastq step -->
255        <fastq-options>-INCLUDE_NON_PF_READS false -APPLY_EAMSS_FILTER false -MINIMUM_QUALITY 2 -MAX_READS_IN_RAM_PER_TILE 5000000 -QUIET true -VERBOSITY WARNING</fastq-options>
256        <!-- static options to put into the "Read group" files -->
257        <readgroup-options>PL=ILLUMINA CN=BRCAlab</readgroup-options>
258        <!-- number of tiles to process when debugging (default=2 (HiSeq), 16 (NextSeq)) -->
259        <debug-tile-limit-hiseq>2</debug-tile-limit-hiseq>
260        <debug-tile-limit-nextseq>16</debug-tile-limit-nextseq>
261        <!-- static options for gzip compression with pigz (default=-5) -->
262        <!-- NOTE! Number of threads (-p) is set automatically and should not be included here -->
263        <pigz-options>-5</pigz-options>
264      </demux-mips>
265 
266      <mask>
267        <!-- relative path from <reference-folder> to the reference genome used for masking -->
268        <!-- This is the -x option used for bowtie -->
269        <reference-name>scanb/ribo_phix_repeats_filter/ribo_phix_repeats_filter</reference-name>
270       
271        <!-- static options for bowtie -->
272        <bowtie-options>-q --fr -k 1 --phred33 -t --local</bowtie-options>
273       
274        <!-- max number of sequences to align when running in debug mode (default=2 millions)-->
275        <debug-max-align>2000000</debug-max-align>
276      </mask>
277 
278      <align>
279        <!-- relative path from <reference-folder> to the reference genome used for alignment -->
280        <!-- TODO selectable in GUI? saved as annotation? -->
281        <reference-gidx>hg38/hg38.analysisSet/hg38.analysisSet</reference-gidx>
282        <reference-tidx>hg38/UCSC_hg38_knownGenes_22sep2014/knownGenes.vs.hg38.analysisSet</reference-tidx>
283       
284        <!-- static options for tophat -->
285        <tophat-options>--library-type fr-firststrand --keep-fasta-order --no-coverage-search --max-insertion-length 20 --max-deletion-length 20 --read-gap-length 20 --read-edit-dist 22</tophat-options>
286        <!-- adjustment values for the 'mate-inner-dist' and 'mate-std-dev' -->
287        <!-- parameters to tophat. The specified values are added to those -->
288        <!-- calculated by bowtie -->
289        <adjust-mate-inner-dist>13</adjust-mate-inner-dist>
290        <adjust-mate-std-dev>10</adjust-mate-std-dev>
291       
292        <!-- static options for the picard MarkDuplicates step -->
293        <mark-duplicates-options>-REMOVE_DUPLICATES false -ASSUME_SORTED true -MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 2000 -QUIET true -VERBOSITY WARNING</mark-duplicates-options>
294      </align>
295     
296      <!-- settings for aligning with Hisat -->
297      <align-hisat>
298        <!-- parallel environment option to the queue system -->
299        <!-- the default setting use up to 16 slots on hosts with at least 8 slots available -->
300        <parallel-environment>smp 8-16</parallel-environment>
301       
302        <!-- relative path from <reference-folder> to the reference genome used for alignment -->
303        <reference-tidx>hg38/hg38.analysisSet_gencode27_snp150/genome_snp_tran</reference-tidx>
304       
305        <!-- static options for hisat -->
306        <hisat-options>-q --fr --phred33 -t --dta --dta-cufflink --new-summary --no-unal --non-deterministic --novel-splicesite-outfile aligned/splicesites.tsv --rna-strandness RF --summary-file aligned/summary.txt --rg PL:Illumina --rg CN:SCANB-prim</hisat-options>
307       
308        <!-- static options for the picard MarkDuplicates step -->
309        <mark-duplicates-options>-REMOVE_DUPLICATES false -ASSUME_SORTED true -MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 2000 -QUIET true -VERBOSITY WARNING</mark-duplicates-options>
310       
311        <!-- relative path from <reference-folder> to FASTA file used as reference for the alignment  -->
312        <haplotypecaller-ref>hg38/hg38.analysisSet_gencode27_snp150/hg38.analysisSet_gencodeid.fa</haplotypecaller-ref>
313       
314        <!-- relative path from <reference-folder> to VCF file with SNP that we should look for -->
315        <haplotypecaller-dbsnp>scanb/genotyping-213-snp_feb2018.vcf</haplotypecaller-dbsnp>
316       
317        <!-- static options for the HaplotypeCaller step -->
318        <haplotypecaller-options>-stand_call_conf 20 --filter_reads_with_N_cigar --annotation AlleleBalance --no_cmdline_in_header</haplotypecaller-options>
319      </align-hisat>
320     
321      <!-- settings for aligning MIPs sequencing -->
322      <align-mips>
323        <!-- parallel environment option to the queue system -->
324        <!-- the default setting use up to 16 slots on hosts with at least 8 slots available -->
325        <parallel-environment>smp 8-16</parallel-environment>
326       
327        <!-- Options for Trimmomatic -->
328        <trimmomatic>
329          <!-- Optional path to Trimmomatic, if not specified the default in the 'programs' section is used -->
330          <path>/usr/local/packages/trimmomatic/0.39/trimmomatic.jar</path>
331          <!-- The first step should filter Illumina adapters-->
332          <step-1>ILLUMINACLIP:adapter.fa:3:12:7:1:true MINLEN:30</step-1>
333          <!-- The second step is for all other filters -->
334          <step-2>MAXINFO:30:0.25 MINLEN:30</step-2>
335        </trimmomatic>
336       
337      </align-mips>
338     
339      <mbaf>
340        <!-- parallel environment option to the queue system -->
341        <!-- the default setting use up to 16 slots on hosts with at least 8 slots available -->
342        <parallel-environment>smp 8-16</parallel-environment>
343       
344        <!-- relative path from <reference-folder> to FASTA file used as reference for the alignment  -->
345        <!-- this should probably be the same as in <align-hisat>/<haplotypecaller-ref> -->
346        <haplotypecaller-ref>hg38/hg38.analysisSet_gencode27_snp150/hg38.analysisSet_gencodeid.fa</haplotypecaller-ref>
347       
348        <!-- relative path from <reference-folder> to VCF file with SNP:s that we should look for -->
349        <haplotypecaller-dbsnp>scanb/genotyping-mbaf-snp_oct2018.vcf</haplotypecaller-dbsnp>
350       
351        <!-- static options for the HaplotypeCaller step -->
352        <haplotypecaller-options>-stand_call_conf 20 --filter_reads_with_N_cigar --no_cmdline_in_header</haplotypecaller-options>
353      </mbaf>
354     
355      <!-- settings for variant calling -->
356      <variant-call>
357        <!-- parallel environment option to the queue system -->
358        <!-- the default setting use up to 16 slots on hosts with at least 8 slots available -->
359        <parallel-environment>smp 8-16</parallel-environment>
360
361        <!-- relative path from <reference-folder> to FASTA file used as reference for the alignment  -->
362        <!-- this should probably be the same as in <align-hisat>/<haplotypecaller-ref> -->
363        <genome-fasta>hg38/hg38.analysisSet_gencode27_snp150/hg38.analysisSet_gencodeid.fa</genome-fasta>
364       
365        <!-- Full path to base directory with databases and other stuff needed by the pipeline -->
366        <!-- This value can be used in other options as ${BaseDir} -->
367        <base-dir>${ReferenceDir}/scanb/rnaseqvarcall-feb2020</base-dir>
368       
369        <!-- static options for 'mosdepth' for regular and debug modes (optional) -->
370        <mosdepth-options></mosdepth-options>
371        <mosdepth-options-debug>-c chr6</mosdepth-options-debug>
372       
373        <!-- the required depth for a base to be callable for variants (optional, default=5) -->
374        <min-depth>5</min-depth>
375       
376        <!-- static options for VarDict (required) -->
377        <vardict-options>-f 0.02 -c 1 -S 2 -E 3 -g 4 -Q 20 -r 2 -q 20 --nosv</vardict-options>
378       
379        <!-- static options for var2vcf_valid.pl (required) -->
380        <var2vcf-options>-A -f 0.02</var2vcf-options>
381       
382        <!--static options for vcfanno command line (required) -->
383        <!-- See https://github.com/brentp/vcfanno for more information -->
384        <vcfanno-options>-p 8 -lua ${BaseDir}/vcfanno.lua -base-path ${BaseDir} ${BaseDir}/allDbs.toml</vcfanno-options>
385       
386        <!-- static options for the snpEff command (required) -->
387        <snpeff-options>-configOption data.dir=${BaseDir}/snpEff_v4_3_hg38/data -noLog -noStats -canon hg38</snpeff-options>
388
389        <!-- static options for the SnpSift command (required) -->
390        <snpsift-options>-s ${BaseDir}/rna_chr_set.txt -s ${BaseDir}/intogen-BRCA-genes-list_patch.txt -e ${BaseDir}/filter_expression.txt</snpsift-options>
391       
392        <!-- path to the COSMIC mutation signature data -->
393        <mutation-signature>${BaseDir}/COSMIC_Cancer_signatures_probabilities.RData</mutation-signature>
394      </variant-call>
395     
396      <cufflinks>
397        <!-- parallel environment option to the queue system -->
398        <!-- the default setting use between 8 and 16 slots on hosts with at least 8 slots available -->
399        <parallel-environment>smp 8-16</parallel-environment>
400 
401        <!-- relative path from <reference-folder> to the reference genome used by cufflinks -->
402        <reference-gidx>hg38/hg38.analysisSet/hg38.analysisSet.fa</reference-gidx>
403        <reference-gtf>hg38/UCSC_hg38_knownGenes_22sep2014.gtf</reference-gtf>
404       
405        <!-- static options for cufflinks -->
406        <options>--multi-read-correct --library-type fr-firststrand --total-hits-norm --max-bundle-frags 10000000 --no-update-check --quiet</options>
407       
408        <!-- if the aligned sequences item has more reads than this limit (when running in debug mode) -->
409        <!-- the accepted_hits.bam will be  limited to chr1 before running cufflinks -->
410        <debug-max-aligned>2000000</debug-max-aligned>
411       
412        <!-- path to a file containing pairs of tracking_id values -->
413        <!-- *.fpkm_tracking files are searched and values from the -->
414        <!-- second column are replaced with values in the first column -->
415        <!-- If no mapping file is specified, no replacement is done -->
416        <tracking-id-map>hg38/UCSC_hg38_knownGenes_22sep2014_duplicate_transcript_id.txt</tracking-id-map>
417      </cufflinks>
418     
419      <stringtie>
420        <!-- parallel environment option to the queue system -->
421        <!-- the default setting use between 8 and 16 slots on hosts with at least 8 slots available -->
422        <parallel-environment>smp 8-16</parallel-environment>
423       
424        <!-- relative path from <reference-folder> to the reference genome used by stringtie -->
425        <reference-gtf>hg38/hg38.analysisSet_gencode27_snp150/gencode.v27.primary_assembly.annotation_subset_transcripttype_proteincoding.gtf</reference-gtf>
426
427        <!-- static options for stringtie -->
428        <options>--rf -B -e</options>
429
430      </stringtie>
431    </host>
432 
433   
434  </remote-hosts>
435
436</reggie>
Note: See TracBrowser for help on using the repository browser.