1 | # $Id: mips_functions.sh 5838 2020-02-24 14:45:46Z nicklas $ |
---|
2 | # Helper functions for MIPS alignment. This script can't be used by itself. |
---|
3 | # It has been designed to be included in a main script that also need to |
---|
4 | # set several global variables. For details see each function definition. |
---|
5 | # |
---|
6 | # Copy FASTQ and RG files from/to given folders |
---|
7 | # Parameters: |
---|
8 | # $1: Folder with FASTQ and RG files |
---|
9 | # $2: Folder to copy the files to |
---|
10 | function copy_fastq_and_rg { |
---|
11 | local srcFolder=$1 |
---|
12 | local destFolder=$2 |
---|
13 | if [ ! -d "${srcFolder}" ]; then |
---|
14 | echo "Can't find FASTQ data folder ${srcFolder}" 1>&2 |
---|
15 | exit 1 |
---|
16 | fi |
---|
17 | cp ${srcFolder}/*.fastq.gz $2 |
---|
18 | cp ${srcFolder}/*.rg $2 |
---|
19 | } |
---|
20 | |
---|
21 | # Adapter sequences |
---|
22 | # Trimmomatic adapter sequences: |
---|
23 | # "For 'Palindrome' clipping, a matched pair of adapter sequences must be provided. |
---|
24 | # The sequence names should both start with 'Prefix', and end in '/1' for the |
---|
25 | # forward adapter and '/2' for the reverse adapter. The part of the name between |
---|
26 | # 'Prefix' and '/1' or '/2' must match exactly within each pair." |
---|
27 | # Checking the adapter fasta files provided by Trimmomatic, it seems as the |
---|
28 | # Prefix/1 adapter is the i5 adapter read from the flowcell surface (i.e. NOT |
---|
29 | # as it will appear in read2 when reading through a short fragment) |
---|
30 | # Prefix/2 adapter is the i7 adapter, also as read from the flowcell surface |
---|
31 | # MIPS BDC i5 adapter sequence after the index for trimmomatic: CATACGAGATCCGTAATCGGGAAGCTGAAG |
---|
32 | # MIPS BDC i7 adapter sequence after the index for trimmomatic: ACACGCACGATCCGACGGTAGTGT |
---|
33 | adapter1=CATACGAGATCCGTAATCGGGAAGCTGAAG |
---|
34 | adapter2=ACACGCACGATCCGACGGTAGTGT |
---|
35 | # Picard CollectAlignmentSummaryMetrics adapter sequences: |
---|
36 | # I haven't figured out in what direction it wants the adapters, |
---|
37 | # for simplicity, I use both directions |
---|
38 | # Reverse complement of MIPS BDC i5 adapter: CTTCAGCTTCCCGATTACGGATCTCGTATG |
---|
39 | # Reverse complement of MIPS BDC i7 adapter: ACACTACCGTCGGATCGTGCGTGT |
---|
40 | adapter1rc=CTTCAGCTTCCCGATTACGGATCTCGTATG |
---|
41 | adapter2rc=ACACTACCGTCGGATCGTGCGTGT |
---|
42 | # Novoalign adapter sequences: |
---|
43 | # Adapter1 is the i7 adapter sequence as it appears in read1 when reading |
---|
44 | # through a short fragment |
---|
45 | # Adapter2 is the i5 adapter sequence as it appears in read2 when reading |
---|
46 | # through as short fragment |
---|
47 | # MIPS BDC read1 (i7) adapter (before i7 index): ACACTACCGTCGGATCGTGCGTGT |
---|
48 | # MIPS BDC read2 (i5) adapter (before i5 index): CTTCAGCTTCCCGATTACGGATCTCGTATG |
---|
49 | # Use 12 first bases of each for novoalign (if using -a) |
---|
50 | novo_adapter1=ACACTACCGTCG |
---|
51 | novo_adapter2=CTTCAGCTTCCC |
---|
52 | |
---|
53 | # Create an adapter file that can be used with Trimmomatic for adapter trimming |
---|
54 | # Parameters: |
---|
55 | # $1: Name of the file to create |
---|
56 | function create_adapter_fa { |
---|
57 | local faFile=$1 |
---|
58 | echo -e ">PrefixPE/1\n${adapter1}\n>PrefixPE/2\n${adapter2}" > ${faFile} |
---|
59 | } |
---|
60 | |
---|
61 | # Run two Trimmomatic steps. First trim adapters and then quality |
---|
62 | # Parameters: |
---|
63 | # $1: Prefix to FASTQ files "_R?.fastq.gz" is added automatically |
---|
64 | # $2: Prefix for generated files (counter) |
---|
65 | # Global parameters: |
---|
66 | # $JAVA: Path to Java exectable |
---|
67 | # $TrimmomaticJAR: Path to Trimmomatic JAR file |
---|
68 | # $NumThreads: Number of threads to use |
---|
69 | # $TrimOptionsAdapter: Options for adapter trimming |
---|
70 | # $TrimOptionsQual: Options for quality trimming |
---|
71 | # $nAfterTrim: Counter for number of FASTQ files with data remaining after trimming |
---|
72 | function run_trimmomatic { |
---|
73 | local prefix=$1 |
---|
74 | local n=$2 |
---|
75 | |
---|
76 | ## Adapter trimming |
---|
77 | ./stdwrap.sh ${JAVA} -jar ${TrimmomaticJAR} PE \ |
---|
78 | -threads ${NumThreads} \ |
---|
79 | -phred33 \ |
---|
80 | -trimlog tmp/${n}.adaptrim.log \ |
---|
81 | fastq/${prefix}_R1.fastq.gz fastq/${prefix}_R2.fastq.gz \ |
---|
82 | tmp/${n}.adaptrim.1.fastq tmp/${n}.adaptrim.u.1.fastq tmp/${n}.adaptrim.2.fastq tmp/${n}.adaptrim.u.2.fastq \ |
---|
83 | ${TrimOptionsAdapter} \ |
---|
84 | >> trimmomatic.out |
---|
85 | |
---|
86 | if [ ! -s "tmp/${n}.adaptrim.1.fastq" ]; then |
---|
87 | rm -f tmp/${n}.adaptrim.*.fastq |
---|
88 | echo "${prefix}: No paired end reads surviving adapter trimming" |
---|
89 | return |
---|
90 | fi |
---|
91 | |
---|
92 | ## Quality trimming |
---|
93 | ./stdwrap.sh ${JAVA} -jar ${TrimmomaticJAR} PE \ |
---|
94 | -threads ${NumThreads} \ |
---|
95 | -phred33 \ |
---|
96 | -trimlog tmp/${n}.qualtrim.log \ |
---|
97 | tmp/${n}.adaptrim.1.fastq tmp/${n}.adaptrim.2.fastq \ |
---|
98 | tmp/${n}.qualtrim.1.fastq tmp/${n}.qualtrim.u.1.fastq tmp/${n}.qualtrim.2.fastq tmp/${n}.qualtrim.u.2.fastq \ |
---|
99 | ${TrimOptionsQual} \ |
---|
100 | >> trimmomatic.out |
---|
101 | |
---|
102 | if [ ! -s "tmp/${n}.qualtrim.1.fastq" ]; then |
---|
103 | rm -f tmp/${n}.qualtrim.*.fastq |
---|
104 | echo "${prefix}: No paired end reads surviving quality trimming" |
---|
105 | return |
---|
106 | fi |
---|
107 | |
---|
108 | nAfterTrim=$((nAfterTrim + 1)) |
---|
109 | } |
---|
110 | |
---|
111 | # Generate statistics from Trimmomatic log files |
---|
112 | # Parameters: |
---|
113 | # $1: Input file name pattern (*.log files from Trimmomatic) |
---|
114 | # $2: Output file name |
---|
115 | function trimlog_stats { |
---|
116 | local input=$1 |
---|
117 | local output=$2 |
---|
118 | |
---|
119 | echo -e "Trim\tCount" > ${output} |
---|
120 | if ls $input > /dev/null 2>&1; then |
---|
121 | awk '{print $ (NF)}' ${input} | sort -n | uniq -c | awk -v OFS="\t" '{print $2,$1}' >> ${output} |
---|
122 | fi |
---|
123 | } |
---|
124 | |
---|
125 | # Read RG file and convert to Picard2 syntax. |
---|
126 | # Parameters: |
---|
127 | # $1: Path to RG file |
---|
128 | # Returns: |
---|
129 | # Converted RG value |
---|
130 | function read_and_fix_rg { |
---|
131 | local rg=$1 |
---|
132 | |
---|
133 | read -r DATA < ${rg} |
---|
134 | |
---|
135 | DATA=${DATA/RG=/-READ_GROUP_NAME } |
---|
136 | DATA=${DATA/SM=/-SAMPLE_NAME } |
---|
137 | DATA=${DATA/LB=/-LIBRARY_NAME } |
---|
138 | DATA=${DATA/PU=/-PLATFORM_UNIT } |
---|
139 | DATA=${DATA/DT=/-RUN_DATE } |
---|
140 | DATA=${DATA/PL=/-PLATFORM } |
---|
141 | DATA=${DATA/CN=/-SEQUENCING_CENTER } |
---|
142 | |
---|
143 | echo ${DATA} |
---|
144 | } |
---|
145 | |
---|
146 | # Run Picard to convert FASTQ files to BAM file |
---|
147 | # Parameters: |
---|
148 | # $1: Prefix to RG file ".rg" is added automatically |
---|
149 | # $2: Prefix for generated files (counter) |
---|
150 | function fastq_to_bam { |
---|
151 | local prefix=$1 |
---|
152 | local n=$2 |
---|
153 | |
---|
154 | if [ ! -f tmp/${n}.qualtrim.1.fastq ]; then |
---|
155 | return |
---|
156 | fi |
---|
157 | |
---|
158 | local rg=$(read_and_fix_rg "fastq/${prefix}.rg") |
---|
159 | |
---|
160 | ./stdwrap.sh ./picard2 FastqToSam \ |
---|
161 | -SORT_ORDER queryname \ |
---|
162 | -QUALITY_FORMAT Standard \ |
---|
163 | ${rg} \ |
---|
164 | -FASTQ tmp/${n}.qualtrim.1.fastq \ |
---|
165 | -FASTQ2 tmp/${n}.qualtrim.2.fastq \ |
---|
166 | -OUTPUT tmp/${n}.u.bam \ |
---|
167 | >> fastqtosam.out |
---|
168 | } |
---|
169 | |
---|
170 | # Run FgBio to annotate the BAM file with UMI information |
---|
171 | # Parameters: |
---|
172 | # $1: Prefix to UMI file "_UMI.fastq.gz" is added automatically |
---|
173 | # $2: Prefix for generated files (counter) |
---|
174 | # Global parameters: |
---|
175 | # $JAVA: Path to Java exectable |
---|
176 | # $FgBioJAR: Path to FgBio JAR file |
---|
177 | function annotate_umi { |
---|
178 | local prefix=$1 |
---|
179 | local n=$2 |
---|
180 | |
---|
181 | if [ ! -f tmp/${n}.u.bam ]; then |
---|
182 | return |
---|
183 | fi |
---|
184 | |
---|
185 | ./stdwrap.sh ${JAVA} -jar ${FgBioJAR} AnnotateBamWithUmis \ |
---|
186 | --fail-fast true -t RX \ |
---|
187 | -i tmp/${n}.u.bam \ |
---|
188 | -f fastq/${prefix}_UMI.fastq.gz \ |
---|
189 | -o tmp/${n}.umi.bam \ |
---|
190 | >> annotatebam.out |
---|
191 | |
---|
192 | } |
---|
193 | |
---|
194 | # Align with novoalign |
---|
195 | # Parameters: |
---|
196 | # $1: Prefix of sample (used in output files) |
---|
197 | # $2: Prefix for generated files (counter) |
---|
198 | # Global parameters: |
---|
199 | # $NovoAlign: Path to novoalign exectable |
---|
200 | # $NovoAlignOptions: Options for novoalign |
---|
201 | # $NovoIndex: Path to index file |
---|
202 | # $AmpliconsBed: Path to amplicons BED file |
---|
203 | # $NovoSort: Path to novosort executable |
---|
204 | # $NumThreads: Number of threads to use |
---|
205 | # $AlignedBams: For returning paths to aligned BAM files (used for input in the merge step) |
---|
206 | function novoalign { |
---|
207 | local prefix=$1 |
---|
208 | local n=$2 |
---|
209 | |
---|
210 | if [ ! -f tmp/${n}.umi.bam ]; then |
---|
211 | return |
---|
212 | fi |
---|
213 | |
---|
214 | ./stderrwrap.sh ${NovoAlign} \ |
---|
215 | -c ${NumThreads} \ |
---|
216 | ${NovoAlignOptions} \ |
---|
217 | -d ${NovoIndex} \ |
---|
218 | --tags MC,ZP \ |
---|
219 | -i PE ${min_insert}-${max_insert} \ |
---|
220 | --amplicons ${AmpliconsBed} \ |
---|
221 | 1 out/${prefix}.novo_coverage.bed \ |
---|
222 | -f tmp/${n}.umi.bam \ |
---|
223 | > tmp/${n}.nosort.bam \ |
---|
224 | 3> out/${prefix}.novo.log |
---|
225 | |
---|
226 | ./stderrwrap.sh ${NovoSort} \ |
---|
227 | -c ${NumThreads} -t . -i \ |
---|
228 | -o tmp/${n}.novo.bam \ |
---|
229 | tmp/${n}.nosort.bam \ |
---|
230 | 3> tmp/${n}.sort.log |
---|
231 | |
---|
232 | AlignedBams="${AlignedBams} -INPUT tmp/${n}.novo.bam" |
---|
233 | } |
---|
234 | |
---|
235 | # Merge BAM files |
---|
236 | # Global parameters: |
---|
237 | # $AlignedBams: BAM files to merge (created by 'novoalign') |
---|
238 | function merge_bam { |
---|
239 | |
---|
240 | ./stdwrap.sh ./picard2 MergeSamFiles \ |
---|
241 | -SORT_ORDER coordinate -ASSUME_SORTED true \ |
---|
242 | ${AlignedBams} \ |
---|
243 | -OUTPUT tmp/novo.bam \ |
---|
244 | > mergebam.out |
---|
245 | } |
---|
246 | |
---|