source: other/pipeline/trunk/mips_functions.sh @ 5838

Last change on this file since 5838 was 5838, checked in by Nicklas Nordborg, 3 years ago

References #1218: Implement MIPs alignment

Added functions for the novoalign and merge steps.

  • Property svn:eol-style set to LF
  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 7.5 KB
Line 
1# $Id: mips_functions.sh 5838 2020-02-24 14:45:46Z nicklas $
2# Helper functions for MIPS alignment. This script can't be used by itself.
3# It has been designed to be included in a main script that also need to
4#  set several global variables. For details see each function definition.
5#
6# Copy FASTQ and RG files from/to given folders
7# Parameters:
8#   $1: Folder with FASTQ and RG files
9#   $2: Folder to copy the files to         
10function copy_fastq_and_rg {
11  local srcFolder=$1
12  local destFolder=$2
13  if [ ! -d "${srcFolder}" ]; then
14    echo "Can't find FASTQ data folder ${srcFolder}" 1>&2
15    exit 1
16  fi
17  cp ${srcFolder}/*.fastq.gz $2
18  cp ${srcFolder}/*.rg $2
19}
20
21# Adapter sequences
22# Trimmomatic adapter sequences:
23# "For 'Palindrome' clipping, a matched pair of adapter sequences must be provided.
24#  The sequence names should both start with 'Prefix', and end in '/1' for the
25#  forward adapter and '/2' for the reverse adapter. The part of the name between
26# 'Prefix' and '/1' or '/2' must match exactly within each pair."
27# Checking the adapter fasta files provided by Trimmomatic, it seems as the
28# Prefix/1 adapter is the i5 adapter read from the flowcell surface (i.e. NOT
29# as it will appear in read2 when reading through a short fragment)
30# Prefix/2 adapter is the i7 adapter, also as read from the flowcell surface
31# MIPS BDC i5 adapter sequence after the index for trimmomatic: CATACGAGATCCGTAATCGGGAAGCTGAAG
32# MIPS BDC i7 adapter sequence after the index for trimmomatic: ACACGCACGATCCGACGGTAGTGT
33adapter1=CATACGAGATCCGTAATCGGGAAGCTGAAG
34adapter2=ACACGCACGATCCGACGGTAGTGT
35# Picard CollectAlignmentSummaryMetrics adapter sequences:
36# I haven't figured out in what direction it wants the adapters,
37# for simplicity, I use both directions
38# Reverse complement of MIPS BDC i5 adapter: CTTCAGCTTCCCGATTACGGATCTCGTATG
39# Reverse complement of MIPS BDC i7 adapter: ACACTACCGTCGGATCGTGCGTGT
40adapter1rc=CTTCAGCTTCCCGATTACGGATCTCGTATG
41adapter2rc=ACACTACCGTCGGATCGTGCGTGT
42# Novoalign adapter sequences:
43# Adapter1 is the i7 adapter sequence as it appears in read1 when reading
44# through a short fragment
45# Adapter2 is the i5 adapter sequence as it appears in read2 when reading
46# through as short fragment
47# MIPS BDC read1 (i7) adapter (before i7 index): ACACTACCGTCGGATCGTGCGTGT
48# MIPS BDC read2 (i5) adapter (before i5 index): CTTCAGCTTCCCGATTACGGATCTCGTATG
49# Use 12 first bases of each for novoalign (if using -a)
50novo_adapter1=ACACTACCGTCG
51novo_adapter2=CTTCAGCTTCCC
52
53# Create an adapter file that can be used with Trimmomatic for adapter trimming
54# Parameters:
55#   $1: Name of the file to create
56function create_adapter_fa {
57  local faFile=$1
58  echo -e ">PrefixPE/1\n${adapter1}\n>PrefixPE/2\n${adapter2}" > ${faFile}
59}
60
61# Run two Trimmomatic steps. First trim adapters and then quality
62# Parameters:
63#   $1: Prefix to FASTQ files "_R?.fastq.gz" is added automatically
64#   $2: Prefix for generated files (counter)
65# Global parameters:
66#   $JAVA: Path to Java exectable
67#   $TrimmomaticJAR: Path to Trimmomatic JAR file
68#   $NumThreads: Number of threads to use
69#   $TrimOptionsAdapter: Options for adapter trimming
70#   $TrimOptionsQual: Options for quality trimming
71#   $nAfterTrim: Counter for number of FASTQ files with data remaining after trimming
72function run_trimmomatic {
73  local prefix=$1
74  local n=$2
75 
76  ## Adapter trimming
77  ./stdwrap.sh ${JAVA} -jar ${TrimmomaticJAR} PE \
78    -threads ${NumThreads} \
79    -phred33 \
80    -trimlog tmp/${n}.adaptrim.log \
81    fastq/${prefix}_R1.fastq.gz fastq/${prefix}_R2.fastq.gz \
82    tmp/${n}.adaptrim.1.fastq tmp/${n}.adaptrim.u.1.fastq tmp/${n}.adaptrim.2.fastq tmp/${n}.adaptrim.u.2.fastq \
83    ${TrimOptionsAdapter} \
84    >> trimmomatic.out
85   
86  if [ ! -s "tmp/${n}.adaptrim.1.fastq" ]; then
87    rm -f tmp/${n}.adaptrim.*.fastq
88    echo "${prefix}: No paired end reads surviving adapter trimming"
89    return
90  fi
91   
92  ## Quality trimming
93  ./stdwrap.sh ${JAVA} -jar ${TrimmomaticJAR} PE \
94    -threads ${NumThreads} \
95    -phred33 \
96    -trimlog tmp/${n}.qualtrim.log \
97    tmp/${n}.adaptrim.1.fastq tmp/${n}.adaptrim.2.fastq \
98    tmp/${n}.qualtrim.1.fastq tmp/${n}.qualtrim.u.1.fastq tmp/${n}.qualtrim.2.fastq tmp/${n}.qualtrim.u.2.fastq \
99    ${TrimOptionsQual} \
100    >> trimmomatic.out
101   
102  if [ ! -s "tmp/${n}.qualtrim.1.fastq" ]; then
103    rm -f tmp/${n}.qualtrim.*.fastq
104    echo "${prefix}: No paired end reads surviving quality trimming"
105    return
106  fi
107 
108  nAfterTrim=$((nAfterTrim + 1))
109}
110
111# Generate statistics from Trimmomatic log files
112# Parameters:
113#   $1: Input file name pattern (*.log files from Trimmomatic)
114#   $2: Output file name
115function trimlog_stats {
116  local input=$1
117  local output=$2
118 
119  echo -e "Trim\tCount" > ${output}
120  if ls $input > /dev/null 2>&1; then
121    awk '{print $ (NF)}' ${input} | sort -n | uniq -c | awk -v OFS="\t" '{print $2,$1}' >> ${output}
122  fi
123}
124
125# Read RG file and convert to Picard2 syntax.
126# Parameters:
127#   $1: Path to RG file
128# Returns:
129#   Converted RG value
130function read_and_fix_rg {
131  local rg=$1
132 
133  read -r DATA < ${rg}
134 
135  DATA=${DATA/RG=/-READ_GROUP_NAME }
136  DATA=${DATA/SM=/-SAMPLE_NAME }
137  DATA=${DATA/LB=/-LIBRARY_NAME }
138  DATA=${DATA/PU=/-PLATFORM_UNIT }
139  DATA=${DATA/DT=/-RUN_DATE }
140  DATA=${DATA/PL=/-PLATFORM  }
141  DATA=${DATA/CN=/-SEQUENCING_CENTER }
142 
143  echo ${DATA}
144}
145
146# Run Picard to convert FASTQ files to BAM file
147# Parameters:
148#   $1: Prefix to RG file ".rg" is added automatically
149#   $2: Prefix for generated files (counter)
150function fastq_to_bam {
151  local prefix=$1
152  local n=$2
153 
154  if [ ! -f tmp/${n}.qualtrim.1.fastq ]; then
155    return
156  fi
157 
158  local rg=$(read_and_fix_rg "fastq/${prefix}.rg")
159 
160  ./stdwrap.sh ./picard2 FastqToSam \
161    -SORT_ORDER queryname \
162    -QUALITY_FORMAT Standard \
163    ${rg} \
164    -FASTQ tmp/${n}.qualtrim.1.fastq \
165    -FASTQ2 tmp/${n}.qualtrim.2.fastq \
166    -OUTPUT tmp/${n}.u.bam \
167    >> fastqtosam.out
168}
169
170# Run FgBio to annotate the BAM file with UMI information
171# Parameters:
172#   $1: Prefix to UMI file "_UMI.fastq.gz" is added automatically
173#   $2: Prefix for generated files (counter)
174# Global parameters:
175#   $JAVA: Path to Java exectable
176#   $FgBioJAR: Path to FgBio JAR file
177function annotate_umi {
178  local prefix=$1
179  local n=$2
180 
181  if [ ! -f tmp/${n}.u.bam ]; then
182    return
183  fi
184 
185  ./stdwrap.sh ${JAVA} -jar ${FgBioJAR} AnnotateBamWithUmis \
186    --fail-fast true -t RX \
187    -i tmp/${n}.u.bam \
188    -f fastq/${prefix}_UMI.fastq.gz \
189    -o tmp/${n}.umi.bam \
190    >> annotatebam.out
191
192}
193
194# Align with novoalign
195# Parameters:
196#   $1: Prefix of sample (used in output files)
197#   $2: Prefix for generated files (counter)
198# Global parameters:
199#   $NovoAlign: Path to novoalign exectable
200#   $NovoAlignOptions: Options for novoalign
201#   $NovoIndex: Path to index file
202#   $AmpliconsBed: Path to amplicons BED file
203#   $NovoSort: Path to novosort executable
204#   $NumThreads: Number of threads to use
205#   $AlignedBams: For returning paths to aligned BAM files (used for input in the merge step)
206function novoalign {
207  local prefix=$1
208  local n=$2
209
210  if [ ! -f tmp/${n}.umi.bam ]; then
211    return
212  fi
213
214  ./stderrwrap.sh ${NovoAlign} \
215    -c ${NumThreads} \
216    ${NovoAlignOptions} \
217    -d ${NovoIndex} \
218    --tags MC,ZP \
219    -i PE ${min_insert}-${max_insert} \
220    --amplicons ${AmpliconsBed} \
221    1 out/${prefix}.novo_coverage.bed \
222    -f tmp/${n}.umi.bam \
223    > tmp/${n}.nosort.bam \
224    3> out/${prefix}.novo.log
225   
226  ./stderrwrap.sh ${NovoSort} \
227    -c ${NumThreads} -t . -i \
228    -o tmp/${n}.novo.bam \
229    tmp/${n}.nosort.bam \
230    3> tmp/${n}.sort.log
231
232  AlignedBams="${AlignedBams} -INPUT tmp/${n}.novo.bam"
233}
234
235# Merge BAM files
236# Global parameters:
237#   $AlignedBams: BAM files to merge (created by 'novoalign')
238function merge_bam {
239
240  ./stdwrap.sh ./picard2 MergeSamFiles \
241    -SORT_ORDER coordinate -ASSUME_SORTED true \
242    ${AlignedBams} \
243    -OUTPUT tmp/novo.bam \
244    > mergebam.out
245}
246
Note: See TracBrowser for help on using the repository browser.