source: trunk/yat/utility/Aligner.h @ 3336

Last change on this file since 3336 was 3336, checked in by Peter, 7 years ago

closes #816

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 9.3 KB
Line 
1#ifndef _theplu_yat_utility_aligner_
2#define _theplu_yat_utility_aligner_
3
4// $Id: Aligner.h 3336 2014-10-24 23:27:45Z peter $
5
6/*
7  Copyright (C) 2012 Peter Johansson
8  Copyright (C) 2013 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2014 Peter Johansson
10
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 3 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with yat. If not, see <http://www.gnu.org/licenses/>.
25*/
26
27#include "CigarIterator.h"
28
29#include <boost/cstdint.hpp>
30
31#include <deque>
32#include <cstddef>
33#include <iosfwd>
34#include <vector>
35
36namespace theplu {
37namespace yat {
38namespace utility {
39
40  class Matrix;
41
42  /**
43     \brief Aligning two sequences
44
45     General class aligning two sequences described in a dot-matrix,
46     D, in which \f$ D(i,j) \f$ describes how similar element \c i in
47     the first sequence is with element \c j in the second sequence. A
48     path through this matrix corresponds to an alignment of the two
49     sequences, in which a diagonal step correspoinds to a match
50     between corresponding sequence elements, and a vertical or
51     horizontal step correspond correspond to inserting a gap into one
52     of the sequences.
53
54     \since New in yat 0.9
55   */
56  class Aligner
57  {
58  public:
59    /**
60       Enum used to describe alignment.
61
62       \see alignment
63     */
64    enum direction { none, right, down, diagonal };
65
66    /**
67       \brief Constructor
68
69       Same as Aligner(gap, open_gap, gap, open_gap)
70     */
71    explicit Aligner(double gap=0, double open_gap=0);
72
73    /**
74       \brief constructor
75
76       \param vertical_gap Penalty for extending a vertical gap. A
77       vertical gap means alignment consumes an element in second
78       element and not, in the first element, i.e., an element has
79       been inserted to the second element or equivalently an element
80       has been deleted in first sequence.
81
82       \param open_vertical_gap Penalty for open a vertical gap. Total
83       cost for a vertical gap is open_vertical_gap + N *
84       vertical_gap.
85
86       \param horizon_gap Penalty for extending a horizontal gap
87
88       \param open_horizon_gap Penalty for open a horizontal
89       gap. Total penalty for a horizontal gap is open_horizon_gap + N
90       * horizon_gap.
91     */
92    Aligner(double vertical_gap, double open_vertical_gap,
93            double horizon_gap, double open_horizon_gap);
94
95    /**
96       Initialize a score matrix with \f$ S_{k,0} = -
97       \textrm{open\_vertical\_gap} - k * \textrm{vertical\_gap} \f$
98       and \f$ S_{0,k} = - \textrm{open\_horizon\_gap} - k *
99       \textrm{horizon\_gap} \f$ and align using operator().
100
101       \return most lower right element of score matrix
102     */
103    double needleman_wunsch(const Matrix& d);
104
105    /**
106       Initialize a score matrix with all elements equal to zero,
107       align using operator().
108
109       \return max element in score matrix.
110
111       \see SmithWaterman
112     */
113    double smith_waterman(const Matrix& d);
114
115    /**
116       \brief calculate score matrix based on the \a dot matrix
117
118       The algorithm calculates a maximum \a score matrix as
119
120       \f$ S_{i,j} = \textrm{max} \{ S_{ij}; S_{i,j-1} - F(\textrm{gap}_V);
121       S_{i-1,j-1} + D_{i-1,j-1}; S_{i-1,j} - F(\textrm{gap}_H) \} \f$
122
123       where \f$ F(\textrm{gap}) \f$ is gap if there was already a
124       gap, and gap + open_gap otherwise.
125
126       To get the CIGAR to behave as expected, the each row in \a dot
127       corresponds to an element in reference sequence and each row in
128       \a dot corresponds to an element in query reference. If that is
129       transposed, insertions and deletions are swapped in CIGAR.
130     */
131    void operator()(const Matrix& dot, Matrix& score);
132
133    /**
134       If, for example, alignment(i,j) returns Aligner::diagonal,
135       implies that score(i,j) was maximized as score(i-1,j-1) +
136       dot(i-1, j-1), in other words, element i-1 in first sequence
137       was aligned with element j-1 in second sequence.
138    */
139    const direction& alignment(size_t i, size_t j) const;
140
141    /**
142       \brief Compact Idiosyncratic Gapped Alignment Report
143
144       A CIGAR is a representation of an alignment, an array of
145       operation/length where the operation and length is packed into
146       a uint32_t. So a CIGAR of e.g. '3M1D5M' means alignment is
147       three matches, one deletion, and five matches, which is
148       represented by an array of length 3.
149
150       <table>
151       <tr><td>\c \#define</td><td>value</td><td>char</td><td>type</td>
152       <td>consume ref</td><td>consume query</td></tr>
153       <tr><td>BAM_CMATCH</td>    <td>0</td><td>M</td><td>3</td>
154       <td>\c true</td><td>\c true</td></tr>
155       <tr><td>BAM_CINS</td>      <td>1</td><td>I</td><td>1</td>
156       <td>\c false</td><td>\c true</td></tr>
157       <tr><td>BAM_CDEL</td>      <td>2</td><td>D</td><td>2</td>
158       <td>\c true</td><td>\c false</td></tr>
159       <tr><td>BAM_CREF_SKIP</td> <td>3</td><td>N</td><td>2</td>
160       <td>\c true</td><td>\c false</td></tr>
161       <tr><td>BAM_CSOFT_CLIP</td><td>4</td><td>S</td><td>1</td>
162       <td>\c false</td><td>\c true</td></tr>
163       <tr><td>BAM_CHARD_CLIP</td><td>5</td><td>H</td><td>0</td>
164       <td>\c false</td><td>\c false</td></tr>
165       <tr><td>BAM_CPAD</td>      <td>6</td><td>P</td><td>0</td>
166       <td>\c false</td><td>\c false</td></tr>
167       <tr><td>BAM_CEQUAL</td>    <td>7</td><td>=</td><td>3</td>
168       <td>\c true</td><td>\c true</td></tr>
169       <tr><td>BAM_CDIFF</td>     <td>8</td><td>X</td><td>3</td>
170       <td>\c true</td><td>\c true</td></tr>
171       <tr><td>BAM_CBACK</td>     <td>9</td><td>B</td><td>0</td>
172       <td>\c false</td><td>\c false</td></tr>
173       </table>
174
175       \since new in yat 0.12
176
177       \see file yat/utility/Cigar.h for some useful macros
178     */
179    class Cigar
180    {
181    public:
182      /**
183         \since New in yat 0.13
184       */
185      typedef
186      CigarIterator<std::deque<uint32_t>::const_iterator > const_iterator;
187
188      /**
189         \return iterator pointing to first element
190
191         \since New in yat 0.13
192       */
193      const_iterator begin(void) const;
194
195      /**
196         Erases all elements.
197       */
198      void clear(void);
199
200      /**
201         \return iterator pointing to one passed last element
202
203         \since New in yat 0.13
204       */
205      const_iterator end(void) const;
206
207      /**
208         Total length of operations counting operations that consume
209         either (or both) query and reference e.g. match, insertion,
210         or deletion. HardClip or Padding operations are ignored.
211       */
212      uint32_t length(void) const;
213
214      /**
215         See table in class documntation.
216
217         \return operation part of element \a i
218       */
219      uint8_t op(size_t i) const;
220
221      /**
222         Translate the op(i) to a descriptive character of one of the
223         following "MIDNSHP=XB". See table in class documentation.
224
225         \return character describing operation i
226       */
227      char opchr(size_t i) const;
228
229      /**
230         \return length of element i
231       */
232      uint32_t oplen(size_t i) const;
233
234      /**
235         Erase last \a len operation.
236       */
237      void pop_back(uint32_t len=1);
238
239      /**
240         Erase first \a len operation.
241       */
242      void pop_front(uint32_t len=1);
243
244      /**
245         Add an operation \a op of length \a len at back of cigar. If
246         last operation equals \a op, length of that element is
247         changed and the size() is not modified.
248       */
249      void push_back(uint8_t op, uint32_t len=1);
250
251      /**
252         Add an operation \a op of length \a len at front of cigar.
253       */
254      void push_front(uint8_t op, uint32_t len=1);
255
256      /**
257         \brief Translate CIGAR to a query length.
258
259         Calculate total length of operations which consume a query
260         e.g. match or insetion. See table in class documentation.
261
262         This is the same function as bam_cigar2qlen in samtools C
263         API.
264       */
265      uint32_t query_length(void) const;
266
267      /**
268         \brief Translate CIGAR to a reference length.
269
270         Calculate total length of operations which consume a reference
271         e.g. match or deletion. See table in class documentation.
272
273         Similar to bam_calend but does not handle \c BAM_CBACK as it has
274         not been included in SAM specification.
275         http://sourceforge.net/p/samtools/mailman/message/29373646/
276       */
277      uint32_t reference_length(void) const;
278
279      /**
280         \return cigar element \a i
281       */
282      uint32_t operator[](size_t i) const;
283
284      /**
285         \return number of elements in cigar
286       */
287      size_t size(void) const;
288    private:
289      std::deque<uint32_t> cigar_;
290
291      // calculate length only counting operations whose type is set
292      // in mask, i.e., mask & type returns true
293      uint32_t length(uint8_t mask) const;
294      // using compiler generated copy
295      // Cigar(const Cigar& other);
296      // Cigar& operator=(const Cigar&);
297    };
298
299    /**
300       \since new in yat 0.12
301     */
302    const Cigar cigar(size_t i, size_t j) const;
303
304  private:
305    direction& directions(size_t i, size_t j);
306
307    size_t columns_;
308    double vertical_gap_;
309    double open_vertical_gap_;
310    double horizon_gap_;
311    double open_horizon_gap_;
312    std::vector<direction> alignment_;
313  };
314
315
316  /**
317     Output e.g. '5S44M5I98M' if the cigar has four elements. Each
318     element is output as the length followed by the character
319     descibing the operation (see opchr).
320
321     \relates Aligner::Cigar
322   */
323  std::ostream& operator<<(std::ostream& os, const Aligner::Cigar& cigar);
324
325}}} // of namespace utility, yat, and theplu
326
327#endif
Note: See TracBrowser for help on using the repository browser.