source: trunk/yat/utility/Aligner.h @ 3211

Last change on this file since 3211 was 3211, checked in by Peter, 9 years ago

improve docs of CIGAR

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 7.7 KB
Line 
1#ifndef _theplu_yat_utility_aligner_
2#define _theplu_yat_utility_aligner_
3
4// $Id: Aligner.h 3211 2014-05-05 06:38:55Z peter $
5
6/*
7  Copyright (C) 2012 Peter Johansson
8  Copyright (C) 2013 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2014 Peter Johansson
10
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 3 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with yat. If not, see <http://www.gnu.org/licenses/>.
25*/
26
27#include <boost/cstdint.hpp>
28
29#include <deque>
30#include <cstddef>
31#include <iosfwd>
32#include <vector>
33
34namespace theplu {
35namespace yat {
36namespace utility {
37
38  class Matrix;
39
40  /**
41     \brief Aligning two sequences
42
43     General class aligning two sequences described in a dot-matrix,
44     D, in which \f$ D(i,j) \f$ describes how similar element \c i in
45     the first sequence is with element \c j in the second sequence. A
46     path through this matrix corresponds to an alignment of the two
47     sequences, in which a diagonal step correspoinds to a match
48     between corresponding sequence elements, and a vertical or
49     horizontal step correspond correspond to inserting a gap into one
50     of the sequences.
51
52     \since New in yat 0.9
53   */
54  class Aligner
55  {
56  public:
57    /**
58       Enum used to describe alignment.
59
60       \see alignment
61     */
62    enum direction { none, right, down, diagonal };
63
64    /**
65       \brief Constructor
66
67       Same as Aligner(gap, open_gap, gap, open_gap)
68     */
69    explicit Aligner(double gap=0, double open_gap=0);
70
71    /**
72       \brief constructor
73
74       \param vertical_gap Penalty for extending a vertical gap. A
75       vertical gap means alignment consumes an element in second
76       element and not, in the first element, i.e., an element has
77       been inserted to the second element or equivalently an element
78       has been deleted in first sequence.
79
80       \param open_vertical_gap Penalty for open a vertical gap. Total
81       cost for a vertical gap is open_vertical_gap + N *
82       vertical_gap.
83
84       \param horizon_gap Penalty for extending a horizontal gap
85
86       \param open_horizon_gap Penalty for open a horizontal
87       gap. Total penalty for a horizontal gap is open_horizon_gap + N
88       * horizon_gap.
89     */
90    Aligner(double vertical_gap, double open_vertical_gap,
91            double horizon_gap, double open_horizon_gap);
92
93    /**
94       Initialize a score matrix with \f$ S_{k,0} = -
95       \textrm{open\_vertical\_gap} - k * \textrm{vertical\_gap} \f$
96       and \f$ S_{0,k} = - \textrm{open\_horizon\_gap} - k *
97       \textrm{horizon\_gap} \f$ and align using operator().
98
99       \return most lower right element of score matrix
100     */
101    double needleman_wunsch(const Matrix& d);
102
103    /**
104       Initialize a score matrix with all elements equal to zero,
105       align using operator().
106
107       \return max element in score matrix.
108
109       \see SmithWaterman
110     */
111    double smith_waterman(const Matrix& d);
112
113    /**
114       \brief calculate score matrix based on the \a dot matrix
115
116       The algorithm calculates a maximum \a score matrix as
117
118       \f$ S_{i,j} = \textrm{max} \{ S_{ij}; S_{i,j-1} - F(\textrm{gap}_V);
119       S_{i-1,j-1} + D_{i-1,j-1}; S_{i-1,j} - F(\textrm{gap}_H) \} \f$
120
121       where \f$ F(\textrm{gap}) \f$ is gap if there was already a
122       gap, and gap + open_gap otherwise.
123
124       To get the CIGAR to behave as expected, the each row in \a dot
125       corresponds to an element in reference sequence and each row in
126       \a dot corresponds to an element in query reference. If that is
127       transposed, insertions and deletions are swapped in CIGAR.
128     */
129    void operator()(const Matrix& dot, Matrix& score);
130
131    /**
132       If, for example, alignment(i,j) returns Aligner::diagonal,
133       implies that score(i,j) was maximized as score(i-1,j-1) +
134       dot(i-1, j-1), in other words, element i-1 in first sequence
135       was aligned with element j-1 in second sequence.
136    */
137    const direction& alignment(size_t i, size_t j) const;
138
139    /**
140       \brief Compact Idiosyncratic Gapped Alignment Report
141
142       A CIGAR is a representation of an alignment, an array of
143       operation/length where the operation and length is packed into
144       a uint32_t. So a CIGAR of e.g. '3M1D5M' means alignment is
145       three matches, one deletion, and five matches, which is
146       represented by an array of length 3.
147
148       <table>
149       <tr><td>\c \#define</td><td>value</td><td>char</td><td>type</td>
150       <td>consume ref</td><td>consume query</td></tr>
151       <tr><td>BAM_CMATCH</td>    <td>0</td><td>M</td><td>3</td>
152       <td>\c true</td><td>\c true</td></tr>
153       <tr><td>BAM_CINS</td>      <td>1</td><td>I</td><td>1</td>
154       <td>\c false</td><td>\c true</td></tr>
155       <tr><td>BAM_CDEL</td>      <td>2</td><td>D</td><td>2</td>
156       <td>\c true</td><td>\c false</td></tr>
157       <tr><td>BAM_CREF_SKIP</td> <td>3</td><td>N</td><td>2</td>
158       <td>\c true</td><td>\c false</td></tr>
159       <tr><td>BAM_CSOFT_CLIP</td><td>4</td><td>S</td><td>1</td>
160       <td>\c false</td><td>\c true</td></tr>
161       <tr><td>BAM_CHARD_CLIP</td><td>5</td><td>H</td><td>0</td>
162       <td>\c false</td><td>\c false</td></tr>
163       <tr><td>BAM_CPAD</td>      <td>6</td><td>P</td><td>0</td>
164       <td>\c false</td><td>\c false</td></tr>
165       <tr><td>BAM_CEQUAL</td>    <td>7</td><td>=</td><td>3</td>
166       <td>\c true</td><td>\c true</td></tr>
167       <tr><td>BAM_CDIFF</td>     <td>8</td><td>X</td><td>3</td>
168       <td>\c true</td><td>\c true</td></tr>
169       <tr><td>BAM_CBACK</td>     <td>9</td><td>B</td><td>0</td>
170       <td>\c false</td><td>\c false</td></tr>
171       </table>
172
173       \since new in yat 0.12
174     */
175    class Cigar
176    {
177    public:
178      /**
179         Erases all elements.
180       */
181      void clear(void);
182
183      /**
184         See table in class documntation.
185
186         \return operation part of element \a i
187       */
188      uint8_t op(size_t i) const;
189
190      /**
191         Translate the op(i) to a descriptive character of one of the
192         following "MIDNSHP=XB". See table in class documentation.
193
194         \return character describing operation i
195       */
196      char opchr(size_t i) const;
197
198      /**
199         \return length of element i
200       */
201      uint32_t oplen(size_t i) const;
202
203      /**
204         Erase last \a len operation.
205       */
206      void pop_back(uint32_t len=1);
207
208      /**
209         Erase first \a len operation.
210       */
211      void pop_front(uint32_t len=1);
212
213      /**
214         Add an operation \a op of length \a len at back of cigar. If
215         last operation equals \a op, length of that element is
216         changed and the size() is not modified.
217       */
218      void push_back(uint8_t op, uint32_t len=1);
219
220      /**
221         Add an operation \a op of length \a len at front of cigar.
222       */
223      void push_front(uint8_t op, uint32_t len=1);
224
225      /**
226         \return cigar element \a i
227       */
228      uint32_t operator[](size_t i) const;
229
230      /**
231         \return number of elements in cigar
232       */
233      size_t size(void) const;
234    private:
235      std::deque<uint32_t> cigar_;
236
237      // using compiler generated copy
238      // Cigar(const Cigar& other);
239      // Cigar& operator=(const Cigar&);
240    };
241
242    /**
243       \since new in yat 0.12
244     */
245    const Cigar cigar(size_t i, size_t j) const;
246
247  private:
248    direction& directions(size_t i, size_t j);
249
250    size_t columns_;
251    double vertical_gap_;
252    double open_vertical_gap_;
253    double horizon_gap_;
254    double open_horizon_gap_;
255    std::vector<direction> alignment_;
256  };
257
258
259  /**
260     Output e.g. '5S44M5I98M' if the cigar has four elements. Each
261     element is output as the length followed by the character
262     descibing the operation (see opchr).
263
264     \relates Aligner::Cigar
265   */
266  std::ostream& operator<<(std::ostream& os, const Aligner::Cigar& cigar);
267
268}}} // of namespace utility, yat, and theplu
269
270#endif
Note: See TracBrowser for help on using the repository browser.