source: trunk/yat/utility/Aligner.h @ 3213

Last change on this file since 3213 was 3213, checked in by Peter, 8 years ago

three new cigar functions

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 8.8 KB
Line 
1#ifndef _theplu_yat_utility_aligner_
2#define _theplu_yat_utility_aligner_
3
4// $Id: Aligner.h 3213 2014-05-05 07:51:59Z peter $
5
6/*
7  Copyright (C) 2012 Peter Johansson
8  Copyright (C) 2013 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2014 Peter Johansson
10
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 3 of the
16  License, or (at your option) any later version.
17
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22
23  You should have received a copy of the GNU General Public License
24  along with yat. If not, see <http://www.gnu.org/licenses/>.
25*/
26
27#include <boost/cstdint.hpp>
28
29#include <deque>
30#include <cstddef>
31#include <iosfwd>
32#include <vector>
33
34namespace theplu {
35namespace yat {
36namespace utility {
37
38  class Matrix;
39
40  /**
41     \brief Aligning two sequences
42
43     General class aligning two sequences described in a dot-matrix,
44     D, in which \f$ D(i,j) \f$ describes how similar element \c i in
45     the first sequence is with element \c j in the second sequence. A
46     path through this matrix corresponds to an alignment of the two
47     sequences, in which a diagonal step correspoinds to a match
48     between corresponding sequence elements, and a vertical or
49     horizontal step correspond correspond to inserting a gap into one
50     of the sequences.
51
52     \since New in yat 0.9
53   */
54  class Aligner
55  {
56  public:
57    /**
58       Enum used to describe alignment.
59
60       \see alignment
61     */
62    enum direction { none, right, down, diagonal };
63
64    /**
65       \brief Constructor
66
67       Same as Aligner(gap, open_gap, gap, open_gap)
68     */
69    explicit Aligner(double gap=0, double open_gap=0);
70
71    /**
72       \brief constructor
73
74       \param vertical_gap Penalty for extending a vertical gap. A
75       vertical gap means alignment consumes an element in second
76       element and not, in the first element, i.e., an element has
77       been inserted to the second element or equivalently an element
78       has been deleted in first sequence.
79
80       \param open_vertical_gap Penalty for open a vertical gap. Total
81       cost for a vertical gap is open_vertical_gap + N *
82       vertical_gap.
83
84       \param horizon_gap Penalty for extending a horizontal gap
85
86       \param open_horizon_gap Penalty for open a horizontal
87       gap. Total penalty for a horizontal gap is open_horizon_gap + N
88       * horizon_gap.
89     */
90    Aligner(double vertical_gap, double open_vertical_gap,
91            double horizon_gap, double open_horizon_gap);
92
93    /**
94       Initialize a score matrix with \f$ S_{k,0} = -
95       \textrm{open\_vertical\_gap} - k * \textrm{vertical\_gap} \f$
96       and \f$ S_{0,k} = - \textrm{open\_horizon\_gap} - k *
97       \textrm{horizon\_gap} \f$ and align using operator().
98
99       \return most lower right element of score matrix
100     */
101    double needleman_wunsch(const Matrix& d);
102
103    /**
104       Initialize a score matrix with all elements equal to zero,
105       align using operator().
106
107       \return max element in score matrix.
108
109       \see SmithWaterman
110     */
111    double smith_waterman(const Matrix& d);
112
113    /**
114       \brief calculate score matrix based on the \a dot matrix
115
116       The algorithm calculates a maximum \a score matrix as
117
118       \f$ S_{i,j} = \textrm{max} \{ S_{ij}; S_{i,j-1} - F(\textrm{gap}_V);
119       S_{i-1,j-1} + D_{i-1,j-1}; S_{i-1,j} - F(\textrm{gap}_H) \} \f$
120
121       where \f$ F(\textrm{gap}) \f$ is gap if there was already a
122       gap, and gap + open_gap otherwise.
123
124       To get the CIGAR to behave as expected, the each row in \a dot
125       corresponds to an element in reference sequence and each row in
126       \a dot corresponds to an element in query reference. If that is
127       transposed, insertions and deletions are swapped in CIGAR.
128     */
129    void operator()(const Matrix& dot, Matrix& score);
130
131    /**
132       If, for example, alignment(i,j) returns Aligner::diagonal,
133       implies that score(i,j) was maximized as score(i-1,j-1) +
134       dot(i-1, j-1), in other words, element i-1 in first sequence
135       was aligned with element j-1 in second sequence.
136    */
137    const direction& alignment(size_t i, size_t j) const;
138
139    /**
140       \brief Compact Idiosyncratic Gapped Alignment Report
141
142       A CIGAR is a representation of an alignment, an array of
143       operation/length where the operation and length is packed into
144       a uint32_t. So a CIGAR of e.g. '3M1D5M' means alignment is
145       three matches, one deletion, and five matches, which is
146       represented by an array of length 3.
147
148       <table>
149       <tr><td>\c \#define</td><td>value</td><td>char</td><td>type</td>
150       <td>consume ref</td><td>consume query</td></tr>
151       <tr><td>BAM_CMATCH</td>    <td>0</td><td>M</td><td>3</td>
152       <td>\c true</td><td>\c true</td></tr>
153       <tr><td>BAM_CINS</td>      <td>1</td><td>I</td><td>1</td>
154       <td>\c false</td><td>\c true</td></tr>
155       <tr><td>BAM_CDEL</td>      <td>2</td><td>D</td><td>2</td>
156       <td>\c true</td><td>\c false</td></tr>
157       <tr><td>BAM_CREF_SKIP</td> <td>3</td><td>N</td><td>2</td>
158       <td>\c true</td><td>\c false</td></tr>
159       <tr><td>BAM_CSOFT_CLIP</td><td>4</td><td>S</td><td>1</td>
160       <td>\c false</td><td>\c true</td></tr>
161       <tr><td>BAM_CHARD_CLIP</td><td>5</td><td>H</td><td>0</td>
162       <td>\c false</td><td>\c false</td></tr>
163       <tr><td>BAM_CPAD</td>      <td>6</td><td>P</td><td>0</td>
164       <td>\c false</td><td>\c false</td></tr>
165       <tr><td>BAM_CEQUAL</td>    <td>7</td><td>=</td><td>3</td>
166       <td>\c true</td><td>\c true</td></tr>
167       <tr><td>BAM_CDIFF</td>     <td>8</td><td>X</td><td>3</td>
168       <td>\c true</td><td>\c true</td></tr>
169       <tr><td>BAM_CBACK</td>     <td>9</td><td>B</td><td>0</td>
170       <td>\c false</td><td>\c false</td></tr>
171       </table>
172
173       \since new in yat 0.12
174
175       \see file yat/utility/Cigar.h for some useful macros
176     */
177    class Cigar
178    {
179    public:
180      /**
181         Erases all elements.
182       */
183      void clear(void);
184
185      /**
186         Total length of operations counting operations that consume
187         either (or both) query and reference e.g. match, insertion,
188         or deletion. HardClip or Padding operations are ignored.
189       */
190      uint32_t length(void) const;
191
192      /**
193         See table in class documntation.
194
195         \return operation part of element \a i
196       */
197      uint8_t op(size_t i) const;
198
199      /**
200         Translate the op(i) to a descriptive character of one of the
201         following "MIDNSHP=XB". See table in class documentation.
202
203         \return character describing operation i
204       */
205      char opchr(size_t i) const;
206
207      /**
208         \return length of element i
209       */
210      uint32_t oplen(size_t i) const;
211
212      /**
213         Erase last \a len operation.
214       */
215      void pop_back(uint32_t len=1);
216
217      /**
218         Erase first \a len operation.
219       */
220      void pop_front(uint32_t len=1);
221
222      /**
223         Add an operation \a op of length \a len at back of cigar. If
224         last operation equals \a op, length of that element is
225         changed and the size() is not modified.
226       */
227      void push_back(uint8_t op, uint32_t len=1);
228
229      /**
230         Add an operation \a op of length \a len at front of cigar.
231       */
232      void push_front(uint8_t op, uint32_t len=1);
233
234      /**
235         \brief Translate CIGAR to a query length.
236
237         Calculate total length of operations which consume a query
238         e.g. match or insetion. See table in class documentation.
239
240         This is the same function as bam_cigar2qlen in samtools C
241         API.
242       */
243      uint32_t query_length(void) const;
244
245      /**
246         \brief Translate CIGAR to a reference length.
247
248         Calculate total length of operations which consume a reference
249         e.g. match or deletion. See table in class documentation.
250
251         Similar to bam_calend but does not handle \c BAM_CBACK as it has
252         not been included in SAM specification.
253         http://sourceforge.net/p/samtools/mailman/message/29373646/
254       */
255      uint32_t reference_length(void) const;
256
257      /**
258         \return cigar element \a i
259       */
260      uint32_t operator[](size_t i) const;
261
262      /**
263         \return number of elements in cigar
264       */
265      size_t size(void) const;
266    private:
267      std::deque<uint32_t> cigar_;
268
269      // calculate length only counting operations whose type is set
270      // in mask, i.e., mask & type returns true
271      uint32_t length(uint8_t mask) const;
272      // using compiler generated copy
273      // Cigar(const Cigar& other);
274      // Cigar& operator=(const Cigar&);
275    };
276
277    /**
278       \since new in yat 0.12
279     */
280    const Cigar cigar(size_t i, size_t j) const;
281
282  private:
283    direction& directions(size_t i, size_t j);
284
285    size_t columns_;
286    double vertical_gap_;
287    double open_vertical_gap_;
288    double horizon_gap_;
289    double open_horizon_gap_;
290    std::vector<direction> alignment_;
291  };
292
293
294  /**
295     Output e.g. '5S44M5I98M' if the cigar has four elements. Each
296     element is output as the length followed by the character
297     descibing the operation (see opchr).
298
299     \relates Aligner::Cigar
300   */
301  std::ostream& operator<<(std::ostream& os, const Aligner::Cigar& cigar);
302
303}}} // of namespace utility, yat, and theplu
304
305#endif
Note: See TracBrowser for help on using the repository browser.