source: trunk/yat/omic/Pileup.h @ 3801

Last change on this file since 3801 was 3801, checked in by Peter, 2 months ago

implement reverse iterator in Pileup. closes #919

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 9.7 KB
Line 
1#ifndef theplu_yat_omic_pileup
2#define theplu_yat_omic_pileup
3
4// $Id: Pileup.h 3801 2019-05-04 07:47:28Z peter $
5
6/*
7  Copyright (C) 2014, 2016, 2017, 2018 Peter Johansson
8
9  This file is part of the yat library, http://dev.thep.lu.se/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 3 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with yat. If not, see <http://www.gnu.org/licenses/>.
23*/
24
25#include "BamRead.h"
26
27#include "yat/utility/CigarIterator.h"
28#include "yat/utility/yat_assert.h"
29
30#include <boost/iterator/iterator_traits.hpp>
31#include <boost/iterator/iterator_concepts.hpp>
32#include <boost/concept/assert.hpp>
33#include <boost/shared_ptr.hpp>
34
35#include <algorithm>
36#include <list>
37
38namespace theplu {
39namespace yat {
40namespace omic {
41
42  /**
43     Pileup takes a range of BamReads and pile them up on the current
44     position. User can investigate the pile of reads at current
45     (locus) position via begin() and end(), which provide access to
46     reads at current position. The position can be incremented and in
47     that way the user can investigate each position of the genome.
48
49     \since new in yat 0.13
50
51     Type Requirments:
52     - \c Iterator is a \readable_iterator
53     - \c Iterator is a \single_pass_iterator
54     - \c value_type is convertible to BamRead
55   */
56  template<typename Iterator>
57  class Pileup
58  {
59  public:
60    /**
61       Essentially a BamRead but with additional information regarding
62       base we currently pointing to
63     */
64    class Entry
65    {
66    public:
67      /**
68         \brief default constructor
69       */
70      Entry(void) {}
71
72      /**
73         Create an Entry with read \a b. Entry is initialized to point
74         to first aligned base (b.pos()).
75       */
76      Entry(const BamRead& b);
77
78      /**
79         \return const reference to BamRead
80       */
81      const BamRead& bam(void) const
82      {
83        YAT_ASSERT(bam_.get());
84        return *bam_;
85      }
86
87      /**
88         \return cigar operation at current position
89       */
90      uint8_t cigar_op(void) const { return *cigar_; }
91
92
93      /**
94         \return length of current cigar element
95       */
96      uint32_t cigar_oplen(void) const
97      {
98        YAT_ASSERT(bam_.get());
99        YAT_ASSERT(cigar_.base() >= bam_->cigar());
100        return bam_cigar_oplen(*cigar_.base());
101      }
102
103
104      /**
105         \return base quality of current position
106       */
107      unsigned short qual(void) const
108      {
109        YAT_ASSERT(bam_.get());
110        YAT_ASSERT(qpos_ < bam_->sequence_length());
111        return bam_->qual(qpos_);
112      }
113
114      /**
115         Returned base is encoded in 4 bits and can be converted to
116         char using nt16_str(uint8_t).
117
118         \return base of current position
119       */
120      uint8_t sequence(void) const;
121
122      /**
123         Increment CIGAR. If current CIGAR element is consuming query
124         (not deletion), increment to next base on query.
125       */
126      void increment(void);
127
128      /**
129         \return true if this position was deleted in this query
130       */
131      bool is_del(void) const
132      {
133        return cigar_type()==2;
134      }
135
136      /**
137         \return index of base at this position
138       */
139      size_t qpos(void) const { return qpos_; }
140    private:
141      boost::shared_ptr<BamRead> bam_;
142      // index of base pointed to
143      size_t qpos_;
144      yat::utility::CigarIterator<const uint32_t*> cigar_;
145      size_t skip_;
146      // return 0, 1, 2, or 3 see bam_cigar_type in yat/utility/Cigar.h
147      uint8_t cigar_type(void) const { return bam_cigar_type(cigar_op()); }
148      friend class Pileup;
149    };
150
151  public:
152    /**
153       Const iterator that can be used to iterate over reads that
154       overlap with current position (as defined by tid() and pos()).
155     */
156    typedef typename std::list<Entry>::const_iterator const_iterator;
157
158    /**
159       Same as const_iterator but iterates reversed.
160
161       \see rbegin(void) rend(void)
162
163       \since new in yat 0.17
164     */
165    typedef typename std::list<Entry>::const_reverse_iterator
166    const_reverse_iterator;
167
168    /**
169       Create an empty Pileup
170     */
171    Pileup(void) {}
172
173    /**
174       Create a Pileup that will use reads in range [first, last). The
175       range must be sorted as defined by BamLessPos.
176
177       Iterator is a \readable_iterator.
178     */
179    Pileup(Iterator first, Iterator last);
180
181    /**
182       \return an iterator that points to the first Entry in Pileup.
183
184       Iteration is done in same order as range provided in
185       constructor. Reads that do not overlap with current position
186       (tid:pos) are ignored.
187     */
188    const_iterator begin(void) const;
189
190    /**
191       \return an iterator that points one past the last element in
192       the Pileup.
193     */
194    const_iterator end(void) const;
195
196    /**
197       \return true if there are more positions to increment into
198     */
199    bool good(void) const { return !(bam_ == bam_end_ && data_.empty()); }
200
201    /**
202       \brief step to next position.
203
204       If any Entry has a insertion at current position, increment
205       each Entry that has an insertion. Otherwise increment to next
206       position and update each Entry accordingly.
207
208       If the position is not covered by any read, the Pileup
209       fastforward to next covered locus.
210     */
211    void increment(void);
212
213    /**
214       \return an iterator that points to the last Entry in Pileup.
215
216       Iteration is done in reversed order; othwerwise behaves as begin(void)
217
218       \since new in yat 0.17
219     */
220    const_reverse_iterator rbegin(void) const;
221
222    /**
223       \return const reverse iterator to reverse end
224
225       \see end(void)
226       \see rbegin(void)
227
228       \since new in yat 0.17
229     */
230    const_reverse_iterator rend(void) const;
231
232    /**
233       \return template id as defined in bam file header
234     */
235    int32_t tid(void) const { return tid_; }
236
237    /**
238       \return position
239     */
240    int32_t pos(void) const { return pos_; }
241
242    /**
243       \return true if position does not exist in reference, i.e.,
244       inserted region.
245     */
246    bool skip_ref(void) const;
247  private:
248    typedef typename std::list<Entry>::iterator iterator;
249
250    Iterator bam_;
251    Iterator bam_end_;
252    int32_t pos_;
253    int32_t tid_;
254    std::list<Entry> data_;
255    uint32_t skip_ref_;
256  };
257
258
259  /**
260     \relates Pileup
261
262     \since new in yat 0.13
263   */
264  template<typename Iterator>
265  Pileup<Iterator> make_pileup(Iterator first, Iterator last)
266  {
267    return Pileup<Iterator>(first, last);
268  }
269
270  // implementations
271
272  template<typename Iterator>
273  Pileup<Iterator>::Pileup(Iterator first, Iterator last)
274    : bam_(first), bam_end_(last), pos_(0), tid_(0), skip_ref_(0)
275  {
276    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<Iterator>));
277    BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<Iterator>));
278    using boost::Convertible;
279    typedef typename boost::iterator_value<Iterator>::type value_type;
280    BOOST_CONCEPT_ASSERT((Convertible<value_type, BamRead>));
281    increment();
282  }
283
284
285  template<typename Iterator>
286  typename Pileup<Iterator>::const_iterator Pileup<Iterator>::begin(void) const
287  {
288    return data_.begin();
289  }
290
291
292  template<typename Iterator>
293  typename Pileup<Iterator>::const_reverse_iterator
294  Pileup<Iterator>::rbegin(void) const
295  {
296    return data_.rbegin();
297  }
298
299
300  template<typename Iterator>
301  typename Pileup<Iterator>::const_iterator Pileup<Iterator>::end(void) const
302  {
303    return data_.end();
304  }
305
306
307  template<typename Iterator>
308  typename Pileup<Iterator>::const_reverse_iterator
309  Pileup<Iterator>::rend(void) const
310  {
311    return data_.rend();
312  }
313
314
315  template<typename Iterator>
316  void Pileup<Iterator>::increment(void)
317  {
318    if (!data_.empty()) {
319      // in insertion
320      if (skip_ref()) {
321        for (iterator it = data_.begin(); it!=data_.end(); ++it) {
322          if (it->cigar_type() == 1) {
323            it->increment();
324          }
325          --it->skip_;
326        }
327        --skip_ref_;
328      }
329      // walk one base forward on reference
330      else {
331        ++pos_;
332        typename std::list<Entry>::iterator read = data_.begin();
333        while (read != data_.end()) {
334          read->increment();
335          // remove entry if read is entirely left of pos
336          if (read->bam().end() <= pos_) { // FIXME speedup possible
337            data_.erase(read++);
338            continue;
339          }
340          // check if we're stepping into insertion
341          if (read->cigar_op() == BAM_CINS) {
342            skip_ref_ = std::max(skip_ref_, read->cigar_oplen());
343          }
344          ++read;
345        }
346
347        // stepped into an insertion
348        if (skip_ref_)
349          for (iterator iter=data_.begin(); iter!=data_.end(); ++iter)
350            iter->skip_ = skip_ref_;
351      }
352    }
353
354
355    // fast forward to next covered locus
356    if (data_.empty()) {
357      if (bam_==bam_end_)
358        return;
359      tid_ = bam_->core().tid;
360      pos_ = bam_->core().pos;
361    }
362
363    // read data
364    while (bam_!=bam_end_ && bam_->core().tid==tid_ && bam_->pos()<=pos_) {
365      // skip unmapped reads
366      if (!(bam_->core().flag & BAM_FUNMAP)) {
367        data_.push_back(Entry(*bam_));
368      }
369      ++bam_;
370    }
371  }
372
373
374  template<typename Iterator>
375  bool Pileup<Iterator>::skip_ref(void) const
376  {
377    return skip_ref_;
378  }
379
380
381  template<typename Iterator>
382  Pileup<Iterator>::Entry::Entry(const BamRead& b)
383    : bam_(new BamRead(b)), qpos_(0), cigar_(bam_->cigar()), skip_(0)
384  {
385    YAT_ASSERT(cigar_.base() == bam_->cigar());
386    if (bam_->core().n_cigar==0)
387      return;
388    // iterate cigar to first match
389    while (qpos_ < bam_->sequence_length() && cigar_type() != 3) {
390      // type is either 0 (hardclipped) or 1 (softclipped)
391      YAT_ASSERT(cigar_type()!=2);
392      if (cigar_type() == 1)
393        ++qpos_;
394      ++cigar_;
395    }
396  }
397
398
399  template<typename Iterator>
400  void Pileup<Iterator>::Entry::increment(void)
401  {
402    YAT_ASSERT(cigar_.base() >= bam_->cigar());
403    if (cigar_type() & 1)
404      ++qpos_;
405    ++cigar_;
406    YAT_ASSERT(cigar_.base() >= bam_->cigar());
407  }
408
409
410  template<typename Iterator>
411  uint8_t Pileup<Iterator>::Entry::sequence(void) const
412  {
413    if (skip_==0) {
414      if (cigar_type() & 1)
415        return bam_->sequence(qpos_);
416      return 0;
417    }
418    if (cigar_op()!=BAM_CINS)
419      return 0;
420    return bam_->sequence(qpos_);
421  }
422
423}}}
424#endif
Note: See TracBrowser for help on using the repository browser.