source: trunk/yat/omic/Pileup.h @ 3802

Last change on this file since 3802 was 3802, checked in by Peter, 5 months ago

closes #918. implement function Pileup::size

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 10.0 KB
Line 
1#ifndef theplu_yat_omic_pileup
2#define theplu_yat_omic_pileup
3
4// $Id: Pileup.h 3802 2019-05-12 04:31:06Z peter $
5
6/*
7  Copyright (C) 2014, 2016, 2017, 2018 Peter Johansson
8
9  This file is part of the yat library, http://dev.thep.lu.se/yat
10
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 3 of the
14  License, or (at your option) any later version.
15
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20
21  You should have received a copy of the GNU General Public License
22  along with yat. If not, see <http://www.gnu.org/licenses/>.
23*/
24
25#include "BamRead.h"
26
27#include "yat/utility/CigarIterator.h"
28#include "yat/utility/yat_assert.h"
29
30#include <boost/iterator/iterator_traits.hpp>
31#include <boost/iterator/iterator_concepts.hpp>
32#include <boost/concept/assert.hpp>
33#include <boost/shared_ptr.hpp>
34
35#include <algorithm>
36#include <list>
37
38namespace theplu {
39namespace yat {
40namespace omic {
41
42  /**
43     Pileup takes a range of BamReads and pile them up on the current
44     position. User can investigate the pile of reads at current
45     (locus) position via begin() and end(), which provide access to
46     reads at current position. The position can be incremented and in
47     that way the user can investigate each position of the genome.
48
49     \since new in yat 0.13
50
51     Type Requirments:
52     - \c Iterator is a \readable_iterator
53     - \c Iterator is a \single_pass_iterator
54     - \c value_type is convertible to BamRead
55   */
56  template<typename Iterator>
57  class Pileup
58  {
59  public:
60    /**
61       Essentially a BamRead but with additional information regarding
62       base we currently pointing to
63     */
64    class Entry
65    {
66    public:
67      /**
68         \brief default constructor
69       */
70      Entry(void) {}
71
72      /**
73         Create an Entry with read \a b. Entry is initialized to point
74         to first aligned base (b.pos()).
75       */
76      Entry(const BamRead& b);
77
78      /**
79         \return const reference to BamRead
80       */
81      const BamRead& bam(void) const
82      {
83        YAT_ASSERT(bam_.get());
84        return *bam_;
85      }
86
87      /**
88         \return cigar operation at current position
89       */
90      uint8_t cigar_op(void) const { return *cigar_; }
91
92
93      /**
94         \return length of current cigar element
95       */
96      uint32_t cigar_oplen(void) const
97      {
98        YAT_ASSERT(bam_.get());
99        YAT_ASSERT(cigar_.base() >= bam_->cigar());
100        return bam_cigar_oplen(*cigar_.base());
101      }
102
103
104      /**
105         \return base quality of current position
106       */
107      unsigned short qual(void) const
108      {
109        YAT_ASSERT(bam_.get());
110        YAT_ASSERT(qpos_ < bam_->sequence_length());
111        return bam_->qual(qpos_);
112      }
113
114      /**
115         Returned base is encoded in 4 bits and can be converted to
116         char using nt16_str(uint8_t).
117
118         \return base of current position
119       */
120      uint8_t sequence(void) const;
121
122      /**
123         Increment CIGAR. If current CIGAR element is consuming query
124         (not deletion), increment to next base on query.
125       */
126      void increment(void);
127
128      /**
129         \return true if this position was deleted in this query
130       */
131      bool is_del(void) const
132      {
133        return cigar_type()==2;
134      }
135
136      /**
137         \return index of base at this position
138       */
139      size_t qpos(void) const { return qpos_; }
140    private:
141      boost::shared_ptr<BamRead> bam_;
142      // index of base pointed to
143      size_t qpos_;
144      yat::utility::CigarIterator<const uint32_t*> cigar_;
145      size_t skip_;
146      // return 0, 1, 2, or 3 see bam_cigar_type in yat/utility/Cigar.h
147      uint8_t cigar_type(void) const { return bam_cigar_type(cigar_op()); }
148      friend class Pileup;
149    };
150
151  public:
152    /**
153       Const iterator that can be used to iterate over reads that
154       overlap with current position (as defined by tid() and pos()).
155     */
156    typedef typename std::list<Entry>::const_iterator const_iterator;
157
158    /**
159       Same as const_iterator but iterates reversed.
160
161       \see rbegin(void) rend(void)
162
163       \since new in yat 0.17
164     */
165    typedef typename std::list<Entry>::const_reverse_iterator
166    const_reverse_iterator;
167
168    /**
169       Create an empty Pileup
170     */
171    Pileup(void) {}
172
173    /**
174       Create a Pileup that will use reads in range [first, last). The
175       range must be sorted as defined by BamLessPos.
176
177       Iterator is a \readable_iterator.
178     */
179    Pileup(Iterator first, Iterator last);
180
181    /**
182       \return an iterator that points to the first Entry in Pileup.
183
184       Iteration is done in same order as range provided in
185       constructor. Reads that do not overlap with current position
186       (tid:pos) are ignored.
187     */
188    const_iterator begin(void) const;
189
190    /**
191       \return an iterator that points one past the last element in
192       the Pileup.
193     */
194    const_iterator end(void) const;
195
196    /**
197       \return true if there are more positions to increment into
198     */
199    bool good(void) const { return !(bam_ == bam_end_ && data_.empty()); }
200
201    /**
202       \brief step to next position.
203
204       If any Entry has a insertion at current position, increment
205       each Entry that has an insertion. Otherwise increment to next
206       position and update each Entry accordingly.
207
208       If the position is not covered by any read, the Pileup
209       fastforward to next covered locus.
210     */
211    void increment(void);
212
213    /**
214       \return number of reads in pileup. Reads with a deletion of the
215       current position is also counted.
216
217       \since new in yat 0.17
218     */
219    size_t size(void) const;
220
221    /**
222       \return an iterator that points to the last Entry in Pileup.
223
224       Iteration is done in reversed order; othwerwise behaves as begin(void)
225
226       \since new in yat 0.17
227     */
228    const_reverse_iterator rbegin(void) const;
229
230    /**
231       \return const reverse iterator to reverse end
232
233       \see end(void)
234       \see rbegin(void)
235
236       \since new in yat 0.17
237     */
238    const_reverse_iterator rend(void) const;
239
240    /**
241       \return template id as defined in bam file header
242     */
243    int32_t tid(void) const { return tid_; }
244
245    /**
246       \return position
247     */
248    int32_t pos(void) const { return pos_; }
249
250    /**
251       \return true if position does not exist in reference, i.e.,
252       inserted region.
253     */
254    bool skip_ref(void) const;
255  private:
256    typedef typename std::list<Entry>::iterator iterator;
257
258    Iterator bam_;
259    Iterator bam_end_;
260    int32_t pos_;
261    int32_t tid_;
262    std::list<Entry> data_;
263    uint32_t skip_ref_;
264  };
265
266
267  /**
268     \relates Pileup
269
270     \since new in yat 0.13
271   */
272  template<typename Iterator>
273  Pileup<Iterator> make_pileup(Iterator first, Iterator last)
274  {
275    return Pileup<Iterator>(first, last);
276  }
277
278  // implementations
279
280  template<typename Iterator>
281  Pileup<Iterator>::Pileup(Iterator first, Iterator last)
282    : bam_(first), bam_end_(last), pos_(0), tid_(0), skip_ref_(0)
283  {
284    BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<Iterator>));
285    BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<Iterator>));
286    using boost::Convertible;
287    typedef typename boost::iterator_value<Iterator>::type value_type;
288    BOOST_CONCEPT_ASSERT((Convertible<value_type, BamRead>));
289    increment();
290  }
291
292
293  template<typename Iterator>
294  typename Pileup<Iterator>::const_iterator Pileup<Iterator>::begin(void) const
295  {
296    return data_.begin();
297  }
298
299
300  template<typename Iterator>
301  typename Pileup<Iterator>::const_reverse_iterator
302  Pileup<Iterator>::rbegin(void) const
303  {
304    return data_.rbegin();
305  }
306
307
308  template<typename Iterator>
309  typename Pileup<Iterator>::const_iterator Pileup<Iterator>::end(void) const
310  {
311    return data_.end();
312  }
313
314
315  template<typename Iterator>
316  typename Pileup<Iterator>::const_reverse_iterator
317  Pileup<Iterator>::rend(void) const
318  {
319    return data_.rend();
320  }
321
322
323  template<typename Iterator>
324  void Pileup<Iterator>::increment(void)
325  {
326    if (!data_.empty()) {
327      // in insertion
328      if (skip_ref()) {
329        for (iterator it = data_.begin(); it!=data_.end(); ++it) {
330          if (it->cigar_type() == 1) {
331            it->increment();
332          }
333          --it->skip_;
334        }
335        --skip_ref_;
336      }
337      // walk one base forward on reference
338      else {
339        ++pos_;
340        typename std::list<Entry>::iterator read = data_.begin();
341        while (read != data_.end()) {
342          read->increment();
343          // remove entry if read is entirely left of pos
344          if (read->bam().end() <= pos_) { // FIXME speedup possible
345            data_.erase(read++);
346            continue;
347          }
348          // check if we're stepping into insertion
349          if (read->cigar_op() == BAM_CINS) {
350            skip_ref_ = std::max(skip_ref_, read->cigar_oplen());
351          }
352          ++read;
353        }
354
355        // stepped into an insertion
356        if (skip_ref_)
357          for (iterator iter=data_.begin(); iter!=data_.end(); ++iter)
358            iter->skip_ = skip_ref_;
359      }
360    }
361
362
363    // fast forward to next covered locus
364    if (data_.empty()) {
365      if (bam_==bam_end_)
366        return;
367      tid_ = bam_->core().tid;
368      pos_ = bam_->core().pos;
369    }
370
371    // read data
372    while (bam_!=bam_end_ && bam_->core().tid==tid_ && bam_->pos()<=pos_) {
373      // skip unmapped reads
374      if (!(bam_->core().flag & BAM_FUNMAP)) {
375        data_.push_back(Entry(*bam_));
376      }
377      ++bam_;
378    }
379  }
380
381
382  template<typename Iterator>
383  bool Pileup<Iterator>::skip_ref(void) const
384  {
385    return skip_ref_;
386  }
387
388
389  template<typename Iterator>
390  size_t Pileup<Iterator>::size(void) const
391  {
392    return data_.size();
393  }
394
395
396  template<typename Iterator>
397  Pileup<Iterator>::Entry::Entry(const BamRead& b)
398    : bam_(new BamRead(b)), qpos_(0), cigar_(bam_->cigar()), skip_(0)
399  {
400    YAT_ASSERT(cigar_.base() == bam_->cigar());
401    if (bam_->core().n_cigar==0)
402      return;
403    // iterate cigar to first match
404    while (qpos_ < bam_->sequence_length() && cigar_type() != 3) {
405      // type is either 0 (hardclipped) or 1 (softclipped)
406      YAT_ASSERT(cigar_type()!=2);
407      if (cigar_type() == 1)
408        ++qpos_;
409      ++cigar_;
410    }
411  }
412
413
414  template<typename Iterator>
415  void Pileup<Iterator>::Entry::increment(void)
416  {
417    YAT_ASSERT(cigar_.base() >= bam_->cigar());
418    if (cigar_type() & 1)
419      ++qpos_;
420    ++cigar_;
421    YAT_ASSERT(cigar_.base() >= bam_->cigar());
422  }
423
424
425  template<typename Iterator>
426  uint8_t Pileup<Iterator>::Entry::sequence(void) const
427  {
428    if (skip_==0) {
429      if (cigar_type() & 1)
430        return bam_->sequence(qpos_);
431      return 0;
432    }
433    if (cigar_op()!=BAM_CINS)
434      return 0;
435    return bam_->sequence(qpos_);
436  }
437
438}}}
439#endif
Note: See TracBrowser for help on using the repository browser.