source: trunk/yat/omic/BamRead.cc @ 3032

Last change on this file since 3032 was 3032, checked in by Peter, 9 years ago

refs #746. Simplify cigar function using logic similar to sequence(2). Avoid bug as pointers to data get invalidated when calling reserve. Forgot to set data_len and avoid compiler warning using char as index to array

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 7.5 KB
Line 
1// $Id: BamRead.cc 3032 2013-04-27 15:06:52Z peter $
2
3/*
4  Copyright (C) 2012, 2013 Peter Johansson
5
6  This file is part of the yat library, http://dev.thep.lu.se/yat
7
8  The yat library is free software; you can redistribute it and/or
9  modify it under the terms of the GNU General Public License as
10  published by the Free Software Foundation; either version 3 of the
11  License, or (at your option) any later version.
12
13  The yat library is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  General Public License for more details.
17
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20*/
21
22#include <config.h>
23
24#include "BamRead.h"
25#include "config_bam.h"
26
27#include YAT_BAM_HEADER
28
29#include "yat/utility/Exception.h"
30
31#include <algorithm>
32#include <cassert>
33#include <sstream>
34
35// bam1_seq_seti is not defined in 0.1.18 (or older), so
36// backport. Code is taken bam.h in version 0.1.19-dev.
37#ifndef bam1_seq_seti
38#define bam1_seq_seti(s, i, c) ( (s)[(i)>>1] = \
39  ((s)[(i)>>1] & 0xf<<(((i)&1)<<2)) | (c)<<((~(i)&1)<<2) )
40#endif
41
42namespace theplu {
43namespace yat {
44namespace omic {
45
46  BamRead::BamRead(void)
47    : bam_(bam_init1())
48  {}
49
50
51  BamRead::BamRead(const BamRead& other)
52    : bam_(bam_dup1(other.bam_))
53  {
54  }
55
56
57  BamRead::~BamRead(void)
58  {
59    bam_destroy1(bam_);
60  }
61
62
63  const BamRead& BamRead::operator=(const BamRead& rhs)
64  {
65    if (bam_!=rhs.bam_)
66      bam_copy1(bam_, rhs.bam_);
67    return *this;
68  }
69
70
71  const uint8_t* BamRead::aux(void) const
72  {
73    assert(bam_);
74    return bam1_aux(bam_);
75  }
76
77
78  const uint8_t* BamRead::aux(const char tag[2]) const
79  {
80    assert(bam_);
81    return bam_aux_get(bam_, tag);
82  }
83
84
85  void BamRead::aux_append(const char tag[2], char type,int len,uint8_t* data)
86  {
87    assert(bam_);
88    bam_aux_append(bam_, tag, type, len, data);
89  }
90
91
92  void BamRead::aux_del(const char tag[2])
93  {
94    assert(bam_);
95    uint8_t* s = bam_aux_get(bam_, tag);
96    if (!s) {
97      std::stringstream msg("BamRead::aux_del: absent tag: ");
98      msg << "'" << tag << "'";
99      throw utility::runtime_error(msg.str());
100    }
101    bam_aux_del(bam_, s);
102  }
103
104
105  int BamRead::aux_size(void) const
106  {
107    assert(bam_);
108    return bam_->l_aux;
109  }
110
111
112  const uint32_t* BamRead::cigar(void) const
113  {
114    return bam1_cigar(bam_);
115  }
116
117
118  uint32_t BamRead::cigar(size_t i) const
119  {
120    assert(i<core().n_cigar);
121    return bam1_cigar(bam_)[i];
122  }
123
124
125  uint32_t BamRead::cigar_op(size_t i) const
126  {
127    assert(i<core().n_cigar);
128    return bam_cigar_op(cigar(i));
129  }
130
131
132  uint32_t BamRead::cigar_oplen(size_t i) const
133  {
134    assert(i<core().n_cigar);
135    return bam_cigar_oplen(cigar(i));
136  }
137
138
139  std::string BamRead::cigar_str(void) const
140  {
141    std::ostringstream os;
142    std::string str = BAM_CIGAR_STR;
143    for (uint32_t i=0; i<core().n_cigar; ++i) {
144      os << bam_cigar_oplen(bam1_cigar(bam_)[i])
145         << str[bam_cigar_op(bam1_cigar(bam_)[i])];
146    }
147    return os.str();
148  }
149
150
151  void BamRead::cigar(const std::vector<uint32_t>& c)
152  {
153    int offset = 4*c.size() - 4*core().n_cigar;
154    reserve(bam_->data_len + offset);
155    /*
156      data comes as qname, cigar, rest
157     */
158    // first move remaining data to new location
159    if (offset)
160      memmove(bam1_seq(bam_)+offset, bam1_seq(bam_),
161              bam_->data_len - 4*core().n_cigar - core().l_qname);
162
163    // copy new cigar
164    if (c.size())
165      // each cigar element is 4 bytes
166      memcpy(bam1_cigar(bam_), &c[0], 4*c.size());
167    bam_->data_len += offset;
168    core().n_cigar = c.size();
169    assert(bam_->data_len <= bam_->m_data);
170  }
171
172
173  const bam1_core_t& BamRead::core(void) const
174  { return bam_->core; }
175
176
177  bam1_core_t& BamRead::core(void)
178  { return bam_->core; }
179
180
181  const char* BamRead::name(void) const
182  { return bam1_qname(bam_); }
183
184
185  uint8_t BamRead::qual(size_t i) const
186  { return *(bam1_qual(bam_)+i); }
187
188
189  void BamRead::qual(size_t i, uint8_t q)
190  { *(bam1_qual(bam_)+i)=q; }
191
192
193  void BamRead::reserve(int n)
194  {
195    assert(bam_);
196    if (bam_->m_data >= n)
197      return;
198    // at least double capacity
199    bam_->m_data = std::max(bam_->m_data<<1, n);
200    bam_->data = (uint8_t*)realloc(bam_->data, bam_->m_data);
201    if (!bam_->data) {
202      throw utility::runtime_error("BamRead::reserve failed");
203    }
204  }
205
206
207  std::string BamRead::sequence(void) const
208  {
209    std::string result(sequence_length(), ' ');
210#ifdef HAVE_BAM_NT16_REV_TABLE
211    for (size_t i=0; i<result.size(); ++i)
212      result[i] = bam_nt16_rev_table[sequence(i)];
213#else
214    std::string table = "=ACMGRSVTWYHKDBN";
215    for (size_t i=0; i<result.size(); ++i)
216      result[i] = table[sequence(i)];
217#endif
218    return result;
219  }
220
221
222  void BamRead::sequence(const std::string& seq,
223                         const std::vector<uint8_t>& qual)
224  {
225    assert(seq.size()==qual.size());
226    // data consist of
227    // |-- qname -->-- cigar -->-- seq -->-- qual -->-- aux -->
228
229    int aux_pos = bam1_seq(bam_) - bam_->data + (seq.size()+1)/2 + qual.size();
230    int len = aux_pos + bam_->l_aux;
231    reserve(len);
232    // move aux to its new location
233    memmove(static_cast<void*>(bam_->data+aux_pos),
234            static_cast<const void*>(bam1_aux(bam_)), bam_->l_aux);
235    // copy sequence
236    for (size_t i=0; i<seq.size(); ++i)
237      sequence(i, bam_nt16_table[static_cast<size_t>(seq[i])]);
238    core().l_qseq = seq.size();
239    // copy quality
240    if (qual.size()) // FIXME is this needed or does memcpy work with n=0?
241      memcpy(static_cast<void*>(bam1_qual(bam_)),
242             static_cast<const void*>(&qual[0]), qual.size());
243    bam_->data_len = len;
244    assert(bam_->data_len <= bam_->m_data);
245  }
246
247
248  void BamRead::sequence(size_t i, uint8_t seq)
249  {
250    assert(bam_);
251    assert(seq < 16);
252    bam1_seq_seti(bam1_seq(bam_), i, seq);
253  }
254
255
256  uint8_t BamRead::sequence(size_t index) const
257  {
258    assert(bam_);
259    return bam1_seqi(bam1_seq(bam_),index);
260  }
261
262
263  uint32_t BamRead::sequence_length(void) const
264  {
265    assert(bam_);
266    return bam_->core.l_qseq;
267  }
268
269
270  int32_t BamRead::pos(void) const
271  {
272    assert(bam_);
273    return bam_->core.pos;
274  }
275
276
277  int32_t BamRead::tid(void) const
278  {
279    assert(bam_);
280    return bam_->core.tid;
281  }
282
283
284  int32_t BamRead::mtid(void) const
285  {
286    assert(bam_);
287    return bam_->core.mtid;
288  }
289
290
291  int32_t BamRead::mpos(void) const
292  {
293    assert(bam_);
294    return bam_->core.mpos;
295  }
296
297
298  int32_t BamRead::end(void) const
299  {
300    assert(bam_);
301    return bam_calend(&core(), bam1_cigar(bam_));
302  }
303
304
305  void BamRead::swap(BamRead& other)
306  {
307    std::swap(bam_, other.bam_);
308  }
309
310
311  void swap(BamRead& lhs, BamRead& rhs)
312  {
313    lhs.swap(rhs);
314  }
315
316  bool same_query_name(const BamRead& lhs, const BamRead& rhs)
317  {
318    if (lhs.core().l_qname != rhs.core().l_qname)
319      return false;
320
321    return std::equal(lhs.name(),
322                      lhs.name() + lhs.core().l_qname,
323                      rhs.name());
324  }
325
326
327  bool soft_clipped(const BamRead& bam)
328  {
329    return left_soft_clipped(bam) || right_soft_clipped(bam);
330  }
331
332
333  uint32_t left_soft_clipped(const BamRead& bam)
334  {
335    for (size_t i=0; i<bam.core().n_cigar; ++i) {
336      if (bam.cigar_op(i) == BAM_CSOFT_CLIP)
337        return bam.cigar_oplen(i);
338      if (bam_cigar_type(bam.cigar_op(i))&1)
339        return 0;
340    }
341    return 0;
342  }
343
344
345  uint32_t right_soft_clipped(const BamRead& bam)
346  {
347    if (bam.core().n_cigar==0)
348      return 0;
349    // right clip can't be at first (most left cigar element)
350    for (size_t i=bam.core().n_cigar-1; i>0; --i) {
351      if (bam.cigar_op(i) == BAM_CSOFT_CLIP)
352        return bam.cigar_oplen(i);
353      if (bam_cigar_type(bam.cigar_op(i))&1)
354        return 0;
355    }
356    return 0;
357  }
358
359
360  bool BamLessPos::operator()(const BamRead& lhs, const BamRead& rhs) const
361  {
362    return lhs.tid() < rhs.tid() ||
363      (lhs.tid() == rhs.tid() && lhs.pos() < rhs.pos());
364  }
365
366
367  bool BamLessEnd::operator()(const BamRead& lhs, const BamRead& rhs) const
368  {
369    return lhs.tid() < rhs.tid() ||
370      (lhs.tid() == rhs.tid() && lhs.end() < rhs.end());
371  }
372
373}}}
Note: See TracBrowser for help on using the repository browser.