source: trunk/yat/omic/BamHeader.cc @ 3477

Last change on this file since 3477 was 3477, checked in by Peter, 8 years ago

add functionality to access which RG IDs that are present in a BamHeader?

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 8.0 KB
Line 
1// $Id: BamHeader.cc 3477 2016-03-09 08:26:38Z peter $
2
3/*
4  Copyright (C) 2012, 2013, 2014, 2015 Peter Johansson
5
6  This file is part of the yat library, http://dev.thep.lu.se/yat
7
8  The yat library is free software; you can redistribute it and/or
9  modify it under the terms of the GNU General Public License as
10  published by the Free Software Foundation; either version 3 of the
11  License, or (at your option) any later version.
12
13  The yat library is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  General Public License for more details.
17
18  You should have received a copy of the GNU General Public License
19  along with this program. If not, see <http://www.gnu.org/licenses/>.
20*/
21
22#include <config.h>
23
24#include "BamHeader.h"
25
26#include "yat/utility/Exception.h"
27#include "yat/utility/split.h"
28#include "yat/utility/stl_utility.h"
29
30// we need to include 'sam_header.h' when compiling against libbam
31#ifndef YAT_HAVE_HTSLIB
32  #if YAT_HAVE_BAM_H
33  #include <sam_header.h>
34  #elif YAT_HAVE_BAM_BAM_H
35  #include <bam/sam_header.h>
36  #elif YAT_HAVE_SAMTOOLS_SAM_H
37  #include <samtools/sam_header.h>
38  #else
39  #error cannot end up here
40  #endif
41#endif
42
43#include <cassert>
44#include <cstdlib>
45#include <cstring>
46#include <string>
47#include <sstream>
48#include <utility>
49
50namespace theplu {
51namespace yat {
52namespace omic {
53
54  BamHeader::BamHeader(void)
55    : header_(NULL)
56  {
57  }
58
59
60  BamHeader::~BamHeader(void)
61  {
62#if YAT_HAVE_HTSLIB
63    bam_hdr_destroy(header_);
64#else
65    bam_header_destroy(header_);
66#endif
67  }
68
69
70  BamHeader::BamHeader(const BamHeader& other)
71    : read_group_(other.read_group_), program_group_(other.program_group_)
72  {
73#ifndef YAT_HAVE_HTSLIB
74    using namespace detail; // for bam_hdr_dup in YAT_HAVE_LIBBAM mode
75#endif
76    header_ = bam_hdr_dup(other.header_);
77  }
78
79
80  const std::string& BamHeader::group(strMap2& table,const std::string& type,
81                                      const std::string& id,
82                                      const std::string& key) const
83  {
84    update_group(table, type);
85    strMap2::const_iterator line_table = table.find(id);
86    if (line_table == table.end()) {
87      std::string msg = "@" + type + ": unknown ID: '" + id + "'";
88      throw utility::runtime_error(msg);
89    }
90    strMap::const_iterator it = line_table->second.find(key);
91    if (it == line_table->second.end()) {
92      std::string msg = "@" + type + ": unknown KEY: '" + key + "'";
93      throw utility::runtime_error(msg);
94    }
95    return it->second;
96  }
97
98
99  void BamHeader::update_group(strMap2& table, const std::string& type) const
100  {
101    assert(type.size()==2);
102    if (table.empty()) { // parse header
103      std::istringstream ss(text());
104      std::string line;
105      while (getline(ss, line)) {
106        if (line.substr(1,2)!=type)
107          continue;
108        // find ID value
109        size_t pos = line.find("\tID:", 3);
110        size_t end = line.find('\t', pos+4);
111        std::string id_val = line.substr(pos+4, end-(pos+4));
112        std::vector<std::string> vec;
113        utility::split(vec, line, '\t');
114        strMap& m = table[id_val];
115        for (size_t i=1; i<vec.size(); ++i) {
116          m[vec[i].substr(0,2)] = vec[i].substr(3);
117        }
118      }
119    }
120  }
121
122
123  void BamHeader::parse_region(const std::string& reg, int& id, int& begin,
124                               int& end) const
125  {
126    assert(header_);
127#if YAT_HAVE_HTSLIB
128    const char* b = reg.c_str();
129    const char* e = hts_parse_reg(b, &begin, &end);
130    // If begin > end suggests something is wrong and that is also how
131    // bam_parse_region (below in samtools impl) used to detect error
132    // and return non-zero.
133    if (begin<=end) {
134      std::string chr = reg.substr(0, e-b);
135      try {
136        id = tid(chr);
137        return;
138      }
139      catch (utility::runtime_error& e) {
140        ; // throw below instead
141      }
142    }
143#else
144    if (!bam_parse_region(header_, reg.c_str(), &id, &begin, &end))
145      return;
146#endif
147    std::ostringstream ss;
148    ss << "invalid region: '" << reg << "'";
149    throw utility::runtime_error(ss.str());
150  }
151
152
153  const std::string& BamHeader::program_group(const std::string& id,
154                                              const std::string& key) const
155  {
156    return group(program_group_, "PG", id, key);
157  }
158
159
160  const std::string& BamHeader::read_group(const std::string& id,
161                                           const std::string& key) const
162  {
163    return group(read_group_, "RG", id, key);
164  }
165
166
167  const std::map<std::string, std::string>&
168  BamHeader::read_group(const std::string& id) const
169  {
170    update_group(read_group_, "RG");
171    strMap2::const_iterator line_table = read_group_.find(id);
172    if (line_table == read_group_.end()) {
173      std::string msg = "@RG: unknown ID: '" + id + "'";
174      throw utility::runtime_error(msg);
175    }
176    return line_table->second;
177  }
178
179
180  BamHeader::read_group_iterator BamHeader::read_group_begin(void) const
181  {
182    update_group(read_group_, "RG");
183    BamHeader::strMap2::const_iterator it(read_group_.begin());
184    return utility::pair_first_iterator(it);
185  }
186
187
188  BamHeader::read_group_iterator BamHeader::read_group_end(void) const
189  {
190    update_group(read_group_, "RG");
191    BamHeader::strMap2::const_iterator it(read_group_.end());
192    return utility::pair_first_iterator(it);
193  }
194
195
196  void BamHeader::swap(BamHeader& other)
197  {
198    std::swap(header_, other.header_);
199    std::swap(read_group_, other.read_group_);
200    std::swap(program_group_, other.program_group_);
201  }
202
203
204  uint32_t BamHeader::target_length(size_t tid) const
205  {
206    assert(tid < static_cast<size_t>(n_targets()));
207    return header_->target_len[tid];
208  }
209
210
211  const char* BamHeader::target_name(size_t tid) const
212  {
213    assert(tid < static_cast<size_t>(n_targets()));
214    return header_->target_name[tid];
215  }
216
217
218  std::string BamHeader::text(void) const
219  {
220    assert(header_);
221    return std::string(header_->text, header_->l_text);
222  }
223
224
225  void BamHeader::text(const std::string& txt)
226  {
227    read_group_.clear();
228    program_group_.clear();
229#ifdef YAT_HAVE_HTSLIB
230    // destroy the old
231    bam_hdr_destroy(header_);
232    // and create header pointer
233    header_ = sam_hdr_parse(txt.size(), txt.c_str());
234#endif
235    header_->l_text = txt.size();
236    header_->text = (char*)realloc(header_->text, txt.size()+1);
237    memcpy(header_->text, txt.c_str(), txt.size()+1);
238#ifndef YAT_HAVE_HTSLIB
239    // destroy dict to ensure that things are updated during parsing
240    if (header_->dict) {
241      sam_header_free(header_->dict);
242      header_->dict = 0;
243    }
244    sam_header_parse(header_);
245#endif
246  }
247
248
249  int32_t BamHeader::tid(const std::string& name) const
250  {
251#if YAT_HAVE_HTSLIB
252    int res = bam_name2id(header_, name.c_str());
253    if (res==-1) {
254      std::string msg("invalid segment name:");
255      msg += name;
256      throw utility::runtime_error(msg);
257    }
258    return res;
259#else
260    if (!header_->hash) {
261      // We would like to call something like bam_init_header_hash,
262      // but unfortunately that function is not defined in any header
263      // (only in two source files). Instead we call bam_parse_region
264      // because a documented side-effect of that function is that
265      // hash get populated.
266      int tid, b, e;
267      bam_parse_region(header_, name.c_str(), &tid, &b, &e);
268      assert(header_->hash);
269      assert(tid == bam_get_tid(header_, name.c_str()));
270      return tid;
271    }
272    return bam_get_tid(header_, name.c_str());
273#endif
274  }
275
276
277  int32_t BamHeader::n_targets(void) const
278  {
279    assert(header_);
280    return header_->n_targets;
281  }
282
283
284  void swap(BamHeader& lhs, BamHeader& rhs)
285  {
286    lhs.swap(rhs);
287  }
288
289
290  BamHeader& BamHeader::operator=(const BamHeader& rhs)
291  {
292    BamHeader tmp(rhs);
293    swap(tmp);
294    return *this;
295  }
296
297#ifndef YAT_HAVE_HTSLIB
298  namespace detail {
299    bam_header_t * bam_hdr_dup(const bam_header_t* other)
300    {
301      // taken from htslib 1.2.1 and adjusted to fit old struct bam_header_t
302      if (!other)
303        return NULL;
304      bam_header_t* h = bam_header_init();
305      if (!h)
306        return NULL;
307      // copy stuff
308      h->n_targets = other->n_targets;
309      h->l_text = other->l_text;
310      h->n_text = other->n_text;
311      h->dict = NULL;
312      h->hash = NULL;
313      h->rg2lib = NULL;
314
315      h->text = (char*)calloc(h->l_text + 1, 1);
316      memcpy(h->text, other->text, h->l_text);
317      h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
318      h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
319      for (int i = 0; i < h->n_targets; ++i) {
320        h->target_len[i] = other->target_len[i];
321        h->target_name[i] = strdup(other->target_name[i]);
322      }
323      return h;
324    }
325  }
326#endif
327
328}}}
Note: See TracBrowser for help on using the repository browser.