SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/bit>
16 #include <seqan3/std/concepts>
17 #include <iterator>
18 #include <seqan3/std/ranges>
19 #include <string>
20 #include <vector>
21 
48 
49 namespace seqan3
50 {
51 
63 {
64 public:
68  // string_buffer is of type std::string and has some problems with pre-C++11 ABI
69  format_bam() = default;
70  format_bam(format_bam const &) = default;
71  format_bam & operator=(format_bam const &) = default;
72  format_bam(format_bam &&) = default;
73  format_bam & operator=(format_bam &&) = default;
74  ~format_bam() = default;
75 
77 
80  {
81  { "bam" }
82  };
83 
84 protected:
85  template <typename stream_type, // constraints checked by file
86  typename seq_legal_alph_type,
87  typename ref_seqs_type,
88  typename ref_ids_type,
89  typename seq_type,
90  typename id_type,
91  typename offset_type,
92  typename ref_seq_type,
93  typename ref_id_type,
94  typename ref_offset_type,
95  typename align_type,
96  typename cigar_type,
97  typename flag_type,
98  typename mapq_type,
99  typename qual_type,
100  typename mate_type,
101  typename tag_dict_type,
102  typename e_value_type,
103  typename bit_score_type>
104  void read_alignment_record(stream_type & stream,
105  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
106  ref_seqs_type & ref_seqs,
108  seq_type & seq,
109  qual_type & qual,
110  id_type & id,
111  offset_type & offset,
112  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
113  ref_id_type & ref_id,
114  ref_offset_type & ref_offset,
115  align_type & align,
116  cigar_type & cigar_vector,
117  flag_type & flag,
118  mapq_type & mapq,
119  mate_type & mate,
120  tag_dict_type & tag_dict,
121  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
122  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
123 
124  template <typename stream_type,
125  typename header_type,
126  typename seq_type,
127  typename id_type,
128  typename ref_seq_type,
129  typename ref_id_type,
130  typename align_type,
131  typename cigar_type,
132  typename qual_type,
133  typename mate_type,
134  typename tag_dict_type>
135  void write_alignment_record([[maybe_unused]] stream_type & stream,
136  [[maybe_unused]] sam_file_output_options const & options,
137  [[maybe_unused]] header_type && header,
138  [[maybe_unused]] seq_type && seq,
139  [[maybe_unused]] qual_type && qual,
140  [[maybe_unused]] id_type && id,
141  [[maybe_unused]] int32_t const offset,
142  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
143  [[maybe_unused]] ref_id_type && ref_id,
144  [[maybe_unused]] std::optional<int32_t> ref_offset,
145  [[maybe_unused]] align_type && align,
146  [[maybe_unused]] cigar_type && cigar_vector,
147  [[maybe_unused]] sam_flag const flag,
148  [[maybe_unused]] uint8_t const mapq,
149  [[maybe_unused]] mate_type && mate,
150  [[maybe_unused]] tag_dict_type && tag_dict,
151  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
152  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
153 
154 private:
156  bool header_was_read{false};
157 
160 
163  { // naming corresponds to official SAM/BAM specifications
164  int32_t block_size;
165  int32_t refID;
166  int32_t pos;
167  uint32_t l_read_name:8;
168  uint32_t mapq:8;
169  uint32_t bin:16;
170  uint32_t n_cigar_op:16;
172  int32_t l_seq;
173  int32_t next_refID;
174  int32_t next_pos;
175  int32_t tlen;
176  };
177 
180  {
181  [] () constexpr
182  {
184 
185  using index_t = std::make_unsigned_t<char>;
186 
187  // ret['M'] = 0; set anyway by initialization
188  ret[static_cast<index_t>('I')] = 1;
189  ret[static_cast<index_t>('D')] = 2;
190  ret[static_cast<index_t>('N')] = 3;
191  ret[static_cast<index_t>('S')] = 4;
192  ret[static_cast<index_t>('H')] = 5;
193  ret[static_cast<index_t>('P')] = 6;
194  ret[static_cast<index_t>('=')] = 7;
195  ret[static_cast<index_t>('X')] = 8;
196 
197  return ret;
198  }()
199  };
200 
202  static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
203  {
204  --end;
205  if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
206  if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
207  if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
208  if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
209  if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
210  return 0;
211  }
212 
213  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
214 
221  template <typename stream_view_type, std::integral number_type>
222  void read_field(stream_view_type && stream_view, number_type & target)
223  {
224  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
225  }
226 
232  template <typename stream_view_type>
233  void read_field(stream_view_type && stream_view, float & target)
234  {
235  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
236  }
237 
248  template <typename stream_view_type, typename optional_value_type>
249  void read_field(stream_view_type && stream_view, std::optional<optional_value_type> & target)
250  {
251  optional_value_type tmp;
252  read_field(std::forward<stream_view_type>(stream_view), tmp);
253  target = tmp;
254  }
255 
256  template <typename stream_view_type, typename value_type>
258  stream_view_type && stream_view,
259  value_type const & SEQAN3_DOXYGEN_ONLY(value));
260 
261  template <typename stream_view_type>
262  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
263 
264  template <typename cigar_input_type>
265  auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
266 
267  static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
268 };
269 
271 template <typename stream_type, // constraints checked by file
272  typename seq_legal_alph_type,
273  typename ref_seqs_type,
274  typename ref_ids_type,
275  typename seq_type,
276  typename id_type,
277  typename offset_type,
278  typename ref_seq_type,
279  typename ref_id_type,
280  typename ref_offset_type,
281  typename align_type,
282  typename cigar_type,
283  typename flag_type,
284  typename mapq_type,
285  typename qual_type,
286  typename mate_type,
287  typename tag_dict_type,
288  typename e_value_type,
289  typename bit_score_type>
290 inline void format_bam::read_alignment_record(stream_type & stream,
291  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
292  ref_seqs_type & ref_seqs,
294  seq_type & seq,
295  qual_type & qual,
296  id_type & id,
297  offset_type & offset,
298  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
299  ref_id_type & ref_id,
300  ref_offset_type & ref_offset,
301  align_type & align,
302  cigar_type & cigar_vector,
303  flag_type & flag,
304  mapq_type & mapq,
305  mate_type & mate,
306  tag_dict_type & tag_dict,
307  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
308  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
309 {
310  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
311  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
312  "The ref_offset must be a specialisation of std::optional.");
313 
314  static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
315  "The type of field::mapq must be uint8_t.");
316 
317  static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
318  "The type of field::flag must be seqan3::sam_flag.");
319 
320  auto stream_view = seqan3::views::istreambuf(stream);
321 
322  // these variables need to be stored to compute the ALIGNMENT
323  [[maybe_unused]] int32_t offset_tmp{};
324  [[maybe_unused]] int32_t soft_clipping_end{};
325  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
326  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
327 
328  // Header
329  // -------------------------------------------------------------------------------------------------------------
330  if (!header_was_read)
331  {
332  // magic BAM string
333  if (!std::ranges::equal(stream_view | views::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
334  throw format_error{"File is not in BAM format."};
335 
336  int32_t tmp32{};
337  read_field(stream_view, tmp32);
338 
339  bool const header_present{tmp32 > 0};
340 
341  if (header_present) // header text is present
342  read_header(stream_view | views::take_exactly_or_throw(tmp32), header, ref_seqs);
343 
344  int32_t n_ref;
345  read_field(stream_view, n_ref);
346 
347  for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
348  {
349  read_field(stream_view, tmp32); // l_name (length of reference name including \0 character)
350 
351  string_buffer.resize(tmp32 - 1);
352  std::ranges::copy_n(std::ranges::begin(stream_view), tmp32 - 1, string_buffer.data()); // copy without \0 character
353  std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
354 
355  read_field(stream_view, tmp32); // l_ref (length of reference sequence)
356 
357  // If there was no header text, we parse reference sequences block as header information
358  if (!header_present)
359  {
360  header.ref_ids().push_back(string_buffer);
361  header.ref_id_info.emplace_back(tmp32, "");
362  header.ref_dict[(header.ref_ids())[(header.ref_ids()).size() - 1]] = (header.ref_ids()).size() - 1;
363  continue;
364  }
365 
366  auto id_it = header.ref_dict.find(string_buffer);
367 
368  // sanity checks of reference information to existing header object:
369  if (id_it == header.ref_dict.end()) // [unlikely]
370  {
371  throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
372  "' found in BAM file header (header.ref_ids():",
373  header.ref_ids(), ").")};
374  }
375  else if (id_it->second != ref_idx) // [unlikely]
376  {
377  throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
378  " does not correspond to the position ", id_it->second,
379  " in the header (header.ref_ids():", header.ref_ids(), ").")};
380  }
381  else if (std::get<0>(header.ref_id_info[id_it->second]) != tmp32) // [unlikely]
382  {
383  throw format_error{"Provided reference has unequal length as specified in the header."};
384  }
385  }
386 
387  header_was_read = true;
388 
389  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
390  return;
391  }
392 
393  // read alignment record into buffer
394  // -------------------------------------------------------------------------------------------------------------
396  std::ranges::copy(stream_view | views::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
397 
398  if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
399  {
400  throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
401  "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
402  }
403  else if (core.refID > -1) // not unmapped
404  {
405  ref_id = core.refID; // field::ref_id
406  }
407 
408  flag = core.flag; // field::flag
409  mapq = core.mapq; // field::mapq
410 
411  if (core.pos > -1) // [[likely]]
412  ref_offset = core.pos; // field::ref_offset
413 
414  if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
415  {
416  if (core.next_refID > -1)
417  get<0>(mate) = core.next_refID;
418 
419  if (core.next_pos > -1) // [[likely]]
420  get<1>(mate) = core.next_pos;
421 
422  get<2>(mate) = core.tlen;
423  }
424 
425  // read id
426  // -------------------------------------------------------------------------------------------------------------
427  read_field(stream_view | views::take_exactly_or_throw(core.l_read_name - 1), id); // field::id
428  std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
429 
430  // read cigar string
431  // -------------------------------------------------------------------------------------------------------------
432  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
433  {
434  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
435  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
436  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
437  }
438  else
439  {
441  }
442 
443  offset = offset_tmp;
444 
445  // read sequence
446  // -------------------------------------------------------------------------------------------------------------
447  if (core.l_seq > 0) // sequence information is given
448  {
449  auto seq_stream = stream_view
450  | views::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
452  {
453  return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
454  dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
455  });
456 
457  if constexpr (detail::decays_to_ignore_v<seq_type>)
458  {
459  auto skip_sequence_bytes = [&] ()
460  {
461  detail::consume(seq_stream);
462  if (core.l_seq & 1)
463  std::ranges::next(std::ranges::begin(stream_view));
464  };
465 
466  if constexpr (!detail::decays_to_ignore_v<align_type>)
467  {
468  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
469  "If you want to read ALIGNMENT but not SEQ, the alignment"
470  " object must store a sequence container at the second (query) position.");
471 
472  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
473  {
474  assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
475  using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
476  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
477 
478  get<1>(align).reserve(seq_length);
479 
480  auto tmp_iter = std::ranges::begin(seq_stream);
481  std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
482 
483  if (offset_tmp & 1)
484  {
485  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
486  ++tmp_iter;
487  --seq_length;
488  }
489 
490  for (size_t i = (seq_length / 2); i > 0; --i)
491  {
492  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
493  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
494  ++tmp_iter;
495  }
496 
497  if (seq_length & 1)
498  {
499  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
500  ++tmp_iter;
501  --soft_clipping_end;
502  }
503 
504  std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
505  }
506  else
507  {
508  skip_sequence_bytes();
509  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
510  }
511  }
512  else
513  {
514  skip_sequence_bytes();
515  }
516  }
517  else
518  {
519  using alph_t = std::ranges::range_value_t<decltype(seq)>;
520  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
521 
522  for (auto [d1, d2] : seq_stream)
523  {
524  seq.push_back(from_dna16[to_rank(d1)]);
525  seq.push_back(from_dna16[to_rank(d2)]);
526  }
527 
528  if (core.l_seq & 1)
529  {
530  dna16sam d = dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
531  seq.push_back(from_dna16[to_rank(d)]);
532  std::ranges::next(std::ranges::begin(stream_view));
533  }
534 
535  if constexpr (!detail::decays_to_ignore_v<align_type>)
536  {
537  assign_unaligned(get<1>(align),
538  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
539  std::ranges::distance(seq) - soft_clipping_end));
540  }
541  }
542  }
543 
544  // read qual string
545  // -------------------------------------------------------------------------------------------------------------
546  read_field(stream_view | views::take_exactly_or_throw(core.l_seq)
547  | std::views::transform([] (char chr) { return static_cast<char>(chr + 33); }), qual);
548 
549  // All remaining optional fields if any: SAM tags dictionary
550  // -------------------------------------------------------------------------------------------------------------
551  int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
552  core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
553  assert(remaining_bytes >= 0);
554  auto tags_view = stream_view | views::take_exactly_or_throw(remaining_bytes);
555 
556  while (tags_view.size() > 0)
557  read_field(tags_view, tag_dict);
558 
559  // DONE READING - wrap up
560  // -------------------------------------------------------------------------------------------------------------
561  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
562  {
563  // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
564  // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
565  // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
566  if (core.l_seq != 0 && offset_tmp == core.l_seq)
567  {
568  if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
569  { // maybe only throw in debug mode and otherwise return an empty alignment?
570  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
571  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
572  "stored in the optional field CG. You need to read in the field::tags and "
573  "field::seq in order to access this information.")};
574  }
575  else
576  {
577  auto it = tag_dict.find("CG"_tag);
578 
579  if (it == tag_dict.end())
580  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
581  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
582  "stored in the optional field CG but this tag is not present in the given ",
583  "record.")};
584 
585  auto cigar_view = std::views::all(std::get<std::string>(it->second));
586  std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
587  offset_tmp = soft_clipping_end = 0;
588  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
589  tag_dict.erase(it); // remove redundant information
590 
591  if constexpr (!detail::decays_to_ignore_v<align_type>)
592  {
593  assign_unaligned(get<1>(align),
594  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
595  std::ranges::distance(seq) - soft_clipping_end));
596  }
597  }
598  }
599  }
600 
601  // Alignment object construction
602  if constexpr (!detail::decays_to_ignore_v<align_type>)
603  construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
604 
605  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
606  std::swap(cigar_vector, tmp_cigar_vector);
607 }
608 
610 template <typename stream_type,
611  typename header_type,
612  typename seq_type,
613  typename id_type,
614  typename ref_seq_type,
615  typename ref_id_type,
616  typename align_type,
617  typename cigar_type,
618  typename qual_type,
619  typename mate_type,
620  typename tag_dict_type>
621 inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
622  [[maybe_unused]] sam_file_output_options const & options,
623  [[maybe_unused]] header_type && header,
624  [[maybe_unused]] seq_type && seq,
625  [[maybe_unused]] qual_type && qual,
626  [[maybe_unused]] id_type && id,
627  [[maybe_unused]] int32_t const offset,
628  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
629  [[maybe_unused]] ref_id_type && ref_id,
630  [[maybe_unused]] std::optional<int32_t> ref_offset,
631  [[maybe_unused]] align_type && align,
632  [[maybe_unused]] cigar_type && cigar_vector,
633  [[maybe_unused]] sam_flag const flag,
634  [[maybe_unused]] uint8_t const mapq,
635  [[maybe_unused]] mate_type && mate,
636  [[maybe_unused]] tag_dict_type && tag_dict,
637  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
638  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
639 {
640  // ---------------------------------------------------------------------
641  // Type Requirements (as static asserts for user friendliness)
642  // ---------------------------------------------------------------------
643  static_assert((std::ranges::forward_range<seq_type> &&
644  alphabet<std::ranges::range_reference_t<seq_type>>),
645  "The seq object must be a std::ranges::forward_range over "
646  "letters that model seqan3::alphabet.");
647 
648  static_assert((std::ranges::forward_range<id_type> &&
649  alphabet<std::ranges::range_reference_t<id_type>>),
650  "The id object must be a std::ranges::forward_range over "
651  "letters that model seqan3::alphabet.");
652 
653  static_assert((std::ranges::forward_range<ref_seq_type> &&
654  alphabet<std::ranges::range_reference_t<ref_seq_type>>),
655  "The ref_seq object must be a std::ranges::forward_range "
656  "over letters that model seqan3::alphabet.");
657 
658  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
659  {
660  static_assert((std::ranges::forward_range<ref_id_type> ||
661  std::integral<std::remove_reference_t<ref_id_type>> ||
662  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
663  "The ref_id object must be a std::ranges::forward_range "
664  "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
665  }
666 
668  "The align object must be a std::pair of two ranges whose "
669  "value_type is comparable to seqan3::gap");
670 
671  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
672  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
673  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
674  "The align object must be a std::pair of two ranges whose "
675  "value_type is comparable to seqan3::gap");
676 
677  static_assert((std::ranges::forward_range<qual_type> &&
678  alphabet<std::ranges::range_reference_t<qual_type>>),
679  "The qual object must be a std::ranges::forward_range "
680  "over letters that model seqan3::alphabet.");
681 
683  "The mate object must be a std::tuple of size 3 with "
684  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
685  "2) a std::integral or std::optional<std::integral>, and "
686  "3) a std::integral.");
687 
688  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
689  std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
690  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
691  (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
692  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
693  std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
694  "The mate object must be a std::tuple of size 3 with "
695  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
696  "2) a std::integral or std::optional<std::integral>, and "
697  "3) a std::integral.");
698 
699  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
700  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
701 
702  if constexpr (detail::decays_to_ignore_v<header_type>)
703  {
704  throw format_error{"BAM can only be written with a header but you did not provide enough information! "
705  "You can either construct the output file with ref_ids and ref_seqs information and "
706  "the header will be created for you, or you can access the `header` member directly."};
707  }
708  else
709  {
710  // ---------------------------------------------------------------------
711  // logical Requirements
712  // ---------------------------------------------------------------------
713 
714  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
715  throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
716 
717  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
718 
719  // ---------------------------------------------------------------------
720  // Writing the BAM Header on first call
721  // ---------------------------------------------------------------------
722  if (!header_was_written)
723  {
724  stream << "BAM\1";
726  write_header(os, options, header); // write SAM header to temporary stream to query the size.
727  int32_t l_text{static_cast<int32_t>(os.str().size())};
728  std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
729 
730  stream << os.str();
731 
732  int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
733  std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
734 
735  for (int32_t ridx = 0; ridx < n_ref; ++ridx)
736  {
737  int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
738  std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
739  // write reference name:
740  std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
741  stream_it = '\0';
742  // write reference sequence length:
743  std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
744  }
745 
746  header_was_written = true;
747  }
748 
749  // ---------------------------------------------------------------------
750  // Writing the Record
751  // ---------------------------------------------------------------------
752  int32_t ref_length{};
753 
754  // if alignment is non-empty, replace cigar_vector.
755  // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
756  if (!std::ranges::empty(cigar_vector))
757  {
758  int32_t dummy_seq_length{};
759  for (auto & [count, operation] : cigar_vector)
760  detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
761  }
762  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
763  {
764  ref_length = std::ranges::distance(get<1>(align));
765 
766  // compute possible distance from alignment end to sequence end
767  // which indicates soft clipping at the end.
768  // This should be replaced by a free count_gaps function for
769  // aligned sequences which is more efficient if possible.
770  int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
771 
772  for (auto chr : get<1>(align))
773  if (chr == gap{})
774  ++off_end;
775 
776  off_end -= ref_length;
777  cigar_vector = detail::get_cigar_vector(align, offset, off_end);
778  }
779 
780  if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
781  {
782  tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
783  cigar_vector.resize(2);
784  cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
785  cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
786  }
787 
788  std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
789 
790  // Compute the value for the l_read_name field for the bam record.
791  // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
792  // the data type to store the value is uint8_t and 255 is the maximal size.
793  // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
794  // 2 bytes.
795  uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
796  read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
797 
799  {
800  /* block_size */ 0, // will be initialised right after
801  /* refID */ -1, // will be initialised right after
802  /* pos */ ref_offset.value_or(-1),
803  /* l_read_name */ read_name_size,
804  /* mapq */ mapq,
805  /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
806  /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
807  /* flag */ flag,
808  /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
809  /* next_refId */ -1, // will be initialised right after
810  /* next_pos */ get<1>(mate).value_or(-1),
811  /* tlen */ get<2>(mate)
812  };
813 
814  auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
815  [[maybe_unused]] auto & id_target)
816  {
817  using id_t = std::remove_reference_t<decltype(id_source)>;
818 
819  if constexpr (!detail::decays_to_ignore_v<id_t>)
820  {
821  if constexpr (std::integral<id_t>)
822  {
823  id_target = id_source;
824  }
825  else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
826  {
827  id_target = id_source.value_or(-1);
828  }
829  else
830  {
831  if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
832  {
833  auto id_it = header.ref_dict.end();
834 
835  if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
836  std::ranges::sized_range<decltype(id_source)> &&
837  std::ranges::borrowed_range<decltype(id_source)>)
838  {
839  id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
840  std::ranges::size(id_source)});
841  }
842  else
843  {
844  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
845 
846  static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
847  "The ref_id type is not convertible to the reference id information stored in the "
848  "reference dictionary of the header object.");
849 
850  id_it = header.ref_dict.find(id_source);
851  }
852 
853  if (id_it == header.ref_dict.end())
854  {
855  throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
856  "not be found in BAM header ref_dict: ",
857  header.ref_dict, ".")};
858  }
859 
860  id_target = id_it->second;
861  }
862  }
863  }
864  };
865 
866  // initialise core.refID
867  check_and_assign_id_to(ref_id, core.refID);
868 
869  // initialise core.next_refID
870  check_and_assign_id_to(get<0>(mate), core.next_refID);
871 
872  // initialise core.block_size
873  core.block_size = sizeof(core) - 4/*block_size excluded*/ +
874  core.l_read_name +
875  core.n_cigar_op * 4 + // each int32_t has 4 bytes
876  (core.l_seq + 1) / 2 + // bitcompressed seq
877  core.l_seq + // quality string
878  tag_dict_binary_str.size();
879 
880  std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
881 
882  if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
883  stream_it = '*';
884  else
885  std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
886  stream_it = '\0';
887 
888  // write cigar
889  for (auto [cigar_count, op] : cigar_vector)
890  {
891  cigar_count = cigar_count << 4;
892  cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
893  std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
894  }
895 
896  // write seq (bit-compressed: dna16sam characters go into one byte)
897  using alph_t = std::ranges::range_value_t<seq_type>;
898  constexpr auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
899 
900  auto sit = std::ranges::begin(seq);
901  for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
902  {
903  uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
904  ++sidx, ++sit;
905  compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
906  stream_it = static_cast<char>(compressed_chr);
907  }
908 
909  if (core.l_seq & 1) // write one more
910  stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
911 
912  // write qual
913  if (std::ranges::empty(qual))
914  {
915  auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
916  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
917  }
918  else
919  {
920  if (std::ranges::distance(qual) != core.l_seq)
921  throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
922  core.l_seq, ". Got quality with size ",
923  std::ranges::distance(qual), " instead.")};
924 
925  auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
926  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
927  }
928 
929  // write optional fields
930  stream << tag_dict_binary_str;
931  } // if constexpr (!detail::decays_to_ignore_v<header_type>)
932 }
933 
935 template <typename stream_view_type, typename value_type>
937  stream_view_type && stream_view,
938  value_type const & SEQAN3_DOXYGEN_ONLY(value))
939 {
940  int32_t count;
941  read_field(stream_view, count); // read length of vector
942  std::vector<value_type> tmp_vector;
943  tmp_vector.reserve(count);
944 
945  while (count > 0)
946  {
947  value_type tmp{};
948  read_field(stream_view, tmp);
949  tmp_vector.push_back(std::move(tmp));
950  --count;
951  }
952  variant = std::move(tmp_vector);
953 }
954 
972 template <typename stream_view_type>
973 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
974 {
975  /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
976  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
977  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
978  VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
979  by the length (int32_t) of the array, followed by the values.
980  */
981  auto it = std::ranges::begin(stream_view);
982  uint16_t tag = static_cast<uint16_t>(*it) << 8;
983  ++it; // skip char read before
984 
985  tag += static_cast<uint16_t>(*it);
986  ++it; // skip char read before
987 
988  char type_id = *it;
989  ++it; // skip char read before
990 
991  switch (type_id)
992  {
993  case 'A' : // char
994  {
995  target[tag] = *it;
996  ++it; // skip char that has been read
997  break;
998  }
999  // all integer sizes are possible
1000  case 'c' : // int8_t
1001  {
1002  int8_t tmp;
1003  read_field(stream_view, tmp);
1004  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1005  break;
1006  }
1007  case 'C' : // uint8_t
1008  {
1009  uint8_t tmp;
1010  read_field(stream_view, tmp);
1011  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1012  break;
1013  }
1014  case 's' : // int16_t
1015  {
1016  int16_t tmp;
1017  read_field(stream_view, tmp);
1018  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1019  break;
1020  }
1021  case 'S' : // uint16_t
1022  {
1023  uint16_t tmp;
1024  read_field(stream_view, tmp);
1025  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1026  break;
1027  }
1028  case 'i' : // int32_t
1029  {
1030  int32_t tmp;
1031  read_field(stream_view, tmp);
1032  target[tag] = std::move(tmp); // readable sam format only allows int32_t
1033  break;
1034  }
1035  case 'I' : // uint32_t
1036  {
1037  uint32_t tmp;
1038  read_field(stream_view, tmp);
1039  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1040  break;
1041  }
1042  case 'f' : // float
1043  {
1044  float tmp;
1045  read_field(stream_view, tmp);
1046  target[tag] = tmp;
1047  break;
1048  }
1049  case 'Z' : // string
1050  {
1051  string_buffer.clear();
1052  while (!is_char<'\0'>(*it))
1053  {
1054  string_buffer.push_back(*it);
1055  ++it;
1056  }
1057  ++it; // skip \0
1058  target[tag] = string_buffer;
1059  break;
1060  }
1061  case 'H' : // byte array, represented as null-terminated string; specification requires even number of bytes
1062  {
1063  std::vector<std::byte> byte_array;
1064  std::byte value;
1065  while (!is_char<'\0'>(*it))
1066  {
1067  string_buffer.clear();
1068  string_buffer.push_back(*it);
1069  ++it;
1070 
1071  if (*it == '\0')
1072  throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1073 
1074  string_buffer.push_back(*it);
1075  ++it;
1076  read_field(string_buffer, value);
1077  byte_array.push_back(value);
1078  }
1079  ++it; // skip \0
1080  target[tag] = byte_array;
1081  break;
1082  }
1083  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1084  {
1085  char array_value_type_id = *it;
1086  ++it; // skip char read before
1087 
1088  switch (array_value_type_id)
1089  {
1090  case 'c' : // int8_t
1091  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1092  break;
1093  case 'C' : // uint8_t
1094  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1095  break;
1096  case 's' : // int16_t
1097  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1098  break;
1099  case 'S' : // uint16_t
1100  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1101  break;
1102  case 'i' : // int32_t
1103  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1104  break;
1105  case 'I' : // uint32_t
1106  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1107  break;
1108  case 'f' : // float
1109  read_sam_dict_vector(target[tag], stream_view, float{});
1110  break;
1111  default:
1112  throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1113  "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1114  }
1115  break;
1116  }
1117  default:
1118  throw format_error{detail::to_string("The second character in the numerical id of a "
1119  "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1120  }
1121 }
1122 
1137 template <typename cigar_input_type>
1138 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1139 {
1140  std::vector<cigar> operations{};
1141  char operation{'\0'};
1142  uint32_t count{};
1143  int32_t ref_length{}, seq_length{};
1144  uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1145  constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1146  constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1147 
1148  if (n_cigar_op == 0) // [[unlikely]]
1149  return std::tuple{operations, ref_length, seq_length};
1150 
1151  // parse the rest of the cigar
1152  // -------------------------------------------------------------------------------------------------------------
1153  while (n_cigar_op > 0) // until stream is not empty
1154  {
1155  std::ranges::copy_n(std::ranges::begin(cigar_input),
1156  sizeof(operation_and_count),
1157  reinterpret_cast<char*>(&operation_and_count));
1158  operation = cigar_mapping[operation_and_count & cigar_mask];
1159  count = operation_and_count >> 4;
1160 
1161  detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1162  operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1163  --n_cigar_op;
1164  }
1165 
1166  return std::tuple{operations, ref_length, seq_length};
1167 }
1168 
1173 {
1174  std::string result{};
1175 
1176  auto stream_variant_fn = [&result] (auto && arg) // helper to print an std::variant
1177  {
1178  // T is either char, int32_t, float, std::string, or a std::vector<some int>
1179  using T = std::remove_cvref_t<decltype(arg)>;
1180 
1181  if constexpr (std::same_as<T, int32_t>)
1182  {
1183  // always choose the smallest possible representation [cCsSiI]
1184  size_t const absolute_arg = std::abs(arg);
1185  auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1186  bool const negative = arg < 0;
1187  n = n * n + 2 * negative; // for switch case order
1188 
1189  switch (n)
1190  {
1191  case 0:
1192  {
1193  result[result.size() - 1] = 'C';
1194  result.append(reinterpret_cast<char const *>(&arg), 1);
1195  break;
1196  }
1197  case 1:
1198  {
1199  result[result.size() - 1] = 'S';
1200  result.append(reinterpret_cast<char const *>(&arg), 2);
1201  break;
1202  }
1203  case 2:
1204  {
1205  result[result.size() - 1] = 'c';
1206  int8_t tmp = static_cast<int8_t>(arg);
1207  result.append(reinterpret_cast<char const *>(&tmp), 1);
1208  break;
1209  }
1210  case 3:
1211  {
1212  result[result.size() - 1] = 's';
1213  int16_t tmp = static_cast<int16_t>(arg);
1214  result.append(reinterpret_cast<char const *>(&tmp), 2);
1215  break;
1216  }
1217  default:
1218  {
1219  result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1220  break;
1221  }
1222  }
1223  }
1224  else if constexpr (std::same_as<T, std::string>)
1225  {
1226  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1227  }
1228  else if constexpr (!std::ranges::range<T>) // char, float
1229  {
1230  result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1231  }
1232  else // std::vector of some arithmetic_type type
1233  {
1234  int32_t sz{static_cast<int32_t>(arg.size())};
1235  result.append(reinterpret_cast<char *>(&sz), 4);
1236  result.append(reinterpret_cast<char const *>(arg.data()),
1237  arg.size() * sizeof(std::ranges::range_value_t<T>));
1238  }
1239  };
1240 
1241  for (auto & [tag, variant] : tag_dict)
1242  {
1243  result.push_back(static_cast<char>(tag / 256));
1244  result.push_back(static_cast<char>(tag % 256));
1245 
1246  result.push_back(detail::sam_tag_type_char[variant.index()]);
1247 
1248  if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1249  result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1250 
1251  std::visit(stream_variant_fn, variant);
1252  }
1253 
1254  return result;
1255 }
1256 
1257 } // namespace seqan3
Provides seqan3::detail::convert_through_char_representation.
Provides the C++20 <bit> header if it is not already available.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:211
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:264
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:59
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:39
The alignment base format.
Definition: format_sam_base.hpp:62
void write_header(stream_t &stream, sam_file_output_options const &options, sam_file_header< ref_ids_type > &header)
Writes the SAM header.
Definition: format_sam_base.hpp:637
void transfer_soft_clipping_to(std::vector< cigar > const &cigar_vector, int32_t &sc_begin, int32_t &sc_end) const
Transfer soft clipping information from the cigar_vector to sc_begin and sc_end.
Definition: format_sam_base.hpp:189
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition: format_sam_base.hpp:83
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &)
Reads the SAM header.
Definition: format_sam_base.hpp:419
void construct_alignment(align_type &align, std::vector< cigar > &cigar_vector, [[maybe_unused]] int32_t rid, [[maybe_unused]] ref_seqs_type &ref_seqs, [[maybe_unused]] int32_t ref_start, size_t ref_length)
Construct the field::alignment depending on the given information.
Definition: format_sam_base.hpp:231
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: dna16sam.hpp:48
The actual implementation of seqan3::cigar::operation for documentation purposes only.
Definition: cigar_operation.hpp:48
The BAM format.
Definition: format_bam.hpp:63
format_bam()=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:290
void read_field(stream_view_type &&stream_view, std::optional< optional_value_type > &target)
Delegate parsing of std::optional types to parsing of the inner value type.
Definition: format_bam.hpp:249
void write_alignment_record([[maybe_unused]] stream_type &stream, [[maybe_unused]] sam_file_output_options const &options, [[maybe_unused]] header_type &&header, [[maybe_unused]] seq_type &&seq, [[maybe_unused]] qual_type &&qual, [[maybe_unused]] id_type &&id, [[maybe_unused]] int32_t const offset, [[maybe_unused]] ref_seq_type &&ref_seq, [[maybe_unused]] ref_id_type &&ref_id, [[maybe_unused]] std::optional< int32_t > ref_offset, [[maybe_unused]] align_type &&align, [[maybe_unused]] cigar_type &&cigar_vector, [[maybe_unused]] sam_flag const flag, [[maybe_unused]] uint8_t const mapq, [[maybe_unused]] mate_type &&mate, [[maybe_unused]] tag_dict_type &&tag_dict, [[maybe_unused]] double e_value, [[maybe_unused]] double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:621
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
void read_field(stream_view_type &&stream_view, float &target)
Reads a float field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:233
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
auto parse_binary_cigar(cigar_input_type &&cigar_input, uint16_t n_cigar_op) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_bam.hpp:1138
static std::string get_tag_dict_str(sam_tag_dictionary const &tag_dict)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:1172
std::string string_buffer
Local buffer to read into while avoiding reallocation.
Definition: format_bam.hpp:159
static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
Definition: format_bam.hpp:202
void read_field(stream_view_type &&stream_view, number_type &target)
Reads a arithmetic field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:222
void read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view, value_type const &value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition: format_bam.hpp:936
bool header_was_read
A variable that tracks whether the content of header has been read or not.
Definition: format_bam.hpp:156
static constexpr std::array< uint8_t, 256 > char_to_sam_rank
Converts a cigar op character to the rank according to the official BAM specifications.
Definition: format_bam.hpp:180
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:80
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:33
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:158
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:155
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:119
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:326
T clear(T... args)
The Concepts library.
Provides various transformation traits used by the range module.
T data(T... args)
Auxiliary for pretty printing of exception messages.
Provides seqan3::debug_stream and related types.
Provides type traits for working with templates.
Provides seqan3::dna16sam.
T emplace_back(T... args)
Provides concepts for core language types and relations that don't have concepts in C++20 (yet).
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:133
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:199
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:73
std::vector< cigar > get_cigar_vector(alignment_type &&alignment, uint32_t const query_start_pos=0, uint32_t const query_end_pos=0, bool const extended_cigar=false)
Creates a cigar string (SAM format) given a seqan3::detail::pairwise_alignment represented by two seq...
Definition: cigar.hpp:201
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition: cigar.hpp:263
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: misc.hpp:28
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:434
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:168
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:150
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:145
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:114
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:91
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:94
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:70
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Provides various utility functions.
Auxiliary functions for the alignment IO.
Provides seqan3::views::istreambuf.
T min(T... args)
std::tuple< std::vector< cigar >, int32_t, int32_t > parse_cigar(cigar_input_type &&cigar_input)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: cigar.hpp:134
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:38
void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition: cigar.hpp:105
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:36
std::string to_string(value_type &&...values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T push_back(T... args)
Provides various utility functions.
Adaptations of concepts from the Ranges TS.
T reserve(T... args)
T resize(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
Provides the seqan3::sam_file_header class.
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_input_options.
Provides seqan3::sam_file_output_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides helper data structures for the seqan3::sam_file_output.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Stores all fixed length variables which can be read/written directly by reinterpreting the binary str...
Definition: format_bam.hpp:163
uint32_t bin
The bin number.
Definition: format_bam.hpp:169
sam_flag flag
The flag value (uint16_t enum).
Definition: format_bam.hpp:171
uint32_t n_cigar_op
The number of cigar operations of the alignment.
Definition: format_bam.hpp:170
int32_t next_refID
The reference id of the mate.
Definition: format_bam.hpp:173
int32_t l_seq
The number of bases of the read sequence.
Definition: format_bam.hpp:172
int32_t refID
The reference id the read was mapped to.
Definition: format_bam.hpp:165
int32_t pos
The begin position of the alignment.
Definition: format_bam.hpp:166
int32_t block_size
The size in bytes of the whole BAM record.
Definition: format_bam.hpp:164
uint32_t l_read_name
The length of the read name including the \0 character.
Definition: format_bam.hpp:167
int32_t tlen
The template length of the read and its mate.
Definition: format_bam.hpp:175
uint32_t mapq
The mapping quality.
Definition: format_bam.hpp:168
int32_t next_pos
The begin position of the mate alignment.
Definition: format_bam.hpp:174
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:88
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:24
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
Exposes the value_type of another type.
Definition: pre.hpp:58
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
T tuple_size_v
Provides character predicates for tokenisation.
Provides seqan3::tuple_like.
T visit(T... args)