22 #include <seqan3/alphabet/detail/convert.hpp>
85 template <
typename stream_type,
86 typename seq_legal_alph_type,
87 typename ref_seqs_type,
88 typename ref_ids_type,
92 typename ref_seq_type,
94 typename ref_offset_type,
101 typename tag_dict_type,
102 typename e_value_type,
103 typename bit_score_type>
106 ref_seqs_type & ref_seqs,
111 offset_type & offset,
112 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
113 ref_id_type & ref_id,
114 ref_offset_type & ref_offset,
116 cigar_type & cigar_vector,
120 tag_dict_type & tag_dict,
121 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
122 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
124 template <
typename stream_type,
125 typename header_type,
128 typename ref_seq_type,
129 typename ref_id_type,
134 typename tag_dict_type>
137 [[maybe_unused]] header_type && header,
138 [[maybe_unused]] seq_type && seq,
139 [[maybe_unused]] qual_type && qual,
140 [[maybe_unused]] id_type &&
id,
141 [[maybe_unused]] int32_t
const offset,
142 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
143 [[maybe_unused]] ref_id_type && ref_id,
145 [[maybe_unused]] align_type && align,
146 [[maybe_unused]] cigar_type && cigar_vector,
147 [[maybe_unused]]
sam_flag const flag,
148 [[maybe_unused]] uint8_t
const mapq,
149 [[maybe_unused]] mate_type && mate,
150 [[maybe_unused]] tag_dict_type && tag_dict,
151 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
152 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score));
156 bool header_was_read{
false};
162 struct alignment_record_core
167 uint32_t l_read_name:8;
170 uint32_t n_cigar_op:16;
188 ret[
static_cast<index_t
>(
'I')] = 1;
189 ret[
static_cast<index_t
>(
'D')] = 2;
190 ret[
static_cast<index_t
>(
'N')] = 3;
191 ret[
static_cast<index_t
>(
'S')] = 4;
192 ret[
static_cast<index_t
>(
'H')] = 5;
193 ret[
static_cast<index_t
>(
'P')] = 6;
194 ret[
static_cast<index_t
>(
'=')] = 7;
195 ret[
static_cast<index_t
>(
'X')] = 8;
202 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
205 if (beg >> 14 == end >> 14)
return ((1 << 15) - 1) / 7 + (beg >> 14);
206 if (beg >> 17 == end >> 17)
return ((1 << 12) - 1) / 7 + (beg >> 17);
207 if (beg >> 20 == end >> 20)
return ((1 << 9) - 1) / 7 + (beg >> 20);
208 if (beg >> 23 == end >> 23)
return ((1 << 6) - 1) / 7 + (beg >> 23);
209 if (beg >> 26 == end >> 26)
return ((1 << 3) - 1) / 7 + (beg >> 26);
213 using format_sam_base::read_field;
221 template <
typename stream_view_type, std::
integral number_type>
222 void read_field(stream_view_type && stream_view, number_type & target)
224 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(target),
reinterpret_cast<char *
>(&target));
232 template <
typename stream_view_type>
233 void read_field(stream_view_type && stream_view,
float & target)
235 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(int32_t),
reinterpret_cast<char *
>(&target));
248 template <
typename stream_view_type,
typename optional_value_type>
251 optional_value_type tmp;
252 read_field(std::forward<stream_view_type>(stream_view), tmp);
256 template <
typename stream_view_type,
typename value_type>
258 stream_view_type && stream_view,
259 value_type
const & SEQAN3_DOXYGEN_ONLY(value));
261 template <
typename stream_view_type>
262 void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
264 template <
typename cigar_input_type>
265 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const;
267 static std::string get_tag_dict_str(sam_tag_dictionary
const & tag_dict);
271 template <
typename stream_type,
272 typename seq_legal_alph_type,
273 typename ref_seqs_type,
274 typename ref_ids_type,
277 typename offset_type,
278 typename ref_seq_type,
279 typename ref_id_type,
280 typename ref_offset_type,
287 typename tag_dict_type,
288 typename e_value_type,
289 typename bit_score_type>
292 ref_seqs_type & ref_seqs,
297 offset_type & offset,
298 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
299 ref_id_type & ref_id,
300 ref_offset_type & ref_offset,
302 cigar_type & cigar_vector,
306 tag_dict_type & tag_dict,
307 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
308 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
310 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
311 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
312 "The ref_offset must be a specialisation of std::optional.");
314 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
315 "The type of field::mapq must be uint8_t.");
317 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
318 "The type of field::flag must be seqan3::sam_flag.");
323 [[maybe_unused]] int32_t offset_tmp{};
324 [[maybe_unused]] int32_t soft_clipping_end{};
326 [[maybe_unused]] int32_t ref_length{0}, seq_length{0};
330 if (!header_was_read)
337 read_field(stream_view, tmp32);
339 bool const header_present{tmp32 > 0};
345 read_field(stream_view, n_ref);
347 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
349 read_field(stream_view, tmp32);
351 string_buffer.
resize(tmp32 - 1);
352 std::ranges::copy_n(std::ranges::begin(stream_view), tmp32 - 1, string_buffer.
data());
353 std::ranges::next(std::ranges::begin(stream_view));
355 read_field(stream_view, tmp32);
360 header.
ref_ids().push_back(string_buffer);
366 auto id_it = header.
ref_dict.find(string_buffer);
371 throw format_error{detail::to_string(
"Unknown reference name '" + string_buffer +
372 "' found in BAM file header (header.ref_ids():",
375 else if (id_it->second != ref_idx)
377 throw format_error{detail::to_string(
"Reference id '", string_buffer,
"' at position ", ref_idx,
378 " does not correspond to the position ", id_it->second,
379 " in the header (header.ref_ids():", header.
ref_ids(),
").")};
381 else if (std::get<0>(header.
ref_id_info[id_it->second]) != tmp32)
383 throw format_error{
"Provided reference has unequal length as specified in the header."};
387 header_was_read =
true;
389 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view))
395 alignment_record_core core;
398 if (core.refID >=
static_cast<int32_t
>(header.
ref_ids().size()) || core.refID < -1)
400 throw format_error{detail::to_string(
"Reference id index '", core.refID,
"' is not in range of ",
401 "header.ref_ids(), which has size ", header.
ref_ids().size(),
".")};
403 else if (core.refID > -1)
414 if constexpr (!detail::decays_to_ignore_v<mate_type>)
416 if (core.next_refID > -1)
417 get<0>(
mate) = core.next_refID;
419 if (core.next_pos > -1)
420 get<1>(
mate) = core.next_pos;
422 get<2>(
mate) = core.tlen;
428 std::ranges::next(std::ranges::begin(stream_view));
432 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
434 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
435 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
449 auto seq_stream = stream_view
457 if constexpr (detail::decays_to_ignore_v<seq_type>)
459 auto skip_sequence_bytes = [&] ()
461 detail::consume(seq_stream);
463 std::ranges::next(std::ranges::begin(stream_view));
466 if constexpr (!detail::decays_to_ignore_v<align_type>)
469 "If you want to read ALIGNMENT but not SEQ, the alignment"
470 " object must store a sequence container at the second (query) position.");
472 if (!tmp_cigar_vector.empty())
474 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end));
475 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
476 constexpr
auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
478 get<1>(align).reserve(seq_length);
480 auto tmp_iter = std::ranges::begin(seq_stream);
481 std::ranges::advance(tmp_iter, offset_tmp / 2);
485 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
490 for (
size_t i = (seq_length / 2); i > 0; --i)
492 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
493 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
499 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
504 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
508 skip_sequence_bytes();
514 skip_sequence_bytes();
519 using alph_t = std::ranges::range_value_t<decltype(
seq)>;
520 constexpr
auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
522 for (
auto [d1, d2] : seq_stream)
532 std::ranges::next(std::ranges::begin(stream_view));
535 if constexpr (!detail::decays_to_ignore_v<align_type>)
537 assign_unaligned(get<1>(align),
538 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
539 std::ranges::distance(
seq) - soft_clipping_end));
551 int32_t remaining_bytes = core.block_size - (
sizeof(alignment_record_core) - 4) -
552 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
553 assert(remaining_bytes >= 0);
556 while (tags_view.size() > 0)
557 read_field(tags_view, tag_dict);
561 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
566 if (core.l_seq != 0 && offset_tmp == core.l_seq)
568 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
570 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
571 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
572 "stored in the optional field CG. You need to read in the field::tags and "
573 "field::seq in order to access this information.")};
577 auto it = tag_dict.find(
"CG"_tag);
579 if (it == tag_dict.end())
580 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
581 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
582 "stored in the optional field CG but this tag is not present in the given ",
585 auto cigar_view = std::views::all(std::get<std::string>(it->second));
586 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
587 offset_tmp = soft_clipping_end = 0;
588 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
591 if constexpr (!detail::decays_to_ignore_v<align_type>)
593 assign_unaligned(get<1>(align),
594 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
595 std::ranges::distance(
seq) - soft_clipping_end));
602 if constexpr (!detail::decays_to_ignore_v<align_type>)
603 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length);
605 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
606 std::swap(cigar_vector, tmp_cigar_vector);
610 template <
typename stream_type,
611 typename header_type,
614 typename ref_seq_type,
615 typename ref_id_type,
620 typename tag_dict_type>
623 [[maybe_unused]] header_type && header,
624 [[maybe_unused]] seq_type && seq,
625 [[maybe_unused]] qual_type && qual,
626 [[maybe_unused]] id_type &&
id,
627 [[maybe_unused]] int32_t
const offset,
628 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
629 [[maybe_unused]] ref_id_type && ref_id,
631 [[maybe_unused]] align_type && align,
632 [[maybe_unused]] cigar_type && cigar_vector,
633 [[maybe_unused]]
sam_flag const flag,
634 [[maybe_unused]] uint8_t
const mapq,
635 [[maybe_unused]] mate_type && mate,
636 [[maybe_unused]] tag_dict_type && tag_dict,
637 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
638 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score))
643 static_assert((std::ranges::forward_range<seq_type> &&
644 alphabet<std::ranges::range_reference_t<seq_type>>),
645 "The seq object must be a std::ranges::forward_range over "
646 "letters that model seqan3::alphabet.");
648 static_assert((std::ranges::forward_range<id_type> &&
649 alphabet<std::ranges::range_reference_t<id_type>>),
650 "The id object must be a std::ranges::forward_range over "
651 "letters that model seqan3::alphabet.");
653 static_assert((std::ranges::forward_range<ref_seq_type> &&
654 alphabet<std::ranges::range_reference_t<ref_seq_type>>),
655 "The ref_seq object must be a std::ranges::forward_range "
656 "over letters that model seqan3::alphabet.");
658 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
660 static_assert((std::ranges::forward_range<ref_id_type> ||
663 "The ref_id object must be a std::ranges::forward_range "
664 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
668 "The align object must be a std::pair of two ranges whose "
669 "value_type is comparable to seqan3::gap");
672 std::equality_comparable_with<
gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
673 std::equality_comparable_with<
gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
674 "The align object must be a std::pair of two ranges whose "
675 "value_type is comparable to seqan3::gap");
677 static_assert((std::ranges::forward_range<qual_type> &&
678 alphabet<std::ranges::range_reference_t<qual_type>>),
679 "The qual object must be a std::ranges::forward_range "
680 "over letters that model seqan3::alphabet.");
683 "The mate object must be a std::tuple of size 3 with "
684 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
685 "2) a std::integral or std::optional<std::integral>, and "
686 "3) a std::integral.");
688 static_assert(((std::ranges::forward_range<decltype(std::get<0>(
mate))> ||
694 "The mate object must be a std::tuple of size 3 with "
695 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
696 "2) a std::integral or std::optional<std::integral>, and "
697 "3) a std::integral.");
700 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
702 if constexpr (detail::decays_to_ignore_v<header_type>)
704 throw format_error{
"BAM can only be written with a header but you did not provide enough information! "
705 "You can either construct the output file with ref_ids and ref_seqs information and "
706 "the header will be created for you, or you can access the `header` member directly."};
717 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
722 if (!header_was_written)
726 write_header(os, options, header);
727 int32_t l_text{
static_cast<int32_t
>(os.
str().size())};
728 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_text), 4, stream_it);
732 int32_t n_ref{
static_cast<int32_t
>(header.
ref_ids().size())};
733 std::ranges::copy_n(
reinterpret_cast<char *
>(&n_ref), 4, stream_it);
735 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
737 int32_t l_name{
static_cast<int32_t
>(header.
ref_ids()[ridx].size()) + 1};
738 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_name), 4, stream_it);
740 std::ranges::copy(header.
ref_ids()[ridx].begin(), header.
ref_ids()[ridx].end(), stream_it);
743 std::ranges::copy_n(
reinterpret_cast<char *
>(&get<0>(header.
ref_id_info[ridx])), 4, stream_it);
746 header_was_written =
true;
752 int32_t ref_length{};
756 if (!std::ranges::empty(cigar_vector))
758 int32_t dummy_seq_length{};
759 for (
auto & [
count, operation] : cigar_vector)
760 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(),
count);
762 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
764 ref_length = std::ranges::distance(get<1>(align));
770 int32_t off_end{
static_cast<int32_t
>(std::ranges::distance(
seq)) -
offset};
772 for (
auto chr : get<1>(align))
776 off_end -= ref_length;
777 cigar_vector = detail::get_cigar_vector(align,
offset, off_end);
780 if (cigar_vector.size() >= (1 << 16))
782 tag_dict[
"CG"_tag] = detail::get_cigar_string(cigar_vector);
783 cigar_vector.resize(2);
784 cigar_vector[0] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(
seq)),
'S'_cigar_operation};
785 cigar_vector[1] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(get<1>(align))),
'N'_cigar_operation};
788 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
795 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(
id), 254) + 1;
796 read_name_size +=
static_cast<uint8_t
>(read_name_size == 1);
798 alignment_record_core core
806 static_cast<uint16_t
>(cigar_vector.size()),
808 static_cast<int32_t
>(std::ranges::distance(
seq)),
810 get<1>(
mate).value_or(-1),
814 auto check_and_assign_id_to = [&header] ([[maybe_unused]]
auto & id_source,
815 [[maybe_unused]]
auto & id_target)
819 if constexpr (!detail::decays_to_ignore_v<id_t>)
821 if constexpr (std::integral<id_t>)
823 id_target = id_source;
825 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
827 id_target = id_source.value_or(-1);
831 if (!std::ranges::empty(id_source))
835 if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
836 std::ranges::sized_range<decltype(id_source)> &&
837 std::ranges::borrowed_range<decltype(id_source)>)
847 "The ref_id type is not convertible to the reference id information stored in the "
848 "reference dictionary of the header object.");
850 id_it = header.
ref_dict.find(id_source);
855 throw format_error{detail::to_string(
"Unknown reference name '", id_source,
"' could "
856 "not be found in BAM header ref_dict: ",
860 id_target = id_it->second;
867 check_and_assign_id_to(
ref_id, core.refID);
870 check_and_assign_id_to(get<0>(
mate), core.next_refID);
873 core.block_size =
sizeof(core) - 4 +
875 core.n_cigar_op * 4 +
876 (core.l_seq + 1) / 2 +
878 tag_dict_binary_str.
size();
880 std::ranges::copy_n(
reinterpret_cast<char *
>(&core),
sizeof(core), stream_it);
882 if (std::ranges::empty(
id))
885 std::ranges::copy_n(std::ranges::begin(
id), core.l_read_name - 1, stream_it);
889 for (
auto [cigar_count, op] : cigar_vector)
891 cigar_count = cigar_count << 4;
892 cigar_count |=
static_cast<int32_t
>(char_to_sam_rank[op.to_char()]);
893 std::ranges::copy_n(
reinterpret_cast<char *
>(&cigar_count), 4, stream_it);
897 using alph_t = std::ranges::range_value_t<seq_type>;
898 constexpr
auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
900 auto sit = std::ranges::begin(
seq);
901 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
906 stream_it =
static_cast<char>(compressed_chr);
910 stream_it =
static_cast<char>(
to_rank(to_dna16[
to_rank(*sit)]) << 4);
913 if (std::ranges::empty(
qual))
916 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
920 if (std::ranges::distance(
qual) != core.l_seq)
921 throw format_error{detail::to_string(
"Expected quality of same length as sequence with size ",
922 core.l_seq,
". Got quality with size ",
923 std::ranges::distance(
qual),
" instead.")};
926 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
930 stream << tag_dict_binary_str;
935 template <
typename stream_view_type,
typename value_type>
937 stream_view_type && stream_view,
938 value_type const & SEQAN3_DOXYGEN_ONLY(value))
941 read_field(stream_view,
count);
948 read_field(stream_view, tmp);
972 template <
typename stream_view_type>
973 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
982 uint16_t tag =
static_cast<uint16_t
>(*it) << 8;
985 tag +=
static_cast<uint16_t
>(*it);
1003 read_field(stream_view, tmp);
1004 target[tag] =
static_cast<int32_t
>(tmp);
1010 read_field(stream_view, tmp);
1011 target[tag] =
static_cast<int32_t
>(tmp);
1017 read_field(stream_view, tmp);
1018 target[tag] =
static_cast<int32_t
>(tmp);
1024 read_field(stream_view, tmp);
1025 target[tag] =
static_cast<int32_t
>(tmp);
1031 read_field(stream_view, tmp);
1038 read_field(stream_view, tmp);
1039 target[tag] =
static_cast<int32_t
>(tmp);
1045 read_field(stream_view, tmp);
1051 string_buffer.
clear();
1052 while (!is_char<'\0'>(*it))
1058 target[tag] = string_buffer;
1065 while (!is_char<'\0'>(*it))
1067 string_buffer.
clear();
1072 throw format_error{
"Hexadecimal tag has an uneven number of digits!"};
1076 read_field(string_buffer, value);
1080 target[tag] = byte_array;
1085 char array_value_type_id = *it;
1088 switch (array_value_type_id)
1091 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1094 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1097 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1100 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1103 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1106 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1109 read_sam_dict_vector(target[tag], stream_view,
float{});
1112 throw format_error{detail::to_string(
"The first character in the numerical id of a SAM tag ",
1113 "must be one of [cCsSiIf] but '", array_value_type_id,
"' was given.")};
1118 throw format_error{detail::to_string(
"The second character in the numerical id of a "
1119 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id,
"' was given.")};
1137 template <
typename cigar_input_type>
1138 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const
1141 char operation{
'\0'};
1143 int32_t ref_length{}, seq_length{};
1144 uint32_t operation_and_count{};
1145 constexpr
char const * cigar_mapping =
"MIDNSHP=X*******";
1146 constexpr uint32_t cigar_mask = 0x0f;
1148 if (n_cigar_op == 0)
1149 return std::tuple{operations, ref_length, seq_length};
1153 while (n_cigar_op > 0)
1155 std::ranges::copy_n(std::ranges::begin(cigar_input),
1156 sizeof(operation_and_count),
1157 reinterpret_cast<char*
>(&operation_and_count));
1158 operation = cigar_mapping[operation_and_count & cigar_mask];
1159 count = operation_and_count >> 4;
1161 detail::update_alignment_lengths(ref_length, seq_length, operation,
count);
1166 return std::tuple{operations, ref_length, seq_length};
1172 inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary
const & tag_dict)
1176 auto stream_variant_fn = [&result] (
auto && arg)
1181 if constexpr (std::same_as<T, int32_t>)
1184 size_t const absolute_arg = std::abs(arg);
1186 bool const negative = arg < 0;
1187 n = n * n + 2 * negative;
1193 result[result.size() - 1] =
'C';
1194 result.append(
reinterpret_cast<char const *
>(&arg), 1);
1199 result[result.size() - 1] =
'S';
1200 result.append(
reinterpret_cast<char const *
>(&arg), 2);
1205 result[result.size() - 1] =
'c';
1206 int8_t tmp =
static_cast<int8_t
>(arg);
1207 result.append(
reinterpret_cast<char const *
>(&tmp), 1);
1212 result[result.size() - 1] =
's';
1213 int16_t tmp =
static_cast<int16_t
>(arg);
1214 result.append(
reinterpret_cast<char const *
>(&tmp), 2);
1219 result.append(
reinterpret_cast<char const *
>(&arg), 4);
1224 else if constexpr (std::same_as<T, std::string>)
1226 result.append(
reinterpret_cast<char const *
>(arg.data()), arg.size() + 1);
1228 else if constexpr (!std::ranges::range<T>)
1230 result.append(
reinterpret_cast<char const *
>(&arg),
sizeof(arg));
1234 int32_t sz{
static_cast<int32_t
>(arg.size())};
1235 result.append(
reinterpret_cast<char *
>(&sz), 4);
1236 result.append(
reinterpret_cast<char const *
>(arg.data()),
1237 arg.size() *
sizeof(std::ranges::range_value_t<T>));
1241 for (
auto & [tag, variant] : tag_dict)
1243 result.push_back(
static_cast<char>(tag / 256));
1244 result.push_back(
static_cast<char>(tag % 256));
1246 result.push_back(detail::sam_tag_type_char[variant.
index()]);
1248 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.
index()]))
1249 result.push_back(detail::sam_tag_type_char_extra[variant.
index()]);
Provides the C++20 <bit> header if it is not already available.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:211
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:264
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:59
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:98
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: dna16sam.hpp:48
The alphabet of a gap character '-'.
Definition: gap.hpp:39
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:326
Provides various transformation traits used by the range module.
Auxiliary for pretty printing of exception messages.
Provides seqan3::debug_stream and related types.
Provides type traits for working with templates.
Provides seqan3::dna16sam.
T emplace_back(T... args)
Provides concepts for core language types and relations that don't have concepts in C++20 (yet).
Provides seqan3::detail::fast_ostreambuf_iterator.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:133
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:199
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:73
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:434
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:168
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:150
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:145
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:114
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:91
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:94
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:70
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Provides various utility functions.
Auxiliary functions for the alignment IO.
Provides seqan3::views::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides various utility functions.
Adaptations of concepts from the Ranges TS.
Provides seqan3::sam_file_output_options.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides helper data structures for the seqan3::sam_file_output.
Provides seqan3::views::slice.
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
Exposes the value_type of another type.
Definition: pre.hpp:58
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides character predicates for tokenisation.
Provides seqan3::tuple_like.