diff --git a/.github/workflows/dnmtools_release_linux.yml b/.github/workflows/dnmtools_release_linux.yml index 55ec3b16..24967e2c 100644 --- a/.github/workflows/dnmtools_release_linux.yml +++ b/.github/workflows/dnmtools_release_linux.yml @@ -26,6 +26,17 @@ jobs: apt-get update && apt-get install --no-install-recommends -y automake libgsl-dev && \ find /usr -name libz.so -exec rm {} \; && \ find /usr -name libgsl\*.so -exec rm {} \; && \ + git clone https://github.com/ebiggers/libdeflate.git && \ + cd libdeflate && \ + cmake -B build \ + -DLIBDEFLATE_BUILD_GZIP=off \ + -DLIBDEFLATE_BUILD_TESTS=off \ + -DLIBDEFLATE_BUILD_SHARED_LIB=off \ + -DCMAKE_VERBOSE_MAKEFILE=on \ + -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j4 && \ + cmake --install build --prefix=/usr/local && \ + cd .. && \ git clone --recursive https://github.com/samtools/htslib.git && \ cd htslib && \ autoreconf -i && \ @@ -35,14 +46,14 @@ jobs: --disable-libcurl \ --disable-lzma \ --disable-ref-cache \ - LDADD="-L/usr/local/lib" && \ + --with-libdeflate && \ make -j4 CFLAGS="-Wall -O2 -fvisibility=hidden" libhts.a && \ cp libhts.a /usr/local/lib/ && \ cp -r ../htslib /usr/local/include/ && \ cd /workspace && \ autoreconf -i && \ mkdir build && cd build && \ - ../configure && \ + ../configure --with-libdeflate && \ make -j4 LDFLAGS="-static-libgcc -static-libstdc++ -s" run: | docker exec build-container bash -c "$SCRIPT" diff --git a/.github/workflows/dnmtools_release_macos.yml b/.github/workflows/dnmtools_release_macos.yml index 481ee31d..ea7e54e5 100644 --- a/.github/workflows/dnmtools_release_macos.yml +++ b/.github/workflows/dnmtools_release_macos.yml @@ -22,6 +22,18 @@ jobs: sudo cp $(brew --prefix gsl)/lib/*.a /opt/dnmtools/lib sudo cp -r $(brew --prefix zlib)/include/* /opt/dnmtools/include sudo cp -r $(brew --prefix gsl)/include/* /opt/dnmtools/include + - name: Build and install libdeflate + run: | + git clone https://github.com/ebiggers/libdeflate.git && \ + cd libdeflate && \ + cmake -B build \ + -DLIBDEFLATE_BUILD_GZIP=off \ + -DLIBDEFLATE_BUILD_TESTS=off \ + -DLIBDEFLATE_BUILD_SHARED_LIB=off \ + -DCMAKE_VERBOSE_MAKEFILE=on \ + -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j4 && \ + sudo cmake --install build --prefix=/opt/dnmtools - name: Build and install HTSlib run: | git clone --recursive https://github.com/samtools/htslib.git @@ -34,15 +46,18 @@ jobs: --disable-libcurl \ --disable-lzma \ --disable-ref-cache \ - --without-libdeflate \ - LDADD="-L/usr/local/lib" + --with-libdeflate \ + LDFLAGS="-L/opt/dnmtools/lib" CPPFLAGS="-I/opt/dnmtools/include" make -j4 CFLAGS="-Wall -O2 -fvisibility=hidden" libhts.a sudo cp libhts.a /opt/dnmtools/lib - name: Build dnmtools run: | ./autogen.sh mkdir build && cd build - ../configure CXX=g++-14 LDFLAGS="-L/opt/dnmtools/lib -static-libgcc -static-libstdc++ -Wl,-dead_strip" CPPFLAGS="-I/opt/dnmtools/include" + ../configure --with-libdeflate \ + CXX=g++-14 \ + LDFLAGS="-L/opt/dnmtools/lib -static-libgcc -static-libstdc++ -Wl,-dead_strip" \ + CPPFLAGS="-I/opt/dnmtools/include" make -j4 - name: Rename the binary run: mv build/dnmtools dnmtools_$(uname -m) diff --git a/configure.ac b/configure.ac index 512e1704..9916e10b 100644 --- a/configure.ac +++ b/configure.ac @@ -43,10 +43,20 @@ the LDFLAGS and CPPFLAGS variables to specify the directories where the GSL library and headers can be found. " +dnl arg for using libdeflate, which might happen by default anyway +AC_ARG_WITH([libdeflate], + [AS_HELP_STRING([--with-libdeflate], [use libdeflate for BAM output])], + [with_libdeflate=yes], [with_libdeflate=no]) + dnl check for required libraries AC_SEARCH_LIBS([pthread_create], [pthread], [], [AC_MSG_FAILURE(["pthread library not found"])]) AC_SEARCH_LIBS([gzopen], [z], [], [AC_MSG_FAILURE(["Zlib library not found"])]) -AC_SEARCH_LIBS([hts_version], [hts], [], [AC_MSG_FAILURE([$hts_fail_msg])], [-lz -lpthread]) +AS_IF([test "x$with_libdeflate" = "xyes"], + [ + AC_SEARCH_LIBS([libdeflate_deflate_compress], [deflate], [], + [AC_MSG_ERROR([--with-libdeflate specified but libdeflate not found])]) + ]) +AC_SEARCH_LIBS([hts_version], [hts], [], [AC_MSG_FAILURE([$hts_fail_msg])]) AC_SEARCH_LIBS([cblas_dgemm], [gslcblas], [], [AC_MSG_FAILURE([$gsl_fail_msg])]) AC_SEARCH_LIBS([gsl_blas_dgemm], [gsl], [], [AC_MSG_FAILURE([$gsl_fail_msg])]) diff --git a/src/abismal b/src/abismal index 634d0301..b63cd99e 160000 --- a/src/abismal +++ b/src/abismal @@ -1 +1 @@ -Subproject commit 634d030113e24f2c020e81093fd5c706210fc38a +Subproject commit b63cd99e99ba988657200d0117f7b3f8b4f1803e diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index b2721c55..7944afcc 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -41,10 +41,6 @@ /* HTSlib */ #include -using bamxx::bam_header; -using bamxx::bam_rec; -using bamxx::bgzf_file; - [[nodiscard]] inline bool is_cytosine(const char c) { return c == 'c' || c == 'C'; @@ -73,7 +69,7 @@ is_cpg(const std::string &s, const std::size_t i) { static void read_fasta_file(const std::string &filename, std::vector &names, std::vector &sequences) { - bgzf_file in(filename, "r"); + bamxx::bgzf_file in(filename, "r"); if (!in) throw std::runtime_error("error opening genome file: " + filename); names.clear(); @@ -96,7 +92,7 @@ read_fasta_file(const std::string &filename, std::vector &names, } [[nodiscard]] static std::string -get_basecall_model(const bam_header &hdr) { +get_basecall_model(const bamxx::bam_header &hdr) { kstring_t ks{}; ks = {0, 0, nullptr}; @@ -583,7 +579,7 @@ struct match_counter { }; static void -count_states_fwd(const bam_rec &aln, std::vector &counts, +count_states_fwd(const bamxx::bam_rec &aln, std::vector &counts, mod_prob_buffer &mod_buf, const std::string &chrom, match_counter &mc) { /* Move through cigar, reference and read positions without @@ -612,9 +608,12 @@ count_states_fwd(const bam_rec &aln, std::vector &counts, const decltype(qpos) end_qpos = qpos + n; for (; qpos < end_qpos; ++qpos) { const auto r_nuc = *ref_itr++; - mc.add_fwd(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, - seq_nt16_str[bam_seqi(seq, qpos)], - qpos == q_lim ? 'N' : seq_nt16_str[bam_seqi(seq, qpos + 1)]); + if (qpos + 1 < end_qpos) { + mc.add_fwd(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, + seq_nt16_str[bam_seqi(seq, qpos)], + qpos == q_lim ? 'N' + : seq_nt16_str[bam_seqi(seq, qpos + 1)]); + } counts[rpos++].add_count_fwd(*hydroxy_prob_itr, *methyl_prob_itr); ++methyl_prob_itr; ++hydroxy_prob_itr; @@ -638,7 +637,7 @@ count_states_fwd(const bam_rec &aln, std::vector &counts, } static void -count_states_rev(const bam_rec &aln, std::vector &counts, +count_states_rev(const bamxx::bam_rec &aln, std::vector &counts, mod_prob_buffer &mod_buf, const std::string &chrom, match_counter &mc) { /* Move through cigar, reference and (*backward*) through read @@ -672,8 +671,10 @@ count_states_rev(const bam_rec &aln, std::vector &counts, const auto r_nuc = *ref_itr++; const auto q_nuc = seq_nt16_str[bam_seqi(seq, q_idx)]; ++q_idx; - mc.add_rev(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, q_nuc, - q_idx == l_qseq ? 'N' : seq_nt16_str[bam_seqi(seq, q_idx)]); + if (end_qpos + 1 < qpos) + mc.add_rev(r_nuc, ref_itr == end_ref ? 'N' : *ref_itr, q_nuc, + q_idx == l_qseq ? 'N' + : seq_nt16_str[bam_seqi(seq, q_idx)]); --methyl_prob_itr; --hydroxy_prob_itr; counts[rpos++].add_count_rev(*hydroxy_prob_itr, *methyl_prob_itr); @@ -699,7 +700,7 @@ count_states_rev(const bam_rec &aln, std::vector &counts, [[nodiscard]] static std::tuple, std::set> get_tid_to_idx( - const bam_header &hdr, + const bamxx::bam_header &hdr, const std::unordered_map &name_to_idx) { std::set missing_tids; std::map tid_to_idx; @@ -718,7 +719,7 @@ get_tid_to_idx( } [[nodiscard]] static bool -consistent_targets(const bam_header &hdr, +consistent_targets(const bamxx::bam_header &hdr, const std::map &tid_to_idx, const std::vector &names, const std::vector &sizes) { @@ -741,7 +742,8 @@ consistent_targets(const bam_header &hdr, [[nodiscard]] static bool consistent_existing_targets( - const bam_header &hdr, const std::map &tid_to_idx, + const bamxx::bam_header &hdr, + const std::map &tid_to_idx, const std::vector &names, const std::vector &sizes) { const std::size_t n_targets = hdr.h->n_targets; @@ -805,10 +807,10 @@ struct read_processor { output_skipped_chromosome( const std::int32_t tid, const std::map &tid_to_idx, - const bam_header &hdr, + const bamxx::bam_header &hdr, const std::vector::const_iterator chroms_beg, const std::vector &chrom_sizes, std::vector &counts, - bgzf_file &out) const { + bamxx::bgzf_file &out) const { // get the index of the next chrom sequence const auto chrom_idx = tid_to_idx.find(tid); @@ -831,7 +833,7 @@ struct read_processor { } void - write_output_all(const bam_header &hdr, bgzf_file &out, + write_output_all(const bamxx::bam_header &hdr, bamxx::bgzf_file &out, const std::int32_t tid, const std::string &chrom, const std::vector &counts) const { static constexpr auto out_fmt = "%ld\t%c\t%s\t%.6g\t%d\t%.6g\t%.6g\n"; @@ -886,7 +888,7 @@ struct read_processor { } void - write_output_sym(const bam_header &hdr, bgzf_file &out, + write_output_sym(const bamxx::bam_header &hdr, bamxx::bgzf_file &out, const std::int32_t tid, const std::string &chrom, const std::vector &counts) const { static constexpr auto out_fmt = "%ld\t+\tCpG\t%.6g\t%d\t%.6g\t%.6g\n"; @@ -952,8 +954,8 @@ struct read_processor { } void - write_output(const bam_header &hdr, bgzf_file &out, const std::int32_t tid, - const std::string &chrom, + write_output(const bamxx::bam_header &hdr, bamxx::bgzf_file &out, + const std::int32_t tid, const std::string &chrom, const std::vector &counts) const { if (symmetric) write_output_sym(hdr, out, tid, chrom, counts); @@ -989,7 +991,7 @@ struct read_processor { if (!hts) throw std::runtime_error("failed to open input file"); // load the input file's header - bam_header hdr(hts); + bamxx::bam_header hdr(hts); if (!hdr) throw std::runtime_error("failed to read header"); @@ -1011,7 +1013,7 @@ struct read_processor { // open the output file const std::string output_mode = compress_output ? "w" : "wu"; - bgzf_file out(outfile, output_mode); + bamxx::bgzf_file out(outfile, output_mode); if (!out) throw std::runtime_error("error opening output file: " + outfile); @@ -1044,7 +1046,7 @@ struct read_processor { // now iterate over the reads, switching chromosomes and writing // output as needed - bam_rec aln; + bamxx::bam_rec aln; std::int32_t prev_tid = -1; std::vector::const_iterator chrom_itr{}; @@ -1190,7 +1192,7 @@ main_nanocount(int argc, const char **argv) { throw std::runtime_error("thread count cannot be negative"); std::ostringstream cmd; - copy(argv, argv + argc, std::ostream_iterator(cmd, " ")); + std::copy(argv, argv + argc, std::ostream_iterator(cmd, " ")); // file types from HTSlib use "-" for the filename to go to stdout if (outfile.empty())