From 9da5c2ecbac84814621cb43955a926c1d0363427 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 001/106] src/amrfinder/allelicmeth.cpp: changes to add static analysis --- src/amrfinder/allelicmeth.cpp | 139 ++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 64 deletions(-) diff --git a/src/amrfinder/allelicmeth.cpp b/src/amrfinder/allelicmeth.cpp index 4a65aca0..3818269d 100644 --- a/src/amrfinder/allelicmeth.cpp +++ b/src/amrfinder/allelicmeth.cpp @@ -16,51 +16,58 @@ * General Public License for more details. */ -#include -#include +#include +#include +#include #include #include -#include -#include #include -#include #include +#include #include #include +#include +#include #include #include -#include "OptionParser.hpp" -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" #include "GenomicRegion.hpp" #include "MSite.hpp" +#include "OptionParser.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" #include "Epiread.hpp" -using std::string; -using std::vector; -using std::cout; using std::cerr; +using std::cout; using std::endl; -using std::unordered_map; -using std::unordered_set; using std::max; using std::min; using std::runtime_error; +using std::string; +using std::unordered_map; +using std::unordered_set; +using std::vector; static inline double log_sum_log(const double p, const double q) { - if (p == 0) {return q;} - else if (q == 0) {return p;} + if (p == 0) { + return q; + } + else if (q == 0) { + return p; + } return p > q ? p + log(1.0 + exp(q - p)) : q + log(1.0 + exp(p - q)); } static inline double lnchoose(const unsigned int n, unsigned int m) { - if (m == n || m == 0) return 0; - if (m * 2 > n) m = n - m; + if (m == n || m == 0) + return 0; + if (m * 2 > n) + m = n - m; using std::lgamma; return lgamma(n + 1.0) - lgamma(m + 1.0) - lgamma((n - m) + 1.0); } @@ -73,9 +80,9 @@ log_hyper_g(const size_t k, const size_t n1, const size_t n2, const size_t t) { static double fishers_exact(size_t a, size_t b, size_t c, size_t d) { - const size_t m = a + c; // sum of first column - const size_t n = b + d; // sum of second column - const size_t k = a + b; // sum of first row + const size_t m = a + c; // sum of first column + const size_t n = b + d; // sum of second column + const size_t k = a + b; // sum of first row // ADS: want more extreme than "observed" const double observed = log_hyper_g(a, m, n, k); double p = 0.0; @@ -92,48 +99,61 @@ state_pair_to_index(const string &s, const size_t idx) { assert(idx < s.length() - 1); const char a = s[idx]; if (a == 'C') { - const char b = s[idx+1]; - if (b == 'C') return 0; - if (b == 'T') return 1; + const char b = s[idx + 1]; + if (b == 'C') + return 0; + if (b == 'T') + return 1; return 4; } if (a == 'T') { - const char b = s[idx+1]; - if (b == 'C') return 2; - if (b == 'T') return 3; + const char b = s[idx + 1]; + if (b == 'C') + return 2; + if (b == 'T') + return 3; return 4; } return 4; } -template -struct PairStateCounter { +template struct PairStateCounter { T CC; T CT; T TC; T TT; - double score() const { - return (CC*TT > CT*TC) ? - fishers_exact(CC, CT, TC, TT) : fishers_exact(CT, CC, TT, TC); + double + score() const { + return (CC * TT > CT * TC) ? fishers_exact(CC, CT, TC, TT) + : fishers_exact(CT, CC, TT, TC); + } + double + total() const { + return CC + CT + TC + TT; } - double total() const {return CC + CT + TC + TT;} - string tostring() const { + string + tostring() const { return toa(CC) + '\t' + toa(CT) + '\t' + toa(TC) + '\t' + toa(TT); } - void increment(const size_t state) { - if (state == 0) ++CC; - else if (state == 1) ++CT; - else if (state == 2) ++TC; - else if (state == 3) ++TT; + void + increment(const size_t state) { + if (state == 0) + ++CC; + else if (state == 1) + ++CT; + else if (state == 2) + ++TC; + else if (state == 3) + ++TT; } }; - -template void -fit_states(const epiread &er, vector > &counts) { +template +void +fit_states(const epiread &er, vector> &counts) { for (size_t i = 0; i < er.length() - 1; ++i) { const size_t pos = er.pos + i; assert(pos < counts.size()); @@ -167,22 +187,18 @@ convert_coordinates(const string &chrom, vector &sites) { } } - template void add_cytosine(const string &chrom_name, const size_t start_cpg, - vector> &counts, - vector &cytosines) { + vector> &counts, vector &cytosines) { std::ostringstream s; - s << counts[start_cpg].score() << "\t" - << counts[start_cpg].total() << "\t" + s << counts[start_cpg].score() << "\t" << counts[start_cpg].total() << "\t" << counts[start_cpg].tostring(); const string name(s.str()); cytosines.push_back(MSite(chrom_name, start_cpg, '+', name, 0, 0)); } - -template +template void process_chrom(const string &chrom_name, const vector &epireads, vector &cytosines, vector> &counts) { @@ -192,7 +208,6 @@ process_chrom(const string &chrom_name, const vector &epireads, add_cytosine(chrom_name, i, counts, cytosines); } - static void update_chroms_seen(const string &chrom_name, unordered_set &chroms_seen) { @@ -202,7 +217,6 @@ update_chroms_seen(const string &chrom_name, chroms_seen.insert(chrom_name); } - static void verify_chroms_available(const string &chrom_name, unordered_map &chrom_lookup) { @@ -211,14 +225,12 @@ verify_chroms_available(const string &chrom_name, throw runtime_error("chrom not found: " + chrom_name); } - int main_allelicmeth(int argc, char *argv[]) { try { - static const string description = - "computes probability of allele-specific \ + static const string description = "computes probability of allele-specific \ methylation at each tuple of CpGs"; static const string fasta_suffix = "fa"; @@ -230,8 +242,8 @@ main_allelicmeth(int argc, char *argv[]) { OptionParser opt_parse(strip_path(argv[0]), description, ""); opt_parse.add_opt("output", 'o', "output file name (default: stdout)", false, outfile); - opt_parse.add_opt("chrom", 'c', "genome sequence file/directory", - true, chroms_dir); + opt_parse.add_opt("chrom", 'c', "genome sequence file/directory", true, + chroms_dir); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); @@ -258,7 +270,7 @@ main_allelicmeth(int argc, char *argv[]) { vector chrom_names; vector chroms; read_fasta_file_short_names(chroms_dir, chrom_names, chroms); - for (auto &&i: chroms) + for (auto &&i : chroms) transform(begin(i), end(i), begin(i), [](const char c) { return std::toupper(c); }); @@ -271,7 +283,7 @@ main_allelicmeth(int argc, char *argv[]) { for (size_t i = 0; i < chrom_names.size(); ++i) { size_t cpg_count = 0; for (size_t j = 0; j < chroms[i].size() - 1; ++j) - cpg_count += (chroms[i][j] == 'C' && chroms[i][j+1] == 'G'); + cpg_count += (chroms[i][j] == 'C' && chroms[i][j + 1] == 'G'); chrom_sizes.insert(make_pair(chrom_names[i], cpg_count)); } @@ -283,7 +295,8 @@ main_allelicmeth(int argc, char *argv[]) { throw runtime_error("cannot open input file: " + epi_file); std::ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); unordered_set chroms_seen; @@ -304,10 +317,9 @@ main_allelicmeth(int argc, char *argv[]) { process_chrom(chrom, epireads, cytosines, counts); const size_t chrom_idx = chrom_lookup[chrom]; convert_coordinates(chroms[chrom_idx], cytosines); - for (size_t i = 0; i < cytosines.size()-1; ++i) { - out << cytosines[i].chrom << "\t" - << cytosines[i].pos << "\t+\tCpG\t" - << cytosines[i].context << endl; + for (size_t i = 0; i < cytosines.size() - 1; ++i) { + out << cytosines[i].chrom << "\t" << cytosines[i].pos + << "\t+\tCpG\t" << cytosines[i].context << endl; } } epireads.clear(); @@ -322,8 +334,7 @@ main_allelicmeth(int argc, char *argv[]) { const size_t chrom_idx = chrom_lookup[chrom]; convert_coordinates(chroms[chrom_idx], cytosines); for (size_t i = 0; i < cytosines.size() - 1; ++i) { - out << cytosines[i].chrom << "\t" - << cytosines[i].pos << "\t+\tCpG\t" + out << cytosines[i].chrom << "\t" << cytosines[i].pos << "\t+\tCpG\t" << cytosines[i].context << endl; } } From f40c7a62ff998fbdb394e8cf2aa483024f44c64d Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 002/106] src/amrfinder/amrfinder.cpp: changes to add static analysis --- src/amrfinder/amrfinder.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/amrfinder/amrfinder.cpp b/src/amrfinder/amrfinder.cpp index 49f4e6c3..fd0eaf3e 100644 --- a/src/amrfinder/amrfinder.cpp +++ b/src/amrfinder/amrfinder.cpp @@ -19,6 +19,7 @@ #include "EpireadStats.hpp" +#include "Epiread.hpp" #include "Interval.hpp" #include "Interval6.hpp" @@ -34,6 +35,7 @@ #include #include // #include // ADS: needs c++20 +#include #include #include #include @@ -300,8 +302,8 @@ get_n_cpgs(const std::vector &reads) { template [[nodiscard]] static inline auto -get_block_bounds(const T n_elements, const T n_chunks) - -> std::vector> { +get_block_bounds(const T n_elements, + const T n_chunks) -> std::vector> { const auto q = n_elements / n_chunks; const auto r = n_elements - q * n_chunks; std::vector> chunks(n_chunks); From 1e18f259d0c1827a35cad81ba0073d45ee17d9e4 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 003/106] src/amrfinder/amrtester.cpp: changes to add static analysis --- src/amrfinder/amrtester.cpp | 76 +++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 41 deletions(-) diff --git a/src/amrfinder/amrtester.cpp b/src/amrfinder/amrtester.cpp index c3cb3a76..c4d3986a 100644 --- a/src/amrfinder/amrtester.cpp +++ b/src/amrfinder/amrtester.cpp @@ -20,29 +20,30 @@ * along with this program. If not, see . */ -#include -#include +#include #include #include +#include +#include +#include #include -#include #include -#include +#include #include "Epiread.hpp" #include "EpireadStats.hpp" -using std::streampos; -using std::string; -using std::vector; -using std::cout; +using std::begin; using std::cerr; +using std::cout; +using std::end; using std::endl; -using std::unordered_map; using std::runtime_error; -using std::begin; -using std::end; +using std::streampos; +using std::string; +using std::unordered_map; +using std::vector; using epi_r = small_epiread; @@ -58,7 +59,6 @@ backup_to_start_of_current_record(std::ifstream &in) { std::to_string(assumed_max_valid_line_width)); } - static streampos find_first_epiread_ending_after_position(const string &query_chrom, const size_t query_pos, @@ -74,13 +74,13 @@ find_first_epiread_ending_after_position(const string &query_chrom, // This is just binary search on disk while (high_pos > low_pos + 1) { - const size_t mid_pos = (low_pos + high_pos)/2; + const size_t mid_pos = (low_pos + high_pos) / 2; in.seekg(mid_pos); backup_to_start_of_current_record(in); // we've hit the end of file without finding an epiread - if(low_pos == eof-2) + if (low_pos == eof - 2) return -1; if (!(in >> chrom >> start >> seq)) { @@ -95,10 +95,8 @@ find_first_epiread_ending_after_position(const string &query_chrom, return low_pos; } - static void -load_reads(const string &reads_file_name, - const GenomicRegion ®ion, +load_reads(const string &reads_file_name, const GenomicRegion ®ion, vector &the_reads) { // open and check the file @@ -117,34 +115,32 @@ load_reads(const string &reads_file_name, string chrom, seq; size_t start = 0ul; - while ((in >> chrom >> start >> seq) && - chrom == query_chrom && start < query_end) + while ((in >> chrom >> start >> seq) && chrom == query_chrom && + start < query_end) the_reads.emplace_back(start, seq); } - static void convert_coordinates(const vector &cpg_positions, GenomicRegion ®ion) { const size_t start_pos = lower_bound(cbegin(cpg_positions), cend(cpg_positions), - region.get_start()) - cbegin(cpg_positions); + region.get_start()) - + cbegin(cpg_positions); const size_t end_pos = - lower_bound(cbegin(cpg_positions), cend(cpg_positions), - region.get_end()) - cbegin(cpg_positions); + lower_bound(cbegin(cpg_positions), cend(cpg_positions), region.get_end()) - + cbegin(cpg_positions); region.set_start(start_pos); region.set_end(end_pos); } - inline static bool is_cpg(const string &s, const size_t idx) { return toupper(s[idx]) == 'C' && toupper(s[idx + 1]) == 'G'; } - static void collect_cpgs(const string &s, vector &cpgs) { const size_t lim = s.length() - 1; @@ -153,14 +149,11 @@ collect_cpgs(const string &s, vector &cpgs) { cpgs.push_back(i); } - static void -clip_reads(const size_t start_pos, const size_t end_pos, - vector &r) { +clip_reads(const size_t start_pos, const size_t end_pos, vector &r) { size_t j = 0; for (size_t i = 0; i < r.size(); ++i) { - if (start_pos < r[i].pos + r[i].seq.length() && - r[i].pos < end_pos) { + if (start_pos < r[i].pos + r[i].seq.length() && r[i].pos < end_pos) { if (r[i].pos < start_pos) { assert(start_pos - r[i].pos < r[i].seq.length()); r[i].seq = r[i].seq.substr(start_pos - r[i].pos); @@ -175,17 +168,15 @@ clip_reads(const size_t start_pos, const size_t end_pos, r.erase(begin(r) + j, end(r)); } - // give names to regions if they do not exist static void ensure_regions_are_named(vector ®ions) { auto region_name_idx = 0u; - for (auto region: regions) + for (auto region : regions) if (region.get_name().empty()) region.set_name("region" + std::to_string(++region_name_idx)); } - int main_amrtester(int argc, char *argv[]) { @@ -209,11 +200,11 @@ main_amrtester(int argc, char *argv[]) { OptionParser opt_parse(strip_path(argv[0]), "resolve epi-alleles", " "); opt_parse.add_opt("output", 'o', "output file", false, outfile); - opt_parse.add_opt("chrom", 'c', "reference genome fasta file", - true, chrom_file); + opt_parse.add_opt("chrom", 'c', "reference genome fasta file", true, + chrom_file); opt_parse.add_opt("itr", 'i', "max iterations", false, max_itr); - opt_parse.add_opt("nordc", 'r', "turn off read count correction", - false, correct_for_read_count); + opt_parse.add_opt("nordc", 'r', "turn off read count correction", false, + correct_for_read_count); opt_parse.add_opt("bic", 'b', "use BIC to compare models", false, use_bic); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); opt_parse.add_opt("progress", 'P', "show progress", false, show_progress); @@ -243,7 +234,7 @@ main_amrtester(int argc, char *argv[]) { /****************** END COMMAND LINE OPTIONS *****************/ const EpireadStats epistat{low_prob, high_prob, critical_value, - max_itr, use_bic, correct_for_read_count}; + max_itr, use_bic, correct_for_read_count}; if (!validate_epiread_file(reads_file_name)) throw runtime_error("invalid states file: " + reads_file_name); @@ -276,7 +267,8 @@ main_amrtester(int argc, char *argv[]) { vector cpg_positions; std::ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); bool is_significant = false; @@ -309,12 +301,14 @@ main_amrtester(int argc, char *argv[]) { clip_reads(conv_region.get_start(), conv_region.get_end(), reads); - const auto score = reads.empty() ? 1.0 : epistat.test_asm(reads, is_significant); + const auto score = + reads.empty() ? 1.0 : epistat.test_asm(reads, is_significant); region.set_score(score); region.set_name(region.get_name() + ":" + toa(reads.size())); out << region << '\n'; } - if (show_progress) cerr << "\r100%" << endl; + if (show_progress) + cerr << "\r100%" << endl; } catch (const runtime_error &e) { cerr << e.what() << endl; From 97b8165700e33dee539e462a70b821eff7d8a572 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 004/106] src/analysis/autocorr.cpp: changes to add static analysis --- src/analysis/autocorr.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/analysis/autocorr.cpp b/src/analysis/autocorr.cpp index 9e44a62e..b0e298dd 100644 --- a/src/analysis/autocorr.cpp +++ b/src/analysis/autocorr.cpp @@ -43,8 +43,8 @@ The value of N is the number of observations contributing.)"; struct genomic_interval { std::string chrom; - std::uint32_t start_pos; - std::uint32_t end_pos; + std::uint32_t start_pos{}; + std::uint32_t end_pos{}; auto operator<(const genomic_interval &other) const -> bool { @@ -348,16 +348,16 @@ newline_terminated_file(const std::string &filename) -> bool { } auto -main_autocorr(int argc, char *argv[]) -> int { +main_autocorr(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays) static constexpr auto default_max_dist = 4000; static constexpr auto default_min_reads = 10; static constexpr auto default_min_sites = 500; static constexpr auto header = "distance\tcorrelation\tN\tsdX\tsdY\tcovXY\n"; - const auto megabytes = [](const double x) { + const auto megabytes = [](const auto x) { constexpr auto mb = 1024 * 1024; - return std::to_string(x / mb) + "MB"; + return std::to_string(static_cast(x) / mb) + "MB"; }; std::string input_filename; @@ -376,7 +376,8 @@ main_autocorr(int argc, char *argv[]) -> int { bool include_header = false; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(argv[0], description, ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.set_prog_descr_raw(); opt_parse.add_opt("input", 'i', "input file name", true, input_filename); opt_parse.add_opt("output", 'o', "output file name", true, outfile); From f45e4543fa58f43e5246359b826e30fdec648af7 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 005/106] src/analysis/bsrate.cpp: changes to add static analysis --- src/analysis/bsrate.cpp | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/analysis/bsrate.cpp b/src/analysis/bsrate.cpp index e5bbc6e6..a49d9adf 100644 --- a/src/analysis/bsrate.cpp +++ b/src/analysis/bsrate.cpp @@ -21,20 +21,34 @@ #include "bam_record_utils.hpp" #include "bsutils.hpp" #include "dnmt_error.hpp" -#include "smithlab_utils.hpp" +#include "smithlab_os.hpp" #include +#include + #include +#include +#include +#include +#include +#include #include +#include #include +#include +#include #include +#include #include #include #include #include +#include #include +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) + struct bsrate_summary { // converted_count_positive is the number of nucleotides covering a // cytosine in the reference that show a thymine in the read, and @@ -89,7 +103,8 @@ struct bsrate_summary { [[nodiscard]] double bisulfite_conversion_rate_positive() const { return static_cast(converted_count_positive) / - std::max(total_count_positive, static_cast(1)); + static_cast( + std::max(total_count_positive, static_cast(1))); } // bisulfite_conversion_rate_negative is equal to converted_count_negative @@ -99,7 +114,8 @@ struct bsrate_summary { [[nodiscard]] double bisulfite_conversion_rate_negative() const { return static_cast(converted_count_negative) / - std::max(total_count_negative, static_cast(1)); + static_cast( + std::max(total_count_negative, static_cast(1))); } // bisulfite_conversion_rate is equal to converted_count divided by @@ -109,7 +125,8 @@ struct bsrate_summary { [[nodiscard]] double bisulfite_conversion_rate() const { return static_cast(converted_count()) / - std::max(total_count(), static_cast(1)); + static_cast( + std::max(total_count(), static_cast(1))); } // error_count is equal to the sum of error_count_positive and @@ -132,7 +149,8 @@ struct bsrate_summary { [[nodiscard]] double error_rate() const { return static_cast(error_count()) / - std::max(valid_count(), static_cast(1)); + static_cast( + std::max(valid_count(), static_cast(1))); } void @@ -476,6 +494,7 @@ write_per_read_histogram(const std::vector> &tab, const std::size_t n_hist_bins, std::ostream &out) { const auto hist = format_histogram(tab, n_hist_bins); out << std::fixed; + // NOLINTNEXTLINE(cert-flp30-c,clang-analyzer-security*) for (auto i = 0.0; i < hist.size(); ++i) out << std::setprecision(3) << i / hist.size() << '\t' << std::setprecision(3) << (i + 1) / hist.size() << '\t' @@ -483,7 +502,7 @@ write_per_read_histogram(const std::vector> &tab, } int -main_bsrate(int argc, char *argv[]) { +main_bsrate(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { // assumed maximum length of a fragment static constexpr const std::size_t output_size = 10000; @@ -507,7 +526,7 @@ main_bsrate(int argc, char *argv[]) { std::string seq_to_use; // use only this chrom/sequence in the analysis /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "Program to compute the " "BS conversion rate from BS-seq " "reads mapped to a genome", @@ -662,3 +681,5 @@ main_bsrate(int argc, char *argv[]) { } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) From fc994361b7fafdd0cd08a733079bb54af17cc2f0 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 006/106] src/analysis/cpgbins.cpp: changes to add static analysis --- src/analysis/cpgbins.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/src/analysis/cpgbins.cpp b/src/analysis/cpgbins.cpp index 949204d0..6ab28693 100644 --- a/src/analysis/cpgbins.cpp +++ b/src/analysis/cpgbins.cpp @@ -17,29 +17,26 @@ #include "GenomicRegion.hpp" #include "LevelsCounter.hpp" -#include "MSite.hpp" #include "OptionParser.hpp" #include "bsutils.hpp" -#include "smithlab_utils.hpp" #include "xcounts_utils.hpp" -#include - #include -#include #include +#include #include #include #include #include +#include #include +#include #include #include +#include #include -using bamxx::bgzf_file; - -namespace fs = std::filesystem; +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) static std::string format_levels_counter(const LevelsCounter &lc) { @@ -68,6 +65,7 @@ format_levels_counter(const LevelsCounter &lc) { static std::unordered_map get_chrom_sizes(const std::string &chrom_sizes_file) { + // NOLINTBEGIN(performance-inefficient-string-concatenation) std::unordered_map chrom_sizes; std::ifstream in(chrom_sizes_file); @@ -93,10 +91,12 @@ get_chrom_sizes(const std::string &chrom_sizes_file) { chrom_sizes[chrom_name] = chrom_size; } return chrom_sizes; + // NOLINTEND(performance-inefficient-string-concatenation) } static std::vector get_chrom_names(const std::string &chrom_sizes_file) { + // NOLINTBEGIN(performance-inefficient-string-concatenation) std::ifstream in(chrom_sizes_file); if (!in) throw std::runtime_error("failed to open file: " + chrom_sizes_file); @@ -114,6 +114,7 @@ get_chrom_names(const std::string &chrom_sizes_file) { chrom_names.push_back(chrom_name); } return chrom_names; + // NOLINTEND(performance-inefficient-string-concatenation) } static void @@ -196,7 +197,7 @@ process_chrom(const bool report_more_info, const std::string &chrom_name, } int -main_cpgbins(int argc, char *argv[]) { +main_cpgbins(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { static const std::string description = R"""( Compute average site methylation levels in each non-overlapping @@ -213,8 +214,6 @@ Columns (beyond the first 6) in the BED format output: (13) total number of observations from reads in the region )"""; - static const std::string default_name_prefix = "X"; - bool verbose = false; bool report_more_info = false; std::uint32_t n_threads = 1; @@ -224,8 +223,8 @@ Columns (beyond the first 6) in the BED format output: std::string outfile; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - " "); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " "); opt_parse.set_show_defaults(); opt_parse.add_opt("output", 'o', "name of output file (default: stdout)", true, outfile); @@ -262,11 +261,11 @@ Columns (beyond the first 6) in the BED format output: const std::string xcounts_file = leftover_args.back(); /****************** END COMMAND LINE OPTIONS *****************/ - if (!fs::is_regular_file(chrom_sizes_file)) + if (!std::filesystem::is_regular_file(chrom_sizes_file)) throw std::runtime_error("chromosome sizes file not a regular file: " + chrom_sizes_file); - if (!fs::is_regular_file(xcounts_file)) + if (!std::filesystem::is_regular_file(xcounts_file)) throw std::runtime_error("xsym file not a regular file: " + xcounts_file); const auto sites_by_chrom = read_xcounts_by_chrom(n_threads, xcounts_file); @@ -296,3 +295,5 @@ Columns (beyond the first 6) in the BED format output: } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) From d5024b9fb4e4378f60c18709d5c79228e87c6a0c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 007/106] src/analysis/hmr-rep.cpp: changes to add static analysis --- src/analysis/hmr-rep.cpp | 470 +++++++++++++++++++-------------------- 1 file changed, 229 insertions(+), 241 deletions(-) diff --git a/src/analysis/hmr-rep.cpp b/src/analysis/hmr-rep.cpp index fb34bc4c..ec40f9ca 100644 --- a/src/analysis/hmr-rep.cpp +++ b/src/analysis/hmr-rep.cpp @@ -13,41 +13,32 @@ * GNU General Public License for more details. */ -#include -#include -#include -#include -#include -#include -#include // for [u]int[0-9]+_t -#include - -#include - -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" #include "GenomicRegion.hpp" +#include "MSite.hpp" #include "OptionParser.hpp" - #include "TwoStateHMM.hpp" -#include "MSite.hpp" -using std::string; -using std::vector; -using std::cout; -using std::endl; -using std::cerr; -using std::numeric_limits; -using std::max; -using std::min; -using std::pair; -using std::make_pair; -using std::runtime_error; -using std::to_string; -using std::begin; -using std::end; - -using bamxx::bgzf_file; +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) static GenomicRegion as_gen_rgn(const MSite &s) { @@ -55,26 +46,31 @@ as_gen_rgn(const MSite &s) { } static double -get_stepup_cutoff(vector scores, const double cutoff) { - if (cutoff <= 0) return numeric_limits::max(); - else if (cutoff > 1) return numeric_limits::min(); +get_stepup_cutoff(std::vector scores, const double cutoff) { + if (cutoff <= 0) + return std::numeric_limits::max(); + else if (cutoff > 1) + return std::numeric_limits::min(); const size_t n = scores.size(); - std::sort(begin(scores), end(scores)); + std::sort(std::begin(scores), std::end(scores)); size_t i = 1; - while (i < n && scores[i-1] < (cutoff*i)/n) ++i; + while (i < n && scores[i - 1] < (cutoff * i) / n) + ++i; return scores[i - 1]; } -template T -pair_sum(const std::pair &t) {return t.first + t.second;} +template +[[nodiscard]] T +pair_sum(const std::pair &t) { + return t.first + t.second; +} static void -get_domain_scores_rep(const vector &state_ids, - const vector > > &meth, - const vector &reset_points, - vector &scores) { - +get_domain_scores_rep( + const std::vector &state_ids, + const std::vector>> &meth, + const std::vector &reset_points, std::vector &scores) { const size_t n_reps = meth.size(); size_t reset_idx = 1; bool in_domain = false; @@ -92,7 +88,7 @@ get_domain_scores_rep(const vector &state_ids, in_domain = true; for (size_t r = 0; r < n_reps; ++r) if (pair_sum(meth[r][i]) >= 1) - score += 1.0 - meth[r][i].first/pair_sum(meth[r][i]); + score += 1.0 - meth[r][i].first / pair_sum(meth[r][i]); } else if (in_domain) { in_domain = false; @@ -102,13 +98,11 @@ get_domain_scores_rep(const vector &state_ids, } } - static void -build_domains(const vector &cpgs, - const vector &reset_points, - const vector &state_ids, - vector &domains) { - +build_domains(const std::vector &cpgs, + const std::vector &reset_points, + const std::vector &state_ids, + std::vector &domains) { size_t n_cpgs = 0, n_domains = 0, reset_idx = 1, prev_end = 0; bool in_domain = false; for (size_t i = 0; i < state_ids.size(); ++i) { @@ -125,7 +119,7 @@ build_domains(const vector &cpgs, if (!in_domain) { in_domain = true; domains.push_back(as_gen_rgn(cpgs[i])); - domains.back().set_name("HYPO" + to_string(n_domains++)); + domains.back().set_name("HYPO" + std::to_string(n_domains++)); } ++n_cpgs; } @@ -139,17 +133,14 @@ build_domains(const vector &cpgs, } } - template static void -separate_regions(const bool VERBOSE, - const size_t desert_size, - vector &cpgs, - vector > &meth, - vector > &reads, - vector &reset_points) { +separate_regions(const bool VERBOSE, const size_t desert_size, + std::vector &cpgs, std::vector> &meth, + std::vector> &reads, + std::vector &reset_points) { if (VERBOSE) - cerr << "[separating by cpg desert]" << endl; + std::cerr << "[separating by cpg desert]\n"; // eliminate the zero-read cpg sites if no coverage in any replicates const size_t n_reps = meth.size(); @@ -170,17 +161,18 @@ separate_regions(const bool VERBOSE, } } - cpgs.erase(begin(cpgs) + j, end(cpgs)); + cpgs.erase(std::begin(cpgs) + j, std::end(cpgs)); for (size_t r = 0; r < n_reps; ++r) { - meth[r].erase(begin(meth[r]) + j, end(meth[r])); - reads[r].erase(begin(reads[r]) + j, end(reads[r])); + meth[r].erase(std::begin(meth[r]) + j, std::end(meth[r])); + reads[r].erase(std::begin(reads[r]) + j, std::end(reads[r])); } // segregate cpgs size_t prev_cpg = 0; for (size_t i = 0; i < cpgs.size(); ++i) { - const size_t dist = (i > 0 && cpgs[i].chrom == cpgs[i - 1].chrom) ? - cpgs[i].pos - prev_cpg : numeric_limits::max(); + const size_t dist = (i > 0 && cpgs[i].chrom == cpgs[i - 1].chrom) + ? cpgs[i].pos - prev_cpg + : std::numeric_limits::max(); if (dist > desert_size) reset_points.push_back(i); prev_cpg = cpgs[i].pos; @@ -188,163 +180,151 @@ separate_regions(const bool VERBOSE, reset_points.push_back(cpgs.size()); if (VERBOSE) - cerr << "[cpgs retained: " << cpgs.size() << "]" << endl - << "[deserts removed: " << reset_points.size() - 2 << "]" << endl; + std::cerr << "[cpgs retained: " << cpgs.size() << "]\n" + << "[deserts removed: " << reset_points.size() - 2 << "]\n"; } static void shuffle_cpgs_rep(const size_t rng_seed, const TwoStateHMM &hmm, - vector > > meth, - vector reset_points, + std::vector>> + meth, // cppcheck-suppress passedByValue + const std::vector &reset_points, const double f_to_b_trans, const double b_to_f_trans, - const vector &fg_alpha, const vector &fg_beta, - const vector &bg_alpha, const vector &bg_beta, - vector &domain_scores) { - + const std::vector &fg_alpha, + const std::vector &fg_beta, + const std::vector &bg_alpha, + const std::vector &bg_beta, + std::vector &domain_scores) { auto eng = std::default_random_engine(rng_seed); - for (size_t r = 0 ; r < meth.size(); ++r) - std::shuffle(begin(meth[r]), end(meth[r]), eng); + for (size_t r = 0; r < meth.size(); ++r) + std::shuffle(std::begin(meth[r]), std::end(meth[r]), eng); - vector state_ids; - vector scores; + std::vector state_ids; + std::vector scores; hmm.PosteriorDecoding(meth, reset_points, f_to_b_trans, b_to_f_trans, - fg_alpha, fg_beta, bg_alpha, bg_beta, - state_ids, scores); + fg_alpha, fg_beta, bg_alpha, bg_beta, state_ids, + scores); get_domain_scores_rep(state_ids, meth, reset_points, domain_scores); - sort(begin(domain_scores), end(domain_scores)); + std::sort(std::begin(domain_scores), std::end(domain_scores)); } - static void -assign_p_values(const vector &random_scores, - const vector &observed_scores, - vector &p_values) { - const double n_randoms = max(random_scores.size(), 1ul); +assign_p_values(const std::vector &random_scores, + const std::vector &observed_scores, + std::vector &p_values) { + const double n_randoms = std::max(random_scores.size(), 1ul); for (size_t i = 0; i < observed_scores.size(); ++i) - p_values.push_back((end(random_scores) - - upper_bound(begin(random_scores), - end(random_scores), - observed_scores[i]))/n_randoms); + p_values.push_back( + (std::end(random_scores) - upper_bound(std::begin(random_scores), + std::end(random_scores), + observed_scores[i])) / + n_randoms); } - static void -read_params_file(const bool VERBOSE, - const string ¶ms_file, - double &fg_alpha, double &fg_beta, - double &bg_alpha, double &bg_beta, - double &f_to_b_trans, double &b_to_f_trans, +read_params_file(const bool VERBOSE, const std::string ¶ms_file, + double &fg_alpha, double &fg_beta, double &bg_alpha, + double &bg_beta, double &f_to_b_trans, double &b_to_f_trans, double &fdr_cutoff) { - - string jnk; + std::string jnk; std::ifstream in(params_file); if (!in) - throw runtime_error("failed to parse params file: " + params_file); + throw std::runtime_error("failed to parse params file: " + params_file); - in >> jnk >> fg_alpha - >> jnk >> fg_beta - >> jnk >> bg_alpha - >> jnk >> bg_beta - >> jnk >> f_to_b_trans - >> jnk >> b_to_f_trans - >> jnk >> fdr_cutoff; + in >> jnk >> fg_alpha >> jnk >> fg_beta >> jnk >> bg_alpha >> jnk >> + bg_beta >> jnk >> f_to_b_trans >> jnk >> b_to_f_trans >> jnk >> fdr_cutoff; if (VERBOSE) - cerr << "read in params from " << params_file << endl - << "FG_ALPHA\t" << fg_alpha << endl - << "FG_BETA\t" << fg_beta << endl - << "BG_ALPHA\t" << bg_alpha << endl - << "BG_BETA\t" << bg_beta << endl - << "F_B\t" << f_to_b_trans << endl - << "B_F\t" << b_to_f_trans << endl - << "FDR_CUTOFF\t" << fdr_cutoff << endl; + std::cerr << "read in params from " << params_file << '\n' + << "FG_ALPHA\t" << fg_alpha << '\n' + << "FG_BETA\t" << fg_beta << '\n' + << "BG_ALPHA\t" << bg_alpha << '\n' + << "BG_BETA\t" << bg_beta << '\n' + << "F_B\t" << f_to_b_trans << '\n' + << "B_F\t" << b_to_f_trans << '\n' + << "FDR_CUTOFF\t" << fdr_cutoff << '\n'; } - static void -write_params_file(const string &outfile, - const vector &fg_alpha, - const vector &fg_beta, - const vector &bg_alpha, - const vector &bg_beta, - const double f_to_b_trans, - const double b_to_f_trans, - const double fdr_cutoff) { - +write_params_file(const std::string &outfile, + const std::vector &fg_alpha, + const std::vector &fg_beta, + const std::vector &bg_alpha, + const std::vector &bg_beta, const double f_to_b_trans, + const double b_to_f_trans, const double fdr_cutoff) { std::ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out.precision(30); - for (size_t r =0; r < fg_alpha.size(); ++r) - out << "FG_ALPHA_" << r+1 << '\t' << fg_alpha[r] << '\t' - << "FG_BETA_" << r+1 << '\t' << fg_beta[r] << '\t' - << "BG_ALPHA_" << r+1 << '\t' << bg_alpha[r] << '\t' - << "BG_BETA_" << r+1 << '\t' << bg_beta[r] << endl; - - out << "F_B\t" << f_to_b_trans << endl - << "B_F\t" << b_to_f_trans << endl - << "FDR_CUTOFF\t" << fdr_cutoff << endl - ; + for (size_t r = 0; r < fg_alpha.size(); ++r) + out << "FG_ALPHA_" << r + 1 << '\t' << fg_alpha[r] << '\t' << "FG_BETA_" + << r + 1 << '\t' << fg_beta[r] << '\t' << "BG_ALPHA_" << r + 1 << '\t' + << bg_alpha[r] << '\t' << "BG_BETA_" << r + 1 << '\t' << bg_beta[r] + << '\n'; + + out << "F_B\t" << f_to_b_trans << '\n' + << "B_F\t" << b_to_f_trans << '\n' + << "FDR_CUTOFF\t" << fdr_cutoff << '\n'; } static void -load_cpgs(const string &cpgs_file, vector &cpgs, - vector > &meth, - vector &reads) { - - bgzf_file in(cpgs_file, "r"); - if (!in) throw runtime_error("failed opening file: " + cpgs_file); +load_cpgs(const std::string &cpgs_file, std::vector &cpgs, + std::vector> &meth, + std::vector &reads) { + bamxx::bgzf_file in(cpgs_file, "r"); + if (!in) + throw std::runtime_error("failed opening file: " + cpgs_file); MSite the_site; while (read_site(in, the_site)) { cpgs.push_back(the_site); reads.push_back(the_site.n_reads); - meth.push_back(make_pair(the_site.n_meth(), the_site.n_unmeth())); + meth.push_back(std::make_pair(the_site.n_meth(), the_site.n_unmeth())); } } static void -check_consistent_sites(const string &expected_filename, - const vector &expected, - const string &observed_filename, - const vector &observed) { +check_consistent_sites(const std::string &expected_filename, + const std::vector &expected, + const std::string &observed_filename, + const std::vector &observed) { if (expected.size() != observed.size()) { std::ostringstream err_msg; - err_msg << "inconsistent number of sites" << endl + err_msg << "inconsistent number of sites\n" << "file=" << expected_filename << "," - << "sites=" << expected.size() << endl + << "sites=" << expected.size() << '\n' << "file=" << observed_filename << "," - << "sites=" << observed.size() << endl; - throw runtime_error(err_msg.str()); + << "sites=" << observed.size() << '\n'; + throw std::runtime_error(err_msg.str()); } } -template double +template +[[nodiscard]] double get_mean(InputIterator first, InputIterator last) { - return accumulate(first, last, 0.0)/std::distance(first, last); + return std::accumulate(first, last, 0.0) / std::distance(first, last); } -static vector -split_comma(const string &orig) { - string tmp(orig); - replace(begin(tmp), end(tmp), ',', ' '); +static std::vector +split_comma(const std::string &orig) { + std::string tmp(orig); + std::replace(std::begin(tmp), std::end(tmp), ',', ' '); std::istringstream iss(tmp); - vector parts; + std::vector parts; while (iss >> tmp) parts.push_back(tmp); return parts; } int -main_hmr_rep(int argc, char *argv[]) { - +main_hmr_rep(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - - string outfile; - string hypo_post_outfile; - string meth_post_outfile; + std::string outfile; + std::string hypo_post_outfile; + std::string meth_post_outfile; size_t desert_size = 1000; size_t max_iterations = 10; @@ -353,67 +333,72 @@ main_hmr_rep(int argc, char *argv[]) { // run mode flags bool VERBOSE = false; - const double tolerance = 1e-10; // corrections for small values + const double tolerance = 1e-10; // corrections for small values - string params_in_files; - string params_out_file; + std::string params_in_files; + std::string params_out_file; - const string description = - "Identify HMRs in a set of replicate methylomes. Methylation must be \ - provided in the methcounts format (chrom, position, strand, context, \ - methylation, reads). See the methcounts documentation for details \ - for details. This program assumes only data at CpG sites and that \ - strands are collapsed so only the positive site appears in the file."; + const std::string description = + R"(Identify HMRs in a set of replicate methylomes. Methylation must be + provided in the methcounts format (chrom, position, strand, context, + methylation, reads). See the methcounts documentation for details + for details. This program assumes only data at CpG sites and that + strands are collapsed so only the positive site appears in the file.)"; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " ..."); - opt_parse.add_opt("out", 'o', "output file (default: stdout)", - false, outfile); + opt_parse.add_opt("out", 'o', "output file (default: stdout)", false, + outfile); opt_parse.add_opt("desert", 'd', "max dist btwn cpgs with reads in HMR", false, desert_size); opt_parse.add_opt("itr", 'i', "max iterations", false, max_iterations); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); - opt_parse.add_opt("post-hypo", '\0', "output file for single-CpG posteiror " + opt_parse.add_opt("post-hypo", '\0', + "output file for single-CpG posteiror " "hypomethylation probability (default: NULL)", false, hypo_post_outfile); - opt_parse.add_opt("post-meth", '\0', "output file for single-CpG posteiror " + opt_parse.add_opt("post-meth", '\0', + "output file for single-CpG posteiror " "methylation probability (default: NULL)", false, meth_post_outfile); - opt_parse.add_opt("params-in", 'P', "HMM parameter files for " + opt_parse.add_opt("params-in", 'P', + "HMM parameter files for " "individual methylomes (separated with comma)", false, params_in_files); opt_parse.add_opt("params-out", 'p', "write HMM parameters to this file", false, params_out_file); opt_parse.add_opt("seed", 's', "specify random seed", false, rng_seed); opt_parse.set_show_defaults(); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.empty()) { - cerr << opt_parse.help_message() << endl; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - const vector cpgs_files(leftover_args); + const std::vector cpgs_files(leftover_args); /****************** END COMMAND LINE OPTIONS *****************/ - for (auto &filename : cpgs_files) + for (const auto &filename : cpgs_files) + // cppcheck-suppress useStlAlgorithm if (!is_msite_file(filename)) - throw runtime_error("malformed counts file: " + filename); + throw std::runtime_error("malformed counts file: " + filename); - vector params_in_file; + std::vector params_in_file; if (!params_in_files.empty()) { params_in_file = split_comma(params_in_files); assert(cpgs_files.size() == params_in_file.size()); @@ -421,57 +406,60 @@ main_hmr_rep(int argc, char *argv[]) { const size_t n_reps = cpgs_files.size(); - vector cpgs; - vector > > meth(n_reps); - vector > reads(n_reps); + std::vector cpgs; + std::vector>> meth(n_reps); + std::vector> reads(n_reps); if (VERBOSE) - cerr << "[reading methylation levels]" << endl; + std::cerr << "[reading methylation levels]\n"; for (size_t i = 0; i < n_reps; ++i) { if (VERBOSE) - cerr << "[filename=" << cpgs_files[i] << "]" << endl; - vector curr_rep; + std::cerr << "[filename=" << cpgs_files[i] << "]\n"; + std::vector curr_rep; load_cpgs(cpgs_files[i], curr_rep, meth[i], reads[i]); if (VERBOSE) - cerr << "[total_cpgs=" << curr_rep.size() << "]" << endl - << "[mean_coverage=" - << get_mean(begin(reads[i]), end(reads[i])) << "]" << endl; + std::cerr << "[total_cpgs=" << curr_rep.size() << "]\n" + << "[mean_coverage=" + << get_mean(std::begin(reads[i]), std::end(reads[i])) + << "]\n"; if (i > 0) check_consistent_sites(cpgs_files[0], cpgs, cpgs_files[i], curr_rep); - else swap(cpgs, curr_rep); + else + swap(cpgs, curr_rep); } // separate the regions by chrom and by desert, and eliminate // those isolated CpGs - vector reset_points; + std::vector reset_points; separate_regions(VERBOSE, desert_size, cpgs, meth, reads, reset_points); /****************** initalize params *****************/ const TwoStateHMM hmm(tolerance, max_iterations, VERBOSE); - vector fg_alpha(n_reps), fg_beta(n_reps); - vector bg_alpha(n_reps), bg_beta(n_reps); + std::vector fg_alpha(n_reps), fg_beta(n_reps); + std::vector bg_alpha(n_reps), bg_beta(n_reps); double fdr_cutoff = std::numeric_limits::max(); double f_to_b_trans = 0.25; double b_to_f_trans = 0.25; - if (!params_in_file.empty()) { // read parameters files - double fdr_cutoff_rep; // ignore this cutoff + if (!params_in_file.empty()) { // read parameters files + double fdr_cutoff_rep{}; // ignore this cutoff for (size_t i = 0; i < n_reps; ++i) - read_params_file(VERBOSE, params_in_file[i], fg_alpha[i], - fg_beta[i], bg_alpha[i], bg_beta[i], - f_to_b_trans, b_to_f_trans, fdr_cutoff_rep); + read_params_file(VERBOSE, params_in_file[i], fg_alpha[i], fg_beta[i], + bg_alpha[i], bg_beta[i], f_to_b_trans, b_to_f_trans, + fdr_cutoff_rep); max_iterations = 0; } else { for (size_t i = 0; i < n_reps; ++i) { // JQU: there are many 0s in reads[r], but the parameter start // points don't need to be perfect - const double mean_reads = get_mean(begin(reads[i]), end(reads[i])); - fg_alpha[i] = 0.33*mean_reads; - fg_beta[i] = 0.67*mean_reads; - bg_alpha[i] = 0.67*mean_reads; - bg_beta[i] = 0.33*mean_reads; + const double mean_reads = + get_mean(std::begin(reads[i]), std::end(reads[i])); + fg_alpha[i] = 0.33 * mean_reads; + fg_beta[i] = 0.67 * mean_reads; + bg_alpha[i] = 0.67 * mean_reads; + bg_beta[i] = 0.33 * mean_reads; } } @@ -479,67 +467,68 @@ main_hmr_rep(int argc, char *argv[]) { hmm.BaumWelchTraining(meth, reset_points, f_to_b_trans, b_to_f_trans, fg_alpha, fg_beta, bg_alpha, bg_beta); - vector state_ids; - vector posteriors; + std::vector state_ids; + std::vector posteriors; hmm.PosteriorDecoding(meth, reset_points, f_to_b_trans, b_to_f_trans, - fg_alpha, fg_beta, bg_alpha, bg_beta, - state_ids, posteriors); + fg_alpha, fg_beta, bg_alpha, bg_beta, state_ids, + posteriors); - vector domain_scores; + std::vector domain_scores; get_domain_scores_rep(state_ids, meth, reset_points, domain_scores); - vector random_scores; - shuffle_cpgs_rep(rng_seed, hmm, meth, reset_points, - f_to_b_trans, b_to_f_trans, - fg_alpha, fg_beta, bg_alpha, bg_beta, random_scores); + std::vector random_scores; + shuffle_cpgs_rep(rng_seed, hmm, meth, reset_points, f_to_b_trans, + b_to_f_trans, fg_alpha, fg_beta, bg_alpha, bg_beta, + random_scores); - vector p_values; + std::vector p_values; assign_p_values(random_scores, domain_scores, p_values); - if (fdr_cutoff == numeric_limits::max()) + if (fdr_cutoff == std::numeric_limits::max()) fdr_cutoff = get_stepup_cutoff(p_values, 0.01); - vector domains; + std::vector domains; build_domains(cpgs, reset_points, state_ids, domains); std::ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); size_t good_hmr_count = 0; for (size_t i = 0; i < domains.size(); ++i) if (p_values[i] < fdr_cutoff) { - domains[i].set_name("HYPO" + to_string(good_hmr_count++)); + domains[i].set_name("HYPO" + std::to_string(good_hmr_count++)); out << domains[i] << '\n'; } // write all the hmm parameters if requested if (!params_out_file.empty()) - write_params_file(params_out_file, fg_alpha, fg_beta, - bg_alpha, bg_beta, f_to_b_trans, b_to_f_trans, - fdr_cutoff); + write_params_file(params_out_file, fg_alpha, fg_beta, bg_alpha, bg_beta, + f_to_b_trans, b_to_f_trans, fdr_cutoff); if (!hypo_post_outfile.empty()) { if (VERBOSE) - cerr << "[writing=" << hypo_post_outfile << "]" << endl; - std::ofstream out(hypo_post_outfile); + std::cerr << "[writing=" << hypo_post_outfile << "]\n"; + std::ofstream out_post(hypo_post_outfile); for (size_t i = 0; i < cpgs.size(); ++i) { size_t m_reads = 0, u_reads = 0; - for (size_t j = 0; j < n_reps; ++j){ + for (size_t j = 0; j < n_reps; ++j) { m_reads += meth[j][i].first; u_reads += meth[j][i].second; } GenomicRegion cpg(as_gen_rgn(cpgs[i])); - cpg.set_name("CpG:" + to_string(m_reads) + ":" + to_string(u_reads)); + cpg.set_name("CpG:" + std::to_string(m_reads) + ":" + + std::to_string(u_reads)); cpg.set_score(posteriors[i]); - out << cpg << '\n'; + out_post << cpg << '\n'; } } if (!meth_post_outfile.empty()) { - std::ofstream out(meth_post_outfile); + std::ofstream out_post(meth_post_outfile); if (VERBOSE) - cerr << "[writing=" << meth_post_outfile << "]" << endl; + std::cerr << "[writing=" << meth_post_outfile << "]\n"; for (size_t i = 0; i < cpgs.size(); ++i) { size_t m_reads = 0, u_reads = 0; for (size_t j = 0; j < n_reps; ++j) { @@ -547,19 +536,18 @@ main_hmr_rep(int argc, char *argv[]) { u_reads += meth[j][i].second; } GenomicRegion cpg(as_gen_rgn(cpgs[i])); - cpg.set_name("CpG:" + to_string(m_reads) + ":" + to_string(u_reads)); + cpg.set_name("CpG:" + std::to_string(m_reads) + ":" + + std::to_string(u_reads)); cpg.set_score(1.0 - posteriors[i]); - out << cpg << '\n'; + out_post << cpg << '\n'; } } } - catch (const runtime_error &e) { - cerr << "ERROR:\t" << e.what() << endl; - return EXIT_FAILURE; - } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::exception &e) { + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) From be3048814b3865a3026818df5e10994fc6838ecd Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 008/106] src/analysis/hmr.cpp: changes to add static analysis --- src/analysis/hmr.cpp | 94 +++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/src/analysis/hmr.cpp b/src/analysis/hmr.cpp index 9fac1574..a9e4260f 100644 --- a/src/analysis/hmr.cpp +++ b/src/analysis/hmr.cpp @@ -14,29 +14,37 @@ * GNU General Public License for more details. */ +#include "GenomicRegion.hpp" #include "MSite.hpp" +#include "OptionParser.hpp" #include "TwoStateHMM.hpp" #include "counts_header.hpp" -#include "GenomicRegion.hpp" -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - #include -#include -#include // for [u]int[0-9]+_t +#include +#include +#include +#include #include +#include #include +#include +#include +#include #include #include +#include #include #include #include +#include +#include + +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer) struct hmr_summary { - hmr_summary(const std::vector &hmrs) { + explicit hmr_summary(const std::vector &hmrs) { hmr_count = hmrs.size(); hmr_total_size = std::accumulate(std::cbegin(hmrs), std::cend(hmrs), 0ul, @@ -96,7 +104,6 @@ get_domain_scores(const std::vector &state_ids, const std::vector> &meth, const std::vector &reset_points, std::vector &scores) { - std::size_t reset_idx = 1; bool in_domain = false; double score = 0; @@ -129,7 +136,6 @@ build_domains(const std::vector &cpgs, const std::vector &reset_points, const std::vector &state_ids, std::vector &domains) { - std::size_t n_cpgs = 0, reset_idx = 1, prev_end = 0; bool in_domain = false; for (std::size_t i = 0; i < state_ids.size(); ++i) { @@ -230,11 +236,10 @@ make_partial_meth(const std::vector &reads, static void shuffle_cpgs(const std::size_t rng_seed, const TwoStateHMM &hmm, std::vector> meth, - std::vector reset_points, const double p_fb, + const std::vector &reset_points, const double p_fb, const double p_bf, const double fg_alpha, const double fg_beta, const double bg_alpha, const double bg_beta, std::vector &domain_scores) { - auto eng = std::default_random_engine(rng_seed); std::shuffle(std::begin(meth), std::end(meth), eng); @@ -243,7 +248,7 @@ shuffle_cpgs(const std::size_t rng_seed, const TwoStateHMM &hmm, hmm.PosteriorDecoding(meth, reset_points, p_fb, p_bf, fg_alpha, fg_beta, bg_alpha, bg_beta, state_ids, scores); get_domain_scores(state_ids, meth, reset_points, domain_scores); - sort(std::begin(domain_scores), std::end(domain_scores)); + std::sort(std::begin(domain_scores), std::end(domain_scores)); } static void @@ -285,7 +290,6 @@ write_params_file(const std::string &outfile, const double fg_alpha, const double fg_beta, const double bg_alpha, const double bg_beta, const double p_fb, const double p_bf, const double domain_score_cutoff) { - std::ofstream of; if (!outfile.empty()) of.open(outfile.c_str()); @@ -317,7 +321,6 @@ static void load_cpgs(const std::string &cpgs_file, std::vector &cpgs, std::vector> &meth, std::vector &reads) { - bamxx::bgzf_file in(cpgs_file, "r"); if (!in) throw std::runtime_error("failed opening file: " + cpgs_file); @@ -348,7 +351,6 @@ get_mean(InputIterator first, InputIterator last) -> T { template static void check_sorted_within_chroms(T first, const T last) { - // empty, or a single element if (first == last || first + 1 == last) return; @@ -378,10 +380,8 @@ check_sorted_within_chroms(T first, const T last) { } int -main_hmr(int argc, char *argv[]) { - +main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - constexpr double min_coverage = 1.0; std::string outfile; @@ -405,17 +405,17 @@ main_hmr(int argc, char *argv[]) { std::string params_in_file; std::string params_out_file; - const std::string description = - "Identify HMRs in methylomes. Methylation must be provided in the \ - methcounts format (chrom, position, strand, context, \ - methylation, reads). See the methcounts documentation for \ - details. This program assumes only data at CpG sites and that \ - strands are collapsed so only the positive site appears in the \ - file."; + constexpr auto description = + R"(Identify HMRs in methylomes. Methylation must be provided + in the methcounts format (chrom, position, strand, context, + methylation, reads). See the methcounts documentation for + details. This program assumes only data at CpG sites and that + strands are collapsed so only the positive site appears in the + file.)"; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.add_opt("out", 'o', "output file (default: stdout)", false, outfile); opt_parse.add_opt("desert", 'd', "max dist btwn covered cpgs in HMR", false, @@ -568,23 +568,25 @@ main_hmr(int argc, char *argv[]) { if (!domain_scores.empty()) build_domains(cpgs, reset_points, state_ids, domains); - std::ofstream of; - if (!outfile.empty()) - of.open(outfile); - std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); - - std::size_t good_hmr_count = 0; - for (auto i = 0u; i < size(domains); ++i) - if (p_values[i] < domain_score_cutoff) { - domains[good_hmr_count] = domains[i]; - domains[good_hmr_count].set_name("HYPO" + - std::to_string(good_hmr_count)); - ++good_hmr_count; - } - domains.resize(good_hmr_count); - - for (const auto &d : domains) - out << d << '\n'; + { + std::ofstream of; + if (!outfile.empty()) + of.open(outfile); + std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); + + std::size_t good_hmr_count = 0; + for (auto i = 0u; i < size(domains); ++i) + if (p_values[i] < domain_score_cutoff) { + domains[good_hmr_count] = domains[i]; + domains[good_hmr_count].set_name("HYPO" + + std::to_string(good_hmr_count)); + ++good_hmr_count; + } + domains.resize(good_hmr_count); + + for (const auto &d : domains) + out << d << '\n'; + } if (!hypo_post_outfile.empty()) { if (verbose) @@ -622,3 +624,5 @@ main_hmr(int argc, char *argv[]) { } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer) From 13389a8a8f5dd8b7612b55ff99a27d65c67a4b4f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 009/106] src/analysis/hypermr.cpp: changes to add static analysis --- src/analysis/hypermr.cpp | 122 +++++++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 50 deletions(-) diff --git a/src/analysis/hypermr.cpp b/src/analysis/hypermr.cpp index e5855bcc..4f9d1698 100644 --- a/src/analysis/hypermr.cpp +++ b/src/analysis/hypermr.cpp @@ -20,19 +20,26 @@ * 02110-1301 USA */ +#include "BetaBin.hpp" +#include "GenomicRegion.hpp" +#include "MSite.hpp" +#include "OptionParser.hpp" +#include "ThreeStateHMM.hpp" + #include -#include +#include +#include +#include #include +#include +#include +#include #include #include - -#include "GenomicRegion.hpp" -#include "MSite.hpp" -#include "OptionParser.hpp" -#include "ThreeStateHMM.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" +#include +#include +#include using std::begin; using std::cerr; @@ -53,6 +60,8 @@ using std::vector; using std::ofstream; using std::ostream_iterator; +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) + static GenomicRegion as_gen_rgn(const MSite &s) { return GenomicRegion(s.chrom, s.pos, s.pos + 1); @@ -62,18 +71,21 @@ static void load_cpgs(const string &cpgs_file, vector &cpgs, vector> &meth) { bamxx::bgzf_file in(cpgs_file, "r"); - if (!in) throw runtime_error("failed opening file: " + cpgs_file); + if (!in) + throw runtime_error("failed opening file: " + cpgs_file); MSite the_site; string line; - while (getline(in, line)) cpgs.push_back(MSite(line)); + while (getline(in, line)) + cpgs.push_back(MSite(line)); meth.resize(cpgs.size()); for (size_t i = 0; i < cpgs.size(); ++i) meth[i] = make_pair(cpgs[i].n_meth(), cpgs[i].n_unmeth()); } -template static void +template +static void separate_regions(const size_t desert_size, vector &cpgs, vector &meth, vector &reset_points, size_t &total_bases, size_t &bases_in_deserts) { @@ -97,16 +109,17 @@ separate_regions(const size_t desert_size, vector &cpgs, vector &meth, : numeric_limits::max(); if (dist > desert_size) { reset_points.push_back(i); - if (dist < numeric_limits::max()) bases_in_deserts += dist; + if (dist < numeric_limits::max()) + bases_in_deserts += dist; } - if (dist < numeric_limits::max()) total_bases += dist; + if (dist < numeric_limits::max()) + total_bases += dist; prev_pos = cpgs[i].pos; } reset_points.push_back(cpgs.size()); } - // ADS (!!!) this function seems to not be working at all right now static void read_params_file(const string ¶ms_file, @@ -115,7 +128,8 @@ read_params_file(const string ¶ms_file, // betabin &HYPO_emission, vector> &trans) { std::ifstream in(params_file); - if (!in) throw runtime_error("failed to read param file: " + params_file); + if (!in) + throw runtime_error("failed to read param file: " + params_file); string hypo_emission_str; getline(in, hypo_emission_str); @@ -128,7 +142,8 @@ read_params_file(const string ¶ms_file, trans.resize(3, vector(3, 0.0)); for (size_t i = 0; i < trans.size(); ++i) - for (size_t j = 0; j < trans[i].size(); ++j) in >> trans[i][j]; + for (size_t j = 0; j < trans[i].size(); ++j) + in >> trans[i][j]; } static void @@ -136,12 +151,13 @@ write_params_file(const string ¶ms_file, const betabin &hypo_emission, const betabin &HYPER_emission, const betabin &HYPO_emission, const vector> &trans) { ofstream out(params_file); - out << hypo_emission.tostring() << endl - << HYPER_emission.tostring() << endl - << HYPO_emission.tostring() << endl; - for (auto &i : trans) { - copy(begin(i), end(i), ostream_iterator(out, "\t")); - out << endl; + out << hypo_emission.tostring() << '\n' + << HYPER_emission.tostring() << '\n' + << HYPO_emission.tostring() << '\n'; + for (const auto &i : trans) { + std::copy(std::cbegin(i), std::cend(i), + ostream_iterator(out, "\t")); + out << '\n'; } } @@ -189,7 +205,8 @@ build_domains(const vector &cpgs, domain.set_name(label + ":" + std::to_string(n)); domain.set_score(meth_sum); domain.set_strand('+'); - if (prev_state == HYPER || prev_state == HYPO) domains.push_back(domain); + if (prev_state == HYPER || prev_state == HYPO) + domains.push_back(domain); } } @@ -212,7 +229,7 @@ initialize_transitions(vector> &trans) { } int -main_hypermr(int argc, char *argv[]) { +main_hypermr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { static const string description = "Identify regions of elevated methylation. Designed for " @@ -223,7 +240,6 @@ main_hypermr(int argc, char *argv[]) { string outfile; string scores_file; - string trans_file; size_t desert_size = 1000; size_t max_iterations = 10; @@ -241,7 +257,8 @@ main_hypermr(int argc, char *argv[]) { string params_out_file; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(argv[0], description, ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.add_opt("out", 'o', "output file (BED format)", false, outfile); opt_parse.add_opt("scores", 's', "output file for posterior scores", false, scores_file); @@ -263,20 +280,20 @@ main_hypermr(int argc, char *argv[]) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const string cpgs_file = leftover_args.front(); @@ -285,19 +302,19 @@ main_hypermr(int argc, char *argv[]) { if (!is_msite_file(cpgs_file)) throw runtime_error("malformed counts file: " + cpgs_file); - if (VERBOSE) cerr << "[loading_data]" << endl; + if (VERBOSE) + cerr << "[loading_data]" << '\n'; vector cpgs; vector> meth; load_cpgs(cpgs_file, cpgs, meth); const size_t n_sites = cpgs.size(); - - double mean_cov = 0.0; - for (auto &&c : cpgs) mean_cov += c.n_reads; - mean_cov /= n_sites; - - if (VERBOSE) - cerr << "[n_sites=" << n_sites << "]" << endl - << "[mean_coverage=" << mean_cov << "]" << endl; + const double mean_cov = + std::accumulate( + std::cbegin(cpgs), std::cend(cpgs), 0.0, + [](const auto a, const auto &c) { return a + c.n_reads; }) / + static_cast(n_sites); + cerr << "[n_sites=" << n_sites << "]" << '\n' + << "[mean_coverage=" << mean_cov << "]" << '\n'; // separate the regions by chrom and by desert, and eliminate // those isolated CpGs @@ -308,9 +325,9 @@ main_hypermr(int argc, char *argv[]) { if (VERBOSE) { const auto des_frac = static_cast(bases_in_deserts) / total_bases; - cerr << "[n_sites_retained=" << cpgs.size() << "]" << endl - << "[deserts_removed=" << reset_points.size() - 2 << "]" << endl - << "[remaining_genome_fraction=" << 1.0 - des_frac << "]" << endl; + cerr << "[n_sites_retained=" << cpgs.size() << "]" << '\n' + << "[deserts_removed=" << reset_points.size() - 2 << "]" << '\n' + << "[remaining_genome_fraction=" << 1.0 - des_frac << "]" << '\n'; } ThreeStateHMM hmm(meth, reset_points, tolerance, max_iterations, VERBOSE); @@ -339,7 +356,8 @@ main_hypermr(int argc, char *argv[]) { } hmm.set_parameters(hypo_emission, HYPER_emission, HYPO_emission, trans); - if (max_iterations > 0) hmm.BaumWelchTraining(); + if (max_iterations > 0) + hmm.BaumWelchTraining(); hmm.get_parameters(hypo_emission, HYPER_emission, HYPO_emission, trans); if (!params_out_file.empty()) @@ -361,31 +379,35 @@ main_hypermr(int argc, char *argv[]) { // write the results ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); copy(begin(domains), end(domains), ostream_iterator(out, "\n")); // if requested, write the posterior scores if (!scores_file.empty()) { - if (USE_VITERBI_DECODING) hmm.PosteriorDecoding(); + if (USE_VITERBI_DECODING) + hmm.PosteriorDecoding(); vector scores; hmm.get_state_posteriors(scores); ofstream score_out(scores_file); for (size_t i = 0; i < cpgs.size(); ++i) { score_out << cpgs[i] << "\t"; if (classes[i] == hypo) - score_out << "hypo\n" << scores[i].hypo << endl; + score_out << "hypo\n" << scores[i].hypo << '\n'; else if (classes[i] == HYPER) - score_out << "HYPER\n" << scores[i].HYPER << endl; + score_out << "HYPER\n" << scores[i].HYPER << '\n'; else // if (classes[i] == HYPO) - score_out << "HYPO\n" << scores[i].HYPO << endl; + score_out << "HYPO\n" << scores[i].HYPO << '\n'; } } } catch (const std::exception &e) { - cerr << "ERROR:\t" << e.what() << endl; + cerr << "ERROR:\t" << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) From 629c7c91c720d1ff1020c71e8a44505709ebd180 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 010/106] src/analysis/levels.cpp: changes to add static analysis --- src/analysis/levels.cpp | 111 ++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 55 deletions(-) diff --git a/src/analysis/levels.cpp b/src/analysis/levels.cpp index 0f516e5c..3acf6699 100644 --- a/src/analysis/levels.cpp +++ b/src/analysis/levels.cpp @@ -1,18 +1,4 @@ -/* levels: a program to compute coverage statistics, mutation rates, - * and three different formulas for methylation levels described in - * the paper: - * - * 'Leveling' the playing field for analyses of single-base - * resolution DNA methylomes - * Schultz, Schmitz & Ecker (TIG 2012) - * - * Note: the fractional methylation level calculated in this program - * is inspired but different from the paper. What we are doing here is - * using binomial test to determine significantly hyper/hypomethylated - * sites, and only use these subset of sites to calculate methylation - * level. - * - * Copyright (C) 2014-2023 University of Southern California and +/* Copyright (C) 2014-2023 University of Southern California and * Andrew D. Smith and Benjamin E Decato * * Authors: Andrew D. Smith and Benjamin E Decato @@ -28,42 +14,51 @@ * General Public License for more details. */ +/* levels: a program to compute coverage statistics, mutation rates, + * and three different formulas for methylation levels described in + * the paper: + * + * 'Leveling' the playing field for analyses of single-base + * resolution DNA methylomes + * Schultz, Schmitz & Ecker (TIG 2012) + * + * Note: the fractional methylation level calculated in this program + * is inspired but different from the paper. What we are doing here is + * using binomial test to determine significantly hyper/hypomethylated + * sites, and only use these subset of sites to calculate methylation + * level. + */ + #include "LevelsCounter.hpp" #include "MSite.hpp" #include "OptionParser.hpp" -#include "bsutils.hpp" #include "counts_header.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" #include -#include +#include +#include #include #include #include -#include +#include #include #include -using std::cerr; -using std::endl; -using std::ios_base; -using std::runtime_error; -using std::string; -using std::vector; - -using bamxx::bgzf_file; - -enum class counts_file_format { ordinary, asym_cpg, sym_cpg }; +enum class counts_file_format : std::uint8_t { + ordinary, + asym_cpg, + sym_cpg, +}; static counts_file_format -guess_counts_file_format(const string &filename) { +guess_counts_file_format(const std::string &filename) { static const uint64_t n_lines_to_check = 10000; const bool has_counts_header = get_has_counts_header(filename); - bgzf_file in(filename, "r"); - if (!in) throw ios_base::failure{"bad input file: " + filename}; + bamxx::bgzf_file in(filename, "r"); + if (!in) + throw std::runtime_error{"bad input file: " + filename}; if (has_counts_header) skip_counts_header(in); @@ -83,16 +78,18 @@ guess_counts_file_format(const string &filename) { } int -main_levels(int argc, char *argv[]) { +main_levels(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { bool verbose = false; bool relaxed_mode = false; - string outfile; + std::string outfile; - const string description = "compute global summary of methylation levels"; + const std::string description = + "compute global summary of methylation levels"; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, outfile); opt_parse.add_opt("alpha", 'a', @@ -102,46 +99,48 @@ main_levels(int argc, char *argv[]) { "run on data that appears to have sites filtered", false, relaxed_mode); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << endl; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - const string meth_file = leftover_args.front(); + const std::string meth_file = leftover_args.front(); /****************** END COMMAND LINE OPTIONS *****************/ if (!is_msite_file(meth_file)) - throw runtime_error{"malformed counts file: " + meth_file}; + throw std::runtime_error{"malformed counts file: " + meth_file}; const counts_file_format guessed_format = guess_counts_file_format(meth_file); if (guessed_format != counts_file_format::ordinary) { if (verbose) - cerr << "input might be only CpG sites (" - << (guessed_format == counts_file_format::asym_cpg ? "not " : "") - << "symmetric)" << endl; + std::cerr << "input might be only CpG sites (" + << (guessed_format == counts_file_format::asym_cpg ? "not " + : "") + << "symmetric)" << '\n'; if (!relaxed_mode) - throw runtime_error{ + throw std::runtime_error{ "unexpected input format (consider using -relaxed)"}; } const bool has_counts_header = get_has_counts_header(meth_file); - bgzf_file in(meth_file, "r"); - if (!in) throw std::runtime_error("bad input file: " + meth_file); + bamxx::bgzf_file in(meth_file, "r"); + if (!in) + throw std::runtime_error("bad input file: " + meth_file); if (has_counts_header) skip_counts_header(in); @@ -156,7 +155,8 @@ main_levels(int argc, char *argv[]) { while (read_site(in, site)) { if (site.chrom != prev_site.chrom) - if (verbose) cerr << "processing " << site.chrom << endl; + if (verbose) + std::cerr << "processing " << site.chrom << '\n'; cytosines.update(site); @@ -174,7 +174,7 @@ main_levels(int argc, char *argv[]) { else if (site.is_cxg()) cxg.update(site); else - throw runtime_error{"bad site context: " + site.context}; + throw std::runtime_error{"bad site context: " + site.context}; prev_site = site; } @@ -182,7 +182,8 @@ main_levels(int argc, char *argv[]) { std::ofstream of; if (!outfile.empty()) { of.open(outfile); - if (!of) throw ios_base::failure{"bad output file: " + outfile}; + if (!of) + throw std::runtime_error{"bad output file: " + outfile}; } std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); @@ -194,7 +195,7 @@ main_levels(int argc, char *argv[]) { << cxg << '\n'; } catch (const std::exception &e) { - cerr << e.what() << endl; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 50b79d03a47e99d5433044c8a7905b8b10c0d67f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 011/106] src/analysis/metagene.cpp: changes to add static analysis --- src/analysis/metagene.cpp | 96 +++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/src/analysis/metagene.cpp b/src/analysis/metagene.cpp index 3529ffa5..e20d2f6d 100644 --- a/src/analysis/metagene.cpp +++ b/src/analysis/metagene.cpp @@ -19,20 +19,29 @@ * . */ -#include -#include -#include - #include "GenomicRegion.hpp" #include "LevelsCounter.hpp" #include "MSite.hpp" #include "OptionParser.hpp" -#include "bsutils.hpp" -#include "smithlab_utils.hpp" + +#include "bamxx.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include using std::cerr; using std::cout; -using std::endl; using std::ostream; using std::pair; using std::runtime_error; @@ -77,37 +86,40 @@ process_chrom(const uint32_t region_size, const vector &genes, } } -template static void +template +static void collapse_bins(const uint32_t bin_size, vector &v) { const uint32_t n_bins = std::ceil(static_cast(v.size()) / bin_size); vector vv(n_bins); - for (auto i = 0u; i < v.size(); ++i) vv[i / bin_size] += v[i]; + for (auto i = 0u; i < v.size(); ++i) + vv[i / bin_size] += v[i]; v.swap(vv); } int -metagene(int argc, char *argv[]) { +metagene(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) constexpr auto description = - "Compute the information needed for metagene plots of DNA methylation \ - levels. The columns in the output correspond to the fields calculated \ - globally by the `levels` and per-region by the `roi` command. Input \ - for features is in BED format, and when present the 6th column is used \ - to indicate strand. For features of non-zero width (where the 2nd and \ - 3rd columns are not identical) the negative strand will indicate that \ - 3rd column should be used. This means, for example, if the features are \ - genes, and the promoters are of interest, the strand will be used \ - correctly."; - + R"( +Compute the information needed for metagene plots of DNA methylation +levels. The columns in the output correspond to the fields calculated +globally by the `levels` and per-region by the `roi` command. Input +for features is in BED format, and when present the 6th column is used +to indicate strand. For features of non-zero width (where the 2nd and +3rd columns are not identical) the negative strand will indicate that +3rd column should be used. This means, for example, if the features are +genes, and the promoters are of interest, the strand will be used +correctly. +)"; try { string outfile; - uint32_t region_size = 5000; + uint32_t region_size = 5000; // NOLINT(*-avoid-magic-numbers) bool verbose = false; bool show_progress = false; - uint32_t bin_size = 50; + uint32_t bin_size = 50; // NOLINT(*-avoid-magic-numbers) /****************** GET COMMAND LINE ARGUMENTS ***************************/ - OptionParser opt_parse(strip_path(argv[0]), description, - " "); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " "); opt_parse.add_opt("output", 'o', "output file (default: terminal)", false, outfile); opt_parse.add_opt("size", 's', "analyze this size in both directions", @@ -118,31 +130,32 @@ metagene(int argc, char *argv[]) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 2) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const string features_file_name = leftover_args.front(); const string cpg_file_name = leftover_args.back(); /**********************************************************************/ - if (verbose) cerr << "[loading feature annotations data]" << endl; + if (verbose) + cerr << "[loading feature annotations data]" << '\n'; vector features; ReadBEDFile(features_file_name, features); sort(begin(features), end(features)); if (verbose) - cerr << "[number of features: " << features.size() << "]" << endl; + cerr << "[number of features: " << features.size() << "]" << '\n'; // identify the start and end of ranges for each chromosome unordered_map> lookup; @@ -152,7 +165,8 @@ metagene(int argc, char *argv[]) { auto prev_idx = 0u; for (auto i = 0u; i < features.size(); ++i) if (features[i].get_chrom() != chrom_name) { - if (!chrom_name.empty()) lookup.insert({chrom_name, {prev_idx, i}}); + if (!chrom_name.empty()) + lookup.insert({chrom_name, {prev_idx, i}}); prev_idx = i; chrom_name = features[i].get_chrom(); } @@ -167,7 +181,8 @@ metagene(int argc, char *argv[]) { vector levels(2 * region_size); bamxx::bgzf_file cpgin(cpg_file_name, "r"); - if (!cpgin) throw runtime_error("failed to open file: " + cpg_file_name); + if (!cpgin) + throw runtime_error("failed to open file: " + cpg_file_name); vector sites; string line; @@ -182,10 +197,11 @@ metagene(int argc, char *argv[]) { const auto n_features = pair_diff(bounds); if (show_progress) cerr << "[sites=" << sites.size() << " features=" << n_features - << "]" << endl; + << "]" << '\n'; sites.clear(); } - if (show_progress) cerr << "[processing: " << the_site.chrom << "]"; + if (show_progress) + cerr << "[processing: " << the_site.chrom << "]"; chrom_name = the_site.chrom; } sites.push_back(std::move(the_site)); @@ -198,24 +214,24 @@ metagene(int argc, char *argv[]) { const auto n_features = pair_diff(bounds); if (show_progress) cerr << "[sites=" << sites.size() << " features=" << n_features << "]" - << endl; + << '\n'; } collapse_bins(bin_size, levels); if (verbose) - cerr << "output columns:\n" - << LevelsCounter::format_header() << endl; + cerr << "output columns:\n" << LevelsCounter::format_header() << '\n'; std::ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); std::ostream out(of.is_open() ? of.rdbuf() : std::cout.rdbuf()); for (auto i = 0u; i < levels.size(); ++i) - out << i * bin_size << '\t' << levels[i].format_row() << endl; + out << i * bin_size << '\t' << levels[i].format_row() << '\n'; } catch (std::exception &e) { - cerr << e.what() << endl; + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 36a87b706f05c739b3d079addfb3257460fd3869 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 012/106] src/analysis/methcounts.cpp: changes to add static analysis --- src/analysis/methcounts.cpp | 51 ++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/src/analysis/methcounts.cpp b/src/analysis/methcounts.cpp index 9caa219a..c66b8d5e 100644 --- a/src/analysis/methcounts.cpp +++ b/src/analysis/methcounts.cpp @@ -16,24 +16,35 @@ * more details. */ +#include "OptionParser.hpp" #include "bam_record_utils.hpp" #include "bsutils.hpp" #include "counts_header.hpp" +#include "smithlab_os.hpp" -#include "OptionParser.hpp" +#include -/* HTSlib */ #include -#include // for [u]int[0-9]+_t +#include +#include +#include +#include +#include +#include +#include +#include #include -#include +#include #include #include #include -#include +#include +#include #include +// NOLINTBEGIN(*-narrowing-conversions,*-constant-array-index,*-pointer-arithmetic) + // ADS: we should never have to worry about coverage over > 32767 in // any downstream analysis, so using "int16_t" here would allow to // detect wrap around and report it as some kind of weird thing, maybe @@ -186,7 +197,8 @@ has_mutated(const char base, const CountSet &cs) { : (cs.pG < mutation_defining_frac * (cs.pos_total())); } -static const char *tag_values[] = { +// NOLINTBEGIN(*-avoid-c-arrays) +static const char *const tag_values[] = { "CpG", // 0 "CHH", // 1 "CXG", // 2 @@ -198,6 +210,7 @@ static const char *tag_values[] = { "CCGx", // 8 "Nx" // 9 }; +// NOLINTEND(*-avoid-c-arrays) static const std::uint32_t MUT_OFFSET = 5; [[nodiscard]] static inline std::uint32_t @@ -210,14 +223,13 @@ static void write_output(const bamxx::bam_header &hdr, bamxx::bgzf_file &out, const int32_t tid, const std::string &chrom, const std::vector &counts, bool CPG_ONLY) { - constexpr auto buf_size = 1024; // max width of a line for counts output + static constexpr auto buf_size = 1024; // max width of output line constexpr const char *fmt = "%s\t%ld\t%c\t%s\t%.6g\t%d\n"; - char buf[buf_size]; + std::array buf{}; for (size_t i = 0; i < size(counts); ++i) { const char base = chrom[i]; if (is_cytosine(base) || is_guanine(base)) { - const std::uint32_t the_tag = get_tag_from_genome(chrom, i); if (CPG_ONLY && the_tag != 0) continue; @@ -234,7 +246,7 @@ write_output(const bamxx::bam_header &hdr, bamxx::bgzf_file &out, const bool mut = has_mutated(base, counts[i]); // clang-format off - const int n = std::snprintf(buf, buf_size, fmt, + const int n = std::snprintf(buf.data(), buf_size, fmt, sam_hdr_tid2name_ptr(hdr, tid), i, (is_c ? '+' : '-'), @@ -242,7 +254,7 @@ write_output(const bamxx::bam_header &hdr, bamxx::bgzf_file &out, (n_reads > 0 ? unconverted / n_reads : 0.0), n_reads); // clang-format on - if (n < 0 || !out.write(buf, n)) + if (n < 0 || !out.write(buf.data(), n)) throw std::runtime_error("error formatting output"); } } @@ -314,7 +326,7 @@ count_states_neg(const bamxx::bam_rec &aln, std::vector &counts) { [[nodiscard]] static std::unordered_map get_tid_to_idx(const bamxx::bam_header &hdr, - const std::unordered_map name_to_idx) { + const std::unordered_map &name_to_idx) { std::unordered_map tid_to_idx; for (int32_t i = 0; i < hdr.h->n_targets; ++i) { // "curr_name" gives a "tid_to_name" mapping allowing to jump @@ -337,7 +349,6 @@ output_skipped_chromosome( const std::vector::const_iterator chroms_beg, const std::vector &chrom_sizes, std::vector &counts, bamxx::bgzf_file &out) { - // get the index of the next chrom sequence const auto chrom_idx = tid_to_idx.find(tid); if (chrom_idx == std::cend(tid_to_idx)) @@ -378,7 +389,7 @@ template static void process_reads(const bool VERBOSE, const bool show_progress, const bool compress_output, const bool include_header, - const size_t n_threads, const std::string &infile, + const std::int32_t n_threads, const std::string &infile, const std::string &outfile, const std::string &chroms_file, const bool CPG_ONLY) { // first get the chromosome names and sequences from the FASTA file @@ -438,7 +449,7 @@ process_reads(const bool VERBOSE, const bool show_progress, // this is where all the counts are accumulated std::vector counts; - std::vector::const_iterator chrom_itr; + std::vector::const_iterator chrom_itr{}; while (hts.read(hdr, aln)) { const int32_t tid = get_tid(aln); @@ -499,10 +510,8 @@ process_reads(const bool VERBOSE, const bool show_progress, } int -main_counts(int argc, char *argv[]) { - +main_counts(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - bool VERBOSE = false; bool show_progress = false; bool CPG_ONLY = false; @@ -512,10 +521,10 @@ main_counts(int argc, char *argv[]) { std::string chroms_file; std::string outfile; - int n_threads = 1; + std::int32_t n_threads = 1; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "get methylation levels from " "mapped bisulfite sequencing reads", "-c "); @@ -584,3 +593,5 @@ main_counts(int argc, char *argv[]) { } return EXIT_SUCCESS; } + +// NOLINTEND(*-narrowing-conversions,*-constant-array-index,*-pointer-arithmetic) From 2f8877bf30461a25675f0d2d88e86f10172c09b4 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 013/106] src/analysis/methentropy.cpp: changes to add static analysis --- src/analysis/methentropy.cpp | 120 +++++++++++++++++------------------ 1 file changed, 58 insertions(+), 62 deletions(-) diff --git a/src/analysis/methentropy.cpp b/src/analysis/methentropy.cpp index 03fdfa68..f00cebcc 100644 --- a/src/analysis/methentropy.cpp +++ b/src/analysis/methentropy.cpp @@ -1,33 +1,42 @@ -/* Copyright (C) 2013-2025 University of Southern California - * Andrew D Smith and Jenny Qu +/* Copyright (C) 2013-2025 Andrew D Smith * - * Author: Jenny Qu and Andrew D. Smith + * Author: Andrew D. Smith * - * This is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * This is free software; you can redistribute it and/or modify it under the + * terms of the GNU General Public License as published by the Free Software + * Foundation; either version 2 of the License, or (at your option) any later + * version. * - * This is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. */ +#include "OptionParser.hpp" +#include "smithlab_utils.hpp" + +#include +#include +#include +#include #include +#include #include #include +#include +#include +#include +#include #include #include #include +#include #include -#include "GenomicRegion.hpp" -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" +// NOLINTBEGIN(*-narrowing-conversions) -[[nodiscard]] static auto +static auto read_fasta(const std::string &fasta_file, std::unordered_map &chrom_names, std::vector &chroms) { @@ -113,11 +122,12 @@ struct epiread { bool epiread::flip_states() { + static constexpr auto one_half = 0.5; const std::size_t meth_states_count = std::count(seq.begin(), seq.end(), 'C'); const auto sz = std::size(seq); - if (meth_states_count < 0.5 * sz) { + if (meth_states_count < one_half * sz) { for (std::size_t i = 0; i < sz; ++i) - seq[i] = (seq[i] == 'T') ? 'C' : ((seq[i] == 'C') ? 'T' : seq[i]); + seq[i] = seq[i] == 'T' ? 'C' : (seq[i] == 'C' ? 'T' : seq[i]); return true; } return false; @@ -175,12 +185,10 @@ compute_entropy_for_window(const std::vector &site_probs, const std::size_t start_cpg, const std::size_t end_cpg, std::size_t &reads_in_window) { - const std::size_t n_states = 1ul << (end_cpg - start_cpg); double entropy = 0.0; for (std::size_t i = 0; i < n_states; ++i) { - double state_prob = 0.0; reads_in_window = 0; for (std::size_t j = start_idx; j < end_idx; ++j) @@ -197,22 +205,17 @@ compute_entropy_for_window(const std::vector &site_probs, return entropy; } -//////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////// -/////// -/////// CODE FOR SLIDING THE WINDOW ALONG THE CHROMOSOME BELOW HERE -/////// +// code for sliding the window along the chromosome below here -/* This function just basically computes the same thing as methcounts - output, so that unobserved states can be imputed */ +// This function computes the same thing as methcounts output, so that +// unobserved states can be imputed static std::vector compute_site_probs(const std::size_t n_cpgs, const std::vector &epireads) { - std::vector site_probs(n_cpgs); std::vector totals(n_cpgs); - for (std::size_t i = 0; i < epireads.size(); ++i) { + for (std::size_t i = 0; i < std::size(epireads); ++i) { const std::size_t len = std::size(epireads[i]); std::size_t idx = epireads[i].pos; for (std::size_t j = 0; j < len; ++j, ++idx) { @@ -220,11 +223,8 @@ compute_site_probs(const std::size_t n_cpgs, totals[idx] += (epireads[i].seq[j] != 'N'); } } - for (auto i = 0u; i < std::size(site_probs); ++i) - if (totals[i] > 0.0) - site_probs[i] /= totals[i]; - + site_probs[i] /= totals[i] > 0.0 ? totals[i] : 1.0; return site_probs; } @@ -232,7 +232,7 @@ static void move_start_index(const std::size_t max_epiread_len, const std::vector &epireads, const std::size_t start_cpg, std::size_t &idx) { - while (idx < epireads.size() && + while (idx < std::size(epireads) && epireads[idx].get_end() + max_epiread_len <= start_cpg) ++idx; } @@ -246,29 +246,27 @@ move_end_index(const std::vector &epireads, } static void -process_chrom(const bool VERBOSE, const std::size_t cpg_window, +process_chrom(const bool verbose, const std::size_t cpg_window, const std::vector &epireads, const std::unordered_map &cpg_lookup, std::ostream &out) { - const std::string chrom(epireads.front().chr); if (!check_sorted(epireads)) throw std::runtime_error("epireads not sorted in chrom: " + chrom); const std::size_t n_cpgs = cpg_lookup.size(); - if (VERBOSE) + if (verbose) std::cerr << "processing " << chrom << " (cpgs = " << n_cpgs << ")\n"; const auto site_probs = compute_site_probs(n_cpgs, epireads); - std::size_t max_epiread_len = 0; - for (const auto &er : epireads) - max_epiread_len = std::max(max_epiread_len, std::size(er)); + std::size_t max_epiread_len = std::accumulate( + std::cbegin(epireads), std::cend(epireads), 0ul, + [](const auto a, const auto &er) { return std::max(a, std::size(er)); }); std::size_t start_cpg = 0; std::size_t start_idx = 0, end_idx = 0; while (start_cpg + cpg_window < n_cpgs) { - move_start_index(max_epiread_len, epireads, start_cpg, start_idx); move_end_index(epireads, start_cpg, cpg_window, end_idx); @@ -286,44 +284,42 @@ process_chrom(const bool VERBOSE, const std::size_t cpg_window, } int -main_methentropy(int argc, char *argv[]) { - +main_methentropy(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - - bool VERBOSE = false; + bool verbose = false; bool FLIP_MAJORITY_STATE = false; - std::size_t cpg_window = 4; + std::size_t cpg_window = 4; // NOLINT(*-avoid-magic-numbers) std::string outfile; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "compute methylation entropy in sliding window", " "); opt_parse.add_opt("window", 'w', "number of CpGs in sliding window", false, cpg_window); opt_parse.add_opt("flip", 'F', "flip read majority state to meth", false, FLIP_MAJORITY_STATE); - opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, + opt_parse.add_opt("output", 'o', "output file (default: stdout)", true, outfile); - opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - std::cerr << opt_parse.help_message() << std::endl - << opt_parse.about_message() << std::endl; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - std::cerr << opt_parse.about_message() << std::endl; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - std::cerr << opt_parse.option_missing_message() << std::endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 2) { - std::cerr << opt_parse.help_message() << std::endl; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const std::string genome_file = leftover_args.front(); @@ -338,16 +334,14 @@ main_methentropy(int argc, char *argv[]) { if (!in) throw std::runtime_error("cannot open input file: " + epi_file); - std::ofstream of; - if (!outfile.empty()) - of.open(outfile); - std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open output file: " + outfile); std::unordered_map cpg_lookup; std::vector epireads; epiread tmp_er; - const std::string chrom; while (in >> tmp_er) { const auto &name = tmp_er.chr; if (!epireads.empty() && name != epireads.back().chr) { @@ -356,7 +350,7 @@ main_methentropy(int argc, char *argv[]) { if (itr == std::cend(chrom_names)) throw std::runtime_error("chrom not found: " + chrom); build_coordinate_converter(chroms[itr->second], cpg_lookup); - process_chrom(VERBOSE, cpg_window, epireads, cpg_lookup, out); + process_chrom(verbose, cpg_window, epireads, cpg_lookup, out); epireads.clear(); } if (FLIP_MAJORITY_STATE) @@ -369,12 +363,14 @@ main_methentropy(int argc, char *argv[]) { if (itr == std::cend(chrom_names)) throw std::runtime_error("chrom not found: " + chrom); build_coordinate_converter(chroms[itr->second], cpg_lookup); - process_chrom(VERBOSE, cpg_window, epireads, cpg_lookup, out); + process_chrom(verbose, cpg_window, epireads, cpg_lookup, out); } } catch (const std::exception &e) { - std::cerr << e.what() << "\n"; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-narrowing-conversions) From c2fcf3a37afd1194839cc95eb50c2b94d66f865d Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 014/106] src/analysis/methstates.cpp: changes to add static analysis --- src/analysis/methstates.cpp | 115 +++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 46 deletions(-) diff --git a/src/analysis/methstates.cpp b/src/analysis/methstates.cpp index 4d18e007..4bf95964 100644 --- a/src/analysis/methstates.cpp +++ b/src/analysis/methstates.cpp @@ -17,26 +17,30 @@ * General Public License for more details. */ +#include "OptionParser.hpp" +#include "bam_record_utils.hpp" +#include "dnmt_error.hpp" +#include "smithlab_os.hpp" + +#include + #include +#include +#include +#include +#include #include -#include -#include +#include +#include +#include #include +#include #include +#include #include -#include "OptionParser.hpp" -#include "bam_record_utils.hpp" -#include "cigar_utils.hpp" -#include "dnmt_error.hpp" -#include "htslib_wrapper.hpp" -#include "sam_record.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - using std::cerr; using std::cout; -using std::endl; using std::lower_bound; using std::string; using std::unordered_map; @@ -45,26 +49,31 @@ using std::vector; using bamxx::bam_rec; -static const char b2c[] = "TNGNNNCNNNNNNNNNNNNA"; +static const char b2c[] = "TNGNNNCNNNNNNNNNNNNA"; // NOLINT(*-avoid-c-arrays) struct quick_buf : public std::ostringstream, public std::basic_stringbuf { // ADS: consider putting this class in a header somewhere quick_buf() { static_cast &>(*this).rdbuf(this); } - void clear() { setp(pbase(), pbase()); } + void + clear() { + setp(pbase(), pbase()); + } - char const *c_str() { + char const * + c_str() { *pptr() = '\0'; return pbase(); } }; -template +template // constexpr // since C++20 OutputIt revcomp_copy(BidirIt first, BidirIt last, OutputIt d_first) { - for (; first != last; ++d_first) *d_first = b2c[*(--last) - 'A']; + for (; first != last; ++d_first) + *d_first = b2c[*(--last) - 'A']; // NOLINT(*-constant-array-index) return d_first; } @@ -78,7 +87,8 @@ collect_cpgs(const string &s, vector &cpgs) { cpgs.clear(); const uint64_t lim = std::size(s) - 1; for (auto i = 0u; i < lim; ++i) - if (is_cpg(s, i)) cpgs.push_back(i); + if (is_cpg(s, i)) + cpgs.push_back(i); } static bool @@ -102,12 +112,14 @@ convert_meth_states_pos(const vector &cpgs, auto cpg_itr = lower_bound(begin(cpgs), end(cpgs), seq_start); auto first_cpg_itr = end(cpgs); - if (cpg_itr == end(cpgs)) return false; + if (cpg_itr == end(cpgs)) + return false; for (; cpg_itr != end(cpgs) && *cpg_itr < seq_end; cpg_itr++) { const char x = seq_str[*cpg_itr - seq_start]; states += (x == 'T') ? 'T' : ((x == 'C') ? 'C' : 'N'); - if (first_cpg_itr == end(cpgs)) first_cpg_itr = cpg_itr; + if (first_cpg_itr == end(cpgs)) + first_cpg_itr = cpg_itr; } if (first_cpg_itr != end(cpgs)) @@ -152,12 +164,15 @@ convert_meth_states_neg(const vector &cpgs, lower_bound(begin(cpgs), end(cpgs), seq_start > 0 ? seq_start - 1 : 0); auto first_cpg_itr = end(cpgs); - if (cpg_itr == end(cpgs)) { return false; } + if (cpg_itr == end(cpgs)) { + return false; + } else { for (; cpg_itr != end(cpgs) && *cpg_itr < seq_end - 1; cpg_itr++) { const char x = seq_str[*cpg_itr - seq_start + 1]; states += (x == 'G') ? 'C' : ((x == 'A') ? 'T' : 'N'); - if (first_cpg_itr == end(cpgs)) first_cpg_itr = cpg_itr; + if (first_cpg_itr == end(cpgs)) + first_cpg_itr = cpg_itr; } } @@ -176,20 +191,21 @@ get_chrom(const string &chrom_name, const vector &all_chroms, throw dnmt_error("could not find chrom: " + chrom_name); chrom = all_chroms[the_chrom->second]; - if (chrom.empty()) throw dnmt_error("problem with chrom: " + chrom_name); + if (chrom.empty()) + throw dnmt_error("problem with chrom: " + chrom_name); } int -main_methstates(int argc, char *argv[]) { +main_methstates(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - const string description = - "Convert mapped reads in SAM format into a format that indicates binary \ - sequences of methylation states in each read, indexed by the identity \ - of the CpG they cover, along with the chromosome. Only reads that \ - cover a CpG site are included in the output. All output is relative to \ - the positive reference strand. This format is used as input to other \ - tools, and is not intended to be human-interpretable. All chromosome \ - sequences are loaded at once."; + const auto description = + R"(Convert mapped reads in SAM format into a format that indicates binary + sequences of methylation states in each read, indexed by the identity + of the CpG they cover, along with the chromosome. Only reads that + cover a CpG site are included in the output. All output is relative to + the positive reference strand. This format is used as input to other + tools, and is not intended to be human-interpretable. All chromosome + sequences are loaded at once.)"; bool VERBOSE = false; bool compress_output = false; @@ -199,7 +215,8 @@ main_methstates(int argc, char *argv[]) { int n_threads = 1; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(argv[0], description, ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.add_opt("output", 'o', "output file name", false, outfile); opt_parse.add_opt("chrom", 'c', "fasta format reference genome file", true, chrom_file); @@ -211,26 +228,27 @@ main_methstates(int argc, char *argv[]) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const string mapped_reads_file = leftover_args.front(); /****************** END COMMAND LINE OPTIONS *****************/ - if (n_threads < 0) throw dnmt_error("thread count cannot be negative"); + if (n_threads < 0) + throw dnmt_error("thread count cannot be negative"); /* first load in all the chromosome sequences and names, and make a map from chromosome name to the location of the chromosome @@ -245,19 +263,23 @@ main_methstates(int argc, char *argv[]) { for (uint64_t i = 0; i < chrom_names.size(); ++i) chrom_lookup[chrom_names[i]] = i; - if (VERBOSE) cerr << "n_chroms: " << all_chroms.size() << endl; + if (VERBOSE) + cerr << "n_chroms: " << all_chroms.size() << '\n'; bamxx::bam_tpool tp(n_threads); // declared first; destroyed last bamxx::bam_in in(mapped_reads_file); - if (!in) throw dnmt_error("cannot open input file " + mapped_reads_file); + if (!in) + throw dnmt_error("cannot open input file " + mapped_reads_file); bamxx::bam_header hdr(in); - if (!hdr) throw dnmt_error("cannot read heade" + mapped_reads_file); + if (!hdr) + throw dnmt_error("cannot read heade" + mapped_reads_file); // open the output file const string output_mode = compress_output ? "w" : "wu"; bamxx::bgzf_file out(outfile, output_mode); - if (!out) throw dnmt_error("error opening output file: " + outfile); + if (!out) + throw dnmt_error("error opening output file: " + outfile); /* set the threads for the input file decompression */ if (n_threads > 1) { @@ -285,7 +307,8 @@ main_methstates(int argc, char *argv[]) { if (chroms_seen.find(chrom_name) != end(chroms_seen)) throw dnmt_error("chroms out of order (check SAM file sorted)"); - if (VERBOSE) cerr << "processing " << chrom_name << endl; + if (VERBOSE) + cerr << "processing " << chrom_name << '\n'; get_chrom(chrom_name, all_chroms, chrom_lookup, chrom); collect_cpgs(chrom, cpgs); @@ -304,14 +327,14 @@ main_methstates(int argc, char *argv[]) { buf << sam_hdr_tid2name(hdr, aln) << '\t' << first_cpg_index << '\t' << seq << '\n'; if (!out.write(buf.c_str(), buf.tellp())) { - cerr << "failure writing output" << endl; + cerr << "failure writing output" << '\n'; return EXIT_FAILURE; } } } } catch (const std::exception &e) { - cerr << e.what() << endl; + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 556e65e8a16e61485fba42b707135d1ef73d579f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 015/106] src/analysis/multimethstat.cpp: changes to add static analysis --- src/analysis/multimethstat.cpp | 351 +++++++++++++++------------------ 1 file changed, 164 insertions(+), 187 deletions(-) diff --git a/src/analysis/multimethstat.cpp b/src/analysis/multimethstat.cpp index e03fe3ad..c745bd82 100644 --- a/src/analysis/multimethstat.cpp +++ b/src/analysis/multimethstat.cpp @@ -15,54 +15,54 @@ * General Public License for more details. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "OptionParser.hpp" -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" #include "GenomicRegion.hpp" - #include "MSite.hpp" - +#include "OptionParser.hpp" #include "bsutils.hpp" -using std::string; -using std::vector; -using std::cout; +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + using std::cerr; +using std::cout; using std::endl; -using std::pair; -using std::make_pair; -using std::ios_base; -using std::runtime_error; using std::ifstream; -using std::isfinite; +using std::ios_base; using std::is_sorted; +using std::isfinite; +using std::make_pair; +using std::pair; +using std::runtime_error; +using std::string; +using std::vector; using bamxx::bgzf_file; +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) + static pair meth_unmeth_calls(const size_t n_meth, const size_t n_unmeth) { static const double alpha = 0.95; // get info for binomial test double lower = 0.0, upper = 0.0; const size_t total = n_meth + n_unmeth; - wilson_ci_for_binomial(alpha, total, - static_cast(n_meth)/total, lower, upper); + wilson_ci_for_binomial(alpha, total, static_cast(n_meth) / total, + lower, upper); return make_pair(lower > 0.5, upper < 0.5); } - static std::pair region_bounds(const vector &sites, const GenomicRegion ®ion) { const string chrom(region.get_chrom()); @@ -77,7 +77,6 @@ region_bounds(const vector &sites, const GenomicRegion ®ion) { return make_pair(a_ins - begin(sites), b_ins - begin(sites)); } - /* This function is used to make sure the output is done the same way in the different places it might be generated. One issue is that @@ -86,61 +85,58 @@ region_bounds(const vector &sites, const GenomicRegion ®ion) { is binned. */ static string -format_output_line(const bool report_more_information, - const GenomicRegion &r, - const vector &score, - const vector &weighted_mean_meth, - const vector &mean_meth, - const vector &fractional_meth, - const size_t total_cpgs, - const vector &cpgs_with_reads, - const vector &total_meth, - const vector &total_reads) { - +format_output_line( + const bool report_more_information, const GenomicRegion &r, + const vector &score, const vector &weighted_mean_meth, + const vector &mean_meth, const vector &fractional_meth, + const size_t total_cpgs, const vector &cpgs_with_reads, + const vector &total_meth, const vector &total_reads) { std::ostringstream oss; - oss << r.get_chrom() << '\t' - << r.get_start() << '\t' - << r.get_end() << '\t' + oss << r.get_chrom() << '\t' << r.get_start() << '\t' << r.get_end() << '\t' << r.get_name() << '\t'; const size_t n_samples = score.size(); for (size_t i = 0; i < n_samples; ++i) { - if (isfinite(score[i])) oss << score[i]; - else oss << "NA"; + if (isfinite(score[i])) + oss << score[i]; + else + oss << "NA"; // GS: commenting this out because we shouldn't be reporting // strand several times // oss << '\t' << r.get_strand(); if (report_more_information) { - oss << '\t'; - if (isfinite(weighted_mean_meth[i])) oss << weighted_mean_meth[i]; - else oss << "NA"; + if (isfinite(weighted_mean_meth[i])) + oss << weighted_mean_meth[i]; + else + oss << "NA"; oss << '\t'; - if (isfinite(mean_meth[i])) oss << mean_meth[i]; - else oss << "NA"; + if (isfinite(mean_meth[i])) + oss << mean_meth[i]; + else + oss << "NA"; oss << '\t'; - if (isfinite(fractional_meth[i])) oss << fractional_meth[i]; - else oss << "NA"; + if (isfinite(fractional_meth[i])) + oss << fractional_meth[i]; + else + oss << "NA"; - oss << '\t' << total_cpgs - << '\t' << cpgs_with_reads[i] - << '\t' << total_meth[i] - << '\t' << total_reads[i]; + oss << '\t' << total_cpgs << '\t' << cpgs_with_reads[i] << '\t' + << total_meth[i] << '\t' << total_reads[i]; } - if (i < n_samples - 1) oss << '\t'; + if (i < n_samples - 1) + oss << '\t'; } return oss.str(); } static void -get_site_and_values(string &line, MSite &site, - vector &values) { - +get_site_and_values(string &line, MSite &site, vector &values) { // in tabular, chrom/pos/strand is separated by colons replace(begin(line), end(line), ':', '\t'); std::istringstream iss(line); @@ -155,7 +151,8 @@ get_site_and_values(string &line, MSite &site, static bool all_is_finite(const vector &scores) { for (auto it(begin(scores)); it != end(scores); ++it) - if (!isfinite(*it)) return false; + if (!isfinite(*it)) + return false; return true; } @@ -165,13 +162,11 @@ process_with_cpgs_loaded(const bool VERBOSE, // const bool sort_data_if_needed, const bool PRINT_NUMERIC_ONLY, const bool report_more_information, - const char level_code, - const string &cpgs_file, - vector ®ions, - std::ostream &out) { - + const char level_code, const string &cpgs_file, + vector ®ions, std::ostream &out) { bgzf_file in(cpgs_file, "r"); - if (!in) throw runtime_error("cannot open file: " + cpgs_file); + if (!in) + throw runtime_error("cannot open file: " + cpgs_file); string header; getline(in, header); @@ -184,17 +179,17 @@ process_with_cpgs_loaded(const bool VERBOSE, auto end_uniq = std::unique(begin(col_names), end(col_names)); // make sure all columns come in pairs of counts - if (2*static_cast(distance(begin(col_names), end_uniq)) - != col_names.size()) + if (2 * static_cast(distance(begin(col_names), end_uniq)) != + col_names.size()) throw runtime_error("wrong header format:\n" + header); col_names.resize(distance(begin(col_names), end_uniq)); const size_t n_samples = col_names.size(); if (VERBOSE) - cerr << "[n_samples=" << n_samples << "]" << endl; + cerr << "[n_samples=" << n_samples << "]\n"; vector cpgs; - vector > values; + vector> values; MSite the_cpg; string line; while (getline(in, line)) { @@ -202,14 +197,14 @@ process_with_cpgs_loaded(const bool VERBOSE, get_site_and_values(line, the_cpg, values.back()); cpgs.push_back(the_cpg); - if (values.back().size() != 2*n_samples) - throw runtime_error("wrong row length. Expected " - + std::to_string(2*n_samples) + ", got " + std::to_string(values.back().size()) - + "\n" + line); + if (values.back().size() != 2 * n_samples) + throw runtime_error("wrong row length. Expected " + + std::to_string(2 * n_samples) + ", got " + + std::to_string(values.back().size()) + "\n" + line); } if (VERBOSE) - cerr << "[n_cpgs=" << cpgs.size() << "]" << endl; + cerr << "[n_cpgs=" << cpgs.size() << "]\n"; if (!is_sorted(begin(cpgs), end(cpgs))) { /* GS: need a way to sort cpgs and values to preserve the order. @@ -217,11 +212,11 @@ process_with_cpgs_loaded(const bool VERBOSE, if (sort_data_if_needed) sort(begin(cpgs), end(cpgs)); else*/ - throw runtime_error("data not sorted: " + cpgs_file); + throw runtime_error("data not sorted: " + cpgs_file); } if (VERBOSE) - cerr << "[n_cpgs=" << cpgs.size() << "]" << endl; + cerr << "[n_cpgs=" << cpgs.size() << "]\n"; // write header as first column for (size_t i = 0; i < n_samples; ++i) @@ -240,17 +235,16 @@ process_with_cpgs_loaded(const bool VERBOSE, vector score(n_samples, 0); for (size_t i = 0; i < regions.size(); ++i) { - const pair bounds(region_bounds(cpgs, regions[i])); for (size_t j = 0; j < n_samples; ++j) { - total_meth[j] = total_reads[j] = cpgs_with_reads[j] = - called_total[j] = called_meth[j] = 0; + total_meth[j] = total_reads[j] = cpgs_with_reads[j] = called_total[j] = + called_meth[j] = 0; double mean_meth = 0.0; for (size_t k = bounds.first; k < bounds.second; ++k) { - const size_t meth = values[k][2*j + 1]; - const size_t tot = values[k][2*j]; + const size_t meth = values[k][2 * j + 1]; + const size_t tot = values[k][2 * j]; if (tot > 0) { total_reads[j] += tot; total_meth[j] += meth; @@ -260,31 +254,32 @@ process_with_cpgs_loaded(const bool VERBOSE, called_total[j] += (calls.first || calls.second); called_meth[j] += calls.first; - mean_meth += meth/static_cast(tot); + mean_meth += meth / static_cast(tot); } } - fractional_meth[j] = static_cast(called_meth[j])/called_total[j]; - weighted_mean_meth[j] = static_cast(total_meth[j])/total_reads[j]; - unweighted_mean_meth[j] = mean_meth/cpgs_with_reads[j]; + fractional_meth[j] = + static_cast(called_meth[j]) / called_total[j]; + weighted_mean_meth[j] = + static_cast(total_meth[j]) / total_reads[j]; + unweighted_mean_meth[j] = mean_meth / cpgs_with_reads[j]; - score[j] = (level_code == 'w' ? weighted_mean_meth[j] : - (level_code == 'u' ? unweighted_mean_meth[j] : - fractional_meth[j])); + score[j] = + (level_code == 'w' ? weighted_mean_meth[j] + : (level_code == 'u' ? unweighted_mean_meth[j] + : fractional_meth[j])); } const size_t total_cpgs = bounds.second - bounds.first; if (!PRINT_NUMERIC_ONLY || all_is_finite(score)) out << format_output_line(report_more_information, regions[i], score, weighted_mean_meth, unweighted_mean_meth, - fractional_meth, - total_cpgs, cpgs_with_reads, + fractional_meth, total_cpgs, cpgs_with_reads, total_meth, total_reads) - << endl; + << '\n'; } } - //////////////////////////////////////////////////////////////////////// /// /// CODE BELOW HERE IS FOR SEARCHING ON DISK @@ -292,7 +287,7 @@ process_with_cpgs_loaded(const bool VERBOSE, static void move_to_start_of_line(ifstream &in) { - char next; + char next{}; while (in.good() && in.get(next) && next != '\n') { in.unget(); in.unget(); @@ -313,7 +308,6 @@ get_chr_and_idx(ifstream &in, string &chrom, size_t &pos) { static void find_start_line(const string &chr, const size_t idx, ifstream &cpg_in) { - cpg_in.seekg(0, ios_base::beg); const size_t begin_pos = cpg_in.tellg(); cpg_in.seekg(0, ios_base::end); @@ -322,7 +316,7 @@ find_start_line(const string &chr, const size_t idx, ifstream &cpg_in) { if (end_pos - begin_pos < 2) throw runtime_error("empty meth file"); - size_t step_size = (end_pos - begin_pos)/2; + size_t step_size = (end_pos - begin_pos) / 2; cpg_in.seekg(0, ios_base::beg); string low_chr; @@ -333,7 +327,7 @@ find_start_line(const string &chr, const size_t idx, ifstream &cpg_in) { cpg_in.seekg(-2, ios_base::end); move_to_start_of_line(cpg_in); string high_chr; - size_t high_idx; + size_t high_idx{}; get_chr_and_idx(cpg_in, high_chr, high_idx); size_t pos = step_size; @@ -362,23 +356,18 @@ find_start_line(const string &chr, const size_t idx, ifstream &cpg_in) { } } - static bool cpg_not_past_region(const GenomicRegion ®ion, const size_t end_pos, const MSite &cpg) { return (cpg.chrom == region.get_chrom() && cpg.pos < end_pos) || - cpg.chrom < region.get_chrom(); + cpg.chrom < region.get_chrom(); } - static void -get_cpg_stats(ifstream &cpg_in, const GenomicRegion region, - vector &total_meth, - vector &total_reads, - size_t &total_cpgs, - vector &cpgs_with_reads, - vector &called_total, - vector &called_meth, +get_cpg_stats(ifstream &cpg_in, const GenomicRegion ®ion, + vector &total_meth, vector &total_reads, + size_t &total_cpgs, vector &cpgs_with_reads, + vector &called_total, vector &called_meth, vector &mean_meth) { const string chrom(region.get_chrom()); const size_t start_pos = region.get_start(); @@ -389,24 +378,23 @@ get_cpg_stats(ifstream &cpg_in, const GenomicRegion region, MSite cpg; vector values; - values.reserve(2*n_samples); + values.reserve(2 * n_samples); string line; while (getline(cpg_in, line)) { values.clear(); get_site_and_values(line, cpg, values); - if (values.size() != 2*n_samples) - throw runtime_error("wrong row length. Expected " - + std::to_string(2*n_samples) + ", got " + std::to_string(values.size()) - + "\n" + line); + if (values.size() != 2 * n_samples) + throw runtime_error("wrong row length. Expected " + + std::to_string(2 * n_samples) + ", got " + + std::to_string(values.size()) + "\n" + line); if (cpg_not_past_region(region, end_pos, cpg)) { if (start_pos <= cpg.pos && cpg.chrom == chrom) { ++total_cpgs; for (size_t j = 0; j < n_samples; ++j) { - const size_t meth = values[2*j + 1]; - const size_t tot = values[2*j]; - if (values[2*j] > 0) { - + const size_t meth = values[2 * j + 1]; + const size_t tot = values[2 * j]; + if (values[2 * j] > 0) { total_meth[j] += meth; total_reads[j] += tot; ++cpgs_with_reads[j]; @@ -415,7 +403,7 @@ get_cpg_stats(ifstream &cpg_in, const GenomicRegion region, called_total[j] += (calls.first || calls.second); called_meth[j] += calls.first; - mean_meth[j] += meth/static_cast(tot); + mean_meth[j] += meth / static_cast(tot); } } } @@ -424,15 +412,11 @@ get_cpg_stats(ifstream &cpg_in, const GenomicRegion region, cpg_in.clear(); } - static void process_with_cpgs_on_disk(const bool PRINT_NUMERIC_ONLY, const bool report_more_information, - const char level_code, - const string &cpgs_file, - vector ®ions, - std::ostream &out) { - + const char level_code, const string &cpgs_file, + vector ®ions, std::ostream &out) { ifstream in(cpgs_file); string header; getline(in, header); @@ -445,8 +429,8 @@ process_with_cpgs_on_disk(const bool PRINT_NUMERIC_ONLY, auto end_uniq = std::unique(begin(col_names), end(col_names)); // make sure all columns come in pairs of counts - if (2*static_cast(distance(begin(col_names), end_uniq)) - != col_names.size()) + if (2 * static_cast(distance(begin(col_names), end_uniq)) != + col_names.size()) throw runtime_error("wrong header format:\n" + header); col_names.resize(distance(begin(col_names), end_uniq)); @@ -469,47 +453,44 @@ process_with_cpgs_on_disk(const bool PRINT_NUMERIC_ONLY, vector score(n_samples, 0.0); vector mean_meth(n_samples, 0.0); - vector values(2*n_samples, 0.0); + vector values(2 * n_samples, 0.0); for (size_t i = 0; i < regions.size() && in; ++i) { size_t total_cpgs = 0; mean_meth = vector(n_samples, 0.0); for (size_t j = 0; j < n_samples; ++j) { - total_meth[j] = total_reads[j] = cpgs_with_reads[j] = - called_total[j] = called_meth[j] = 0; + total_meth[j] = total_reads[j] = cpgs_with_reads[j] = called_total[j] = + called_meth[j] = 0; } - get_cpg_stats(in, regions[i], total_meth, total_reads, total_cpgs, cpgs_with_reads, - called_total, called_meth, mean_meth); + get_cpg_stats(in, regions[i], total_meth, total_reads, total_cpgs, + cpgs_with_reads, called_total, called_meth, mean_meth); for (size_t j = 0; j < n_samples; ++j) { - fractional_meth[j] = static_cast(called_meth[j])/called_total[j]; - weighted_mean_meth[j] = static_cast(total_meth[j])/total_reads[j]; - unweighted_mean_meth[j] = mean_meth[j]/cpgs_with_reads[j]; - - score[j] = (level_code == 'w' ? weighted_mean_meth[j] : - (level_code == 'u' ? unweighted_mean_meth[j] : - fractional_meth[j])); + fractional_meth[j] = + static_cast(called_meth[j]) / called_total[j]; + weighted_mean_meth[j] = + static_cast(total_meth[j]) / total_reads[j]; + unweighted_mean_meth[j] = mean_meth[j] / cpgs_with_reads[j]; + + score[j] = + (level_code == 'w' ? weighted_mean_meth[j] + : (level_code == 'u' ? unweighted_mean_meth[j] + : fractional_meth[j])); } if (!PRINT_NUMERIC_ONLY || all_is_finite(score)) out << format_output_line(report_more_information, regions[i], score, weighted_mean_meth, unweighted_mean_meth, - fractional_meth, - total_cpgs, cpgs_with_reads, + fractional_meth, total_cpgs, cpgs_with_reads, total_meth, total_reads) - << endl; + << '\n'; } } -/// -/// END OF CODE FOR SEARCHING ON DISK -/// -//////////////////////////////////////////////////////////////////////// - +// end of code for searching on disk -static size_t +[[nodiscard]] static size_t check_bed_format(const string ®ions_file) { - ifstream in(regions_file); if (!in) throw runtime_error("cannot open file: " + regions_file); @@ -526,12 +507,9 @@ check_bed_format(const string ®ions_file) { return n_columns; } - int -main_multimethstat(int argc, char *argv[]) { - +main_multimethstat(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - static const string default_name_prefix = "X"; bool VERBOSE = false; @@ -545,7 +523,8 @@ main_multimethstat(int argc, char *argv[]) { string outfile; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "Compute average CpG " + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + "Compute average CpG " "methylation in each of a set of genomic intervals", " "); opt_parse.set_show_defaults(); @@ -553,40 +532,42 @@ main_multimethstat(int argc, char *argv[]) { false, outfile); opt_parse.add_opt("numeric", 'N', "print numeric values only (not NAs)", false, PRINT_NUMERIC_ONLY); - opt_parse.add_opt("preload", 'L', "Load all CpG sites", - false, load_entire_file); - opt_parse.add_opt("sort", 's', "sort data if needed", - false, sort_data_if_needed); - opt_parse.add_opt("level", 'l', "the level to report as score column " - "in bed format output (w, u or f)", false, level_code); + opt_parse.add_opt("preload", 'L', "Load all CpG sites", false, + load_entire_file); + opt_parse.add_opt("sort", 's', "sort data if needed", false, + sort_data_if_needed); + opt_parse.add_opt("level", 'l', + "the level to report as score column " + "in bed format output (w, u or f)", + false, level_code); opt_parse.add_opt("more-levels", 'M', "report more methylation information", false, report_more_information); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 2) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (sort_data_if_needed && !load_entire_file) { - cerr << "cannot sort data unless all data is loaded" << endl; + cerr << "cannot sort data unless all data is loaded\n"; return EXIT_SUCCESS; } if (level_code != "w" && level_code != "u" && level_code != "f") { - cerr << "selected level must be in {w, u, f}" << endl; + cerr << "selected level must be in {w, u, f}\n"; return EXIT_SUCCESS; } const string regions_file = leftover_args.front(); @@ -594,7 +575,7 @@ main_multimethstat(int argc, char *argv[]) { /****************** END COMMAND LINE OPTIONS *****************/ if (VERBOSE) - cerr << "loading regions" << endl; + cerr << "loading regions\n"; const size_t n_columns = check_bed_format(regions_file); // MAGIC: this allows for exactly 3 or at least 6 columns in the @@ -607,42 +588,38 @@ main_multimethstat(int argc, char *argv[]) { if (!is_sorted(begin(regions), end(regions))) { if (sort_data_if_needed) { if (VERBOSE) - cerr << "sorting regions" << endl; + cerr << "sorting regions\n"; sort(begin(regions), end(regions)); } - else throw runtime_error("regions not sorted in file: " + regions_file); + else + throw runtime_error("regions not sorted in file: " + regions_file); } - if (n_columns == 3) // then we should name the regions + if (n_columns == 3) // then we should name the regions for (size_t i = 0; i < regions.size(); ++i) regions[i].set_name(default_name_prefix + std::to_string(i)); if (VERBOSE) - cerr << "[n_regions=" << regions.size() << "]" << endl; + cerr << "[n_regions=" << regions.size() << "]\n"; std::ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); if (load_entire_file) - process_with_cpgs_loaded(VERBOSE, // sort_data_if_needed, - PRINT_NUMERIC_ONLY, - report_more_information, - level_code[0], - cpgs_file, regions, out); + process_with_cpgs_loaded(VERBOSE, // sort_data_if_needed, + PRINT_NUMERIC_ONLY, report_more_information, + level_code[0], cpgs_file, regions, out); else - process_with_cpgs_on_disk(PRINT_NUMERIC_ONLY, - report_more_information, - level_code[0], - cpgs_file, regions, out); - } - catch (const runtime_error &e) { - cerr << e.what() << endl; - return EXIT_FAILURE; + process_with_cpgs_on_disk(PRINT_NUMERIC_ONLY, report_more_information, + level_code[0], cpgs_file, regions, out); } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::exception &e) { + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) From 92ece4d4b0ff8759b1e1592e58f17e538cf2b3c7 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:31 -0800 Subject: [PATCH 016/106] src/analysis/nanopore.cpp: changes to add static analysis --- src/analysis/nanopore.cpp | 75 ++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index 7d41ff07..29a3438f 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -16,11 +16,24 @@ * more details. */ +#include "counts_header.hpp" + +#include "OptionParser.hpp" +#include "bam_record_utils.hpp" + +#include "bamxx.hpp" + +#include + #include #include #include +#include #include -#include +#include +#include +#include +#include #include #include #include @@ -31,15 +44,12 @@ #include #include #include +#include +#include +#include #include -#include "OptionParser.hpp" -#include "bam_record_utils.hpp" -#include "counts_header.hpp" -#include "dnmt_error.hpp" - -/* HTSlib */ -#include +// NOLINTBEGIN(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*-narrowing-conversions,*-pro-bounds-constant-array-index,*-pro-bounds-pointer-arithmetic) [[nodiscard]] inline bool is_cytosine(const char c) { @@ -115,7 +125,6 @@ get_basecall_model(const bamxx::bam_header &hdr) { const std::vector parts( (std::istream_iterator(buffer)), {}); - std::string basecall_model; for (const auto &key_val : parts) { const auto equal_pos = key_val.find('='); if (equal_pos == std::string::npos) @@ -346,7 +355,7 @@ next_mod_pos(T &b, const T e) -> std::int32_t { b = std::find_if(b, e, isdig); if (b == e) return -1; - auto r = atoi(b); + auto r = atoi(b); // NOLINT(cert-err34-c) b = std::find_if_not(b, e, isdig); return r; }; @@ -469,7 +478,7 @@ static const std::uint8_t encoding[] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 // 256 }; static constexpr auto n_dinucs = 16u; -static const auto dinucs = std::vector{ +static const auto dinucs = std::vector{ // NOLINT(cert-err58-cpp) "AA", "AC", "AG", "AT", "CA", "CC", "CG", "CT", "GA", "GC", "GG", "GT", @@ -790,7 +799,7 @@ struct read_processor { bool symmetric{}; bool cpg_only{}; bool force{}; - std::uint32_t n_threads{1}; + std::int32_t n_threads{1}; int strand{0}; std::string expected_basecall_model{}; @@ -855,7 +864,7 @@ struct read_processor { const std::vector &counts) const { static constexpr auto out_fmt = "%ld\t%c\t%s\t%.6g\t%d\t%.6g\t%.6g\n"; static constexpr auto buf_size = 1024; - char buffer[buf_size]; + std::array buffer{}; // Put chrom name in buffer and then skip that part for each site because // it doesn't change. @@ -864,15 +873,14 @@ struct read_processor { throw std::runtime_error("failed to identify chrom for tid: " + std::to_string(tid)); const auto chrom_name_offset = - std::snprintf(buffer, buf_size, "%s\t", chrom_name); + std::snprintf(buffer.data(), buf_size, "%s\t", chrom_name); if (chrom_name_offset < 0) throw std::runtime_error("failed to write to output buffer"); - auto buffer_after_chrom = buffer + chrom_name_offset; + auto buffer_after_chrom = buffer.data() + chrom_name_offset; for (auto chrom_posn = 0ul; chrom_posn < std::size(chrom); ++chrom_posn) { const char base = chrom[chrom_posn]; if (is_cytosine(base) || is_guanine(base)) { - const std::uint32_t the_tag = get_tag_from_genome(chrom, chrom_posn); if (cpg_only && the_tag != 0) continue; @@ -902,7 +910,7 @@ struct read_processor { if (r < 0) throw std::runtime_error("failed to write to output buffer"); - out.write(buffer); + out.write(buffer.data()); } } } @@ -913,7 +921,7 @@ struct read_processor { const std::vector &counts) const { static constexpr auto out_fmt = "%ld\t+\tCpG\t%.6g\t%d\t%.6g\t%.6g\n"; static constexpr auto buf_size = 1024; - char buffer[buf_size]; + std::array buffer{}; // Put chrom name in buffer and then skip that part for each site because // it doesn't change. @@ -922,10 +930,10 @@ struct read_processor { throw std::runtime_error("failed to identify chrom for tid: " + std::to_string(tid)); const auto chrom_name_offset = - std::snprintf(buffer, buf_size, "%s\t", chrom_name); + std::snprintf(buffer.data(), buf_size, "%s\t", chrom_name); if (chrom_name_offset < 0) throw std::runtime_error("failed to write to output buffer"); - auto buffer_after_chrom = buffer + chrom_name_offset; + auto buffer_after_chrom = buffer.data() + chrom_name_offset; bool prev_was_c{false}; std::uint32_t n_reads_pos{}; @@ -969,7 +977,7 @@ struct read_processor { if (r < 0) throw std::runtime_error("failed to write to output buffer"); - out.write(buffer); + out.write(buffer.data()); } prev_was_c = false; } @@ -997,7 +1005,7 @@ struct read_processor { [](const char c) { return std::toupper(c); }); if (verbose) std::cerr << "[n chroms in reference: " << std::size(chroms) << "]" - << std::endl; + << '\n'; const auto chroms_beg = std::cbegin(chroms); std::unordered_map name_to_idx; @@ -1051,13 +1059,13 @@ struct read_processor { if (verbose) std::cerr << "[observed basecall model: " << (basecall_model.empty() ? "NA" : basecall_model) << "]" - << std::endl; + << '\n'; if (!expected_basecall_model.empty() && basecall_model != expected_basecall_model) { std::cerr << "failed to match basecall model:\n" << "observed=" << (basecall_model.empty() ? "NA" : basecall_model) << "\n" - << "expected=" << expected_basecall_model_str() << std::endl; + << "expected=" << expected_basecall_model_str() << '\n'; return {{}, {}}; } @@ -1109,7 +1117,7 @@ struct read_processor { std::string(sam_hdr_tid2name(hdr, tid))); if (show_progress && current_target_present) - std::cerr << "processing " << sam_hdr_tid2name(hdr, tid) << std::endl; + std::cerr << "processing " << sam_hdr_tid2name(hdr, tid) << '\n'; // reset the counts counts.clear(); @@ -1215,10 +1223,9 @@ check_modification_sites(const std::string &infile, } int -main_nanocount(int argc, char *argv[]) { // NOLINT +main_nanocount(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) static constexpr auto n_reads_to_check = 1000; try { - read_processor rp; std::string chroms_file; @@ -1226,7 +1233,7 @@ main_nanocount(int argc, char *argv[]) { // NOLINT std::string stats_file; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(std::filesystem::path{argv[0]}.filename(), + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "get methylation levels from mapped nanopore reads", "-c "); opt_parse.add_opt("threads", 't', "threads to use (few needed)", false, @@ -1259,12 +1266,12 @@ main_nanocount(int argc, char *argv[]) { // NOLINT opt_parse.parse(argc, argv, leftover_args); if (opt_parse.about_requested() || opt_parse.help_requested() || leftover_args.empty()) { - std::cerr << opt_parse.help_message() << std::endl - << opt_parse.about_message() << std::endl; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - std::cerr << opt_parse.option_missing_message() << std::endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } const std::string mapped_reads_file = leftover_args.front(); @@ -1277,7 +1284,7 @@ main_nanocount(int argc, char *argv[]) { // NOLINT rp.cpg_only = true; if (rp.n_threads <= 0) - throw std::runtime_error("thread count cannot be negative"); + throw std::runtime_error("thread count cannot be zerox"); std::ostringstream cmd; std::copy(argv, argv + argc, std::ostream_iterator(cmd, " ")); @@ -1306,8 +1313,10 @@ main_nanocount(int argc, char *argv[]) { // NOLINT } } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*-narrowing-conversions,*-pro-bounds-constant-array-index,*-pro-bounds-pointer-arithmetic) From 66c28305af6a61da81fac73d70044fb1cd768059 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 017/106] src/analysis/pmd.cpp: changes to add static analysis --- src/analysis/pmd.cpp | 811 ++++++++++++++++++++----------------------- 1 file changed, 378 insertions(+), 433 deletions(-) diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp index ba6dedeb..1c237b3f 100644 --- a/src/analysis/pmd.cpp +++ b/src/analysis/pmd.cpp @@ -14,58 +14,70 @@ * GNU General Public License for more details. */ -#include +#include "GenomicRegion.hpp" +#include "MSite.hpp" +#include "OptionParser.hpp" +#include "TwoStateHMM_PMD.hpp" +#include "bsutils.hpp" +#include "counts_header.hpp" +#include "smithlab_utils.hpp" #include -#include +#include + +#include +#include #include +#include +#include #include #include +#include +#include +#include #include -#include -#include +#include +#include #include #include +#include +#include +#include +#include +#include -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" -#include "GenomicRegion.hpp" -#include "OptionParser.hpp" -#include "bsutils.hpp" -#include "counts_header.hpp" - -#include "TwoStateHMM_PMD.hpp" -#include "MSite.hpp" +// NOLINTBEGIN(*-avoid-c-arrays,*-avoid-magic-numbers,*-init-variables,*-narrowing-conversions,*-owning-memory,*-pointer-arithmetic) -using std::string; -using std::vector; -using std::cout; -using std::endl; using std::cerr; -using std::numeric_limits; +using std::cout; +using std::make_pair; using std::max; using std::min; -using std::make_pair; -using std::pair; +using std::numeric_limits; using std::runtime_error; +using std::string; +using std::vector; -using std::ostream; using std::ofstream; +using std::ostream; using std::to_string; using bamxx::bgzf_file; -template using num_lim = std::numeric_limits; +template using num_lim = std::numeric_limits; struct pmd_summary { - pmd_summary(const vector &pmds) { - pmd_count = pmds.size(); + explicit pmd_summary(const vector &pmds) : + pmd_count{std::size(pmds)} { + // NOLINTBEGIN(*-prefer-member-initializer) pmd_total_size = accumulate(cbegin(pmds), cend(pmds), 0ul, [](const uint64_t t, const GenomicRegion &p) { - return t + p.get_width(); }); - pmd_mean_size = - static_cast(pmd_total_size)/std::max(pmd_count, static_cast(1)); + return t + p.get_width(); + }); + pmd_mean_size = static_cast(pmd_total_size) / + std::max(pmd_count, static_cast(1)); + // NOLINTEND(*-prefer-member-initializer) } // pmd_count is the number of identified PMDs. uint64_t pmd_count{}; @@ -74,12 +86,13 @@ struct pmd_summary { // mean_pmd_size is the mean size of the identified PMDs double pmd_mean_size{}; - string tostring() { + string + tostring() { std::ostringstream oss; - oss << "pmd_count: " << pmd_count << endl - << "pmd_total_size: " << pmd_total_size << endl - << "pmd_mean_size: " - << std::fixed << std::setprecision(2) << pmd_mean_size; + oss << "pmd_count: " << pmd_count << '\n' + << "pmd_total_size: " << pmd_total_size << '\n' + << "pmd_mean_size: " << std::fixed << std::setprecision(2) + << pmd_mean_size; return oss.str(); } @@ -93,28 +106,20 @@ get_adjacent_distances(const vector &pmds, dists.push_back(pmds[i].get_start() - pmds[i - 1].get_end()); } - static bool -precedes(const string &chrom, const size_t position, - const GenomicRegion &r) { +precedes(const string &chrom, const size_t position, const GenomicRegion &r) { return (chrom < r.get_chrom() || - (chrom == r.get_chrom() && - position < r.get_start())); + (chrom == r.get_chrom() && position < r.get_start())); } - static bool -succeeds(const string &chrom, const size_t position, - const GenomicRegion &r) { +succeeds(const string &chrom, const size_t position, const GenomicRegion &r) { return (r.get_chrom() < chrom || - (chrom == r.get_chrom() && - r.get_end() <= position)); + (chrom == r.get_chrom() && r.get_end() <= position)); } - static void -merge_nearby_pmd(const size_t max_merge_dist, - vector &pmds) { +merge_nearby_pmd(const size_t max_merge_dist, vector &pmds) { size_t j = 0; for (size_t i = 1; i < pmds.size(); ++i) { if (pmds[j].same_chrom(pmds[i]) && @@ -123,7 +128,8 @@ merge_nearby_pmd(const size_t max_merge_dist, const string combined_name(pmds[j].get_name() + pmds[i].get_name()); pmds[j].set_name(combined_name); } - else pmds[++j] = pmds[i]; + else + pmds[++j] = pmds[i]; } pmds.resize(j); } @@ -132,23 +138,17 @@ inline double beta_max_likelihood(const double fg_alpha, const double fg_beta, const double bg_alpha, const double bg_beta, const double p_low, const double p_hi) { - return (fg_alpha - 1.0)*log(p_low) + - (fg_beta - 1.0)*log(1.0 - p_low) - - gsl_sf_lnbeta(fg_alpha, fg_beta) + - (bg_alpha - 1.0)*log(p_hi) + - (bg_beta - 1.0)*log(1.0 - p_hi) - - gsl_sf_lnbeta(bg_alpha, bg_beta); + return (fg_alpha - 1.0) * log(p_low) + (fg_beta - 1.0) * log(1.0 - p_low) - + gsl_sf_lnbeta(fg_alpha, fg_beta) + (bg_alpha - 1.0) * log(p_hi) + + (bg_beta - 1.0) * log(1.0 - p_hi) - gsl_sf_lnbeta(bg_alpha, bg_beta); } static size_t find_best_bound(const bool IS_RIGHT_BOUNDARY, - std::map > &pos_meth_tot, - const vector &fg_alpha, - const vector &fg_beta, - const vector &bg_alpha, - const vector &bg_beta) { - - vector > meth_tot; + std::map> &pos_meth_tot, + const vector &fg_alpha, const vector &fg_beta, + const vector &bg_alpha, const vector &bg_beta) { + vector> meth_tot; vector positions; for (auto &&it : pos_meth_tot) { positions.push_back(it.first); @@ -160,18 +160,18 @@ find_best_bound(const bool IS_RIGHT_BOUNDARY, vector cumu_right_meth(meth_tot.size(), 0); vector cumu_right_tot(meth_tot.size(), 0); if (meth_tot.size() > 0) - for (size_t i = 1; i < meth_tot.size()-1; ++i) { + for (size_t i = 1; i < meth_tot.size() - 1; ++i) { const size_t j = meth_tot.size() - 1 - i; - cumu_left_meth[i] = cumu_left_meth[i-1] + meth_tot[i-1].first; - cumu_left_tot[i] = cumu_left_tot[i-1] + meth_tot[i-1].second; - cumu_right_meth[j] = cumu_right_meth[j+1] + meth_tot[j+1].first; - cumu_right_tot[j] = cumu_right_tot[j+1] + meth_tot[j+1].second; + cumu_left_meth[i] = cumu_left_meth[i - 1] + meth_tot[i - 1].first; + cumu_left_tot[i] = cumu_left_tot[i - 1] + meth_tot[i - 1].second; + cumu_right_meth[j] = cumu_right_meth[j + 1] + meth_tot[j + 1].first; + cumu_right_tot[j] = cumu_right_tot[j + 1] + meth_tot[j + 1].second; } size_t best_idx = 0; double best_score = -num_lim::max(); if (meth_tot.size() > 0) - for (size_t i = 1; i < meth_tot.size()-1; ++i) { + for (size_t i = 1; i < meth_tot.size() - 1; ++i) { size_t N_low, k_low, N_hi, k_hi; if (!IS_RIGHT_BOUNDARY) { N_low = cumu_right_tot[i] + meth_tot[i].second; @@ -187,14 +187,13 @@ find_best_bound(const bool IS_RIGHT_BOUNDARY, } if (N_hi > 0 && N_low > 0) { double score = 0; - const double p_hi = static_cast(k_hi)/N_hi; - const double p_low = static_cast(k_low)/N_low; + const double p_hi = static_cast(k_hi) / N_hi; + const double p_low = static_cast(k_low) / N_low; for (size_t j = 0; j < fg_alpha.size(); ++j) { - score += beta_max_likelihood(fg_alpha[j], fg_beta[j], - bg_alpha[j], bg_beta[j], - p_low, p_hi); - } // beta max likelihood using learned emissions + score += beta_max_likelihood(fg_alpha[j], fg_beta[j], bg_alpha[j], + bg_beta[j], p_low, p_hi); + } // beta max likelihood using learned emissions score /= fg_alpha.size(); if (p_hi > p_low && score > best_score) { best_idx = i; @@ -202,62 +201,58 @@ find_best_bound(const bool IS_RIGHT_BOUNDARY, } } } - return (best_score > -num_lim::max()) ? - positions[best_idx] : num_lim::max(); + return (best_score > -num_lim::max()) ? positions[best_idx] + : num_lim::max(); } - static void get_boundary_positions(vector &bounds, const vector &pmds, const size_t &bin_size) { for (size_t i = 0; i < pmds.size(); ++i) { bounds.push_back(pmds[i]); - pmds[i].get_start() > bin_size ? - bounds.back().set_start(pmds[i].get_start() - bin_size) : - bounds.back().set_start(0); + pmds[i].get_start() > bin_size + ? bounds.back().set_start(pmds[i].get_start() - bin_size) + : bounds.back().set_start(0); bounds.back().set_end(pmds[i].get_start() + bin_size); bounds.push_back(pmds[i]); - pmds[i].get_end() > bin_size ? - bounds.back().set_start(pmds[i].get_end() - bin_size) : - bounds.back().set_start(0); + pmds[i].get_end() > bin_size + ? bounds.back().set_start(pmds[i].get_end() - bin_size) + : bounds.back().set_start(0); bounds.back().set_end(pmds[i].get_end() + bin_size); } } static void -get_optimized_boundary_likelihoods(const vector &cpgs_file, - vector &bounds, - const vector &array_status, - const vector &fg_alpha, - const vector &fg_beta, - const vector &bg_alpha, - const vector &bg_beta, - vector &boundary_scores, - vector &boundary_certainties) { +get_optimized_boundary_likelihoods( + const vector &cpgs_file, vector &bounds, + const vector &array_status, const vector &fg_alpha, + const vector &fg_beta, const vector &bg_alpha, + const vector &bg_beta, vector &boundary_scores, + vector &boundary_certainties) { // MAGIC NUMBER FOR WEIGHTING ARRAY // CONTRIBUTION TO BOUNDARY OBSERVATIONS static const double array_coverage_constant = 10; - vector in(cpgs_file.size()); + vector in(cpgs_file.size()); for (size_t i = 0; i < cpgs_file.size(); ++i) { in[i] = new bgzf_file(cpgs_file[i], "r"); if (get_has_counts_header(cpgs_file[i])) - skip_counts_header(*in[i]); + skip_counts_header(*in[i]); // cppcheck-suppress shadowVariable } - std::map > pos_meth_tot; - size_t n_meth = 0ul, n_reads = 0ul; - size_t bound_idx = 0; - for (; bound_idx < bounds.size(); ++bound_idx) { // for each boundary + std::map> pos_meth_tot; + size_t n_meth{}; + size_t n_reads{}; + size_t bound_idx{}; + for (; bound_idx < bounds.size(); ++bound_idx) { // for each boundary for (size_t i = 0; i < in.size(); ++i) { // get totals for all CpGs overlapping that boundary MSite site; while (read_site(*in[i], site) && !succeeds(site.chrom, site.pos, bounds[bound_idx])) { - if (array_status[i]) site.n_reads = array_coverage_constant; @@ -279,9 +274,9 @@ get_optimized_boundary_likelihoods(const vector &cpgs_file, } auto it(pos_meth_tot.find(site.pos)); if (it == end(pos_meth_tot)) - pos_meth_tot[site.pos] = make_pair(n_meth, n_reads); + pos_meth_tot[site.pos] = std::make_pair(n_meth, n_reads); - else { // add this file's contribution to the site's methylation + else { // add this file's contribution to the site's methylation pos_meth_tot[site.pos].first += n_meth; pos_meth_tot[site.pos].second += n_reads; } @@ -291,7 +286,7 @@ get_optimized_boundary_likelihoods(const vector &cpgs_file, // Get the boundary position size_t boundary_position = - (bounds[bound_idx].get_start() + bounds[bound_idx].get_end())/2; + (bounds[bound_idx].get_start() + bounds[bound_idx].get_end()) / 2; size_t N_low = 0, k_low = 0, N_hi = 0, k_hi = 0; for (auto it = begin(pos_meth_tot); it != end(pos_meth_tot); ++it) { @@ -299,31 +294,27 @@ get_optimized_boundary_likelihoods(const vector &cpgs_file, N_low += it->second.second; k_low += it->second.first; } - else{ + else { N_hi += it->second.second; k_hi += it->second.first; } } double score = 0; - const double p_hi = static_cast(k_hi)/N_hi; - const double p_low = static_cast(k_low)/N_low; + const double p_hi = static_cast(k_hi) / N_hi; + const double p_low = static_cast(k_low) / N_low; - if (bound_idx % 2) { // its a right boundary, p_low should go with fg + if (bound_idx % 2) { // its a right boundary, p_low should go with fg for (size_t j = 0; j < fg_alpha.size(); ++j) - score += beta_max_likelihood(fg_alpha[j], fg_beta[j], - bg_alpha[j], bg_beta[j], - p_low, p_hi); + score += beta_max_likelihood(fg_alpha[j], fg_beta[j], bg_alpha[j], + bg_beta[j], p_low, p_hi); } - else { // its a left boundary, p_low should go with bg + else { // its a left boundary, p_low should go with bg for (size_t j = 0; j < fg_alpha.size(); ++j) - score += beta_max_likelihood(bg_alpha[j], bg_beta[j], - fg_alpha[j], fg_beta[j], - p_low, p_hi); - - + score += beta_max_likelihood(bg_alpha[j], bg_beta[j], fg_alpha[j], + fg_beta[j], p_low, p_hi); } - boundary_certainties.push_back(std::min(N_low,N_hi)); + boundary_certainties.push_back(std::min(N_low, N_hi)); score /= fg_alpha.size(); boundary_scores.push_back(exp(score)); pos_meth_tot.clear(); @@ -333,38 +324,34 @@ get_optimized_boundary_likelihoods(const vector &cpgs_file, delete fp; } - static void -find_exact_boundaries(const vector &cpgs_file, - vector &bounds, - const vector &array_status, - const vector &fg_alpha, - const vector &fg_beta, - const vector &bg_alpha, - const vector &bg_beta, - vector &bound_site) { +find_exact_boundaries( + const vector &cpgs_file, vector &bounds, + const vector &array_status, const vector &fg_alpha, + const vector &fg_beta, const vector &bg_alpha, + const vector &bg_beta, vector &bound_site) { // MAGIC NUMBER FOR WEIGHTING ARRAY // CONTRIBUTION TO BOUNDARY OBSERVATIONS static const double array_coverage_constant = 10; - vector in(cpgs_file.size()); + vector in(cpgs_file.size()); for (size_t i = 0; i < cpgs_file.size(); ++i) { in[i] = new bgzf_file(cpgs_file[i], "r"); if (get_has_counts_header(cpgs_file[i])) - skip_counts_header(*in[i]); + skip_counts_header(*in[i]); // cppcheck-suppress shadowVariable } - std::map > pos_meth_tot; - size_t n_meth = 0ul, n_reads = 0ul; - size_t bound_idx = 0; - for (; bound_idx < bounds.size(); ++bound_idx) { // for each boundary + std::map> pos_meth_tot; + size_t n_meth{}; + size_t n_reads{}; + size_t bound_idx{}; + for (; bound_idx < bounds.size(); ++bound_idx) { // for each boundary for (size_t i = 0; i < in.size(); ++i) { // get totals for all CpGs overlapping that boundary MSite site; while (read_site(*in[i], site) && !succeeds(site.chrom, site.pos, bounds[bound_idx])) { - if (array_status[i]) site.pos = array_coverage_constant; @@ -385,52 +372,48 @@ find_exact_boundaries(const vector &cpgs_file, n_reads = site.n_reads; } auto it = pos_meth_tot.find(site.pos); - if (it == end(pos_meth_tot)) {// does not exist in map - pos_meth_tot.emplace(site.pos, make_pair(n_meth, n_reads)); + if (it == end(pos_meth_tot)) { // does not exist in map + pos_meth_tot.emplace(site.pos, std::make_pair(n_meth, n_reads)); } - else { // add this file's contribution to the CpG's methylation + else { // add this file's contribution to the CpG's methylation pos_meth_tot[site.pos].first += site.n_meth(); pos_meth_tot[site.pos].second += site.n_reads; } } } } - bound_site.push_back(find_best_bound(bound_idx % 2, pos_meth_tot, - fg_alpha, fg_beta, - bg_alpha, bg_beta)); + bound_site.push_back(find_best_bound(bound_idx % 2, pos_meth_tot, fg_alpha, + fg_beta, bg_alpha, bg_beta)); pos_meth_tot.clear(); } - for (size_t i = 0; i < in.size(); ++i) + + for (size_t i = 0; i < std::size(in); ++i) delete in[i]; } - static void -optimize_boundaries(const size_t bin_size, - const vector &cpgs_file, +optimize_boundaries(const size_t bin_size, const vector &cpgs_file, vector &pmds, const vector &array_status, const vector &fg_alpha, const vector &fg_beta, const vector &bg_alpha, const vector &bg_beta) { - vector bounds; get_boundary_positions(bounds, pmds, bin_size); vector bound_site; - find_exact_boundaries(cpgs_file, bounds, array_status, fg_alpha, - fg_beta, bg_alpha, bg_beta, - bound_site); + find_exact_boundaries(cpgs_file, bounds, array_status, fg_alpha, fg_beta, + bg_alpha, bg_beta, bound_site); //////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// ///// NOW RESET THE STARTS AND ENDS OF PMDs ///// for (size_t i = 0; i < pmds.size(); ++i) { - const size_t start_site = bound_site[2*i]; + const size_t start_site = bound_site[2 * i]; if (start_site != num_lim::max()) pmds[i].set_start(start_site); - const size_t end_site = bound_site[2*i + 1]; + const size_t end_site = bound_site[2 * i + 1]; if (end_site != num_lim::max()) pmds[i].set_end(end_site + 1); } @@ -447,41 +430,39 @@ optimize_boundaries(const size_t bin_size, //////////////////////////////////////////////////////////////////////// // NEED TO USE SOME RANDOMIZATION METHOD HERE TO FIGURE OUT THE // MERGING DISTANCE - vector > dist_hist; + vector> dist_hist; size_t first = 0; for (size_t i = 1; i < dists.size(); ++i) if (dists[i] != dists[i - 1]) { - dist_hist.push_back(make_pair(dists[i - 1], i - first)); + dist_hist.push_back(std::make_pair(dists[i - 1], i - first)); first = i; } - merge_nearby_pmd(2*bin_size, pmds); + merge_nearby_pmd(2 * bin_size, pmds); /////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////// // LAST, GET THE CPG SITES WITHIN 1 BIN OF EACH BOUNDARY AND COMPUTE // THE LIKELIHOOD TO GET A "SCORE" ON THE BOUNDARY /////// - bounds.clear(); // need updated boundaries after merging nearby PMDs + bounds.clear(); // need updated boundaries after merging nearby PMDs get_boundary_positions(bounds, pmds, bin_size); vector boundary_scores; vector boundary_certainties; - get_optimized_boundary_likelihoods(cpgs_file, bounds, array_status, - fg_alpha, fg_beta, bg_alpha, - bg_beta, boundary_scores, - boundary_certainties); + get_optimized_boundary_likelihoods(cpgs_file, bounds, array_status, fg_alpha, + fg_beta, bg_alpha, bg_beta, + boundary_scores, boundary_certainties); // Add the boundary scores to the PMD names for (size_t i = 0; i < pmds.size(); ++i) - pmds[i].set_name(pmds[i].get_name() - + ":" + to_string(boundary_scores[2*i]) - + ":" + to_string(boundary_certainties[2*i]) - + ":" + to_string(boundary_scores[2*i+1]) - + ":" + to_string(boundary_certainties[2*i+1])); + pmds[i].set_name(pmds[i].get_name() + ":" + + to_string(boundary_scores[2 * i]) + ":" + + to_string(boundary_certainties[2 * i]) + ":" + + to_string(boundary_scores[2 * i + 1]) + ":" + + to_string(boundary_certainties[2 * i + 1])); } - double get_score_cutoff_for_fdr(const vector &scores, const double fdr) { if (fdr <= 0) @@ -492,24 +473,22 @@ get_score_cutoff_for_fdr(const vector &scores, const double fdr) { std::sort(begin(local), end(local)); size_t i = 0; for (; i < local.size() - 1 && - local[i+1] < fdr*static_cast(i+1)/local.size(); ++i); - return local[i] + 1.0/scores.size(); + local[i + 1] < fdr * static_cast(i + 1) / local.size(); + ++i) + ; + return local[i] + 1.0 / scores.size(); } - static inline double -score_contribution(const pair &m) { +score_contribution(const std::pair &m) { const double denom = m.first + m.second; - return (denom > 0) ? 1.0 - m.first/denom : 0.0; + return (denom > 0) ? 1.0 - m.first / denom : 0.0; } - static void get_domain_scores(const vector &classes, - const vector > > &meth, - const vector &reset_points, - vector &scores) { - + const vector>> &meth, + const vector &reset_points, vector &scores) { const size_t n_replicates = meth.size(); size_t reset_idx = 1; bool in_domain = false; @@ -524,7 +503,7 @@ get_domain_scores(const vector &classes, ++reset_idx; } if (classes[i]) { - for (size_t r = 0; r < n_replicates ; ++r) + for (size_t r = 0; r < n_replicates; ++r) score += score_contribution(meth[r][i]); in_domain = true; } @@ -538,13 +517,10 @@ get_domain_scores(const vector &classes, scores.push_back(score); } - static void build_domains(const vector &bins, - const vector &reset_points, - const vector &classes, + const vector &reset_points, const vector &classes, vector &domains) { - size_t n_bins = 0, reset_idx = 1, prev_end = 0; bool in_domain = false; for (size_t i = 0; i < classes.size(); ++i) { @@ -574,12 +550,12 @@ build_domains(const vector &bins, } } - -//Modified to take multiple replicates -template static void +// Modified to take multiple replicates +template +static void separate_regions(const size_t desert_size, - vector > &bins, - vector > &meth, vector> &reads, + vector> &bins, + vector> &meth, vector> &reads, vector &reset_points, vector &dists_btwn_bins) { const size_t n_replicates = bins.size(); @@ -591,8 +567,10 @@ separate_regions(const size_t desert_size, bool all_empty = true; size_t rep_idx = 0; while (all_empty && rep_idx < n_replicates) { - if (reads[rep_idx][i] == 0) ++rep_idx; - else all_empty = false; + if (reads[rep_idx][i] == 0) + ++rep_idx; + else + all_empty = false; } if (!all_empty) { dists_btwn_bins.push_back(bins[0][i].get_start() - end_coord_of_prev); @@ -615,8 +593,9 @@ separate_regions(const size_t desert_size, // segregate bins size_t prev_cpg = 0; for (size_t i = 0; i < bins[0].size(); ++i) { - const size_t dist = (i > 0 && bins[0][i].same_chrom(bins[0][i - 1])) ? - bins[0][i].get_start() - prev_cpg : num_lim::max(); + const size_t dist = (i > 0 && bins[0][i].same_chrom(bins[0][i - 1])) + ? bins[0][i].get_start() - prev_cpg + : num_lim::max(); if (dist > desert_size) reset_points.push_back(i); prev_cpg = bins[0][i].get_start(); @@ -625,36 +604,31 @@ separate_regions(const size_t desert_size, reset_points.push_back(bins[0].size()); } - static void -shuffle_bins(const size_t rng_seed, - const TwoStateHMM &hmm, - vector > > meth, +shuffle_bins(const size_t rng_seed, const TwoStateHMM &hmm, + vector>> meth, const vector &reset_points, const vector &start_trans, - const vector > &trans, - const vector &end_trans, - const vector &fg_alpha, const vector &fg_beta, - const vector &bg_alpha, const vector &bg_beta, - vector &domain_scores, - vector &array_status) { - + const vector> &trans, + const vector &end_trans, const vector &fg_alpha, + const vector &fg_beta, const vector &bg_alpha, + const vector &bg_beta, vector &domain_scores, + const vector &array_status) { size_t n_replicates = meth.size(); auto eng = std::default_random_engine(rng_seed); - for (size_t r =0 ; r < n_replicates; ++r) + for (size_t r = 0; r < n_replicates; ++r) std::shuffle(begin(meth[r]), end(meth[r]), eng); vector classes; vector scores; - hmm.PosteriorDecoding_rep(meth, reset_points, start_trans, trans, - end_trans, fg_alpha, fg_beta, bg_alpha, - bg_beta, classes, scores, array_status); + hmm.PosteriorDecoding_rep(meth, reset_points, start_trans, trans, end_trans, + fg_alpha, fg_beta, bg_alpha, bg_beta, classes, + scores, array_status); get_domain_scores(classes, meth, reset_points, domain_scores); sort(begin(domain_scores), end(domain_scores)); } - static void assign_p_values(const vector &random_scores, const vector &observed_scores, @@ -667,100 +641,81 @@ assign_p_values(const vector &random_scores, } } - static void -read_params_file(const bool verbose, - const string ¶ms_file, - double &fg_alpha, - double &fg_beta, - double &bg_alpha, - double &bg_beta, - vector &start_trans, - vector > &trans, - vector &end_trans, +read_params_file(const bool verbose, const string ¶ms_file, + double &fg_alpha, double &fg_beta, double &bg_alpha, + double &bg_beta, vector &start_trans, + vector> &trans, vector &end_trans, double &fdr_cutoff) { string jnk; std::ifstream in(params_file.c_str()); - in >> jnk >> fg_alpha - >> jnk >> fg_beta - >> jnk >> bg_alpha - >> jnk >> bg_beta - >> jnk >> start_trans[0] - >> jnk >> start_trans[1] - >> jnk >> trans[0][0] - >> jnk >> trans[0][1] - >> jnk >> trans[1][0] - >> jnk >> trans[1][1] - >> jnk >> end_trans[0] - >> jnk >> end_trans[1] - >> jnk >> fdr_cutoff; + in >> jnk >> fg_alpha >> jnk >> fg_beta >> jnk >> bg_alpha >> jnk >> + bg_beta >> jnk >> start_trans[0] >> jnk >> start_trans[1] >> jnk >> + trans[0][0] >> jnk >> trans[0][1] >> jnk >> trans[1][0] >> jnk >> + trans[1][1] >> jnk >> end_trans[0] >> jnk >> end_trans[1] >> jnk >> + fdr_cutoff; if (verbose) - cerr << "Read in params from " << params_file << endl - << "FG_ALPHA\t" << fg_alpha << endl - << "FG_BETA\t" << fg_beta << endl - << "BG_ALPHA\t" << bg_alpha << endl - << "BG_BETA\t" << bg_beta << endl - << "S_F\t" << start_trans[0] << endl - << "S_B\t" << start_trans[1] << endl - << "F_F\t" << trans[0][0] << endl - << "F_B\t" << trans[0][1] << endl - << "B_F\t" << trans[1][0] << endl - << "B_B\t" << trans[1][1] << endl - << "F_E\t" << end_trans[0] << endl - << "B_E\t" << end_trans[1] << endl - << "FDR_CUTOFF\t" << fdr_cutoff << endl; + cerr << "Read in params from " << params_file << '\n' + << "FG_ALPHA\t" << fg_alpha << '\n' + << "FG_BETA\t" << fg_beta << '\n' + << "BG_ALPHA\t" << bg_alpha << '\n' + << "BG_BETA\t" << bg_beta << '\n' + << "S_F\t" << start_trans[0] << '\n' + << "S_B\t" << start_trans[1] << '\n' + << "F_F\t" << trans[0][0] << '\n' + << "F_B\t" << trans[0][1] << '\n' + << "B_F\t" << trans[1][0] << '\n' + << "B_B\t" << trans[1][1] << '\n' + << "F_E\t" << end_trans[0] << '\n' + << "B_E\t" << end_trans[1] << '\n' + << "FDR_CUTOFF\t" << fdr_cutoff << '\n'; } - static void write_posteriors_file(const string &posteriors_file, - const vector > &bins, + const vector> &bins, const vector &scores) { static const size_t decimal_precision = 10; ofstream out(posteriors_file); out.precision(decimal_precision); for (size_t r = 0; r < scores.size(); ++r) - out << bins[0][r] << '\t' << scores[r] << endl; + out << bins[0][r] << '\t' << scores[r] << '\n'; } - static void -write_params_file(const string &outfile, - const vector &fg_alpha, - const vector &fg_beta, - const vector &bg_alpha, +write_params_file(const string &outfile, const vector &fg_alpha, + const vector &fg_beta, const vector &bg_alpha, const vector &bg_beta, const vector &start_trans, - const vector > &trans, + const vector> &trans, const vector &end_trans) { static const size_t decimal_precision = 30; ofstream out(outfile); out.precision(decimal_precision); - for (size_t r =0; r < fg_alpha.size(); ++r) - out << "FG_ALPHA_" << r+1 << "\t" << std::setw(14) << fg_alpha[r] << "\t" - << "FG_BETA_" << r+1 << "\t" << std::setw(14) << fg_beta[r] << "\t" - << "BG_ALPHA_" << r+1 << "\t" << std::setw(14) << bg_alpha[r] << "\t" - << "BG_BETA_" << r+1 << "\t" << std::setw(14) << bg_beta[r] << endl; - - out << "S_F\t" << start_trans[0] << endl - << "S_B\t" << start_trans[1] << endl - << "F_F\t" << trans[0][0] << endl - << "F_B\t" << trans[0][1] << endl - << "B_F\t" << trans[1][0] << endl - << "B_B\t" << trans[1][1] << endl - << "F_E\t" << end_trans[0] << endl - << "B_E\t" << end_trans[1] << endl; + for (size_t r = 0; r < fg_alpha.size(); ++r) + out << "FG_ALPHA_" << r + 1 << "\t" << std::setw(14) << fg_alpha[r] << "\t" + << "FG_BETA_" << r + 1 << "\t" << std::setw(14) << fg_beta[r] << "\t" + << "BG_ALPHA_" << r + 1 << "\t" << std::setw(14) << bg_alpha[r] << "\t" + << "BG_BETA_" << r + 1 << "\t" << std::setw(14) << bg_beta[r] << '\n'; + + out << "S_F\t" << start_trans[0] << '\n' + << "S_B\t" << start_trans[1] << '\n' + << "F_F\t" << trans[0][0] << '\n' + << "F_B\t" << trans[0][1] << '\n' + << "B_F\t" << trans[1][0] << '\n' + << "B_B\t" << trans[1][1] << '\n' + << "F_E\t" << end_trans[0] << '\n' + << "B_E\t" << end_trans[1] << '\n'; } - static bool check_if_array_data(const string &infile) { - bgzf_file in(infile, "r"); - if (!in) throw std::runtime_error("bad file: " + infile); + if (!in) + throw std::runtime_error("bad file: " + infile); if (get_has_counts_header(infile)) skip_counts_header(in); @@ -773,18 +728,17 @@ check_if_array_data(const string &infile) { return (!(iss >> cov)); } - static void -load_array_data(const size_t bin_size, - const string &cpgs_file, +load_array_data(const size_t bin_size, const string &cpgs_file, vector &bins, - vector > &meth, + vector> &meth, vector &reads) { // MAGIC. GS: minimum value for array? static const double meth_min = 1.0e-2; bgzf_file in(cpgs_file, "r"); - if (!in) throw std::runtime_error("bad sites file: " + cpgs_file); + if (!in) + throw std::runtime_error("bad sites file: " + cpgs_file); if (get_has_counts_header(cpgs_file)) skip_counts_header(in); @@ -796,11 +750,10 @@ load_array_data(const size_t bin_size, MSite site; while (read_site(in, site)) { - // TODO: MN: I think that the block below should be placed later - // in this scope. At this location, the methylation level of the - // first site in a new chrom is contributed to the last bin of the - // previous chrom. - if (site.n_reads > 0) { // its covered by a probe + // TODO(MN): I think that the block below should be placed later in this + // scope. At this location, the methylation level of the first site in a + // new chrom is contributed to the last bin of the previous chrom. + if (site.n_reads > 0) { // its covered by a probe ++num_probes_in_bin; if (site.meth < meth_min) array_meth_bin += meth_min; @@ -813,13 +766,13 @@ load_array_data(const size_t bin_size, if (curr_chrom != site.chrom) { if (!curr_chrom.empty()) { if (site.chrom < curr_chrom) - throw runtime_error("CpGs not sorted in file \"" - + cpgs_file + "\""); - bins.push_back(SimpleGenomicRegion(curr_chrom, curr_pos, - prev_pos + 1)); - meth.push_back(make_pair(array_meth_bin, num_probes_in_bin)); - if (num_probes_in_bin > 0) reads.push_back(1); - else reads.push_back(0); + throw runtime_error("CpGs not sorted in file \"" + cpgs_file + "\""); + bins.push_back(SimpleGenomicRegion(curr_chrom, curr_pos, prev_pos + 1)); + meth.push_back(std::make_pair(array_meth_bin, num_probes_in_bin)); + if (num_probes_in_bin > 0) + reads.push_back(1); + else + reads.push_back(0); } curr_chrom = site.chrom; curr_pos = site.pos; @@ -830,25 +783,25 @@ load_array_data(const size_t bin_size, throw std::runtime_error("CpGs not sorted in file \"" + cpgs_file + "\""); } else if (site.pos > curr_pos + bin_size) { - bins.push_back(SimpleGenomicRegion(curr_chrom, curr_pos, - curr_pos + bin_size)); - meth.push_back(make_pair(array_meth_bin, num_probes_in_bin)); + bins.push_back( + SimpleGenomicRegion(curr_chrom, curr_pos, curr_pos + bin_size)); + meth.push_back(std::make_pair(array_meth_bin, num_probes_in_bin)); (num_probes_in_bin > 0) ? reads.push_back(1) : reads.push_back(0); array_meth_bin = 0.0; num_probes_in_bin = 0.0; curr_pos += bin_size; while (curr_pos + bin_size < site.pos) { - bins.push_back(SimpleGenomicRegion(curr_chrom, curr_pos, - curr_pos + bin_size)); + bins.push_back( + SimpleGenomicRegion(curr_chrom, curr_pos, curr_pos + bin_size)); reads.push_back(0); - meth.push_back(make_pair(0.0, 0.0)); + meth.push_back(std::make_pair(0.0, 0.0)); curr_pos += bin_size; } } } - if (site.meth != -1 ) { // its covered by a probe + if (site.meth != -1) { // its covered by a probe ++num_probes_in_bin; if (site.meth < meth_min) array_meth_bin += meth_min; @@ -859,28 +812,27 @@ load_array_data(const size_t bin_size, } prev_pos = site.pos; if (!curr_chrom.empty()) { - bins.push_back(SimpleGenomicRegion(curr_chrom, curr_pos, - prev_pos + 1)); - meth.push_back(make_pair(array_meth_bin, num_probes_in_bin)); + bins.push_back(SimpleGenomicRegion(curr_chrom, curr_pos, prev_pos + 1)); + meth.push_back(std::make_pair(array_meth_bin, num_probes_in_bin)); if (num_probes_in_bin > 0) reads.push_back(1); - else reads.push_back(0); + else + reads.push_back(0); } } - static void load_wgbs_data(const size_t bin_size, const string &cpgs_file, vector &bins, - vector > &meth, - vector &reads) { - reads.clear(); // for safety + vector> &meth, vector &reads) { + reads.clear(); // for safety meth.clear(); bins.clear(); // ADS: loading data each iteration should be put outside loop bgzf_file in(cpgs_file, "r"); - if (!in) throw runtime_error("bad sites file: " + cpgs_file); + if (!in) + throw runtime_error("bad sites file: " + cpgs_file); if (get_has_counts_header(cpgs_file)) skip_counts_header(in); @@ -894,14 +846,15 @@ load_wgbs_data(const size_t bin_size, const string &cpgs_file, size_t sites_in_bin = 0ul; while (read_site(in, site)) { - if (curr_chrom != site.chrom) { // handle change of chrom - if (sites_in_bin > 0) bins.back().set_end(prev_pos); + if (curr_chrom != site.chrom) { // handle change of chrom + if (sites_in_bin > 0) + bins.back().set_end(prev_pos); if (chroms_seen.find(site.chrom) != end(chroms_seen)) throw runtime_error("sites not sorted"); chroms_seen.insert(site.chrom); curr_chrom = site.chrom; reads.push_back(0); - meth.push_back(make_pair(0.0, 0.0)); + meth.push_back(std::make_pair(0.0, 0.0)); bins.push_back(SimpleGenomicRegion(site.chrom, 0, bin_size)); sites_in_bin = 0; } @@ -911,7 +864,7 @@ load_wgbs_data(const size_t bin_size, const string &cpgs_file, while (bins.back().get_end() < site.pos) { sites_in_bin = 0; reads.push_back(0); - meth.push_back(make_pair(0.0, 0.0)); + meth.push_back(std::make_pair(0.0, 0.0)); bins.push_back(SimpleGenomicRegion(site.chrom, bins.back().get_end(), bins.back().get_end() + bin_size)); } @@ -920,13 +873,13 @@ load_wgbs_data(const size_t bin_size, const string &cpgs_file, meth.back().second += site.n_unmeth(); sites_in_bin++; } - if (sites_in_bin > 0) bins.back().set_end(prev_pos); + if (sites_in_bin > 0) + bins.back().set_end(prev_pos); } - static void remove_empty_bins_at_chrom_start(vector &bins, - vector > &meth, + vector> &meth, vector &reads) { bool chrom_start = true; size_t j = 0; @@ -950,15 +903,15 @@ remove_empty_bins_at_chrom_start(vector &bins, reads.erase(begin(reads) + j, end(reads)); } - static void load_read_counts(const string &cpgs_file, const size_t bin_size, vector &reads) { - reads.clear(); // for safety + reads.clear(); // for safety // ADS: loading data each iteration should be put outside loop bgzf_file in(cpgs_file, "r"); - if (!in) throw runtime_error("bad methcounts file: " + cpgs_file); + if (!in) + throw runtime_error("bad methcounts file: " + cpgs_file); if (get_has_counts_header(cpgs_file)) skip_counts_header(in); @@ -970,7 +923,7 @@ load_read_counts(const string &cpgs_file, const size_t bin_size, MSite site; while (read_site(in, site)) { - if (curr_chrom != site.chrom) { // handle change of chrom + if (curr_chrom != site.chrom) { // handle change of chrom if (chroms_seen.find(site.chrom) != end(chroms_seen)) throw runtime_error("sites not sorted"); chroms_seen.insert(site.chrom); @@ -986,18 +939,16 @@ load_read_counts(const string &cpgs_file, const size_t bin_size, } } - static double good_bins_frac(const vector &cumulative, const size_t min_bin_size, const size_t bin_size, const size_t min_cov_to_pass) { - // make sure the target bin size is a multiple of the minimum so we // have the resolution to construct the new bins assert(bin_size % min_bin_size == 0); // the step size corresponds to the number of minium sized bins that // would make up a new bin of the target size - const size_t step_size = bin_size/min_bin_size; + const size_t step_size = bin_size / min_bin_size; size_t passing_bins = 0, covered_bins = 0; @@ -1017,7 +968,7 @@ good_bins_frac(const vector &cumulative, const size_t min_bin_size, passing_bins += (bin_count >= min_cov_to_pass); } - return static_cast(passing_bins)/std::max(1ul, covered_bins); + return static_cast(passing_bins) / std::max(1ul, covered_bins); } static size_t @@ -1029,20 +980,17 @@ get_min_reads_for_confidence(const double conf_level) { // ADS: should be doubling first, followed by bisection while (1.0 - conf_level < upper - lower) { ++n_reads; - wilson_ci_for_binomial(1.0 - conf_level, n_reads, - fixed_phat, lower, upper); + wilson_ci_for_binomial(1.0 - conf_level, n_reads, fixed_phat, lower, upper); } return n_reads; } - // ADS: this function will return num_lim::max() if the // fraction of "good" bins is zero for all attempted bin sizes. static size_t binsize_selection(const size_t resolution, const size_t min_bin_sz, const size_t max_bin_sz, const double conf_level, const double min_frac_passed, const string &cpgs_file) { - const size_t min_cov_to_pass = get_min_reads_for_confidence(conf_level); vector reads; @@ -1061,14 +1009,11 @@ binsize_selection(const size_t resolution, const size_t min_bin_sz, return frac_passed < min_frac_passed ? num_lim::max() : bin_size; } - static void -load_bins(const size_t bin_size, - const string &cpgs_file, +load_bins(const size_t bin_size, const string &cpgs_file, vector &bins, - vector > &meth, - vector &reads, vector &array_status) { - + vector> &meth, vector &reads, + vector &array_status) { const bool is_array_data = check_if_array_data(cpgs_file); array_status.push_back(is_array_data); @@ -1082,19 +1027,18 @@ load_bins(const size_t bin_size, } static void -get_union_of_bins(const vector > &orig, +get_union_of_bins(const vector> &orig, vector &bins) { - // flatten the set of sorted bins bins.clear(); - for (auto &&i: orig) + for (auto &&i : orig) bins.insert(end(bins), begin(i), end(i)); // merge each sorted interval of bins const auto first = begin(bins); auto middle = begin(bins); for (size_t i = 1; i < orig.size(); ++i) { - middle += orig[i-1].size(); + middle += orig[i - 1].size(); std::inplace_merge(first, middle, middle + orig[i].size()); } // ensure unique bins @@ -1103,20 +1047,18 @@ get_union_of_bins(const vector > &orig, // make sure all bins are aligned at same boundaries for (size_t i = 1; i < bins.size(); ++i) - if (bins[i-1].overlaps(bins[i])) + if (bins[i - 1].overlaps(bins[i])) throw std::runtime_error("bins from reps not aligned"); } - static void add_missing_bins(const vector &all_bins, vector &bins, - vector> &meth) { - + vector> &meth) { const size_t n_bins = all_bins.size(); - vector> tmp_meth(n_bins); + vector> tmp_meth(n_bins); - size_t j = 0; // assume j range no larger than i range + size_t j = 0; // assume j range no larger than i range for (size_t i = 0; i < n_bins; ++i) { if (all_bins[i] == bins[j]) tmp_meth[i] = meth[j++]; @@ -1127,28 +1069,24 @@ add_missing_bins(const vector &all_bins, bins = all_bins; } - static void write_empty_summary(const string &summary_file) { if (!summary_file.empty()) { ofstream summary_out(summary_file); if (!summary_out) throw runtime_error("failed to open: " + summary_file); - summary_out << pmd_summary({}).tostring() << endl; + summary_out << pmd_summary({}).tostring() << '\n'; } } - int -main_pmd(int argc, char *argv[]) { +main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - static const size_t min_observations_for_inference = 100; static const size_t max_bin_size = 500000; static const size_t min_bin_size = 1000; size_t resolution = 500; - const char* sep = ","; string outfile; size_t rng_seed = 408; @@ -1164,7 +1102,7 @@ main_pmd(int argc, char *argv[]) { // MAGIC: corrections for small values (not parameters): static const double tolerance = 1e-5; - static const double min_prob = 1e-10; + static const double min_prob = 1e-10; string summary_file; @@ -1172,60 +1110,58 @@ main_pmd(int argc, char *argv[]) { string params_out_file; string posteriors_out_prefix; - - const string description = - "Identify PMDs in methylomes. Methylation must be provided in the \ - methcounts file format (chrom, position, strand, context, \ - methylation, reads). See the methcounts documentation for \ - details. This program assumes only data at CpG sites and that \ - strands are collapsed so only the positive site appears in the \ - file, but reads counts are from both strands."; + constexpr auto description = + R"(Identify PMDs in methylomes. Methylation must be provided in the + methcounts file format (chrom, position, strand, context, + methylation, reads). See the methcounts documentation for + details. This program assumes only data at CpG sites and that + strands are collapsed so only the positive site appears in the + file, but reads counts are from both strands.)"; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - ""); - opt_parse.add_opt("out", 'o', "output file (default: stdout)", - false, outfile); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); + opt_parse.add_opt("out", 'o', "output file (default: stdout)", false, + outfile); opt_parse.add_opt("desert", 'd', "max dist between bins with data in PMD", false, desert_size); opt_parse.add_opt("fixedbin", 'f', "Fixed bin size", false, fixed_bin_size); opt_parse.add_opt("bin", 'b', "Starting bin size", false, bin_size); - opt_parse.add_opt("arraymode",'a', "All samples are array", - false, ARRAY_MODE); + opt_parse.add_opt("arraymode", 'a', "All samples are array", false, + ARRAY_MODE); opt_parse.add_opt("itr", 'i', "max iterations", false, max_iterations); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); opt_parse.add_opt("debug", 'D', "print more run info", false, DEBUG); - opt_parse.add_opt("params-in", 'P', "HMM parameter files for " + opt_parse.add_opt("params-in", 'P', + "HMM parameter files for " "individual methylomes (separated with comma)", false, params_in_files); opt_parse.add_opt("posteriors-out", 'r', "write out posterior probabilities in methcounts format", false, posteriors_out_prefix); - opt_parse.add_opt("summary", 'S', - "write summary output here", - false, summary_file); + opt_parse.add_opt("summary", 'S', "write summary output here", false, + summary_file); opt_parse.add_opt("params-out", 'p', "write HMM parameters to this file", false, params_out_file); - opt_parse.add_opt("seed", 's', "specify random seed", - false, rng_seed); + opt_parse.add_opt("seed", 's', "specify random seed", false, rng_seed); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.empty()) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } /****************** END COMMAND LINE OPTIONS *****************/ @@ -1235,36 +1171,42 @@ main_pmd(int argc, char *argv[]) { vector cpgs_file = leftover_args; vector params_in_file; if (!params_in_files.empty()) { - params_in_file = smithlab::split(params_in_files, sep, false); + params_in_file = smithlab::split(params_in_files, ",", false); assert(cpgs_file.size() == params_in_file.size()); } const size_t n_replicates = cpgs_file.size(); - for (auto &filename : cpgs_file) - if (!is_msite_file(filename)) - throw runtime_error("malformed counts file: " + filename); - - bool insufficient_data = false; // ADS: this is used now to detect - // when the counts files have - // lines for CpG sites, but no - // counts. + std::for_each(std::cbegin(cpgs_file), std::cend(cpgs_file), + [](const auto &x) { + if (!is_msite_file(x)) + throw runtime_error("malformed counts file: " + x); + }); + // return !is_msite_file(x); })) + // // for (const auto &filename : cpgs_file) + // // if (!is_msite_file(filename)) + // throw runtime_error("malformed counts file: " + filename); + + bool insufficient_data = false; // ADS: this is used now to detect + // when the counts files have + // lines for CpG sites, but no + // counts. // Sanity checks input file format and dynamically selects bin // size from WGBS samples. if (!fixed_bin_size && !ARRAY_MODE) { if (verbose) - cerr << "[DYNAMICALLY SELECTING BIN SIZE]" << endl; + cerr << "[DYNAMICALLY SELECTING BIN SIZE]" << '\n'; double confidence_interval = 0.80; double prop_accept = 0.80; for (size_t i = 0; i < n_replicates && !insufficient_data; ++i) { const bool arrayData = check_if_array_data(cpgs_file[i]); if (!arrayData) { - bin_size = binsize_selection(resolution, min_bin_size, max_bin_size, - confidence_interval, prop_accept, - cpgs_file[i]); + bin_size = + binsize_selection(resolution, min_bin_size, max_bin_size, + confidence_interval, prop_accept, cpgs_file[i]); if (bin_size == num_lim::max()) insufficient_data = true; - desert_size = 5*bin_size; // TODO: explore extrapolation number + desert_size = 5 * bin_size; // TODO(ADS): what should this be? } else { // same as the parameters below @@ -1274,8 +1216,8 @@ main_pmd(int argc, char *argv[]) { } } else if (ARRAY_MODE) { - bin_size = 1000; // MAGIC NUMBERS FROM PAPER - desert_size = 200000; // PERFORM WITH HIGHEST JACCARD INDEX TO WGBS + bin_size = 1000; // MAGIC NUMBERS FROM PAPER + desert_size = 200000; // PERFORM WITH HIGHEST JACCARD INDEX TO WGBS } else { desert_size = max(desert_size, bin_size); @@ -1284,42 +1226,45 @@ main_pmd(int argc, char *argv[]) { if (insufficient_data) { // ADS: first check for insufficient data; another is needed if // fixed bin size is used - if (verbose) cerr << "EXITING: INSUFFICIENT DATA" << endl; - if (!summary_file.empty()) write_empty_summary(summary_file); + if (verbose) + cerr << "EXITING: INSUFFICIENT DATA" << '\n'; + if (!summary_file.empty()) + write_empty_summary(summary_file); return EXIT_SUCCESS; } if (verbose) - cerr << "[READING IN AT BIN SIZE " << bin_size << "]" << endl; + cerr << "[READING IN AT BIN SIZE " << bin_size << "]" << '\n'; // separate the regions by chrom and by desert - vector > bins(n_replicates); - vector > > meth(n_replicates); - vector > reads(n_replicates); + vector> bins(n_replicates); + vector>> meth(n_replicates); + vector> reads(n_replicates); vector array_status; for (size_t i = 0; i < n_replicates && !insufficient_data; ++i) { if (verbose) - cerr << "[READING CPGS AND METH PROPS] from " << cpgs_file[i] << endl; + cerr << "[READING CPGS AND METH PROPS] from " << cpgs_file[i] << '\n'; - load_bins(bin_size, cpgs_file[i], bins[i], meth[i], - reads[i], array_status); + load_bins(bin_size, cpgs_file[i], bins[i], meth[i], reads[i], + array_status); const double total_observations = accumulate(begin(reads[i]), end(reads[i]), 0); if (total_observations <= num_lim::min()) insufficient_data = true; if (verbose) - cerr << "TOTAL BINS: " << bins[i].size() << endl + cerr << "TOTAL BINS: " << bins[i].size() << '\n' << "MEAN COVERAGE: " - << total_observations/std::max(1ul, reads[i].size()) - << endl; + << total_observations / std::max(1ul, reads[i].size()) << '\n'; } if (insufficient_data) { // ADS: second check for insufficient data; another is needed if // filtered number of bins is too few - if (verbose) cerr << "EXITING: INSUFFICIENT DATA" << endl; - if (!summary_file.empty()) write_empty_summary(summary_file); + if (verbose) + cerr << "EXITING: INSUFFICIENT DATA" << '\n'; + if (!summary_file.empty()) + write_empty_summary(summary_file); return EXIT_SUCCESS; } @@ -1342,28 +1287,31 @@ main_pmd(int argc, char *argv[]) { vector reset_points; vector dists_btwn_bins; if (verbose) - cerr << "[separating by CpG desert]" << endl; - separate_regions(desert_size, bins, meth, reads, - reset_points, dists_btwn_bins); + cerr << "[separating by CpG desert]" << '\n'; + separate_regions(desert_size, bins, meth, reads, reset_points, + dists_btwn_bins); if (size(bins[0]) < min_observations_for_inference) insufficient_data = true; if (insufficient_data) { // ADS: final check for sufficient data failed; too few bins // after filtering - if (verbose) cerr << "EXITING: INSUFFICIENT DATA" << endl; - if (!summary_file.empty()) write_empty_summary(summary_file); + if (verbose) + cerr << "EXITING: INSUFFICIENT DATA" << '\n'; + if (!summary_file.empty()) + write_empty_summary(summary_file); return EXIT_SUCCESS; } if (verbose) - cerr << "bins retained: " << std::size(bins[0]) << endl - << "number of distances between: " << std::size(dists_btwn_bins) << endl - << "deserts removed: " << size(reset_points) - 2 << endl; + cerr << "bins retained: " << std::size(bins[0]) << '\n' + << "number of distances between: " << std::size(dists_btwn_bins) + << '\n' + << "deserts removed: " << size(reset_points) - 2 << '\n'; /****************** Read in params *****************/ vector start_trans(2, 0.5), end_trans(2, 1e-10); - vector > trans(2, vector(2, 0.01)); + vector> trans(2, vector(2, 0.01)); trans[0][0] = trans[1][1] = 0.99; const TwoStateHMM hmm(min_prob, tolerance, max_iterations, verbose, DEBUG); vector reps_fg_alpha(n_replicates, 0.05); @@ -1374,7 +1322,7 @@ main_pmd(int argc, char *argv[]) { if (!params_in_file.empty()) { // read parameters files - for (size_t i= 0; i < n_replicates; ++i) + for (size_t i = 0; i < n_replicates; ++i) read_params_file(verbose, params_in_file[i], reps_fg_alpha[i], reps_fg_beta[i], reps_bg_alpha[i], reps_bg_beta[i], start_trans, trans, end_trans, score_cutoff_for_fdr); @@ -1382,17 +1330,15 @@ main_pmd(int argc, char *argv[]) { // train model (default behavior; not done when params supplied) if (max_iterations > 0) - hmm.BaumWelchTraining_rep(meth, reset_points, - start_trans, trans, end_trans, - reps_fg_alpha, reps_fg_beta, + hmm.BaumWelchTraining_rep(meth, reset_points, start_trans, trans, + end_trans, reps_fg_alpha, reps_fg_beta, reps_bg_alpha, reps_bg_beta, array_status); if (!params_out_file.empty()) { // write all the HMM parameters - write_params_file(params_out_file, - reps_fg_alpha, reps_fg_beta, - reps_bg_alpha, reps_bg_beta, - start_trans, trans, end_trans); + write_params_file(params_out_file, reps_fg_alpha, reps_fg_beta, + reps_bg_alpha, reps_bg_beta, start_trans, trans, + end_trans); } /***********************************/ @@ -1401,15 +1347,15 @@ main_pmd(int argc, char *argv[]) { vector into_scores; hmm.TransitionPosteriors_rep(meth, reset_points, start_trans, trans, end_trans, reps_fg_alpha, reps_fg_beta, - reps_bg_alpha, reps_bg_beta, array_status, - 2, into_scores); + reps_bg_alpha, reps_bg_beta, array_status, 2, + into_scores); write_posteriors_file(posteriors_out_prefix + ".intoTrans", bins, into_scores); vector outof_scores; hmm.TransitionPosteriors_rep(meth, reset_points, start_trans, trans, end_trans, reps_fg_alpha, reps_fg_beta, - reps_bg_alpha, reps_bg_beta, array_status, - 1, outof_scores); + reps_bg_alpha, reps_bg_beta, array_status, 1, + outof_scores); write_posteriors_file(posteriors_out_prefix + ".outofTrans", bins, outof_scores); } @@ -1417,18 +1363,18 @@ main_pmd(int argc, char *argv[]) { vector classes; vector scores; hmm.PosteriorDecoding_rep(meth, reset_points, start_trans, trans, end_trans, - reps_fg_alpha, reps_fg_beta, - reps_bg_alpha, reps_bg_beta, classes, - scores, array_status); + reps_fg_alpha, reps_fg_beta, reps_bg_alpha, + reps_bg_beta, classes, scores, array_status); if (!posteriors_out_prefix.empty()) - write_posteriors_file(posteriors_out_prefix + ".posteriors", bins, scores); + write_posteriors_file(posteriors_out_prefix + ".posteriors", bins, + scores); vector domain_scores; get_domain_scores(classes, meth, reset_points, domain_scores); if (verbose) - cerr << "[RANDOMIZING SCORES FOR FDR]" << endl; + cerr << "[RANDOMIZING SCORES FOR FDR]" << '\n'; vector random_scores; shuffle_bins(rng_seed, hmm, meth, reset_points, start_trans, trans, @@ -1438,14 +1384,13 @@ main_pmd(int argc, char *argv[]) { vector p_values; assign_p_values(random_scores, domain_scores, p_values); - if (score_cutoff_for_fdr == num_lim::max() && - !p_values.empty()) + if (score_cutoff_for_fdr == num_lim::max() && !p_values.empty()) score_cutoff_for_fdr = get_score_cutoff_for_fdr(p_values, 0.01); if (!params_out_file.empty()) { ofstream out(params_out_file, std::ios::app); - out << "FDR_CUTOFF\t" - << std::setprecision(30) << score_cutoff_for_fdr << endl; + out << "FDR_CUTOFF\t" << std::setprecision(30) << score_cutoff_for_fdr + << '\n'; } vector domains; @@ -1460,29 +1405,29 @@ main_pmd(int argc, char *argv[]) { } optimize_boundaries(bin_size, cpgs_file, good_domains, array_status, - reps_fg_alpha, reps_fg_beta, - reps_bg_alpha, reps_bg_beta); + reps_fg_alpha, reps_fg_beta, reps_bg_alpha, + reps_bg_beta); if (!summary_file.empty()) { ofstream summary_out(summary_file); - if (!summary_out) throw runtime_error("failed to open: " + summary_file); - summary_out << pmd_summary(good_domains).tostring() << endl; + if (!summary_out) + throw runtime_error("failed to open: " + summary_file); + summary_out << pmd_summary(good_domains).tostring() << '\n'; } ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); copy(begin(good_domains), end(good_domains), std::ostream_iterator(out, "\n")); } - catch (const runtime_error &e) { - cerr << "ERROR:\t" << e.what() << endl; - return EXIT_FAILURE; - } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::exception &e) { + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-c-arrays,*-avoid-magic-numbers,*-init-variables,*-narrowing-conversions,*-owning-memory,*-pointer-arithmetic) From 61abb5dcc0364d196fe9069b2e61ce9acd6d8ca5 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 018/106] src/analysis/roimethstat.cpp: changes to add static analysis --- src/analysis/roimethstat.cpp | 51 +++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/src/analysis/roimethstat.cpp b/src/analysis/roimethstat.cpp index 4555e816..120030c6 100644 --- a/src/analysis/roimethstat.cpp +++ b/src/analysis/roimethstat.cpp @@ -15,27 +15,31 @@ * more details. */ +#include "GenomicRegion.hpp" +#include "LevelsCounter.hpp" +#include "MSite.hpp" +#include "OptionParser.hpp" +#include "bsutils.hpp" +#include "xcounts_utils.hpp" + #include #include -#include +#include +#include +#include #include #include #include #include +#include #include #include #include #include #include -#include "GenomicRegion.hpp" -#include "LevelsCounter.hpp" -#include "MSite.hpp" -#include "OptionParser.hpp" -#include "bsutils.hpp" -#include "smithlab_utils.hpp" -#include "xcounts_utils.hpp" +// NOLINTBEGIN(*-avoid-c-arrays,*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) [[nodiscard]] static std::string format_levels_counter(const LevelsCounter &lc) { @@ -88,7 +92,6 @@ static void process_chrom(const bool report_more_info, const char level_code, const std::vector &intervals, const std::vector &sites, std::ostream &out) { - std::uint64_t j = 0; for (auto i = 0ul; i < std::size(intervals); ++i) { while (j < std::size(sites) && sites[j].pos < intervals[i].get_start()) @@ -130,20 +133,19 @@ process_chrom(const bool report_more_info, static void process_from_xcounts(const std::uint32_t n_threads, const bool report_more_info, const char level_code, const std::string &xsym_file, - const std::vector &intervals, + const std::vector &intervals_in, std::ostream &out) { - const auto sites_by_chrom = read_xcounts_by_chrom(n_threads, xsym_file); // const auto intervals = get_GenomicRegions(intervals_file); std::vector> intervals_by_chrom; std::string prev_chrom; - for (auto i = 0u; i < std::size(intervals); ++i) { - if (intervals[i].get_chrom() != prev_chrom) { + for (auto i = 0u; i < std::size(intervals_in); ++i) { + if (intervals_in[i].get_chrom() != prev_chrom) { intervals_by_chrom.push_back(std::vector()); - prev_chrom = intervals[i].get_chrom(); + prev_chrom = intervals_in[i].get_chrom(); } - intervals_by_chrom.back().push_back(intervals[i]); + intervals_by_chrom.back().push_back(intervals_in[i]); } for (const auto &intervals : intervals_by_chrom) { @@ -184,8 +186,8 @@ get_chrom_order(const T &order, const GenomicRegion &r) { } struct cmp_chrom_order { - cmp_chrom_order(const std::unordered_map &m) : - order{m} {} + explicit cmp_chrom_order( + const std::unordered_map &m) : order{m} {} template [[nodiscard]] bool @@ -200,7 +202,7 @@ struct cmp_chrom_order { return c < 0 || (c == 0 && cmp_within_chrom(a, b)); } - const std::unordered_map ℴ + const std::unordered_map ℴ // NOLINT }; [[nodiscard]] static std::vector @@ -229,7 +231,7 @@ get_chroms(const std::string &filename) { if (x != std::cend(chroms_seen)) throw std::runtime_error("chroms not sorted"); chroms_seen.push_back(chrom); - prev_chrom = std::move(chrom); + prev_chrom = chrom; } } return chroms_seen; @@ -294,7 +296,6 @@ process_preloaded( const std::string &sites_file, const std::unordered_map &chrom_order, const std::vector ®ions, std::ostream &out) { - const auto sites = read_sites(sites_file); if (sites.empty()) throw std::runtime_error("failed to read sites: " + sites_file); @@ -305,7 +306,7 @@ process_preloaded( if (VERBOSE) std::cerr << "[n_sites=" << std::size(sites) << "]\n"; - for (auto &r : regions) { + for (const auto &r : regions) { LevelsCounter lc; const auto b = region_bounds(chrom_order, std::cbegin(sites), std::cend(sites), r); @@ -379,7 +380,7 @@ process_on_disk( if (!in) throw std::runtime_error("failed to open file: " + sites_file); - for (auto ®ion : regions) { + for (const auto ®ion : regions) { const auto lc = calc_site_stats(in, region, chrom_order); const double score = level_code == 'w' @@ -445,7 +446,7 @@ Columns (beyond the first 6) in the BED format output: std::string outfile; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, + OptionParser opt_parse(argv[0], description, " "); opt_parse.set_show_defaults(); opt_parse.add_opt("output", 'o', "Name of output file (default: stdout)", @@ -505,7 +506,7 @@ Columns (beyond the first 6) in the BED format output: // ensure regions are sorted in the same way std::unordered_map chrom_order; if (!is_xcounts) - for (auto &i : get_chroms(sites_file)) + for (const auto &i : get_chroms(sites_file)) chrom_order.emplace(i, std::size(chrom_order)); if (VERBOSE) @@ -571,3 +572,5 @@ Columns (beyond the first 6) in the BED format output: } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-c-arrays,*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) From 9c61cbaf99636e70ed1df57c81b2db03d396dea6 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 019/106] src/common/BetaBin.cpp: changes to add static analysis --- src/common/BetaBin.cpp | 74 ++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/src/common/BetaBin.cpp b/src/common/BetaBin.cpp index aaf9b8f1..08dbc62b 100644 --- a/src/common/BetaBin.cpp +++ b/src/common/BetaBin.cpp @@ -17,26 +17,26 @@ #include "BetaBin.hpp" +#include #include - +#include #include -#include #include +#include #include +#include -#include #include +#include - -using std::vector; -using std::pair; -using std::setw; +using std::cerr; using std::max; using std::min; -using std::cerr; -using std::endl; -using std::string; +using std::pair; using std::setprecision; +using std::setw; +using std::string; +using std::vector; ////////////////////////////////////////////// ////// struct betabin ////// @@ -44,8 +44,7 @@ using std::setprecision; const double betabin::tolerance = 1e-10; -betabin::betabin() : - alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {} +betabin::betabin() : alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {} betabin::betabin(const double a, const double b) : alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {} @@ -56,37 +55,35 @@ betabin::betabin(const string &str) { iss >> name >> alpha >> beta; if (name != "betabin" || alpha < 0 || beta < 0) { cerr << "betabin::betabin: " - << "bad string representation of betabin distribution: " - << str << endl; + << "bad string representation of betabin distribution: " << str + << '\n'; throw "bad string representation of betabin distribution"; } lnbeta_helper = gsl_sf_lnbeta(alpha, beta); } - string betabin::tostring() const { std::ostringstream os; - os << "betabin " << setprecision(4) << alpha << " " - << setprecision(4) << beta; + os << "betabin " << setprecision(4) << alpha << " " << setprecision(4) + << beta; return os.str(); } - double betabin::operator()(const pair &val) const { - const size_t x = static_cast(val.first); - const size_t n = static_cast(x + val.second); - return gsl_sf_lnchoose(n, x) + - gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper; + const std::uint32_t x = static_cast(val.first); + const std::uint32_t n = static_cast(val.first + val.second); + return gsl_sf_lnchoose(n, x) + gsl_sf_lnbeta(alpha + x, beta + val.second) - + lnbeta_helper; } double betabin::log_likelihood(const pair &val) const { - const size_t x = static_cast(val.first); - const size_t n = static_cast(x + val.second); - return gsl_sf_lnchoose(n, x) + - gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper; + const std::uint32_t x = static_cast(val.first); + const std::uint32_t n = static_cast(val.first + val.second); + return gsl_sf_lnchoose(n, x) + gsl_sf_lnbeta(alpha + x, beta + val.second) - + lnbeta_helper; } double @@ -96,29 +93,36 @@ betabin::sign(const double x) { double betabin::invpsi(const double tolerance, const double x) { - double L = 1.0, Y = std::exp(x); + double L = 1.0; + double Y = std::exp(x); while (L > tolerance) { - Y += L*sign(x - gsl_sf_psi(Y)); - L /= 2.0; + Y += L * sign(x - gsl_sf_psi(Y)); + L /= 2.0; // NOLINT(*-avoid-magic-numbers) } return Y; } double betabin::movement(const double curr, const double prev) { - return std::abs(curr - prev)/std::max(std::fabs(curr), std::fabs(prev)); + return std::abs(curr - prev) / std::max(std::fabs(curr), std::fabs(prev)); } void betabin::fit(const vector &vals_a, const vector &vals_b, const vector &p) { + static constexpr auto initial_param_vals = 0.01; const double p_total = std::accumulate(p.begin(), p.end(), 0.0); - const double alpha_rhs = inner_product(vals_a.begin(), vals_a.end(), - p.begin(), 0.0)/p_total; - const double beta_rhs = inner_product(vals_b.begin(), vals_b.end(), - p.begin(), 0.0)/p_total; + const double alpha_rhs = + std::inner_product(std::cbegin(vals_a), std::cend(vals_a), std::cbegin(p), + 0.0) / + p_total; + const double beta_rhs = + std::inner_product(std::cbegin(vals_b), std::cend(vals_b), std::cbegin(p), + 0.0) / + p_total; double prev_alpha = 0.0, prev_beta = 0.0; - alpha = beta = 0.01; + alpha = beta = initial_param_vals; + while (movement(alpha, prev_alpha) > tolerance && movement(beta, prev_beta) > tolerance) { prev_alpha = alpha; From ec17833b940bed901d87e92dec516046c3efbfa1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 020/106] src/common/BetaBin.hpp: changes to add static analysis --- src/common/BetaBin.hpp | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/common/BetaBin.hpp b/src/common/BetaBin.hpp index 290dd65a..94c63dc4 100644 --- a/src/common/BetaBin.hpp +++ b/src/common/BetaBin.hpp @@ -18,26 +18,33 @@ #ifndef BETABIN_HPP #define BETABIN_HPP -#include +#include // IWYU pragma: keep #include +#include #include struct betabin { betabin(); betabin(const double a, const double b); - betabin(const std::string &str); - double operator()(const std::pair &val) const; - double log_likelihood(const std::pair &val) const; - double sign(const double x); - double invpsi(const double tolerance, const double x); - double movement(const double curr, const double prev); - void fit(const std::vector &vals_a, - const std::vector &vals_b, - const std::vector &p); - std::string tostring() const; - double alpha; - double beta; - double lnbeta_helper; + explicit betabin(const std::string &str); + double + operator()(const std::pair &val) const; + double + log_likelihood(const std::pair &val) const; + double + sign(const double x); + double + invpsi(const double tolerance, const double x); + double + movement(const double curr, const double prev); + void + fit(const std::vector &vals_a, const std::vector &vals_b, + const std::vector &p); + std::string + tostring() const; + double alpha{}; + double beta{}; + double lnbeta_helper{}; static const double tolerance; }; From b7f2dd96b34d07c4667ce8f241a4b61ae5e56e8c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 021/106] src/common/EmissionDistribution.cpp: changes to add static analysis --- src/common/EmissionDistribution.cpp | 151 +++++++++++++++------------- 1 file changed, 80 insertions(+), 71 deletions(-) diff --git a/src/common/EmissionDistribution.cpp b/src/common/EmissionDistribution.cpp index 8c2aa472..778d61fe 100644 --- a/src/common/EmissionDistribution.cpp +++ b/src/common/EmissionDistribution.cpp @@ -18,127 +18,136 @@ #include "EmissionDistribution.hpp" -using std::vector; -using std::pair; -using std::setw; +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using std::cerr; using std::max; using std::min; -using std::cerr; -using std::endl; -using std::string; +using std::pair; using std::setprecision; +using std::setw; +using std::string; +using std::vector; EmissionDistribution::EmissionDistribution() : - alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {} + alpha(1), beta(1), lnbeta_helper(gsl_sf_lnbeta(1, 1)) {} EmissionDistribution::EmissionDistribution(const double a, const double b) : - alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {} + alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {} EmissionDistribution::EmissionDistribution(const string &str) { - std::istringstream iss(str, std::istringstream::in); - string name; - iss >> name >> alpha >> beta; - if (name != "edtn" || alpha < 0 || beta < 0) - { - cerr << "EmissionDistribution::EmissionDistribution: " - << "bad string representation of emission distribution: " - << str << endl; - throw "bad string representation of emission distribution"; - } - lnbeta_helper = gsl_sf_lnbeta(alpha, beta); + std::istringstream iss(str, std::istringstream::in); + string name; + iss >> name >> alpha >> beta; + if (name != "edtn" || alpha < 0 || beta < 0) { + cerr << "EmissionDistribution::EmissionDistribution: " + << "bad string representation of emission distribution: " << str + << '\n'; + throw "bad string representation of emission distribution"; + } + lnbeta_helper = gsl_sf_lnbeta(alpha, beta); } EmissionDistribution::~EmissionDistribution() {} string EmissionDistribution::tostring() const { - std::ostringstream os; - os << "Emission dtn params: " << setprecision(4) << alpha << " " - << setprecision(4) << beta; - return os.str(); + std::ostringstream os; + os << "Emission dtn params: " << setprecision(4) << alpha << " " + << setprecision(4) << beta; + return os.str(); } - double EmissionDistribution::sign(const double x) { - return (x >= 0) ? 1.0 : -1.0; + return (x >= 0) ? 1.0 : -1.0; } - double EmissionDistribution::invpsi(const double tolerance, const double x) { - double L = 1.0, Y = std::exp(x); - while (L > tolerance) - { - Y += L*sign(x - gsl_sf_psi(Y)); - L /= 2.0; - } - return Y; + double L = 1.0, Y = std::exp(x); + while (L > tolerance) { + Y += L * sign(x - gsl_sf_psi(Y)); + L /= 2.0; // NOLINT(*-avoid-magic-numbers) + } + return Y; } - double EmissionDistribution::movement(const double curr, const double prev) { - return std::abs(curr - prev)/std::max(std::fabs(curr), std::fabs(prev)); + return std::abs(curr - prev) / std::max(std::fabs(curr), std::fabs(prev)); } - void EmissionDistribution::fit(const vector &vals_a, - const vector &vals_b, const vector &p) { - const double p_total = std::accumulate(p.begin(), p.end(), 0.0); - const double alpha_rhs = inner_product(vals_a.begin(), vals_a.end(), - p.begin(), 0.0)/p_total; - const double beta_rhs = inner_product(vals_b.begin(), vals_b.end(), - p.begin(), 0.0)/p_total; - - double prev_alpha = 0.0, prev_beta = 0.0; - alpha = beta = 0.01; - while (movement(alpha, prev_alpha) > tolerance && - movement(beta, prev_beta) > tolerance) - { - prev_alpha = alpha; - prev_beta = beta; - alpha = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + alpha_rhs); - beta = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + beta_rhs); - } - lnbeta_helper = gsl_sf_lnbeta(alpha, beta); + const vector &vals_b, + const vector &p) { + static constexpr auto initial_param_vals = 0.01; + const double p_total = std::accumulate(p.begin(), p.end(), 0.0); + const double alpha_rhs = + inner_product(vals_a.begin(), vals_a.end(), p.begin(), 0.0) / p_total; + const double beta_rhs = + inner_product(vals_b.begin(), vals_b.end(), p.begin(), 0.0) / p_total; + + double prev_alpha = 0.0, prev_beta = 0.0; + alpha = beta = initial_param_vals; + + while (movement(alpha, prev_alpha) > tolerance && + movement(beta, prev_beta) > tolerance) { + prev_alpha = alpha; + prev_beta = beta; + alpha = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + alpha_rhs); + beta = invpsi(tolerance, gsl_sf_psi(prev_alpha + prev_beta) + beta_rhs); + } + lnbeta_helper = gsl_sf_lnbeta(alpha, beta); } Beta::Beta() : EmissionDistribution() {} -Beta::Beta(const double a, const double b) : EmissionDistribution(a,b) {} +Beta::Beta(const double a, const double b) : EmissionDistribution(a, b) {} Beta::Beta(const std::string &str) : EmissionDistribution(str) {} double Beta::operator()(const pair &val) const { - const double p = val.first/val.second; - return (alpha-1.0)*log(p) + (beta-1.0)*log(1.0-p) - gsl_sf_lnbeta(alpha, beta); + const double p = val.first / val.second; + return (alpha - 1.0) * log(p) + (beta - 1.0) * log(1.0 - p) - + gsl_sf_lnbeta(alpha, beta); } double Beta::log_likelihood(const pair &val) const { - const double p = val.first/val.second; - return (alpha-1.0)*log(p) + (beta-1.0)*log(1.0-p) - gsl_sf_lnbeta(alpha, beta); + const double p = val.first / val.second; + return (alpha - 1.0) * log(p) + (beta - 1.0) * log(1.0 - p) - + gsl_sf_lnbeta(alpha, beta); } BetaBinomial::BetaBinomial() : EmissionDistribution() {} -BetaBinomial::BetaBinomial(const double a, const double b) - : EmissionDistribution(a,b) {} -BetaBinomial::BetaBinomial(const std::string &str) - : EmissionDistribution(str) {} +BetaBinomial::BetaBinomial(const double a, const double b) : + EmissionDistribution(a, b) {} +BetaBinomial::BetaBinomial(const std::string &str) : + EmissionDistribution(str) {} double BetaBinomial::operator()(const pair &val) const { - const size_t x = static_cast(val.first); - const size_t n = static_cast(x + val.second); - return gsl_sf_lnchoose(n, x) + - gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper; + const std::uint32_t x = static_cast(val.first); + const std::uint32_t n = static_cast(val.first + val.second); + return gsl_sf_lnchoose(n, x) + gsl_sf_lnbeta(alpha + x, beta + val.second) - + lnbeta_helper; } double BetaBinomial::log_likelihood(const pair &val) const { - const size_t x = static_cast(val.first); - const size_t n = static_cast(x + val.second); - return gsl_sf_lnchoose(n, x) + - gsl_sf_lnbeta(alpha + x, beta + val.second) - lnbeta_helper; + const std::uint32_t x = static_cast(val.first); + const std::uint32_t n = static_cast(val.first + val.second); + return gsl_sf_lnchoose(n, x) + gsl_sf_lnbeta(alpha + x, beta + val.second) - + lnbeta_helper; } From fc09428af96ee5d0dc90477b39b0d8b7c6b83e39 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 022/106] src/common/EmissionDistribution.hpp: changes to add static analysis --- src/common/EmissionDistribution.hpp | 101 +++++++++++++++------------- 1 file changed, 54 insertions(+), 47 deletions(-) diff --git a/src/common/EmissionDistribution.hpp b/src/common/EmissionDistribution.hpp index 988d6514..31cda6ab 100644 --- a/src/common/EmissionDistribution.hpp +++ b/src/common/EmissionDistribution.hpp @@ -19,15 +19,9 @@ #ifndef EM_DTN #define EM_DTN -#include -#include -#include -#include -#include -#include -#include -#include +#include // IWYU pragma: keep #include +#include #include /** Emission distributions for methylation should be modeled either as @@ -35,51 +29,64 @@ * helpful to have an abstraction so that we can put them in the same * container. */ -class EmissionDistribution -{ - public: - EmissionDistribution(); - virtual ~EmissionDistribution(); - EmissionDistribution(const double a, const double b); - EmissionDistribution(const std::string &str); - virtual double operator()(const std::pair &val) const = 0; - virtual double log_likelihood(const std::pair &val) const = 0; - std::string tostring() const; - double getalpha() { return alpha; }; - double getbeta() { return beta; }; - void fit(const std::vector &vals_a, - const std::vector &vals_b, - const std::vector &p); +class EmissionDistribution { +public: + EmissionDistribution(); + virtual ~EmissionDistribution(); + EmissionDistribution(const double a, const double b); + EmissionDistribution(const std::string &str); + virtual double + operator()(const std::pair &val) const = 0; + virtual double + log_likelihood(const std::pair &val) const = 0; + std::string + tostring() const; + double + getalpha() { + return alpha; + }; + double + getbeta() { + return beta; + }; + void + fit(const std::vector &vals_a, const std::vector &vals_b, + const std::vector &p); - protected: - double sign(const double x); - double invpsi(const double tolerance, const double x); - double movement(const double curr, const double prev); - double alpha; - double beta; - double lnbeta_helper; +protected: + double + sign(const double x); + double + invpsi(const double tolerance, const double x); + double + movement(const double curr, const double prev); + double alpha{}; + double beta{}; + double lnbeta_helper{}; - const double tolerance = 1e-10; + static constexpr double tolerance = 1e-10; }; -class Beta : public EmissionDistribution -{ - public: - Beta(); - Beta(const double a, const double b); - Beta(const std::string &str); - double operator()(const std::pair &val) const; - double log_likelihood(const std::pair &val) const; +class Beta : public EmissionDistribution { +public: + Beta(); + Beta(const double a, const double b); + explicit Beta(const std::string &str); + double + operator()(const std::pair &val) const override; + double + log_likelihood(const std::pair &val) const override; }; -class BetaBinomial : public EmissionDistribution -{ - public: - BetaBinomial(); - BetaBinomial(const double a, const double b); - BetaBinomial(const std::string &str); - double operator()(const std::pair &val) const; - double log_likelihood(const std::pair &val) const; +class BetaBinomial : public EmissionDistribution { +public: + BetaBinomial(); + BetaBinomial(const double a, const double b); + explicit BetaBinomial(const std::string &str); + double + operator()(const std::pair &val) const override; + double + log_likelihood(const std::pair &val) const override; }; #endif From 89784de3526aa4c589b8e961cda6aae0d6d4141c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 023/106] src/common/Epiread.cpp: changes to add static analysis --- src/common/Epiread.cpp | 51 ++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/src/common/Epiread.cpp b/src/common/Epiread.cpp index 5a989af1..22969c90 100644 --- a/src/common/Epiread.cpp +++ b/src/common/Epiread.cpp @@ -14,19 +14,20 @@ * GNU General Public License for more details. */ -#include -#include -#include -#include -#include - #include "Epiread.hpp" -using std::vector; -using std::string; +#include +#include +#include +#include +#include +#include +#include +#include +#include size_t -adjust_read_offsets(vector &reads) { +adjust_read_offsets(std::vector &reads) { size_t first_read_offset = std::numeric_limits::max(); for (size_t i = 0; i < reads.size(); ++i) first_read_offset = std::min(reads[i].pos, first_read_offset); @@ -36,16 +37,16 @@ adjust_read_offsets(vector &reads) { } size_t -get_n_cpgs(const vector &reads) { +get_n_cpgs(const std::vector &reads) { size_t n_cpgs = 0; for (size_t i = 0; i < reads.size(); ++i) n_cpgs = std::max(n_cpgs, reads[i].end()); return n_cpgs; } -std::istream& +std::istream & operator>>(std::istream &in, epiread &er) { - string buffer; + std::string buffer; if (getline(in, buffer)) { std::istringstream is(buffer); if (!(is >> er.chr >> er.pos >> er.seq)) @@ -54,48 +55,52 @@ operator>>(std::istream &in, epiread &er) { return in; } -std::ostream& +std::ostream & operator<<(std::ostream &out, const epiread &er) { return out << er.chr << '\t' << er.pos << '\t' << er.seq; } bool -validate_epiread_file(const string &filename) { +validate_epiread_file(const std::string &filename) { const size_t max_lines_to_validate = 10000; std::ifstream in(filename); if (!in) throw std::runtime_error("failed to open file: " + filename); - string c, s, other; + std::string c, s, other; size_t p = 0; size_t n_lines = 0; - string line; + std::string line; while (getline(in, line) && n_lines++ < max_lines_to_validate) { std::istringstream iss(line); - if (!(iss >> c >> p >> s) || iss >> other) return false; + if (!(iss >> c >> p >> s) || iss >> other) + return false; } return true; } -epiread::epiread(const string &line) { +epiread::epiread(const std::string &line) { constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; }; constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; }; + using std::distance; using std::find_if; using std::from_chars; - using std::distance; bool failed = false; + // NOLINTBEGIN(*-pointer-arithmetic) + const auto c = line.data(); const auto c_end = c + line.size(); auto field_s = c; auto field_e = find_if(field_s + 1, c_end, is_sep); - if (field_e == c_end) failed = true; + if (field_e == c_end) + failed = true; - chr = string{field_s, static_cast(distance(field_s, field_e))}; + chr = std::string{field_s, static_cast(distance(field_s, field_e))}; field_s = find_if(field_e + 1, c_end, not_sep); field_e = find_if(field_s + 1, c_end, is_sep); @@ -108,11 +113,13 @@ epiread::epiread(const string &line) { field_e = find_if(field_s + 1, c_end, is_sep); failed = failed || (field_e != c_end); - seq = string{field_s, static_cast(distance(field_s, field_e))}; + seq = std::string{field_s, static_cast(distance(field_s, field_e))}; if (failed) { throw std::runtime_error("bad epiread line: " + line); // ADS: the value below would work for a flag // pos = std::numeric_limits::max(); } + + // NOLINTEND(*-pointer-arithmetic) } From cc0795b68fe012a6adab1d7e760fba70cd0e63d8 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 024/106] src/common/Epiread.hpp: changes to add static analysis --- src/common/Epiread.hpp | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/common/Epiread.hpp b/src/common/Epiread.hpp index 31331b31..a971a7e3 100644 --- a/src/common/Epiread.hpp +++ b/src/common/Epiread.hpp @@ -17,29 +17,39 @@ #ifndef EPIREAD #define EPIREAD +#include +#include #include #include -#include "smithlab_utils.hpp" struct epiread { std::string chr{}; size_t pos{}; std::string seq{}; epiread() = default; - epiread(const std::string &line); + explicit epiread(const std::string &line); epiread(const size_t p, const std::string &s) : pos(p), seq(s) {} - epiread(const std::string &c, const size_t p, const std::string &s) - : chr(c), pos(p), seq(s) {} + epiread(const std::string &c, const size_t p, const std::string &s) : + chr(c), pos(p), seq(s) {} - bool operator<(const epiread &other) const { + bool + operator<(const epiread &other) const { return (chr < other.chr || (chr == other.chr && pos < other.pos)); } - size_t end() const {return pos + seq.length();} - size_t length() const {return seq.length();} + size_t + end() const { + return pos + seq.length(); + } + size_t + length() const { + return seq.length(); + } }; -std::istream& operator>>(std::istream &in, epiread &er); -std::ostream& operator<<(std::ostream &out, const epiread &er); +std::istream & +operator>>(std::istream &in, epiread &er); +std::ostream & +operator<<(std::ostream &out, const epiread &er); size_t adjust_read_offsets(std::vector &reads); From 1f3644f3adb3975d8176c6081c2b823c7bc6f99c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 025/106] src/common/EpireadStats.cpp: changes to add static analysis --- src/common/EpireadStats.cpp | 153 ++++++++++++++++++++---------------- 1 file changed, 87 insertions(+), 66 deletions(-) diff --git a/src/common/EpireadStats.cpp b/src/common/EpireadStats.cpp index be1ba312..759e6c4c 100644 --- a/src/common/EpireadStats.cpp +++ b/src/common/EpireadStats.cpp @@ -16,34 +16,36 @@ #include "EpireadStats.hpp" -#include -#include -#include +#include #include -#include +#include +#include #include -#include -#include +#include +#include +#include +#include -#include #include -using std::string; -using std::vector; using std::isfinite; -using std::min; using std::max; +using std::min; +using std::string; +using std::vector; -template using num_lim = std::numeric_limits; +template using num_lim = std::numeric_limits; using epi_r = small_epiread; static const double PSEUDOCOUNT = 1e-10; static inline uint32_t adjust_read_offsets(vector &reads) { - auto first_read_offset = num_lim::max(); - for (auto &r : reads) - first_read_offset = min(r.pos, first_read_offset); + auto first_read_offset = std::accumulate( + std::cbegin(reads), std::cend(reads), num_lim::max(), + [](const std::uint32_t a, const auto &r) { return std::min(a, r.pos); }); + // for (const auto &r : reads) + // first_read_offset = min(r.pos, first_read_offset); for (auto &r : reads) r.pos -= first_read_offset; return first_read_offset; @@ -51,10 +53,13 @@ adjust_read_offsets(vector &reads) { static inline uint32_t get_n_cpgs(const vector &reads) { - auto n_cpgs = 0u; - for (auto &r : reads) - n_cpgs = std::max(n_cpgs, r.end()); - return n_cpgs; + return std::accumulate( + std::cbegin(reads), std::cend(reads), 0u, + [](const std::uint32_t a, const auto &r) { return std::max(a, r.end()); }); + // auto n_cpgs = 0u; + // for (const auto &r : reads) + // n_cpgs = std::max(n_cpgs, r.end()); + // return n_cpgs; } // static inline bool @@ -69,8 +74,8 @@ log_likelihood(const epi_r &r, const vector &a) { for (size_t i = 0; i < r.seq.length(); ++i) // if (is_meth(r.seq[i]) || un_meth(r.seq[i])) { if (r.seq[i] == 'C' || r.seq[i] == 'T') { - // const double val = (is_meth(r, i) ? a[r.pos + i] : (1.0 - a[r.pos + i])); - // assert(isfinite(log(val))); + // const double val = (is_meth(r, i) ? a[r.pos + i] : (1.0 - a[r.pos + + // i])); assert(isfinite(log(val))); ll += log(r.seq[i] == 'C' ? a[r.pos + i] : (1.0 - a[r.pos + i])); } return ll; @@ -89,7 +94,8 @@ log_likelihood(const epi_r &r, const vector &a) { // } static inline std::pair -log_likelihood_pair(const epi_r &r, const vector &a1, const vector &a2) { +log_likelihood_pair(const epi_r &r, const vector &a1, + const vector &a2) { double ll1 = 0.0, ll2 = 0.0; // auto a1_itr = cbegin(a1) + r.pos; // auto a2_itr = cbegin(a2) + r.pos; @@ -110,8 +116,9 @@ log_likelihood_pair(const epi_r &r, const vector &a1, const vector &a1, const vector &a2) { +log_likelihood(const epi_r &r, const double log_mixing1, + const double log_mixing2, const vector &a1, + const vector &a2) { auto [ll1, ll2] = log_likelihood_pair(r, a1, a2); return log(exp(log_mixing1 + ll1) + exp(log_mixing2 + ll2)); } @@ -136,7 +143,8 @@ expectation_step(const vector &reads, const double mixing, double score = 0.0; auto ind_itr = begin(indicators); - for (auto &r : reads) { /// for (uint32_t i = 0; i < reads.size(); ++i) { + for (const auto &r : reads) { + // for (uint32_t i = 0; i < reads.size(); ++i) { const double ll1 = log_mixing1 + log_likelihood(r, a1); const double ll2 = log_mixing2 + log_likelihood(r, a2); // assert(isfinite(ll1) && isfinite(ll2)); @@ -165,15 +173,15 @@ expectation_step(const vector &reads, const double mixing, // for (uint32_t i = 0; i < n_cpgs; ++i) a[i] /= total[i]; // } - -template void +template +void fit_epiallele(const double pseudo, const vector &reads, vector::const_iterator indic_itr, vector &a) { vector total(size(a), 2 * pseudo); auto t_beg = begin(total); auto a_beg = begin(a); - fill_n(a_beg, size(a), pseudo); - for (auto &r : reads) { + std::fill_n(a_beg, std::size(a), pseudo); + for (const auto &r : reads) { const double weight = inverse ? 1.0 - *indic_itr++ : *indic_itr++; auto m_itr = a_beg + r.pos; auto t_itr = t_beg + r.pos; @@ -187,17 +195,18 @@ fit_epiallele(const double pseudo, const vector &reads, } void -fit_epiallele(const double pseudo, const vector &reads, vector &a) { +fit_epiallele(const double pseudo, const vector &reads, + vector &a) { const uint32_t n_cpgs = a.size(); vector total(n_cpgs, 2 * pseudo); auto t_beg = begin(total); auto a_beg = begin(a); - fill_n(a_beg, n_cpgs, pseudo); - for (auto &r : reads) { + std::fill_n(a_beg, n_cpgs, pseudo); + for (const auto &r : reads) { const uint32_t start = r.pos; auto m_itr = a_beg + start; auto t_itr = t_beg + start; - for (auto s : r.seq) { + for (const auto s : r.seq) { *m_itr++ += (s == 'C'); *t_itr++ += (s != 'N'); } @@ -207,16 +216,24 @@ fit_epiallele(const double pseudo, const vector &reads, vector &a } static inline void -rescale_indicators(const double mixing, vector &indic) { - const double ratio = reduce(cbegin(indic), cend(indic), 0.0)/indic.size(); +rescale_indicators( + const double mixing, + vector &indic) { // cppcheck-suppress constParameterReference + const double ratio = std::reduce(std::cbegin(indic), std::cend(indic), 0.0) / + static_cast(std::size(indic)); if (mixing < ratio) { - const double adjustment = mixing/ratio; - for (auto &i : indic) i *= adjustment; + const double adjustment = mixing / ratio; + std::transform(std::cbegin(indic), std::cend(indic), std::begin(indic), + [&](const auto x) { return x * adjustment; }); + // for (auto &i : indic) + // i *= adjustment; } else { - const double adjustment = mixing/(1.0 - ratio); - for (auto &i : indic) - i = 1.0 - (1.0 - i)*adjustment; + const double adjustment = mixing / (1.0 - ratio); + std::transform(std::cbegin(indic), std::cend(indic), std::begin(indic), + [&](const auto x) { return 1.0 - (1.0 - x) * adjustment; }); + // for (auto &i : indic) + // i = 1.0 - (1.0 - i) * adjustment; } } @@ -224,33 +241,32 @@ static double expectation_maximization(const size_t max_itr, const vector &reads, const double &mixing, vector &indicators, vector &a1, vector &a2) { - static constexpr double EPIREAD_STATS_TOLERANCE = 1e-10; double prev_score = -num_lim::max(); for (auto i = 0u; i < max_itr; ++i) { - const double score = expectation_step(reads, mixing, a1, a2, indicators); rescale_indicators(mixing, indicators); - if ((prev_score - score)/prev_score < EPIREAD_STATS_TOLERANCE) + if ((prev_score - score) / prev_score < EPIREAD_STATS_TOLERANCE) break; // maximization_step(reads, indicators, a1, a2); - fit_epiallele(0.5*PSEUDOCOUNT, reads, cbegin(indicators), a1); - fit_epiallele(0.5*PSEUDOCOUNT, reads, cbegin(indicators), a2); + + // NOLINTBEGIN(*-avoid-magic-numbers) + fit_epiallele(0.5 * PSEUDOCOUNT, reads, cbegin(indicators), a1); + fit_epiallele(0.5 * PSEUDOCOUNT, reads, cbegin(indicators), a2); + // NOLINTEND(*-avoid-magic-numbers) prev_score = score; } return prev_score; } - double resolve_epialleles(const size_t max_itr, const vector &reads, const double &mixing, vector &indicators, vector &a1, vector &a2) { - indicators.clear(); indicators.resize(reads.size(), 0.0); for (size_t i = 0; i < reads.size(); ++i) { @@ -268,8 +284,8 @@ fit_single_epiallele(const vector &reads, vector &a) { fit_epiallele(PSEUDOCOUNT, reads, a); double score = 0.0; - for (auto r : reads) { - score += log_likelihood(r, a); + for (const auto &r : reads) { + score += log_likelihood(r, a); // cppcheck-suppress useStlAlgorithm // assert(isfinite(score)); } return score; @@ -278,13 +294,12 @@ fit_single_epiallele(const vector &reads, vector &a) { void compute_model_likelihoods(double &single_score, double &pair_score, const size_t &max_itr, const double &low_prob, - const double &high_prob, const size_t &n_cpgs, + const double &high_prob, const size_t n_cpgs, const vector &reads) { - static constexpr double mixing = 0.5; // try a single epi-allele and compute its log likelihood - vector a0(n_cpgs, 0.5); + vector a0(n_cpgs, 0.5); // NOLINT(*-magic-numbers) single_score = fit_single_epiallele(reads, a0); // initialize the pair epi-alleles and indicators, and do the actual @@ -296,15 +311,16 @@ compute_model_likelihoods(double &single_score, double &pair_score, const double log_mixing2 = log(1.0 - mixing); pair_score = transform_reduce( - cbegin(reads), cend(reads), 0.0, std::plus<>(), - [&](const epi_r &r) { return log_likelihood(r, log_mixing1, log_mixing2, a1, a2); }); + cbegin(reads), cend(reads), 0.0, std::plus<>(), [&](const epi_r &r) { + return log_likelihood(r, log_mixing1, log_mixing2, a1, a2); + }); } - double test_asm_lrt(const size_t max_itr, const bool correct_for_read_count, - const double low_prob, - const double high_prob, vector &reads) { + const double low_prob, const double high_prob, + vector &reads) { + // NOLINTBEGIN(*-avoid-magic-numbers) double single_score = num_lim::min(); double pair_score = num_lim::min(); const auto first_read_offset = adjust_read_offsets(reads); @@ -322,20 +338,21 @@ test_asm_lrt(const size_t max_itr, const bool correct_for_read_count, // correction for numbers of reads if (correct_for_read_count) - pair_score += size(reads)*log(0.5); + pair_score += static_cast(std::size(reads)) * std::log(0.5); - const double llr_stat = -2*(single_score - pair_score); - const double p_value = 1.0 - gsl_cdf_chisq_P(llr_stat, df); + const double llr_stat = -2.0 * (single_score - pair_score); + const double p_value = + 1.0 - gsl_cdf_chisq_P(llr_stat, static_cast(df)); + // NOLINTEND(*-avoid-magic-numbers) return p_value; } - double test_asm_bic(const size_t max_itr, const bool correct_for_read_count, - const double low_prob, - const double high_prob, vector &reads) { - + const double low_prob, const double high_prob, + vector &reads) { + // NOLINTBEGIN(*-avoid-magic-numbers) double single_score = num_lim::min(); double pair_score = num_lim::min(); const auto first_read_offset = adjust_read_offsets(reads); @@ -346,13 +363,17 @@ test_asm_bic(const size_t max_itr, const bool correct_for_read_count, // correction for numbers of reads if (correct_for_read_count) - pair_score += size(reads)*log(0.5); + pair_score += static_cast(std::size(reads)) * log(0.5); for (auto &read : reads) read.pos += first_read_offset; + const double n_reads = static_cast(std::size(reads)); + const auto n_cpgs_log_n_reads = n_cpgs * std::log(n_reads); + // compute bic scores and compare - const double bic_single = n_cpgs*log(reads.size()) - 2*single_score; - const double bic_pair = 2*n_cpgs*log(reads.size()) - 2*pair_score; + const double bic_single = n_cpgs_log_n_reads - 2 * single_score; + const double bic_pair = 2 * n_cpgs_log_n_reads - 2 * pair_score; return bic_pair - bic_single; + // NOLINTEND(*-avoid-magic-numbers) } From 9d32587f26f80f2c2c887f53b3bae45701485842 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 026/106] src/common/EpireadStats.hpp: changes to add static analysis --- src/common/EpireadStats.hpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/common/EpireadStats.hpp b/src/common/EpireadStats.hpp index 2e18d62e..388966fa 100644 --- a/src/common/EpireadStats.hpp +++ b/src/common/EpireadStats.hpp @@ -17,20 +17,27 @@ #ifndef EPIREAD_STATS #define EPIREAD_STATS +#include #include +#include +#include #include -#include "Epiread.hpp" - struct small_epiread { uint32_t pos{}; std::string seq{}; - small_epiread(uint32_t p, std::string s): pos{p}, seq{s} {} + small_epiread(const std::uint32_t p, const std::string &s) : pos{p}, seq{s} {} - uint32_t end() const { return pos + std::size(seq); } + uint32_t + end() const { + return pos + std::size(seq); + } - uint32_t length() const { return std::size(seq); } + uint32_t + length() const { + return std::size(seq); + } }; double @@ -71,8 +78,8 @@ test_asm_bic(const size_t max_itr, const bool crct_for_read_count, std::vector &reads); struct EpireadStats { - double test_asm(std::vector &reads, - bool &is_significant) const { + double + test_asm(std::vector &reads, bool &is_significant) const { const double score = use_bic ? test_asm_bic(max_itr, crct_for_read_count, low_prob, high_prob, reads) : test_asm_lrt(max_itr, crct_for_read_count, From c2af9ddd8e7542d95cc33206110488e60d33aa6b Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 027/106] src/common/Interval.cpp: changes to add static analysis --- src/common/Interval.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/Interval.cpp b/src/common/Interval.cpp index 3b0b6915..415c2c2a 100644 --- a/src/common/Interval.cpp +++ b/src/common/Interval.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include auto @@ -60,6 +61,7 @@ Interval::initialize(const char *c, const char *c_end) -> bool { const auto [ptr, ec] = std::from_chars(field_s, field_e, stop); failed = failed || ec != std::errc{}; } + // NOLINTEND(*-pointer-arithmetic) return !failed; } From a7af1574d47bf4beb1a5101f7777b30eb3296232 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 028/106] src/common/Interval6.cpp: changes to add static analysis --- src/common/Interval6.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/Interval6.cpp b/src/common/Interval6.cpp index 2a4fa1de..a966a483 100644 --- a/src/common/Interval6.cpp +++ b/src/common/Interval6.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include auto From 5aa469479ffab00e9af0b8560d5b68348019d277 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 029/106] src/common/LevelsCounter.cpp: changes to add static analysis --- src/common/LevelsCounter.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/common/LevelsCounter.cpp b/src/common/LevelsCounter.cpp index 7931a613..92004c45 100644 --- a/src/common/LevelsCounter.cpp +++ b/src/common/LevelsCounter.cpp @@ -14,6 +14,7 @@ */ #include "LevelsCounter.hpp" +#include "MSite.hpp" #include "bsutils.hpp" #include @@ -22,6 +23,7 @@ void LevelsCounter::update(const MSite &s) { + static constexpr auto half = 0.5; if (s.is_mutated()) { ++mutations; } @@ -31,10 +33,12 @@ LevelsCounter::update(const MSite &s) { total_c += s.n_meth(); total_t += s.n_reads - s.n_meth(); total_meth += s.meth; - double lower = 0.0, upper = 0.0; - wilson_ci_for_binomial(alpha, s.n_reads, s.meth, lower, upper); - called_meth += (lower > 0.5); - called_unmeth += (upper < 0.5); + double lower{}; + double upper{}; + wilson_ci_for_binomial(alpha, static_cast(s.n_reads), s.meth, lower, + upper); + called_meth += (lower > half); + called_unmeth += (upper < half); } ++total_sites; } @@ -87,9 +91,9 @@ LevelsCounter::format_row() const { << total_meth << '\t'; // derived values oss << coverage() << '\t' - << static_cast(sites_covered)/total_sites << '\t' - << static_cast(coverage())/total_sites << '\t' - << static_cast(coverage())/sites_covered << '\t' + << static_cast(sites_covered)/static_cast(total_sites) << '\t' + << static_cast(coverage())/static_cast(total_sites) << '\t' + << static_cast(coverage())/static_cast(sites_covered) << '\t' << (good ? mean_meth() : 0.0) << '\t' << (good ? mean_meth_weighted() : 0.0) << '\t' << (good ? fractional_meth() : 0.0); @@ -121,7 +125,7 @@ LevelsCounter::format_header() { return oss.str(); } -double LevelsCounter::alpha = 0.05; +double LevelsCounter::alpha = 0.05; // NOLINT(*-avoid-magic-numbers) std::ostream & operator<<(std::ostream &out, const LevelsCounter &cs) { @@ -129,7 +133,7 @@ operator<<(std::ostream &out, const LevelsCounter &cs) { } static void -check_label(const std::string &observed, const std::string expected) { +check_label(const std::string &observed, const std::string &expected) { if (observed != expected) throw std::runtime_error("bad levels format [" + observed + "," + expected + "]"); @@ -138,7 +142,7 @@ check_label(const std::string &observed, const std::string expected) { std::istream & operator>>(std::istream &in, LevelsCounter &cs) { in >> cs.context; // get the context - cs.context = cs.context.substr(0, cs.context.find_first_of(":")); + cs.context = cs.context.substr(0, cs.context.find_first_of(':')); std::string label; in >> label >> cs.total_sites; // the total sites From 9dcd30da5db29ac2aec498f1b7de1e55b657acfb Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 030/106] src/common/LevelsCounter.hpp: changes to add static analysis --- src/common/LevelsCounter.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/common/LevelsCounter.hpp b/src/common/LevelsCounter.hpp index 2ce520fc..f6c24e75 100644 --- a/src/common/LevelsCounter.hpp +++ b/src/common/LevelsCounter.hpp @@ -16,10 +16,11 @@ #ifndef LEVELS_COUNTER_HPP #define LEVELS_COUNTER_HPP -#include "MSite.hpp" - +#include +#include #include #include +struct MSite; struct LevelsCounter { From 17d0943cab2ba779e2bbb8aa891f59af8cb6ffe9 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 031/106] src/common/MSite.cpp: changes to add static analysis --- src/common/MSite.cpp | 65 ++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/src/common/MSite.cpp b/src/common/MSite.cpp index d4abd2fc..053addc0 100644 --- a/src/common/MSite.cpp +++ b/src/common/MSite.cpp @@ -15,23 +15,27 @@ #include "MSite.hpp" +#include "bamxx.hpp" +#include "counts_header.hpp" + #include #include -#include +#include +#include #include #include #include #include #include - -#include "counts_header.hpp" -#include "smithlab_utils.hpp" +#include using std::cbegin; using std::cend; using std::end; using std::find_if; using std::from_chars; +using std::ifstream; +using std::ios_base; using std::regex_match; using std::runtime_error; using std::string; @@ -45,6 +49,7 @@ MSite::initialize(const char *c, const char *c_end) { bool failed = false; + // NOLINTBEGIN(*-pointer-arithmetic) auto field_s = c; auto field_e = find_if(field_s + 1, c_end, is_sep); if (field_e == c_end) @@ -103,15 +108,19 @@ MSite::initialize(const char *c, const char *c_end) { // ADS: the value below would work for a flag // pos = std::numeric_limits::max(); + // NOLINTEND(*-pointer-arithmetic) + return !failed; } MSite::MSite(const string &line) { - if (!initialize(line.data(), line.data() + size(line))) + // NOLINTNEXTLINE(*-pointer-arithmetic) + if (!initialize(line.data(), line.data() + std::size(line))) throw runtime_error("bad count line: " + line); } MSite::MSite(const char *line, const int n) { + // NOLINTNEXTLINE(*-pointer-arithmetic) if (!initialize(line, line + n)) throw runtime_error("bad count line: " + string(line)); } @@ -130,12 +139,9 @@ distance(const MSite &a, const MSite &b) { : std::numeric_limits::max(); } -using std::ifstream; -using std::ios_base; - static void move_to_start_of_line(ifstream &in) { - char next; + char next{}; // move backwards by: one step forward, two steps back while (in.good() && in.get(next) && next != '\n') { in.unget(); @@ -150,16 +156,15 @@ move_to_start_of_line(ifstream &in) { void find_offset_for_msite(const std::string &chr, const size_t idx, std::ifstream &site_in) { - site_in.seekg(0, ios_base::beg); - const size_t begin_pos = site_in.tellg(); + const auto begin_pos = site_in.tellg(); site_in.seekg(0, ios_base::end); - const size_t end_pos = site_in.tellg(); + const auto end_pos = site_in.tellg(); if (end_pos - begin_pos < 2) throw runtime_error("empty counts file"); - size_t step_size = (end_pos - begin_pos) / 2; + std::streamoff step_size = (end_pos - begin_pos) / 2; site_in.seekg(0, ios_base::beg); string low_chr; @@ -171,19 +176,18 @@ find_offset_for_msite(const std::string &chr, const size_t idx, site_in.seekg(-2, ios_base::end); move_to_start_of_line(site_in); string high_chr; - size_t high_idx; + size_t high_idx{}; if (!(site_in >> high_chr >> high_idx)) throw runtime_error("failed search in counts file"); - size_t pos = step_size; + std::streamoff pos = step_size; site_in.seekg(pos, ios_base::beg); move_to_start_of_line(site_in); - size_t prev_pos = 0; // keep track of previous position in file + std::streamoff prev_pos = 0; // keep track of previous position in file // binary search inside sorted file on disk // iterate until step size is 0 or positions are identical while (step_size > 0 && prev_pos != pos) { - // track (mid) position in file to make sure it keeps moving prev_pos = pos; @@ -194,11 +198,13 @@ find_offset_for_msite(const std::string &chr, const size_t idx, // this check will never give a false indication of unsorted, but // might catch an unsorted file - if (mid_chr < low_chr || mid_chr > high_chr) + if (mid_chr < low_chr || mid_chr > high_chr) { + // NOLINTBEGIN(performance-inefficient-string-concatenation) throw runtime_error("chromosomes unsorted inside file: " "low=" + low_chr + ",mid=" + mid_chr + ",high=" + high_chr); - + // NOLINTEND(performance-inefficient-string-concatenation) + } step_size /= 2; // cut the range in half if (chr < mid_chr || (chr == mid_chr && idx <= mid_idx)) { @@ -222,16 +228,15 @@ void find_offset_for_msite( const std::unordered_map &chrom_order, const std::string &chr, const size_t idx, std::ifstream &site_in) { - site_in.seekg(0, ios_base::beg); - const size_t begin_pos = site_in.tellg(); + const auto begin_pos = site_in.tellg(); site_in.seekg(0, ios_base::end); - const size_t end_pos = site_in.tellg(); + const auto end_pos = site_in.tellg(); if (end_pos - begin_pos < 2) throw runtime_error("empty counts file"); - size_t step_size = (end_pos - begin_pos) / 2; + std::streamoff step_size = (end_pos - begin_pos) / 2; const auto chr_idx_itr = chrom_order.find(chr); if (chr_idx_itr == end(chrom_order)) { @@ -264,15 +269,14 @@ find_offset_for_msite( throw runtime_error("inconsistent chromosome order info"); auto high_chr_idx = high_chr_idx_itr->second; - size_t pos = step_size; + std::streamoff pos = step_size; site_in.seekg(pos, ios_base::beg); move_to_start_of_line(site_in); - size_t prev_pos = 0; // keep track of previous position in file + std::streamoff prev_pos = 0; // keep track of previous position in file // binary search inside sorted file on disk // iterate until step size is 0 or positions are identical while (step_size > 0 && prev_pos != pos) { - // track (mid) position in file to make sure it keeps moving prev_pos = pos; @@ -289,25 +293,28 @@ find_offset_for_msite( // this check will never give a false indication of unsorted, but // might catch an unsorted file using std::to_string; - if (mid_chr_idx < low_chr_idx || mid_chr_idx > high_chr_idx) + if (mid_chr_idx < low_chr_idx || mid_chr_idx > high_chr_idx) { + // NOLINTBEGIN(performance-inefficient-string-concatenation) throw runtime_error("chromosomes unsorted inside file: " "low=" + to_string(low_chr_idx) + ",mid=" + to_string(mid_chr_idx) + ",high=" + to_string(high_chr_idx)); + // NOLINTEND(performance-inefficient-string-concatenation) + } step_size /= 2; // cut the range in half if (chr_idx < mid_chr_idx || (chr_idx == mid_chr_idx && idx <= mid_idx)) { // move to the left high_chr_idx = mid_chr_idx; - high_idx = mid_idx; + // high_idx = mid_idx; pos -= step_size; } else { // move to the left low_chr_idx = mid_chr_idx; - low_idx = mid_idx; + // low_idx = mid_idx; pos += step_size; } site_in.seekg(pos, ios_base::beg); From 56bd937acb10018a3da6dc51f5da5bee7cced62b Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 032/106] src/common/MSite.hpp: changes to add static analysis --- src/common/MSite.hpp | 99 ++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 44 deletions(-) diff --git a/src/common/MSite.hpp b/src/common/MSite.hpp index a74fc36a..159789ec 100644 --- a/src/common/MSite.hpp +++ b/src/common/MSite.hpp @@ -1,18 +1,17 @@ /* - Copyright (C) 2015-2022 University of Southern California - Andrew D Smith + Copyright (C) 2015-2025 Andrew D Smith Authors: Andrew D. Smith - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your option) + any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + This program is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. */ #ifndef MSITE_HPP @@ -20,14 +19,16 @@ #include +#include #include #include +#include +#include // IWYU pragma: keep #include #include #include struct MSite { - // This defaults to true, and when true MSite lines cannot be parsed if they // have additional columns in the file. static bool no_extra_fields; @@ -47,28 +48,37 @@ struct MSite { explicit MSite(const std::string &line); explicit MSite(const char *line, const int n); - bool + [[nodiscard]] bool initialize(const char *c, const char *c_end); - bool + [[nodiscard]] bool operator<(const MSite &other) const { - int r = chrom.compare(other.chrom); + const int r = chrom.compare(other.chrom); return (r < 0 || (r == 0 && (pos < other.pos || (pos == other.pos && strand < other.strand)))); } - size_t + [[nodiscard]] size_t n_meth() const { return std::round(meth * n_reads); } - size_t + + [[nodiscard]] size_t n_unmeth() const { return n_reads - n_meth(); } - ////////////////////////////////////////////////////////////// - /// FUNCTIONS BELOW ARE FOR MANIPULATING SYMMETRIC CPG SITES - ////////////////////////////////////////////////////////////// + [[nodiscard]] double + n_meth_f() const { + return meth * n_reads; + } + + [[nodiscard]] double + n_unmeth_f() const { + return n_reads - n_meth_f(); + } + + // functions below are for manipulating symmetric CpG sites void add(const MSite &other) { // ADS: possible that this function has specific behavior that @@ -85,41 +95,43 @@ struct MSite { // ADS: function below has redundant check for is_cpg, which is // expensive and might be ok to remove - bool + [[nodiscard]] bool is_mate_of(const MSite &first) { return (first.pos + 1 == pos && first.is_cpg() && is_cpg() && first.strand == '+' && strand == '-'); } - //////////////////////////////////////////////////////////////////////// - ///// Functions below test the type of site. These are CpG, CHH, and - ///// CHG divided into two kinds: CCG and CXG, the former including a - ///// CpG within. Also included is a function that tests if a site - ///// has a mutation. - //////////////////////////////////////////////////////////////////////// - bool + // Functions below test the type of site. These are CpG, CHH, and CHG + // divided into two kinds: CCG and CXG, the former including a CpG + // within. Also included is a function that tests if a site has a mutation. + + [[nodiscard]] bool is_cpg() const { - return context.length() >= 3 && + return std::size(context) >= 3 && (context[0] == 'C' && context[1] == 'p' && context[2] == 'G'); } - bool + + [[nodiscard]] bool is_chh() const { - return context.length() >= 3 && + return std::size(context) >= 3 && (context[0] == 'C' && context[1] == 'H' && context[2] == 'H'); } - bool + + [[nodiscard]] bool is_ccg() const { - return context.length() >= 3 && + return std::size(context) >= 3 && (context[0] == 'C' && context[1] == 'C' && context[2] == 'G'); } - bool + + [[nodiscard]] bool is_cxg() const { - return context.length() >= 3 && + return std::size(context) >= 3 && (context[0] == 'C' && context[1] == 'X' && context[2] == 'G'); } - bool + + [[nodiscard]] bool is_mutated() const { - return context.length() == 4 && context[3] == 'x'; + return std::size(context) == 4 && context[3] == 'x'; } void @@ -127,13 +139,14 @@ struct MSite { if (!is_mutated()) context += 'x'; } + void set_unmutated() { if (is_mutated()) - context.resize(context.length() - 1); + context.resize(std::size(context) - 1); } - std::string + [[nodiscard]] std::string tostring() const; }; @@ -153,7 +166,7 @@ operator<<(T &out, const MSite &s) { return out; } -size_t +[[nodiscard]] size_t distance(const MSite &a, const MSite &b); // find the byte offset within the given file of the first site in the @@ -176,10 +189,8 @@ find_offset_for_msite( inline bamxx::bgzf_file & write_site(bamxx::bgzf_file &f, const MSite &s) { - // ADS: to slow?? - std::ostringstream oss; - oss << s.tostring() << '\n'; - f.write(oss.str()); + // ADS: too slow?? + f.write(s.tostring() + "\n"); return f; } @@ -196,7 +207,7 @@ read_site(bamxx::bgzf_file &f, MSite &s) { return f; } -bool +[[nodiscard]] bool is_msite_file(const std::string &file); #endif From 826f54f27f58a67e899dafc5a751eee8530c8160 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 033/106] src/common/Smoothing.cpp: changes to add static analysis --- src/common/Smoothing.cpp | 61 ++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/src/common/Smoothing.cpp b/src/common/Smoothing.cpp index e98007aa..d421a65f 100644 --- a/src/common/Smoothing.cpp +++ b/src/common/Smoothing.cpp @@ -21,31 +21,32 @@ #include "Smoothing.hpp" -#include +#include + #include +#include +#include #include #include -#include #include +#include -#include "smithlab_utils.hpp" +// NOLINTBEGIN -using std::vector; -using std::transform; using std::divides; using std::runtime_error; +using std::transform; +using std::vector; static double Epanechnikov_kernel(double i, double j, double bandwidth) { - const double u = (j - i)/bandwidth; - return 0.75*(1.0 - u*u); + const double u = (j - i) / bandwidth; + return 0.75 * (1.0 - u * u); // NOLINT(*-avoid-magic-numbers) } void -KernelSmoothing(const double bandwidth, - const vector &x_vals, - const vector &y_vals, - const vector &x_target, +KernelSmoothing(const double bandwidth, const vector &x_vals, + const vector &y_vals, const vector &x_target, vector &y_target) { assert(x_vals.size() == y_vals.size()); @@ -58,7 +59,6 @@ KernelSmoothing(const double bandwidth, // iterate over the x target vals for (size_t i = 0; i < x_target.size(); ++i) { - // calculate the x starting point while (x_start < x_vals.size() && x_vals[x_start] < x_target[i] - bandwidth) ++x_start; @@ -76,24 +76,22 @@ KernelSmoothing(const double bandwidth, // calculate the weights vector weights(lim); for (size_t j = 0; j < lim; ++j) - weights[j] = Epanechnikov_kernel(x_target[i], x_vals[x_start+j], bandwidth); + weights[j] = + Epanechnikov_kernel(x_target[i], x_vals[x_start + j], bandwidth); const double weight_sum = accumulate(weights.begin(), weights.end(), 0.0); transform(weights.begin(), weights.end(), weights.begin(), - [weight_sum] (const double w) {return w / weight_sum;}); + [weight_sum](const double w) { return w / weight_sum; }); // apply the weights y_target[i] = 0; for (size_t j = 0; j < lim; ++j) - y_target[i] += y_vals[x_start + j]*weights[j]; + y_target[i] += y_vals[x_start + j] * weights[j]; } } - - void KernelSmoothing(const double bandwidth, const vector &y_vals, vector &y_target) { - // allocate the space for the new y vals y_target.resize(y_vals.size(), 0); @@ -103,9 +101,8 @@ KernelSmoothing(const double bandwidth, const vector &y_vals, // iterate over the x target vals for (size_t i = 0; i < y_vals.size(); ++i) { - // calculate the x starting point - while (x_start < y_vals.size() && x_start < i - bandwidth) + while (x_start < y_vals.size() && x_start + bandwidth < i) ++x_start; // calculate the x ending point @@ -125,24 +122,20 @@ KernelSmoothing(const double bandwidth, const vector &y_vals, const double weight_sum = accumulate(weights.begin(), weights.end(), 0.0); transform(weights.begin(), weights.end(), weights.begin(), - [weight_sum] (const double w) {return w / weight_sum;}); + [weight_sum](const double w) { return w / weight_sum; }); // apply the weights y_target[i] = 0; for (size_t j = 0; j < lim; ++j) - y_target[i] += y_vals[x_start + j]*weights[j]; + y_target[i] += y_vals[x_start + j] * weights[j]; } } -#include - void -LocalLinearRegression(const double bandwidth, - const vector &x_vals, +LocalLinearRegression(const double bandwidth, const vector &x_vals, const vector &y_vals, const vector &x_target, vector &y_target) { - // Make sure the x and y vectors are of the same length assert(x_vals.size() == y_vals.size()); @@ -154,7 +147,6 @@ LocalLinearRegression(const double bandwidth, // iterate over the x target vals for (size_t i = 0; i < x_target.size(); ++i) { - // calculate the x starting point while (x_start < x_vals.size() && x_vals[x_start] < x_target[i] - bandwidth) ++x_start; @@ -172,16 +164,19 @@ LocalLinearRegression(const double bandwidth, // calculate the weights vector weights(lim); for (size_t j = 0; j < lim; ++j) - weights[j] = Epanechnikov_kernel(x_target[i], x_vals[x_start+j], bandwidth); + weights[j] = + Epanechnikov_kernel(x_target[i], x_vals[x_start + j], bandwidth); const double weight_sum = accumulate(weights.begin(), weights.end(), 0.0); transform(weights.begin(), weights.end(), weights.begin(), - [weight_sum] (const double w) {return w / weight_sum;}); + [weight_sum](const double w) { return w / weight_sum; }); double intercept = 0, slope = 0; double c00 = 0, c10 = 0, c11 = 0; double ssq = 0; - gsl_fit_wlinear(&x_vals[x_start], 1, &weights[0], 1, &y_vals[x_start], 1, lim, - &intercept, &slope, &c00, &c10, &c11, &ssq); - y_target[i] = intercept + slope*x_target[i]; + gsl_fit_wlinear(&x_vals[x_start], 1, &weights[0], 1, &y_vals[x_start], 1, + lim, &intercept, &slope, &c00, &c10, &c11, &ssq); + y_target[i] = intercept + slope * x_target[i]; } } + +// NOLINTEND From ddf5c1bcb62de6b412e55782da611fa67689432c Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 034/106] src/common/ThreeStateHMM.cpp: changes to add static analysis --- src/common/ThreeStateHMM.cpp | 301 ++++++++++++++++++----------------- 1 file changed, 154 insertions(+), 147 deletions(-) diff --git a/src/common/ThreeStateHMM.cpp b/src/common/ThreeStateHMM.cpp index 1955e1af..1b3b8689 100644 --- a/src/common/ThreeStateHMM.cpp +++ b/src/common/ThreeStateHMM.cpp @@ -25,11 +25,16 @@ #include "BetaBin.hpp" #include "numerical_utils.hpp" +#include +#include #include +#include #include +#include +#include // IWYU pragma: keep #include -#include #include +#include using std::cerr; using std::endl; @@ -46,6 +51,8 @@ using std::setw; using std::string; using std::vector; +// NOLINTBEGIN(*-avoid-magic-numbers) + static STATE_LABELS max_state(const Triplet &likelihoods) { if (likelihoods.hypo >= max(likelihoods.HYPER, likelihoods.HYPO)) @@ -66,71 +73,65 @@ max_value(const Triplet &likelihoods) { return likelihoods.HYPO; } -ThreeStateHMM::ThreeStateHMM(vector> &_observations, - const vector &_reset_points, +ThreeStateHMM::ThreeStateHMM(vector> &observations, + const vector &reset_points, const double tol, const size_t max_itr, - const bool v) - : reset_points(_reset_points), - meth_lp(_observations.size()), - unmeth_lp(_observations.size()), - hypo_log_likelihood(_observations.size()), - HYPER_log_likelihood(_observations.size()), - HYPO_log_likelihood(_observations.size()), - forward(_observations.size()), - backward(_observations.size()), - hypo_posteriors(_observations.size()), - HYPER_posteriors(_observations.size()), - HYPO_posteriors(_observations.size()), - hypo_hypo(_observations.size()), - hypo_HYPER(_observations.size()), - HYPER_hypo(_observations.size()), - HYPER_HYPER(_observations.size()), - HYPER_HYPO(_observations.size()), - HYPO_HYPER(_observations.size()), - HYPO_HYPO(_observations.size()), - classes(_observations.size()), - state_posteriors(_observations.size()), - tolerance(tol), max_iterations(max_itr), VERBOSE(v) { - - std::swap(observations, _observations); - - for (size_t i = 0; i < observations.size(); ++i) { + const bool v) : + observations{observations}, reset_points{reset_points}, + meth_lp(std::size(observations)), unmeth_lp(std::size(observations)), + hypo_log_likelihood(std::size(observations)), + HYPER_log_likelihood(std::size(observations)), + HYPO_log_likelihood(std::size(observations)), + forward(std::size(observations)), backward(std::size(observations)), + hypo_posteriors(std::size(observations)), + HYPER_posteriors(std::size(observations)), + HYPO_posteriors(std::size(observations)), hypo_hypo(std::size(observations)), + hypo_HYPER(std::size(observations)), HYPER_hypo(std::size(observations)), + HYPER_HYPER(std::size(observations)), HYPER_HYPO(std::size(observations)), + HYPO_HYPER(std::size(observations)), HYPO_HYPO(std::size(observations)), + classes(std::size(observations)), state_posteriors(std::size(observations)), + tolerance(tol), max_iterations(max_itr), VERBOSE(v) { + static constexpr auto epsilon = 1.0e-02; + for (std::size_t i = 0; i < std::size(observations); ++i) { const double m = observations[i].first; const double u = observations[i].second; - meth_lp[i] = log(min(max(m / (m + u), 1e-2), 1.0 - 1e-2)); - unmeth_lp[i] = log(min(max(u / (m + u), 1e-2), 1.0 - 1e-2)); + meth_lp[i] = + std::log(std::min(std::max(m / (m + u), epsilon), 1.0 - epsilon)); + unmeth_lp[i] = + std::log(std::min(std::max(u / (m + u), epsilon), 1.0 - epsilon)); } } void -ThreeStateHMM::set_parameters(const betabin &_hypo_emission, - const betabin &_HYPER_emission, - const betabin &_HYPO_emission, - const vector> &_trans) { - hypo_emission = _hypo_emission; - HYPER_emission = _HYPER_emission; - HYPO_emission = _HYPO_emission; +ThreeStateHMM::set_parameters(const betabin &hypo_emission_in, + const betabin &HYPER_emission_in, + const betabin &HYPO_emission_in, + const vector> &trans_in) { + hypo_emission = hypo_emission_in; + HYPER_emission = HYPER_emission_in; + HYPO_emission = HYPO_emission_in; update_observation_likelihood(); - lp_start.hypo = log(0.5); - lp_start.HYPER = log(0.25); - lp_start.HYPO = log(0.25); + lp_start.hypo = std::log(0.5); + lp_start.HYPER = std::log(0.25); + lp_start.HYPO = std::log(0.25); - lp_end.hypo = log(1e-10); - lp_end.HYPER = log(1e-10); - lp_end.HYPO = log(1e-10); + lp_end.hypo = std::log(1e-10); + lp_end.HYPER = std::log(1e-10); + lp_end.HYPO = std::log(1e-10); - trans = _trans; + trans = trans_in; } void -ThreeStateHMM::get_parameters(betabin &_hypo_emission, betabin &_HYPER_emission, - betabin &_HYPO_emission, - vector> &_trans) const { - _hypo_emission = hypo_emission; - _HYPER_emission = HYPER_emission; - _HYPO_emission = HYPO_emission; - _trans = trans; +ThreeStateHMM::get_parameters(betabin &hypo_emission_out, + betabin &HYPER_emission_out, + betabin &HYPO_emission_out, + vector> &trans_out) const { + hypo_emission_out = hypo_emission; + HYPER_emission_out = HYPER_emission; + HYPO_emission_out = HYPO_emission; + trans_out = trans; } ////////////////////////////////////////////// @@ -178,9 +179,6 @@ ThreeStateHMM::HYPO_segment_log_likelihood(const size_t start, double ThreeStateHMM::forward_algorithm(const size_t start, const size_t end) { - ///// - // cerr << "check enter forward_algorithm: "<< "OK" << endl; - ///// for (size_t i = start; i < end; ++i) forward[i].hypo = forward[i].HYPER = forward[i].HYPO = 0.0; @@ -194,21 +192,21 @@ ThreeStateHMM::forward_algorithm(const size_t start, const size_t end) { for (size_t i = start + 1; i < end; ++i) { // hypomethylated CpG in HypoMR segment forward[i].hypo = - log_sum_log(forward[i - 1].hypo + log(trans[hypo][hypo]), - forward[i - 1].HYPER + log(trans[HYPER][hypo])) + + log_sum_log(forward[i - 1].hypo + std::log(trans[hypo][hypo]), + forward[i - 1].HYPER + std::log(trans[HYPER][hypo])) + hypo_segment_log_likelihood(i, i + 1); // hypermethylated CpG in HyperMR segment forward[i].HYPER = - log_sum_log(forward[i - 1].hypo + log(trans[hypo][HYPER]), - forward[i - 1].HYPER + log(trans[HYPER][HYPER]), - forward[i - 1].HYPO + log(trans[HYPO][HYPER])) + + log_sum_log(forward[i - 1].hypo + std::log(trans[hypo][HYPER]), + forward[i - 1].HYPER + std::log(trans[HYPER][HYPER]), + forward[i - 1].HYPO + std::log(trans[HYPO][HYPER])) + HYPER_segment_log_likelihood(i, i + 1); // hypomethylated CpG in HyperMR segment forward[i].HYPO = - log_sum_log(forward[i - 1].HYPER + log(trans[HYPER][HYPO]), - forward[i - 1].HYPO + log(trans[HYPO][HYPO])) + + log_sum_log(forward[i - 1].HYPER + std::log(trans[HYPER][HYPO]), + forward[i - 1].HYPO + std::log(trans[HYPO][HYPO])) + HYPO_segment_log_likelihood(i, i + 1); } @@ -219,11 +217,8 @@ ThreeStateHMM::forward_algorithm(const size_t start, const size_t end) { double ThreeStateHMM::backward_algorithm(const size_t start, const size_t end) { - // ///// - // cerr << "check backward_algorithm: "<< "OK" << endl; - // ///// - - const int start_int(start), end_int(end); + const int start_int = static_cast(start); + const int end_int = static_cast(end); for (size_t i = start; i < end; ++i) backward[i].hypo = backward[i].HYPER = backward[i].HYPO = 0.0; @@ -235,25 +230,25 @@ ThreeStateHMM::backward_algorithm(const size_t start, const size_t end) { for (int i = end_int - 2; i >= start_int; --i) { // i in hypo-methylated state of HypoMR backward[i].hypo = log_sum_log( - log(trans[hypo][hypo]) + hypo_segment_log_likelihood(i + 1, i + 2) + + std::log(trans[hypo][hypo]) + hypo_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].hypo, - log(trans[hypo][HYPER]) + HYPER_segment_log_likelihood(i + 1, i + 2) + - backward[i + 1].HYPER); + std::log(trans[hypo][HYPER]) + + HYPER_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPER); // i in hyper-methylated state of HyperMR backward[i].HYPER = log_sum_log( - log(trans[HYPER][hypo]) + hypo_segment_log_likelihood(i + 1, i + 2) + + std::log(trans[HYPER][hypo]) + hypo_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].hypo, - log(trans[HYPER][HYPER]) + HYPER_segment_log_likelihood(i + 1, i + 2) + - backward[i + 1].HYPER, - log(trans[HYPER][HYPO]) + HYPO_segment_log_likelihood(i + 1, i + 2) + + std::log(trans[HYPER][HYPER]) + + HYPER_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPER, + std::log(trans[HYPER][HYPO]) + HYPO_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPO); // i in hypo-methylated state of HyperMR backward[i].HYPO = log_sum_log( - log(trans[HYPO][HYPER]) + HYPER_segment_log_likelihood(i + 1, i + 2) + - backward[i + 1].HYPER, - log(trans[HYPO][HYPO]) + HYPO_segment_log_likelihood(i + 1, i + 2) + + std::log(trans[HYPO][HYPER]) + + HYPER_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPER, + std::log(trans[HYPO][HYPO]) + HYPO_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPO); } @@ -272,7 +267,6 @@ ThreeStateHMM::backward_algorithm(const size_t start, const size_t end) { // Expectation void ThreeStateHMM::estimate_state_posterior(const size_t start, const size_t end) { - vector hypo_evidence(end - start, 0), HYPER_evidence(end - start, 0), HYPO_evidence(end - start, 0); @@ -288,19 +282,21 @@ ThreeStateHMM::estimate_state_posterior(const size_t start, const size_t end) { denom = log_sum_log(hypo_evidence[i - start], HYPER_evidence[i - start], HYPO_evidence[i - start]); - if (i > start) assert(fabs(exp(prev_denom - denom) - 1) < 1e-6); + if (i > start) + assert(fabs(expm1(prev_denom - denom)) < 1e-6); #ifndef NDEBUG prev_denom = denom; #endif } for (size_t i = start; i < end; ++i) { - hypo_posteriors[i] = exp(hypo_evidence[i - start] - denom); - HYPER_posteriors[i] = exp(HYPER_evidence[i - start] - denom); - HYPO_posteriors[i] = exp(HYPO_evidence[i - start] - denom); + hypo_posteriors[i] = std::exp(hypo_evidence[i - start] - denom); + HYPER_posteriors[i] = std::exp(HYPER_evidence[i - start] - denom); + HYPO_posteriors[i] = std::exp(HYPO_evidence[i - start] - denom); // renormalize the probabilities - const double sum = hypo_posteriors[i] + HYPER_posteriors[i] + HYPO_posteriors[i]; + const double sum = + hypo_posteriors[i] + HYPER_posteriors[i] + HYPO_posteriors[i]; hypo_posteriors[i] /= sum; HYPER_posteriors[i] /= sum; HYPO_posteriors[i] /= sum; @@ -318,45 +314,47 @@ ThreeStateHMM::estimate_posterior_trans_prob(const size_t start, forward[start].HYPO + backward[start].HYPO); for (size_t i = start; i < end - 1; ++i) { - hypo_hypo[i] = forward[i].hypo + log(trans[hypo][hypo]) + + hypo_hypo[i] = forward[i].hypo + std::log(trans[hypo][hypo]) + hypo_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].hypo - denom; - hypo_HYPER[i] = forward[i].hypo + log(trans[hypo][HYPER]) + + hypo_HYPER[i] = forward[i].hypo + std::log(trans[hypo][HYPER]) + HYPER_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPER - denom; - HYPER_hypo[i] = forward[i].HYPER + log(trans[HYPER][hypo]) + + HYPER_hypo[i] = forward[i].HYPER + std::log(trans[HYPER][hypo]) + hypo_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].hypo - denom; - HYPER_HYPER[i] = forward[i].HYPER + log(trans[HYPER][HYPER]) + + HYPER_HYPER[i] = forward[i].HYPER + std::log(trans[HYPER][HYPER]) + HYPER_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPER - denom; - HYPER_HYPO[i] = forward[i].HYPER + log(trans[HYPER][HYPO]) + + HYPER_HYPO[i] = forward[i].HYPER + std::log(trans[HYPER][HYPO]) + HYPO_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPO - denom; - HYPO_HYPER[i] = forward[i].HYPO + log(trans[HYPO][HYPER]) + + HYPO_HYPER[i] = forward[i].HYPO + std::log(trans[HYPO][HYPER]) + HYPER_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPER - denom; - HYPO_HYPO[i] = forward[i].HYPO + log(trans[HYPO][HYPO]) + + HYPO_HYPO[i] = forward[i].HYPO + std::log(trans[HYPO][HYPO]) + HYPO_segment_log_likelihood(i + 1, i + 2) + backward[i + 1].HYPO - denom; - double sum = exp(hypo_hypo[i]) + exp(hypo_HYPER[i]) + exp(HYPER_hypo[i]) + - exp(HYPER_HYPER[i]) + exp(HYPER_HYPO[i]) + exp(HYPO_HYPER[i]) + - exp(HYPO_HYPO[i]); - - hypo_hypo[i] -= log(sum); - hypo_HYPER[i] -= log(sum); - HYPER_hypo[i] -= log(sum); - HYPER_HYPER[i] -= log(sum); - HYPER_HYPO[i] -= log(sum); - HYPO_HYPER[i] -= log(sum); - HYPO_HYPO[i] -= log(sum); - - assert(fabs(exp(hypo_hypo[i]) + exp(hypo_HYPER[i]) + exp(HYPER_hypo[i]) + - exp(HYPER_HYPER[i]) + exp(HYPER_HYPO[i]) + exp(HYPO_HYPER[i]) + - exp(HYPO_HYPO[i]) - 1) < 1e-3); + double sum = std::exp(hypo_hypo[i]) + std::exp(hypo_HYPER[i]) + + std::exp(HYPER_hypo[i]) + std::exp(HYPER_HYPER[i]) + + std::exp(HYPER_HYPO[i]) + std::exp(HYPO_HYPER[i]) + + std::exp(HYPO_HYPO[i]); + + hypo_hypo[i] -= std::log(sum); + hypo_HYPER[i] -= std::log(sum); + HYPER_hypo[i] -= std::log(sum); + HYPER_HYPER[i] -= std::log(sum); + HYPER_HYPO[i] -= std::log(sum); + HYPO_HYPER[i] -= std::log(sum); + HYPO_HYPO[i] -= std::log(sum); + + assert(fabs(std::exp(hypo_hypo[i]) + std::exp(hypo_HYPER[i]) + + std::exp(HYPER_hypo[i]) + std::exp(HYPER_HYPER[i]) + + std::exp(HYPER_HYPO[i]) + std::exp(HYPO_HYPER[i]) + + std::exp(HYPO_HYPO[i]) - 1) < 1e-3); } } @@ -376,28 +374,28 @@ ThreeStateHMM::estimate_parameters() { HYPER_emission.fit(meth_lp, unmeth_lp, HYPER_posteriors); const double sum_hypo_hypo = - exp(log_sum_log(hypo_hypo.begin(), hypo_hypo.end())); + std::exp(log_sum_log(hypo_hypo.begin(), hypo_hypo.end())); const double sum_hypo_HYPER = - exp(log_sum_log(hypo_HYPER.begin(), hypo_HYPER.end())); + std::exp(log_sum_log(hypo_HYPER.begin(), hypo_HYPER.end())); const double sum_hypo = sum_hypo_hypo + sum_hypo_HYPER; trans[hypo][hypo] = sum_hypo_hypo / sum_hypo; trans[hypo][HYPER] = sum_hypo_HYPER / sum_hypo; const double sum_HYPER_hypo = - exp(log_sum_log(begin(HYPER_hypo), end(HYPER_hypo))); + std::exp(log_sum_log(begin(HYPER_hypo), end(HYPER_hypo))); const double sum_HYPER_HYPER = - exp(log_sum_log(begin(HYPER_HYPER), end(HYPER_HYPER))); + std::exp(log_sum_log(begin(HYPER_HYPER), end(HYPER_HYPER))); const double sum_HYPER_HYPO = - exp(log_sum_log(begin(HYPER_HYPO), end(HYPER_HYPO))); + std::exp(log_sum_log(begin(HYPER_HYPO), end(HYPER_HYPO))); const double sum_HYPER = sum_HYPER_hypo + sum_HYPER_HYPER + sum_HYPER_HYPO; trans[HYPER][hypo] = sum_HYPER_hypo / sum_HYPER; trans[HYPER][HYPER] = sum_HYPER_HYPER / sum_HYPER; trans[HYPER][HYPO] = sum_HYPER_HYPO / sum_HYPER; const double sum_HYPO_HYPER = - exp(log_sum_log(begin(HYPO_HYPER), end(HYPO_HYPER))); + std::exp(log_sum_log(begin(HYPO_HYPER), end(HYPO_HYPER))); const double sum_HYPO_HYPO = - exp(log_sum_log(begin(HYPO_HYPO), HYPO_HYPO.end())); + std::exp(log_sum_log(begin(HYPO_HYPO), HYPO_HYPO.end())); const double sum_HYPO = sum_HYPO_HYPER + sum_HYPO_HYPO; trans[HYPO][HYPER] = sum_HYPO_HYPER / sum_HYPO; trans[HYPO][HYPO] = sum_HYPO_HYPO / sum_HYPO; @@ -415,7 +413,7 @@ ThreeStateHMM::single_iteration() { #ifndef NDEBUG const double backward_score = #endif - backward_algorithm(reset_points[i], reset_points[i + 1]); + backward_algorithm(reset_points[i], reset_points[i + 1]); assert(fabs((forward_score - backward_score) / max(forward_score, backward_score)) < 1e-10); @@ -430,8 +428,8 @@ ThreeStateHMM::single_iteration() { double ThreeStateHMM::BaumWelchTraining() { - - if (VERBOSE) cerr << "Baum-Welch Training" << endl; + if (VERBOSE) + cerr << "Baum-Welch Training\n"; double prev_total = -numeric_limits::max(); @@ -446,13 +444,13 @@ ThreeStateHMM::BaumWelchTraining() { if (VERBOSE) { cerr << "Itration: " << setw(2) << i + 1 << ";\t" << "Log-Likelihood: " << total << ";\t" - << "Delta: " << (total - prev_total) / fabs(total) << endl + << "Delta: " << (total - prev_total) / fabs(total) << '\n' << "hypo: " << old_hypo_emission.tostring() << ";\t" << "HYPER: " << old_HYPER_emission.tostring() << ";\t" - << "HYPO: " << old_HYPO_emission.tostring() << endl; + << "HYPO: " << old_HYPO_emission.tostring() << '\n'; cerr << setw(5) << "" << setw(10) << "hypo" << setw(10) << "HYPER" - << setw(10) << "HYPO" << endl; + << setw(10) << "HYPO\n"; for (size_t r = 0; r < 3; ++r) { switch (r) { case 0: @@ -466,10 +464,11 @@ ThreeStateHMM::BaumWelchTraining() { break; } - for (size_t c = 0; c < 3; ++c) cerr << setw(10) << old_trans[r][c]; - cerr << endl; + for (size_t c = 0; c < 3; ++c) + cerr << setw(10) << old_trans[r][c]; + cerr << '\n'; } - cerr << endl; + cerr << '\n'; } if ((total - prev_total) / fabs(total) < tolerance) { @@ -479,7 +478,8 @@ ThreeStateHMM::BaumWelchTraining() { update_observation_likelihood(); trans = old_trans; - if (VERBOSE) cerr << "CONVERGED" << endl << endl; + if (VERBOSE) + cerr << "CONVERGED\n\n"; break; } prev_total = total; @@ -501,7 +501,7 @@ ThreeStateHMM::PosteriorDecoding() { #ifndef NDEBUG const double backward_score = #endif - backward_algorithm(reset_points[i], reset_points[i + 1]); + backward_algorithm(reset_points[i], reset_points[i + 1]); assert(fabs((forward_score - backward_score) / max(forward_score, backward_score)) < 1e-10); @@ -522,7 +522,8 @@ ThreeStateHMM::PosteriorDecoding() { double ThreeStateHMM::ViterbiDecoding(const size_t start, const size_t end) { - if (start >= end) throw runtime_error("Invalid HMM sequence indices"); + if (start >= end) + throw runtime_error("Invalid HMM sequence indices"); const size_t lim = end - start; @@ -540,50 +541,55 @@ ThreeStateHMM::ViterbiDecoding(const size_t start, const size_t end) { for (size_t i = 1; i < lim; ++i) { // hypo: - const double hypo_hypo = llh[i - 1].hypo + log(trans[hypo][hypo]); - const double HYPER_hypo = llh[i - 1].HYPER + log(trans[HYPER][hypo]); - if (hypo_hypo > HYPER_hypo) { + const double hypo_hypo_var = llh[i - 1].hypo + std::log(trans[hypo][hypo]); + const double HYPER_hypo_var = + llh[i - 1].HYPER + std::log(trans[HYPER][hypo]); + if (hypo_hypo_var > HYPER_hypo_var) { llh[i].hypo = - hypo_hypo + hypo_segment_log_likelihood(start + i, start + i + 1); + hypo_hypo_var + hypo_segment_log_likelihood(start + i, start + i + 1); trace_hypo[i] = hypo; } else { llh[i].hypo = - HYPER_hypo + hypo_segment_log_likelihood(start + i, start + i + 1); + HYPER_hypo_var + hypo_segment_log_likelihood(start + i, start + i + 1); trace_hypo[i] = HYPER; } // HYPER - const double hypo_HYPER = llh[i - 1].hypo + log(trans[hypo][HYPER]); - const double HYPER_HYPER = llh[i - 1].HYPER + log(trans[HYPER][HYPER]); - const double HYPO_HYPER = llh[i - 1].HYPER + log(trans[HYPO][HYPER]); - if (hypo_HYPER >= max(HYPER_HYPER, HYPO_HYPER)) { + const double hypo_HYPER_var = + llh[i - 1].hypo + std::log(trans[hypo][HYPER]); + const double HYPER_HYPER_var = + llh[i - 1].HYPER + std::log(trans[HYPER][HYPER]); + const double HYPO_HYPER_var = + llh[i - 1].HYPER + std::log(trans[HYPO][HYPER]); + if (hypo_HYPER_var >= max(HYPER_HYPER_var, HYPO_HYPER_var)) { llh[i].HYPER = - hypo_HYPER + HYPER_segment_log_likelihood(start + i, start + i + 1); + hypo_HYPER_var + HYPER_segment_log_likelihood(start + i, start + i + 1); trace_HYPER[i] = hypo; } - else if (HYPER_HYPER >= max(hypo_HYPER, HYPO_HYPER)) { - llh[i].HYPER = - HYPER_HYPER + HYPER_segment_log_likelihood(start + i, start + i + 1); + else if (HYPER_HYPER_var >= max(hypo_HYPER_var, HYPO_HYPER_var)) { + llh[i].HYPER = HYPER_HYPER_var + + HYPER_segment_log_likelihood(start + i, start + i + 1); trace_HYPER[i] = HYPER; } else { llh[i].HYPER = - HYPO_HYPER + HYPER_segment_log_likelihood(start + i, start + i + 1); + HYPO_HYPER_var + HYPER_segment_log_likelihood(start + i, start + i + 1); trace_HYPER[i] = HYPO; } // HYPO - const double HYPER_HYPO = llh[i - 1].HYPER + log(trans[HYPER][HYPO]); - const double HYPO_HYPO = llh[i - 1].HYPO + log(trans[HYPO][HYPO]); - if (HYPER_HYPO > HYPO_HYPO) { + const double HYPER_HYPO_var = + llh[i - 1].HYPER + std::log(trans[HYPER][HYPO]); + const double HYPO_HYPO_var = llh[i - 1].HYPO + std::log(trans[HYPO][HYPO]); + if (HYPER_HYPO_var > HYPO_HYPO_var) { llh[i].HYPO = - HYPER_HYPO + HYPO_segment_log_likelihood(start + i, start + i + 1); + HYPER_HYPO_var + HYPO_segment_log_likelihood(start + i, start + i + 1); trace_HYPO[i] = HYPER; } else { llh[i].HYPO = - HYPO_HYPO + HYPO_segment_log_likelihood(start + i, start + i + 1); + HYPO_HYPO_var + HYPO_segment_log_likelihood(start + i, start + i + 1); trace_HYPO[i] = HYPO; } } @@ -601,16 +607,15 @@ ThreeStateHMM::ViterbiDecoding(const size_t start, const size_t end) { case HYPER: curr = trace_HYPER[lim - i - 1]; break; - case HYPO:; - break; + case HYPO: curr = trace_HYPO[lim - i - 1]; break; } } reverse(inner_ml_classes.begin(), inner_ml_classes.end()); - copy(inner_ml_classes.begin(), inner_ml_classes.end(), - classes.begin() + start); + std::copy(cbegin(inner_ml_classes), std::cend(inner_ml_classes), + std::begin(classes) + static_cast(start)); return max_value(llh.back()); } @@ -635,3 +640,5 @@ void ThreeStateHMM::get_classes(vector &ml_classes) const { ml_classes = classes; } + +// NOLINTEND(*-avoid-magic-numbers) From 441fb53b9af927a562247ee2c3f4a9b56fa45e09 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 035/106] src/common/ThreeStateHMM.hpp: changes to add static analysis --- src/common/ThreeStateHMM.hpp | 100 ++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 43 deletions(-) diff --git a/src/common/ThreeStateHMM.hpp b/src/common/ThreeStateHMM.hpp index 490d8e5a..ec95cd04 100644 --- a/src/common/ThreeStateHMM.hpp +++ b/src/common/ThreeStateHMM.hpp @@ -20,91 +20,105 @@ #ifndef THREE_STATE_HMM_HPP #define THREE_STATE_HMM_HPP -#include +#include "BetaBin.hpp" + +#include +#include +#include // IWYU pragma: keep #include -#include #include -#include "smithlab_utils.hpp" -#include "BetaBin.hpp" - -enum STATE_LABELS {hypo, HYPER, HYPO}; +enum STATE_LABELS { hypo, HYPER, HYPO }; -struct Triplet {double hypo, HYPER, HYPO;}; +struct Triplet { + double hypo, HYPER, HYPO; +}; class ThreeStateHMM { public: - ThreeStateHMM(std::vector> &obs, - const std::vector &res, - const double tol, const size_t max_itr, const bool v); + const std::vector &res, const double tol, + const size_t max_itr, const bool v); void - set_parameters(const betabin & hypo_em, - const betabin & HYPER_em, - const betabin & HYPO_em, + set_parameters(const betabin &hypo_em, const betabin &HYPER_em, + const betabin &HYPO_em, const std::vector> &tr); void - get_parameters(betabin & hypo_em, - betabin & HYPER_em, - betabin & HYPO_em, - std::vector > &tr) const; + get_parameters(betabin &hypo_em, betabin &HYPER_em, betabin &HYPO_em, + std::vector> &tr) const; - double BaumWelchTraining(); + double + BaumWelchTraining(); - double PosteriorDecoding(); + double + PosteriorDecoding(); - double ViterbiDecoding(); + double + ViterbiDecoding(); - void get_state_posteriors(std::vector &scores) const; + void + get_state_posteriors(std::vector &scores) const; - void get_classes(std::vector &classes) const; + void + get_classes(std::vector &classes) const; // private: //////////// methods //////////// - double single_iteration(); - double forward_algorithm(const size_t start, const size_t end); - double backward_algorithm(const size_t start, const size_t end); - double hypo_segment_log_likelihood(const size_t start, const size_t end); - double HYPER_segment_log_likelihood(const size_t start, const size_t end); - double HYPO_segment_log_likelihood(const size_t start, const size_t end); + double + single_iteration(); + double + forward_algorithm(const size_t start, const size_t end); + double + backward_algorithm(const size_t start, const size_t end); + double + hypo_segment_log_likelihood(const size_t start, const size_t end); + double + HYPER_segment_log_likelihood(const size_t start, const size_t end); + double + HYPO_segment_log_likelihood(const size_t start, const size_t end); - void estimate_state_posterior(const size_t start, const size_t end); - void estimate_posterior_trans_prob(const size_t start, const size_t end); - void estimate_parameters(); - void update_observation_likelihood(); + void + estimate_state_posterior(const size_t start, const size_t end); + void + estimate_posterior_trans_prob(const size_t start, const size_t end); + void + estimate_parameters(); + void + update_observation_likelihood(); - double ViterbiDecoding(const size_t start, const size_t end); + double + ViterbiDecoding(const size_t start, const size_t end); - //////// data //////// std::vector> observations; std::vector reset_points; std::vector meth_lp, unmeth_lp; - std::vector hypo_log_likelihood, HYPER_log_likelihood, HYPO_log_likelihood; + std::vector hypo_log_likelihood, HYPER_log_likelihood, + HYPO_log_likelihood; // HMM internal data betabin hypo_emission, HYPER_emission, HYPO_emission; - Triplet lp_start, lp_end; - std::vector > trans; + Triplet lp_start{}; + Triplet lp_end{}; + std::vector> trans; std::vector forward; std::vector backward; std::vector hypo_posteriors, HYPER_posteriors, HYPO_posteriors; - std::vector hypo_hypo, hypo_HYPER, - HYPER_hypo, HYPER_HYPER, HYPER_HYPO, - HYPO_HYPER, HYPO_HYPO; + std::vector hypo_hypo, hypo_HYPER, HYPER_hypo, HYPER_HYPER, + HYPER_HYPO, HYPO_HYPER, HYPO_HYPO; // result std::vector classes; std::vector state_posteriors; // parameters - double tolerance; - size_t max_iterations; - bool VERBOSE; + double tolerance{}; + size_t max_iterations{}; + bool VERBOSE{}; }; #endif From 47b12824e60c705bd01e2708bdc1700145ccfd1f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 036/106] src/common/TwoStateHMM.cpp: changes to add static analysis --- src/common/TwoStateHMM.cpp | 683 +++++++++++++++++-------------------- 1 file changed, 318 insertions(+), 365 deletions(-) diff --git a/src/common/TwoStateHMM.cpp b/src/common/TwoStateHMM.cpp index 7241145d..b202dfd5 100644 --- a/src/common/TwoStateHMM.cpp +++ b/src/common/TwoStateHMM.cpp @@ -14,40 +14,47 @@ */ #include "TwoStateHMM.hpp" +#include "smithlab_utils.hpp" -#include -#include -#include -#include -#include - -#include #include +#include -#include "smithlab_utils.hpp" +#include +#include +#include +#include +#include +#include // IWYU pragma: keep +#include +#include +#include +#include +#include -using std::vector; -using std::pair; -using std::setw; -using std::max; -using std::min; using std::abs; using std::cerr; -using std::endl; -using std::string; -using std::setprecision; using std::isfinite; using std::make_pair; +using std::max; +using std::min; +using std::pair; +using std::setprecision; +using std::setw; +using std::string; +using std::vector; using smithlab::log_sum_log_vec; struct TwoStateBetaBin { TwoStateBetaBin(const double a, const double b) : alpha(a), beta(b), lnbeta_helper(gsl_sf_lnbeta(a, b)) {} - double operator()(const pair &val) const; - void fit(const vector &vals_a, const vector &vals_b, - const vector &p); - string tostring() const; + double + operator()(const pair &val) const; + void + fit(const vector &vals_a, const vector &vals_b, + const vector &p); + string + tostring() const; double alpha; double beta; double lnbeta_helper; @@ -56,8 +63,8 @@ struct TwoStateBetaBin { string TwoStateBetaBin::tostring() const { std::ostringstream os; - os << std::fixed << setprecision(3) << alpha << " " - << std::fixed << setprecision(3) << beta; + os << std::fixed << setprecision(3) << alpha << " " << std::fixed + << setprecision(3) << beta; return os.str(); } @@ -78,29 +85,31 @@ inline static double invpsi(const double tolerance, const double x) { double L = 1.0, Y = std::exp(x); while (L > tolerance) { - Y += L*sign(x - gsl_sf_psi(Y)); - L /= 2.0; + Y += L * sign(x - gsl_sf_psi(Y)); + L /= 2.0; // NOLINT(*-avoid-magic-numbers) } return Y; } inline static double movement(const double curr, const double prev) { - return std::abs(curr - prev)/max(std::abs(curr), std::abs(prev)); + return std::abs(curr - prev) / max(std::abs(curr), std::abs(prev)); } void TwoStateBetaBin::fit(const vector &vals_a, const vector &vals_b, - const vector &p) { + const vector &p) { + static constexpr auto initial_param_vals = 0.01; const double p_total = std::accumulate(begin(p), end(p), 0.0); const double alpha_rhs = - inner_product(begin(vals_a), end(vals_a), begin(p), 0.0)/p_total; + inner_product(begin(vals_a), end(vals_a), begin(p), 0.0) / p_total; const double beta_rhs = - inner_product(begin(vals_b), end(vals_b), begin(p), 0.0)/p_total; + inner_product(begin(vals_b), end(vals_b), begin(p), 0.0) / p_total; double prev_alpha = 0.0, prev_beta = 0.0; - alpha = beta = 0.01; + alpha = beta = initial_param_vals; + while (movement(alpha, prev_alpha) > tolerance && movement(beta, prev_beta) > tolerance) { prev_alpha = alpha; @@ -117,99 +126,98 @@ TwoStateBetaBin::fit(const vector &vals_a, const vector &vals_b, static inline double log_sum_log(const double p, const double q) { - if (p == 0.0) {return q;} - else if (q == 0.0) {return p;} + if (p == 0.0) { + return q; + } + else if (q == 0.0) { + return p; + } const double larger = (p > q) ? p : q; const double smaller = (p > q) ? q : p; - return larger + log(1.0 + exp(smaller - larger)); + return larger + std::log1p(std::exp(smaller - larger)); } static double log_sum_log_vec(const vector &vals, size_t a, size_t b) { - auto x = std::max_element(begin(vals) + a, begin(vals) + b); + auto x = std::max_element(std::cbegin(vals) + static_cast(a), + std::cbegin(vals) + static_cast(b)); const double max_val = *x; - const size_t max_idx = x - begin(vals); + const size_t max_idx = std::distance(std::cbegin(vals), x); double sum = 1.0; for (size_t i = a; i < b; ++i) { if (i != max_idx) sum += exp(vals[i] - max_val); } - return max_val + log(sum); + return max_val + std::log(sum); } static double log_sum_log_vec(const vector &vals, const vector &resets) { - vector w(resets.size() - 1); - for (size_t i = 0; i < resets.size() - 1; ++i) - w[i] = log_sum_log_vec(vals, resets[i], resets[i+1] - 1); - return log_sum_log_vec(w, w.size()); + vector w(std::size(resets) - 1); + for (size_t i = 0; i + 1 < std::size(resets); ++i) + w[i] = log_sum_log_vec(vals, resets[i], resets[i + 1] - 1); + return log_sum_log_vec(w, std::size(w)); } - //////////////////////////////////////////////////////////////////////// /////////// WRAPPER FUNCTIONS //////////////////////////////////////////////////////////////////////// double -TwoStateHMM::ViterbiDecoding(const vector > &values, +TwoStateHMM::ViterbiDecoding(const vector> &values, const vector &reset_points, const double p_fb, const double p_bf, const double fg_alpha, const double fg_beta, const double bg_alpha, const double bg_beta, vector &classes) const { - const TwoStateBetaBin fg_distro(fg_alpha, fg_beta); const TwoStateBetaBin bg_distro(bg_alpha, bg_beta); - return ViterbiDecoding(values, reset_points, p_fb, p_bf, - fg_distro, bg_distro, classes); + return ViterbiDecoding(values, reset_points, p_fb, p_bf, fg_distro, bg_distro, + classes); } - double -TwoStateHMM::PosteriorDecoding(const vector > &values, +TwoStateHMM::PosteriorDecoding(const vector> &values, const vector &reset_points, const double p_fb, const double p_bf, const double fg_alpha, const double fg_beta, const double bg_alpha, const double bg_beta, vector &classes, vector &posteriors) const { - const TwoStateBetaBin fg_distro(fg_alpha, fg_beta); const TwoStateBetaBin bg_distro(bg_alpha, bg_beta); - return PosteriorDecoding(values, reset_points, p_fb, p_bf, - fg_distro, bg_distro, classes, posteriors); + return PosteriorDecoding(values, reset_points, p_fb, p_bf, fg_distro, + bg_distro, classes, posteriors); } void -TwoStateHMM::PosteriorScores(const vector > &values, +TwoStateHMM::PosteriorScores(const vector> &values, const vector &reset_points, const double p_fb, const double p_bf, const double fg_alpha, const double fg_beta, const double bg_alpha, const double bg_beta, const bool fg_class, vector &posteriors) const { - const TwoStateBetaBin fg_distro(fg_alpha, fg_beta); const TwoStateBetaBin bg_distro(bg_alpha, bg_beta); - PosteriorScores(values, reset_points, p_fb, p_bf, - fg_distro, bg_distro, fg_class, posteriors); + PosteriorScores(values, reset_points, p_fb, p_bf, fg_distro, bg_distro, + fg_class, posteriors); } double -TwoStateHMM::BaumWelchTraining(const std::vector > &values, +TwoStateHMM::BaumWelchTraining(const std::vector> &values, const std::vector &reset_points, - double &p_fb, double &p_bf, - double &fg_alpha, double &fg_beta, - double &bg_alpha, double &bg_beta) const { - + double &p_fb, double &p_bf, double &fg_alpha, + double &fg_beta, double &bg_alpha, + double &bg_beta) const { TwoStateBetaBin fg_distro(fg_alpha, fg_beta); TwoStateBetaBin bg_distro(bg_alpha, bg_beta); - const double score = BaumWelchTraining(values, reset_points, p_fb, p_bf, - fg_distro, bg_distro); + const double score = + BaumWelchTraining(values, reset_points, p_fb, p_bf, fg_distro, bg_distro); fg_alpha = fg_distro.alpha; fg_beta = fg_distro.beta; bg_alpha = bg_distro.alpha; @@ -223,41 +231,37 @@ TwoStateHMM::BaumWelchTraining(const std::vector > &values, //////////////////////////////////////////////////////////////////////// static void -get_emissions(vector >::const_iterator v, - const vector >::const_iterator v_end, +get_emissions(vector>::const_iterator v, + const vector>::const_iterator v_end, vector::iterator emit, const TwoStateBetaBin &distr) { while (v != v_end) *emit++ = distr(*v++); } - static double -forward_algorithm(const size_t start, const size_t end, - const double lp_sf, const double lp_sb, - const double lp_ff, const double lp_fb, +forward_algorithm(const size_t start, const size_t end, const double lp_sf, + const double lp_sb, const double lp_ff, const double lp_fb, const double lp_bf, const double lp_bb, const vector &fg_emit, const vector &bg_emit, - vector > &f) { + vector> &f) { f[start].first = fg_emit[start] + lp_sf; f[start].second = bg_emit[start] + lp_sb; for (size_t i = start + 1; i < end; ++i) { const size_t k = i - 1; - f[i].first = fg_emit[i] + log_sum_log(f[k].first + lp_ff, - f[k].second + lp_bf); - f[i].second = bg_emit[i] + log_sum_log(f[k].first + lp_fb, - f[k].second + lp_bb); + f[i].first = + fg_emit[i] + log_sum_log(f[k].first + lp_ff, f[k].second + lp_bf); + f[i].second = + bg_emit[i] + log_sum_log(f[k].first + lp_fb, f[k].second + lp_bb); } return log_sum_log(f[end - 1].first, f[end - 1].second); } static double -backward_algorithm(const size_t start, const size_t end, - const double lp_sf, const double lp_sb, - const double lp_ff, const double lp_fb, +backward_algorithm(const size_t start, const size_t end, const double lp_sf, + const double lp_sb, const double lp_ff, const double lp_fb, const double lp_bf, const double lp_bb, - const vector &fg_emit, - const vector &bg_emit, - vector > &b) { + const vector &fg_emit, const vector &bg_emit, + vector> &b) { b[end - 1].first = 0; b[end - 1].second = 0; for (size_t k = end - 1; k > start; --k) { @@ -271,8 +275,8 @@ backward_algorithm(const size_t start, const size_t end, b[start].second + bg_emit[start] + lp_sb); } - -template static void +template +static void one_minus(T a, const T a_end, T b) { while (a != a_end) *b++ = 1.0 - *a++; @@ -285,26 +289,24 @@ get_posterior(const pair &f, const pair &b) { } inline static void -get_posteriors(const vector > &forward, - const vector > &backward, +get_posteriors(const vector> &forward, + const vector> &backward, vector &posteriors) { posteriors.resize(forward.size()); for (size_t i = 0; i < forward.size(); ++i) posteriors[i] = get_posterior(forward[i], backward[i]); } - static void summarize_transitions(const size_t start, const size_t end, - const vector > &f, - const vector > &b, - const double total, - const vector &fg_emit, const vector &bg_emit, - const double lp_ff, const double lp_fb, - const double lp_bf, const double lp_bb, - vector &ff_vals, vector &fb_vals, - vector &bf_vals, vector &bb_vals) { - + const vector> &f, + const vector> &b, const double total, + const vector &fg_emit, + const vector &bg_emit, const double lp_ff, + const double lp_fb, const double lp_bf, + const double lp_bb, vector &ff_vals, + vector &fb_vals, vector &bf_vals, + vector &bb_vals) { for (size_t i = start + 1; i < end; ++i) { const double b_first = b[i].first + fg_emit[i] - total; const double b_second = b[i].second + bg_emit[i] - total; @@ -321,21 +323,19 @@ summarize_transitions(const size_t start, const size_t end, } } - static double -single_iteration(const vector > &values, +single_iteration(const vector> &values, const vector &vals_a, const vector &vals_b, const vector &reset_points, - vector > &forward, - vector > &backward, - double &p_fb, double &p_bf, - TwoStateBetaBin &fg_distro, TwoStateBetaBin &bg_distro, - vector &fg_emit, vector &bg_emit, - vector &ff_vals, vector &fb_vals, - vector &bf_vals, vector &bb_vals) { - - const double lp_sf = log(p_bf/(p_bf + p_fb)); - const double lp_sb = log(p_fb/(p_bf + p_fb)); + vector> &forward, + vector> &backward, double &p_fb, + double &p_bf, TwoStateBetaBin &fg_distro, + TwoStateBetaBin &bg_distro, vector &fg_emit, + vector &bg_emit, vector &ff_vals, + vector &fb_vals, vector &bf_vals, + vector &bb_vals) { + const double lp_sf = log(p_bf / (p_bf + p_fb)); + const double lp_sb = log(p_fb / (p_bf + p_fb)); const double lp_ff = log(1.0 - p_fb); const double lp_fb = log(p_fb); const double lp_bf = log(p_bf); @@ -348,25 +348,23 @@ single_iteration(const vector > &values, double total_loglik = 0; for (size_t i = 0; i < reset_points.size() - 1; ++i) { - const double score = - forward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, forward); + forward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, forward); #ifndef NDEBUG const double backward_score = #endif - backward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, backward); + backward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, + backward); - assert(fabs(score - backward_score)/max(score, backward_score) < tolerance); + assert(fabs(score - backward_score) / max(score, backward_score) < + tolerance); - summarize_transitions(reset_points[i], reset_points[i + 1], - forward, backward, score, fg_emit, bg_emit, - lp_ff, lp_fb, lp_bf, lp_bb, - ff_vals, fb_vals, bf_vals, bb_vals); + summarize_transitions(reset_points[i], reset_points[i + 1], forward, + backward, score, fg_emit, bg_emit, lp_ff, lp_fb, + lp_bf, lp_bb, ff_vals, fb_vals, bf_vals, bb_vals); total_loglik += score; } @@ -374,14 +372,14 @@ single_iteration(const vector > &values, const double p_ff_update = exp(log_sum_log_vec(ff_vals, reset_points)); const double p_fb_update = exp(log_sum_log_vec(fb_vals, reset_points)); const double f_denom = p_ff_update + p_fb_update; - assert(p_fb_update/f_denom > tolerance); - p_fb = p_fb_update/f_denom; + assert(p_fb_update / f_denom > tolerance); + p_fb = p_fb_update / f_denom; const double p_bf_update = exp(log_sum_log_vec(bf_vals, reset_points)); const double p_bb_update = exp(log_sum_log_vec(bb_vals, reset_points)); const double b_denom = p_bb_update + p_bf_update; - assert(p_bf_update/b_denom > tolerance); - p_bf = p_bf_update/b_denom; + assert(p_bf_update / b_denom > tolerance); + p_bf = p_bf_update / b_denom; vector posteriors; get_posteriors(forward, backward, posteriors); @@ -393,50 +391,40 @@ single_iteration(const vector > &values, return total_loglik; } - static void report_param_header_for_verbose() { - cerr << setw(3) << "ITR" - << setw(8) << "F size" - << setw(8) << "B size" - << setw(14) << "F PARAMS" - << setw(14) << "B PARAMS" - << setw(11) << "DELTA" - << endl; + // NOLINTBEGIN(*-avoid-magic-numbers) + cerr << setw(3) << "ITR" << setw(8) << "F size" << setw(8) << "B size" + << setw(14) << "F PARAMS" << setw(14) << "B PARAMS" << setw(11) + << "DELTA\n"; + // NOLINTEND(*-avoid-magic-numbers) } static inline double get_delta(const double a, const double b) { - return (b - a)/max(abs(a), abs(b)); + return (b - a) / max(abs(a), abs(b)); } - static void -report_params_for_verbose(const size_t i, - const double p_fb_est, +report_params_for_verbose(const size_t i, const double p_fb_est, const double p_bf_est, const TwoStateBetaBin &fg_distro, - const TwoStateBetaBin &bg_distro, - const double total, + const TwoStateBetaBin &bg_distro, const double total, const double prev_total) { + // NOLINTBEGIN(*-avoid-magic-numbers) std::ios_base::fmtflags orig_flags(cerr.flags()); cerr.precision(2); - cerr << setw(3) << i + 1 - << setw(8) << std::fixed << 1/p_fb_est - << setw(8) << std::fixed << 1/p_bf_est - << setw(14) << fg_distro.tostring() - << setw(14) << bg_distro.tostring() - << setw(11) << std::scientific - << abs(get_delta(prev_total, total)) - << endl; + cerr << setw(3) << i + 1 << setw(8) << std::fixed << 1 / p_fb_est << setw(8) + << std::fixed << 1 / p_bf_est << setw(14) << fg_distro.tostring() + << setw(14) << bg_distro.tostring() << setw(11) << std::scientific + << abs(get_delta(prev_total, total)) << '\n'; cerr.flags(orig_flags); + // NOLINTEND(*-avoid-magic-numbers) } - static void -extract_fractional_values(const vector > &values, +extract_fractional_values(const vector> &values, vector &vals_a, vector &vals_b) { - const size_t n_vals = values.size(); vals_a.resize(n_vals); vals_b.resize(n_vals); @@ -447,30 +435,29 @@ extract_fractional_values(const vector > &values, // const double a = values[i].first + pseudocount; // const double b = values[i].second + pseudocount; // const double val = a/(a + b); - const double val = values[i].first/(values[i].first + values[i].second); + const double val = values[i].first / (values[i].first + values[i].second); const double adjusted_val = min(max(val, epsilon), 1 - epsilon); vals_a[i] = log(adjusted_val); vals_b[i] = log(1.0 - adjusted_val); } } - double -TwoStateHMM::BaumWelchTraining(const vector > &values, - const vector &reset_points, - double &p_fb, double &p_bf, - TwoStateBetaBin &fg_distro, TwoStateBetaBin &bg_distro) const { - +TwoStateHMM::BaumWelchTraining(const vector> &values, + const vector &reset_points, double &p_fb, + double &p_bf, TwoStateBetaBin &fg_distro, + TwoStateBetaBin &bg_distro) const { vector vals_a, vals_b; extract_fractional_values(values, vals_a, vals_b); const size_t n_vals = values.size(); - vector > forward(n_vals, make_pair(0.0, 0.0)); - vector > backward(n_vals, make_pair(0.0, 0.0)); + vector> forward(n_vals, make_pair(0.0, 0.0)); + vector> backward(n_vals, make_pair(0.0, 0.0)); - vector ff_vals(n_vals), fb_vals(n_vals); // for estimating transitions + vector ff_vals(n_vals), + fb_vals(n_vals); // for estimating transitions vector bf_vals(n_vals), bb_vals(n_vals); - vector fg_emit(n_vals), bg_emit(n_vals); // avoid recomp of emissions + vector fg_emit(n_vals), bg_emit(n_vals); // avoid recomp of emissions if (VERBOSE) report_param_header_for_verbose(); @@ -478,17 +465,16 @@ TwoStateHMM::BaumWelchTraining(const vector > &values, double prev_total = -std::numeric_limits::max(); bool converged = false; for (size_t i = 0; i < max_iterations && !converged; ++i) { - double p_fb_est = p_fb, p_bf_est = p_bf; const double total = single_iteration(values, vals_a, vals_b, reset_points, forward, backward, - p_fb_est, p_bf_est, fg_distro, bg_distro, - ff_vals, fb_vals, bf_vals, bb_vals, fg_emit, bg_emit); + p_fb_est, p_bf_est, fg_distro, bg_distro, ff_vals, + fb_vals, bf_vals, bb_vals, fg_emit, bg_emit); if (VERBOSE) - report_params_for_verbose(i, p_fb_est, p_bf_est, - fg_distro, bg_distro, total, prev_total); + report_params_for_verbose(i, p_fb_est, p_bf_est, fg_distro, bg_distro, + total, prev_total); // ADS: removing the check based on expected log likelihood from // forward/backward as these seem to have some problem... @@ -498,7 +484,7 @@ TwoStateHMM::BaumWelchTraining(const vector > &values, if (converged) { if (VERBOSE) - cerr << "CONVERGED" << endl; + cerr << "CONVERGED\n"; } else { p_fb = p_fb_est; @@ -509,17 +495,16 @@ TwoStateHMM::BaumWelchTraining(const vector > &values, return prev_total; } - void -TwoStateHMM::PosteriorScores(const vector > &values, +TwoStateHMM::PosteriorScores(const vector> &values, const vector &reset_points, const double p_fb, const double p_bf, - const TwoStateBetaBin &fg_distro, const TwoStateBetaBin &bg_distro, + const TwoStateBetaBin &fg_distro, + const TwoStateBetaBin &bg_distro, const bool fg_class, vector &posteriors) const { - - const double lp_sf = log(p_bf/(p_bf + p_fb)); - const double lp_sb = log(p_fb/(p_bf + p_fb)); + const double lp_sf = log(p_bf / (p_bf + p_fb)); + const double lp_sb = log(p_fb / (p_bf + p_fb)); const double lp_ff = log(1.0 - p_fb); const double lp_fb = log(p_fb); const double lp_bf = log(p_bf); @@ -529,8 +514,8 @@ TwoStateHMM::PosteriorScores(const vector > &values, isfinite(lp_fb) && isfinite(lp_bf) && isfinite(lp_bb)); const size_t n_vals = values.size(); - vector > forward(n_vals, make_pair(0.0, 0.0)); - vector > backward(n_vals, make_pair(0.0, 0.0)); + vector> forward(n_vals, make_pair(0.0, 0.0)); + vector> backward(n_vals, make_pair(0.0, 0.0)); vector fg_emit(n_vals), bg_emit(n_vals); get_emissions(begin(values), end(values), begin(fg_emit), fg_distro); @@ -540,18 +525,18 @@ TwoStateHMM::PosteriorScores(const vector > &values, #ifndef NDEBUG const double score = #endif - forward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, forward); + forward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, forward); #ifndef NDEBUG const double backward_score = #endif - backward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, backward); + backward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, + backward); - assert(fabs(score - backward_score)/max(score, backward_score) < tolerance); + assert(fabs(score - backward_score) / max(score, backward_score) < + tolerance); } get_posteriors(forward, backward, posteriors); @@ -559,30 +544,26 @@ TwoStateHMM::PosteriorScores(const vector > &values, one_minus(begin(posteriors), end(posteriors), begin(posteriors)); } - void -TwoStateHMM::TransitionPosteriors(const vector > &values, +TwoStateHMM::TransitionPosteriors(const vector> &values, const vector &reset_points, const double p_fb, const double p_bf, const double fg_alpha, const double fg_beta, const double bg_alpha, const double bg_beta, const size_t transition, vector &posteriors) const { - const TwoStateBetaBin fg_distro(fg_alpha, fg_beta); const TwoStateBetaBin bg_distro(bg_alpha, bg_beta); - return TransitionPosteriors(values, reset_points, p_fb, p_bf, - fg_distro, bg_distro, transition, posteriors); + return TransitionPosteriors(values, reset_points, p_fb, p_bf, fg_distro, + bg_distro, transition, posteriors); } - static void get_joint_posteriors(const pair &forward, - const pair &backward, - const double fg_emit, const double bg_emit, - const double lp_ff, const double lp_fb, - const double lp_bf, const double lp_bb, + const pair &backward, const double fg_emit, + const double bg_emit, const double lp_ff, + const double lp_fb, const double lp_bf, const double lp_bb, double &ff_c, double &fb_c, double &bf_c, double &bb_c) { // (forward val) + transition + emission + (backward val offset by 1) ff_c = forward.first + lp_ff + fg_emit + backward.first; @@ -591,28 +572,26 @@ get_joint_posteriors(const pair &forward, bb_c = forward.second + lp_bb + bg_emit + backward.second; } - void -TwoStateHMM::TransitionPosteriors(const vector > &values, +TwoStateHMM::TransitionPosteriors(const vector> &values, const vector &reset_points, const double p_fb, const double p_bf, const TwoStateBetaBin &fg_distro, const TwoStateBetaBin &bg_distro, const size_t transition, vector &scores) const { - assert(transition < 4); - const double lp_sf = log(p_bf/(p_bf + p_fb)); - const double lp_sb = log(p_fb/(p_bf + p_fb)); + const double lp_sf = log(p_bf / (p_bf + p_fb)); + const double lp_sb = log(p_fb / (p_bf + p_fb)); const double lp_ff = log(1.0 - p_fb); const double lp_fb = log(p_fb); const double lp_bf = log(p_bf); const double lp_bb = log(1.0 - p_bf); const size_t n_vals = values.size(); - vector > forward(n_vals, make_pair(0.0, 0.0)); - vector > backward(n_vals, make_pair(0.0, 0.0)); + vector> forward(n_vals, make_pair(0.0, 0.0)); + vector> backward(n_vals, make_pair(0.0, 0.0)); vector fg_emit(n_vals), bg_emit(n_vals); get_emissions(begin(values), end(values), begin(fg_emit), fg_distro); @@ -622,18 +601,18 @@ TwoStateHMM::TransitionPosteriors(const vector > &values, #ifndef NDEBUG const double score = #endif - forward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, forward); + forward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, forward); #ifndef NDEBUG const double backward_score = #endif - backward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, backward); + backward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, + backward); - assert(fabs(score - backward_score)/max(score, backward_score) < tolerance); + assert(fabs(score - backward_score) / max(score, backward_score) < + tolerance); } scores.clear(); @@ -643,40 +622,44 @@ TwoStateHMM::TransitionPosteriors(const vector > &values, if (i == reset_points[j]) ++j; else { - double ff_c, fb_c, bf_c, bb_c; + double ff_c{}; + double fb_c{}; + double bf_c{}; + double bb_c{}; get_joint_posteriors(forward[i - 1], backward[i], fg_emit[i], bg_emit[i], lp_ff, lp_fb, lp_bf, lp_bb, ff_c, fb_c, bf_c, bb_c); double numerator = ff_c; - if (transition == 1) numerator = fb_c; - if (transition == 2) numerator = bf_c; - if (transition == 3) numerator = bb_c; - const double denom = log_sum_log(log_sum_log(ff_c, fb_c), - log_sum_log(bf_c, bb_c)); + if (transition == 1) + numerator = fb_c; + if (transition == 2) + numerator = bf_c; + if (transition == 3) + numerator = bb_c; + const double denom = + log_sum_log(log_sum_log(ff_c, fb_c), log_sum_log(bf_c, bb_c)); scores[i] = exp(numerator - denom); } } } - double -TwoStateHMM::PosteriorDecoding(const vector > &values, +TwoStateHMM::PosteriorDecoding(const vector> &values, const vector &reset_points, const double p_fb, const double p_bf, const TwoStateBetaBin &fg_distro, const TwoStateBetaBin &bg_distro, vector &classes, vector &posteriors) const { - - const double lp_sf = log(p_bf/(p_bf + p_fb)); - const double lp_sb = log(p_fb/(p_bf + p_fb)); + const double lp_sf = log(p_bf / (p_bf + p_fb)); + const double lp_sb = log(p_fb / (p_bf + p_fb)); const double lp_ff = log(1.0 - p_fb); const double lp_fb = log(p_fb); const double lp_bf = log(p_bf); const double lp_bb = log(1.0 - p_bf); const size_t n_vals = values.size(); - vector > forward(n_vals, make_pair(0.0, 0.0)); - vector > backward(n_vals, make_pair(0.0, 0.0)); + vector> forward(n_vals, make_pair(0.0, 0.0)); + vector> backward(n_vals, make_pair(0.0, 0.0)); vector fg_emit(n_vals), bg_emit(n_vals); get_emissions(begin(values), end(values), begin(fg_emit), fg_distro); @@ -685,18 +668,18 @@ TwoStateHMM::PosteriorDecoding(const vector > &values, double total_loglik = 0; for (size_t i = 0; i < reset_points.size() - 1; ++i) { const double score = - forward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, forward); + forward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, forward); #ifndef NDEBUG const double backward_score = #endif - backward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, backward); + backward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, + backward); - assert(fabs(score - backward_score)/max(score, backward_score) < tolerance); + assert(fabs(score - backward_score) / max(score, backward_score) < + tolerance); total_loglik += score; } @@ -705,21 +688,20 @@ TwoStateHMM::PosteriorDecoding(const vector > &values, classes.resize(n_vals); for (size_t i = 0; i < n_vals; ++i) - classes[i] = (posteriors[i] > 0.5); + classes[i] = (posteriors[i] > 0.5); // NOLINT(*-avoid-magic-numbers) return total_loglik; } - double -TwoStateHMM::ViterbiDecoding(const vector > &values, +TwoStateHMM::ViterbiDecoding(const vector> &values, const vector &reset_points, const double p_fb, const double p_bf, - const TwoStateBetaBin &fg_distro, const TwoStateBetaBin &bg_distro, + const TwoStateBetaBin &fg_distro, + const TwoStateBetaBin &bg_distro, vector &ml_classes) const { - - const double lp_sf = log(p_bf/(p_bf + p_fb)); - const double lp_sb = log(p_fb/(p_bf + p_fb)); + const double lp_sf = log(p_bf / (p_bf + p_fb)); + const double lp_sb = log(p_fb / (p_bf + p_fb)); const double lp_ff = log(1.0 - p_fb); const double lp_fb = log(p_fb); const double lp_bf = log(p_bf); @@ -727,17 +709,15 @@ TwoStateHMM::ViterbiDecoding(const vector > &values, double total = 0; for (size_t i = 0; i < reset_points.size() - 1; ++i) { - const size_t lim = reset_points[i + 1] - reset_points[i]; - vector > v(lim, make_pair(0.0, 0.0)); - vector > trace(lim, make_pair(0ul, 0ul)); + vector> v(lim, make_pair(0.0, 0.0)); + vector> trace(lim, make_pair(0ul, 0ul)); v[0].first = fg_distro(values[reset_points[i]]) + lp_sf; v[0].second = bg_distro(values[reset_points[i]]) + lp_sb; for (size_t j = 1; j < lim; ++j) { - const double ff = v[j - 1].first + lp_ff; const double bf = v[j - 1].second + lp_bf; const double fg_log_emmit = fg_distro(values[reset_points[i] + j]); @@ -788,36 +768,33 @@ TwoStateHMM::ViterbiDecoding(const vector > &values, } } reverse(begin(inner_ml_classes), end(inner_ml_classes)); - ml_classes.insert(end(ml_classes), - begin(inner_ml_classes), end(inner_ml_classes)); + ml_classes.insert(end(ml_classes), begin(inner_ml_classes), + end(inner_ml_classes)); total += max(v.back().first, v.back().second); } return total; } - //////////////////////////////////////////////////////////////////////////////// /////////////// FOR MULTIPLE REPLICATES // WRAPPER FUNCTIONS double -TwoStateHMM::BaumWelchTraining(const vector > > &values, - const vector &reset_points, - double &p_fb, double &p_bf, - vector &fg_alpha, - vector &fg_beta, - vector &bg_alpha, - vector &bg_beta) const { +TwoStateHMM::BaumWelchTraining( + const vector>> &values, + const vector &reset_points, double &p_fb, double &p_bf, + vector &fg_alpha, vector &fg_beta, vector &bg_alpha, + vector &bg_beta) const { vector fg_distro, bg_distro; for (size_t i = 0; i < values.size(); ++i) { fg_distro.push_back(TwoStateBetaBin(fg_alpha[i], fg_beta[i])); bg_distro.push_back(TwoStateBetaBin(bg_alpha[i], bg_beta[i])); } - const double score = BaumWelchTraining(values, reset_points, - p_fb, p_bf, fg_distro, bg_distro); + const double score = + BaumWelchTraining(values, reset_points, p_fb, p_bf, fg_distro, bg_distro); for (size_t r = 0; r < values.size(); ++r) { fg_alpha[r] = fg_distro[r].alpha; fg_beta[r] = fg_distro[r].beta; @@ -828,44 +805,35 @@ TwoStateHMM::BaumWelchTraining(const vector > > &val } double -TwoStateHMM::PosteriorDecoding(const vector > > &values, - const vector &reset_points, - const double p_fb, const double p_bf, - const vector &fg_alpha, - const vector &fg_beta, - const vector &bg_alpha, - const vector &bg_beta, - vector &classes, - vector &posteriors) const { - +TwoStateHMM::PosteriorDecoding( + const vector>> &values, + const vector &reset_points, const double p_fb, const double p_bf, + const vector &fg_alpha, const vector &fg_beta, + const vector &bg_alpha, const vector &bg_beta, + vector &classes, vector &posteriors) const { vector fg_distro, bg_distro; for (size_t i = 0; i < values.size(); ++i) { fg_distro.push_back(TwoStateBetaBin(fg_alpha[i], fg_beta[i])); bg_distro.push_back(TwoStateBetaBin(bg_alpha[i], bg_beta[i])); } - return PosteriorDecoding(values, reset_points, p_fb, p_bf, - fg_distro, bg_distro, classes, posteriors); + return PosteriorDecoding(values, reset_points, p_fb, p_bf, fg_distro, + bg_distro, classes, posteriors); } - void -TwoStateHMM::PosteriorScores(const vector > > &values, - const vector &reset_points, - const double p_fb, const double p_bf, - const vector &fg_alpha, - const vector &fg_beta, - const vector &bg_alpha, - const vector &bg_beta, - const bool &fg_class, - vector &posteriors) const { - +TwoStateHMM::PosteriorScores( + const vector>> &values, + const vector &reset_points, const double p_fb, const double p_bf, + const vector &fg_alpha, const vector &fg_beta, + const vector &bg_alpha, const vector &bg_beta, + const bool &fg_class, vector &posteriors) const { vector fg_distro, bg_distro; for (size_t i = 0; i < values.size(); ++i) { fg_distro.push_back(TwoStateBetaBin(fg_alpha[i], fg_beta[i])); bg_distro.push_back(TwoStateBetaBin(bg_alpha[i], bg_beta[i])); } - return PosteriorScores(values, reset_points, p_fb, p_bf, - fg_distro, bg_distro, fg_class, posteriors); + return PosteriorScores(values, reset_points, p_fb, p_bf, fg_distro, bg_distro, + fg_class, posteriors); } // INTERNAL FUNCTIONS (FOR REPLICATES) @@ -876,7 +844,7 @@ has_data(const pair &p) { } static void -get_emissions_rep(const vector > > &v, +get_emissions_rep(const vector>> &v, vector &emit, const vector &distr) { fill(begin(emit), end(emit), 0.0); for (size_t r = 0; r < v.size(); ++r) @@ -885,13 +853,12 @@ get_emissions_rep(const vector > > &v, emit[i] += distr[r](v[r][i]); } - static void -fit_distro_rep(TwoStateBetaBin &distro, const vector > &values, +fit_distro_rep(TwoStateBetaBin &distro, + const vector> &values, const vector &vals_a, const vector &vals_b, - const vector &posteriors, - vector &tmp_a, vector &tmp_b, - vector &tmp_p) { + const vector &posteriors, vector &tmp_a, + vector &tmp_b, vector &tmp_p) { tmp_a.clear(); tmp_b.clear(); tmp_p.clear(); @@ -904,22 +871,17 @@ fit_distro_rep(TwoStateBetaBin &distro, const vector > &val distro.fit(tmp_a, tmp_b, tmp_p); } - static double -single_iteration_rep(const vector > > &values, - const vector > &vals_a, - const vector > &vals_b, - const vector &reset_points, - vector > &forward, - vector > &backward, - double &p_fb, double &p_bf, - vector &fg_distro, vector &bg_distro, - vector &fg_emit, vector &bg_emit, - vector &ff_vals, vector &fb_vals, - vector &bf_vals, vector &bb_vals) { - - const double lp_sf = log(p_bf/(p_bf + p_fb)); - const double lp_sb = log(p_fb/(p_bf + p_fb)); +single_iteration_rep( + const vector>> &values, + const vector> &vals_a, const vector> &vals_b, + const vector &reset_points, vector> &forward, + vector> &backward, double &p_fb, double &p_bf, + vector &fg_distro, vector &bg_distro, + vector &fg_emit, vector &bg_emit, vector &ff_vals, + vector &fb_vals, vector &bf_vals, vector &bb_vals) { + const double lp_sf = log(p_bf / (p_bf + p_fb)); + const double lp_sb = log(p_fb / (p_bf + p_fb)); const double lp_ff = log(1.0 - p_fb); const double lp_fb = log(p_fb); const double lp_bf = log(p_bf); @@ -931,23 +893,22 @@ single_iteration_rep(const vector > > &values, double total_loglik = 0; for (size_t i = 0; i < reset_points.size() - 1; ++i) { const double score = - forward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, forward); + forward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, forward); #ifndef NDEBUG const double backward_score = #endif - backward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, backward); + backward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, + backward); - assert(fabs(score - backward_score)/max(score, backward_score) < tolerance); + assert(fabs(score - backward_score) / max(score, backward_score) < + tolerance); - summarize_transitions(reset_points[i], reset_points[i + 1], - forward, backward, score, fg_emit, bg_emit, - lp_ff, lp_fb, lp_bf, lp_bb, - ff_vals, fb_vals, bf_vals, bb_vals); + summarize_transitions(reset_points[i], reset_points[i + 1], forward, + backward, score, fg_emit, bg_emit, lp_ff, lp_fb, + lp_bf, lp_bb, ff_vals, fb_vals, bf_vals, bb_vals); total_loglik += score; } @@ -955,14 +916,14 @@ single_iteration_rep(const vector > > &values, const double p_ff_update = exp(log_sum_log_vec(ff_vals, reset_points)); const double p_fb_update = exp(log_sum_log_vec(fb_vals, reset_points)); const double f_denom = p_ff_update + p_fb_update; - assert(p_fb_update/f_denom > tolerance); - p_fb = p_fb_update/f_denom; + assert(p_fb_update / f_denom > tolerance); + p_fb = p_fb_update / f_denom; const double p_bf_update = exp(log_sum_log_vec(bf_vals, reset_points)); const double p_bb_update = exp(log_sum_log_vec(bb_vals, reset_points)); const double b_denom = p_bb_update + p_bf_update; - assert(p_bf_update/b_denom > tolerance); - p_bf = p_bf_update/b_denom; + assert(p_bf_update / b_denom > tolerance); + p_bf = p_bf_update / b_denom; vector posteriors; get_posteriors(forward, backward, posteriors); @@ -975,39 +936,37 @@ single_iteration_rep(const vector > > &values, tmp_b.reserve(n_vals); tmp_p.reserve(n_vals); for (size_t r = 0; r < n_reps; ++r) - fit_distro_rep(fg_distro[r], values[r], - vals_a[r], vals_b[r], posteriors, tmp_a, tmp_b, tmp_p); + fit_distro_rep(fg_distro[r], values[r], vals_a[r], vals_b[r], posteriors, + tmp_a, tmp_b, tmp_p); one_minus(begin(posteriors), end(posteriors), begin(posteriors)); for (size_t r = 0; r < n_reps; ++r) - fit_distro_rep(bg_distro[r], values[r], - vals_a[r], vals_b[r], posteriors, tmp_a, tmp_b, tmp_p); + fit_distro_rep(bg_distro[r], values[r], vals_a[r], vals_b[r], posteriors, + tmp_a, tmp_b, tmp_p); return total_loglik; } - - double -TwoStateHMM::BaumWelchTraining(const vector > > &values, - const vector &reset_points, - double &p_fb, double &p_bf, - vector &fg_distro, - vector &bg_distro) const { - +TwoStateHMM::BaumWelchTraining( + const vector>> &values, + const vector &reset_points, double &p_fb, double &p_bf, + vector &fg_distro, + vector &bg_distro) const { // extract the fractional values (both fraction meth and unmeth) const size_t n_reps = values.size(); - vector > vals_a(n_reps), vals_b(n_reps); + vector> vals_a(n_reps), vals_b(n_reps); for (size_t r = 0; r < n_reps; ++r) extract_fractional_values(values[r], vals_a[r], vals_b[r]); const size_t n_vals = values[0].size(); - vector > forward(n_vals, make_pair(0.0, 0.0)); - vector > backward(n_vals, make_pair(0.0, 0.0)); + vector> forward(n_vals, make_pair(0.0, 0.0)); + vector> backward(n_vals, make_pair(0.0, 0.0)); - vector ff_vals(n_vals), fb_vals(n_vals); // for estimating transitions + vector ff_vals(n_vals), + fb_vals(n_vals); // for estimating transitions vector bf_vals(n_vals), bb_vals(n_vals); - vector fg_emit(n_vals), bg_emit(n_vals); // avoid recomp of emissions + vector fg_emit(n_vals), bg_emit(n_vals); // avoid recomp of emissions if (VERBOSE) report_param_header_for_verbose(); @@ -1015,17 +974,16 @@ TwoStateHMM::BaumWelchTraining(const vector > > &val double prev_total = -std::numeric_limits::max(); bool converged = false; for (size_t i = 0; i < max_iterations && !converged; ++i) { - double p_fb_est = p_fb, p_bf_est = p_bf; - const double total = - single_iteration_rep(values, vals_a, vals_b, reset_points, forward, backward, - p_fb_est, p_bf_est, fg_distro, bg_distro, - ff_vals, fb_vals, bf_vals, bb_vals, fg_emit, bg_emit); + const double total = single_iteration_rep( + values, vals_a, vals_b, reset_points, forward, backward, p_fb_est, + p_bf_est, fg_distro, bg_distro, ff_vals, fb_vals, bf_vals, bb_vals, + fg_emit, bg_emit); - if (VERBOSE) // reporting for first replicate - report_params_for_verbose(i, p_fb_est, p_bf_est, - fg_distro[0], bg_distro[0], total, prev_total); + if (VERBOSE) // reporting for first replicate + report_params_for_verbose(i, p_fb_est, p_bf_est, fg_distro[0], + bg_distro[0], total, prev_total); // ADS: removing the check based on expected log likelihood from // forward/backward as these seem to have some problem... @@ -1035,7 +993,7 @@ TwoStateHMM::BaumWelchTraining(const vector > > &val if (converged) { if (VERBOSE) - cerr << "CONVERGED" << endl; + cerr << "CONVERGED\n"; } else { p_fb = p_fb_est; @@ -1046,26 +1004,24 @@ TwoStateHMM::BaumWelchTraining(const vector > > &val return prev_total; } - void -TwoStateHMM::PosteriorScores(const vector > > &values, +TwoStateHMM::PosteriorScores(const vector>> &values, const vector &reset_points, const double p_fb, const double p_bf, const vector &fg_distro, const vector &bg_distro, const bool fg_class, vector &posteriors) const { - - const double lp_sf = log(p_bf/(p_bf + p_fb)); - const double lp_sb = log(p_fb/(p_bf + p_fb)); + const double lp_sf = log(p_bf / (p_bf + p_fb)); + const double lp_sb = log(p_fb / (p_bf + p_fb)); const double lp_ff = log(1.0 - p_fb); const double lp_fb = log(p_fb); const double lp_bf = log(p_bf); const double lp_bb = log(1.0 - p_bf); const size_t n_vals = values[0].size(); - vector > forward(n_vals, make_pair(0.0, 0.0)); - vector > backward(n_vals, make_pair(0.0, 0.0)); + vector> forward(n_vals, make_pair(0.0, 0.0)); + vector> backward(n_vals, make_pair(0.0, 0.0)); vector fg_emit(n_vals), bg_emit(n_vals); get_emissions_rep(values, fg_emit, fg_distro); @@ -1075,18 +1031,18 @@ TwoStateHMM::PosteriorScores(const vector > > &value #ifndef NDEBUG const double score = #endif - forward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, forward); + forward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, forward); #ifndef NDEBUG const double backward_score = #endif - backward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, backward); + backward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, + backward); - assert(fabs(score - backward_score)/max(score, backward_score) < tolerance); + assert(fabs(score - backward_score) / max(score, backward_score) < + tolerance); } get_posteriors(forward, backward, posteriors); @@ -1094,26 +1050,23 @@ TwoStateHMM::PosteriorScores(const vector > > &value one_minus(begin(posteriors), end(posteriors), begin(posteriors)); } - double -TwoStateHMM::PosteriorDecoding(const vector > > &values, - const vector &reset_points, - const double p_fb, const double p_bf, - const vector &fg_distro, - const vector &bg_distro, - vector &classes, - vector &posteriors) const { - - const double lp_sf = log(p_bf/(p_bf + p_fb)); - const double lp_sb = log(p_fb/(p_bf + p_fb)); +TwoStateHMM::PosteriorDecoding( + const vector>> &values, + const vector &reset_points, const double p_fb, const double p_bf, + const vector &fg_distro, + const vector &bg_distro, vector &classes, + vector &posteriors) const { + const double lp_sf = log(p_bf / (p_bf + p_fb)); + const double lp_sb = log(p_fb / (p_bf + p_fb)); const double lp_ff = log(1.0 - p_fb); const double lp_fb = log(p_fb); const double lp_bf = log(p_bf); const double lp_bb = log(1.0 - p_bf); const size_t n_vals = values[0].size(); - vector > forward(n_vals, make_pair(0.0, 0.0)); - vector > backward(n_vals, make_pair(0.0, 0.0)); + vector> forward(n_vals, make_pair(0.0, 0.0)); + vector> backward(n_vals, make_pair(0.0, 0.0)); vector fg_emit(n_vals), bg_emit(n_vals); get_emissions_rep(values, fg_emit, fg_distro); @@ -1122,18 +1075,18 @@ TwoStateHMM::PosteriorDecoding(const vector > > &val double total_loglik = 0; for (size_t i = 0; i < reset_points.size() - 1; ++i) { const double score = - forward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, forward); + forward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, forward); #ifndef NDEBUG const double backward_score = #endif - backward_algorithm(reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_bf, lp_bb, - fg_emit, bg_emit, backward); + backward_algorithm(reset_points[i], reset_points[i + 1], lp_sf, lp_sb, + lp_ff, lp_fb, lp_bf, lp_bb, fg_emit, bg_emit, + backward); - assert(fabs(score - backward_score)/max(score, backward_score) < tolerance); + assert(fabs(score - backward_score) / max(score, backward_score) < + tolerance); total_loglik += score; } @@ -1142,7 +1095,7 @@ TwoStateHMM::PosteriorDecoding(const vector > > &val classes.resize(n_vals); for (size_t i = 0; i < n_vals; ++i) - classes[i] = (posteriors[i] > 0.5); + classes[i] = (posteriors[i] > 0.5); // NOLINT(*-avoid-magic-numbers) return total_loglik; } From 805d67e38157098291535dbb2675d33a8540adbc Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 037/106] src/common/TwoStateHMM.hpp: changes to add static analysis --- src/common/TwoStateHMM.hpp | 145 +++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 79 deletions(-) diff --git a/src/common/TwoStateHMM.hpp b/src/common/TwoStateHMM.hpp index bc297c9f..015bb1d2 100644 --- a/src/common/TwoStateHMM.hpp +++ b/src/common/TwoStateHMM.hpp @@ -16,35 +16,34 @@ #ifndef TWO_STATE_HMM_HPP #define TWO_STATE_HMM_HPP -#include +#include +#include // IWYU pragma: keep +#include #include - struct TwoStateBetaBin; class TwoStateHMM { public: - TwoStateHMM(const double tol, const size_t max_itr, const bool v) : tolerance(tol), max_iterations(max_itr), VERBOSE(v) {} double - ViterbiDecoding(const std::vector > &values, + ViterbiDecoding(const std::vector> &values, const std::vector &reset_points, const double f_to_b_trans, const double b_to_f_trans, const double fg_alpha, const double fg_beta, const double bg_alpha, const double bg_beta, std::vector &ml_classes) const; - double - BaumWelchTraining(const std::vector > &values, + BaumWelchTraining(const std::vector> &values, const std::vector &reset_points, double &f_to_b_trans, double &b_to_f_trans, - double &fg_alpha, double &fg_beta, - double &bg_alpha, double &bg_beta) const; + double &fg_alpha, double &fg_beta, double &bg_alpha, + double &bg_beta) const; double - PosteriorDecoding(const std::vector > &values, + PosteriorDecoding(const std::vector> &values, const std::vector &reset_points, const double f_to_b_trans, const double b_to_f_trans, const double fg_alpha, const double fg_beta, @@ -53,16 +52,15 @@ class TwoStateHMM { std::vector &llr_scores) const; void - PosteriorScores(const std::vector > &values, + PosteriorScores(const std::vector> &values, const std::vector &reset_points, const double f_to_b_trans, const double b_to_f_trans, const double fg_alpha, const double fg_beta, const double bg_alpha, const double bg_beta, - const bool class_id, - std::vector &llr_scores) const; + const bool class_id, std::vector &llr_scores) const; void - TransitionPosteriors(const std::vector > &values, + TransitionPosteriors(const std::vector> &values, const std::vector &reset_points, const double f_to_b_trans, const double b_to_f_trans, const double fg_alpha, const double fg_beta, @@ -72,104 +70,93 @@ class TwoStateHMM { // FOR MULTIPLE REPLICATES double - BaumWelchTraining(const std::vector > > &values, - const std::vector &reset_points, - double &f_to_b_trans, double &b_to_f_trans, - std::vector &fg_alpha, - std::vector &fg_beta, - std::vector &bg_alpha, - std::vector &bg_beta) const; + BaumWelchTraining( + const std::vector>> &values, + const std::vector &reset_points, double &f_to_b_trans, + double &b_to_f_trans, std::vector &fg_alpha, + std::vector &fg_beta, std::vector &bg_alpha, + std::vector &bg_beta) const; double - PosteriorDecoding(const std::vector > > &values, - const std::vector &reset_points, - const double f_to_b_trans, const double b_to_f_trans, - const std::vector &fg_alpha, - const std::vector &fg_beta, - const std::vector &bg_alpha, - const std::vector &bg_beta, - std::vector &classes, - std::vector &llr_scores) const; + PosteriorDecoding( + const std::vector>> &values, + const std::vector &reset_points, const double f_to_b_trans, + const double b_to_f_trans, const std::vector &fg_alpha, + const std::vector &fg_beta, const std::vector &bg_alpha, + const std::vector &bg_beta, std::vector &classes, + std::vector &llr_scores) const; void - PosteriorScores(const std::vector > > &values, - const std::vector &reset_points, - const double f_to_b_trans, const double b_to_f_trans, - const std::vector &fg_alpha, - const std::vector &fg_beta, - const std::vector &bg_alpha, - const std::vector &bg_beta, - const bool &fg_class, - std::vector &llr_scores) const; - + PosteriorScores( + const std::vector>> &values, + const std::vector &reset_points, const double f_to_b_trans, + const double b_to_f_trans, const std::vector &fg_alpha, + const std::vector &fg_beta, const std::vector &bg_alpha, + const std::vector &bg_beta, const bool &fg_class, + std::vector &llr_scores) const; private: - double - ViterbiDecoding(const std::vector > &values, - const std::vector &reset_points, - const double p_fb, const double p_bf, - const TwoStateBetaBin &fg_distro, const TwoStateBetaBin &bg_distro, + ViterbiDecoding(const std::vector> &values, + const std::vector &reset_points, const double p_fb, + const double p_bf, const TwoStateBetaBin &fg_distro, + const TwoStateBetaBin &bg_distro, std::vector &ml_classes) const; double - BaumWelchTraining(const std::vector > &values, - const std::vector &reset_points, - double &p_fb, double &p_bf, - TwoStateBetaBin &fg_distro, TwoStateBetaBin &bg_distro) const; + BaumWelchTraining(const std::vector> &values, + const std::vector &reset_points, double &p_fb, + double &p_bf, TwoStateBetaBin &fg_distro, + TwoStateBetaBin &bg_distro) const; double - PosteriorDecoding(const std::vector > &values, - const std::vector &reset_points, - const double p_fb, const double p_bf, - const TwoStateBetaBin &fg_distro, + PosteriorDecoding(const std::vector> &values, + const std::vector &reset_points, const double p_fb, + const double p_bf, const TwoStateBetaBin &fg_distro, const TwoStateBetaBin &bg_distro, std::vector &classes, std::vector &llr_scores) const; void - PosteriorScores(const std::vector > &values, - const std::vector &reset_points, - const double p_fb, const double p_bf, - const TwoStateBetaBin &fg_distro, const TwoStateBetaBin &bg_distro, - const bool class_id, + PosteriorScores(const std::vector> &values, + const std::vector &reset_points, const double p_fb, + const double p_bf, const TwoStateBetaBin &fg_distro, + const TwoStateBetaBin &bg_distro, const bool class_id, std::vector &llr_scores) const; void - TransitionPosteriors(const std::vector > &values, + TransitionPosteriors(const std::vector> &values, const std::vector &reset_points, const double p_fb, const double p_bf, - const TwoStateBetaBin &fg_distro, const TwoStateBetaBin &bg_distro, + const TwoStateBetaBin &fg_distro, + const TwoStateBetaBin &bg_distro, const size_t transition, std::vector &scores) const; // FOR MULTIPLE REPLICATES double - BaumWelchTraining(const std::vector > > &values, - const std::vector &reset_points, - double &p_fb, double &p_bf, - std::vector &fg_distro, - std::vector &bg_distro) const; + BaumWelchTraining( + const std::vector>> &values, + const std::vector &reset_points, double &p_fb, double &p_bf, + std::vector &fg_distro, + std::vector &bg_distro) const; void - PosteriorScores(const std::vector > > &values, - const std::vector &reset_points, - const double p_fb, const double p_bf, - const std::vector &fg_distro, - const std::vector &bg_distro, - const bool fg_class, - std::vector &llr_scores) const; + PosteriorScores( + const std::vector>> &values, + const std::vector &reset_points, const double p_fb, + const double p_bf, const std::vector &fg_distro, + const std::vector &bg_distro, const bool fg_class, + std::vector &llr_scores) const; double - PosteriorDecoding(const std::vector > > &values, - const std::vector &reset_points, - const double p_fb, const double p_bf, - const std::vector &fg_distro, - const std::vector &bg_distro, - std::vector &classes, - std::vector &llr_scores) const; - + PosteriorDecoding( + const std::vector>> &values, + const std::vector &reset_points, const double p_fb, + const double p_bf, const std::vector &fg_distro, + const std::vector &bg_distro, std::vector &classes, + std::vector &llr_scores) const; double tolerance; size_t max_iterations; From d0ae17f1f5f9805a267de38b041b1ba15fc631ba Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 038/106] src/common/TwoStateHMM_PMD.cpp: changes to add static analysis --- src/common/TwoStateHMM_PMD.cpp | 658 ++++++++++++++------------------- 1 file changed, 280 insertions(+), 378 deletions(-) diff --git a/src/common/TwoStateHMM_PMD.cpp b/src/common/TwoStateHMM_PMD.cpp index 189ba59c..98eee926 100644 --- a/src/common/TwoStateHMM_PMD.cpp +++ b/src/common/TwoStateHMM_PMD.cpp @@ -18,31 +18,45 @@ #include "TwoStateHMM_PMD.hpp" -using std::vector; -using std::pair; -using std::setw; -using std::max; -using std::min; +#include +#include +#include +#include +#include +#include // IWYU pragma: keep +#include +#include + using std::cerr; using std::endl; -using std::string; -using std::setprecision; using std::isfinite; -using std::unique_ptr; using std::make_pair; +using std::max; +using std::min; +using std::pair; +using std::setprecision; +using std::setw; +using std::string; +using std::unique_ptr; +using std::vector; + +// NOLINTBEGIN(*-avoid-magic-numbers,*-owning-memory,*-narrowing-conversions) -template using num_lim = std::numeric_limits; +template using num_lim = std::numeric_limits; inline double TwoStateHMM::log_sum_log(const double p, const double q) const { - if (p == 0) {return q;} - else if (q == 0) {return p;} + if (p == 0) { + return q; + } + else if (q == 0) { + return p; + } const double larger = (p > q) ? p : q; const double smaller = (p > q) ? q : p; - return larger + log(1.0 + exp(smaller - larger)); + return larger + std::log1p(std::exp(smaller - larger)); } - double TwoStateHMM::log_sum_log_vec(const vector &vals, size_t limit) const { const vector::const_iterator x = @@ -56,15 +70,14 @@ TwoStateHMM::log_sum_log_vec(const vector &vals, size_t limit) const { assert(isfinite(sum)); } } - return max_val + log(sum); + return max_val + std::log(sum); } - void TwoStateHMM::estimate_emissions(const vector> &f, - const vector> &b, - vector &fg_probs, - vector &bg_probs) const { + const vector> &b, + vector &fg_probs, + vector &bg_probs) const { for (size_t i = 0; i < b.size(); ++i) { const double fg = (f[i].first + b[i].first); const double bg = (f[i].second + b[i].second); @@ -74,28 +87,21 @@ TwoStateHMM::estimate_emissions(const vector> &f, } } - void TwoStateHMM::TransitionPosteriors_rep( - const vector> > &values, - const vector &reset_points, - const vector &start_trans, - const vector> &trans, - const vector &end_trans, - const vector &fg_alpha, - const vector &fg_beta, - const vector &bg_alpha, - const vector &bg_beta, - const vector &array_status, - const size_t transition, - vector &llr_scores) const { - + const vector>> &values, + const vector &reset_points, const vector &start_trans, + const vector> &trans, const vector &end_trans, + const vector &fg_alpha, const vector &fg_beta, + const vector &bg_alpha, const vector &bg_beta, + const vector &array_status, const size_t transition, + vector &llr_scores) const { vector> fg_distro; vector> bg_distro; size_t NREP = values.size(); for (size_t i = 0; i < NREP; ++i) { - if(array_status[i]) { + if (array_status[i]) { fg_distro.emplace_back(new Beta(fg_alpha[i], fg_beta[i])); bg_distro.emplace_back(new Beta(bg_alpha[i], bg_beta[i])); } @@ -110,11 +116,10 @@ TwoStateHMM::TransitionPosteriors_rep( assert(trans.size() >= 2); for (size_t i = 0; i < trans.size(); ++i) assert(trans[i].size() >= 2); - return TransitionPosteriors_rep(values, reset_points, - start_trans[0], start_trans[1], - trans[0][0], trans[0][1], end_trans[0], - trans[1][0], trans[1][1], end_trans[1], - fg_distro, bg_distro, transition, llr_scores); + return TransitionPosteriors_rep( + values, reset_points, start_trans[0], start_trans[1], trans[0][0], + trans[0][1], end_trans[0], trans[1][0], trans[1][1], end_trans[1], + fg_distro, bg_distro, transition, llr_scores); } //////////////////////////////////////////////////////////////////////////////// @@ -124,21 +129,19 @@ TwoStateHMM::TransitionPosteriors_rep( //////////////////////////////////////////////////////////////////////////////// double TwoStateHMM::forward_algorithm_rep( - const vector> > &vals, - const size_t start, const size_t end, - const double lp_sf, const double lp_sb, - const double lp_ff, const double lp_fb, - const double lp_ft, const double lp_bf, - const double lp_bb, const double lp_bt, - const vector> &fg_distro, - const vector> &bg_distro, - vector> &f) const { + const vector>> &vals, const size_t start, + const size_t end, const double lp_sf, const double lp_sb, const double lp_ff, + const double lp_fb, const double lp_ft, const double lp_bf, + const double lp_bb, const double lp_bt, + const vector> &fg_distro, + const vector> &bg_distro, + vector> &f) const { size_t NREP = vals.size(); f[start].first = lp_sf; f[start].second = lp_sb; for (size_t r = 0; r < NREP; ++r) { - if (vals[r][start].first + vals[r][start].second >= 1) { + if (vals[r][start].first + vals[r][start].second >= 1) { f[start].first += (*fg_distro[r])(vals[r][start]); f[start].second += (*bg_distro[r])(vals[r][start]); } @@ -150,27 +153,24 @@ TwoStateHMM::forward_algorithm_rep( f[i].second = log_sum_log(f[k].first + lp_fb, f[k].second + lp_bb); for (size_t r = 0; r < NREP; ++r) { - if(vals[r][i].first + vals[r][i].second >= 1) { - f[i].first += (*fg_distro[r])(vals[r][i]); - f[i].second += (*bg_distro[r])(vals[r][i]); + if (vals[r][i].first + vals[r][i].second >= 1) { + f[i].first += (*fg_distro[r])(vals[r][i]); + f[i].second += (*bg_distro[r])(vals[r][i]); } } } return log_sum_log(f[end - 1].first + lp_ft, f[end - 1].second + lp_bt); } - double -TwoStateHMM::backward_algorithm_rep(const vector> > &vals, - const size_t start, const size_t end, - const double lp_sf, const double lp_sb, - const double lp_ff, const double lp_fb, - const double lp_ft, const double lp_bf, - const double lp_bb, const double lp_bt, - const vector> &fg_distro, - const vector> &bg_distro, - vector> &b) const { - +TwoStateHMM::backward_algorithm_rep( + const vector>> &vals, const size_t start, + const size_t end, const double lp_sf, const double lp_sb, const double lp_ff, + const double lp_fb, const double lp_ft, const double lp_bf, + const double lp_bb, const double lp_bt, + const vector> &fg_distro, + const vector> &bg_distro, + vector> &b) const { size_t NREP = vals.size(); b[end - 1].first = lp_ft; b[end - 1].second = lp_bt; @@ -203,23 +203,17 @@ TwoStateHMM::backward_algorithm_rep(const vector> > b[start].second + emission_t1_b + lp_sb); } - -//ff_vals: ksi_t(1,1), where 1 is the S_1, i.e. posterior prob of transitions +// ff_vals: ksi_t(1,1), where 1 is the S_1, i.e. posterior prob of transitions void TwoStateHMM::estimate_transitions_rep( - const vector> > &vals, - const size_t start, const size_t end, - const vector> &f, - const vector> &b, - const double total, - const vector> &fg_distro, - const vector> &bg_distro, - const double lp_ff, const double lp_fb, - const double lp_bf, const double lp_bb, - vector &ff_vals, - vector &fb_vals, - vector &bf_vals, - vector &bb_vals) const { + const vector>> &vals, const size_t start, + const size_t end, const vector> &f, + const vector> &b, const double total, + const vector> &fg_distro, + const vector> &bg_distro, const double lp_ff, + const double lp_fb, const double lp_bf, const double lp_bb, + vector &ff_vals, vector &fb_vals, vector &bf_vals, + vector &bb_vals) const { size_t NREP = vals.size(); for (size_t i = start + 1; i < end; ++i) { const size_t k = i - 1; @@ -227,9 +221,9 @@ TwoStateHMM::estimate_transitions_rep( double b_second = b[i].second - total; for (size_t r = 0; r < NREP; ++r) { - if(vals[r][i].first + vals[r][i].second >= 1) { - b_first += (*fg_distro[r])(vals[r][i]); - b_second += (*bg_distro[r])(vals[r][i]); + if (vals[r][i].first + vals[r][i].second >= 1) { + b_first += (*fg_distro[r])(vals[r][i]); + b_second += (*bg_distro[r])(vals[r][i]); } } const double ff = f[k].first; @@ -243,21 +237,17 @@ TwoStateHMM::estimate_transitions_rep( } } - double -TwoStateHMM::single_iteration_rep(const vector> > &values, - const vector> &vals_a_reps, - const vector> &vals_b_reps, - const vector &reset_points, - vector> &forward, - vector> &backward, - double &p_sf, double &p_sb, - double &p_ff, double &p_fb, double &p_ft, - double &p_bf, double &p_bb, double &p_bt, - vector> &fg_distro, - vector> &bg_distro) const { - - size_t NREP= values.size(); +TwoStateHMM::single_iteration_rep( + const vector>> &values, + const vector> &vals_a_reps, + const vector> &vals_b_reps, const vector &reset_points, + vector> &forward, vector> &backward, + double &p_sf, double &p_sb, double &p_ff, double &p_fb, double &p_ft, + double &p_bf, double &p_bb, double &p_bt, + vector> &fg_distro, + vector> &bg_distro) const { + size_t NREP = values.size(); vector log_fg_expected; vector log_bg_expected; @@ -272,9 +262,9 @@ TwoStateHMM::single_iteration_rep(const vector> > &v const double lp_bb = log(p_bb); const double lp_bt = log(p_bt); - assert(isfinite(lp_sf) && isfinite(lp_sb) && - isfinite(lp_ff) && isfinite(lp_fb) && isfinite(lp_ft) && - isfinite(lp_bf) && isfinite(lp_bb) && isfinite(lp_bt)); + assert(isfinite(lp_sf) && isfinite(lp_sb) && isfinite(lp_ff) && + isfinite(lp_fb) && isfinite(lp_ft) && isfinite(lp_bf) && + isfinite(lp_bb) && isfinite(lp_bt)); // for estimating transitions vector ff_vals(values[0].size(), 0); @@ -284,36 +274,24 @@ TwoStateHMM::single_iteration_rep(const vector> > &v // #pragma omp parallel for (size_t i = 0; i < reset_points.size() - 1; ++i) { - const double score = - forward_algorithm_rep(values, - reset_points[i], - reset_points[i + 1], - lp_sf, lp_sb, - lp_ff, lp_fb, lp_ft, - lp_bf, lp_bb, lp_bt, - fg_distro, bg_distro, - forward); - - const double backward_score = - backward_algorithm_rep(values, - reset_points[i], - reset_points[i + 1], - lp_sf, lp_sb, - lp_ff, lp_fb, lp_ft, - lp_bf, lp_bb, lp_bt, - fg_distro, bg_distro, - backward); - - if (DEBUG && (fabs(score - backward_score)/ - max(score, backward_score)) > 1e-10) { + const double score = forward_algorithm_rep( + values, reset_points[i], reset_points[i + 1], lp_sf, lp_sb, lp_ff, lp_fb, + lp_ft, lp_bf, lp_bb, lp_bt, fg_distro, bg_distro, forward); + + const double backward_score = backward_algorithm_rep( + values, reset_points[i], reset_points[i + 1], lp_sf, lp_sb, lp_ff, lp_fb, + lp_ft, lp_bf, lp_bb, lp_bt, fg_distro, bg_distro, backward); + + if (DEBUG && + (fabs(score - backward_score) / max(score, backward_score)) > 1e-10) { cerr << "fabs(score - backward_score)/" - << "max(score, backward_score) > 1e-10" << endl; + << "max(score, backward_score) > 1e-10\n"; } estimate_transitions_rep(values, reset_points[i], reset_points[i + 1], forward, backward, score, fg_distro, bg_distro, - lp_ff, lp_fb, lp_bf, lp_bb, - ff_vals, fb_vals, bf_vals, bb_vals); + lp_ff, lp_fb, lp_bf, lp_bb, ff_vals, fb_vals, + bf_vals, bb_vals); total_score += score; } @@ -324,7 +302,7 @@ TwoStateHMM::single_iteration_rep(const vector> > &v // because the final term in each block has no // meaning since there is no transition to be counted // from the final observation (they all must go to terminal state) - size_t NBLOCKS= reset_points.size()-1; //should equal to #deserts+1 + size_t NBLOCKS = reset_points.size() - 1; // should equal to #deserts+1 const double p_ff_new_estimate = exp(log_sum_log_vec(ff_vals, values[0].size() - 1)) - (NBLOCKS - 1); const double p_fb_new_estimate = @@ -335,39 +313,39 @@ TwoStateHMM::single_iteration_rep(const vector> > &v exp(log_sum_log_vec(bb_vals, values[0].size() - 1)) - (NBLOCKS - 1); double denom = p_ff_new_estimate + p_fb_new_estimate; - p_ff = (p_ff_new_estimate)/denom - p_ft/2.0; - p_fb = (p_fb_new_estimate)/denom - p_ft/2.0; + p_ff = (p_ff_new_estimate) / denom - p_ft / 2.0; + p_fb = (p_fb_new_estimate) / denom - p_ft / 2.0; if (p_ff < MIN_PROB) { if (DEBUG) - cerr << "p_ff < MIN_PROB" << endl; + cerr << "p_ff < MIN_PROB\n"; p_ff = MIN_PROB; } if (p_fb < MIN_PROB) { if (DEBUG) - cerr << "p_fb < MIN_PROB" << endl; + cerr << "p_fb < MIN_PROB\n"; p_fb = MIN_PROB; } denom = p_bf_new_estimate + p_bb_new_estimate; - p_bf = p_bf_new_estimate/denom - p_bt/2.0; - p_bb = p_bb_new_estimate/denom - p_bt/2.0; + p_bf = p_bf_new_estimate / denom - p_bt / 2.0; + p_bb = p_bb_new_estimate / denom - p_bt / 2.0; if (p_bf < MIN_PROB) { if (DEBUG) - cerr << "p_bf < MIN_PROB" << endl; + cerr << "p_bf < MIN_PROB\n"; p_bf = MIN_PROB; } if (p_bb < MIN_PROB) { if (DEBUG) - cerr << "p_bb < MIN_PROB" << endl; + cerr << "p_bb < MIN_PROB\n"; p_bb = MIN_PROB; } - p_sb = (p_bb + p_fb)/2.0; - p_sf = (p_bf + p_ff)/2.0; + p_sb = (p_bb + p_fb) / 2.0; + p_sf = (p_bf + p_ff) / 2.0; vector fg_probs(values[0].size(), 0); vector bg_probs(values[0].size(), 0); @@ -376,18 +354,18 @@ TwoStateHMM::single_iteration_rep(const vector> > &v vector vals_a, vals_b; vector fg_prob, bg_prob; for (size_t r = 0; r < NREP; ++r) { - //individual replicate may have 0 coverage at some sites - //remove these sites before fitting + // individual replicate may have 0 coverage at some sites + // remove these sites before fitting vals_a.clear(); vals_b.clear(); fg_prob.clear(); bg_prob.clear(); for (size_t i = 0; i < values[0].size(); ++i) { if (values[r][i].first + values[r][i].second >= 1) { - vals_a.push_back(vals_a_reps[r][i]); - vals_b.push_back(vals_b_reps[r][i]); - fg_prob.push_back(fg_probs[i]); // use the common posterior prob - bg_prob.push_back(bg_probs[i]); + vals_a.push_back(vals_a_reps[r][i]); + vals_b.push_back(vals_b_reps[r][i]); + fg_prob.push_back(fg_probs[i]); // use the common posterior prob + bg_prob.push_back(bg_probs[i]); } } fg_distro[r]->fit(vals_a, vals_b, fg_prob); @@ -398,21 +376,17 @@ TwoStateHMM::single_iteration_rep(const vector> > &v double TwoStateHMM::BaumWelchTraining_rep( - const vector> > &values, - const vector &reset_points, - vector &start_trans, - vector> &trans, - vector &end_trans, - vector &fg_alpha, vector &fg_beta, - vector &bg_alpha, vector &bg_beta, - const vector &array_status) const { - + const vector>> &values, + const vector &reset_points, vector &start_trans, + vector> &trans, vector &end_trans, + vector &fg_alpha, vector &fg_beta, vector &bg_alpha, + vector &bg_beta, const vector &array_status) const { vector> fg_distro; vector> bg_distro; size_t NREP = values.size(); for (size_t i = 0; i < NREP; ++i) { - if(array_status[i]) { + if (array_status[i]) { fg_distro.emplace_back(new Beta(fg_alpha[i], fg_beta[i])); bg_distro.emplace_back(new Beta(bg_alpha[i], bg_beta[i])); } @@ -428,13 +402,10 @@ TwoStateHMM::BaumWelchTraining_rep( for (size_t i = 0; i < trans.size(); ++i) assert(trans[i].size() >= 2); - const double score = BaumWelchTraining_rep(values, reset_points, - start_trans[0], start_trans[1], - trans[0][0], trans[0][1], - end_trans[0], trans[1][0], - trans[1][1], end_trans[1], - fg_distro, bg_distro, - array_status); + const double score = BaumWelchTraining_rep( + values, reset_points, start_trans[0], start_trans[1], trans[0][0], + trans[0][1], end_trans[0], trans[1][0], trans[1][1], end_trans[1], + fg_distro, bg_distro, array_status); for (size_t r = 0; r < NREP; ++r) { fg_alpha[r] = fg_distro[r]->getalpha(); fg_beta[r] = fg_distro[r]->getbeta(); @@ -445,32 +416,23 @@ TwoStateHMM::BaumWelchTraining_rep( return score; } - double TwoStateHMM::BaumWelchTraining_rep( - const vector> > &values, - const vector &reset_points, - double &p_sf, double &p_sb, - double &p_ff, double &p_fb, double &p_ft, - double &p_bf, double &p_bb, double &p_bt, - vector> &fg_distro, - vector> &bg_distro, - const vector &array_status) const { - + const vector>> &values, + const vector &reset_points, double &p_sf, double &p_sb, double &p_ff, + double &p_fb, double &p_ft, double &p_bf, double &p_bb, double &p_bt, + vector> &fg_distro, + vector> &bg_distro, + const vector &array_status) const { size_t NREP = values.size(); - vector> forward(values[0].size(), - make_pair(0.0, 0.0)); - vector> backward(values[0].size(), - make_pair(0.0, 0.0)); + vector> forward(values[0].size(), make_pair(0.0, 0.0)); + vector> backward(values[0].size(), make_pair(0.0, 0.0)); if (VERBOSE) { - cerr << "MAX_ITER=" << max_iterations << "\tTOLERANCE=" - << tolerance << endl; - cerr << setw(5) << "ITR" - << setw(10) << "F size" - << setw(10) << "B size" - << setw(14) << "DELTA" - << endl; + cerr << "MAX_ITER=" << max_iterations << "\tTOLERANCE=" << tolerance + << '\n'; + cerr << setw(5) << "ITR" << setw(10) << "F size" << setw(10) << "B size" + << setw(14) << "DELTA\n"; } double prev_total = -num_lim::max(); @@ -481,33 +443,32 @@ TwoStateHMM::BaumWelchTraining_rep( if (array_status[r]) { for (size_t i = 0; i < values[0].size(); ++i) { if (values[r][i].second > 0) { - vals_a_reps[r][i] = - log(min(max(values[r][i].first/values[r][i].second, - 1e-2), 1.0 - 1e-2)); + vals_a_reps[r][i] = log(min( + max(values[r][i].first / values[r][i].second, 1e-2), 1.0 - 1e-2)); vals_b_reps[r][i] = - log(1.0 - min(max(values[r][i].first/values[r][i].second, - 1e-2), 1.0 - 1e-2)); + log(1.0 - min(max(values[r][i].first / values[r][i].second, 1e-2), + 1.0 - 1e-2)); } } } else { for (size_t i = 0; i < values[0].size(); ++i) { if (values[r][i].first + values[r][i].second >= 1) { - vals_a_reps[r][i] = - log(min(max(values[r][i].first/ - (values[r][i].first + - values[r][i].second), 1e-2), 1.0 - 1e-2)); - vals_b_reps[r][i] = - log(1 - min(max(values[r][i].first/ - (values[r][i].first + - values[r][i].second), 1e-2), 1.0 - 1e-2)); + vals_a_reps[r][i] = log(min( + max(values[r][i].first / (values[r][i].first + values[r][i].second), + 1e-2), + 1.0 - 1e-2)); + vals_b_reps[r][i] = + log(1 - min(max(values[r][i].first / + (values[r][i].first + values[r][i].second), + 1e-2), + 1.0 - 1e-2)); } } } } for (size_t i = 0; i < max_iterations; ++i) { - double p_sf_est = p_sf; double p_sb_est = p_sb; double p_ff_est = p_ff; @@ -517,45 +478,39 @@ TwoStateHMM::BaumWelchTraining_rep( double p_ft_est = p_ft; double p_bt_est = p_bt; - double total = single_iteration_rep(values, vals_a_reps, vals_b_reps, - reset_points, - forward, backward, - p_sf_est, p_sb_est, - p_ff_est, p_fb_est, p_ft_est, - p_bf_est, p_bb_est, p_bt_est, - fg_distro, bg_distro); + double total = single_iteration_rep( + values, vals_a_reps, vals_b_reps, reset_points, forward, backward, + p_sf_est, p_sb_est, p_ff_est, p_fb_est, p_ft_est, p_bf_est, p_bb_est, + p_bt_est, fg_distro, bg_distro); if (DEBUG) { - cerr << "S_F\t" << p_sf_est << endl - << "S_B\t" << p_sb_est << endl - << "F_F\t" << p_ff_est << endl - << "F_B\t" << p_fb_est << endl - << "B_F\t" << p_bf_est << endl - << "B_B\t" << p_bb_est << endl - << "F_E\t" << p_ft_est << endl - << "B_E\t" << p_bt_est << endl - << endl; + cerr << "S_F\t" << p_sf_est << '\n' + << "S_B\t" << p_sb_est << '\n' + << "F_F\t" << p_ff_est << '\n' + << "F_B\t" << p_fb_est << '\n' + << "B_F\t" << p_bf_est << '\n' + << "B_B\t" << p_bb_est << '\n' + << "F_E\t" << p_ft_est << '\n' + << "B_E\t" << p_bt_est << '\n' + << '\n'; for (size_t r = 0; r < NREP; ++r) - cerr << "Emission parameters for Rep" << r+1 - << setw(14) << fg_distro[r]->getalpha() << setw(14) - << fg_distro[r]->getbeta() << setw(14) << bg_distro[r]->getalpha() - << setw(14) << bg_distro[r]->getbeta() << endl; + cerr << "Emission parameters for Rep" << r + 1 << setw(14) + << fg_distro[r]->getalpha() << setw(14) << fg_distro[r]->getbeta() + << setw(14) << bg_distro[r]->getalpha() << setw(14) + << bg_distro[r]->getbeta() << '\n'; } if (VERBOSE) { - cerr << setw(5) << i + 1 - << setw(10) << 1/p_fb_est - << setw(10) << 1/p_bf_est - << setw(14) << total - << setw(14) << prev_total - << setw(14) << total - prev_total - << setw(14) << (total - prev_total)/std::fabs(total) - << endl; + cerr << setw(5) << i + 1 << setw(10) << 1 / p_fb_est << setw(10) + << 1 / p_bf_est << setw(14) << total << setw(14) << prev_total + << setw(14) << total - prev_total << setw(14) + << (total - prev_total) / std::fabs(total) << '\n'; } if (std::abs(total - prev_total) < tolerance) { if (VERBOSE) - cerr << "CONVERGED" << "\t" << std::abs(total - prev_total) - << "\t" << tolerance << endl << endl; + cerr << "CONVERGED" << "\t" << std::abs(total - prev_total) << "\t" + << tolerance << '\n' + << '\n'; break; } @@ -573,27 +528,21 @@ TwoStateHMM::BaumWelchTraining_rep( return prev_total; } - void -TwoStateHMM::PosteriorScores_rep(const vector> > &values, - const vector &reset_points, - const vector &start_trans, - const vector> &trans, - const vector &end_trans, - const vector fg_alpha, - const vector fg_beta, - const vector bg_alpha, - const vector bg_beta, - const vector &classes, - vector &llr_scores, - const vector &array_status) const { - +TwoStateHMM::PosteriorScores_rep( + const vector>> &values, + const vector &reset_points, const vector &start_trans, + const vector> &trans, const vector &end_trans, + const vector &fg_alpha, const vector &fg_beta, + const vector &bg_alpha, const vector &bg_beta, + const vector &classes, vector &llr_scores, + const vector &array_status) const { vector> fg_distro; vector> bg_distro; size_t NREP = values.size(); for (size_t i = 0; i < NREP; ++i) { - if(array_status[i]) { + if (array_status[i]) { fg_distro.emplace_back(new Beta(fg_alpha[i], fg_beta[i])); bg_distro.emplace_back(new Beta(bg_alpha[i], bg_beta[i])); } @@ -609,24 +558,20 @@ TwoStateHMM::PosteriorScores_rep(const vector> > &va for (size_t i = 0; i < trans.size(); ++i) assert(trans[i].size() >= 2); - return PosteriorScores_rep(values, reset_points, - start_trans[0], start_trans[1], - trans[0][0], trans[0][1], end_trans[0], - trans[1][0], trans[1][1], end_trans[1], - fg_distro, bg_distro, classes, llr_scores); + return PosteriorScores_rep( + values, reset_points, start_trans[0], start_trans[1], trans[0][0], + trans[0][1], end_trans[0], trans[1][0], trans[1][1], end_trans[1], + fg_distro, bg_distro, classes, llr_scores); } void TwoStateHMM::PosteriorScores_rep( const vector>> &values, - const vector &reset_points, - double p_sf, double p_sb, - double p_ff, double p_fb, - double p_ft, double p_bf, double p_bb, double p_bt, + const vector &reset_points, double p_sf, double p_sb, double p_ff, + double p_fb, double p_ft, double p_bf, double p_bb, double p_bt, const vector> &fg_distro, const vector> &bg_distro, const vector &classes, vector &llr_scores) const { - const double lp_sf = log(p_sf); const double lp_sb = log(p_sb); const double lp_ff = log(p_ff); @@ -636,39 +581,26 @@ TwoStateHMM::PosteriorScores_rep( const double lp_bb = log(p_bb); const double lp_bt = log(p_bt); - assert(isfinite(lp_sf) && isfinite(lp_sb) && - isfinite(lp_ff) && isfinite(lp_fb) && isfinite(lp_ft) && - isfinite(lp_bf) && isfinite(lp_bb) && isfinite(lp_bt)); + assert(isfinite(lp_sf) && isfinite(lp_sb) && isfinite(lp_ff) && + isfinite(lp_fb) && isfinite(lp_ft) && isfinite(lp_bf) && + isfinite(lp_bb) && isfinite(lp_bt)); - vector> forward(values[0].size(), - make_pair(0.0, 0.0)); - vector> backward(values[0].size(), - make_pair(0.0, 0.0)); + vector> forward(values[0].size(), make_pair(0.0, 0.0)); + vector> backward(values[0].size(), make_pair(0.0, 0.0)); for (size_t i = 0; i < reset_points.size() - 1; ++i) { - const double score = forward_algorithm_rep(values, - reset_points[i], - reset_points[i + 1], - lp_sf, lp_sb, - lp_ff, lp_fb, lp_ft, - lp_bf, lp_bb, lp_bt, - fg_distro, bg_distro, - forward); - - const double backward_score = - backward_algorithm_rep(values, - reset_points[i], - reset_points[i + 1], - lp_sf, lp_sb, - lp_ff, lp_fb, lp_ft, - lp_bf, lp_bb, lp_bt, - fg_distro, bg_distro, - backward); - - if (DEBUG && (fabs(score - backward_score)/ - max(score, backward_score)) > 1e-10) + const double score = forward_algorithm_rep( + values, reset_points[i], reset_points[i + 1], lp_sf, lp_sb, lp_ff, lp_fb, + lp_ft, lp_bf, lp_bb, lp_bt, fg_distro, bg_distro, forward); + + const double backward_score = backward_algorithm_rep( + values, reset_points[i], reset_points[i + 1], lp_sf, lp_sb, lp_ff, lp_fb, + lp_ft, lp_bf, lp_bb, lp_bt, fg_distro, bg_distro, backward); + + if (DEBUG && + (fabs(score - backward_score) / max(score, backward_score)) > 1e-10) cerr << "fabs(score - backward_score)/" - << "max(score, backward_score) > 1e-10" << endl; + << "max(score, backward_score) > 1e-10\n"; } llr_scores.resize(values[0].size()); @@ -683,17 +615,14 @@ TwoStateHMM::PosteriorScores_rep( } double -TwoStateHMM::PosteriorDecoding_rep(const vector> > &values, - const vector &reset_points, - const vector &start_trans, - const vector> &trans, - const vector &end_trans, - const vector fg_alpha, const vector fg_beta, - const vector bg_alpha, const vector bg_beta, - vector &classes, - vector &llr_scores, - const vector &array_status) const { - +TwoStateHMM::PosteriorDecoding_rep( + const vector>> &values, + const vector &reset_points, const vector &start_trans, + const vector> &trans, const vector &end_trans, + const vector &fg_alpha, const vector &fg_beta, + const vector &bg_alpha, const vector &bg_beta, + vector &classes, vector &llr_scores, + const vector &array_status) const { vector> fg_distro; vector> bg_distro; @@ -715,26 +644,20 @@ TwoStateHMM::PosteriorDecoding_rep(const vector> > & for (size_t i = 0; i < trans.size(); ++i) assert(trans[i].size() >= 2); - return PosteriorDecoding_rep(values, reset_points, - start_trans[0], start_trans[1], - trans[0][0], trans[0][1], end_trans[0], - trans[1][0], trans[1][1], end_trans[1], - fg_distro, bg_distro, classes, llr_scores); + return PosteriorDecoding_rep( + values, reset_points, start_trans[0], start_trans[1], trans[0][0], + trans[0][1], end_trans[0], trans[1][0], trans[1][1], end_trans[1], + fg_distro, bg_distro, classes, llr_scores); } - void TwoStateHMM::TransitionPosteriors_rep( - const vector>> &values, - const vector &reset_points, - double p_sf, double p_sb, - double p_ff, double p_fb, double p_ft, - double p_bf, double p_bb, double p_bt, - const vector> &fg_distro, - const vector> &bg_distro, - const size_t transition, - vector &scores) const { - + const vector>> &values, + const vector &reset_points, double p_sf, double p_sb, double p_ff, + double p_fb, double p_ft, double p_bf, double p_bb, double p_bt, + const vector> &fg_distro, + const vector> &bg_distro, + const size_t transition, vector &scores) const { size_t NREP = values.size(); const double lp_sf = log(p_sf); const double lp_sb = log(p_sb); @@ -745,32 +668,24 @@ TwoStateHMM::TransitionPosteriors_rep( const double lp_bb = log(p_bb); const double lp_bt = log(p_bt); - assert(isfinite(lp_sf) && isfinite(lp_sb) && - isfinite(lp_ff) && isfinite(lp_fb) && isfinite(lp_ft) && - isfinite(lp_bf) && isfinite(lp_bb) && isfinite(lp_bt)); + assert(isfinite(lp_sf) && isfinite(lp_sb) && isfinite(lp_ff) && + isfinite(lp_fb) && isfinite(lp_ft) && isfinite(lp_bf) && + isfinite(lp_bb) && isfinite(lp_bt)); - vector> forward(values[0].size(), - make_pair(0.0, 0.0)); - vector> backward(values[0].size(), - make_pair(0.0, 0.0)); + vector> forward(values[0].size(), make_pair(0.0, 0.0)); + vector> backward(values[0].size(), make_pair(0.0, 0.0)); for (size_t i = 0; i < reset_points.size() - 1; ++i) { - const double score = forward_algorithm_rep(values, - reset_points[i], - reset_points[i + 1], - lp_sf, lp_sb, - lp_ff, lp_fb, lp_ft, - lp_bf, lp_bb, lp_bt, - fg_distro, bg_distro, forward); - const double backward_score = - backward_algorithm_rep(values, reset_points[i], reset_points[i + 1], - lp_sf, lp_sb, lp_ff, lp_fb, lp_ft, - lp_bf, lp_bb, lp_bt, - fg_distro, bg_distro, backward); - - if (DEBUG && (fabs(score - backward_score)/ - max(score, backward_score)) > 1e-10) + const double score = forward_algorithm_rep( + values, reset_points[i], reset_points[i + 1], lp_sf, lp_sb, lp_ff, lp_fb, + lp_ft, lp_bf, lp_bb, lp_bt, fg_distro, bg_distro, forward); + const double backward_score = backward_algorithm_rep( + values, reset_points[i], reset_points[i + 1], lp_sf, lp_sb, lp_ff, lp_fb, + lp_ft, lp_bf, lp_bb, lp_bt, fg_distro, bg_distro, backward); + + if (DEBUG && + (fabs(score - backward_score) / max(score, backward_score)) > 1e-10) cerr << "fabs(score - backward_score)/" - << "max(score, backward_score) > 1e-10" << endl; + << "max(score, backward_score) > 1e-10\n"; } scores.resize(values[0].size()); size_t j = 0; @@ -780,26 +695,26 @@ TwoStateHMM::TransitionPosteriors_rep( scores[i] = 0; } else { - double fg_vals=0; - double bg_vals=0; - for(size_t r = 0; r < NREP; ++r) { - fg_vals+=(*fg_distro[r])(values[r][i]); - bg_vals+=(*bg_distro[r])(values[r][i]); + double fg_vals = 0; + double bg_vals = 0; + for (size_t r = 0; r < NREP; ++r) { + fg_vals += (*fg_distro[r])(values[r][i]); + bg_vals += (*bg_distro[r])(values[r][i]); } fg_vals /= NREP; bg_vals /= NREP; - const double fg_to_fg_state = forward[i - 1].first + lp_ff + - fg_vals + backward[i].first; - const double fg_to_bg_state = forward[i - 1].first + lp_fb + - bg_vals + backward[i].second; - const double bg_to_fg_state = forward[i - 1].second + lp_bf + - fg_vals + backward[i].first; - const double bg_to_bg_state = forward[i - 1].second + lp_bb + - bg_vals + backward[i].second; - const double denom = log_sum_log( - log_sum_log(fg_to_fg_state, fg_to_bg_state), - log_sum_log(bg_to_fg_state, bg_to_bg_state)); + const double fg_to_fg_state = + forward[i - 1].first + lp_ff + fg_vals + backward[i].first; + const double fg_to_bg_state = + forward[i - 1].first + lp_fb + bg_vals + backward[i].second; + const double bg_to_fg_state = + forward[i - 1].second + lp_bf + fg_vals + backward[i].first; + const double bg_to_bg_state = + forward[i - 1].second + lp_bb + bg_vals + backward[i].second; + const double denom = + log_sum_log(log_sum_log(fg_to_fg_state, fg_to_bg_state), + log_sum_log(bg_to_fg_state, bg_to_bg_state)); double numerator = fg_to_fg_state; if (transition == 1) numerator = fg_to_bg_state; @@ -812,17 +727,14 @@ TwoStateHMM::TransitionPosteriors_rep( } } - double -TwoStateHMM::PosteriorDecoding_rep(const vector> > &values, - const vector &reset_points, - double p_sf, double p_sb, - double p_ff, double p_fb, double p_ft, - double p_bf, double p_bb, double p_bt, - const vector> &fg_distro, - const vector> &bg_distro, - vector &classes, - vector &llr_scores) const { +TwoStateHMM::PosteriorDecoding_rep( + const vector>> &values, + const vector &reset_points, double p_sf, double p_sb, double p_ff, + double p_fb, double p_ft, double p_bf, double p_bb, double p_bt, + const vector> &fg_distro, + const vector> &bg_distro, + vector &classes, vector &llr_scores) const { double total_score = 0; const double lp_sf = log(p_sf); @@ -834,36 +746,25 @@ TwoStateHMM::PosteriorDecoding_rep(const vector> > & const double lp_bb = log(p_bb); const double lp_bt = log(p_bt); - assert(isfinite(lp_sf) && isfinite(lp_sb) && - isfinite(lp_ff) && isfinite(lp_fb) && isfinite(lp_ft) && - isfinite(lp_bf) && isfinite(lp_bb) && isfinite(lp_bt)); + assert(isfinite(lp_sf) && isfinite(lp_sb) && isfinite(lp_ff) && + isfinite(lp_fb) && isfinite(lp_ft) && isfinite(lp_bf) && + isfinite(lp_bb) && isfinite(lp_bt)); vector> forward(values[0].size(), make_pair(0.0, 0.0)); vector> backward(values[0].size(), make_pair(0.0, 0.0)); for (size_t i = 0; i < reset_points.size() - 1; ++i) { - const double score = forward_algorithm_rep(values, - reset_points[i], - reset_points[i + 1], - lp_sf, lp_sb, - lp_ff, lp_fb, lp_ft, - lp_bf, lp_bb, lp_bt, - fg_distro, bg_distro, - forward); - - const double backward_score = - backward_algorithm_rep(values, - reset_points[i], - reset_points[i + 1], - lp_sf, lp_sb, - lp_ff, lp_fb, lp_ft, - lp_bf, lp_bb, lp_bt, - fg_distro, bg_distro, - backward); - - if (DEBUG && (fabs(score - backward_score)/ - max(score, backward_score)) > 1e-10) + const double score = forward_algorithm_rep( + values, reset_points[i], reset_points[i + 1], lp_sf, lp_sb, lp_ff, lp_fb, + lp_ft, lp_bf, lp_bb, lp_bt, fg_distro, bg_distro, forward); + + const double backward_score = backward_algorithm_rep( + values, reset_points[i], reset_points[i + 1], lp_sf, lp_sb, lp_ff, lp_fb, + lp_ft, lp_bf, lp_bb, lp_bt, fg_distro, bg_distro, backward); + + if (DEBUG && + (fabs(score - backward_score) / max(score, backward_score)) > 1e-10) cerr << "fabs(score - backward_score)/" - << "max(score, backward_score) > 1e-10" << endl; + << "max(score, backward_score) > 1e-10\n"; total_score += score; } @@ -881,5 +782,6 @@ TwoStateHMM::PosteriorDecoding_rep(const vector> > & return total_score; } -/***********End of functions for multiple replicates**************/ -//////////////////////////////////////////////////////////////////////////////// +// End of functions for multiple replicates + +// NOLINTEND(*-avoid-magic-numbers,*-owning-memory,*-narrowing-conversions) From 35ceb8fa97c5e7aca80fc1ff3bdaa084f73cc6a0 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 039/106] src/common/TwoStateHMM_PMD.hpp: changes to add static analysis --- src/common/TwoStateHMM_PMD.hpp | 261 +++++++++++++++------------------ 1 file changed, 117 insertions(+), 144 deletions(-) diff --git a/src/common/TwoStateHMM_PMD.hpp b/src/common/TwoStateHMM_PMD.hpp index ea27e24b..8d5c0275 100644 --- a/src/common/TwoStateHMM_PMD.hpp +++ b/src/common/TwoStateHMM_PMD.hpp @@ -18,79 +18,68 @@ #ifndef TWO_STATE_HMM_PMD_HPP #define TWO_STATE_HMM_PMD_HPP -#include -#include -#include -#include -#include - -#include -#include -#include "smithlab_utils.hpp" + #include "EmissionDistribution.hpp" +#include +#include // IWYU pragma: keep +#include +#include +#include +#include + class TwoStateHMM { public: - - TwoStateHMM(const double mp, const double tol, - const size_t max_itr, const bool v, bool d = false) : - MIN_PROB(mp), tolerance(tol), max_iterations(max_itr), - VERBOSE(v), DEBUG(d) {} + TwoStateHMM(const double mp, const double tol, const size_t max_itr, + const bool v, bool d = false) : + MIN_PROB(mp), tolerance(tol), max_iterations(max_itr), VERBOSE(v), + DEBUG(d) {} /***************************/ /* for multiple replicates */ double BaumWelchTraining_rep( - const std::vector > > &values, - const std::vector &reset_points, - std::vector &start_trans, - std::vector > &trans, - std::vector &end_trans, - std::vector &fg_alpha, std::vector &fg_beta, - std::vector &bg_alpha, std::vector &bg_beta, - const std::vector &array_status) const; + const std::vector>> &values, + const std::vector &reset_points, std::vector &start_trans, + std::vector> &trans, std::vector &end_trans, + std::vector &fg_alpha, std::vector &fg_beta, + std::vector &bg_alpha, std::vector &bg_beta, + const std::vector &array_status) const; double PosteriorDecoding_rep( - const std::vector > > &values, - const std::vector &reset_points, - const std::vector &start_trans, - const std::vector > &trans, - const std::vector &end_trans, - const std::vector fg_alpha, const std::vector fg_beta, - const std::vector bg_alpha, const std::vector bg_beta, - std::vector &classes, - std::vector &llr_scores, - const std::vector &array_status) const; - - void PosteriorScores_rep( const std::vector>> &values, const std::vector &reset_points, const std::vector &start_trans, const std::vector> &trans, - const std::vector &end_trans, - const std::vector fg_alpha, - const std::vector fg_beta, - const std::vector bg_alpha, - const std::vector bg_beta, - const std::vector &classes, + const std::vector &end_trans, const std::vector &fg_alpha, + const std::vector &fg_beta, const std::vector &bg_alpha, + const std::vector &bg_beta, std::vector &classes, + std::vector &llr_scores, + const std::vector &array_status) const; + + void + PosteriorScores_rep( + const std::vector>> &values, + const std::vector &reset_points, + const std::vector &start_trans, + const std::vector> &trans, + const std::vector &end_trans, const std::vector &fg_alpha, + const std::vector &fg_beta, const std::vector &bg_alpha, + const std::vector &bg_beta, const std::vector &classes, std::vector &llr_scores, const std::vector &array_status) const; void TransitionPosteriors_rep( - const std::vector > > &values, - const std::vector &reset_points, - const std::vector &start_trans, - const std::vector > &trans, - const std::vector &end_trans, - const std::vector &fg_alpha, - const std::vector &fg_beta, - const std::vector &bg_alpha, - const std::vector &bg_beta, - const std::vector &array_status, - const size_t transition, - std::vector &scores) const; + const std::vector>> &values, + const std::vector &reset_points, + const std::vector &start_trans, + const std::vector> &trans, + const std::vector &end_trans, const std::vector &fg_alpha, + const std::vector &fg_beta, const std::vector &bg_alpha, + const std::vector &bg_beta, const std::vector &array_status, + const size_t transition, std::vector &scores) const; /***************************/ @@ -101,112 +90,98 @@ class TwoStateHMM { static const size_t BG_TO_FG_TRANSITION = 2; private: - double log_sum_log_vec(const std::vector &vals, size_t limit) const; void - estimate_emissions(const std::vector > &f, - const std::vector > &b, + estimate_emissions(const std::vector> &f, + const std::vector> &b, std::vector &fg_probs, std::vector &bg_probs) const; void - estimate_transitions(const std::vector > &vals, - const size_t start, const size_t end, - const std::vector > &f, - const std::vector > &b, - const double total, - const EmissionDistribution &fg_distro, - const EmissionDistribution &bg_distro, - const double lp_ff, const double lp_fb, - const double lp_bf, const double lp_bb, - const double lp_ft, const double lp_bt, - std::vector &ff_vals, - std::vector &fb_vals, - std::vector &bf_vals, - std::vector &bb_vals) const; - + estimate_transitions( + const std::vector> &vals, const size_t start, + const size_t end, const std::vector> &f, + const std::vector> &b, const double total, + const EmissionDistribution &fg_distro, + const EmissionDistribution &bg_distro, const double lp_ff, + const double lp_fb, const double lp_bf, const double lp_bb, + const double lp_ft, const double lp_bt, std::vector &ff_vals, + std::vector &fb_vals, std::vector &bf_vals, + std::vector &bb_vals) const; /***************************/ /* for multiple replicates */ double forward_algorithm_rep( - const std::vector > > &vals, - const size_t start, const size_t end, - const double lp_sf, const double lp_sb, - const double lp_ff, const double lp_fb, const double lp_ft, - const double lp_bf, const double lp_bb, const double lp_bt, - const std::vector > &fg_distro, - const std::vector > &bg_distro, - std::vector > &f) const; + const std::vector>> &vals, + const size_t start, const size_t end, const double lp_sf, + const double lp_sb, const double lp_ff, const double lp_fb, + const double lp_ft, const double lp_bf, const double lp_bb, + const double lp_bt, + const std::vector> &fg_distro, + const std::vector> &bg_distro, + std::vector> &f) const; double backward_algorithm_rep( - const std::vector > > &vals, - const size_t start, const size_t end, - const double lp_sf, const double lp_sb, - const double lp_ff, const double lp_fb, const double lp_ft, - const double lp_bf, const double lp_bb, const double lp_bt, - const std::vector > &fg_distro, - const std::vector > &bg_distro, - std::vector > &b) const; + const std::vector>> &vals, + const size_t start, const size_t end, const double lp_sf, + const double lp_sb, const double lp_ff, const double lp_fb, + const double lp_ft, const double lp_bf, const double lp_bb, + const double lp_bt, + const std::vector> &fg_distro, + const std::vector> &bg_distro, + std::vector> &b) const; void estimate_transitions_rep( - const std::vector > > &vals, - const size_t start, const size_t end, - const std::vector > &f, - const std::vector > &b, - const double total, - const std::vector > &fg_distro, - const std::vector > &bg_distro, - const double lp_ff, const double lp_fb, - const double lp_bf, const double lp_bb, - std::vector &ff_vals, - std::vector &fb_vals, - std::vector &bf_vals, - std::vector &bb_vals) const; + const std::vector>> &vals, + const size_t start, const size_t end, + const std::vector> &f, + const std::vector> &b, const double total, + const std::vector> &fg_distro, + const std::vector> &bg_distro, + const double lp_ff, const double lp_fb, const double lp_bf, + const double lp_bb, std::vector &ff_vals, + std::vector &fb_vals, std::vector &bf_vals, + std::vector &bb_vals) const; double single_iteration_rep( - const std::vector > > &values, - const std::vector > &vals_a, - const std::vector > &vals_b, - const std::vector &reset_points, - std::vector > &forward, - std::vector > &backward, - double &p_sf, double &p_sb, - double &p_ff, double &p_fb, double &p_ft, - double &p_bf, double &p_bb, double &p_bt, - std::vector > &fg_distro, - std::vector > &bg_distro) const; - + const std::vector>> &values, + const std::vector> &vals_a, + const std::vector> &vals_b, + const std::vector &reset_points, + std::vector> &forward, + std::vector> &backward, double &p_sf, + double &p_sb, double &p_ff, double &p_fb, double &p_ft, double &p_bf, + double &p_bb, double &p_bt, + std::vector> &fg_distro, + std::vector> &bg_distro) const; double BaumWelchTraining_rep( - const std::vector > > &values, - const std::vector &reset_points, - double &p_sf, double &p_sb, - double &p_ff, double &p_fb, double &p_ft, - double &p_bf, double &p_bb, double &p_bt, - std::vector > &fg_distro, - std::vector > &bg_distro, - const std::vector &array_status) const; + const std::vector>> &values, + const std::vector &reset_points, double &p_sf, double &p_sb, + double &p_ff, double &p_fb, double &p_ft, double &p_bf, double &p_bb, + double &p_bt, std::vector> &fg_distro, + std::vector> &bg_distro, + const std::vector &array_status) const; void PosteriorScores_rep( - const std::vector > > &values, - const std::vector &reset_points, - double p_sf, double p_sb, - double p_ff, double p_fb, double p_ft, - double p_bf, double p_bb, double p_bt, - const std::vector > &fg_distro, - const std::vector > &bg_distro, - const std::vector &classes, - std::vector &llr_scores) const; - - double PosteriorDecoding_rep( + const std::vector>> &values, + const std::vector &reset_points, double p_sf, double p_sb, + double p_ff, double p_fb, double p_ft, double p_bf, double p_bb, + double p_bt, + const std::vector> &fg_distro, + const std::vector> &bg_distro, + const std::vector &classes, std::vector &llr_scores) const; + + double + PosteriorDecoding_rep( const std::vector>> &values, const std::vector &reset_points, double p_sf, double p_sb, double p_ff, double p_fb, double p_ft, double p_bf, double p_bb, @@ -215,30 +190,28 @@ class TwoStateHMM { const std::vector> &bg_distro, std::vector &classes, std::vector &llr_scores) const; - void TransitionPosteriors_rep( + void + TransitionPosteriors_rep( const std::vector>> &values, - const std::vector &reset_points, - double p_sf, double p_sb, - double p_ff, double p_fb, - double p_ft, double p_bf, - double p_bb, double p_bt, + const std::vector &reset_points, double p_sf, double p_sb, + double p_ff, double p_fb, double p_ft, double p_bf, double p_bb, + double p_bt, const std::vector> &fg_distro, const std::vector> &bg_distro, - const size_t transition, - std::vector &scores) const; + const size_t transition, std::vector &scores) const; /***************************/ double log_sum_log(const double p, const double q) const; - double MIN_PROB; - double tolerance; - size_t max_iterations; - bool VERBOSE; - bool DEBUG; + double MIN_PROB{}; + double tolerance{}; + size_t max_iterations{}; + bool VERBOSE{}; + bool DEBUG{}; - mutable size_t emission_correction_count; + mutable size_t emission_correction_count{}; }; #endif From 83cb6fcf085b7ec4ce675ac87afa6179c049a080 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 040/106] src/common/bam_record_utils.cpp: changes to add static analysis --- src/common/bam_record_utils.cpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/common/bam_record_utils.cpp b/src/common/bam_record_utils.cpp index 8e2c2076..4cdf3d32 100644 --- a/src/common/bam_record_utils.cpp +++ b/src/common/bam_record_utils.cpp @@ -14,16 +14,21 @@ */ #include "bam_record_utils.hpp" +#include "dnmt_error.hpp" + +#include #include #include +#include +#include +#include +#include #include #include #include - -#include "dnmt_error.hpp" -#include "smithlab_utils.hpp" +#include using std::runtime_error; using std::string; @@ -34,6 +39,8 @@ using std::vector; using bamxx::bam_header; using bamxx::bam_rec; +// NOLINTBEGIN(*-pointer-arithmetic,*-avoid-magic-numbers,*-type-reinterpret-cast,*-owning-memory,*-no-malloc,*-narrowing-conversions,*-avoid-c-arrays,*-constant-array-index) + /// functions in place of undefd macro static inline bool bam_is_rev(const bam1_t *b) { @@ -838,7 +845,7 @@ keep_better_end(const bam_rec &a, const bam_rec &b, bam_rec &c) { // ADS: will move to using this function once it is written static inline void standardize_format(const string &input_format, bam1_t *aln) { - int err_code; // = 0; + int err_code{}; if (input_format == "abismal" || input_format == "walt") return; @@ -926,7 +933,8 @@ standardize_format(const string &input_format, bam1_t *aln) { } void -standardize_format(const string &input_format, bam_rec &aln) { +standardize_format(const string &input_format, + bam_rec &aln) { // cppcheck-suppress constParameterReference standardize_format(input_format, aln.b); } @@ -988,7 +996,8 @@ to_string(const bam_header &hdr, const bam_rec &aln) { throw runtime_error("Can't format record: " + to_string(hdr, aln)); } const std::string s = string(ks.s); - if (ks.s != nullptr) - free(ks.s); + ks_free(&ks); return s; } + +// NOLINTEND(*-pointer-arithmetic,*-avoid-magic-numbers,*-type-reinterpret-cast,*-owning-memory,*-no-malloc,*-narrowing-conversions,*-avoid-c-arrays,*-constant-array-index) From cf7f8957458d20f1d8e2dbe29272d63854f6b85f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 041/106] src/common/bam_record_utils.hpp: changes to add static analysis --- src/common/bam_record_utils.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/common/bam_record_utils.hpp b/src/common/bam_record_utils.hpp index 46221960..972f2ffa 100644 --- a/src/common/bam_record_utils.hpp +++ b/src/common/bam_record_utils.hpp @@ -29,6 +29,10 @@ #include +#include + +#include +#include #include #ifdef bam_is_rev @@ -250,7 +254,8 @@ inline bool precedes_by_end_and_strand(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { const auto end_a = bam_endpos(a.b); const auto end_b = bam_endpos(b.b); - return end_a < end_b || (end_a == end_b && bam_is_rev(a) < bam_is_rev(b)); + return end_a < end_b || + (end_a == end_b && bam_is_rev(a) == false && bam_is_rev(b) == true); } inline bool From 74273db69961ac80a41ede328920c10a87f555a1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 042/106] src/common/bsutils.cpp: changes to add static analysis --- src/common/bsutils.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/common/bsutils.cpp b/src/common/bsutils.cpp index 7d0a661e..d1e857da 100644 --- a/src/common/bsutils.cpp +++ b/src/common/bsutils.cpp @@ -17,11 +17,16 @@ #include "dnmtools_gaussinv.hpp" #include -#include +#include +#include #include +#include +#include #include #include +#include +#include #include void @@ -61,7 +66,6 @@ void relative_sort(const std::vector &mapped_locations, const std::vector &names, std::vector &lookup) { - std::unordered_map names_map; for (std::size_t i = 0; i < std::size(names); ++i) names_map[names[i]] = i; From 34518108cdb775e401284b49e3f57964a0ea2840 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 043/106] src/common/bsutils.hpp: changes to add static analysis --- src/common/bsutils.hpp | 59 ++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/src/common/bsutils.hpp b/src/common/bsutils.hpp index 156e47a4..30aa50d6 100644 --- a/src/common/bsutils.hpp +++ b/src/common/bsutils.hpp @@ -18,56 +18,56 @@ #ifndef BSUTILS_HPP #define BSUTILS_HPP +#include #include #include -#include - -#include -#include +class GenomicRegion; inline bool -is_cytosine(char c) {return (c == 'c' || c == 'C');} +is_cytosine(char c) { + return (c == 'c' || c == 'C'); +} inline bool -is_guanine(char c) {return (c == 'g' || c == 'G');} +is_guanine(char c) { + return (c == 'g' || c == 'G'); +} inline bool -is_thymine(char c) {return (c == 't' || c == 'T');} +is_thymine(char c) { + return (c == 't' || c == 'T'); +} inline bool -is_adenine(char c) {return (c == 'a' || c == 'A');} - +is_adenine(char c) { + return (c == 'a' || c == 'A'); +} //// CONFIDENCE INTERVALS //**************//////////////////////// void -wilson_ci_for_binomial(const double alpha, const double n, - const double p_hat, double &lower, double &upper); - +wilson_ci_for_binomial(const double alpha, const double n, const double p_hat, + double &lower, double &upper); inline bool is_cpg(const std::string &s, size_t i) { - return (i < (s.length() - 1)) && - is_cytosine(s[i]) && is_guanine(s[i + 1]); + return (i < (s.length() - 1)) && is_cytosine(s[i]) && is_guanine(s[i + 1]); } - void -adjust_region_ends(const std::vector > &clusters, +adjust_region_ends(const std::vector> &clusters, std::vector ®ions); - void relative_sort(const std::vector &mapped_locations, const std::vector &names, std::vector &lookup); - -template static void +template +static void separate_regions(const std::vector &big_regions, - const std::vector ®ions, - const std::vector &seqs, - std::vector > &sep_regions, - std::vector > &sep_seqs) { + const std::vector ®ions, const std::vector &seqs, + std::vector> &sep_regions, + std::vector> &sep_seqs) { size_t rr_id = 0; const size_t n_regions = regions.size(); assert(n_regions <= seqs.size()); @@ -79,14 +79,12 @@ separate_regions(const std::vector &big_regions, const std::string current_chrom(big_regions[i].get_chrom()); const size_t current_start = big_regions[i].get_start(); const size_t current_end = big_regions[i].get_end(); - while (rr_id < n_regions && - (regions[rr_id].get_chrom() < current_chrom || - (regions[rr_id].get_chrom() == current_chrom && - regions[rr_id].get_end() <= current_start))) + while (rr_id < n_regions && (regions[rr_id].get_chrom() < current_chrom || + (regions[rr_id].get_chrom() == current_chrom && + regions[rr_id].get_end() <= current_start))) ++rr_id; - while (rr_id < n_regions && - (regions[rr_id].get_chrom() == current_chrom && - regions[rr_id].get_start() < current_end)) { + while (rr_id < n_regions && (regions[rr_id].get_chrom() == current_chrom && + regions[rr_id].get_start() < current_end)) { sep_regions[i].push_back(regions[rr_id]); sep_seqs[i].push_back(seqs[rr_id]); ++rr_id; @@ -94,5 +92,4 @@ separate_regions(const std::vector &big_regions, } } - #endif From fd6fc8c5883e296a0371c990ede5809516ece36e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 044/106] src/common/counts_header.cpp: changes to add static analysis --- src/common/counts_header.cpp | 40 ++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/common/counts_header.cpp b/src/common/counts_header.cpp index b835ec53..d99ab1ad 100644 --- a/src/common/counts_header.cpp +++ b/src/common/counts_header.cpp @@ -17,23 +17,25 @@ */ #include "counts_header.hpp" +#include "bam_record_utils.hpp" +#include "dnmt_error.hpp" + +#include + +#include + +#include +#include #include +#include #include #include -#include +#include #include #include #include -#include "bam_record_utils.hpp" - -// generated by autotools -#include - -#include "bamxx.hpp" -#include "dnmt_error.hpp" - using std::string; using std::to_string; using std::unordered_map; @@ -54,7 +56,7 @@ write_counts_header_from_chrom_sizes(const vector &chrom_names, for (auto i = 0u; i < size(chrom_sizes); ++i) { const string tmp = "#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n"; - out.write(tmp.data()); + out.write(tmp); chrom_order.emplace(chrom_names[i], chrom_count++); } out.write("#\n"); @@ -86,28 +88,26 @@ write_counts_header_from_file(const string &header_file, bgzf_file &out) { bamxx::bgzf_file & skip_counts_header(bamxx::bgzf_file &in) { - // use the kstring_t type to more directly use the BGZF file kstring_t line{0, 0, nullptr}; const int ret = ks_resize(&line, 1024); if (ret) return in; - while (bamxx::getline(in, line) && line.s[0] == '#') { - if (line.s[0] == '#' && line.l == 1) + while (bamxx::getline(in, line) && line.s[0] == '#') // NOLINT + if (line.l == 1) { + ks_free(&line); return in; - } - // otherwise we have missed the final line of the header - assert(line.s[0] != '#'); + } + ks_free(&line); return in; } int -get_chrom_sizes_for_counts_header(const uint32_t n_threads, +get_chrom_sizes_for_counts_header(const std::int32_t n_threads, const string &filename, vector &chrom_names, vector &chrom_sizes) { - bamxx::bam_tpool tpool(n_threads); bgzf_file in(filename, "r"); @@ -127,10 +127,10 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads, uint64_t chrom_size = 0; while (getline(in, line)) { - if (line.s[0] == '>') { + if (line.s[0] == '>') { // NOLINT(*-pointer-arithmetic) if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size); - chrom_names.emplace_back(line.s + 1); + chrom_names.emplace_back(line.s + 1); // NOLINT(*-pointer-arithmetic) chrom_size = 0; } else From 4f251b27ae77aa3384d05fb64bb96387f5574be1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 045/106] src/common/counts_header.hpp: changes to add static analysis --- src/common/counts_header.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/common/counts_header.hpp b/src/common/counts_header.hpp index 5f380074..dd8e0bef 100644 --- a/src/common/counts_header.hpp +++ b/src/common/counts_header.hpp @@ -23,8 +23,12 @@ #include #include #include - -#include "bamxx.hpp" +namespace bamxx { +struct bam_header; +} +namespace bamxx { +struct bgzf_file; +} std::unordered_map write_counts_header_from_chrom_sizes( @@ -37,7 +41,7 @@ write_counts_header_from_file(const std::string &header_file, // returns -1 on failure, 0 on success int -get_chrom_sizes_for_counts_header(const uint32_t n_threads, +get_chrom_sizes_for_counts_header(const std::int32_t n_threads, const std::string &filename, std::vector &chrom_names, std::vector &chrom_sizes); From 5d9f503952cbf7e66a5d25c03fa755b303c71f93 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 046/106] src/common/dnmt_error.hpp: changes to add static analysis --- src/common/dnmt_error.hpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/common/dnmt_error.hpp b/src/common/dnmt_error.hpp index 38a714c1..9632cf69 100644 --- a/src/common/dnmt_error.hpp +++ b/src/common/dnmt_error.hpp @@ -16,17 +16,17 @@ #ifndef DNMT_ERROR_HPP #define DNMT_ERROR_HPP -#include -#include +#include // for int64_t #include -#include // for int64_t #include +#include +#include -struct dnmt_error: public std::exception { - int64_t err; // error possibly from HTSlib - int the_errno; // ERRNO at time of construction - std::string msg; // the message - std::string the_what; // to report +struct dnmt_error : public std::exception { + int64_t err; // error possibly from HTSlib + int the_errno; // ERRNO at time of construction + std::string msg; // the message + std::string the_what; // to report dnmt_error(const int64_t _err, const std::string &_msg) : err{_err}, the_errno{errno}, msg{_msg} { std::ostringstream oss; @@ -34,9 +34,11 @@ struct dnmt_error: public std::exception { << "[" << strerror(the_errno) << "][" << msg << "]"; the_what = oss.str(); } - dnmt_error(const std::string &_msg) : dnmt_error(0, _msg) {} - const char* - what() const noexcept override {return the_what.c_str();} + explicit dnmt_error(const std::string &_msg) : dnmt_error(0, _msg) {} + const char * + what() const noexcept override { + return the_what.c_str(); + } }; #endif From d8c90979978403fbce1cfcc90a4ce63ef82ef9b4 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 047/106] src/common/dnmtools_gaussinv.cpp: changes to add static analysis --- src/common/dnmtools_gaussinv.cpp | 181 +++++++++++++++++++++---------- 1 file changed, 123 insertions(+), 58 deletions(-) diff --git a/src/common/dnmtools_gaussinv.cpp b/src/common/dnmtools_gaussinv.cpp index 681229af..bae01391 100644 --- a/src/common/dnmtools_gaussinv.cpp +++ b/src/common/dnmtools_gaussinv.cpp @@ -74,40 +74,61 @@ * */ +#include "dnmtools_gaussinv.hpp" + #include +#include #include -using std::sqrt; +// ADS: they are all magic... -static double +// NOLINTBEGIN(*-avoid-magic-numbers,*-avoid-c-arrays,*-pointer-arithmetic,*-array-to-pointer-decay,*-constant-array-index) + +[[nodiscard]] static inline double rat_eval(const double a[], const size_t na, const double b[], const size_t nb, const double x) { double u = a[na - 1]; - for (size_t i = na - 1; i > 0; i--) { u = x * u + a[i - 1]; } + for (size_t i = na - 1; i > 0; i--) { + u = x * u + a[i - 1]; + } double v = b[nb - 1]; - for (size_t j = nb - 1; j > 0; j--) { v = x * v + b[j - 1]; } + for (size_t j = nb - 1; j > 0; j--) { + v = x * v + b[j - 1]; + } return u / v; } static double small(double q) { - const double a[8] = {3.387132872796366608, 133.14166789178437745, - 1971.5909503065514427, 13731.693765509461125, - 45921.953931549871457, 67265.770927008700853, - 33430.575583588128105, 2509.0809287301226727}; - - const double b[8] = {1.0, - 42.313330701600911252, - 687.1870074920579083, - 5394.1960214247511077, - 21213.794301586595867, - 39307.89580009271061, - 28729.085735721942674, - 5226.495278852854561}; + // clang-format off + const double a[8] = { + 3.387132872796366608, + 133.14166789178437745, + 1971.5909503065514427, + 13731.693765509461125, + 45921.953931549871457, + 67265.770927008700853, + 33430.575583588128105, + 2509.0809287301226727, + }; + // clang-format on + + // clang-format off + const double b[8] = { + 1.0, + 42.313330701600911252, + 687.1870074920579083, + 5394.1960214247511077, + 21213.794301586595867, + 39307.89580009271061, + 28729.085735721942674, + 5226.495278852854561, + }; + // clang-format on const double r = 0.180625 - q * q; @@ -116,38 +137,62 @@ small(double q) { static double intermediate(double r) { - const double a[] = {1.42343711074968357734, 4.6303378461565452959, - 5.7694972214606914055, 3.64784832476320460504, - 1.27045825245236838258, 0.24178072517745061177, - 0.0227238449892691845833, 7.7454501427834140764e-4}; - - const double b[] = {1.0, - 2.05319162663775882187, - 1.6763848301838038494, - 0.68976733498510000455, - 0.14810397642748007459, - 0.0151986665636164571966, - 5.475938084995344946e-4, - 1.05075007164441684324e-9}; + // clang-format off + const double a[] = { + 1.42343711074968357734, + 4.6303378461565452959, + 5.7694972214606914055, + 3.64784832476320460504, + 1.27045825245236838258, + 0.24178072517745061177, + 0.0227238449892691845833, + 7.7454501427834140764e-4, + }; + // clang-format on + + // clang-format off + const double b[] = { + 1.0, + 2.05319162663775882187, + 1.6763848301838038494, + 0.68976733498510000455, + 0.14810397642748007459, + 0.0151986665636164571966, + 5.475938084995344946e-4, + 1.05075007164441684324e-9, + }; + // clang-format on return rat_eval(a, 8, b, 8, (r - 1.6)); } static double tail(double r) { - const double a[] = {6.6579046435011037772, 5.4637849111641143699, - 1.7848265399172913358, 0.29656057182850489123, - 0.026532189526576123093, 0.0012426609473880784386, - 2.71155556874348757815e-5, 2.01033439929228813265e-7}; - - const double b[] = {1.0, - 0.59983220655588793769, - 0.13692988092273580531, - 0.0148753612908506148525, - 7.868691311456132591e-4, - 1.8463183175100546818e-5, - 1.4215117583164458887e-7, - 2.04426310338993978564e-15}; + // clang-format off + const double a[] = { + 6.6579046435011037772, + 5.4637849111641143699, + 1.7848265399172913358, + 0.29656057182850489123, + 0.026532189526576123093, + 0.0012426609473880784386, + 2.71155556874348757815e-5, + 2.01033439929228813265e-7, + }; + // clang-format on + + // clang-format off + const double b[] = { + 1.0, + 0.59983220655588793769, + 0.13692988092273580531, + 0.0148753612908506148525, + 7.868691311456132591e-4, + 1.8463183175100546818e-5, + 1.4215117583164458887e-7, + 2.04426310338993978564e-15, + }; + // clang-format on return rat_eval(a, 8, b, 8, (r - 5.0)); } @@ -161,11 +206,12 @@ dnmt_gsl_cdf_ugaussian_Pinv(const double P) { else if (P == 0.0) return -std::numeric_limits::infinity(); - if (fabs(dP) <= 0.425) return small(dP); + if (fabs(dP) <= 0.425) + return small(dP); const double pp = (P < 0.5) ? P : 1.0 - P; - const double r = sqrt(-log(pp)); + const double r = std::sqrt(-log(pp)); const double x = (r <= 5.0) ? intermediate(r) : tail(r); @@ -176,14 +222,19 @@ double dnmt_gsl_cdf_ugaussian_Qinv(const double Q) { const double dQ = Q - 0.5; - if (Q == 1.0) { return -std::numeric_limits::infinity(); } - else if (Q == 0.0) { return std::numeric_limits::infinity(); } + if (Q == 1.0) { + return -std::numeric_limits::infinity(); + } + else if (Q == 0.0) { + return std::numeric_limits::infinity(); + } - if (fabs(dQ) <= 0.425) return -small(dQ); + if (fabs(dQ) <= 0.425) + return -small(dQ); const double pp = (Q < 0.5) ? Q : 1.0 - Q; - const double r = sqrt(-log(pp)); + const double r = std::sqrt(-log(pp)); const double x = (r <= 5.0) ? intermediate(r) : tail(r); @@ -239,9 +290,9 @@ get_del(double x, double rational) { */ static double gauss_small(const double x) { - double xsq; - double xnum; - double xden; + double xsq{}; + double xnum{}; + double xden{}; const double a[5] = {2.2352520354606839287, 161.02823106855587881, 1067.6894854603709582, 18154.981253343561249, @@ -333,7 +384,9 @@ dnmt_gsl_cdf_ugaussian_P(const double x) { else if (absx < dnmt_SQRT32) { result = gauss_medium(x); - if (x > 0.0) { result = 1.0 - result; } + if (x > 0.0) { + result = 1.0 - result; + } return result; } @@ -346,7 +399,9 @@ dnmt_gsl_cdf_ugaussian_P(const double x) { else { result = gauss_large(x); - if (x > 0.0) { result = 1.0 - result; } + if (x > 0.0) { + result = 1.0 - result; + } } return result; @@ -363,15 +418,21 @@ dnmt_gsl_cdf_ugaussian_Q(const double x) { else if (absx < 0.66291) { result = gauss_small(x); - if (x < 0.0) { result = fabs(result) + 0.5; } - else { result = 0.5 - result; } + if (x < 0.0) { + result = fabs(result) + 0.5; + } + else { + result = 0.5 - result; + } return result; } else if (absx < dnmt_SQRT32) { result = gauss_medium(x); - if (x < 0.0) { result = 1.0 - result; } + if (x < 0.0) { + result = 1.0 - result; + } return result; } @@ -384,7 +445,9 @@ dnmt_gsl_cdf_ugaussian_Q(const double x) { else { result = gauss_large(x); - if (x < 0.0) { result = 1.0 - result; } + if (x < 0.0) { + result = 1.0 - result; + } } return result; } @@ -398,3 +461,5 @@ double dnmt_gsl_cdf_gaussian_Q(const double x, const double sigma) { return dnmt_gsl_cdf_ugaussian_Q(x / sigma); } + +// NOLINTEND(*-avoid-magic-numbers,*-avoid-c-arrays,*-pointer-arithmetic,*-array-to-pointer-decay,*-constant-array-index) From df96788562eca2866d0dc058856996418870c42f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 048/106] src/common/dnmtools_utils.cpp: changes to add static analysis --- src/common/dnmtools_utils.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/common/dnmtools_utils.cpp b/src/common/dnmtools_utils.cpp index 93f7e4d2..e4206781 100644 --- a/src/common/dnmtools_utils.cpp +++ b/src/common/dnmtools_utils.cpp @@ -26,12 +26,15 @@ using std::ostringstream; using std::string; auto -get_command_line(const int argc, char *argv[]) -> std::string { +get_command_line(const int argc, + char *argv[]) -> std::string { // NOLINT(*-c-arrays) if (argc == 0) return std::string{}; std::ostringstream cmd; cmd << '"'; + // NOLINTBEGIN(*-pointer-arithmetic) copy(argv, argv + (argc - 1), ostream_iterator(cmd, " ")); cmd << argv[argc - 1] << '"'; + // NOLINTEND(*-pointer-arithmetic) return cmd.str(); } From 5b84b0093584f3a4538dbf2495e449da5b1128d1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 049/106] src/common/dnmtools_utils.hpp: changes to add static analysis --- src/common/dnmtools_utils.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common/dnmtools_utils.hpp b/src/common/dnmtools_utils.hpp index dc5a16b6..0a57f97d 100644 --- a/src/common/dnmtools_utils.hpp +++ b/src/common/dnmtools_utils.hpp @@ -19,6 +19,7 @@ #include auto -get_command_line(const int argc, char *argv[]) -> std::string; +get_command_line(const int argc, + char *argv[]) -> std::string; // NOLINT(*-c-arrays) #endif From 116d6fe52be15b89b7fabc8c2b415eea58dd1e6a Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 050/106] src/common/numerical_utils.cpp: changes to add static analysis --- src/common/numerical_utils.cpp | 51 +++++++++++++--------------------- 1 file changed, 20 insertions(+), 31 deletions(-) diff --git a/src/common/numerical_utils.cpp b/src/common/numerical_utils.cpp index a77be36b..020056ad 100644 --- a/src/common/numerical_utils.cpp +++ b/src/common/numerical_utils.cpp @@ -16,43 +16,32 @@ #include "numerical_utils.hpp" +#include #include +#include // IWYU pragma: keep #include -#include - -using std::vector; double -log_sum_log_vec(const std::vector &vals, const size_t limit) -{ - const std::vector::const_iterator x = - std::max_element(vals.begin(), vals.begin() + limit); - const double max_val = *x; - const size_t max_idx = x - vals.begin(); - double sum = 1.0; - for (size_t i = 0; i < limit; ++i) - { - if (i != max_idx) - { - sum += exp(vals[i] - max_val); - } - } - return max_val + log(sum); +log_sum_log_vec(const std::vector &vals, const size_t limit) { + const auto x = std::max_element( + std::cbegin(vals), std::cbegin(vals) + static_cast(limit)); + const double max_val = *x; + const std::size_t max_idx = std::distance(std::cbegin(vals), x); + double sum = 1.0; + for (std::size_t i = 0; i < limit; ++i) + if (i != max_idx) + sum += std::exp(vals[i] - max_val); // cppcheck-suppress useStlAlgorithm + return max_val + std::log(sum); } double log_sum_log(const std::vector::const_iterator &begin, - const std::vector::const_iterator &end) -{ - const std::vector::const_iterator max_itr = - std::max_element(begin, end); - const double max_val = *max_itr; - - double sum = 1.0; - for (std::vector::const_iterator itr = begin; itr < end; ++itr) - if (itr != max_itr) sum += exp(*itr - max_val); - - return max_val + log(sum); + const std::vector::const_iterator &end) { + const auto max_itr = std::max_element(begin, end); + const double max_val = *max_itr; + double sum = 1.0; + for (auto itr = begin; itr < end; ++itr) + if (itr != max_itr) + sum += std::exp(*itr - max_val); // cppcheck-suppress useStlAlgorithm + return max_val + std::log(sum); } - - From 94752cb3c26acc3816a9ad683a68765d9136f242 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 051/106] src/common/numerical_utils.hpp: changes to add static analysis --- src/common/numerical_utils.hpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/common/numerical_utils.hpp b/src/common/numerical_utils.hpp index a9da9b80..c5f2de1c 100644 --- a/src/common/numerical_utils.hpp +++ b/src/common/numerical_utils.hpp @@ -18,23 +18,25 @@ #define NUMERICAL_UTILS_HPP #include +#include #include -#include inline double -log_sum_log(const double p, const double q) -{ - if (p == 0) {return q;} - else if (q == 0) {return p;} - const double larger = (p > q) ? p : q; - const double smaller = (p > q) ? q : p; - return larger + log(1.0 + exp(smaller - larger)); +log_sum_log(const double p, const double q) { + if (p == 0) { + return q; + } + else if (q == 0) { + return p; + } + const double larger = (p > q) ? p : q; + const double smaller = (p > q) ? q : p; + return larger + log1p(exp(smaller - larger)); } inline double -log_sum_log(const double p, const double q, const double r) -{ - return log_sum_log(log_sum_log(p, q), r); +log_sum_log(const double p, const double q, const double r) { + return log_sum_log(log_sum_log(p, q), r); } double @@ -45,4 +47,3 @@ log_sum_log(const std::vector::const_iterator &begin, const std::vector::const_iterator &end); #endif - From 64b0b713de2cbffa99014db43cf32bbd11e33565 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 052/106] src/common/xcounts_utils.cpp: changes to add static analysis --- src/common/xcounts_utils.cpp | 91 +++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/src/common/xcounts_utils.cpp b/src/common/xcounts_utils.cpp index b4cb0d58..c7828cbc 100644 --- a/src/common/xcounts_utils.cpp +++ b/src/common/xcounts_utils.cpp @@ -17,41 +17,42 @@ */ #include "xcounts_utils.hpp" +#include "counts_header.hpp" +#include "dnmt_error.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include -#include "counts_header.hpp" -#include "bam_record_utils.hpp" +#include -#include "bamxx.hpp" -#include "dnmt_error.hpp" +#include +#include +#include +#include +#include +#include +#include +#include -using std::vector; +using std::runtime_error; using std::string; using std::to_string; using std::unordered_map; -using std::runtime_error; +using std::vector; using bamxx::bgzf_file; - // careful: this could get big unordered_map> -read_xcounts_by_chrom(const uint32_t n_threads, const string &xcounts_file) { +read_xcounts_by_chrom(const std::int32_t n_threads, + const string &xcounts_file) { bamxx::bam_tpool tp(n_threads); bamxx::bgzf_file in(xcounts_file, "r"); - if (!in) throw runtime_error("failed to open input file"); + if (!in) + throw runtime_error("failed to open input file"); // set the threads for the input file decompression - if (n_threads > 1 && in.is_bgzf()) tp.set_io(in); + if (n_threads > 1 && in.is_bgzf()) + tp.set_io(in); kstring_t line{0, 0, nullptr}; string chrom_name; @@ -62,9 +63,11 @@ read_xcounts_by_chrom(const uint32_t n_threads, const string &xcounts_file) { vector curr_chrom; while (bamxx::getline(in, line)) { - if (is_counts_header_line(line.s)) continue; // ADS: early loop exit + if (is_counts_header_line(line.s)) + continue; // ADS: early loop exit - if (!std::isdigit(line.s[0])) { // check if we have a chrom line + // check if we have a chrom line + if (!std::isdigit(line.s[0])) { // NOLINT(*-pointer-arithmetic) if (!chrom_name.empty()) { sites_by_chrom.insert({chrom_name, curr_chrom}); curr_chrom.clear(); @@ -74,11 +77,15 @@ read_xcounts_by_chrom(const uint32_t n_threads, const string &xcounts_file) { continue; } - uint32_t pos_step = 0, n_meth = 0, n_unmeth = 0; + std::uint32_t pos_step{}; + std::uint32_t n_meth{}; + std::uint32_t n_unmeth{}; + // NOLINTBEGIN(*-pointer-arithmetic) const auto end_line = line.s + line.l; auto res = std::from_chars(line.s, end_line, pos_step); res = std::from_chars(res.ptr + 1, end_line, n_meth); res = std::from_chars(res.ptr + 1, end_line, n_unmeth); + // NOLINTEND(*-pointer-arithmetic) const auto curr_pos = pos + pos_step; @@ -87,17 +94,19 @@ read_xcounts_by_chrom(const uint32_t n_threads, const string &xcounts_file) { pos = curr_pos; } - if (!chrom_name.empty()) sites_by_chrom.insert({chrom_name, curr_chrom}); + if (!chrom_name.empty()) + sites_by_chrom.insert({chrom_name, curr_chrom}); + ks_free(&line); return sites_by_chrom; } - bool get_is_xcounts_file(const std::string &filename) { static constexpr auto max_lines_to_check = 1000ul; bamxx::bgzf_file in(filename, "r"); - if (!in) throw dnmt_error{"failed to open input file: " + filename}; + if (!in) + throw dnmt_error{"failed to open input file: " + filename}; kstring_t line{0, 0, nullptr}; @@ -109,22 +118,40 @@ get_is_xcounts_file(const std::string &filename) { if (is_counts_header_line(line.s)) { found_header = true; } - else if (!std::isdigit(line.s[0])) { // check if we have a chrom line - if (!found_header) + // check if we have a chrom line + else if (!std::isdigit(line.s[0])) { // NOLINT(*-pointer-arithmetic) + if (!found_header) { + ks_free(&line); return false; + } found_chrom = true; } else { - if (!found_chrom) return false; - int64_t pos_step = 0, n_meth = 0, n_unmeth = 0; + if (!found_chrom) { + ks_free(&line); + return false; + } + std::int64_t pos_step = 0, n_meth = 0, n_unmeth = 0; + // NOLINTBEGIN(*-pointer-arithmetic) const auto end_line = line.s + line.l; auto res = std::from_chars(line.s, end_line, pos_step); - if (res.ec != std::errc()) return false; + if (res.ec != std::errc()) { + ks_free(&line); + return false; + } res = std::from_chars(res.ptr + 1, end_line, n_meth); - if (res.ec != std::errc()) return false; + if (res.ec != std::errc()) { + ks_free(&line); + return false; + } res = std::from_chars(res.ptr + 1, end_line, n_unmeth); - if (res.ec != std::errc()) return false; + if (res.ec != std::errc()) { + ks_free(&line); + return false; + } + // NOLINTEND(*-pointer-arithmetic) } } + ks_free(&line); return true; } From 1aed78b7823030ac987e5bf641e3585c5d494b69 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 053/106] src/common/xcounts_utils.hpp: changes to add static analysis --- src/common/xcounts_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/xcounts_utils.hpp b/src/common/xcounts_utils.hpp index 91f0ae61..e2bf5837 100644 --- a/src/common/xcounts_utils.hpp +++ b/src/common/xcounts_utils.hpp @@ -47,7 +47,7 @@ operator<<(std::ostream &o, const xcounts_entry &e) { } std::unordered_map> -read_xcounts_by_chrom(const std::uint32_t n_threads, +read_xcounts_by_chrom(const std::int32_t n_threads, const std::string &xcounts_file); bool From 9070676eeaea8b43a6fb52172d74c6a2ed93b38f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 054/106] src/radmeth/dmr.cpp: changes to add static analysis --- src/radmeth/dmr.cpp | 228 +++++++++++++++++++++----------------------- 1 file changed, 111 insertions(+), 117 deletions(-) diff --git a/src/radmeth/dmr.cpp b/src/radmeth/dmr.cpp index 00403e14..779a0bb1 100644 --- a/src/radmeth/dmr.cpp +++ b/src/radmeth/dmr.cpp @@ -17,48 +17,51 @@ * General Public License for more details. */ -#include -#include -#include -#include -#include -#include -#include - -#include "OptionParser.hpp" -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" #include "GenomicRegion.hpp" #include "MSite.hpp" +#include "OptionParser.hpp" +#include "smithlab_utils.hpp" #include -using std::string; -using std::vector; -using std::cout; +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + using std::cerr; +using std::cout; using std::endl; -using std::pair; -using std::max; +using std::find_if; +using std::from_chars; using std::ifstream; +using std::max; +using std::pair; using std::runtime_error; -using std::from_chars; -using std::find_if; - -using bamxx::bgzf_file; - +using std::string; +using std::vector; static bool -parse_methdiff_line(const char *c, const char *c_end, - string &chrom, uint32_t &pos, char &strand, - string &context, double &diffscore, - uint32_t &n_meth_a, uint32_t &n_unmeth_a, +parse_methdiff_line(const char *c, const char *c_end, string &chrom, + uint32_t &pos, char &strand, string &context, + double &diffscore, uint32_t &n_meth_a, uint32_t &n_unmeth_a, uint32_t &n_meth_b, uint32_t &n_unmeth_b) { constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; }; constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; }; + // NOLINTBEGIN(*-pointer-arithmetic) auto field_s = c; - auto field_e = find_if(field_s + 1, c_end, is_sep); + auto field_e = std::find_if(field_s + 1, c_end, is_sep); bool failed = field_e == c_end; // chromosome name @@ -67,18 +70,18 @@ parse_methdiff_line(const char *c, const char *c_end, chrom = string{field_s, d}; } - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || field_e == c_end; // position { - const auto [ptr, ec] = from_chars(field_s, field_e, pos); + const auto [ptr, ec] = std::from_chars(field_s, field_e, pos); failed = failed || ec != std::errc(); } - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); // below because strand is 1 base wide failed = failed || field_e != field_s + 1 || field_e == c_end; @@ -86,8 +89,8 @@ parse_methdiff_line(const char *c, const char *c_end, strand = *field_s; failed = failed || (strand != '-' && strand != '+'); - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || field_e == c_end; // context @@ -96,8 +99,8 @@ parse_methdiff_line(const char *c, const char *c_end, context = string{field_s, d}; } - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || field_e == c_end; // score for difference in methylation (contingency table p-value) @@ -106,59 +109,60 @@ parse_methdiff_line(const char *c, const char *c_end, const int ret = std::sscanf(field_s, "%lf", &diffscore); failed = failed || ret < 1; #else - const auto [ptr, ec] = from_chars(field_s, field_e, diffscore); + const auto [ptr, ec] = std::from_chars(field_s, field_e, diffscore); failed = failed || ec != std::errc(); #endif } - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || (field_e == c_end); // counts methylated in methylome "a" { - const auto [ptr, ec] = from_chars(field_s, c_end, n_meth_a); + const auto [ptr, ec] = std::from_chars(field_s, c_end, n_meth_a); failed = failed || ec != std::errc(); } - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || field_e == c_end; // counts unmethylated in methylome "a" { - const auto [ptr, ec] = from_chars(field_s, c_end, n_unmeth_a); + const auto [ptr, ec] = std::from_chars(field_s, c_end, n_unmeth_a); failed = failed || ec != std::errc(); } - field_s = find_if(field_e + 1, c_end, not_sep); - field_e = find_if(field_s + 1, c_end, is_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); + field_e = std::find_if(field_s + 1, c_end, is_sep); failed = failed || field_e == c_end; // counts methylated in methylome "b" { - const auto [ptr, ec] = from_chars(field_s, c_end, n_meth_b); + const auto [ptr, ec] = std::from_chars(field_s, c_end, n_meth_b); failed = failed || ec != std::errc(); } - field_s = find_if(field_e + 1, c_end, not_sep); + field_s = std::find_if(field_e + 1, c_end, not_sep); // counts unmethylated in methylome "a" { - const auto [ptr, ec] = from_chars(field_s, c_end, n_unmeth_b); + const auto [ptr, ec] = std::from_chars(field_s, c_end, n_unmeth_b); // final field needs to fail if we haven't reached the end failed = failed || ec != std::errc() || ptr != c_end; } + // NOLINTEND(*-pointer-arithmetic) + return !failed; } - static vector read_diffs_file(const string &diffs_file) { - - bgzf_file in(diffs_file, "r"); - if (!in) throw runtime_error("could not open file: " + diffs_file); + bamxx::bgzf_file in(diffs_file, "r"); + if (!in) + throw runtime_error("could not open file: " + diffs_file); string chrom, name; char strand{}; @@ -168,11 +172,12 @@ read_diffs_file(const string &diffs_file) { vector cpgs; string line; while (getline(in, line)) { - - if (!parse_methdiff_line(line.data(), line.data() + size(line), - chrom, pos, strand, name, diffscore, - meth_a, unmeth_a, meth_b, unmeth_b)) + // NOLINTBEGIN(*-pointer-arithmetic) + if (!parse_methdiff_line(line.data(), line.data() + size(line), chrom, pos, + strand, name, diffscore, meth_a, unmeth_a, meth_b, + unmeth_b)) throw runtime_error("bad methdiff line: " + line); + // NOLINTEND(*-pointer-arithmetic) cpgs.emplace_back(chrom, pos, strand, name, diffscore, 1); } @@ -197,8 +202,8 @@ static vector get_chrom_ends(const vector &r) { vector ends; for (size_t i = 0; i < r.size() - 1; ++i) - if (!r[i].same_chrom(r[i+1])) - ends.push_back(i+1); + if (!r[i].same_chrom(r[i + 1])) + ends.push_back(i + 1); ends.push_back(r.size()); return ends; } @@ -218,52 +223,45 @@ complement_regions(const size_t max_end, const vector &r) { static bool check_no_overlap(const vector ®ions) { for (size_t i = 1; i < regions.size(); ++i) - if (regions[i].same_chrom(regions[i-1]) && + if (regions[i].same_chrom(regions[i - 1]) && regions[i].get_start() < regions[i - 1].get_end()) return false; return true; } - static inline MSite get_left_msite(const GenomicRegion &r) { return {r.get_chrom(), r.get_start(), r.get_strand(), r.get_name(), 0.0, 1u}; } - static inline MSite get_right_msite(const GenomicRegion &r) { return {r.get_chrom(), r.get_end(), r.get_strand(), r.get_name(), 0.0, 1u}; } - -static vector > -separate_sites(const vector &dmrs, - const vector &sites) { +static vector> +separate_sites(const vector &dmrs, const vector &sites) { vector> sep_sites; - for (const auto &dmr: dmrs) { + for (const auto &dmr : dmrs) { const auto a = get_left_msite(dmr); const auto b = get_right_msite(dmr); - const auto a_insert = lower_bound(cbegin(sites), cend(sites), a); - const auto b_insert = lower_bound(cbegin(sites), cend(sites), b); - sep_sites.emplace_back(distance(cbegin(sites), a_insert), - distance(cbegin(sites), b_insert)); + const auto a_insert = lower_bound(std::cbegin(sites), std::cend(sites), a); + const auto b_insert = lower_bound(std::cbegin(sites), std::cend(sites), b); + sep_sites.emplace_back(distance(std::cbegin(sites), a_insert), + distance(std::cbegin(sites), b_insert)); } return sep_sites; } - static inline double pval_from_msite(const MSite &s) { - return s.meth; // abused as a p-value here + return s.meth; // abused as a p-value here } - static void get_cpg_stats(const bool LOW_CUTOFF, const double sig_cutoff, - const vector &cpgs, - const size_t start_idx, const size_t end_idx, - size_t &total_cpgs, size_t &total_sig) { + const vector &cpgs, const size_t start_idx, + const size_t end_idx, size_t &total_cpgs, size_t &total_sig) { total_cpgs = end_idx - start_idx; for (size_t i = start_idx; i < end_idx; ++i) { const auto pval = pval_from_msite(cpgs[i]); @@ -273,21 +271,21 @@ get_cpg_stats(const bool LOW_CUTOFF, const double sig_cutoff, } } - int -main_dmr(int argc, char *argv[]) { - +main_dmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - static const string description = - "computes DMRs based on HMRs and probability of differences at \ - individual CpGs"; + R"( +computes DMRs based on HMRs and probability of differences at +individual CpGs"; +)"; bool VERBOSE = false; - double sig_cutoff = 0.05; + double sig_cutoff = 0.05; // NOLINT(*-avoid-magic-numbers) /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " " " "); opt_parse.set_show_defaults(); @@ -296,20 +294,20 @@ main_dmr(int argc, char *argv[]) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } - if (leftover_args.size() != 5) { - cerr << opt_parse.help_message() << endl; + if (leftover_args.size() != 5) { // NOLINT(*-avoid-magic-numbers) + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const string diffs_file = leftover_args[0]; @@ -320,7 +318,7 @@ main_dmr(int argc, char *argv[]) { /****************** END COMMAND LINE OPTIONS *****************/ if (VERBOSE) - cerr << "[LOADING HMRS] " << hmr1_file << endl; + cerr << "[LOADING HMRS] " << hmr1_file << '\n'; vector regions_a; ReadBEDFile(hmr1_file, regions_a); @@ -331,7 +329,7 @@ main_dmr(int argc, char *argv[]) { throw runtime_error("regions overlap in file: " + hmr1_file); if (VERBOSE) - cerr << "[LOADING HMRS] " << hmr2_file << endl; + cerr << "[LOADING HMRS] " << hmr2_file << '\n'; vector regions_b; ReadBEDFile(hmr2_file, regions_b); @@ -342,11 +340,13 @@ main_dmr(int argc, char *argv[]) { throw runtime_error("regions overlap in file: " + hmr2_file); if (VERBOSE) - cerr << "[COMPUTING SYMMETRIC DIFFERENCE]" << endl; + cerr << "[COMPUTING SYMMETRIC DIFFERENCE]" << '\n'; size_t max_end = 0; - for (const auto &r: regions_a) max_end = max(max_end, r.get_end()); - for (const auto &r: regions_b) max_end = max(max_end, r.get_end()); + for (const auto &r : regions_a) + max_end = max(max_end, r.get_end()); // cppcheck-suppress useStlAlgorithm + for (const auto &r : regions_b) + max_end = max(max_end, r.get_end()); // cppcheck-suppress useStlAlgorithm const auto a_cmpl = complement_regions(max_end, regions_a); const auto b_cmpl = complement_regions(max_end, regions_b); @@ -357,56 +357,50 @@ main_dmr(int argc, char *argv[]) { // separate the regions by chrom and by desert if (VERBOSE) - cerr << "[READING CPG METH DIFFS]" << endl; + cerr << "[READING CPG METH DIFFS]" << '\n'; const auto cpgs = read_diffs_file(diffs_file); if (VERBOSE) - cerr << "[read " << size(cpgs) - << " sites from " + diffs_file << "]" << endl; + cerr << "[read " << size(cpgs) << " sites from " + diffs_file << "]" + << '\n'; if (!check_sorted(cpgs)) throw runtime_error("CpGs not sorted in: " + diffs_file); if (VERBOSE) - cerr << "[TOTAL CPGS]: " << cpgs.size() << endl; + cerr << "[TOTAL CPGS]: " << cpgs.size() << '\n'; auto sep_sites = separate_sites(dmrs_a, cpgs); for (size_t i = 0; i < dmrs_a.size(); ++i) { size_t total_cpgs = 0, total_sig = 0; - get_cpg_stats(true, sig_cutoff, - cpgs, sep_sites[i].first, sep_sites[i].second, - total_cpgs, total_sig); + get_cpg_stats(true, sig_cutoff, cpgs, sep_sites[i].first, + sep_sites[i].second, total_cpgs, total_sig); dmrs_a[i].set_name(dmrs_a[i].get_name() + ":" + toa(total_cpgs)); - dmrs_a[i].set_score(total_sig); + dmrs_a[i].set_score(static_cast(total_sig)); } sep_sites = separate_sites(dmrs_b, cpgs); for (size_t i = 0; i < dmrs_b.size(); ++i) { size_t total_cpgs = 0, total_sig = 0; - get_cpg_stats(false, sig_cutoff, - cpgs, sep_sites[i].first, sep_sites[i].second, - total_cpgs, total_sig); + get_cpg_stats(false, sig_cutoff, cpgs, sep_sites[i].first, + sep_sites[i].second, total_cpgs, total_sig); dmrs_b[i].set_name(dmrs_b[i].get_name() + ":" + toa(total_cpgs)); - dmrs_b[i].set_score(total_sig); + dmrs_b[i].set_score(static_cast(total_sig)); } std::ofstream out_a(outfile_a); - copy(cbegin(dmrs_a), cend(dmrs_a), - std::ostream_iterator(out_a, "\n")); + std::copy(std::cbegin(dmrs_a), std::cend(dmrs_a), + std::ostream_iterator(out_a, "\n")); std::ofstream out_b(outfile_b); - copy(cbegin(dmrs_b), cend(dmrs_b), - std::ostream_iterator(out_b, "\n")); + std::copy(std::cbegin(dmrs_b), std::cend(dmrs_b), + std::ostream_iterator(out_b, "\n")); if (VERBOSE) - cerr << "[OUTPUT FORMAT] COL4=NAME:N_COVERED_CPGS COL5=N_SIG_CPGS" << endl; - } - catch (const runtime_error &e) { - cerr << e.what() << endl; - return EXIT_FAILURE; + cerr << "[OUTPUT FORMAT] COL4=NAME:N_COVERED_CPGS COL5=N_SIG_CPGS\n"; } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::exception &e) { + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 97521e2fd93b421f7fd60d2997f1abdaa53f2f59 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 055/106] src/radmeth/methdiff.cpp: changes to add static analysis --- src/radmeth/methdiff.cpp | 72 +++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/src/radmeth/methdiff.cpp b/src/radmeth/methdiff.cpp index 33d4d474..8afdb640 100644 --- a/src/radmeth/methdiff.cpp +++ b/src/radmeth/methdiff.cpp @@ -17,21 +17,29 @@ * General Public License for more details. */ +#include "MSite.hpp" + +#include "OptionParser.hpp" +#include "smithlab_os.hpp" + #include + +#include +#include #include +#include +#include #include +#include +#include +#include #include +#include +#include #include #include #include - -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - -#include "MSite.hpp" - -using bamxx::bgzf_file; +#include static inline double log_sum_log(const double p, const double q) { @@ -102,10 +110,12 @@ write_methdiff_site(T &out, const MSite &a, const MSite &b, "\n"; // clang-format on static constexpr auto buf_size = 1024; - static char buffer[buf_size]; + static std::array buffer; // clang-format off - const int r = std::snprintf(buffer, buf_size, out_fmt, + const int r = std::snprintf(buffer.data(), // NOLINT(*-pro-type-vararg) + buf_size, + out_fmt, a.chrom.data(), a.pos, a.strand, @@ -118,7 +128,7 @@ write_methdiff_site(T &out, const MSite &a, const MSite &b, // clang-format on if (r < 0) throw std::runtime_error("failed to write to output buffer"); - out.write(buffer, r); + out.write(buffer.data(), r); return out; } @@ -146,8 +156,8 @@ get_chrom_id(std::unordered_map &chrom_order, static std::string bad_order(const std::unordered_map &chrom_order, - const std::string prev_chrom, const std::size_t prev_pos, - const std::string chrom, const std::size_t pos) { + const std::string &prev_chrom, const std::size_t prev_pos, + const std::string &chrom, const std::size_t pos) { std::ostringstream oss; const std::size_t chrom_id = chrom_order.find(chrom)->second; const std::size_t prev_chrom_id = chrom_order.find(prev_chrom)->second; @@ -164,8 +174,9 @@ bad_order(const std::unordered_map &chrom_order, template static void -process_sites(const bool show_progress, bgzf_file &in_a, bgzf_file &in_b, - const bool allow_uncovered, const double pseudocount, T &out) { +process_sites(const bool show_progress, bamxx::bgzf_file &in_a, + bamxx::bgzf_file &in_b, const bool allow_uncovered, + const double pseudocount, T &out) { // chromosome order in the files std::unordered_map chrom_order; std::unordered_set chroms_seen_a, chroms_seen_b; @@ -182,13 +193,12 @@ process_sites(const bool show_progress, bgzf_file &in_a, bgzf_file &in_b, bool advance_b = true; while (true) { - while (advance_a && read_site(in_a, a)) { if (prev_chrom_a.compare(a.chrom) != 0) { prev_chrom_id_a = chrom_id_a; chrom_id_a = get_chrom_id(chrom_order, chroms_seen_a, a); if (show_progress) - std::cerr << "processing " << a.chrom << std::endl; + std::cerr << "processing " << a.chrom << '\n'; prev_chrom_a = a.chrom; } if (site_precedes(chrom_id_a, a.pos, prev_chrom_id_a, prev_pos_a)) @@ -216,10 +226,12 @@ process_sites(const bool show_progress, bgzf_file &in_a, bgzf_file &in_b, if (chrom_id_a == chrom_id_b && a.pos == b.pos) { if (allow_uncovered || std::min(a.n_reads, b.n_reads) > 0) { + // NOLINTBEGIN(*-narrowing-conversions) const std::size_t meth_a = a.n_meth() + pseudocount; const std::size_t unmeth_a = a.n_unmeth() + pseudocount; const std::size_t meth_b = b.n_meth() + pseudocount; const std::size_t unmeth_b = b.n_unmeth() + pseudocount; + // NOLINTEND(*-narrowing-conversions) const double diffscore = test_greater_population(meth_b, unmeth_b, meth_a, unmeth_a); @@ -237,7 +249,7 @@ process_sites(const bool show_progress, bgzf_file &in_a, bgzf_file &in_b, } int -main_methdiff(int argc, char *argv[]) { +main_methdiff(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { std::string outfile; double pseudocount = 1.0; @@ -247,7 +259,7 @@ main_methdiff(int argc, char *argv[]) { bool verbose = false; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "compute probability that site " "has higher methylation in file A than B", " "); @@ -261,20 +273,20 @@ main_methdiff(int argc, char *argv[]) { std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - std::cerr << opt_parse.help_message() << std::endl - << opt_parse.about_message() << std::endl; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - std::cerr << opt_parse.about_message() << std::endl; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - std::cerr << opt_parse.option_missing_message() << std::endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 2) { - std::cerr << opt_parse.help_message() << std::endl; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const std::string cpgs_file_a = leftover_args[0]; @@ -282,14 +294,14 @@ main_methdiff(int argc, char *argv[]) { /****************** END COMMAND LINE OPTIONS *****************/ if (verbose) - std::cerr << "[opening counts file: " << cpgs_file_a << "]" << std::endl; - bgzf_file in_a(cpgs_file_a, "r"); + std::cerr << "[opening counts file: " << cpgs_file_a << "]\n"; + bamxx::bgzf_file in_a(cpgs_file_a, "r"); if (!in_a) throw std::runtime_error("cannot open file: " + cpgs_file_a); if (verbose) - std::cerr << "[opening counts file: " << cpgs_file_b << "]" << std::endl; - bgzf_file in_b(cpgs_file_b, "r"); + std::cerr << "[opening counts file: " << cpgs_file_b << "]\n"; + bamxx::bgzf_file in_b(cpgs_file_b, "r"); if (!in_b) throw std::runtime_error("cannot open file: " + cpgs_file_b); @@ -298,12 +310,12 @@ main_methdiff(int argc, char *argv[]) { process_sites(verbose, in_a, in_b, allow_uncovered, pseudocount, out); } else { - bgzf_file out(outfile, "w"); + bamxx::bgzf_file out(outfile, "w"); process_sites(verbose, in_a, in_b, allow_uncovered, pseudocount, out); } } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 39e333388fbcf32296adb90ae893fec41d741064 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 056/106] src/radmeth/radmeth-adjust.cpp: changes to add static analysis --- src/radmeth/radmeth-adjust.cpp | 277 ++++++++++++++++++--------------- 1 file changed, 150 insertions(+), 127 deletions(-) diff --git a/src/radmeth/radmeth-adjust.cpp b/src/radmeth/radmeth-adjust.cpp index 03903f83..44ede484 100644 --- a/src/radmeth/radmeth-adjust.cpp +++ b/src/radmeth/radmeth-adjust.cpp @@ -15,90 +15,111 @@ * GNU General Public License for more details. */ -#include +#include "OptionParser.hpp" +#include "dnmtools_gaussinv.hpp" + +#include +#include +#include +#include +#include #include +#include +#include +#include #include -#include -#include -#include #include -#include -#include -#include - -#include "dnmtools_gaussinv.hpp" - -// smithlab headers -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" +#include +#include -using std::string; -using std::vector; -using std::istringstream; using std::cerr; using std::cout; using std::endl; -using std::istream; -using std::ostream; using std::ifstream; +using std::istream; +using std::istringstream; +using std::min; using std::ofstream; +using std::ostream; using std::runtime_error; -using std::min; +using std::string; +using std::vector; + +// NOLINTBEGIN(*-avoid-do-while,*-narrowing-conversions,cert-err34-c) /***************** COMBINE P-VALUES *****************/ struct PvalLocus { - size_t pos; - double raw_pval; - double combined_pval; - double corrected_pval; + size_t pos{}; + double raw_pval{}; + double combined_pval{}; + double corrected_pval{}; }; class BinForDistance { public: - BinForDistance(string spec_string); + explicit BinForDistance(string spec_string); - size_t which_bin(size_t value) const; + size_t + which_bin(size_t value) const; - size_t num_bins() const {return num_bins_;} - size_t invalid_bin() const {return invalid_bin_;} - size_t max_dist() const {return max_dist_;} + size_t + num_bins() const { + return num_bins_; + } + size_t + invalid_bin() const { + return invalid_bin_; + } + size_t + max_dist() const { + return max_dist_; + } private: - size_t min_dist_; - size_t max_dist_; - size_t bin_size_; - size_t num_bins_; - size_t invalid_bin_; + size_t min_dist_{}; + size_t max_dist_{}; + size_t bin_size_{}; + size_t num_bins_{}; + size_t invalid_bin_{}; }; class ProximalLoci { public: - ProximalLoci(vector &loci, size_t max_distance) - : loci_(loci), max_distance_(max_distance), next_pos_(loci.begin()) {}; - bool get(vector &neighbors); - PvalLocus cur_region() {return *(next_pos_ - 1);} + ProximalLoci(const vector &loci, size_t max_distance) : + loci_{loci}, max_distance_{max_distance}, next_pos_{std::begin(loci)} {}; + bool + get(vector &neighbors); + PvalLocus + cur_region() { + return *(next_pos_ - 1); + } private: + // NOLINTBEGIN(*-avoid-const-or-ref-data-members) const vector &loci_; size_t max_distance_; vector::const_iterator next_pos_; + // NOLINTEND(*-avoid-const-or-ref-data-members) }; class DistanceCorrelation { public: - DistanceCorrelation(BinForDistance bin_for_dist) - : bin_for_dist_(bin_for_dist) {}; - vector correlation_table(const vector &loci); + explicit DistanceCorrelation(const BinForDistance &bin_for_dist) : + bin_for_dist_{bin_for_dist} {}; + vector + correlation_table(const vector &loci); private: - double correlation(const vector &x, - const vector &y); - void bin(const vector &loci); - vector< vector > x_pvals_for_bin_; - vector< vector > y_pvals_for_bin_; + double + correlation(const vector &x, const vector &y); + void + do_bin(const vector &loci); + vector> x_pvals_for_bin_; + vector> y_pvals_for_bin_; + // NOLINTBEGIN(*-avoid-const-or-ref-data-members) const BinForDistance bin_for_dist_; + // NOLINTEND(*-avoid-const-or-ref-data-members) }; static double @@ -114,8 +135,7 @@ to_zscore(double pval) { } static double -stouffer_liptak(const vector > &corr_mat, vector &pvals) { - +stouffer_liptak(const vector> &corr_mat, vector &pvals) { double correction = 0; for (size_t row_ind = 0; row_ind < corr_mat.size(); ++row_ind) for (size_t col_ind = row_ind + 1; col_ind < corr_mat.size(); ++col_ind) @@ -126,19 +146,20 @@ stouffer_liptak(const vector > &corr_mat, vector &pvals) const double sum = std::accumulate(zscores.begin(), zscores.end(), 0.0); const double test_stat = - sum/sqrt(static_cast(pvals.size()) + 2.0*correction); + sum / sqrt(static_cast(pvals.size()) + 2.0 * correction); return 1.0 - dnmt_gsl_cdf_gaussian_P(test_stat, 1.0); } -static bool -is_number(const string& str) { - for (const char &c : str) - if (c != '.' && c != 'e' && c != '-' && !std::isdigit(c)) return false; - return true; +[[nodiscard]] static inline bool +is_number(const std::string &s) { + return std::all_of(std::cbegin(s), std::cend(s), [](const char c) { + return c == '.' || c == 'e' || c == '-' || std::isdigit(c); + }); } -template inline bool +template +inline bool isnan(const T x) { return !(x == x); } @@ -152,11 +173,11 @@ static void update_pval_loci(std::istream &input_encoding, const vector &pval_loci, std::ostream &output_encoding) { - string record, chrom, name, sign; - size_t position, coverage_factor, meth_factor, coverage_rest, meth_rest; + size_t position{}, coverage_factor{}, meth_factor{}, coverage_rest{}, + meth_rest{}; string pval_str; - double pval; + double pval{}; vector::const_iterator cur_locus_iter = pval_loci.begin(); @@ -166,29 +187,34 @@ update_pval_loci(std::istream &input_encoding, try { std::istringstream iss(record); iss.exceptions(std::ios::failbit); - iss >> chrom >> position >> sign >> name >> pval_str - >> coverage_factor >> meth_factor >> coverage_rest >> meth_rest; - - pval = (is_number(pval_str) ? atof(pval_str.c_str()) : 1.0); + iss >> chrom >> position >> sign >> name >> pval_str >> coverage_factor >> + meth_factor >> coverage_rest >> meth_rest; + if (is_number(pval_str)) { + std::istringstream pval_converter(pval_str); + pval_converter >> pval; + } + else { + pval = 1.0; + } } - catch (std::exception const & err) { - cerr << err.what() << endl << "could not parse line:\n" - << record << endl; + catch (std::exception const &err) { + cerr << err.what() << '\n' << "could not parse line:\n" << record << '\n'; std::terminate(); } - output_encoding << chrom << "\t" << position << "\t" << sign << "\t" - << name << "\t" << pval << "\t"; + output_encoding << chrom << "\t" << position << "\t" << sign << "\t" << name + << "\t" << pval << "\t"; if (0.0 <= pval && pval < 1.0) { output_encoding << fix_pval_nan(cur_locus_iter->combined_pval) << "\t" << fix_pval_nan(cur_locus_iter->corrected_pval) << "\t"; ++cur_locus_iter; } - else output_encoding << 1.0 << "\t" << 1.0 << "\t"; // MAGIC?? + else + output_encoding << 1.0 << "\t" << 1.0 << "\t"; // MAGIC?? output_encoding << coverage_factor << "\t" << meth_factor << "\t" - << coverage_rest << "\t" << meth_rest << endl; + << coverage_rest << "\t" << meth_rest << '\n'; } } @@ -198,8 +224,10 @@ BinForDistance::BinForDistance(std::string spec_string) { std::istringstream iss(spec_string); iss >> min_dist_ >> max_dist_ >> bin_size_; + // NOLINTBEGIN(*-prefer-member-initializer) num_bins_ = (max_dist_ - min_dist_) / bin_size_; invalid_bin_ = num_bins_ + 1; + // NOLINTEND(*-prefer-member-initializer) } size_t @@ -207,9 +235,9 @@ BinForDistance::which_bin(size_t value) const { if (value < min_dist_) return invalid_bin_; - const size_t bin = (value - min_dist_)/bin_size_; + const size_t bin = (value - min_dist_) / bin_size_; - //Bin numbering is 0 based. + // Bin numbering is 0 based. if (bin >= num_bins_) return invalid_bin_; @@ -218,7 +246,6 @@ BinForDistance::which_bin(size_t value) const { bool ProximalLoci::get(vector &neighbors) { - if (next_pos_ == loci_.end()) return false; @@ -226,7 +253,7 @@ ProximalLoci::get(vector &neighbors) { neighbors.clear(); neighbors.push_back(*cur_pos); - if ( cur_pos != loci_.begin() ) { + if (cur_pos != loci_.begin()) { vector::const_iterator up_pos = cur_pos; bool too_far = false; @@ -234,15 +261,15 @@ ProximalLoci::get(vector &neighbors) { --up_pos; size_t up_dist = cur_pos->pos - (up_pos->pos + 1); - if(up_dist <= max_distance_) { + if (up_dist <= max_distance_) { neighbors.push_back(*up_pos); - } else + } + else too_far = true; - } while (!too_far && up_pos != loci_.begin()); } - std::reverse(neighbors.begin(), neighbors.end()); + std::reverse(std::begin(neighbors), std::end(neighbors)); if (cur_pos != loci_.end() - 1) { bool too_far = false; @@ -255,8 +282,8 @@ ProximalLoci::get(vector &neighbors) { if (down_dist <= max_distance_) { neighbors.push_back(*down_pos); } - else too_far = true; - + else + too_far = true; } while (!too_far && down_pos != loci_.end() - 1); } @@ -265,7 +292,7 @@ ProximalLoci::get(vector &neighbors) { } void -DistanceCorrelation::bin(const vector &loci) { +DistanceCorrelation::do_bin(const vector &loci) { x_pvals_for_bin_.clear(); y_pvals_for_bin_.clear(); vector::const_iterator it = loci.begin(); @@ -278,7 +305,7 @@ DistanceCorrelation::bin(const vector &loci) { const size_t dist = forward_it->pos - (it->pos + 1); const size_t bin = bin_for_dist_.which_bin(dist); - //check if the appropriate bin exists + // check if the appropriate bin exists if (bin != bin_for_dist_.invalid_bin()) { x_pvals_for_bin_[bin].push_back(to_zscore(it->raw_pval)); y_pvals_for_bin_[bin].push_back(to_zscore(forward_it->raw_pval)); @@ -298,7 +325,8 @@ double DistanceCorrelation::correlation(const vector &x, const vector &y) { // correlation is 0 when all bins are empty - if (x.size() <= 1) return 0.0; + if (x.size() <= 1) + return 0.0; using std::inner_product; const auto X = accumulate(begin(x), end(x), 0.0); @@ -309,43 +337,41 @@ DistanceCorrelation::correlation(const vector &x, const auto N = x.size(); // Sum XY - N.mu(X).mu(Y) = Sum XY - Sum(X)Sum(Y)/N - const auto covXY = XY - (X*Y)/N; + const auto covXY = XY - (X * Y) / N; // sqrt[SSX - N.mu(X).mu(X)] - const auto sdX = std::sqrt(XX - (X*X)/N); + const auto sdX = std::sqrt(XX - (X * X) / N); // sqrt[SSY - N.mu(Y).mu(Y)] - const auto sdY = std::sqrt(YY - (Y*Y)/N); + const auto sdY = std::sqrt(YY - (Y * Y) / N); // Pearson correlation - return covXY/(sdX*sdY); + return covXY / (sdX * sdY); } - vector DistanceCorrelation::correlation_table(const vector &loci) { const size_t num_bins = bin_for_dist_.num_bins(); x_pvals_for_bin_.resize(num_bins); y_pvals_for_bin_.resize(num_bins); - bin(loci); - vector correlation_table; - + do_bin(loci); + vector correlation_table(num_bins); for (size_t bin = 0; bin < num_bins; ++bin) - correlation_table.push_back( - correlation(x_pvals_for_bin_[bin], y_pvals_for_bin_[bin])); + correlation_table[bin] = + correlation(x_pvals_for_bin_[bin], y_pvals_for_bin_[bin]); return correlation_table; } void -distance_corr_matrix(BinForDistance bin_for_dist, +distance_corr_matrix(const BinForDistance &bin_for_dist, const std::vector &acor_for_bin, const std::vector &neighbors, - std::vector< std::vector > &corr_matrix) { + std::vector> &corr_matrix) { corr_matrix.clear(); const size_t num_neighbors = neighbors.size(); corr_matrix.resize(num_neighbors); - for (std::vector >::iterator row = corr_matrix.begin(); + for (std::vector>::iterator row = corr_matrix.begin(); row != corr_matrix.end(); ++row) row->resize(num_neighbors); @@ -361,7 +387,8 @@ distance_corr_matrix(BinForDistance bin_for_dist, if (bin == bin_for_dist.invalid_bin()) { corr_matrix[row][col] = 0; - } else + } + else corr_matrix[row][col] = corr_matrix[col][row] = acor_for_bin[bin]; } } @@ -378,7 +405,7 @@ combine_pvals(vector &loci, const BinForDistance &bin_for_distance) { size_t i = 0; while (proximal_loci.get(neighbors)) { - vector > correlation_matrix; + vector> correlation_matrix; vector p_vals; for (auto it = begin(neighbors); it != end(neighbors); ++it) { @@ -386,8 +413,8 @@ combine_pvals(vector &loci, const BinForDistance &bin_for_distance) { p_vals.push_back(pval); } - distance_corr_matrix(bin_for_distance, correlation_for_bin, - neighbors, correlation_matrix); + distance_corr_matrix(bin_for_distance, correlation_for_bin, neighbors, + correlation_matrix); double combined_pval = stouffer_liptak(correlation_matrix, p_vals); loci[i].combined_pval = combined_pval; @@ -407,19 +434,16 @@ ls_locus_position(const PvalLocus &r1, const PvalLocus &r2) { void fdr(vector &loci) { - sort(begin(loci), end(loci), lt_locus_pval); for (size_t idx = 0; idx < loci.size(); ++idx) { const double current_score = loci[idx].combined_pval; - //Assign a new one. - const double corrected_pval = loci.size()*current_score/(idx + 1); + // Assign a new one. + const double corrected_pval = loci.size() * current_score / (idx + 1); loci[idx].corrected_pval = corrected_pval; } - for (vector::reverse_iterator it = loci.rbegin() + 1; - it != loci.rend(); ++it) { - + for (auto it = loci.rbegin() + 1; it != loci.rend(); ++it) { const PvalLocus &prev_locus = *(it - 1); PvalLocus &cur_locus = *(it); @@ -434,41 +458,38 @@ fdr(vector &loci) { sort(begin(loci), end(loci), ls_locus_position); } - int -main_radmeth_adjust(int argc, char *argv[]) { - +main_radmeth_adjust(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - string outfile; string bin_spec = "1:200:1"; bool VERBOSE = false; /**************** GET COMMAND LINE ARGUMENTS *************************/ - OptionParser opt_parse(strip_path(argv[0]), + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "compute adjusted p-values using autocorrelation", ""); opt_parse.set_show_defaults(); - opt_parse.add_opt("out", 'o', "output file (default: stdout)", - false, outfile); - opt_parse.add_opt("bins", 'b', "corrlation bin specs", false , bin_spec); + opt_parse.add_opt("out", 'o', "output file (default: stdout)", false, + outfile); + opt_parse.add_opt("bins", 'b', "corrlation bin specs", false, bin_spec); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const string bed_filename = leftover_args.front(); @@ -481,7 +502,7 @@ main_radmeth_adjust(int argc, char *argv[]) { throw "could not open file: " + bed_filename; if (VERBOSE) - cerr << "[reading input]" << endl; + cerr << "[reading input]\n"; // Read in all p-value loci. The loci that are not correspond to valid // p-values (i.e. values in [0, 1]) are skipped. @@ -491,11 +512,10 @@ main_radmeth_adjust(int argc, char *argv[]) { size_t chrom_offset = 0; while (getline(bed_file, input_line)) { - istringstream iss(input_line); string chrom, sign, name; - size_t position; - double pval; + size_t position{}; + double pval{}; string pval_str; if (!(iss >> chrom >> position >> sign >> name >> pval_str)) throw runtime_error("failed to parse line: " + input_line); @@ -507,7 +527,7 @@ main_radmeth_adjust(int argc, char *argv[]) { if (!prev_chrom.empty() && prev_chrom != chrom) chrom_offset += pvals.back().pos; - PvalLocus plocus; + PvalLocus plocus{}; plocus.raw_pval = pval; plocus.pos = chrom_offset + bin_for_dist.max_dist() + 1 + position; @@ -517,26 +537,29 @@ main_radmeth_adjust(int argc, char *argv[]) { } if (VERBOSE) - cerr << "[combining p-values]" << endl; + cerr << "[combining p-values]\n"; combine_pvals(pvals, bin_for_dist); if (VERBOSE) - cerr << "[running multiple test adjustment]" << endl; + cerr << "[running multiple test adjustment]\n"; fdr(pvals); ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); ifstream original_bed_file(bed_filename); update_pval_loci(original_bed_file, pvals, out); - //TODO: Check that the regions do not overlap & sorted + // TODO(ADS): Check that the regions do not overlap & sorted } catch (const std::exception &e) { - cerr << "ERROR: " << e.what() << endl; + cerr << "ERROR: " << e.what() << '\n'; exit(EXIT_FAILURE); } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-do-while,*-narrowing-conversions,cert-err34-c) From 8b281c794f7064002fbb81a53e71c20e8bcc84e1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 057/106] src/radmeth/radmeth-merge.cpp: changes to add static analysis --- src/radmeth/radmeth-merge.cpp | 118 ++++++++++++++++------------------ 1 file changed, 55 insertions(+), 63 deletions(-) diff --git a/src/radmeth/radmeth-merge.cpp b/src/radmeth/radmeth-merge.cpp index 538e7a5d..cc519072 100644 --- a/src/radmeth/radmeth-merge.cpp +++ b/src/radmeth/radmeth-merge.cpp @@ -15,56 +15,55 @@ * General Public License for more details. */ -#include +#include "GenomicRegion.hpp" +#include "OptionParser.hpp" + +#include +#include #include +#include #include -#include -#include -#include #include -#include -#include - -// smithlab headers -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" -#include "GenomicRegion.hpp" +#include +#include -using std::string; -using std::vector; using std::cerr; using std::cout; using std::endl; -using std::istringstream; -using std::istream; -using std::ostream; using std::ifstream; +using std::istream; +using std::istringstream; using std::ofstream; +using std::ostream; using std::runtime_error; +using std::string; +using std::vector; + +// NOLINTBEGIN(*-narrowing-conversions) // Attemps to find the next significant CpG site. Returns true if one was found // and flase otherwise. static bool read_next_significant_cpg(istream &cpg_stream, GenomicRegion &cpg, double cutoff, bool &skipped_any, bool &n_sig_sites, - size_t &test_cov, size_t &test_meth, - size_t &rest_cov, size_t &rest_meth) { + size_t &test_cov, size_t &test_meth, size_t &rest_cov, + size_t &rest_meth) { GenomicRegion region; skipped_any = false; n_sig_sites = false; string cpg_encoding; while (getline(cpg_stream, cpg_encoding)) { - string record, chrom, name, sign; - size_t position; - double raw_pval, adjusted_pval, corrected_pval; + string chrom, name, sign; + size_t position{}; + double raw_pval{}; + double adjusted_pval{}; + double corrected_pval{}; istringstream iss(cpg_encoding); iss.exceptions(std::ios::failbit); - iss >> chrom >> position >> sign >> name >> raw_pval - >> adjusted_pval >> corrected_pval - >> test_cov >> test_meth >> rest_cov >> rest_meth; + iss >> chrom >> position >> sign >> name >> raw_pval >> adjusted_pval >> + corrected_pval >> test_cov >> test_meth >> rest_cov >> rest_meth; if (corrected_pval >= 0.0 && corrected_pval < cutoff) { cpg.set_chrom(chrom); @@ -81,7 +80,6 @@ read_next_significant_cpg(istream &cpg_stream, GenomicRegion &cpg, static void merge(istream &cpg_stream, ostream &dmr_stream, double cutoff) { - GenomicRegion dmr; dmr.set_name("dmr"); @@ -96,10 +94,11 @@ merge(istream &cpg_stream, ostream &dmr_stream, double cutoff) { size_t rest_meth = 0; // Find the first significant CpG, or terminate the function if none exist. - bool skipped_last_cpg, n_sig_sites; + bool skipped_last_cpg{}; + bool n_sig_sites{}; if (!read_next_significant_cpg(cpg_stream, dmr, cutoff, skipped_last_cpg, - n_sig_sites, test_cov, test_meth, - rest_cov, rest_meth)) + n_sig_sites, test_cov, test_meth, rest_cov, + rest_meth)) return; dmr.set_score(n_sig_sites); @@ -112,18 +111,16 @@ merge(istream &cpg_stream, ostream &dmr_stream, double cutoff) { cpg.set_name("dmr"); while (read_next_significant_cpg(cpg_stream, cpg, cutoff, skipped_last_cpg, - n_sig_sites, test_cov, test_meth, - rest_cov, rest_meth)) { - + n_sig_sites, test_cov, test_meth, rest_cov, + rest_meth)) { if (skipped_last_cpg || cpg.get_chrom() != dmr.get_chrom()) { if (dmr.get_score() != 0) - dmr_stream << dmr.get_chrom() << '\t' - << dmr.get_start() << '\t' - << dmr.get_end() << '\t' - << dmr.get_name() << '\t' + dmr_stream << dmr.get_chrom() << '\t' << dmr.get_start() << '\t' + << dmr.get_end() << '\t' << dmr.get_name() << '\t' << dmr.get_score() << '\t' - << double(dmr_test_meth)/dmr_test_cov - - double(dmr_rest_meth)/dmr_rest_cov << endl; + << static_cast(dmr_test_meth) / dmr_test_cov - + static_cast(dmr_rest_meth) / dmr_rest_cov + << '\n'; dmr = cpg; dmr.set_score(n_sig_sites); dmr_test_cov = test_cov; @@ -141,59 +138,53 @@ merge(istream &cpg_stream, ostream &dmr_stream, double cutoff) { } } if (dmr.get_score() != 0) { - const double diff = - static_cast(dmr_test_meth)/dmr_test_cov - - static_cast(dmr_rest_meth)/dmr_rest_cov; - dmr_stream << dmr.get_chrom() << '\t' - << dmr.get_start() << '\t' - << dmr.get_end() << '\t' - << dmr.get_name() << '\t' - << dmr.get_score() << '\t' - << diff << endl; + const double diff = static_cast(dmr_test_meth) / dmr_test_cov - + static_cast(dmr_rest_meth) / dmr_rest_cov; + dmr_stream << dmr.get_chrom() << '\t' << dmr.get_start() << '\t' + << dmr.get_end() << '\t' << dmr.get_name() << '\t' + << dmr.get_score() << '\t' << diff << '\n'; } } int -main_radmeth_merge(int argc, char *argv[]) { - +main_radmeth_merge(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - - /* FILES */ string outfile; - double cutoff = 0.01; + double cutoff = 0.01; // NOLINT(*-avoid-magic-numbers) /**************** GET COMMAND LINE ARGUMENTS *************************/ - OptionParser opt_parse(strip_path(argv[0]), + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "merge significantly differentially" " methylated CpGs into DMRs", ""); opt_parse.set_show_defaults(); - opt_parse.add_opt("output", 'o', - "output file (default: stdout)", false, outfile); - opt_parse.add_opt("cutoff", 'p', "p-value cutoff", false , cutoff); + opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, + outfile); + opt_parse.add_opt("cutoff", 'p', "p-value cutoff", false, cutoff); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const string bed_filename = leftover_args.front(); /************************************************************************/ ofstream of; - if (!outfile.empty()) of.open(outfile); + if (!outfile.empty()) + of.open(outfile); ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); ifstream in(bed_filename); @@ -201,11 +192,12 @@ main_radmeth_merge(int argc, char *argv[]) { throw runtime_error("could not open file: " + bed_filename); merge(in, out, cutoff); - } catch (const std::exception &e) { - cerr << "ERROR: " << e.what() << endl; + cerr << "ERROR: " << e.what() << '\n'; exit(EXIT_FAILURE); } return EXIT_SUCCESS; } + +// NOLINTEND(*-narrowing-conversions) From 8874cb4897e4e27f6121309391563ac5230573a9 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 058/106] src/radmeth/radmeth.cpp: changes to add static analysis --- src/radmeth/radmeth.cpp | 216 ++++++++++++++++++++-------------------- 1 file changed, 109 insertions(+), 107 deletions(-) diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 68eab100..4f5e3a0a 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -20,22 +20,21 @@ #include "radmeth_optimize_series.hpp" #include "radmeth_utils.hpp" -// smithlab headers -#include "GenomicRegion.hpp" #include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" -#include #include -#include +#include +#include +#include +#include #include -#include #include -#include -#include +#include +#include #include #include +#include +#include #include template @@ -137,109 +136,112 @@ that the design matrix and the proportion table are correctly formatted. std::vector threads; for (auto thread_id = 0u; thread_id < n_threads; ++thread_id) { - threads.emplace_back([&, thread_id] { - std::vector p_estim_alt; - std::vector p_estim_null; - double phi_estim_alt{}; - double phi_estim_null{}; - - auto &t_alt_model = alt_models[thread_id]; - auto &t_null_model = null_models[thread_id]; - for (auto b = 0u; b < n_lines; ++b) { - // ADS: rows done by different threads are interleaved because the - // difficult (e.g., high-coverage) rows can be consecutive and this - // balances work better. - if (b % n_threads != thread_id) - continue; - t_alt_model.parse(lines[b]); - if (t_alt_model.props_size() != n_samples) - throw std::runtime_error("found row with wrong number of columns"); - - const auto p_val_status = [&]() -> std::tuple { - // Skip the test if (1) no coverage in all cases or in all - // controls, or (2) the site is completely methylated or - // completely unmethylated across all samples. - if (has_low_coverage(t_alt_model, test_factor_idx)) - return std::tuple{1.0, row_status::na_low_cov}; - - if (has_extreme_counts(t_alt_model)) - return std::tuple{1.0, row_status::na_extreme_cnt}; - - fit_regression_model(t_alt_model, p_estim_alt, phi_estim_alt); - - t_null_model.mc = t_alt_model.mc; - t_null_model.rowname = t_alt_model.rowname; - - fit_regression_model(t_null_model, p_estim_null, phi_estim_null); - - const double p_value = - llr_test(t_null_model.max_loglik, t_alt_model.max_loglik); - - return (p_value != p_value) ? std::tuple{1.0, row_status::na} - : std::tuple{p_value, row_status::ok}; - }(); - // ADS: avoid capture structured binding in C++17 - const auto p_val = std::get<0>(p_val_status); - const auto status = std::get<1>(p_val_status); - - n_bytes[b] = [&] { - auto bufsize = std::size(bufs[b]); - // clang-format off + threads.emplace_back( // NOLINT(performance-inefficient-vector-operation) + [&, thread_id] { + std::vector p_estim_alt; + std::vector p_estim_null; + double phi_estim_alt{}; + double phi_estim_null{}; + + auto &t_alt_model = alt_models[thread_id]; + auto &t_null_model = null_models[thread_id]; + for (auto b = 0u; b < n_lines; ++b) { + // ADS: rows done by different threads are interleaved because the + // difficult (e.g., high-coverage) rows can be consecutive and this + // balances work better. + if (b % n_threads != thread_id) + continue; + t_alt_model.parse(lines[b]); + if (t_alt_model.props_size() != n_samples) + throw std::runtime_error( + "found row with wrong number of columns"); + + const auto p_val_status = [&]() -> std::tuple { + // Skip the test if (1) no coverage in all cases or in all + // controls, or (2) the site is completely methylated or + // completely unmethylated across all samples. + if (has_low_coverage(t_alt_model, test_factor_idx)) + return std::tuple{1.0, row_status::na_low_cov}; + + if (has_extreme_counts(t_alt_model)) + return std::tuple{1.0, row_status::na_extreme_cnt}; + + fit_regression_model(t_alt_model, p_estim_alt, phi_estim_alt); + + t_null_model.mc = t_alt_model.mc; + t_null_model.rowname = t_alt_model.rowname; + + fit_regression_model(t_null_model, p_estim_null, phi_estim_null); + + const double p_value = + llr_test(t_null_model.max_loglik, t_alt_model.max_loglik); + + return (p_value != p_value) ? std::tuple{1.0, row_status::na} + : std::tuple{p_value, row_status::ok}; + }(); + // ADS: avoid capture structured binding in C++17 + const auto p_val = std::get<0>(p_val_status); + const auto status = std::get<1>(p_val_status); + + n_bytes[b] = [&] { + auto bufsize = std::size(bufs[b]); + // clang-format off const int n_prefix_bytes = std::snprintf(bufs[b].data(), bufsize, "%s\t", t_alt_model.rowname.data()); - // clang-format on - if (n_prefix_bytes < 0) - return n_prefix_bytes; - - bufsize -= n_prefix_bytes; - auto cursor = bufs[b].data() + n_prefix_bytes; - - const int n_pval_bytes = [&] { - if (status == row_status::ok) - return std::snprintf(cursor, bufsize, "%.6g", p_val); - if (!more_na_info || status == row_status::na) - return std::snprintf(cursor, bufsize, "NA"); - if (status == row_status::na_extreme_cnt) - return std::snprintf(cursor, bufsize, "NA_EXTREME_CNT"); - // if (status == row_status::na_low_cov) - return std::snprintf(cursor, bufsize, "NA_LOW_COV"); - }(); - - if (n_pval_bytes < 0) - return n_pval_bytes; - - bufsize -= n_pval_bytes; - cursor += n_pval_bytes; - - const int n_param_bytes = [&] { - std::int32_t n_param_bytes = 0; - if (p_estim_alt.empty()) - p_estim_alt.resize(n_groups, 0.0); - for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) { - const int n = - std::snprintf(cursor, bufsize, "\t%f", p_estim_alt[g_idx]); - bufsize -= n; - cursor += n; + // clang-format on + if (n_prefix_bytes < 0) + return n_prefix_bytes; + + bufsize -= n_prefix_bytes; + // NOLINTNEXTLINE(*-pointer-arithmetic) + auto cursor = bufs[b].data() + n_prefix_bytes; + + const int n_pval_bytes = [&] { + if (status == row_status::ok) + return std::snprintf(cursor, bufsize, "%.6g", p_val); + if (!more_na_info || status == row_status::na) + return std::snprintf(cursor, bufsize, "NA"); + if (status == row_status::na_extreme_cnt) + return std::snprintf(cursor, bufsize, "NA_EXTREME_CNT"); + // if (status == row_status::na_low_cov) + return std::snprintf(cursor, bufsize, "NA_LOW_COV"); + }(); + + if (n_pval_bytes < 0) + return n_pval_bytes; + + bufsize -= n_pval_bytes; + cursor += n_pval_bytes; // NOLINT(*-pointer-arithmetic) + + const int n_param_bytes = [&] { + std::int32_t n_param_bytes = 0; + if (p_estim_alt.empty()) + p_estim_alt.resize(n_groups, 0.0); + for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) { + const int n = + std::snprintf(cursor, bufsize, "\t%f", p_estim_alt[g_idx]); + bufsize -= n; + cursor += n; // NOLINT(*-pointer-arithmetic) + if (n < 0) + return n; + n_param_bytes += n; + } + const auto od = overdispersion_factor(n_samples, phi_estim_alt); + const int n = std::snprintf(cursor, bufsize, "\t%f\n", od); if (n < 0) return n; n_param_bytes += n; - } - const auto od = overdispersion_factor(n_samples, phi_estim_alt); - const int n = std::snprintf(cursor, bufsize, "\t%f\n", od); - if (n < 0) - return n; - n_param_bytes += n; - return n_param_bytes; - }(); + return n_param_bytes; + }(); - if (n_param_bytes < 0) - return n_param_bytes; + if (n_param_bytes < 0) + return n_param_bytes; - return n_prefix_bytes + n_pval_bytes + n_param_bytes; - }(); - } - }); + return n_prefix_bytes + n_pval_bytes + n_param_bytes; + }(); + } + }); } for (auto &thread : threads) @@ -263,7 +265,7 @@ that the design matrix and the proportion table are correctly formatted. } int -main_radmeth(int argc, char *argv[]) { +main_radmeth(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { static const std::string description = "calculate differential methylation scores"; @@ -276,8 +278,8 @@ main_radmeth(int argc, char *argv[]) { std::uint32_t n_threads{1}; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - " "); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " "); opt_parse.set_show_defaults(); opt_parse.add_opt("out", 'o', "output file", true, outfile); opt_parse.add_opt("threads", 't', "number of threads to use", false, From e4ad44a10e560c2f33600dbd5ed05900a0f9659f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 059/106] src/radmeth/radmeth_design.cpp: changes to add static analysis --- src/radmeth/radmeth_design.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/radmeth/radmeth_design.cpp b/src/radmeth/radmeth_design.cpp index f08dd9ea..3c849727 100644 --- a/src/radmeth/radmeth_design.cpp +++ b/src/radmeth/radmeth_design.cpp @@ -16,14 +16,14 @@ #include "radmeth_design.hpp" #include -#include #include #include #include -#include #include +#include #include #include +#include #include static void @@ -101,7 +101,7 @@ operator>>(std::istream &is, Design &design) { } [[nodiscard]] Design -Design::drop_factor(const std::size_t factor_idx) { +Design::drop_factor(const std::uint32_t factor_idx) { // clang-format off Design design{ factor_names, @@ -255,8 +255,10 @@ Design::get_test_factor_idx(const std::string &test_factor) const { [[nodiscard]] bool Design::has_two_values(const std::size_t test_factor) const { const auto &tcol = tmatrix[test_factor]; - for (const auto x : tcol) - if (x != tcol[0]) - return true; - return false; + return std::any_of(std::cbegin(tcol), std::cend(tcol), + [&](const auto x) { return x != tcol[0]; }); + // for (const auto x : tcol) + // if (x != tcol[0]) + // return true; + // return false; } From 20543c823b0bf43cf5c9636a93dedb1eb4a6e064 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 060/106] src/radmeth/radmeth_design.hpp: changes to add static analysis --- src/radmeth/radmeth_design.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/radmeth/radmeth_design.hpp b/src/radmeth/radmeth_design.hpp index 38d37cc6..0b4e6baf 100644 --- a/src/radmeth/radmeth_design.hpp +++ b/src/radmeth/radmeth_design.hpp @@ -16,9 +16,10 @@ #ifndef RADMETH_DESIGN_HPP #define RADMETH_DESIGN_HPP +#include #include #include -#include +#include #include #include @@ -49,7 +50,7 @@ struct Design { } [[nodiscard]] Design - drop_factor(const std::size_t factor_idx); + drop_factor(const std::uint32_t factor_idx); void order_samples(const std::vector &ordered_names); From 7955dd99c6456c4df3ef5e57e17057c0d642053d Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 061/106] src/radmeth/radmeth_nano.cpp: changes to add static analysis --- src/radmeth/radmeth_nano.cpp | 219 ++++++++++++++++++----------------- 1 file changed, 111 insertions(+), 108 deletions(-) diff --git a/src/radmeth/radmeth_nano.cpp b/src/radmeth/radmeth_nano.cpp index 367b8e1e..5b76f495 100644 --- a/src/radmeth/radmeth_nano.cpp +++ b/src/radmeth/radmeth_nano.cpp @@ -19,22 +19,21 @@ #include "radmeth_optimize_params.hpp" #include "radmeth_utils.hpp" -// smithlab headers -#include "GenomicRegion.hpp" #include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" -#include #include -#include +#include +#include +#include +#include #include -#include #include -#include -#include +#include +#include #include #include +#include +#include #include [[nodiscard]] static bool @@ -134,110 +133,114 @@ that the design matrix and the proportion table are correctly formatted. std::vector threads; for (auto thread_id = 0u; thread_id < n_threads; ++thread_id) { - threads.emplace_back([&, thread_id] { - std::vector p_estim_alt; - std::vector p_estim_null; - double phi_estim_alt{}; - double phi_estim_null{}; - - auto &t_alt_model = alt_models[thread_id]; - auto &t_null_model = null_models[thread_id]; - for (auto b = 0u; b < n_lines; ++b) { - // ADS: rows done by different threads are interleaved because the - // difficult (e.g., high-coverage) rows can be consecutive and this - // balances work better. - if (b % n_threads != thread_id) - continue; - t_alt_model.parse(lines[b]); - if (t_alt_model.props_size() != n_samples) - throw std::runtime_error("found row with wrong number of columns"); - - const auto p_val_status = [&]() -> std::tuple { - // Skip the test if (1) no coverage in all cases or in all - // controls, or (2) the site is completely methylated or - // completely unmethylated across all samples. - if (has_low_coverage(t_alt_model, test_factor_idx)) - return std::tuple{1.0, row_status::na_low_cov}; - - if (has_extreme_counts(t_alt_model)) - return std::tuple{1.0, row_status::na_extreme_cnt}; - - fit_regression_model_gamma(t_alt_model, p_estim_alt, phi_estim_alt); - - t_null_model.mc = t_alt_model.mc; - t_null_model.rowname = t_alt_model.rowname; - - fit_regression_model_gamma(t_null_model, p_estim_null, - phi_estim_null); - - const double p_value = - llr_test(t_null_model.max_loglik, t_alt_model.max_loglik); - - return (p_value != p_value) ? std::tuple{1.0, row_status::na} - : std::tuple{p_value, row_status::ok}; - }(); - // ADS: avoid capture structured binding in C++17 - const auto p_val = std::get<0>(p_val_status); - const auto status = std::get<1>(p_val_status); - - n_bytes[b] = [&] { - auto bufsize = std::size(bufs[b]); - // clang-format off + threads.emplace_back( // NOLINT(performance-inefficient-vector-operation) + [&, thread_id] { + std::vector p_estim_alt; + std::vector p_estim_null; + double phi_estim_alt{}; + double phi_estim_null{}; + + auto &t_alt_model = alt_models[thread_id]; + auto &t_null_model = null_models[thread_id]; + for (auto b = 0u; b < n_lines; ++b) { + // ADS: rows done by different threads are interleaved because the + // difficult (e.g., high-coverage) rows can be consecutive and this + // balances work better. + if (b % n_threads != thread_id) + continue; + t_alt_model.parse(lines[b]); + if (t_alt_model.props_size() != n_samples) + throw std::runtime_error( + "found row with wrong number of columns"); + + const auto p_val_status = [&]() -> std::tuple { + // Skip the test if (1) no coverage in all cases or in all + // controls, or (2) the site is completely methylated or + // completely unmethylated across all samples. + if (has_low_coverage(t_alt_model, test_factor_idx)) + return std::tuple{1.0, row_status::na_low_cov}; + + if (has_extreme_counts(t_alt_model)) + return std::tuple{1.0, row_status::na_extreme_cnt}; + + fit_regression_model_gamma(t_alt_model, p_estim_alt, + phi_estim_alt); + + t_null_model.mc = t_alt_model.mc; + t_null_model.rowname = t_alt_model.rowname; + + fit_regression_model_gamma(t_null_model, p_estim_null, + phi_estim_null); + + const double p_value = + llr_test(t_null_model.max_loglik, t_alt_model.max_loglik); + + return (p_value != p_value) ? std::tuple{1.0, row_status::na} + : std::tuple{p_value, row_status::ok}; + }(); + // ADS: avoid capture structured binding in C++17 + const auto p_val = std::get<0>(p_val_status); + const auto status = std::get<1>(p_val_status); + + n_bytes[b] = [&] { + auto bufsize = std::size(bufs[b]); + // clang-format off const int n_prefix_bytes = std::snprintf(bufs[b].data(), bufsize, "%s\t", t_alt_model.rowname.data()); - // clang-format on - if (n_prefix_bytes < 0) - return n_prefix_bytes; - - bufsize -= n_prefix_bytes; - auto cursor = bufs[b].data() + n_prefix_bytes; - - const int n_pval_bytes = [&] { - if (status == row_status::ok) - return std::snprintf(cursor, bufsize, "%.6g", p_val); - if (!more_na_info || status == row_status::na) - return std::snprintf(cursor, bufsize, "NA"); - if (status == row_status::na_extreme_cnt) - return std::snprintf(cursor, bufsize, "NA_EXTREME_CNT"); - // if (status == row_status::na_low_cov) - return std::snprintf(cursor, bufsize, "NA_LOW_COV"); - }(); - - if (n_pval_bytes < 0) - return n_pval_bytes; - - bufsize -= n_pval_bytes; - cursor += n_pval_bytes; - - const int n_param_bytes = [&] { - std::int32_t n_param_bytes = 0; - if (p_estim_alt.empty()) - p_estim_alt.resize(n_groups, 0.0); - for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) { - const int n = - std::snprintf(cursor, bufsize, "\t%f", p_estim_alt[g_idx]); - bufsize -= n; - cursor += n; + // clang-format on + if (n_prefix_bytes < 0) + return n_prefix_bytes; + + bufsize -= n_prefix_bytes; + // NOLINTNEXTLINE(*-pointer-arithmetic) + auto cursor = bufs[b].data() + n_prefix_bytes; + + const int n_pval_bytes = [&] { + if (status == row_status::ok) + return std::snprintf(cursor, bufsize, "%.6g", p_val); + if (!more_na_info || status == row_status::na) + return std::snprintf(cursor, bufsize, "NA"); + if (status == row_status::na_extreme_cnt) + return std::snprintf(cursor, bufsize, "NA_EXTREME_CNT"); + // if (status == row_status::na_low_cov) + return std::snprintf(cursor, bufsize, "NA_LOW_COV"); + }(); + + if (n_pval_bytes < 0) + return n_pval_bytes; + + bufsize -= n_pval_bytes; + cursor += n_pval_bytes; // NOLINT(*-pointer-arithmetic) + + const int n_param_bytes = [&] { + std::int32_t n_param_bytes = 0; + if (p_estim_alt.empty()) + p_estim_alt.resize(n_groups, 0.0); + for (auto g_idx = 0u; g_idx < n_groups; ++g_idx) { + const int n = + std::snprintf(cursor, bufsize, "\t%f", p_estim_alt[g_idx]); + bufsize -= n; + cursor += n; // NOLINT(*-pointer-arithmetic) + if (n < 0) + return n; + n_param_bytes += n; + } + const auto od = overdispersion_factor(n_samples, phi_estim_alt); + const int n = std::snprintf(cursor, bufsize, "\t%f\n", od); if (n < 0) return n; n_param_bytes += n; - } - const auto od = overdispersion_factor(n_samples, phi_estim_alt); - const int n = std::snprintf(cursor, bufsize, "\t%f\n", od); - if (n < 0) - return n; - n_param_bytes += n; - return n_param_bytes; - }(); + return n_param_bytes; + }(); - if (n_param_bytes < 0) - return n_param_bytes; + if (n_param_bytes < 0) + return n_param_bytes; - return n_prefix_bytes + n_pval_bytes + n_param_bytes; - }(); - } - }); + return n_prefix_bytes + n_pval_bytes + n_param_bytes; + }(); + } + }); } for (auto &thread : threads) @@ -261,7 +264,7 @@ that the design matrix and the proportion table are correctly formatted. } int -main_radmeth_nano(int argc, char *argv[]) { +main_radmeth_nano(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { static const std::string description = "calculate differential methylation scores for nanopore data"; @@ -274,8 +277,8 @@ main_radmeth_nano(int argc, char *argv[]) { std::uint32_t n_threads{1}; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - " "); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " "); opt_parse.set_show_defaults(); opt_parse.add_opt("out", 'o', "output file", true, outfile); opt_parse.add_opt("threads", 't', "number of threads to use", false, From 45e5a6f51209e56d3a176ef7847637885ecc8fd3 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 062/106] src/radmeth/radmeth_optimize_gamma.cpp: changes to add static analysis --- src/radmeth/radmeth_optimize_gamma.cpp | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/radmeth/radmeth_optimize_gamma.cpp b/src/radmeth/radmeth_optimize_gamma.cpp index 533ee81d..5bb97f6f 100644 --- a/src/radmeth/radmeth_optimize_gamma.cpp +++ b/src/radmeth/radmeth_optimize_gamma.cpp @@ -18,16 +18,19 @@ #include "radmeth_model.hpp" #include "radmeth_optimize_params.hpp" -#include -#include - -#include #include #include +#include +#include #include -#include #include +#include +#include +#include + +// NOLINTBEGIN(*-pointer-arithmetic,*-avoid-magic-numbers,*-constant-array-index,*-avoid-do-while) + [[nodiscard]] static inline auto logistic(const double x) -> double { return 1.0 / (1.0 / std::exp(x) + 1.0); @@ -92,7 +95,7 @@ log_likelihood(const gsl_vector *params, Regression ®) -> double { range 0-1 */ // clang-format off static constexpr std::array psi1_cs { - -0.038057080835217922, // == -0.019028540417608961*2 + -0.038057080835217922, // == -0.019028540417608961*2 +0.491415393029387130, -0.056815747821244730, +0.008357821225914313, @@ -122,7 +125,7 @@ static constexpr std::array psi1_cs { function */ // clang-format off static constexpr std::array psi2_cs = { - -0.0204749044678185, // == -0.01023745223390925*2 + -0.0204749044678185, // == -0.01023745223390925*2 -0.0101801271534859, +0.0000559718725387, -0.0000012917176570, @@ -344,3 +347,5 @@ fit_regression_model_gamma(Regression &r, double &dispersion_estimate) { fit_regression_model(r, p_estimates, dispersion_estimate); } + +// NOLINTEND(*-pointer-arithmetic,*-avoid-magic-numbers,*-constant-array-index,*-avoid-do-while) From e3587f5ad0980491168f73a49562498395864b77 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 063/106] src/radmeth/radmeth_optimize_series.cpp: changes to add static analysis --- src/radmeth/radmeth_optimize_series.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/radmeth/radmeth_optimize_series.cpp b/src/radmeth/radmeth_optimize_series.cpp index bd54f087..4707e35f 100644 --- a/src/radmeth/radmeth_optimize_series.cpp +++ b/src/radmeth/radmeth_optimize_series.cpp @@ -14,19 +14,23 @@ */ #include "radmeth_optimize_series.hpp" - +#include "radmeth_design.hpp" #include "radmeth_model.hpp" #include "radmeth_optimize_params.hpp" -#include -#include - #include +#include #include +#include #include -#include #include +#include +#include +#include + +// NOLINTBEGIN(*-pointer-arithmetic,*-narrowing-conversions,*-constant-array-index,*-avoid-do-while,cert-flp30-c) + [[nodiscard]] static inline auto logistic(const double x) -> double { return 1.0 / (1.0 / std::exp(x) + 1.0); @@ -65,7 +69,8 @@ get_cached_log1p_factors(Regression ®, const double phi) { static inline auto get_cached_dispersion_effect(Regression ®, const double phi) { const std::size_t max_k = reg.max_r_count; - auto &cache_dispersion_effect = reg.cache_dispersion_effect; + auto &cache_dispersion_effect = // cppcheck-suppress constVariableReference + reg.cache_dispersion_effect; double j = -1.0; const auto lim = std::cbegin(cache_dispersion_effect) + max_k; for (auto it = std::begin(cache_dispersion_effect); it != lim; ++it, ++j) @@ -306,3 +311,5 @@ fit_regression_model(Regression &r, gsl_multimin_fdfminimizer_free(s); gsl_vector_free(params); } + +// NOLINTEND(*-pointer-arithmetic,*-narrowing-conversions,*-constant-array-index,*-avoid-do-while,cert-flp30-c) From a01de7101518d4db31de6ceeef5952ce483efae8 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 064/106] src/radmeth/radmeth_utils.cpp: changes to add static analysis --- src/radmeth/radmeth_utils.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/radmeth/radmeth_utils.cpp b/src/radmeth/radmeth_utils.cpp index 54a39c1c..530199b6 100644 --- a/src/radmeth/radmeth_utils.cpp +++ b/src/radmeth/radmeth_utils.cpp @@ -19,10 +19,14 @@ #include #include #include +#include #include +#include #include #include +// NOLINTBEGIN(*-narrowing-conversions,*-avoid-magic-numbers) + [[nodiscard]] std::string format_duration(const std::chrono::duration elapsed) { static constexpr auto s_per_h = 3600; @@ -45,7 +49,8 @@ file_progress::file_progress(const std::string &filename) : one_thousand_over_filesize{1000.0 / std::filesystem::file_size(filename)} {} void -file_progress::operator()(std::ifstream &in) { +file_progress::operator()( + std::ifstream &in) { // cppcheck-suppress constParameterReference const std::size_t curr_offset = in.eof() ? 1000 : in.tellg() * one_thousand_over_filesize; if (curr_offset <= prev_offset) @@ -142,3 +147,5 @@ llr_test(const double null_loglik, const double full_loglik) { return p_value; } + +// NOLINTEND(*-narrowing-conversions,*-avoid-magic-numbers) From cd37dfd4d41cce22292fad4c9848a655dcc9cd66 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 065/106] src/radmeth/radmeth_utils.hpp: changes to add static analysis --- src/radmeth/radmeth_utils.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/radmeth/radmeth_utils.hpp b/src/radmeth/radmeth_utils.hpp index cae962c1..4522fe55 100644 --- a/src/radmeth/radmeth_utils.hpp +++ b/src/radmeth/radmeth_utils.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -29,7 +30,7 @@ struct file_progress { std::size_t prev_offset{}; explicit file_progress(const std::string &filename); void - operator()(std::ifstream &in); + operator()(std::ifstream &in); // cppcheck-suppress constParameterReference }; [[nodiscard]] double From aa6e06ce44c069bf36d08cf7e2c923fd72381cf4 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 066/106] src/utils/clean-hairpins.cpp: changes to add static analysis --- src/utils/clean-hairpins.cpp | 245 ++++++++++++++++++----------------- 1 file changed, 123 insertions(+), 122 deletions(-) diff --git a/src/utils/clean-hairpins.cpp b/src/utils/clean-hairpins.cpp index 24147870..29f56462 100644 --- a/src/utils/clean-hairpins.cpp +++ b/src/utils/clean-hairpins.cpp @@ -16,85 +16,77 @@ * General Public License for more details. */ +#include "OptionParser.hpp" +#include "smithlab_utils.hpp" + #include #include #include -#include // std::size_t -#include // std::uint32_t and std::uint64_t +#include +#include +#include +#include #include -#include #include #include #include +#include +#include #include #include #include -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - -using std::array; -using std::cerr; -using std::cout; -using std::endl; -using std::floor; -using std::ifstream; -using std::istream; -using std::min; -using std::ofstream; -using std::ostream; -using std::ostringstream; -using std::runtime_error; -using std::size_t; -using std::string; -using std::uint32_t; -using std::uint64_t; -using std::vector; - -using bamxx::bgzf_file; +// NOLINTBEGIN(*-narrowing-conversions,*-magic-numbers) // store each read from one end struct FASTQRecord { - string name; - string seq; - string qual; + std::string name; + std::string seq; + std::string qual; }; -bgzf_file & -operator<<(bgzf_file &out, const FASTQRecord &r) { +bamxx::bgzf_file & +operator<<(bamxx::bgzf_file &out, const FASTQRecord &r) { // below, the other chars are @, + and four newlines - static const uint32_t other_chars = 6; - const size_t buf_size = + static const std::uint32_t other_chars = 6; + const std::size_t buf_size = size(r.name) + size(r.seq) + size(r.qual) + other_chars; - string buffer(buf_size, '\0'); - string::iterator b = begin(buffer); + std::string buffer(buf_size, '\0'); + std::string::iterator b = begin(buffer); *b++ = '@'; - b = copy(cbegin(r.name), cend(r.name), b); + b = copy(std::cbegin(r.name), std::cend(r.name), b); *b++ = '\n'; - b = copy(cbegin(r.seq), cend(r.seq), b); + b = copy(std::cbegin(r.seq), std::cend(r.seq), b); *b++ = '\n'; *b++ = '+'; *b++ = '\n'; - b = copy(cbegin(r.qual), cend(r.qual), b); + b = copy(std::cbegin(r.qual), std::cend(r.qual), b); *b++ = '\n'; out.write(buffer); return out; } // Read 4 lines one time from fastq and fill in the FASTQRecord structure -static bgzf_file & -operator>>(bgzf_file &s, FASTQRecord &r) { - constexpr auto n_error_codes = 5u; - - enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual }; - - static const array error_msg = { - runtime_error(""), runtime_error("failed to parse fastq name line"), - runtime_error("failed to parse fastq sequence line"), - runtime_error("failed to parse fastq plus line"), - runtime_error("failed to parse fastq qual line")}; +static bamxx::bgzf_file & +operator>>(bamxx::bgzf_file &s, FASTQRecord &r) { + static constexpr auto n_error_codes = 5u; + + enum err_code : std::uint8_t { + none, + bad_name, + bad_seq, + bad_plus, + bad_qual, + }; + + static const std::array error_msg = { + std::runtime_error(""), + std::runtime_error("failed to parse fastq name line"), + std::runtime_error("failed to parse fastq sequence line"), + std::runtime_error("failed to parse fastq plus line"), + std::runtime_error("failed to parse fastq qual line"), + }; err_code ec = err_code::none; @@ -105,13 +97,14 @@ operator>>(bgzf_file &s, FASTQRecord &r) { ec = err_code::bad_name; const auto nm_end = r.name.find_first_of(" \t"); - const auto nm_sz = (nm_end == string::npos ? r.name.size() : nm_end) - 1; - r.name.erase(copy_n(cbegin(r.name) + 1, nm_sz, begin(r.name)), cend(r.name)); + const auto nm_sz = (nm_end == std::string::npos ? r.name.size() : nm_end) - 1; + r.name.erase(copy_n(std::cbegin(r.name) + 1, nm_sz, begin(r.name)), + std::cend(r.name)); if (!getline(s, r.seq)) ec = err_code::bad_seq; - string tmp; + std::string tmp; if (!getline(s, tmp)) ec = err_code::bad_plus; @@ -119,7 +112,7 @@ operator>>(bgzf_file &s, FASTQRecord &r) { ec = err_code::bad_qual; if (ec != err_code::none) - throw error_msg[ec]; + throw error_msg[ec]; // NOLINT return s; } @@ -132,12 +125,13 @@ similar_letters_bisulfite_tc_and_ag(const char a, const char b) { // compare two reads to detect the overlapped region static double -similarity_both_bisulfite_conversions(const string &s1, const string &s2) { - const size_t lim = min(size(s1), size(s2)); +similarity_both_bisulfite_conversions(const std::string &s1, + const std::string &s2) { + const std::size_t lim = std::min(size(s1), size(s2)); - uint32_t total_letters = 0; - uint32_t matching_letters = 0; - for (size_t i = 0; i < lim; ++i) { + std::uint32_t total_letters = 0; + std::uint32_t matching_letters = 0; + for (std::size_t i = 0; i < lim; ++i) { matching_letters += (similar_letters_bisulfite_tc_and_ag(s1[i], s2[i])); total_letters += (valid_base(s1[i]) && valid_base(s2[i])); } @@ -146,20 +140,19 @@ similarity_both_bisulfite_conversions(const string &s1, const string &s2) { } struct hp_summary { - explicit hp_summary(const double cutoff) : cutoff{cutoff} {} // n_reads is the total number of read pairs in the input fastq // files. - uint64_t n_reads{}; + std::uint64_t n_reads{}; // n_good_reads is the total number of read pairs that together have // a fraction of good bases above the minimum specified. - uint64_t n_good_reads{}; + std::uint64_t n_good_reads{}; // n_hairpin_reads is the number of read pairs identified as being // hairpins using the criteria in the "cutoff" variable. - uint64_t n_hairpin_reads{}; + std::uint64_t n_hairpin_reads{}; // sum_percent_match_good is the sum of the percent matches between // the read ends for reads that do not meet the criteria for hairpin. @@ -182,14 +175,16 @@ struct hp_summary { // sum_percent_match_bad over the total hairpin reads. double mean_percent_match_hairpin{}; - auto assign_values() -> void { + auto + assign_values() -> void { mean_percent_match_non_hairpin = sum_percent_match_good / (n_reads - n_hairpin_reads); mean_percent_match_hairpin = sum_percent_match_bad / n_hairpin_reads; } - auto tostring() const -> string { - ostringstream oss; + auto + tostring() const -> std::string { + std::ostringstream oss; oss << "total_reads_pairs: " << n_reads << '\n' << "good_reads_pairs: " << n_good_reads << '\n' << "hairpin_read_pairs: " << n_hairpin_reads << '\n' @@ -204,65 +199,71 @@ struct hp_summary { }; static void -write_histogram(const string &hist_outfile, vector hist) { - ofstream hist_out(hist_outfile); +write_histogram(const std::string &hist_outfile, std::vector hist) { + std::ofstream hist_out(hist_outfile); if (!hist_out) - throw runtime_error("failed to open file: " + hist_outfile); - const auto total = accumulate(cbegin(hist), cend(hist), 0.0); - transform(cbegin(hist), cend(hist), begin(hist), - [&total](const double t) { return t / total; }); + throw std::runtime_error("failed to open file: " + hist_outfile); + const auto total = std::accumulate(std::cbegin(hist), std::cend(hist), 0.0); + std::transform(std::cbegin(hist), std::cend(hist), std::begin(hist), + [&](const double t) { return t / total; }); const double increment = 1.0 / size(hist); auto idx = 0; hist_out << std::fixed; hist_out.precision(3); + // NOLINTBEGIN(clang-analyzer-*,cert-flp30-c) for (double offset = 0.0; offset < 1.0; offset += increment) hist_out << offset << '\t' << hist[idx++] << '\n'; + // NOLINTEND(clang-analyzer-*,cert-flp30-c) } static void -write_statistics(const string &filename, hp_summary hps) { +write_statistics(const std::string &filename, hp_summary hps) { hps.assign_values(); - ofstream out(filename); + std::ofstream out(filename); if (!out) - throw runtime_error("failed to open file: " + filename); + throw std::runtime_error("failed to open file: " + filename); out << hps.tostring(); } static inline double fraction_good_bases(const FASTQRecord &a, const FASTQRecord &b) { - const double a_bases = count_if(cbegin(a.seq), cend(a.seq), &valid_base); - const double b_bases = count_if(cbegin(b.seq), cend(b.seq), &valid_base); + const double a_bases = + std::count_if(std::cbegin(a.seq), std::cend(a.seq), &valid_base); + const double b_bases = + std::count_if(std::cbegin(b.seq), std::cend(b.seq), &valid_base); return (a_bases + b_bases) / (size(a.seq) + size(b.seq)); } struct clean_hairpin { double cutoff{0.95}; - // ADS: this was uint64_t but g++-14.2.0 on macOS had a problem - size_t n_reads_to_check{std::numeric_limits::max()}; + // ADS: this was std::uint64_t but g++-14.2.0 on macOS had a problem + std::size_t n_reads_to_check{std::numeric_limits::max()}; double min_good_base_percent{0.5}; - uint32_t min_read_length{32}; - uint32_t n_hist_bins{20}; + std::uint32_t min_read_length{32}; + std::uint32_t n_hist_bins{20}; bool invert_output{false}; - hp_summary analyze_reads(const string &outfile1, const string &outfile2, - bgzf_file &in1, bgzf_file &in2, - vector &hist) const; - hp_summary analyze_reads(bgzf_file &in1, bgzf_file &in2, - vector &hist) const; + hp_summary + analyze_reads(const std::string &outfile1, const std::string &outfile2, + bamxx::bgzf_file &in1, bamxx::bgzf_file &in2, + std::vector &hist) const; + hp_summary + analyze_reads(bamxx::bgzf_file &in1, bamxx::bgzf_file &in2, + std::vector &hist) const; }; hp_summary -clean_hairpin::analyze_reads(const string &outfile1, const string &outfile2, - bgzf_file &in1, bgzf_file &in2, - vector &hist) const { - +clean_hairpin::analyze_reads(const std::string &outfile1, + const std::string &outfile2, bamxx::bgzf_file &in1, + bamxx::bgzf_file &in2, + std::vector &hist) const { // output files for read1 and read2 with hairpins removed - bgzf_file out1(outfile1, "w"); + bamxx::bgzf_file out1(outfile1, "w"); if (!out1) - throw runtime_error("cannot open output file: " + outfile1); - bgzf_file out2(outfile2, "w"); + throw std::runtime_error("cannot open output file: " + outfile1); + bamxx::bgzf_file out2(outfile2, "w"); if (!out2) - throw runtime_error("cannot open output file: " + outfile2); + throw std::runtime_error("cannot open output file: " + outfile2); hp_summary hps{cutoff}; FASTQRecord r1, r2; @@ -280,7 +281,7 @@ clean_hairpin::analyze_reads(const string &outfile1, const string &outfile2, similarity_both_bisulfite_conversions(r1.seq, r2.seq); // ADS: need a bitter way to get this bin identifier - ++hist[floor(percent_match * n_hist_bins)]; + ++hist[std::floor(percent_match * n_hist_bins)]; if (percent_match > cutoff) { hps.sum_percent_match_bad += percent_match; @@ -302,8 +303,8 @@ clean_hairpin::analyze_reads(const string &outfile1, const string &outfile2, } hp_summary -clean_hairpin::analyze_reads(bgzf_file &in1, bgzf_file &in2, - vector &hist) const { +clean_hairpin::analyze_reads(bamxx::bgzf_file &in1, bamxx::bgzf_file &in2, + std::vector &hist) const { hp_summary hps{cutoff}; FASTQRecord r1, r2; while (hps.n_reads < n_reads_to_check && in1 >> r1 && in2 >> r2) { @@ -317,7 +318,7 @@ clean_hairpin::analyze_reads(bgzf_file &in1, bgzf_file &in2, const double percent_match = similarity_both_bisulfite_conversions(r1.seq, r2.seq); - ++hist[floor(percent_match * n_hist_bins)]; + ++hist[std::floor(percent_match * n_hist_bins)]; if (percent_match > cutoff) { hps.sum_percent_match_bad += percent_match; @@ -330,24 +331,21 @@ clean_hairpin::analyze_reads(bgzf_file &in1, bgzf_file &in2, } int -main_clean_hairpins(int argc, char *argv[]) { - - static const string description = "fix and stat invdup/hairpin reads"; - +main_clean_hairpins(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) + static constexpr auto description = "fix and stat invdup/hairpin reads"; try { - - string outfile1; - string outfile2; - string stat_outfile; - string hist_outfile; + std::string outfile1; + std::string outfile2; + std::string stat_outfile; + std::string hist_outfile; clean_hairpin ch{}; bool verbose = false; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - " ", true); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " ", true); opt_parse.set_show_defaults(); opt_parse.add_opt("out1", 'o', "output file for read 1", false, outfile1); opt_parse.add_opt("out2", 'p', "output file for read 2", false, outfile2); @@ -367,42 +365,43 @@ main_clean_hairpins(int argc, char *argv[]) { opt_parse.add_opt("bins", 'b', "number of histograms bins", false, ch.n_hist_bins); opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested() || leftover_args.size() != 2) { - cerr << opt_parse.about_message() << endl; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } - const string reads_file1(leftover_args.front()); - const string reads_file2(leftover_args.back()); + const std::string reads_file1(leftover_args.front()); + const std::string reads_file2(leftover_args.back()); /****************** END COMMAND LINE OPTIONS *****************/ if (outfile1.empty() != outfile2.empty()) { // ADS: add message about number of reads to check and output files - cerr << "error: specify both or neither of out1/o and out2/p" << endl; + std::cerr << "error: specify both or neither of out1/o and out2/p" + << '\n'; return EXIT_FAILURE; } const bool write_reads = !outfile1.empty(); - vector hist(ch.n_hist_bins, 0.0); + std::vector hist(ch.n_hist_bins, 0.0); // Input: paired-end reads with end1 and end2 - bgzf_file in1(reads_file1, "r"); + bamxx::bgzf_file in1(reads_file1, "r"); if (!in1) - throw runtime_error("cannot open input file: " + reads_file1); - bgzf_file in2(reads_file2, "r"); + throw std::runtime_error("cannot open input file: " + reads_file1); + bamxx::bgzf_file in2(reads_file2, "r"); if (!in2) - throw runtime_error("cannot open input file: " + reads_file2); + throw std::runtime_error("cannot open input file: " + reads_file2); const hp_summary hps = write_reads ? ch.analyze_reads(outfile1, outfile2, in1, in2, hist) @@ -415,8 +414,10 @@ main_clean_hairpins(int argc, char *argv[]) { write_histogram(hist_outfile, hist); } catch (const std::exception &e) { - cerr << e.what() << endl; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-narrowing-conversions,*-magic-numbers) From 7c1f417f304014ad246b5a98a8f776384b1fc40e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 067/106] src/utils/covered.cpp: changes to add static analysis --- src/utils/covered.cpp | 79 +++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 36 deletions(-) diff --git a/src/utils/covered.cpp b/src/utils/covered.cpp index 7836ed15..7c0f7c54 100644 --- a/src/utils/covered.cpp +++ b/src/utils/covered.cpp @@ -15,90 +15,93 @@ * General Public License for more details. */ +#include "OptionParser.hpp" + #include +#include +#include + +#include +#include +#include +#include #include +#include #include #include #include -// from smithlab_cpp -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - -using std::cerr; -using std::cout; -using std::endl; -using std::runtime_error; using std::string; -using bamxx::bgzf_file; - - static inline bool get_is_mutated(const kstring_t &line) { - const auto end_itr = line.s + line.l; + const auto end_itr = line.s + line.l; // NOLINT(*-pointer-arithmetic) return std::find(line.s, end_itr, 'x') != end_itr; } static inline uint32_t get_n_reads(const kstring_t &line) { - const auto end_itr = std::make_reverse_iterator(line.s + line.l); - const auto beg_itr = std::make_reverse_iterator(line.s); + const auto end_itr = std::make_reverse_iterator( + line.s + line.l); // NOLINT(*-pointer-arithmetic) + const auto beg_itr = + std::make_reverse_iterator(line.s); // NOLINT(*-pointer-arithmetic) auto n_reads_pos = std::find_if( end_itr, beg_itr, [](const char c) { return c == ' ' || c == '\t'; }); ++n_reads_pos; - return atoi(n_reads_pos.base()); + return atoi(n_reads_pos.base()); // NOLINT(cert-err34-c) } int -main_covered(int argc, char *argv[]) { +main_covered(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - size_t n_threads = 1; + std::int32_t n_threads = 1; - string outfile{"-"}; - const string description = + std::string outfile{"-"}; + const std::string description = "filter counts files so they only have covered sites"; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " (\"-\" for standard input)", 1); opt_parse.add_opt("output", 'o', "output file (default is standard out)", false, outfile); opt_parse.add_opt("threads", 't', "threads for compression (use few)", false, n_threads); - std::vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << endl; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } - const string filename(leftover_args.front()); + const std::string filename(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ bamxx::bam_tpool tpool(n_threads); - bgzf_file in(filename, "r"); - if (!in) throw runtime_error("could not open file: " + filename); + bamxx::bgzf_file in(filename, "r"); + if (!in) + throw std::runtime_error("could not open file: " + filename); const auto outfile_mode = in.is_compressed() ? "w" : "wu"; - bgzf_file out(outfile, outfile_mode); - if (!out) throw runtime_error("error opening output file: " + outfile); + bamxx::bgzf_file out(outfile, outfile_mode); + if (!out) + throw std::runtime_error("error opening output file: " + outfile); if (n_threads > 1) { // ADS: something breaks when we use the thread for the input @@ -110,25 +113,29 @@ main_covered(int argc, char *argv[]) { // use the kstring_t type to more directly use the BGZF file kstring_t line{0, 0, nullptr}; const int ret = ks_resize(&line, 1024); - if (ret) throw runtime_error("failed to acquire buffer"); + if (ret) + throw std::runtime_error("failed to acquire buffer"); bool write_ok = true; while (bamxx::getline(in, line) && write_ok) { const bool is_mutated = get_is_mutated(line); const uint32_t n_reads = get_n_reads(line); if (n_reads > 0u || is_mutated) { + // NOLINTBEGIN(*-pointer-arithmetic) line.s[line.l++] = '\n'; write_ok = (bgzf_write(out.f, line.s, line.l) == static_cast(line.l)); + // NOLINTEND(*-pointer-arithmetic) } } if (!write_ok) { - cerr << "failed writing to: " << outfile << '\n'; + std::cerr << "failed writing to: " << outfile << '\n'; return EXIT_FAILURE; } + ks_free(&line); } - catch (const std::runtime_error &e) { - cerr << e.what() << endl; + catch (const std::exception &e) { + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 246d7a12b21719cb9af3b02a9847c6c6bf893db2 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 068/106] src/utils/fast-liftover.cpp: changes to add static analysis --- src/utils/fast-liftover.cpp | 74 ++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/src/utils/fast-liftover.cpp b/src/utils/fast-liftover.cpp index cede0633..f5a0291c 100644 --- a/src/utils/fast-liftover.cpp +++ b/src/utils/fast-liftover.cpp @@ -16,43 +16,40 @@ * GNU General Public License for more details. */ - /* Sample indexfile line: [T-chr] [T-start] [T-end] [S-chr]:[S-start]:[S-end]:[S-strand] [] [T-strand] chr21 26608683 26608684 chr1:3007015:3007016:- 0 + */ -#include -#include +#include "MSite.hpp" +#include "OptionParser.hpp" + +#include +#include #include -#include #include -#include -#include +#include +#include #include -#include -#include // for [u]int[0-9]+_t - -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" -#include "OptionParser.hpp" -#include "GenomicRegion.hpp" -#include "MSite.hpp" +#include +#include +#include +#include -using std::string; -using std::ios_base; -using std::vector; -using std::cout; using std::cerr; +using std::cout; using std::endl; -using std::unordered_map; +using std::ios_base; using std::runtime_error; +using std::string; +using std::unordered_map; +using std::vector; struct SimpleSite { string chrom; - uint32_t pos; - char strand; + uint32_t pos{}; + char strand{}; SimpleSite() {} SimpleSite(const string &c, const uint32_t p, const char s) : chrom(c), pos(p), strand(s) {} @@ -66,18 +63,16 @@ flip_strand(SimpleSite &s) { } } -typedef -unordered_map liftover_index; +typedef unordered_map liftover_index; static void read_index_file(const bool plus_strand, const string &index_file, unordered_map &index) { - std::ifstream in(index_file); if (!in) throw runtime_error("problem opening index file: " + index_file); - size_t from_pos, to_pos; + size_t from_pos{}, to_pos{}; string from_chrom, to_chrom; string to_strand; MSite curr_site; @@ -92,7 +87,6 @@ read_index_file(const bool plus_strand, const string &index_file, static bool lift_site(const unordered_map &index, MSite &meth_site) { - auto chrom_index = index.find(meth_site.chrom); if (chrom_index == end(index)) return false; @@ -108,7 +102,7 @@ lift_site(const unordered_map &index, } int -main_fast_liftover(int argc, char *argv[]) { +main_fast_liftover(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { string indexfile; string tofile; @@ -119,8 +113,8 @@ main_fast_liftover(int argc, char *argv[]) { bool plus_strand = false; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), - "Fast liftOver-all cytosine-by strand" ); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + "Fast liftOver-all cytosine-by strand"); opt_parse.add_opt("indexfile", 'i', "index file", true, indexfile); opt_parse.add_opt("from", 'f', "Original file", true, fromfile); opt_parse.add_opt("to", 't', "Output file liftovered", true, tofile); @@ -134,22 +128,22 @@ main_fast_liftover(int argc, char *argv[]) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } /****************** END COMMAND LINE OPTIONS *****************/ unordered_map index; if (VERBOSE) - cerr << "[loading liftover index file " << indexfile << "]" << endl; + cerr << "[loading liftover index file " << indexfile << "]\n"; read_index_file(plus_strand, indexfile, index); std::ifstream in(fromfile); @@ -165,22 +159,18 @@ main_fast_liftover(int argc, char *argv[]) { unlifted.open(unlifted_file.c_str()); if (VERBOSE) - cerr << "[lifting from: " << fromfile << " to: " << tofile << "]" << endl; + cerr << "[lifting from: " << fromfile << " to: " << tofile << "]\n"; MSite lifted, meth_site; while (in >> meth_site) { if (lift_site(index, meth_site)) - out << meth_site << endl; + out << meth_site << '\n'; else if (!unlifted_file.empty()) - unlifted << meth_site << endl; + unlifted << meth_site << '\n'; } } - catch (const runtime_error &e) { - cerr << e.what() << endl; - return EXIT_FAILURE; - } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::exception &e) { + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 9bc0f1eb8739f9a78c46b18a1b499e230b5c9cb3 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 069/106] src/utils/format-reads.cpp: changes to add static analysis --- src/utils/format-reads.cpp | 73 +++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/src/utils/format-reads.cpp b/src/utils/format-reads.cpp index f2b674d9..dfd7a94a 100644 --- a/src/utils/format-reads.cpp +++ b/src/utils/format-reads.cpp @@ -27,38 +27,46 @@ * A-rich and then changing that format to indicate T-rich. */ -// from dnmtools +#include "OptionParser.hpp" #include "bam_record_utils.hpp" #include "dnmt_error.hpp" -// from smithlab -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" +#include #include #include -#include // for [u]int[0-9]+_t +#include +#include +#include +#include #include -#include +#include #include +#include +#include +#include #include -#include #include #include +#include #include +// NOLINTBEGIN(*-avoid-magic-numbers,cert-*-cpp) + static std::int32_t -merge_ends(bamxx::bam_rec &one, bamxx::bam_rec &two, bamxx::bam_rec &merged) { +merge_ends(const bamxx::bam_rec &one, const bamxx::bam_rec &two, + bamxx::bam_rec &merged) { if (!are_mates(one, two)) return std::numeric_limits::min(); // arithmetic easier using base 0 so subtracting 1 from pos + // NOLINTBEGIN(*-narrowing-conversions) const int one_s = get_pos(one); const int one_e = get_endpos(one); const int two_s = get_pos(two); const int two_e = get_endpos(two); + // NOLINTEND(*-narrowing-conversions) assert(one_s >= 0 && two_s >= 0); const int spacer = two_s - one_e; @@ -175,6 +183,8 @@ get_max_repeat_count(const std::vector &names, // allow the repeat_count to go to 2, which might not be the "max" // but still would indicate that this suffix length is too long and // would result in more that two reads identified mutually as mates. + + // NOLINTBEGIN(*-narrowing-conversions) for (std::size_t i = 1; i < names.size() && repeat_count < 2; ++i) { if (names[i - 1].size() == names[i].size() && std::equal(std::cbegin(names[i - 1]), @@ -184,6 +194,7 @@ get_max_repeat_count(const std::vector &names, tmp_repeat_count = 0; repeat_count = std::max(repeat_count, tmp_repeat_count); } + // NOLINTEND(*-narrowing-conversions) return repeat_count; } @@ -194,9 +205,14 @@ check_suff_len(const std::string &inputfile, const std::size_t suff_len, more than two reads being mutually considered mates */ auto names(load_read_names(inputfile, n_names_to_check)); // get the minimum read name length - std::size_t min_name_len = std::numeric_limits::max(); - for (auto &&i : names) - min_name_len = std::min(min_name_len, i.size()); + const std::size_t min_name_len = + std::accumulate(std::cbegin(names), std::cend(names), + std::numeric_limits::max(), + [](const auto a, const auto &name) { + return std::min(a, std::size(name)); + }); + // for (auto &&i : names) + // min_name_len = std::min(min_name_len, std::size(i)); if (min_name_len <= suff_len) throw dnmt_error("given suffix length exceeds min read name length"); std::sort(std::begin(names), std::end(names)); @@ -214,9 +230,16 @@ guess_suff_len(const std::string &inputfile, const std::size_t n_names_to_check, } // get the minimum read name length - std::size_t min_name_len = std::numeric_limits::max(); - for (auto &&i : names) - min_name_len = std::min(min_name_len, i.size()); + const std::size_t min_name_len = + std::accumulate(std::cbegin(names), std::cend(names), + std::numeric_limits::max(), + [](const auto a, const auto &name) { + return std::min(a, std::size(name)); + }); + + // std::size_t min_name_len = std::numeric_limits::max(); + // for (auto &&i : names) + // min_name_len = std::min(min_name_len, i.size()); assert(min_name_len > 0); std::sort(std::begin(names), std::end(names)); @@ -313,12 +336,12 @@ same_name(const bamxx::bam_rec &a, const bamxx::bam_rec &b, } static inline void -swap(bamxx::bam_rec &a, bamxx::bam_rec &b) { +swap(bamxx::bam_rec &a, bamxx::bam_rec &b) noexcept { std::swap(a.b, b.b); } static void -format(const std::string &cmd, const std::size_t n_threads, +format(const std::string &cmd, const std::int32_t n_threads, const std::string &inputfile, const std::string &outfile, const bool bam_format, const std::string &input_format, const std::size_t suff_len, const std::int32_t max_frag_len) { @@ -346,11 +369,10 @@ format(const std::string &cmd, const std::size_t n_threads, } bamxx::bam_rec aln, prev_aln, merged; - bool previous_was_merged = false; - const bool empty_reads_file = !hts.read(hdr, aln); if (!empty_reads_file) { + bool previous_was_merged = false; // ADS: if input is strict SAM that means any read with the rev bits set // in the flags has been reverse complemented and is not the original @@ -416,7 +438,7 @@ format(const std::string &cmd, const std::size_t n_threads, } auto -get_command_line(const int argc, char *argv[], +get_command_line(const int argc, char *argv[], // NOLINT(*-avoid-c-arrays) const std::string &prefix = std::string{}) -> std::string { if (argc == 0) return std::string{}; @@ -425,12 +447,12 @@ get_command_line(const int argc, char *argv[], cmd << prefix << " "; std::copy(argv, argv + (argc - 1), std::ostream_iterator(cmd, " ")); - cmd << argv[argc - 1]; + cmd << argv[argc - 1]; // NOLINT(*-pointer-arithmetic) return cmd.str(); } int -main_format(int argc, char *argv[]) { +main_format(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { std::size_t n_reads_to_check{1000000}; @@ -444,11 +466,12 @@ main_format(int argc, char *argv[]) { bool single_end{false}; bool verbose{false}; bool force{false}; - std::size_t n_threads{1}; + std::int32_t n_threads{1}; const auto description = "convert SAM/BAM mapped bs-seq reads " "to standard dnmtools format"; - const auto prog = std::string{"dnmtools"} + " " + argv[0]; + const auto prog = + std::string{"dnmtools"} + " " + argv[0]; // NOLINT(*-pointer-arithmetic) /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(prog, description, " [out-file]", 2); @@ -559,3 +582,5 @@ main_format(int argc, char *argv[]) { } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,cert-*-cpp) From 09e9c21f4079f70ea8e22ed31fb3e1a3c3849857 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 070/106] src/utils/guessprotocol.cpp: changes to add static analysis --- src/utils/guessprotocol.cpp | 264 ++++++++++++++++++++---------------- 1 file changed, 146 insertions(+), 118 deletions(-) diff --git a/src/utils/guessprotocol.cpp b/src/utils/guessprotocol.cpp index a2ef5923..85a3e5ff 100644 --- a/src/utils/guessprotocol.cpp +++ b/src/utils/guessprotocol.cpp @@ -16,36 +16,32 @@ * General Public License for more details. */ +#include "OptionParser.hpp" +#include "numerical_utils.hpp" + #include #include +#include +#include +#include +#include +#include +#include #include #include #include #include +#include #include #include #include #include -#include - -#include "OptionParser.hpp" -#include "numerical_utils.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - -using std::array; -using std::cerr; -using std::cout; -using std::endl; -using std::min; -using std::runtime_error; -using std::string; -using std::vector; -using bamxx::bgzf_file; +// NOLINTBEGIN(*-pointer-arithmetic,*-constant-array-index,*-narrowing-conversions,cert-err09-cpp,cert-err61-cpp) -constexpr int nuc_to_idx[] = { +// clang-format off +constexpr std::array nuc_to_idx = { /* 0*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 16*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 32*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -63,47 +59,51 @@ constexpr int nuc_to_idx[] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, }; +// clang-format on struct nucleotide_model { - vector pr{}; - vector lpr{}; + std::vector pr{}; + std::vector lpr{}; double bisulfite_conversion_rate{}; bool is_t_rich{}; - nucleotide_model(const vector &bc, const double conv_rate, - const bool itr) - : pr{bc}, bisulfite_conversion_rate{conv_rate}, is_t_rich{itr} { + nucleotide_model(const std::vector &bc, const double conv_rate, + const bool itr) : + pr{bc}, bisulfite_conversion_rate{conv_rate}, is_t_rich{itr} { auto nuc_from = is_t_rich ? 1 : 2; auto nuc_to = is_t_rich ? 3 : 0; pr[nuc_to] += bisulfite_conversion_rate * pr[nuc_from]; pr[nuc_from] *= (1.0 - bisulfite_conversion_rate); - assert(reduce(cbegin(pr), cend(pr), 0.0) == 1.0); + assert(std::reduce(std::cbegin(pr), std::cend(pr), 0.0) == 1.0); lpr.resize(std::size(pr)); - transform(cbegin(pr), cend(pr), begin(lpr), + transform(std::cbegin(pr), std::cend(pr), std::begin(lpr), [](const double x) { return log(x); }); } - double operator()(const string &s) const { - return accumulate(cbegin(s), cend(s), 0.0, - [&](const double x, const char c) { - const auto i = nuc_to_idx[static_cast(c)]; - return i == 4 ? x : x + lpr[i]; - }); + double + operator()(const std::string &s) const { + return std::accumulate(std::cbegin(s), std::cend(s), 0.0, + [&](const double x, const char c) { + const auto i = nuc_to_idx[static_cast(c)]; + return i == 4 ? x : x + lpr[i]; + }); }; - string tostring() const { + std::string + tostring() const { std::ostringstream oss; oss << "pr:\n"; - for (auto i : pr) oss << i << '\n'; + for (auto i : pr) + oss << i << '\n'; oss << "log pr:\n"; - for (auto i : lpr) oss << i << '\n'; + for (auto i : lpr) + oss << i << '\n'; oss << bisulfite_conversion_rate << '\n' << is_t_rich; return oss.str(); } }; struct guessprotocol_summary { - static constexpr auto wgbs_cutoff_confident = 0.99; static constexpr auto wgbs_cutoff_unconfident = 0.9; static constexpr auto rpbat_cutoff_confident_high = 0.8; @@ -111,14 +111,14 @@ struct guessprotocol_summary { static constexpr auto pbat_cutoff_unconfident = 0.1; static constexpr auto pbat_cutoff_confident = 0.01; - // protocol is the guessed protocol (wgbs, pbat, rpbat, or inconclusive) - // based on the content of the reads. - string protocol; + // protocol is the guessed protocol (wgbs, pbat or rpbat) based on the + // content of the reads. + std::string protocol; // confidence indicates the level of confidence in the guess for the // protocol. - string confidence; + std::string confidence; // layout indicates whether the reads are paired or single-ended. - string layout; + std::string layout; // n_reads_wgbs is the average number of reads (for single-ended reads) or // read pairs (for paired reads) where read1 is T-rich. double n_reads_wgbs{}; @@ -128,10 +128,9 @@ struct guessprotocol_summary { // the read1 of a read pair (for paired reads) is T-rich. double wgbs_fraction{}; - void evaluate() { - + void + evaluate() { const auto frac = n_reads_wgbs / n_reads; - protocol = "inconclusive"; // assigning wgbs (near one) if (frac > wgbs_cutoff_confident) { @@ -165,11 +164,12 @@ struct guessprotocol_summary { wgbs_fraction = frac; } - string tostring() const { + std::string + tostring() const { std::ostringstream oss; oss << "protocol: " << protocol << '\n' << "confidence: " << confidence << '\n' - << "wgbs_fraction: " << wgbs_fraction << '\n' + << "wgbs_fraction: " << wgbs_fraction << '\n' << "n_reads_wgbs: " << n_reads_wgbs << '\n' << "n_reads: " << n_reads; return oss.str(); @@ -178,111 +178,135 @@ struct guessprotocol_summary { // store each read from one end struct FASTQRecord { - string name; - string seq; + std::string name; + std::string seq; }; // see if two reads from two ends match to each other (they should // have the same name) static bool -mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends +mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends const FASTQRecord &a, const FASTQRecord &b) { assert(to_ignore_at_end < std::size(a.name)); - return equal(cbegin(a.name), cend(a.name) - to_ignore_at_end, - cbegin(b.name)); + return equal(std::cbegin(a.name), std::cend(a.name) - to_ignore_at_end, + std::cbegin(b.name)); } // Read 4 lines one time from fastq and fill in the FASTQRecord structure -static bgzf_file & -operator>>(bgzf_file &s, FASTQRecord &r) { - constexpr auto n_error_codes = 5u; - - enum err_code { none, bad_name, bad_seq, bad_plus, bad_qual }; +static bamxx::bgzf_file & +operator>>(bamxx::bgzf_file &s, FASTQRecord &r) { + static constexpr auto n_error_codes = 5u; + + enum err_code : std::uint8_t { + none, + bad_name, + bad_seq, + bad_plus, + bad_qual, + }; - static const array error_msg = { - runtime_error(""), runtime_error("failed to parse fastq name line"), - runtime_error("failed to parse fastq sequence line"), - runtime_error("failed to parse fastq plus line"), - runtime_error("failed to parse fastq qual line") + static const std::array error_msg = { + std::runtime_error(""), + std::runtime_error("failed to parse fastq name line"), + std::runtime_error("failed to parse fastq sequence line"), + std::runtime_error("failed to parse fastq plus line"), + std::runtime_error("failed to parse fastq qual line"), }; err_code ec = err_code::none; - if (!getline(s, r.name)) return s; + if (!getline(s, r.name)) + return s; - if (r.name.empty() || r.name[0] != '@') ec = err_code::bad_name; + if (r.name.empty() || r.name[0] != '@') + ec = err_code::bad_name; const auto nm_end = r.name.find_first_of(" \t"); - const auto nm_sz = (nm_end == string::npos ? r.name.size() : nm_end) - 1; - r.name.erase(copy_n(cbegin(r.name) + 1, nm_sz, begin(r.name)), cend(r.name)); + const auto nm_sz = (nm_end == std::string::npos ? r.name.size() : nm_end) - 1; + r.name.erase(std::copy_n(std::cbegin(r.name) + 1, nm_sz, std::begin(r.name)), + std::cend(r.name)); - if (!getline(s, r.seq)) ec = err_code::bad_seq; + if (!getline(s, r.seq)) + ec = err_code::bad_seq; - string tmp; - if (!getline(s, tmp)) ec = err_code::bad_plus; + std::string tmp; + if (!getline(s, tmp)) + ec = err_code::bad_plus; - if (!getline(s, tmp)) ec = err_code::bad_qual; + if (!getline(s, tmp)) + ec = err_code::bad_qual; - if (ec != err_code::none) throw error_msg[ec]; + if (ec != err_code::none) + throw error_msg[ec]; // NOLINT(runtime/arrays) return s; } int -main_guessprotocol(int argc, char *argv[]) { - +main_guessprotocol(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - - static const vector human_base_comp = {0.295, 0.205, 0.205, 0.295}; - static const vector flat_base_comp = {0.25, 0.25, 0.25, 0.25}; + static const std::vector human_base_comp = { + 0.295, + 0.205, + 0.205, + 0.295, + }; + static const std::vector flat_base_comp = { + 0.25, + 0.25, + 0.25, + 0.25, + }; constexpr auto description = "guess bisulfite protocol for a library"; bool verbose = false; bool use_human = false; - string outfile; - size_t reads_to_check = 1000000; + std::string outfile; + size_t reads_to_check = 1000000; // NOLINT(*-avoid-magic-numbers) size_t name_suffix_len = 0; - double bisulfite_conversion_rate = 0.98; + double bisulfite_conversion_rate = 0.98; // NOLINT(*-avoid-magic-numbers) namespace fs = std::filesystem; - const string cmd_name = std::filesystem::path(argv[0]).filename(); + const std::string cmd_name = std::filesystem::path(argv[0]).filename(); /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(cmd_name, description, " []"); - opt_parse.add_opt("nreads", 'n', "number of reads in initial check", - false, reads_to_check); - opt_parse.add_opt("ignore", 'i', "length of read name suffix " - "to ignore when matching", false, name_suffix_len); - opt_parse.add_opt("bisulfite", 'b', "bisulfite conversion rate", - false, bisulfite_conversion_rate); - opt_parse.add_opt("human", 'H', "assume human genome", - false, use_human); + opt_parse.add_opt("nreads", 'n', "number of reads in initial check", false, + reads_to_check); + opt_parse.add_opt("ignore", 'i', + "length of read name suffix " + "to ignore when matching", + false, name_suffix_len); + opt_parse.add_opt("bisulfite", 'b', "bisulfite conversion rate", false, + bisulfite_conversion_rate); + opt_parse.add_opt("human", 'H', "assume human genome", false, use_human); opt_parse.add_opt("output", 'o', "output file name", false, outfile); opt_parse.add_opt("verbose", 'v', - "report available information during the run", - false, verbose); - vector leftover_args; + "report available information during the run", false, + verbose); + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested() || leftover_args.size() > 2) { - cerr << opt_parse.about_message() << endl; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } - const vector reads_files(leftover_args); + const std::vector reads_files(leftover_args); /****************** END COMMAND LINE OPTIONS *****************/ auto base_comp = flat_base_comp; - if (use_human) base_comp = human_base_comp; + if (use_human) + base_comp = human_base_comp; nucleotide_model t_rich_model(base_comp, bisulfite_conversion_rate, true); nucleotide_model a_rich_model(base_comp, bisulfite_conversion_rate, false); @@ -292,36 +316,37 @@ main_guessprotocol(int argc, char *argv[]) { if (verbose) { if (reads_files.size() == 2) - cerr << "data layout: " - << "paired" << '\n' - << "read1 file: " << reads_files.front() << '\n' - << "read2 file: " << reads_files.back() << '\n'; + std::cerr << "data layout: " + << "paired" << '\n' + << "read1 file: " << reads_files.front() << '\n' + << "read2 file: " << reads_files.back() << '\n'; else - cerr << "data layout: " - << "single" << '\n' - << "read file: " << reads_files.front() << '\n'; - cerr << "reads to check: " << reads_to_check << '\n' - << "read name suffix length: " << name_suffix_len << '\n' - << "bisulfite conversion: " << bisulfite_conversion_rate << '\n'; + std::cerr << "data layout: " + << "single" << '\n' + << "read file: " << reads_files.front() << '\n'; + std::cerr << "reads to check: " << reads_to_check << '\n' + << "read name suffix length: " << name_suffix_len << '\n' + << "bisulfite conversion: " << bisulfite_conversion_rate + << '\n'; } if (reads_files.size() == 2) { - // input: paired-end reads with end1 and end2 - bgzf_file in1(reads_files.front(), "r"); + bamxx::bgzf_file in1(reads_files.front(), "r"); if (!in1) - throw runtime_error("cannot open file: " + reads_files.front()); + throw std::runtime_error("cannot open file: " + reads_files.front()); - bgzf_file in2(reads_files.back(), "r"); + bamxx::bgzf_file in2(reads_files.back(), "r"); if (!in2) - throw runtime_error("cannot open file: " + reads_files.back()); + throw std::runtime_error("cannot open file: " + reads_files.back()); FASTQRecord r1, r2; while (in1 >> r1 && in2 >> r2 && summary.n_reads < reads_to_check) { summary.n_reads++; if (!mates(name_suffix_len, r1, r2)) - throw runtime_error("expected mates: " + r1.name + ", " + r2.name); + throw std::runtime_error("expected mates: " + r1.name + ", " + + r2.name); const double ta = t_rich_model(r1.seq) + a_rich_model(r2.seq); const double at = a_rich_model(r1.seq) + t_rich_model(r2.seq); @@ -331,11 +356,10 @@ main_guessprotocol(int argc, char *argv[]) { } } else { - // input: single-end reads - bgzf_file in(reads_files.front(), "r"); + bamxx::bgzf_file in(reads_files.front(), "r"); if (!in) - throw runtime_error("cannot open file: " + reads_files.front()); + throw std::runtime_error("cannot open file: " + reads_files.front()); FASTQRecord r; while (in >> r && summary.n_reads < reads_to_check) { @@ -353,14 +377,18 @@ main_guessprotocol(int argc, char *argv[]) { if (!outfile.empty()) { std::ofstream out(outfile); - if (!out) throw runtime_error("failed to open: " + outfile); - out << summary.tostring() << endl; + if (!out) + throw std::runtime_error("failed to open: " + outfile); + out << summary.tostring() << '\n'; } - else cout << summary.tostring() << endl; + else + std::cout << summary.tostring() << '\n'; } - catch (const runtime_error &e) { - cerr << e.what() << endl; + catch (const std::exception &e) { + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-pointer-arithmetic,*-constant-array-index,*-narrowing-conversions,cert-err09-cpp,cert-err61-cpp) From 3ed5a295f5c6923137d543249fceca3d18f2759f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 071/106] src/utils/kmersites.cpp: changes to add static analysis --- src/utils/kmersites.cpp | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/utils/kmersites.cpp b/src/utils/kmersites.cpp index 15318cda..6c7cf29a 100644 --- a/src/utils/kmersites.cpp +++ b/src/utils/kmersites.cpp @@ -17,23 +17,23 @@ * General Public License for more details. */ +#include "OptionParser.hpp" + +#include + #include -#include // for [u]int[0-9]+_t -#include +#include +#include +#include #include #include #include -#include #include #include #include +#include #include -#include "OptionParser.hpp" -#include "smithlab_os.hpp" - -#include - static inline auto process_chrom_wig(const std::string &kmer, const int offset, const std::string &name, const std::string &chrom, @@ -54,7 +54,7 @@ process_chrom_wig(const std::string &kmer, const int offset, const auto end_chrom = std::cend(chrom); auto chrom_itr = std::cbegin(chrom); - auto chrom_itr_k = chrom_itr + kmer_size; + auto chrom_itr_k = chrom_itr + kmer_size; // NOLINT(*-narrowing-conversions) auto pos = 0; while (chrom_itr_k != end_chrom) { @@ -67,7 +67,6 @@ process_chrom_wig(const std::string &kmer, const int offset, [[nodiscard]] static auto read_fasta_file(const std::string &filename) -> std::tuple, std::vector> { - std::ifstream in(filename); if (!in) throw std::runtime_error("cannot open input file " + filename); @@ -96,7 +95,6 @@ process_chrom_with_named_lines(const std::string &kmer, const int offset, const std::string &name, const std::string &chrom, bamxx::bgzf_file &out) { - const auto kmer_size = size(kmer); const auto chrom_size = size(chrom); if (kmer_size > chrom_size) @@ -109,7 +107,7 @@ process_chrom_with_named_lines(const std::string &kmer, const int offset, const auto end_chrom = std::cend(chrom); auto chrom_itr = std::cbegin(chrom); - auto chrom_itr_k = chrom_itr + kmer_size; + auto chrom_itr_k = chrom_itr + kmer_size; // NOLINT(*-narrowing-conversions) auto pos = 0; while (chrom_itr_k != end_chrom) { @@ -129,9 +127,8 @@ bad_dna_kmer(const std::string &kmer) -> bool { } auto -kmersites(const int argc, char *argv[]) -> int { +kmersites(const int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays) try { - bool verbose{false}; bool show_progress{false}; bool compress_output{false}; @@ -194,7 +191,7 @@ kmersites(const int argc, char *argv[]) -> int { << "[command line: " << cmd.str() << "]\n"; auto [names, chroms] = read_fasta_file(chroms_file); - for (auto &chrom : chroms) + for (auto &chrom : chroms) // cppcheck-suppress constVariableReference std::transform(std::cbegin(chrom), std::cend(chrom), std::begin(chrom), [](const char c) { return std::toupper(c); }); From 09335fa53a9e6717eb4f1de074920ecfe44625c2 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 072/106] src/utils/lc-approx.cpp: changes to add static analysis --- src/utils/lc-approx.cpp | 124 +++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 66 deletions(-) diff --git a/src/utils/lc-approx.cpp b/src/utils/lc-approx.cpp index 479ae690..6d38010e 100644 --- a/src/utils/lc-approx.cpp +++ b/src/utils/lc-approx.cpp @@ -16,122 +16,114 @@ * GNU General Public License for more details. */ -#include -#include +#include "OptionParser.hpp" +#include "smithlab_os.hpp" + +#include +#include +#include +#include +#include #include -#include #include -#include +#include // IWYU pragma: keep #include +#include +#include -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" -#include "OptionParser.hpp" - -using std::string; -using std::ios_base; -using std::vector; -using std::cout; -using std::cerr; -using std::endl; -using std::runtime_error; - -static size_t -get_approx_line_count(const bool VERBOSE, const string &filename, - const size_t n_samples, size_t sample_size) { +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) - static const size_t megabyte = (1ul << 20); - static const size_t kilobyte = (1ul << 10); +static std::size_t +get_approx_line_count(const bool VERBOSE, const std::string &filename, + const std::size_t n_samples, std::size_t sample_size) { + static const std::size_t megabyte = (1ul << 20); + static const std::size_t kilobyte = (1ul << 10); - const size_t filesize = get_filesize(filename); + const std::size_t filesize = get_filesize(filename); if (sample_size == 0) - sample_size = std::min(megabyte/10, filesize/n_samples); + sample_size = std::min(megabyte / 10, filesize / n_samples); - const size_t increment = - std::floor((filesize - sample_size*n_samples)/ - (n_samples - 1.0)) + sample_size; + const std::size_t increment = + std::floor((filesize - sample_size * n_samples) / (n_samples - 1.0)) + + sample_size; assert(filesize > n_samples && filesize > sample_size && - filesize > n_samples*sample_size); + filesize > n_samples * sample_size); if (VERBOSE) { - cerr << "[PROCESSING FILE: " << filename << "]" << endl - << "[FILESIZE: " - << static_cast(filesize)/megabyte << "MB]" << endl - << "[CHUNK SIZE: " - << static_cast(1.0*sample_size/kilobyte) << "KB]" << endl - << "[NUM CHUNKS: " << n_samples << "]" << endl - << "[TOTAL SAMPLE: " - << (1.0*n_samples*sample_size)/megabyte << "MB]" << endl; + std::cerr << "[PROCESSING FILE: " << filename << "]" << '\n' + << "[FILESIZE: " << static_cast(filesize) / megabyte + << "MB]" << '\n' + << "[CHUNK SIZE: " + << static_cast(1.0 * sample_size / kilobyte) << "KB]" + << '\n' + << "[NUM CHUNKS: " << n_samples << "]" << '\n' + << "[TOTAL SAMPLE: " << (1.0 * n_samples * sample_size) / megabyte + << "MB]" << '\n'; } - std::ifstream in(filename.c_str(), ios_base::binary); + std::ifstream in(filename, std::ios_base::binary); if (!in) - throw runtime_error("cannot open input file " + string(filename)); + throw std::runtime_error("cannot open input file " + filename); - vector buffer(sample_size); + std::vector buffer(sample_size); double total_lines = 0.0; - for (size_t i = 0; i < filesize && in.good(); i += increment) { - in.seekg(i, ios_base::beg); + for (std::size_t i = 0; i < filesize && in.good(); i += increment) { + in.seekg(i, std::ios_base::beg); in.read(&buffer.front(), sample_size); if (in.good()) total_lines += (0.5 + count(buffer.begin(), buffer.end(), '\n')); } - return (filesize*total_lines)/(n_samples*sample_size); + return (filesize * total_lines) / (n_samples * sample_size); } - - int -main_lc_approx(int argc, char *argv[]) { +main_lc_approx(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - - size_t n_samples = 100; - size_t sample_size = 0; + std::size_t n_samples = 100; + std::size_t sample_size = 0; bool VERBOSE = false; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "approximate line counting in large files", - " ..." ); + " ..."); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); opt_parse.add_opt("samples", 'n', "number of samples", false, n_samples); opt_parse.add_opt("size", 'z', "sample size (bytes)", false, sample_size); - vector leftover_args; + std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() < 1) { - cerr << opt_parse.help_message() << endl; + std::cerr << opt_parse.help_message() << '\n'; return EXIT_FAILURE; } - vector filenames(leftover_args); + std::vector filenames(leftover_args); /****************** END COMMAND LINE OPTIONS *****************/ - ////////////////////////////////////////////////////////////// - for (size_t i = 0; i < filenames.size(); ++i) - cout << filenames[i] << "\t" - << get_approx_line_count(VERBOSE, filenames[i], - n_samples, sample_size) << endl; + for (auto i = 0u; i < std::size(filenames); ++i) + std::cout << filenames[i] << "\t" + << get_approx_line_count(VERBOSE, filenames[i], n_samples, + sample_size) + << '\n'; } - catch (const runtime_error &e) { - cerr << e.what() << endl; - return EXIT_FAILURE; - } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::exception &e) { + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) From a8e0c69beba276e600e081cddf64c3c4317d5fee Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 073/106] src/utils/lift-filter.cpp: changes to add static analysis --- src/utils/lift-filter.cpp | 54 +++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/src/utils/lift-filter.cpp b/src/utils/lift-filter.cpp index 7a3686fb..ef932cec 100644 --- a/src/utils/lift-filter.cpp +++ b/src/utils/lift-filter.cpp @@ -16,23 +16,22 @@ * GNU General Public License for more details. */ -#include -#include +#include "MSite.hpp" +#include "OptionParser.hpp" + +#include #include -#include +#include +#include #include -#include - -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" -#include "OptionParser.hpp" -#include "MSite.hpp" +#include +#include -using std::string; -using std::vector; using std::cerr; using std::endl; using std::runtime_error; +using std::string; +using std::vector; static bool same_chrom_pos_strand(const MSite &a, const MSite &b) { @@ -40,14 +39,14 @@ same_chrom_pos_strand(const MSite &a, const MSite &b) { } int -main_lift_filter(int argc, char *argv[]) { - try{ +main_lift_filter(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) + try { string pfile; bool VERBOSE = false; bool UNIQUE = false; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) "Process duplicated sites from fast-liftover output", ""); opt_parse.add_opt("output", 'o', "Output processed methcount", true, pfile); @@ -57,19 +56,19 @@ main_lift_filter(int argc, char *argv[]) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.empty()) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const string mfile(leftover_args.front()); @@ -80,9 +79,9 @@ main_lift_filter(int argc, char *argv[]) { throw runtime_error("cannot open input file: " + mfile); std::ofstream out(pfile); - //if (!of) - // throw runtime_error("cannot open output file: " + pfile); - //std::ostream out(of.rdbuf()); + // if (!of) + // throw runtime_error("cannot open output file: " + pfile); + // std::ostream out(of.rdbuf()); // read first site MSite curr_site; @@ -98,21 +97,16 @@ main_lift_filter(int argc, char *argv[]) { } else { if (!UNIQUE || site_is_unique) - out << curr_site << endl; + out << curr_site << '\n'; site_is_unique = true; curr_site = next_site; } } if (!UNIQUE || site_is_unique) - out << curr_site << endl; - - } - catch (const runtime_error &e) { - cerr << e.what() << endl; - return EXIT_FAILURE; + out << curr_site << '\n'; } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::exception &e) { + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 6424c68f1bd7f0d946831c5eb710a312b05cd5e8 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 074/106] src/utils/merge-bsrate.cpp: changes to add static analysis --- src/utils/merge-bsrate.cpp | 147 +++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 81 deletions(-) diff --git a/src/utils/merge-bsrate.cpp b/src/utils/merge-bsrate.cpp index 74f8f9e1..068062e1 100644 --- a/src/utils/merge-bsrate.cpp +++ b/src/utils/merge-bsrate.cpp @@ -18,83 +18,79 @@ * GNU General Public License for more details. */ -#include -#include -#include -#include +#include "OptionParser.hpp" + +#include #include -#include -#include -#include -#include #include +#include +#include +#include +#include #include +#include +#include -#include "OptionParser.hpp" -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" -#include "GenomicRegion.hpp" - -#include "bsutils.hpp" - -using std::string; -using std::vector; -using std::cout; using std::cerr; +using std::cout; using std::endl; +using std::runtime_error; using std::setw; +using std::string; using std::stringstream; -using std::runtime_error; +using std::vector; + +// NOLINTBEGIN(*-owning-memory,*-avoid-magic-numbers,*-narrowing-conversions) -bool readline(std::vector& infiles, - std::vector& cur_line) { - for ( size_t i = 0; i < infiles.size(); ++i) { - if (infiles[i]->eof() ) +bool +readline(std::vector &infiles, std::vector &cur_line) { + for (size_t i = 0; i < infiles.size(); ++i) { + if (infiles[i]->eof()) return false; else - getline(*infiles[i],cur_line[i]); - if(!cur_line[i].compare("")) + getline(*infiles[i], cur_line[i]); + if (!cur_line[i].compare("")) return false; } return true; } int -main_merge_bsrate(int argc, char *argv[]) { - +main_merge_bsrate(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { bool VERBOSE = false; string outfile; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "Program to merge the " + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + "Program to merge the " "BS conversion rate from two sets of BS-seq " "reads mapped to a genome", ", ..., "); opt_parse.add_opt("output", 'o', "Name of output file (default: stdout)", false, outfile); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); - vector leftover_args; // list of mapped-read files to merge + vector leftover_args; // list of mapped-read files to merge opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.empty()) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } /****************** END COMMAND LINE OPTIONS *****************/ - vector infiles(leftover_args.size()); + vector infiles(leftover_args.size()); for (size_t i = 0; i < leftover_args.size(); ++i) { infiles[i] = new std::ifstream(leftover_args[i].c_str()); if (!infiles[i]) @@ -102,7 +98,8 @@ main_merge_bsrate(int argc, char *argv[]) { } std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); static const size_t precision_val = 5; out.precision(precision_val); @@ -120,18 +117,17 @@ main_merge_bsrate(int argc, char *argv[]) { size_t sum_pos = 0ul; size_t sum_neg = 0ul; size_t base = 1; - for (size_t i = 0; i < infiles.size(); ++i) - { - getline(*infiles[i],overall_line[i]); - getline(*infiles[i],pos_line[i]); - getline(*infiles[i],neg_line[i]); - getline(*infiles[i],title_line[i]); - } + for (size_t i = 0; i < infiles.size(); ++i) { + getline(*infiles[i], overall_line[i]); + getline(*infiles[i], pos_line[i]); + getline(*infiles[i], neg_line[i]); + getline(*infiles[i], title_line[i]); + } vector ostrings; ostrings.clear(); - while(readline(infiles, cur_line)) { + while (readline(infiles, cur_line)) { // declare all values vector p_total(cur_line.size()); vector n_total(cur_line.size()); @@ -149,12 +145,12 @@ main_merge_bsrate(int argc, char *argv[]) { vector all(cur_line.size()); vector err_rate(cur_line.size()); - for (size_t j=0; j< cur_line.size(); ++j) { - //parse the line + for (size_t j = 0; j < cur_line.size(); ++j) { + // parse the line stringstream ss(cur_line[j]); string item; vector elems; - while(getline(ss,item,'\t')){ + while (getline(ss, item, '\t')) { elems.push_back(item); } p_total[j] = strtod(elems[1].c_str(), NULL); @@ -178,11 +174,11 @@ main_merge_bsrate(int argc, char *argv[]) { size_t err_out = 0, all_out = 0; double prate_out = 0, nrate_out = 0, bthrate_out = 0, errrate_out = 0; - for (size_t k=0; k Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 075/106] src/utils/merge-methcounts.cpp: changes to add static analysis --- src/utils/merge-methcounts.cpp | 129 ++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 43 deletions(-) diff --git a/src/utils/merge-methcounts.cpp b/src/utils/merge-methcounts.cpp index 8aec4083..1f1e7bec 100644 --- a/src/utils/merge-methcounts.cpp +++ b/src/utils/merge-methcounts.cpp @@ -18,25 +18,31 @@ #include "MSite.hpp" #include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" #include #include -#include +#include +#include +#include +#include +#include +#include #include #include #include +#include #include -#include #include #include #include #include #include +#include #include +// NOLINTBEGIN(*-owning-memory,*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) + static void set_invalid(MSite &s) { s.pos = std::numeric_limits::max(); @@ -52,7 +58,6 @@ any_sites_unprocessed( const std::vector &filenames, const std::vector> &infiles, std::vector &outdated, std::vector &sites) { - const std::size_t n_files = std::size(sites); bool sites_remain = false; @@ -94,7 +99,6 @@ find_minimum_site( const std::vector &sites, const std::unordered_map &chroms_order, const std::vector &outdated) { - const std::size_t n_files = std::size(sites); // ms_id is the id of the minimum site, the next one to print @@ -128,7 +132,6 @@ collect_sites_to_print( const std::vector &sites, const std::unordered_map &chroms_order, const std::vector &outdated, std::vector &to_print) { - const std::size_t n_files = std::size(sites); const std::size_t min_site_idx = @@ -151,13 +154,14 @@ any_mutated(const std::vector &to_print, return (i < std::size(sites)); } +template static void -write_line_for_tabular(const bool write_fractional, +write_line_for_tabular(std::array &buffer, + const bool write_fractional, const bool report_any_mutated, const std::size_t min_reads, std::ostream &out, const std::vector &to_print, const std::vector &sites, MSite min_site) { - const std::size_t n_files = std::size(sites); min_site.set_unmutated(); @@ -165,33 +169,69 @@ write_line_for_tabular(const bool write_fractional, min_site.set_mutated(); // ADS: is this the format we want for the row names? - out << min_site.chrom << ':' << min_site.pos << ':' << min_site.strand << ':' - << min_site.context; + auto cursor = buffer.data(); + auto bytes_left = buffer_size; + { + const auto n_bytes = + std::snprintf(cursor, bytes_left, "%s:%zu:%c:%s", min_site.chrom.data(), + min_site.pos, min_site.strand, min_site.context.data()); + if (n_bytes < 0 || n_bytes == static_cast(bytes_left)) + throw std::runtime_error("failed to write output line"); + cursor += n_bytes; + bytes_left -= n_bytes; + } + + // out << min_site.chrom << ':' << min_site.pos << ':' << min_site.strand << + // ':' + // << min_site.context; if (write_fractional) { for (std::size_t i = 0; i < n_files; ++i) { const std::size_t r = sites[i].n_reads; - if (to_print[i] && r >= min_reads) - out << '\t' << sites[i].meth; - else - out << '\t' << "NA"; + const auto n_bytes = [&] { + return (to_print[i] && r >= min_reads) + ? std::snprintf(cursor, bytes_left, "\t%.6g", sites[i].meth) + : std::snprintf(cursor, bytes_left, "\tNA"); + }(); + if (n_bytes < 0 || n_bytes == static_cast(bytes_left)) + throw std::runtime_error("failed to write output line"); + cursor += n_bytes; + bytes_left -= n_bytes; } } else for (std::size_t i = 0; i < n_files; ++i) { - if (to_print[i]) - out << '\t' << sites[i].n_reads << '\t' << sites[i].n_meth(); - else - out << '\t' << 0 << '\t' << 0; + const auto n_bytes = [&] { + return to_print[i] + ? std::snprintf(cursor, bytes_left, "\t%zu\t%.6g", + sites[i].n_reads, sites[i].n_meth_f()) + : std::snprintf(cursor, bytes_left, "\t0\t0"); + }(); + if (n_bytes < 0 || n_bytes == static_cast(bytes_left)) + throw std::runtime_error("failed to write output line"); + cursor += n_bytes; + bytes_left -= n_bytes; + // if (to_print[i]) + // out << '\t' << sites[i].n_reads << '\t' << sites[i].n_meth(); + // else + // out << '\t' << 0 << '\t' << 0; } - out << '\n'; + if (std::distance(buffer.data(), cursor) + 1 < + static_cast(buffer_size)) + *cursor++ = '\n'; + + if (std::distance(buffer.data(), cursor) < + static_cast(buffer_size)) { + *cursor++ = '\0'; + out.write(buffer.data(), + std::distance(buffer.data(), cursor)); // "\n"); // out << '\n'; + } } static void write_line_for_merged_counts(std::ostream &out, const bool report_any_mutated, const std::vector &to_print, const std::vector &sites, MSite min_site) { - const std::size_t n_files = std::size(sites); min_site.set_unmutated(); @@ -212,7 +252,7 @@ write_line_for_merged_counts(std::ostream &out, const bool report_any_mutated, [[nodiscard]] static std::string remove_extension(const std::string &filename) { - const std::size_t last_dot = filename.find_last_of("."); + const std::size_t last_dot = filename.find_last_of('.'); if (last_dot == std::string::npos) return filename; else @@ -254,7 +294,6 @@ get_orders_by_file(const std::string &filename, static void get_chroms_order(const std::vector &filenames, std::unordered_map &chroms_order) { - // get order of chroms in each file std::vector> orders(std::size(filenames)); for (std::size_t i = 0; i < std::size(filenames); ++i) @@ -326,9 +365,9 @@ get_chroms_order(const std::vector &filenames, output is in counts or fractions. */ int -main_merge_methcounts(int argc, char *argv[]) { +main_merge_methcounts(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) + static constexpr auto buffer_size = 65536; try { - static const std::string description = "merge multiple methcounts files"; std::string outfile; @@ -347,10 +386,10 @@ main_merge_methcounts(int argc, char *argv[]) { std::size_t min_reads = 1; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - ""); - opt_parse.add_opt("output", 'o', "output file name (default: stdout)", - false, outfile); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); + opt_parse.add_opt("output", 'o', "output file name (default: stdout)", true, + outfile); opt_parse.add_opt("header", 'H', "header to print (ignored for tabular)", false, header_info); opt_parse.add_opt("tabular", 't', "output as table", false, @@ -438,21 +477,20 @@ main_merge_methcounts(int argc, char *argv[]) { throw std::runtime_error("cannot open file: " + meth_files[i]); } - std::ofstream of; - if (!outfile.empty()) - of.open(outfile); - std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open output file: " + outfile); // print header if user specifies or if tabular output format if (write_tabular_format) { - - std::vector colnames; - for (const auto &i : meth_files) - colnames.push_back(strip_path(i)); - - for (auto &i : colnames) - i = suffix_to_remove.empty() ? remove_extension(i) - : remove_suffix(suffix_to_remove, i); + std::vector colnames(std::size(meth_files)); + std::transform(std::cbegin(meth_files), std::cend(meth_files), + std::begin(colnames), [&](const auto &x) { + auto fn = std::filesystem::path{x}.filename().string(); + return suffix_to_remove.empty() + ? remove_extension(fn) + : remove_suffix(suffix_to_remove, fn); + }); if (!write_fractional && !radmeth_format) { std::vector tmp; @@ -477,6 +515,8 @@ main_merge_methcounts(int argc, char *argv[]) { std::vector sites_to_print; // declared here to keep allocation std::vector> chroms_seen(n_files); + std::array buffer{}; + while (any_sites_unprocessed(meth_files, infiles, outdated, sites)) { sites_to_print.clear(); sites_to_print.resize(n_files, false); @@ -487,8 +527,9 @@ main_merge_methcounts(int argc, char *argv[]) { // output the appropriate sites' data if (write_tabular_format) - write_line_for_tabular(write_fractional, report_any_mutated, min_reads, - out, sites_to_print, sites, sites[idx]); + write_line_for_tabular(buffer, write_fractional, report_any_mutated, + min_reads, out, sites_to_print, sites, + sites[idx]); else write_line_for_merged_counts(out, report_any_mutated, sites_to_print, sites, sites[idx]); @@ -502,3 +543,5 @@ main_merge_methcounts(int argc, char *argv[]) { } return EXIT_SUCCESS; } + +// NOLINTEND(*-owning-memory,*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) From 2087ea0441c3f5b4d60a0580a447a7fe89021880 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 076/106] src/utils/recovered.cpp: changes to add static analysis --- src/utils/recovered.cpp | 64 +++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/src/utils/recovered.cpp b/src/utils/recovered.cpp index 7e834719..dc89254d 100644 --- a/src/utils/recovered.cpp +++ b/src/utils/recovered.cpp @@ -17,28 +17,33 @@ */ #include "MSite.hpp" +#include "OptionParser.hpp" #include "bsutils.hpp" #include "counts_header.hpp" +#include "smithlab_os.hpp" #include -// from smithlab_cpp -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - +#include +#include +#include #include +#include +#include #include +#include +#include +#include #include #include #include #include +#include #include static void verify_chrom_orders( - const bool verbose, const std::uint32_t n_threads, - const std::string &filename, + const bool verbose, const std::int32_t n_threads, const std::string &filename, const std::unordered_map &chroms_order) { bamxx::bgzf_file in(filename, "r"); if (!in) @@ -96,8 +101,8 @@ struct quick_buf : public std::ostringstream, setp(pbase(), pbase()); } char const * - c_str() { - /* between c_str and insertion make sure to clear() */ + data() { + /* between data and insertion make sure to clear() */ *pptr() = '\0'; return pbase(); } @@ -156,13 +161,15 @@ get_tag_from_genome_g(const std::string &s, const std::size_t pos) { return 3; } -static const char *tag_values[] = { +// NOLINTBEGIN(*-avoid-c-arrays) +static const char *const tag_values[] = { "CpG", // 0 "CHH", // 1 "CXG", // 2 "CCG", // 3 "N" // 4 }; +// NOLINTEND(*-avoid-c-arrays) static void write_missing_sites(const std::string &name, const std::string &chrom, @@ -177,9 +184,11 @@ write_missing_sites(const std::string &name, const std::string &chrom, const std::uint32_t the_tag = is_c ? get_tag_from_genome_c(chrom, pos) : get_tag_from_genome_g(chrom, pos); buf.clear(); + // NOLINTBEGIN(*-constant-array-index) buf << name_tab << pos << (is_c ? "\t+\t" : "\t-\t") << tag_values[the_tag] << "\t0\t0\n"; - if (!out.write(buf.c_str(), buf.tellp())) + // NOLINTEND(*-constant-array-index) + if (!out.write(buf.data(), buf.tellp())) throw std::runtime_error("error writing output"); } } @@ -189,7 +198,7 @@ static void write_current_site(const MSite &site, bamxx::bgzf_file &out) { quick_buf buf; // keep underlying buffer space? buf << site << '\n'; - if (!out.write(buf.c_str(), buf.tellp())) + if (!out.write(buf.data(), buf.tellp())) throw std::runtime_error("error writing site: " + site.tostring()); } @@ -215,10 +224,9 @@ get_chrom_idx(const std::unordered_map &name_to_idx, static void process_sites(const bool verbose, const bool add_missing_chroms, - const bool compress_output, const std::size_t n_threads, + const bool compress_output, const std::int32_t n_threads, const std::string &infile, const std::string &outfile, const std::string &chroms_file) { - // first get the chromosome names and sequences from the FASTA file std::vector chroms, names; read_fasta_file_short_names(chroms_file, names, chroms); @@ -231,10 +239,10 @@ process_sites(const bool verbose, const bool add_missing_chroms, std::unordered_map chrom_lookup; std::unordered_map name_to_idx; std::vector chrom_sizes(size(chroms), 0); - for (std::size_t i = 0; i < size(chroms); ++i) { - chrom_lookup[names[i]] = cbegin(chroms) + i; - name_to_idx[names[i]] = i; - chrom_sizes[i] = size(chroms[i]); + for (auto i = 0u; i < std::size(chroms); ++i) { + chrom_lookup[names[i]] = cbegin(chroms) + static_cast(i); + name_to_idx[names[i]] = static_cast(i); + chrom_sizes[i] = std::size(chroms[i]); } if (add_missing_chroms) @@ -264,7 +272,7 @@ process_sites(const bool verbose, const bool add_missing_chroms, // ADS: this is probably a poor strategy since we already would know // the index of the chrom sequence in the vector. - chrom_itr_t chrom_itr; + chrom_itr_t chrom_itr{}; std::string line; while (getline(in, line)) { @@ -272,9 +280,10 @@ process_sites(const bool verbose, const bool add_missing_chroms, write_counts_header_line(line, out); continue; } - site.initialize(line.data(), line.data() + size(line)); + // NOLINTNEXTLINE(*-pointer-arithmetic) + if (!site.initialize(line.data(), line.data() + std::size(line))) + throw std::runtime_error("failed to parse line:\n" + line); if (site.chrom != chrom_name) { - if (pos != std::numeric_limits::max()) write_missing_sites(chrom_name, *chrom_itr, pos, size(*chrom_itr), out); @@ -302,7 +311,7 @@ process_sites(const bool verbose, const bool add_missing_chroms, write_missing_sites(chrom_name, *chrom_itr, pos, size(*chrom_itr), out); if (add_missing_chroms) { - const std::int32_t chrom_idx = size(chroms); + const std::int32_t chrom_idx = static_cast(std::size(chroms)); for (auto i = prev_chrom_idx + 1; i < chrom_idx; ++i) { if (verbose) std::cerr << "processing: " << names[i] << " (missing)\n"; @@ -312,13 +321,12 @@ process_sites(const bool verbose, const bool add_missing_chroms, } int -main_recovered(int argc, char *argv[]) { +main_recovered(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - bool verbose = false; bool add_missing_chroms = false; bool compress_output = false; - std::size_t n_threads = 1; + std::int32_t n_threads = 1; std::string outfile; std::string chroms_file; @@ -326,8 +334,8 @@ main_recovered(int argc, char *argv[]) { "add sites that are missing as non-covered sites"; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.add_opt("output", 'o', "output file (required)", true, outfile); opt_parse.add_opt("missing", 'm', "add missing chroms", false, add_missing_chroms); @@ -362,7 +370,7 @@ main_recovered(int argc, char *argv[]) { filename, outfile, chroms_file); } catch (const std::exception &e) { - std::cerr << e.what() << "\n"; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 66271d2724f76dae1a7ebd833e0394b2af692201 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 077/106] src/utils/selectsites.cpp: changes to add static analysis --- src/utils/selectsites.cpp | 68 ++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/src/utils/selectsites.cpp b/src/utils/selectsites.cpp index b0389ac4..92bde671 100644 --- a/src/utils/selectsites.cpp +++ b/src/utils/selectsites.cpp @@ -16,27 +16,36 @@ * GNU General Public License for more details. */ -#include +#include "GenomicRegion.hpp" +#include "MSite.hpp" +#include "OptionParser.hpp" + #include + +#include + +#include + +#include +#include +#include +#include +#include #include #include #include #include #include +#include +#include #include -#include +#include #include +#include #include -#include "GenomicRegion.hpp" -#include "MSite.hpp" -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - using std::cerr; using std::cout; -using std::endl; using std::ios_base; using std::runtime_error; using std::string; @@ -69,7 +78,6 @@ struct quick_buf : public std::ostringstream, }; struct selectsites_summary { - // command_line is the command used to produce this summary file and // the corresponding results std::string command_line{}; @@ -132,13 +140,13 @@ write_stats_output(const selectsites_summary &summary, std::ofstream out_summary(summary_file); if (!out_summary) throw runtime_error("bad summary output file"); - out_summary << summary.tostring() << endl; + out_summary << summary.tostring() << '\n'; } } static void collapsebed(std::vector ®ions) { - std::size_t j = 0; + std::ptrdiff_t j = 0; for (std::size_t i = 1; i < std::size(regions); ++i) { if (regions[j].same_chrom(regions[i]) && regions[i].get_start() <= regions[j].get_end()) { @@ -148,7 +156,7 @@ collapsebed(std::vector ®ions) { regions[++j] = regions[i]; } } - regions.erase(begin(regions) + j + 1, end(regions)); + regions.erase(std::begin(regions) + j + 1, std::end(regions)); } static inline bool @@ -182,7 +190,7 @@ process_all_sites( ++n_sites_total; if (the_site.chrom != prev_site.chrom) { if (VERBOSE) - cerr << "processing " << the_site.chrom << endl; + cerr << "processing " << the_site.chrom << '\n'; const auto r = regions.find(the_site.chrom); chrom_is_relevant = (r != cend(regions)); if (chrom_is_relevant) { @@ -234,7 +242,7 @@ get_sites_in_region(std::ifstream &site_in, const GenomicRegion ®ion, static auto process_with_sites_on_disk(const std::string &sites_file, - std::vector ®ions, + const std::vector ®ions, bgzf_file &out) -> std::uint64_t { std::ifstream in(sites_file); if (!in) @@ -276,21 +284,21 @@ is_compressed_file(const std::string &filename) { } static auto -get_command_line(const int argc, const char *const argv[]) -> std::string { +get_command_line(const int argc, + const char *const argv[] // NOLINT(*-avoid-c-arrays) + ) -> std::string { if (argc == 0) return std::string(); std::ostringstream cmd; cmd << '"'; copy(argv, argv + (argc - 1), std::ostream_iterator(cmd, " ")); - cmd << argv[argc - 1] << '"'; + cmd << argv[argc - 1] << '"'; // NOLINT(*-pointer-arithmetic) return cmd.str(); } int -main_selectsites(int argc, char *argv[]) { - +main_selectsites(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - bool VERBOSE = false; bool keep_file_on_disk = false; bool compress_output = false; @@ -305,8 +313,8 @@ main_selectsites(int argc, char *argv[]) { "Intervals must be specified in bed format."; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - " ", 2); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " ", 2); opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, outfile); opt_parse.add_opt("disk", 'd', @@ -323,20 +331,20 @@ main_selectsites(int argc, char *argv[]) { std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (std::size(leftover_args) != 2) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const std::string regions_file = leftover_args.front(); @@ -354,7 +362,7 @@ main_selectsites(int argc, char *argv[]) { if (is_compressed_file(sites_file)) { keep_file_on_disk = false; if (VERBOSE) - cerr << "input file is so must be loaded" << endl; + cerr << "input file is so must be loaded\n"; } std::vector regions; @@ -368,7 +376,7 @@ main_selectsites(int argc, char *argv[]) { collapsebed(regions); if (VERBOSE && n_orig_regions != std::size(regions)) cerr << "[number of regions merged due to overlap: " - << n_orig_regions - std::size(regions) << "]" << endl; + << n_orig_regions - std::size(regions) << "]\n"; std::tie(summary.n_target_regions_collapsed, summary.target_region_collapsed_size) = @@ -394,7 +402,7 @@ main_selectsites(int argc, char *argv[]) { write_stats_output(summary, summary_file); } catch (const std::exception &e) { - cerr << e.what() << endl; + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 1fff8fab8bb4a0b62559841615ffbdb91e1bd482 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 078/106] src/utils/symmetric-cpgs.cpp: changes to add static analysis --- src/utils/symmetric-cpgs.cpp | 59 ++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/src/utils/symmetric-cpgs.cpp b/src/utils/symmetric-cpgs.cpp index 1884a8fa..0caf2b69 100644 --- a/src/utils/symmetric-cpgs.cpp +++ b/src/utils/symmetric-cpgs.cpp @@ -16,24 +16,27 @@ * General Public License for more details. */ +#include "MSite.hpp" +#include "OptionParser.hpp" +#include "counts_header.hpp" + #include +#include +#include +#include +#include #include #include #include +#include #include #include +#include #include +#include #include -// from smithlab_cpp -#include "MSite.hpp" -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - -#include "counts_header.hpp" - using std::cerr; using std::cout; using std::endl; @@ -41,8 +44,6 @@ using std::runtime_error; using std::string; using std::unordered_set; -using bamxx::bgzf_file; - static inline bool found_symmetric(const MSite &prev, const MSite &curr) { // assumes check for CpG already done @@ -67,7 +68,9 @@ get_first_site(T &in, T &out) { write_counts_header_line(line, out); } else { - prev_site.initialize(line.data(), line.data() + size(line)); + // NOLINTNEXTLINE(*-pointer-arithmetic) + if (!prev_site.initialize(line.data(), line.data() + std::size(line))) + throw std::runtime_error("failed to parse line:\n" + line); if (prev_site.is_cpg()) prev_is_cpg = true; within_header = false; @@ -79,7 +82,6 @@ get_first_site(T &in, T &out) { template static bool process_sites(const bool verbose, T &in, T &out) { - // get the first site while dealing with the header auto [prev_site, prev_is_cpg] = get_first_site(in, out); @@ -105,7 +107,7 @@ process_sites(const bool verbose, T &in, T &out) { } else { if (verbose) - cerr << "processing: " << curr_site.chrom << endl; + cerr << "processing: " << curr_site.chrom << '\n'; if (chroms_seen.find(curr_site.chrom) != cend(chroms_seen)) return false; chroms_seen.insert(curr_site.chrom); @@ -127,7 +129,7 @@ process_sites(const bool verbose, T &in, T &out) { } int -main_symmetric_cpgs(int argc, char *argv[]) { +main_symmetric_cpgs(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { // file types from HTSlib use "-" for the filename to go to stdout string outfile{"-"}; @@ -140,8 +142,8 @@ main_symmetric_cpgs(int argc, char *argv[]) { "Get CpG sites and make methylation levels symmetric."; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, - ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, outfile); opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); @@ -152,20 +154,20 @@ main_symmetric_cpgs(int argc, char *argv[]) { std::vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const string filename(leftover_args.front()); @@ -178,7 +180,7 @@ main_symmetric_cpgs(int argc, char *argv[]) { bamxx::bam_tpool tp(n_threads); // const bool show_progress = VERBOSE && isatty(fileno(stderr)); - bgzf_file in(filename, "r"); + bamxx::bgzf_file in(filename, "r"); if (!in) throw runtime_error("could not open file: " + filename); @@ -197,16 +199,15 @@ main_symmetric_cpgs(int argc, char *argv[]) { const bool sites_are_sorted = process_sites(verbose, in, out); if (!sites_are_sorted) { - cerr << "sites are not sorted in: " << filename << endl; - namespace fs = std::filesystem; - const fs::path outpath{outfile}; - if (fs::exists(outpath)) - fs::remove(outpath); + cerr << "sites are not sorted in: " << filename << '\n'; + const std::filesystem::path outpath{outfile}; + if (std::filesystem::exists(outpath) && !std::filesystem::remove(outpath)) + throw std::runtime_error("failed to remove file: " + outpath.string()); return EXIT_FAILURE; } } - catch (const runtime_error &e) { - cerr << e.what() << endl; + catch (const std::exception &e) { + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From c135b741bc16a7070f5c14cf2353c998a62327af Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 079/106] src/utils/uniq.cpp: changes to add static analysis --- src/utils/uniq.cpp | 76 +++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/src/utils/uniq.cpp b/src/utils/uniq.cpp index 9339ba75..5b284c91 100644 --- a/src/utils/uniq.cpp +++ b/src/utils/uniq.cpp @@ -18,22 +18,27 @@ * more details. */ -#include "GenomicRegion.hpp" #include "OptionParser.hpp" #include "bam_record_utils.hpp" -#include "bsutils.hpp" #include "dnmt_error.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" -// generated by autotools +#include "bamxx.hpp" + #include -#include // for [u]int[0-9]+_t +#include +#include +#include +#include +#include #include +#include +#include #include +#include #include #include +#include #include namespace uniq_random { @@ -42,9 +47,9 @@ namespace uniq_random { // could be different. This meant testing didn't work. // ADS: (TODO) refactor this -bool initialized{false}; -std::default_random_engine e; -std::uniform_int_distribution di; +bool initialized{false}; // NOLINT +std::default_random_engine e; // NOLINT +std::uniform_int_distribution di; // NOLINT void initialize(const std::size_t the_seed) { @@ -100,7 +105,7 @@ struct uniq_summary { [[nodiscard]] double non_duplicate_fraction() const { return static_cast(unique_reads - duplicate_reads) / - std::max(1ul, total_reads); + static_cast(std::max(1ul, total_reads)); } // duplication_rate is the average number of duplicates for the reads with @@ -108,19 +113,19 @@ struct uniq_summary { [[nodiscard]] double duplication_rate() const { return static_cast(reads_removed() + duplicate_reads) / - std::max(1ul, duplicate_reads); + static_cast(std::max(1ul, duplicate_reads)); } std::string to_string() { std::ostringstream oss; - oss << "total_reads: " << total_reads << "\n" - << "total_bases: " << total_bases << "\n" - << "unique_reads: " << unique_reads << "\n" - << "unique_read_bases: " << unique_read_bases << "\n" - << "non_duplicate_fraction: " << non_duplicate_fraction() << "\n" - << "duplicate_reads: " << duplicate_reads << "\n" - << "reads_removed: " << reads_removed() << "\n" + oss << "total_reads: " << total_reads << '\n' + << "total_bases: " << total_bases << '\n' + << "unique_reads: " << unique_reads << '\n' + << "unique_read_bases: " << unique_read_bases << '\n' + << "non_duplicate_fraction: " << non_duplicate_fraction() << '\n' + << "duplicate_reads: " << duplicate_reads << '\n' + << "reads_removed: " << reads_removed() << '\n' << "duplication_rate: " << duplication_rate(); return oss.str(); } @@ -133,7 +138,7 @@ write_stats_output(const rd_stats &rs_in, const rd_stats &rs_out, std::ofstream out_stat(statfile); if (!out_stat) throw std::runtime_error("bad stats output file"); - out_stat << summary.to_string() << "\n"; + out_stat << summary.to_string() << '\n'; } static void @@ -154,19 +159,19 @@ static void process_inner_buffer(const bool add_dup_count, const std::vector::iterator it, const std::vector::iterator jt, - bamxx::bam_header &hdr, bamxx::bam_out &out, + const bamxx::bam_header &hdr, bamxx::bam_out &out, rd_stats &rs_out, std::size_t &reads_duped, std::vector &hist) { - constexpr char du_tag[2] = {'D', 'U'}; + constexpr char du_tag[2] = {'D', 'U'}; // NOLINT(*-avoid-c-arrays) const std::size_t n_reads = std::distance(it, jt); const std::size_t selected = uniq_random::rand() % n_reads; + // NOLINTBEGIN(*-narrowing-conversions,*-array-to-pointer-decay) if (add_dup_count) { const int ret = bam_aux_update_int(*(it + selected), du_tag, n_reads); if (ret < 0) throw dnmt_error("error adding duplicate count aux field"); } - if (!out.write(hdr, *(it + selected))) throw std::runtime_error("failed writing bam record"); if (hist.size() <= n_reads) @@ -174,6 +179,7 @@ process_inner_buffer(const bool add_dup_count, hist[n_reads]++; rs_out.update(*(it + selected)); reads_duped += (n_reads > 1); + // NOLINTEND(*-narrowing-conversions,*-array-to-pointer-decay) } /* The buffer corresponds to reads sharing the same mapping chromosome @@ -181,8 +187,8 @@ process_inner_buffer(const bool add_dup_count, static void process_buffer(const bool add_dup_count, rd_stats &rs_out, std::size_t &reads_duped, std::vector &hist, - std::vector &buffer, bamxx::bam_header &hdr, - bamxx::bam_out &out) { + std::vector &buffer, + const bamxx::bam_header &hdr, bamxx::bam_out &out) { std::sort(std::begin(buffer), std::end(buffer), precedes_by_end_and_strand); auto it = std::begin(buffer); auto jt = it + 1; @@ -199,7 +205,7 @@ process_buffer(const bool add_dup_count, rd_stats &rs_out, static void uniq(const bool add_dup_count, const std::uint32_t max_buffer_size, - const std::size_t n_threads, const std::string &cmd, + const std::int32_t n_threads, const std::string &cmd, const std::string &infile, const std::string &statfile, const std::string &histfile, const bool bam_format, const std::string &outfile) { @@ -273,7 +279,7 @@ uniq(const bool add_dup_count, const std::uint32_t max_buffer_size, out); if (size(buffer) < max_buffer_size || add_dup_count) buffer.push_back(aln); - else if (!add_dup_count) + else std::swap(buffer[uniq_random::rand() % max_buffer_size], aln); } process_buffer(add_dup_count, rs_out, reads_duped, hist, buffer, hdr, out); @@ -287,7 +293,7 @@ uniq(const bool add_dup_count, const std::uint32_t max_buffer_size, } int -main_uniq(int argc, char *argv[]) { +main_uniq(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { std::uint32_t max_buffer_size = std::numeric_limits::max(); bool VERBOSE = false; @@ -298,11 +304,11 @@ main_uniq(int argc, char *argv[]) { // ADS: Not recommended to change this seed. It shouldn't matter // at all, and we want results to behave as deterministic. - std::size_t the_seed = 408; + std::size_t the_seed = 408; // NOLINT std::string outfile; std::string statfile; std::string histfile; - std::size_t n_threads = 1; + std::int32_t n_threads = 1; /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse("dnmtools uniq", @@ -330,18 +336,18 @@ main_uniq(int argc, char *argv[]) { opt_parse.parse(argc, argv, leftover_args); if (opt_parse.about_requested() || opt_parse.help_requested() || leftover_args.empty()) { - std::cerr << opt_parse.help_message() << std::endl - << opt_parse.about_message() << "\n"; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - std::cerr << opt_parse.option_missing_message() << "\n"; + std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if ((leftover_args.size() == 1 && !use_stdout) || (leftover_args.size() == 2 && use_stdout)) { - std::cerr << opt_parse.help_message() << std::endl - << opt_parse.about_message() << "\n"; + std::cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } const std::string infile(leftover_args.front()); @@ -375,7 +381,7 @@ main_uniq(int argc, char *argv[]) { histfile, bam_format, outfile); } catch (const std::exception &e) { - std::cerr << e.what() << "\n"; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 9e6620c121cabab47bfef0896406e87624954f1e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 080/106] src/utils/unxcounts.cpp: changes to add static analysis --- src/utils/unxcounts.cpp | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/utils/unxcounts.cpp b/src/utils/unxcounts.cpp index 6a7bd519..90447d21 100644 --- a/src/utils/unxcounts.cpp +++ b/src/utils/unxcounts.cpp @@ -16,25 +16,37 @@ * General Public License for more details. */ -#include "MSite.hpp" +#include "OptionParser.hpp" #include "bsutils.hpp" #include "counts_header.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" #include -// from smithlab_cpp -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" +#include +#include +#include +#include +#include #include +#include +#include +#include #include +#include +#include +#include #include #include #include #include +#include #include +// NOLINTBEGIN(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*-narrowing-conversions,*-constant-array-index,*-pointer-arithmetic) + static void read_fasta_file_short_names_uppercase(const std::string &chroms_file, std::vector &names, @@ -390,12 +402,12 @@ process_sites(const bool verbose, const bool add_missing_chroms, std::string chrom_name; std::uint32_t nm_sz{}; - std::int32_t prev_chr_id = -1; + std::int32_t prev_chr_id{-1}; std::uint64_t pos = std::numeric_limits::max(); // ADS: this is probably a poor strategy since we already would know // the index of the chrom sequence in the vector. - chrom_itr_t ch_itr; + chrom_itr_t ch_itr{}; while (getline(in, line)) { if (is_counts_header_line(line.s)) { @@ -404,7 +416,6 @@ process_sites(const bool verbose, const bool add_missing_chroms, } if (!std::isdigit(line.s[0])) { // check if we have a chrom line - if (!require_covered && pos != std::numeric_limits::max()) write_missing(nm_sz, *ch_itr, pos + 1, size(*ch_itr), buf, out); @@ -516,7 +527,7 @@ process_cpg_sites(const bool verbose, const bool add_missing_chroms, // ADS: this is probably a poor strategy since we already would know // the index of the chrom sequence in the vector. - chrom_itr_t ch_itr; + chrom_itr_t ch_itr{}; while (getline(in, line)) { if (is_counts_header_line(line.s)) { @@ -525,7 +536,6 @@ process_cpg_sites(const bool verbose, const bool add_missing_chroms, } if (!std::isdigit(line.s[0])) { // check if we have a chrom line - if (!require_covered && pos != std::numeric_limits::max()) write_missing_cpg(nm_sz, *ch_itr, pos + 1, size(*ch_itr), buf, out); @@ -568,7 +578,7 @@ process_cpg_sites(const bool verbose, const bool add_missing_chroms, } int -main_unxcounts(int argc, char *argv[]) { +main_unxcounts(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { bool verbose = false; bool add_missing_chroms = false; @@ -583,7 +593,8 @@ main_unxcounts(int argc, char *argv[]) { "convert compressed counts format back to full counts"; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.add_opt("output", 'o', "output file (required)", true, outfile); opt_parse.add_opt("missing", 'm', "add missing chroms", false, add_missing_chroms); @@ -641,3 +652,5 @@ main_unxcounts(int argc, char *argv[]) { } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*-narrowing-conversions,*-constant-array-index,*-pointer-arithmetic) From 9942822d21e75d4c5ab607437bfe178b91093e22 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:02:32 -0800 Subject: [PATCH 081/106] src/utils/xcounts.cpp: changes to add static analysis --- src/utils/xcounts.cpp | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/src/utils/xcounts.cpp b/src/utils/xcounts.cpp index dfce6d67..42e7e3fa 100644 --- a/src/utils/xcounts.cpp +++ b/src/utils/xcounts.cpp @@ -17,27 +17,32 @@ */ #include "MSite.hpp" +#include "OptionParser.hpp" #include "counts_header.hpp" #include "dnmt_error.hpp" #include -// from smithlab_cpp -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" +#include +#include #include #include +#include +#include #include -#include +#include #include #include -#include // std::underlying_type_t +#include +#include #include +#include #include -enum class xcounts_err { +// NOLINTBEGIN(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*-narrowing-conversions,*-constant-array-index,*-pointer-arithmetic) + +enum class xcounts_err : std::uint8_t { // clang-format off ok = 0, open_failure = 1, @@ -96,7 +101,7 @@ fill_output_buffer(const std::uint32_t offset, const MSite &s, T &buf) { } int -main_xcounts(int argc, char *argv[]) { +main_xcounts(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { // ADS: It might happen that a "chromosome" has no CpG sites (like // Scaffold113377 in strPur2). Therefore, we can't assume each chrom will @@ -114,7 +119,8 @@ main_xcounts(int argc, char *argv[]) { "compress counts files by removing context information"; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " (\"-\" for standard input)", 1); opt_parse.add_opt("output", 'o', "output file (default is standard out)", false, outfile); @@ -277,3 +283,5 @@ main_xcounts(int argc, char *argv[]) { } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*-narrowing-conversions,*-constant-array-index,*-pointer-arithmetic) From 75f078ca95c94574ff227a70ad670fd56ae2501b Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:03:38 -0800 Subject: [PATCH 082/106] adding static analysis config files --- .clang-format | 12 ++++++++++++ .clang-tidy | 31 +++++++++++++++++++++++++++++++ .cppcheck_suppress | 44 ++++++++++++++++++++++++++++++++++++++++++++ CPPLINT.cfg | 36 ++++++++++++++++++++++++++++++++++++ iwyu.json | 5 +++++ 5 files changed, 128 insertions(+) create mode 100644 .clang-format create mode 100644 .clang-tidy create mode 100644 .cppcheck_suppress create mode 100644 CPPLINT.cfg create mode 100644 iwyu.json diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..07139cf1 --- /dev/null +++ b/.clang-format @@ -0,0 +1,12 @@ +BasedOnStyle: LLVM +ColumnLimit: 80 +IndentWidth: 2 +AlwaysBreakAfterReturnType: All +ContinuationIndentWidth: 2 +ConstructorInitializerIndentWidth: 2 +BraceWrapping: + BeforeElse: true + BeforeCatch: true +BreakBeforeBraces: Custom +BreakConstructorInitializers: AfterColon +SpacesBeforeTrailingComments: 2 diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 00000000..fd5b64ea --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,31 @@ +Checks: 'cert-*,cppcoreguidelines-*,performance-*,clang-diagnostic-*,clang-analyzer-*,-clang-diagnostic-unqualified-std-cast-call,-clang-diagnostic-unknown-warning-option,-clang-analyzer-unix.BlockInCriticalSection,-cppcoreguidelines-pro-type-vararg' +WarningsAsErrors: '*' +HeaderFileExtensions: + - '' + - h + - hh + - hpp + - hxx +ImplementationFileExtensions: + - c + - cc + - cpp + - cxx +HeaderFilterRegex: '' +ExcludeHeaderFilterRegex: 'OptionParser.hpp' +FormatStyle: none +CheckOptions: + cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU' + cert-err33-c.AllowCastToVoid: 'true' + cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;' + cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false' + cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false' + cppcoreguidelines-non-private-member-variables-in-classes.IgnorePublicMemberVariables: 'true' + google-readability-braces-around-statements.ShortStatementLines: '1' + google-readability-function-size.StatementThreshold: '800' + google-readability-namespace-comments.ShortNamespaceLines: '10' + google-readability-namespace-comments.SpacesBeforeComments: '2' + llvm-else-after-return.WarnOnConditionVariables: 'false' + llvm-else-after-return.WarnOnUnfixable: 'false' + llvm-qualified-auto.AddConstToQualified: 'false' +SystemHeaders: 'false' diff --git a/.cppcheck_suppress b/.cppcheck_suppress new file mode 100644 index 00000000..c925e537 --- /dev/null +++ b/.cppcheck_suppress @@ -0,0 +1,44 @@ +# MIT License +# +# Copyright (c) 2024 Andrew Smith +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +missingIncludeSystem +constVariablePointer +checkersReport +unknownMacro +unmatchedSuppression +# Ignore unused function because it's too hard to get right +unusedFunction +# Ignore unused struct member because this won't go unnoticed anyway +unusedStructMember +# Ignore missing includes because if they are real things won't build +missingInclude +# Exclude external files +*:*CLI11.hpp +*:*json.hpp +*:*asio* +*:*smithlab_cpp* +*:*ssl.hpp +*:*indicators.hpp +# Problem caused by external files +toomanyconfigs +# More problems caused by external files -- with too many ifdefs +normalCheckLevelMaxBranches diff --git a/CPPLINT.cfg b/CPPLINT.cfg new file mode 100644 index 00000000..92c4683f --- /dev/null +++ b/CPPLINT.cfg @@ -0,0 +1,36 @@ +# MIT License +# +# Copyright (c) 2025 Andrew Smith +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +set noparent +filter=-runtime/references +filter=-build/include_subdir +filter=-build/include_order +filter=-build/c++11 +filter=-build/c++17 +# Formatting below handled by clang-format +filter=-whitespace/line_length +filter=-whitespace/newline +filter=-readability/braces +filter=-whitespace/semicolon +filter=-whitespace/indent +filter=-whitespace/braces +filter=-whitespace/parens +filter=-readability/nolint diff --git a/iwyu.json b/iwyu.json new file mode 100644 index 00000000..2896ec40 --- /dev/null +++ b/iwyu.json @@ -0,0 +1,5 @@ +[ + { "include": ["", "private", "", "public"] }, + { "include": ["@[\"<]htslib/kstring.h[\">]", "private", "", "public"] }, + { "include": ["@[\"<]htslib/hts.h[\">]", "private", "", "public"] }, +] From e07a63d13c95e1a062799699d89f0ac5bc74ab09 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 17:04:01 -0800 Subject: [PATCH 083/106] cmake/FindHTSLIB.cmake cmake/static_analysis.cmake: adding the cmake files for htslib and static analysis --- cmake/FindHTSLIB.cmake | 171 ++++++++++++++++++++++++++++++++++++ cmake/static_analysis.cmake | 119 +++++++++++++++++++++++++ 2 files changed, 290 insertions(+) create mode 100644 cmake/FindHTSLIB.cmake create mode 100644 cmake/static_analysis.cmake diff --git a/cmake/FindHTSLIB.cmake b/cmake/FindHTSLIB.cmake new file mode 100644 index 00000000..dd0bec76 --- /dev/null +++ b/cmake/FindHTSLIB.cmake @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: GPL-3.0-or-later; (c) 2025 Andrew D Smith (author) +#[=======================================================================[.rst: +FindHTSLIB +-------- + +Find the native HTSLib includes and library. Based on the ZLIB module. + +#]=======================================================================] + +cmake_policy(PUSH) +cmake_policy(SET CMP0159 NEW) # file(STRINGS) with REGEX updates CMAKE_MATCH_ + +if(HTSLIB_FIND_COMPONENTS AND NOT HTSLIB_FIND_QUIETLY) + message(AUTHOR_WARNING + "HTSLib does not provide any COMPONENTS. Calling\n" + " find_package(HTSLIB COMPONENTS ...)\n" + "will always fail." + ) +endif() + +set(_HTSLIB_SEARCHES) + +# Search HTSLIB_ROOT first if it is set. +if(HTSLIB_ROOT) + set(_HTSLIB_SEARCH_ROOT PATHS ${HTSLIB_ROOT} NO_DEFAULT_PATH) + list(APPEND _HTSLIB_SEARCHES _HTSLIB_SEARCH_ROOT) +endif() + +# Normal search. +# Windows stuff +set(_HTSLIB_x86 "(x86)") +set(_HTSLIB_SEARCH_NORMAL + PATHS "$ENV{ProgramFiles}/htslib" + "$ENV{ProgramFiles${_HTSLIB_x86}}/htslib") +unset(_HTSLIB_x86) +list(APPEND _HTSLIB_SEARCHES _HTSLIB_SEARCH_NORMAL) + +if(HTSLIB_USE_STATIC_LIBS) + set(HTSLIB_NAMES hts) + set(HTSLIB_NAMES_DEBUG hts) +else() + set(HTSLIB_NAMES hts) + set(HTSLIB_NAMES_DEBUG hts) +endif() + +# Try each search configuration. +foreach(search ${_HTSLIB_SEARCHES}) + find_path(HTSLIB_INCLUDE_DIR NAMES htslib ${${search}} PATH_SUFFIXES include) +endforeach() + +# Allow HTSLIB_LIBRARY to be set manually, as the location of the htslib library +if(NOT HTSLIB_LIBRARY) + if(DEFINED CMAKE_FIND_LIBRARY_PREFIXES) + set(_htslib_ORIG_CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}") + else() + set(_htslib_ORIG_CMAKE_FIND_LIBRARY_PREFIXES) + endif() + if(DEFINED CMAKE_FIND_LIBRARY_SUFFIXES) + set(_htslib_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_FIND_LIBRARY_SUFFIXES}") + else() + set(_htslib_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES) + endif() + # Prefix/suffix of the win32/Makefile.gcc build + if(WIN32) + list(APPEND CMAKE_FIND_LIBRARY_PREFIXES "" "lib") + list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES ".dll.a") + endif() + # Support preference of static libs by adjusting CMAKE_FIND_LIBRARY_SUFFIXES + if(HTSLIB_USE_STATIC_LIBS) + if(WIN32) + set(CMAKE_FIND_LIBRARY_SUFFIXES .lib .a ${CMAKE_FIND_LIBRARY_SUFFIXES}) + else() + set(CMAKE_FIND_LIBRARY_SUFFIXES .a) + endif() + endif() + + foreach(search ${_HTSLIB_SEARCHES}) + find_library(HTSLIB_LIBRARY_RELEASE NAMES ${HTSLIB_NAMES} NAMES_PER_DIR ${${search}} PATH_SUFFIXES lib) + find_library(HTSLIB_LIBRARY_DEBUG NAMES ${HTSLIB_NAMES_DEBUG} NAMES_PER_DIR ${${search}} PATH_SUFFIXES lib) + endforeach() + + # Restore the original find library ordering + if(DEFINED _htslib_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES) + set(CMAKE_FIND_LIBRARY_SUFFIXES "${_htslib_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES}") + else() + set(CMAKE_FIND_LIBRARY_SUFFIXES) + endif() + if(DEFINED _htslib_ORIG_CMAKE_FIND_LIBRARY_PREFIXES) + set(CMAKE_FIND_LIBRARY_PREFIXES "${_htslib_ORIG_CMAKE_FIND_LIBRARY_PREFIXES}") + else() + set(CMAKE_FIND_LIBRARY_PREFIXES) + endif() + + include(SelectLibraryConfigurations) + select_library_configurations(HTSLIB) +endif() + +unset(HTSLIB_NAMES) +unset(HTSLIB_NAMES_DEBUG) + +mark_as_advanced(HTSLIB_INCLUDE_DIR) + +if(HTSLIB_INCLUDE_DIR AND EXISTS "${HTSLIB_INCLUDE_DIR}/htslib/hts.h") + # Example: #define HTS_VERSION 101300 + file(STRINGS "${HTSLIB_INCLUDE_DIR}/htslib/hts.h" HTSLIB_H_LIST REGEX "^#define HTS_VERSION") + list(GET HTSLIB_H_LIST 0 HTSLIB_H) # Take the first matching line + if (HTSLIB_H MATCHES "#define[ \t]+HTS_VERSION[ \t]+\([0-9]+\)") + set(NUMERIC_VERSION "${CMAKE_MATCH_1}") + # Extract digits by position in string + # XYYYZZ => X = major, YYY = minor, ZZ = patch + string(SUBSTRING "${NUMERIC_VERSION}" 0 1 HTSLIB_VERSION_MAJOR) + string(SUBSTRING "${NUMERIC_VERSION}" 1 3 HTSLIB_VERSION_MINOR) + string(SUBSTRING "${NUMERIC_VERSION}" 4 2 HTSLIB_VERSION_PATCH) + else() + set(HTSLIB_VERSION_STRING "") + set(HTSLIB_VERSION_MAJOR "") + set(HTSLIB_VERSION_MINOR "") + set(HTSLIB_VERSION_PATCH "") + endif() + # Set canonical variables + set(HTSLIB_MAJOR_VERSION "${HTSLIB_VERSION_MAJOR}") + set(HTSLIB_MINOR_VERSION "${HTSLIB_VERSION_MINOR}") + set(HTSLIB_PATCH_VERSION "${HTSLIB_VERSION_PATCH}") + # Build the standard version string + set(HTSLIB_VERSION "${HTSLIB_VERSION_MAJOR}.${HTSLIB_VERSION_MINOR}") + # Only append patch if it's not "00" + if(NOT HTSLIB_VERSION_PATCH STREQUAL "00") + set(HTSLIB_VERSION "${HTSLIB_VERSION}.${HTSLIB_VERSION_PATCH}") + endif() +endif() + +include(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS( + HTSLIB + REQUIRED_VARS + HTSLIB_LIBRARY + HTSLIB_INCLUDE_DIR + VERSION_VAR + HTSLIB_VERSION + HANDLE_COMPONENTS +) + +if(HTSLIB_FOUND) + set(HTSLIB_INCLUDE_DIRS ${HTSLIB_INCLUDE_DIR}) + if(NOT HTSLIB_LIBRARIES) + set(HTSLIB_LIBRARIES ${HTSLIB_LIBRARY}) + endif() + if(NOT TARGET HTSLIB::HTSLIB) + add_library(HTSLIB::HTSLIB UNKNOWN IMPORTED) + set_target_properties(HTSLIB::HTSLIB PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${HTSLIB_INCLUDE_DIRS}") + if(HTSLIB_LIBRARY_RELEASE) + set_property(TARGET HTSLIB::HTSLIB APPEND PROPERTY + IMPORTED_CONFIGURATIONS RELEASE) + set_target_properties(HTSLIB::HTSLIB PROPERTIES + IMPORTED_LOCATION_RELEASE "${HTSLIB_LIBRARY_RELEASE}") + endif() + if(HTSLIB_LIBRARY_DEBUG) + set_property(TARGET HTSLIB::HTSLIB APPEND PROPERTY + IMPORTED_CONFIGURATIONS DEBUG) + set_target_properties(HTSLIB::HTSLIB PROPERTIES + IMPORTED_LOCATION_DEBUG "${HTSLIB_LIBRARY_DEBUG}") + endif() + if(NOT HTSLIB_LIBRARY_RELEASE AND NOT HTSLIB_LIBRARY_DEBUG) + set_property(TARGET HTSLIB::HTSLIB APPEND PROPERTY + IMPORTED_LOCATION "${HTSLIB_LIBRARY}") + endif() + endif() +endif() + +cmake_policy(POP) diff --git a/cmake/static_analysis.cmake b/cmake/static_analysis.cmake new file mode 100644 index 00000000..adbc027f --- /dev/null +++ b/cmake/static_analysis.cmake @@ -0,0 +1,119 @@ +# Copyright (C) 2025 Andrew D Smith +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . + +# StaticAnalysis +message(STATUS "Enabling static analysis") +# If no specific static analysis is requested, do them all +if(NOT RUN_CPPCHECK AND NOT RUN_IWYU AND + NOT RUN_CPPLINT AND NOT RUN_CLANG_TIDY) + set(RUN_CPPCHECK on) + set(RUN_IWYU on) + set(RUN_CPPLINT on) + set(RUN_CLANG_TIDY on) +endif() + +set(STATIC_ANALYSIS_CHECKS "") +if(RUN_CPPCHECK) + list(APPEND STATIC_ANALYSIS_CHECKS "cppcheck") +endif() +if(RUN_CPPLINT) + list(APPEND STATIC_ANALYSIS_CHECKS "cpplint") +endif() +if(RUN_IWYU) + list(APPEND STATIC_ANALYSIS_CHECKS "iwyu") +endif() +if(RUN_CLANG_TIDY) + list(APPEND STATIC_ANALYSIS_CHECKS "clang-tidy") +endif() + +message(STATUS "Requested static analysis: ${STATIC_ANALYSIS_CHECKS}") + +# cpplint: all options are in the config file +if ("cpplint" IN_LIST STATIC_ANALYSIS_CHECKS) + find_program(FOUND_CPPLINT cpplint) + if(FOUND_CPPLINT) + message(STATUS "Enabling cpplint analysis") + set(CMAKE_CXX_CPPLINT cpplint --quiet) + else() + message(STATUS "Could not find cpplint; disabling cpplint") + endif() +endif() + +# include-what-you-use: config is a mappings file +if ("iwyu" IN_LIST STATIC_ANALYSIS_CHECKS) + find_program(FOUND_IWYU include-what-you-use) + if(FOUND_IWYU) + message(STATUS "Enabling include-what-you-use analysis") + set(CMAKE_CXX_INCLUDE_WHAT_YOU_USE + include-what-you-use + -Xiwyu + --comment_style=long + -Xiwyu + --quoted_includes_first + -Xiwyu + --mapping_file=${PROJECT_SOURCE_DIR}/iwyu.json + ) + else() + message(STATUS "Could not find iwyu; disabling iwyu") + endif() +endif() + +# cppcheck: options on the command line as there is no config file +if ("cppcheck" IN_LIST STATIC_ANALYSIS_CHECKS) + find_program(FOUND_CPPCHECK cppcheck) + if(FOUND_CPPCHECK) + message(STATUS "Enabling cppcheck analysis") + set(CMAKE_CXX_CPPCHECK + cppcheck + --quiet + --enable=all + --inline-suppr + --max-configs=1 + --suppressions-list=${PROJECT_SOURCE_DIR}/.cppcheck_suppress + ) + else() + message(STATUS "Could not find cppcheck; disabling cppcheck") + endif() +endif() + +# clang-tidy: need to make sure version is at least 20 +if ("clang-tidy" IN_LIST STATIC_ANALYSIS_CHECKS) + find_program(CLANG_TIDY_EXECUTABLE NAMES clang-tidy) + # Minimum required version + set(MIN_CLANG_TIDY_VERSION "20.0.0") + if(CLANG_TIDY_EXECUTABLE) + execute_process( + COMMAND + bash -c + "${CLANG_TIDY_EXECUTABLE} --version | grep version | tr -cd '0-9.\n'" + OUTPUT_VARIABLE CLANG_TIDY_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + # Compare the version numbers + if(CLANG_TIDY_VERSION VERSION_GREATER_EQUAL MIN_CLANG_TIDY_VERSION) + message(STATUS "Enabling clang-tidy (version: ${CLANG_TIDY_VERSION})") + set(CMAKE_CXX_CLANG_TIDY + clang-tidy + --quiet + --allow-no-checks + -p ${PROJECT_BINARY_DIR} + ) + else() + message(STATUS "Not enabling clang-tidy (min version not found") + endif() + else() + message(STATUS "Could not find clang-tidy; disabling clang-tidy") + endif() +endif() From 3576d9e205a3f86f9322079b39a8263700bdef1d Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:03:47 -0800 Subject: [PATCH 084/106] cmake/static_analysis.cmake: removing the hints for iwyu --- cmake/static_analysis.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/static_analysis.cmake b/cmake/static_analysis.cmake index adbc027f..f1323ad8 100644 --- a/cmake/static_analysis.cmake +++ b/cmake/static_analysis.cmake @@ -59,7 +59,7 @@ if ("iwyu" IN_LIST STATIC_ANALYSIS_CHECKS) set(CMAKE_CXX_INCLUDE_WHAT_YOU_USE include-what-you-use -Xiwyu - --comment_style=long + --comment_style=none -Xiwyu --quoted_includes_first -Xiwyu From dc7d5f6e157c1b76963cdf382b95f2e22dadcc8a Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:04:08 -0800 Subject: [PATCH 085/106] src/amrfinder/CMakeLists.txt: adding this cmake --- src/amrfinder/CMakeLists.txt | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 src/amrfinder/CMakeLists.txt diff --git a/src/amrfinder/CMakeLists.txt b/src/amrfinder/CMakeLists.txt new file mode 100644 index 00000000..9365e13b --- /dev/null +++ b/src/amrfinder/CMakeLists.txt @@ -0,0 +1,37 @@ +# Copyright (C) 2025 Andrew D Smith +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . + +find_package(HTSLIB REQUIRED) + +file(GLOB cpp_files "*.cpp") + +# Gather all the object files that will be put in the static library +# and prepare to compile them. +set(LIBRARY_OBJECTS "") +foreach(cpp_file ${cpp_files}) + get_filename_component(BASE_NAME ${cpp_file} NAME_WE) + add_library(${BASE_NAME} OBJECT ${cpp_file}) + target_link_libraries(${BASE_NAME} PUBLIC + dnmtools_objs + bamxx + smithlab_cpp + HTSLIB::HTSLIB + ) + ## Below is to make sure 'config.h' is visible for includes and any + ## of the headers for configured libraries + target_include_directories(${BASE_NAME} PUBLIC + ${PROJECT_BINARY_DIR} + ) +endforeach() From 3bbe729d33b46df8aff021f894f4505ea6902ae7 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:04:24 -0800 Subject: [PATCH 086/106] src/amrfinder/allelicmeth.cpp: linting --- src/amrfinder/allelicmeth.cpp | 73 +++++++++++++++++------------------ 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/src/amrfinder/allelicmeth.cpp b/src/amrfinder/allelicmeth.cpp index 3818269d..e1705d3b 100644 --- a/src/amrfinder/allelicmeth.cpp +++ b/src/amrfinder/allelicmeth.cpp @@ -16,12 +16,25 @@ * General Public License for more details. */ +#include "Epiread.hpp" +#include "MSite.hpp" + +#include "GenomicRegion.hpp" +#include "OptionParser.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" + #include #include +#include +#include +#include +#include #include #include #include -#include +#include +#include #include #include #include @@ -29,20 +42,8 @@ #include #include -#include -#include - -#include "GenomicRegion.hpp" -#include "MSite.hpp" -#include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" - -#include "Epiread.hpp" - using std::cerr; using std::cout; -using std::endl; using std::max; using std::min; using std::runtime_error; @@ -51,6 +52,8 @@ using std::unordered_map; using std::unordered_set; using std::vector; +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer) + static inline double log_sum_log(const double p, const double q) { if (p == 0) { @@ -59,7 +62,7 @@ log_sum_log(const double p, const double q) { else if (q == 0) { return p; } - return p > q ? p + log(1.0 + exp(q - p)) : q + log(1.0 + exp(p - q)); + return p > q ? p + log1p(exp(q - p)) : q + log1p(exp(p - q)); } static inline double @@ -226,20 +229,18 @@ verify_chroms_available(const string &chrom_name, } int -main_allelicmeth(int argc, char *argv[]) { - +main_allelicmeth(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - - static const string description = "computes probability of allele-specific \ - methylation at each tuple of CpGs"; - - static const string fasta_suffix = "fa"; + static const auto description = R"( +computes probability of allele-specific methylation at each tuple of CpGs +)"; bool VERBOSE = false; string outfile; string chroms_dir; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.add_opt("output", 'o', "output file name (default: stdout)", false, outfile); opt_parse.add_opt("chrom", 'c', "genome sequence file/directory", true, @@ -248,20 +249,20 @@ main_allelicmeth(int argc, char *argv[]) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 1) { - cerr << opt_parse.help_message() << endl; + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } const string epi_file(leftover_args.front()); @@ -288,7 +289,7 @@ main_allelicmeth(int argc, char *argv[]) { } if (VERBOSE) - cerr << "number of chromosomes: " << chrom_sizes.size() << endl; + cerr << "number of chromosomes: " << chrom_sizes.size() << '\n'; std::ifstream in(epi_file); if (!in) @@ -309,7 +310,7 @@ main_allelicmeth(int argc, char *argv[]) { verify_chroms_available(er.chr, chrom_lookup); if (VERBOSE) - cerr << "[processing " << er.chr << "]" << endl; + cerr << "[processing " << er.chr << "]" << '\n'; if (!chrom.empty()) { vector> counts(chrom_sizes[chrom] - 1); @@ -319,7 +320,7 @@ main_allelicmeth(int argc, char *argv[]) { convert_coordinates(chroms[chrom_idx], cytosines); for (size_t i = 0; i < cytosines.size() - 1; ++i) { out << cytosines[i].chrom << "\t" << cytosines[i].pos - << "\t+\tCpG\t" << cytosines[i].context << endl; + << "\t+\tCpG\t" << cytosines[i].context << '\n'; } } epireads.clear(); @@ -335,17 +336,15 @@ main_allelicmeth(int argc, char *argv[]) { convert_coordinates(chroms[chrom_idx], cytosines); for (size_t i = 0; i < cytosines.size() - 1; ++i) { out << cytosines[i].chrom << "\t" << cytosines[i].pos << "\t+\tCpG\t" - << cytosines[i].context << endl; + << cytosines[i].context << '\n'; } } } - catch (const runtime_error &e) { - cerr << e.what() << endl; - return EXIT_FAILURE; - } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::exception &e) { + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer) From c8a9f8718e747357b4cbaceba26e706ddca5a716 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:04:53 -0800 Subject: [PATCH 087/106] src/common/EpireadStats.hpp: adding a size function for std::size() --- src/common/EpireadStats.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/common/EpireadStats.hpp b/src/common/EpireadStats.hpp index 388966fa..8d1abf64 100644 --- a/src/common/EpireadStats.hpp +++ b/src/common/EpireadStats.hpp @@ -38,6 +38,11 @@ struct small_epiread { length() const { return std::size(seq); } + + std::size_t + size() const { + return std::size(seq); + } }; double From 8a2edbadd5d3b8e209c5a2d5dff5c5dd1e386951 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:05:37 -0800 Subject: [PATCH 088/106] src/amrfinder/amrfinder.cpp: linting --- src/amrfinder/amrfinder.cpp | 56 ++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/src/amrfinder/amrfinder.cpp b/src/amrfinder/amrfinder.cpp index fd0eaf3e..8d29d57a 100644 --- a/src/amrfinder/amrfinder.cpp +++ b/src/amrfinder/amrfinder.cpp @@ -17,9 +17,8 @@ * more details. */ -#include "EpireadStats.hpp" - #include "Epiread.hpp" +#include "EpireadStats.hpp" #include "Interval.hpp" #include "Interval6.hpp" @@ -29,28 +28,39 @@ #include +#include #include +#include +#include +#include +#include +#include #include +#include #include #include #include -// #include // ADS: needs c++20 -#include #include #include #include #include +#include +#include #include +// #include // ADS: needs c++20 + +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) struct amr_summary { - amr_summary(const std::vector &amrs) { - amr_count = size(amrs); - amr_total_size = accumulate( + // NOLINTBEGIN(*-prefer-member-initializer) + amr_summary(const std::vector &amrs) : amr_count{std::size(amrs)} { + amr_total_size = std::accumulate( std::cbegin(amrs), std::cend(amrs), 0ul, [](const std::size_t t, const Interval6 &p) { return t + size(p); }); amr_mean_size = static_cast(amr_total_size) / std::max(amr_count, static_cast(1)); } + // NOLINTEND(*-prefer-member-initializer) // amr_count is the number of identified AMRs, which are the merged AMRs // that are found to be significant when tested as a single interval @@ -194,7 +204,8 @@ get_chrom_partition(const std::vector &r) { } [[nodiscard]] static bool -convert_coordinates(const std::size_t n_threads, const std::string &genome_file, +convert_coordinates(const std::uint32_t n_threads, + const std::string &genome_file, std::vector &amrs) { std::unordered_map chrom_lookup; @@ -212,14 +223,14 @@ convert_coordinates(const std::size_t n_threads, const std::string &genome_file, } const auto chrom_parts = get_chrom_partition(amrs); - const auto n_parts = size(chrom_parts); + const std::uint32_t n_parts = std::size(chrom_parts); const auto parts_beg = std::cbegin(chrom_parts); - const std::uint32_t n_per = (n_parts + n_threads - 1) / n_threads; + const auto n_per = (n_parts + n_threads - 1) / n_threads; std::atomic_uint32_t conv_failure = 0; std::vector threads; - for (auto i = 0ul; i < std::min(n_threads, n_parts); ++i) { + for (auto i = 0; i < std::min(n_threads, n_parts); ++i) { const auto p_beg = parts_beg + std::min(i * n_per, n_parts); const auto p_end = parts_beg + std::min((i + 1) * n_per, n_parts); threads.emplace_back([&, p_beg, p_end] { @@ -258,10 +269,9 @@ clip_read(const std::size_t start_pos, const std::size_t end_pos, epi_r r) { get_current_epireads(const std::vector &epireads, const std::size_t max_len, const std::size_t cpg_window, const std::size_t start_pos, std::size_t &read_id) { - // assert(is_sorted(std::cbegin(epireads), std::cend(epireads), - // [](const epi_r &a, const epi_r &b) { - // return a.pos < b.pos; - // })); + assert(std::is_sorted( + std::cbegin(epireads), std::cend(epireads), + [](const epi_r &a, const epi_r &b) { return a.pos < b.pos; })); const auto n_epi = size(epireads); while (read_id < n_epi && epireads[read_id].pos + max_len <= start_pos) ++read_id; @@ -343,12 +353,6 @@ process_chrom(const bool verbose, const std::uint32_t n_threads, // ADS: need to do this by windows const auto n_blocks = std::min(lim, n_threads * blocks_per_thread); const auto blocks = get_block_bounds(lim, n_blocks); - if (!(n_blocks == size(blocks))) { - std::cerr << "n_blocks=" << n_blocks << '\t' - << "std::size(blocks)=" << size(blocks) << '\t' << "lim=" << lim - << '\t' << "lim/n_blocks=" << lim / n_blocks << std::endl; - exit(0); - } const auto blocks_beg = std::cbegin(blocks); const std::uint32_t n_per = (n_blocks + n_threads - 1) / n_threads; @@ -359,8 +363,7 @@ process_chrom(const bool verbose, const std::uint32_t n_threads, std::vector threads; for (auto i = 0u; i < std::min(n_threads, n_blocks); ++i) { - const auto b_beg = - blocks_beg + std::min(i * n_per, n_blocks); // i * n_per; + const auto b_beg = blocks_beg + std::min(i * n_per, n_blocks); const auto b_end = blocks_beg + std::min((i + 1) * n_per, n_blocks); if (b_beg == b_end) break; @@ -413,7 +416,7 @@ struct rename_amr { }; int -main_amrfinder(int argc, char *argv[]) { +main_amrfinder(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { const std::string description = "identify regions of allele-specific methylation"; @@ -445,7 +448,8 @@ main_amrfinder(int argc, char *argv[]) { bool correct_for_read_count = true; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), description, ""); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, ""); opt_parse.add_opt("output", 'o', "output file", true, outfile); opt_parse.add_opt("chrom", 'c', "reference genome fasta file", true, genome_file); @@ -665,3 +669,5 @@ main_amrfinder(int argc, char *argv[]) { } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) From bdd438b6659f59ee8bed3d9e3441e4ed3671469d Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:05:56 -0800 Subject: [PATCH 089/106] src/amrfinder/amrtester.cpp: linting --- src/amrfinder/amrtester.cpp | 68 ++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/src/amrfinder/amrtester.cpp b/src/amrfinder/amrtester.cpp index c4d3986a..0cb405c0 100644 --- a/src/amrfinder/amrtester.cpp +++ b/src/amrfinder/amrtester.cpp @@ -20,25 +20,32 @@ * along with this program. If not, see . */ -#include -#include -#include -#include -#include +#include "Epiread.hpp" +#include "EpireadStats.hpp" #include #include #include #include -#include "Epiread.hpp" -#include "EpireadStats.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include using std::begin; using std::cerr; using std::cout; using std::end; -using std::endl; using std::runtime_error; using std::streampos; using std::string; @@ -47,6 +54,8 @@ using std::vector; using epi_r = small_epiread; +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions) + static void backup_to_start_of_current_record(std::ifstream &in) { static const size_t assumed_max_valid_line_width = 10000; @@ -64,17 +73,17 @@ find_first_epiread_ending_after_position(const string &query_chrom, const size_t query_pos, std::ifstream &in) { in.seekg(0, std::ios_base::end); - size_t high_pos = in.tellg(); + auto high_pos = in.tellg(); size_t eof = in.tellg(); in.seekg(0, std::ios_base::beg); - size_t low_pos = 0; + std::streamoff low_pos = 0; string chrom, seq; size_t start = 0ul; // This is just binary search on disk while (high_pos > low_pos + 1) { - const size_t mid_pos = (low_pos + high_pos) / 2; + const std::streamoff mid_pos = (low_pos + high_pos) / 2; in.seekg(mid_pos); backup_to_start_of_current_record(in); @@ -98,7 +107,6 @@ find_first_epiread_ending_after_position(const string &query_chrom, static void load_reads(const string &reads_file_name, const GenomicRegion ®ion, vector &the_reads) { - // open and check the file std::ifstream in(reads_file_name.c_str()); if (!in) @@ -178,10 +186,8 @@ ensure_regions_are_named(vector ®ions) { } int -main_amrtester(int argc, char *argv[]) { - +main_amrtester(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - static constexpr double critical_value = 0.01; static const string fasta_suffix = "fa"; @@ -197,7 +203,8 @@ main_amrtester(int argc, char *argv[]) { double high_prob = 0.75, low_prob = 0.25; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "resolve epi-alleles", + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + "resolve epi-alleles", " "); opt_parse.add_opt("output", 'o', "output file", false, outfile); opt_parse.add_opt("chrom", 'c', "reference genome fasta file", true, @@ -212,21 +219,21 @@ main_amrtester(int argc, char *argv[]) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } if (leftover_args.size() != 2) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } const string regions_file(leftover_args.front()); @@ -261,7 +268,7 @@ main_amrtester(int argc, char *argv[]) { auto n_regions = size(regions); if (verbose) - cerr << "number of regions: " << n_regions << endl; + cerr << "number of regions: " << n_regions << '\n'; string chrom_name; vector cpg_positions; @@ -277,7 +284,6 @@ main_amrtester(int argc, char *argv[]) { auto progress_idx = 0u; for (auto ®ion : regions) { - if (show_progress && progress.time_to_report(progress_idx)) progress.report(cerr, progress_idx); ++progress_idx; @@ -303,20 +309,18 @@ main_amrtester(int argc, char *argv[]) { const auto score = reads.empty() ? 1.0 : epistat.test_asm(reads, is_significant); - region.set_score(score); + region.set_score(static_cast(score)); region.set_name(region.get_name() + ":" + toa(reads.size())); out << region << '\n'; } if (show_progress) - cerr << "\r100%" << endl; + cerr << "\r100%\n"; } - catch (const runtime_error &e) { - cerr << e.what() << endl; - return EXIT_FAILURE; - } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; + catch (const std::exception &e) { + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } + +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions) From c88f69f1c0269897b81a01fb49362baf45c73264 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:06:30 -0800 Subject: [PATCH 090/106] Adding CMakeLists.txt files --- CMakeLists.txt | 163 ++++++++++++++++++++++++++++++++++++ src/CMakeLists.txt | 24 ++++++ src/analysis/CMakeLists.txt | 37 ++++++++ src/common/CMakeLists.txt | 42 ++++++++++ src/radmeth/CMakeLists.txt | 50 +++++++++++ src/utils/CMakeLists.txt | 37 ++++++++ 6 files changed, 353 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 src/CMakeLists.txt create mode 100644 src/analysis/CMakeLists.txt create mode 100644 src/common/CMakeLists.txt create mode 100644 src/radmeth/CMakeLists.txt create mode 100644 src/utils/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..aaf873bf --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,163 @@ +# This file is part of dnmtools +# +# Copyright (C) 2025 Andrew D. Smith +# +# Authors: Andrew D. Smith +# +# This is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +# to find the version of cmake do +# $ cmake --version +cmake_minimum_required(VERSION 3.28) +project( + dnmtools + VERSION 1.4.4 + DESCRIPTION + "Tools for analyzing DNA methylation data" + HOMEPAGE_URL https://github.com/smithlabcode/dnmtools + LANGUAGES CXX) + +# Set language version used +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED on) +set(CMAKE_CXX_EXTENSIONS off) # prevents std=gnu++17 +set(CMAKE_EXPORT_COMPILE_COMMANDS on) + +include(CheckIncludeFileCXX) +include(CheckFunctionExists) +include(CheckCXXCompilerFlag) + +include(GNUInstallDirs) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") + +configure_file(data/config.h.in config.h) + +# Collect any global linker options as needed and then apply them +# individually to targets +set(GLOBAL_COMPILE_OPTIONS "") +set(GLOBAL_LINKER_OPTIONS "") + +if(USE_STATIC_LIBS) + # This needs to come before finding any libraries so that the static + # versions are identified + message(STATUS "Enabling static linkage for all non-system libraries") + message(STATUS "Configuring to clone ZLib") + include(ExternalProject) + set(ZLIB_CMAKE_ARGS + -DZLIB_BUILD_EXAMPLES=off + -DSKIP_INSTALL_FILES=on + -DCMAKE_POSITION_INDEPENDENT_CODE=on + -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR}/src/zlib + -DCMAKE_BUILD_TYPE=Release + ) + if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + list(APPEND ZLIB_CMAKE_ARGS + -DCMAKE_OSX_DEPLOYMENT_TARGET=${CMAKE_OSX_DEPLOYMENT_TARGET} + ) + endif() + ExternalProject_Add( + ZLIB + GIT_REPOSITORY https://github.com/madler/zlib.git + GIT_TAG master + CMAKE_ARGS ${ZLIB_CMAKE_ARGS} + ) + # Include the built zlib headers and link against the built zlib library + set(ZLIB_INCLUDE_DIR "${PROJECT_BINARY_DIR}/src/zlib/include") + set(ZLIB_LIBRARY "${PROJECT_BINARY_DIR}/src/zlib/lib/libz.a") + + # Create the CMake target for the built zlib + add_library(ZLIB_IMPORTED INTERFACE) + set_target_properties(ZLIB_IMPORTED PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES ${ZLIB_INCLUDE_DIR} + INTERFACE_LINK_LIBRARIES ${ZLIB_LIBRARY} + ) + + # This alias means we don't need to worry where zlib came from + add_library(ZLIB::ZLIB ALIAS ZLIB_IMPORTED) + + ExternalProject_Add( + HTSLIB + GIT_REPOSITORY https://github.com/samtools/htslib.git + GIT_TAG master + CONFIGURE_COMMAND "" + # "autoreconf -i ; /configure" + BUILD_COMMAND make -C lib-static -j + INSTALL_COMMAND "" + # make install + TEST_COMMAND "" + BUILD_BYPRODUCTS libhts.a + ) + # ExternalProject_Add_Step(HTSLIB + # bootstrap + # COMMAND autoreconf -i + # DEPENDEES download + # DEPENDERS configure + # ) + ExternalProject_Get_Property(HTSLIB SOURCE_DIR) + message(STATUS "SOURCE_DIR: ${SOURCE_DIR}") + + # Include the built zlib headers and link against the built zlib library + set(HTSLIB_INCLUDE_DIR "${SOURCE_DIR}/src/htslib/htslib") + set(HTSLIB_LIBRARY "${SOURCE_DIR}/htslib.a") + + # Create the CMake target for the built zlib + add_library(HTSLIB_IMPORTED INTERFACE) + set_target_properties(HTSLIB_IMPORTED PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES ${HTSLIB_INCLUDE_DIR} + INTERFACE_LINK_LIBRARIES ${HTSLIB_LIBRARY} + ) + set_target_properties(HTSLIB_IMPORTED PROPERTIES IMPORTED_LOCATION ${HTSLIB_LIBRARY}) + + # This alias means we don't need to worry where htslib came from + add_library(HTSLIB::HTSLIB ALIAS HTSLIB_IMPORTED) + + # Set static for the linker so the compiler's libraries will be static + ## ADS: using this instead of forcing -static for everything avoids the + ## static linkage that Aiso warns against, but also means it's not 100% + ## static linked ADS: can't do this if the compiler is AppleClang because + ## they don't have the libc++.a and the libgcc wouldn't make sense anyway. + if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") # macOS + list(APPEND GLOBAL_LINKER_OPTIONS -static-libgcc -static-libstdc++) + endif() + +endif() + +if(ENABLE_LTO) + # Turn on LTO if we are building for distribution + include(CheckIPOSupported) + check_ipo_supported(RESULT result OUTPUT output) + if(result) + set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) + else() + message(FATAL_ERROR "IPO is not supported: ${output}") + endif() +endif() + +if(STATIC_ANALYSIS) + include(cmake/static_analysis.cmake) +endif() + +# ADS: set the most stringent warnings we can +list(APPEND GLOBAL_COMPILE_OPTIONS + -Wall -Wextra -Wpedantic -Werror -Wfatal-errors +) + +if(STRIP_PATHS_FROM_BINARIES) + # This is set if we have configured to distribute + set(PREFIX_MAP_ARG -ffile-prefix-map=) + list(TRANSFORM STRIP_SUB_LIST PREPEND ${PREFIX_MAP_ARG}) + list(APPEND GLOBAL_COMPILE_OPTIONS ${STRIP_SUB_LIST}) +endif() + +message(STATUS "Finished global compile and linker configuration") + +add_subdirectory(src) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 00000000..e2dbf069 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,24 @@ +# Copyright (C) 2025 Andrew D Smith +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . + +add_subdirectory(bamxx) +add_subdirectory(smithlab_cpp) +add_subdirectory(common) +add_subdirectory(radmeth) +add_subdirectory(utils) +add_subdirectory(analysis) +add_subdirectory(amrfinder) +# add_subdirectory(abismal) +# add_subdirectory(mlml) diff --git a/src/analysis/CMakeLists.txt b/src/analysis/CMakeLists.txt new file mode 100644 index 00000000..9365e13b --- /dev/null +++ b/src/analysis/CMakeLists.txt @@ -0,0 +1,37 @@ +# Copyright (C) 2025 Andrew D Smith +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . + +find_package(HTSLIB REQUIRED) + +file(GLOB cpp_files "*.cpp") + +# Gather all the object files that will be put in the static library +# and prepare to compile them. +set(LIBRARY_OBJECTS "") +foreach(cpp_file ${cpp_files}) + get_filename_component(BASE_NAME ${cpp_file} NAME_WE) + add_library(${BASE_NAME} OBJECT ${cpp_file}) + target_link_libraries(${BASE_NAME} PUBLIC + dnmtools_objs + bamxx + smithlab_cpp + HTSLIB::HTSLIB + ) + ## Below is to make sure 'config.h' is visible for includes and any + ## of the headers for configured libraries + target_include_directories(${BASE_NAME} PUBLIC + ${PROJECT_BINARY_DIR} + ) +endforeach() diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt new file mode 100644 index 00000000..3d73a472 --- /dev/null +++ b/src/common/CMakeLists.txt @@ -0,0 +1,42 @@ +# Copyright (C) 2025 Andrew D Smith +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . + +file(GLOB cpp_files "*.cpp") + +set(LIBRARY_OBJECTS "") +foreach(cpp_file ${cpp_files}) + get_filename_component(BASE_NAME ${cpp_file} NAME_WE) + add_library(${BASE_NAME} OBJECT ${cpp_file}) + target_link_libraries(${BASE_NAME} PRIVATE + bamxx + smithlab_cpp + ) + target_include_directories(${BASE_NAME} PUBLIC + ${CMAKE_BINARY_DIR} + ) + list(APPEND LIBRARY_OBJECTS ${BASE_NAME}) +endforeach() + +# Create static library linking the individual objects +add_library(dnmtools_objs STATIC) +target_include_directories(dnmtools_objs PUBLIC + ${CMAKE_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} +) +target_link_libraries(dnmtools_objs PUBLIC + ${LIBRARY_OBJECTS} + smithlab_cpp + bamxx +) diff --git a/src/radmeth/CMakeLists.txt b/src/radmeth/CMakeLists.txt new file mode 100644 index 00000000..99747442 --- /dev/null +++ b/src/radmeth/CMakeLists.txt @@ -0,0 +1,50 @@ +# Copyright (C) 2025 Andrew D Smith +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . + +find_package(HTSLIB REQUIRED) + +add_library(radmeth OBJECT + dmr.cpp + methdiff.cpp + radmeth-adjust.cpp + radmeth-merge.cpp + radmeth.cpp + radmeth_design.cpp + radmeth_nano.cpp + radmeth_optimize_gamma.cpp + radmeth_optimize_series.cpp + radmeth_utils.cpp +) + +target_include_directories(radmeth PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_BINARY_DIR} +) +target_link_libraries(radmeth PUBLIC + dnmtools_objs + bamxx + smithlab_cpp + HTSLIB::HTSLIB +) +# target_include_directories(radmeth PUBLIC ${CMAKE_BINARY_DIR}) + +# add_executable(abismal abismal_main.cpp) +# # ADS: below, for config.h +# target_include_directories(abismal PUBLIC ${CMAKE_BINARY_DIR}) +# target_link_libraries(abismal PUBLIC +# abismal_objs +# smithlab_cpp +# HTSLIB::HTSLIB +# ) diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt new file mode 100644 index 00000000..9365e13b --- /dev/null +++ b/src/utils/CMakeLists.txt @@ -0,0 +1,37 @@ +# Copyright (C) 2025 Andrew D Smith +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . + +find_package(HTSLIB REQUIRED) + +file(GLOB cpp_files "*.cpp") + +# Gather all the object files that will be put in the static library +# and prepare to compile them. +set(LIBRARY_OBJECTS "") +foreach(cpp_file ${cpp_files}) + get_filename_component(BASE_NAME ${cpp_file} NAME_WE) + add_library(${BASE_NAME} OBJECT ${cpp_file}) + target_link_libraries(${BASE_NAME} PUBLIC + dnmtools_objs + bamxx + smithlab_cpp + HTSLIB::HTSLIB + ) + ## Below is to make sure 'config.h' is visible for includes and any + ## of the headers for configured libraries + target_include_directories(${BASE_NAME} PUBLIC + ${PROJECT_BINARY_DIR} + ) +endforeach() From 94238307242e332e126b62b36b58dac496a6170a Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:07:03 -0800 Subject: [PATCH 091/106] data/config.h.in: adding file --- data/config.h.in | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 data/config.h.in diff --git a/data/config.h.in b/data/config.h.in new file mode 100644 index 00000000..b3d64bcf --- /dev/null +++ b/data/config.h.in @@ -0,0 +1,15 @@ +/* Copyright (C) 2025 Andrew D Smith + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define PROJECT_NAME "@PROJECT_NAME@" +#define VERSION "@PROJECT_VERSION@" From e9e0b237c246bedd818db1b07b07c92afcdbda1f Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:30:24 -0800 Subject: [PATCH 092/106] src/amrfinder/amrfinder.cpp: more linting --- src/amrfinder/amrfinder.cpp | 49 ++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/src/amrfinder/amrfinder.cpp b/src/amrfinder/amrfinder.cpp index 8d29d57a..e5b8ac3c 100644 --- a/src/amrfinder/amrfinder.cpp +++ b/src/amrfinder/amrfinder.cpp @@ -53,7 +53,8 @@ struct amr_summary { // NOLINTBEGIN(*-prefer-member-initializer) - amr_summary(const std::vector &amrs) : amr_count{std::size(amrs)} { + explicit amr_summary(const std::vector &amrs) : + amr_count{std::size(amrs)} { amr_total_size = std::accumulate( std::cbegin(amrs), std::cend(amrs), 0ul, [](const std::size_t t, const Interval6 &p) { return t + size(p); }); @@ -207,7 +208,6 @@ get_chrom_partition(const std::vector &r) { convert_coordinates(const std::uint32_t n_threads, const std::string &genome_file, std::vector &amrs) { - std::unordered_map chrom_lookup; { std::vector c_name, c_seq; @@ -215,8 +215,8 @@ convert_coordinates(const std::uint32_t n_threads, const std::size_t n_chroms = size(c_seq); for (auto &s : c_seq) - for (auto &c : s) - c = std::toupper(c); + std::transform(std::cbegin(s), std::cend(s), std::begin(s), + [](const auto x) { return std::toupper(x); }); for (auto i = 0u; i < n_chroms; ++i) chrom_lookup.emplace(c_name[i], c_seq[i]); @@ -230,7 +230,7 @@ convert_coordinates(const std::uint32_t n_threads, std::atomic_uint32_t conv_failure = 0; std::vector threads; - for (auto i = 0; i < std::min(n_threads, n_parts); ++i) { + for (auto i = 0u; i < std::min(n_threads, n_parts); ++i) { const auto p_beg = parts_beg + std::min(i * n_per, n_parts); const auto p_end = parts_beg + std::min((i + 1) * n_per, n_parts); threads.emplace_back([&, p_beg, p_end] { @@ -287,10 +287,9 @@ get_current_epireads(const std::vector &epireads, static inline std::size_t total_states(const std::vector &epireads) { - std::size_t tot = 0; - for (auto &e : epireads) - tot += e.length(); - return tot; + return std::accumulate( + std::cbegin(epireads), std::cend(epireads), 0ul, + [](const auto a, const auto &e) { return a + std::size(e); }); } static inline void @@ -304,10 +303,10 @@ add_amr(const std::string &chrom_name, const std::size_t start_cpg, static inline std::size_t get_n_cpgs(const std::vector &reads) { - std::size_t n_cpgs = 0; - for (auto &r : reads) - n_cpgs = std::max(n_cpgs, static_cast(r.end())); - return n_cpgs; + return std::accumulate( + std::cbegin(reads), std::cend(reads), 0ul, [](const auto a, const auto &r) { + return std::max(a, static_cast(r.end())); + }); } template @@ -333,11 +332,12 @@ process_chrom(const bool verbose, const std::uint32_t n_threads, const EpireadStats &epistat, const std::string &chrom_name, const std::vector &epireads, std::vector &amrs) { - constexpr auto blocks_per_thread = 1u; - - auto max_epiread_len = 0u; - for (auto &e : epireads) - max_epiread_len = std::max(max_epiread_len, e.length()); + const auto max_epiread_len = std::accumulate( + std::cbegin(epireads), std::cend(epireads), 0ul, + [](const auto a, const auto &e) { return std::max(a, std::size(e)); }); + // auto max_epiread_len = 0u; + // for (auto &e : epireads) + // max_epiread_len = std::max(max_epiread_len, e.length()); const std::size_t min_obs_per_window = window_size * min_obs_per_cpg; const std::uint32_t n_cpgs = get_n_cpgs(epireads); @@ -351,7 +351,7 @@ process_chrom(const bool verbose, const std::uint32_t n_threads, return 0; // ADS: need to do this by windows - const auto n_blocks = std::min(lim, n_threads * blocks_per_thread); + const auto n_blocks = std::min(lim, n_threads); const auto blocks = get_block_bounds(lim, n_blocks); const auto blocks_beg = std::cbegin(blocks); const std::uint32_t n_per = (n_blocks + n_threads - 1) / n_threads; @@ -394,13 +394,16 @@ process_chrom(const bool verbose, const std::uint32_t n_threads, for (auto &thread : threads) thread.join(); - auto total_amrs = 0u; - for (const auto &a : all_amrs) - total_amrs += size(a); + const auto total_amrs = std::accumulate( + std::cbegin(all_amrs), std::cend(all_amrs), 0ul, + [](const auto a, const auto &amr) { return a + std::size(amr); }); + // for (const auto &a : all_amrs) + // total_amrs += size(a); amrs.reserve(total_amrs); for (auto &v : all_amrs) for (auto &a : v) + // cppcheck-suppress useStlAlgorithm amrs.push_back(std::move(a)); return windows_tested; @@ -523,7 +526,7 @@ main_amrfinder(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) std::size_t windows_tested = 0; epiread er; std::vector epireads; - std::string prev_chrom, curr_chrom, tmp_states; + std::string prev_chrom; while (read_epiread(in, er)) { if (!epireads.empty() && er.chr != prev_chrom) { From 40132be4df81a9f0134ce0c860fe778bda9b1d8e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 18:30:30 -0800 Subject: [PATCH 093/106] src/amrfinder/amrtester.cpp: more linting --- src/amrfinder/amrtester.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/amrfinder/amrtester.cpp b/src/amrfinder/amrtester.cpp index 0cb405c0..a9880589 100644 --- a/src/amrfinder/amrtester.cpp +++ b/src/amrfinder/amrtester.cpp @@ -178,7 +178,9 @@ clip_reads(const size_t start_pos, const size_t end_pos, vector &r) { // give names to regions if they do not exist static void -ensure_regions_are_named(vector ®ions) { +ensure_regions_are_named( + vector ®ions // cppcheck-suppress constParameterReference +) { auto region_name_idx = 0u; for (auto region : regions) if (region.get_name().empty()) @@ -189,7 +191,6 @@ int main_amrtester(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { static constexpr double critical_value = 0.01; - static const string fasta_suffix = "fa"; bool verbose = false; bool show_progress = false; From cbac2124e248741466b3ece57a6438c54f81e8be Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:12:21 -0800 Subject: [PATCH 094/106] src/mlml/CMakeLists.txt: adding --- src/mlml/CMakeLists.txt | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 src/mlml/CMakeLists.txt diff --git a/src/mlml/CMakeLists.txt b/src/mlml/CMakeLists.txt new file mode 100644 index 00000000..9365e13b --- /dev/null +++ b/src/mlml/CMakeLists.txt @@ -0,0 +1,37 @@ +# Copyright (C) 2025 Andrew D Smith +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program. If not, see . + +find_package(HTSLIB REQUIRED) + +file(GLOB cpp_files "*.cpp") + +# Gather all the object files that will be put in the static library +# and prepare to compile them. +set(LIBRARY_OBJECTS "") +foreach(cpp_file ${cpp_files}) + get_filename_component(BASE_NAME ${cpp_file} NAME_WE) + add_library(${BASE_NAME} OBJECT ${cpp_file}) + target_link_libraries(${BASE_NAME} PUBLIC + dnmtools_objs + bamxx + smithlab_cpp + HTSLIB::HTSLIB + ) + ## Below is to make sure 'config.h' is visible for includes and any + ## of the headers for configured libraries + target_include_directories(${BASE_NAME} PUBLIC + ${PROJECT_BINARY_DIR} + ) +endforeach() From 03473c38716eecac1f6d57bf670067bbf8d3f76e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:13:31 -0800 Subject: [PATCH 095/106] src/common/CMakeLists.txt: adding GSL dependency in cmake --- src/common/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 3d73a472..8e0d7bec 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -13,6 +13,8 @@ # You should have received a copy of the GNU General Public License along with # this program. If not, see . +find_package(GSL REQUIRED) + file(GLOB cpp_files "*.cpp") set(LIBRARY_OBJECTS "") @@ -22,6 +24,7 @@ foreach(cpp_file ${cpp_files}) target_link_libraries(${BASE_NAME} PRIVATE bamxx smithlab_cpp + GSL::gsl ) target_include_directories(${BASE_NAME} PUBLIC ${CMAKE_BINARY_DIR} From 72f848d9c8445c7fd1a7a62242cd1f59daac5aeb Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:14:07 -0800 Subject: [PATCH 096/106] src/CMakeLists.txt: Adding all the deps dnmtools binary --- src/CMakeLists.txt | 57 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e2dbf069..360aaf4c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,12 +13,61 @@ # You should have received a copy of the GNU General Public License along with # this program. If not, see . -add_subdirectory(bamxx) -add_subdirectory(smithlab_cpp) +if(NOT TARGET smithlab_cpp) + add_subdirectory(bamxx) +endif() +if(NOT TARGET smithlab_cpp) + add_subdirectory(smithlab_cpp) +endif() add_subdirectory(common) add_subdirectory(radmeth) add_subdirectory(utils) add_subdirectory(analysis) add_subdirectory(amrfinder) -# add_subdirectory(abismal) -# add_subdirectory(mlml) +add_subdirectory(abismal) +add_subdirectory(mlml) + +add_executable(dnmtools dnmtools.cpp) +# ADS: below, for config.h +target_include_directories(dnmtools PUBLIC ${CMAKE_BINARY_DIR}) +target_link_libraries(dnmtools PUBLIC + dnmtools_objs + smithlab_cpp + bamxx + abismal_objs + radmeth + format-reads + uniq + bsrate + methcounts + nanopore + symmetric-cpgs + levels + hmr + hmr-rep + hypermr + methentropy + pmd + roimethstat + autocorr + cpgbins + multimethstat + mlml + methstates + allelicmeth + amrfinder + amrtester + fast-liftover + metagene + lift-filter + clean-hairpins + guessprotocol + merge-bsrate + merge-methcounts + covered + recovered + xcounts + unxcounts + selectsites + kmersites +) From e681b867a0b41cb73aba9dc753eebb18df652e91 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:14:25 -0800 Subject: [PATCH 097/106] src/mlml/mlml.cpp: linting --- src/mlml/mlml.cpp | 693 ++++++++++++++++++++++------------------------ 1 file changed, 327 insertions(+), 366 deletions(-) diff --git a/src/mlml/mlml.cpp b/src/mlml/mlml.cpp index d67e2ace..8c598416 100644 --- a/src/mlml/mlml.cpp +++ b/src/mlml/mlml.cpp @@ -14,58 +14,61 @@ * General Public License for more details. */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "MSite.hpp" +#include "dnmtools_gaussinv.hpp" #include "OptionParser.hpp" -#include "smithlab_os.hpp" -#include "MSite.hpp" -#include "dnmt_error.hpp" -#include "dnmtools_gaussinv.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -using std::string; -using std::vector; using std::cerr; using std::cout; -using std::endl; +using std::ifstream; using std::max; using std::min; -using std::round; using std::ofstream; -using std::ifstream; +using std::round; using std::runtime_error; +using std::string; +using std::vector; static void -wilson_ci_for_binomial(const double alpha, const double n, - const double p_hat, double &lower, double &upper) { - const double z = dnmt_gsl_cdf_ugaussian_Pinv(1 - alpha/2); - const double denom = 1 + z*z/n; - const double first_term = p_hat + z*z/(2*n); - const double discriminant = p_hat*(1 - p_hat)/n + z*z/(4*n*n); - lower = std::max(0.0, (first_term - z*std::sqrt(discriminant))/denom); - upper = std::min(1.0, (first_term + z*std::sqrt(discriminant))/denom); +wilson_ci_for_binomial(const double alpha, const double n, const double p_hat, + double &lower, double &upper) { + const double z = dnmt_gsl_cdf_ugaussian_Pinv(1 - alpha / 2); + const double denom = 1 + z * z / n; + const double first_term = p_hat + z * z / (2 * n); + const double discriminant = p_hat * (1 - p_hat) / n + z * z / (4 * n * n); + lower = std::max(0.0, (first_term - z * std::sqrt(discriminant)) / denom); + upper = std::min(1.0, (first_term + z * std::sqrt(discriminant)) / denom); } static int -binom_null(const double alpha, const double n, - const double p_hat, const double p) { +binom_null(const double alpha, const std::size_t n, const double p_hat, + const double p) { // ADS: function tests if final argument p is outside the (1-alpha) // CI for p_hat given counts n. - double lower = 0.0; - double upper = 0.0; - wilson_ci_for_binomial(alpha, n, p_hat, lower, upper); - if (p < upper && p >lower) return 0; - else return 1; + double lower{}; + double upper{}; + wilson_ci_for_binomial(alpha, static_cast(n), p_hat, lower, upper); + if (p < upper && p > lower) + return 0; + else + return 1; } /* NOTATION: @@ -89,7 +92,6 @@ binom_null(const double alpha, const double n, * g1 (j in expectation): Ts that are from 5mC in TAB-seq */ - ////////////////////////// /// All 3 input files //// ////////////////////////// @@ -98,7 +100,8 @@ static inline double lnchoose(const unsigned int n, unsigned int m) { if (m == n || m == 0) return 0; - if (m*2 > n) m = n - m; + if (m * 2 > n) + m = n - m; using std::lgamma; return lgamma(n + 1.0) - lgamma(m + 1.0) - lgamma((n - m) + 1.0); } @@ -106,42 +109,43 @@ lnchoose(const unsigned int n, unsigned int m) { static double log_L(const size_t h, const size_t g, const size_t m, const size_t l, const size_t u, const size_t t, const double p_h, const double p_m) { + double log_lkhd = + lnchoose(h + g, h) + lnchoose(m + l, m) + lnchoose(u + t, u); - double log_lkhd = lnchoose(h+g, h) + lnchoose(m+l, m) + lnchoose(u+t, u); - - if (p_h > 0) log_lkhd += h*log(p_h); - if (p_h < 1) log_lkhd += g*log(1-p_h); + if (p_h > 0) + log_lkhd += h * log(p_h); + if (p_h < 1) + log_lkhd += g * log(1 - p_h); - if (p_m > 0) log_lkhd += m*log(p_m); - if (p_m < 1) log_lkhd += l*log(1-p_m); + if (p_m > 0) + log_lkhd += m * log(p_m); + if (p_m < 1) + log_lkhd += l * log(1 - p_m); - if (p_h + p_m < 1) log_lkhd += u*log(1-p_h-p_m); - if (p_h + p_m > 0)log_lkhd += t*log(p_h+p_m); + if (p_h + p_m < 1) + log_lkhd += u * log(1 - p_h - p_m); + if (p_h + p_m > 0) + log_lkhd += t * log(p_h + p_m); return log_lkhd; } -//get start point if all 3 inputs are available +// get start point if all 3 inputs are available static void -get_start_point(const size_t t, const size_t u, - const size_t m, const size_t l, - const size_t h, const size_t g, - const double tolerance, +get_start_point(const size_t t, const size_t u, const size_t m, const size_t l, + const size_t h, const size_t g, const double tolerance, double &p_m, double &p_h) { - - p_m = static_cast(m)/(m + l + tolerance) + tolerance; - p_h = static_cast(h)/(h + g + tolerance) + tolerance; - double p_u = static_cast(u)/(t + u + tolerance) + tolerance; + p_m = static_cast(m) / (m + l + tolerance) + tolerance; + p_h = static_cast(h) / (h + g + tolerance) + tolerance; + double p_u = static_cast(u) / (t + u + tolerance) + tolerance; double sum = p_m + p_h + p_u; - p_m = p_m/sum; - p_h = p_h/sum; + p_m = p_m / sum; + p_h = p_h / sum; } static void -expectation(const size_t a, const size_t x, - const double p, const double q, +expectation(const size_t a, const size_t x, const double p, const double q, vector> &coeff) { - assert(p > 0.0 && q > 0.0); assert(p + q <= 1.0); @@ -153,61 +157,55 @@ expectation(const size_t a, const size_t x, vector a_c_j(a + 1, 0.0); for (size_t j = 0; j <= a; ++j) - a_c_j[j] = lnchoose(a, j) + log_q*(a - j) + log_p*j - log_p_q*a; + a_c_j[j] = lnchoose(a, j) + log_q * (a - j) + log_p * j - log_p_q * a; coeff = vector>(x + 1, vector(a + 1, 0.0)); for (size_t k = 0; k <= x; ++k) { - const double x_c_k = lnchoose(x, k) + log_p*k + log_1mpq*(x-k) - log_1mq*x; + const double x_c_k = + lnchoose(x, k) + log_p * k + log_1mpq * (x - k) - log_1mq * x; for (size_t j = 0; j <= a; ++j) coeff[k][j] = exp(a_c_j[j] + x_c_k); } } static double -maximization(const size_t x, const size_t y, - const size_t a, const size_t b, - const vector > &coeff) { +maximization(const size_t x, const size_t y, const size_t a, const size_t b, + const vector> &coeff) { double num = y, denom = y + b; for (size_t k = 0; k <= x; ++k) { auto c = begin(coeff[k]); for (size_t j = 0; j <= a; ++j) { - num += (*c)*(a - j); - denom += (*c)*(a + x - k - j); + num += (*c) * (a - j); + denom += (*c) * (a + x - k - j); ++c; } } - return num/denom; + return num / denom; } - static double -update_p_m(const size_t x, const size_t y, - const size_t z, const size_t w, +update_p_m(const size_t x, const size_t y, const size_t z, const size_t w, const size_t a, const size_t b, - const vector > &coeff) { + const vector> &coeff) { double num = z; for (size_t k = 0; k <= x; ++k) for (size_t j = 0; j <= a; ++j) - num += coeff[k][j]*(k + j); - return num/(a + b + x + y + z + w); + num += coeff[k][j] * (k + j); + return num / (a + b + x + y + z + w); } - static void -expectation_maximization(const bool DEBUG, - const size_t x, const size_t y, - const size_t z, const size_t w, - const size_t a, const size_t b, - const double tolerance, - double &p, double &q) { +expectation_maximization(const bool DEBUG, const size_t x, const size_t y, + const size_t z, const size_t w, const size_t a, + const size_t b, const double tolerance, double &p, + double &q) { constexpr auto max_iterations = 500; if (DEBUG) { - cerr << "t:" << a << ", u:" << b - << ", m:" << z << ", l:" << w - << ", h:" << y << ", g:" << x << endl - << "p:" << p << ", q:" << q << endl; + cerr << "t:" << a << ", u:" << b << ", m:" << z << ", l:" << w + << ", h:" << y << ", g:" << x << '\n' + << "p:" << p << ", q:" << q << '\n'; } uint32_t iter = 0u; @@ -222,18 +220,15 @@ expectation_maximization(const bool DEBUG, const double p_old = p; const double q_old = q; p = update_p_m(x, y, z, w, a, b, coeff); - q = M*(1 - p); - p = max(tolerance, min(p, 1-2*tolerance)); - q = max(tolerance, min(q, 1-tolerance-p)); + q = M * (1 - p); + p = max(tolerance, min(p, 1 - 2 * tolerance)); + q = max(tolerance, min(q, 1 - tolerance - p)); delta = max(fabs(p_old - p), fabs(q_old - q)); - } while (delta > tolerance && iter++ < max_iterations); if (DEBUG) { - cerr << iter << '\t' - << "p_m=" << p << '\t' - << "p_h=" << q << '\t' - << "log-likelihood=" << log_L(y,x,z,w,b,a,q, p) << endl; + cerr << iter << '\t' << "p_m=" << p << '\t' << "p_h=" << q << '\t' + << "log-likelihood=" << log_L(y, x, z, w, b, a, q, p) << '\n'; } } @@ -245,11 +240,15 @@ log_L(const size_t x, const size_t y, const size_t z, const size_t w, const double p, const double q) { assert(p + q <= 1); double log_lkhd = lnchoose(x + y, x) + lnchoose(z + w, z); - if (p > 0) log_lkhd += x * log(p); - if (p < 1) log_lkhd += y * log(1 - p); + if (p > 0) + log_lkhd += x * log(p); + if (p < 1) + log_lkhd += y * log(1 - p); - if (q > 0) log_lkhd += z * log(q); - if (q < 1) log_lkhd += w * log(1 - q); + if (q > 0) + log_lkhd += z * log(q); + if (q < 1) + log_lkhd += w * log(1 - q); return (log_lkhd); } @@ -283,7 +282,9 @@ expectation(const size_t y, const double p, const double q, static inline double maximization(const size_t x, const size_t y, const vector &coeff) { double num = x, denom = x + y; - for (size_t j = 0; j <= y; ++j) { denom -= coeff[j] * j; } + for (size_t j = 0; j <= y; ++j) { + denom -= coeff[j] * j; + } return num / denom; } @@ -292,15 +293,14 @@ update_q(const size_t x, const size_t y, const size_t z, const size_t w, const vector &coeff) { double num = z; const double denom = x + y + z + w; - for (size_t j = 0; j <= y; ++j) num += coeff[j] * j; + for (size_t j = 0; j <= y; ++j) + num += coeff[j] * j; return num / denom; } static void -expectation_maximization(const bool DEBUG, - const size_t x, const size_t y, - const size_t z, const size_t w, - const double tolerance, +expectation_maximization(const bool DEBUG, const size_t x, const size_t y, + const size_t z, const size_t w, const double tolerance, double &p, double &q) { size_t iter = 0; double delta = std::numeric_limits::max(); @@ -311,19 +311,16 @@ expectation_maximization(const bool DEBUG, const double M = maximization(x, y, coeff); const double p_old = p, q_old = q; q = update_q(x, y, z, w, coeff); - p = M*(1 - q); - p = max(tolerance, min(p, 1-2*tolerance)); - q = max(tolerance, min(q, 1-tolerance-p)); + p = M * (1 - q); + p = max(tolerance, min(p, 1 - 2 * tolerance)); + q = max(tolerance, min(q, 1 - tolerance - p)); delta = max(fabs(p_old - p), fabs(q_old - q)); - iter ++; - } - while (delta > tolerance && iter <= 500); + iter++; + } while (delta > tolerance && iter <= 500); if (DEBUG) { - cerr << iter << '\t' - << "p=" << p << '\t' - << "q=" << q << '\t' - <<"log-likelihood=" << log_L(x, y, z, w, p, q) << endl; + cerr << iter << '\t' << "p=" << p << '\t' << "q=" << q << '\t' + << "log-likelihood=" << log_L(x, y, z, w, p, q) << '\n'; } } @@ -333,42 +330,40 @@ check_file_non_empty(const string &filename) { } static void -process_three_types(const double alpha, - const double tolerance, - const bool FLAG, - const string &hydroxy_file, - const string &bs_seq_file, - const string &oxbs_file, - const string &outfile, +process_three_types(const double alpha, const double tolerance, const bool FLAG, + const string &hydroxy_file, const string &bs_seq_file, + const string &oxbs_file, const string &outfile, const string &out_methcount_pseudo_h, - const string &out_methcount_pseudo_m, - size_t &total_sites, - size_t &overshoot_sites, - size_t &conflict_sites) { + const string &out_methcount_pseudo_m, size_t &total_sites, + size_t &overshoot_sites, size_t &conflict_sites) { constexpr auto max_read_count = 500; if (!check_file_non_empty(hydroxy_file)) - throw dnmt_error("input file empty: " + hydroxy_file); + throw std::runtime_error("input file empty: " + hydroxy_file); ifstream h_in(hydroxy_file); - if (!h_in) throw dnmt_error("failed to open input file: " + hydroxy_file); + if (!h_in) + throw std::runtime_error("failed to open input file: " + hydroxy_file); if (!check_file_non_empty(bs_seq_file)) - throw dnmt_error("input file empty: " + bs_seq_file); + throw std::runtime_error("input file empty: " + bs_seq_file); ifstream b_in(bs_seq_file); - if (!b_in) throw dnmt_error("failed to open input file: " + bs_seq_file); + if (!b_in) + throw std::runtime_error("failed to open input file: " + bs_seq_file); if (!check_file_non_empty(oxbs_file)) - throw dnmt_error("input file empty: " + oxbs_file); + throw std::runtime_error("input file empty: " + oxbs_file); ifstream o_in(oxbs_file); - if (!o_in) throw dnmt_error("failed to open input file: " + oxbs_file); + if (!o_in) + throw std::runtime_error("failed to open input file: " + oxbs_file); ofstream of; if (!outfile.empty()) { of.open(outfile); - if (!of) throw dnmt_error("failed to open output file: " + outfile); + if (!of) + throw std::runtime_error("failed to open output file: " + outfile); } std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); @@ -376,182 +371,152 @@ process_three_types(const double alpha, if (!out_methcount_pseudo_m.empty()) { out_m.open(out_methcount_pseudo_m); if (!out_m) - throw dnmt_error("failed to open output file: " + out_methcount_pseudo_m); + throw std::runtime_error("failed to open output file: " + + out_methcount_pseudo_m); } if (!out_methcount_pseudo_h.empty()) { out_h.open(out_methcount_pseudo_h); if (!out_h) - throw dnmt_error("failed to open output file: " + out_methcount_pseudo_h); + throw std::runtime_error("failed to open output file: " + + out_methcount_pseudo_h); } MSite r, s, o; while (h_in >> r && b_in >> s && o_in >> o) { - - if (r.n_reads > max_read_count) r.n_reads = max_read_count; + if (r.n_reads > max_read_count) + r.n_reads = max_read_count; const size_t h = r.n_meth(); const size_t g = r.n_unmeth(); - if (s.n_reads > max_read_count) s.n_reads = max_read_count; + if (s.n_reads > max_read_count) + s.n_reads = max_read_count; const size_t t = s.n_meth(); const size_t u = s.n_unmeth(); - if (o.n_reads > max_read_count) o.n_reads = max_read_count; + if (o.n_reads > max_read_count) + o.n_reads = max_read_count; const size_t m = o.n_meth(); const size_t l = o.n_unmeth(); - assert(r.chrom == s.chrom && r.chrom == o.chrom && - r.pos == o.pos && r.pos == s.pos); + assert(r.chrom == s.chrom && r.chrom == o.chrom && r.pos == o.pos && + r.pos == s.pos); total_sites++; - double p_m, p_h, p_u; - int CONFLICT = 0, cflt_m, cflt_h, cflt_u; - double p_m_hat, p_h_hat, p_u_hat; - - size_t x = 0, y = 0, z = 0, w = 0, a = 0, b= 0; - if ((h + g > 0 && u + t > 0) || - (h + g > 0 && m + l > 0) || + if ((h + g > 0 && u + t > 0) || (h + g > 0 && m + l > 0) || (m + l > 0 && u + t > 0)) { + auto x = g; + auto y = h; + auto z = m; + auto w = l; + auto a = t; + auto b = u; - x = g; y = h; - z = m; w = l; - a = t; b = u; - - p_h_hat = static_cast(y)/(x + y); - p_m_hat = static_cast(z)/(z + w); - p_u_hat = static_cast(b)/(a + b); + auto p_h_hat = static_cast(y) / (x + y); + auto p_m_hat = static_cast(z) / (z + w); + auto p_u_hat = static_cast(b) / (a + b); // use frequent method result if no overshoot if (p_h_hat + p_m_hat + p_u_hat == 1.0) { - out << r.chrom << '\t' - << r.pos << '\t' - << r.strand << '\t' - << r.context << '\t' - << p_m_hat << '\t' - << p_h_hat << '\t' - << p_u_hat << '\t' - << 0 << endl; + out << r.chrom << '\t' << r.pos << '\t' << r.strand << '\t' << r.context + << '\t' << p_m_hat << '\t' << p_h_hat << '\t' << p_u_hat << '\t' + << 0 << '\n'; // write out pseudo methcount files for mC and hmC if (!out_methcount_pseudo_m.empty()) - out_m << r.chrom << '\t' - << r.pos << '\t' - << r.strand << '\t' - << r.context << '\t' - << p_m_hat << '\t' - << (a + b + x + y + z + w) << endl; + out_m << r.chrom << '\t' << r.pos << '\t' << r.strand << '\t' + << r.context << '\t' << p_m_hat << '\t' + << (a + b + x + y + z + w) << '\n'; if (!out_methcount_pseudo_h.empty()) - out_h << r.chrom << '\t' - << r.pos << '\t' - << r.strand << '\t' - << r.context << '\t' - << p_h_hat << '\t' - << (a + b + x + y + z + w) << endl; + out_h << r.chrom << '\t' << r.pos << '\t' << r.strand << '\t' + << r.context << '\t' << p_h_hat << '\t' + << (a + b + x + y + z + w) << '\n'; } else { overshoot_sites++; + double p_m{}; + double p_h{}; + double p_u{}; get_start_point(t, u, m, l, h, g, tolerance, p_m, p_h); expectation_maximization(false, x, y, z, w, a, b, tolerance, p_m, p_h); p_u = 1 - p_m - p_h; - if (p_h <= 2.0*tolerance) p_h = 0.0; - if (p_m <= 2.0*tolerance) p_m = 0.0; - if (p_u <= 2.0*tolerance) p_u = 0.0; - if (p_m >= 1.0-2.0*tolerance) p_m = 1.0; - if (p_h >= 1.0-2.0*tolerance) p_h = 1.0; - if (p_u >= 1.0-2.0*tolerance) p_u = 1.0; - + if (p_h <= 2.0 * tolerance) + p_h = 0.0; + if (p_m <= 2.0 * tolerance) + p_m = 0.0; + if (p_u <= 2.0 * tolerance) + p_u = 0.0; + if (p_m >= 1.0 - 2.0 * tolerance) + p_m = 1.0; + if (p_h >= 1.0 - 2.0 * tolerance) + p_h = 1.0; + if (p_u >= 1.0 - 2.0 * tolerance) + p_u = 1.0; + + int CONFLICT{}; if (p_h_hat + p_m_hat + p_u_hat != 1 && FLAG) { - cflt_h = binom_null(alpha, static_cast(x+y), p_h_hat, p_h); - cflt_m = binom_null(alpha, static_cast(z+w), p_m_hat, p_m); - cflt_u = binom_null(alpha, static_cast(a+b), p_u_hat, p_u); + auto cflt_h = binom_null(alpha, x + y, p_h_hat, p_h); + auto cflt_m = binom_null(alpha, z + w, p_m_hat, p_m); + auto cflt_u = binom_null(alpha, a + b, p_u_hat, p_u); CONFLICT = cflt_m + cflt_u + cflt_h; } - out << r.chrom << '\t' - << r.pos << '\t' - << r.strand << '\t' - << r.context << '\t' - << p_m << '\t' - << p_h << '\t' - << p_u << '\t' - << CONFLICT << endl; + out << r.chrom << '\t' << r.pos << '\t' << r.strand << '\t' << r.context + << '\t' << p_m << '\t' << p_h << '\t' << p_u << '\t' << CONFLICT + << '\n'; if (CONFLICT > 1) conflict_sites++; // write out pseudo methcount files for mC and hmC if (!out_methcount_pseudo_m.empty()) - out_m << r.chrom << '\t' - << r.pos << '\t' - << r.strand << '\t' - << r.context << '\t' - << p_m << '\t' - << (a + b + x + y + z + w) << endl; + out_m << r.chrom << '\t' << r.pos << '\t' << r.strand << '\t' + << r.context << '\t' << p_m << '\t' << (a + b + x + y + z + w) + << '\n'; if (!out_methcount_pseudo_h.empty()) - out_h << r.chrom << '\t' - << r.pos << '\t' - << r.strand << '\t' - << r.context << '\t' - << p_h << '\t' - << (a + b +x + y + z + w) << endl; + out_h << r.chrom << '\t' << r.pos << '\t' << r.strand << '\t' + << r.context << '\t' << p_h << '\t' << (a + b + x + y + z + w) + << '\n'; } } - else { //observation from only one experiment - out << r.chrom << '\t' - << r.pos << '\t' - << r.strand << '\t' - << r.context << '\t' - << "NA" << '\t' - << "NA" << '\t' - << "NA" << '\t' - << "NA" << endl; + else { // observation from only one experiment + out << r.chrom << '\t' << r.pos << '\t' << r.strand << '\t' << r.context + << '\t' << "NA" << '\t' << "NA" << '\t' << "NA" << '\t' << "NA\n"; // write out pseudo methcount files for mC and hmC if (!out_methcount_pseudo_m.empty()) { int coverage = m + l; - const double level = coverage > 0 ? static_cast(m)/coverage : 0.0; - out_m << r.chrom << '\t' - << r.pos << '\t' - << r.strand << '\t' - << r.context << '\t' - << level << "\t" << coverage << endl; + const double level = + coverage > 0 ? static_cast(m) / coverage : 0.0; + out_m << r.chrom << '\t' << r.pos << '\t' << r.strand << '\t' + << r.context << '\t' << level << "\t" << coverage << '\n'; } if (!out_methcount_pseudo_h.empty()) { const int coverage = h + g; - const double level = coverage > 0 ? - static_cast(h)/coverage : 0.0; - out_h << r.chrom << '\t' - << r.pos << '\t' - << r.strand << '\t' - << r.context << '\t' - << level << '\t' - << coverage << endl; + const double level = + coverage > 0 ? static_cast(h) / coverage : 0.0; + out_h << r.chrom << '\t' << r.pos << '\t' << r.strand << '\t' + << r.context << '\t' << level << '\t' << coverage << '\n'; } } } } static void -process_two_types(const double alpha, - const double tolerance, - const bool FLAG, - const string &hydroxy_file, - const string &bs_seq_file, - const string &oxbs_file, - const string &outfile, +process_two_types(const double alpha, const double tolerance, const bool FLAG, + const string &hydroxy_file, const string &bs_seq_file, + const string &oxbs_file, const string &outfile, const string &out_methcount_pseudo_h, - const string &out_methcount_pseudo_m, - size_t &total_sites, - size_t &overshoot_sites, - size_t &conflict_sites) { - + const string &out_methcount_pseudo_m, size_t &total_sites, + size_t &overshoot_sites, size_t &conflict_sites) { constexpr auto max_read_count = 500u; ofstream of; if (!outfile.empty()) { of.open(outfile); - if (!of) throw dnmt_error("failed to open output file: " + outfile); + if (!of) + throw std::runtime_error("failed to open output file: " + outfile); } std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); @@ -559,12 +524,14 @@ process_two_types(const double alpha, if (!out_methcount_pseudo_m.empty()) { out_m.open(out_methcount_pseudo_m); if (!out_m) - throw dnmt_error("failed to open output file: " + out_methcount_pseudo_m); + throw std::runtime_error("failed to open output file: " + + out_methcount_pseudo_m); } if (!out_methcount_pseudo_h.empty()) { out_h.open(out_methcount_pseudo_h); if (!out_h) - throw dnmt_error("failed to open output file: " + out_methcount_pseudo_h); + throw std::runtime_error("failed to open output file: " + + out_methcount_pseudo_h); } std::ifstream f_in, s_in; @@ -572,48 +539,57 @@ process_two_types(const double alpha, if (oxbs_file.empty()) { s_rev = true; f_in.open(hydroxy_file); - if (!f_in) throw dnmt_error("failed to open file: " + hydroxy_file); + if (!f_in) + throw std::runtime_error("failed to open file: " + hydroxy_file); s_in.open(bs_seq_file); - if (!s_in) throw dnmt_error("failed to open file: " + bs_seq_file); + if (!s_in) + throw std::runtime_error("failed to open file: " + bs_seq_file); } else if (hydroxy_file.empty()) { f_rev = true; f_in.open(bs_seq_file); - if (!f_in) throw dnmt_error("failed to open file: " + bs_seq_file); + if (!f_in) + throw std::runtime_error("failed to open file: " + bs_seq_file); s_in.open(oxbs_file); - if (!s_in) throw dnmt_error("failed to open file: " + oxbs_file); + if (!s_in) + throw std::runtime_error("failed to open file: " + oxbs_file); } else { f_in.open(oxbs_file); - if (!f_in) throw dnmt_error("failed to open file: " + oxbs_file); + if (!f_in) + throw std::runtime_error("failed to open file: " + oxbs_file); s_in.open(hydroxy_file); - if (!s_in) throw dnmt_error("failed to open file: " + hydroxy_file); + if (!s_in) + throw std::runtime_error("failed to open file: " + hydroxy_file); } - MSite f, s; while (f_in >> f && s_in >> s) { - if (f.chrom != s.chrom || f.pos != s.pos) - throw dnmt_error("error: sites not synchronized between files"); + throw std::runtime_error("error: sites not synchronized between files"); - if (f.n_reads > max_read_count) f.n_reads = max_read_count; + if (f.n_reads > max_read_count) + f.n_reads = max_read_count; size_t x = f.n_meth(); size_t y = f.n_unmeth(); - if (f_rev) std::swap(x, y); + if (f_rev) + std::swap(x, y); - if (s.n_reads > max_read_count) s.n_reads = max_read_count; + if (s.n_reads > max_read_count) + s.n_reads = max_read_count; size_t z = s.n_meth(); size_t w = s.n_unmeth(); - if (s_rev) std::swap(z, w); + if (s_rev) + std::swap(z, w); total_sites++; - double p = 0.0, q = 0.0, r = 0.0; - int CONFLICT = 0, cflt1, cflt2; if (x + y > 0 && z + w > 0) { - if (x/static_cast(x + y) + z/static_cast(z + w) <= 1.0) { - p = x/static_cast(x + y); - q = z/static_cast(z + w); + int CONFLICT{}; + double p = 0.0, q = 0.0, r = 0.0; + if (x / static_cast(x + y) + z / static_cast(z + w) <= + 1.0) { + p = x / static_cast(x + y); + q = z / static_cast(z + w); r = 1.0 - p - q; } else { @@ -622,60 +598,57 @@ process_two_types(const double alpha, expectation_maximization(false, x, y, z, w, tolerance, p, q); r = 1.0 - p - q; - if (p <= 2.0*tolerance) p = 0.0; - if (q <= 2.0*tolerance) q = 0.0; - if (r <= 2.0*tolerance) r = 0.0; + if (p <= 2.0 * tolerance) + p = 0.0; + if (q <= 2.0 * tolerance) + q = 0.0; + if (r <= 2.0 * tolerance) + r = 0.0; - if (p >= 1.0 - 2.0*tolerance) p = 1.0; - if (q >= 1.0 - 2.0*tolerance) q = 1.0; - if (r >= 1.0 - 2.0*tolerance) r = 1.0; + if (p >= 1.0 - 2.0 * tolerance) + p = 1.0; + if (q >= 1.0 - 2.0 * tolerance) + q = 1.0; + if (r >= 1.0 - 2.0 * tolerance) + r = 1.0; if (FLAG) { - const double p_hat1 = static_cast(x)/(x+y); - cflt1 = binom_null(alpha, static_cast(x+y), p_hat1, p); - const double p_hat2 = static_cast(z)/(z+w); - cflt2 = binom_null(alpha, static_cast(z+w), p_hat2, q); + const double p_hat1 = static_cast(x) / (x + y); + auto cflt1 = binom_null(alpha, x + y, p_hat1, p); + const double p_hat2 = static_cast(z) / (z + w); + auto cflt2 = binom_null(alpha, z + w, p_hat2, q); CONFLICT = cflt1 + cflt2; } } - out << f.chrom << '\t' - << f.pos << '\t' - << f.strand << '\t' - << f.context << '\t'; + out << f.chrom << '\t' << f.pos << '\t' << f.strand << '\t' << f.context + << '\t'; if (oxbs_file.empty()) - out << r << '\t' - << p << '\t' - << q << '\t'; + out << r << '\t' << p << '\t' << q << '\t'; else if (hydroxy_file.empty()) - out << q << '\t' - << r << '\t' - << p << '\t'; + out << q << '\t' << r << '\t' << p << '\t'; else - out << p << '\t' - << q << '\t' - << r << '\t'; - out << CONFLICT << endl; + out << p << '\t' << q << '\t' << r << '\t'; + out << CONFLICT << '\n'; if (CONFLICT > 1) conflict_sites++; // write out pseudo methcount files for mC and hmC if (!out_methcount_pseudo_h.empty()) { - out_h << f.chrom << '\t' - << f.pos << '\t' - << f.strand << '\t' + out_h << f.chrom << '\t' << f.pos << '\t' << f.strand << '\t' << f.context << '\t'; - if (oxbs_file.empty()) out_h << p; - else if (hydroxy_file.empty()) out_h << r; - else out_h << q; - out_h << '\t' << x + y + z+ w << endl; + if (oxbs_file.empty()) + out_h << p; + else if (hydroxy_file.empty()) + out_h << r; + else + out_h << q; + out_h << '\t' << x + y + z + w << '\n'; } if (!out_methcount_pseudo_m.empty()) { - out_m << f.chrom << '\t' - << f.pos << '\t' - << f.strand << '\t' + out_m << f.chrom << '\t' << f.pos << '\t' << f.strand << '\t' << f.context << '\t'; if (oxbs_file.empty()) out_m << r; @@ -683,16 +656,12 @@ process_two_types(const double alpha, out_m << q; else out_m << p; - out_m << '\t' << (x + y + z + w) << endl; + out_m << '\t' << (x + y + z + w) << '\n'; } } - else { // at most one input file has non-zero coverage - - out << f.chrom << '\t' - << f.pos << '\t' - << f.strand << '\t' - << f.context << '\t' - << "NA\tNA\tNA\tNA" << endl; + else { // at most one input file has non-zero coverage + out << f.chrom << '\t' << f.pos << '\t' << f.strand << '\t' << f.context + << '\t' << "NA\tNA\tNA\tNA\n"; // write out pseudo methcount files for mC and hmC if (!out_methcount_pseudo_h.empty()) { @@ -700,18 +669,14 @@ process_two_types(const double alpha, double level = 0.0; if (oxbs_file.empty() && x + y > 0) { coverage = x + y; - level = static_cast(x)/coverage; + level = static_cast(x) / coverage; } - else if (bs_seq_file.empty() && z+w > 0) { + else if (bs_seq_file.empty() && z + w > 0) { coverage = z + w; - level = static_cast(z)/coverage; + level = static_cast(z) / coverage; } - out_h << s.chrom << '\t' - << s.pos << '\t' - << s.strand << '\t' - << s.context << '\t' - << level << '\t' - << coverage << endl; + out_h << s.chrom << '\t' << s.pos << '\t' << s.strand << '\t' + << s.context << '\t' << level << '\t' << coverage << '\n'; } if (!out_methcount_pseudo_m.empty()) { @@ -719,28 +684,22 @@ process_two_types(const double alpha, double level = 0.0; if (bs_seq_file.empty() && x + y > 0) { coverage = x + y; - level = static_cast(x)/coverage; + level = static_cast(x) / coverage; } else if (hydroxy_file.empty() && z + w > 0) { coverage = z + w; - level = static_cast(z)/coverage; + level = static_cast(z) / coverage; } - out_m << s.chrom << '\t' - << s.pos << '\t' - << s.strand << '\t' - << s.context << '\t' - << level << '\t' - << coverage << endl; + out_m << s.chrom << '\t' << s.pos << '\t' << s.strand << '\t' + << s.context << '\t' << level << '\t' << coverage << '\n'; } } } } int -main_mlml(int argc, char *argv[]) { - +main_mlml(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { - bool VERBOSE = false; bool FLAG = true; string oxbs_file; @@ -753,97 +712,99 @@ main_mlml(int argc, char *argv[]) { static double tolerance = 1e-10; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "program to estimate " - "methylation levels", "at least two input files"); - opt_parse.add_opt("output", 'o', "output file (default: stdout)", - false, outfile); - opt_parse.add_opt("bsseq", 'u', "input BS-seq methcounts file", - false , bs_seq_file); - opt_parse.add_opt("tabseq", 'h', "input TAB-seq methcounts file", - false , hydroxy_file); - opt_parse.add_opt("oxbsseq", 'm', "input oxBS-seq methcounts file", - false , oxbs_file); - opt_parse.add_opt("tolerance", 't', "EM convergence threshold", - false , tolerance); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + "program to estimate " + "methylation levels", + "at least two input files"); + opt_parse.add_opt("output", 'o', "output file (default: stdout)", false, + outfile); + opt_parse.add_opt("bsseq", 'u', "input BS-seq methcounts file", false, + bs_seq_file); + opt_parse.add_opt("tabseq", 'h', "input TAB-seq methcounts file", false, + hydroxy_file); + opt_parse.add_opt("oxbsseq", 'm', "input oxBS-seq methcounts file", false, + oxbs_file); + opt_parse.add_opt("tolerance", 't', "EM convergence threshold", false, + tolerance); opt_parse.add_opt("alpha", 'a', "significance level of binomial test for each site", false, alpha); opt_parse.add_opt("outh", 'H', - "hmC pseudo methcount output file (default: null)", - false, out_methcount_pseudo_h); + "hmC pseudo methcount output file (default: null)", false, + out_methcount_pseudo_h); opt_parse.add_opt("outm", 'M', - "mC pseudo methcount output file (default: null)", - false, out_methcount_pseudo_m); + "mC pseudo methcount output file (default: null)", false, + out_methcount_pseudo_m); opt_parse.add_opt("verbose", 'v', "print run statistics", false, VERBOSE); opt_parse.set_show_defaults(); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << opt_parse.help_message() << endl - << opt_parse.about_message() << endl; + cerr << opt_parse.help_message() << '\n' + << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << opt_parse.about_message() << endl; + cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << opt_parse.option_missing_message() << endl; + cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } - if (leftover_args.size() >0) { - cerr << opt_parse.help_message() << endl; + if (leftover_args.size() > 0) { + cerr << opt_parse.help_message() << '\n'; return EXIT_SUCCESS; } if (alpha <= 0.0 || alpha >= 1.0) { - cerr << "Please specify a value in (0, 1) for -a option." << endl; + cerr << "Please specify a value in (0, 1) for -a option.\n"; return EXIT_SUCCESS; } if ((oxbs_file.empty() && hydroxy_file.empty()) || (oxbs_file.empty() && bs_seq_file.empty()) || (bs_seq_file.empty() && hydroxy_file.empty())) { - cerr << "Please specify at least 2 bed files as input" << endl; + cerr << "Please specify at least 2 bed files as input\n"; return EXIT_SUCCESS; } tolerance = max(1e-15, min(tolerance, 0.1)); /****************** END COMMAND LINE OPTIONS *****************/ if (VERBOSE) - cerr << "output columns:" << endl - << "1. chrom" << endl - << "2. position" << endl - << "3. strand" << endl - << "4. label" << endl - << "5. probability of 5mC" << endl - << "6. probability of 5hmC" << endl - << "7. probability of neither" << endl - << "8. number of conflicting sites" << endl; + cerr << "output columns:\n" + << "1. chrom\n" + << "2. position\n" + << "3. strand\n" + << "4. label\n" + << "5. probability of 5mC\n" + << "6. probability of 5hmC\n" + << "7. probability of neither\n" + << "8. number of conflicting sites\n"; size_t total_sites = 0; size_t overshoot_sites = 0; size_t conflict_sites = 0; if (!hydroxy_file.empty() && !bs_seq_file.empty() && !oxbs_file.empty()) - process_three_types(alpha, tolerance, FLAG, - hydroxy_file, bs_seq_file, oxbs_file, outfile, - out_methcount_pseudo_h, out_methcount_pseudo_m, - total_sites, overshoot_sites, conflict_sites); + process_three_types(alpha, tolerance, FLAG, hydroxy_file, bs_seq_file, + oxbs_file, outfile, out_methcount_pseudo_h, + out_methcount_pseudo_m, total_sites, overshoot_sites, + conflict_sites); else - process_two_types(alpha, tolerance, FLAG, - hydroxy_file, bs_seq_file, oxbs_file, outfile, - out_methcount_pseudo_h, out_methcount_pseudo_m, - total_sites, overshoot_sites, conflict_sites); + process_two_types(alpha, tolerance, FLAG, hydroxy_file, bs_seq_file, + oxbs_file, outfile, out_methcount_pseudo_h, + out_methcount_pseudo_m, total_sites, overshoot_sites, + conflict_sites); if (VERBOSE) - cerr << "total sites: " << total_sites << endl - << "sites with overshoot: " << overshoot_sites - << " (" << 1.0*overshoot_sites/total_sites*100 << "%)" << endl + cerr << "total sites: " << total_sites << '\n' + << "sites with overshoot: " << overshoot_sites << " (" + << 1.0 * overshoot_sites / total_sites * 100 << "%)\n" << "sites conflicting to at least two input: " << conflict_sites - << " (" << 1.0*conflict_sites/total_sites*100 << "%)" << endl; + << " (" << 1.0 * conflict_sites / total_sites * 100 << "%)\n"; } catch (const std::exception &e) { - cerr << e.what() << endl; + cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 3808dcccf69de382c94ac050bb7ee61f2f80b93e Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:19:55 -0800 Subject: [PATCH 098/106] src/mlml/mlml.cpp: more linting --- src/mlml/mlml.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/mlml/mlml.cpp b/src/mlml/mlml.cpp index 8c598416..ab34e35a 100644 --- a/src/mlml/mlml.cpp +++ b/src/mlml/mlml.cpp @@ -46,6 +46,8 @@ using std::runtime_error; using std::string; using std::vector; +// NOLINTBEGIN(*-narrowing-conversions,*-avoid-magic-numbers) + static void wilson_ci_for_binomial(const double alpha, const double n, const double p_hat, double &lower, double &upper) { @@ -210,9 +212,9 @@ expectation_maximization(const bool DEBUG, const size_t x, const size_t y, uint32_t iter = 0u; double delta = std::numeric_limits::max(); + // NOLINTBEGIN(*-avoid-do-while) do { vector> coeff; - assert(p > 0 && q > 0); expectation(a, x, p, q, coeff); @@ -225,6 +227,7 @@ expectation_maximization(const bool DEBUG, const size_t x, const size_t y, q = max(tolerance, min(q, 1 - tolerance - p)); delta = max(fabs(p_old - p), fabs(q_old - q)); } while (delta > tolerance && iter++ < max_iterations); + // NOLINTEND(*-avoid-do-while) if (DEBUG) { cerr << iter << '\t' << "p_m=" << p << '\t' << "p_h=" << q << '\t' @@ -302,9 +305,9 @@ static void expectation_maximization(const bool DEBUG, const size_t x, const size_t y, const size_t z, const size_t w, const double tolerance, double &p, double &q) { - size_t iter = 0; + size_t iter{}; double delta = std::numeric_limits::max(); - + // NOLINTBEGIN(*-avoid-do-while) do { vector coeff; expectation(y, p, q, coeff); @@ -317,6 +320,7 @@ expectation_maximization(const bool DEBUG, const size_t x, const size_t y, delta = max(fabs(p_old - p), fabs(q_old - q)); iter++; } while (delta > tolerance && iter <= 500); + // NOLINTEND(*-avoid-do-while) if (DEBUG) { cerr << iter << '\t' << "p=" << p << '\t' << "q=" << q << '\t' @@ -809,3 +813,5 @@ main_mlml(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) } return EXIT_SUCCESS; } + +// NOLINTEND(*-narrowing-conversions,*-avoid-magic-numbers) From 3b6569a0491afc955c90b5f8f3e186d03faea20a Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:26:16 -0800 Subject: [PATCH 099/106] src/dnmtools.cpp: more linting --- src/dnmtools.cpp | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/dnmtools.cpp b/src/dnmtools.cpp index 137d1824..48d14da3 100644 --- a/src/dnmtools.cpp +++ b/src/dnmtools.cpp @@ -16,12 +16,14 @@ #include #include +#include +#include #include #include #include #include -#include #include +#include #include #ifdef INCLUDE_FULL_LICENSE_INFO @@ -36,7 +38,9 @@ using std::string; using std::to_string; using std::vector; -static const string PROGRAM_NAME = "dnmtools"; +static constexpr auto PROGRAM_NAME = "dnmtools"; + +// NOLINTBEGIN(*-avoid-c-arrays) struct dnmtools_command { string tag; @@ -60,6 +64,7 @@ operator<<(std::ostream &out, const dnmtools_command &cmd) -> std::ostream & { // ADS: not sure of best way to acquire these below beyond simply // declaring them here + int abismal(int argc, char *argv[]); int @@ -146,6 +151,7 @@ int main_recovered(int argc, char *argv[]); int kmersites(int argc, char *argv[]); +// NOLINTEND(*-avoid-c-arrays) void print_help( @@ -156,20 +162,20 @@ print_help( << "License: use --license for full license info\n" #endif << "Usage: " << PROGRAM_NAME << " [options]\n" - << "Commands:" << endl; - for (auto &&g : command_groups) { - cout << " " << g.first << ":" << endl; - for (auto &&c : g.second) - cout << c << endl; - cout << endl; + << "Commands:\n"; + for (const auto &g : command_groups) { + cout << " " << g.first << ":\n"; + for (const auto &c : g.second) + cout << c << '\n'; + cout << '\n'; } } int -main(int argc, char *argv[]) { +main(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) try { + // clang-format off vector>> command_groups = { - // clang-format off {{"mapping", {{{"abismal", "map FASTQ reads to a FASTA reference genome or an index", abismal}, {"abismalidx", "convert a FASTA reference genome to an abismal index", abismalidx}, @@ -243,20 +249,22 @@ main(int argc, char *argv[]) { #endif const auto has_tag = [&](const dnmtools_command &a) { - return a.tag == argv[1]; + return a.tag == argv[1]; // NOLINT(*-avoid-c-arrays,*-pointer-arithmetic) }; - for (auto &g : command_groups) { + for (const auto &g : command_groups) { const auto the_cmd = std::find_if(std::cbegin(g.second), std::cend(g.second), has_tag); if (the_cmd != std::cend(g.second)) return (*the_cmd)(argc, argv); } - std::cerr << "ERROR: invalid command " << argv[1] << std::endl; + std::cerr << "ERROR: invalid command " + << argv[1] // NOLINT(*-avoid-c-arrays,*-pointer-arithmetic) + << '\n'; } catch (const std::exception &e) { - std::cerr << "ERROR:\t" << e.what() << endl; + std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; From 792757c3d81be8ea7e21ab1b60e7a7a768509237 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:43:08 -0800 Subject: [PATCH 100/106] src/amrfinder/amrfinder.cpp: removing dead code --- src/amrfinder/amrfinder.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/amrfinder/amrfinder.cpp b/src/amrfinder/amrfinder.cpp index e5b8ac3c..2492792e 100644 --- a/src/amrfinder/amrfinder.cpp +++ b/src/amrfinder/amrfinder.cpp @@ -335,9 +335,6 @@ process_chrom(const bool verbose, const std::uint32_t n_threads, const auto max_epiread_len = std::accumulate( std::cbegin(epireads), std::cend(epireads), 0ul, [](const auto a, const auto &e) { return std::max(a, std::size(e)); }); - // auto max_epiread_len = 0u; - // for (auto &e : epireads) - // max_epiread_len = std::max(max_epiread_len, e.length()); const std::size_t min_obs_per_window = window_size * min_obs_per_cpg; const std::uint32_t n_cpgs = get_n_cpgs(epireads); From 04cf61bde1b0b37e3ee5ee0d9d66fbe2e1377bc7 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:43:33 -0800 Subject: [PATCH 101/106] src/common/EpireadStats.cpp: removing commented deadcode --- src/common/EpireadStats.cpp | 44 ++++--------------------------------- 1 file changed, 4 insertions(+), 40 deletions(-) diff --git a/src/common/EpireadStats.cpp b/src/common/EpireadStats.cpp index 759e6c4c..a1900a37 100644 --- a/src/common/EpireadStats.cpp +++ b/src/common/EpireadStats.cpp @@ -41,11 +41,9 @@ static const double PSEUDOCOUNT = 1e-10; static inline uint32_t adjust_read_offsets(vector &reads) { - auto first_read_offset = std::accumulate( + const auto first_read_offset = std::accumulate( std::cbegin(reads), std::cend(reads), num_lim::max(), [](const std::uint32_t a, const auto &r) { return std::min(a, r.pos); }); - // for (const auto &r : reads) - // first_read_offset = min(r.pos, first_read_offset); for (auto &r : reads) r.pos -= first_read_offset; return first_read_offset; @@ -56,28 +54,15 @@ get_n_cpgs(const vector &reads) { return std::accumulate( std::cbegin(reads), std::cend(reads), 0u, [](const std::uint32_t a, const auto &r) { return std::max(a, r.end()); }); - // auto n_cpgs = 0u; - // for (const auto &r : reads) - // n_cpgs = std::max(n_cpgs, r.end()); - // return n_cpgs; } -// static inline bool -// is_meth(const epi_r &r, const uint32_t pos) {return (r.seq[pos] == 'C');} - -// static inline bool -// un_meth(const epi_r &r, const uint32_t pos) {return (r.seq[pos] == 'T');} - -inline double +static inline double log_likelihood(const epi_r &r, const vector &a) { double ll = 0.0; for (size_t i = 0; i < r.seq.length(); ++i) - // if (is_meth(r.seq[i]) || un_meth(r.seq[i])) { - if (r.seq[i] == 'C' || r.seq[i] == 'T') { - // const double val = (is_meth(r, i) ? a[r.pos + i] : (1.0 - a[r.pos + - // i])); assert(isfinite(log(val))); + if (r.seq[i] == 'C' || r.seq[i] == 'T') ll += log(r.seq[i] == 'C' ? a[r.pos + i] : (1.0 - a[r.pos + i])); - } + assert(std::isfinite(ll)); return ll; } @@ -156,23 +141,6 @@ expectation_step(const vector &reads, const double mixing, return score; } -// template void -// fit_epiallele(const double pseudo, const vector &reads, -// vector::const_iterator indic_itr, vector &a) { -// vector total(a.size(), 2 * pseudo); -// fill_n(begin(a), size(a), pseudo); -// for (auto &r : reads) { -// const double weight = inverse ? 1.0 - *indic_itr++ : *indic_itr++; -// auto m_itr = begin(a) + r.pos; -// auto t_itr = begin(total) + r.pos; -// for (auto s : r.seq) { -// *m_itr++ += weight * (s == 'C'); -// *t_itr++ += weight * (s != 'N'); -// } -// } -// for (uint32_t i = 0; i < n_cpgs; ++i) a[i] /= total[i]; -// } - template void fit_epiallele(const double pseudo, const vector &reads, @@ -225,15 +193,11 @@ rescale_indicators( const double adjustment = mixing / ratio; std::transform(std::cbegin(indic), std::cend(indic), std::begin(indic), [&](const auto x) { return x * adjustment; }); - // for (auto &i : indic) - // i *= adjustment; } else { const double adjustment = mixing / (1.0 - ratio); std::transform(std::cbegin(indic), std::cend(indic), std::begin(indic), [&](const auto x) { return 1.0 - (1.0 - x) * adjustment; }); - // for (auto &i : indic) - // i = 1.0 - (1.0 - i) * adjustment; } } From 4d9fce3bc58304fbe6fbbec6fd53a8358bdcb508 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:43:49 -0800 Subject: [PATCH 102/106] src/radmeth/radmeth_design.cpp: removing commented deadcode --- src/radmeth/radmeth_design.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/radmeth/radmeth_design.cpp b/src/radmeth/radmeth_design.cpp index 3c849727..7729ebfd 100644 --- a/src/radmeth/radmeth_design.cpp +++ b/src/radmeth/radmeth_design.cpp @@ -257,8 +257,4 @@ Design::has_two_values(const std::size_t test_factor) const { const auto &tcol = tmatrix[test_factor]; return std::any_of(std::cbegin(tcol), std::cend(tcol), [&](const auto x) { return x != tcol[0]; }); - // for (const auto x : tcol) - // if (x != tcol[0]) - // return true; - // return false; } From e2109544707f1bfc2058ddb80d57002aaab42459 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:44:14 -0800 Subject: [PATCH 103/106] src/utils/merge-methcounts.cpp: removing commented deadcode --- src/utils/merge-methcounts.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/utils/merge-methcounts.cpp b/src/utils/merge-methcounts.cpp index 1f1e7bec..c3d8ce58 100644 --- a/src/utils/merge-methcounts.cpp +++ b/src/utils/merge-methcounts.cpp @@ -181,10 +181,6 @@ write_line_for_tabular(std::array &buffer, bytes_left -= n_bytes; } - // out << min_site.chrom << ':' << min_site.pos << ':' << min_site.strand << - // ':' - // << min_site.context; - if (write_fractional) { for (std::size_t i = 0; i < n_files; ++i) { const std::size_t r = sites[i].n_reads; @@ -211,10 +207,6 @@ write_line_for_tabular(std::array &buffer, throw std::runtime_error("failed to write output line"); cursor += n_bytes; bytes_left -= n_bytes; - // if (to_print[i]) - // out << '\t' << sites[i].n_reads << '\t' << sites[i].n_meth(); - // else - // out << '\t' << 0 << '\t' << 0; } if (std::distance(buffer.data(), cursor) + 1 < static_cast(buffer_size)) @@ -223,8 +215,7 @@ write_line_for_tabular(std::array &buffer, if (std::distance(buffer.data(), cursor) < static_cast(buffer_size)) { *cursor++ = '\0'; - out.write(buffer.data(), - std::distance(buffer.data(), cursor)); // "\n"); // out << '\n'; + out.write(buffer.data(), std::distance(buffer.data(), cursor)); } } From 4c6ae5dee7c673efac68f15c6dc13f9eafa4d7e7 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:44:32 -0800 Subject: [PATCH 104/106] src/radmeth/CMakeLists.txt: removing commented deadcode --- src/radmeth/CMakeLists.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/radmeth/CMakeLists.txt b/src/radmeth/CMakeLists.txt index 99747442..553a8c35 100644 --- a/src/radmeth/CMakeLists.txt +++ b/src/radmeth/CMakeLists.txt @@ -38,13 +38,3 @@ target_link_libraries(radmeth PUBLIC smithlab_cpp HTSLIB::HTSLIB ) -# target_include_directories(radmeth PUBLIC ${CMAKE_BINARY_DIR}) - -# add_executable(abismal abismal_main.cpp) -# # ADS: below, for config.h -# target_include_directories(abismal PUBLIC ${CMAKE_BINARY_DIR}) -# target_link_libraries(abismal PUBLIC -# abismal_objs -# smithlab_cpp -# HTSLIB::HTSLIB -# ) From ac787cea9917891852bfaf4de9cc59279f21deb7 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:44:44 -0800 Subject: [PATCH 105/106] src/analysis/pmd.cpp: removing commented deadcode --- src/analysis/pmd.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp index 1c237b3f..e0756b80 100644 --- a/src/analysis/pmd.cpp +++ b/src/analysis/pmd.cpp @@ -1181,10 +1181,6 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) if (!is_msite_file(x)) throw runtime_error("malformed counts file: " + x); }); - // return !is_msite_file(x); })) - // // for (const auto &filename : cpgs_file) - // // if (!is_msite_file(filename)) - // throw runtime_error("malformed counts file: " + filename); bool insufficient_data = false; // ADS: this is used now to detect // when the counts files have From 326626cc298681a18ecf37b2885a5e1a5446d5e6 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Mon, 24 Nov 2025 19:47:15 -0800 Subject: [PATCH 106/106] src/common/EpireadStats.cpp: fixing bug with log_likelihood function signature --- src/common/EpireadStats.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/EpireadStats.cpp b/src/common/EpireadStats.cpp index a1900a37..32947a37 100644 --- a/src/common/EpireadStats.cpp +++ b/src/common/EpireadStats.cpp @@ -56,7 +56,7 @@ get_n_cpgs(const vector &reads) { [](const std::uint32_t a, const auto &r) { return std::max(a, r.end()); }); } -static inline double +inline double log_likelihood(const epi_r &r, const vector &a) { double ll = 0.0; for (size_t i = 0; i < r.seq.length(); ++i)