From 631396a5e0c643ba6db6e6d74c63eebd7ae42c76 Mon Sep 17 00:00:00 2001 From: tbrekalo Date: Mon, 27 May 2024 09:42:23 +0200 Subject: [PATCH] tbrekalo-build: update CMake and create source file for algorithms --- CMakeLists.txt | 56 ++++++++------- include/algorithms.hpp | 150 +++++++++++++++-------------------------- src/algorithms.cpp | 80 ++++++++++++++++++++++ tool/density.cpp | 1 + 4 files changed, 169 insertions(+), 118 deletions(-) create mode 100644 src/algorithms.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ab02af6..a097316 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,42 +2,50 @@ cmake_minimum_required(VERSION 3.5) project(MINIMIZERS) set(CMAKE_CXX_STANDARD 20) -if (NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release") -endif () - -MESSAGE(STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE}) - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) -MESSAGE(STATUS "Compiling for processor: " ${CMAKE_HOST_SYSTEM_PROCESSOR}) +option(MINIMIZERS_USE_SANITIZERS "Build Debug and RelWithDebInfo with ASAN" ON) -if (UNIX AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")) - MESSAGE(STATUS "Compiling with flags: -march=native -mbmi2 -msse4.2") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release") endif() -if (UNIX) +message(STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE}) +message(STATUS "Compiling for processor: " ${CMAKE_HOST_SYSTEM_PROCESSOR}) +if(UNIX AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64")) + message(STATUS "Compiling with flags: -march=native -mbmi2 -msse4.2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") +endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") +if(UNIX AND CMAKE_BUILD_TYPE STREQUAL "Release") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces -Wno-unknown-attributes -Wno-unused-function") - - MESSAGE(STATUS "Compiling with flags: ${CMAKE_CXX_FLAGS}") - - if (MINIMIZERS_USE_SANITIZERS) - MESSAGE(STATUS "Using sanitizers. Compiling with flags: -fsanitize=address -fno-omit-frame-pointer") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") - endif() - + message(STATUS "Compiling with flags: ${CMAKE_CXX_FLAGS}") endif() +add_compile_options(-Wall -Wextra -Wno-missing-braces -Wno-unused-function) + include_directories(.) include_directories(include) +add_library(minimizers STATIC src/algorithms.cpp) +target_include_directories(minimizers PUBLIC include) add_executable(density tool/density.cpp) -target_link_libraries(density - z -) +target_compile_options(density PRIVATE -Wall -Wextra -Wno-missing-braces + -Wno-unused-function) + +if(MINIMIZERS_USE_SANITIZERS) + target_compile_options( + minimizers PRIVATE $<$:-fsanitize=address + -fsanitize=undefined -fno-omit-frame-pointer>) + target_link_options( + minimizers PRIVATE $<$:-fsanitize=address + -fsanitize=undefined>) + message( + STATUS + "Using sanitizers. Compiling with flags: -fsanitize=address -fno-omit-frame-pointer" + ) +endif() + +target_link_libraries(density z minimizers) add_executable(generate_random_fasta tool/generate_random_fasta.cpp) diff --git a/include/algorithms.hpp b/include/algorithms.hpp index e5fb22d..241f154 100644 --- a/include/algorithms.hpp +++ b/include/algorithms.hpp @@ -3,58 +3,31 @@ #include #include #include -#include +#include +#include +#include #include "external/fastmod/fastmod.h" -#include "util.hpp" #include "enumerator.hpp" namespace minimizers { -double redundancy_in_density_in_perc(const double density, const double lower_bound) { - return (density / lower_bound - 1) * 100.0; -} - -double redundancy_in_density_as_factor(const double density, const double lower_bound) { - return density / lower_bound; -} - -bool is_not_forward(const uint64_t k, const uint64_t w, const uint64_t t) { - assert(w >= 2); - assert(t <= k); - /* - We know a scheme is *not* foward when there exist x and y - such that x mod w + 1 < y mod w, where x and y are the - positions of the smallest t-mer in window i and i-1 respectively, - for some i > 0. So we derive: x mod w < w - 2. - Since x is in [0..l - t] = [0..w + k - 1 - t], then x is at most - w + k - t - 1, i.e., w + k - t - 1 mod w < w - 2. - - All possible backward jumps (y mod w, x mod w), of length y-x-1, - are for y in [x+1..w-1]. - - Note: in math, we would write (k - t - 1) mod w < w - 2, - but here we always sum w to avoid having to take the - modulo of negative integers when t = k. - */ - return (w + k - t - 1) % w < w - 2; -} - -/* This ignores (asymptotic) lower order terms. */ -double closed_form_density(std::string const& scheme_name, // - const uint64_t k, const uint64_t w, const uint64_t t) // -{ - if (scheme_name == "miniception") { - return 1.67 / w; - } else if (scheme_name == "mod_sampling") { - bool ok = (w + k - 1 - t) % w == w - 1; - double correction = ok ? 0 : floor(1.0 + double(k - 1.0 - t) / w) / (w + k - t); - return double(floor(1.0 + double(k - t - 1.0) / w) + 2.0 - correction) / (w + k - t + 1.0); - } else { - throw std::runtime_error("unknown scheme name"); - } -} +namespace detail { + +struct globals_t { + std::vector sines; + std::vector> roots; + long double pi = std::numbers::pi_v; +}; + +globals_t& get_globals(); + +} // namespace detail + +double redundancy_in_density_in_perc(double density, double lower_bound); +double redundancy_in_density_as_factor(double density, double lower_bound); +bool is_not_forward(const uint64_t k, const uint64_t w, const uint64_t t); /* Each algorithm returns a position p in [0..w-1], corresponding @@ -62,6 +35,9 @@ double closed_form_density(std::string const& scheme_name, Note: in case of ties, we return the *leftmost* kmer. */ +double closed_form_density(std::string const& scheme_name, const uint64_t k, const uint64_t w, + const uint64_t t); + template struct mod_sampling { static std::string name() { return "mod_sampling"; } @@ -241,8 +217,6 @@ struct rotational_alt { template using uhs_hash = std::pair; -uint8_t char_remap[256]; - /// Proof that for all j: sumj <= sum0 + sigma/2 is sufficient to guarantee the /// existence of a sum0. /// @@ -263,11 +237,11 @@ uint8_t char_remap[256]; /// z p>=z+sigma/2+1 (implied by y below) /// . y=z+v /// .. -/// x y>=x+sigma/2+1 y = leftmost max in row of x; must be created as some previous z+v, -/// v<=sigma-1. (z cannot be s0.) +/// x y>=x+sigma/2+1 y = leftmost max in row of x; must be created as some previous +/// z+v, v<=sigma-1. (z cannot be s0.) /// -/// p >= z+sigma/2+1>=y-(sigma-1)+sigma/2+1>=x+sigma/2+1-(sigma-1)+sigma/2+1=x+3 => must be created. -/// cannot be created by s0. +/// p >= z+sigma/2+1>=y-(sigma-1)+sigma/2+1>=x+sigma/2+1-(sigma-1)+sigma/2+1=x+3 => must be +/// created. cannot be created by s0. /// Return whether the kmer is in the UHS, and the random kmer order. template @@ -276,14 +250,25 @@ struct rotational_orig_hasher { static hash_type hash(char const* kmer, const uint64_t w, const uint64_t k, const uint64_t seed) { - constexpr uint64_t sigma = 4; + static constexpr uint64_t sigma = 4; + static constexpr auto char_remap = []() -> std::array { + std::array dst; + + dst[int('A')] = 0; + dst[int('C')] = 1; + dst[int('T')] = 2; + dst[int('G')] = 3; + + return dst; + }(); + bool in_uhs = true; uint64_t sum0 = 0; - for (uint64_t pos = 0; pos < k; pos += w) sum0 += char_remap[int(kmer[pos])]; + for (uint64_t pos = 0; pos < k; pos += w) sum0 += char_remap.at(int(kmer[pos])); for (uint64_t j = 1; j != w; ++j) { uint64_t sumj = 0; - for (uint64_t pos = j; pos < k; pos += w) sumj += char_remap[int(kmer[pos])]; + for (uint64_t pos = j; pos < k; pos += w) sumj += char_remap.at(int(kmer[pos])); // Assume alphabet size 4. // Instead of <=+sigma, we do <=+sigma-1, // since the max difference between two characters is actually @@ -309,10 +294,6 @@ struct rotational_orig { rotational_orig(uint64_t w, uint64_t k, uint64_t /*t*/, uint64_t seed) : m_w(w), m_k(k), m_seed(seed), m_enum_kmers(w, k, seed) { assert(m_k % m_w == 0); - char_remap[int('A')] = 0; - char_remap[int('C')] = 1; - char_remap[int('T')] = 2; - char_remap[int('G')] = 3; } uint64_t sample(char const* window) { @@ -344,34 +325,14 @@ struct rotational_orig { enumerator> m_enum_kmers; }; -// TODO: Global variables are ugly and ideally should be replaced by member variables on the Hasher -// objects. -vector sines; -vector> roots; -long double pi = std::numbers::pi_v; - // The pseudocode from the original paper. // We intentionally ignore the 0 case. -bool is_decycling_original(char const* kmer, const uint64_t k) { - long double im = 0; - for (uint64_t i = 0; i != k; ++i) { im += sines[i] * kmer[i]; } - long double im_rot = 0; - for (uint64_t i = 0; i != k; ++i) { im_rot += sines[i + 1] * kmer[i]; } - // std::cerr << "Im: " << im << " im_rot: " << im_rot << - // std::endl; - return im > 0 and im_rot <= 0; -} +bool is_decycling_original(char const* kmer, const uint64_t k); // Same method, but using complex numbers. // This is only different due to floating point rounding errors, e.g. when imag(x)=0. // The original method has ever so slightly better density. -bool is_decycling_arg_pos(char const* kmer, const uint64_t k) { - std::complex x = 0; - for (uint64_t i = 0; i != k; ++i) { x += roots[i] * (long double)kmer[i]; } - auto a = std::arg(x); - // std::cerr << "arg: " << a << "\nthresh: " << pi - 2 * pi / k << endl; - return pi - 2 * pi / k < a; -} +bool is_decycling_arg_pos(char const* kmer, const uint64_t k); // Use angle around 0 instead of around pi. // @@ -379,12 +340,7 @@ bool is_decycling_arg_pos(char const* kmer, const uint64_t k) { // That should be equivalent since it's basically using the D-tilde. // // FIXME: This is around 1% worse than the versions above. I do not understand why. -bool is_decycling_arg_neg(char const* kmer, const uint64_t k) { - std::complex x = 0; - for (uint64_t i = 0; i != k; ++i) { x += roots[i] * (long double)kmer[i]; } - auto a = std::arg(x); - return -2 * pi / k < a and a <= 0; -} +bool is_decycling_arg_neg(char const* kmer, const uint64_t k); template struct decycling_hasher { @@ -408,11 +364,14 @@ struct decycling { decycling(uint64_t w, uint64_t k, uint64_t /*t*/, uint64_t seed) : m_w(w), m_k(k), m_seed(seed), m_enum_kmers(w, k, seed) { - sines.reserve(m_k + 1); - for (uint64_t i = 0; i != m_k; ++i) { sines.push_back(std::sin(2 * pi * i / m_k)); } - roots.reserve(m_k + 1); + detail::get_globals().sines.reserve(m_k + 1); for (uint64_t i = 0; i != m_k; ++i) { - roots.push_back(std::exp(std::complex(0, 2 * pi * i / m_k))); + detail::get_globals().sines.push_back(std::sin(2 * detail::get_globals().pi * i / m_k)); + } + detail::get_globals().roots.reserve(m_k + 1); + for (uint64_t i = 0; i != m_k; ++i) { + detail::get_globals().roots.push_back( + std::exp(std::complex(0, 2 * detail::get_globals().pi * i / m_k))); } } @@ -464,11 +423,14 @@ struct double_decycling { double_decycling(uint64_t w, uint64_t k, uint64_t /*t*/, uint64_t seed) : m_w(w), m_k(k), m_seed(seed), m_enum_kmers(w, k, seed) { - sines.reserve(m_k + 1); - for (uint64_t i = 0; i != m_k; ++i) { sines.push_back(std::sin(2 * pi * i / m_k)); } - roots.reserve(m_k + 1); + detail::get_globals().sines.reserve(m_k + 1); + for (uint64_t i = 0; i != m_k; ++i) { + detail::get_globals().sines.push_back(std::sin(2 * detail::get_globals().pi * i / m_k)); + } + detail::get_globals().roots.reserve(m_k + 1); for (uint64_t i = 0; i != m_k; ++i) { - roots.push_back(std::exp(std::complex(0, 2 * pi * i / m_k))); + detail::get_globals().roots.push_back( + std::exp(std::complex(0, 2 * detail::get_globals().pi * i / m_k))); } } diff --git a/src/algorithms.cpp b/src/algorithms.cpp new file mode 100644 index 0000000..3cc36a8 --- /dev/null +++ b/src/algorithms.cpp @@ -0,0 +1,80 @@ +#include "algorithms.hpp" + +namespace minimizers { + +namespace detail { + +static globals_t globals; +globals_t& get_globals() { return globals; } +} // namespace detail + +double redundancy_in_density_in_perc(const double density, const double lower_bound) { + return (density / lower_bound - 1) * 100.0; +} + +double redundancy_in_density_as_factor(const double density, const double lower_bound) { + return density / lower_bound; +} + +bool is_not_forward(const uint64_t k, const uint64_t w, const uint64_t t) { + assert(w >= 2); + assert(t <= k); + /* + We know a scheme is *not* foward when there exist x and y + such that x mod w + 1 < y mod w, where x and y are the + positions of the smallest t-mer in window i and i-1 respectively, + for some i > 0. So we derive: x mod w < w - 2. + Since x is in [0..l - t] = [0..w + k - 1 - t], then x is at most + w + k - t - 1, i.e., w + k - t - 1 mod w < w - 2. + + All possible backward jumps (y mod w, x mod w), of length y-x-1, + are for y in [x+1..w-1]. + + Note: in math, we would write (k - t - 1) mod w < w - 2, + but here we always sum w to avoid having to take the + modulo of negative integers when t = k. + */ + return (w + k - t - 1) % w < w - 2; +} + +/* This ignores (asymptotic) lower order terms. */ +double closed_form_density(std::string const& scheme_name, // + const uint64_t k, const uint64_t w, const uint64_t t) // +{ + if (scheme_name == "miniception") { + return 1.67 / w; + } else if (scheme_name == "mod_sampling") { + bool ok = (w + k - 1 - t) % w == w - 1; + double correction = ok ? 0 : floor(1.0 + double(k - 1.0 - t) / w) / (w + k - t); + return double(floor(1.0 + double(k - t - 1.0) / w) + 2.0 - correction) / (w + k - t + 1.0); + } else { + throw std::runtime_error("unknown scheme name"); + } +} + +bool is_decycling_original(char const* kmer, const uint64_t k) { + long double im = 0; + for (uint64_t i = 0; i != k; ++i) { im += detail::globals.sines[i] * kmer[i]; } + long double im_rot = 0; + for (uint64_t i = 0; i != k; ++i) { im_rot += detail::globals.sines[i + 1] * kmer[i]; } + // std::cerr << "Im: " << im << " im_rot: " << im_rot << + // std::endl; + return im > 0 and im_rot <= 0; +} + +bool is_decycling_arg_pos(char const* kmer, const uint64_t k) { + std::complex x = 0; + for (uint64_t i = 0; i != k; ++i) { x += detail::globals.roots[i] * (long double)kmer[i]; } + auto a = std::arg(x); + // std::cerr << "arg: " << a << "\nthresh: " << pi - 2 * pi / k << endl; + return detail::globals.pi - 2 * detail::globals.pi / k < a; +} + +bool is_decycling_arg_neg(char const* kmer, const uint64_t k) { + std::complex x = 0; + for (uint64_t i = 0; i != k; ++i) { x += detail::globals.roots[i] * (long double)kmer[i]; } + auto a = std::arg(x); + return -2 * detail::globals.pi / k < a and a <= 0; +} + +} // namespace minimizers diff --git a/tool/density.cpp b/tool/density.cpp index f1964cd..1cf7d37 100644 --- a/tool/density.cpp +++ b/tool/density.cpp @@ -8,6 +8,7 @@ #include "external/gz/zip_stream.hpp" #include "external/gz/zip_stream.cpp" +#include "util.hpp" #include "hasher.hpp" #include "algorithms.hpp"