diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..ee1bee2 --- /dev/null +++ b/.clang-format @@ -0,0 +1,2 @@ +BasedOnStyle: Google +ColumnLimit: 100 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1798a54..3bc761e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,8 +2,9 @@ name: CI on: push: - branches: ['**'] + branches: [master] pull_request: + branches: [master] jobs: build: diff --git a/.gitignore b/.gitignore index d3643ad..f446f2e 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,4 @@ gprof2dot.py bin/ cryfa keygen +results/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 9274219..80b6d96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,54 @@ cmake_minimum_required(VERSION 4.0.0) project(cryfa) +set(CRYFA_VERSION_OVERRIDE "" CACHE STRING "Override the version embedded in Cryfa") +set(CRYFA_VERSION "0.0.0-dev") + +if(CRYFA_VERSION_OVERRIDE) + set(CRYFA_VERSION "${CRYFA_VERSION_OVERRIDE}") +elseif(DEFINED ENV{GITHUB_REF_TYPE} AND "$ENV{GITHUB_REF_TYPE}" STREQUAL "tag") + set(CRYFA_VERSION "$ENV{GITHUB_REF_NAME}") + string(REGEX REPLACE "^v" "" CRYFA_VERSION "${CRYFA_VERSION}") +else() + find_package(Git QUIET) + + if(Git_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git") + execute_process( + COMMAND "${GIT_EXECUTABLE}" describe --tags --exact-match + --match "v[0-9]*" --match "[0-9]*" + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE CRYFA_GIT_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + RESULT_VARIABLE CRYFA_GIT_VERSION_RESULT + ) + + if(NOT CRYFA_GIT_VERSION_RESULT EQUAL 0) + execute_process( + COMMAND "${GIT_EXECUTABLE}" describe --tags --dirty --always + --match "v[0-9]*" --match "[0-9]*" + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" + OUTPUT_VARIABLE CRYFA_GIT_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + RESULT_VARIABLE CRYFA_GIT_VERSION_RESULT + ) + endif() + + if(CRYFA_GIT_VERSION_RESULT EQUAL 0 AND CRYFA_GIT_VERSION) + string(REGEX REPLACE "^v" "" CRYFA_VERSION "${CRYFA_GIT_VERSION}") + endif() + endif() +endif() + +set(CRYFA_GENERATED_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/generated") +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cmake/version.hpp.in" + "${CRYFA_GENERATED_INCLUDE_DIR}/cryfa/version.hpp" + @ONLY +) +message(STATUS "Cryfa version: ${CRYFA_VERSION}") + # On MSVC x64 the cryptopp runtime uses hand-coded MASM routines # (CPUID64, XGETBV64, Baseline_Add/Sub, SHA256/GCM/Rijndael_*_SSE2). # Enable the MASM language so CMake can assemble x64dll.asm. @@ -169,13 +217,19 @@ add_library(libCryfaCommon OBJECT src/fastq.cpp src/security.cpp ) -target_include_directories(libCryfaCommon PRIVATE src/include) +target_include_directories(libCryfaCommon PRIVATE + src/include + "${CRYFA_GENERATED_INCLUDE_DIR}" +) target_link_libraries(libCryfaCommon PRIVATE cryptopp-dep) add_executable(cryfa src/cryfa.cpp ) -target_include_directories(cryfa PRIVATE src/include) +target_include_directories(cryfa PRIVATE + src/include + "${CRYFA_GENERATED_INCLUDE_DIR}" +) target_link_libraries(cryfa PRIVATE Threads::Threads libCryfaCommon @@ -185,7 +239,10 @@ target_link_libraries(cryfa PRIVATE add_executable(keygen src/keygen.cpp ) -target_include_directories(keygen PRIVATE src/include) +target_include_directories(keygen PRIVATE + src/include + "${CRYFA_GENERATED_INCLUDE_DIR}" +) # ── CTest round-trip integration test ──────────────────────────────────────── enable_testing() diff --git a/Dockerfile b/Dockerfile index 766148e..5b381bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,20 @@ -# ── Stage 1: builder ─────────────────────────────────────────────────────────- -FROM ubuntu:22.04 AS builder -LABEL maintainer="Morteza Hosseini" - -ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update \ - && apt-get install -y --no-install-recommends build-essential git python3-pip \ - && pip3 install cmake \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /src -COPY . . -RUN cmake -B build -DCMAKE_BUILD_TYPE=Release \ - && cmake --build build --parallel "$(nproc)" --config Release - -# ── Stage 2: runtime ─────────────────────────────────────────────────────────- -FROM debian:bookworm-slim - -COPY --from=builder /src/build/cryfa /usr/local/bin/cryfa -COPY --from=builder /src/build/keygen /usr/local/bin/keygen - -ENTRYPOINT ["cryfa"] +FROM ubuntu:22.04 AS builder +LABEL maintainer="Morteza Hosseini" + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential git python3-pip \ + && pip3 install cmake \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /src +COPY . . +RUN cmake -B build -DCMAKE_BUILD_TYPE=Release \ + && cmake --build build --parallel "$(nproc)" --config Release + +FROM debian:bookworm-slim AS runtime + +COPY --from=builder /src/build/cryfa /usr/local/bin/cryfa +COPY --from=builder /src/build/keygen /usr/local/bin/keygen + +ENTRYPOINT ["cryfa"] diff --git a/README.md b/README.md index 7791a2c..c15cb1f 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,21 @@

Cryfa

-[![Conda Version](https://anaconda.org/bioconda/cryfa/badges/version.svg)](https://anaconda.org/bioconda/cryfa) +[![Anaconda version](https://anaconda.org/bioconda/cryfa/badges/version.svg)](https://anaconda.org/bioconda/cryfa) +[![Anaconda downloads](https://anaconda.org/bioconda/cryfa/badges/downloads.svg)](https://anaconda.org/bioconda/cryfa) [![CI](https://github.com/cobilab/cryfa/actions/workflows/ci.yml/badge.svg)](https://github.com/cobilab/cryfa/actions/workflows/ci.yml) -[![Platforms](https://anaconda.org/bioconda/cryfa/badges/platforms.svg)](https://anaconda.org/bioconda/cryfa) -[![License: GPL v3](https://img.shields.io/badge/License-GPL%20v3-blue.svg)](LICENSE) +[![License](https://img.shields.io/badge/License-GPL%20v3-blue.svg)](LICENSE) Cryfa is an ultrafast encryption tool specifically designed for genomic data. Besides providing robust security, it also compresses FASTA/FASTQ sequences by a factor of three, making it an efficient solution for managing genomic data. -# Installation +## Installation -## Conda +### Conda ```sh conda install -y bioconda::cryfa ``` -## Docker +### Docker The image is available for **linux/amd64** and **linux/arm64** (Apple Silicon, AWS Graviton). @@ -32,9 +32,9 @@ docker run --rm -v /path/to/data:/data smortezah/cryfa \ -k /data/pass.txt -d /data/out.crf > restored.fq ``` -## Build from source +### Build from source -### Linux +#### Linux ```sh # Install git and cmake (≥ 4.0) @@ -48,7 +48,7 @@ cd cryfa; sh install.sh; ``` -### macOS +#### macOS ```sh # Install Homebrew, git and cmake @@ -61,7 +61,7 @@ cd cryfa; sh install.sh; ``` -### Windows +#### Windows ```powershell # Install CMake and Visual Studio Build Tools (requires winget) @@ -77,7 +77,7 @@ cd cryfa > [!NOTE] > Pre-compiled binaries for 64-bit Linux, macOS, and Windows are available as assets on the [Releases](https://github.com/cobilab/cryfa/releases) page. -# Usage +## Usage Run Cryfa with: @@ -102,7 +102,7 @@ A sample file, `in.fq`, is available in the `example/` directory. > [!NOTE] > Cryfa supports a maximum file size of 64 GB. For larger files, consider splitting them into smaller chunks, e.g. using the `split` command in Linux, and then encrypt each chunk separately. After decryption, you can reassemble the chunks using the `cat` command. -## Input file format +### Input file format Cryfa identifies the format of a genomic data file by examining its content, not its extension. For instance, a FASTA file named `test` can be provided with any extension — `test`, `test.fa`, `test.fasta`, `test.fas`, `test.fsa`, etc. So, running @@ -119,7 +119,7 @@ is equivalent to running > [!NOTE] > The password file can have any extension or none at all -- `pass`, `pass.txt`, `pass.dat`, etc. are all valid and yield the same result. -## Options +### Options Cryfa supports the following options: @@ -139,7 +139,7 @@ Cryfa supports the following options: Cryfa leverages the standard output stream, allowing seamless integration with existing data processing pipelines. -## Creating a Key File +### Creating a Key File There are two ways to create a `KEY_FILE` for use with `-k` / `--key`: save a raw password in a file, or use the `keygen` program to generate a strong one. The latter is strongly recommended. @@ -182,7 +182,7 @@ The generated key will be saved to the file you specify (e.g., `key.txt`). Note To learn more about key management (generation, exchange, storage, usage, and replacement of keys), see [[1]](https://en.wikipedia.org/wiki/Key_management), [[2]](https://info.townsendsecurity.com/definitive-guide-to-encryption-key-management-fundamentals), [[3]](https://csrc.nist.gov/projects/key-management/cryptographic-key-management-systems) and [[4]](https://www.cryptomathic.com/news-events/blog/what-is-key-management-a-ciso-perspective). -## Benchmarking Cryfa Against Other Methods +### Benchmarking Cryfa Against Other Methods To benchmark Cryfa against other methods, configure the parameters in the **bench_cryfa.sh** bash script and execute it: @@ -192,13 +192,21 @@ To benchmark Cryfa against other methods, configure the parameters in the **benc This script automates the process of downloading datasets, installing dependencies, setting up compression and encryption tools, executing these tools, and finally, displaying the results. -# Citation +For quick local performance and correctness checks, use the local harness: + +```sh +bash scripts/runtime/run_local_perf.sh --label local-check --input example/in.fq --target-mb 200 --threads "1 4 8" --runs 1 --modes both --no-prompt +``` + +The local harness expands the seed input to the requested size, measures compression and decompression, verifies every round trip with `cmp`, and writes CSV/Markdown reports under `results/local_perf/`. + +## Citation If you use Cryfa in your research, please cite the following references: - M. Hosseini, D. Pratas and A.J. Pinho, "Cryfa: a secure encryption tool for genomic data," _Bioinformatics_, vol. 35, no. 1, pp. 146--148, 2018. [DOI: 10.1093/bioinformatics/bty645](https://doi.org/10.1093/bioinformatics/bty645) - **[OPTIONAL]** D. Pratas, M. Hosseini and A.J. Pinho, "Cryfa: a tool to compact and encrypt FASTA files," _11'th International Conference on Practical Applications of Computational Biology & Bioinformatics_ (PACBB), Springer, June 2017. [DOI: 10.1007/978-3-319-60816-7_37](https://doi.org/10.1007/978-3-319-60816-7_37) -# License +## License Cryfa is licensed under the [GPLv3](http://www.gnu.org/licenses/gpl-3.0.html). diff --git a/bench_cryfa.sh b/bench_cryfa.sh index 6e135ce..d39689a 100644 --- a/bench_cryfa.sh +++ b/bench_cryfa.sh @@ -99,6 +99,20 @@ RUN_CRYFA_THR=1 # Results RESULTS_CRYFA_THR=0 +# Run lightweight local performance harness +RUN_LOCAL_PERF=0 +LOCAL_PERF_LABEL="baseline" +LOCAL_PERF_COMPARE_TO="" +LOCAL_PERF_INPUT="example/in.fq" +LOCAL_PERF_TARGET_MB=200 +LOCAL_PERF_THREADS="1 4 8" +LOCAL_PERF_RUNS=1 +LOCAL_PERF_MODES="default stop-shuffle" +LOCAL_PERF_INTERACTIVE="auto" +LOCAL_PERF_BIN="build/cryfa" +LOCAL_PERF_KEY_FILE="pass.txt" +LOCAL_PERF_OUT_DIR="results/local_perf" + # Run different methods to explore redundancy RUN_REDUNDANCY=0 # Cryfa, DELIMINATE, MFCompress # Dataset (FASTA) -- archaea, bacteria, fungi, plants, viruses diff --git a/cmake/version.hpp.in b/cmake/version.hpp.in new file mode 100644 index 0000000..93db071 --- /dev/null +++ b/cmake/version.hpp.in @@ -0,0 +1,10 @@ +#ifndef CRYFA_VERSION_HPP +#define CRYFA_VERSION_HPP + +#include + +namespace cryfa { +inline const std::string VERSION = "@CRYFA_VERSION@"; +} // namespace cryfa + +#endif // CRYFA_VERSION_HPP diff --git a/install.ps1 b/install.ps1 index 008370b..e8f05ad 100644 --- a/install.ps1 +++ b/install.ps1 @@ -1,14 +1,179 @@ # Parameters -$BUILD_TYPE = "Release" -$BUILD = "build" -$PARALLEL = 8 +$Root = Split-Path -Parent $MyInvocation.MyCommand.Path +$BUILD_TYPE = if ($env:BUILD_TYPE) { $env:BUILD_TYPE } else { "Release" } +$BUILD = if ($env:BUILD) { $env:BUILD } else { "build" } +$PARALLEL = if ($env:PARALLEL) { $env:PARALLEL } else { 8 } +$ConfirmSettings = $false +$PromptOnStaleCache = $true -# Configure CMake -cmake -B $BUILD -DCMAKE_BUILD_TYPE=$BUILD_TYPE +function Write-Log { + param([string]$Message) + Write-Host "[install] $Message" +} -# Build +function Test-InteractiveHost { + return $Host.Name -notmatch 'ServerRemoteHost|Visual Studio Code Host' +} + +function Confirm-Choice { + param( + [string]$Prompt, + [string]$Default = "Y" + ) + + while ($true) { + $reply = Read-Host "$Prompt [$Default]" + if ([string]::IsNullOrWhiteSpace($reply)) { + $reply = $Default + } + + switch ($reply.ToLowerInvariant()) { + "y" { return $true } + "yes" { return $true } + "n" { return $false } + "no" { return $false } + } + + Write-Host "Please answer yes or no." + } +} + +function Get-CacheSourceDirectory { + param([string]$CacheFile) + + if (-not (Test-Path $CacheFile)) { + return $null + } + + $line = Select-String -Path $CacheFile -Pattern '^CMAKE_HOME_DIRECTORY:INTERNAL=' | Select-Object -Last 1 + if ($null -eq $line) { + return $null + } + + return $line.Line -replace '^CMAKE_HOME_DIRECTORY:INTERNAL=', '' +} + +function Ensure-FreshBuildDirectory { + param([string]$BuildDir) + + $cacheFile = Join-Path $BuildDir "CMakeCache.txt" + $cacheSource = Get-CacheSourceDirectory -CacheFile $cacheFile + if ([string]::IsNullOrWhiteSpace($cacheSource) -or $cacheSource -eq $Root) { + return + } + + Write-Log "Detected a build cache from a different source directory." + Write-Log "Current repo: $Root" + Write-Log "Cached source: $cacheSource" + + if ($PromptOnStaleCache -and (Test-InteractiveHost)) { + if (-not (Confirm-Choice "Reset the build directory and reconfigure?" "Y")) { + throw "Installation cancelled." + } + } + + if (-not (Test-Path $BuildDir)) { + return + } + + Write-Log "Resetting stale CMake state in: $BuildDir" + Write-Log "Keeping reusable downloaded dependencies under $BuildDir\_deps when present" + + $pathsToRemove = @( + (Join-Path $BuildDir "CMakeCache.txt"), + (Join-Path $BuildDir "CTestTestfile.cmake"), + (Join-Path $BuildDir "Makefile"), + (Join-Path $BuildDir "cmake_install.cmake"), + (Join-Path $BuildDir "compile_commands.json"), + (Join-Path $BuildDir "build.ninja"), + (Join-Path $BuildDir ".ninja_deps"), + (Join-Path $BuildDir ".ninja_log"), + (Join-Path $BuildDir "CMakeFiles") + ) + + foreach ($path in $pathsToRemove) { + if (Test-Path $path) { + Remove-Item -Recurse -Force $path + } + } +} + +function Get-BuiltExecutable { + param([string]$Name) + + $exeName = "$Name.exe" + $candidates = @( + (Join-Path $BUILD $exeName), + (Join-Path (Join-Path $BUILD $BUILD_TYPE) $exeName) + ) + + foreach ($candidate in $candidates) { + if (Test-Path $candidate) { + return $candidate + } + } + + throw "Unable to find built executable '$exeName' in '$BUILD' or '$BUILD\$BUILD_TYPE'." +} + +for ($i = 0; $i -lt $args.Length; $i++) { + switch ($args[$i]) { + "--build-dir" { + $i++ + $BUILD = $args[$i] + } + "--build-type" { + $i++ + $BUILD_TYPE = $args[$i] + } + "--parallel" { + $i++ + $PARALLEL = $args[$i] + } + "--interactive" { + $ConfirmSettings = $true + } + "--no-prompt" { + $ConfirmSettings = $false + $PromptOnStaleCache = $false + } + "--help" { + Write-Host "Usage: .\install.ps1 [--build-dir DIR] [--build-type TYPE] [--parallel N] [--interactive] [--no-prompt]" + exit 0 + } + default { + throw "Unknown argument: $($args[$i])" + } + } +} + +Set-Location $Root + +if ($ConfirmSettings -and (Test-InteractiveHost)) { + Write-Log "Planned install settings:" + Write-Log " Build directory: $BUILD" + Write-Log " Build type: $BUILD_TYPE" + Write-Log " Parallel jobs: $PARALLEL" + + if (-not (Confirm-Choice "Continue with these settings?" "Y")) { + throw "Installation cancelled." + } +} + +if (-not ($PARALLEL -as [int]) -or [int]$PARALLEL -le 0) { + throw "Parallel build jobs must be a positive integer." +} + +Ensure-FreshBuildDirectory -BuildDir $BUILD + +Write-Log "Configuring CMake in `"$BUILD`"" +cmake -S $Root -B $BUILD -DCMAKE_BUILD_TYPE=$BUILD_TYPE + +Write-Log "Building targets with $PARALLEL parallel jobs" cmake --build $BUILD --parallel $PARALLEL --config $BUILD_TYPE -# Move executables to the main directory -Move-Item "$BUILD\$BUILD_TYPE\cryfa.exe" -Destination . -Move-Item "$BUILD\$BUILD_TYPE\keygen.exe" -Destination . +Write-Log "Copying executables to the repository root" +Copy-Item (Get-BuiltExecutable "cryfa") -Destination $Root -Force +Copy-Item (Get-BuiltExecutable "keygen") -Destination $Root -Force + +Write-Log "Install complete" diff --git a/install.sh b/install.sh index 0fc3deb..242bb23 100644 --- a/install.sh +++ b/install.sh @@ -1,16 +1,196 @@ -#!/usr/bin/env bash +#!/usr/bin/env sh -# Parameters -BUILD_TYPE=Release -BUILD=build -PARALLEL=8 +set -eu -# Configure CMake -cmake -B $BUILD -DCMAKE_BUILD_TYPE=$BUILD_TYPE +ROOT_DIR=$(CDPATH= cd -- "$(dirname "$0")" && pwd) +BUILD_TYPE=${BUILD_TYPE:-Release} +BUILD=${BUILD:-build} +PARALLEL=${PARALLEL:-8} +INTERACTIVE=${INTERACTIVE:-no} +NO_PROMPT=0 -# Build -cmake --build $BUILD --parallel $PARALLEL --config $BUILD_TYPE +usage() { + cat <<'EOF' +Usage: sh install.sh [options] -# Move executables to the main directory -mv $BUILD/cryfa . -mv $BUILD/keygen . +Options: + --build-dir DIR Build directory (default: build) + --build-type TYPE CMake build type (default: Release) + --parallel N Parallel build jobs (default: 8) + --interactive Confirm the resolved settings before building + --no-prompt Run non-interactively with the current values + --help Show this help text +EOF +} + +log() { + printf '[install] %s\n' "$*" +} + +fail() { + log "Error: $*" + exit 1 +} + +is_tty() { + [ -t 0 ] && [ -t 1 ] +} + +confirm_yes_no() { + prompt_text=$1 + default_value=$2 + while true; do + printf '%s [%s]: ' "$prompt_text" "$default_value" >&2 + IFS= read -r reply || exit 1 + if [ -z "$reply" ]; then + reply=$default_value + fi + case $(printf '%s' "$reply" | tr '[:upper:]' '[:lower:]') in + y | yes) return 0 ;; + n | no) return 1 ;; + esac + printf 'Please answer yes or no.\n' >&2 + done +} + +validate_options() { + case $BUILD_TYPE in + "" ) + fail "build type must not be empty." + ;; + esac + + case $PARALLEL in + '' | *[!0-9]*) + fail "parallel build jobs must be a positive integer." + ;; + 0) + fail "parallel build jobs must be greater than zero." + ;; + esac +} + +cache_source_dir() { + cache_file=$1 + sed -n 's/^CMAKE_HOME_DIRECTORY:INTERNAL=//p' "$cache_file" | tail -n 1 +} + +clear_stale_cmake_state() { + build_dir=$1 + if [ ! -d "$build_dir" ]; then + return + fi + + log "Resetting stale CMake state in: $build_dir" + log "Keeping reusable downloaded dependencies under $build_dir/_deps when present" + + rm -f \ + "$build_dir/CMakeCache.txt" \ + "$build_dir/CTestTestfile.cmake" \ + "$build_dir/Makefile" \ + "$build_dir/cmake_install.cmake" \ + "$build_dir/compile_commands.json" \ + "$build_dir/build.ninja" \ + "$build_dir/.ninja_deps" \ + "$build_dir/.ninja_log" + + rm -rf "$build_dir/CMakeFiles" +} + +handle_existing_cache() { + build_dir=$1 + cache_file=$build_dir/CMakeCache.txt + + if [ ! -f "$cache_file" ]; then + return + fi + + cache_source=$(cache_source_dir "$cache_file") + if [ "$cache_source" = "$ROOT_DIR" ]; then + return + fi + + log "Detected a build cache from a different source directory." + log "Current repo: $ROOT_DIR" + log "Cached source: ${cache_source:-unknown}" + + if [ "$NO_PROMPT" -eq 0 ] && is_tty; then + if ! confirm_yes_no "Reset the build directory and reconfigure?" "yes"; then + fail "installation cancelled." + fi + fi + + clear_stale_cmake_state "$build_dir" +} + +built_executable() { + name=$1 + for candidate in "$BUILD/$name" "$BUILD/$BUILD_TYPE/$name"; do + if [ -f "$candidate" ]; then + printf '%s\n' "$candidate" + return + fi + done + + fail "unable to find built executable \"$name\" in \"$BUILD\" or \"$BUILD/$BUILD_TYPE\"." +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --build-dir) + BUILD=$2 + shift 2 + ;; + --build-type) + BUILD_TYPE=$2 + shift 2 + ;; + --parallel) + PARALLEL=$2 + shift 2 + ;; + --interactive) + INTERACTIVE=yes + shift + ;; + --no-prompt) + INTERACTIVE=no + NO_PROMPT=1 + shift + ;; + -h | --help) + usage + exit 0 + ;; + *) + fail "unknown argument \"$1\"." + ;; + esac +done + +cd "$ROOT_DIR" + +if [ "$INTERACTIVE" = "yes" ] && [ "$NO_PROMPT" -eq 0 ] && is_tty; then + log "Planned install settings:" + log " Build directory: $BUILD" + log " Build type: $BUILD_TYPE" + log " Parallel jobs: $PARALLEL" + if ! confirm_yes_no "Continue with these settings?" "yes"; then + fail "installation cancelled." + fi +fi + +validate_options +handle_existing_cache "$BUILD" + +log "Configuring CMake in \"$BUILD\"" +cmake -S "$ROOT_DIR" -B "$BUILD" -DCMAKE_BUILD_TYPE="$BUILD_TYPE" + +log "Building targets with $PARALLEL parallel jobs" +cmake --build "$BUILD" --parallel "$PARALLEL" --config "$BUILD_TYPE" + +log "Copying executables to the repository root" +cp "$(built_executable cryfa)" "$ROOT_DIR/cryfa" +cp "$(built_executable keygen)" "$ROOT_DIR/keygen" + +log "Install complete" diff --git a/scripts/config/par.sh b/scripts/config/par.sh index 70e4f2b..4f5043a 100644 --- a/scripts/config/par.sh +++ b/scripts/config/par.sh @@ -18,7 +18,7 @@ scripts_results="$scripts/results" dataset="dataset" redun="Redundancy" progs="progs" -result="result" +result="results" details="details" FA="FA" FQ="FQ" diff --git a/scripts/orchestration/benchmark_orchestrator.sh b/scripts/orchestration/benchmark_orchestrator.sh index 782b48f..b5c6ac4 100644 --- a/scripts/orchestration/benchmark_orchestrator.sh +++ b/scripts/orchestration/benchmark_orchestrator.sh @@ -253,6 +253,56 @@ function run_benchmark_cryfa_threads { fi } +function run_benchmark_local_perf { + if [[ $RUN_LOCAL_PERF -ne 1 ]]; then + return + fi + + ensureDir $result + + local label=${LOCAL_PERF_LABEL:-baseline} + local compare_to=${LOCAL_PERF_COMPARE_TO:-} + local input=${LOCAL_PERF_INPUT:-example/in.fq} + local target_mb=${LOCAL_PERF_TARGET_MB:-200} + local threads=${LOCAL_PERF_THREADS:-1\ 4\ 8} + local runs=${LOCAL_PERF_RUNS:-1} + local modes=${LOCAL_PERF_MODES:-default\ stop-shuffle} + local interactive_mode=${LOCAL_PERF_INTERACTIVE:-auto} + local bin=${LOCAL_PERF_BIN:-build/cryfa} + local key_file=${LOCAL_PERF_KEY_FILE:-pass.txt} + local out_dir=${LOCAL_PERF_OUT_DIR:-results/local_perf} + local compare_args=() + local prompt_args=() + if [[ -n $compare_to ]]; then + compare_args=(--compare-to "$compare_to") + fi + case "$interactive_mode" in + yes) + prompt_args=(--interactive) + ;; + no) + prompt_args=(--no-prompt) + ;; + esac + + echo "[local_perf] Starting local performance harness..." + echo "[local_perf] Base label: $label" + echo "[local_perf] Output dir: $out_dir" + + bash "$scripts_runtime/run_local_perf.sh" \ + --label "$label" \ + "${compare_args[@]}" \ + "${prompt_args[@]}" \ + --bin "$bin" \ + --key-file "$key_file" \ + --input "$input" \ + --out-dir "$out_dir" \ + --target-mb "$target_mb" \ + --threads "$threads" \ + --runs "$runs" \ + --modes "$modes" +} + function run_benchmark_redundancy { if [[ $RUN_REDUNDANCY -ne 1 ]]; then return @@ -278,5 +328,6 @@ function run_benchmark { run_benchmark_encryption run_benchmark_compression_encryption run_benchmark_cryfa_threads || return 1 + run_benchmark_local_perf || return 1 run_benchmark_redundancy } diff --git a/scripts/runtime/run_local_perf.sh b/scripts/runtime/run_local_perf.sh new file mode 100644 index 0000000..14a0ef1 --- /dev/null +++ b/scripts/runtime/run_local_perf.sh @@ -0,0 +1,770 @@ +#!/usr/bin/env bash + +set -euo pipefail + +ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd) + +LABEL=${LOCAL_PERF_LABEL:-} +COMPARE_TO=${LOCAL_PERF_COMPARE_TO:-} +BIN=${LOCAL_PERF_BIN:-build/cryfa} +KEY_FILE=${LOCAL_PERF_KEY_FILE:-pass.txt} +INPUT=${LOCAL_PERF_INPUT:-example/in.fq} +OUT_DIR=${LOCAL_PERF_OUT_DIR:-results/local_perf} +TARGET_MB=${LOCAL_PERF_TARGET_MB:-200} +THREADS=${LOCAL_PERF_THREADS:-1 4 8} +RUNS=${LOCAL_PERF_RUNS:-1} +MODES=${LOCAL_PERF_MODES:-default stop-shuffle} +INTERACTIVE=${LOCAL_PERF_INTERACTIVE:-auto} + +RUN_TIMESTAMP="" +LABEL_BASE="" +RUN_LABEL="" +LABEL_SAFE="" +DATASET="" +INPUT_BYTES=0 +COMPARE_DIR="" +COMPARE_LABEL="" +RAW_CSV="" +SUMMARY_CSV="" +SUMMARY_MD="" +COMPARE_CSV="" +COMPARE_MD="" + +function usage { + cat <<'EOF' +Usage: bash scripts/runtime/run_local_perf.sh [options] + +Options: + --label NAME Base label for this run; a timestamp is always appended + --compare-to NAME Compare against an exact prior run label or a label prefix + --bin PATH Cryfa binary path (default: build/cryfa) + --key-file PATH Key file path (default: pass.txt) + --input PATH Seed input or dataset path (default: example/in.fq) + --out-dir PATH Output folder (default: results/local_perf) + --target-mb N Expand the seed input to at least N MiB (default: 200) + --threads "LIST" Thread counts to test (default: "1 4 8") + --runs N Runs per configuration (default: 1) + --modes "LIST" Modes: default, stop-shuffle, or both + --interactive Prompt for the main run options before starting + --no-prompt Run non-interactively with the current values + --help Show this help text +EOF +} + +function timestamp { + date '+%Y-%m-%d %H:%M:%S' +} + +function log { + printf '[%s] %s\n' "$(timestamp)" "$*" +} + +function log_section { + printf '\n[%s] %s\n' "$(timestamp)" "$*" +} + +function fail { + log "Error: $*" + exit 1 +} + +function is_tty { + [[ -t 0 && -t 1 ]] +} + +function prompt_with_default { + local prompt_text=$1 + local default_value=$2 + local reply + printf '%s [%s]: ' "$prompt_text" "$default_value" >&2 + IFS= read -r reply || exit 1 + if [[ -n $reply ]]; then + printf '%s\n' "$reply" + else + printf '%s\n' "$default_value" + fi +} + +function confirm_yes_no { + local prompt_text=$1 + local default_value=$2 + local reply + local normalized + + while true; do + printf '%s [%s]: ' "$prompt_text" "$default_value" >&2 + IFS= read -r reply || exit 1 + if [[ -z $reply ]]; then + reply=$default_value + fi + normalized=$(printf '%s' "$reply" | tr '[:upper:]' '[:lower:]') + case "$normalized" in + y | yes) + return 0 + ;; + n | no) + return 1 + ;; + esac + printf 'Please answer yes or no.\n' >&2 + done +} + +function sanitize_name { + printf '%s' "$1" | tr -c 'A-Za-z0-9._-' '_' +} + +function resolve_path { + local path=$1 + if [[ $path = /* ]]; then + printf '%s\n' "$path" + else + printf '%s/%s\n' "$ROOT_DIR" "$path" + fi +} + +function file_size_bytes { + wc -c <"$1" | awk '{print $1}' +} + +function file_checksum { + cksum "$1" | awk '{print $1 ":" $2}' +} + +function cache_key { + printf '%s' "$1" | cksum | awk '{print $1}' +} + +function format_mib { + awk -v bytes="$1" 'BEGIN { printf "%.2f", bytes / 1048576.0 }' +} + +function mib_per_second { + awk -v bytes="$1" -v seconds="$2" 'BEGIN { + if (seconds + 0 <= 0) { + printf "0.00" + } else { + printf "%.2f", (bytes / 1048576.0) / seconds + } + }' +} + +function parse_time_value { + local key=$1 + local log_file=$2 + awk -v key="$key" '$1 == key { print $2 }' "$log_file" | tail -n 1 +} + +function require_file { + local path=$1 + local label=$2 + if [[ ! -e $path ]]; then + fail "$label \"$path\" does not exist." + fi +} + +function is_probably_script_executable { + local path=$1 + if ! command -v file >/dev/null 2>&1; then + return 1 + fi + + file "$path" | grep -qi 'script' +} + +function resolve_existing_binary { + local requested=$1 + local candidate + + if [[ -x $requested ]] && ! is_probably_script_executable "$requested"; then + printf '%s\n' "$requested" + return + fi + + for candidate in "$ROOT_DIR/build/cryfa" "$ROOT_DIR/cryfa"; do + if [[ -x $candidate ]] && ! is_probably_script_executable "$candidate"; then + printf '%s\n' "$candidate" + return + fi + done + + fail "Cryfa binary \"$requested\" does not exist." +} + +function csv_escape { + local value=$1 + value=${value//$'\r'/} + if [[ $value == *','* || $value == *'"'* || $value == *$'\n'* ]]; then + value=${value//\"/\"\"} + printf '"%s"' "$value" + else + printf '%s' "$value" + fi +} + +function append_csv_row { + local output_file=$1 + shift + local first=1 + local field + + { + for field in "$@"; do + if ((first)); then + first=0 + else + printf ',' + fi + csv_escape "$field" + done + printf '\n' + } >>"$output_file" +} + +function convert_tsv_file_to_csv { + local input_file=$1 + local output_file=$2 + awk -F '\t' ' + { + for (i = 1; i <= NF; ++i) { + gsub(/\r/, "", $i) + gsub(/"/, "\"\"", $i) + if ($i ~ /[",\n]/) { + $i = "\"" $i "\"" + } + } + for (i = 1; i <= NF; ++i) { + printf "%s", $i + if (i < NF) { + printf "," + } + } + printf "\n" + } + ' "$input_file" >"$output_file" +} + +function ask_for_options { + log "Interactive benchmark setup" + LABEL=$(prompt_with_default "Base label" "${LABEL:-baseline}") + INPUT=$(prompt_with_default "Input file" "$INPUT") + TARGET_MB=$(prompt_with_default "Target dataset size in MiB (0 keeps the input as-is)" "$TARGET_MB") + THREADS=$(prompt_with_default "Thread counts (space or comma separated)" "$THREADS") + RUNS=$(prompt_with_default "Runs per case" "$RUNS") + MODES=$(prompt_with_default "Modes (default, stop-shuffle, both)" "$MODES") + COMPARE_TO=$(prompt_with_default "Compare to previous label (leave empty to skip)" "$COMPARE_TO") +} + +function build_run_label { + RUN_TIMESTAMP=$(date '+%Y%m%d_%H%M%S') + LABEL_BASE=${LABEL:-baseline} + LABEL_BASE=${LABEL_BASE//$'\r'/} + LABEL_BASE=${LABEL_BASE//$'\n'/ } + LABEL_BASE=${LABEL_BASE//,/ _} + LABEL_BASE=${LABEL_BASE//\"/_} + RUN_LABEL="${LABEL_BASE}_${RUN_TIMESTAMP}" + LABEL_SAFE=$(sanitize_name "$RUN_LABEL") +} + +function build_dataset { + local seed_bytes + local source_path + local source_checksum + local source_key + seed_bytes=$(file_size_bytes "$INPUT") + source_path=$(resolve_path "$INPUT") + + if (( TARGET_MB <= 0 )) || (( seed_bytes >= TARGET_MB * 1024 * 1024 )); then + DATASET=$INPUT + log "Using input file directly as benchmark dataset: $DATASET ($(format_mib "$seed_bytes") MiB)" + return + fi + + local target_bytes=$((TARGET_MB * 1024 * 1024)) + local copies=$(((target_bytes + seed_bytes - 1) / seed_bytes)) + local dataset_dir="$OUT_DIR/datasets" + local base_name + local ext + local tmp + + mkdir -p "$dataset_dir" + base_name=$(basename "${INPUT%.*}") + ext=${INPUT##*.} + if [[ $ext == "$INPUT" ]]; then + ext="dat" + fi + + source_checksum=$(file_checksum "$INPUT") + source_key=$(cache_key "${source_path}:${source_checksum}") + DATASET="$dataset_dir/${base_name}_${source_key}_${TARGET_MB}mb_x${copies}.${ext}" + + if [[ -f $DATASET && -f $DATASET.meta ]] && + grep -Fqx "dataset_format=2" "$DATASET.meta" && + grep -Fqx "source=$source_path" "$DATASET.meta" && + grep -Fqx "source_checksum=$source_checksum" "$DATASET.meta" && + grep -Fqx "seed_bytes=$seed_bytes" "$DATASET.meta" && + grep -Fqx "copies=$copies" "$DATASET.meta" && + grep -Fqx "target_mb=$TARGET_MB" "$DATASET.meta"; then + log "Reusing cached generated dataset: $DATASET ($(format_mib "$(file_size_bytes "$DATASET")") MiB)" + return + fi + + if (( copies > 1 )) && ! command -v perl >/dev/null 2>&1; then + fail "perl is required to expand \"$INPUT\" into a local dataset." + fi + + tmp="$DATASET.tmp" + rm -f "$tmp" + + log "Generating synthetic benchmark dataset from $INPUT" + log "Seed size: $(format_mib "$seed_bytes") MiB | Copies: $copies | Target: ${TARGET_MB} MiB+" + + if (( copies == 1 )); then + cp "$INPUT" "$tmp" + else + perl -e ' + use strict; + use warnings; + + my ($input, $copies, $output) = @ARGV; + open my $in, "<", $input or die "Unable to open $input: $!"; + binmode $in; + local $/; + my $chunk = <$in>; + close $in; + my $terminator = length($chunk) && substr($chunk, -1) eq "\n" ? "" : "\n"; + + open my $out, ">", $output or die "Unable to open $output: $!"; + binmode $out; + for (1 .. $copies) { + print {$out} $chunk; + print {$out} $terminator if length($terminator); + } + close $out or die "Unable to write $output: $!"; + ' "$INPUT" "$copies" "$tmp" + fi + + mv "$tmp" "$DATASET" + cat >"$DATASET.meta" <"$compressed"; } 2>"$c_log"; then + fail "compression failed for mode=$mode thread=$thread run=$run_index. See $c_log" + fi + + decompress_cmd=("$BIN" -k "$KEY_FILE" -t "$thread" -d "$compressed") + log "Decompress command: ${decompress_cmd[*]}" + if ! { time -p "${decompress_cmd[@]}" >"$decompressed"; } 2>"$d_log"; then + fail "decompression failed for mode=$mode thread=$thread run=$run_index. See $d_log" + fi + + if ! cmp -s "$DATASET" "$decompressed"; then + fail "round-trip mismatch for mode=$mode thread=$thread run=$run_index." + fi + + local compressed_bytes + local c_real + local c_user + local c_sys + local d_real + local d_user + local d_sys + local c_rate + local d_rate + + compressed_bytes=$(file_size_bytes "$compressed") + c_real=$(parse_time_value real "$c_log") + c_user=$(parse_time_value user "$c_log") + c_sys=$(parse_time_value sys "$c_log") + d_real=$(parse_time_value real "$d_log") + d_user=$(parse_time_value user "$d_log") + d_sys=$(parse_time_value sys "$d_log") + c_rate=$(mib_per_second "$INPUT_BYTES" "$c_real") + d_rate=$(mib_per_second "$INPUT_BYTES" "$d_real") + + append_csv_row "$RAW_CSV" \ + "$RUN_LABEL" "$mode" "$thread" "$run_index" "$INPUT_BYTES" "$compressed_bytes" \ + "$c_real" "$c_user" "$c_sys" "$c_rate" \ + "$d_real" "$d_user" "$d_sys" "$d_rate" "ok" + + log "Case complete: compressed $(format_mib "$compressed_bytes") MiB | c=${c_real}s (${c_rate} MiB/s) | d=${d_real}s (${d_rate} MiB/s) | verified=ok" + + rm -f "$compressed" "$decompressed" + done +} + +function write_summary_csv { + append_csv_row "$SUMMARY_CSV" \ + label mode threads runs input_bytes compressed_bytes avg_c_real avg_c_user avg_c_sys avg_c_mib_s avg_d_real avg_d_user avg_d_sys avg_d_mib_s verified + + local mode + local thread + for mode in $MODES; do + for thread in $THREADS; do + awk -F ',' -v label="$RUN_LABEL" -v mode="$mode" -v thread="$thread" ' + NR > 1 && $2 == mode && $3 == thread { + count++ + input_bytes = $5 + compressed_bytes = $6 + c_real += $7 + c_user += $8 + c_sys += $9 + c_mib += $10 + d_real += $11 + d_user += $12 + d_sys += $13 + d_mib += $14 + if ($15 != "ok") verified = "FAIL" + } + END { + if (count > 0) { + if (verified == "") verified = "ok" + printf "%s,%s,%s,%d,%s,%s,%.6f,%.6f,%.6f,%.2f,%.6f,%.6f,%.6f,%.2f,%s\n", + label, mode, thread, count, input_bytes, compressed_bytes, + c_real / count, c_user / count, c_sys / count, c_mib / count, + d_real / count, d_user / count, d_sys / count, d_mib / count, + verified + } + } + ' "$RAW_CSV" >>"$SUMMARY_CSV" + done + done +} + +function write_summary_markdown { + { + echo "# Local Performance Summary" + echo + echo "- Label: $RUN_LABEL" + echo "- Dataset: $DATASET" + echo "- Dataset size: $(format_mib "$INPUT_BYTES") MiB" + echo "- Runs per case: $RUNS" + echo "- Threads: $THREADS" + echo "- Modes: $MODES" + echo "- Raw data: $(basename "$RAW_CSV")" + echo "- Summary data: $(basename "$SUMMARY_CSV")" + echo + echo "| Mode | Threads | Runs | Compressed Size (MiB) | Ratio | Compress Time (s) | Compress MiB/s | Decompress Time (s) | Decompress MiB/s | Verified |" + echo "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |" + awk -F ',' ' + NR > 1 { + printf "| %s | %s | %s | %.2f | %.3f | %.3f | %.2f | %.3f | %.2f | %s |\n", + $2, $3, $4, $6 / 1048576.0, $6 / $5, $7, $10, $11, $14, $15 + } + ' "$SUMMARY_CSV" + } >"$SUMMARY_MD" +} + +function summary_input_for_compare { + local dir=$1 + if [[ -f "$dir/summary.csv" ]]; then + printf '%s\n' "$dir/summary.csv" + return + fi + if [[ -f "$dir/summary.tsv" ]]; then + printf '%s\n' "$dir/summary.tsv" + return + fi + fail "comparison summary was not found in \"$dir\"." +} + +function write_compare_report { + [[ -z $COMPARE_DIR ]] && return + + local baseline_summary + local baseline_csv + baseline_summary=$(summary_input_for_compare "$COMPARE_DIR") + + append_csv_row "$COMPARE_CSV" \ + mode threads compress_speedup decompress_speedup roundtrip_speedup size_ratio_delta baseline_verified current_verified + + if [[ $baseline_summary == *.tsv ]]; then + baseline_csv=$(mktemp "${TMPDIR:-/tmp}/cryfa-compare-baseline.XXXXXX.csv") + convert_tsv_file_to_csv "$baseline_summary" "$baseline_csv" + else + baseline_csv=$baseline_summary + fi + + awk -F ',' ' + NR == FNR { + if (FNR == 1) next + key = $2 SUBSEP $3 + base_c[key] = $7 + base_d[key] = $11 + base_ratio[key] = $6 / $5 + base_verified[key] = $15 + next + } + FNR == 1 { + next + } + { + key = $2 SUBSEP $3 + if (!(key in base_c)) next + compress_speedup = ($7 + 0 > 0) ? base_c[key] / $7 : 0 + decompress_speedup = ($11 + 0 > 0) ? base_d[key] / $11 : 0 + roundtrip_speedup = (($7 + $11) + 0 > 0) ? (base_c[key] + base_d[key]) / ($7 + $11) : 0 + ratio_delta = ($6 / $5) - base_ratio[key] + printf "%s,%s,%.3f,%.3f,%.3f,%.6f,%s,%s\n", + $2, $3, compress_speedup, decompress_speedup, roundtrip_speedup, ratio_delta, + base_verified[key], $15 + } + ' "$baseline_csv" "$SUMMARY_CSV" >>"$COMPARE_CSV" + + if [[ $baseline_summary == *.tsv ]]; then + rm -f "$baseline_csv" + fi + + { + echo "# Before/After Comparison" + echo + echo "- Before: $COMPARE_LABEL" + echo "- After: $RUN_LABEL" + echo "- Speedup values above 1.00 are faster than the baseline." + echo "- Negative size deltas mean the new run produced smaller output." + echo "- Comparison data: $(basename "$COMPARE_CSV")" + echo + echo "| Mode | Threads | Compress Speedup | Decompress Speedup | Round-Trip Speedup | Size Ratio Delta | Baseline Verified | Current Verified |" + echo "| --- | ---: | ---: | ---: | ---: | ---: | --- | --- |" + awk -F ',' ' + NR > 1 { + printf "| %s | %s | %.3f | %.3f | %.3f | %.6f | %s | %s |\n", + $1, $2, $3, $4, $5, $6, $7, $8 + } + ' "$COMPARE_CSV" + } >"$COMPARE_MD" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --label) + LABEL=$2 + shift 2 + ;; + --compare-to) + COMPARE_TO=$2 + shift 2 + ;; + --bin) + BIN=$2 + shift 2 + ;; + --key-file) + KEY_FILE=$2 + shift 2 + ;; + --input) + INPUT=$2 + shift 2 + ;; + --out-dir) + OUT_DIR=$2 + shift 2 + ;; + --target-mb) + TARGET_MB=$2 + shift 2 + ;; + --threads) + THREADS=$2 + shift 2 + ;; + --runs) + RUNS=$2 + shift 2 + ;; + --modes) + MODES=$2 + shift 2 + ;; + --interactive) + INTERACTIVE=yes + shift + ;; + --no-prompt) + INTERACTIVE=no + shift + ;; + -h | --help) + usage + exit 0 + ;; + *) + echo "Error: unknown argument \"$1\"." >&2 + usage >&2 + exit 1 + ;; + esac +done + +if [[ $INTERACTIVE == auto ]] && is_tty; then + INTERACTIVE=yes +fi + +if [[ $INTERACTIVE == yes ]] && is_tty; then + ask_for_options +fi + +THREADS=${THREADS//,/ } +case "$MODES" in +both) + MODES="default stop-shuffle" + ;; +*) + MODES=${MODES//,/ } + ;; +esac + +if [[ ! $RUNS =~ ^[1-9][0-9]*$ ]]; then + fail "--runs must be a positive integer." +fi + +if [[ ! $TARGET_MB =~ ^[0-9]+$ ]]; then + fail "--target-mb must be a non-negative integer." +fi + +BIN=$(resolve_existing_binary "$(resolve_path "$BIN")") +KEY_FILE=$(resolve_path "$KEY_FILE") +INPUT=$(resolve_path "$INPUT") +OUT_DIR=$(resolve_path "$OUT_DIR") + +require_file "$KEY_FILE" "key file" +require_file "$INPUT" "input file" + +if [[ $INTERACTIVE == yes ]] && is_tty; then + log "Planned benchmark settings:" + log " Base label: ${LABEL:-baseline}" + log " Input file: $INPUT" + log " Target size: $TARGET_MB MiB" + log " Threads: $THREADS" + log " Runs per case: $RUNS" + log " Modes: $MODES" + if [[ -n $COMPARE_TO ]]; then + log " Compare to: $COMPARE_TO" + else + log " Compare to: none" + fi + if ! confirm_yes_no "Start this benchmark run?" "yes"; then + fail "benchmark cancelled." + fi +fi + +build_run_label + +mkdir -p "$OUT_DIR" +RUN_DIR="$OUT_DIR/$LABEL_SAFE" +DETAILS_DIR="$RUN_DIR/details" +RAW_CSV="$RUN_DIR/raw.csv" +SUMMARY_CSV="$RUN_DIR/summary.csv" +SUMMARY_MD="$RUN_DIR/summary.md" +COMPARE_CSV="$RUN_DIR/compare.csv" +COMPARE_MD="$RUN_DIR/compare.md" + +rm -rf "$RUN_DIR" +mkdir -p "$DETAILS_DIR" + +log_section "Local performance benchmark" +log "Run label: $RUN_LABEL" +log "Binary: $BIN" +log "Key file: $KEY_FILE" +log "Input: $INPUT" +log "Output directory: $RUN_DIR" +log "Requested threads: $THREADS" +log "Runs per case: $RUNS" +log "Modes: $MODES" + +build_dataset +INPUT_BYTES=$(file_size_bytes "$DATASET") +log "Measured dataset size: $(format_mib "$INPUT_BYTES") MiB" + +resolve_compare_target + +append_csv_row "$RAW_CSV" \ + label mode threads run input_bytes compressed_bytes c_real c_user c_sys c_mib_s d_real d_user d_sys d_mib_s verified + +for mode in $MODES; do + for thread in $THREADS; do + run_case "$mode" "$thread" + done +done + +log_section "Writing reports" +write_summary_csv +write_summary_markdown +write_compare_report + +log "Raw CSV: $RAW_CSV" +log "Summary CSV: $SUMMARY_CSV" +log "Summary markdown: $SUMMARY_MD" +if [[ -f $COMPARE_MD ]]; then + log "Comparison CSV: $COMPARE_CSV" + log "Comparison markdown: $COMPARE_MD" +fi + +log_section "Benchmark complete" diff --git a/src/application.cpp b/src/application.cpp index 7742bad..507e15f 100644 --- a/src/application.cpp +++ b/src/application.cpp @@ -1,14 +1,14 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file application.cpp - * @brief Application - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file application.cpp + * @brief Application */ #include "application.hpp" -#include +#include #include "assert.hpp" #include "numeric.hpp" @@ -39,7 +39,7 @@ void application::exe_compress_encrypt() { crypt.shuffle_file(); break; default: - error("\"" + par.in_file + "\" is not a valid FASTA or FASTQ file."); + error(std::format("\"{}\" is not a valid FASTA or FASTQ file.", par.in_file)); } } @@ -47,9 +47,7 @@ void application::exe_compress_encrypt() { * @brief Decrypt and/or unshuffle + decompress */ void application::exe_decrypt_decompress() { - crypt.decrypt(); - std::ifstream in(DEC_FNAME); - switch (in.peek()) { + switch (crypt.peek_decrypted_type()) { case (char)127: fa.decompress(); break; @@ -57,18 +55,18 @@ void application::exe_decrypt_decompress() { fq.decompress(); break; case (char)125: + crypt.decrypt(); crypt.unshuffle_file(); break; default: error("corrupted file."); } - in.close(); } /** * @brief Execute Cryfa - * @param argc number of command line arguments - * @param argv command line arguments + * @param argc Number of command line arguments + * @param argv Command line arguments */ void application::exe(int argc, char* argv[]) { const char action = parse(par, argc, argv); diff --git a/src/application.hpp b/src/application.hpp index c44e438..a27e077 100644 --- a/src/application.hpp +++ b/src/application.hpp @@ -1,9 +1,9 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file application.hpp - * @brief Application - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file application.hpp + * @brief Application */ #ifndef CRYFA_APPLICATION_H @@ -32,4 +32,4 @@ class application { } // namespace cryfa -#endif // CRYFA_APPLICATION_H \ No newline at end of file +#endif // CRYFA_APPLICATION_H diff --git a/src/cryfa.cpp b/src/cryfa.cpp index b79050b..394ea00 100644 --- a/src/cryfa.cpp +++ b/src/cryfa.cpp @@ -1,16 +1,9 @@ -/***************************************************** - Cryfa :: A secure encryption tool for genomic data -****************************************************** - Morteza Hosseini seyedmorteza@ua.pt - Diogo Pratas pratas@ua.pt -******************************************************/ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only /** - * @file cryfa.cpp - * @brief Main - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file cryfa.cpp + * @brief Main file - Cryfa: a secure encryption tool for genomic data */ #include // std::exception @@ -19,9 +12,9 @@ using namespace cryfa; /** - * @brief Run Cryfa - * @param argc number of command line arguments - * @param argv command line arguments + * @brief Run Cryfa + * @param argc Number of command line arguments + * @param argv Command line arguments * @return SUCCESS or FAILURE */ int main(int argc, char* argv[]) { @@ -34,4 +27,4 @@ int main(int argc, char* argv[]) { } return 0; -} \ No newline at end of file +} diff --git a/src/def.hpp b/src/def.hpp index e9b2863..a077091 100644 --- a/src/def.hpp +++ b/src/def.hpp @@ -1,9 +1,9 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file def.hpp - * @brief Definitions - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file def.hpp + * @brief Definitions */ #ifndef CRYFA_DEF_H @@ -14,12 +14,9 @@ #include // std::mt19937 #include // Hash table -namespace cryfa { -// Version -static const std::string MONTH = "04"; -static const std::string YEAR = "26"; -static const std::string VERSION = YEAR + "." + MONTH; +#include "cryfa/version.hpp" +namespace cryfa { // Typedefs using byte = unsigned char; using u16 = unsigned short; @@ -28,13 +25,13 @@ using u64 = unsigned long long; using i64 = long long; using rng_t = std::mt19937; using htbl_t = std::unordered_map; -using pos_t = std::char_traits::pos_type; /**< @brief tellg(), tellp() */ +using pos_t = std::char_traits::pos_type; // Position type for tellg() and tellp() // Metaprograms /** - * Power (B^E). Usage: "cerr << POWER<3,2>::val;" which yields 9 - * @tparam B Base - * @tparam E Exponent + * @brief Power (B^E). Usage: "cerr << POWER<3,2>::val;" which yields 9 + * @tparam B Base + * @tparam E Exponent * @warning Base (B) and exponent (E) MUST be known at compile time. */ template @@ -56,51 +53,45 @@ struct POWER { #define LOOP4(i, j, k, l, S) LOOP(i, S) LOOP(j, S) LOOP2(k, l, S) #define LOOP5(i, j, k, l, m, S) LOOP(i, S) LOOP(j, S) LOOP3(k, l, m, S) #define LOOP6(i, j, k, l, m, n, S) LOOP(i, S) LOOP(j, S) LOOP4(k, l, m, n, S) -#define LOOP7(i, j, k, l, m, n, o, S) \ - LOOP(i, S) LOOP(j, S) LOOP5(k, l, m, n, o, S) -#define LOOP8(i, j, k, l, m, n, o, p, S) \ - LOOP(i, S) LOOP(j, S) LOOP6(k, l, m, n, o, p, S) -#define IGNORE_THIS_LINE(in) \ - (in).ignore(std::numeric_limits::max(), '\n') +#define LOOP7(i, j, k, l, m, n, o, S) LOOP(i, S) LOOP(j, S) LOOP5(k, l, m, n, o, S) +#define LOOP8(i, j, k, l, m, n, o, p, S) LOOP(i, S) LOOP(j, S) LOOP6(k, l, m, n, o, p, S) +#define IGNORE_THIS_LINE(in) (in).ignore(std::numeric_limits::max(), '\n') // Constants -static const std::string THR_ID_HDR = "THRD="; /**< @brief Thread ID header */ -static const std::string PK_FNAME = "CRYFA_PK"; /**< @brief Packed file name */ -static const std::string PCKD_FNAME = - "CRYFA_PCKD"; /**< @brief Pckd f name - joined*/ -static const std::string SH_FNAME = "CRYFA_SH"; /**< @brief Shuffed file name */ -static const std::string DEC_FNAME = - "CRYFA_DEC"; /**< @brief Decrypted file name */ -static const std::string UPK_FNAME = - "CRYFA_UPK"; /**< @brief Unpacked file name */ -static const std::string USH_FNAME = - "CRYFA_USH"; /**< @brief Unshuffled file name*/ -constexpr byte DEF_N_THR = 8; /**< @brief Default number of threads */ -constexpr u64 BLOCK_SIZE = 8 * 1024; /**< @brief To read/write from/to file */ -constexpr byte C1 = 2; /**< @brief Cat 1 = 2 */ -constexpr byte C2 = 3; /**< @brief Cat 2 = 3 */ -constexpr byte MIN_C3 = 4; /**< @brief 4 <= Cat 3 <= 6 */ +static const std::string THR_ID_HDR = "THRD="; // Thread ID header +static const std::string PK_FNAME = "CRYFA_PK"; // Packed file name +static const std::string PCKD_FNAME = "CRYFA_PCKD"; // Packed file name - joined +static const std::string SH_FNAME = "CRYFA_SH"; // Shuffled file name +static const std::string DEC_FNAME = "CRYFA_DEC"; // Decrypted file name +static const std::string UPK_FNAME = "CRYFA_UPK"; // Unpacked file name +static const std::string USH_FNAME = "CRYFA_USH"; // Unshuffled file name +constexpr byte DEF_N_THR = 8; // Default number of threads +constexpr u64 IO_BUFFER_SIZE = 8ULL * 1024ULL; // Buffered output writes +constexpr u64 CHUNK_TARGET_SIZE = 1024ULL * 1024ULL; // Internal worker chunk target +constexpr byte C1 = 2; // Cat 1 = 2 +constexpr byte C2 = 3; // Cat 2 = 3 +constexpr byte MIN_C3 = 4; // 4 <= Cat 3 <= 6 constexpr byte MID_C3 = 5; constexpr byte MAX_C3 = 6; -constexpr byte MIN_C4 = 7; /**< @brief 7 <= Cat 4 <= 15 */ +constexpr byte MIN_C4 = 7; // 7 <= Cat 4 <= 15 constexpr byte MAX_C4 = 15; -constexpr byte MIN_C5 = 16; /**< @brief 16 <= Cat 5 <= 39 */ +constexpr byte MIN_C5 = 16; // 16 <= Cat 5 <= 39 constexpr byte MAX_C5 = 39; -constexpr byte KEYLEN_C1 = 7; /**< @brief 7 to 1 byte. Build hash table*/ -constexpr byte KEYLEN_C2 = 5; /**< @brief 5 to 1 byte */ -constexpr byte KEYLEN_C3 = 3; /**< @brief 3 to 1 byte */ -constexpr byte KEYLEN_C4 = 2; /**< @brief 2 to 1 byte */ -constexpr byte KEYLEN_C5 = 3; /**< @brief 3 to 2 byte */ -constexpr int TAG_SIZE = 12; /**< @brief GCC mode auth enc */ +constexpr byte KEYLEN_C1 = 7; // 7 to 1 byte. Build hash table +constexpr byte KEYLEN_C2 = 5; // 5 to 1 byte +constexpr byte KEYLEN_C3 = 3; // 3 to 1 byte +constexpr byte KEYLEN_C4 = 2; // 2 to 1 byte +constexpr byte KEYLEN_C5 = 3; // 3 to 2 byte +constexpr int TAG_SIZE = 12; // GCC mode auth enc /** @brief Command line input arguments */ struct Param { - static bool verbose; /**< @brief Verbose mode */ - static bool stop_shuffle; /**< @brief Disable shuffling */ - static byte n_threads; /**< @brief Number of threads */ - static std::string in_file; /**< @brief Input file name */ - static std::string key_file; /**< @brief Password file name */ - static char format; /**< @brief Format of the input file */ + static bool verbose; // Verbose mode + static bool stop_shuffle; // Disable shuffling + static byte n_threads; // Number of threads + static std::string in_file; // Input file name + static std::string key_file; // Password file name + static char format; // Format of the input file }; } // namespace cryfa diff --git a/src/endecrypto.cpp b/src/endecrypto.cpp index b9b3241..0359af8 100644 --- a/src/endecrypto.cpp +++ b/src/endecrypto.cpp @@ -1,18 +1,21 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file endecrypto.cpp - * @brief Encryption/Decryption - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file endecrypto.cpp + * @brief Encryption/Decryption */ #include "endecrypto.hpp" #include +#include #include // std::pow +#include #include #include #include // setw, std::setprecision +#include #include #include #include @@ -22,16 +25,113 @@ #include "time.hpp" using namespace cryfa; -std::mutex mutxEnDe; /**< @brief Mutex */ +std::mutex mutxEnDe; + +namespace { +constexpr u16 INVALID_RANK = std::numeric_limits::max(); + +struct DenseLookup { + std::string alphabet; + bool with_extra = false; + u16 base = 0; + u16 extra_rank = 0; + std::array rank{}; +}; + +auto build_dense_lookup(const std::string& alphabet, bool with_extra) -> DenseLookup { + DenseLookup lookup; + lookup.alphabet = alphabet; + lookup.with_extra = with_extra; + lookup.base = static_cast(alphabet.size() + (with_extra ? 1 : 0)); + lookup.extra_rank = static_cast(alphabet.size()); + lookup.rank.fill(INVALID_RANK); + + for (u16 i = 0; i != alphabet.size(); ++i) { + lookup.rank[(byte)alphabet[i]] = i; + } + if (with_extra) { + lookup.rank[(byte)(alphabet.back() + 1)] = lookup.extra_rank; + } + + return lookup; +} + +auto dense_lookup(const std::string& alphabet, bool with_extra = false) -> const DenseLookup& { + thread_local std::vector cache; + for (const DenseLookup& lookup : cache) { + if (lookup.with_extra == with_extra && lookup.alphabet == alphabet) { + return lookup; + } + } + + cache.push_back(build_dense_lookup(alphabet, with_extra)); + return cache.back(); +} + +auto checked_rank(const DenseLookup& lookup, char c) -> u16 { + const u16 rank = lookup.rank[(byte)c]; + if (rank == INVALID_RANK) { + error(std::format("symbol \"{}\" not found!", c)); + } + return rank; +} + +auto tuple_index(const DenseLookup& lookup, const char* tuple, size_t width) -> u16 { + u64 index = 0; + for (size_t i = 0; i != width; ++i) { + index = index * lookup.base + checked_rank(lookup, tuple[i]); + } + return static_cast(index); +} + +auto large_tuple_index(const DenseLookup& lookup, char s0, char s1, char s2, bool& first_not_in, + bool& second_not_in, bool& third_not_in) -> u16 { + auto rank_or_extra = [&](char c, bool& not_in) { + const u16 rank = lookup.rank[(byte)c]; + not_in = (rank == INVALID_RANK); + return not_in ? lookup.extra_rank : rank; + }; + + const u16 r0 = rank_or_extra(s0, first_not_in); + const u16 r1 = rank_or_extra(s1, second_not_in); + const u16 r2 = rank_or_extra(s2, third_not_in); + return static_cast((r0 * lookup.base + r1) * lookup.base + r2); +} + +auto dna_rank_or_x(char c, bool& not_in) -> byte { + not_in = false; + switch (c) { + case 'A': + return 0; + case 'C': + return 1; + case 'G': + return 2; + case 'T': + return 3; + case 'N': + return 4; + default: + not_in = true; + return 5; + } +} + +void append_penalty_tail(std::string& packed, const std::string& input, size_t pos) { + for (; pos != input.size(); ++pos) { + packed += (char)255; + packed += input[pos]; + } +} +} // namespace /** * @brief Build a hash table - * @param[out] map Hash table - * @param[in] strIn The string including the keys - * @param[in] keyLen Length of the keys + * @param[out] map Hash table + * @param strIn The string including the keys + * @param keyLen Length of the keys */ -void EnDecrypto::build_hash_tbl(htbl_t& map, const std::string& strIn, - short keyLen) { +void EnDecrypto::build_hash_tbl(htbl_t& map, const std::string& strIn, short keyLen) { u64 elementNo = 0; std::string element; element.reserve((unsigned long)keyLen); @@ -132,12 +232,12 @@ void EnDecrypto::build_hash_tbl(htbl_t& map, const std::string& strIn, /** * @brief Build a table for unpacking - * @param[out] unpack Table (vector of strings) - * @param[in] strIn The string including the keys - * @param[in] keyLen Length of the keys + * @param[out] unpack Table (vector of strings) + * @param strIn The string including the keys + * @param keyLen Length of the keys */ -void EnDecrypto::build_unpack_tbl(std::vector& unpack, - const std::string& strIn, u16 keyLen) { +void EnDecrypto::build_unpack_tbl(std::vector& unpack, const std::string& strIn, + u16 keyLen) { std::string element; element.reserve(keyLen); unpack.clear(); @@ -233,432 +333,210 @@ void EnDecrypto::build_unpack_tbl(std::vector& unpack, } } -/** - * @brief Index of each DNA bases pack - * @param key Key - * @return Value (based on the idea of key-value in a hash table) - */ -byte EnDecrypto::dna_pack_idx(const std::string& key) { - const auto found = DNA_MAP.find(key); - if (found == DNA_MAP.end()) error("key \"" + key + "\" not found!"); - - return (byte)found->second; -} - -/** - * @brief Index of each pack, when # > 39 - * @param key Key - * @param map Hash table - * @return Value (based on the idea of key-value in a hash table) - */ -u16 EnDecrypto::large_pack_idx(const std::string& key, const htbl_t& map) { - const auto found = map.find(key); - if (found == map.end()) error("key \"" + key + "\" not found!"); - - return (u16)found->second; -} - /** * @brief Encapsulate each 3 DNA bases in 1 byte. Reduction: ~2/3 - * @param[out] packedSeq Packed sequence - * @param[in] seq Sequence + * @param[out] packedSeq Packed sequence + * @param seq Sequence */ void EnDecrypto::pack_seq(std::string& packedSeq, const std::string& seq) { - auto i = seq.begin(); - - for (auto iEnd = seq.end() - 2; i < iEnd; i += 3) { - char s0 = *i, s1 = *(i + 1), s2 = *(i + 2); + size_t pos = 0; + const size_t tuple_limit = seq.size() - (seq.size() % 3); - std::string tuple; - tuple.reserve(3); + for (; pos != tuple_limit; pos += 3) { + const char s0 = seq[pos]; + const char s1 = seq[pos + 1]; + const char s2 = seq[pos + 2]; bool firstNotIn, secondNotIn, thirdNotIn; - tuple += (firstNotIn = (s0 != 'A' && s0 != 'C' && s0 != 'G' && s0 != 'T' && - s0 != 'N')) - ? 'X' - : s0; - tuple += (secondNotIn = (s1 != 'A' && s1 != 'C' && s1 != 'G' && s1 != 'T' && - s1 != 'N')) - ? 'X' - : s1; - tuple += (thirdNotIn = (s2 != 'A' && s2 != 'C' && s2 != 'G' && s2 != 'T' && - s2 != 'N')) - ? 'X' - : s2; - - packedSeq += dna_pack_idx(tuple); - if (firstNotIn) packedSeq += s0; - if (secondNotIn) packedSeq += s1; - if (thirdNotIn) packedSeq += s2; - } - // If seq len isn't multiple of 3, add (char) 255 before each sym - switch (seq.length() % 3) { - case 1: - packedSeq += (char)255; - packedSeq += *i; - break; + const byte r0 = dna_rank_or_x(s0, firstNotIn); + const byte r1 = dna_rank_or_x(s1, secondNotIn); + const byte r2 = dna_rank_or_x(s2, thirdNotIn); + packedSeq += static_cast((r0 * 6 + r1) * 6 + r2); - case 2: - packedSeq += (char)255; - packedSeq += *i; - packedSeq += (char)255; - packedSeq += *(i + 1); - break; - - default: - break; + if (firstNotIn) { + packedSeq += s0; + } + if (secondNotIn) { + packedSeq += s1; + } + if (thirdNotIn) { + packedSeq += s2; + } } + + append_penalty_tail(packedSeq, seq, pos); } /** * @brief Encapsulate 3 header symbols in 2 bytes, when # >= 40. * -- FASTA/FASTQ. Reduction ~1/3 - * @param[out] packed Packed header - * @param[in] strIn Header - * @param[in] map Hash table + * @param[out] packed Packed header + * @param strIn Header + * @param map Hash table */ -void EnDecrypto::pack_hL_fa_fq(std::string& packed, const std::string& strIn, - const htbl_t& map) { +void EnDecrypto::pack_hL_fa_fq(std::string& packed, const std::string& strIn, const htbl_t& map) { pack_large(packed, strIn, Hdrs, map); } /** * @brief Encapsulate 3 quality score symbols in 2 bytes, when # >= 40. - * -- FASTQ. Reduction ~1/3 - * @param[out] packed Packed qulity scores - * @param[in] strIn Quality scores - * @param[in] map Hash table + * -- FASTQ. Reduction ~1/3 + * @param[out] packed Packed qulity scores + * @param strIn Quality scores + * @param map Hash table */ -void EnDecrypto::pack_qL_fq(std::string& packed, const std::string& strIn, - const htbl_t& map) { +void EnDecrypto::pack_qL_fq(std::string& packed, const std::string& strIn, const htbl_t& map) { pack_large(packed, strIn, QSs, map); } /** * @brief Encapsulate 3 header/quality score symbols in 2 bytes, when # >= 40 * -- FASTA/FASTQ. Reduction ~1/3 - * @param[out] packed Packed qulity scores - * @param[in] strIn Input header/quality score - * @param[in] hdrQs Collection of headers/quality scores - * @param[in] map Hash table + * @param[out] packed Packed qulity scores + * @param strIn Input header/quality score + * @param hdrQs Collection of headers/quality scores + * @param map Hash table */ -inline void EnDecrypto::pack_large(std::string& packed, - const std::string& strIn, - const std::string& hdrQs, - const htbl_t& map) { - // ASCII char after the last char in QUALITY_SCORES std::string - const auto XChar = (char)(hdrQs.back() + 1); - auto i = strIn.begin(); - - for (auto iEnd = strIn.end() - 2; i < iEnd; i += 3) { - char s0 = *i, s1 = *(i + 1), s2 = *(i + 2); - - std::string tuple; - tuple.reserve(3); +inline void EnDecrypto::pack_large(std::string& packed, const std::string& strIn, + const std::string& hdrQs, const htbl_t& map) { + (void)map; + const DenseLookup& lookup = dense_lookup(hdrQs, true); + size_t pos = 0; + const size_t tuple_limit = strIn.size() - (strIn.size() % 3); + + for (; pos != tuple_limit; pos += 3) { + const char s0 = strIn[pos]; + const char s1 = strIn[pos + 1]; + const char s2 = strIn[pos + 2]; bool firstNotIn, secondNotIn, thirdNotIn; - tuple = (firstNotIn = (hdrQs.find(s0) == std::string::npos)) ? XChar : s0; - tuple += (secondNotIn = (hdrQs.find(s1) == std::string::npos)) ? XChar : s1; - tuple += (thirdNotIn = (hdrQs.find(s2) == std::string::npos)) ? XChar : s2; - u16 shortTuple = large_pack_idx(tuple, map); + const u16 shortTuple = + large_tuple_index(lookup, s0, s1, s2, firstNotIn, secondNotIn, thirdNotIn); packed += (unsigned char)(shortTuple >> 8); // Left byte packed += (unsigned char)(shortTuple & 0xFF); // Right byte - if (firstNotIn) packed += s0; - if (secondNotIn) packed += s1; - if (thirdNotIn) packed += s2; + if (firstNotIn) { + packed += s0; + } + if (secondNotIn) { + packed += s1; + } + if (thirdNotIn) { + packed += s2; + } } - // If len isn't multiple of 3, add (char) 255 before each sym - switch (strIn.length() % 3) { - case 1: - packed += (char)255; - packed += *i; - break; - - case 2: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - break; - - default: - break; - } + append_penalty_tail(packed, strIn, pos); } /** * @brief Encapsulate 3 symbols in 2 bytes, when 16 <= # <= 39. Reduction ~1/3 - * @param[out] packed Packed string - * @param[in] strIn Input string - * @param[in] map Hash table + * @param[out] packed Packed string + * @param strIn Input string + * @param map Hash table */ -void EnDecrypto::pack_3to2(std::string& packed, const std::string& strIn, - const htbl_t& map) { - auto i = strIn.begin(); - - for (auto iEnd = strIn.end() - 2; i < iEnd; i += 3) { - std::string tuple; - tuple.reserve(3); - tuple = *i; - tuple += *(i + 1); - tuple += *(i + 2); - u16 shortTuple = (u16)map.find(tuple)->second; +void EnDecrypto::pack_3to2(std::string& packed, const std::string& strIn, const htbl_t& map) { + const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs); + size_t pos = 0; + const size_t tuple_limit = strIn.size() - (strIn.size() % 3); + + for (; pos != tuple_limit; pos += 3) { + const u16 shortTuple = tuple_index(lookup, strIn.data() + pos, 3); packed += (byte)(shortTuple >> 8); // Left byte packed += (byte)(shortTuple & 0xFF); // Right byte } - // If len isn't multiple of 3, add (char) 255 before each sym - switch (strIn.length() % 3) { - case 1: - packed += (char)255; - packed += *i; - break; - - case 2: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - break; - - default: - break; - } + append_penalty_tail(packed, strIn, pos); } /** * @brief Encapsulate 2 symbols in 1 byte, when 7 <= # <= 15. Reduction ~1/2 - * @param[out] packed Packed string - * @param[in] strIn Input string - * @param[in] map Hash table + * @param[out] packed Packed string + * @param strIn Input string + * @param map Hash table */ -void EnDecrypto::pack_2to1(std::string& packed, const std::string& strIn, - const htbl_t& map) { - auto i = strIn.begin(); - - for (auto iEnd = strIn.end() - 1; i < iEnd; i += 2) { - std::string tuple; - tuple.reserve(2); - tuple = *i; - tuple += *(i + 1); - packed += (char)map.find(tuple)->second; - } +void EnDecrypto::pack_2to1(std::string& packed, const std::string& strIn, const htbl_t& map) { + const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs); + size_t pos = 0; + const size_t tuple_limit = strIn.size() - (strIn.size() % 2); - // If len isn't multiple of 2 (it's odd), add (char) 255 before each sym - if (strIn.length() & 1) { - packed += (char)255; - packed += *i; + for (; pos != tuple_limit; pos += 2) { + packed += static_cast(tuple_index(lookup, strIn.data() + pos, 2)); } + + append_penalty_tail(packed, strIn, pos); } /** * @brief Encapsulate 3 symbols in 1 byte, when # = 4, 5, 6. Reduction ~2/3 - * @param packed Packed string - * @param strIn Input string - * @param map Hash table + * @param packed Packed string + * @param strIn Input string + * @param map Hash table */ -void EnDecrypto::pack_3to1(std::string& packed, const std::string& strIn, - const htbl_t& map) { - auto i = strIn.begin(); - - for (auto iEnd = strIn.end() - 2; i < iEnd; i += 3) { - std::string tuple; - tuple.reserve(3); - tuple = *i; - tuple += *(i + 1); - tuple += *(i + 2); - packed += (char)map.find(tuple)->second; - } - - // If len isn't multiple of 3, add (char) 255 before each sym - switch (strIn.length() % 3) { - case 1: - packed += (char)255; - packed += *i; - break; - - case 2: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - break; +void EnDecrypto::pack_3to1(std::string& packed, const std::string& strIn, const htbl_t& map) { + const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs); + size_t pos = 0; + const size_t tuple_limit = strIn.size() - (strIn.size() % 3); - default: - break; + for (; pos != tuple_limit; pos += 3) { + packed += static_cast(tuple_index(lookup, strIn.data() + pos, 3)); } + + append_penalty_tail(packed, strIn, pos); } /** * @brief Encapsulate 5 symbols in 1 byte, when # = 3. Reduction ~4/5 - * @param[out] packed Packed string - * @param[in] strIn Input string - * @param[in] map Hash table + * @param[out] packed Packed string + * @param strIn Input string + * @param map Hash table */ -void EnDecrypto::pack_5to1(std::string& packed, const std::string& strIn, - const htbl_t& map) { - auto i = strIn.begin(); - - for (auto iEnd = strIn.end() - 4; i < iEnd; i += 5) { - std::string tuple; - tuple.reserve(5); - tuple = *i; - tuple += *(i + 1); - tuple += *(i + 2); - tuple += *(i + 3); - tuple += *(i + 4); - packed += (char)map.find(tuple)->second; - } +void EnDecrypto::pack_5to1(std::string& packed, const std::string& strIn, const htbl_t& map) { + const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs); + size_t pos = 0; + const size_t tuple_limit = strIn.size() - (strIn.size() % 5); - // If len isn't multiple of 5, add (char) 255 before each sym - switch (strIn.length() % 5) { - case 1: - packed += (char)255; - packed += *i; - break; - - case 2: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - break; - - case 3: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - packed += (char)255; - packed += *(i + 2); - break; - - case 4: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - packed += (char)255; - packed += *(i + 2); - packed += (char)255; - packed += *(i + 3); - break; - - default: - break; + for (; pos != tuple_limit; pos += 5) { + packed += static_cast(tuple_index(lookup, strIn.data() + pos, 5)); } + + append_penalty_tail(packed, strIn, pos); } /** * @brief Encapsulate 7 symbols in 1 byte, when # = 2. Reduction ~6/7 - * @param[out] packed Packed string - * @param[in] strIn Input string - * @param[in] map Hash table + * @param[out] packed Packed string + * @param strIn Input string + * @param map Hash table */ -void EnDecrypto::pack_7to1(std::string& packed, const std::string& strIn, - const htbl_t& map) { - auto i = strIn.begin(); - - for (auto iEnd = strIn.end() - 6; i < iEnd; i += 7) { - std::string tuple; - tuple.reserve(7); - tuple = *i; - tuple += *(i + 1); - tuple += *(i + 2); - tuple += *(i + 3); - tuple += *(i + 4); - tuple += *(i + 5); - tuple += *(i + 6); - packed += (char)map.find(tuple)->second; - } - - // If len isn't multiple of 7, add (char) 255 before each sym - switch (strIn.length() % 7) { - case 1: - packed += (char)255; - packed += *i; - break; - - case 2: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - break; - - case 3: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - packed += (char)255; - packed += *(i + 2); - break; - - case 4: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - packed += (char)255; - packed += *(i + 2); - packed += (char)255; - packed += *(i + 3); - break; +void EnDecrypto::pack_7to1(std::string& packed, const std::string& strIn, const htbl_t& map) { + const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs); + size_t pos = 0; + const size_t tuple_limit = strIn.size() - (strIn.size() % 7); - case 5: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - packed += (char)255; - packed += *(i + 2); - packed += (char)255; - packed += *(i + 3); - packed += (char)255; - packed += *(i + 4); - break; - - case 6: - packed += (char)255; - packed += *i; - packed += (char)255; - packed += *(i + 1); - packed += (char)255; - packed += *(i + 2); - packed += (char)255; - packed += *(i + 3); - packed += (char)255; - packed += *(i + 4); - packed += (char)255; - packed += *(i + 5); - break; - - default: - break; + for (; pos != tuple_limit; pos += 7) { + packed += static_cast(tuple_index(lookup, strIn.data() + pos, 7)); } + + append_penalty_tail(packed, strIn, pos); } /** * @brief Encapsulate 1 symbol in 1 byte, when # = 1. - * @param[out] packed Packed string - * @param[in] strIn Input string - * @param[in] map Hash table + * @param[out] packed Packed string + * @param strIn Input string + * @param map Hash table */ -void EnDecrypto::pack_1to1(std::string& packed, const std::string& strIn, - const htbl_t& map) { - for (auto i = strIn.begin(), iEnd = strIn.end(); i < iEnd; ++i) { - std::string single; - single = *i; - packed += (char)map.find(single)->second; +void EnDecrypto::pack_1to1(std::string& packed, const std::string& strIn, const htbl_t& map) { + const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs); + for (char c : strIn) { + packed += static_cast(checked_rank(lookup, c)); } } /** - * @brief Penalty symbol - * @param c Input char + * @brief Penalty symbol + * @param c Input char * @return Input char or (char)10='\\n' */ char EnDecrypto::penalty_sym(char c) const { @@ -671,13 +549,12 @@ char EnDecrypto::penalty_sym(char c) const { /** * @brief Unpack by reading 2 byte by 2 byte, when # > 39 - * @param[out] out Unpacked string - * @param[in] i Input string iterator - * @param[in] XChar Extra character for unpacking - * @param[in] unpack Table for unpacking + * @param[out] out Unpacked string + * @param i Input string iterator + * @param XChar Extra character for unpacking + * @param unpack Table for unpacking */ -void EnDecrypto::unpack_large(std::string& out, std::string::iterator& i, - char XChar, +void EnDecrypto::unpack_large(std::string& out, std::string::iterator& i, char XChar, const std::vector& unpack) { out.clear(); @@ -691,63 +568,42 @@ void EnDecrypto::unpack_large(std::string& out, std::string::iterator& i, const auto rightB = (byte) * (i + 1); const u16 doubleB = leftB << 8 | rightB; // Join two bytes - const std::string tpl = unpack[doubleB]; + const std::string& tpl = unpack[doubleB]; - if (tpl[0] != XChar && tpl[1] != XChar && tpl[2] != XChar) // ... - { + if (tpl[0] != XChar && tpl[1] != XChar && tpl[2] != XChar) { // ... out += tpl; i += 2; - } - - else if (tpl[0] == XChar && tpl[1] != XChar && tpl[2] != XChar) // X.. - { + } else if (tpl[0] == XChar && tpl[1] != XChar && tpl[2] != XChar) { // X.. out += penalty_sym(*(i + 2)); out += tpl[1]; out += tpl[2]; i += 3; - } - - else if (tpl[0] != XChar && tpl[1] == XChar && tpl[2] != XChar) // .X. - { + } else if (tpl[0] != XChar && tpl[1] == XChar && tpl[2] != XChar) { // .X. out += tpl[0]; out += penalty_sym(*(i + 2)); out += tpl[2]; i += 3; - } - - else if (tpl[0] == XChar && tpl[1] == XChar && tpl[2] != XChar) // XX. - { + } else if (tpl[0] == XChar && tpl[1] == XChar && tpl[2] != XChar) { // XX. out += penalty_sym(*(i + 2)); out += penalty_sym(*(i + 3)); out += tpl[2]; i += 4; - } - - else if (tpl[0] != XChar && tpl[1] != XChar && tpl[2] == XChar) // ..X - { + } else if (tpl[0] != XChar && tpl[1] != XChar && tpl[2] == XChar) { // ..X out += tpl[0]; out += tpl[1]; out += penalty_sym(*(i + 2)); i += 3; - } - - else if (tpl[0] == XChar && tpl[1] != XChar && tpl[2] == XChar) // X.X - { + } else if (tpl[0] == XChar && tpl[1] != XChar && tpl[2] == XChar) { // X.X out += penalty_sym(*(i + 2)); out += tpl[1]; out += penalty_sym(*(i + 3)); i += 4; - } - - else if (tpl[0] != XChar && tpl[1] == XChar && tpl[2] == XChar) // .XX - { + } else if (tpl[0] != XChar && tpl[1] == XChar && tpl[2] == XChar) { // .XX out += tpl[0]; out += penalty_sym(*(i + 2)); out += penalty_sym(*(i + 3)); i += 4; - } - - else { + } else { out += penalty_sym(*(i + 2)); out += penalty_sym(*(i + 3)); // XXX out += penalty_sym(*(i + 4)); @@ -759,9 +615,9 @@ void EnDecrypto::unpack_large(std::string& out, std::string::iterator& i, /** * @brief Unpack by reading 2 byte by 2 byte - * @param[out] out Unpacked string - * @param[in] i Input string iterator - * @param[in] unpack Table for unpacking + * @param[out] out Unpacked string + * @param i Input string iterator + * @param unpack Table for unpacking */ void EnDecrypto::unpack_2B(std::string& out, std::string::iterator& i, const std::vector& unpack) { @@ -769,9 +625,9 @@ void EnDecrypto::unpack_2B(std::string& out, std::string::iterator& i, for (; *i != (char)254; i += 2) { // Hdr len not multiple of keyLen - if (*i == (char)255) + if (*i == (char)255) { out += penalty_sym(*(i + 1)); - else { + } else { const auto leftB = (byte)*i; const auto rightB = (byte) * (i + 1); const u16 doubleB = leftB << 8 | rightB; // Join two bytes @@ -783,9 +639,9 @@ void EnDecrypto::unpack_2B(std::string& out, std::string::iterator& i, /** * @brief Unpack by reading 1 byte by 1 byte - * @param[out] out Unpacked string - * @param[in] i Input string iterator - * @param[in] unpack Table for unpacking + * @param[out] out Unpacked string + * @param i Input string iterator + * @param unpack Table for unpacking */ void EnDecrypto::unpack_1B(std::string& out, std::string::iterator& i, const std::vector& unpack) { @@ -793,75 +649,57 @@ void EnDecrypto::unpack_1B(std::string& out, std::string::iterator& i, for (; *i != (char)254; ++i) { // Hdr len not multiple of keyLen - if (*i == (char)255) + if (*i == (char)255) { out += penalty_sym(*(++i)); - else + } else { out += unpack[(byte)*i]; + } } } /** * @brief Unpack 1 byte to 3 DNA bases - * @param[out] out DNA bases - * @param[in] i Input string iterator + * @param[out] out DNA bases + * @param i Input string iterator */ void EnDecrypto::unpack_seq(std::string& out, std::string::iterator& i) { out.clear(); for (; *i != (char)254; ++i) { - if (*i == (char)255) // Seq len not multiple of 3 + if (*i == (char)255) { // Seq len not multiple of 3 out += penalty_sym(*(++i)); - else { - const std::string tpl = DNA_UNPACK[(byte)*i]; + } else { + const std::string& tpl = DNA_UNPACK[(byte)*i]; - if (tpl[0] != 'X' && tpl[1] != 'X' && tpl[2] != 'X') // ... - { + if (tpl[0] != 'X' && tpl[1] != 'X' && tpl[2] != 'X') { // ... out += tpl; } // Using just one 'out' makes trouble - else if (tpl[0] == 'X' && tpl[1] != 'X' && tpl[2] != 'X') // X.. - { + else if (tpl[0] == 'X' && tpl[1] != 'X' && tpl[2] != 'X') { // X.. out += penalty_sym(*(++i)); out += tpl[1]; out += tpl[2]; - } - - else if (tpl[0] != 'X' && tpl[1] == 'X' && tpl[2] != 'X') // .X. - { + } else if (tpl[0] != 'X' && tpl[1] == 'X' && tpl[2] != 'X') { // .X. out += tpl[0]; out += penalty_sym(*(++i)); out += tpl[2]; - } - - else if (tpl[0] == 'X' && tpl[1] == 'X' && tpl[2] != 'X') // XX. - { + } else if (tpl[0] == 'X' && tpl[1] == 'X' && tpl[2] != 'X') { // XX. out += penalty_sym(*(++i)); out += penalty_sym(*(++i)); out += tpl[2]; - } - - else if (tpl[0] != 'X' && tpl[1] != 'X' && tpl[2] == 'X') // ..X - { + } else if (tpl[0] != 'X' && tpl[1] != 'X' && tpl[2] == 'X') { // ..X out += tpl[0]; out += tpl[1]; out += penalty_sym(*(++i)); - } - - else if (tpl[0] == 'X' && tpl[1] != 'X' && tpl[2] == 'X') // X.X - { + } else if (tpl[0] == 'X' && tpl[1] != 'X' && tpl[2] == 'X') { // X.X out += penalty_sym(*(++i)); out += tpl[1]; out += penalty_sym(*(++i)); - } - - else if (tpl[0] != 'X' && tpl[1] == 'X' && tpl[2] == 'X') // .XX - { + } else if (tpl[0] != 'X' && tpl[1] == 'X' && tpl[2] == 'X') { // .XX out += tpl[0]; out += penalty_sym(*(++i)); out += penalty_sym(*(++i)); - } - - else { + } else { out += penalty_sym(*(++i)); out += penalty_sym(*(++i)); // XXX out += penalty_sym(*(++i)); @@ -874,25 +712,25 @@ void EnDecrypto::unpack_seq(std::string& out, std::string::iterator& i) { * @brief Shuffle a file (not FASTA/FASTQ) */ void EnDecrypto::shuffle_file() { - std::cerr << "\"" << file_name(in_file) - << "\" isn't FASTA/FASTQ. We just encrypt it.\n"; + std::cerr << "\"" << file_name(in_file) << "\" isn't FASTA/FASTQ. We just encrypt it.\n"; if (!stop_shuffle) { const auto start = now(); // Start timer std::vector arrThread(n_threads); // Distribute file among threads, for shuffling - for (byte t = 0; t != n_threads; ++t) + for (byte t = 0; t != n_threads; ++t) { arrThread[t] = std::thread(&EnDecrypto::shuffle_block, this, t); - for (auto& thr : arrThread) + } + for (auto& thr : arrThread) { if (thr.joinable()) thr.join(); + } // Join partially shuffled files join_shuffled_files(); const auto finish = now(); // Stop timer - std::cerr << "\r" << bold("[+]") << " Shuffling done in " - << hms(finish - start); + std::cerr << "\r" << bold("[+]") << " Shuffling done in " << hms(finish - start); } else { std::ifstream inFile(in_file); std::ofstream pckdFile(PCKD_FNAME); @@ -910,18 +748,23 @@ void EnDecrypto::shuffle_file() { /** * @brief Shuffle a block of file - * @param threadID Thread ID + * @param threadID Thread ID */ void EnDecrypto::shuffle_block(byte threadID) { std::ifstream in(in_file); - std::ofstream shfile(SH_FNAME + std::to_string(threadID), std::ios_base::app); + std::ofstream shfile(std::format("{}{}", SH_FNAME, static_cast(threadID)), + std::ios_base::app); // Characters ignored at the beginning - in.ignore((std::streamsize)(threadID * BLOCK_SIZE)); + in.ignore((std::streamsize)(threadID * CHUNK_TARGET_SIZE)); for (char c; in.peek() != EOF;) { std::string context; - for (u64 bs = BLOCK_SIZE; bs--;) - if (in.get(c)) context += c; + context.reserve(CHUNK_TARGET_SIZE); + for (u64 bs = CHUNK_TARGET_SIZE; bs--;) { + if (in.get(c)) { + context += c; + } + } // Shuffle if (!stop_shuffle) { @@ -937,11 +780,11 @@ void EnDecrypto::shuffle_block(byte threadID) { } // Write header containing threadID for each partially shuffled file - shfile << THR_ID_HDR << std::to_string(threadID) << '\n'; + shfile << std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); shfile << context << '\n'; // Ignore to go to the next related chunk - in.ignore((std::streamsize)((n_threads - 1) * BLOCK_SIZE)); + in.ignore((std::streamsize)((n_threads - 1) * CHUNK_TARGET_SIZE)); } shfile.close(); } @@ -961,10 +804,14 @@ void EnDecrypto::unshuffle_file() { std::vector arrThread(n_threads); // Distribute file among threads, for unshuffling - for (byte t = 0; t != n_threads; ++t) + for (byte t = 0; t != n_threads; ++t) { arrThread[t] = std::thread(&EnDecrypto::unshuffle_block, this, t); - for (auto& thr : arrThread) - if (thr.joinable()) thr.join(); + } + for (auto& thr : arrThread) { + if (thr.joinable()) { + thr.join(); + } + } // Delete decrypted file std::remove(DEC_FNAME.c_str()); @@ -973,8 +820,7 @@ void EnDecrypto::unshuffle_file() { join_unshuffled_files(); const auto finish = now(); // Stop timer - std::cerr << "\r" << bold("[+]") << " Unshuffling done in " - << hms(finish - start); + std::cerr << "\r" << bold("[+]") << " Unshuffling done in " << hms(finish - start); } else if (c == (char)129) { std::cout << in.rdbuf(); @@ -988,20 +834,24 @@ void EnDecrypto::unshuffle_file() { /** * @brief Unshuffle a block of file - * @param threadID Thread ID + * @param threadID Thread ID */ void EnDecrypto::unshuffle_block(byte threadID) { std::ifstream in(DEC_FNAME); - std::ofstream ushfile(USH_FNAME + std::to_string(threadID), + std::ofstream ushfile(std::format("{}{}", USH_FNAME, static_cast(threadID)), std::ios_base::app); // filetype char (125) + shuffed (128) + characters ignored at the beginning - in.ignore((std::streamsize)(2 + threadID * BLOCK_SIZE)); + in.ignore((std::streamsize)(2 + threadID * CHUNK_TARGET_SIZE)); for (char c; in.peek() != EOF;) { std::string unshText; - for (u64 bs = BLOCK_SIZE; bs--;) - if (in.get(c)) unshText += c; + unshText.reserve(CHUNK_TARGET_SIZE); + for (u64 bs = CHUNK_TARGET_SIZE; bs--;) { + if (in.get(c)) { + unshText += c; + } + } auto i = unshText.begin(); @@ -1019,11 +869,11 @@ void EnDecrypto::unshuffle_block(byte threadID) { } // Write header containing threadID for each partially unshuffled file - ushfile << THR_ID_HDR + std::to_string(threadID) << '\n'; + ushfile << std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); ushfile << unshText << '\n'; // Ignore to go to the next related chunk - in.ignore((std::streamsize)((n_threads - 1) * BLOCK_SIZE)); + in.ignore((std::streamsize)((n_threads - 1) * CHUNK_TARGET_SIZE)); } ushfile.close(); @@ -1032,19 +882,18 @@ void EnDecrypto::unshuffle_block(byte threadID) { /** * @brief Join partially packed files - * @param headers Headers - * @param qscores Quality scores - * @param fT File type - * @param justPlus If the third line of FASTQ contains only the '+' char + * @param headers Headers + * @param qscores Quality scores + * @param fT File type + * @param justPlus If the third line of FASTQ contains only the '+' char */ -void EnDecrypto::join_packed_files(const std::string& headers, - const std::string& qscores, char fT, +void EnDecrypto::join_packed_files(const std::string& headers, const std::string& qscores, char fT, bool justPlus) const { byte t; // For threads std::vector pkFile(n_threads); std::ofstream pckdFile(PCKD_FNAME); // Packed file std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { pckdFile << content; }; switch (fT) { @@ -1066,7 +915,9 @@ void EnDecrypto::join_packed_files(const std::string& headers, } // Input files - for (t = n_threads; t--;) pkFile[t].open(PK_FNAME + std::to_string(t)); + for (t = n_threads; t--;) { + pkFile[t].open(std::format("{}{}", PK_FNAME, static_cast(t))); + } std::string line; bool prevLineNotThrID; // If previous line was "THR=" or not @@ -1075,14 +926,16 @@ void EnDecrypto::join_packed_files(const std::string& headers, prevLineNotThrID = false; while (std::getline(pkFile[t], line).good() && - line != THR_ID_HDR + std::to_string(t)) { - if (prevLineNotThrID) content += '\n'; + line != std::format("{}{}", THR_ID_HDR, static_cast(t))) { + if (prevLineNotThrID) { + content += '\n'; + } content += line; - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } prevLineNotThrID = true; @@ -1096,8 +949,7 @@ void EnDecrypto::join_packed_files(const std::string& headers, pckdFile.close(); for (t = n_threads; t--;) { pkFile[t].close(); - std::string pkFileName = PK_FNAME; - pkFileName += std::to_string(t); + std::string pkFileName = std::format("{}{}", PK_FNAME, static_cast(t)); std::remove(pkFileName.c_str()); } } @@ -1108,9 +960,11 @@ void EnDecrypto::join_packed_files(const std::string& headers, void EnDecrypto::join_unpacked_files() const { byte t; // For threads std::vector upkdFile(n_threads); - for (t = n_threads; t--;) upkdFile[t].open(UPK_FNAME + std::to_string(t)); + for (t = n_threads; t--;) { + upkdFile[t].open(std::format("{}{}", UPK_FNAME, static_cast(t))); + } std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { std::cout << content; }; bool prevLineNotThrID; // If previous line was "THRD=" or not @@ -1119,20 +973,24 @@ void EnDecrypto::join_unpacked_files() const { prevLineNotThrID = false; for (std::string line; std::getline(upkdFile[t], line).good() && - line != THR_ID_HDR + std::to_string(t);) { - if (prevLineNotThrID) content += '\n'; + line != std::format("{}{}", THR_ID_HDR, static_cast(t));) { + if (prevLineNotThrID) { + content += '\n'; + } content += line; - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } prevLineNotThrID = true; } - if (prevLineNotThrID) content += '\n'; + if (prevLineNotThrID) { + content += '\n'; + } } } write_content(); @@ -1140,8 +998,7 @@ void EnDecrypto::join_unpacked_files() const { // Close/delete input/output files for (t = n_threads; t--;) { upkdFile[t].close(); - std::string upkdFileName = UPK_FNAME; - upkdFileName += std::to_string(t); + std::string upkdFileName = std::format("{}{}", UPK_FNAME, static_cast(t)); std::remove(upkdFileName.c_str()); } } @@ -1153,28 +1010,32 @@ void EnDecrypto::join_shuffled_files() const { std::vector shFile(n_threads); std::ofstream shdFile(PCKD_FNAME); // Output Shuffled file std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { shdFile << content; }; content += (char)125; content += (!stop_shuffle ? (char)128 : (char)129); // Input files - for (byte t = n_threads; t--;) shFile[t].open(SH_FNAME + std::to_string(t)); + for (byte t = n_threads; t--;) { + shFile[t].open(std::format("{}{}", SH_FNAME, static_cast(t))); + } while (!shFile[0].eof()) { for (byte t = 0; t != n_threads; ++t) { bool prevLineNotThrID = false; // If previous line was "THR=" or not for (std::string line; std::getline(shFile[t], line).good() && - line != THR_ID_HDR + std::to_string(t);) { - if (prevLineNotThrID) content += '\n'; + line != std::format("{}{}", THR_ID_HDR, static_cast(t));) { + if (prevLineNotThrID) { + content += '\n'; + } content += line; - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } prevLineNotThrID = true; @@ -1187,8 +1048,7 @@ void EnDecrypto::join_shuffled_files() const { shdFile.close(); for (byte t = n_threads; t--;) { shFile[t].close(); - std::string shFileName = SH_FNAME; - shFileName += std::to_string(t); + std::string shFileName = std::format("{}{}", SH_FNAME, static_cast(t)); std::remove(shFileName.c_str()); } } @@ -1199,9 +1059,11 @@ void EnDecrypto::join_shuffled_files() const { void EnDecrypto::join_unshuffled_files() const { byte t; // For threads std::vector ushdFile(n_threads); - for (t = n_threads; t--;) ushdFile[t].open(USH_FNAME + std::to_string(t)); + for (t = n_threads; t--;) { + ushdFile[t].open(std::format("{}{}", USH_FNAME, static_cast(t))); + } std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { std::cout << content; }; while (!ushdFile[0].eof()) { @@ -1209,14 +1071,16 @@ void EnDecrypto::join_unshuffled_files() const { bool prevLineNotThrID = false; // If previous line was "THR=" or not for (std::string line; std::getline(ushdFile[t], line).good() && - line != THR_ID_HDR + std::to_string(t);) { - if (prevLineNotThrID) content += '\n'; + line != std::format("{}{}", THR_ID_HDR, static_cast(t));) { + if (prevLineNotThrID) { + content += '\n'; + } content += line; - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } prevLineNotThrID = true; @@ -1228,8 +1092,7 @@ void EnDecrypto::join_unshuffled_files() const { // Close/delete input/output files for (t = n_threads; t--;) { ushdFile[t].close(); - std::string ushdFileName = USH_FNAME; - ushdFileName += std::to_string(t); + std::string ushdFileName = std::format("{}{}", USH_FNAME, static_cast(t)); std::remove(ushdFileName.c_str()); } -} \ No newline at end of file +} diff --git a/src/endecrypto.hpp b/src/endecrypto.hpp index c345abf..4366155 100644 --- a/src/endecrypto.hpp +++ b/src/endecrypto.hpp @@ -1,9 +1,9 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file endecrypto.hpp - * @brief Encryption/Decryption - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file endecrypto.hpp + * @brief Encryption/Decryption */ #ifndef CRYFA_ENDECRYPTO_H @@ -18,8 +18,7 @@ namespace cryfa { class EnDecrypto; // Type define -typedef void (EnDecrypto::*packFP_t)(std::string&, const std::string&, - const htbl_t&); +typedef void (EnDecrypto::*packFP_t)(std::string&, const std::string&, const htbl_t&); typedef void (EnDecrypto::*unpackFP_t)(std::string&, std::string::iterator&, const std::vector&); @@ -38,10 +37,8 @@ class EnDecrypto : public Security { void pack_5to1(std::string&, const std::string&, const htbl_t&); void pack_7to1(std::string&, const std::string&, const htbl_t&); void pack_1to1(std::string&, const std::string&, const htbl_t&); - void unpack_2B(std::string&, std::string::iterator&, - const std::vector&); - void unpack_1B(std::string&, std::string::iterator&, - const std::vector&); + void unpack_2B(std::string&, std::string::iterator&, const std::vector&); + void unpack_1B(std::string&, std::string::iterator&, const std::vector&); void shuffle_file(); void unshuffle_file(); @@ -57,21 +54,16 @@ class EnDecrypto : public Security { void build_hash_tbl(htbl_t&, const std::string&, short); void build_unpack_tbl(std::vector&, const std::string&, u16); - auto dna_pack_idx(const std::string&) -> byte; - auto large_pack_idx(const std::string&, const htbl_t&) -> u16; void pack_seq(std::string&, const std::string&); void unpack_seq(std::string&, std::string::iterator&); - void unpack_large(std::string&, std::string::iterator&, char, - const std::vector&); - void join_packed_files(const std::string&, const std::string&, char, - bool) const; + void unpack_large(std::string&, std::string::iterator&, char, const std::vector&); + void join_packed_files(const std::string&, const std::string&, char, bool) const; void join_unpacked_files() const; void join_shuffled_files() const; void join_unshuffled_files() const; private: - void pack_large(std::string&, const std::string&, const std::string&, - const htbl_t&); + void pack_large(std::string&, const std::string&, const std::string&, const htbl_t&); auto penalty_sym(char) const -> char; void shuffle_block(byte); void unshuffle_block(byte); @@ -82,76 +74,65 @@ class EnDecrypto : public Security { * @hideinitializer */ static const htbl_t DNA_MAP{ - {"AAA", 0}, {"AAC", 1}, {"AAG", 2}, {"AAT", 3}, {"AAN", 4}, - {"AAX", 5}, {"ACA", 6}, {"ACC", 7}, {"ACG", 8}, {"ACT", 9}, - {"ACN", 10}, {"ACX", 11}, {"AGA", 12}, {"AGC", 13}, {"AGG", 14}, - {"AGT", 15}, {"AGN", 16}, {"AGX", 17}, {"ATA", 18}, {"ATC", 19}, - {"ATG", 20}, {"ATT", 21}, {"ATN", 22}, {"ATX", 23}, {"ANA", 24}, - {"ANC", 25}, {"ANG", 26}, {"ANT", 27}, {"ANN", 28}, {"ANX", 29}, - {"AXA", 30}, {"AXC", 31}, {"AXG", 32}, {"AXT", 33}, {"AXN", 34}, - {"AXX", 35}, {"CAA", 36}, {"CAC", 37}, {"CAG", 38}, {"CAT", 39}, - {"CAN", 40}, {"CAX", 41}, {"CCA", 42}, {"CCC", 43}, {"CCG", 44}, - {"CCT", 45}, {"CCN", 46}, {"CCX", 47}, {"CGA", 48}, {"CGC", 49}, - {"CGG", 50}, {"CGT", 51}, {"CGN", 52}, {"CGX", 53}, {"CTA", 54}, - {"CTC", 55}, {"CTG", 56}, {"CTT", 57}, {"CTN", 58}, {"CTX", 59}, - {"CNA", 60}, {"CNC", 61}, {"CNG", 62}, {"CNT", 63}, {"CNN", 64}, - {"CNX", 65}, {"CXA", 66}, {"CXC", 67}, {"CXG", 68}, {"CXT", 69}, - {"CXN", 70}, {"CXX", 71}, {"GAA", 72}, {"GAC", 73}, {"GAG", 74}, - {"GAT", 75}, {"GAN", 76}, {"GAX", 77}, {"GCA", 78}, {"GCC", 79}, - {"GCG", 80}, {"GCT", 81}, {"GCN", 82}, {"GCX", 83}, {"GGA", 84}, - {"GGC", 85}, {"GGG", 86}, {"GGT", 87}, {"GGN", 88}, {"GGX", 89}, - {"GTA", 90}, {"GTC", 91}, {"GTG", 92}, {"GTT", 93}, {"GTN", 94}, - {"GTX", 95}, {"GNA", 96}, {"GNC", 97}, {"GNG", 98}, {"GNT", 99}, - {"GNN", 100}, {"GNX", 101}, {"GXA", 102}, {"GXC", 103}, {"GXG", 104}, - {"GXT", 105}, {"GXN", 106}, {"GXX", 107}, {"TAA", 108}, {"TAC", 109}, - {"TAG", 110}, {"TAT", 111}, {"TAN", 112}, {"TAX", 113}, {"TCA", 114}, - {"TCC", 115}, {"TCG", 116}, {"TCT", 117}, {"TCN", 118}, {"TCX", 119}, - {"TGA", 120}, {"TGC", 121}, {"TGG", 122}, {"TGT", 123}, {"TGN", 124}, - {"TGX", 125}, {"TTA", 126}, {"TTC", 127}, {"TTG", 128}, {"TTT", 129}, - {"TTN", 130}, {"TTX", 131}, {"TNA", 132}, {"TNC", 133}, {"TNG", 134}, - {"TNT", 135}, {"TNN", 136}, {"TNX", 137}, {"TXA", 138}, {"TXC", 139}, - {"TXG", 140}, {"TXT", 141}, {"TXN", 142}, {"TXX", 143}, {"NAA", 144}, - {"NAC", 145}, {"NAG", 146}, {"NAT", 147}, {"NAN", 148}, {"NAX", 149}, - {"NCA", 150}, {"NCC", 151}, {"NCG", 152}, {"NCT", 153}, {"NCN", 154}, - {"NCX", 155}, {"NGA", 156}, {"NGC", 157}, {"NGG", 158}, {"NGT", 159}, - {"NGN", 160}, {"NGX", 161}, {"NTA", 162}, {"NTC", 163}, {"NTG", 164}, - {"NTT", 165}, {"NTN", 166}, {"NTX", 167}, {"NNA", 168}, {"NNC", 169}, - {"NNG", 170}, {"NNT", 171}, {"NNN", 172}, {"NNX", 173}, {"NXA", 174}, - {"NXC", 175}, {"NXG", 176}, {"NXT", 177}, {"NXN", 178}, {"NXX", 179}, - {"XAA", 180}, {"XAC", 181}, {"XAG", 182}, {"XAT", 183}, {"XAN", 184}, - {"XAX", 185}, {"XCA", 186}, {"XCC", 187}, {"XCG", 188}, {"XCT", 189}, - {"XCN", 190}, {"XCX", 191}, {"XGA", 192}, {"XGC", 193}, {"XGG", 194}, - {"XGT", 195}, {"XGN", 196}, {"XGX", 197}, {"XTA", 198}, {"XTC", 199}, - {"XTG", 200}, {"XTT", 201}, {"XTN", 202}, {"XTX", 203}, {"XNA", 204}, - {"XNC", 205}, {"XNG", 206}, {"XNT", 207}, {"XNN", 208}, {"XNX", 209}, - {"XXA", 210}, {"XXC", 211}, {"XXG", 212}, {"XXT", 213}, {"XXN", 214}, - {"XXX", 215}}; + {"AAA", 0}, {"AAC", 1}, {"AAG", 2}, {"AAT", 3}, {"AAN", 4}, {"AAX", 5}, + {"ACA", 6}, {"ACC", 7}, {"ACG", 8}, {"ACT", 9}, {"ACN", 10}, {"ACX", 11}, + {"AGA", 12}, {"AGC", 13}, {"AGG", 14}, {"AGT", 15}, {"AGN", 16}, {"AGX", 17}, + {"ATA", 18}, {"ATC", 19}, {"ATG", 20}, {"ATT", 21}, {"ATN", 22}, {"ATX", 23}, + {"ANA", 24}, {"ANC", 25}, {"ANG", 26}, {"ANT", 27}, {"ANN", 28}, {"ANX", 29}, + {"AXA", 30}, {"AXC", 31}, {"AXG", 32}, {"AXT", 33}, {"AXN", 34}, {"AXX", 35}, + {"CAA", 36}, {"CAC", 37}, {"CAG", 38}, {"CAT", 39}, {"CAN", 40}, {"CAX", 41}, + {"CCA", 42}, {"CCC", 43}, {"CCG", 44}, {"CCT", 45}, {"CCN", 46}, {"CCX", 47}, + {"CGA", 48}, {"CGC", 49}, {"CGG", 50}, {"CGT", 51}, {"CGN", 52}, {"CGX", 53}, + {"CTA", 54}, {"CTC", 55}, {"CTG", 56}, {"CTT", 57}, {"CTN", 58}, {"CTX", 59}, + {"CNA", 60}, {"CNC", 61}, {"CNG", 62}, {"CNT", 63}, {"CNN", 64}, {"CNX", 65}, + {"CXA", 66}, {"CXC", 67}, {"CXG", 68}, {"CXT", 69}, {"CXN", 70}, {"CXX", 71}, + {"GAA", 72}, {"GAC", 73}, {"GAG", 74}, {"GAT", 75}, {"GAN", 76}, {"GAX", 77}, + {"GCA", 78}, {"GCC", 79}, {"GCG", 80}, {"GCT", 81}, {"GCN", 82}, {"GCX", 83}, + {"GGA", 84}, {"GGC", 85}, {"GGG", 86}, {"GGT", 87}, {"GGN", 88}, {"GGX", 89}, + {"GTA", 90}, {"GTC", 91}, {"GTG", 92}, {"GTT", 93}, {"GTN", 94}, {"GTX", 95}, + {"GNA", 96}, {"GNC", 97}, {"GNG", 98}, {"GNT", 99}, {"GNN", 100}, {"GNX", 101}, + {"GXA", 102}, {"GXC", 103}, {"GXG", 104}, {"GXT", 105}, {"GXN", 106}, {"GXX", 107}, + {"TAA", 108}, {"TAC", 109}, {"TAG", 110}, {"TAT", 111}, {"TAN", 112}, {"TAX", 113}, + {"TCA", 114}, {"TCC", 115}, {"TCG", 116}, {"TCT", 117}, {"TCN", 118}, {"TCX", 119}, + {"TGA", 120}, {"TGC", 121}, {"TGG", 122}, {"TGT", 123}, {"TGN", 124}, {"TGX", 125}, + {"TTA", 126}, {"TTC", 127}, {"TTG", 128}, {"TTT", 129}, {"TTN", 130}, {"TTX", 131}, + {"TNA", 132}, {"TNC", 133}, {"TNG", 134}, {"TNT", 135}, {"TNN", 136}, {"TNX", 137}, + {"TXA", 138}, {"TXC", 139}, {"TXG", 140}, {"TXT", 141}, {"TXN", 142}, {"TXX", 143}, + {"NAA", 144}, {"NAC", 145}, {"NAG", 146}, {"NAT", 147}, {"NAN", 148}, {"NAX", 149}, + {"NCA", 150}, {"NCC", 151}, {"NCG", 152}, {"NCT", 153}, {"NCN", 154}, {"NCX", 155}, + {"NGA", 156}, {"NGC", 157}, {"NGG", 158}, {"NGT", 159}, {"NGN", 160}, {"NGX", 161}, + {"NTA", 162}, {"NTC", 163}, {"NTG", 164}, {"NTT", 165}, {"NTN", 166}, {"NTX", 167}, + {"NNA", 168}, {"NNC", 169}, {"NNG", 170}, {"NNT", 171}, {"NNN", 172}, {"NNX", 173}, + {"NXA", 174}, {"NXC", 175}, {"NXG", 176}, {"NXT", 177}, {"NXN", 178}, {"NXX", 179}, + {"XAA", 180}, {"XAC", 181}, {"XAG", 182}, {"XAT", 183}, {"XAN", 184}, {"XAX", 185}, + {"XCA", 186}, {"XCC", 187}, {"XCG", 188}, {"XCT", 189}, {"XCN", 190}, {"XCX", 191}, + {"XGA", 192}, {"XGC", 193}, {"XGG", 194}, {"XGT", 195}, {"XGN", 196}, {"XGX", 197}, + {"XTA", 198}, {"XTC", 199}, {"XTG", 200}, {"XTT", 201}, {"XTN", 202}, {"XTX", 203}, + {"XNA", 204}, {"XNC", 205}, {"XNG", 206}, {"XNT", 207}, {"XNN", 208}, {"XNX", 209}, + {"XXA", 210}, {"XXC", 211}, {"XXG", 212}, {"XXT", 213}, {"XXN", 214}, {"XXX", 215}}; /** * @brief Lookup table for unpacking -- 216 elements * @hideinitializer */ static const std::vector DNA_UNPACK{ - "AAA", "AAC", "AAG", "AAT", "AAN", "AAX", "ACA", "ACC", "ACG", "ACT", "ACN", - "ACX", "AGA", "AGC", "AGG", "AGT", "AGN", "AGX", "ATA", "ATC", "ATG", "ATT", - "ATN", "ATX", "ANA", "ANC", "ANG", "ANT", "ANN", "ANX", "AXA", "AXC", "AXG", - "AXT", "AXN", "AXX", "CAA", "CAC", "CAG", "CAT", "CAN", "CAX", "CCA", "CCC", - "CCG", "CCT", "CCN", "CCX", "CGA", "CGC", "CGG", "CGT", "CGN", "CGX", "CTA", - "CTC", "CTG", "CTT", "CTN", "CTX", "CNA", "CNC", "CNG", "CNT", "CNN", "CNX", - "CXA", "CXC", "CXG", "CXT", "CXN", "CXX", "GAA", "GAC", "GAG", "GAT", "GAN", - "GAX", "GCA", "GCC", "GCG", "GCT", "GCN", "GCX", "GGA", "GGC", "GGG", "GGT", - "GGN", "GGX", "GTA", "GTC", "GTG", "GTT", "GTN", "GTX", "GNA", "GNC", "GNG", - "GNT", "GNN", "GNX", "GXA", "GXC", "GXG", "GXT", "GXN", "GXX", "TAA", "TAC", - "TAG", "TAT", "TAN", "TAX", "TCA", "TCC", "TCG", "TCT", "TCN", "TCX", "TGA", - "TGC", "TGG", "TGT", "TGN", "TGX", "TTA", "TTC", "TTG", "TTT", "TTN", "TTX", - "TNA", "TNC", "TNG", "TNT", "TNN", "TNX", "TXA", "TXC", "TXG", "TXT", "TXN", - "TXX", "NAA", "NAC", "NAG", "NAT", "NAN", "NAX", "NCA", "NCC", "NCG", "NCT", - "NCN", "NCX", "NGA", "NGC", "NGG", "NGT", "NGN", "NGX", "NTA", "NTC", "NTG", - "NTT", "NTN", "NTX", "NNA", "NNC", "NNG", "NNT", "NNN", "NNX", "NXA", "NXC", - "NXG", "NXT", "NXN", "NXX", "XAA", "XAC", "XAG", "XAT", "XAN", "XAX", "XCA", - "XCC", "XCG", "XCT", "XCN", "XCX", "XGA", "XGC", "XGG", "XGT", "XGN", "XGX", - "XTA", "XTC", "XTG", "XTT", "XTN", "XTX", "XNA", "XNC", "XNG", "XNT", "XNN", - "XNX", "XXA", "XXC", "XXG", "XXT", "XXN", "XXX"}; + "AAA", "AAC", "AAG", "AAT", "AAN", "AAX", "ACA", "ACC", "ACG", "ACT", "ACN", "ACX", "AGA", + "AGC", "AGG", "AGT", "AGN", "AGX", "ATA", "ATC", "ATG", "ATT", "ATN", "ATX", "ANA", "ANC", + "ANG", "ANT", "ANN", "ANX", "AXA", "AXC", "AXG", "AXT", "AXN", "AXX", "CAA", "CAC", "CAG", + "CAT", "CAN", "CAX", "CCA", "CCC", "CCG", "CCT", "CCN", "CCX", "CGA", "CGC", "CGG", "CGT", + "CGN", "CGX", "CTA", "CTC", "CTG", "CTT", "CTN", "CTX", "CNA", "CNC", "CNG", "CNT", "CNN", + "CNX", "CXA", "CXC", "CXG", "CXT", "CXN", "CXX", "GAA", "GAC", "GAG", "GAT", "GAN", "GAX", + "GCA", "GCC", "GCG", "GCT", "GCN", "GCX", "GGA", "GGC", "GGG", "GGT", "GGN", "GGX", "GTA", + "GTC", "GTG", "GTT", "GTN", "GTX", "GNA", "GNC", "GNG", "GNT", "GNN", "GNX", "GXA", "GXC", + "GXG", "GXT", "GXN", "GXX", "TAA", "TAC", "TAG", "TAT", "TAN", "TAX", "TCA", "TCC", "TCG", + "TCT", "TCN", "TCX", "TGA", "TGC", "TGG", "TGT", "TGN", "TGX", "TTA", "TTC", "TTG", "TTT", + "TTN", "TTX", "TNA", "TNC", "TNG", "TNT", "TNN", "TNX", "TXA", "TXC", "TXG", "TXT", "TXN", + "TXX", "NAA", "NAC", "NAG", "NAT", "NAN", "NAX", "NCA", "NCC", "NCG", "NCT", "NCN", "NCX", + "NGA", "NGC", "NGG", "NGT", "NGN", "NGX", "NTA", "NTC", "NTG", "NTT", "NTN", "NTX", "NNA", + "NNC", "NNG", "NNT", "NNN", "NNX", "NXA", "NXC", "NXG", "NXT", "NXN", "NXX", "XAA", "XAC", + "XAG", "XAT", "XAN", "XAX", "XCA", "XCC", "XCG", "XCT", "XCN", "XCX", "XGA", "XGC", "XGG", + "XGT", "XGN", "XGX", "XTA", "XTC", "XTG", "XTT", "XTN", "XTX", "XNA", "XNC", "XNG", "XNT", + "XNN", "XNX", "XXA", "XXC", "XXG", "XXT", "XXN", "XXX"}; } // namespace cryfa -#endif // CRYFA_ENDECRYPTO_H \ No newline at end of file +#endif // CRYFA_ENDECRYPTO_H diff --git a/src/fasta.cpp b/src/fasta.cpp index b7eaf88..e4e0d98 100644 --- a/src/fasta.cpp +++ b/src/fasta.cpp @@ -1,75 +1,180 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file fasta.cpp - * @brief Compression/Decompression of FASTA - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file fasta.cpp + * @brief Compression/Decompression of FASTA */ #include "fasta.hpp" #include +#include +#include #include #include // setw, std::setprecision #include +#include +#include #include +#include +#include "ordered_pipeline.hpp" +#include "plaintext_stream.hpp" #include "string.hpp" #include "time.hpp" using namespace cryfa; -std::mutex mutxFA; /**< @brief Mutex */ +std::mutex mutxFA; + +namespace { +struct FastaRecord { + std::string header; + std::vector sequence_lines; +}; + +struct FastaChunk { + std::vector records; +}; +} // namespace /** * @brief Compress */ void Fasta::compress() { - if (!verbose) std::cerr << bold("[+]") << " Compacting ..."; + if (!verbose) { + std::cerr << bold("[+]") << " Compacting ..."; + } const auto start = now(); // Start timer - std::vector arrThr(n_threads); std::string headers; packfa_s pkStruct; // Collection of inputs to pass to pack... - if (verbose) + if (verbose) { std::cerr << bold("[+]") << " Calculating no. unique characters ..."; + } // Gather different chars in all headers and max length in all bases gather_h_bs(headers); // Show number of different chars in headers -- ignore '>'=62 - if (verbose) - std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " - << headers.length() << " \n"; + if (verbose) { + std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " << headers.length() + << " \n"; + } // Set Hash table and pack function set_hashTbl_packFn(pkStruct, headers); - // Distribute file among threads, for reading and packing - for (byte t = 0; t != n_threads; ++t) - arrThr[t] = std::thread(&Fasta::pack, this, pkStruct, t); - for (auto& thr : arrThr) - if (thr.joinable()) thr.join(); + auto read_chunk = [this, in = std::ifstream(in_file), + pending_header = std::string{}]() mutable -> std::optional { + FastaChunk chunk; + std::string line; + u64 chunk_bytes = 0; + + if (pending_header.empty()) { + while (std::getline(in, line)) { + if (!line.empty() && line.front() == '>') { + pending_header = std::move(line); + break; + } + } + } - if (verbose) { - std::cerr << "\r" << bold("[+]") << " Shuffling done in " - << hms(now() - shuffle_timer); - std::cerr << bold("[+]") << " Compacting ..."; - } + if (pending_header.empty()) { + return std::nullopt; + } - // Join partially packed and/or shuffled files - join_packed_files(headers, "", 'A', false); + while (!pending_header.empty()) { + FastaRecord record; + record.header = std::move(pending_header); + pending_header.clear(); + chunk_bytes += record.header.size() + 1; - const auto finish = now(); // Stop timer - std::cerr << "\r" << bold("[+]") << " Compacting done in " - << hms(finish - start); + while (std::getline(in, line)) { + if (!line.empty() && line.front() == '>') { + pending_header = std::move(line); + break; + } + + chunk_bytes += line.size() + 1; + record.sequence_lines.push_back(std::move(line)); + } + + chunk.records.push_back(std::move(record)); + if (chunk_bytes >= CHUNK_TARGET_SIZE) { + break; + } + } + + return chunk; + }; + + auto pack_chunk = [this, pkStruct](FastaChunk chunk) { + packFP_t packHdr = pkStruct.packHdrFP; + std::string context; + context.reserve(CHUNK_TARGET_SIZE); + std::string seq; + seq.reserve(CHUNK_TARGET_SIZE); + + for (const FastaRecord& record : chunk.records) { + context += (char)253; + (this->*packHdr)(context, record.header.substr(1), HdrMap); + context += (char)254; + + seq.clear(); + for (const std::string& line : record.sequence_lines) { + seq += line; + seq += (char)252; + } + if (!seq.empty()) { + seq.pop_back(); + pack_seq(context, seq); + context += (char)254; + } + } + + if (!stop_shuffle) { + mutxFA.lock(); //---------------------------------------------------- + if (verbose && shuffInProg) { + std::cerr << bold("[+]") << " Shuffling ..."; + shuffle_timer = now(); + } + shuffInProg = false; + mutxFA.unlock(); //-------------------------------------------------- + + shuffle(context); + } - // Cout encrypted content - encrypt(); + std::string packed = std::format("{}{}{}", (char)253, context.size(), (char)254); + packed += context; + return packed; + }; + + encrypt_stream([&](const PlaintextSink& emit) { + std::string header; + header.reserve(headers.size() + 3); + header += (char)127; + header += (!stop_shuffle ? (char)128 : (char)129); + header += headers; + header += (char)254; + emit(header); + + run_ordered_pipeline(n_threads, read_chunk, pack_chunk, emit); + emit(std::string(1, (char)252)); + + if (verbose && !stop_shuffle) { + std::cerr << "\r" << bold("[+]") << " Shuffling done in " << hms(now() - shuffle_timer); + std::cerr << bold("[+]") << " Compacting ..."; + } + + const auto finish = now(); // Stop timer + std::cerr << "\r" << bold("[+]") << " Compacting done in " << hms(finish - start); + }); } /** * @brief Set hash table and pack function - * @param[out] pkStruct Pack structure - * @param[in] headers Headers + * @param[out] pkStruct Pack structure + * @param headers Headers */ void Fasta::set_hashTbl_packFn(packfa_s& pkStruct, const std::string& headers) { const size_t headersLen = headers.length(); @@ -110,16 +215,21 @@ void Fasta::set_hashTbl_packFn(packfa_s& pkStruct, const std::string& headers) { /** * @brief Pack. '>' at the beginning of headers is not packed - * @param pkStruct Pack structure - * @param threadID Thread ID + * @param pkStruct Pack structure + * @param threadID Thread ID */ void Fasta::pack(const packfa_s& pkStruct, byte threadID) { packFP_t packHdr = pkStruct.packHdrFP; // Function pointer std::ifstream in(in_file); std::string line, context, seq; - std::ofstream pkfile(PK_FNAME + std::to_string(threadID), std::ios_base::app); + context.reserve(CHUNK_TARGET_SIZE); + seq.reserve(CHUNK_TARGET_SIZE); + std::ofstream pkfile(std::format("{}{}", PK_FNAME, static_cast(threadID)), + std::ios_base::app); // Lines ignored at the beginning - for (u64 l = (u64)threadID * BlockLine; l--;) IGNORE_THIS_LINE(in); + for (u64 l = (u64)threadID * BlockLine; l--;) { + IGNORE_THIS_LINE(in); + } while (in.peek() != EOF) { context.clear(); @@ -176,18 +286,17 @@ void Fasta::pack(const packfa_s& pkStruct, byte threadID) { } // For unshuffling: insert the size of packed context in the beginning - std::string contextSize; - contextSize += (char)253; - contextSize += std::to_string(context.size()); - contextSize += (char)254; + std::string contextSize = std::format("{}{}{}", (char)253, context.size(), (char)254); context.insert(0, contextSize); // Write header containing threadID for each partially packed file - pkfile << THR_ID_HDR << std::to_string(threadID) << '\n'; + pkfile << std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); pkfile << context << '\n'; // Ignore to go to the next related chunk - for (u64 l = (u64)(n_threads - 1) * BlockLine; l--;) IGNORE_THIS_LINE(in); + for (u64 l = (u64)(n_threads - 1) * BlockLine; l--;) { + IGNORE_THIS_LINE(in); + } } pkfile.close(); @@ -195,9 +304,8 @@ void Fasta::pack(const packfa_s& pkStruct, byte threadID) { } /** - * @brief Gather chars of all headers & max length of DNA bases lines, - * excluding '>' - * @param[out] headers Chars of all headers + * @brief Gather chars of all headers & max length of DNA bases lines, excluding '>' + * @param[out] headers Chars of all headers */ void Fasta::gather_h_bs(std::string& headers) { u32 maxBLen = 0; // Max length of each line of bases @@ -207,132 +315,211 @@ void Fasta::gather_h_bs(std::string& headers) { std::ifstream in(in_file); std::string line; while (getline(in, line).good()) { - if (line[0] == '>') - for (char c : line) hChars[c] = true; - else if (line.size() > maxBLen) + if (line[0] == '>') { + for (char c : line) { + hChars[c] = true; + } + } else if (line.size() > maxBLen) { maxBLen = (u32)line.size(); + } } in.close(); // Number of lines read from input file while compression - BlockLine = (u32)(BLOCK_SIZE / maxBLen); - if (!BlockLine) BlockLine = 2; + BlockLine = (u32)(CHUNK_TARGET_SIZE / maxBLen); + if (!BlockLine) { + BlockLine = 2; + } // Gather the characters -- Ignore '>'=62 for headers - for (byte i = 32; i != 62; ++i) - if (*(hChars + i)) headers += i; - for (byte i = 63; i != 127; ++i) - if (*(hChars + i)) headers += i; + for (byte i = 32; i != 62; ++i) { + if (*(hChars + i)) { + headers += i; + } + } + for (byte i = 63; i != 127; ++i) { + if (*(hChars + i)) { + headers += i; + } + } } /** * @brief Decompress */ void Fasta::decompress() { - if (!verbose) std::cerr << bold("[+]") << " Decompressing ..."; + if (!verbose) { + std::cerr << bold("[+]") << " Decompressing ..."; + } const auto start = now(); // Start timer - char c; // Chars in file + PlaintextStream plaintext; + std::exception_ptr decrypt_error; + std::thread decrypt_thread([&]() { + try { + decrypt_stream([&](std::string_view decrypted) { plaintext.push(decrypted); }); + plaintext.close(); + } catch (...) { + decrypt_error = std::current_exception(); + plaintext.fail(decrypt_error); + } + }); + + auto join_decrypt = [&]() { + if (decrypt_thread.joinable()) { + decrypt_thread.join(); + } + if (decrypt_error) { + std::rethrow_exception(decrypt_error); + } + }; + std::string headers; unpackfa_s upkStruct; // Collection of inputs to pass to unpack... - std::vector arrThread(n_threads); // Array of threads - std::ifstream in(DEC_FNAME); - in.ignore(1); // Jump over decText[0]==(char) 127 - in.get(c); - shuffled = (c == (char)128); // Check if file had been shuffled - if (verbose) - std::cerr << bold("[+]") << " Extracting no. unique characters ..."; - while (in.get(c) && c != (char)254) headers += c; - if (verbose) // Show number of different chars in headers -- Ignore '>'=62 - std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " - << headers.length() << " \n"; - - // Header -- Set unpack table and unpack function - set_unpackTbl_unpackFn(upkStruct, headers); - - // Distribute file among threads, for reading and unpacking - using unpackHFP = void (Fasta::*)(const unpackfa_s&, byte); - unpackHFP unpackH = - (headers.length() <= MAX_C5) ? &Fasta::unpack_hS : &Fasta::unpack_hL; - - for (byte t = 0; t != n_threads; ++t) { - in.get(c); - if (c == (char)253) { - std::string chunkSizeStr; // Chunk size (std::string) -- For unshuffling - while (in.get(c) && c != (char)254) chunkSizeStr += c; - const auto offset = - stoull(chunkSizeStr); // To traverse decompressed file - - upkStruct.begPos = in.tellg(); - upkStruct.chunkSize = offset; - - arrThread[t] = std::thread(unpackH, this, upkStruct, t); - - // Jump to the beginning of the next chunk - in.seekg((std::streamoff)offset, std::ios_base::cur); - } - // End of file - if (in.peek() == 252) break; - } - // Join threads - for (auto& thr : arrThread) - if (thr.joinable()) thr.join(); + try { + const auto file_type = plaintext.get(); + if (!file_type || *file_type != (char)127) { + throw std::runtime_error("corrupted file."); + } - if (verbose) { - std::cerr << "\r" << bold("[+]") << " Unshuffling done in " - << hms(now() - shuffle_timer); - std::cerr << bold("[+]") << " Decompressing ..."; - } + const auto shuffle_flag = plaintext.get(); + if (!shuffle_flag || (*shuffle_flag != (char)128 && *shuffle_flag != (char)129)) { + throw std::runtime_error("corrupted file."); + } - // Close/delete decrypted file - in.close(); - const std::string decFileName = DEC_FNAME; - std::remove(decFileName.c_str()); + shuffled = (*shuffle_flag == (char)128); // Check if file had been shuffled + if (verbose) { + std::cerr << bold("[+]") << " Extracting no. unique characters ..."; + } + if (!plaintext.read_until((char)254, headers)) { + throw std::runtime_error("corrupted file."); + } + if (verbose) { // Show number of different chars in headers -- Ignore '>'=62 + std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " << headers.length() + << " \n"; + } - // Join partially unpacked files - join_unpacked_files(); + // Header -- Set unpack table and unpack function + set_unpackTbl_unpackFn(upkStruct, headers); + const bool has_small_header = headers.length() <= MAX_C5; + + auto read_chunk = [&]() -> std::optional { + const auto marker = plaintext.get(); + if (!marker || *marker == (char)252) { + return std::nullopt; + } + if (*marker != (char)253) { + throw std::runtime_error("corrupted file."); + } + + std::string chunk_size_str; + if (!plaintext.read_until((char)254, chunk_size_str) || chunk_size_str.empty()) { + throw std::runtime_error("corrupted file."); + } + + std::string chunk; + if (!plaintext.read_bytes(std::stoull(chunk_size_str), chunk)) { + throw std::runtime_error("corrupted file."); + } + return chunk; + }; + + auto unpack_chunk = [this, upkStruct, has_small_header](std::string decText) mutable { + if (decText.empty()) { + return std::string{}; + } + + auto i = decText.begin(); + + // Unshuffle + if (shuffled) { + mutxFA.lock(); //-------------------------------------------------- + if (verbose && shuffInProg) { + std::cerr << bold("[+]") << " Unshuffling ..."; + shuffle_timer = now(); + } + shuffInProg = false; + mutxFA.unlock(); //------------------------------------------------ + + unshuffle(i, decText.size()); + } + + std::string upkhdrOut, upkSeqOut; + std::string content; + content.reserve(decText.size() * 2); + do { + if (*i == (char)253) { // Hdr + if (has_small_header) { + (this->*upkStruct.unpackHdrFP)(upkhdrOut, ++i, upkStruct.hdrUnpack); + } else { + unpack_large(upkhdrOut, ++i, upkStruct.XChar_hdr, upkStruct.hdrUnpack); + } + content += std::format(">{}\n", upkhdrOut); + } else { // Seq + unpack_seq(upkSeqOut, i); + content += std::format("{}\n", upkSeqOut); + } + } while (++i != decText.end()); + + return content; + }; + + run_ordered_pipeline(n_threads, read_chunk, unpack_chunk, + [](const std::string& output) { std::cout << output; }); + + if (verbose && shuffled) { + std::cerr << "\r" << bold("[+]") << " Unshuffling done in " << hms(now() - shuffle_timer); + std::cerr << bold("[+]") << " Decompressing ..."; + } + } catch (...) { + plaintext.fail(std::current_exception()); + if (decrypt_thread.joinable()) { + decrypt_thread.join(); + } + throw; + } + + join_decrypt(); const auto finish = now(); // Stop timer - std::cerr << "\r" << bold("[+]") << " Decompressing done in " - << hms(finish - start); + std::cerr << "\r" << bold("[+]") << " Decompressing done in " << hms(finish - start); } /** - * @brief Set unpack table and unpack function - * @param[out] upkStruct Unpack structure - * @param[in] headers Headers + * @brief Set unpack table and unpack function + * @param[out] upkStruct Unpack structure + * @param headers Headers */ -void Fasta::set_unpackTbl_unpackFn(unpackfa_s& upkStruct, - const std::string& headers) { +void Fasta::set_unpackTbl_unpackFn(unpackfa_s& upkStruct, const std::string& headers) { const size_t headersLen = headers.length(); u16 keyLen_hdr = 0; - if (headersLen > MAX_C5) + if (headersLen > MAX_C5) { keyLen_hdr = KEYLEN_C5; - else if (headersLen > MAX_C4) { // Cat 5 + } else if (headersLen > MAX_C4) { // Cat 5 upkStruct.unpackHdrFP = &EnDecrypto::unpack_2B; keyLen_hdr = KEYLEN_C5; } else { upkStruct.unpackHdrFP = &EnDecrypto::unpack_1B; - if (headersLen > MAX_C3) + if (headersLen > MAX_C3) { keyLen_hdr = KEYLEN_C4; // Cat 4 - else if (headersLen == MAX_C3 || headersLen == MID_C3 || - headersLen == MIN_C3) + } else if (headersLen == MAX_C3 || headersLen == MID_C3 || headersLen == MIN_C3) { keyLen_hdr = KEYLEN_C3; // Cat 3 - else if (headersLen == C2) + } else if (headersLen == C2) { keyLen_hdr = KEYLEN_C2; // Cat 2 - else if (headersLen == C1) + } else if (headersLen == C1) { keyLen_hdr = KEYLEN_C1; // Cat 1 - else + } else { keyLen_hdr = 1; // = 1 + } } // Build unpacking tables - if (headersLen <= MAX_C5) + if (headersLen <= MAX_C5) { build_unpack_tbl(upkStruct.hdrUnpack, headers, keyLen_hdr); - else { + } else { const std::string decHeaders = headers.substr(headersLen - MAX_C5); // ASCII char after the last char in headers std::string std::string decHeadersX = decHeaders; @@ -344,19 +531,19 @@ void Fasta::set_unpackTbl_unpackFn(unpackfa_s& upkStruct, /** * @brief Unpack: small header - * @param upkStruct Unpack structure - * @param threadID Thread ID + * @param upkStruct Unpack structure + * @param threadID Thread ID */ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) { unpackFP_t unpackHdr = upkStruct.unpackHdrFP; // Function pointer pos_t begPos = upkStruct.begPos; u64 chunkSize = upkStruct.chunkSize; std::ifstream in(DEC_FNAME); - std::ofstream upkfile(UPK_FNAME + std::to_string(threadID), + std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)), std::ios_base::app); std::string upkhdrOut, upkSeqOut; std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { upkfile << content; }; while (in.peek() != EOF) { @@ -364,6 +551,7 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) { in.seekg(begPos); // Read the file from this position // Take a chunk of decrypted file std::string decText; + decText.reserve(chunkSize); for (u64 u = chunkSize; u--;) { in.get(c); decText += c; @@ -384,14 +572,14 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) { unshuffle(i, chunkSize); } // todo - content += THR_ID_HDR + std::to_string(threadID) + "\n"; + content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); do { if (*i == (char)253) { // Hdr (this->*unpackHdr)(upkhdrOut, ++i, upkStruct.hdrUnpack); - content += ">" + upkhdrOut + "\n"; + content += std::format(">{}\n", upkhdrOut); } else { // Seq unpack_seq(upkSeqOut, i); - content += upkSeqOut + "\n"; + content += std::format("{}\n", upkSeqOut); } } while (++i != decText.end()); // If trouble: change "!=" to "<" @@ -401,7 +589,9 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) { in.get(c); if (c == (char)253) { std::string chunkSizeStr; - while (in.get(c) && c != (char)254) chunkSizeStr += c; + while (in.get(c) && c != (char)254) { + chunkSizeStr += c; + } chunkSize = stoull(chunkSizeStr); begPos = in.tellg(); @@ -409,10 +599,10 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) { } } - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } } write_content(); @@ -423,18 +613,18 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) { /** * @brief Unpack: large header - * @param upkStruct Unpack structure - * @param threadID Thread ID + * @param upkStruct Unpack structure + * @param threadID Thread ID */ void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) { pos_t begPos = upkStruct.begPos; u64 chunkSize = upkStruct.chunkSize; std::ifstream in(DEC_FNAME); - std::ofstream upkfile(UPK_FNAME + std::to_string(threadID), + std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)), std::ios_base::app); std::string upkHdrOut, upkSeqOut; std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { upkfile << content; }; while (in.peek() != EOF) { @@ -442,6 +632,7 @@ void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) { in.seekg(begPos); // Read the file from this position // Take a chunk of decrypted file std::string decText; + decText.reserve(chunkSize); for (u64 u = chunkSize; u--;) { in.get(c); decText += c; @@ -462,14 +653,14 @@ void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) { unshuffle(i, chunkSize); } - content += THR_ID_HDR + std::to_string(threadID) + "\n"; + content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); do { if (*i == (char)253) { // Hdr unpack_large(upkHdrOut, ++i, upkStruct.XChar_hdr, upkStruct.hdrUnpack); - content += ">" + upkHdrOut + "\n"; + content += std::format(">{}\n", upkHdrOut); } else { // Seq unpack_seq(upkSeqOut, i); - content += upkSeqOut + "\n"; + content += std::format("{}\n", upkSeqOut); } } while (++i != decText.end()); // If trouble: change "!=" to "<" @@ -479,7 +670,9 @@ void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) { in.get(c); if (c == (char)253) { std::string chunkSizeStr; - while (in.get(c) && c != (char)254) chunkSizeStr += c; + while (in.get(c) && c != (char)254) { + chunkSizeStr += c; + } chunkSize = stoull(chunkSizeStr); begPos = in.tellg(); @@ -487,14 +680,14 @@ void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) { } } - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } } write_content(); upkfile.close(); in.close(); -} \ No newline at end of file +} diff --git a/src/fasta.hpp b/src/fasta.hpp index 6037ba2..0b56568 100644 --- a/src/fasta.hpp +++ b/src/fasta.hpp @@ -1,9 +1,9 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file fasta.hpp - * @brief Compression/Decompression of FASTA - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file fasta.hpp + * @brief Compression/Decompression of FASTA */ #ifndef CRYFA_FASTA_H @@ -24,12 +24,11 @@ struct packfa_s { * @brief Unpakcing FASTA */ struct unpackfa_s { - char XChar_hdr; /**< @brief Extra char if header's length > 39 */ - pos_t begPos; /**< @brief Begining position for each thread */ - u64 chunkSize; /**< @brief Chunk size */ - std::vector - hdrUnpack; /**< @brief Lookup table for unpacking headers */ - unpackFP_t unpackHdrFP; /**< @brief Points to a header unpacking fn */ + char XChar_hdr; /**< @brief Extra char if header's length > 39 */ + pos_t begPos; /**< @brief Begining position for each thread */ + u64 chunkSize; /**< @brief Chunk size */ + std::vector hdrUnpack; /**< @brief Lookup table for unpacking headers */ + unpackFP_t unpackHdrFP; /**< @brief Points to a header unpacking fn */ }; /** @@ -50,4 +49,4 @@ class Fasta : public EnDecrypto { }; } // namespace cryfa -#endif // CRYFA_FASTA_H \ No newline at end of file +#endif // CRYFA_FASTA_H diff --git a/src/fastq.cpp b/src/fastq.cpp index 47ceb8e..f1cff46 100644 --- a/src/fastq.cpp +++ b/src/fastq.cpp @@ -1,27 +1,46 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file fastq.cpp - * @brief Compression/Decompression of FASTQ - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file fastq.cpp + * @brief Compression/Decompression of FASTQ */ #include "fastq.hpp" #include +#include +#include #include #include // setw, std::setprecision #include +#include +#include #include +#include +#include "ordered_pipeline.hpp" +#include "plaintext_stream.hpp" #include "string.hpp" #include "time.hpp" using namespace cryfa; std::mutex mutxFQ; /**< @brief Mutex */ +namespace { +struct FastqRecord { + std::string header; + std::string sequence; + std::string quality; +}; + +struct FastqChunk { + std::vector records; +}; +} // namespace + /** - * @brief Check if the third line contains only + + * @brief Check if the third line contains only + * @return True or false */ bool Fastq::has_just_plus() const { @@ -48,53 +67,121 @@ bool Fastq::has_just_plus() const { * @brief Compress */ void Fastq::compress() { - if (!verbose) std::cerr << bold("[+]") << " Compacting ..."; + if (!verbose) { + std::cerr << bold("[+]") << " Compacting ..."; + } const auto start = now(); // Start timer - std::vector arrThread(n_threads); std::string headers, qscores; packfq_s pkStruct; // Collection of inputs to pass to pack... - if (verbose) + if (verbose) { std::cerr << bold("[+]") << " Calculating no. unique characters ..."; + } // Gather different chars and max length in all headers and quality scores gather_h_q(headers, qscores); // Show number of different chars in headers and qs -- Ignore '@'=64 in hdr - if (verbose) - std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " - << headers.length() << ", qscores => " << qscores.length() - << "\n"; + if (verbose) { + std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " << headers.length() + << ", qscores => " << qscores.length() << "\n"; + } // Set Hash table and pack function set_hashTbl_packFn(pkStruct, headers, qscores); - // Distribute file among threads, for reading and packing - for (byte t = 0; t != n_threads; ++t) - arrThread[t] = std::thread(&Fastq::pack, this, pkStruct, t); - for (auto& thr : arrThread) - if (thr.joinable()) thr.join(); + const bool plus_is_plain = has_just_plus(); - if (verbose) { - std::cerr << "\r" << bold("[+]") << " Shuffling done in " - << hms(now() - shuffle_timer); - std::cerr << bold("[+]") << " Compacting ..."; - } + auto read_chunk = [this, in = std::ifstream(in_file)]() mutable -> std::optional { + FastqChunk chunk; + std::string plus; + u64 chunk_bytes = 0; - // Join partially packed and/or shuffled files - join_packed_files(headers, qscores, 'Q', has_just_plus()); + while (chunk_bytes < CHUNK_TARGET_SIZE) { + FastqRecord record; + if (!std::getline(in, record.header)) { + break; + } + if (!std::getline(in, record.sequence)) { + break; + } + if (!std::getline(in, plus)) { + break; + } + if (!std::getline(in, record.quality)) { + break; + } - const auto finish = now(); // Stop timer - std::cerr << "\r" << bold("[+]") << " Compacting done in " - << hms(finish - start); + chunk_bytes += + record.header.size() + record.sequence.size() + plus.size() + record.quality.size() + 4; + chunk.records.push_back(std::move(record)); + } + + if (chunk.records.empty()) { + return std::nullopt; + } + return chunk; + }; + + auto pack_chunk = [this, pkStruct](FastqChunk chunk) { + packFP_t packHdr = pkStruct.packHdrFPtr; + packFP_t packQS = pkStruct.packQSFPtr; + std::string context; + context.reserve(CHUNK_TARGET_SIZE); + + for (const FastqRecord& record : chunk.records) { + (this->*packHdr)(context, record.header.substr(1), HdrMap); + context += (char)254; + pack_seq(context, record.sequence); + context += (char)254; + (this->*packQS)(context, record.quality, QsMap); + context += (char)254; + } + + if (!stop_shuffle) { + mutxFQ.lock(); //---------------------------------------------------- + if (verbose && shuffInProg) { + std::cerr << bold("[+]") << " Shuffling ..."; + shuffle_timer = now(); + } + shuffInProg = false; + mutxFQ.unlock(); //-------------------------------------------------- - // Cout encrypted content - encrypt(); + shuffle(context); + } + + std::string packed = std::format("{}{}{}", (char)253, context.size(), (char)254); + packed += context; + return packed; + }; + + encrypt_stream([&](const PlaintextSink& emit) { + std::string header; + header.reserve(headers.size() + qscores.size() + 3); + header += (char)126; + header += (!stop_shuffle ? (char)128 : (char)129); + header += headers; + header += (char)254; + header += qscores; + header += (plus_is_plain ? (char)253 : '\n'); + emit(header); + + run_ordered_pipeline(n_threads, read_chunk, pack_chunk, emit); + emit(std::string(1, (char)252)); + + if (verbose && !stop_shuffle) { + std::cerr << "\r" << bold("[+]") << " Shuffling done in " << hms(now() - shuffle_timer); + std::cerr << bold("[+]") << " Compacting ..."; + } + + const auto finish = now(); // Stop timer + std::cerr << "\r" << bold("[+]") << " Compacting done in " << hms(finish - start); + }); } /** * @brief Set hash table and pack function - * @param[out] pkStruct Pack structure - * @param[in] headers Headers - * @param[in] qscores Quality scores + * @param[out] pkStruct Pack structure + * @param headers Headers + * @param qscores Quality scores */ void Fastq::set_hashTbl_packFn(packfq_s& pkStruct, const std::string& headers, const std::string& qscores) { @@ -170,20 +257,24 @@ void Fastq::set_hashTbl_packFn(packfq_s& pkStruct, const std::string& headers, /** * @brief Pack. '@' at the beginning of headers is not packed - * @param pkStruct Pack structure - * @param threadID Thread ID + * @param pkStruct Pack structure + * @param threadID Thread ID */ void Fastq::pack(const packfq_s& pkStruct, byte threadID) { packFP_t packHdr = pkStruct.packHdrFPtr; // Function pointer packFP_t packQS = pkStruct.packQSFPtr; // Function pointer std::ifstream in(in_file); - std::ofstream pkfile(PK_FNAME + std::to_string(threadID), std::ios_base::app); + std::ofstream pkfile(std::format("{}{}", PK_FNAME, static_cast(threadID)), + std::ios_base::app); // Lines ignored at the beginning - for (u64 l = (u64)threadID * BlockLine; l--;) IGNORE_THIS_LINE(in); + for (u64 l = (u64)threadID * BlockLine; l--;) { + IGNORE_THIS_LINE(in); + } while (in.peek() != EOF) { std::string context; // Output std::string + context.reserve(CHUNK_TARGET_SIZE); std::string line; for (u64 l = 0; l != BlockLine; l += 4) { // Process 4 lines by 4 lines @@ -216,18 +307,17 @@ void Fastq::pack(const packfq_s& pkStruct, byte threadID) { } // For unshuffling: insert the size of packed context in the beginning - std::string contextSize; - contextSize += (char)253; - contextSize += std::to_string(context.size()); - contextSize += (char)254; + std::string contextSize = std::format("{}{}{}", (char)253, context.size(), (char)254); context.insert(0, contextSize); // Write header containing threadID for each - pkfile << THR_ID_HDR << std::to_string(threadID) << '\n'; + pkfile << std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); pkfile << context << '\n'; // Ignore to go to the next related chunk - for (u64 l = (u64)(n_threads - 1) * BlockLine; l--;) IGNORE_THIS_LINE(in); + for (u64 l = (u64)(n_threads - 1) * BlockLine; l--;) { + IGNORE_THIS_LINE(in); + } } pkfile.close(); @@ -236,8 +326,8 @@ void Fastq::pack(const packfq_s& pkStruct, byte threadID) { /** * @brief Gather chars of all headers & quality scores, excluding '@' in headers - * @param[out] headers Chars of all headers - * @param[out] qscores Chars of all quality scores + * @param[out] headers Chars of all headers + * @param[out] qscores Chars of all quality scores */ void Fastq::gather_h_q(std::string& headers, std::string& qscores) { u32 maxHLen = 0, maxQLen = 0; // Max length of headers & quality scores @@ -248,168 +338,275 @@ void Fastq::gather_h_q(std::string& headers, std::string& qscores) { std::ifstream in(in_file); for (std::string line; !in.eof();) { if (getline(in, line).good()) { - for (char c : line) hChars[c] = true; - if (line.size() > maxHLen) maxHLen = (u32)line.size(); + for (char c : line) { + hChars[c] = true; + } + if (line.size() > maxHLen) { + maxHLen = (u32)line.size(); + } } IGNORE_THIS_LINE(in); // Ignore sequence IGNORE_THIS_LINE(in); // Ignore + if (getline(in, line).good()) { - for (char c : line) qChars[c] = true; - if (line.size() > maxQLen) maxQLen = (u32)line.size(); + for (char c : line) { + qChars[c] = true; + } + if (line.size() > maxQLen) { + maxQLen = (u32)line.size(); + } } } in.close(); // Number of lines read from input file while compression - BlockLine = (u32)(4 * (BLOCK_SIZE / (maxHLen + 2 * maxQLen))); + BlockLine = (u32)(4 * (CHUNK_TARGET_SIZE / (maxHLen + 2 * maxQLen))); if (!BlockLine) BlockLine = 4; // Gather the characters -- ignore '@'=64 for headers - for (byte i = 32; i != 64; ++i) - if (*(hChars + i)) headers += i; - for (byte i = 65; i != 127; ++i) - if (*(hChars + i)) headers += i; - for (byte i = 32; i != 127; ++i) - if (*(qChars + i)) qscores += i; + for (byte i = 32; i != 64; ++i) { + if (*(hChars + i)) { + headers += i; + } + } + for (byte i = 65; i != 127; ++i) { + if (*(hChars + i)) { + headers += i; + } + } + for (byte i = 32; i != 127; ++i) { + if (*(qChars + i)) { + qscores += i; + } + } } /** * @brief Decompress */ void Fastq::decompress() { - if (!verbose) std::cerr << bold("[+]") << " Decompressing ..."; + if (!verbose) { + std::cerr << bold("[+]") << " Decompressing ..."; + } const auto start = now(); // Start timer - char c; // Chars in file + PlaintextStream plaintext; + std::exception_ptr decrypt_error; + std::thread decrypt_thread([&]() { + try { + decrypt_stream([&](std::string_view decrypted) { plaintext.push(decrypted); }); + plaintext.close(); + } catch (...) { + decrypt_error = std::current_exception(); + plaintext.fail(decrypt_error); + } + }); + + auto join_decrypt = [&]() { + if (decrypt_thread.joinable()) { + decrypt_thread.join(); + } + if (decrypt_error) { + std::rethrow_exception(decrypt_error); + } + }; + std::string headers, qscores; unpackfq_s upkStruct; // Collection of inputs to pass to unpack... - std::vector arrThread(n_threads); // Array of threads - std::ifstream in(DEC_FNAME); - in.ignore(1); // Jump over decText[0]==(char) 126 - in.get(c); - shuffled = (c == (char)128); // Check if file had been shuffled - if (verbose) - std::cerr << bold("[+]") << " Extracting no. unique characters ..."; - while (in.get(c) && c != (char)254) headers += c; - while (in.get(c) && c != '\n' && c != (char)253) qscores += c; - // Show number of different chars in headers and qs -- ignore '@'=64 - if (verbose) - std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " - << headers.length() << ", qscores => " << qscores.length() - << "\n"; - if (c == '\n') justPlus = false; // If 3rd line is just + - - // Header -- Set unpack table and unpack function - set_unpackTbl_unpackFn(upkStruct, headers, qscores); - - // Distribute file among threads, for reading and unpacking - using unpackHQFP = void (Fastq::*)(const unpackfq_s&, byte); - unpackHQFP unpackHQ = - (headers.length() <= MAX_C5) - ? (qscores.length() <= MAX_C5 ? &Fastq::unpack_hS_qS - : &Fastq::unpack_hS_qL) - : (qscores.length() > MAX_C5 ? &Fastq::unpack_hL_qL - : &Fastq::unpack_hL_qS); - - for (byte t = 0; t != n_threads; ++t) { - in.get(c); - if (c == (char)253) { - std::string chunkSizeStr; // Chunk size (std::string) -- For unshuffling - while (in.get(c) && c != (char)254) chunkSizeStr += c; - const u64 offset = stoull(chunkSizeStr); // To traverse decompressed file - - upkStruct.begPos = in.tellg(); - upkStruct.chunkSize = offset; - - arrThread[t] = std::thread(unpackHQ, this, upkStruct, t); - - // Jump to the beginning of the next chunk - in.seekg((std::streamoff)offset, std::ios_base::cur); - } - // End of file - if (in.peek() == 252) break; - } - // Join threads - for (auto& thr : arrThread) - if (thr.joinable()) thr.join(); + try { + const auto file_type = plaintext.get(); + if (!file_type || *file_type != (char)126) { + throw std::runtime_error("corrupted file."); + } - if (verbose) { - std::cerr << "\r" << bold("[+]") << " Unshuffling done in " - << hms(now() - shuffle_timer); - std::cerr << bold("[+]") << " Decompressing ..."; - } + const auto shuffle_flag = plaintext.get(); + if (!shuffle_flag || (*shuffle_flag != (char)128 && *shuffle_flag != (char)129)) { + throw std::runtime_error("corrupted file."); + } - // Close/delete decrypted file - in.close(); - const std::string decFileName = DEC_FNAME; - std::remove(decFileName.c_str()); + shuffled = (*shuffle_flag == (char)128); // Check if file had been shuffled + if (verbose) { + std::cerr << bold("[+]") << " Extracting no. unique characters ..."; + } + if (!plaintext.read_until((char)254, headers)) { + throw std::runtime_error("corrupted file."); + } + + char c = 0; + while (std::optional next = plaintext.get()) { + c = *next; + if (c == '\n' || c == (char)253) { + break; + } + qscores += c; + } + if (c != '\n' && c != (char)253) { + throw std::runtime_error("corrupted file."); + } + // Show number of different chars in headers and qs -- ignore '@'=64 + if (verbose) { + std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " << headers.length() + << ", qscores => " << qscores.length() << "\n"; + } + justPlus = (c != '\n'); // If 3rd line is just + + + // Header -- Set unpack table and unpack function + set_unpackTbl_unpackFn(upkStruct, headers, qscores); + const bool has_small_header = headers.length() <= MAX_C5; + const bool has_small_qscore = qscores.length() <= MAX_C5; + + auto read_chunk = [&]() -> std::optional { + const auto marker = plaintext.get(); + if (!marker || *marker == (char)252) { + return std::nullopt; + } + if (*marker != (char)253) { + throw std::runtime_error("corrupted file."); + } + + std::string chunk_size_str; + if (!plaintext.read_until((char)254, chunk_size_str) || chunk_size_str.empty()) { + throw std::runtime_error("corrupted file."); + } + + std::string chunk; + if (!plaintext.read_bytes(std::stoull(chunk_size_str), chunk)) { + throw std::runtime_error("corrupted file."); + } + return chunk; + }; + + auto unpack_chunk = [this, upkStruct, has_small_header, + has_small_qscore](std::string decText) mutable { + if (decText.empty()) { + return std::string{}; + } + + auto i = decText.begin(); + + // Unshuffle + if (shuffled) { + mutxFQ.lock(); //-------------------------------------------------- + if (verbose && shuffInProg) { + std::cerr << bold("[+]") << " Unshuffling ..."; + shuffle_timer = now(); + } + shuffInProg = false; + mutxFQ.unlock(); //------------------------------------------------ + + unshuffle(i, decText.size()); + } - // Join partially unpacked files - join_unpacked_files(); + std::string upkHdrOut, upkSeqOut, upkQsOut; + std::string content; + content.reserve(decText.size() * 2); + do { + content += '@'; + std::string plusMore; + + if (has_small_header) { + (this->*upkStruct.unpackHdrFPtr)(upkHdrOut, i, upkStruct.hdrUnpack); + } else { + unpack_large(upkHdrOut, i, upkStruct.XChar_hdr, upkStruct.hdrUnpack); + } + plusMore = upkHdrOut; + content += std::format("{}\n", upkHdrOut); + ++i; // Hdr + + unpack_seq(upkSeqOut, i); + content += std::format("{}\n", upkSeqOut); // Seq + + content += justPlus ? "+\n" : std::format("+{}\n", plusMore); + ++i; // + + + if (has_small_qscore) { + (this->*upkStruct.unpackQSFPtr)(upkQsOut, i, upkStruct.qsUnpack); + } else { + unpack_large(upkQsOut, i, upkStruct.XChar_qs, upkStruct.qsUnpack); + } + content += std::format("{}\n", upkQsOut); // Qs + } while (++i != decText.end()); + + return content; + }; + + run_ordered_pipeline(n_threads, read_chunk, unpack_chunk, + [](const std::string& output) { std::cout << output; }); + + if (verbose && shuffled) { + std::cerr << "\r" << bold("[+]") << " Unshuffling done in " << hms(now() - shuffle_timer); + std::cerr << bold("[+]") << " Decompressing ..."; + } + } catch (...) { + plaintext.fail(std::current_exception()); + if (decrypt_thread.joinable()) { + decrypt_thread.join(); + } + throw; + } + + join_decrypt(); const auto finish = now(); // Stop timer - std::cerr << "\r" << bold("[+]") << " Decompressing done in " - << hms(finish - start); + std::cerr << "\r" << bold("[+]") << " Decompressing done in " << hms(finish - start); } /** * @brief Set unpack table and unpack function - * @param[out] upkStruct Unpack structure - * @param[in] headers Headers - * @param[in] qscores Quality scores + * @param[out] upkStruct Unpack structure + * @param headers Headers + * @param qscores Quality scores */ -void Fastq::set_unpackTbl_unpackFn(unpackfq_s& upkStruct, - const std::string& headers, +void Fastq::set_unpackTbl_unpackFn(unpackfq_s& upkStruct, const std::string& headers, const std::string& qscores) { const auto headersLen = headers.length(); const auto qscoresLen = qscores.length(); u16 keyLen_hdr = 0, keyLen_qs = 0; // Header - if (headersLen > MAX_C5) + if (headersLen > MAX_C5) { keyLen_hdr = KEYLEN_C5; - else if (headersLen > MAX_C4) { // Cat 5 + } else if (headersLen > MAX_C4) { // Cat 5 upkStruct.unpackHdrFPtr = &EnDecrypto::unpack_2B; keyLen_hdr = KEYLEN_C5; } else { upkStruct.unpackHdrFPtr = &EnDecrypto::unpack_1B; - if (headersLen > MAX_C3) + if (headersLen > MAX_C3) { keyLen_hdr = KEYLEN_C4; // Cat 4 - else if (headersLen == MAX_C3 || headersLen == MID_C3 || - headersLen == MIN_C3) + } else if (headersLen == MAX_C3 || headersLen == MID_C3 || headersLen == MIN_C3) { keyLen_hdr = KEYLEN_C3; // Cat 3 - else if (headersLen == C2) + } else if (headersLen == C2) { keyLen_hdr = KEYLEN_C2; // Cat 2 - else if (headersLen == C1) + } else if (headersLen == C1) { keyLen_hdr = KEYLEN_C1; // Cat 1 - else + } else { keyLen_hdr = 1; // = 1 + } } // Quality score - if (qscoresLen > MAX_C5) + if (qscoresLen > MAX_C5) { keyLen_qs = KEYLEN_C5; - else if (qscoresLen > MAX_C4) { // Cat 5 + } else if (qscoresLen > MAX_C4) { // Cat 5 upkStruct.unpackQSFPtr = &EnDecrypto::unpack_2B; keyLen_qs = KEYLEN_C5; } else { upkStruct.unpackQSFPtr = &EnDecrypto::unpack_1B; - if (qscoresLen > MAX_C3) + if (qscoresLen > MAX_C3) { keyLen_qs = KEYLEN_C4; // Cat 4 - else if (qscoresLen == MAX_C3 || qscoresLen == MID_C3 || - qscoresLen == MIN_C3) + } else if (qscoresLen == MAX_C3 || qscoresLen == MID_C3 || qscoresLen == MIN_C3) { keyLen_qs = KEYLEN_C3; // Cat 3 - else if (qscoresLen == C2) + } else if (qscoresLen == C2) { keyLen_qs = KEYLEN_C2; // Cat 2 - else if (qscoresLen == C1) + } else if (qscoresLen == C1) { keyLen_qs = KEYLEN_C1; // Cat 1 - else + } else { keyLen_qs = 1; // = 1 + } } // Build unpacking tables @@ -449,8 +646,8 @@ void Fastq::set_unpackTbl_unpackFn(unpackfq_s& upkStruct, /** * @brief Unpack: small header, small quality score. * '@' at the beginning of headers not packed - * @param upkStruct Unpack structure - * @param threadID Thread ID + * @param upkStruct Unpack structure + * @param threadID Thread ID */ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) { unpackFP_t unpackHdr = upkStruct.unpackHdrFPtr; // Function pointer @@ -458,11 +655,11 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) { pos_t begPos = upkStruct.begPos; u64 chunkSize = upkStruct.chunkSize; std::ifstream in(DEC_FNAME); - std::ofstream upkfile(UPK_FNAME + std::to_string(threadID), + std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)), std::ios_base::app); std::string upkHdrOut, upkSeqOut, upkQsOut; std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { upkfile << content; }; while (in.peek() != EOF) { @@ -470,6 +667,7 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) { in.seekg(begPos); // Read the file from this position // Take a chunk of decrypted file std::string decText; + decText.reserve(chunkSize); for (u64 u = chunkSize; u--;) { in.get(c); decText += c; @@ -490,24 +688,24 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) { unshuffle(i, chunkSize); } - content += THR_ID_HDR + std::to_string(threadID) + "\n"; + content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); do { content += '@'; std::string plusMore; (this->*unpackHdr)(upkHdrOut, i, upkStruct.hdrUnpack); plusMore = upkHdrOut; - content += upkHdrOut + "\n"; + content += std::format("{}\n", upkHdrOut); ++i; // Hdr unpack_seq(upkSeqOut, i); - content += upkSeqOut + "\n"; // Seq + content += std::format("{}\n", upkSeqOut); // Seq - content += (justPlus ? "+" : "+" + plusMore) + "\n"; + content += justPlus ? "+\n" : std::format("+{}\n", plusMore); ++i; // + (this->*unpackQS)(upkQsOut, i, upkStruct.qsUnpack); - content += upkQsOut + "\n"; // Qs + content += std::format("{}\n", upkQsOut); // Qs } while (++i != decText.end()); // If trouble: change "!=" to "<" // Update the chunk size and positions (beg & end) @@ -516,7 +714,9 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) { in.get(c); if (c == (char)253) { std::string chunkSizeStr; - while (in.get(c) && c != (char)254) chunkSizeStr += c; + while (in.get(c) && c != (char)254) { + chunkSizeStr += c; + } chunkSize = stoull(chunkSizeStr); begPos = in.tellg(); @@ -524,10 +724,10 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) { } } - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } } write_content(); @@ -539,19 +739,19 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) { /** * @brief Unpack: small header, large quality score. * '@' at the beginning of headers not packed - * @param upkStruct Unpack structure - * @param threadID Thread ID + * @param upkStruct Unpack structure + * @param threadID Thread ID */ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) { unpackFP_t unpackHdr = upkStruct.unpackHdrFPtr; // Function pointer pos_t begPos = upkStruct.begPos; u64 chunkSize = upkStruct.chunkSize; std::ifstream in(DEC_FNAME); - std::ofstream upkfile(UPK_FNAME + std::to_string(threadID), + std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)), std::ios_base::app); std::string upkHdrOut, upkSeqOut, upkQsOut; std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { upkfile << content; }; while (in.peek() != EOF) { @@ -559,6 +759,7 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) { in.seekg(begPos); // Read file from this position // Take a chunk of decrypted file std::string decText; + decText.reserve(chunkSize); for (u64 u = chunkSize; u--;) { in.get(c); decText += c; @@ -579,24 +780,24 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) { unshuffle(i, chunkSize); } - content += THR_ID_HDR + std::to_string(threadID) + "\n"; + content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); do { content += '@'; std::string plusMore; (this->*unpackHdr)(upkHdrOut, i, upkStruct.hdrUnpack); plusMore = upkHdrOut; - content += upkHdrOut + "\n"; + content += std::format("{}\n", upkHdrOut); ++i; // Hdr unpack_seq(upkSeqOut, i); - content += upkSeqOut + "\n"; // Seq + content += std::format("{}\n", upkSeqOut); // Seq - content += (justPlus ? "+" : "+" + plusMore) + "\n"; + content += justPlus ? "+\n" : std::format("+{}\n", plusMore); ++i; // + unpack_large(upkQsOut, i, upkStruct.XChar_qs, upkStruct.qsUnpack); - content += upkQsOut + "\n"; // Qs + content += std::format("{}\n", upkQsOut); // Qs } while (++i != decText.end()); // If trouble: change "!=" to "<" // Update the chunk size and positions (beg & end) @@ -605,7 +806,9 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) { in.get(c); if (c == (char)253) { std::string chunkSizeStr; - while (in.get(c) && c != (char)254) chunkSizeStr += c; + while (in.get(c) && c != (char)254) { + chunkSizeStr += c; + } chunkSize = stoull(chunkSizeStr); begPos = in.tellg(); @@ -613,10 +816,10 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) { } } - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } } write_content(); @@ -628,19 +831,19 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) { /** * @brief Unpack: large header, small quality score. * '@' at the beginning of headers not packed - * @param upkStruct Unpack structure - * @param threadID Thread ID + * @param upkStruct Unpack structure + * @param threadID Thread ID */ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) { unpackFP_t unpackQS = upkStruct.unpackQSFPtr; // Function pointer pos_t begPos = upkStruct.begPos; u64 chunkSize = upkStruct.chunkSize; std::ifstream in(DEC_FNAME); - std::ofstream upkfile(UPK_FNAME + std::to_string(threadID), + std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)), std::ios_base::app); std::string upkHdrOut, upkSeqOut, upkQsOut; std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { upkfile << content; }; while (in.peek() != EOF) { @@ -648,6 +851,7 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) { in.seekg(begPos); // Read file from this position // Take a chunk of decrypted file std::string decText; + decText.reserve(chunkSize); for (u64 u = chunkSize; u--;) { in.get(c); decText += c; @@ -668,24 +872,24 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) { unshuffle(i, chunkSize); } - content += THR_ID_HDR + std::to_string(threadID) + "\n"; + content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); do { content += "@"; std::string plusMore; unpack_large(upkHdrOut, i, upkStruct.XChar_hdr, upkStruct.hdrUnpack); plusMore = upkHdrOut; - content += upkHdrOut + "\n"; + content += std::format("{}\n", upkHdrOut); ++i; // Hdr unpack_seq(upkSeqOut, i); - content += upkSeqOut + "\n"; // Seq + content += std::format("{}\n", upkSeqOut); // Seq - content += (justPlus ? "+" : "+" + plusMore) + "\n"; + content += justPlus ? "+\n" : std::format("+{}\n", plusMore); ++i; // + (this->*unpackQS)(upkQsOut, i, upkStruct.qsUnpack); - content += upkQsOut + "\n"; // Qs + content += std::format("{}\n", upkQsOut); // Qs } while (++i != decText.end()); // If trouble: change "!=" to "<" // Update the chunk size and positions (beg & end) @@ -694,7 +898,9 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) { in.get(c); if (c == (char)253) { std::string chunkSizeStr; - while (in.get(c) && c != (char)254) chunkSizeStr += c; + while (in.get(c) && c != (char)254) { + chunkSizeStr += c; + } chunkSize = stoull(chunkSizeStr); begPos = in.tellg(); @@ -702,10 +908,10 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) { } } - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } } write_content(); @@ -717,18 +923,18 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) { /** * @brief Unpack: large header, large quality score. * '@' at the beginning of headers not packed - * @param upkStruct Unpack structure - * @param threadID Thread ID + * @param upkStruct Unpack structure + * @param threadID Thread ID */ void Fastq::unpack_hL_qL(const unpackfq_s& upkStruct, byte threadID) { pos_t begPos = upkStruct.begPos; u64 chunkSize = upkStruct.chunkSize; std::ifstream in(DEC_FNAME); - std::ofstream upkfile(UPK_FNAME + std::to_string(threadID), + std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)), std::ios_base::app); std::string upkHdrOut, upkSeqOut, upkQsOut; std::string content; - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); auto write_content = [&]() { upkfile << content; }; while (in.peek() != EOF) { @@ -736,6 +942,7 @@ void Fastq::unpack_hL_qL(const unpackfq_s& upkStruct, byte threadID) { in.seekg(begPos); // Read file from this position // Take a chunk of decrypted file std::string decText; + decText.reserve(chunkSize); for (u64 u = chunkSize; u--;) { in.get(c); decText += c; @@ -756,24 +963,24 @@ void Fastq::unpack_hL_qL(const unpackfq_s& upkStruct, byte threadID) { unshuffle(i, chunkSize); } - content += THR_ID_HDR + std::to_string(threadID) + "\n"; + content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID)); do { content += "@"; std::string plusMore; unpack_large(upkHdrOut, i, upkStruct.XChar_hdr, upkStruct.hdrUnpack); plusMore = upkHdrOut; - content += upkHdrOut + "\n"; + content += std::format("{}\n", upkHdrOut); ++i; // Hdr unpack_seq(upkSeqOut, i); - content += upkSeqOut + "\n"; // Seq + content += std::format("{}\n", upkSeqOut); // Seq - content += (justPlus ? "+" : "+" + plusMore) + "\n"; + content += justPlus ? "+\n" : std::format("+{}\n", plusMore); ++i; // + unpack_large(upkQsOut, i, upkStruct.XChar_qs, upkStruct.qsUnpack); - content += upkQsOut + "\n"; // Qs + content += std::format("{}\n", upkQsOut); // Qs } while (++i != decText.end()); // If trouble: change "!=" to "<" // Update the chunk size and positions (beg & end) @@ -782,22 +989,23 @@ void Fastq::unpack_hL_qL(const unpackfq_s& upkStruct, byte threadID) { in.get(c); if (c == (char)253) { std::string chunkSizeStr; - while (in.get(c) && c != (char)254) chunkSizeStr += c; - + while (in.get(c) && c != (char)254) { + chunkSizeStr += c; + } chunkSize = stoull(chunkSizeStr); begPos = in.tellg(); endPos = begPos + (pos_t)chunkSize; } } - if (content.size() >= BLOCK_SIZE) { + if (content.size() >= IO_BUFFER_SIZE) { write_content(); content.clear(); - content.reserve(BLOCK_SIZE); + content.reserve(IO_BUFFER_SIZE); } } write_content(); upkfile.close(); in.close(); -} \ No newline at end of file +} diff --git a/src/fastq.hpp b/src/fastq.hpp index 407634b..2314812 100644 --- a/src/fastq.hpp +++ b/src/fastq.hpp @@ -1,9 +1,9 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file fastq.hpp - * @brief Compression/Decompression of FASTQ - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file fastq.hpp + * @brief Compression/Decompression of FASTQ */ #ifndef CRYFA_FASTQ_H @@ -21,16 +21,14 @@ struct packfq_s { /** @brief Unpakcing FASTQ */ struct unpackfq_s { - char XChar_hdr; /**< @brief Extra char if header's length > 39 */ - char XChar_qs; /**< @brief Extra char if q scores length > 39 */ - pos_t begPos; /**< @brief Begining position for each thread */ - u64 chunkSize; /**< @brief Chunk size */ - std::vector - hdrUnpack; /**< @brief Lookup table for unpacking headers */ - std::vector - qsUnpack; /**< @brief Lookup table for unpacking q scores */ - unpackFP_t unpackHdrFPtr; /**< @brief Points to a hdr unpacking function */ - unpackFP_t unpackQSFPtr; /**< @brief Points to a qs unpacking function */ + char XChar_hdr; /**< @brief Extra char if header's length > 39 */ + char XChar_qs; /**< @brief Extra char if q scores length > 39 */ + pos_t begPos; /**< @brief Begining position for each thread */ + u64 chunkSize; /**< @brief Chunk size */ + std::vector hdrUnpack; /**< @brief Lookup table for unpacking headers */ + std::vector qsUnpack; /**< @brief Lookup table for unpacking q scores */ + unpackFP_t unpackHdrFPtr; /**< @brief Points to a hdr unpacking function */ + unpackFP_t unpackQSFPtr; /**< @brief Points to a qs unpacking function */ }; /** @@ -48,8 +46,7 @@ class Fastq : public EnDecrypto { void gather_h_q(std::string&, std::string&); void set_hashTbl_packFn(packfq_s&, const std::string&, const std::string&); void pack(const packfq_s&, byte); - void set_unpackTbl_unpackFn(unpackfq_s&, const std::string&, - const std::string&); + void set_unpackTbl_unpackFn(unpackfq_s&, const std::string&, const std::string&); void unpack_hS_qS(const unpackfq_s&, byte); void unpack_hS_qL(const unpackfq_s&, byte); void unpack_hL_qS(const unpackfq_s&, byte); @@ -57,4 +54,4 @@ class Fastq : public EnDecrypto { }; } // namespace cryfa -#endif // CRYFA_FASTQ_H \ No newline at end of file +#endif // CRYFA_FASTQ_H diff --git a/src/include/assert.hpp b/src/include/assert.hpp index 029299a..510afd4 100644 --- a/src/include/assert.hpp +++ b/src/include/assert.hpp @@ -1,74 +1,74 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file assert.hpp - * @brief Assertions - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file assert.hpp + * @brief Assertion and diagnostic helpers */ #ifndef CRYFA_ASSERT_H #define CRYFA_ASSERT_H +#include #include #include // std::runtime_error #include "string.hpp" /** - * @brief Show error - * @param message the message to be shown - * @param width width of the message shown on terminal + * @brief Throw a formatted runtime error + * @param message Message shown after the error prefix + * @param width Maximum terminal width used when wrapping the message + * @throws std::runtime_error Always throws with the formatted error message */ inline void error(std::string const& message, int width = 65) { - std::string msg = wrap_text("Error: " + message, "", width); - msg = bold(msg.substr(0, 6), "red") + msg.substr(6) + "\n"; + std::string msg = wrap_text(std::format("Error: {}", message), "", width); + msg = std::format("{}{}\n", bold(msg.substr(0, 6), "red"), msg.substr(6)); throw std::runtime_error(msg); } /** - * @brief Show warning - * @param message the message to be shown - * @param width width of the message shown on terminal + * @brief Print a formatted warning message + * @param message Message shown after the warning prefix + * @param width Maximum terminal width used when wrapping the message */ inline void warning(std::string const& message, int width = 65) { - std::string msg = wrap_text("Warning: " + message, "", width); - msg = bold(msg.substr(0, 8), "magenta") + msg.substr(8) + "\n"; + std::string msg = wrap_text(std::format("Warning: {}", message), "", width); + msg = std::format("{}{}\n", bold(msg.substr(0, 8), "magenta"), msg.substr(8)); std::cerr << msg; } /** - * @brief Assert a condition - * @param cond the condition to be checked - * @param msg the message shown when the condition is true + * @brief Throw an error when a condition is true + * @param cond Condition that triggers the error + * @param msg Message shown when the condition is true */ inline void assert_single(bool cond, const std::string& msg) { if (cond) error(msg); } /** - * @brief Assert a condition - * @param cond the condition which will be checked - * @param msgT the message shown when the condition is true - * @param msgF the message shown when the condition is false + * @brief Throw one of two errors based on a condition + * @param cond Condition used to select the error message + * @param msgT Message shown when the condition is true + * @param msgF Message shown when the condition is false */ -inline void assert_dual(bool cond, const std::string& msgT, - const std::string& msgF) { +inline void assert_dual(bool cond, const std::string& msgT, const std::string& msgF) { error(cond ? msgT : msgF); } /** - * @brief Check if file is good - * @param fname the file name - * @param msg the error message + * @brief Check that a file can be opened and is not empty + * @param fname File name to check + * @param msg Optional custom error message */ -inline void assert_file_good(const std::string& fname, - const std::string& msg = "") { +inline void assert_file_good(const std::string& fname, const std::string& msg = "") { std::ifstream in(fname); if (!in.good() || in.peek() == EOF) { in.close(); - assert_dual(msg.empty(), "failed opening \"" + fname + "\".", msg); + assert_dual(msg.empty(), std::format("failed opening \"{}\".", fname), msg); } in.close(); } -#endif // CRYFA_ASSERT_H \ No newline at end of file +#endif // CRYFA_ASSERT_H diff --git a/src/include/file.hpp b/src/include/file.hpp index 6863074..5c14299 100644 --- a/src/include/file.hpp +++ b/src/include/file.hpp @@ -1,40 +1,44 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file file.hpp - * @brief file handling - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file file.hpp + * @brief File handling */ #ifndef CRYFA_FILE_HPP #define CRYFA_FILE_HPP +#include #include /** * @brief Check if file can be opened correctly - * @param name name of the file + * @param name Name of the file */ inline static void check_file(std::string name) { // Must be inline std::ifstream f(name); if (!f) { f.close(); - error("the file \"" + name + "\" cannot be opened or is empty."); + error(std::format("the file \"{}\" cannot be opened or is empty.", name)); } else { bool foundChar{false}; - for (char c; f.get(c) && !foundChar;) - if (c != ' ' && c != '\n' && c != '\t') foundChar = true; + for (char c; f.get(c) && !foundChar;) { + if (c != ' ' && c != '\n' && c != '\t') { + foundChar = true; + } + } if (!foundChar) { f.close(); - error("the file \"" + name + "\" is empty."); + error(std::format("the file \"{}\" is empty.", name)); } f.close(); } } /** - * @brief Extract file name - * @param path path including the file name + * @brief Extract file name + * @param path Path including the file name * @return File name */ inline static std::string file_name(std::string path) { @@ -43,8 +47,8 @@ inline static std::string file_name(std::string path) { } /** - * @brief Find file size - * @param name name of the file + * @brief Find file size + * @param name Name of the file * @return File size */ inline static uint64_t file_size(std::string name) { @@ -53,4 +57,4 @@ inline static uint64_t file_size(std::string name) { return static_cast(f.tellg()); } -#endif // CRYFA_FILE_HPP \ No newline at end of file +#endif // CRYFA_FILE_HPP diff --git a/src/include/numeric.hpp b/src/include/numeric.hpp index a38552b..52ff64b 100644 --- a/src/include/numeric.hpp +++ b/src/include/numeric.hpp @@ -1,9 +1,9 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file numeric.hpp - * @brief numerical functions - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file numeric.hpp + * @brief Numerical functions */ #ifndef CRYFA_NUMERIC_HPP @@ -17,24 +17,26 @@ #include "string.hpp" /** - * @brief Accumulate hop index values in a range - * @param first beginning of the range - * @param last end of the range - * @param init initial value - * @param h hop value + * @brief Accumulate hop index values in a range + * @param first Beginning of the range + * @param last End of the range + * @param init Initial value + * @param h Hop value * @return A number */ template T accum_hops(Iter first, Iter last, T init, Hop h) { - for (; first < last; first += h) init += *first; + for (; first < last; first += h) { + init += *first; + } return init; } /** - * @brief Accumulate even index values in a range - * @param first beginning of the range - * @param last end of the range - * @param init initial value + * @brief Accumulate even index values in a range + * @param first Beginning of the range + * @param last End of the range + * @param init Initial value * @return A number */ template @@ -43,10 +45,10 @@ T accum_even(Iter first, Iter last, T init) { } /** - * @brief Accumulate odd index values in a range - * @param first beginning of the range - * @param last end of the range - * @param init initial value + * @brief Accumulate odd index values in a range + * @param first Beginning of the range + * @param last End of the range + * @param init Initial value * @return A number */ template @@ -55,14 +57,13 @@ T accum_odd(Iter first, Iter last, T init) { } /** - * @brief Check if a string is a number - * @param s the input string + * @brief Check if a string is a number + * @param s The input string * @return Yes, if it is a number */ inline bool is_number(const std::string& s) { assert_single(s.empty(), "the string is empty."); - return std::find_if(s.begin(), s.end(), - [](char c) { return !std::isdigit(c); }) == s.end(); + return std::find_if(s.begin(), s.end(), [](char c) { return !std::isdigit(c); }) == s.end(); } -#endif // CRYFA_NUMERIC_HPP \ No newline at end of file +#endif // CRYFA_NUMERIC_HPP diff --git a/src/include/ordered_pipeline.hpp b/src/include/ordered_pipeline.hpp new file mode 100644 index 0000000..9e33b2c --- /dev/null +++ b/src/include/ordered_pipeline.hpp @@ -0,0 +1,162 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + +/** + * @file ordered_pipeline.hpp + * @brief Ordered pipeline functions + */ + +#ifndef CRYFA_ORDERED_PIPELINE_HPP +#define CRYFA_ORDERED_PIPELINE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../def.hpp" + +namespace cryfa { + +template +void run_ordered_pipeline(size_t worker_count, ReadChunk&& read_chunk, PackChunk&& pack_chunk, + Emit&& emit) { + worker_count = std::max(1, worker_count); + + struct WorkItem { + u64 index = 0; + Chunk chunk; + }; + + const size_t max_in_flight = std::max(1, worker_count * 2); + std::mutex mutex; + std::condition_variable work_ready; + std::condition_variable space_ready; + std::condition_variable result_ready; + std::deque work_queue; + std::map results; + std::exception_ptr error; + bool reader_done = false; + u64 chunks_read = 0; + u64 next_to_write = 0; + size_t in_flight = 0; + + auto set_error = [&](std::exception_ptr ptr) { + std::lock_guard lock(mutex); + if (!error) { + error = ptr; + } + reader_done = true; + work_ready.notify_all(); + space_ready.notify_all(); + result_ready.notify_all(); + }; + + std::thread reader([&]() { + try { + while (std::optional chunk = read_chunk()) { + std::unique_lock lock(mutex); + space_ready.wait(lock, [&]() { return error || in_flight < max_in_flight; }); + if (error) { + return; + } + + work_queue.push_back(WorkItem{chunks_read++, std::move(*chunk)}); + ++in_flight; + work_ready.notify_one(); + } + + std::lock_guard lock(mutex); + reader_done = true; + work_ready.notify_all(); + result_ready.notify_all(); + } catch (...) { + set_error(std::current_exception()); + } + }); + + std::vector workers; + workers.reserve(worker_count); + for (size_t i = 0; i != worker_count; ++i) { + workers.emplace_back([&]() { + try { + while (true) { + WorkItem item; + { + std::unique_lock lock(mutex); + work_ready.wait(lock, [&]() { return error || !work_queue.empty() || reader_done; }); + if (error || (work_queue.empty() && reader_done)) { + return; + } + item = std::move(work_queue.front()); + work_queue.pop_front(); + } + + std::string packed = pack_chunk(std::move(item.chunk)); + + std::lock_guard lock(mutex); + results.emplace(item.index, std::move(packed)); + result_ready.notify_all(); + } + } catch (...) { + set_error(std::current_exception()); + } + }); + } + + try { + while (true) { + std::string packed; + { + std::unique_lock lock(mutex); + result_ready.wait(lock, [&]() { + return error || results.contains(next_to_write) || + (reader_done && next_to_write == chunks_read); + }); + + if (error) { + std::rethrow_exception(error); + } + if (reader_done && next_to_write == chunks_read) { + break; + } + + auto result = results.find(next_to_write); + packed = std::move(result->second); + results.erase(result); + --in_flight; + ++next_to_write; + space_ready.notify_one(); + } + + emit(packed); + } + } catch (...) { + set_error(std::current_exception()); + } + + if (reader.joinable()) { + reader.join(); + } + for (auto& worker : workers) { + if (worker.joinable()) { + worker.join(); + } + } + + if (error) { + std::rethrow_exception(error); + } +} + +} // namespace cryfa + +#endif // CRYFA_ORDERED_PIPELINE_HPP diff --git a/src/include/plaintext_stream.hpp b/src/include/plaintext_stream.hpp new file mode 100644 index 0000000..bf822de --- /dev/null +++ b/src/include/plaintext_stream.hpp @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + +/** + * @file plaintext_stream.hpp + * @brief Plaintext stream functions + */ + +#ifndef CRYFA_PLAINTEXT_STREAM_HPP +#define CRYFA_PLAINTEXT_STREAM_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../def.hpp" + +namespace cryfa { + +class PlaintextStream { + public: + explicit PlaintextStream(size_t max_buffered = std::max(CHUNK_TARGET_SIZE * 4, + IO_BUFFER_SIZE * 4)) + : max_buffered_(max_buffered) {} + + void push(std::string_view plaintext) { + if (plaintext.empty()) { + return; + } + + std::unique_lock lock(mutex_); + space_ready_.wait(lock, [&]() { + return error_ || buffered_bytes_ + plaintext.size() <= max_buffered_ || buffered_bytes_ == 0; + }); + if (error_) { + std::rethrow_exception(error_); + } + + chunks_.emplace_back(plaintext); + buffered_bytes_ += plaintext.size(); + data_ready_.notify_all(); + } + + void close() { + std::lock_guard lock(mutex_); + done_ = true; + data_ready_.notify_all(); + } + + void fail(std::exception_ptr error) { + std::lock_guard lock(mutex_); + if (!error_) { + error_ = error; + } + done_ = true; + data_ready_.notify_all(); + space_ready_.notify_all(); + } + + auto get() -> std::optional { + std::unique_lock lock(mutex_); + data_ready_.wait(lock, [&]() { return error_ || !chunks_.empty() || done_; }); + if (error_) { + std::rethrow_exception(error_); + } + if (chunks_.empty()) { + return std::nullopt; + } + + return pop_front(lock); + } + + auto read_until(char delimiter, std::string& out) -> bool { + out.clear(); + while (std::optional c = get()) { + if (*c == delimiter) { + return true; + } + out += *c; + } + return false; + } + + auto read_bytes(size_t size, std::string& out) -> bool { + out.clear(); + out.reserve(size); + + size_t remaining = size; + while (remaining != 0) { + std::unique_lock lock(mutex_); + data_ready_.wait(lock, [&]() { return error_ || !chunks_.empty() || done_; }); + if (error_) { + std::rethrow_exception(error_); + } + if (chunks_.empty()) { + return false; + } + + std::string& front = chunks_.front(); + const size_t available = front.size() - front_offset_; + const size_t take = std::min(remaining, available); + out.append(front.data() + front_offset_, take); + front_offset_ += take; + buffered_bytes_ -= take; + remaining -= take; + + if (front_offset_ == front.size()) { + chunks_.pop_front(); + front_offset_ = 0; + } + + lock.unlock(); + space_ready_.notify_all(); + } + return true; + } + + private: + auto pop_front(std::unique_lock& lock) -> char { + std::string& front = chunks_.front(); + const char c = front[front_offset_++]; + --buffered_bytes_; + + if (front_offset_ == front.size()) { + chunks_.pop_front(); + front_offset_ = 0; + } + + lock.unlock(); + space_ready_.notify_all(); + return c; + } + + const size_t max_buffered_; + std::mutex mutex_; + std::condition_variable data_ready_; + std::condition_variable space_ready_; + std::deque chunks_; + std::exception_ptr error_; + size_t buffered_bytes_ = 0; + size_t front_offset_ = 0; + bool done_ = false; +}; + +} // namespace cryfa + +#endif // CRYFA_PLAINTEXT_STREAM_HPP diff --git a/src/include/string.hpp b/src/include/string.hpp index 43cb45b..d884414 100644 --- a/src/include/string.hpp +++ b/src/include/string.hpp @@ -1,9 +1,9 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file string.hpp - * @brief String format - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file string.hpp + * @brief String format */ #ifndef CRYFA_STRING_HPP @@ -11,26 +11,26 @@ #include #include +#include #include #include extern void assert_single(bool, const std::string&); /** - * @brief Convert a string to lower case - * @param s the input string + * @brief Convert a string to lower case + * @param s The input string * @return A string */ inline std::string lower_case(std::string s) { - std::transform(s.begin(), s.end(), s.begin(), - [](unsigned char c) { return std::tolower(c); }); + std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); }); return s; } /** - * @brief Make a bold string, supporting different colors - * @param text the input text - * @param color the color name + * @brief Make a bold string, supporting different colors + * @param text The input text + * @param color The color name * @return A string */ inline std::string bold(const std::string& text, std::string color = "") { @@ -40,74 +40,87 @@ inline std::string bold(const std::string& text, std::string color = "") { const std::string pre = "\033[1m"; const std::string post = "\033[0m"; - if (color == "") return pre + text + post; - - std::string code = "\033[38;5;"; - if (lower_case(color) == "black") code += "0"; - if (lower_case(color) == "red") code += "1"; - if (lower_case(color) == "green") code += "2"; - if (lower_case(color) == "yellow") code += "3"; - if (lower_case(color) == "blue") code += "4"; - if (lower_case(color) == "magenta") code += "5"; - if (lower_case(color) == "cyan") code += "6"; - if (lower_case(color) == "white") code += "7"; - code += "m"; - return pre + code + text + post; + if (color.empty()) { + return std::format("{}{}{}", pre, text, post); + } + + const std::string color_l = lower_case(color); + std::string code = ""; + + if (color_l == "black") { + code = "\033[38;5;0m"; + } else if (color_l == "red") { + code = "\033[38;5;1m"; + } else if (color_l == "green") { + code = "\033[38;5;2m"; + } else if (color_l == "yellow") { + code = "\033[38;5;3m"; + } else if (color_l == "blue") { + code = "\033[38;5;4m"; + } else if (color_l == "magenta") { + code = "\033[38;5;5m"; + } else if (color_l == "cyan") { + code = "\033[38;5;6m"; + } else if (color_l == "white") { + code = "\033[38;5;7m"; + } else { + return std::format("{}{}{}", pre, text, post); + } + + return std::format("{}{}{}{}", pre, code, text, post); + #endif } /** - * @brief Make an italic string - * @param text the input text + * @brief Make an italic string + * @param text The input text * @return A string */ inline std::string italic(const std::string& text) { #ifdef _WIN32 return text; #else - return "\033[3m" + text + "\033[0m"; + return std::format("\033[3m{}\033[0m", text); #endif } /** - * @brief Make an underline string - * @param text the input text + * @brief Make an underline string + * @param text The input text * @return A string */ inline std::string underline(const std::string& text) { #ifdef _WIN32 return text; #else - return "\033[4m" + text + "\033[0m"; + return std::format("\033[4m{}\033[0m", text); #endif } /** - * @brief Format a string - * @param format intended format - * @param args string(s) + * @brief Format a string + * @param format Intended format + * @param args String(s) * @return A string */ template -inline static std::string string_format(const std::string& format, - Args... args) { +inline static std::string string_format(const std::string& format, Args... args) { // Extra space for '\0' auto size{size_t(snprintf(nullptr, 0, format.c_str(), args...) + 1)}; std::unique_ptr buf(new char[size]); std::snprintf(buf.get(), size, format.c_str(), args...); - return std::string(buf.get(), - buf.get() + size - 1); // Doesn't want the '\0' inside + return std::string(buf.get(), size - 1); // Exclude the terminating '\0' } /** - * @brief Wrap a text - * @param text the text - * @param pre_str the string preceding the text - * @param width width of the text + * @brief Wrap a text + * @param text The text + * @param pre_str The string preceding the text + * @param width Width of the text * @return A string */ -inline std::string wrap_text(std::string text, std::string pre_str = "", - int width = 57) { +inline std::string wrap_text(std::string text, std::string pre_str = "", int width = 57) { std::string out = pre_str; std::string word; char last{'\0'}; @@ -115,13 +128,17 @@ inline std::string wrap_text(std::string text, std::string pre_str = "", for (auto c : text) { if (++pos == width) { - if (word.empty()) return ""; + if (word.empty()) { + return ""; + } auto p = std::end(word); while (p != std::begin(word) && *--p != ' '); - if (*p == ' ') word = std::string(++p, std::end(word)); + if (*p == ' ') { + word = std::string(++p, std::end(word)); + } - out += "\n" + pre_str + word; + out += std::format("\n{}{}", pre_str, word); pos = word.length(); word.clear(); } else if (c == ' ' && last != ' ') { @@ -138,10 +155,10 @@ inline std::string wrap_text(std::string text, std::string pre_str = "", } /** - * @brief Check if a std::string exists in a range - * @param first begin iterator of the range - * @param last end iterator of the range - * @param value the value to be found in the range + * @brief Check if a std::string exists in a range + * @param first Begin iterator of the range + * @param last End iterator of the range + * @param value The value to be found in the range * @return Yes, if it exists */ template @@ -151,16 +168,18 @@ bool exist(const Iter first, const Iter last, const T& value) { } /** - * @brief Save the contents of a file into a std::string - * @param fname the password file name + * @brief Save the contents of a file into a std::string + * @param fname The password file name * @return A string */ inline std::string file_to_string(const std::string& fname) { std::ifstream in(fname); std::string pass; - for (char c; in.get(c);) pass += c; + for (char c; in.get(c);) { + pass += c; + } in.close(); return pass; } -#endif // CRYFA_STRING_HPP \ No newline at end of file +#endif // CRYFA_STRING_HPP diff --git a/src/include/time.hpp b/src/include/time.hpp index c3136b5..8f83d55 100644 --- a/src/include/time.hpp +++ b/src/include/time.hpp @@ -1,46 +1,45 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file time.hpp - * @brief time-related functions - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file time.hpp + * @brief Time-related functions */ #ifndef CRYFA_TIME_HPP #define CRYFA_TIME_HPP #include +#include #include /** - * @brief Accumulate hop index values in a range - * @param first beginning of the range - * @param last end of the range - * @param init initial value - * @param h hop value - * @return A number + * @brief Get the current high-resolution time point + * @return Current time point */ - -inline static std::chrono::time_point -now() noexcept { +inline static std::chrono::time_point now() noexcept { return std::chrono::high_resolution_clock::now(); } +/** + * @brief Format an elapsed duration as a human-readable string + * @tparam Time Duration type accepted by std::chrono::duration_cast + * @param elapsed Elapsed duration + * @return Formatted elapsed time + */ template inline static std::string hms(Time elapsed) { - const auto durSec = - std::chrono::duration_cast(elapsed).count(); + const auto durSec = std::chrono::duration_cast(elapsed).count(); const auto h = durSec / 3600; const auto m = (durSec % 3600) / 60; const auto s = durSec % 60; if (m < 1) { - return (s == 0 ? "< 1" : std::to_string(s)) + " sec.\n"; + return (s == 0) ? "< 1 sec.\n" : std::format("{} sec.\n", s); } else if (h < 1) { - return std::to_string(m) + ":" + std::to_string(s) + " min:sec.\n"; + return std::format("{}:{} min:sec.\n", m, s); } else { - return std::to_string(h) + ":" + std::to_string(m) + ":" + - std::to_string(s) + " hour:min:sec.\n"; + return std::format("{}:{}:{} hour:min:sec.\n", h, m, s); } } diff --git a/src/keygen.cpp b/src/keygen.cpp index d418e8d..82eefba 100644 --- a/src/keygen.cpp +++ b/src/keygen.cpp @@ -1,9 +1,9 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file keygen.cpp - * @brief Key generator - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file keygen.cpp + * @brief Key generator */ #include @@ -16,9 +16,9 @@ using namespace cryfa; /** - * @brief Generate a key and save in a file - * @param argc number of command line arguments - * @param argv command line arguments + * @brief Generate a key and save in a file + * @param argc Number of command line arguments + * @param argv Command line arguments * @return SUCCESS or FAILURE */ int main(int argc, char* argv[]) { @@ -29,10 +29,11 @@ int main(int argc, char* argv[]) { std::cerr << "Enter a password, then press 'Enter':\n"; std::string pass; - for (char c; std::cin.get(c) && c != '\n';) pass += c; + for (char c; std::cin.get(c) && c != '\n';) { + pass += c; + } - std::cerr << "Enter a file name to save the generated key, " - "then press 'Enter':\n"; + std::cerr << "Enter a file name to save the generated key, then press 'Enter':\n"; std::string target; for (char c; std::cin.get(c) && c != '\n';) { assert_single(c == ' ', "the file name has a space character."); @@ -42,20 +43,20 @@ int main(int argc, char* argv[]) { std::uniform_int_distribution udist(0, 255); rng_t rng; // Using old rand to generate the new random initSeed - srand(static_cast( - 36721 * (94583 * accum_even(pass.begin(), pass.end(), 0ul) + - 279431 * accum_odd(pass.begin(), pass.end(), 0ul)) + - 623681)); + srand(static_cast(36721 * (94583 * accum_even(pass.begin(), pass.end(), 0ul) + + 279431 * accum_odd(pass.begin(), pass.end(), 0ul)) + + 623681)); u64 initSeed = 0; - for (char c : pass) initSeed += c * rand() + rand(); + for (char c : pass) { + initSeed += c * rand() + rand(); + } rng.seed(static_cast(initSeed)); std::string key; - for (auto i = keyLen; i--;) + for (auto i = keyLen; i--;) { key += static_cast( - ((udist(rng) * accum_hops(pass.begin(), pass.end(), 0u, i + 1)) % - nShowable) + - firstC); + ((udist(rng) * accum_hops(pass.begin(), pass.end(), 0u, i + 1)) % nShowable) + firstC); + } std::ofstream keyFile(target); keyFile << key; @@ -67,4 +68,4 @@ int main(int argc, char* argv[]) { } return 0; -} \ No newline at end of file +} diff --git a/src/parser.hpp b/src/parser.hpp index 6724d48..c904148 100644 --- a/src/parser.hpp +++ b/src/parser.hpp @@ -1,15 +1,16 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file parser.hpp - * @brief Parser for command line options - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file parser.hpp + * @brief Parser for command line options */ #ifndef CRYFA_PARSER_H #define CRYFA_PARSER_H #include +#include #include #include "def.hpp" @@ -19,10 +20,10 @@ namespace cryfa { /** - * @brief Argument of a command line option - * @param first begin iterator of the range - * @param last end iterator of the range - * @param value the value to be found in the range + * @brief Argument of a command line option + * @param first Begin iterator of the range + * @param last End iterator of the range + * @param value The value to be found in the range * @return A string */ template @@ -32,33 +33,39 @@ inline std::string argument(Iter first, Iter last, const T& value) { /** * @brief Check password file - * @param fname the password file name + * @param fname The password file name */ inline void check_pass(const std::string& fname) { - assert_file_good(fname, - "Error opening the password file \"" + fname + "\".\n"); + assert_file_good(fname, std::format("Error opening the password file \"{}\".\n", fname)); const std::string pass = file_to_string(fname); assert_single(pass.size() < 8, "the password size must be at least 8."); } /** - * @brief Check input file format (FASTA/FASTQ/other) - * @param inFileName the file name + * @brief Check input file format (FASTA/FASTQ/other) + * @param inFileName The file name * @return A character */ inline char frmt(const std::string& inFileName) { wchar_t c; std::wifstream in(inFileName); - assert_single(!in.good(), "failed opening \"" + inFileName + "\"."); + assert_single(!in.good(), std::format("failed opening \"{}\".", inFileName)); // Skip leading blank lines or spaces - while (in.peek() == '\n' || in.peek() == ' ') in.get(c); + while (in.peek() == '\n' || in.peek() == ' ') { + in.get(c); + } // Fastq - while (in.peek() == '@') IGNORE_THIS_LINE(in); + while (in.peek() == '@') { + IGNORE_THIS_LINE(in); + } byte nTabs = 0; - while (in.get(c) && c != '\n') - if (c == '\t') ++nTabs; + while (in.get(c) && c != '\n') { + if (c == '\t') { + ++nTabs; + } + } if (in.peek() == '+') { in.close(); @@ -68,7 +75,9 @@ inline char frmt(const std::string& inFileName) { // Fasta or Not Fasta/Fastq in.clear(); in.seekg(0, std::ios::beg); // Return to beginning of the file - while (in.peek() != '>' && in.peek() != EOF) IGNORE_THIS_LINE(in); + while (in.peek() != '>' && in.peek() != EOF) { + IGNORE_THIS_LINE(in); + } if (in.peek() == '>') { in.close(); @@ -85,83 +94,79 @@ inline char frmt(const std::string& inFileName) { */ inline void show_help() { const std::string init_space = " "; - const std::string opt_space = init_space + " "; - std::cerr - << bold("NAME") << '\n' - << init_space << "Cryfa - a secure encryption tool for genomic data \n" - << '\n' - << bold("SYNOPSIS") << '\n' - << init_space << "./cryfa [" << underline("OPTION") << "]... -k [" - << underline("KEY_FILE") << "] [-d] [" << underline("IN_FILE") << "] > [" - << underline("OUT_FILE") << "] \n" - << '\n' - << bold("SAMPLE") << '\n' - << init_space << italic("Encrypt and Compact") - << ": ./cryfa -k pass.txt in.fq > comp \n" - << init_space << italic("Decrypt and Unpack") - << ": ./cryfa -k pass.txt -d comp > orig.fq \n" - << '\n' - << init_space << italic("Encrypt") - << ": ./cryfa -k pass.txt in > enc \n" - << init_space << italic("Decrypt") - << ": ./cryfa -k pass.txt -d enc > orig \n" - << '\n' - << bold("OPTIONS") << '\n' - << init_space << "Compact & encrypt FASTA/FASTQ files. \n" - << init_space - << "Encrypt any text-based genomic data, e.g., VCF/SAM/BAM. \n" - << '\n' - << init_space << bold("-k") << " [" << underline("KEY_FILE") << "], " - << bold("--key") << " [" << underline("KEY_FILE") << "] \n" - << opt_space << "key file name -- " << italic("MANDATORY") << '\n' - << opt_space << "The KEY_FILE should contain a password. \n" - << wrap_text( - "To make a strong password, the \"keygen\" program can be used " - "via the command \"./keygen\".", - opt_space) - << '\n' - << '\n' - << init_space << bold("-d") << ", " << bold("--dec") << '\n' - << opt_space << "decrypt & unpack \n" - << '\n' - << init_space << bold("-f") << ", " << bold("--force") << '\n' - << opt_space << "force to consider input as non-FASTA/FASTQ \n" - << wrap_text( - "Forces Cryfa not to compact, but shuffle and encrypt. If the " - "input is FASTA/FASTQ, it is considered as non-FASTA/FASTQ; so, " - "compaction will be ignored, but shuffling and encryption will be " - "performed.", - opt_space) - << '\n' - << '\n' - << init_space << bold("-s") << ", " << bold("--stop_shuffle") << '\n' - << opt_space << "stop shuffling the input \n" - << '\n' - << init_space << bold("-t") << " [" << underline("NUMBER") << "], " - << bold("--thread") << " [" << underline("NUMBER") << "] \n" - << opt_space << "number of threads \n" - << '\n' - << init_space << bold("-v") << ", " << bold("--verbose") << '\n' - << opt_space << "verbose mode (more information) \n" - << '\n' - << init_space << bold("-h") << ", " << bold("--help") << '\n' - << opt_space << "usage guide \n" - << '\n' - << init_space << bold("--version") << '\n' - << opt_space << "version information \n" - << '\n' - << bold("AUTHORS") << '\n' - << " Morteza Hosseini seyedmorteza@ua.pt \n" - << " Diogo Pratas pratas@ua.pt \n" - << '\n' - << bold("Warning:") << ' ' - << wrap_text( - "the maximum file size supported is 64 GB. For larger files, you " - "can split them, e.g. by \"split\" command, and encrypt each " - "chunk. After the decryption, you can concatenate the chunks, " - "e.g. by \"cat\" command.", - "", 62) - << std::endl; + const std::string opt_space = std::format("{} ", init_space); + + std::cerr << bold("NAME") << '\n' + << init_space << bold("Cryfa") << " - a secure encryption tool for genomic data \n" + << '\n' + << bold("SYNOPSIS") << '\n' + << init_space << "./cryfa [" << underline("OPTION") << "]... -k [" + << underline("KEY_FILE") << "] [-d] [" << underline("IN_FILE") << "] > [" + << underline("OUT_FILE") << "] \n" + << '\n' + << bold("SAMPLE") << '\n' + << init_space << italic("Encrypt and Compact") + << ": ./cryfa -k pass.txt in.fq > comp \n" + << init_space << italic("Decrypt and Unpack") + << ": ./cryfa -k pass.txt -d comp > orig.fq \n" + << '\n' + << init_space << italic("Encrypt") << ": ./cryfa -k pass.txt in > enc \n" + << init_space << italic("Decrypt") + << ": ./cryfa -k pass.txt -d enc > orig \n" + << '\n' + << bold("OPTIONS") << '\n' + << init_space << "Compact & encrypt FASTA/FASTQ files. \n" + << init_space << "Encrypt any text-based genomic data, e.g., VCF/SAM/BAM. \n" + << '\n' + << init_space << bold("-k") << " [" << underline("KEY_FILE") << "], " << bold("--key") + << " [" << underline("KEY_FILE") << "] \n" + << opt_space << "key file name -- " << italic("MANDATORY") << '\n' + << opt_space << "The KEY_FILE should contain a password. \n" + << wrap_text( + "To make a strong password, the \"keygen\" program can be used via the command " + "\"./keygen\".", + opt_space) + << '\n' + << '\n' + << init_space << bold("-d") << ", " << bold("--dec") << '\n' + << opt_space << "decrypt & unpack \n" + << '\n' + << init_space << bold("-f") << ", " << bold("--force") << '\n' + << opt_space << "force to consider input as non-FASTA/FASTQ \n" + << wrap_text( + "Forces Cryfa not to compact, but shuffle and encrypt. If the input is " + "FASTA/FASTQ, it is considered as non-FASTA/FASTQ; so, compaction will be " + "ignored, but shuffling and encryption will be performed.", + opt_space) + << '\n' + << '\n' + << init_space << bold("-s") << ", " << bold("--stop_shuffle") << '\n' + << opt_space << "stop shuffling the input \n" + << '\n' + << init_space << bold("-t") << " [" << underline("NUMBER") << "], " << bold("--thread") + << " [" << underline("NUMBER") << "] \n" + << opt_space << "number of threads \n" + << '\n' + << init_space << bold("-v") << ", " << bold("--verbose") << '\n' + << opt_space << "verbose mode (more information) \n" + << '\n' + << init_space << bold("-h") << ", " << bold("--help") << '\n' + << opt_space << "usage guide \n" + << '\n' + << init_space << bold("--version") << '\n' + << opt_space << "version information \n" + << '\n' + << bold("AUTHORS") << '\n' + << " Morteza Hosseini seyedmorteza.hosseini@manchester.ac.uk\n" + << " Diogo Pratas pratas@ua.pt \n" + << '\n' + << bold("Warning:") << ' ' + << wrap_text( + "the maximum file size supported is 64 GB. For larger files, you can split " + "them, e.g. by \"split\" command, and encrypt each chunk. After the decryption, " + "you can concatenate the chunks, e.g. by \"cat\" command.", + "", 62) + << std::endl; throw EXIT_SUCCESS; } @@ -175,11 +180,11 @@ inline void show_version() { } /** - * @brief Parse the command line options - * @param par An object to hold parameters - * @param argc Number of command line options - * @param argv Array of command line options - * @return 'c': compress+encrypt or 'd': decrypt+decompress + * @brief Parse the command line options + * @param par An object to hold parameters + * @param argc Number of command line options + * @param argv Array of command line options + * @return Operation mode: 'c' for compress+encrypt or 'd' for decrypt+decompress */ char parse(Param& par, int argc, char** argv) { if (argc < 2) show_help(); @@ -187,32 +192,34 @@ char parse(Param& par, int argc, char** argv) { par.in_file = *(argv + argc - 1); // Not standard input std::vector vArgs; vArgs.reserve(static_cast(argc)); - for (auto a = argv; a != argv + argc; ++a) + for (auto a = argv; a != argv + argc; ++a) { vArgs.emplace_back(std::string(*a)); + } // Help - if (exist(vArgs.begin(), vArgs.end(), "-h") || - exist(vArgs.begin(), vArgs.end(), "--help")) + if (exist(vArgs.begin(), vArgs.end(), "-h") || exist(vArgs.begin(), vArgs.end(), "--help")) { show_help(); + } // Version - if (exist(vArgs.begin(), vArgs.end(), "--version")) show_version(); + if (exist(vArgs.begin(), vArgs.end(), "--version")) { + show_version(); + } // Check file size for > 64 GB if (file_size(par.in_file) > (1ull << 36)) { - const std::string message = - "Size of \"" + file_name(par.in_file) + - "\" is larger than 64 GB. You can split it, e.g. by \"split\" command, " - "and encrypt each chunk. " - "After the decryption, you can concatenate the chunks, e.g. by \"cat\" " - "command."; + const std::string message = std::format( + "Size of \"{}\" is larger than 64 GB. You can split it, e.g. by \"split\" command, and " + "encrypt each chunk. After the decryption, you can concatenate the chunks, e.g. by \"cat\" " + "command.", + file_name(par.in_file)); error(message); } // key -- MANDATORY - assert_single(!exist(vArgs.begin(), vArgs.end(), "-k") && - !exist(vArgs.begin(), vArgs.end(), "--key"), - "no password file has been set."); + assert_single( + !exist(vArgs.begin(), vArgs.end(), "-k") && !exist(vArgs.begin(), vArgs.end(), "--key"), + "no password file has been set."); for (auto i = vArgs.begin(); i != vArgs.end(); ++i) { if (*i == "-k" || *i == "--key") { if (i + 1 != vArgs.end() && (*(i + 1))[0] != '-') { @@ -228,30 +235,31 @@ char parse(Param& par, int argc, char** argv) { for (auto i = vArgs.begin(); i != vArgs.end(); ++i) { if (*i == "-v" || *i == "--verbose") { par.verbose = true; - } else if ((*i == "-t" || *i == "--thread") && i + 1 != vArgs.end() && - (*(i + 1))[0] != '-' && is_number(*(i + 1))) + } else if ((*i == "-t" || *i == "--thread") && i + 1 != vArgs.end() && (*(i + 1))[0] != '-' && + is_number(*(i + 1))) par.n_threads = static_cast(stoi(*++i)); } // Decrypt+decompress - if (exist(vArgs.begin(), vArgs.end(), "-d") || - exist(vArgs.begin(), vArgs.end(), "--dec")) + if (exist(vArgs.begin(), vArgs.end(), "-d") || exist(vArgs.begin(), vArgs.end(), "--dec")) { return 'd'; + } // stop_shuffle, frmt for (auto i = vArgs.begin(); i != vArgs.end(); ++i) { - if (*i == "-s" || *i == "--stop_shuffle") + if (*i == "-s" || *i == "--stop_shuffle") { par.stop_shuffle = true; - else if (*i == "-f" || *i == "--force") + } else if (*i == "-f" || *i == "--force") { par.format = 'n'; + } } - if (!exist(vArgs.begin(), vArgs.end(), "-f") && - !exist(vArgs.begin(), vArgs.end(), "--force")) + if (!exist(vArgs.begin(), vArgs.end(), "-f") && !exist(vArgs.begin(), vArgs.end(), "--force")) { par.format = frmt(par.in_file); // Not standard input file + } // Compress+encrypt return 'c'; } } // namespace cryfa -#endif // CRYFA_PARSER_H \ No newline at end of file +#endif // CRYFA_PARSER_H diff --git a/src/security.cpp b/src/security.cpp index 58e11a7..af1a437 100644 --- a/src/security.cpp +++ b/src/security.cpp @@ -1,14 +1,13 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file security.cpp - * @brief Security - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file security.cpp + * @brief Security */ #include "security.hpp" -#include #include #include // setw, std::setprecision #include @@ -19,18 +18,43 @@ #include "cryptopp/eax.h" #include "cryptopp/files.h" #include "cryptopp/gcm.h" +#include "cryptopp/simple.h" #include "numeric.hpp" #include "string.hpp" #include "time.hpp" using namespace cryfa; std::mutex mutxSec; /**< @brief Mutex */ +std::mutex Security::derived_state_mutex; +std::unordered_map> + Security::derived_state_cache; + +namespace { +class FunctionSink : public CryptoPP::Bufferless { + public: + explicit FunctionSink(const std::function& sink) : sink_(sink) {} + + std::string AlgorithmName() const override { return "FunctionSink"; } + + size_t Put2(const CryptoPP::byte* inString, size_t length, int messageEnd, + bool blocking) override { + (void)messageEnd; + (void)blocking; + if (length != 0) { + sink_(std::string_view(reinterpret_cast(inString), length)); + } + return 0; + } + + private: + std::function sink_; +}; +} // namespace /** - * @brief Encrypt - * @details AES encryption uses a secret key of a variable length (128, 196 or - * 256 bit). This key is secretly exchanged between two parties before - * communication begins. + * @brief Encrypt + * @details AES encryption uses a secret key of a variable length (128, 196 or 256 bit). + * This key is secretly exchanged between two parties before communication begins. * * DEFAULT_KEYLENGTH = 16 bytes. */ @@ -38,23 +62,15 @@ void Security::encrypt() { std::cerr << bold("[+]") << " Encrypting ..."; const auto start = now(); // Start timer - byte key[CryptoPP::AES::DEFAULT_KEYLENGTH], iv[CryptoPP::AES::BLOCKSIZE]; - std::memset(key, 0x00, (size_t)CryptoPP::AES::DEFAULT_KEYLENGTH); // AES key - std::memset(iv, 0x00, - (size_t)CryptoPP::AES::BLOCKSIZE); // Initialization Vector - - const std::string pass = file_to_string(key_file); - build_key(key, pass); - build_iv(iv, pass); + const auto state = derived_state(); try { CryptoPP::GCM::Encryption e; - e.SetKeyWithIV(key, sizeof(key), iv, sizeof(iv)); + e.SetKeyWithIV(state->key.data(), state->key.size(), state->iv.data(), state->iv.size()); - CryptoPP::FileSource( - PCKD_FNAME.c_str(), true, - new CryptoPP::AuthenticatedEncryptionFilter( - e, new CryptoPP::FileSink(std::cout), false, TAG_SIZE)); + CryptoPP::FileSource(PCKD_FNAME.c_str(), true, + new CryptoPP::AuthenticatedEncryptionFilter( + e, new CryptoPP::FileSink(std::cout), false, TAG_SIZE)); } catch (CryptoPP::InvalidArgument& e) { std::cerr << "Caught InvalidArgument...\n" << e.what() << "\n"; } catch (CryptoPP::Exception& e) { @@ -62,19 +78,45 @@ void Security::encrypt() { } const auto finish = now(); // Stop timer - std::cerr << "\r" << bold("[+]") << " Encrypting done in " - << hms(finish - start); + std::cerr << "\r" << bold("[+]") << " Encrypting done in " << hms(finish - start); // Delete packed file const std::string pkdFileName = PCKD_FNAME; std::remove(pkdFileName.c_str()); } +void Security::encrypt_stream(const PlaintextProducer& produce_plaintext) { + std::cerr << bold("[+]") << " Encrypting ..."; + const auto start = now(); // Start timer + + const auto state = derived_state(); + + try { + CryptoPP::GCM::Encryption e; + e.SetKeyWithIV(state->key.data(), state->key.size(), state->iv.data(), state->iv.size()); + + CryptoPP::AuthenticatedEncryptionFilter filter(e, new CryptoPP::FileSink(std::cout), false, + TAG_SIZE); + const PlaintextSink sink = [&](std::string_view plaintext) { + filter.Put(reinterpret_cast(plaintext.data()), plaintext.size()); + }; + + produce_plaintext(sink); + filter.MessageEnd(); + } catch (CryptoPP::InvalidArgument& e) { + std::cerr << "Caught InvalidArgument...\n" << e.what() << "\n"; + } catch (CryptoPP::Exception& e) { + std::cerr << "Caught Exception...\n" << e.what() << "\n"; + } + + const auto finish = now(); // Stop timer + std::cerr << "\r" << bold("[+]") << " Encrypting done in " << hms(finish - start); +} + /** - * @brief Decrypt - * @details AES encryption uses a secret key of a variable length (128, 196 - * or 256 bit). This key is secretly exchanged between two parties - * before communication begins. + * @brief Decrypt + * @details AES encryption uses a secret key of a variable length (128, 196 or 256 bit). + * This key is secretly exchanged between two parties before communication begins. * * DEFAULT_KEYLENGTH = 16 bytes. */ @@ -84,57 +126,98 @@ void Security::decrypt() { std::cerr << bold("[+]") << " Decrypting ..."; const auto start = now(); // Start timer - byte key[CryptoPP::AES::DEFAULT_KEYLENGTH], iv[CryptoPP::AES::BLOCKSIZE]; - std::memset(key, 0x00, (size_t)CryptoPP::AES::DEFAULT_KEYLENGTH); // AES key - std::memset(iv, 0x00, - (size_t)CryptoPP::AES::BLOCKSIZE); // Initialization Vector - - const std::string pass = file_to_string(key_file); - build_key(key, pass); - build_iv(iv, pass); + const auto state = derived_state(); try { std::ifstream in(in_file); const char* outFile = DEC_FNAME.c_str(); CryptoPP::GCM::Decryption d; - d.SetKeyWithIV(key, sizeof(key), iv, sizeof(iv)); + d.SetKeyWithIV(state->key.data(), state->key.size(), state->iv.data(), state->iv.size()); CryptoPP::AuthenticatedDecryptionFilter df( - d, new CryptoPP::FileSink(outFile), + d, new CryptoPP::FileSink(outFile), CryptoPP::AuthenticatedDecryptionFilter::DEFAULT_FLAGS, + TAG_SIZE); + CryptoPP::FileSource(in, true, new CryptoPP::Redirector(df /*, PASS_EVERYTHING */)); + in.close(); + } catch (CryptoPP::HashVerificationFilter::HashVerificationFailed& e) { + std::cerr << "Caught HashVerificationFailed...\n" << e.what() << "\n"; + } catch (CryptoPP::InvalidArgument& e) { + std::cerr << "Caught InvalidArgument...\n" << e.what() << "\n"; + } catch (CryptoPP::Exception& e) { + std::cerr << "Caught Exception...\n" << e.what() << "\n"; + } + + const auto finish = now(); // Stop timer + std::cerr << "\r" << bold("[+]") << " Decrypting done in " << hms(finish - start); +} + +char Security::peek_decrypted_type() { + assert_file_good(in_file); + + const auto state = derived_state(); + std::ifstream in(in_file, std::ios::binary); + char encrypted_type = 0; + if (!in.get(encrypted_type)) { + error("corrupted file."); + } + + CryptoPP::GCM::Decryption d; + d.SetKeyWithIV(state->key.data(), state->key.size(), state->iv.data(), state->iv.size()); + + CryptoPP::byte decrypted_type = 0; + d.ProcessData(&decrypted_type, reinterpret_cast(&encrypted_type), 1); + return static_cast(decrypted_type); +} + +void Security::decrypt_stream(const PlaintextSink& consume_plaintext) { + assert_file_good(in_file); + + std::cerr << bold("[+]") << " Decrypting ..."; + const auto start = now(); // Start timer + + const auto state = derived_state(); + + try { + std::ifstream in(in_file); + + CryptoPP::GCM::Decryption d; + d.SetKeyWithIV(state->key.data(), state->key.size(), state->iv.data(), state->iv.size()); + + CryptoPP::AuthenticatedDecryptionFilter df( + d, new FunctionSink(consume_plaintext), CryptoPP::AuthenticatedDecryptionFilter::DEFAULT_FLAGS, TAG_SIZE); - CryptoPP::FileSource(in, true, - new CryptoPP::Redirector(df /*, PASS_EVERYTHING */)); + CryptoPP::FileSource(in, true, new CryptoPP::Redirector(df /*, PASS_EVERYTHING */)); in.close(); } catch (CryptoPP::HashVerificationFilter::HashVerificationFailed& e) { std::cerr << "Caught HashVerificationFailed...\n" << e.what() << "\n"; + throw; } catch (CryptoPP::InvalidArgument& e) { std::cerr << "Caught InvalidArgument...\n" << e.what() << "\n"; + throw; } catch (CryptoPP::Exception& e) { std::cerr << "Caught Exception...\n" << e.what() << "\n"; + throw; } const auto finish = now(); // Stop timer - std::cerr << "\r" << bold("[+]") << " Decrypting done in " - << hms(finish - start); + std::cerr << "\r" << bold("[+]") << " Decrypting done in " << hms(finish - start); } /** * @brief Random number seed -- Emulate C srand() - * @param s Seed + * @param s Seed */ void Security::srandom(u32 s) { random_engine().seed(s); } /** - * @brief Random number generate -- Emulate C rand() + * @brief Random number generate -- Emulate C rand() * @return Random number */ -int Security::random() { - return (int)(random_engine()() - random_engine().min()); -} +int Security::random() { return (int)(random_engine()() - random_engine().min()); } /** - * @brief Random number engine + * @brief Random number engine * @return The classic Minimum Standard rand0 */ std::minstd_rand0& Security::random_engine() { @@ -142,121 +225,161 @@ std::minstd_rand0& Security::random_engine() { return e; } -/** - * @brief Shuffle/unshuffle seed generator -- For each chunk - */ -void Security::gen_shuff_seed() { +std::shared_ptr Security::derived_state() { + std::lock_guard cache_lock(derived_state_mutex); + const auto found = derived_state_cache.find(key_file); + if (found != derived_state_cache.end()) { + return found->second; + } + + auto state = std::make_shared(); const std::string pass = file_to_string(key_file); + std::lock_guard random_lock(mutxSec); + build_key(state->key.data(), pass); + build_iv(state->iv.data(), pass); + state->shuffle_seed = build_shuff_seed(pass); + + return derived_state_cache.emplace(key_file, state).first->second; +} + +/** + * @brief Shuffle/unshuffle seed generator + * @param pass Password + * @return Shuffle seed + */ +u64 Security::build_shuff_seed(const std::string& pass) { // Using old rand to generate the new random seed u64 seed = 0; - mutxSec.lock(); //----------------------------------------------------------- srandom(681493 * std::accumulate(pass.begin(), pass.end(), u32(0)) + 9148693); - for (char c : pass) seed += (u64)(c * random()); - mutxSec.unlock(); //--------------------------------------------------------- + for (char c : pass) { + seed += (u64)(c * random()); + } - seed_shared = seed; + return seed; +} + +std::shared_ptr> Security::unshuffle_positions(u64 size) { + { + std::lock_guard cache_lock(unshuffle_cache_mutex); + const auto found = unshuffle_cache.find(size); + if (found != unshuffle_cache.end()) { + return found->second; + } + } + + auto positions = std::make_shared>(size); + std::iota(positions->begin(), positions->end(), 0); + std::shuffle(positions->begin(), positions->end(), rng_t(derived_state()->shuffle_seed)); + + std::lock_guard cache_lock(unshuffle_cache_mutex); + return unshuffle_cache.emplace(size, positions).first->second; } /** * @brief Shuffle - * @param[in, out] str String to be shuffled + * @param[in,out] str String to be shuffled */ void Security::shuffle(std::string& str) { - gen_shuff_seed(); // shuffling seed - std::shuffle(str.begin(), str.end(), rng_t(seed_shared)); + std::shuffle(str.begin(), str.end(), rng_t(derived_state()->shuffle_seed)); } /** * @brief Unshuffle - * @param i Shuffled string iterator - * @param size Size of shuffled string + * @param i Shuffled string iterator + * @param size Size of shuffled string */ void Security::unshuffle(std::string::iterator& i, u64 size) { std::string shuffledStr; // Copy of shuffled std::string - for (u64 j = 0; j != size; ++j, ++i) shuffledStr += *i; + shuffledStr.reserve(size); + for (u64 j = 0; j != size; ++j, ++i) { + shuffledStr += *i; + } auto shIt = shuffledStr.begin(); i -= size; - // Shuffle vector of positions - std::vector vPos(size); - std::iota(vPos.begin(), vPos.end(), 0); // Insert 0 .. N-1 - gen_shuff_seed(); - std::shuffle(vPos.begin(), vPos.end(), rng_t(seed_shared)); + const auto positions = unshuffle_positions(size); // Insert unshuffled data - for (const u64& vI : vPos) *(i + vI) = *shIt++; // *shIt, then ++shIt + for (const u64& vI : *positions) { + *(i + vI) = *shIt++; // *shIt, then ++shIt + } } /** * @brief Build initialization vector (IV) for cryption - * @param iv IV - * @param pass Password + * @param iv IV + * @param pass Password */ void Security::build_iv(byte* iv, const std::string& pass) { std::uniform_int_distribution udist(0, 255); rng_t rng; // Using old rand to generate the new random seed - srandom(static_cast( - 44701 * (459229 * accum_even(pass.begin(), pass.end(), 0ul) + - 3175661 * accum_odd(pass.begin(), pass.end(), 0ul)) + - 499397)); + srandom(static_cast(44701 * (459229 * accum_even(pass.begin(), pass.end(), 0ul) + + 3175661 * accum_odd(pass.begin(), pass.end(), 0ul)) + + 499397)); u64 seed = 0; - for (char c : pass) seed += c * random() + random(); + for (char c : pass) { + seed += c * random() + random(); + } rng.seed(static_cast(seed)); - for (int i = CryptoPP::AES::BLOCKSIZE; i--;) - iv[i] = static_cast( - (udist(rng) * accum_hops(pass.begin(), pass.end(), 0u, i + 1)) % 255); + for (int i = CryptoPP::AES::BLOCKSIZE; i--;) { + iv[i] = static_cast((udist(rng) * accum_hops(pass.begin(), pass.end(), 0u, i + 1)) % 255); + } } /** * @brief Build key for cryption - * @param key Key - * @param pass password + * @param key Key + * @param pass Password */ void Security::build_key(byte* key, const std::string& pass) { std::uniform_int_distribution udist(0, 255); rng_t rng; // Using old rand to generate the new random seed - srandom(static_cast( - 24593 * (9819241 * accum_even(pass.begin(), pass.end(), 0ul) + - 2597591 * accum_odd(pass.begin(), pass.end(), 0ul)) + - 648649)); + srandom(static_cast(24593 * (9819241 * accum_even(pass.begin(), pass.end(), 0ul) + + 2597591 * accum_odd(pass.begin(), pass.end(), 0ul)) + + 648649)); u64 seed = 0; - for (char c : pass) seed += c * random() + random(); + for (char c : pass) { + seed += c * random() + random(); + } rng.seed(static_cast(seed)); - for (int i = CryptoPP::AES::DEFAULT_KEYLENGTH; i--;) - key[i] = static_cast( - (udist(rng) * accum_hops(pass.begin(), pass.end(), 0u, i + 1)) % 255); + for (int i = CryptoPP::AES::DEFAULT_KEYLENGTH; i--;) { + key[i] = + static_cast((udist(rng) * accum_hops(pass.begin(), pass.end(), 0u, i + 1)) % 255); + } } #ifdef DEBUG /** * @brief Print IV - * @param iv IV + * @param iv IV */ void Security::print_iv(byte* iv) const { std::cerr << "IV = [" << (int)*iv++; - for (auto i = CryptoPP::AES::BLOCKSIZE - 1; i--;) + for (auto i = CryptoPP::AES::BLOCKSIZE - 1; i--;) { std::cerr << " " << (int)*iv++; + } std::cerr << "]\n"; } /** * @brief Print key - * @param key Key + * @param key Key */ void Security::print_key(byte* key) const { std::cerr << "Key: [" << (int)*key++; - for (auto i = CryptoPP::AES::DEFAULT_KEYLENGTH - 1; i--;) + for (auto i = CryptoPP::AES::DEFAULT_KEYLENGTH - 1; i--;) { std::cerr << " " << (int)*key++; + } std::cerr << "]\n"; } -#endif \ No newline at end of file +#endif diff --git a/src/security.hpp b/src/security.hpp index 206d001..827d29e 100644 --- a/src/security.hpp +++ b/src/security.hpp @@ -1,14 +1,23 @@ +// SPDX-FileCopyrightText: 2026 Morteza Hosseini +// SPDX-License-Identifier: GPL-3.0-only + /** - * @file security.hpp - * @brief Security - * @author Morteza Hosseini (seyedmorteza@ua.pt) - * @author Diogo Pratas (pratas@ua.pt) - * @copyright The GNU General Public License v3.0 + * @file security.hpp + * @brief Security */ #ifndef CRYFA_SECURITY_H #define CRYFA_SECURITY_H +#include +#include +#include +#include +#include +#include +#include +#include + #include "def.hpp" namespace cryfa { @@ -19,23 +28,43 @@ class Security : public Param { public: Security() = default; void decrypt(); + auto peek_decrypted_type() -> char; protected: + using PlaintextSink = std::function; + using PlaintextProducer = std::function; + bool shuffInProg = true; /**< @brief Shuffle in progress @hideinitializer */ bool shuffled = true; /**< @hideinitializer */ void encrypt(); + void encrypt_stream(const PlaintextProducer&); + void decrypt_stream(const PlaintextSink&); void shuffle(std::string&); void unshuffle(std::string::iterator&, u64); private: - u64 seed_shared; /**< @brief Shared seed */ - // const int TAG_SIZE = 12; /**< @brief Tag size used in GCC mode auth enc */ + static constexpr size_t AES_KEY_SIZE = 16; + static constexpr size_t AES_IV_SIZE = 16; + + struct DerivedState { + std::array key{}; + std::array iv{}; + u64 shuffle_seed = 0; + }; + + static std::mutex derived_state_mutex; + static std::unordered_map> derived_state_cache; + + std::mutex unshuffle_cache_mutex; + std::unordered_map>> unshuffle_cache; void srandom(u32); auto random() -> int; auto random_engine() -> std::minstd_rand0&; - void gen_shuff_seed(); + auto derived_state() -> std::shared_ptr; + auto build_shuff_seed(const std::string&) -> u64; + auto unshuffle_positions(u64) -> std::shared_ptr>; void build_iv(byte*, const std::string&); void build_key(byte*, const std::string&); @@ -46,4 +75,4 @@ class Security : public Param { }; } // namespace cryfa -#endif // CRYFA_SECURITY_H \ No newline at end of file +#endif // CRYFA_SECURITY_H