diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..ee1bee2
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,2 @@
+BasedOnStyle: Google
+ColumnLimit: 100
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1798a54..3bc761e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,8 +2,9 @@ name: CI
on:
push:
- branches: ['**']
+ branches: [master]
pull_request:
+ branches: [master]
jobs:
build:
diff --git a/.gitignore b/.gitignore
index d3643ad..f446f2e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,4 @@ gprof2dot.py
bin/
cryfa
keygen
+results/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9274219..80b6d96 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,54 @@ cmake_minimum_required(VERSION 4.0.0)
project(cryfa)
+set(CRYFA_VERSION_OVERRIDE "" CACHE STRING "Override the version embedded in Cryfa")
+set(CRYFA_VERSION "0.0.0-dev")
+
+if(CRYFA_VERSION_OVERRIDE)
+ set(CRYFA_VERSION "${CRYFA_VERSION_OVERRIDE}")
+elseif(DEFINED ENV{GITHUB_REF_TYPE} AND "$ENV{GITHUB_REF_TYPE}" STREQUAL "tag")
+ set(CRYFA_VERSION "$ENV{GITHUB_REF_NAME}")
+ string(REGEX REPLACE "^v" "" CRYFA_VERSION "${CRYFA_VERSION}")
+else()
+ find_package(Git QUIET)
+
+ if(Git_FOUND AND EXISTS "${CMAKE_SOURCE_DIR}/.git")
+ execute_process(
+ COMMAND "${GIT_EXECUTABLE}" describe --tags --exact-match
+ --match "v[0-9]*" --match "[0-9]*"
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+ OUTPUT_VARIABLE CRYFA_GIT_VERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ ERROR_QUIET
+ RESULT_VARIABLE CRYFA_GIT_VERSION_RESULT
+ )
+
+ if(NOT CRYFA_GIT_VERSION_RESULT EQUAL 0)
+ execute_process(
+ COMMAND "${GIT_EXECUTABLE}" describe --tags --dirty --always
+ --match "v[0-9]*" --match "[0-9]*"
+ WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+ OUTPUT_VARIABLE CRYFA_GIT_VERSION
+ OUTPUT_STRIP_TRAILING_WHITESPACE
+ ERROR_QUIET
+ RESULT_VARIABLE CRYFA_GIT_VERSION_RESULT
+ )
+ endif()
+
+ if(CRYFA_GIT_VERSION_RESULT EQUAL 0 AND CRYFA_GIT_VERSION)
+ string(REGEX REPLACE "^v" "" CRYFA_VERSION "${CRYFA_GIT_VERSION}")
+ endif()
+ endif()
+endif()
+
+set(CRYFA_GENERATED_INCLUDE_DIR "${CMAKE_CURRENT_BINARY_DIR}/generated")
+configure_file(
+ "${CMAKE_CURRENT_SOURCE_DIR}/cmake/version.hpp.in"
+ "${CRYFA_GENERATED_INCLUDE_DIR}/cryfa/version.hpp"
+ @ONLY
+)
+message(STATUS "Cryfa version: ${CRYFA_VERSION}")
+
# On MSVC x64 the cryptopp runtime uses hand-coded MASM routines
# (CPUID64, XGETBV64, Baseline_Add/Sub, SHA256/GCM/Rijndael_*_SSE2).
# Enable the MASM language so CMake can assemble x64dll.asm.
@@ -169,13 +217,19 @@ add_library(libCryfaCommon OBJECT
src/fastq.cpp
src/security.cpp
)
-target_include_directories(libCryfaCommon PRIVATE src/include)
+target_include_directories(libCryfaCommon PRIVATE
+ src/include
+ "${CRYFA_GENERATED_INCLUDE_DIR}"
+)
target_link_libraries(libCryfaCommon PRIVATE cryptopp-dep)
add_executable(cryfa
src/cryfa.cpp
)
-target_include_directories(cryfa PRIVATE src/include)
+target_include_directories(cryfa PRIVATE
+ src/include
+ "${CRYFA_GENERATED_INCLUDE_DIR}"
+)
target_link_libraries(cryfa PRIVATE
Threads::Threads
libCryfaCommon
@@ -185,7 +239,10 @@ target_link_libraries(cryfa PRIVATE
add_executable(keygen
src/keygen.cpp
)
-target_include_directories(keygen PRIVATE src/include)
+target_include_directories(keygen PRIVATE
+ src/include
+ "${CRYFA_GENERATED_INCLUDE_DIR}"
+)
# ── CTest round-trip integration test ────────────────────────────────────────
enable_testing()
diff --git a/Dockerfile b/Dockerfile
index 766148e..5b381bc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,22 +1,20 @@
-# ── Stage 1: builder ─────────────────────────────────────────────────────────-
-FROM ubuntu:22.04 AS builder
-LABEL maintainer="Morteza Hosseini"
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update \
- && apt-get install -y --no-install-recommends build-essential git python3-pip \
- && pip3 install cmake \
- && rm -rf /var/lib/apt/lists/*
-
-WORKDIR /src
-COPY . .
-RUN cmake -B build -DCMAKE_BUILD_TYPE=Release \
- && cmake --build build --parallel "$(nproc)" --config Release
-
-# ── Stage 2: runtime ─────────────────────────────────────────────────────────-
-FROM debian:bookworm-slim
-
-COPY --from=builder /src/build/cryfa /usr/local/bin/cryfa
-COPY --from=builder /src/build/keygen /usr/local/bin/keygen
-
-ENTRYPOINT ["cryfa"]
+FROM ubuntu:22.04 AS builder
+LABEL maintainer="Morteza Hosseini"
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends build-essential git python3-pip \
+ && pip3 install cmake \
+ && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+COPY . .
+RUN cmake -B build -DCMAKE_BUILD_TYPE=Release \
+ && cmake --build build --parallel "$(nproc)" --config Release
+
+FROM debian:bookworm-slim AS runtime
+
+COPY --from=builder /src/build/cryfa /usr/local/bin/cryfa
+COPY --from=builder /src/build/keygen /usr/local/bin/keygen
+
+ENTRYPOINT ["cryfa"]
diff --git a/README.md b/README.md
index 7791a2c..c15cb1f 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,21 @@

-[](https://anaconda.org/bioconda/cryfa)
+[](https://anaconda.org/bioconda/cryfa)
+[](https://anaconda.org/bioconda/cryfa)
[](https://github.com/cobilab/cryfa/actions/workflows/ci.yml)
-[](https://anaconda.org/bioconda/cryfa)
-[](LICENSE)
+[](LICENSE)
Cryfa is an ultrafast encryption tool specifically designed for genomic data. Besides providing robust security, it also compresses FASTA/FASTQ sequences by a factor of three, making it an efficient solution for managing genomic data.
-# Installation
+## Installation
-## Conda
+### Conda
```sh
conda install -y bioconda::cryfa
```
-## Docker
+### Docker
The image is available for **linux/amd64** and **linux/arm64** (Apple Silicon, AWS Graviton).
@@ -32,9 +32,9 @@ docker run --rm -v /path/to/data:/data smortezah/cryfa \
-k /data/pass.txt -d /data/out.crf > restored.fq
```
-## Build from source
+### Build from source
-### Linux
+#### Linux
```sh
# Install git and cmake (≥ 4.0)
@@ -48,7 +48,7 @@ cd cryfa;
sh install.sh;
```
-### macOS
+#### macOS
```sh
# Install Homebrew, git and cmake
@@ -61,7 +61,7 @@ cd cryfa;
sh install.sh;
```
-### Windows
+#### Windows
```powershell
# Install CMake and Visual Studio Build Tools (requires winget)
@@ -77,7 +77,7 @@ cd cryfa
> [!NOTE]
> Pre-compiled binaries for 64-bit Linux, macOS, and Windows are available as assets on the [Releases](https://github.com/cobilab/cryfa/releases) page.
-# Usage
+## Usage
Run Cryfa with:
@@ -102,7 +102,7 @@ A sample file, `in.fq`, is available in the `example/` directory.
> [!NOTE]
> Cryfa supports a maximum file size of 64 GB. For larger files, consider splitting them into smaller chunks, e.g. using the `split` command in Linux, and then encrypt each chunk separately. After decryption, you can reassemble the chunks using the `cat` command.
-## Input file format
+### Input file format
Cryfa identifies the format of a genomic data file by examining its content, not its extension. For instance, a FASTA file named `test` can be provided with any extension — `test`, `test.fa`, `test.fasta`, `test.fas`, `test.fsa`, etc. So, running
@@ -119,7 +119,7 @@ is equivalent to running
> [!NOTE]
> The password file can have any extension or none at all -- `pass`, `pass.txt`, `pass.dat`, etc. are all valid and yield the same result.
-## Options
+### Options
Cryfa supports the following options:
@@ -139,7 +139,7 @@ Cryfa supports the following options:
Cryfa leverages the standard output stream, allowing seamless integration with existing data processing pipelines.
-## Creating a Key File
+### Creating a Key File
There are two ways to create a `KEY_FILE` for use with `-k` / `--key`: save a raw password in a file, or use the `keygen` program to generate a strong one. The latter is strongly recommended.
@@ -182,7 +182,7 @@ The generated key will be saved to the file you specify (e.g., `key.txt`). Note
To learn more about key management (generation, exchange, storage, usage, and replacement of keys), see [[1]](https://en.wikipedia.org/wiki/Key_management), [[2]](https://info.townsendsecurity.com/definitive-guide-to-encryption-key-management-fundamentals), [[3]](https://csrc.nist.gov/projects/key-management/cryptographic-key-management-systems) and [[4]](https://www.cryptomathic.com/news-events/blog/what-is-key-management-a-ciso-perspective).
-## Benchmarking Cryfa Against Other Methods
+### Benchmarking Cryfa Against Other Methods
To benchmark Cryfa against other methods, configure the parameters in the **bench_cryfa.sh** bash script and execute it:
@@ -192,13 +192,21 @@ To benchmark Cryfa against other methods, configure the parameters in the **benc
This script automates the process of downloading datasets, installing dependencies, setting up compression and encryption tools, executing these tools, and finally, displaying the results.
-# Citation
+For quick local performance and correctness checks, use the local harness:
+
+```sh
+bash scripts/runtime/run_local_perf.sh --label local-check --input example/in.fq --target-mb 200 --threads "1 4 8" --runs 1 --modes both --no-prompt
+```
+
+The local harness expands the seed input to the requested size, measures compression and decompression, verifies every round trip with `cmp`, and writes CSV/Markdown reports under `results/local_perf/`.
+
+## Citation
If you use Cryfa in your research, please cite the following references:
- M. Hosseini, D. Pratas and A.J. Pinho, "Cryfa: a secure encryption tool for genomic data," _Bioinformatics_, vol. 35, no. 1, pp. 146--148, 2018. [DOI: 10.1093/bioinformatics/bty645](https://doi.org/10.1093/bioinformatics/bty645)
- **[OPTIONAL]** D. Pratas, M. Hosseini and A.J. Pinho, "Cryfa: a tool to compact and encrypt FASTA files," _11'th International Conference on Practical Applications of Computational Biology & Bioinformatics_ (PACBB), Springer, June 2017. [DOI: 10.1007/978-3-319-60816-7_37](https://doi.org/10.1007/978-3-319-60816-7_37)
-# License
+## License
Cryfa is licensed under the [GPLv3](http://www.gnu.org/licenses/gpl-3.0.html).
diff --git a/bench_cryfa.sh b/bench_cryfa.sh
index 6e135ce..d39689a 100644
--- a/bench_cryfa.sh
+++ b/bench_cryfa.sh
@@ -99,6 +99,20 @@ RUN_CRYFA_THR=1
# Results
RESULTS_CRYFA_THR=0
+# Run lightweight local performance harness
+RUN_LOCAL_PERF=0
+LOCAL_PERF_LABEL="baseline"
+LOCAL_PERF_COMPARE_TO=""
+LOCAL_PERF_INPUT="example/in.fq"
+LOCAL_PERF_TARGET_MB=200
+LOCAL_PERF_THREADS="1 4 8"
+LOCAL_PERF_RUNS=1
+LOCAL_PERF_MODES="default stop-shuffle"
+LOCAL_PERF_INTERACTIVE="auto"
+LOCAL_PERF_BIN="build/cryfa"
+LOCAL_PERF_KEY_FILE="pass.txt"
+LOCAL_PERF_OUT_DIR="results/local_perf"
+
# Run different methods to explore redundancy
RUN_REDUNDANCY=0 # Cryfa, DELIMINATE, MFCompress
# Dataset (FASTA) -- archaea, bacteria, fungi, plants, viruses
diff --git a/cmake/version.hpp.in b/cmake/version.hpp.in
new file mode 100644
index 0000000..93db071
--- /dev/null
+++ b/cmake/version.hpp.in
@@ -0,0 +1,10 @@
+#ifndef CRYFA_VERSION_HPP
+#define CRYFA_VERSION_HPP
+
+#include
+
+namespace cryfa {
+inline const std::string VERSION = "@CRYFA_VERSION@";
+} // namespace cryfa
+
+#endif // CRYFA_VERSION_HPP
diff --git a/install.ps1 b/install.ps1
index 008370b..e8f05ad 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1,14 +1,179 @@
# Parameters
-$BUILD_TYPE = "Release"
-$BUILD = "build"
-$PARALLEL = 8
+$Root = Split-Path -Parent $MyInvocation.MyCommand.Path
+$BUILD_TYPE = if ($env:BUILD_TYPE) { $env:BUILD_TYPE } else { "Release" }
+$BUILD = if ($env:BUILD) { $env:BUILD } else { "build" }
+$PARALLEL = if ($env:PARALLEL) { $env:PARALLEL } else { 8 }
+$ConfirmSettings = $false
+$PromptOnStaleCache = $true
-# Configure CMake
-cmake -B $BUILD -DCMAKE_BUILD_TYPE=$BUILD_TYPE
+function Write-Log {
+ param([string]$Message)
+ Write-Host "[install] $Message"
+}
-# Build
+function Test-InteractiveHost {
+ return $Host.Name -notmatch 'ServerRemoteHost|Visual Studio Code Host'
+}
+
+function Confirm-Choice {
+ param(
+ [string]$Prompt,
+ [string]$Default = "Y"
+ )
+
+ while ($true) {
+ $reply = Read-Host "$Prompt [$Default]"
+ if ([string]::IsNullOrWhiteSpace($reply)) {
+ $reply = $Default
+ }
+
+ switch ($reply.ToLowerInvariant()) {
+ "y" { return $true }
+ "yes" { return $true }
+ "n" { return $false }
+ "no" { return $false }
+ }
+
+ Write-Host "Please answer yes or no."
+ }
+}
+
+function Get-CacheSourceDirectory {
+ param([string]$CacheFile)
+
+ if (-not (Test-Path $CacheFile)) {
+ return $null
+ }
+
+ $line = Select-String -Path $CacheFile -Pattern '^CMAKE_HOME_DIRECTORY:INTERNAL=' | Select-Object -Last 1
+ if ($null -eq $line) {
+ return $null
+ }
+
+ return $line.Line -replace '^CMAKE_HOME_DIRECTORY:INTERNAL=', ''
+}
+
+function Ensure-FreshBuildDirectory {
+ param([string]$BuildDir)
+
+ $cacheFile = Join-Path $BuildDir "CMakeCache.txt"
+ $cacheSource = Get-CacheSourceDirectory -CacheFile $cacheFile
+ if ([string]::IsNullOrWhiteSpace($cacheSource) -or $cacheSource -eq $Root) {
+ return
+ }
+
+ Write-Log "Detected a build cache from a different source directory."
+ Write-Log "Current repo: $Root"
+ Write-Log "Cached source: $cacheSource"
+
+ if ($PromptOnStaleCache -and (Test-InteractiveHost)) {
+ if (-not (Confirm-Choice "Reset the build directory and reconfigure?" "Y")) {
+ throw "Installation cancelled."
+ }
+ }
+
+ if (-not (Test-Path $BuildDir)) {
+ return
+ }
+
+ Write-Log "Resetting stale CMake state in: $BuildDir"
+ Write-Log "Keeping reusable downloaded dependencies under $BuildDir\_deps when present"
+
+ $pathsToRemove = @(
+ (Join-Path $BuildDir "CMakeCache.txt"),
+ (Join-Path $BuildDir "CTestTestfile.cmake"),
+ (Join-Path $BuildDir "Makefile"),
+ (Join-Path $BuildDir "cmake_install.cmake"),
+ (Join-Path $BuildDir "compile_commands.json"),
+ (Join-Path $BuildDir "build.ninja"),
+ (Join-Path $BuildDir ".ninja_deps"),
+ (Join-Path $BuildDir ".ninja_log"),
+ (Join-Path $BuildDir "CMakeFiles")
+ )
+
+ foreach ($path in $pathsToRemove) {
+ if (Test-Path $path) {
+ Remove-Item -Recurse -Force $path
+ }
+ }
+}
+
+function Get-BuiltExecutable {
+ param([string]$Name)
+
+ $exeName = "$Name.exe"
+ $candidates = @(
+ (Join-Path $BUILD $exeName),
+ (Join-Path (Join-Path $BUILD $BUILD_TYPE) $exeName)
+ )
+
+ foreach ($candidate in $candidates) {
+ if (Test-Path $candidate) {
+ return $candidate
+ }
+ }
+
+ throw "Unable to find built executable '$exeName' in '$BUILD' or '$BUILD\$BUILD_TYPE'."
+}
+
+for ($i = 0; $i -lt $args.Length; $i++) {
+ switch ($args[$i]) {
+ "--build-dir" {
+ $i++
+ $BUILD = $args[$i]
+ }
+ "--build-type" {
+ $i++
+ $BUILD_TYPE = $args[$i]
+ }
+ "--parallel" {
+ $i++
+ $PARALLEL = $args[$i]
+ }
+ "--interactive" {
+ $ConfirmSettings = $true
+ }
+ "--no-prompt" {
+ $ConfirmSettings = $false
+ $PromptOnStaleCache = $false
+ }
+ "--help" {
+ Write-Host "Usage: .\install.ps1 [--build-dir DIR] [--build-type TYPE] [--parallel N] [--interactive] [--no-prompt]"
+ exit 0
+ }
+ default {
+ throw "Unknown argument: $($args[$i])"
+ }
+ }
+}
+
+Set-Location $Root
+
+if ($ConfirmSettings -and (Test-InteractiveHost)) {
+ Write-Log "Planned install settings:"
+ Write-Log " Build directory: $BUILD"
+ Write-Log " Build type: $BUILD_TYPE"
+ Write-Log " Parallel jobs: $PARALLEL"
+
+ if (-not (Confirm-Choice "Continue with these settings?" "Y")) {
+ throw "Installation cancelled."
+ }
+}
+
+if (-not ($PARALLEL -as [int]) -or [int]$PARALLEL -le 0) {
+ throw "Parallel build jobs must be a positive integer."
+}
+
+Ensure-FreshBuildDirectory -BuildDir $BUILD
+
+Write-Log "Configuring CMake in `"$BUILD`""
+cmake -S $Root -B $BUILD -DCMAKE_BUILD_TYPE=$BUILD_TYPE
+
+Write-Log "Building targets with $PARALLEL parallel jobs"
cmake --build $BUILD --parallel $PARALLEL --config $BUILD_TYPE
-# Move executables to the main directory
-Move-Item "$BUILD\$BUILD_TYPE\cryfa.exe" -Destination .
-Move-Item "$BUILD\$BUILD_TYPE\keygen.exe" -Destination .
+Write-Log "Copying executables to the repository root"
+Copy-Item (Get-BuiltExecutable "cryfa") -Destination $Root -Force
+Copy-Item (Get-BuiltExecutable "keygen") -Destination $Root -Force
+
+Write-Log "Install complete"
diff --git a/install.sh b/install.sh
index 0fc3deb..242bb23 100644
--- a/install.sh
+++ b/install.sh
@@ -1,16 +1,196 @@
-#!/usr/bin/env bash
+#!/usr/bin/env sh
-# Parameters
-BUILD_TYPE=Release
-BUILD=build
-PARALLEL=8
+set -eu
-# Configure CMake
-cmake -B $BUILD -DCMAKE_BUILD_TYPE=$BUILD_TYPE
+ROOT_DIR=$(CDPATH= cd -- "$(dirname "$0")" && pwd)
+BUILD_TYPE=${BUILD_TYPE:-Release}
+BUILD=${BUILD:-build}
+PARALLEL=${PARALLEL:-8}
+INTERACTIVE=${INTERACTIVE:-no}
+NO_PROMPT=0
-# Build
-cmake --build $BUILD --parallel $PARALLEL --config $BUILD_TYPE
+usage() {
+ cat <<'EOF'
+Usage: sh install.sh [options]
-# Move executables to the main directory
-mv $BUILD/cryfa .
-mv $BUILD/keygen .
+Options:
+ --build-dir DIR Build directory (default: build)
+ --build-type TYPE CMake build type (default: Release)
+ --parallel N Parallel build jobs (default: 8)
+ --interactive Confirm the resolved settings before building
+ --no-prompt Run non-interactively with the current values
+ --help Show this help text
+EOF
+}
+
+log() {
+ printf '[install] %s\n' "$*"
+}
+
+fail() {
+ log "Error: $*"
+ exit 1
+}
+
+is_tty() {
+ [ -t 0 ] && [ -t 1 ]
+}
+
+confirm_yes_no() {
+ prompt_text=$1
+ default_value=$2
+ while true; do
+ printf '%s [%s]: ' "$prompt_text" "$default_value" >&2
+ IFS= read -r reply || exit 1
+ if [ -z "$reply" ]; then
+ reply=$default_value
+ fi
+ case $(printf '%s' "$reply" | tr '[:upper:]' '[:lower:]') in
+ y | yes) return 0 ;;
+ n | no) return 1 ;;
+ esac
+ printf 'Please answer yes or no.\n' >&2
+ done
+}
+
+validate_options() {
+ case $BUILD_TYPE in
+ "" )
+ fail "build type must not be empty."
+ ;;
+ esac
+
+ case $PARALLEL in
+ '' | *[!0-9]*)
+ fail "parallel build jobs must be a positive integer."
+ ;;
+ 0)
+ fail "parallel build jobs must be greater than zero."
+ ;;
+ esac
+}
+
+cache_source_dir() {
+ cache_file=$1
+ sed -n 's/^CMAKE_HOME_DIRECTORY:INTERNAL=//p' "$cache_file" | tail -n 1
+}
+
+clear_stale_cmake_state() {
+ build_dir=$1
+ if [ ! -d "$build_dir" ]; then
+ return
+ fi
+
+ log "Resetting stale CMake state in: $build_dir"
+ log "Keeping reusable downloaded dependencies under $build_dir/_deps when present"
+
+ rm -f \
+ "$build_dir/CMakeCache.txt" \
+ "$build_dir/CTestTestfile.cmake" \
+ "$build_dir/Makefile" \
+ "$build_dir/cmake_install.cmake" \
+ "$build_dir/compile_commands.json" \
+ "$build_dir/build.ninja" \
+ "$build_dir/.ninja_deps" \
+ "$build_dir/.ninja_log"
+
+ rm -rf "$build_dir/CMakeFiles"
+}
+
+handle_existing_cache() {
+ build_dir=$1
+ cache_file=$build_dir/CMakeCache.txt
+
+ if [ ! -f "$cache_file" ]; then
+ return
+ fi
+
+ cache_source=$(cache_source_dir "$cache_file")
+ if [ "$cache_source" = "$ROOT_DIR" ]; then
+ return
+ fi
+
+ log "Detected a build cache from a different source directory."
+ log "Current repo: $ROOT_DIR"
+ log "Cached source: ${cache_source:-unknown}"
+
+ if [ "$NO_PROMPT" -eq 0 ] && is_tty; then
+ if ! confirm_yes_no "Reset the build directory and reconfigure?" "yes"; then
+ fail "installation cancelled."
+ fi
+ fi
+
+ clear_stale_cmake_state "$build_dir"
+}
+
+built_executable() {
+ name=$1
+ for candidate in "$BUILD/$name" "$BUILD/$BUILD_TYPE/$name"; do
+ if [ -f "$candidate" ]; then
+ printf '%s\n' "$candidate"
+ return
+ fi
+ done
+
+ fail "unable to find built executable \"$name\" in \"$BUILD\" or \"$BUILD/$BUILD_TYPE\"."
+}
+
+while [ "$#" -gt 0 ]; do
+ case "$1" in
+ --build-dir)
+ BUILD=$2
+ shift 2
+ ;;
+ --build-type)
+ BUILD_TYPE=$2
+ shift 2
+ ;;
+ --parallel)
+ PARALLEL=$2
+ shift 2
+ ;;
+ --interactive)
+ INTERACTIVE=yes
+ shift
+ ;;
+ --no-prompt)
+ INTERACTIVE=no
+ NO_PROMPT=1
+ shift
+ ;;
+ -h | --help)
+ usage
+ exit 0
+ ;;
+ *)
+ fail "unknown argument \"$1\"."
+ ;;
+ esac
+done
+
+cd "$ROOT_DIR"
+
+if [ "$INTERACTIVE" = "yes" ] && [ "$NO_PROMPT" -eq 0 ] && is_tty; then
+ log "Planned install settings:"
+ log " Build directory: $BUILD"
+ log " Build type: $BUILD_TYPE"
+ log " Parallel jobs: $PARALLEL"
+ if ! confirm_yes_no "Continue with these settings?" "yes"; then
+ fail "installation cancelled."
+ fi
+fi
+
+validate_options
+handle_existing_cache "$BUILD"
+
+log "Configuring CMake in \"$BUILD\""
+cmake -S "$ROOT_DIR" -B "$BUILD" -DCMAKE_BUILD_TYPE="$BUILD_TYPE"
+
+log "Building targets with $PARALLEL parallel jobs"
+cmake --build "$BUILD" --parallel "$PARALLEL" --config "$BUILD_TYPE"
+
+log "Copying executables to the repository root"
+cp "$(built_executable cryfa)" "$ROOT_DIR/cryfa"
+cp "$(built_executable keygen)" "$ROOT_DIR/keygen"
+
+log "Install complete"
diff --git a/scripts/config/par.sh b/scripts/config/par.sh
index 70e4f2b..4f5043a 100644
--- a/scripts/config/par.sh
+++ b/scripts/config/par.sh
@@ -18,7 +18,7 @@ scripts_results="$scripts/results"
dataset="dataset"
redun="Redundancy"
progs="progs"
-result="result"
+result="results"
details="details"
FA="FA"
FQ="FQ"
diff --git a/scripts/orchestration/benchmark_orchestrator.sh b/scripts/orchestration/benchmark_orchestrator.sh
index 782b48f..b5c6ac4 100644
--- a/scripts/orchestration/benchmark_orchestrator.sh
+++ b/scripts/orchestration/benchmark_orchestrator.sh
@@ -253,6 +253,56 @@ function run_benchmark_cryfa_threads {
fi
}
+function run_benchmark_local_perf {
+ if [[ $RUN_LOCAL_PERF -ne 1 ]]; then
+ return
+ fi
+
+ ensureDir $result
+
+ local label=${LOCAL_PERF_LABEL:-baseline}
+ local compare_to=${LOCAL_PERF_COMPARE_TO:-}
+ local input=${LOCAL_PERF_INPUT:-example/in.fq}
+ local target_mb=${LOCAL_PERF_TARGET_MB:-200}
+ local threads=${LOCAL_PERF_THREADS:-1\ 4\ 8}
+ local runs=${LOCAL_PERF_RUNS:-1}
+ local modes=${LOCAL_PERF_MODES:-default\ stop-shuffle}
+ local interactive_mode=${LOCAL_PERF_INTERACTIVE:-auto}
+ local bin=${LOCAL_PERF_BIN:-build/cryfa}
+ local key_file=${LOCAL_PERF_KEY_FILE:-pass.txt}
+ local out_dir=${LOCAL_PERF_OUT_DIR:-results/local_perf}
+ local compare_args=()
+ local prompt_args=()
+ if [[ -n $compare_to ]]; then
+ compare_args=(--compare-to "$compare_to")
+ fi
+ case "$interactive_mode" in
+ yes)
+ prompt_args=(--interactive)
+ ;;
+ no)
+ prompt_args=(--no-prompt)
+ ;;
+ esac
+
+ echo "[local_perf] Starting local performance harness..."
+ echo "[local_perf] Base label: $label"
+ echo "[local_perf] Output dir: $out_dir"
+
+ bash "$scripts_runtime/run_local_perf.sh" \
+ --label "$label" \
+ "${compare_args[@]}" \
+ "${prompt_args[@]}" \
+ --bin "$bin" \
+ --key-file "$key_file" \
+ --input "$input" \
+ --out-dir "$out_dir" \
+ --target-mb "$target_mb" \
+ --threads "$threads" \
+ --runs "$runs" \
+ --modes "$modes"
+}
+
function run_benchmark_redundancy {
if [[ $RUN_REDUNDANCY -ne 1 ]]; then
return
@@ -278,5 +328,6 @@ function run_benchmark {
run_benchmark_encryption
run_benchmark_compression_encryption
run_benchmark_cryfa_threads || return 1
+ run_benchmark_local_perf || return 1
run_benchmark_redundancy
}
diff --git a/scripts/runtime/run_local_perf.sh b/scripts/runtime/run_local_perf.sh
new file mode 100644
index 0000000..14a0ef1
--- /dev/null
+++ b/scripts/runtime/run_local_perf.sh
@@ -0,0 +1,770 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)
+
+LABEL=${LOCAL_PERF_LABEL:-}
+COMPARE_TO=${LOCAL_PERF_COMPARE_TO:-}
+BIN=${LOCAL_PERF_BIN:-build/cryfa}
+KEY_FILE=${LOCAL_PERF_KEY_FILE:-pass.txt}
+INPUT=${LOCAL_PERF_INPUT:-example/in.fq}
+OUT_DIR=${LOCAL_PERF_OUT_DIR:-results/local_perf}
+TARGET_MB=${LOCAL_PERF_TARGET_MB:-200}
+THREADS=${LOCAL_PERF_THREADS:-1 4 8}
+RUNS=${LOCAL_PERF_RUNS:-1}
+MODES=${LOCAL_PERF_MODES:-default stop-shuffle}
+INTERACTIVE=${LOCAL_PERF_INTERACTIVE:-auto}
+
+RUN_TIMESTAMP=""
+LABEL_BASE=""
+RUN_LABEL=""
+LABEL_SAFE=""
+DATASET=""
+INPUT_BYTES=0
+COMPARE_DIR=""
+COMPARE_LABEL=""
+RAW_CSV=""
+SUMMARY_CSV=""
+SUMMARY_MD=""
+COMPARE_CSV=""
+COMPARE_MD=""
+
+function usage {
+ cat <<'EOF'
+Usage: bash scripts/runtime/run_local_perf.sh [options]
+
+Options:
+ --label NAME Base label for this run; a timestamp is always appended
+ --compare-to NAME Compare against an exact prior run label or a label prefix
+ --bin PATH Cryfa binary path (default: build/cryfa)
+ --key-file PATH Key file path (default: pass.txt)
+ --input PATH Seed input or dataset path (default: example/in.fq)
+ --out-dir PATH Output folder (default: results/local_perf)
+ --target-mb N Expand the seed input to at least N MiB (default: 200)
+ --threads "LIST" Thread counts to test (default: "1 4 8")
+ --runs N Runs per configuration (default: 1)
+ --modes "LIST" Modes: default, stop-shuffle, or both
+ --interactive Prompt for the main run options before starting
+ --no-prompt Run non-interactively with the current values
+ --help Show this help text
+EOF
+}
+
+function timestamp {
+ date '+%Y-%m-%d %H:%M:%S'
+}
+
+function log {
+ printf '[%s] %s\n' "$(timestamp)" "$*"
+}
+
+function log_section {
+ printf '\n[%s] %s\n' "$(timestamp)" "$*"
+}
+
+function fail {
+ log "Error: $*"
+ exit 1
+}
+
+function is_tty {
+ [[ -t 0 && -t 1 ]]
+}
+
+function prompt_with_default {
+ local prompt_text=$1
+ local default_value=$2
+ local reply
+ printf '%s [%s]: ' "$prompt_text" "$default_value" >&2
+ IFS= read -r reply || exit 1
+ if [[ -n $reply ]]; then
+ printf '%s\n' "$reply"
+ else
+ printf '%s\n' "$default_value"
+ fi
+}
+
+function confirm_yes_no {
+ local prompt_text=$1
+ local default_value=$2
+ local reply
+ local normalized
+
+ while true; do
+ printf '%s [%s]: ' "$prompt_text" "$default_value" >&2
+ IFS= read -r reply || exit 1
+ if [[ -z $reply ]]; then
+ reply=$default_value
+ fi
+ normalized=$(printf '%s' "$reply" | tr '[:upper:]' '[:lower:]')
+ case "$normalized" in
+ y | yes)
+ return 0
+ ;;
+ n | no)
+ return 1
+ ;;
+ esac
+ printf 'Please answer yes or no.\n' >&2
+ done
+}
+
+function sanitize_name {
+ printf '%s' "$1" | tr -c 'A-Za-z0-9._-' '_'
+}
+
+function resolve_path {
+ local path=$1
+ if [[ $path = /* ]]; then
+ printf '%s\n' "$path"
+ else
+ printf '%s/%s\n' "$ROOT_DIR" "$path"
+ fi
+}
+
+function file_size_bytes {
+ wc -c <"$1" | awk '{print $1}'
+}
+
+function file_checksum {
+ cksum "$1" | awk '{print $1 ":" $2}'
+}
+
+function cache_key {
+ printf '%s' "$1" | cksum | awk '{print $1}'
+}
+
+function format_mib {
+ awk -v bytes="$1" 'BEGIN { printf "%.2f", bytes / 1048576.0 }'
+}
+
+function mib_per_second {
+ awk -v bytes="$1" -v seconds="$2" 'BEGIN {
+ if (seconds + 0 <= 0) {
+ printf "0.00"
+ } else {
+ printf "%.2f", (bytes / 1048576.0) / seconds
+ }
+ }'
+}
+
+function parse_time_value {
+ local key=$1
+ local log_file=$2
+ awk -v key="$key" '$1 == key { print $2 }' "$log_file" | tail -n 1
+}
+
+function require_file {
+ local path=$1
+ local label=$2
+ if [[ ! -e $path ]]; then
+ fail "$label \"$path\" does not exist."
+ fi
+}
+
+function is_probably_script_executable {
+ local path=$1
+ if ! command -v file >/dev/null 2>&1; then
+ return 1
+ fi
+
+ file "$path" | grep -qi 'script'
+}
+
+function resolve_existing_binary {
+ local requested=$1
+ local candidate
+
+ if [[ -x $requested ]] && ! is_probably_script_executable "$requested"; then
+ printf '%s\n' "$requested"
+ return
+ fi
+
+ for candidate in "$ROOT_DIR/build/cryfa" "$ROOT_DIR/cryfa"; do
+ if [[ -x $candidate ]] && ! is_probably_script_executable "$candidate"; then
+ printf '%s\n' "$candidate"
+ return
+ fi
+ done
+
+ fail "Cryfa binary \"$requested\" does not exist."
+}
+
+function csv_escape {
+ local value=$1
+ value=${value//$'\r'/}
+ if [[ $value == *','* || $value == *'"'* || $value == *$'\n'* ]]; then
+ value=${value//\"/\"\"}
+ printf '"%s"' "$value"
+ else
+ printf '%s' "$value"
+ fi
+}
+
+function append_csv_row {
+ local output_file=$1
+ shift
+ local first=1
+ local field
+
+ {
+ for field in "$@"; do
+ if ((first)); then
+ first=0
+ else
+ printf ','
+ fi
+ csv_escape "$field"
+ done
+ printf '\n'
+ } >>"$output_file"
+}
+
+function convert_tsv_file_to_csv {
+ local input_file=$1
+ local output_file=$2
+ awk -F '\t' '
+ {
+ for (i = 1; i <= NF; ++i) {
+ gsub(/\r/, "", $i)
+ gsub(/"/, "\"\"", $i)
+ if ($i ~ /[",\n]/) {
+ $i = "\"" $i "\""
+ }
+ }
+ for (i = 1; i <= NF; ++i) {
+ printf "%s", $i
+ if (i < NF) {
+ printf ","
+ }
+ }
+ printf "\n"
+ }
+ ' "$input_file" >"$output_file"
+}
+
+function ask_for_options {
+ log "Interactive benchmark setup"
+ LABEL=$(prompt_with_default "Base label" "${LABEL:-baseline}")
+ INPUT=$(prompt_with_default "Input file" "$INPUT")
+ TARGET_MB=$(prompt_with_default "Target dataset size in MiB (0 keeps the input as-is)" "$TARGET_MB")
+ THREADS=$(prompt_with_default "Thread counts (space or comma separated)" "$THREADS")
+ RUNS=$(prompt_with_default "Runs per case" "$RUNS")
+ MODES=$(prompt_with_default "Modes (default, stop-shuffle, both)" "$MODES")
+ COMPARE_TO=$(prompt_with_default "Compare to previous label (leave empty to skip)" "$COMPARE_TO")
+}
+
+function build_run_label {
+ RUN_TIMESTAMP=$(date '+%Y%m%d_%H%M%S')
+ LABEL_BASE=${LABEL:-baseline}
+ LABEL_BASE=${LABEL_BASE//$'\r'/}
+ LABEL_BASE=${LABEL_BASE//$'\n'/ }
+ LABEL_BASE=${LABEL_BASE//,/ _}
+ LABEL_BASE=${LABEL_BASE//\"/_}
+ RUN_LABEL="${LABEL_BASE}_${RUN_TIMESTAMP}"
+ LABEL_SAFE=$(sanitize_name "$RUN_LABEL")
+}
+
+function build_dataset {
+ local seed_bytes
+ local source_path
+ local source_checksum
+ local source_key
+ seed_bytes=$(file_size_bytes "$INPUT")
+ source_path=$(resolve_path "$INPUT")
+
+ if (( TARGET_MB <= 0 )) || (( seed_bytes >= TARGET_MB * 1024 * 1024 )); then
+ DATASET=$INPUT
+ log "Using input file directly as benchmark dataset: $DATASET ($(format_mib "$seed_bytes") MiB)"
+ return
+ fi
+
+ local target_bytes=$((TARGET_MB * 1024 * 1024))
+ local copies=$(((target_bytes + seed_bytes - 1) / seed_bytes))
+ local dataset_dir="$OUT_DIR/datasets"
+ local base_name
+ local ext
+ local tmp
+
+ mkdir -p "$dataset_dir"
+ base_name=$(basename "${INPUT%.*}")
+ ext=${INPUT##*.}
+ if [[ $ext == "$INPUT" ]]; then
+ ext="dat"
+ fi
+
+ source_checksum=$(file_checksum "$INPUT")
+ source_key=$(cache_key "${source_path}:${source_checksum}")
+ DATASET="$dataset_dir/${base_name}_${source_key}_${TARGET_MB}mb_x${copies}.${ext}"
+
+ if [[ -f $DATASET && -f $DATASET.meta ]] &&
+ grep -Fqx "dataset_format=2" "$DATASET.meta" &&
+ grep -Fqx "source=$source_path" "$DATASET.meta" &&
+ grep -Fqx "source_checksum=$source_checksum" "$DATASET.meta" &&
+ grep -Fqx "seed_bytes=$seed_bytes" "$DATASET.meta" &&
+ grep -Fqx "copies=$copies" "$DATASET.meta" &&
+ grep -Fqx "target_mb=$TARGET_MB" "$DATASET.meta"; then
+ log "Reusing cached generated dataset: $DATASET ($(format_mib "$(file_size_bytes "$DATASET")") MiB)"
+ return
+ fi
+
+ if (( copies > 1 )) && ! command -v perl >/dev/null 2>&1; then
+ fail "perl is required to expand \"$INPUT\" into a local dataset."
+ fi
+
+ tmp="$DATASET.tmp"
+ rm -f "$tmp"
+
+ log "Generating synthetic benchmark dataset from $INPUT"
+ log "Seed size: $(format_mib "$seed_bytes") MiB | Copies: $copies | Target: ${TARGET_MB} MiB+"
+
+ if (( copies == 1 )); then
+ cp "$INPUT" "$tmp"
+ else
+ perl -e '
+ use strict;
+ use warnings;
+
+ my ($input, $copies, $output) = @ARGV;
+ open my $in, "<", $input or die "Unable to open $input: $!";
+ binmode $in;
+ local $/;
+ my $chunk = <$in>;
+ close $in;
+ my $terminator = length($chunk) && substr($chunk, -1) eq "\n" ? "" : "\n";
+
+ open my $out, ">", $output or die "Unable to open $output: $!";
+ binmode $out;
+ for (1 .. $copies) {
+ print {$out} $chunk;
+ print {$out} $terminator if length($terminator);
+ }
+ close $out or die "Unable to write $output: $!";
+ ' "$INPUT" "$copies" "$tmp"
+ fi
+
+ mv "$tmp" "$DATASET"
+ cat >"$DATASET.meta" <"$compressed"; } 2>"$c_log"; then
+ fail "compression failed for mode=$mode thread=$thread run=$run_index. See $c_log"
+ fi
+
+ decompress_cmd=("$BIN" -k "$KEY_FILE" -t "$thread" -d "$compressed")
+ log "Decompress command: ${decompress_cmd[*]}"
+ if ! { time -p "${decompress_cmd[@]}" >"$decompressed"; } 2>"$d_log"; then
+ fail "decompression failed for mode=$mode thread=$thread run=$run_index. See $d_log"
+ fi
+
+ if ! cmp -s "$DATASET" "$decompressed"; then
+ fail "round-trip mismatch for mode=$mode thread=$thread run=$run_index."
+ fi
+
+ local compressed_bytes
+ local c_real
+ local c_user
+ local c_sys
+ local d_real
+ local d_user
+ local d_sys
+ local c_rate
+ local d_rate
+
+ compressed_bytes=$(file_size_bytes "$compressed")
+ c_real=$(parse_time_value real "$c_log")
+ c_user=$(parse_time_value user "$c_log")
+ c_sys=$(parse_time_value sys "$c_log")
+ d_real=$(parse_time_value real "$d_log")
+ d_user=$(parse_time_value user "$d_log")
+ d_sys=$(parse_time_value sys "$d_log")
+ c_rate=$(mib_per_second "$INPUT_BYTES" "$c_real")
+ d_rate=$(mib_per_second "$INPUT_BYTES" "$d_real")
+
+ append_csv_row "$RAW_CSV" \
+ "$RUN_LABEL" "$mode" "$thread" "$run_index" "$INPUT_BYTES" "$compressed_bytes" \
+ "$c_real" "$c_user" "$c_sys" "$c_rate" \
+ "$d_real" "$d_user" "$d_sys" "$d_rate" "ok"
+
+ log "Case complete: compressed $(format_mib "$compressed_bytes") MiB | c=${c_real}s (${c_rate} MiB/s) | d=${d_real}s (${d_rate} MiB/s) | verified=ok"
+
+ rm -f "$compressed" "$decompressed"
+ done
+}
+
+function write_summary_csv {
+ append_csv_row "$SUMMARY_CSV" \
+ label mode threads runs input_bytes compressed_bytes avg_c_real avg_c_user avg_c_sys avg_c_mib_s avg_d_real avg_d_user avg_d_sys avg_d_mib_s verified
+
+ local mode
+ local thread
+ for mode in $MODES; do
+ for thread in $THREADS; do
+ awk -F ',' -v label="$RUN_LABEL" -v mode="$mode" -v thread="$thread" '
+ NR > 1 && $2 == mode && $3 == thread {
+ count++
+ input_bytes = $5
+ compressed_bytes = $6
+ c_real += $7
+ c_user += $8
+ c_sys += $9
+ c_mib += $10
+ d_real += $11
+ d_user += $12
+ d_sys += $13
+ d_mib += $14
+ if ($15 != "ok") verified = "FAIL"
+ }
+ END {
+ if (count > 0) {
+ if (verified == "") verified = "ok"
+ printf "%s,%s,%s,%d,%s,%s,%.6f,%.6f,%.6f,%.2f,%.6f,%.6f,%.6f,%.2f,%s\n",
+ label, mode, thread, count, input_bytes, compressed_bytes,
+ c_real / count, c_user / count, c_sys / count, c_mib / count,
+ d_real / count, d_user / count, d_sys / count, d_mib / count,
+ verified
+ }
+ }
+ ' "$RAW_CSV" >>"$SUMMARY_CSV"
+ done
+ done
+}
+
+function write_summary_markdown {
+ {
+ echo "# Local Performance Summary"
+ echo
+ echo "- Label: $RUN_LABEL"
+ echo "- Dataset: $DATASET"
+ echo "- Dataset size: $(format_mib "$INPUT_BYTES") MiB"
+ echo "- Runs per case: $RUNS"
+ echo "- Threads: $THREADS"
+ echo "- Modes: $MODES"
+ echo "- Raw data: $(basename "$RAW_CSV")"
+ echo "- Summary data: $(basename "$SUMMARY_CSV")"
+ echo
+ echo "| Mode | Threads | Runs | Compressed Size (MiB) | Ratio | Compress Time (s) | Compress MiB/s | Decompress Time (s) | Decompress MiB/s | Verified |"
+ echo "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |"
+ awk -F ',' '
+ NR > 1 {
+ printf "| %s | %s | %s | %.2f | %.3f | %.3f | %.2f | %.3f | %.2f | %s |\n",
+ $2, $3, $4, $6 / 1048576.0, $6 / $5, $7, $10, $11, $14, $15
+ }
+ ' "$SUMMARY_CSV"
+ } >"$SUMMARY_MD"
+}
+
+function summary_input_for_compare {
+ local dir=$1
+ if [[ -f "$dir/summary.csv" ]]; then
+ printf '%s\n' "$dir/summary.csv"
+ return
+ fi
+ if [[ -f "$dir/summary.tsv" ]]; then
+ printf '%s\n' "$dir/summary.tsv"
+ return
+ fi
+ fail "comparison summary was not found in \"$dir\"."
+}
+
+function write_compare_report {
+ [[ -z $COMPARE_DIR ]] && return
+
+ local baseline_summary
+ local baseline_csv
+ baseline_summary=$(summary_input_for_compare "$COMPARE_DIR")
+
+ append_csv_row "$COMPARE_CSV" \
+ mode threads compress_speedup decompress_speedup roundtrip_speedup size_ratio_delta baseline_verified current_verified
+
+ if [[ $baseline_summary == *.tsv ]]; then
+ baseline_csv=$(mktemp "${TMPDIR:-/tmp}/cryfa-compare-baseline.XXXXXX.csv")
+ convert_tsv_file_to_csv "$baseline_summary" "$baseline_csv"
+ else
+ baseline_csv=$baseline_summary
+ fi
+
+ awk -F ',' '
+ NR == FNR {
+ if (FNR == 1) next
+ key = $2 SUBSEP $3
+ base_c[key] = $7
+ base_d[key] = $11
+ base_ratio[key] = $6 / $5
+ base_verified[key] = $15
+ next
+ }
+ FNR == 1 {
+ next
+ }
+ {
+ key = $2 SUBSEP $3
+ if (!(key in base_c)) next
+ compress_speedup = ($7 + 0 > 0) ? base_c[key] / $7 : 0
+ decompress_speedup = ($11 + 0 > 0) ? base_d[key] / $11 : 0
+ roundtrip_speedup = (($7 + $11) + 0 > 0) ? (base_c[key] + base_d[key]) / ($7 + $11) : 0
+ ratio_delta = ($6 / $5) - base_ratio[key]
+ printf "%s,%s,%.3f,%.3f,%.3f,%.6f,%s,%s\n",
+ $2, $3, compress_speedup, decompress_speedup, roundtrip_speedup, ratio_delta,
+ base_verified[key], $15
+ }
+ ' "$baseline_csv" "$SUMMARY_CSV" >>"$COMPARE_CSV"
+
+ if [[ $baseline_summary == *.tsv ]]; then
+ rm -f "$baseline_csv"
+ fi
+
+ {
+ echo "# Before/After Comparison"
+ echo
+ echo "- Before: $COMPARE_LABEL"
+ echo "- After: $RUN_LABEL"
+ echo "- Speedup values above 1.00 are faster than the baseline."
+ echo "- Negative size deltas mean the new run produced smaller output."
+ echo "- Comparison data: $(basename "$COMPARE_CSV")"
+ echo
+ echo "| Mode | Threads | Compress Speedup | Decompress Speedup | Round-Trip Speedup | Size Ratio Delta | Baseline Verified | Current Verified |"
+ echo "| --- | ---: | ---: | ---: | ---: | ---: | --- | --- |"
+ awk -F ',' '
+ NR > 1 {
+ printf "| %s | %s | %.3f | %.3f | %.3f | %.6f | %s | %s |\n",
+ $1, $2, $3, $4, $5, $6, $7, $8
+ }
+ ' "$COMPARE_CSV"
+ } >"$COMPARE_MD"
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --label)
+ LABEL=$2
+ shift 2
+ ;;
+ --compare-to)
+ COMPARE_TO=$2
+ shift 2
+ ;;
+ --bin)
+ BIN=$2
+ shift 2
+ ;;
+ --key-file)
+ KEY_FILE=$2
+ shift 2
+ ;;
+ --input)
+ INPUT=$2
+ shift 2
+ ;;
+ --out-dir)
+ OUT_DIR=$2
+ shift 2
+ ;;
+ --target-mb)
+ TARGET_MB=$2
+ shift 2
+ ;;
+ --threads)
+ THREADS=$2
+ shift 2
+ ;;
+ --runs)
+ RUNS=$2
+ shift 2
+ ;;
+ --modes)
+ MODES=$2
+ shift 2
+ ;;
+ --interactive)
+ INTERACTIVE=yes
+ shift
+ ;;
+ --no-prompt)
+ INTERACTIVE=no
+ shift
+ ;;
+ -h | --help)
+ usage
+ exit 0
+ ;;
+ *)
+ echo "Error: unknown argument \"$1\"." >&2
+ usage >&2
+ exit 1
+ ;;
+ esac
+done
+
+if [[ $INTERACTIVE == auto ]] && is_tty; then
+ INTERACTIVE=yes
+fi
+
+if [[ $INTERACTIVE == yes ]] && is_tty; then
+ ask_for_options
+fi
+
+THREADS=${THREADS//,/ }
+case "$MODES" in
+both)
+ MODES="default stop-shuffle"
+ ;;
+*)
+ MODES=${MODES//,/ }
+ ;;
+esac
+
+if [[ ! $RUNS =~ ^[1-9][0-9]*$ ]]; then
+ fail "--runs must be a positive integer."
+fi
+
+if [[ ! $TARGET_MB =~ ^[0-9]+$ ]]; then
+ fail "--target-mb must be a non-negative integer."
+fi
+
+BIN=$(resolve_existing_binary "$(resolve_path "$BIN")")
+KEY_FILE=$(resolve_path "$KEY_FILE")
+INPUT=$(resolve_path "$INPUT")
+OUT_DIR=$(resolve_path "$OUT_DIR")
+
+require_file "$KEY_FILE" "key file"
+require_file "$INPUT" "input file"
+
+if [[ $INTERACTIVE == yes ]] && is_tty; then
+ log "Planned benchmark settings:"
+ log " Base label: ${LABEL:-baseline}"
+ log " Input file: $INPUT"
+ log " Target size: $TARGET_MB MiB"
+ log " Threads: $THREADS"
+ log " Runs per case: $RUNS"
+ log " Modes: $MODES"
+ if [[ -n $COMPARE_TO ]]; then
+ log " Compare to: $COMPARE_TO"
+ else
+ log " Compare to: none"
+ fi
+ if ! confirm_yes_no "Start this benchmark run?" "yes"; then
+ fail "benchmark cancelled."
+ fi
+fi
+
+build_run_label
+
+mkdir -p "$OUT_DIR"
+RUN_DIR="$OUT_DIR/$LABEL_SAFE"
+DETAILS_DIR="$RUN_DIR/details"
+RAW_CSV="$RUN_DIR/raw.csv"
+SUMMARY_CSV="$RUN_DIR/summary.csv"
+SUMMARY_MD="$RUN_DIR/summary.md"
+COMPARE_CSV="$RUN_DIR/compare.csv"
+COMPARE_MD="$RUN_DIR/compare.md"
+
+rm -rf "$RUN_DIR"
+mkdir -p "$DETAILS_DIR"
+
+log_section "Local performance benchmark"
+log "Run label: $RUN_LABEL"
+log "Binary: $BIN"
+log "Key file: $KEY_FILE"
+log "Input: $INPUT"
+log "Output directory: $RUN_DIR"
+log "Requested threads: $THREADS"
+log "Runs per case: $RUNS"
+log "Modes: $MODES"
+
+build_dataset
+INPUT_BYTES=$(file_size_bytes "$DATASET")
+log "Measured dataset size: $(format_mib "$INPUT_BYTES") MiB"
+
+resolve_compare_target
+
+append_csv_row "$RAW_CSV" \
+ label mode threads run input_bytes compressed_bytes c_real c_user c_sys c_mib_s d_real d_user d_sys d_mib_s verified
+
+for mode in $MODES; do
+ for thread in $THREADS; do
+ run_case "$mode" "$thread"
+ done
+done
+
+log_section "Writing reports"
+write_summary_csv
+write_summary_markdown
+write_compare_report
+
+log "Raw CSV: $RAW_CSV"
+log "Summary CSV: $SUMMARY_CSV"
+log "Summary markdown: $SUMMARY_MD"
+if [[ -f $COMPARE_MD ]]; then
+ log "Comparison CSV: $COMPARE_CSV"
+ log "Comparison markdown: $COMPARE_MD"
+fi
+
+log_section "Benchmark complete"
diff --git a/src/application.cpp b/src/application.cpp
index 7742bad..507e15f 100644
--- a/src/application.cpp
+++ b/src/application.cpp
@@ -1,14 +1,14 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file application.cpp
- * @brief Application
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file application.cpp
+ * @brief Application
*/
#include "application.hpp"
-#include
+#include
#include "assert.hpp"
#include "numeric.hpp"
@@ -39,7 +39,7 @@ void application::exe_compress_encrypt() {
crypt.shuffle_file();
break;
default:
- error("\"" + par.in_file + "\" is not a valid FASTA or FASTQ file.");
+ error(std::format("\"{}\" is not a valid FASTA or FASTQ file.", par.in_file));
}
}
@@ -47,9 +47,7 @@ void application::exe_compress_encrypt() {
* @brief Decrypt and/or unshuffle + decompress
*/
void application::exe_decrypt_decompress() {
- crypt.decrypt();
- std::ifstream in(DEC_FNAME);
- switch (in.peek()) {
+ switch (crypt.peek_decrypted_type()) {
case (char)127:
fa.decompress();
break;
@@ -57,18 +55,18 @@ void application::exe_decrypt_decompress() {
fq.decompress();
break;
case (char)125:
+ crypt.decrypt();
crypt.unshuffle_file();
break;
default:
error("corrupted file.");
}
- in.close();
}
/**
* @brief Execute Cryfa
- * @param argc number of command line arguments
- * @param argv command line arguments
+ * @param argc Number of command line arguments
+ * @param argv Command line arguments
*/
void application::exe(int argc, char* argv[]) {
const char action = parse(par, argc, argv);
diff --git a/src/application.hpp b/src/application.hpp
index c44e438..a27e077 100644
--- a/src/application.hpp
+++ b/src/application.hpp
@@ -1,9 +1,9 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file application.hpp
- * @brief Application
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file application.hpp
+ * @brief Application
*/
#ifndef CRYFA_APPLICATION_H
@@ -32,4 +32,4 @@ class application {
} // namespace cryfa
-#endif // CRYFA_APPLICATION_H
\ No newline at end of file
+#endif // CRYFA_APPLICATION_H
diff --git a/src/cryfa.cpp b/src/cryfa.cpp
index b79050b..394ea00 100644
--- a/src/cryfa.cpp
+++ b/src/cryfa.cpp
@@ -1,16 +1,9 @@
-/*****************************************************
- Cryfa :: A secure encryption tool for genomic data
-******************************************************
- Morteza Hosseini seyedmorteza@ua.pt
- Diogo Pratas pratas@ua.pt
-******************************************************/
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
/**
- * @file cryfa.cpp
- * @brief Main
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file cryfa.cpp
+ * @brief Main file - Cryfa: a secure encryption tool for genomic data
*/
#include // std::exception
@@ -19,9 +12,9 @@
using namespace cryfa;
/**
- * @brief Run Cryfa
- * @param argc number of command line arguments
- * @param argv command line arguments
+ * @brief Run Cryfa
+ * @param argc Number of command line arguments
+ * @param argv Command line arguments
* @return SUCCESS or FAILURE
*/
int main(int argc, char* argv[]) {
@@ -34,4 +27,4 @@ int main(int argc, char* argv[]) {
}
return 0;
-}
\ No newline at end of file
+}
diff --git a/src/def.hpp b/src/def.hpp
index e9b2863..a077091 100644
--- a/src/def.hpp
+++ b/src/def.hpp
@@ -1,9 +1,9 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file def.hpp
- * @brief Definitions
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file def.hpp
+ * @brief Definitions
*/
#ifndef CRYFA_DEF_H
@@ -14,12 +14,9 @@
#include // std::mt19937
#include // Hash table
-namespace cryfa {
-// Version
-static const std::string MONTH = "04";
-static const std::string YEAR = "26";
-static const std::string VERSION = YEAR + "." + MONTH;
+#include "cryfa/version.hpp"
+namespace cryfa {
// Typedefs
using byte = unsigned char;
using u16 = unsigned short;
@@ -28,13 +25,13 @@ using u64 = unsigned long long;
using i64 = long long;
using rng_t = std::mt19937;
using htbl_t = std::unordered_map;
-using pos_t = std::char_traits::pos_type; /**< @brief tellg(), tellp() */
+using pos_t = std::char_traits::pos_type; // Position type for tellg() and tellp()
// Metaprograms
/**
- * Power (B^E). Usage: "cerr << POWER<3,2>::val;" which yields 9
- * @tparam B Base
- * @tparam E Exponent
+ * @brief Power (B^E). Usage: "cerr << POWER<3,2>::val;" which yields 9
+ * @tparam B Base
+ * @tparam E Exponent
* @warning Base (B) and exponent (E) MUST be known at compile time.
*/
template
@@ -56,51 +53,45 @@ struct POWER {
#define LOOP4(i, j, k, l, S) LOOP(i, S) LOOP(j, S) LOOP2(k, l, S)
#define LOOP5(i, j, k, l, m, S) LOOP(i, S) LOOP(j, S) LOOP3(k, l, m, S)
#define LOOP6(i, j, k, l, m, n, S) LOOP(i, S) LOOP(j, S) LOOP4(k, l, m, n, S)
-#define LOOP7(i, j, k, l, m, n, o, S) \
- LOOP(i, S) LOOP(j, S) LOOP5(k, l, m, n, o, S)
-#define LOOP8(i, j, k, l, m, n, o, p, S) \
- LOOP(i, S) LOOP(j, S) LOOP6(k, l, m, n, o, p, S)
-#define IGNORE_THIS_LINE(in) \
- (in).ignore(std::numeric_limits::max(), '\n')
+#define LOOP7(i, j, k, l, m, n, o, S) LOOP(i, S) LOOP(j, S) LOOP5(k, l, m, n, o, S)
+#define LOOP8(i, j, k, l, m, n, o, p, S) LOOP(i, S) LOOP(j, S) LOOP6(k, l, m, n, o, p, S)
+#define IGNORE_THIS_LINE(in) (in).ignore(std::numeric_limits::max(), '\n')
// Constants
-static const std::string THR_ID_HDR = "THRD="; /**< @brief Thread ID header */
-static const std::string PK_FNAME = "CRYFA_PK"; /**< @brief Packed file name */
-static const std::string PCKD_FNAME =
- "CRYFA_PCKD"; /**< @brief Pckd f name - joined*/
-static const std::string SH_FNAME = "CRYFA_SH"; /**< @brief Shuffed file name */
-static const std::string DEC_FNAME =
- "CRYFA_DEC"; /**< @brief Decrypted file name */
-static const std::string UPK_FNAME =
- "CRYFA_UPK"; /**< @brief Unpacked file name */
-static const std::string USH_FNAME =
- "CRYFA_USH"; /**< @brief Unshuffled file name*/
-constexpr byte DEF_N_THR = 8; /**< @brief Default number of threads */
-constexpr u64 BLOCK_SIZE = 8 * 1024; /**< @brief To read/write from/to file */
-constexpr byte C1 = 2; /**< @brief Cat 1 = 2 */
-constexpr byte C2 = 3; /**< @brief Cat 2 = 3 */
-constexpr byte MIN_C3 = 4; /**< @brief 4 <= Cat 3 <= 6 */
+static const std::string THR_ID_HDR = "THRD="; // Thread ID header
+static const std::string PK_FNAME = "CRYFA_PK"; // Packed file name
+static const std::string PCKD_FNAME = "CRYFA_PCKD"; // Packed file name - joined
+static const std::string SH_FNAME = "CRYFA_SH"; // Shuffled file name
+static const std::string DEC_FNAME = "CRYFA_DEC"; // Decrypted file name
+static const std::string UPK_FNAME = "CRYFA_UPK"; // Unpacked file name
+static const std::string USH_FNAME = "CRYFA_USH"; // Unshuffled file name
+constexpr byte DEF_N_THR = 8; // Default number of threads
+constexpr u64 IO_BUFFER_SIZE = 8ULL * 1024ULL; // Buffered output writes
+constexpr u64 CHUNK_TARGET_SIZE = 1024ULL * 1024ULL; // Internal worker chunk target
+constexpr byte C1 = 2; // Cat 1 = 2
+constexpr byte C2 = 3; // Cat 2 = 3
+constexpr byte MIN_C3 = 4; // 4 <= Cat 3 <= 6
constexpr byte MID_C3 = 5;
constexpr byte MAX_C3 = 6;
-constexpr byte MIN_C4 = 7; /**< @brief 7 <= Cat 4 <= 15 */
+constexpr byte MIN_C4 = 7; // 7 <= Cat 4 <= 15
constexpr byte MAX_C4 = 15;
-constexpr byte MIN_C5 = 16; /**< @brief 16 <= Cat 5 <= 39 */
+constexpr byte MIN_C5 = 16; // 16 <= Cat 5 <= 39
constexpr byte MAX_C5 = 39;
-constexpr byte KEYLEN_C1 = 7; /**< @brief 7 to 1 byte. Build hash table*/
-constexpr byte KEYLEN_C2 = 5; /**< @brief 5 to 1 byte */
-constexpr byte KEYLEN_C3 = 3; /**< @brief 3 to 1 byte */
-constexpr byte KEYLEN_C4 = 2; /**< @brief 2 to 1 byte */
-constexpr byte KEYLEN_C5 = 3; /**< @brief 3 to 2 byte */
-constexpr int TAG_SIZE = 12; /**< @brief GCC mode auth enc */
+constexpr byte KEYLEN_C1 = 7; // 7 to 1 byte. Build hash table
+constexpr byte KEYLEN_C2 = 5; // 5 to 1 byte
+constexpr byte KEYLEN_C3 = 3; // 3 to 1 byte
+constexpr byte KEYLEN_C4 = 2; // 2 to 1 byte
+constexpr byte KEYLEN_C5 = 3; // 3 to 2 byte
+constexpr int TAG_SIZE = 12; // GCC mode auth enc
/** @brief Command line input arguments */
struct Param {
- static bool verbose; /**< @brief Verbose mode */
- static bool stop_shuffle; /**< @brief Disable shuffling */
- static byte n_threads; /**< @brief Number of threads */
- static std::string in_file; /**< @brief Input file name */
- static std::string key_file; /**< @brief Password file name */
- static char format; /**< @brief Format of the input file */
+ static bool verbose; // Verbose mode
+ static bool stop_shuffle; // Disable shuffling
+ static byte n_threads; // Number of threads
+ static std::string in_file; // Input file name
+ static std::string key_file; // Password file name
+ static char format; // Format of the input file
};
} // namespace cryfa
diff --git a/src/endecrypto.cpp b/src/endecrypto.cpp
index b9b3241..0359af8 100644
--- a/src/endecrypto.cpp
+++ b/src/endecrypto.cpp
@@ -1,18 +1,21 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file endecrypto.cpp
- * @brief Encryption/Decryption
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file endecrypto.cpp
+ * @brief Encryption/Decryption
*/
#include "endecrypto.hpp"
#include
+#include
#include // std::pow
+#include
#include
#include
#include // setw, std::setprecision
+#include
#include
#include
#include
@@ -22,16 +25,113 @@
#include "time.hpp"
using namespace cryfa;
-std::mutex mutxEnDe; /**< @brief Mutex */
+std::mutex mutxEnDe;
+
+namespace {
+constexpr u16 INVALID_RANK = std::numeric_limits::max();
+
+struct DenseLookup {
+ std::string alphabet;
+ bool with_extra = false;
+ u16 base = 0;
+ u16 extra_rank = 0;
+ std::array rank{};
+};
+
+auto build_dense_lookup(const std::string& alphabet, bool with_extra) -> DenseLookup {
+ DenseLookup lookup;
+ lookup.alphabet = alphabet;
+ lookup.with_extra = with_extra;
+ lookup.base = static_cast(alphabet.size() + (with_extra ? 1 : 0));
+ lookup.extra_rank = static_cast(alphabet.size());
+ lookup.rank.fill(INVALID_RANK);
+
+ for (u16 i = 0; i != alphabet.size(); ++i) {
+ lookup.rank[(byte)alphabet[i]] = i;
+ }
+ if (with_extra) {
+ lookup.rank[(byte)(alphabet.back() + 1)] = lookup.extra_rank;
+ }
+
+ return lookup;
+}
+
+auto dense_lookup(const std::string& alphabet, bool with_extra = false) -> const DenseLookup& {
+ thread_local std::vector cache;
+ for (const DenseLookup& lookup : cache) {
+ if (lookup.with_extra == with_extra && lookup.alphabet == alphabet) {
+ return lookup;
+ }
+ }
+
+ cache.push_back(build_dense_lookup(alphabet, with_extra));
+ return cache.back();
+}
+
+auto checked_rank(const DenseLookup& lookup, char c) -> u16 {
+ const u16 rank = lookup.rank[(byte)c];
+ if (rank == INVALID_RANK) {
+ error(std::format("symbol \"{}\" not found!", c));
+ }
+ return rank;
+}
+
+auto tuple_index(const DenseLookup& lookup, const char* tuple, size_t width) -> u16 {
+ u64 index = 0;
+ for (size_t i = 0; i != width; ++i) {
+ index = index * lookup.base + checked_rank(lookup, tuple[i]);
+ }
+ return static_cast(index);
+}
+
+auto large_tuple_index(const DenseLookup& lookup, char s0, char s1, char s2, bool& first_not_in,
+ bool& second_not_in, bool& third_not_in) -> u16 {
+ auto rank_or_extra = [&](char c, bool& not_in) {
+ const u16 rank = lookup.rank[(byte)c];
+ not_in = (rank == INVALID_RANK);
+ return not_in ? lookup.extra_rank : rank;
+ };
+
+ const u16 r0 = rank_or_extra(s0, first_not_in);
+ const u16 r1 = rank_or_extra(s1, second_not_in);
+ const u16 r2 = rank_or_extra(s2, third_not_in);
+ return static_cast((r0 * lookup.base + r1) * lookup.base + r2);
+}
+
+auto dna_rank_or_x(char c, bool& not_in) -> byte {
+ not_in = false;
+ switch (c) {
+ case 'A':
+ return 0;
+ case 'C':
+ return 1;
+ case 'G':
+ return 2;
+ case 'T':
+ return 3;
+ case 'N':
+ return 4;
+ default:
+ not_in = true;
+ return 5;
+ }
+}
+
+void append_penalty_tail(std::string& packed, const std::string& input, size_t pos) {
+ for (; pos != input.size(); ++pos) {
+ packed += (char)255;
+ packed += input[pos];
+ }
+}
+} // namespace
/**
* @brief Build a hash table
- * @param[out] map Hash table
- * @param[in] strIn The string including the keys
- * @param[in] keyLen Length of the keys
+ * @param[out] map Hash table
+ * @param strIn The string including the keys
+ * @param keyLen Length of the keys
*/
-void EnDecrypto::build_hash_tbl(htbl_t& map, const std::string& strIn,
- short keyLen) {
+void EnDecrypto::build_hash_tbl(htbl_t& map, const std::string& strIn, short keyLen) {
u64 elementNo = 0;
std::string element;
element.reserve((unsigned long)keyLen);
@@ -132,12 +232,12 @@ void EnDecrypto::build_hash_tbl(htbl_t& map, const std::string& strIn,
/**
* @brief Build a table for unpacking
- * @param[out] unpack Table (vector of strings)
- * @param[in] strIn The string including the keys
- * @param[in] keyLen Length of the keys
+ * @param[out] unpack Table (vector of strings)
+ * @param strIn The string including the keys
+ * @param keyLen Length of the keys
*/
-void EnDecrypto::build_unpack_tbl(std::vector& unpack,
- const std::string& strIn, u16 keyLen) {
+void EnDecrypto::build_unpack_tbl(std::vector& unpack, const std::string& strIn,
+ u16 keyLen) {
std::string element;
element.reserve(keyLen);
unpack.clear();
@@ -233,432 +333,210 @@ void EnDecrypto::build_unpack_tbl(std::vector& unpack,
}
}
-/**
- * @brief Index of each DNA bases pack
- * @param key Key
- * @return Value (based on the idea of key-value in a hash table)
- */
-byte EnDecrypto::dna_pack_idx(const std::string& key) {
- const auto found = DNA_MAP.find(key);
- if (found == DNA_MAP.end()) error("key \"" + key + "\" not found!");
-
- return (byte)found->second;
-}
-
-/**
- * @brief Index of each pack, when # > 39
- * @param key Key
- * @param map Hash table
- * @return Value (based on the idea of key-value in a hash table)
- */
-u16 EnDecrypto::large_pack_idx(const std::string& key, const htbl_t& map) {
- const auto found = map.find(key);
- if (found == map.end()) error("key \"" + key + "\" not found!");
-
- return (u16)found->second;
-}
-
/**
* @brief Encapsulate each 3 DNA bases in 1 byte. Reduction: ~2/3
- * @param[out] packedSeq Packed sequence
- * @param[in] seq Sequence
+ * @param[out] packedSeq Packed sequence
+ * @param seq Sequence
*/
void EnDecrypto::pack_seq(std::string& packedSeq, const std::string& seq) {
- auto i = seq.begin();
-
- for (auto iEnd = seq.end() - 2; i < iEnd; i += 3) {
- char s0 = *i, s1 = *(i + 1), s2 = *(i + 2);
+ size_t pos = 0;
+ const size_t tuple_limit = seq.size() - (seq.size() % 3);
- std::string tuple;
- tuple.reserve(3);
+ for (; pos != tuple_limit; pos += 3) {
+ const char s0 = seq[pos];
+ const char s1 = seq[pos + 1];
+ const char s2 = seq[pos + 2];
bool firstNotIn, secondNotIn, thirdNotIn;
- tuple += (firstNotIn = (s0 != 'A' && s0 != 'C' && s0 != 'G' && s0 != 'T' &&
- s0 != 'N'))
- ? 'X'
- : s0;
- tuple += (secondNotIn = (s1 != 'A' && s1 != 'C' && s1 != 'G' && s1 != 'T' &&
- s1 != 'N'))
- ? 'X'
- : s1;
- tuple += (thirdNotIn = (s2 != 'A' && s2 != 'C' && s2 != 'G' && s2 != 'T' &&
- s2 != 'N'))
- ? 'X'
- : s2;
-
- packedSeq += dna_pack_idx(tuple);
- if (firstNotIn) packedSeq += s0;
- if (secondNotIn) packedSeq += s1;
- if (thirdNotIn) packedSeq += s2;
- }
- // If seq len isn't multiple of 3, add (char) 255 before each sym
- switch (seq.length() % 3) {
- case 1:
- packedSeq += (char)255;
- packedSeq += *i;
- break;
+ const byte r0 = dna_rank_or_x(s0, firstNotIn);
+ const byte r1 = dna_rank_or_x(s1, secondNotIn);
+ const byte r2 = dna_rank_or_x(s2, thirdNotIn);
+ packedSeq += static_cast((r0 * 6 + r1) * 6 + r2);
- case 2:
- packedSeq += (char)255;
- packedSeq += *i;
- packedSeq += (char)255;
- packedSeq += *(i + 1);
- break;
-
- default:
- break;
+ if (firstNotIn) {
+ packedSeq += s0;
+ }
+ if (secondNotIn) {
+ packedSeq += s1;
+ }
+ if (thirdNotIn) {
+ packedSeq += s2;
+ }
}
+
+ append_penalty_tail(packedSeq, seq, pos);
}
/**
* @brief Encapsulate 3 header symbols in 2 bytes, when # >= 40.
* -- FASTA/FASTQ. Reduction ~1/3
- * @param[out] packed Packed header
- * @param[in] strIn Header
- * @param[in] map Hash table
+ * @param[out] packed Packed header
+ * @param strIn Header
+ * @param map Hash table
*/
-void EnDecrypto::pack_hL_fa_fq(std::string& packed, const std::string& strIn,
- const htbl_t& map) {
+void EnDecrypto::pack_hL_fa_fq(std::string& packed, const std::string& strIn, const htbl_t& map) {
pack_large(packed, strIn, Hdrs, map);
}
/**
* @brief Encapsulate 3 quality score symbols in 2 bytes, when # >= 40.
- * -- FASTQ. Reduction ~1/3
- * @param[out] packed Packed qulity scores
- * @param[in] strIn Quality scores
- * @param[in] map Hash table
+ * -- FASTQ. Reduction ~1/3
+ * @param[out] packed Packed qulity scores
+ * @param strIn Quality scores
+ * @param map Hash table
*/
-void EnDecrypto::pack_qL_fq(std::string& packed, const std::string& strIn,
- const htbl_t& map) {
+void EnDecrypto::pack_qL_fq(std::string& packed, const std::string& strIn, const htbl_t& map) {
pack_large(packed, strIn, QSs, map);
}
/**
* @brief Encapsulate 3 header/quality score symbols in 2 bytes, when # >= 40
* -- FASTA/FASTQ. Reduction ~1/3
- * @param[out] packed Packed qulity scores
- * @param[in] strIn Input header/quality score
- * @param[in] hdrQs Collection of headers/quality scores
- * @param[in] map Hash table
+ * @param[out] packed Packed qulity scores
+ * @param strIn Input header/quality score
+ * @param hdrQs Collection of headers/quality scores
+ * @param map Hash table
*/
-inline void EnDecrypto::pack_large(std::string& packed,
- const std::string& strIn,
- const std::string& hdrQs,
- const htbl_t& map) {
- // ASCII char after the last char in QUALITY_SCORES std::string
- const auto XChar = (char)(hdrQs.back() + 1);
- auto i = strIn.begin();
-
- for (auto iEnd = strIn.end() - 2; i < iEnd; i += 3) {
- char s0 = *i, s1 = *(i + 1), s2 = *(i + 2);
-
- std::string tuple;
- tuple.reserve(3);
+inline void EnDecrypto::pack_large(std::string& packed, const std::string& strIn,
+ const std::string& hdrQs, const htbl_t& map) {
+ (void)map;
+ const DenseLookup& lookup = dense_lookup(hdrQs, true);
+ size_t pos = 0;
+ const size_t tuple_limit = strIn.size() - (strIn.size() % 3);
+
+ for (; pos != tuple_limit; pos += 3) {
+ const char s0 = strIn[pos];
+ const char s1 = strIn[pos + 1];
+ const char s2 = strIn[pos + 2];
bool firstNotIn, secondNotIn, thirdNotIn;
- tuple = (firstNotIn = (hdrQs.find(s0) == std::string::npos)) ? XChar : s0;
- tuple += (secondNotIn = (hdrQs.find(s1) == std::string::npos)) ? XChar : s1;
- tuple += (thirdNotIn = (hdrQs.find(s2) == std::string::npos)) ? XChar : s2;
- u16 shortTuple = large_pack_idx(tuple, map);
+ const u16 shortTuple =
+ large_tuple_index(lookup, s0, s1, s2, firstNotIn, secondNotIn, thirdNotIn);
packed += (unsigned char)(shortTuple >> 8); // Left byte
packed += (unsigned char)(shortTuple & 0xFF); // Right byte
- if (firstNotIn) packed += s0;
- if (secondNotIn) packed += s1;
- if (thirdNotIn) packed += s2;
+ if (firstNotIn) {
+ packed += s0;
+ }
+ if (secondNotIn) {
+ packed += s1;
+ }
+ if (thirdNotIn) {
+ packed += s2;
+ }
}
- // If len isn't multiple of 3, add (char) 255 before each sym
- switch (strIn.length() % 3) {
- case 1:
- packed += (char)255;
- packed += *i;
- break;
-
- case 2:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- break;
-
- default:
- break;
- }
+ append_penalty_tail(packed, strIn, pos);
}
/**
* @brief Encapsulate 3 symbols in 2 bytes, when 16 <= # <= 39. Reduction ~1/3
- * @param[out] packed Packed string
- * @param[in] strIn Input string
- * @param[in] map Hash table
+ * @param[out] packed Packed string
+ * @param strIn Input string
+ * @param map Hash table
*/
-void EnDecrypto::pack_3to2(std::string& packed, const std::string& strIn,
- const htbl_t& map) {
- auto i = strIn.begin();
-
- for (auto iEnd = strIn.end() - 2; i < iEnd; i += 3) {
- std::string tuple;
- tuple.reserve(3);
- tuple = *i;
- tuple += *(i + 1);
- tuple += *(i + 2);
- u16 shortTuple = (u16)map.find(tuple)->second;
+void EnDecrypto::pack_3to2(std::string& packed, const std::string& strIn, const htbl_t& map) {
+ const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs);
+ size_t pos = 0;
+ const size_t tuple_limit = strIn.size() - (strIn.size() % 3);
+
+ for (; pos != tuple_limit; pos += 3) {
+ const u16 shortTuple = tuple_index(lookup, strIn.data() + pos, 3);
packed += (byte)(shortTuple >> 8); // Left byte
packed += (byte)(shortTuple & 0xFF); // Right byte
}
- // If len isn't multiple of 3, add (char) 255 before each sym
- switch (strIn.length() % 3) {
- case 1:
- packed += (char)255;
- packed += *i;
- break;
-
- case 2:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- break;
-
- default:
- break;
- }
+ append_penalty_tail(packed, strIn, pos);
}
/**
* @brief Encapsulate 2 symbols in 1 byte, when 7 <= # <= 15. Reduction ~1/2
- * @param[out] packed Packed string
- * @param[in] strIn Input string
- * @param[in] map Hash table
+ * @param[out] packed Packed string
+ * @param strIn Input string
+ * @param map Hash table
*/
-void EnDecrypto::pack_2to1(std::string& packed, const std::string& strIn,
- const htbl_t& map) {
- auto i = strIn.begin();
-
- for (auto iEnd = strIn.end() - 1; i < iEnd; i += 2) {
- std::string tuple;
- tuple.reserve(2);
- tuple = *i;
- tuple += *(i + 1);
- packed += (char)map.find(tuple)->second;
- }
+void EnDecrypto::pack_2to1(std::string& packed, const std::string& strIn, const htbl_t& map) {
+ const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs);
+ size_t pos = 0;
+ const size_t tuple_limit = strIn.size() - (strIn.size() % 2);
- // If len isn't multiple of 2 (it's odd), add (char) 255 before each sym
- if (strIn.length() & 1) {
- packed += (char)255;
- packed += *i;
+ for (; pos != tuple_limit; pos += 2) {
+ packed += static_cast(tuple_index(lookup, strIn.data() + pos, 2));
}
+
+ append_penalty_tail(packed, strIn, pos);
}
/**
* @brief Encapsulate 3 symbols in 1 byte, when # = 4, 5, 6. Reduction ~2/3
- * @param packed Packed string
- * @param strIn Input string
- * @param map Hash table
+ * @param packed Packed string
+ * @param strIn Input string
+ * @param map Hash table
*/
-void EnDecrypto::pack_3to1(std::string& packed, const std::string& strIn,
- const htbl_t& map) {
- auto i = strIn.begin();
-
- for (auto iEnd = strIn.end() - 2; i < iEnd; i += 3) {
- std::string tuple;
- tuple.reserve(3);
- tuple = *i;
- tuple += *(i + 1);
- tuple += *(i + 2);
- packed += (char)map.find(tuple)->second;
- }
-
- // If len isn't multiple of 3, add (char) 255 before each sym
- switch (strIn.length() % 3) {
- case 1:
- packed += (char)255;
- packed += *i;
- break;
-
- case 2:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- break;
+void EnDecrypto::pack_3to1(std::string& packed, const std::string& strIn, const htbl_t& map) {
+ const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs);
+ size_t pos = 0;
+ const size_t tuple_limit = strIn.size() - (strIn.size() % 3);
- default:
- break;
+ for (; pos != tuple_limit; pos += 3) {
+ packed += static_cast(tuple_index(lookup, strIn.data() + pos, 3));
}
+
+ append_penalty_tail(packed, strIn, pos);
}
/**
* @brief Encapsulate 5 symbols in 1 byte, when # = 3. Reduction ~4/5
- * @param[out] packed Packed string
- * @param[in] strIn Input string
- * @param[in] map Hash table
+ * @param[out] packed Packed string
+ * @param strIn Input string
+ * @param map Hash table
*/
-void EnDecrypto::pack_5to1(std::string& packed, const std::string& strIn,
- const htbl_t& map) {
- auto i = strIn.begin();
-
- for (auto iEnd = strIn.end() - 4; i < iEnd; i += 5) {
- std::string tuple;
- tuple.reserve(5);
- tuple = *i;
- tuple += *(i + 1);
- tuple += *(i + 2);
- tuple += *(i + 3);
- tuple += *(i + 4);
- packed += (char)map.find(tuple)->second;
- }
+void EnDecrypto::pack_5to1(std::string& packed, const std::string& strIn, const htbl_t& map) {
+ const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs);
+ size_t pos = 0;
+ const size_t tuple_limit = strIn.size() - (strIn.size() % 5);
- // If len isn't multiple of 5, add (char) 255 before each sym
- switch (strIn.length() % 5) {
- case 1:
- packed += (char)255;
- packed += *i;
- break;
-
- case 2:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- break;
-
- case 3:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- packed += (char)255;
- packed += *(i + 2);
- break;
-
- case 4:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- packed += (char)255;
- packed += *(i + 2);
- packed += (char)255;
- packed += *(i + 3);
- break;
-
- default:
- break;
+ for (; pos != tuple_limit; pos += 5) {
+ packed += static_cast(tuple_index(lookup, strIn.data() + pos, 5));
}
+
+ append_penalty_tail(packed, strIn, pos);
}
/**
* @brief Encapsulate 7 symbols in 1 byte, when # = 2. Reduction ~6/7
- * @param[out] packed Packed string
- * @param[in] strIn Input string
- * @param[in] map Hash table
+ * @param[out] packed Packed string
+ * @param strIn Input string
+ * @param map Hash table
*/
-void EnDecrypto::pack_7to1(std::string& packed, const std::string& strIn,
- const htbl_t& map) {
- auto i = strIn.begin();
-
- for (auto iEnd = strIn.end() - 6; i < iEnd; i += 7) {
- std::string tuple;
- tuple.reserve(7);
- tuple = *i;
- tuple += *(i + 1);
- tuple += *(i + 2);
- tuple += *(i + 3);
- tuple += *(i + 4);
- tuple += *(i + 5);
- tuple += *(i + 6);
- packed += (char)map.find(tuple)->second;
- }
-
- // If len isn't multiple of 7, add (char) 255 before each sym
- switch (strIn.length() % 7) {
- case 1:
- packed += (char)255;
- packed += *i;
- break;
-
- case 2:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- break;
-
- case 3:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- packed += (char)255;
- packed += *(i + 2);
- break;
-
- case 4:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- packed += (char)255;
- packed += *(i + 2);
- packed += (char)255;
- packed += *(i + 3);
- break;
+void EnDecrypto::pack_7to1(std::string& packed, const std::string& strIn, const htbl_t& map) {
+ const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs);
+ size_t pos = 0;
+ const size_t tuple_limit = strIn.size() - (strIn.size() % 7);
- case 5:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- packed += (char)255;
- packed += *(i + 2);
- packed += (char)255;
- packed += *(i + 3);
- packed += (char)255;
- packed += *(i + 4);
- break;
-
- case 6:
- packed += (char)255;
- packed += *i;
- packed += (char)255;
- packed += *(i + 1);
- packed += (char)255;
- packed += *(i + 2);
- packed += (char)255;
- packed += *(i + 3);
- packed += (char)255;
- packed += *(i + 4);
- packed += (char)255;
- packed += *(i + 5);
- break;
-
- default:
- break;
+ for (; pos != tuple_limit; pos += 7) {
+ packed += static_cast(tuple_index(lookup, strIn.data() + pos, 7));
}
+
+ append_penalty_tail(packed, strIn, pos);
}
/**
* @brief Encapsulate 1 symbol in 1 byte, when # = 1.
- * @param[out] packed Packed string
- * @param[in] strIn Input string
- * @param[in] map Hash table
+ * @param[out] packed Packed string
+ * @param strIn Input string
+ * @param map Hash table
*/
-void EnDecrypto::pack_1to1(std::string& packed, const std::string& strIn,
- const htbl_t& map) {
- for (auto i = strIn.begin(), iEnd = strIn.end(); i < iEnd; ++i) {
- std::string single;
- single = *i;
- packed += (char)map.find(single)->second;
+void EnDecrypto::pack_1to1(std::string& packed, const std::string& strIn, const htbl_t& map) {
+ const DenseLookup& lookup = dense_lookup((&map == &QsMap) ? QSs : Hdrs);
+ for (char c : strIn) {
+ packed += static_cast(checked_rank(lookup, c));
}
}
/**
- * @brief Penalty symbol
- * @param c Input char
+ * @brief Penalty symbol
+ * @param c Input char
* @return Input char or (char)10='\\n'
*/
char EnDecrypto::penalty_sym(char c) const {
@@ -671,13 +549,12 @@ char EnDecrypto::penalty_sym(char c) const {
/**
* @brief Unpack by reading 2 byte by 2 byte, when # > 39
- * @param[out] out Unpacked string
- * @param[in] i Input string iterator
- * @param[in] XChar Extra character for unpacking
- * @param[in] unpack Table for unpacking
+ * @param[out] out Unpacked string
+ * @param i Input string iterator
+ * @param XChar Extra character for unpacking
+ * @param unpack Table for unpacking
*/
-void EnDecrypto::unpack_large(std::string& out, std::string::iterator& i,
- char XChar,
+void EnDecrypto::unpack_large(std::string& out, std::string::iterator& i, char XChar,
const std::vector& unpack) {
out.clear();
@@ -691,63 +568,42 @@ void EnDecrypto::unpack_large(std::string& out, std::string::iterator& i,
const auto rightB = (byte) * (i + 1);
const u16 doubleB = leftB << 8 | rightB; // Join two bytes
- const std::string tpl = unpack[doubleB];
+ const std::string& tpl = unpack[doubleB];
- if (tpl[0] != XChar && tpl[1] != XChar && tpl[2] != XChar) // ...
- {
+ if (tpl[0] != XChar && tpl[1] != XChar && tpl[2] != XChar) { // ...
out += tpl;
i += 2;
- }
-
- else if (tpl[0] == XChar && tpl[1] != XChar && tpl[2] != XChar) // X..
- {
+ } else if (tpl[0] == XChar && tpl[1] != XChar && tpl[2] != XChar) { // X..
out += penalty_sym(*(i + 2));
out += tpl[1];
out += tpl[2];
i += 3;
- }
-
- else if (tpl[0] != XChar && tpl[1] == XChar && tpl[2] != XChar) // .X.
- {
+ } else if (tpl[0] != XChar && tpl[1] == XChar && tpl[2] != XChar) { // .X.
out += tpl[0];
out += penalty_sym(*(i + 2));
out += tpl[2];
i += 3;
- }
-
- else if (tpl[0] == XChar && tpl[1] == XChar && tpl[2] != XChar) // XX.
- {
+ } else if (tpl[0] == XChar && tpl[1] == XChar && tpl[2] != XChar) { // XX.
out += penalty_sym(*(i + 2));
out += penalty_sym(*(i + 3));
out += tpl[2];
i += 4;
- }
-
- else if (tpl[0] != XChar && tpl[1] != XChar && tpl[2] == XChar) // ..X
- {
+ } else if (tpl[0] != XChar && tpl[1] != XChar && tpl[2] == XChar) { // ..X
out += tpl[0];
out += tpl[1];
out += penalty_sym(*(i + 2));
i += 3;
- }
-
- else if (tpl[0] == XChar && tpl[1] != XChar && tpl[2] == XChar) // X.X
- {
+ } else if (tpl[0] == XChar && tpl[1] != XChar && tpl[2] == XChar) { // X.X
out += penalty_sym(*(i + 2));
out += tpl[1];
out += penalty_sym(*(i + 3));
i += 4;
- }
-
- else if (tpl[0] != XChar && tpl[1] == XChar && tpl[2] == XChar) // .XX
- {
+ } else if (tpl[0] != XChar && tpl[1] == XChar && tpl[2] == XChar) { // .XX
out += tpl[0];
out += penalty_sym(*(i + 2));
out += penalty_sym(*(i + 3));
i += 4;
- }
-
- else {
+ } else {
out += penalty_sym(*(i + 2));
out += penalty_sym(*(i + 3)); // XXX
out += penalty_sym(*(i + 4));
@@ -759,9 +615,9 @@ void EnDecrypto::unpack_large(std::string& out, std::string::iterator& i,
/**
* @brief Unpack by reading 2 byte by 2 byte
- * @param[out] out Unpacked string
- * @param[in] i Input string iterator
- * @param[in] unpack Table for unpacking
+ * @param[out] out Unpacked string
+ * @param i Input string iterator
+ * @param unpack Table for unpacking
*/
void EnDecrypto::unpack_2B(std::string& out, std::string::iterator& i,
const std::vector& unpack) {
@@ -769,9 +625,9 @@ void EnDecrypto::unpack_2B(std::string& out, std::string::iterator& i,
for (; *i != (char)254; i += 2) {
// Hdr len not multiple of keyLen
- if (*i == (char)255)
+ if (*i == (char)255) {
out += penalty_sym(*(i + 1));
- else {
+ } else {
const auto leftB = (byte)*i;
const auto rightB = (byte) * (i + 1);
const u16 doubleB = leftB << 8 | rightB; // Join two bytes
@@ -783,9 +639,9 @@ void EnDecrypto::unpack_2B(std::string& out, std::string::iterator& i,
/**
* @brief Unpack by reading 1 byte by 1 byte
- * @param[out] out Unpacked string
- * @param[in] i Input string iterator
- * @param[in] unpack Table for unpacking
+ * @param[out] out Unpacked string
+ * @param i Input string iterator
+ * @param unpack Table for unpacking
*/
void EnDecrypto::unpack_1B(std::string& out, std::string::iterator& i,
const std::vector& unpack) {
@@ -793,75 +649,57 @@ void EnDecrypto::unpack_1B(std::string& out, std::string::iterator& i,
for (; *i != (char)254; ++i) {
// Hdr len not multiple of keyLen
- if (*i == (char)255)
+ if (*i == (char)255) {
out += penalty_sym(*(++i));
- else
+ } else {
out += unpack[(byte)*i];
+ }
}
}
/**
* @brief Unpack 1 byte to 3 DNA bases
- * @param[out] out DNA bases
- * @param[in] i Input string iterator
+ * @param[out] out DNA bases
+ * @param i Input string iterator
*/
void EnDecrypto::unpack_seq(std::string& out, std::string::iterator& i) {
out.clear();
for (; *i != (char)254; ++i) {
- if (*i == (char)255) // Seq len not multiple of 3
+ if (*i == (char)255) { // Seq len not multiple of 3
out += penalty_sym(*(++i));
- else {
- const std::string tpl = DNA_UNPACK[(byte)*i];
+ } else {
+ const std::string& tpl = DNA_UNPACK[(byte)*i];
- if (tpl[0] != 'X' && tpl[1] != 'X' && tpl[2] != 'X') // ...
- {
+ if (tpl[0] != 'X' && tpl[1] != 'X' && tpl[2] != 'X') { // ...
out += tpl;
}
// Using just one 'out' makes trouble
- else if (tpl[0] == 'X' && tpl[1] != 'X' && tpl[2] != 'X') // X..
- {
+ else if (tpl[0] == 'X' && tpl[1] != 'X' && tpl[2] != 'X') { // X..
out += penalty_sym(*(++i));
out += tpl[1];
out += tpl[2];
- }
-
- else if (tpl[0] != 'X' && tpl[1] == 'X' && tpl[2] != 'X') // .X.
- {
+ } else if (tpl[0] != 'X' && tpl[1] == 'X' && tpl[2] != 'X') { // .X.
out += tpl[0];
out += penalty_sym(*(++i));
out += tpl[2];
- }
-
- else if (tpl[0] == 'X' && tpl[1] == 'X' && tpl[2] != 'X') // XX.
- {
+ } else if (tpl[0] == 'X' && tpl[1] == 'X' && tpl[2] != 'X') { // XX.
out += penalty_sym(*(++i));
out += penalty_sym(*(++i));
out += tpl[2];
- }
-
- else if (tpl[0] != 'X' && tpl[1] != 'X' && tpl[2] == 'X') // ..X
- {
+ } else if (tpl[0] != 'X' && tpl[1] != 'X' && tpl[2] == 'X') { // ..X
out += tpl[0];
out += tpl[1];
out += penalty_sym(*(++i));
- }
-
- else if (tpl[0] == 'X' && tpl[1] != 'X' && tpl[2] == 'X') // X.X
- {
+ } else if (tpl[0] == 'X' && tpl[1] != 'X' && tpl[2] == 'X') { // X.X
out += penalty_sym(*(++i));
out += tpl[1];
out += penalty_sym(*(++i));
- }
-
- else if (tpl[0] != 'X' && tpl[1] == 'X' && tpl[2] == 'X') // .XX
- {
+ } else if (tpl[0] != 'X' && tpl[1] == 'X' && tpl[2] == 'X') { // .XX
out += tpl[0];
out += penalty_sym(*(++i));
out += penalty_sym(*(++i));
- }
-
- else {
+ } else {
out += penalty_sym(*(++i));
out += penalty_sym(*(++i)); // XXX
out += penalty_sym(*(++i));
@@ -874,25 +712,25 @@ void EnDecrypto::unpack_seq(std::string& out, std::string::iterator& i) {
* @brief Shuffle a file (not FASTA/FASTQ)
*/
void EnDecrypto::shuffle_file() {
- std::cerr << "\"" << file_name(in_file)
- << "\" isn't FASTA/FASTQ. We just encrypt it.\n";
+ std::cerr << "\"" << file_name(in_file) << "\" isn't FASTA/FASTQ. We just encrypt it.\n";
if (!stop_shuffle) {
const auto start = now(); // Start timer
std::vector arrThread(n_threads);
// Distribute file among threads, for shuffling
- for (byte t = 0; t != n_threads; ++t)
+ for (byte t = 0; t != n_threads; ++t) {
arrThread[t] = std::thread(&EnDecrypto::shuffle_block, this, t);
- for (auto& thr : arrThread)
+ }
+ for (auto& thr : arrThread) {
if (thr.joinable()) thr.join();
+ }
// Join partially shuffled files
join_shuffled_files();
const auto finish = now(); // Stop timer
- std::cerr << "\r" << bold("[+]") << " Shuffling done in "
- << hms(finish - start);
+ std::cerr << "\r" << bold("[+]") << " Shuffling done in " << hms(finish - start);
} else {
std::ifstream inFile(in_file);
std::ofstream pckdFile(PCKD_FNAME);
@@ -910,18 +748,23 @@ void EnDecrypto::shuffle_file() {
/**
* @brief Shuffle a block of file
- * @param threadID Thread ID
+ * @param threadID Thread ID
*/
void EnDecrypto::shuffle_block(byte threadID) {
std::ifstream in(in_file);
- std::ofstream shfile(SH_FNAME + std::to_string(threadID), std::ios_base::app);
+ std::ofstream shfile(std::format("{}{}", SH_FNAME, static_cast(threadID)),
+ std::ios_base::app);
// Characters ignored at the beginning
- in.ignore((std::streamsize)(threadID * BLOCK_SIZE));
+ in.ignore((std::streamsize)(threadID * CHUNK_TARGET_SIZE));
for (char c; in.peek() != EOF;) {
std::string context;
- for (u64 bs = BLOCK_SIZE; bs--;)
- if (in.get(c)) context += c;
+ context.reserve(CHUNK_TARGET_SIZE);
+ for (u64 bs = CHUNK_TARGET_SIZE; bs--;) {
+ if (in.get(c)) {
+ context += c;
+ }
+ }
// Shuffle
if (!stop_shuffle) {
@@ -937,11 +780,11 @@ void EnDecrypto::shuffle_block(byte threadID) {
}
// Write header containing threadID for each partially shuffled file
- shfile << THR_ID_HDR << std::to_string(threadID) << '\n';
+ shfile << std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
shfile << context << '\n';
// Ignore to go to the next related chunk
- in.ignore((std::streamsize)((n_threads - 1) * BLOCK_SIZE));
+ in.ignore((std::streamsize)((n_threads - 1) * CHUNK_TARGET_SIZE));
}
shfile.close();
}
@@ -961,10 +804,14 @@ void EnDecrypto::unshuffle_file() {
std::vector arrThread(n_threads);
// Distribute file among threads, for unshuffling
- for (byte t = 0; t != n_threads; ++t)
+ for (byte t = 0; t != n_threads; ++t) {
arrThread[t] = std::thread(&EnDecrypto::unshuffle_block, this, t);
- for (auto& thr : arrThread)
- if (thr.joinable()) thr.join();
+ }
+ for (auto& thr : arrThread) {
+ if (thr.joinable()) {
+ thr.join();
+ }
+ }
// Delete decrypted file
std::remove(DEC_FNAME.c_str());
@@ -973,8 +820,7 @@ void EnDecrypto::unshuffle_file() {
join_unshuffled_files();
const auto finish = now(); // Stop timer
- std::cerr << "\r" << bold("[+]") << " Unshuffling done in "
- << hms(finish - start);
+ std::cerr << "\r" << bold("[+]") << " Unshuffling done in " << hms(finish - start);
} else if (c == (char)129) {
std::cout << in.rdbuf();
@@ -988,20 +834,24 @@ void EnDecrypto::unshuffle_file() {
/**
* @brief Unshuffle a block of file
- * @param threadID Thread ID
+ * @param threadID Thread ID
*/
void EnDecrypto::unshuffle_block(byte threadID) {
std::ifstream in(DEC_FNAME);
- std::ofstream ushfile(USH_FNAME + std::to_string(threadID),
+ std::ofstream ushfile(std::format("{}{}", USH_FNAME, static_cast(threadID)),
std::ios_base::app);
// filetype char (125) + shuffed (128) + characters ignored at the beginning
- in.ignore((std::streamsize)(2 + threadID * BLOCK_SIZE));
+ in.ignore((std::streamsize)(2 + threadID * CHUNK_TARGET_SIZE));
for (char c; in.peek() != EOF;) {
std::string unshText;
- for (u64 bs = BLOCK_SIZE; bs--;)
- if (in.get(c)) unshText += c;
+ unshText.reserve(CHUNK_TARGET_SIZE);
+ for (u64 bs = CHUNK_TARGET_SIZE; bs--;) {
+ if (in.get(c)) {
+ unshText += c;
+ }
+ }
auto i = unshText.begin();
@@ -1019,11 +869,11 @@ void EnDecrypto::unshuffle_block(byte threadID) {
}
// Write header containing threadID for each partially unshuffled file
- ushfile << THR_ID_HDR + std::to_string(threadID) << '\n';
+ ushfile << std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
ushfile << unshText << '\n';
// Ignore to go to the next related chunk
- in.ignore((std::streamsize)((n_threads - 1) * BLOCK_SIZE));
+ in.ignore((std::streamsize)((n_threads - 1) * CHUNK_TARGET_SIZE));
}
ushfile.close();
@@ -1032,19 +882,18 @@ void EnDecrypto::unshuffle_block(byte threadID) {
/**
* @brief Join partially packed files
- * @param headers Headers
- * @param qscores Quality scores
- * @param fT File type
- * @param justPlus If the third line of FASTQ contains only the '+' char
+ * @param headers Headers
+ * @param qscores Quality scores
+ * @param fT File type
+ * @param justPlus If the third line of FASTQ contains only the '+' char
*/
-void EnDecrypto::join_packed_files(const std::string& headers,
- const std::string& qscores, char fT,
+void EnDecrypto::join_packed_files(const std::string& headers, const std::string& qscores, char fT,
bool justPlus) const {
byte t; // For threads
std::vector pkFile(n_threads);
std::ofstream pckdFile(PCKD_FNAME); // Packed file
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { pckdFile << content; };
switch (fT) {
@@ -1066,7 +915,9 @@ void EnDecrypto::join_packed_files(const std::string& headers,
}
// Input files
- for (t = n_threads; t--;) pkFile[t].open(PK_FNAME + std::to_string(t));
+ for (t = n_threads; t--;) {
+ pkFile[t].open(std::format("{}{}", PK_FNAME, static_cast(t)));
+ }
std::string line;
bool prevLineNotThrID; // If previous line was "THR=" or not
@@ -1075,14 +926,16 @@ void EnDecrypto::join_packed_files(const std::string& headers,
prevLineNotThrID = false;
while (std::getline(pkFile[t], line).good() &&
- line != THR_ID_HDR + std::to_string(t)) {
- if (prevLineNotThrID) content += '\n';
+ line != std::format("{}{}", THR_ID_HDR, static_cast(t))) {
+ if (prevLineNotThrID) {
+ content += '\n';
+ }
content += line;
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
prevLineNotThrID = true;
@@ -1096,8 +949,7 @@ void EnDecrypto::join_packed_files(const std::string& headers,
pckdFile.close();
for (t = n_threads; t--;) {
pkFile[t].close();
- std::string pkFileName = PK_FNAME;
- pkFileName += std::to_string(t);
+ std::string pkFileName = std::format("{}{}", PK_FNAME, static_cast(t));
std::remove(pkFileName.c_str());
}
}
@@ -1108,9 +960,11 @@ void EnDecrypto::join_packed_files(const std::string& headers,
void EnDecrypto::join_unpacked_files() const {
byte t; // For threads
std::vector upkdFile(n_threads);
- for (t = n_threads; t--;) upkdFile[t].open(UPK_FNAME + std::to_string(t));
+ for (t = n_threads; t--;) {
+ upkdFile[t].open(std::format("{}{}", UPK_FNAME, static_cast(t)));
+ }
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { std::cout << content; };
bool prevLineNotThrID; // If previous line was "THRD=" or not
@@ -1119,20 +973,24 @@ void EnDecrypto::join_unpacked_files() const {
prevLineNotThrID = false;
for (std::string line; std::getline(upkdFile[t], line).good() &&
- line != THR_ID_HDR + std::to_string(t);) {
- if (prevLineNotThrID) content += '\n';
+ line != std::format("{}{}", THR_ID_HDR, static_cast(t));) {
+ if (prevLineNotThrID) {
+ content += '\n';
+ }
content += line;
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
prevLineNotThrID = true;
}
- if (prevLineNotThrID) content += '\n';
+ if (prevLineNotThrID) {
+ content += '\n';
+ }
}
}
write_content();
@@ -1140,8 +998,7 @@ void EnDecrypto::join_unpacked_files() const {
// Close/delete input/output files
for (t = n_threads; t--;) {
upkdFile[t].close();
- std::string upkdFileName = UPK_FNAME;
- upkdFileName += std::to_string(t);
+ std::string upkdFileName = std::format("{}{}", UPK_FNAME, static_cast(t));
std::remove(upkdFileName.c_str());
}
}
@@ -1153,28 +1010,32 @@ void EnDecrypto::join_shuffled_files() const {
std::vector shFile(n_threads);
std::ofstream shdFile(PCKD_FNAME); // Output Shuffled file
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { shdFile << content; };
content += (char)125;
content += (!stop_shuffle ? (char)128 : (char)129);
// Input files
- for (byte t = n_threads; t--;) shFile[t].open(SH_FNAME + std::to_string(t));
+ for (byte t = n_threads; t--;) {
+ shFile[t].open(std::format("{}{}", SH_FNAME, static_cast(t)));
+ }
while (!shFile[0].eof()) {
for (byte t = 0; t != n_threads; ++t) {
bool prevLineNotThrID = false; // If previous line was "THR=" or not
for (std::string line; std::getline(shFile[t], line).good() &&
- line != THR_ID_HDR + std::to_string(t);) {
- if (prevLineNotThrID) content += '\n';
+ line != std::format("{}{}", THR_ID_HDR, static_cast(t));) {
+ if (prevLineNotThrID) {
+ content += '\n';
+ }
content += line;
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
prevLineNotThrID = true;
@@ -1187,8 +1048,7 @@ void EnDecrypto::join_shuffled_files() const {
shdFile.close();
for (byte t = n_threads; t--;) {
shFile[t].close();
- std::string shFileName = SH_FNAME;
- shFileName += std::to_string(t);
+ std::string shFileName = std::format("{}{}", SH_FNAME, static_cast(t));
std::remove(shFileName.c_str());
}
}
@@ -1199,9 +1059,11 @@ void EnDecrypto::join_shuffled_files() const {
void EnDecrypto::join_unshuffled_files() const {
byte t; // For threads
std::vector ushdFile(n_threads);
- for (t = n_threads; t--;) ushdFile[t].open(USH_FNAME + std::to_string(t));
+ for (t = n_threads; t--;) {
+ ushdFile[t].open(std::format("{}{}", USH_FNAME, static_cast(t)));
+ }
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { std::cout << content; };
while (!ushdFile[0].eof()) {
@@ -1209,14 +1071,16 @@ void EnDecrypto::join_unshuffled_files() const {
bool prevLineNotThrID = false; // If previous line was "THR=" or not
for (std::string line; std::getline(ushdFile[t], line).good() &&
- line != THR_ID_HDR + std::to_string(t);) {
- if (prevLineNotThrID) content += '\n';
+ line != std::format("{}{}", THR_ID_HDR, static_cast(t));) {
+ if (prevLineNotThrID) {
+ content += '\n';
+ }
content += line;
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
prevLineNotThrID = true;
@@ -1228,8 +1092,7 @@ void EnDecrypto::join_unshuffled_files() const {
// Close/delete input/output files
for (t = n_threads; t--;) {
ushdFile[t].close();
- std::string ushdFileName = USH_FNAME;
- ushdFileName += std::to_string(t);
+ std::string ushdFileName = std::format("{}{}", USH_FNAME, static_cast(t));
std::remove(ushdFileName.c_str());
}
-}
\ No newline at end of file
+}
diff --git a/src/endecrypto.hpp b/src/endecrypto.hpp
index c345abf..4366155 100644
--- a/src/endecrypto.hpp
+++ b/src/endecrypto.hpp
@@ -1,9 +1,9 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file endecrypto.hpp
- * @brief Encryption/Decryption
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file endecrypto.hpp
+ * @brief Encryption/Decryption
*/
#ifndef CRYFA_ENDECRYPTO_H
@@ -18,8 +18,7 @@ namespace cryfa {
class EnDecrypto;
// Type define
-typedef void (EnDecrypto::*packFP_t)(std::string&, const std::string&,
- const htbl_t&);
+typedef void (EnDecrypto::*packFP_t)(std::string&, const std::string&, const htbl_t&);
typedef void (EnDecrypto::*unpackFP_t)(std::string&, std::string::iterator&,
const std::vector&);
@@ -38,10 +37,8 @@ class EnDecrypto : public Security {
void pack_5to1(std::string&, const std::string&, const htbl_t&);
void pack_7to1(std::string&, const std::string&, const htbl_t&);
void pack_1to1(std::string&, const std::string&, const htbl_t&);
- void unpack_2B(std::string&, std::string::iterator&,
- const std::vector&);
- void unpack_1B(std::string&, std::string::iterator&,
- const std::vector&);
+ void unpack_2B(std::string&, std::string::iterator&, const std::vector&);
+ void unpack_1B(std::string&, std::string::iterator&, const std::vector&);
void shuffle_file();
void unshuffle_file();
@@ -57,21 +54,16 @@ class EnDecrypto : public Security {
void build_hash_tbl(htbl_t&, const std::string&, short);
void build_unpack_tbl(std::vector&, const std::string&, u16);
- auto dna_pack_idx(const std::string&) -> byte;
- auto large_pack_idx(const std::string&, const htbl_t&) -> u16;
void pack_seq(std::string&, const std::string&);
void unpack_seq(std::string&, std::string::iterator&);
- void unpack_large(std::string&, std::string::iterator&, char,
- const std::vector&);
- void join_packed_files(const std::string&, const std::string&, char,
- bool) const;
+ void unpack_large(std::string&, std::string::iterator&, char, const std::vector&);
+ void join_packed_files(const std::string&, const std::string&, char, bool) const;
void join_unpacked_files() const;
void join_shuffled_files() const;
void join_unshuffled_files() const;
private:
- void pack_large(std::string&, const std::string&, const std::string&,
- const htbl_t&);
+ void pack_large(std::string&, const std::string&, const std::string&, const htbl_t&);
auto penalty_sym(char) const -> char;
void shuffle_block(byte);
void unshuffle_block(byte);
@@ -82,76 +74,65 @@ class EnDecrypto : public Security {
* @hideinitializer
*/
static const htbl_t DNA_MAP{
- {"AAA", 0}, {"AAC", 1}, {"AAG", 2}, {"AAT", 3}, {"AAN", 4},
- {"AAX", 5}, {"ACA", 6}, {"ACC", 7}, {"ACG", 8}, {"ACT", 9},
- {"ACN", 10}, {"ACX", 11}, {"AGA", 12}, {"AGC", 13}, {"AGG", 14},
- {"AGT", 15}, {"AGN", 16}, {"AGX", 17}, {"ATA", 18}, {"ATC", 19},
- {"ATG", 20}, {"ATT", 21}, {"ATN", 22}, {"ATX", 23}, {"ANA", 24},
- {"ANC", 25}, {"ANG", 26}, {"ANT", 27}, {"ANN", 28}, {"ANX", 29},
- {"AXA", 30}, {"AXC", 31}, {"AXG", 32}, {"AXT", 33}, {"AXN", 34},
- {"AXX", 35}, {"CAA", 36}, {"CAC", 37}, {"CAG", 38}, {"CAT", 39},
- {"CAN", 40}, {"CAX", 41}, {"CCA", 42}, {"CCC", 43}, {"CCG", 44},
- {"CCT", 45}, {"CCN", 46}, {"CCX", 47}, {"CGA", 48}, {"CGC", 49},
- {"CGG", 50}, {"CGT", 51}, {"CGN", 52}, {"CGX", 53}, {"CTA", 54},
- {"CTC", 55}, {"CTG", 56}, {"CTT", 57}, {"CTN", 58}, {"CTX", 59},
- {"CNA", 60}, {"CNC", 61}, {"CNG", 62}, {"CNT", 63}, {"CNN", 64},
- {"CNX", 65}, {"CXA", 66}, {"CXC", 67}, {"CXG", 68}, {"CXT", 69},
- {"CXN", 70}, {"CXX", 71}, {"GAA", 72}, {"GAC", 73}, {"GAG", 74},
- {"GAT", 75}, {"GAN", 76}, {"GAX", 77}, {"GCA", 78}, {"GCC", 79},
- {"GCG", 80}, {"GCT", 81}, {"GCN", 82}, {"GCX", 83}, {"GGA", 84},
- {"GGC", 85}, {"GGG", 86}, {"GGT", 87}, {"GGN", 88}, {"GGX", 89},
- {"GTA", 90}, {"GTC", 91}, {"GTG", 92}, {"GTT", 93}, {"GTN", 94},
- {"GTX", 95}, {"GNA", 96}, {"GNC", 97}, {"GNG", 98}, {"GNT", 99},
- {"GNN", 100}, {"GNX", 101}, {"GXA", 102}, {"GXC", 103}, {"GXG", 104},
- {"GXT", 105}, {"GXN", 106}, {"GXX", 107}, {"TAA", 108}, {"TAC", 109},
- {"TAG", 110}, {"TAT", 111}, {"TAN", 112}, {"TAX", 113}, {"TCA", 114},
- {"TCC", 115}, {"TCG", 116}, {"TCT", 117}, {"TCN", 118}, {"TCX", 119},
- {"TGA", 120}, {"TGC", 121}, {"TGG", 122}, {"TGT", 123}, {"TGN", 124},
- {"TGX", 125}, {"TTA", 126}, {"TTC", 127}, {"TTG", 128}, {"TTT", 129},
- {"TTN", 130}, {"TTX", 131}, {"TNA", 132}, {"TNC", 133}, {"TNG", 134},
- {"TNT", 135}, {"TNN", 136}, {"TNX", 137}, {"TXA", 138}, {"TXC", 139},
- {"TXG", 140}, {"TXT", 141}, {"TXN", 142}, {"TXX", 143}, {"NAA", 144},
- {"NAC", 145}, {"NAG", 146}, {"NAT", 147}, {"NAN", 148}, {"NAX", 149},
- {"NCA", 150}, {"NCC", 151}, {"NCG", 152}, {"NCT", 153}, {"NCN", 154},
- {"NCX", 155}, {"NGA", 156}, {"NGC", 157}, {"NGG", 158}, {"NGT", 159},
- {"NGN", 160}, {"NGX", 161}, {"NTA", 162}, {"NTC", 163}, {"NTG", 164},
- {"NTT", 165}, {"NTN", 166}, {"NTX", 167}, {"NNA", 168}, {"NNC", 169},
- {"NNG", 170}, {"NNT", 171}, {"NNN", 172}, {"NNX", 173}, {"NXA", 174},
- {"NXC", 175}, {"NXG", 176}, {"NXT", 177}, {"NXN", 178}, {"NXX", 179},
- {"XAA", 180}, {"XAC", 181}, {"XAG", 182}, {"XAT", 183}, {"XAN", 184},
- {"XAX", 185}, {"XCA", 186}, {"XCC", 187}, {"XCG", 188}, {"XCT", 189},
- {"XCN", 190}, {"XCX", 191}, {"XGA", 192}, {"XGC", 193}, {"XGG", 194},
- {"XGT", 195}, {"XGN", 196}, {"XGX", 197}, {"XTA", 198}, {"XTC", 199},
- {"XTG", 200}, {"XTT", 201}, {"XTN", 202}, {"XTX", 203}, {"XNA", 204},
- {"XNC", 205}, {"XNG", 206}, {"XNT", 207}, {"XNN", 208}, {"XNX", 209},
- {"XXA", 210}, {"XXC", 211}, {"XXG", 212}, {"XXT", 213}, {"XXN", 214},
- {"XXX", 215}};
+ {"AAA", 0}, {"AAC", 1}, {"AAG", 2}, {"AAT", 3}, {"AAN", 4}, {"AAX", 5},
+ {"ACA", 6}, {"ACC", 7}, {"ACG", 8}, {"ACT", 9}, {"ACN", 10}, {"ACX", 11},
+ {"AGA", 12}, {"AGC", 13}, {"AGG", 14}, {"AGT", 15}, {"AGN", 16}, {"AGX", 17},
+ {"ATA", 18}, {"ATC", 19}, {"ATG", 20}, {"ATT", 21}, {"ATN", 22}, {"ATX", 23},
+ {"ANA", 24}, {"ANC", 25}, {"ANG", 26}, {"ANT", 27}, {"ANN", 28}, {"ANX", 29},
+ {"AXA", 30}, {"AXC", 31}, {"AXG", 32}, {"AXT", 33}, {"AXN", 34}, {"AXX", 35},
+ {"CAA", 36}, {"CAC", 37}, {"CAG", 38}, {"CAT", 39}, {"CAN", 40}, {"CAX", 41},
+ {"CCA", 42}, {"CCC", 43}, {"CCG", 44}, {"CCT", 45}, {"CCN", 46}, {"CCX", 47},
+ {"CGA", 48}, {"CGC", 49}, {"CGG", 50}, {"CGT", 51}, {"CGN", 52}, {"CGX", 53},
+ {"CTA", 54}, {"CTC", 55}, {"CTG", 56}, {"CTT", 57}, {"CTN", 58}, {"CTX", 59},
+ {"CNA", 60}, {"CNC", 61}, {"CNG", 62}, {"CNT", 63}, {"CNN", 64}, {"CNX", 65},
+ {"CXA", 66}, {"CXC", 67}, {"CXG", 68}, {"CXT", 69}, {"CXN", 70}, {"CXX", 71},
+ {"GAA", 72}, {"GAC", 73}, {"GAG", 74}, {"GAT", 75}, {"GAN", 76}, {"GAX", 77},
+ {"GCA", 78}, {"GCC", 79}, {"GCG", 80}, {"GCT", 81}, {"GCN", 82}, {"GCX", 83},
+ {"GGA", 84}, {"GGC", 85}, {"GGG", 86}, {"GGT", 87}, {"GGN", 88}, {"GGX", 89},
+ {"GTA", 90}, {"GTC", 91}, {"GTG", 92}, {"GTT", 93}, {"GTN", 94}, {"GTX", 95},
+ {"GNA", 96}, {"GNC", 97}, {"GNG", 98}, {"GNT", 99}, {"GNN", 100}, {"GNX", 101},
+ {"GXA", 102}, {"GXC", 103}, {"GXG", 104}, {"GXT", 105}, {"GXN", 106}, {"GXX", 107},
+ {"TAA", 108}, {"TAC", 109}, {"TAG", 110}, {"TAT", 111}, {"TAN", 112}, {"TAX", 113},
+ {"TCA", 114}, {"TCC", 115}, {"TCG", 116}, {"TCT", 117}, {"TCN", 118}, {"TCX", 119},
+ {"TGA", 120}, {"TGC", 121}, {"TGG", 122}, {"TGT", 123}, {"TGN", 124}, {"TGX", 125},
+ {"TTA", 126}, {"TTC", 127}, {"TTG", 128}, {"TTT", 129}, {"TTN", 130}, {"TTX", 131},
+ {"TNA", 132}, {"TNC", 133}, {"TNG", 134}, {"TNT", 135}, {"TNN", 136}, {"TNX", 137},
+ {"TXA", 138}, {"TXC", 139}, {"TXG", 140}, {"TXT", 141}, {"TXN", 142}, {"TXX", 143},
+ {"NAA", 144}, {"NAC", 145}, {"NAG", 146}, {"NAT", 147}, {"NAN", 148}, {"NAX", 149},
+ {"NCA", 150}, {"NCC", 151}, {"NCG", 152}, {"NCT", 153}, {"NCN", 154}, {"NCX", 155},
+ {"NGA", 156}, {"NGC", 157}, {"NGG", 158}, {"NGT", 159}, {"NGN", 160}, {"NGX", 161},
+ {"NTA", 162}, {"NTC", 163}, {"NTG", 164}, {"NTT", 165}, {"NTN", 166}, {"NTX", 167},
+ {"NNA", 168}, {"NNC", 169}, {"NNG", 170}, {"NNT", 171}, {"NNN", 172}, {"NNX", 173},
+ {"NXA", 174}, {"NXC", 175}, {"NXG", 176}, {"NXT", 177}, {"NXN", 178}, {"NXX", 179},
+ {"XAA", 180}, {"XAC", 181}, {"XAG", 182}, {"XAT", 183}, {"XAN", 184}, {"XAX", 185},
+ {"XCA", 186}, {"XCC", 187}, {"XCG", 188}, {"XCT", 189}, {"XCN", 190}, {"XCX", 191},
+ {"XGA", 192}, {"XGC", 193}, {"XGG", 194}, {"XGT", 195}, {"XGN", 196}, {"XGX", 197},
+ {"XTA", 198}, {"XTC", 199}, {"XTG", 200}, {"XTT", 201}, {"XTN", 202}, {"XTX", 203},
+ {"XNA", 204}, {"XNC", 205}, {"XNG", 206}, {"XNT", 207}, {"XNN", 208}, {"XNX", 209},
+ {"XXA", 210}, {"XXC", 211}, {"XXG", 212}, {"XXT", 213}, {"XXN", 214}, {"XXX", 215}};
/**
* @brief Lookup table for unpacking -- 216 elements
* @hideinitializer
*/
static const std::vector DNA_UNPACK{
- "AAA", "AAC", "AAG", "AAT", "AAN", "AAX", "ACA", "ACC", "ACG", "ACT", "ACN",
- "ACX", "AGA", "AGC", "AGG", "AGT", "AGN", "AGX", "ATA", "ATC", "ATG", "ATT",
- "ATN", "ATX", "ANA", "ANC", "ANG", "ANT", "ANN", "ANX", "AXA", "AXC", "AXG",
- "AXT", "AXN", "AXX", "CAA", "CAC", "CAG", "CAT", "CAN", "CAX", "CCA", "CCC",
- "CCG", "CCT", "CCN", "CCX", "CGA", "CGC", "CGG", "CGT", "CGN", "CGX", "CTA",
- "CTC", "CTG", "CTT", "CTN", "CTX", "CNA", "CNC", "CNG", "CNT", "CNN", "CNX",
- "CXA", "CXC", "CXG", "CXT", "CXN", "CXX", "GAA", "GAC", "GAG", "GAT", "GAN",
- "GAX", "GCA", "GCC", "GCG", "GCT", "GCN", "GCX", "GGA", "GGC", "GGG", "GGT",
- "GGN", "GGX", "GTA", "GTC", "GTG", "GTT", "GTN", "GTX", "GNA", "GNC", "GNG",
- "GNT", "GNN", "GNX", "GXA", "GXC", "GXG", "GXT", "GXN", "GXX", "TAA", "TAC",
- "TAG", "TAT", "TAN", "TAX", "TCA", "TCC", "TCG", "TCT", "TCN", "TCX", "TGA",
- "TGC", "TGG", "TGT", "TGN", "TGX", "TTA", "TTC", "TTG", "TTT", "TTN", "TTX",
- "TNA", "TNC", "TNG", "TNT", "TNN", "TNX", "TXA", "TXC", "TXG", "TXT", "TXN",
- "TXX", "NAA", "NAC", "NAG", "NAT", "NAN", "NAX", "NCA", "NCC", "NCG", "NCT",
- "NCN", "NCX", "NGA", "NGC", "NGG", "NGT", "NGN", "NGX", "NTA", "NTC", "NTG",
- "NTT", "NTN", "NTX", "NNA", "NNC", "NNG", "NNT", "NNN", "NNX", "NXA", "NXC",
- "NXG", "NXT", "NXN", "NXX", "XAA", "XAC", "XAG", "XAT", "XAN", "XAX", "XCA",
- "XCC", "XCG", "XCT", "XCN", "XCX", "XGA", "XGC", "XGG", "XGT", "XGN", "XGX",
- "XTA", "XTC", "XTG", "XTT", "XTN", "XTX", "XNA", "XNC", "XNG", "XNT", "XNN",
- "XNX", "XXA", "XXC", "XXG", "XXT", "XXN", "XXX"};
+ "AAA", "AAC", "AAG", "AAT", "AAN", "AAX", "ACA", "ACC", "ACG", "ACT", "ACN", "ACX", "AGA",
+ "AGC", "AGG", "AGT", "AGN", "AGX", "ATA", "ATC", "ATG", "ATT", "ATN", "ATX", "ANA", "ANC",
+ "ANG", "ANT", "ANN", "ANX", "AXA", "AXC", "AXG", "AXT", "AXN", "AXX", "CAA", "CAC", "CAG",
+ "CAT", "CAN", "CAX", "CCA", "CCC", "CCG", "CCT", "CCN", "CCX", "CGA", "CGC", "CGG", "CGT",
+ "CGN", "CGX", "CTA", "CTC", "CTG", "CTT", "CTN", "CTX", "CNA", "CNC", "CNG", "CNT", "CNN",
+ "CNX", "CXA", "CXC", "CXG", "CXT", "CXN", "CXX", "GAA", "GAC", "GAG", "GAT", "GAN", "GAX",
+ "GCA", "GCC", "GCG", "GCT", "GCN", "GCX", "GGA", "GGC", "GGG", "GGT", "GGN", "GGX", "GTA",
+ "GTC", "GTG", "GTT", "GTN", "GTX", "GNA", "GNC", "GNG", "GNT", "GNN", "GNX", "GXA", "GXC",
+ "GXG", "GXT", "GXN", "GXX", "TAA", "TAC", "TAG", "TAT", "TAN", "TAX", "TCA", "TCC", "TCG",
+ "TCT", "TCN", "TCX", "TGA", "TGC", "TGG", "TGT", "TGN", "TGX", "TTA", "TTC", "TTG", "TTT",
+ "TTN", "TTX", "TNA", "TNC", "TNG", "TNT", "TNN", "TNX", "TXA", "TXC", "TXG", "TXT", "TXN",
+ "TXX", "NAA", "NAC", "NAG", "NAT", "NAN", "NAX", "NCA", "NCC", "NCG", "NCT", "NCN", "NCX",
+ "NGA", "NGC", "NGG", "NGT", "NGN", "NGX", "NTA", "NTC", "NTG", "NTT", "NTN", "NTX", "NNA",
+ "NNC", "NNG", "NNT", "NNN", "NNX", "NXA", "NXC", "NXG", "NXT", "NXN", "NXX", "XAA", "XAC",
+ "XAG", "XAT", "XAN", "XAX", "XCA", "XCC", "XCG", "XCT", "XCN", "XCX", "XGA", "XGC", "XGG",
+ "XGT", "XGN", "XGX", "XTA", "XTC", "XTG", "XTT", "XTN", "XTX", "XNA", "XNC", "XNG", "XNT",
+ "XNN", "XNX", "XXA", "XXC", "XXG", "XXT", "XXN", "XXX"};
} // namespace cryfa
-#endif // CRYFA_ENDECRYPTO_H
\ No newline at end of file
+#endif // CRYFA_ENDECRYPTO_H
diff --git a/src/fasta.cpp b/src/fasta.cpp
index b7eaf88..e4e0d98 100644
--- a/src/fasta.cpp
+++ b/src/fasta.cpp
@@ -1,75 +1,180 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file fasta.cpp
- * @brief Compression/Decompression of FASTA
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file fasta.cpp
+ * @brief Compression/Decompression of FASTA
*/
#include "fasta.hpp"
#include
+#include
+#include
#include
#include // setw, std::setprecision
#include
+#include
+#include
#include
+#include
+#include "ordered_pipeline.hpp"
+#include "plaintext_stream.hpp"
#include "string.hpp"
#include "time.hpp"
using namespace cryfa;
-std::mutex mutxFA; /**< @brief Mutex */
+std::mutex mutxFA;
+
+namespace {
+struct FastaRecord {
+ std::string header;
+ std::vector sequence_lines;
+};
+
+struct FastaChunk {
+ std::vector records;
+};
+} // namespace
/**
* @brief Compress
*/
void Fasta::compress() {
- if (!verbose) std::cerr << bold("[+]") << " Compacting ...";
+ if (!verbose) {
+ std::cerr << bold("[+]") << " Compacting ...";
+ }
const auto start = now(); // Start timer
- std::vector arrThr(n_threads);
std::string headers;
packfa_s pkStruct; // Collection of inputs to pass to pack...
- if (verbose)
+ if (verbose) {
std::cerr << bold("[+]") << " Calculating no. unique characters ...";
+ }
// Gather different chars in all headers and max length in all bases
gather_h_bs(headers);
// Show number of different chars in headers -- ignore '>'=62
- if (verbose)
- std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => "
- << headers.length() << " \n";
+ if (verbose) {
+ std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " << headers.length()
+ << " \n";
+ }
// Set Hash table and pack function
set_hashTbl_packFn(pkStruct, headers);
- // Distribute file among threads, for reading and packing
- for (byte t = 0; t != n_threads; ++t)
- arrThr[t] = std::thread(&Fasta::pack, this, pkStruct, t);
- for (auto& thr : arrThr)
- if (thr.joinable()) thr.join();
+ auto read_chunk = [this, in = std::ifstream(in_file),
+ pending_header = std::string{}]() mutable -> std::optional {
+ FastaChunk chunk;
+ std::string line;
+ u64 chunk_bytes = 0;
+
+ if (pending_header.empty()) {
+ while (std::getline(in, line)) {
+ if (!line.empty() && line.front() == '>') {
+ pending_header = std::move(line);
+ break;
+ }
+ }
+ }
- if (verbose) {
- std::cerr << "\r" << bold("[+]") << " Shuffling done in "
- << hms(now() - shuffle_timer);
- std::cerr << bold("[+]") << " Compacting ...";
- }
+ if (pending_header.empty()) {
+ return std::nullopt;
+ }
- // Join partially packed and/or shuffled files
- join_packed_files(headers, "", 'A', false);
+ while (!pending_header.empty()) {
+ FastaRecord record;
+ record.header = std::move(pending_header);
+ pending_header.clear();
+ chunk_bytes += record.header.size() + 1;
- const auto finish = now(); // Stop timer
- std::cerr << "\r" << bold("[+]") << " Compacting done in "
- << hms(finish - start);
+ while (std::getline(in, line)) {
+ if (!line.empty() && line.front() == '>') {
+ pending_header = std::move(line);
+ break;
+ }
+
+ chunk_bytes += line.size() + 1;
+ record.sequence_lines.push_back(std::move(line));
+ }
+
+ chunk.records.push_back(std::move(record));
+ if (chunk_bytes >= CHUNK_TARGET_SIZE) {
+ break;
+ }
+ }
+
+ return chunk;
+ };
+
+ auto pack_chunk = [this, pkStruct](FastaChunk chunk) {
+ packFP_t packHdr = pkStruct.packHdrFP;
+ std::string context;
+ context.reserve(CHUNK_TARGET_SIZE);
+ std::string seq;
+ seq.reserve(CHUNK_TARGET_SIZE);
+
+ for (const FastaRecord& record : chunk.records) {
+ context += (char)253;
+ (this->*packHdr)(context, record.header.substr(1), HdrMap);
+ context += (char)254;
+
+ seq.clear();
+ for (const std::string& line : record.sequence_lines) {
+ seq += line;
+ seq += (char)252;
+ }
+ if (!seq.empty()) {
+ seq.pop_back();
+ pack_seq(context, seq);
+ context += (char)254;
+ }
+ }
+
+ if (!stop_shuffle) {
+ mutxFA.lock(); //----------------------------------------------------
+ if (verbose && shuffInProg) {
+ std::cerr << bold("[+]") << " Shuffling ...";
+ shuffle_timer = now();
+ }
+ shuffInProg = false;
+ mutxFA.unlock(); //--------------------------------------------------
+
+ shuffle(context);
+ }
- // Cout encrypted content
- encrypt();
+ std::string packed = std::format("{}{}{}", (char)253, context.size(), (char)254);
+ packed += context;
+ return packed;
+ };
+
+ encrypt_stream([&](const PlaintextSink& emit) {
+ std::string header;
+ header.reserve(headers.size() + 3);
+ header += (char)127;
+ header += (!stop_shuffle ? (char)128 : (char)129);
+ header += headers;
+ header += (char)254;
+ emit(header);
+
+ run_ordered_pipeline(n_threads, read_chunk, pack_chunk, emit);
+ emit(std::string(1, (char)252));
+
+ if (verbose && !stop_shuffle) {
+ std::cerr << "\r" << bold("[+]") << " Shuffling done in " << hms(now() - shuffle_timer);
+ std::cerr << bold("[+]") << " Compacting ...";
+ }
+
+ const auto finish = now(); // Stop timer
+ std::cerr << "\r" << bold("[+]") << " Compacting done in " << hms(finish - start);
+ });
}
/**
* @brief Set hash table and pack function
- * @param[out] pkStruct Pack structure
- * @param[in] headers Headers
+ * @param[out] pkStruct Pack structure
+ * @param headers Headers
*/
void Fasta::set_hashTbl_packFn(packfa_s& pkStruct, const std::string& headers) {
const size_t headersLen = headers.length();
@@ -110,16 +215,21 @@ void Fasta::set_hashTbl_packFn(packfa_s& pkStruct, const std::string& headers) {
/**
* @brief Pack. '>' at the beginning of headers is not packed
- * @param pkStruct Pack structure
- * @param threadID Thread ID
+ * @param pkStruct Pack structure
+ * @param threadID Thread ID
*/
void Fasta::pack(const packfa_s& pkStruct, byte threadID) {
packFP_t packHdr = pkStruct.packHdrFP; // Function pointer
std::ifstream in(in_file);
std::string line, context, seq;
- std::ofstream pkfile(PK_FNAME + std::to_string(threadID), std::ios_base::app);
+ context.reserve(CHUNK_TARGET_SIZE);
+ seq.reserve(CHUNK_TARGET_SIZE);
+ std::ofstream pkfile(std::format("{}{}", PK_FNAME, static_cast(threadID)),
+ std::ios_base::app);
// Lines ignored at the beginning
- for (u64 l = (u64)threadID * BlockLine; l--;) IGNORE_THIS_LINE(in);
+ for (u64 l = (u64)threadID * BlockLine; l--;) {
+ IGNORE_THIS_LINE(in);
+ }
while (in.peek() != EOF) {
context.clear();
@@ -176,18 +286,17 @@ void Fasta::pack(const packfa_s& pkStruct, byte threadID) {
}
// For unshuffling: insert the size of packed context in the beginning
- std::string contextSize;
- contextSize += (char)253;
- contextSize += std::to_string(context.size());
- contextSize += (char)254;
+ std::string contextSize = std::format("{}{}{}", (char)253, context.size(), (char)254);
context.insert(0, contextSize);
// Write header containing threadID for each partially packed file
- pkfile << THR_ID_HDR << std::to_string(threadID) << '\n';
+ pkfile << std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
pkfile << context << '\n';
// Ignore to go to the next related chunk
- for (u64 l = (u64)(n_threads - 1) * BlockLine; l--;) IGNORE_THIS_LINE(in);
+ for (u64 l = (u64)(n_threads - 1) * BlockLine; l--;) {
+ IGNORE_THIS_LINE(in);
+ }
}
pkfile.close();
@@ -195,9 +304,8 @@ void Fasta::pack(const packfa_s& pkStruct, byte threadID) {
}
/**
- * @brief Gather chars of all headers & max length of DNA bases lines,
- * excluding '>'
- * @param[out] headers Chars of all headers
+ * @brief Gather chars of all headers & max length of DNA bases lines, excluding '>'
+ * @param[out] headers Chars of all headers
*/
void Fasta::gather_h_bs(std::string& headers) {
u32 maxBLen = 0; // Max length of each line of bases
@@ -207,132 +315,211 @@ void Fasta::gather_h_bs(std::string& headers) {
std::ifstream in(in_file);
std::string line;
while (getline(in, line).good()) {
- if (line[0] == '>')
- for (char c : line) hChars[c] = true;
- else if (line.size() > maxBLen)
+ if (line[0] == '>') {
+ for (char c : line) {
+ hChars[c] = true;
+ }
+ } else if (line.size() > maxBLen) {
maxBLen = (u32)line.size();
+ }
}
in.close();
// Number of lines read from input file while compression
- BlockLine = (u32)(BLOCK_SIZE / maxBLen);
- if (!BlockLine) BlockLine = 2;
+ BlockLine = (u32)(CHUNK_TARGET_SIZE / maxBLen);
+ if (!BlockLine) {
+ BlockLine = 2;
+ }
// Gather the characters -- Ignore '>'=62 for headers
- for (byte i = 32; i != 62; ++i)
- if (*(hChars + i)) headers += i;
- for (byte i = 63; i != 127; ++i)
- if (*(hChars + i)) headers += i;
+ for (byte i = 32; i != 62; ++i) {
+ if (*(hChars + i)) {
+ headers += i;
+ }
+ }
+ for (byte i = 63; i != 127; ++i) {
+ if (*(hChars + i)) {
+ headers += i;
+ }
+ }
}
/**
* @brief Decompress
*/
void Fasta::decompress() {
- if (!verbose) std::cerr << bold("[+]") << " Decompressing ...";
+ if (!verbose) {
+ std::cerr << bold("[+]") << " Decompressing ...";
+ }
const auto start = now(); // Start timer
- char c; // Chars in file
+ PlaintextStream plaintext;
+ std::exception_ptr decrypt_error;
+ std::thread decrypt_thread([&]() {
+ try {
+ decrypt_stream([&](std::string_view decrypted) { plaintext.push(decrypted); });
+ plaintext.close();
+ } catch (...) {
+ decrypt_error = std::current_exception();
+ plaintext.fail(decrypt_error);
+ }
+ });
+
+ auto join_decrypt = [&]() {
+ if (decrypt_thread.joinable()) {
+ decrypt_thread.join();
+ }
+ if (decrypt_error) {
+ std::rethrow_exception(decrypt_error);
+ }
+ };
+
std::string headers;
unpackfa_s upkStruct; // Collection of inputs to pass to unpack...
- std::vector arrThread(n_threads); // Array of threads
- std::ifstream in(DEC_FNAME);
- in.ignore(1); // Jump over decText[0]==(char) 127
- in.get(c);
- shuffled = (c == (char)128); // Check if file had been shuffled
- if (verbose)
- std::cerr << bold("[+]") << " Extracting no. unique characters ...";
- while (in.get(c) && c != (char)254) headers += c;
- if (verbose) // Show number of different chars in headers -- Ignore '>'=62
- std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => "
- << headers.length() << " \n";
-
- // Header -- Set unpack table and unpack function
- set_unpackTbl_unpackFn(upkStruct, headers);
-
- // Distribute file among threads, for reading and unpacking
- using unpackHFP = void (Fasta::*)(const unpackfa_s&, byte);
- unpackHFP unpackH =
- (headers.length() <= MAX_C5) ? &Fasta::unpack_hS : &Fasta::unpack_hL;
-
- for (byte t = 0; t != n_threads; ++t) {
- in.get(c);
- if (c == (char)253) {
- std::string chunkSizeStr; // Chunk size (std::string) -- For unshuffling
- while (in.get(c) && c != (char)254) chunkSizeStr += c;
- const auto offset =
- stoull(chunkSizeStr); // To traverse decompressed file
-
- upkStruct.begPos = in.tellg();
- upkStruct.chunkSize = offset;
-
- arrThread[t] = std::thread(unpackH, this, upkStruct, t);
-
- // Jump to the beginning of the next chunk
- in.seekg((std::streamoff)offset, std::ios_base::cur);
- }
- // End of file
- if (in.peek() == 252) break;
- }
- // Join threads
- for (auto& thr : arrThread)
- if (thr.joinable()) thr.join();
+ try {
+ const auto file_type = plaintext.get();
+ if (!file_type || *file_type != (char)127) {
+ throw std::runtime_error("corrupted file.");
+ }
- if (verbose) {
- std::cerr << "\r" << bold("[+]") << " Unshuffling done in "
- << hms(now() - shuffle_timer);
- std::cerr << bold("[+]") << " Decompressing ...";
- }
+ const auto shuffle_flag = plaintext.get();
+ if (!shuffle_flag || (*shuffle_flag != (char)128 && *shuffle_flag != (char)129)) {
+ throw std::runtime_error("corrupted file.");
+ }
- // Close/delete decrypted file
- in.close();
- const std::string decFileName = DEC_FNAME;
- std::remove(decFileName.c_str());
+ shuffled = (*shuffle_flag == (char)128); // Check if file had been shuffled
+ if (verbose) {
+ std::cerr << bold("[+]") << " Extracting no. unique characters ...";
+ }
+ if (!plaintext.read_until((char)254, headers)) {
+ throw std::runtime_error("corrupted file.");
+ }
+ if (verbose) { // Show number of different chars in headers -- Ignore '>'=62
+ std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " << headers.length()
+ << " \n";
+ }
- // Join partially unpacked files
- join_unpacked_files();
+ // Header -- Set unpack table and unpack function
+ set_unpackTbl_unpackFn(upkStruct, headers);
+ const bool has_small_header = headers.length() <= MAX_C5;
+
+ auto read_chunk = [&]() -> std::optional {
+ const auto marker = plaintext.get();
+ if (!marker || *marker == (char)252) {
+ return std::nullopt;
+ }
+ if (*marker != (char)253) {
+ throw std::runtime_error("corrupted file.");
+ }
+
+ std::string chunk_size_str;
+ if (!plaintext.read_until((char)254, chunk_size_str) || chunk_size_str.empty()) {
+ throw std::runtime_error("corrupted file.");
+ }
+
+ std::string chunk;
+ if (!plaintext.read_bytes(std::stoull(chunk_size_str), chunk)) {
+ throw std::runtime_error("corrupted file.");
+ }
+ return chunk;
+ };
+
+ auto unpack_chunk = [this, upkStruct, has_small_header](std::string decText) mutable {
+ if (decText.empty()) {
+ return std::string{};
+ }
+
+ auto i = decText.begin();
+
+ // Unshuffle
+ if (shuffled) {
+ mutxFA.lock(); //--------------------------------------------------
+ if (verbose && shuffInProg) {
+ std::cerr << bold("[+]") << " Unshuffling ...";
+ shuffle_timer = now();
+ }
+ shuffInProg = false;
+ mutxFA.unlock(); //------------------------------------------------
+
+ unshuffle(i, decText.size());
+ }
+
+ std::string upkhdrOut, upkSeqOut;
+ std::string content;
+ content.reserve(decText.size() * 2);
+ do {
+ if (*i == (char)253) { // Hdr
+ if (has_small_header) {
+ (this->*upkStruct.unpackHdrFP)(upkhdrOut, ++i, upkStruct.hdrUnpack);
+ } else {
+ unpack_large(upkhdrOut, ++i, upkStruct.XChar_hdr, upkStruct.hdrUnpack);
+ }
+ content += std::format(">{}\n", upkhdrOut);
+ } else { // Seq
+ unpack_seq(upkSeqOut, i);
+ content += std::format("{}\n", upkSeqOut);
+ }
+ } while (++i != decText.end());
+
+ return content;
+ };
+
+ run_ordered_pipeline(n_threads, read_chunk, unpack_chunk,
+ [](const std::string& output) { std::cout << output; });
+
+ if (verbose && shuffled) {
+ std::cerr << "\r" << bold("[+]") << " Unshuffling done in " << hms(now() - shuffle_timer);
+ std::cerr << bold("[+]") << " Decompressing ...";
+ }
+ } catch (...) {
+ plaintext.fail(std::current_exception());
+ if (decrypt_thread.joinable()) {
+ decrypt_thread.join();
+ }
+ throw;
+ }
+
+ join_decrypt();
const auto finish = now(); // Stop timer
- std::cerr << "\r" << bold("[+]") << " Decompressing done in "
- << hms(finish - start);
+ std::cerr << "\r" << bold("[+]") << " Decompressing done in " << hms(finish - start);
}
/**
- * @brief Set unpack table and unpack function
- * @param[out] upkStruct Unpack structure
- * @param[in] headers Headers
+ * @brief Set unpack table and unpack function
+ * @param[out] upkStruct Unpack structure
+ * @param headers Headers
*/
-void Fasta::set_unpackTbl_unpackFn(unpackfa_s& upkStruct,
- const std::string& headers) {
+void Fasta::set_unpackTbl_unpackFn(unpackfa_s& upkStruct, const std::string& headers) {
const size_t headersLen = headers.length();
u16 keyLen_hdr = 0;
- if (headersLen > MAX_C5)
+ if (headersLen > MAX_C5) {
keyLen_hdr = KEYLEN_C5;
- else if (headersLen > MAX_C4) { // Cat 5
+ } else if (headersLen > MAX_C4) { // Cat 5
upkStruct.unpackHdrFP = &EnDecrypto::unpack_2B;
keyLen_hdr = KEYLEN_C5;
} else {
upkStruct.unpackHdrFP = &EnDecrypto::unpack_1B;
- if (headersLen > MAX_C3)
+ if (headersLen > MAX_C3) {
keyLen_hdr = KEYLEN_C4; // Cat 4
- else if (headersLen == MAX_C3 || headersLen == MID_C3 ||
- headersLen == MIN_C3)
+ } else if (headersLen == MAX_C3 || headersLen == MID_C3 || headersLen == MIN_C3) {
keyLen_hdr = KEYLEN_C3; // Cat 3
- else if (headersLen == C2)
+ } else if (headersLen == C2) {
keyLen_hdr = KEYLEN_C2; // Cat 2
- else if (headersLen == C1)
+ } else if (headersLen == C1) {
keyLen_hdr = KEYLEN_C1; // Cat 1
- else
+ } else {
keyLen_hdr = 1; // = 1
+ }
}
// Build unpacking tables
- if (headersLen <= MAX_C5)
+ if (headersLen <= MAX_C5) {
build_unpack_tbl(upkStruct.hdrUnpack, headers, keyLen_hdr);
- else {
+ } else {
const std::string decHeaders = headers.substr(headersLen - MAX_C5);
// ASCII char after the last char in headers std::string
std::string decHeadersX = decHeaders;
@@ -344,19 +531,19 @@ void Fasta::set_unpackTbl_unpackFn(unpackfa_s& upkStruct,
/**
* @brief Unpack: small header
- * @param upkStruct Unpack structure
- * @param threadID Thread ID
+ * @param upkStruct Unpack structure
+ * @param threadID Thread ID
*/
void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) {
unpackFP_t unpackHdr = upkStruct.unpackHdrFP; // Function pointer
pos_t begPos = upkStruct.begPos;
u64 chunkSize = upkStruct.chunkSize;
std::ifstream in(DEC_FNAME);
- std::ofstream upkfile(UPK_FNAME + std::to_string(threadID),
+ std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)),
std::ios_base::app);
std::string upkhdrOut, upkSeqOut;
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { upkfile << content; };
while (in.peek() != EOF) {
@@ -364,6 +551,7 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) {
in.seekg(begPos); // Read the file from this position
// Take a chunk of decrypted file
std::string decText;
+ decText.reserve(chunkSize);
for (u64 u = chunkSize; u--;) {
in.get(c);
decText += c;
@@ -384,14 +572,14 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) {
unshuffle(i, chunkSize);
}
// todo
- content += THR_ID_HDR + std::to_string(threadID) + "\n";
+ content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
do {
if (*i == (char)253) { // Hdr
(this->*unpackHdr)(upkhdrOut, ++i, upkStruct.hdrUnpack);
- content += ">" + upkhdrOut + "\n";
+ content += std::format(">{}\n", upkhdrOut);
} else { // Seq
unpack_seq(upkSeqOut, i);
- content += upkSeqOut + "\n";
+ content += std::format("{}\n", upkSeqOut);
}
} while (++i != decText.end()); // If trouble: change "!=" to "<"
@@ -401,7 +589,9 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) {
in.get(c);
if (c == (char)253) {
std::string chunkSizeStr;
- while (in.get(c) && c != (char)254) chunkSizeStr += c;
+ while (in.get(c) && c != (char)254) {
+ chunkSizeStr += c;
+ }
chunkSize = stoull(chunkSizeStr);
begPos = in.tellg();
@@ -409,10 +599,10 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) {
}
}
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
}
write_content();
@@ -423,18 +613,18 @@ void Fasta::unpack_hS(const unpackfa_s& upkStruct, byte threadID) {
/**
* @brief Unpack: large header
- * @param upkStruct Unpack structure
- * @param threadID Thread ID
+ * @param upkStruct Unpack structure
+ * @param threadID Thread ID
*/
void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) {
pos_t begPos = upkStruct.begPos;
u64 chunkSize = upkStruct.chunkSize;
std::ifstream in(DEC_FNAME);
- std::ofstream upkfile(UPK_FNAME + std::to_string(threadID),
+ std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)),
std::ios_base::app);
std::string upkHdrOut, upkSeqOut;
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { upkfile << content; };
while (in.peek() != EOF) {
@@ -442,6 +632,7 @@ void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) {
in.seekg(begPos); // Read the file from this position
// Take a chunk of decrypted file
std::string decText;
+ decText.reserve(chunkSize);
for (u64 u = chunkSize; u--;) {
in.get(c);
decText += c;
@@ -462,14 +653,14 @@ void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) {
unshuffle(i, chunkSize);
}
- content += THR_ID_HDR + std::to_string(threadID) + "\n";
+ content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
do {
if (*i == (char)253) { // Hdr
unpack_large(upkHdrOut, ++i, upkStruct.XChar_hdr, upkStruct.hdrUnpack);
- content += ">" + upkHdrOut + "\n";
+ content += std::format(">{}\n", upkHdrOut);
} else { // Seq
unpack_seq(upkSeqOut, i);
- content += upkSeqOut + "\n";
+ content += std::format("{}\n", upkSeqOut);
}
} while (++i != decText.end()); // If trouble: change "!=" to "<"
@@ -479,7 +670,9 @@ void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) {
in.get(c);
if (c == (char)253) {
std::string chunkSizeStr;
- while (in.get(c) && c != (char)254) chunkSizeStr += c;
+ while (in.get(c) && c != (char)254) {
+ chunkSizeStr += c;
+ }
chunkSize = stoull(chunkSizeStr);
begPos = in.tellg();
@@ -487,14 +680,14 @@ void Fasta::unpack_hL(const unpackfa_s& upkStruct, byte threadID) {
}
}
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
}
write_content();
upkfile.close();
in.close();
-}
\ No newline at end of file
+}
diff --git a/src/fasta.hpp b/src/fasta.hpp
index 6037ba2..0b56568 100644
--- a/src/fasta.hpp
+++ b/src/fasta.hpp
@@ -1,9 +1,9 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file fasta.hpp
- * @brief Compression/Decompression of FASTA
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file fasta.hpp
+ * @brief Compression/Decompression of FASTA
*/
#ifndef CRYFA_FASTA_H
@@ -24,12 +24,11 @@ struct packfa_s {
* @brief Unpakcing FASTA
*/
struct unpackfa_s {
- char XChar_hdr; /**< @brief Extra char if header's length > 39 */
- pos_t begPos; /**< @brief Begining position for each thread */
- u64 chunkSize; /**< @brief Chunk size */
- std::vector
- hdrUnpack; /**< @brief Lookup table for unpacking headers */
- unpackFP_t unpackHdrFP; /**< @brief Points to a header unpacking fn */
+ char XChar_hdr; /**< @brief Extra char if header's length > 39 */
+ pos_t begPos; /**< @brief Begining position for each thread */
+ u64 chunkSize; /**< @brief Chunk size */
+ std::vector hdrUnpack; /**< @brief Lookup table for unpacking headers */
+ unpackFP_t unpackHdrFP; /**< @brief Points to a header unpacking fn */
};
/**
@@ -50,4 +49,4 @@ class Fasta : public EnDecrypto {
};
} // namespace cryfa
-#endif // CRYFA_FASTA_H
\ No newline at end of file
+#endif // CRYFA_FASTA_H
diff --git a/src/fastq.cpp b/src/fastq.cpp
index 47ceb8e..f1cff46 100644
--- a/src/fastq.cpp
+++ b/src/fastq.cpp
@@ -1,27 +1,46 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file fastq.cpp
- * @brief Compression/Decompression of FASTQ
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file fastq.cpp
+ * @brief Compression/Decompression of FASTQ
*/
#include "fastq.hpp"
#include
+#include
+#include
#include
#include // setw, std::setprecision
#include
+#include
+#include
#include
+#include
+#include "ordered_pipeline.hpp"
+#include "plaintext_stream.hpp"
#include "string.hpp"
#include "time.hpp"
using namespace cryfa;
std::mutex mutxFQ; /**< @brief Mutex */
+namespace {
+struct FastqRecord {
+ std::string header;
+ std::string sequence;
+ std::string quality;
+};
+
+struct FastqChunk {
+ std::vector records;
+};
+} // namespace
+
/**
- * @brief Check if the third line contains only +
+ * @brief Check if the third line contains only +
* @return True or false
*/
bool Fastq::has_just_plus() const {
@@ -48,53 +67,121 @@ bool Fastq::has_just_plus() const {
* @brief Compress
*/
void Fastq::compress() {
- if (!verbose) std::cerr << bold("[+]") << " Compacting ...";
+ if (!verbose) {
+ std::cerr << bold("[+]") << " Compacting ...";
+ }
const auto start = now(); // Start timer
- std::vector arrThread(n_threads);
std::string headers, qscores;
packfq_s pkStruct; // Collection of inputs to pass to pack...
- if (verbose)
+ if (verbose) {
std::cerr << bold("[+]") << " Calculating no. unique characters ...";
+ }
// Gather different chars and max length in all headers and quality scores
gather_h_q(headers, qscores);
// Show number of different chars in headers and qs -- Ignore '@'=64 in hdr
- if (verbose)
- std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => "
- << headers.length() << ", qscores => " << qscores.length()
- << "\n";
+ if (verbose) {
+ std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " << headers.length()
+ << ", qscores => " << qscores.length() << "\n";
+ }
// Set Hash table and pack function
set_hashTbl_packFn(pkStruct, headers, qscores);
- // Distribute file among threads, for reading and packing
- for (byte t = 0; t != n_threads; ++t)
- arrThread[t] = std::thread(&Fastq::pack, this, pkStruct, t);
- for (auto& thr : arrThread)
- if (thr.joinable()) thr.join();
+ const bool plus_is_plain = has_just_plus();
- if (verbose) {
- std::cerr << "\r" << bold("[+]") << " Shuffling done in "
- << hms(now() - shuffle_timer);
- std::cerr << bold("[+]") << " Compacting ...";
- }
+ auto read_chunk = [this, in = std::ifstream(in_file)]() mutable -> std::optional {
+ FastqChunk chunk;
+ std::string plus;
+ u64 chunk_bytes = 0;
- // Join partially packed and/or shuffled files
- join_packed_files(headers, qscores, 'Q', has_just_plus());
+ while (chunk_bytes < CHUNK_TARGET_SIZE) {
+ FastqRecord record;
+ if (!std::getline(in, record.header)) {
+ break;
+ }
+ if (!std::getline(in, record.sequence)) {
+ break;
+ }
+ if (!std::getline(in, plus)) {
+ break;
+ }
+ if (!std::getline(in, record.quality)) {
+ break;
+ }
- const auto finish = now(); // Stop timer
- std::cerr << "\r" << bold("[+]") << " Compacting done in "
- << hms(finish - start);
+ chunk_bytes +=
+ record.header.size() + record.sequence.size() + plus.size() + record.quality.size() + 4;
+ chunk.records.push_back(std::move(record));
+ }
+
+ if (chunk.records.empty()) {
+ return std::nullopt;
+ }
+ return chunk;
+ };
+
+ auto pack_chunk = [this, pkStruct](FastqChunk chunk) {
+ packFP_t packHdr = pkStruct.packHdrFPtr;
+ packFP_t packQS = pkStruct.packQSFPtr;
+ std::string context;
+ context.reserve(CHUNK_TARGET_SIZE);
+
+ for (const FastqRecord& record : chunk.records) {
+ (this->*packHdr)(context, record.header.substr(1), HdrMap);
+ context += (char)254;
+ pack_seq(context, record.sequence);
+ context += (char)254;
+ (this->*packQS)(context, record.quality, QsMap);
+ context += (char)254;
+ }
+
+ if (!stop_shuffle) {
+ mutxFQ.lock(); //----------------------------------------------------
+ if (verbose && shuffInProg) {
+ std::cerr << bold("[+]") << " Shuffling ...";
+ shuffle_timer = now();
+ }
+ shuffInProg = false;
+ mutxFQ.unlock(); //--------------------------------------------------
- // Cout encrypted content
- encrypt();
+ shuffle(context);
+ }
+
+ std::string packed = std::format("{}{}{}", (char)253, context.size(), (char)254);
+ packed += context;
+ return packed;
+ };
+
+ encrypt_stream([&](const PlaintextSink& emit) {
+ std::string header;
+ header.reserve(headers.size() + qscores.size() + 3);
+ header += (char)126;
+ header += (!stop_shuffle ? (char)128 : (char)129);
+ header += headers;
+ header += (char)254;
+ header += qscores;
+ header += (plus_is_plain ? (char)253 : '\n');
+ emit(header);
+
+ run_ordered_pipeline(n_threads, read_chunk, pack_chunk, emit);
+ emit(std::string(1, (char)252));
+
+ if (verbose && !stop_shuffle) {
+ std::cerr << "\r" << bold("[+]") << " Shuffling done in " << hms(now() - shuffle_timer);
+ std::cerr << bold("[+]") << " Compacting ...";
+ }
+
+ const auto finish = now(); // Stop timer
+ std::cerr << "\r" << bold("[+]") << " Compacting done in " << hms(finish - start);
+ });
}
/**
* @brief Set hash table and pack function
- * @param[out] pkStruct Pack structure
- * @param[in] headers Headers
- * @param[in] qscores Quality scores
+ * @param[out] pkStruct Pack structure
+ * @param headers Headers
+ * @param qscores Quality scores
*/
void Fastq::set_hashTbl_packFn(packfq_s& pkStruct, const std::string& headers,
const std::string& qscores) {
@@ -170,20 +257,24 @@ void Fastq::set_hashTbl_packFn(packfq_s& pkStruct, const std::string& headers,
/**
* @brief Pack. '@' at the beginning of headers is not packed
- * @param pkStruct Pack structure
- * @param threadID Thread ID
+ * @param pkStruct Pack structure
+ * @param threadID Thread ID
*/
void Fastq::pack(const packfq_s& pkStruct, byte threadID) {
packFP_t packHdr = pkStruct.packHdrFPtr; // Function pointer
packFP_t packQS = pkStruct.packQSFPtr; // Function pointer
std::ifstream in(in_file);
- std::ofstream pkfile(PK_FNAME + std::to_string(threadID), std::ios_base::app);
+ std::ofstream pkfile(std::format("{}{}", PK_FNAME, static_cast(threadID)),
+ std::ios_base::app);
// Lines ignored at the beginning
- for (u64 l = (u64)threadID * BlockLine; l--;) IGNORE_THIS_LINE(in);
+ for (u64 l = (u64)threadID * BlockLine; l--;) {
+ IGNORE_THIS_LINE(in);
+ }
while (in.peek() != EOF) {
std::string context; // Output std::string
+ context.reserve(CHUNK_TARGET_SIZE);
std::string line;
for (u64 l = 0; l != BlockLine; l += 4) { // Process 4 lines by 4 lines
@@ -216,18 +307,17 @@ void Fastq::pack(const packfq_s& pkStruct, byte threadID) {
}
// For unshuffling: insert the size of packed context in the beginning
- std::string contextSize;
- contextSize += (char)253;
- contextSize += std::to_string(context.size());
- contextSize += (char)254;
+ std::string contextSize = std::format("{}{}{}", (char)253, context.size(), (char)254);
context.insert(0, contextSize);
// Write header containing threadID for each
- pkfile << THR_ID_HDR << std::to_string(threadID) << '\n';
+ pkfile << std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
pkfile << context << '\n';
// Ignore to go to the next related chunk
- for (u64 l = (u64)(n_threads - 1) * BlockLine; l--;) IGNORE_THIS_LINE(in);
+ for (u64 l = (u64)(n_threads - 1) * BlockLine; l--;) {
+ IGNORE_THIS_LINE(in);
+ }
}
pkfile.close();
@@ -236,8 +326,8 @@ void Fastq::pack(const packfq_s& pkStruct, byte threadID) {
/**
* @brief Gather chars of all headers & quality scores, excluding '@' in headers
- * @param[out] headers Chars of all headers
- * @param[out] qscores Chars of all quality scores
+ * @param[out] headers Chars of all headers
+ * @param[out] qscores Chars of all quality scores
*/
void Fastq::gather_h_q(std::string& headers, std::string& qscores) {
u32 maxHLen = 0, maxQLen = 0; // Max length of headers & quality scores
@@ -248,168 +338,275 @@ void Fastq::gather_h_q(std::string& headers, std::string& qscores) {
std::ifstream in(in_file);
for (std::string line; !in.eof();) {
if (getline(in, line).good()) {
- for (char c : line) hChars[c] = true;
- if (line.size() > maxHLen) maxHLen = (u32)line.size();
+ for (char c : line) {
+ hChars[c] = true;
+ }
+ if (line.size() > maxHLen) {
+ maxHLen = (u32)line.size();
+ }
}
IGNORE_THIS_LINE(in); // Ignore sequence
IGNORE_THIS_LINE(in); // Ignore +
if (getline(in, line).good()) {
- for (char c : line) qChars[c] = true;
- if (line.size() > maxQLen) maxQLen = (u32)line.size();
+ for (char c : line) {
+ qChars[c] = true;
+ }
+ if (line.size() > maxQLen) {
+ maxQLen = (u32)line.size();
+ }
}
}
in.close();
// Number of lines read from input file while compression
- BlockLine = (u32)(4 * (BLOCK_SIZE / (maxHLen + 2 * maxQLen)));
+ BlockLine = (u32)(4 * (CHUNK_TARGET_SIZE / (maxHLen + 2 * maxQLen)));
if (!BlockLine) BlockLine = 4;
// Gather the characters -- ignore '@'=64 for headers
- for (byte i = 32; i != 64; ++i)
- if (*(hChars + i)) headers += i;
- for (byte i = 65; i != 127; ++i)
- if (*(hChars + i)) headers += i;
- for (byte i = 32; i != 127; ++i)
- if (*(qChars + i)) qscores += i;
+ for (byte i = 32; i != 64; ++i) {
+ if (*(hChars + i)) {
+ headers += i;
+ }
+ }
+ for (byte i = 65; i != 127; ++i) {
+ if (*(hChars + i)) {
+ headers += i;
+ }
+ }
+ for (byte i = 32; i != 127; ++i) {
+ if (*(qChars + i)) {
+ qscores += i;
+ }
+ }
}
/**
* @brief Decompress
*/
void Fastq::decompress() {
- if (!verbose) std::cerr << bold("[+]") << " Decompressing ...";
+ if (!verbose) {
+ std::cerr << bold("[+]") << " Decompressing ...";
+ }
const auto start = now(); // Start timer
- char c; // Chars in file
+ PlaintextStream plaintext;
+ std::exception_ptr decrypt_error;
+ std::thread decrypt_thread([&]() {
+ try {
+ decrypt_stream([&](std::string_view decrypted) { plaintext.push(decrypted); });
+ plaintext.close();
+ } catch (...) {
+ decrypt_error = std::current_exception();
+ plaintext.fail(decrypt_error);
+ }
+ });
+
+ auto join_decrypt = [&]() {
+ if (decrypt_thread.joinable()) {
+ decrypt_thread.join();
+ }
+ if (decrypt_error) {
+ std::rethrow_exception(decrypt_error);
+ }
+ };
+
std::string headers, qscores;
unpackfq_s upkStruct; // Collection of inputs to pass to unpack...
- std::vector arrThread(n_threads); // Array of threads
- std::ifstream in(DEC_FNAME);
- in.ignore(1); // Jump over decText[0]==(char) 126
- in.get(c);
- shuffled = (c == (char)128); // Check if file had been shuffled
- if (verbose)
- std::cerr << bold("[+]") << " Extracting no. unique characters ...";
- while (in.get(c) && c != (char)254) headers += c;
- while (in.get(c) && c != '\n' && c != (char)253) qscores += c;
- // Show number of different chars in headers and qs -- ignore '@'=64
- if (verbose)
- std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => "
- << headers.length() << ", qscores => " << qscores.length()
- << "\n";
- if (c == '\n') justPlus = false; // If 3rd line is just +
-
- // Header -- Set unpack table and unpack function
- set_unpackTbl_unpackFn(upkStruct, headers, qscores);
-
- // Distribute file among threads, for reading and unpacking
- using unpackHQFP = void (Fastq::*)(const unpackfq_s&, byte);
- unpackHQFP unpackHQ =
- (headers.length() <= MAX_C5)
- ? (qscores.length() <= MAX_C5 ? &Fastq::unpack_hS_qS
- : &Fastq::unpack_hS_qL)
- : (qscores.length() > MAX_C5 ? &Fastq::unpack_hL_qL
- : &Fastq::unpack_hL_qS);
-
- for (byte t = 0; t != n_threads; ++t) {
- in.get(c);
- if (c == (char)253) {
- std::string chunkSizeStr; // Chunk size (std::string) -- For unshuffling
- while (in.get(c) && c != (char)254) chunkSizeStr += c;
- const u64 offset = stoull(chunkSizeStr); // To traverse decompressed file
-
- upkStruct.begPos = in.tellg();
- upkStruct.chunkSize = offset;
-
- arrThread[t] = std::thread(unpackHQ, this, upkStruct, t);
-
- // Jump to the beginning of the next chunk
- in.seekg((std::streamoff)offset, std::ios_base::cur);
- }
- // End of file
- if (in.peek() == 252) break;
- }
- // Join threads
- for (auto& thr : arrThread)
- if (thr.joinable()) thr.join();
+ try {
+ const auto file_type = plaintext.get();
+ if (!file_type || *file_type != (char)126) {
+ throw std::runtime_error("corrupted file.");
+ }
- if (verbose) {
- std::cerr << "\r" << bold("[+]") << " Unshuffling done in "
- << hms(now() - shuffle_timer);
- std::cerr << bold("[+]") << " Decompressing ...";
- }
+ const auto shuffle_flag = plaintext.get();
+ if (!shuffle_flag || (*shuffle_flag != (char)128 && *shuffle_flag != (char)129)) {
+ throw std::runtime_error("corrupted file.");
+ }
- // Close/delete decrypted file
- in.close();
- const std::string decFileName = DEC_FNAME;
- std::remove(decFileName.c_str());
+ shuffled = (*shuffle_flag == (char)128); // Check if file had been shuffled
+ if (verbose) {
+ std::cerr << bold("[+]") << " Extracting no. unique characters ...";
+ }
+ if (!plaintext.read_until((char)254, headers)) {
+ throw std::runtime_error("corrupted file.");
+ }
+
+ char c = 0;
+ while (std::optional next = plaintext.get()) {
+ c = *next;
+ if (c == '\n' || c == (char)253) {
+ break;
+ }
+ qscores += c;
+ }
+ if (c != '\n' && c != (char)253) {
+ throw std::runtime_error("corrupted file.");
+ }
+ // Show number of different chars in headers and qs -- ignore '@'=64
+ if (verbose) {
+ std::cerr << "\r" << bold("[+]") << " No. unique characters: headers => " << headers.length()
+ << ", qscores => " << qscores.length() << "\n";
+ }
+ justPlus = (c != '\n'); // If 3rd line is just +
+
+ // Header -- Set unpack table and unpack function
+ set_unpackTbl_unpackFn(upkStruct, headers, qscores);
+ const bool has_small_header = headers.length() <= MAX_C5;
+ const bool has_small_qscore = qscores.length() <= MAX_C5;
+
+ auto read_chunk = [&]() -> std::optional {
+ const auto marker = plaintext.get();
+ if (!marker || *marker == (char)252) {
+ return std::nullopt;
+ }
+ if (*marker != (char)253) {
+ throw std::runtime_error("corrupted file.");
+ }
+
+ std::string chunk_size_str;
+ if (!plaintext.read_until((char)254, chunk_size_str) || chunk_size_str.empty()) {
+ throw std::runtime_error("corrupted file.");
+ }
+
+ std::string chunk;
+ if (!plaintext.read_bytes(std::stoull(chunk_size_str), chunk)) {
+ throw std::runtime_error("corrupted file.");
+ }
+ return chunk;
+ };
+
+ auto unpack_chunk = [this, upkStruct, has_small_header,
+ has_small_qscore](std::string decText) mutable {
+ if (decText.empty()) {
+ return std::string{};
+ }
+
+ auto i = decText.begin();
+
+ // Unshuffle
+ if (shuffled) {
+ mutxFQ.lock(); //--------------------------------------------------
+ if (verbose && shuffInProg) {
+ std::cerr << bold("[+]") << " Unshuffling ...";
+ shuffle_timer = now();
+ }
+ shuffInProg = false;
+ mutxFQ.unlock(); //------------------------------------------------
+
+ unshuffle(i, decText.size());
+ }
- // Join partially unpacked files
- join_unpacked_files();
+ std::string upkHdrOut, upkSeqOut, upkQsOut;
+ std::string content;
+ content.reserve(decText.size() * 2);
+ do {
+ content += '@';
+ std::string plusMore;
+
+ if (has_small_header) {
+ (this->*upkStruct.unpackHdrFPtr)(upkHdrOut, i, upkStruct.hdrUnpack);
+ } else {
+ unpack_large(upkHdrOut, i, upkStruct.XChar_hdr, upkStruct.hdrUnpack);
+ }
+ plusMore = upkHdrOut;
+ content += std::format("{}\n", upkHdrOut);
+ ++i; // Hdr
+
+ unpack_seq(upkSeqOut, i);
+ content += std::format("{}\n", upkSeqOut); // Seq
+
+ content += justPlus ? "+\n" : std::format("+{}\n", plusMore);
+ ++i; // +
+
+ if (has_small_qscore) {
+ (this->*upkStruct.unpackQSFPtr)(upkQsOut, i, upkStruct.qsUnpack);
+ } else {
+ unpack_large(upkQsOut, i, upkStruct.XChar_qs, upkStruct.qsUnpack);
+ }
+ content += std::format("{}\n", upkQsOut); // Qs
+ } while (++i != decText.end());
+
+ return content;
+ };
+
+ run_ordered_pipeline(n_threads, read_chunk, unpack_chunk,
+ [](const std::string& output) { std::cout << output; });
+
+ if (verbose && shuffled) {
+ std::cerr << "\r" << bold("[+]") << " Unshuffling done in " << hms(now() - shuffle_timer);
+ std::cerr << bold("[+]") << " Decompressing ...";
+ }
+ } catch (...) {
+ plaintext.fail(std::current_exception());
+ if (decrypt_thread.joinable()) {
+ decrypt_thread.join();
+ }
+ throw;
+ }
+
+ join_decrypt();
const auto finish = now(); // Stop timer
- std::cerr << "\r" << bold("[+]") << " Decompressing done in "
- << hms(finish - start);
+ std::cerr << "\r" << bold("[+]") << " Decompressing done in " << hms(finish - start);
}
/**
* @brief Set unpack table and unpack function
- * @param[out] upkStruct Unpack structure
- * @param[in] headers Headers
- * @param[in] qscores Quality scores
+ * @param[out] upkStruct Unpack structure
+ * @param headers Headers
+ * @param qscores Quality scores
*/
-void Fastq::set_unpackTbl_unpackFn(unpackfq_s& upkStruct,
- const std::string& headers,
+void Fastq::set_unpackTbl_unpackFn(unpackfq_s& upkStruct, const std::string& headers,
const std::string& qscores) {
const auto headersLen = headers.length();
const auto qscoresLen = qscores.length();
u16 keyLen_hdr = 0, keyLen_qs = 0;
// Header
- if (headersLen > MAX_C5)
+ if (headersLen > MAX_C5) {
keyLen_hdr = KEYLEN_C5;
- else if (headersLen > MAX_C4) { // Cat 5
+ } else if (headersLen > MAX_C4) { // Cat 5
upkStruct.unpackHdrFPtr = &EnDecrypto::unpack_2B;
keyLen_hdr = KEYLEN_C5;
} else {
upkStruct.unpackHdrFPtr = &EnDecrypto::unpack_1B;
- if (headersLen > MAX_C3)
+ if (headersLen > MAX_C3) {
keyLen_hdr = KEYLEN_C4; // Cat 4
- else if (headersLen == MAX_C3 || headersLen == MID_C3 ||
- headersLen == MIN_C3)
+ } else if (headersLen == MAX_C3 || headersLen == MID_C3 || headersLen == MIN_C3) {
keyLen_hdr = KEYLEN_C3; // Cat 3
- else if (headersLen == C2)
+ } else if (headersLen == C2) {
keyLen_hdr = KEYLEN_C2; // Cat 2
- else if (headersLen == C1)
+ } else if (headersLen == C1) {
keyLen_hdr = KEYLEN_C1; // Cat 1
- else
+ } else {
keyLen_hdr = 1; // = 1
+ }
}
// Quality score
- if (qscoresLen > MAX_C5)
+ if (qscoresLen > MAX_C5) {
keyLen_qs = KEYLEN_C5;
- else if (qscoresLen > MAX_C4) { // Cat 5
+ } else if (qscoresLen > MAX_C4) { // Cat 5
upkStruct.unpackQSFPtr = &EnDecrypto::unpack_2B;
keyLen_qs = KEYLEN_C5;
} else {
upkStruct.unpackQSFPtr = &EnDecrypto::unpack_1B;
- if (qscoresLen > MAX_C3)
+ if (qscoresLen > MAX_C3) {
keyLen_qs = KEYLEN_C4; // Cat 4
- else if (qscoresLen == MAX_C3 || qscoresLen == MID_C3 ||
- qscoresLen == MIN_C3)
+ } else if (qscoresLen == MAX_C3 || qscoresLen == MID_C3 || qscoresLen == MIN_C3) {
keyLen_qs = KEYLEN_C3; // Cat 3
- else if (qscoresLen == C2)
+ } else if (qscoresLen == C2) {
keyLen_qs = KEYLEN_C2; // Cat 2
- else if (qscoresLen == C1)
+ } else if (qscoresLen == C1) {
keyLen_qs = KEYLEN_C1; // Cat 1
- else
+ } else {
keyLen_qs = 1; // = 1
+ }
}
// Build unpacking tables
@@ -449,8 +646,8 @@ void Fastq::set_unpackTbl_unpackFn(unpackfq_s& upkStruct,
/**
* @brief Unpack: small header, small quality score.
* '@' at the beginning of headers not packed
- * @param upkStruct Unpack structure
- * @param threadID Thread ID
+ * @param upkStruct Unpack structure
+ * @param threadID Thread ID
*/
void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) {
unpackFP_t unpackHdr = upkStruct.unpackHdrFPtr; // Function pointer
@@ -458,11 +655,11 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) {
pos_t begPos = upkStruct.begPos;
u64 chunkSize = upkStruct.chunkSize;
std::ifstream in(DEC_FNAME);
- std::ofstream upkfile(UPK_FNAME + std::to_string(threadID),
+ std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)),
std::ios_base::app);
std::string upkHdrOut, upkSeqOut, upkQsOut;
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { upkfile << content; };
while (in.peek() != EOF) {
@@ -470,6 +667,7 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) {
in.seekg(begPos); // Read the file from this position
// Take a chunk of decrypted file
std::string decText;
+ decText.reserve(chunkSize);
for (u64 u = chunkSize; u--;) {
in.get(c);
decText += c;
@@ -490,24 +688,24 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) {
unshuffle(i, chunkSize);
}
- content += THR_ID_HDR + std::to_string(threadID) + "\n";
+ content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
do {
content += '@';
std::string plusMore;
(this->*unpackHdr)(upkHdrOut, i, upkStruct.hdrUnpack);
plusMore = upkHdrOut;
- content += upkHdrOut + "\n";
+ content += std::format("{}\n", upkHdrOut);
++i; // Hdr
unpack_seq(upkSeqOut, i);
- content += upkSeqOut + "\n"; // Seq
+ content += std::format("{}\n", upkSeqOut); // Seq
- content += (justPlus ? "+" : "+" + plusMore) + "\n";
+ content += justPlus ? "+\n" : std::format("+{}\n", plusMore);
++i; // +
(this->*unpackQS)(upkQsOut, i, upkStruct.qsUnpack);
- content += upkQsOut + "\n"; // Qs
+ content += std::format("{}\n", upkQsOut); // Qs
} while (++i != decText.end()); // If trouble: change "!=" to "<"
// Update the chunk size and positions (beg & end)
@@ -516,7 +714,9 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) {
in.get(c);
if (c == (char)253) {
std::string chunkSizeStr;
- while (in.get(c) && c != (char)254) chunkSizeStr += c;
+ while (in.get(c) && c != (char)254) {
+ chunkSizeStr += c;
+ }
chunkSize = stoull(chunkSizeStr);
begPos = in.tellg();
@@ -524,10 +724,10 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) {
}
}
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
}
write_content();
@@ -539,19 +739,19 @@ void Fastq::unpack_hS_qS(const unpackfq_s& upkStruct, byte threadID) {
/**
* @brief Unpack: small header, large quality score.
* '@' at the beginning of headers not packed
- * @param upkStruct Unpack structure
- * @param threadID Thread ID
+ * @param upkStruct Unpack structure
+ * @param threadID Thread ID
*/
void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) {
unpackFP_t unpackHdr = upkStruct.unpackHdrFPtr; // Function pointer
pos_t begPos = upkStruct.begPos;
u64 chunkSize = upkStruct.chunkSize;
std::ifstream in(DEC_FNAME);
- std::ofstream upkfile(UPK_FNAME + std::to_string(threadID),
+ std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)),
std::ios_base::app);
std::string upkHdrOut, upkSeqOut, upkQsOut;
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { upkfile << content; };
while (in.peek() != EOF) {
@@ -559,6 +759,7 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) {
in.seekg(begPos); // Read file from this position
// Take a chunk of decrypted file
std::string decText;
+ decText.reserve(chunkSize);
for (u64 u = chunkSize; u--;) {
in.get(c);
decText += c;
@@ -579,24 +780,24 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) {
unshuffle(i, chunkSize);
}
- content += THR_ID_HDR + std::to_string(threadID) + "\n";
+ content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
do {
content += '@';
std::string plusMore;
(this->*unpackHdr)(upkHdrOut, i, upkStruct.hdrUnpack);
plusMore = upkHdrOut;
- content += upkHdrOut + "\n";
+ content += std::format("{}\n", upkHdrOut);
++i; // Hdr
unpack_seq(upkSeqOut, i);
- content += upkSeqOut + "\n"; // Seq
+ content += std::format("{}\n", upkSeqOut); // Seq
- content += (justPlus ? "+" : "+" + plusMore) + "\n";
+ content += justPlus ? "+\n" : std::format("+{}\n", plusMore);
++i; // +
unpack_large(upkQsOut, i, upkStruct.XChar_qs, upkStruct.qsUnpack);
- content += upkQsOut + "\n"; // Qs
+ content += std::format("{}\n", upkQsOut); // Qs
} while (++i != decText.end()); // If trouble: change "!=" to "<"
// Update the chunk size and positions (beg & end)
@@ -605,7 +806,9 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) {
in.get(c);
if (c == (char)253) {
std::string chunkSizeStr;
- while (in.get(c) && c != (char)254) chunkSizeStr += c;
+ while (in.get(c) && c != (char)254) {
+ chunkSizeStr += c;
+ }
chunkSize = stoull(chunkSizeStr);
begPos = in.tellg();
@@ -613,10 +816,10 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) {
}
}
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
}
write_content();
@@ -628,19 +831,19 @@ void Fastq::unpack_hS_qL(const unpackfq_s& upkStruct, byte threadID) {
/**
* @brief Unpack: large header, small quality score.
* '@' at the beginning of headers not packed
- * @param upkStruct Unpack structure
- * @param threadID Thread ID
+ * @param upkStruct Unpack structure
+ * @param threadID Thread ID
*/
void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) {
unpackFP_t unpackQS = upkStruct.unpackQSFPtr; // Function pointer
pos_t begPos = upkStruct.begPos;
u64 chunkSize = upkStruct.chunkSize;
std::ifstream in(DEC_FNAME);
- std::ofstream upkfile(UPK_FNAME + std::to_string(threadID),
+ std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)),
std::ios_base::app);
std::string upkHdrOut, upkSeqOut, upkQsOut;
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { upkfile << content; };
while (in.peek() != EOF) {
@@ -648,6 +851,7 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) {
in.seekg(begPos); // Read file from this position
// Take a chunk of decrypted file
std::string decText;
+ decText.reserve(chunkSize);
for (u64 u = chunkSize; u--;) {
in.get(c);
decText += c;
@@ -668,24 +872,24 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) {
unshuffle(i, chunkSize);
}
- content += THR_ID_HDR + std::to_string(threadID) + "\n";
+ content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
do {
content += "@";
std::string plusMore;
unpack_large(upkHdrOut, i, upkStruct.XChar_hdr, upkStruct.hdrUnpack);
plusMore = upkHdrOut;
- content += upkHdrOut + "\n";
+ content += std::format("{}\n", upkHdrOut);
++i; // Hdr
unpack_seq(upkSeqOut, i);
- content += upkSeqOut + "\n"; // Seq
+ content += std::format("{}\n", upkSeqOut); // Seq
- content += (justPlus ? "+" : "+" + plusMore) + "\n";
+ content += justPlus ? "+\n" : std::format("+{}\n", plusMore);
++i; // +
(this->*unpackQS)(upkQsOut, i, upkStruct.qsUnpack);
- content += upkQsOut + "\n"; // Qs
+ content += std::format("{}\n", upkQsOut); // Qs
} while (++i != decText.end()); // If trouble: change "!=" to "<"
// Update the chunk size and positions (beg & end)
@@ -694,7 +898,9 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) {
in.get(c);
if (c == (char)253) {
std::string chunkSizeStr;
- while (in.get(c) && c != (char)254) chunkSizeStr += c;
+ while (in.get(c) && c != (char)254) {
+ chunkSizeStr += c;
+ }
chunkSize = stoull(chunkSizeStr);
begPos = in.tellg();
@@ -702,10 +908,10 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) {
}
}
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
}
write_content();
@@ -717,18 +923,18 @@ void Fastq::unpack_hL_qS(const unpackfq_s& upkStruct, byte threadID) {
/**
* @brief Unpack: large header, large quality score.
* '@' at the beginning of headers not packed
- * @param upkStruct Unpack structure
- * @param threadID Thread ID
+ * @param upkStruct Unpack structure
+ * @param threadID Thread ID
*/
void Fastq::unpack_hL_qL(const unpackfq_s& upkStruct, byte threadID) {
pos_t begPos = upkStruct.begPos;
u64 chunkSize = upkStruct.chunkSize;
std::ifstream in(DEC_FNAME);
- std::ofstream upkfile(UPK_FNAME + std::to_string(threadID),
+ std::ofstream upkfile(std::format("{}{}", UPK_FNAME, static_cast(threadID)),
std::ios_base::app);
std::string upkHdrOut, upkSeqOut, upkQsOut;
std::string content;
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
auto write_content = [&]() { upkfile << content; };
while (in.peek() != EOF) {
@@ -736,6 +942,7 @@ void Fastq::unpack_hL_qL(const unpackfq_s& upkStruct, byte threadID) {
in.seekg(begPos); // Read file from this position
// Take a chunk of decrypted file
std::string decText;
+ decText.reserve(chunkSize);
for (u64 u = chunkSize; u--;) {
in.get(c);
decText += c;
@@ -756,24 +963,24 @@ void Fastq::unpack_hL_qL(const unpackfq_s& upkStruct, byte threadID) {
unshuffle(i, chunkSize);
}
- content += THR_ID_HDR + std::to_string(threadID) + "\n";
+ content += std::format("{}{}\n", THR_ID_HDR, static_cast(threadID));
do {
content += "@";
std::string plusMore;
unpack_large(upkHdrOut, i, upkStruct.XChar_hdr, upkStruct.hdrUnpack);
plusMore = upkHdrOut;
- content += upkHdrOut + "\n";
+ content += std::format("{}\n", upkHdrOut);
++i; // Hdr
unpack_seq(upkSeqOut, i);
- content += upkSeqOut + "\n"; // Seq
+ content += std::format("{}\n", upkSeqOut); // Seq
- content += (justPlus ? "+" : "+" + plusMore) + "\n";
+ content += justPlus ? "+\n" : std::format("+{}\n", plusMore);
++i; // +
unpack_large(upkQsOut, i, upkStruct.XChar_qs, upkStruct.qsUnpack);
- content += upkQsOut + "\n"; // Qs
+ content += std::format("{}\n", upkQsOut); // Qs
} while (++i != decText.end()); // If trouble: change "!=" to "<"
// Update the chunk size and positions (beg & end)
@@ -782,22 +989,23 @@ void Fastq::unpack_hL_qL(const unpackfq_s& upkStruct, byte threadID) {
in.get(c);
if (c == (char)253) {
std::string chunkSizeStr;
- while (in.get(c) && c != (char)254) chunkSizeStr += c;
-
+ while (in.get(c) && c != (char)254) {
+ chunkSizeStr += c;
+ }
chunkSize = stoull(chunkSizeStr);
begPos = in.tellg();
endPos = begPos + (pos_t)chunkSize;
}
}
- if (content.size() >= BLOCK_SIZE) {
+ if (content.size() >= IO_BUFFER_SIZE) {
write_content();
content.clear();
- content.reserve(BLOCK_SIZE);
+ content.reserve(IO_BUFFER_SIZE);
}
}
write_content();
upkfile.close();
in.close();
-}
\ No newline at end of file
+}
diff --git a/src/fastq.hpp b/src/fastq.hpp
index 407634b..2314812 100644
--- a/src/fastq.hpp
+++ b/src/fastq.hpp
@@ -1,9 +1,9 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file fastq.hpp
- * @brief Compression/Decompression of FASTQ
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file fastq.hpp
+ * @brief Compression/Decompression of FASTQ
*/
#ifndef CRYFA_FASTQ_H
@@ -21,16 +21,14 @@ struct packfq_s {
/** @brief Unpakcing FASTQ */
struct unpackfq_s {
- char XChar_hdr; /**< @brief Extra char if header's length > 39 */
- char XChar_qs; /**< @brief Extra char if q scores length > 39 */
- pos_t begPos; /**< @brief Begining position for each thread */
- u64 chunkSize; /**< @brief Chunk size */
- std::vector
- hdrUnpack; /**< @brief Lookup table for unpacking headers */
- std::vector
- qsUnpack; /**< @brief Lookup table for unpacking q scores */
- unpackFP_t unpackHdrFPtr; /**< @brief Points to a hdr unpacking function */
- unpackFP_t unpackQSFPtr; /**< @brief Points to a qs unpacking function */
+ char XChar_hdr; /**< @brief Extra char if header's length > 39 */
+ char XChar_qs; /**< @brief Extra char if q scores length > 39 */
+ pos_t begPos; /**< @brief Begining position for each thread */
+ u64 chunkSize; /**< @brief Chunk size */
+ std::vector hdrUnpack; /**< @brief Lookup table for unpacking headers */
+ std::vector qsUnpack; /**< @brief Lookup table for unpacking q scores */
+ unpackFP_t unpackHdrFPtr; /**< @brief Points to a hdr unpacking function */
+ unpackFP_t unpackQSFPtr; /**< @brief Points to a qs unpacking function */
};
/**
@@ -48,8 +46,7 @@ class Fastq : public EnDecrypto {
void gather_h_q(std::string&, std::string&);
void set_hashTbl_packFn(packfq_s&, const std::string&, const std::string&);
void pack(const packfq_s&, byte);
- void set_unpackTbl_unpackFn(unpackfq_s&, const std::string&,
- const std::string&);
+ void set_unpackTbl_unpackFn(unpackfq_s&, const std::string&, const std::string&);
void unpack_hS_qS(const unpackfq_s&, byte);
void unpack_hS_qL(const unpackfq_s&, byte);
void unpack_hL_qS(const unpackfq_s&, byte);
@@ -57,4 +54,4 @@ class Fastq : public EnDecrypto {
};
} // namespace cryfa
-#endif // CRYFA_FASTQ_H
\ No newline at end of file
+#endif // CRYFA_FASTQ_H
diff --git a/src/include/assert.hpp b/src/include/assert.hpp
index 029299a..510afd4 100644
--- a/src/include/assert.hpp
+++ b/src/include/assert.hpp
@@ -1,74 +1,74 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file assert.hpp
- * @brief Assertions
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file assert.hpp
+ * @brief Assertion and diagnostic helpers
*/
#ifndef CRYFA_ASSERT_H
#define CRYFA_ASSERT_H
+#include
#include
#include // std::runtime_error
#include "string.hpp"
/**
- * @brief Show error
- * @param message the message to be shown
- * @param width width of the message shown on terminal
+ * @brief Throw a formatted runtime error
+ * @param message Message shown after the error prefix
+ * @param width Maximum terminal width used when wrapping the message
+ * @throws std::runtime_error Always throws with the formatted error message
*/
inline void error(std::string const& message, int width = 65) {
- std::string msg = wrap_text("Error: " + message, "", width);
- msg = bold(msg.substr(0, 6), "red") + msg.substr(6) + "\n";
+ std::string msg = wrap_text(std::format("Error: {}", message), "", width);
+ msg = std::format("{}{}\n", bold(msg.substr(0, 6), "red"), msg.substr(6));
throw std::runtime_error(msg);
}
/**
- * @brief Show warning
- * @param message the message to be shown
- * @param width width of the message shown on terminal
+ * @brief Print a formatted warning message
+ * @param message Message shown after the warning prefix
+ * @param width Maximum terminal width used when wrapping the message
*/
inline void warning(std::string const& message, int width = 65) {
- std::string msg = wrap_text("Warning: " + message, "", width);
- msg = bold(msg.substr(0, 8), "magenta") + msg.substr(8) + "\n";
+ std::string msg = wrap_text(std::format("Warning: {}", message), "", width);
+ msg = std::format("{}{}\n", bold(msg.substr(0, 8), "magenta"), msg.substr(8));
std::cerr << msg;
}
/**
- * @brief Assert a condition
- * @param cond the condition to be checked
- * @param msg the message shown when the condition is true
+ * @brief Throw an error when a condition is true
+ * @param cond Condition that triggers the error
+ * @param msg Message shown when the condition is true
*/
inline void assert_single(bool cond, const std::string& msg) {
if (cond) error(msg);
}
/**
- * @brief Assert a condition
- * @param cond the condition which will be checked
- * @param msgT the message shown when the condition is true
- * @param msgF the message shown when the condition is false
+ * @brief Throw one of two errors based on a condition
+ * @param cond Condition used to select the error message
+ * @param msgT Message shown when the condition is true
+ * @param msgF Message shown when the condition is false
*/
-inline void assert_dual(bool cond, const std::string& msgT,
- const std::string& msgF) {
+inline void assert_dual(bool cond, const std::string& msgT, const std::string& msgF) {
error(cond ? msgT : msgF);
}
/**
- * @brief Check if file is good
- * @param fname the file name
- * @param msg the error message
+ * @brief Check that a file can be opened and is not empty
+ * @param fname File name to check
+ * @param msg Optional custom error message
*/
-inline void assert_file_good(const std::string& fname,
- const std::string& msg = "") {
+inline void assert_file_good(const std::string& fname, const std::string& msg = "") {
std::ifstream in(fname);
if (!in.good() || in.peek() == EOF) {
in.close();
- assert_dual(msg.empty(), "failed opening \"" + fname + "\".", msg);
+ assert_dual(msg.empty(), std::format("failed opening \"{}\".", fname), msg);
}
in.close();
}
-#endif // CRYFA_ASSERT_H
\ No newline at end of file
+#endif // CRYFA_ASSERT_H
diff --git a/src/include/file.hpp b/src/include/file.hpp
index 6863074..5c14299 100644
--- a/src/include/file.hpp
+++ b/src/include/file.hpp
@@ -1,40 +1,44 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file file.hpp
- * @brief file handling
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file file.hpp
+ * @brief File handling
*/
#ifndef CRYFA_FILE_HPP
#define CRYFA_FILE_HPP
+#include
#include
/**
* @brief Check if file can be opened correctly
- * @param name name of the file
+ * @param name Name of the file
*/
inline static void check_file(std::string name) { // Must be inline
std::ifstream f(name);
if (!f) {
f.close();
- error("the file \"" + name + "\" cannot be opened or is empty.");
+ error(std::format("the file \"{}\" cannot be opened or is empty.", name));
} else {
bool foundChar{false};
- for (char c; f.get(c) && !foundChar;)
- if (c != ' ' && c != '\n' && c != '\t') foundChar = true;
+ for (char c; f.get(c) && !foundChar;) {
+ if (c != ' ' && c != '\n' && c != '\t') {
+ foundChar = true;
+ }
+ }
if (!foundChar) {
f.close();
- error("the file \"" + name + "\" is empty.");
+ error(std::format("the file \"{}\" is empty.", name));
}
f.close();
}
}
/**
- * @brief Extract file name
- * @param path path including the file name
+ * @brief Extract file name
+ * @param path Path including the file name
* @return File name
*/
inline static std::string file_name(std::string path) {
@@ -43,8 +47,8 @@ inline static std::string file_name(std::string path) {
}
/**
- * @brief Find file size
- * @param name name of the file
+ * @brief Find file size
+ * @param name Name of the file
* @return File size
*/
inline static uint64_t file_size(std::string name) {
@@ -53,4 +57,4 @@ inline static uint64_t file_size(std::string name) {
return static_cast(f.tellg());
}
-#endif // CRYFA_FILE_HPP
\ No newline at end of file
+#endif // CRYFA_FILE_HPP
diff --git a/src/include/numeric.hpp b/src/include/numeric.hpp
index a38552b..52ff64b 100644
--- a/src/include/numeric.hpp
+++ b/src/include/numeric.hpp
@@ -1,9 +1,9 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
/**
- * @file numeric.hpp
- * @brief numerical functions
- * @author Morteza Hosseini (seyedmorteza@ua.pt)
- * @author Diogo Pratas (pratas@ua.pt)
- * @copyright The GNU General Public License v3.0
+ * @file numeric.hpp
+ * @brief Numerical functions
*/
#ifndef CRYFA_NUMERIC_HPP
@@ -17,24 +17,26 @@
#include "string.hpp"
/**
- * @brief Accumulate hop index values in a range
- * @param first beginning of the range
- * @param last end of the range
- * @param init initial value
- * @param h hop value
+ * @brief Accumulate hop index values in a range
+ * @param first Beginning of the range
+ * @param last End of the range
+ * @param init Initial value
+ * @param h Hop value
* @return A number
*/
template
T accum_hops(Iter first, Iter last, T init, Hop h) {
- for (; first < last; first += h) init += *first;
+ for (; first < last; first += h) {
+ init += *first;
+ }
return init;
}
/**
- * @brief Accumulate even index values in a range
- * @param first beginning of the range
- * @param last end of the range
- * @param init initial value
+ * @brief Accumulate even index values in a range
+ * @param first Beginning of the range
+ * @param last End of the range
+ * @param init Initial value
* @return A number
*/
template
@@ -43,10 +45,10 @@ T accum_even(Iter first, Iter last, T init) {
}
/**
- * @brief Accumulate odd index values in a range
- * @param first beginning of the range
- * @param last end of the range
- * @param init initial value
+ * @brief Accumulate odd index values in a range
+ * @param first Beginning of the range
+ * @param last End of the range
+ * @param init Initial value
* @return A number
*/
template
@@ -55,14 +57,13 @@ T accum_odd(Iter first, Iter last, T init) {
}
/**
- * @brief Check if a string is a number
- * @param s the input string
+ * @brief Check if a string is a number
+ * @param s The input string
* @return Yes, if it is a number
*/
inline bool is_number(const std::string& s) {
assert_single(s.empty(), "the string is empty.");
- return std::find_if(s.begin(), s.end(),
- [](char c) { return !std::isdigit(c); }) == s.end();
+ return std::find_if(s.begin(), s.end(), [](char c) { return !std::isdigit(c); }) == s.end();
}
-#endif // CRYFA_NUMERIC_HPP
\ No newline at end of file
+#endif // CRYFA_NUMERIC_HPP
diff --git a/src/include/ordered_pipeline.hpp b/src/include/ordered_pipeline.hpp
new file mode 100644
index 0000000..9e33b2c
--- /dev/null
+++ b/src/include/ordered_pipeline.hpp
@@ -0,0 +1,162 @@
+// SPDX-FileCopyrightText: 2026 Morteza Hosseini
+// SPDX-License-Identifier: GPL-3.0-only
+
+/**
+ * @file ordered_pipeline.hpp
+ * @brief Ordered pipeline functions
+ */
+
+#ifndef CRYFA_ORDERED_PIPELINE_HPP
+#define CRYFA_ORDERED_PIPELINE_HPP
+
+#include
+#include
+#include
+#include
+#include
+#include