haddocking · AnnaKravchenko · May 1, 2026 · May 1, 2026 · VGPReys · May 2, 2026
diff --git a/src/haddock/libs/libpdb.py b/src/haddock/libs/libpdb.py
@@ -1,5 +1,6 @@
 """Parse molecular structures in PDB format."""
 
+from math import log
 import os
 from copy import deepcopy
 from functools import partial
@@ -9,6 +10,7 @@
 from pdbtools.pdb_splitchain import run as split_chain
 from pdbtools.pdb_splitmodel import run as split_model
 from pdbtools.pdb_tidy import run as tidy_pdbfile
+from pdbtools.pdb_wc import run as pdb_wc
 
 from haddock.core.exceptions import SetupError
 from haddock.core.supported_molecules import supported_residues
@@ -21,6 +23,7 @@
     TypeVar,
     Union,
 )
+from haddock import log as haddock_log
 from haddock.libs.libio import PDBFile, working_directory
 from haddock.libs.libutil import get_result_or_same_in_list, sort_numbered_paths
 
@@ -523,3 +526,67 @@ def check_mol_shape(input_mol: Path) -> bool:
         if any('SHA SHA ' in line for line in input_file_mol):
             shape = True
     return shape
+
+def handle_input_reference(reference: Path) -> list[Path]:
+    """Validate the reference file by returning only one model.
+
+    Parameters
+    ----------
+    reference : Path
+        Path to the input reference structure, possibly containing
+        an ensemble.
+
+    Returns
+    -------
+    reference or first_model_path : Path
+        Path to the reference structure to be used downstream.
+    """
+    if reference.stat().st_size == 0:
+        raise ValueError(f"Reference file is empty: {reference}")
+
+    # Extremly complicated stuff to manage the gathering of the sys.stdout,
+    # as the pdb_tools.pdb_wc is basically writing on it.
+    import sys
+    from io import TextIOWrapper, BytesIO
+    # Memorize previous sys.stdout
+    original_stdout = sys.stdout
+    # setup the new stdout environment
+    sys.stdout = TextIOWrapper(BytesIO(), sys.stdout.encoding)
+
+    # Count number of models
+    with open(reference, "r") as fh:
+        pdb_wc(fh, "m")
+    # Get output
+    sys.stdout.seek(0)  # Jump to the start
+    wc_return = sys.stdout.read()  # Read output
+    # Restore original stdout
+    sys.stdout.close()
+    sys.stdout = original_stdout
+    # Parse output
+    # using here `\n` (and not os.linesep) as it is the output for pdb_wc
+    nb_models = 1  # pdb_wc treats a file without MODEL records as a single model
+    for line in wc_return.split("\n"):
+        if "No. models" in line:
+            nb_models = int(line.strip().split()[-1])
+            break
+    # Return reference as only one structure present
+    if nb_models == 1:
+        return [reference]
+    # If more than one model in reference
+    haddock_log.info(
+        f"Multiple structures ({nb_models}) found in reference file. "
+        "Using all conformations as reference."
+        )
+    # Split models
+    with open(reference, "r") as ref_in:
+        split_model(ref_in, "reference_model")
+    # Gather individual references and sort them
+    references = sorted(
+        list(Path(".").glob("reference_model_*.pdb")),
+        key=lambda k: int(k.stem.split("_")[-1]),
+        )
+    assert len(references) == nb_models, (
+            "Issue while splitting references conformation: "
+            f"{nb_models} detected, {len(references)} generated"
+        )
+    return references
diff --git a/src/haddock/libs/libstructure.py b/src/haddock/libs/libstructure.py
@@ -2,6 +2,7 @@
 from functools import partial
 from pathlib import Path
 from typing import Any, Iterable, Optional
+from haddock.libs.libontology import PDBFile
 
 
 class Molecule:
@@ -42,3 +43,33 @@ def __init__(self,
 def make_molecules(paths: Iterable[Path], **kwargs: Any) -> list[Molecule]:
     """Get input molecules from the data stream."""
     return list(map(partial(Molecule, **kwargs), paths))
+
+
+def find_ff(models: list[PDBFile]) -> str:
+    """Finds the force-field information (all-atom or martini) from the topology 
+    associated to the first model. Used in caprieval and rmsdfilter.
+
+    The assumption is that the force-fields will be identical between models.
+
+    Parameters
+    -----------
+    models : list[PDBFile]
+        List of models where to find the topology
+
+    Return
+    -------
+    ff : str
+        The force-field used in those models.
+    """
+    try:
+        ff = Path(models[0].topology[0].rel_path).stem.split("_")[-1]
+    except TypeError:
+        try:
+            ff = Path(models[0].topology.rel_path).stem.split("_")[-1]
+        except AttributeError:
+            ff = "aa"
+    # In case of issue, fall back to all-atom
+    if "martini" not in ff:
+        ff = "aa"
+
+    return ff
diff --git a/src/haddock/modules/analysis/caprieval/__init__.py b/src/haddock/modules/analysis/caprieval/__init__.py
@@ -35,6 +35,9 @@
 from haddock.core.typing import FilePath, Union
 from haddock.libs.libontology import PDBFile
 from haddock.libs.libparallel import Scheduler
+from haddock.libs.libaa2cg import martinize
+from haddock.libs.libstructure import find_ff
+from haddock.libs.libpdb import handle_input_reference
 from haddock.modules import BaseHaddockModule
 from haddock.modules.analysis.caprieval.capri import (
     CAPRI,
@@ -43,9 +46,6 @@
     extract_data_from_capri_class,
     extract_models_best_references,
     )
-from haddock.libs.libaa2cg import martinize
-from pdbtools.pdb_wc import run as pdb_wc
-from pdbtools.pdb_splitmodel import run as pdb_splitmodel
 
 
 RECIPE_PATH = Path(__file__).resolve().parent
@@ -77,96 +77,6 @@ def is_nested(models: list[Union[PDBFile, list[PDBFile]]]) -> bool:
                 return True
         return False
 
-    @staticmethod
-    def find_ff(models: list[PDBFile]) -> str:
-        """Finds the force-field information (all-atom or martini) from the topology 
-        associated to the first model.
-
-        The assumption is that the force-fields will be identical between models.
-
-        Parameters
-        -----------
-        models : list[PDBFile]
-            List of models where to find the topology
-
-        Return
-        -------
-        ff : str
-            The force-field used in those models.
-        """
-        try:
-            ff = Path(models[0].topology[0].rel_path).stem.split("_")[-1]
-        except TypeError:
-            try:
-                ff = Path(models[0].topology.rel_path).stem.split("_")[-1]
-            except AttributeError:
-                ff = "aa"
-        # In case of issue, fall back to all-atom
-        if "martini" not in ff:
-            ff = "aa"
-
-        return ff
-
-    def handle_input_reference(self, reference: Path) -> list[Path]:
-        """Validate the reference file by returning only one model.
-
-        Parameters
-        ----------
-        reference : Path
-            Path to the input reference structure, possibly containing
-            an ensemble.
-
-        Returns
-        -------
-        reference or first_model_path : Path
-            Path to the reference structure to be used downstream.
-        """
-        # Extremly complicated stuff to manage the gathering of the sys.stdout,
-        # as the pdb_tools.pdb_wc is basically writing on it.
-        import sys
-        from io import TextIOWrapper, BytesIO
-        # Memorize previous sys.stdout
-        original_stdout = sys.stdout
-        # setup the new stdout environment
-        sys.stdout = TextIOWrapper(BytesIO(), sys.stdout.encoding)
-
-        # Count number of models
-        with open(reference, "r") as fh:
-            pdb_wc(fh, "m")
-        # Get output
-        sys.stdout.seek(0)  # Jump to the start
-        wc_return = sys.stdout.read()  # Read output
-        # Restore original stdout
-        sys.stdout.close()
-        sys.stdout = original_stdout
-        # Parse output
-        # using here `\n` (and not os.linesep) as it is the output for pdb_wc
-        for line in wc_return.split("\n"):
-            if "No. models" in line:
-                sline = line.strip().split()
-                nb_models = int(sline[-1])
-                break
-        # Return reference as only one structure present
-        if nb_models == 1:
-            return [reference]
-
-        self.log(
-            f"Multiple structures ({nb_models}) found in reference file. "
-            "Using all conformations as reference."
-            )
-        # Split models
-        with open(reference, "r") as ref_in:
-            pdb_splitmodel(ref_in, "reference_model")
-        # Gather individual references and sort them
-        references = sorted(
-            list(Path(".").glob("reference_model_*.pdb")),
-            key=lambda k: int(k.stem.split("_")[-1]),
-            )
-        assert len(references) == nb_models, (
-                "Issue while splitting references conformation: "
-                f"{nb_models} detected, {len(references)} generated"
-            )
-        return references
 
     def get_reference(self, models: list[PDBFile]) -> list[Path]:
         """Manage to obtain the reference structure to be used downstream.
@@ -184,7 +94,7 @@ def get_reference(self, models: list[PDBFile]) -> list[Path]:
         """
         if self.params["reference_fname"]:
             _reference = Path(self.params["reference_fname"])
-            references = self.handle_input_reference(_reference)
+            references = handle_input_reference(_reference)
         else:
             self.log(
                 "No reference structure provided. "
@@ -216,7 +126,7 @@ def _run(self) -> None:
         dump_weights(self.order)
 
         # Find force-field
-        ff = self.find_ff(models)
+        ff = find_ff(models)
         # Get reference file
         if ff == "martini2":
             references = [