Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions src/haddock/libs/libpdb.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Parse molecular structures in PDB format."""

from math import log
import os
from copy import deepcopy
from functools import partial
Expand All @@ -9,6 +10,7 @@
from pdbtools.pdb_splitchain import run as split_chain
from pdbtools.pdb_splitmodel import run as split_model
from pdbtools.pdb_tidy import run as tidy_pdbfile
from pdbtools.pdb_wc import run as pdb_wc

from haddock.core.exceptions import SetupError
from haddock.core.supported_molecules import supported_residues
Expand All @@ -21,6 +23,7 @@
TypeVar,
Union,
)
from haddock import log as haddock_log
from haddock.libs.libio import PDBFile, working_directory
from haddock.libs.libutil import get_result_or_same_in_list, sort_numbered_paths

Expand Down Expand Up @@ -523,3 +526,67 @@ def check_mol_shape(input_mol: Path) -> bool:
if any('SHA SHA ' in line for line in input_file_mol):
shape = True
return shape

def handle_input_reference(reference: Path) -> list[Path]:
"""Validate the reference file by returning only one model.

Parameters
----------
reference : Path
Path to the input reference structure, possibly containing
an ensemble.

Returns
-------
reference or first_model_path : Path
Path to the reference structure to be used downstream.
"""
if reference.stat().st_size == 0:
raise ValueError(f"Reference file is empty: {reference}")
Comment on lines +544 to +545
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it checks that the file is indeed empty, but not that it contains coordinates. Could be composed of REMARKS or not even be a pdb file

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just checked that case - libalign already handles it with error message "Please check the input file and queried filterings.”

But also now that I look in the code - may-be it’s better to remove this check entirely and instead in libpdb.py > handle_input_reference > line 570

    for line in wc_return.split("\n"):
        if "No. models" in line:
            nb_models = int(line.strip().split()[-1])
            break

add lines to read the number in line No. atoms and if 0 - exit with message like “No atoms found in the reference file, please check”. What do you think?


# Extremly complicated stuff to manage the gathering of the sys.stdout,
# as the pdb_tools.pdb_wc is basically writing on it.
import sys
from io import TextIOWrapper, BytesIO
# Memorize previous sys.stdout
original_stdout = sys.stdout
# setup the new stdout environment
sys.stdout = TextIOWrapper(BytesIO(), sys.stdout.encoding)

# Count number of models
with open(reference, "r") as fh:
pdb_wc(fh, "m")
# Get output
sys.stdout.seek(0) # Jump to the start
wc_return = sys.stdout.read() # Read output
# Restore original stdout
sys.stdout.close()
sys.stdout = original_stdout
# Parse output
# using here `\n` (and not os.linesep) as it is the output for pdb_wc
nb_models = 1 # pdb_wc treats a file without MODEL records as a single model
for line in wc_return.split("\n"):
if "No. models" in line:
nb_models = int(line.strip().split()[-1])
break
# Return reference as only one structure present
if nb_models == 1:
return [reference]
# If more than one model in reference
haddock_log.info(
f"Multiple structures ({nb_models}) found in reference file. "
"Using all conformations as reference."
)
# Split models
with open(reference, "r") as ref_in:
split_model(ref_in, "reference_model")
# Gather individual references and sort them
references = sorted(
list(Path(".").glob("reference_model_*.pdb")),
key=lambda k: int(k.stem.split("_")[-1]),
)
assert len(references) == nb_models, (
"Issue while splitting references conformation: "
f"{nb_models} detected, {len(references)} generated"
)
return references
31 changes: 31 additions & 0 deletions src/haddock/libs/libstructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from functools import partial
from pathlib import Path
from typing import Any, Iterable, Optional
from haddock.libs.libontology import PDBFile


class Molecule:
Expand Down Expand Up @@ -42,3 +43,33 @@ def __init__(self,
def make_molecules(paths: Iterable[Path], **kwargs: Any) -> list[Molecule]:
"""Get input molecules from the data stream."""
return list(map(partial(Molecule, **kwargs), paths))


def find_ff(models: list[PDBFile]) -> str:
"""Finds the force-field information (all-atom or martini) from the topology
associated to the first model. Used in caprieval and rmsdfilter.

The assumption is that the force-fields will be identical between models.

Parameters
-----------
models : list[PDBFile]
List of models where to find the topology

Return
-------
ff : str
The force-field used in those models.
"""
try:
ff = Path(models[0].topology[0].rel_path).stem.split("_")[-1]
except TypeError:
try:
ff = Path(models[0].topology.rel_path).stem.split("_")[-1]
except AttributeError:
ff = "aa"
# In case of issue, fall back to all-atom
if "martini" not in ff:
ff = "aa"

return ff
100 changes: 5 additions & 95 deletions src/haddock/modules/analysis/caprieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
from haddock.core.typing import FilePath, Union
from haddock.libs.libontology import PDBFile
from haddock.libs.libparallel import Scheduler
from haddock.libs.libaa2cg import martinize
from haddock.libs.libstructure import find_ff
from haddock.libs.libpdb import handle_input_reference
from haddock.modules import BaseHaddockModule
from haddock.modules.analysis.caprieval.capri import (
CAPRI,
Expand All @@ -43,9 +46,6 @@
extract_data_from_capri_class,
extract_models_best_references,
)
from haddock.libs.libaa2cg import martinize
from pdbtools.pdb_wc import run as pdb_wc
from pdbtools.pdb_splitmodel import run as pdb_splitmodel


RECIPE_PATH = Path(__file__).resolve().parent
Expand Down Expand Up @@ -77,96 +77,6 @@ def is_nested(models: list[Union[PDBFile, list[PDBFile]]]) -> bool:
return True
return False

@staticmethod
def find_ff(models: list[PDBFile]) -> str:
"""Finds the force-field information (all-atom or martini) from the topology
associated to the first model.

The assumption is that the force-fields will be identical between models.

Parameters
-----------
models : list[PDBFile]
List of models where to find the topology

Return
-------
ff : str
The force-field used in those models.
"""
try:
ff = Path(models[0].topology[0].rel_path).stem.split("_")[-1]
except TypeError:
try:
ff = Path(models[0].topology.rel_path).stem.split("_")[-1]
except AttributeError:
ff = "aa"
# In case of issue, fall back to all-atom
if "martini" not in ff:
ff = "aa"

return ff

def handle_input_reference(self, reference: Path) -> list[Path]:
"""Validate the reference file by returning only one model.

Parameters
----------
reference : Path
Path to the input reference structure, possibly containing
an ensemble.

Returns
-------
reference or first_model_path : Path
Path to the reference structure to be used downstream.
"""
# Extremly complicated stuff to manage the gathering of the sys.stdout,
# as the pdb_tools.pdb_wc is basically writing on it.
import sys
from io import TextIOWrapper, BytesIO
# Memorize previous sys.stdout
original_stdout = sys.stdout
# setup the new stdout environment
sys.stdout = TextIOWrapper(BytesIO(), sys.stdout.encoding)

# Count number of models
with open(reference, "r") as fh:
pdb_wc(fh, "m")
# Get output
sys.stdout.seek(0) # Jump to the start
wc_return = sys.stdout.read() # Read output
# Restore original stdout
sys.stdout.close()
sys.stdout = original_stdout
# Parse output
# using here `\n` (and not os.linesep) as it is the output for pdb_wc
for line in wc_return.split("\n"):
if "No. models" in line:
sline = line.strip().split()
nb_models = int(sline[-1])
break
# Return reference as only one structure present
if nb_models == 1:
return [reference]

self.log(
f"Multiple structures ({nb_models}) found in reference file. "
"Using all conformations as reference."
)
# Split models
with open(reference, "r") as ref_in:
pdb_splitmodel(ref_in, "reference_model")
# Gather individual references and sort them
references = sorted(
list(Path(".").glob("reference_model_*.pdb")),
key=lambda k: int(k.stem.split("_")[-1]),
)
assert len(references) == nb_models, (
"Issue while splitting references conformation: "
f"{nb_models} detected, {len(references)} generated"
)
return references

def get_reference(self, models: list[PDBFile]) -> list[Path]:
"""Manage to obtain the reference structure to be used downstream.
Expand All @@ -184,7 +94,7 @@ def get_reference(self, models: list[PDBFile]) -> list[Path]:
"""
if self.params["reference_fname"]:
_reference = Path(self.params["reference_fname"])
references = self.handle_input_reference(_reference)
references = handle_input_reference(_reference)
else:
self.log(
"No reference structure provided. "
Expand Down Expand Up @@ -216,7 +126,7 @@ def _run(self) -> None:
dump_weights(self.order)

# Find force-field
ff = self.find_ff(models)
ff = find_ff(models)
# Get reference file
if ff == "martini2":
references = [
Expand Down
Loading
Loading