"""
Module defines basic interfaces to interact with macromolecular structures.
"""
from __future__ import annotations
import logging
import operator as op
import typing as t
from collections import abc, defaultdict
from dataclasses import dataclass
from functools import reduce
from io import IOBase
from os import PathLike
from pathlib import Path
import biotite.structure as bst
import numpy as np
import numpy.typing as npt
import rustworkx as rx
from more_itertools import unique_everseen
from toolz import keyfilter
import lXtractor.core.segment as lxs
from lXtractor.core.base import ResNameDict
from lXtractor.core.config import AtomMark, DefaultConfig, EMPTY_ALTLOC
from lXtractor.core.exceptions import NoOverlap, InitError, LengthMismatch, MissingData
from lXtractor.core.ligand import Ligand, make_ligand, ligands_from_atom_marks
from lXtractor.util import get_files
from lXtractor.util.misc import json_to_molgraph, graph_reindex_nodes, molgraph_to_json
from lXtractor.util.structure import (
filter_selection,
load_structure,
save_structure,
filter_solvent_extended,
iter_residue_masks,
mark_polymer_type,
to_graph,
find_first_polymer_type,
extend_residue_mask,
compare_coord,
)
LOGGER = logging.getLogger(__name__)
RES_DICT = ResNameDict()
_POL_MARKS = {
"c": AtomMark.CARB,
"n": AtomMark.NUC,
"p": AtomMark.PEP,
"x": AtomMark.UNK,
}
[docs]
@dataclass(frozen=True)
class Masks:
primary_polymer: npt.NDArray[np.bool_]
primary_polymer_ptm: npt.NDArray[np.bool_]
primary_polymer_modified: npt.NDArray[np.bool_]
solvent: npt.NDArray[np.bool_]
ligand: npt.NDArray[np.bool_]
ligand_covalent: npt.NDArray[np.bool_]
ligand_poly: npt.NDArray[np.bool_]
ligand_nonpoly: npt.NDArray[np.bool_]
ligand_pep: npt.NDArray[np.bool_]
ligand_nuc: npt.NDArray[np.bool_]
ligand_carb: npt.NDArray[np.bool_]
unk: npt.NDArray[np.bool_]
[docs]
class GenericStructure:
"""
A generic macromolecular structure with possibly many chains holding
a single :class:`biotite.structure.AtomArray` instance.
This object is a core data structure in `lXtractor` for structural data.
The object is considered immutable: atoms of a structure can't change their
location or properties, as well as other protected attributes.
While atoms are stored as :class:`biotite.structure.AtomArray`,
`GenericStructure` defines additional annotations for each atom and
operations crucial for other objects such as
:class:`lXtractor.core.chain.ChainStructure`.
Upon initialization, atom array attains graph representation (:meth:`graph`)
using :func:`lXtractor.util.structure.to_graph` function. Using this
representation, atom annotations are attained via :func``mark_atoms_g`.
These annotations can be accessed via :meth:`atom_marks`. For convenience,
boolean masks are stored and can be applied to the :meth:`array` as follows:
.. code-block:: python
# Assume ``s`` is a :class:`GenericStructure` object.
s[s.mask.`mask_name`]
To view available mask names, see :class:`Masks`.
One of the most crucial annotations is the so-called "primary_polymer".
These atoms serve as a frame of reference for all other atoms in a structure.
The rest of the atoms are categorized as either ligand or solvent. Sometimes
the annotation process fails to identify certain atoms. In such cases, a
warning is logged. To view uncategorized atoms, one can use the following mask:
.. code-block:: python
s[s.mask.unk]
.. note::
Using ``__getitem__(item)`` like in ``s[s.mask.unk`` will return an
atom array. Use :meth:`subset` to obtain a new generic structure or
initialize a new ``GenericStructure(s[s.mask.unk] instance; it will be
equivalent.
Methods ``__repr__`` and ``__str__`` output a string in the format:
``{_name}:{polymer_chain_ids};{ligand_chain_ids}|{altloc_ids}``
where ``*ids`` are ","-separated.
"""
__slots__ = (
"_atom_marks",
"_array",
"_graph",
"_name",
"_ligands",
"_mask",
"_id",
)
[docs]
def __init__(
self,
array: bst.AtomArray,
name: str,
ligands: abc.Sequence[Ligand] | None = None,
atom_marks: npt.NDArray[int] | PathLike | None = None,
graph: rx.PyGraph | dict | PathLike | None = None,
):
"""
:param array: Atom array object.
:param name: ID of a structure in `array`.
:param ligands: A list of ligands or flag indicating to extract ligands
during initialization.
"""
#: Atom array object.
self._array: bst.AtomArray = array
#: ID of a structure in `array`.
self._name: str = name
if isinstance(graph, rx.PyGraph):
self._graph = graph
elif isinstance(graph, dict | PathLike):
self._graph = json_to_molgraph(graph)
else:
self._graph = to_graph(array, True)
if len(self._graph) != len(self._array):
raise LengthMismatch(
f"The number of nodes in a graph ({len(self._graph)}) does not match "
f"the number of atoms in the array ({len(self._array)})."
)
if atom_marks is None:
atom_marks, primary_pol_type, _ligands = mark_atoms_g(self)
atom_marks.flags.writeable = False
self._atom_marks = atom_marks
if isinstance(ligands, list):
_ligands = ligands
else:
if isinstance(atom_marks, PathLike):
atom_marks = np.load(atom_marks)
if not isinstance(atom_marks, np.ndarray):
raise TypeError(
f"Expected `atom_marks` to be an array, got {type(atom_marks)}"
)
if len(atom_marks) != len(array):
raise LengthMismatch(
"The lengths of `atom_marks` and `array` must match. "
f"Got {len(atom_marks)} and {len(array)}."
)
self._atom_marks = atom_marks
# determine primary polymer type
primary_pol_type = find_first_polymer_type(atom_marks)
if not isinstance(ligands, list):
_ligands = list(ligands_from_atom_marks(self))
else:
_ligands = ligands
if any(not isinstance(x, Ligand) for x in _ligands):
raise TypeError(
"Some entries in supplied `ligands` are not of the `Ligand` type"
)
#: A tuple of ligands
self._ligands: tuple[Ligand, ...] = tuple(
sorted(_ligands, key=lambda x: (x.res_name, x.res_id, x.chain_id))
)
if np.any(atom_marks == AtomMark.UNK):
num_unk = np.sum(atom_marks == AtomMark.UNK)
LOGGER.warning(f"Structure {name} has {num_unk} uncategorized atoms.")
ligand_pep = atom_marks == (AtomMark.PEP | AtomMark.LIGAND)
ligand_nuc = atom_marks == (AtomMark.NUC | AtomMark.LIGAND)
ligand_carb = atom_marks == (AtomMark.CARB | AtomMark.LIGAND)
ligand_poly = ligand_pep | ligand_nuc | ligand_carb
ligand_nonpoly = atom_marks == AtomMark.LIGAND
ligand_covalent = atom_marks == (AtomMark.LIGAND | AtomMark.COVALENT)
if primary_pol_type == "x":
primary_polymer = np.full_like(atom_marks, False, bool)
else:
primary_polymer = atom_marks == _POL_MARKS[primary_pol_type]
primary_polymer_ptm = primary_polymer & array.hetero
primary_polymer_mod = (
atom_marks == (AtomMark.SOLVENT | AtomMark.COVALENT)
) | primary_polymer_ptm
self._mask = Masks(
primary_polymer=primary_polymer,
primary_polymer_ptm=primary_polymer_ptm,
primary_polymer_modified=primary_polymer_mod,
solvent=(atom_marks == AtomMark.SOLVENT),
ligand=ligand_nonpoly | ligand_poly,
ligand_covalent=ligand_covalent,
ligand_nonpoly=ligand_nonpoly,
ligand_poly=ligand_poly,
ligand_pep=ligand_pep,
ligand_nuc=ligand_nuc,
ligand_carb=ligand_carb,
unk=(atom_marks == AtomMark.UNK),
)
self._id = self._make_id()
def __len__(self) -> int:
return len(self.array)
def __eq__(self, other: t.Any) -> bool:
if isinstance(other, GenericStructure):
return (
self._name == other._name
and len(self) == len(other)
and np.all(self.atom_marks == other.atom_marks)
and compare_coord(self.array, other.array)
and self.ligands == other.ligands
)
return False
def __hash__(self) -> int:
atoms = tuple(
(a.chain_id, a.res_id, a.res_name, a.atom_name, tuple(a.coord))
for a in self.array
)
return hash(self._name) + hash(atoms)
def __str__(self) -> str:
return self.id
def __repr__(self) -> str:
return self.id
def __getitem__(self, item: t.Any) -> bst.AtomArray:
return self._array.__getitem__(item)
def _make_id(self) -> str:
chains_pol = ",".join(sorted(self.chain_ids_polymer))
chains_lig = ",".join(sorted(self.chain_ids_ligand))
altloc_ids = ",".join(filter(lambda x: x not in EMPTY_ALTLOC, self.altloc_ids))
return f"{self._name}:{chains_pol};{chains_lig}|{altloc_ids}"
@property
def atom_marks(self) -> npt.NDArray[np.int_]:
"""
:return: An array of :class:`lXtractor.core.config.AtomMark` marks,
categorizing each atom in this structure.
"""
return self._atom_marks
@property
def graph(self) -> rx.PyGraph:
"""
:return: A structure's graph representation.
"""
return self._graph
@property
def id(self) -> str:
"""
:return: An identifier of this structure. It's composed once upon
initialization and has the following format:
``{_name}:{polymer_chain_ids};{ligand_chain_ids}|{altloc_ids}``.
It should uniquely identify a structure, i.e., one should expect
two structures with the same ID to be identical.
"""
return self._id
@property
def name(self) -> str:
"""
:return: A name of the structure.
"""
return self._name
@name.setter
def name(self, value: str):
if not isinstance(value, str):
raise TypeError(f"New ID must have the `str` type. Got {type(value)}")
self._name = value
self._id = self._make_id()
@property
def array(self) -> bst.AtomArray:
"""
:return: Atom array object.
"""
return self._array
@array.setter
def array(self, _: t.Any) -> None:
raise RuntimeError("Array cannot be set. Please initialize a new structure")
@property
def mask(self) -> Masks:
return self._mask
@property
def altloc_ids(self) -> list[str]:
"""
:return: A sorted list of altloc IDs. If none found, will output ``[""]``.
"""
if hasattr(self.array, "altloc_id"):
return sorted(set(self.array.altloc_id))
return [""]
@property
def chain_ids(self) -> list[str]:
"""
:return: A list of chain IDs this structure encompasses.
"""
return list(unique_everseen(self.array.chain_id))
@property
def chain_ids_polymer(self) -> list[str]:
"""
:return: A list of polymer chain IDs.
"""
return list(unique_everseen(self.array[self.mask.primary_polymer].chain_id))
@property
def chain_ids_ligand(self) -> list[str]:
"""
:return: A set of ligand chain IDs.
"""
return list(unique_everseen(lig.chain_id for lig in self.ligands))
@property
def ligands(self) -> tuple[Ligand, ...]:
"""
:return: A list of ligands.
"""
return self._ligands
@ligands.setter
def ligands(self, _):
raise RuntimeError("Cannot set ligands")
@property
def is_empty(self) -> bool:
"""
:return: ``True`` if the :meth:`array` is empty.
"""
return len(self) == 0
@property
def is_empty_polymer(self) -> bool:
"""
Check if there are any polymer atoms.
:return: ``True`` if there are >=1 polymer atoms and ``False``
otherwise.
"""
return self.mask.primary_polymer.sum() == 0
@property
def is_singleton(self) -> bool:
"""
:return: ``True`` if the structure contains a single residue.
"""
return bst.get_residue_count(self.array) == 1
[docs]
@classmethod
def make_empty(
cls, structure_id: str = DefaultConfig["unknowns"]["structure_id"]
) -> t.Self:
"""
:param structure_id: (Optional) ID of the created array.
:return: An instance with empty :meth:`array`.
"""
return cls(bst.AtomArray(0), structure_id, [])
[docs]
@classmethod
def read(
cls,
inp: IOBase | Path | str | bytes,
path2id: abc.Callable[[Path], str] = lambda p: p.name.split(".")[0],
structure_id: str = DefaultConfig["unknowns"]["structure_id"],
altloc: bool | str = False,
**kwargs,
) -> t.Self:
"""
Parse the atom array from the provided input and wrap it into the
:class:`GenericStructure` object.
.. seealso::
:func:`lXtractor.util.structure.load_structure`
.. note::
If `inp` is not a ``Path``, ``kwargs`` must contain the correct
``fmt`` (e.g., ``fmt=cif``).
:param inp: Path to a structure in supported format.
:param path2id: A callable obtaining a PDB ID from the file path.
By default, it's a ``Path.stem``.
:param structure_id: A structure unique identifier (e.g., PDB ID). If
not provided and the input is ``Path``, will use ``path2id`` to
infer the ID. Otherwise, will use a constant placeholder.
:param altloc: Parse alternative locations and populate
``array.altloc_id`` attribute.
:param kwargs: Passed to ``load_structure``.
:return: Parsed structure.
"""
if isinstance(altloc, bool) and altloc:
altloc = "all"
if isinstance(altloc, str):
kwargs["altloc"] = altloc
array = load_structure(inp, **kwargs)
empty_id = DefaultConfig["unknowns"]["structure_id"]
if hasattr(array, "altloc_id"):
array.altloc_id[np.isin(array.altloc_id, EMPTY_ALTLOC)] = ""
if isinstance(inp, Path) and structure_id == empty_id:
structure_id = path2id(inp)
if isinstance(array, bst.AtomArrayStack):
raise InitError(
f"{inp} is likely an NMR structure. "
f"NMR structures are not supported."
)
if isinstance(inp, Path):
files = get_files(inp.parent)
name = inp.stem.split(".")[0]
marks_name, graph_name = f"{name}.npy", f"{name}.json"
marks = files.get(marks_name, None)
graph = files.get(graph_name, None)
else:
marks, graph = None, None
return cls(array, structure_id, ligands=None, atom_marks=marks, graph=graph)
[docs]
def write(
self, path: PathLike | str, atom_marks: bool = True, graph: bool = True
) -> Path:
"""
Save this structure to a file. The format is automatically determined
from the given path.
Additional files are saved using the same filename alongside the
structure file. The filename will resolve to "structure" in all the
following cases and result in "structure.npy" and "structure.json"
files saved to the same dir::
path="/path/to/structure.pdb"
path="/path/to/structure.mmtf.gz"
path="/path/to/structure.with.many.dots.pdb.gz"
.. seealso::
:func:`lXtractor.util.structure.save_structure`.
:param path: A path or a path-like object compatible with :func:`open`.
Must not point to an existing directory. Must provide the structure
format as an extension.
:param atom_marks: Save an array of atom marks in the `npy` format.
:param graph: Save molecular connectivity graph in the `json` format.
:return: Path to the saved structure if writing was successful.
"""
if not isinstance(path, Path):
path = Path(path)
saved_path = save_structure(self.array, path)
stem = path.stem.split(".")[0]
if atom_marks:
np.save(path.parent / f"{stem}.npy", self.atom_marks)
if graph:
json_path = path.parent / f"{stem}.json"
molgraph_to_json(self.graph, path=json_path)
return saved_path
[docs]
def get_sequence(self) -> abc.Generator[tuple[str, str, int]]:
"""
:return: A generator over tuples, where each residue is described by:
(1) one-letter code, (2) three-letter code, (3) residue number.
"""
if self.is_empty:
return []
m = self.mask.primary_polymer
a = self.array if not np.any(m) else self.array[m]
first_atoms = (r[0] for r in bst.residue_iter(a))
for i, a in enumerate(first_atoms, start=1):
try:
res3 = RES_DICT[a.res_name]
except KeyError:
LOGGER.warning(
f"Failed to obtain a single-letter code for residue {a.res_name} "
f"at {i}-th position ({a.res_id + 1}-th first atom position in "
f"protein) for a structure {self}."
)
res3 = "X"
yield res3, a.res_name, a.res_id
[docs]
def subset(
self,
mask: np.ndarray,
ligands: bool = True,
reinit_ligands: bool = False,
copy: bool = False,
) -> t.Self:
"""
Create a sub-structure potentially preserving connected :meth:`ligands`.
.. warning::
If ``DefaultConfig["structure"]["primary_pol_type"]`` is set to auto,
and `mask` points to a polymer that is shorter than some existing
ligand polymer, this ligand polymer will become a primary polymer
in the substructure.
:param mask: Boolean mask, ``True`` for atoms in :meth:`array`, used
to create a sub-structure.
:param ligands: Keep ligands that are connected to atoms specified by
`mask`.
:param reinit_ligands: Reinitialize ligands upon creating a sub-structure,
rather than filtering existing ligands connected to atoms specified
by `mask`. Takes precedence over the `ligands` option. This option
is used in :meth:`split_altloc`.
:param copy: Copy the atom array resulting from subsetting the original
one.
:return: A new instance with atoms defined by `mask` and connected
ligands.
"""
if reinit_ligands:
ligands = False
if ligands:
# Filter connected ligands
ligands = list(
filter(
lambda lig: lig.is_locally_connected(mask),
self.ligands,
)
)
# Extend mask by atoms from the connected ligands
ligand_mask = reduce(
op.or_,
(lig.mask for lig in ligands),
np.zeros_like(self.array.res_id, dtype=bool),
)
else:
ligand_mask = mask
_mask = mask | ligand_mask
a = self.array[_mask]
g = graph_reindex_nodes(self.graph.subgraph(np.where(_mask)[0]))
m = self.atom_marks[_mask]
if copy:
a = a.copy()
m = m.copy()
s = self.__class__(
a, self.name, ligands=True if reinit_ligands else [], atom_marks=m, graph=g
)
if ligands:
new_ligands = []
names = DefaultConfig["metadata"]
retain_names = [
names["res_name"],
names["res_id"],
names["structure_chain_id"],
]
for lig in ligands:
meta = keyfilter(lambda x: x in retain_names, lig.meta)
new_ligands.append(
Ligand(
s,
lig.mask[_mask],
lig.contact_mask[_mask],
lig.ligand_idx[_mask],
lig.dist[_mask],
meta,
)
)
s._ligands = tuple(
sorted(new_ligands, key=lambda x: (x.res_name, x.res_id, x.chain_id))
)
s._id = s._make_id()
return s
[docs]
def rm_solvent(self, copy: bool = False):
"""
:param copy: Copy the resulting substructure.
:return: A substructure with solvent molecules removed.
"""
return self.subset(~self.mask.solvent, ligands=True, copy=copy)
[docs]
def split_chains(self, polymer: bool = False, **kwargs) -> abc.Iterator[t.Self]:
"""
Split into separate chains. Splitting is done using
:func:`biotite.structure.get_chain_starts`.
.. note::
Preserved ligands may have a different ``chain_id``.
.. note::
If there is a single chain, this method will return ``self``.
:param polymer: Use only primary polymer chains for splitting.
:param kwargs: Passed to :meth:`subset`.
:return: An iterable over chains found in :attr:`array`.
"""
if polymer:
chain_ids = self.chain_ids_polymer
else:
chain_ids = self.chain_ids
# a = self.array.copy() if copy else self.array
a = self.array
if len(chain_ids) == 1:
yield self
return
for chain_id in sorted(chain_ids):
mask = a.chain_id == chain_id
yield self.subset(mask, **kwargs)
[docs]
def split_altloc(self, **kwargs) -> abc.Iterator[t.Self]:
"""
Split into substructures based on altloc IDs. Atoms missing altloc
annotations are distributed into every substructure. Thus, even if
a structure contains a single atom having altlocs (say, A and B),
this method will produce two substructed identical except for this
atom.
.. note::
If :meth:`array` does not specify any altloc ID, the method yields
``self``.
:param kwargs: Passed to :meth:`subset`.
:return: An iterator over objects of the same type initialized by
atoms having altloc annotations.
"""
ids = self.altloc_ids
if len(ids) == 1:
yield self
return
if "reinit_ligands" not in kwargs:
kwargs["reinit_ligands"] = True
no_alt_mask = np.isin(self.array.altloc_id, EMPTY_ALTLOC)
for altloc in ids[1:]:
m = no_alt_mask | (self.array.altloc_id == altloc)
yield self.subset(m, **kwargs)
[docs]
def superpose(
self,
other: GenericStructure | bst.AtomArray,
res_id_self: abc.Iterable[int] | None = None,
res_id_other: abc.Iterable[int] | None = None,
atom_names_self: abc.Iterable[abc.Sequence[str]]
| abc.Sequence[str]
| None = None,
atom_names_other: abc.Iterable[abc.Sequence[str]]
| abc.Sequence[str]
| None = None,
mask_self: np.ndarray | None = None,
mask_other: np.ndarray | None = None,
) -> tuple[GenericStructure, float, tuple[np.ndarray, np.ndarray, np.ndarray]]:
"""
Superpose other structure to this one.
Arguments to this function all serve a single purpose: to correctly
subset both structures so the resulting selections have the same number
of atoms.
The subsetting achieved either by specifying residue numbers and atom
names or by supplying a binary mask of the same length as the number
of atoms in the structure.
:param other: Other :class:`GenericStructure` or atom array.
:param res_id_self: Residue numbers to select in this structure.
:param res_id_other: Residue numbers to select in other structure.
:param atom_names_self: Atom names to select in this structure given
either per-residue or as a single sequence broadcasted to selected
residues.
:param atom_names_other: Same as `self`.
:param mask_self: Binary mask to select atoms. Takes precedence over
other selection arguments.
:param mask_other: Same as `self`.
:return: A tuple of (1) an `other` structure superposed onto this one,
(2) an RMSD of the superposition, and (3) a transformation that
had been used with :func:`biotite.structure.superimpose_apply`.
"""
def _get_mask(a, res_id, atom_names):
if res_id:
m = filter_selection(a, res_id, atom_names)
else:
if atom_names:
m = np.isin(a.atom_name, atom_names)
else:
m = np.ones_like(a, bool)
return m
if self.is_empty or other.is_empty:
raise MissingData("Superposing empty structures is not supported")
if mask_self is None:
mask_self = _get_mask(self.array, res_id_self, atom_names_self)
if mask_other is None:
mask_other = _get_mask(other.array, res_id_other, atom_names_other)
num_self, num_other = mask_self.sum(), mask_other.sum()
if num_self != num_other:
raise LengthMismatch(
f"To superpose, the number of atoms must match. "
f"Got {num_self} in self and {num_other} in other."
)
if num_self == num_other == 0:
raise MissingData("No atoms selected")
superposed, transformation = bst.superimpose(
self.array[mask_self], other.array[mask_other]
)
other_transformed = bst.superimpose_apply(other.array, transformation)
rmsd_target = bst.rmsd(self.array[mask_self], superposed)
return (
GenericStructure(other_transformed, other._name),
rmsd_target,
transformation,
)
[docs]
class ProteinStructure(GenericStructure):
"""
A structure type where primary polymer is peptide.
.. seealso::
:class:`GenericStructure` for general-purpose documentation.
"""
[docs]
def __init__(
self,
array: bst.AtomArray,
structure_id: str,
ligands: bool | list[Ligand] = True,
atom_marks: npt.NDArray[int] | PathLike | None = None,
graph: rx.PyGraph | dict | PathLike | None = None,
):
with DefaultConfig.temporary_namespace():
DefaultConfig["structure"]["primary_pol_type"] = "p"
super().__init__(array, structure_id, ligands, atom_marks, graph)
[docs]
class NucleotideStructure(GenericStructure):
"""
A structure type where primary polymer is nucleotide.
.. seealso::
:class:`GenericStructure` for general-purpose documentation.
"""
[docs]
def __init__(
self,
array: bst.AtomArray,
structure_id: str,
ligands: bool | list[Ligand] = True,
atom_marks: npt.NDArray[int] | PathLike | None = None,
graph: rx.PyGraph | dict | PathLike | None = None,
):
with DefaultConfig.temporary_namespace():
DefaultConfig["structure"]["primary_pol_type"] = "n"
super().__init__(array, structure_id, ligands, atom_marks, graph)
[docs]
class CarbohydrateStructure(GenericStructure):
"""
A structure type where primary polymer is carbohydrate.
.. seealso::
:class:`GenericStructure` for general-purpose documentation.
"""
[docs]
def __init__(
self,
array: bst.AtomArray,
structure_id: str,
ligands: bool | list[Ligand] = True,
atom_marks: npt.NDArray[int] | PathLike | None = None,
graph: rx.PyGraph | dict | PathLike | None = None,
):
with DefaultConfig.temporary_namespace():
DefaultConfig["structure"]["primary_pol_type"] = "c"
super().__init__(array, structure_id, ligands, atom_marks, graph)
[docs]
def mark_atoms(
structure: GenericStructure,
) -> tuple[npt.NDArray[np.int_], list[Ligand]]:
"""
Mark each atom in structure according to
:class:`lXtractor.core.config.AtomMark`.
This function is used upon initializing :class:`GenericStructure` and its
subclasses, storing the output under :attr:`GenericStructure.atom_marks`.
:param structure: An arbitrary structure.
:return: An array of atom marks (equivalently, classes or types).
"""
a = structure.array
is_solv = filter_solvent_extended(a)
pol_types = mark_polymer_type(a, DefaultConfig["structure"]["n_monomers"])
is_nuc, is_pep, is_carb = (pol_types == p for p in ["n", "p", "c"])
is_any_pol = pol_types != "x"
is_solv[is_any_pol] = False
marks = np.full(len(a), AtomMark.UNK)
marks[is_solv] = AtomMark.SOLVENT
match DefaultConfig["structure"]["primary_pol_type"][0]:
case "p":
is_pol, pol_type = is_pep, "p"
case "n":
is_pol, pol_type = is_nuc, "n"
case "c":
is_pol, pol_type = is_carb, "c"
case _:
is_pol, pol_type = max(
[(is_carb, "c"), (is_nuc, "n"), (is_pep, "p")], key=lambda x: x[0].sum()
)
if not any(is_pol):
LOGGER.warning("No polymer atoms identified. Marking as SOLVENT+UNK.")
return marks, []
marks[is_carb] = AtomMark.CARB
marks[is_nuc] = AtomMark.NUC
marks[is_pep] = AtomMark.PEP
ligands = []
# Annotate small-molecule ligands
is_putative_lig = ~(is_any_pol | is_solv)
if np.any(is_putative_lig):
for m_res in iter_residue_masks(a):
# A mask that is a single ligand residue
m_lig = is_putative_lig & m_res
lig = make_ligand(m_lig, is_pol, structure)
if lig is not None:
marks[m_lig] = AtomMark.LIGAND
ligands.append(lig)
# Annotate polymer ligands
is_putative_lig = ~(is_pol | is_solv | (marks == AtomMark.LIGAND)) & is_any_pol
pol_marks = {"c": AtomMark.CARB, "n": AtomMark.NUC, "p": AtomMark.PEP}
if np.any(is_putative_lig):
for c in bst.get_chains(a[is_putative_lig]):
m_lig = is_putative_lig & (a.chain_id == c)
lig = make_ligand(m_lig, is_pol, structure)
if (
lig is not None
and lig.res_name[0] in DefaultConfig["structure"]["ligand_pol_types"]
):
marks[m_lig] = AtomMark.LIGAND | pol_marks[lig.res_name[0]]
ligands.append(lig)
return marks, ligands
def _get_one_chain(a: bst.AtomArray) -> str:
chain_ids = set(a.chain_id)
if len(chain_ids) != 1:
raise ValueError(f"Expected single chain, found {chain_ids}")
return chain_ids.pop()
def _to_single_poly_mask(
a: bst.AtomArray,
poly_masks: abc.Sequence[npt.NDArray[np.bool_]],
):
largest_idx = np.argmax([x.sum() for x in poly_masks])
pol_mask = poly_masks[largest_idx]
pol_chain = _get_one_chain(a[pol_mask])
for i, m in enumerate(poly_masks):
if i == largest_idx:
continue
a_sub = a[m]
# If chains match and not all atoms are hetero
if _get_one_chain(a_sub) == pol_chain and np.any(~a_sub.hetero):
pol_mask[m] = True
return pol_mask
def _combine_poly_masks(
a: bst.AtomArray,
poly_masks: abc.Sequence[npt.NDArray[np.bool_]],
):
poly_mask = np.full_like(a, False, dtype=np.bool_)
for m in poly_masks:
if np.sum(a[m].hetero) < m.sum() / 2:
poly_mask[m] = True
return poly_mask
[docs]
def mark_atoms_g(
s: GenericStructure, single_poly_chain: bool = False
) -> (npt.NDArray[np.int_], str, list[Ligand]):
"""
Mark structure atoms based on a molecular graph's representation by of
the :class:`lXtractor.core.config.AtomMark` categories.
Atoms are classified into five categories::
#. primary polymer: corresponds to ``PEP``, ``NUC`` or ``CARB``
categories.
#. solvent: ``SOLVENT``.
#. non polymer ligand: ``LIGAND``.
#. polymer ligand: A combination of ``LIGAND`` with one of the primary
polymer types, eg. ``AtomMark.LIGAND | AtomMark.NUC``.
#. unknown: ``UNK`` for atoms that couldn't be categorized.
The classification process depends on groups of atoms forming covalent bonds
with each other, or connected components in the molecular graph representation.
Each such component is assessed separately and its atoms are classified
as polymer, ligand, or solvent. If the primary polymer is set to "auto" in
config (``DefaultConfig["structure"]["primary_pol_type"]``), the polymer
with the largest number of monomers will be selected. The rest of the polymers
will become polymer ligands: special kind of ligand that can have multiple
residues. See :class:`lXtractore.core.ligand.Ligand` for details.
:param s:
:param single_poly_chain:
:return:
"""
a = s.array
g = s.graph
marks = np.full(len(a), AtomMark.UNK)
n_monomers = DefaultConfig["structure"]["n_monomers"]
lig_pol_types = DefaultConfig["structure"]["ligand_pol_types"]
# Keep track of polymer masks and sizes for each polymer type
polymers = defaultdict(list)
polymer_sizes = defaultdict(int)
# Iterate over connected components (molecules) in a graph.
for cc_idx in map(list, rx.connected_components(g)):
# Make residue mask corresponding to all atoms from residues
# of the connected component atom indices
r_mask = extend_residue_mask(a, cc_idx)
n_resi = bst.get_residue_count(a[r_mask])
# Check if the single residue CC is a solvent.
# Otherwise, it is a ligand candidate.
if n_resi == 1:
res_name = a[r_mask].res_name[0]
if res_name in DefaultConfig["residues"]["solvents"]:
marks[r_mask] = AtomMark.SOLVENT
continue
# If not solvent or polymer: continue
if n_resi < n_monomers:
continue
pol_type = find_first_polymer_type(a[r_mask], n_monomers)
if pol_type != "x":
polymers[pol_type].append(r_mask)
polymer_sizes[pol_type] += n_resi
if not polymers or all(len(x) == 0 for x in polymers.values()):
LOGGER.warning("No polymer atoms identified. Marking as SOLVENT+UNK.")
return marks, "x", []
# Determine primary polymer type
cfg_pol_type = DefaultConfig["structure"]["primary_pol_type"][0]
if cfg_pol_type == "a":
prim_pol_type = max(polymer_sizes.items(), key=lambda x: x[1])[0]
else:
prim_pol_type = cfg_pol_type
if not polymers[prim_pol_type]:
LOGGER.warning(
f"No primary polymer '{prim_pol_type}' atoms. Marking as SOLVENT+UNK."
)
return marks, "x", []
# Force a single polymer chain or allow multiple ones
fn = _to_single_poly_mask if single_poly_chain else _combine_poly_masks
is_pol = fn(a, polymers[prim_pol_type])
if not np.any(is_pol):
LOGGER.warning(
f"No primary polymer '{prim_pol_type}' atoms. Marking as SOLVENT+UNK."
)
return marks, "x", []
# Mark determined primary polymer atoms. Ligand polymers will be marked below.
marks[is_pol] = _POL_MARKS[prim_pol_type]
ligands = []
# Detect covalently bound solvent or ligand atoms.
het_pol_idx = np.where(a.hetero & is_pol)[0]
if len(het_pol_idx) > 0:
starts_het_pol = np.unique(bst.get_residue_starts_for(a, het_pol_idx))
for r_mask in bst.get_residue_masks(a, starts_het_pol):
het_pol_type = find_first_polymer_type(a[r_mask], min_size=1)
if het_pol_type == prim_pol_type:
continue
lig = make_ligand(r_mask, is_pol & ~r_mask, s)
if lig is None:
marks[r_mask] = AtomMark.SOLVENT | AtomMark.COVALENT
else:
marks[r_mask] = AtomMark.LIGAND | AtomMark.COVALENT
ligands.append(lig)
# Find unmarked atom indices and impose a subgraph based on them
remaining_idx = np.where(marks == AtomMark.UNK)[0]
sg = g.subgraph(remaining_idx)
cc_idx_viewed = set()
# Iterate over the subgraph's connected components
for cc_idx in map(list, rx.connected_components(sg)):
# Obtain original indices; they are reset upon a subgraph creation
cc_idx = sg.subgraph(cc_idx).nodes()
# Obtain a mask pointing to CC residues that were not previously
# marked as polymer or solvent
r_mask = extend_residue_mask(a, cc_idx)
# Avoid creating duplicated ligands by storing already assessed atom
# indices
r_mask[list(cc_idx_viewed)] = False
if not np.any(r_mask):
continue
cc_idx_viewed |= set(np.where(r_mask)[0])
n_resi = bst.get_residue_count(a[r_mask])
# Attempt making a ligand
lig = make_ligand(r_mask, is_pol, s)
if lig is None:
continue
multiples_flag = False # indicates whether ligand is split below
if n_resi > 1: # Handle polymer ligand
if lig.res_name[0] in lig_pol_types:
marks[r_mask] = AtomMark.LIGAND | _POL_MARKS[lig.res_name[0]]
else:
# Ligand is non-polymer but still has multiple residues.
# Users are warned about such cases.
# Split into separate residues if polymer type is undetermined.
# It's likely that a ligand appears as a single CC due to
# structure artifacts, eg, molecules with different altloc
# occupying the same spatial region.
if lig.res_name[0] == "x":
multiples_flag = True
starts = np.unique(bst.get_residue_starts_for(a, cc_idx))
for r_mask in bst.get_residue_masks(a, starts):
lig = make_ligand(r_mask, is_pol & ~r_mask, s)
if lig is not None:
marks[r_mask] = AtomMark.LIGAND
ligands.append(lig)
# Otherwise, despite polymeric nature of a ligand, it's
# unsupported by the config and will be marked as a regular
# ligand.
else:
marks[r_mask] = AtomMark.LIGAND
else: # Handle a non-polymer ligand
marks[r_mask] = AtomMark.LIGAND
if not multiples_flag:
ligands.append(lig)
ligand_mask = reduce(
op.or_,
(lig.mask for lig in ligands),
np.zeros_like(a.res_id, dtype=bool),
)
for lig in ligands:
lig.contact_mask[ligand_mask] = False
lig.dist[ligand_mask] = -1
return marks, prim_pol_type, ligands
if __name__ == "__main__":
raise RuntimeError