Source code for lXtractor.core.structure

"""
Module defines basic interfaces to interact with macromolecular structures.
"""
from __future__ import annotations

import logging
import operator as op
import typing as t
from collections import abc, defaultdict
from dataclasses import dataclass
from functools import reduce
from io import IOBase
from os import PathLike
from pathlib import Path

import biotite.structure as bst
import numpy as np
import numpy.typing as npt
import rustworkx as rx
from more_itertools import unique_everseen
from toolz import keyfilter

import lXtractor.core.segment as lxs
from lXtractor.core.base import ResNameDict
from lXtractor.core.config import AtomMark, DefaultConfig, EMPTY_ALTLOC
from lXtractor.core.exceptions import NoOverlap, InitError, LengthMismatch, MissingData
from lXtractor.core.ligand import Ligand, make_ligand, ligands_from_atom_marks
from lXtractor.util import get_files
from lXtractor.util.misc import json_to_molgraph, graph_reindex_nodes, molgraph_to_json
from lXtractor.util.structure import (
    filter_selection,
    load_structure,
    save_structure,
    filter_solvent_extended,
    iter_residue_masks,
    mark_polymer_type,
    to_graph,
    find_first_polymer_type,
    extend_residue_mask,
    compare_coord,
)

LOGGER = logging.getLogger(__name__)
RES_DICT = ResNameDict()
_POL_MARKS = {
    "c": AtomMark.CARB,
    "n": AtomMark.NUC,
    "p": AtomMark.PEP,
    "x": AtomMark.UNK,
}



[docs]
@dataclass(frozen=True)
class Masks:
    primary_polymer: npt.NDArray[np.bool_]
    primary_polymer_ptm: npt.NDArray[np.bool_]
    primary_polymer_modified: npt.NDArray[np.bool_]
    solvent: npt.NDArray[np.bool_]
    ligand: npt.NDArray[np.bool_]
    ligand_covalent: npt.NDArray[np.bool_]
    ligand_poly: npt.NDArray[np.bool_]
    ligand_nonpoly: npt.NDArray[np.bool_]
    ligand_pep: npt.NDArray[np.bool_]
    ligand_nuc: npt.NDArray[np.bool_]
    ligand_carb: npt.NDArray[np.bool_]
    unk: npt.NDArray[np.bool_]




[docs]
class GenericStructure:
    """
    A generic macromolecular structure with possibly many chains holding
    a single :class:`biotite.structure.AtomArray` instance.

    This object is a core data structure in `lXtractor` for structural data.

    The object is considered immutable: atoms of a structure can't change their
    location or properties, as well as other protected attributes.

    While atoms are stored as :class:`biotite.structure.AtomArray`,
    `GenericStructure` defines additional annotations for each atom and
    operations crucial for other objects such as
    :class:`lXtractor.core.chain.ChainStructure`.

    Upon initialization, atom array attains graph representation (:meth:`graph`)
    using :func:`lXtractor.util.structure.to_graph` function. Using this
    representation, atom annotations are attained via :func``mark_atoms_g`.
    These annotations can be accessed via :meth:`atom_marks`. For convenience,
    boolean masks are stored and can be applied to the :meth:`array` as follows:

    .. code-block:: python

        # Assume ``s`` is a :class:`GenericStructure` object.
        s[s.mask.`mask_name`]

    To view available mask names, see :class:`Masks`.

    One of the most crucial annotations is the so-called "primary_polymer".
    These atoms serve as a frame of reference for all other atoms in a structure.
    The rest of the atoms are categorized as either ligand or solvent. Sometimes
    the annotation process fails to identify certain atoms. In such cases, a
    warning is logged. To view uncategorized atoms, one can use the following mask:

    .. code-block:: python

        s[s.mask.unk]

    .. note::
        Using ``__getitem__(item)`` like in ``s[s.mask.unk`` will return an
        atom array. Use :meth:`subset` to obtain a new generic structure or
        initialize a new ``GenericStructure(s[s.mask.unk] instance; it will be
        equivalent.

    Methods ``__repr__`` and ``__str__`` output a string in the format:
    ``{_name}:{polymer_chain_ids};{ligand_chain_ids}|{altloc_ids}``
    where ``*ids`` are ","-separated.
    """

    __slots__ = (
        "_atom_marks",
        "_array",
        "_graph",
        "_name",
        "_ligands",
        "_mask",
        "_id",
    )


[docs]
    def __init__(
        self,
        array: bst.AtomArray,
        name: str,
        ligands: abc.Sequence[Ligand] | None = None,
        atom_marks: npt.NDArray[int] | PathLike | None = None,
        graph: rx.PyGraph | dict | PathLike | None = None,
    ):
        """
        :param array: Atom array object.
        :param name: ID of a structure in `array`.
        :param ligands: A list of ligands or flag indicating to extract ligands
            during initialization.
        """
        #: Atom array object.
        self._array: bst.AtomArray = array
        #: ID of a structure in `array`.
        self._name: str = name

        if isinstance(graph, rx.PyGraph):
            self._graph = graph
        elif isinstance(graph, dict | PathLike):
            self._graph = json_to_molgraph(graph)
        else:
            self._graph = to_graph(array, True)

        if len(self._graph) != len(self._array):
            raise LengthMismatch(
                f"The number of nodes in a graph ({len(self._graph)}) does not match "
                f"the number of atoms in the array ({len(self._array)})."
            )

        if atom_marks is None:
            atom_marks, primary_pol_type, _ligands = mark_atoms_g(self)
            atom_marks.flags.writeable = False
            self._atom_marks = atom_marks

            if isinstance(ligands, list):
                _ligands = ligands
        else:
            if isinstance(atom_marks, PathLike):
                atom_marks = np.load(atom_marks)
            if not isinstance(atom_marks, np.ndarray):
                raise TypeError(
                    f"Expected `atom_marks` to be an array, got {type(atom_marks)}"
                )
            if len(atom_marks) != len(array):
                raise LengthMismatch(
                    "The lengths of `atom_marks` and `array` must match. "
                    f"Got {len(atom_marks)} and {len(array)}."
                )
            self._atom_marks = atom_marks
            # determine primary polymer type
            primary_pol_type = find_first_polymer_type(atom_marks)
            if not isinstance(ligands, list):
                _ligands = list(ligands_from_atom_marks(self))
            else:
                _ligands = ligands

        if any(not isinstance(x, Ligand) for x in _ligands):
            raise TypeError(
                "Some entries in supplied `ligands` are not of the `Ligand` type"
            )
        #: A tuple of ligands
        self._ligands: tuple[Ligand, ...] = tuple(
            sorted(_ligands, key=lambda x: (x.res_name, x.res_id, x.chain_id))
        )

        if np.any(atom_marks == AtomMark.UNK):
            num_unk = np.sum(atom_marks == AtomMark.UNK)
            LOGGER.warning(f"Structure {name} has {num_unk} uncategorized atoms.")

        ligand_pep = atom_marks == (AtomMark.PEP | AtomMark.LIGAND)
        ligand_nuc = atom_marks == (AtomMark.NUC | AtomMark.LIGAND)
        ligand_carb = atom_marks == (AtomMark.CARB | AtomMark.LIGAND)
        ligand_poly = ligand_pep | ligand_nuc | ligand_carb
        ligand_nonpoly = atom_marks == AtomMark.LIGAND
        ligand_covalent = atom_marks == (AtomMark.LIGAND | AtomMark.COVALENT)

        if primary_pol_type == "x":
            primary_polymer = np.full_like(atom_marks, False, bool)
        else:
            primary_polymer = atom_marks == _POL_MARKS[primary_pol_type]

        primary_polymer_ptm = primary_polymer & array.hetero
        primary_polymer_mod = (
            atom_marks == (AtomMark.SOLVENT | AtomMark.COVALENT)
        ) | primary_polymer_ptm

        self._mask = Masks(
            primary_polymer=primary_polymer,
            primary_polymer_ptm=primary_polymer_ptm,
            primary_polymer_modified=primary_polymer_mod,
            solvent=(atom_marks == AtomMark.SOLVENT),
            ligand=ligand_nonpoly | ligand_poly,
            ligand_covalent=ligand_covalent,
            ligand_nonpoly=ligand_nonpoly,
            ligand_poly=ligand_poly,
            ligand_pep=ligand_pep,
            ligand_nuc=ligand_nuc,
            ligand_carb=ligand_carb,
            unk=(atom_marks == AtomMark.UNK),
        )

        self._id = self._make_id()


    def __len__(self) -> int:
        return len(self.array)

    def __eq__(self, other: t.Any) -> bool:
        if isinstance(other, GenericStructure):
            return (
                    self._name == other._name
                    and len(self) == len(other)
                    and np.all(self.atom_marks == other.atom_marks)
                    and compare_coord(self.array, other.array)
                    and self.ligands == other.ligands
            )
        return False

    def __hash__(self) -> int:
        atoms = tuple(
            (a.chain_id, a.res_id, a.res_name, a.atom_name, tuple(a.coord))
            for a in self.array
        )
        return hash(self._name) + hash(atoms)

    def __str__(self) -> str:
        return self.id

    def __repr__(self) -> str:
        return self.id

    def __getitem__(self, item: t.Any) -> bst.AtomArray:
        return self._array.__getitem__(item)

    def _make_id(self) -> str:
        chains_pol = ",".join(sorted(self.chain_ids_polymer))
        chains_lig = ",".join(sorted(self.chain_ids_ligand))
        altloc_ids = ",".join(filter(lambda x: x not in EMPTY_ALTLOC, self.altloc_ids))
        return f"{self._name}:{chains_pol};{chains_lig}|{altloc_ids}"

    @property
    def atom_marks(self) -> npt.NDArray[np.int_]:
        """
        :return: An array of :class:`lXtractor.core.config.AtomMark` marks,
            categorizing each atom in this structure.
        """
        return self._atom_marks

    @property
    def graph(self) -> rx.PyGraph:
        """
        :return: A structure's graph representation.
        """
        return self._graph

    @property
    def id(self) -> str:
        """
        :return: An identifier of this structure. It's composed once upon
            initialization and has the following format:
            ``{_name}:{polymer_chain_ids};{ligand_chain_ids}|{altloc_ids}``.
            It should uniquely identify a structure, i.e., one should expect
            two structures with the same ID to be identical.
        """
        return self._id

    @property
    def name(self) -> str:
        """
        :return: A name of the structure.
        """
        return self._name

    @name.setter
    def name(self, value: str):
        if not isinstance(value, str):
            raise TypeError(f"New ID must have the `str` type. Got {type(value)}")
        self._name = value
        self._id = self._make_id()

    @property
    def array(self) -> bst.AtomArray:
        """
        :return: Atom array object.
        """
        return self._array

    @array.setter
    def array(self, _: t.Any) -> None:
        raise RuntimeError("Array cannot be set. Please initialize a new structure")

    @property
    def mask(self) -> Masks:
        return self._mask

    @property
    def altloc_ids(self) -> list[str]:
        """
        :return: A sorted list of altloc IDs. If none found, will output ``[""]``.
        """
        if hasattr(self.array, "altloc_id"):
            return sorted(set(self.array.altloc_id))
        return [""]

    @property
    def chain_ids(self) -> list[str]:
        """
        :return: A list of chain IDs this structure encompasses.
        """
        return list(unique_everseen(self.array.chain_id))

    @property
    def chain_ids_polymer(self) -> list[str]:
        """
        :return: A list of polymer chain IDs.
        """
        return list(unique_everseen(self.array[self.mask.primary_polymer].chain_id))

    @property
    def chain_ids_ligand(self) -> list[str]:
        """
        :return: A set of ligand chain IDs.
        """
        return list(unique_everseen(lig.chain_id for lig in self.ligands))

    @property
    def ligands(self) -> tuple[Ligand, ...]:
        """
        :return: A list of ligands.
        """
        return self._ligands

    @ligands.setter
    def ligands(self, _):
        raise RuntimeError("Cannot set ligands")

    @property
    def is_empty(self) -> bool:
        """
        :return: ``True`` if the :meth:`array` is empty.
        """
        return len(self) == 0

    @property
    def is_empty_polymer(self) -> bool:
        """
        Check if there are any polymer atoms.

        :return: ``True`` if there are >=1 polymer atoms and ``False``
            otherwise.
        """
        return self.mask.primary_polymer.sum() == 0

    @property
    def is_singleton(self) -> bool:
        """
        :return: ``True`` if the structure contains a single residue.
        """
        return bst.get_residue_count(self.array) == 1


[docs]
    @classmethod
    def make_empty(
        cls, structure_id: str = DefaultConfig["unknowns"]["structure_id"]
    ) -> t.Self:
        """
        :param structure_id: (Optional) ID of the created array.
        :return: An instance with empty :meth:`array`.
        """
        return cls(bst.AtomArray(0), structure_id, [])



[docs]
    @classmethod
    def read(
        cls,
        inp: IOBase | Path | str | bytes,
        path2id: abc.Callable[[Path], str] = lambda p: p.name.split(".")[0],
        structure_id: str = DefaultConfig["unknowns"]["structure_id"],
        altloc: bool | str = False,
        **kwargs,
    ) -> t.Self:
        """
        Parse the atom array from the provided input and wrap it into the
        :class:`GenericStructure` object.

        .. seealso::
            :func:`lXtractor.util.structure.load_structure`

        .. note::
            If `inp` is not a ``Path``, ``kwargs`` must contain the correct
            ``fmt`` (e.g., ``fmt=cif``).

        :param inp: Path to a structure in supported format.
        :param path2id: A callable obtaining a PDB ID from the file path.
            By default, it's a ``Path.stem``.
        :param structure_id: A structure unique identifier (e.g., PDB ID). If
            not provided and the input is ``Path``, will use ``path2id`` to
            infer the ID. Otherwise, will use a constant placeholder.
        :param altloc: Parse alternative locations and populate
            ``array.altloc_id`` attribute.
        :param kwargs: Passed to ``load_structure``.
        :return: Parsed structure.
        """
        if isinstance(altloc, bool) and altloc:
            altloc = "all"
        if isinstance(altloc, str):
            kwargs["altloc"] = altloc
        array = load_structure(inp, **kwargs)
        empty_id = DefaultConfig["unknowns"]["structure_id"]
        if hasattr(array, "altloc_id"):
            array.altloc_id[np.isin(array.altloc_id, EMPTY_ALTLOC)] = ""
        if isinstance(inp, Path) and structure_id == empty_id:
            structure_id = path2id(inp)
        if isinstance(array, bst.AtomArrayStack):
            raise InitError(
                f"{inp} is likely an NMR structure. "
                f"NMR structures are not supported."
            )
        if isinstance(inp, Path):
            files = get_files(inp.parent)
            name = inp.stem.split(".")[0]
            marks_name, graph_name = f"{name}.npy", f"{name}.json"
            marks = files.get(marks_name, None)
            graph = files.get(graph_name, None)
        else:
            marks, graph = None, None

        return cls(array, structure_id, ligands=None, atom_marks=marks, graph=graph)



[docs]
    def write(
        self, path: PathLike | str, atom_marks: bool = True, graph: bool = True
    ) -> Path:
        """
        Save this structure to a file. The format is automatically determined
        from the given path.

        Additional files are saved using the same filename alongside the
        structure file. The filename will resolve to "structure" in all the
        following cases and result in "structure.npy" and "structure.json"
        files saved to the same dir::

            path="/path/to/structure.pdb"
            path="/path/to/structure.mmtf.gz"
            path="/path/to/structure.with.many.dots.pdb.gz"

        .. seealso::
            :func:`lXtractor.util.structure.save_structure`.

        :param path: A path or a path-like object compatible with :func:`open`.
            Must not point to an existing directory. Must provide the structure
            format as an extension.
        :param atom_marks: Save an array of atom marks in the `npy` format.
        :param graph: Save molecular connectivity graph in the `json` format.
        :return: Path to the saved structure if writing was successful.
        """
        if not isinstance(path, Path):
            path = Path(path)
        saved_path = save_structure(self.array, path)
        stem = path.stem.split(".")[0]
        if atom_marks:
            np.save(path.parent / f"{stem}.npy", self.atom_marks)
        if graph:
            json_path = path.parent / f"{stem}.json"
            molgraph_to_json(self.graph, path=json_path)
        return saved_path



[docs]
    def get_sequence(self) -> abc.Generator[tuple[str, str, int]]:
        """
        :return: A generator over tuples, where each residue is described by:
            (1) one-letter code, (2) three-letter code, (3) residue number.
        """
        if self.is_empty:
            return []

        m = self.mask.primary_polymer
        a = self.array if not np.any(m) else self.array[m]
        first_atoms = (r[0] for r in bst.residue_iter(a))
        for i, a in enumerate(first_atoms, start=1):
            try:
                res3 = RES_DICT[a.res_name]
            except KeyError:
                LOGGER.warning(
                    f"Failed to obtain a single-letter code for residue {a.res_name} "
                    f"at {i}-th position ({a.res_id + 1}-th first atom position in "
                    f"protein) for a structure {self}."
                )
                res3 = "X"
            yield res3, a.res_name, a.res_id



[docs]
    def subset(
        self,
        mask: np.ndarray,
        ligands: bool = True,
        reinit_ligands: bool = False,
        copy: bool = False,
    ) -> t.Self:
        """
        Create a sub-structure potentially preserving connected :meth:`ligands`.

        .. warning::

            If ``DefaultConfig["structure"]["primary_pol_type"]`` is set to auto,
            and `mask` points to a polymer that is shorter than some existing
            ligand polymer, this ligand polymer will become a primary polymer
            in the substructure.

        :param mask: Boolean mask, ``True`` for atoms in :meth:`array`, used
            to create a sub-structure.
        :param ligands: Keep ligands that are connected to atoms specified by
            `mask`.
        :param reinit_ligands: Reinitialize ligands upon creating a sub-structure,
            rather than filtering existing ligands connected to atoms specified
            by `mask`. Takes precedence over the `ligands` option. This option
            is used in :meth:`split_altloc`.
        :param copy: Copy the atom array resulting from subsetting the original
            one.
        :return: A new instance with atoms defined by `mask` and connected
            ligands.
        """

        if reinit_ligands:
            ligands = False

        if ligands:
            # Filter connected ligands
            ligands = list(
                filter(
                    lambda lig: lig.is_locally_connected(mask),
                    self.ligands,
                )
            )
            # Extend mask by atoms from the connected ligands
            ligand_mask = reduce(
                op.or_,
                (lig.mask for lig in ligands),
                np.zeros_like(self.array.res_id, dtype=bool),
            )
        else:
            ligand_mask = mask

        _mask = mask | ligand_mask
        a = self.array[_mask]
        g = graph_reindex_nodes(self.graph.subgraph(np.where(_mask)[0]))
        m = self.atom_marks[_mask]
        if copy:
            a = a.copy()
            m = m.copy()

        s = self.__class__(
            a, self.name, ligands=True if reinit_ligands else [], atom_marks=m, graph=g
        )

        if ligands:
            new_ligands = []
            names = DefaultConfig["metadata"]
            retain_names = [
                names["res_name"],
                names["res_id"],
                names["structure_chain_id"],
            ]
            for lig in ligands:
                meta = keyfilter(lambda x: x in retain_names, lig.meta)
                new_ligands.append(
                    Ligand(
                        s,
                        lig.mask[_mask],
                        lig.contact_mask[_mask],
                        lig.ligand_idx[_mask],
                        lig.dist[_mask],
                        meta,
                    )
                )
            s._ligands = tuple(
                sorted(new_ligands, key=lambda x: (x.res_name, x.res_id, x.chain_id))
            )
            s._id = s._make_id()

        return s



[docs]
    def rm_solvent(self, copy: bool = False):
        """
        :param copy: Copy the resulting substructure.
        :return: A substructure with solvent molecules removed.
        """
        return self.subset(~self.mask.solvent, ligands=True, copy=copy)



[docs]
    def split_chains(self, polymer: bool = False, **kwargs) -> abc.Iterator[t.Self]:
        """
        Split into separate chains. Splitting is done using
        :func:`biotite.structure.get_chain_starts`.

        .. note::
            Preserved ligands may have a different ``chain_id``.

        .. note::
            If there is a single chain, this method will return ``self``.

        :param polymer: Use only primary polymer chains for splitting.
        :param kwargs: Passed to :meth:`subset`.
        :return: An iterable over chains found in :attr:`array`.
        """

        if polymer:
            chain_ids = self.chain_ids_polymer
        else:
            chain_ids = self.chain_ids

        # a = self.array.copy() if copy else self.array
        a = self.array
        if len(chain_ids) == 1:
            yield self
            return

        for chain_id in sorted(chain_ids):
            mask = a.chain_id == chain_id
            yield self.subset(mask, **kwargs)



[docs]
    def split_altloc(self, **kwargs) -> abc.Iterator[t.Self]:
        """
        Split into substructures based on altloc IDs. Atoms missing altloc
        annotations are distributed into every substructure. Thus, even if
        a structure contains a single atom having altlocs (say, A and B),
        this method will produce two substructed identical except for this
        atom.

        .. note::
            If :meth:`array` does not specify any altloc ID, the method yields
            ``self``.

        :param kwargs: Passed to :meth:`subset`.
        :return: An iterator over objects of the same type initialized by
            atoms having altloc annotations.
        """
        ids = self.altloc_ids
        if len(ids) == 1:
            yield self
            return

        if "reinit_ligands" not in kwargs:
            kwargs["reinit_ligands"] = True

        no_alt_mask = np.isin(self.array.altloc_id, EMPTY_ALTLOC)
        for altloc in ids[1:]:
            m = no_alt_mask | (self.array.altloc_id == altloc)
            yield self.subset(m, **kwargs)



[docs]
    def extract_segment(self, start: int, end: int, chain_id: str, **kwargs) -> t.Self:
        """
        Create a sub-structure encompassing some continuous segment bounded by
        existing position boundaries.

        :param start: Residue number to start from (inclusive).
        :param end: Residue number to stop at (inclusive).
        :param chain_id: Chain to extract a segment from.
        :param kwargs: Passed to :meth:`subset`.
        :return: A new Generic structure with residues in ``[start, end]``.
        """
        if self.is_empty:
            raise NoOverlap("Attempting to sub an empty structure")

        chain_mask = self.array.chain_id == chain_id
        a = self.array[chain_mask]
        self_start, self_end = a.res_id.min(), a.res_id.max()

        # This is needed when some positions are <= 0 which can occur
        # in PDB structures but unsupported for a Segment.
        offset_self = abs(self_start) + 1 if self_start <= 0 else 0
        offset_start = abs(start) + 1 if start <= 0 else 0
        offset = max(offset_start, offset_self)
        self_start_, self_end_, start_, end_ = map(
            lambda x: x + offset, [self_start, self_end, start, end]
        )
        if self_start_ > self_end_:
            raise NoOverlap(
                f"Invalid boundaries ({self_start}+{offset}, {self_end}+{offset}) "
                f"derived for sequence being subsetted"
            )
        if start_ > end_:
            raise NoOverlap(
                f"Invalid boundaries ({start}+{offset}, {end}+{offset}) "
                f"derived for subsetting"
            )
        seg_self = lxs.Segment(self_start_, self_end_)
        seg_sub = lxs.Segment(start_, end_)
        if not seg_self.bounds(seg_sub):
            raise NoOverlap(
                f"Provided positions {start, end} lie outside "
                f"of the structure positions {self_start, self_end}"
            )
        mask = chain_mask & (self.array.res_id >= start) & (self.array.res_id <= end)
        return self.subset(mask, **kwargs)



[docs]
    def extract_positions(
        self,
        pos: abc.Sequence[int],
        chain_ids: abc.Sequence[str] | str | None = None,
        **kwargs,
    ) -> t.Self:
        """
        Extract specific positions from this structure.

        :param pos: A sequence of positions (res_id) to extract.
        :param chain_ids: Optionally, a single chain ID or a sequence of such.
        :param kwargs: Passed to :meth:`subset`.
        :return: A new instance with extracted residues.
        """

        if self.is_empty:
            return self.make_empty(self._name)

        a = self.array

        mask = np.isin(a.res_id, pos)
        if chain_ids is not None:
            if isinstance(chain_ids, str):
                chain_ids = [chain_ids]
            mask &= np.isin(a.chain_id, chain_ids)
        return self.subset(mask, **kwargs)



[docs]
    def superpose(
        self,
        other: GenericStructure | bst.AtomArray,
        res_id_self: abc.Iterable[int] | None = None,
        res_id_other: abc.Iterable[int] | None = None,
        atom_names_self: abc.Iterable[abc.Sequence[str]]
        | abc.Sequence[str]
        | None = None,
        atom_names_other: abc.Iterable[abc.Sequence[str]]
        | abc.Sequence[str]
        | None = None,
        mask_self: np.ndarray | None = None,
        mask_other: np.ndarray | None = None,
    ) -> tuple[GenericStructure, float, tuple[np.ndarray, np.ndarray, np.ndarray]]:
        """
        Superpose other structure to this one.
        Arguments to this function all serve a single purpose: to correctly
        subset both structures so the resulting selections have the same number
        of atoms.

        The subsetting achieved either by specifying residue numbers and atom
        names or by supplying a binary mask of the same length as the number
        of atoms in the structure.

        :param other: Other :class:`GenericStructure` or atom array.
        :param res_id_self: Residue numbers to select in this structure.
        :param res_id_other: Residue numbers to select in other structure.
        :param atom_names_self: Atom names to select in this structure given
            either per-residue or as a single sequence broadcasted to selected
            residues.
        :param atom_names_other: Same as `self`.
        :param mask_self: Binary mask to select atoms. Takes precedence over
            other selection arguments.
        :param mask_other: Same as `self`.
        :return: A tuple of (1) an `other` structure superposed onto this one,
            (2) an RMSD of the superposition, and (3) a transformation that
            had been used with :func:`biotite.structure.superimpose_apply`.
        """

        def _get_mask(a, res_id, atom_names):
            if res_id:
                m = filter_selection(a, res_id, atom_names)
            else:
                if atom_names:
                    m = np.isin(a.atom_name, atom_names)
                else:
                    m = np.ones_like(a, bool)
            return m

        if self.is_empty or other.is_empty:
            raise MissingData("Superposing empty structures is not supported")

        if mask_self is None:
            mask_self = _get_mask(self.array, res_id_self, atom_names_self)
        if mask_other is None:
            mask_other = _get_mask(other.array, res_id_other, atom_names_other)

        num_self, num_other = mask_self.sum(), mask_other.sum()
        if num_self != num_other:
            raise LengthMismatch(
                f"To superpose, the number of atoms must match. "
                f"Got {num_self} in self and {num_other} in other."
            )

        if num_self == num_other == 0:
            raise MissingData("No atoms selected")

        superposed, transformation = bst.superimpose(
            self.array[mask_self], other.array[mask_other]
        )
        other_transformed = bst.superimpose_apply(other.array, transformation)

        rmsd_target = bst.rmsd(self.array[mask_self], superposed)

        return (
            GenericStructure(other_transformed, other._name),
            rmsd_target,
            transformation,
        )





[docs]
class ProteinStructure(GenericStructure):
    """
    A structure type where primary polymer is peptide.

    .. seealso::
        :class:`GenericStructure` for general-purpose documentation.
    """


[docs]
    def __init__(
        self,
        array: bst.AtomArray,
        structure_id: str,
        ligands: bool | list[Ligand] = True,
        atom_marks: npt.NDArray[int] | PathLike | None = None,
        graph: rx.PyGraph | dict | PathLike | None = None,
    ):
        with DefaultConfig.temporary_namespace():
            DefaultConfig["structure"]["primary_pol_type"] = "p"
            super().__init__(array, structure_id, ligands, atom_marks, graph)





[docs]
class NucleotideStructure(GenericStructure):
    """
    A structure type where primary polymer is nucleotide.

    .. seealso::
        :class:`GenericStructure` for general-purpose documentation.
    """


[docs]
    def __init__(
        self,
        array: bst.AtomArray,
        structure_id: str,
        ligands: bool | list[Ligand] = True,
        atom_marks: npt.NDArray[int] | PathLike | None = None,
        graph: rx.PyGraph | dict | PathLike | None = None,
    ):
        with DefaultConfig.temporary_namespace():
            DefaultConfig["structure"]["primary_pol_type"] = "n"
            super().__init__(array, structure_id, ligands, atom_marks, graph)





[docs]
class CarbohydrateStructure(GenericStructure):
    """
    A structure type where primary polymer is carbohydrate.

    .. seealso::
        :class:`GenericStructure` for general-purpose documentation.
    """


[docs]
    def __init__(
        self,
        array: bst.AtomArray,
        structure_id: str,
        ligands: bool | list[Ligand] = True,
        atom_marks: npt.NDArray[int] | PathLike | None = None,
        graph: rx.PyGraph | dict | PathLike | None = None,
    ):
        with DefaultConfig.temporary_namespace():
            DefaultConfig["structure"]["primary_pol_type"] = "c"
            super().__init__(array, structure_id, ligands, atom_marks, graph)





[docs]
def mark_atoms(
    structure: GenericStructure,
) -> tuple[npt.NDArray[np.int_], list[Ligand]]:
    """
    Mark each atom in structure according to
    :class:`lXtractor.core.config.AtomMark`.

    This function is used upon initializing :class:`GenericStructure` and its
    subclasses, storing the output under :attr:`GenericStructure.atom_marks`.

    :param structure: An arbitrary structure.
    :return: An array of atom marks (equivalently, classes or types).
    """
    a = structure.array

    is_solv = filter_solvent_extended(a)
    pol_types = mark_polymer_type(a, DefaultConfig["structure"]["n_monomers"])
    is_nuc, is_pep, is_carb = (pol_types == p for p in ["n", "p", "c"])

    is_any_pol = pol_types != "x"
    is_solv[is_any_pol] = False

    marks = np.full(len(a), AtomMark.UNK)
    marks[is_solv] = AtomMark.SOLVENT

    match DefaultConfig["structure"]["primary_pol_type"][0]:
        case "p":
            is_pol, pol_type = is_pep, "p"
        case "n":
            is_pol, pol_type = is_nuc, "n"
        case "c":
            is_pol, pol_type = is_carb, "c"
        case _:
            is_pol, pol_type = max(
                [(is_carb, "c"), (is_nuc, "n"), (is_pep, "p")], key=lambda x: x[0].sum()
            )

    if not any(is_pol):
        LOGGER.warning("No polymer atoms identified. Marking as SOLVENT+UNK.")
        return marks, []

    marks[is_carb] = AtomMark.CARB
    marks[is_nuc] = AtomMark.NUC
    marks[is_pep] = AtomMark.PEP

    ligands = []

    # Annotate small-molecule ligands
    is_putative_lig = ~(is_any_pol | is_solv)
    if np.any(is_putative_lig):
        for m_res in iter_residue_masks(a):
            # A mask that is a single ligand residue
            m_lig = is_putative_lig & m_res
            lig = make_ligand(m_lig, is_pol, structure)
            if lig is not None:
                marks[m_lig] = AtomMark.LIGAND
                ligands.append(lig)

    # Annotate polymer ligands
    is_putative_lig = ~(is_pol | is_solv | (marks == AtomMark.LIGAND)) & is_any_pol
    pol_marks = {"c": AtomMark.CARB, "n": AtomMark.NUC, "p": AtomMark.PEP}
    if np.any(is_putative_lig):
        for c in bst.get_chains(a[is_putative_lig]):
            m_lig = is_putative_lig & (a.chain_id == c)
            lig = make_ligand(m_lig, is_pol, structure)
            if (
                lig is not None
                and lig.res_name[0] in DefaultConfig["structure"]["ligand_pol_types"]
            ):
                marks[m_lig] = AtomMark.LIGAND | pol_marks[lig.res_name[0]]
                ligands.append(lig)

    return marks, ligands



def _get_one_chain(a: bst.AtomArray) -> str:
    chain_ids = set(a.chain_id)
    if len(chain_ids) != 1:
        raise ValueError(f"Expected single chain, found {chain_ids}")
    return chain_ids.pop()


def _to_single_poly_mask(
    a: bst.AtomArray,
    poly_masks: abc.Sequence[npt.NDArray[np.bool_]],
):
    largest_idx = np.argmax([x.sum() for x in poly_masks])
    pol_mask = poly_masks[largest_idx]
    pol_chain = _get_one_chain(a[pol_mask])

    for i, m in enumerate(poly_masks):
        if i == largest_idx:
            continue
        a_sub = a[m]
        # If chains match and not all atoms are hetero
        if _get_one_chain(a_sub) == pol_chain and np.any(~a_sub.hetero):
            pol_mask[m] = True

    return pol_mask


def _combine_poly_masks(
    a: bst.AtomArray,
    poly_masks: abc.Sequence[npt.NDArray[np.bool_]],
):
    poly_mask = np.full_like(a, False, dtype=np.bool_)
    for m in poly_masks:
        if np.sum(a[m].hetero) < m.sum() / 2:
            poly_mask[m] = True

    return poly_mask



[docs]
def mark_atoms_g(
    s: GenericStructure, single_poly_chain: bool = False
) -> (npt.NDArray[np.int_], str, list[Ligand]):
    """
    Mark structure atoms based on a molecular graph's representation by of
    the :class:`lXtractor.core.config.AtomMark` categories.

    Atoms are classified into five categories::

        #. primary polymer: corresponds to ``PEP``, ``NUC`` or ``CARB``
        categories.
        #. solvent: ``SOLVENT``.
        #. non polymer ligand: ``LIGAND``.
        #. polymer ligand: A combination of ``LIGAND`` with one of the primary
        polymer types, eg. ``AtomMark.LIGAND | AtomMark.NUC``.
        #. unknown: ``UNK`` for atoms that couldn't be categorized.

    The classification process depends on groups of atoms forming covalent bonds
    with each other, or connected components in the molecular graph representation.
    Each such component is assessed separately and its atoms are classified
    as polymer, ligand, or solvent. If the primary polymer is set to "auto" in
    config (``DefaultConfig["structure"]["primary_pol_type"]``), the polymer
    with the largest number of monomers will be selected. The rest of the polymers
    will become polymer ligands: special kind of ligand that can have multiple
    residues. See :class:`lXtractore.core.ligand.Ligand` for details.

    :param s:
    :param single_poly_chain:
    :return:
    """
    a = s.array
    g = s.graph
    marks = np.full(len(a), AtomMark.UNK)
    n_monomers = DefaultConfig["structure"]["n_monomers"]
    lig_pol_types = DefaultConfig["structure"]["ligand_pol_types"]
    # Keep track of polymer masks and sizes for each polymer type
    polymers = defaultdict(list)
    polymer_sizes = defaultdict(int)

    # Iterate over connected components (molecules) in a graph.
    for cc_idx in map(list, rx.connected_components(g)):
        # Make residue mask corresponding to all atoms from residues
        # of the connected component atom indices
        r_mask = extend_residue_mask(a, cc_idx)
        n_resi = bst.get_residue_count(a[r_mask])

        # Check if the single residue CC is a solvent.
        # Otherwise, it is a ligand candidate.
        if n_resi == 1:
            res_name = a[r_mask].res_name[0]
            if res_name in DefaultConfig["residues"]["solvents"]:
                marks[r_mask] = AtomMark.SOLVENT
            continue

        # If not solvent or polymer: continue
        if n_resi < n_monomers:
            continue

        pol_type = find_first_polymer_type(a[r_mask], n_monomers)
        if pol_type != "x":
            polymers[pol_type].append(r_mask)
            polymer_sizes[pol_type] += n_resi

    if not polymers or all(len(x) == 0 for x in polymers.values()):
        LOGGER.warning("No polymer atoms identified. Marking as SOLVENT+UNK.")
        return marks, "x", []

    # Determine primary polymer type
    cfg_pol_type = DefaultConfig["structure"]["primary_pol_type"][0]
    if cfg_pol_type == "a":
        prim_pol_type = max(polymer_sizes.items(), key=lambda x: x[1])[0]
    else:
        prim_pol_type = cfg_pol_type

    if not polymers[prim_pol_type]:
        LOGGER.warning(
            f"No primary polymer '{prim_pol_type}' atoms. Marking as SOLVENT+UNK."
        )
        return marks, "x", []

    # Force a single polymer chain or allow multiple ones
    fn = _to_single_poly_mask if single_poly_chain else _combine_poly_masks
    is_pol = fn(a, polymers[prim_pol_type])

    if not np.any(is_pol):
        LOGGER.warning(
            f"No primary polymer '{prim_pol_type}' atoms. Marking as SOLVENT+UNK."
        )
        return marks, "x", []

    # Mark determined primary polymer atoms. Ligand polymers will be marked below.
    marks[is_pol] = _POL_MARKS[prim_pol_type]

    ligands = []

    # Detect covalently bound solvent or ligand atoms.
    het_pol_idx = np.where(a.hetero & is_pol)[0]
    if len(het_pol_idx) > 0:
        starts_het_pol = np.unique(bst.get_residue_starts_for(a, het_pol_idx))
        for r_mask in bst.get_residue_masks(a, starts_het_pol):
            het_pol_type = find_first_polymer_type(a[r_mask], min_size=1)
            if het_pol_type == prim_pol_type:
                continue
            lig = make_ligand(r_mask, is_pol & ~r_mask, s)
            if lig is None:
                marks[r_mask] = AtomMark.SOLVENT | AtomMark.COVALENT
            else:
                marks[r_mask] = AtomMark.LIGAND | AtomMark.COVALENT
                ligands.append(lig)

    # Find unmarked atom indices and impose a subgraph based on them
    remaining_idx = np.where(marks == AtomMark.UNK)[0]
    sg = g.subgraph(remaining_idx)
    cc_idx_viewed = set()

    # Iterate over the subgraph's connected components
    for cc_idx in map(list, rx.connected_components(sg)):
        # Obtain original indices; they are reset upon a subgraph creation
        cc_idx = sg.subgraph(cc_idx).nodes()
        # Obtain a mask pointing to CC residues that were not previously
        # marked as polymer or solvent
        r_mask = extend_residue_mask(a, cc_idx)
        # Avoid creating duplicated ligands by storing already assessed atom
        # indices
        r_mask[list(cc_idx_viewed)] = False
        if not np.any(r_mask):
            continue
        cc_idx_viewed |= set(np.where(r_mask)[0])

        n_resi = bst.get_residue_count(a[r_mask])

        # Attempt making a ligand
        lig = make_ligand(r_mask, is_pol, s)
        if lig is None:
            continue

        multiples_flag = False  # indicates whether ligand is split below

        if n_resi > 1:  # Handle polymer ligand
            if lig.res_name[0] in lig_pol_types:
                marks[r_mask] = AtomMark.LIGAND | _POL_MARKS[lig.res_name[0]]
            else:
                # Ligand is non-polymer but still has multiple residues.
                # Users are warned about such cases.

                # Split into separate residues if polymer type is undetermined.
                # It's likely that a ligand appears as a single CC due to
                # structure artifacts, eg, molecules with different altloc
                # occupying the same spatial region.
                if lig.res_name[0] == "x":
                    multiples_flag = True
                    starts = np.unique(bst.get_residue_starts_for(a, cc_idx))
                    for r_mask in bst.get_residue_masks(a, starts):
                        lig = make_ligand(r_mask, is_pol & ~r_mask, s)
                        if lig is not None:
                            marks[r_mask] = AtomMark.LIGAND
                            ligands.append(lig)
                # Otherwise, despite polymeric nature of a ligand, it's
                # unsupported by the config and will be marked as a regular
                # ligand.
                else:
                    marks[r_mask] = AtomMark.LIGAND
        else:  # Handle a non-polymer ligand
            marks[r_mask] = AtomMark.LIGAND

        if not multiples_flag:
            ligands.append(lig)

    ligand_mask = reduce(
        op.or_,
        (lig.mask for lig in ligands),
        np.zeros_like(a.res_id, dtype=bool),
    )
    for lig in ligands:
        lig.contact_mask[ligand_mask] = False
        lig.dist[ligand_mask] = -1

    return marks, prim_pol_type, ligands



if __name__ == "__main__":
    raise RuntimeError