from __future__ import annotations
import logging
import typing as t
from collections import abc
from io import TextIOBase
from pathlib import Path
import numpy as np
import pandas as pd
from lXtractor.chain import ChainSequence, ChainStructure
from lXtractor.chain.base import topo_iter
from lXtractor.chain.list import ChainList, _wrap_children
from lXtractor.core.base import SeqReader, ApplyT, FilterT
from lXtractor.core.config import DefaultConfig
from lXtractor.core.exceptions import (
AmbiguousMapping,
MissingData,
NoOverlap,
FormatError,
InitError,
)
from lXtractor.util.seq import read_fasta
LOGGER = logging.getLogger(__name__)
__all__ = ("Chain",)
# TODO: support for empty chain in methods
[docs]
class Chain:
"""
A container, encompassing a :class:`ChainSequence` and possibly many
:class:`ChainStructure`'s corresponding to a single protein chain.
A typical use case is when one wants to benefit from the connection
of structural and sequential data, e.g., using single full canonical
sequence as :attr:`_seq` and all the associated structures within
:attr:`structures`. In this case, this data structure makes it easier
to extract, annotate, and calculate variables using canonical sequence
mapped to the sequence of a structure.
Typical workflow:
#. Initialize from some canonical sequence.
#. Add structures and map their sequences.
#. ???
#. Do something useful, like calculate variables using canonical
sequence's positions.
.. code-block:: python
c = Chain.from_sequence((header, _seq))
for s in structures:
c.add_structure(s)
"""
__slots__ = ("_seq", "_id", "structures", "_parent", "children")
[docs]
def __init__(
self,
seq: ChainSequence,
structures: abc.Iterable[ChainStructure] | None = None,
parent: Chain | None = None,
children: abc.Iterable[Chain] | None = None,
):
"""
:param seq: A chain sequence.
:param structures: Chain structures corresponding to a single protein
chain specified by `_seq`.
:param parent: A parent chain this chain had descended from.
:param children: A collection of children.
"""
self._seq: ChainSequence = seq
#: A collection of structures corresponding to :attr:`_seq`.
if structures is None:
structures = ChainList([])
else:
if not isinstance(structures, ChainList):
structures = ChainList(structures)
self.structures: ChainList[ChainStructure] = structures
#: A parent chain this chain had descended from.
self._parent: Chain | None = parent
#: A collection of children preferably obtained using
#: :meth:`spawn_child`.
self.children: ChainList[Chain] = _wrap_children(children)
self._id = self._make_id()
for c in self.children:
c.parent = self
def __eq__(self, other: t.Any) -> bool:
if isinstance(other, Chain):
return (
self.id == other.id
and self.seq == other.seq
and self.structures == other.structures
)
return False
def __hash__(self) -> int:
return hash(self.id)
def _make_id(self) -> str:
parent = "" if self.parent is None else f"<-({self.parent.id})"
return f"Chain({self.seq.id_strip_parents()}){parent}"
@property
def id(self) -> str:
"""
:return: Chain identifier derived from its :attr:`_seq` ID.
"""
return self._id
@property
def seq(self) -> ChainSequence:
return self._seq
@seq.setter
def seq(self, value: ChainSequence) -> None:
if not isinstance(value, ChainSequence):
raise TypeError(f"Invalid type {type(value)}")
self._seq = value
self._id = self._make_id()
@property
def parent(self) -> t.Self | None:
return self._parent
@parent.setter
def parent(self, value: t.Self | None):
if not isinstance(value, (type(self), type(None))):
raise TypeError(
f"Invalid type {type(value)}. "
f"Parent must be of the same type {type(self)}"
)
self._parent = value
self._id = self._make_id()
@property
def meta(self) -> dict[str, str]:
"""
:return: A :meth:`seq`'s :attr:`ChainSequence.meta`.
"""
return self.seq.meta
@property
def start(self) -> int:
"""
:return: Structure sequence's :attr:`start <lXtractor.core.chain.
sequence.start>`
"""
return self.seq.start
@property
def end(self) -> int:
"""
:return: Structure sequence's :attr:`end <lXtractor.core.chain.
sequence.end>`
"""
return self.seq.end
@property
def name(self) -> str | None:
"""
:return: Structure sequence's :attr:`name <lXtractor.core.chain.
sequence.name>`
"""
return self.seq.name
@property
def categories(self) -> list[str]:
"""
:return: A list of categories from :attr:`_seq`'s
:attr:`ChainSequence.meta`.
"""
return self.seq.categories
def __repr__(self) -> str:
return self.id
def __str__(self) -> str:
return self.id
[docs]
def iter_children(self) -> abc.Generator[list[Chain], None, None]:
"""
Iterate :attr:`children` in topological order.
See :meth:`ChainSequence.iter_children` and :func:`topo_iter`.
:return: Iterator over levels of a child tree.
"""
return topo_iter(self, lambda x: x.children)
[docs]
def filter_children(self, pred: FilterT[Chain], inplace: bool = False) -> t.Self:
"""
Filter children using some predicate.
:param pred: Some callable accepting chain and returning bool.
:param inplace: Filter :attr:`children` in place. Otherwise, return
a copy with only children transformed.
:return: A chain with filtered children.
"""
children = self.children.filter(pred)
if inplace:
self.children = children
return self
return self.__class__(
seq=self.seq,
structures=self.structures,
children=children,
parent=self.parent,
)
[docs]
def apply_children(self, fn: ApplyT[Chain], inplace: bool = False) -> t.Self:
"""
Apply some function to children.
:param fn: A callable accepting and returning the chain type instance.
:param inplace: Apply to children in place. Otherwise, return a copy
with only children transformed.
:return: A chain with transformed children.
"""
children = self.children.apply(fn)
if inplace:
self.children = children
return self
return self.__class__(
seq=self.seq,
structures=self.structures,
children=children,
parent=self.parent,
)
[docs]
def filter_structures(
self, pred: FilterT[ChainStructure], inplace: bool = False
) -> t.Self:
"""
Filter chain :attr:`structures`.
:param pred: A callable accepting a chain structure and returning bool.
:param inplace: Filter :attr:`structures` in place. Otherwise, return
a copy with only children transformed.
:return: A chain with filtered structures.
"""
structures = self.structures.filter(pred)
if inplace:
self.structures = structures
return self
return self.__class__(
seq=self.seq,
structures=structures,
children=self.children,
parent=self.parent,
)
[docs]
def apply_structures(
self, fn: ApplyT[ChainStructure], inplace: bool = False
) -> t.Self:
"""
Apply some function to :attr:`structures`.
:param fn: A callable accepting and returning a chain structure.
:param inplace: Apply to :attr:`structures` in place. Otherwise, return
a copy with only children transformed.
:return: A chain with transformed structures.
"""
structures = self.structures.apply(fn)
if inplace:
self.structures = structures
return self
return self.__class__(self.seq, structures, self.parent, self.children)
@classmethod
@t.overload
def from_seq(
cls, inp: Path | TextIOBase, read_method: SeqReader = read_fasta
) -> ChainList[t.Self]:
...
@classmethod
@t.overload
def from_seq(
cls, inp: str | ChainSequence, read_method: SeqReader = read_fasta
) -> t.Self:
...
@classmethod
@t.overload
def from_seq(
cls,
inp: abc.Iterable[str] | tuple[str, str],
read_method: SeqReader = read_fasta,
) -> t.Self | ChainList[t.Self]:
...
@classmethod
def from_seq(
cls,
inp: str
| tuple[str, str]
| ChainSequence
| Path
| TextIOBase
| abc.Iterable[str],
read_method: SeqReader = read_fasta,
) -> t.Self | ChainList[t.Self]:
# TODO: consider removing and passing the functionality to init
"""
:param inp: A string of with a sequence or a pair (header, _seq).
Otherwise, something that the `read_method` accepts.
:param read_method: A callable accepting a path to a file or opened
file or an iterable over the file lines and returning pairs
(header, _seq).
:return: If a single sequence is provided as a string or pair,
return an initialized chain. Otherwise, use `read_method` to parse
the input and embed the resulting :class:`Chain`'s into
a :class:`ChainList`.
"""
match inp:
case str():
return cls(ChainSequence.from_string(inp))
case ChainSequence():
return cls(inp)
case [str(), str()]:
return cls(ChainSequence.from_string(inp[1], name=inp[0]))
case _:
return ChainList(
cls(ChainSequence.from_string(seq, name=name))
for name, seq in read_method(inp)
)
[docs]
@classmethod
def read(
cls,
path: Path,
*,
search_children: bool = False,
) -> Chain:
"""
:param path: A path to a directory with at least sequence and
metadata files.
:param search_children: Recursively search for child segments and
populate :attr:`children`.
:return: An initialized chain.
"""
fnames = DefaultConfig["filenames"]
seq = ChainSequence.read(path, search_children=False)
structures = [
ChainStructure.read(p) for p in (path / fnames["structures_dir"]).glob("*")
]
c = Chain(seq, structures)
if search_children:
for child_path in (path / fnames["segments_dir"]).glob("*"):
child = Chain.read(child_path, search_children=True)
child.parent = c
c.children.append(child)
return c
[docs]
@classmethod
def make_empty(cls) -> t.Self:
return cls(ChainSequence.make_empty())
[docs]
def write(
self,
dest: Path,
*,
str_fmt: str = "mmtf.gz",
write_children: bool = True,
) -> Path:
"""
Create a disk dump of this chain data.
Created dumps can be reinitialized via :meth:`read`.
:param dest: A writable dir to hold the data.
:param str_fmt: A format to write :attr:`structures` in.
:param write_children: Recursively write :attr:`children`.
:return: Path to the directory where the files are written.
"""
dest.mkdir(parents=True, exist_ok=True)
fnames = DefaultConfig["filenames"]
self.seq.write(dest, write_children=False)
if self.structures:
str_dir = dest / fnames["structures_dir"]
str_dir.mkdir(exist_ok=True)
for s in self.structures:
s.write(str_dir / s.id, str_fmt, write_children=False)
for c in self.children:
c.write(
dest / fnames["segments_dir"] / c.id,
str_fmt=str_fmt,
write_children=write_children,
)
return dest
[docs]
def add_structure(
self,
structure: ChainStructure,
*,
check_ids: bool = True,
map_to_seq: bool = True,
map_name: str = DefaultConfig["mapnames"]["map_canonical"],
add_to_children: bool = False,
**kwargs,
):
"""
Add a structure to :attr:`structures`.
:param structure: A structure of a single chain corresponding to
:attr:`_seq`.
:param check_ids: Check that existing :attr:`structures`
don't encompass the structure with the same :meth:`id`.
:param map_to_seq: Align the structure sequence to the :attr:`_seq` and
create a mapping within the former.
:param map_name: If `map_to_seq` is ``True``, use this map name.
:param add_to_children: If ``True``, will recursively add structure to
existing children according to their boundaries mapped to the
structure's numbering. Consequently, this requires mapping, i.e.,
``map_to_seq=True``.
:param kwargs: Passed to :meth:`ChainSequence.map_numbering`.
:return: Mutates :attr:`structures` and returns nothing.
:raise ValueError: If `check_ids` is ``True`` and the structure
id clashes with the existing ones.
"""
if check_ids:
ids = [s.id for s in self.structures]
if structure.id in ids:
raise ValueError(
f"Protein already contains structure {structure.id}. "
f"Remove it first or disable `check_ids`"
)
if map_to_seq:
structure.seq.map_numbering(self.seq, name=map_name, **kwargs)
self.structures.append(structure)
if add_to_children and len(self.children) > 0:
for c in self.children:
sub = structure.spawn_child(
c.seq.start,
c.seq.end,
c.name,
map_from=map_name,
)
c.add_structure(sub, map_to_seq=False, add_to_children=True)
[docs]
def transfer_seq_mapping(
self,
map_name: str,
link_map: str = DefaultConfig["mapnames"]["map_canonical"],
link_map_points_to: str = "i",
**kwargs,
):
"""
Transfer sequence mapping to each :attr:`ChainStructure._seq` within
:attr:`structures`.
This method simply utilizes :meth:`ChainSequence.relate` to transfer
some map from the :attr:`_seq` to each :attr:`ChainStructure._seq`.
Check :meth:`ChainSequence.relate` for an explanation.
:param map_name: The name of the map to transfer.
:param link_map: A name of the map existing within
:attr:`ChainStructure._seq` of each structure in :attr:`structures`.
:param link_map_points_to: Which sequence values of the `link_map`
point to.
:param kwargs: Passed to :meth:`ChainSequence.relate`
:return: Nothing.
"""
for s in self.structures:
self.seq.relate(s.seq, map_name, link_map, link_map_points_to, **kwargs)
[docs]
def generate_patched_seqs(
self,
numbering: str = DefaultConfig["mapnames"]["enum"],
link_name: str = DefaultConfig["mapnames"]["map_canonical"],
link_points_to: str = "i",
**kwargs,
) -> abc.Generator[ChainSequence, None, None]:
"""
Generate patched sequences from chain structure sequences.
For explanation of the patching process see
:meth:`lXtractor.chain.sequence.ChainSequence.patch`.
:param numbering: Map name referring to a numbering scheme to infer
gaps from.
:param link_name: Map name linking structure sequence to the canonical
sequence.
:param link_points_to: Map name in the canonical sequence that
`link_name` refers to.
:param kwargs: Passed to
:meth:`lXtractor.chain.sequence.ChainSequence.patch`.
:return: A generator over patched structure sequences.
"""
for s in self.structures.sequences:
yield self.seq.patch(s, numbering, link_name, link_points_to, **kwargs)
[docs]
def spawn_child(
self,
start: int,
end: int,
name: str | None = None,
category: str | None = None,
*,
subset_structures: bool = True,
tolerate_failure: bool = False,
silent: bool = False,
keep: bool = True,
seq_deep_copy: bool = False,
seq_map_from: str | None = None,
seq_map_closest: bool = True,
seq_keep_child: bool = False,
str_deep_copy: bool = False,
str_map_from: str | None = None,
str_map_closest: bool = True,
str_keep_child: bool = True,
str_seq_keep_child: bool = False,
str_min_size: int | float = 1,
str_accept_fn: abc.Callable[[ChainStructure], bool] = lambda _: True,
) -> t.Self:
"""
Subset a :attr:`_seq` and (optionally) each structure in
:attr:`structures` using the provided :attr:`_seq` boundaries
(inclusive).
:param start: Start coordinate.
:param end: End coordinate.
:param name: Name of a new chain.
:param category: Spawned child category. Any meaningful tag string that
could be used later to group similar children.
:param subset_structures: If ``True``, subset each structure in
:attr:`structures`. If ``False``, structures are not inherited.
:param tolerate_failure: If ``True``, a failure to subset a structure
doesn't raise an error.
:param silent: Supress warnings for errors when `tolerate_failure` is
``True``.
:param keep: Save created child to :attr:`children`.
:param seq_deep_copy: Deep copy potentially mutable sequences within
:attr:`_seq`.
:param seq_map_from: Use this map to obtain coordinates within
:attr:`_seq`.
:param seq_map_closest: Map to the closest matching coordinates of
a :attr:`_seq`. See :meth:`ChainSequence.map_boundaries`
and :meth:`ChainSequence.find_closest`.
:param seq_keep_child: Keep a spawned :class:`ChainSequence` as a child
within :attr:`_seq`. Should be ``False`` if `keep` is ``True`` to
avoid data duplication.
:param str_deep_copy: Deep copy each sub-structure.
:param str_map_from: Use this map to obtain coordinates within
:attr:`ChainStructure._seq` of each structure.
:param str_map_closest: Map to the closest matching coordinates of
a :attr:`_seq`. See :meth:`ChainSequence.map_boundaries`
and :meth:`ChainSequence.find_closest`.
:param str_keep_child: Keep a spawned sub-structure as a child in
:attr:`ChainStructure.children`. Should be ``False`` if `keep` is
``True`` to avoid data duplication.
:param str_seq_keep_child: Keep a sub-sequence of a spawned structure
within the :attr:`ChainSequence.children` of
:attr:`ChainStructure._seq` of a spawned structure. Should be
``False`` if `keep` or `str_keep_child` is ``True`` to avoid
data duplication.
:param str_min_size: A minimum number of residues in a structure to be
accepted after subsetting.
:param str_accept_fn: A filter function accepting a
:class:`~lXtractor.chain.structure.ChainStructure` and returning
a boolean value indicating whether this structure should be
retained in :attr:`structures`.
:return: A sub-chain with sub-sequence and (optionally) sub-structures.
"""
def subset_structure(structure: ChainStructure) -> ChainStructure | None:
try:
c = structure.spawn_child(
start,
end,
name,
category,
map_from=str_map_from,
map_closest=str_map_closest,
deep_copy=str_deep_copy,
keep=str_keep_child,
keep_seq_child=str_seq_keep_child,
)
if len(c) >= str_min_size and str_accept_fn(c):
return c
structure.children.remove(c)
return None
except (
AmbiguousMapping,
MissingData,
NoOverlap,
FormatError,
InitError,
) as e:
msg = (
f"Cannot spawn child structure from {structure} "
f"using boundaries {start, end}."
)
if tolerate_failure:
if not silent:
LOGGER.warning(msg)
else:
raise InitError(msg) from e
return None
name = name or self.seq.name
seq = self.seq.spawn_child(
start,
end,
name,
category,
map_from=seq_map_from,
map_closest=seq_map_closest,
deep_copy=seq_deep_copy,
keep=seq_keep_child,
)
structures = None
if subset_structures:
structures = [
s for s in map(subset_structure, self.structures) if s is not None
]
child = Chain(seq, structures, self)
if keep:
self.children.append(child)
return child
[docs]
def summary(
self, meta: bool = True, children: bool = False, structures: bool = True
) -> pd.DataFrame:
s = self.seq.summary(meta=meta, children=False)
s[DefaultConfig["colnames"]["id"]] = [self.id]
parent_id = np.NaN if self.parent is None else self.parent.id
s[DefaultConfig["colnames"]["parent_id"]] = [parent_id]
if structures and self.structures:
str_summaries = pd.concat(
s.summary(meta=meta, children=False) for s in self.structures
)
str_summaries["Structure"] = True
str_summaries["ParentChain"] = self.id
s = pd.concat([s, str_summaries])
if children and self.children:
child_summaries = pd.concat(
c.summary(meta=meta, children=children) for c in self.children
)
s = pd.concat([s, child_summaries])
return s
if __name__ == "__main__":
raise RuntimeError