Source code for lXtractor.variables.base

"""
Base classes, common types and functions for the `variables` module.
"""
from __future__ import annotations

import inspect
import logging
import typing as t
from abc import abstractmethod, ABCMeta
from collections import UserDict, abc
from pathlib import Path

import numpy as np
import numpy.typing as npt
import pandas as pd

import lXtractor.resources as resources
from lXtractor.core import Ligand
from lXtractor.core.structure import GenericStructure
from lXtractor.util.io import read_n_col_table


[docs] class AggFn(t.Protocol):
[docs] def __call__(self, a: npt.ArrayLike, **kwargs) -> np.ndarray | float: ...
AggFns: dict[str, AggFn] = dict( min=np.min, max=np.max, mean=np.mean, median=np.median # type: ignore ) LOGGER = logging.getLogger(__name__) T = t.TypeVar("T") V = t.TypeVar("V") MappingT: t.TypeAlias = abc.Mapping[int, t.Optional[int]] OT = t.TypeVar("OT", GenericStructure, abc.Sequence) # object type RT = t.TypeVar("RT") # return type ERT: t.TypeAlias = tuple[bool, RT] # extended return type
[docs] class AbstractVariable(t.Generic[OT, RT], metaclass=ABCMeta): """ Abstract base class for variables. """ __slots__ = () def __str__(self): return self.id def __repr__(self): return self.__str__() def __eq__(self, other): return not isinstance(other, type(self)) or self.id == other.id def __hash__(self): return hash(self.id) @property def id(self) -> str: """ Variable identifier such that eval(x.id) produces another instance. """ def parse_value(v): if isinstance(v, str): return f"'{v}'" return v # Complains about accessing init of an instance since it can change # which is exactly the purpose of this method! init_params = inspect.signature(self.__init__).parameters # type: ignore args = ",".join( map(lambda x: f"{x}={parse_value(getattr(self, x))}", init_params) ) return f"{self.__class__.__name__}({args})" @property @abstractmethod def rtype(self) -> t.Type[RT]: """ Variable's return type, such that `rtype("result")` converts to the relevant type. """
[docs] @abstractmethod def calculate(self, obj: OT, mapping: MappingT | None = None) -> RT: """ Calculate variable. Each variable defines its own calculation strategy. :param obj: An object used for variable's calculation. :param mapping: Mapping from generalizable positions of MSA/reference/etc. to the `obj`'s positions. :return: Calculation result. :raises: :class:`FailedCalculation` if the calculation fails. """
[docs] class StructureVariable(AbstractVariable[GenericStructure, RT], t.Generic[RT]): """ A type of variable whose :meth:`calculate` method requires protein structure. """ __slots__ = ()
[docs] @abstractmethod def calculate(self, obj: GenericStructure, mapping: MappingT | None = None) -> RT: """ :param obj: Some atom array. :param mapping: Optional mapping between structure and some reference object numbering schemes. :return: A calculation result of some sensible non-sequence type, such as string, float, int, etc. """
[docs] class SequenceVariable(AbstractVariable[abc.Sequence[T], RT], t.Generic[T, RT]): """ A type of variable whose :meth:`calculate` method requires protein sequence. """ __slots__ = ()
[docs] @abstractmethod def calculate(self, obj: abc.Sequence[T], mapping: MappingT | None = None) -> RT: """ :param obj: Some sequence. :param mapping: Optional mapping between sequence and some reference object numbering schemes. :return: A calculation result of some sensible non-sequence type, such as string, float, int, etc. """
[docs] class LigandVariable(AbstractVariable[Ligand, RT], t.Generic[T, RT]): """ A type of variable whose :meth:`calculate` method requires protein sequence. """ __slots__ = ()
[docs] @abstractmethod def calculate(self, obj: Ligand, mapping: MappingT | None = None) -> RT: """ :param obj: Some sequence. :param mapping: Optional mapping between sequence and some reference object numbering schemes. :return: A calculation result of some sensible non-sequence type, such as string, float, int, etc. """
VT = t.TypeVar( "VT", bound=t.Union[StructureVariable, SequenceVariable] ) # variable type
[docs] class Variables(UserDict): # TODO: Proper generic type? """ A subclass of :class:`dict` holding variables (:class:`AbstractVariable` subclasses). The keys are the :class:`AbstractVariable` subclasses' instances (hashed by :meth::class:`id <AbstractVariable.id>`), and values are calculation results. """ def __getitem__(self, item: str | AbstractVariable): if isinstance(item, str): return super().__getitem__(hash(item)) return super().__getitem__(item) @property def structure(self) -> Variables: """ :return: values that are :class:`StructureVariable` instances. """ return Variables( {k: v for k, v in self.items() if isinstance(k, StructureVariable)} ) @property def sequence(self) -> Variables: """ :return: values that are :class:`SequenceVariable` instances. """ return Variables( {k: v for k, v in self.items() if isinstance(k, SequenceVariable)} )
[docs] @classmethod def read(cls, path: Path) -> Variables: # TODO: does it still need the dynamic imports? """ Read and initialize variables. :param path: Path to a two-column .tsv file holding pairs (var_id, var_value). Will use `var_id` to initialize variable, importing dynamically a relevant class from :mod:`variables`. :return: A dict mapping variable object to its value. """ try: vs = read_n_col_table(path, 2) or pd.DataFrame() except pd.errors.EmptyDataError: vs = pd.DataFrame() variables = cls() for v_id, v_val in vs.itertuples(index=False): v_name = v_id.split("(")[0] import_statement = f"from lXtractor.variables import {v_name}" try: exec(import_statement) except ImportError: LOGGER.exception( f"Failed to exec {import_statement} for variable {v_name} " f"causing variable's init to fail" ) continue try: v = eval(v_id) except Exception as e: LOGGER.exception(f"Failed to eval variable {v_id} due to {e}") continue try: v_val = eval(v_val) except Exception as e: LOGGER.debug(f"Failed to eval {v_val} for variable {v_name} due to {e}") variables[v] = v_val return variables
[docs] def write(self, path: Path) -> None: """ :param path: Path to a file. :param skip_if_contains: Skip if a variable ID contains any of the provided strings. """ items = (f"{v.id}\t{r}" for v, r in self.items()) path.write_text("\n".join(items))
[docs] def as_df(self) -> pd.DataFrame: """ :return: A table with two columns: VariableID and VariableResult. """ if len(self) == 0: return pd.DataFrame() return pd.DataFrame( {"VariableID": [k.id for k in self], "VariableResult": list(self.values())} )
[docs] class AbstractCalculator(t.Generic[OT], metaclass=ABCMeta): """ Class defining variables' calculation strategy. """ __slots__ = () @t.overload def __call__(self, o: OT, v: VT, m: MappingT | None) -> ERT: ... @t.overload def __call__( self, o: abc.Iterable[OT], v: abc.Iterable[VT] | abc.Iterable[abc.Iterable[VT]], m: abc.Iterable[MappingT | None] | None, ) -> abc.Iterable[abc.Iterable[ERT]]: ...
[docs] @abstractmethod def __call__( self, o: OT | abc.Iterable[OT], v: VT | abc.Iterable[VT] | abc.Iterable[abc.Iterable[VT]], m: MappingT | abc.Iterable[MappingT | None] | None, ) -> ERT | abc.Iterable[abc.Iterable[ERT]]: """ :param o: Object to calculate on. :param v: Some variable whose `calculate` method accepts `o`-type instances. :param m: Optional mapping between object and some reference object numbering schemes. :return: Calculation result. """
[docs] @abstractmethod def map(self, o: OT, v: abc.Iterable[VT], m: MappingT | None) -> abc.Iterable[ERT]: """ Map variables to a single object. :param o: Object to calculate on. :param v: An iterable over variables whose `calculate` method accepts `o`-type instances. :param m: Optional mapping between object and some reference object numbering schemes. :return: An iterator (generator) over calculation result. """
[docs] @abstractmethod def vmap( self, o: abc.Iterable[OT], v: VT, m: abc.Iterable[MappingT | None] ) -> abc.Iterable[ERT]: """ Map objects to a single variable. :param o: An iterable over objects to calculate on. :param v: Some variable whose `calculate` method accepts `o`-type instances. :param m: Optional mapping between object and some reference object numbering schemes. :return: An iterator (generator) over calculation result. """
# class CalculatorProtocol(t.Protocol[OT, VT, RT]): # """ # An interface of a calculator definition for typing. # """ # # @t.overload # def __call__(self, o: OT, v: VT, m: MappingT | None, *args, **kwargs) -> RT: # ... # # @t.overload # def __call__( # self, # o: abc.Iterable[OT], # v: abc.Iterable[abc.Iterable[VT]], # m: abc.Iterable[MappingT | None] | None, # *args, # **kwargs, # ) -> abc.Iterable[abc.Iterable[RT]]: # ... # # def __call__( # self, # o: OT | abc.Iterable[OT], # v: VT | abc.Iterable[abc.Iterable[VT]], # m: MappingT | abc.Iterable[MappingT | None] | None, # *args, # **kwargs, # ) -> RT | abc.Iterable[abc.Iterable[RT]]: # ...
[docs] class ProtFP: """ ProtFP embeddings for amino acid residues. ProtFP is a coding scheme derived from the PCA analysis of the AAIndex database :cite:`pfp1,pfp2`. >>> pfp = ProtFP() >>> pfp[('G', 1)] -5.7 >>> list(pfp['G']) [-5.7, -8.72, 4.18, -1.35, -0.31] >>> comp1 = pfp[1] >>> assert len(comp1) == 20 >>> comp1[0] -5.7 >>> comp1.index[0] 'G' .. bibliography:: """
[docs] def __init__(self, path: Path = Path(resources.__file__).parent / "PFP.csv"): self._df = pd.read_csv(path).set_index("AA")
@t.overload def __getitem__(self, item: tuple[str, int]) -> float: ... @t.overload def __getitem__(self, item: str) -> np.ndarray: ... @t.overload def __getitem__(self, item: int) -> pd.Series: ... def __getitem__( self, item: tuple[str, int] | str | int ) -> float | np.ndarray | pd.Series: match item: case [c, i]: # 1. Fails to infer types of c and i # 2. Fails to infer return type (thinks it's a data frame) return self._df.loc[c, str(i - 1)] # type: ignore case str(): # Pandas always puts "ExtensionArray | ndarray" as a return # type, although only ndarray is actually returned here return self._df.loc[item].values # type: ignore case int(): return self._df[str(item - 1)] case _: raise TypeError(f"Invalid index type {item}")
if __name__ == "__main__": raise RuntimeError