Source code for lXtractor.variables.base

"""
Base classes, common types and functions for the `variables` module.
"""
from __future__ import annotations

import inspect
import logging
import typing as t
from abc import abstractmethod, ABCMeta
from collections import UserDict, abc
from pathlib import Path

import numpy as np
import numpy.typing as npt
import pandas as pd

import lXtractor.resources as resources
from lXtractor.core import Ligand
from lXtractor.core.structure import GenericStructure
from lXtractor.util.io import read_n_col_table



[docs]
class AggFn(t.Protocol):

[docs]
    def __call__(self, a: npt.ArrayLike, **kwargs) -> np.ndarray | float:
        ...




AggFns: dict[str, AggFn] = dict(
    min=np.min, max=np.max, mean=np.mean, median=np.median  # type: ignore
)

LOGGER = logging.getLogger(__name__)
T = t.TypeVar("T")
V = t.TypeVar("V")

MappingT: t.TypeAlias = abc.Mapping[int, t.Optional[int]]
OT = t.TypeVar("OT", GenericStructure, abc.Sequence)  # object type
RT = t.TypeVar("RT")  # return type
ERT: t.TypeAlias = tuple[bool, RT]  # extended return type



[docs]
class AbstractVariable(t.Generic[OT, RT], metaclass=ABCMeta):
    """
    Abstract base class for variables.
    """

    __slots__ = ()

    def __str__(self):
        return self.id

    def __repr__(self):
        return self.__str__()

    def __eq__(self, other):
        return not isinstance(other, type(self)) or self.id == other.id

    def __hash__(self):
        return hash(self.id)

    @property
    def id(self) -> str:
        """
        Variable identifier such that eval(x.id) produces another instance.
        """

        def parse_value(v):
            if isinstance(v, str):
                return f"'{v}'"
            return v

        # Complains about accessing init of an instance since it can change
        # which is exactly the purpose of this method!
        init_params = inspect.signature(self.__init__).parameters  # type: ignore
        args = ",".join(
            map(lambda x: f"{x}={parse_value(getattr(self, x))}", init_params)
        )
        return f"{self.__class__.__name__}({args})"

    @property
    @abstractmethod
    def rtype(self) -> t.Type[RT]:
        """
        Variable's return type, such that `rtype("result")` converts to
        the relevant type.
        """


[docs]
    @abstractmethod
    def calculate(self, obj: OT, mapping: MappingT | None = None) -> RT:
        """
        Calculate variable. Each variable defines its own calculation strategy.

        :param obj: An object used for variable's calculation.
        :param mapping: Mapping from generalizable positions of
            MSA/reference/etc. to the `obj`'s positions.
        :return: Calculation result.
        :raises: :class:`FailedCalculation` if the calculation fails.
        """





[docs]
class StructureVariable(AbstractVariable[GenericStructure, RT], t.Generic[RT]):
    """
    A type of variable whose :meth:`calculate` method requires protein
    structure.
    """

    __slots__ = ()


[docs]
    @abstractmethod
    def calculate(self, obj: GenericStructure, mapping: MappingT | None = None) -> RT:
        """
        :param obj: Some atom array.
        :param mapping: Optional mapping between structure and some reference
            object numbering schemes.
        :return: A calculation result of some sensible non-sequence type, such
            as string, float, int, etc.
        """





[docs]
class SequenceVariable(AbstractVariable[abc.Sequence[T], RT], t.Generic[T, RT]):
    """
    A type of variable whose :meth:`calculate` method requires protein
    sequence.
    """

    __slots__ = ()


[docs]
    @abstractmethod
    def calculate(self, obj: abc.Sequence[T], mapping: MappingT | None = None) -> RT:
        """
        :param obj: Some sequence.
        :param mapping: Optional mapping between sequence and some reference
            object numbering schemes.
        :return: A calculation result of some sensible non-sequence type, such
            as string, float, int, etc.
        """





[docs]
class LigandVariable(AbstractVariable[Ligand, RT], t.Generic[T, RT]):
    """
    A type of variable whose :meth:`calculate` method requires protein
    sequence.
    """

    __slots__ = ()


[docs]
    @abstractmethod
    def calculate(self, obj: Ligand, mapping: MappingT | None = None) -> RT:
        """
        :param obj: Some sequence.
        :param mapping: Optional mapping between sequence and some reference
            object numbering schemes.
        :return: A calculation result of some sensible non-sequence type, such
            as string, float, int, etc.
        """




VT = t.TypeVar(
    "VT", bound=t.Union[StructureVariable, SequenceVariable]
)  # variable type



[docs]
class Variables(UserDict):
    # TODO: Proper generic type?
    """
    A subclass of :class:`dict` holding variables (:class:`AbstractVariable`
    subclasses).

    The keys are the :class:`AbstractVariable` subclasses' instances
    (hashed by :meth::class:`id <AbstractVariable.id>`), and values are
    calculation results.
    """

    def __getitem__(self, item: str | AbstractVariable):
        if isinstance(item, str):
            return super().__getitem__(hash(item))
        return super().__getitem__(item)

    @property
    def structure(self) -> Variables:
        """
        :return: values that are :class:`StructureVariable` instances.
        """
        return Variables(
            {k: v for k, v in self.items() if isinstance(k, StructureVariable)}
        )

    @property
    def sequence(self) -> Variables:
        """
        :return: values that are :class:`SequenceVariable` instances.
        """
        return Variables(
            {k: v for k, v in self.items() if isinstance(k, SequenceVariable)}
        )


[docs]
    @classmethod
    def read(cls, path: Path) -> Variables:
        # TODO: does it still need the dynamic imports?
        """
        Read and initialize variables.

        :param path: Path to a two-column .tsv file holding pairs
            (var_id, var_value). Will use `var_id` to initialize variable,
            importing dynamically a relevant class from :mod:`variables`.
        :return: A dict mapping variable object to its value.
        """

        try:
            vs = read_n_col_table(path, 2) or pd.DataFrame()
        except pd.errors.EmptyDataError:
            vs = pd.DataFrame()
        variables = cls()

        for v_id, v_val in vs.itertuples(index=False):
            v_name = v_id.split("(")[0]

            import_statement = f"from lXtractor.variables import {v_name}"
            try:
                exec(import_statement)
            except ImportError:
                LOGGER.exception(
                    f"Failed to exec {import_statement} for variable {v_name} "
                    f"causing variable's init to fail"
                )
                continue

            try:
                v = eval(v_id)
            except Exception as e:
                LOGGER.exception(f"Failed to eval variable {v_id} due to {e}")
                continue
            try:
                v_val = eval(v_val)
            except Exception as e:
                LOGGER.debug(f"Failed to eval {v_val} for variable {v_name} due to {e}")
            variables[v] = v_val

        return variables



[docs]
    def write(self, path: Path) -> None:
        """
        :param path: Path to a file.
        :param skip_if_contains: Skip if a variable ID contains any of the
            provided strings.
        """
        items = (f"{v.id}\t{r}" for v, r in self.items())
        path.write_text("\n".join(items))



[docs]
    def as_df(self) -> pd.DataFrame:
        """
        :return: A table with two columns: VariableID and VariableResult.
        """
        if len(self) == 0:
            return pd.DataFrame()
        return pd.DataFrame(
            {"VariableID": [k.id for k in self], "VariableResult": list(self.values())}
        )





[docs]
class AbstractCalculator(t.Generic[OT], metaclass=ABCMeta):
    """
    Class defining variables' calculation strategy.
    """

    __slots__ = ()

    @t.overload
    def __call__(self, o: OT, v: VT, m: MappingT | None) -> ERT:
        ...

    @t.overload
    def __call__(
        self,
        o: abc.Iterable[OT],
        v: abc.Iterable[VT] | abc.Iterable[abc.Iterable[VT]],
        m: abc.Iterable[MappingT | None] | None,
    ) -> abc.Iterable[abc.Iterable[ERT]]:
        ...


[docs]
    @abstractmethod
    def __call__(
        self,
        o: OT | abc.Iterable[OT],
        v: VT | abc.Iterable[VT] | abc.Iterable[abc.Iterable[VT]],
        m: MappingT | abc.Iterable[MappingT | None] | None,
    ) -> ERT | abc.Iterable[abc.Iterable[ERT]]:
        """
        :param o: Object to calculate on.
        :param v: Some variable whose `calculate` method accepts `o`-type
            instances.
        :param m: Optional mapping between object and some reference object
            numbering schemes.
        :return: Calculation result.
        """



[docs]
    @abstractmethod
    def map(self, o: OT, v: abc.Iterable[VT], m: MappingT | None) -> abc.Iterable[ERT]:
        """
        Map variables to a single object.

        :param o: Object to calculate on.
        :param v: An iterable over variables whose `calculate` method accepts
            `o`-type instances.
        :param m: Optional mapping between object and some reference object
            numbering schemes.
        :return: An iterator (generator) over calculation result.
        """



[docs]
    @abstractmethod
    def vmap(
        self, o: abc.Iterable[OT], v: VT, m: abc.Iterable[MappingT | None]
    ) -> abc.Iterable[ERT]:
        """
        Map objects to a single variable.

        :param o: An iterable over objects to calculate on.
        :param v: Some variable whose `calculate` method accepts `o`-type
            instances.
        :param m:  Optional mapping between object and some reference object
            numbering schemes.
        :return: An iterator (generator) over calculation result.
        """




# class CalculatorProtocol(t.Protocol[OT, VT, RT]):
#     """
#     An interface of a calculator definition for typing.
#     """
#
#     @t.overload
#     def __call__(self, o: OT, v: VT, m: MappingT | None, *args, **kwargs) -> RT:
#         ...
#
#     @t.overload
#     def __call__(
#         self,
#         o: abc.Iterable[OT],
#         v: abc.Iterable[abc.Iterable[VT]],
#         m: abc.Iterable[MappingT | None] | None,
#         *args,
#         **kwargs,
#     ) -> abc.Iterable[abc.Iterable[RT]]:
#         ...
#
#     def __call__(
#         self,
#         o: OT | abc.Iterable[OT],
#         v: VT | abc.Iterable[abc.Iterable[VT]],
#         m: MappingT | abc.Iterable[MappingT | None] | None,
#         *args,
#         **kwargs,
#     ) -> RT | abc.Iterable[abc.Iterable[RT]]:
#         ...



[docs]
class ProtFP:
    """
    ProtFP embeddings for amino acid residues.

    ProtFP is a coding scheme derived from
    the PCA analysis of the AAIndex database :cite:`pfp1,pfp2`.

    >>> pfp = ProtFP()
    >>> pfp[('G', 1)]
    -5.7
    >>> list(pfp['G'])
    [-5.7, -8.72, 4.18, -1.35, -0.31]
    >>> comp1 = pfp[1]
    >>> assert len(comp1) == 20
    >>> comp1[0]
    -5.7
    >>> comp1.index[0]
    'G'

    .. bibliography::

    """


[docs]
    def __init__(self, path: Path = Path(resources.__file__).parent / "PFP.csv"):
        self._df = pd.read_csv(path).set_index("AA")


    @t.overload
    def __getitem__(self, item: tuple[str, int]) -> float:
        ...

    @t.overload
    def __getitem__(self, item: str) -> np.ndarray:
        ...

    @t.overload
    def __getitem__(self, item: int) -> pd.Series:
        ...

    def __getitem__(
        self, item: tuple[str, int] | str | int
    ) -> float | np.ndarray | pd.Series:
        match item:
            case [c, i]:
                # 1. Fails to infer types of c and i
                # 2. Fails to infer return type (thinks it's a data frame)
                return self._df.loc[c, str(i - 1)]  # type: ignore
            case str():
                # Pandas always puts "ExtensionArray | ndarray" as a return
                # type, although only ndarray is actually returned here
                return self._df.loc[item].values  # type: ignore
            case int():
                return self._df[str(item - 1)]
            case _:
                raise TypeError(f"Invalid index type {item}")



if __name__ == "__main__":
    raise RuntimeError