Source code for lXtractor.ext.dssp

from collections import abc
from itertools import dropwhile
from pathlib import Path
from tempfile import NamedTemporaryFile

import biotite.structure as bst
import numpy as np
import pandas as pd
from biotite.structure.io.pdb import PDBFile
from more_itertools import unzip

import lXtractor.util as util
from lXtractor.core import GenericStructure, ProteinStructure

__all__ = ["dssp_to_df", "dssp_run", "dssp_set_ss_annotation"]

DSSP_TO_SS8 = {"H": "H", "G": "G", "I": "I", "E": "E", "B": "B", "T": "T", "S": "S"}
DSSP_TO_SS3 = {"H": "H", "G": "H", "I": "H", "E": "E", "B": "E"}
DSSP_COLUMNS = (
    "res_id",
    "chain_id",
    "res_name",
    "structure",
    "ss8",
    "ss3",
    "bp1",
    "bp2",
    "acc",
    "n_h_o_1",
    "o_h_n_1",
    "n_h_o_2",
    "o_h_n_2",
    "tco",
    "kappa",
    "alpha",
    "phi",
    "psi",
    "x_ca",
    "y_ca",
    "z_ca",
)


def _parse_dssp_line(line):
    ss = line[16]
    return {
        "res_id": int(line[0:5].strip()),
        "chain_id": line[11],
        "res_name": line[13],
        "structure": line[16:25].strip(),
        "bp1": int(line[25:29]),
        "bp2": int(line[29:33]),
        "acc": int(line[34:38]),
        "n_h_o_1": line[39:50].strip(),
        "o_h_n_1": line[50:61].strip(),
        "n_h_o_2": line[61:72].strip(),
        "o_h_n_2": line[72:83].strip(),
        "tco": float(line[83:91]),
        "kappa": float(line[91:97]),
        "alpha": float(line[97:103]),
        "phi": float(line[103:109]),
        "psi": float(line[109:115]),
        "x_ca": float(line[115:122]),
        "y_ca": float(line[122:129]),
        "z_ca": float(line[129:136]),
        "ss8": DSSP_TO_SS8.get(ss, "C"),
        "ss3": DSSP_TO_SS3.get(ss, "C"),
    }



[docs]
def dssp_to_df(output: str | Path) -> pd.DataFrame:
    """
    Parse "classical" DSSP output and convert it into a pandas dataframe.

    :param output: Full output as a str or a path to an output file.
    :return: A dataframe the same columns as in the DSSP output plust two
        additional columns: "ss8" and "ss3" with secondary structure
        designations (original 8-state and converted 3-state).
    """

    def parse(lines: abc.Iterable[str]) -> abc.Iterable[str]:
        lines = filter(lambda x: bool(x.strip()), lines)
        lines = dropwhile(lambda x: not x.startswith("  #  RESIDUE AA "), lines)
        next(lines)  # consume header
        yield from map(_parse_dssp_line, lines)

    if isinstance(output, str):
        lines = output.split("\n")
    elif isinstance(output, Path):
        lines = output.read_text().splitlines()
    else:
        raise TypeError(f"Invalid output type {type(output)}. Must be str or Path.")

    df = pd.DataFrame(parse(lines))
    df = df[df["chain_id"] != " "]  # remove polymer line breaks
    return df[list(DSSP_COLUMNS)]




[docs]
def dssp_set_ss_annotation(a: bst.AtomArray, df: pd.DataFrame) -> None:
    """
    Set secondary structure annotations for a given atom array.
    Modifies the provided array in-place to incorporate "ss8" and "ss3"
    annotation categories with ``np.nan`` designating missing annotations.

    :param a: Atom array.
    :param df: Dataframe with parsed DSSP output.
    :return: Nothing.

    .. seealso::
        :func:`dssp_to_df` :func:`dssp_run`
    """
    sub = df[["res_id", "chain_id", "ss3", "ss8"]]
    ss_map = {
        (res_id, chain_id): (ss3, ss8)
        for res_id, chain_id, ss3, ss8 in sub.itertuples(index=False)
    }
    starts = bst.get_residue_starts(a)
    annotations = (
        ss_map.get((atom.res_id, atom.chain_id), (np.nan, np.nan)) for atom in a[starts]
    )
    ss3, ss8 = map(
        lambda x: bst.spread_residue_wise(a, np.array(list(x))), unzip(annotations)
    )
    a.set_annotation("ss3", ss3)
    a.set_annotation("ss8", ss8)




[docs]
def dssp_run(
    structure: GenericStructure | ProteinStructure | bst.AtomArray | Path,
    exec_name: str = "mkdssp",
    set_ss_annotation: bool = False,
) -> pd.DataFrame:
    """
    Run DSSP and on a given structure and parse the ouptut.

    :param structure: Parsed structure or an atom array or path to a structure.
    :param exec_name: Name of the DSSP executable.
    :param set_ss_annotation: Set secondary structure annotations. Only works
        if `structure` is not a ``Path``.
    :return: Parsed DSSP output.

    .. seealso::
        :func:`dssp_set_ss_annotation` :func:`dssp_to_df`
    """
    if isinstance(structure, (GenericStructure, ProteinStructure)):
        a = structure.array
    elif isinstance(structure, Path):
        a = util.load_structure(structure)
    elif isinstance(structure, bst.AtomArray):
        a = structure
    else:
        raise TypeError(f"Invalid structure type {type(structure)}")

    # New DSSP versions work primarily with mmCIF. However, mmCIF written by
    # biotite is missing some important header info, and DSSP doesn't work with
    # it. Hence, we write the array in PDB format and prepend a dummy header
    # so DSSP is happy with the PURITY of the PDB format and the input not
    # being GARBAGE (https://github.com/PDB-REDO/dssp/issues/10)
    with NamedTemporaryFile("w", suffix=".pdb") as tmp:
        file = PDBFile()
        file.set_structure(a)
        tmp.write("HEADER empty\n")
        file.write(tmp)
        tmp.seek(0)
        output = util.run_sp(f"{exec_name} --output-format dssp {tmp.name}").stdout
        df = dssp_to_df(output)

    if set_ss_annotation and not isinstance(structure, Path):
        dssp_set_ss_annotation(a, df)

    return df



if __name__ == "__main__":
    raise RuntimeError