Source code for hippo.iset

"""Classes for working with sets of interactions"""

import mcol
import mrich



[docs]
class InteractionTable:
    """Class representing all :class:`.Interaction` objects in the 'interaction' table of the :class:`.Database`.

    .. attention::

            :class:`.InteractionTable` objects should not be created directly. Instead use the :meth:`.HIPPO.interactions` property.

    """

    def __init__(self, db: "Database", table: str = "interaction") -> None:
        """InteractionTable initialisation"""

        self._db = db
        self._df = None
        self._table = table

    ### PROPERTIES

    @property
    def db(self) -> "Database":
        """Returns the associated :class:`.Database`"""
        return self._db

    @property
    def table(self) -> str:
        """Returns the name of the :class:`.Database` table"""
        return self._table

    @property
    def df(self) -> "pandas.DataFrame":
        """DataFrame representation of the interactions

        :returns: a ``pandas.Dataframe`` of the interactions

        """

        if self._df is None:
            records = self.db.select_all_where(
                table="interaction", key=f"interaction_id > 0", multiple=True
            )
            df = df_from_interaction_records(self.db, records)
            self._df = df

        return self._df

    ### DUNDERS


[docs]
    def __len__(self) -> int:
        """The total number of interactions"""
        return self.db.count(self.table)



[docs]
    def __str__(self) -> str:
        """Unformatted command-line representation"""
        return "{" f"I × {len(self)}" "}"



[docs]
    def __repr__(self) -> str:
        """ANSI formatted command-line representation"""
        return f"{mcol.bold}{mcol.underline}{self}{mcol.unbold}{mcol.ununderline}"


    def __rich__(self) -> str:
        """Rich formatted command-line representation"""
        return f"[bold underline]{self}"




[docs]
class InteractionSet:
    """Class representing a subset of the :class:`.Interaction` objects in the 'interaction' table of the :class:`.Database`.

    .. attention::

            :class:`.InteractionSet` objects should not be created directly. Instead use :meth:`.Pose.interactions`, or :meth:`.PoseSet.interactions` methods.

    """

    def __init__(
        self,
        db: "Database",
        indices: list = None,
        table: str = "interaction",
    ) -> None:
        """InteractionSet initialisation"""

        self._db = db
        self._table = table

        indices = indices or []

        if not isinstance(indices, list):
            indices = list(indices)

        indices = [int(i) for i in indices]

        self._indices = sorted(list(set(indices)))
        self._df = None

    ### FACTORIES


[docs]
    @classmethod
    def from_pose(
        cls, pose: "Pose | PoseSet", table: str = "interaction"
    ) -> "InteractionSet":
        """Construct a :class:`.InteractionSet` from one or more poses.

        :param pose: a :class:`.Pose` or :class:`.PoseSet` object
        :returns: an :class:`.InteractionSet`

        """

        self = cls.__new__(cls)

        ### get the ID's

        from .pset import PoseSet

        if isinstance(pose, PoseSet):

            # check if all poses have fingerprints
            (has_invalid_fps,) = pose.db.select_where(
                query="COUNT(1)",
                table="pose",
                key=f"pose_id IN {pose.str_ids} AND pose_fingerprint = 0",
            )

            if has_invalid_fps:
                mrich.warning(f"{has_invalid_fps} Poses have not been fingerprinted")

            sql = f"""
            SELECT interaction_id FROM {table}
            WHERE interaction_pose IN {pose.str_ids}
            """

        else:

            sql = f"""
            SELECT interaction_id FROM {table}
            WHERE interaction_pose = {pose.id}
            """

        ids = pose.db.execute(sql).fetchall()

        ids = [i for i, in ids]

        self.__init__(pose.db, ids, table=table)

        return self



[docs]
    @classmethod
    def all(
        cls,
        db: "Database",
        table: str = "interaction",
    ) -> "InteractionSet":
        """Construct a :class:`.InteractionSet` for all interactions in the table.

        :returns: an :class:`.InteractionSet`

        """

        sql = f"SELECT interaction_id FROM {table}"

        ids = db.execute(sql).fetchall()

        ids = [i for i, in ids]

        self = cls.__new__(cls)
        self.__init__(db, ids, table=table)

        return self



[docs]
    @classmethod
    def from_residue(
        cls,
        db: "Database",
        residue_number: int,
        chain: None | str = None,
        target: "Target | int" = 1,
    ) -> "InteractionSet":
        """Get the set of interactions for a given residue number (and chain)

        :param db: HIPPO :class:`.Database`
        :param residue_number: the residue number
        :param chain: the chain name / letter, defaults to any chain
        :param target: the protein :class:`.Target` object or ID, defaults to first target in database
        :returns: a :class:`.InteractionSet` object
        """

        from .target import Target

        self = cls.__new__(cls)

        if isinstance(target, Target):
            target = target.id

        sql = f"""
        SELECT interaction_id FROM interaction
        INNER JOIN feature
        ON interaction_feature = feature_id
        WHERE feature_target = {target}
        AND feature_residue_number = {residue_number}
        """

        if chain:
            sql += f' AND feature_chain_name = "{chain}"'

        ids = db.execute(sql).fetchall()

        ids = [i for i, in ids]

        self.__init__(db, ids)

        return self


    ### PROPERTIES

    @property
    def indices(self) -> list[int]:
        """Returns the ids of interactions in this set"""
        return self._indices

    @property
    def ids(self) -> list[int]:
        """Returns the ids of interactions in this set"""
        return self._indices

    @property
    def types(self) -> list[str]:
        """Returns the ids of interactions in this set"""
        records = self.db.select_where(
            query="interaction_type",
            table="interaction",
            key=f"interaction_id IN {self.str_ids}",
            multiple=True,
        )
        return [r for r, in records]

    @property
    def db(self) -> "Database":
        """The associated HIPPO :class:`.Database`"""
        return self._db

    @property
    def table(self) -> str:
        """Get the name of the database table"""
        return self._table

    @property
    def str_ids(self) -> str:
        """Return an SQL formatted tuple string of the :class:`.Interaction` IDs"""
        return str(tuple(self.ids)).replace(",)", ")")

    @property
    def classic_fingerprint(self) -> dict:
        """Classic HIPPO fingerprint dictionary, mapping protein :class:`.Feature` ID's to the number of corresponding ligand features (from any :class:`.Pose`)"""
        return self.get_classic_fingerprint()

    @property
    def df(self) -> "pandas.DataFrame":
        """DataFrame representation of the interactions

        :returns: a ``pandas.Dataframe`` of the interactions

        """

        if self._df is None:
            records = self.db.select_all_where(
                table=self.table,
                key=f"interaction_id IN {self.str_ids}",
                multiple=True,
            )
            df = df_from_interaction_records(self.db, records)
            self._df = df

        return self._df

    @property
    def residue_number_chain_pairs(self) -> list[tuple]:
        """Get a list of ``(residue_number, chain_name)`` tuples"""

        sql = f"""
        SELECT DISTINCT feature_residue_number, feature_chain_name FROM {self.table}
        INNER JOIN feature
        ON feature_id = interaction_feature
        WHERE interaction_id IN {self.str_ids}
        """

        return self.db.execute(sql).fetchall()

    @property
    def avg_num_residues_per_pose(self) -> list[tuple]:
        """Get a list of ``(residue_number, chain_name)`` tuples"""

        sql = f"""
        SELECT DISTINCT interaction_pose, feature_residue_number, feature_chain_name FROM {self.table}
        INNER JOIN feature
        ON feature_id = interaction_feature
        WHERE interaction_id IN {self.str_ids}
        """

        records = self.db.execute(sql).fetchall()

        from collections import defaultdict
        from numpy import mean

        d = defaultdict(set)

        for pose_id, res_num, chain_name in records:
            d[pose_id].add((res_num, chain_name))

        return mean(list(len(v) for v in d.values()))

    @property
    def avg_num_interactions_per_pose(self) -> list[tuple]:
        """Get a list of ``(residue_number, chain_name)`` tuples"""

        sql = f"""
        SELECT interaction_pose FROM {self.table}
        WHERE interaction_id IN {self.str_ids}
        """

        records = self.db.execute(sql).fetchall()

        from collections import defaultdict
        from numpy import mean

        d = defaultdict(int)

        for (pose_id,) in records:
            d[pose_id] += 1

        return mean(list(d.values()))

    @property
    def avg_num_interaction_type_residue_pairs_per_pose(self) -> list[tuple]:
        """Get a list of ``(residue_number, chain_name)`` tuples"""

        sql = f"""
        SELECT DISTINCT interaction_pose, interaction_type, feature_residue_number, feature_chain_name FROM {self.table}
        INNER JOIN feature
        ON feature_id = interaction_feature
        WHERE interaction_id IN {self.str_ids}
        """

        records = self.db.execute(sql).fetchall()

        from collections import defaultdict
        from numpy import mean

        d = defaultdict(set)

        for pose_id, type, res_num, chain_name in records:
            d[pose_id].add((res_num, type, chain_name))

        return mean(list(len(v) for v in d.values()))

    @property
    def type_residue_number_chain_triples(self) -> list[tuple]:
        """Get a list of ``(interaction_type, residue_number, chain_name)`` tuples"""

        sql = f"""
        SELECT DISTINCT interaction_type, feature_residue_number, feature_chain_name FROM {self.table}
        INNER JOIN feature
        ON feature_id = interaction_feature
        WHERE interaction_id IN {self.str_ids}
        """

        return self.db.execute(sql).fetchall()

    @property
    def num_features(self) -> int:
        """Count the funmber of protein :class:`.Feature`s with which interactions are formed"""

        (count,) = self.db.execute(
            f"""
        SELECT COUNT(DISTINCT interaction_feature) FROM {self.table}
        WHERE interaction_id IN {self.str_ids}
        """
        ).fetchone()

        return count

    @property
    def avg_num_interactions_per_feature(self) -> float:
        """Average number of interactions formed with each protein :class:`.Feature`"""

        (count,) = self.db.execute(
            f"""
        WITH counts AS
        (
            SELECT interaction_feature, COUNT(1) AS count FROM {self.table}
            WHERE interaction_id IN {self.str_ids}
            GROUP BY interaction_feature
        )

        SELECT AVG(count) FROM counts
        """
        ).fetchone()

        return count

    @property
    def per_feature_count_hirsch(self) -> float:
        """A measure for how evenly protein :class:`.Feature`s are being interacted with"""

        counts = self.db.execute(
            f"""
        SELECT interaction_feature, COUNT(1) AS count FROM interaction
        WHERE interaction_id IN {self.str_ids}
        GROUP BY interaction_feature
        """
        ).fetchall()

        counts = [count for f_id, count in counts]

        from numpy import std

        # return -std(counts)

        from hirsch import hirsch

        if not counts:
            return 0

        return hirsch(counts)

    ### METHODS


[docs]
    def summary(
        self,
        families: bool = False,
    ) -> None:
        """Print a summary of this :class:`.InteractionSet`"""

        mrich.header(self)

        for interaction in self:
            # print(interaction)

            # mrich.var(f'{interaction.family_str}', f'{interaction.distance:.1f}')
            s = f"{interaction.description}"

            if families:
                s += f" {interaction.feature.family} ~ {interaction.family}"

            mrich.var(s, f"{interaction.distance:.1f}", "Å")



[docs]
    def get_classic_fingerprint(self) -> dict:
        """Classic HIPPO fingerprint dictionary, mapping protein :class:`.Feature` ID's to the number of corresponding ligand features (from any :class:`.Pose`)"""

        pairs = self.db.execute(
            f"""
        SELECT interaction_feature, COUNT(1) FROM {self.table}
        WHERE interaction_id IN {self.str_ids}
        GROUP BY interaction_feature
        """
        ).fetchall()

        return {f: c for f, c in pairs}



[docs]
    def resolve(
        self,
        debug: bool = False,
        commit: bool = True,
        # table: str = 'interaction',
    ) -> "InteractionSet":
        """Resolve into predicted key interactions. In place modification.

        :param debug: Increased verbosity for debugging (Default value = False)
        :param commit: commit the changes (Default value = True)
        :returns: a filtered :class:`.InteractionSet`
        """

        keep_list = []

        table = self.table

        ### H-Bonds (closest)

        sql = f"""
        SELECT interaction_id, MIN(interaction_distance)
        FROM {table}
        WHERE interaction_id IN {self.str_ids}
        AND interaction_type = "Hydrogen Bond"
        GROUP BY interaction_atom_ids
        """

        records = self.db.execute(sql).fetchall()
        ids = [a for a, b in records]
        keep_list += ids

        ### pi-stacking (closest)

        sql = f"""
        SELECT interaction_id, MIN(interaction_distance)
        FROM {table}
        INNER JOIN feature
        ON feature_id = interaction_feature
        WHERE interaction_id IN {self.str_ids}
        AND interaction_type = "π-stacking"
        GROUP BY feature_atom_names
        """
        # GROUP BY interaction_atom_ids

        records = self.db.execute(sql).fetchall()
        ids = [a for a, b in records]
        keep_list += ids

        ### pi-cation (closest)

        sql = f"""
        SELECT interaction_id, MIN(interaction_distance)
        FROM {table}
        WHERE interaction_id IN {self.str_ids}
        AND interaction_type = "π-cation"
        GROUP BY interaction_atom_ids
        """
        # GROUP BY interaction_atom_ids

        records = self.db.execute(sql).fetchall()
        ids = [a for a, b in records]
        keep_list += ids

        ### electrostatic (closest)

        sql = f"""
        SELECT interaction_id, MIN(interaction_distance)
        FROM {table}
        WHERE interaction_id IN {self.str_ids}
        AND interaction_type = "Electrostatic"
        GROUP BY interaction_atom_ids
        """
        # GROUP BY interaction_atom_ids

        records = self.db.execute(sql).fetchall()
        ids = [a for a, b in records]
        keep_list += ids

        ### sulfur-sulfur (all)

        sql = f"""
        SELECT interaction_id
        FROM {table}
        WHERE interaction_id IN {self.str_ids}
        AND interaction_type = "Sulfur-Sulfur"
        """

        records = self.db.execute(sql).fetchall()
        ids = [a for a, in records]
        keep_list += ids

        ### hydrophobic

        sql = f"""
        SELECT interaction_id, interaction_distance
        FROM {table}
        WHERE interaction_id IN {self.str_ids}
        AND interaction_type = "Hydrophobic"
        """

        records = self.db.execute(sql).fetchall()
        ids = [a for a, b in records]
        subset = InteractionSet(self.db, ids, table=table)

        # aggregate lumped

        hydrophobic_interactions_in_lumped = {}
        lumped_hydrophobic_in_lumped_lumped = {}

        for interaction in subset:
            families = (interaction.feature.family, interaction.family)

            if families == ("LumpedHydrophobe", "Hydrophobe"):
                for name in interaction.feature.atom_names.split():
                    key = (name, interaction.atom_ids[0])
                    if key not in hydrophobic_interactions_in_lumped:
                        hydrophobic_interactions_in_lumped[key] = []
                    hydrophobic_interactions_in_lumped[key].append(interaction.id)

            elif families == ("Hydrophobe", "LumpedHydrophobe"):
                for atom_id in interaction.atom_ids:
                    key = (interaction.feature.atom_names, atom_id)
                    if key not in hydrophobic_interactions_in_lumped:
                        hydrophobic_interactions_in_lumped[key] = []
                    hydrophobic_interactions_in_lumped[key].append(interaction.id)

            elif families == ("LumpedHydrophobe", "LumpedHydrophobe"):
                for name in interaction.feature.atom_names.split():
                    for atom_id in interaction.atom_ids:
                        key = (name, atom_id)
                        if key not in hydrophobic_interactions_in_lumped:
                            hydrophobic_interactions_in_lumped[key] = []
                        hydrophobic_interactions_in_lumped[key].append(interaction.id)

                key = interaction.feature.atom_names
                lumped_hydrophobic_in_lumped_lumped[key] = tuple(interaction.atom_ids)

        keep_hydrophobic_ids = set(subset.ids)
        rev_hydrophobic_in_lumped_lumped = {
            v: k for k, v in lumped_hydrophobic_in_lumped_lumped.items()
        }

        # modify keep list by those covered in lumped

        for interaction in subset:
            families = (interaction.feature.family, interaction.family)

            if families == ("Hydrophobe", "Hydrophobe"):
                key = (interaction.feature.atom_names, interaction.atom_ids[0])

                if key in hydrophobic_interactions_in_lumped:
                    keep_hydrophobic_ids -= set([interaction.id])

            elif families == ("LumpedHydrophobe", "Hydrophobe"):

                key = interaction.feature.atom_names

                if key in lumped_hydrophobic_in_lumped_lumped:
                    atom_id = interaction.atom_ids[0]
                    value = lumped_hydrophobic_in_lumped_lumped[key]
                    if atom_id in value:
                        keep_hydrophobic_ids -= set([interaction.id])

            elif families == ("Hydrophobe", "LumpedHydrophobe"):

                key = tuple(interaction.atom_ids)

                if key in rev_hydrophobic_in_lumped_lumped:

                    atom_name = interaction.feature.atom_names
                    value = rev_hydrophobic_in_lumped_lumped[key]

                    if atom_name in value:
                        keep_hydrophobic_ids -= set([interaction.id])

        keep_list += list(keep_hydrophobic_ids)

        ### cull non-keepers

        cull_list = set(self.ids) - set(keep_list)
        cull_iset = InteractionSet(self.db, cull_list)
        self.db.delete_where(
            table=table,
            key=f"interaction_id IN {cull_iset.str_ids}",
            commit=commit,
        )
        self._indices = sorted(list(set(keep_list)))

        ### revisit hydrophobes

        # for a given protein feature, choose the closest interaction

        cull_list = []

        hydrophobic_keeper_iset = InteractionSet(self.db, keep_hydrophobic_ids)

        sql = f"""
        SELECT interaction_id, MIN(interaction_distance)
        FROM {table}
        WHERE interaction_id IN {hydrophobic_keeper_iset.str_ids}
        GROUP BY interaction_feature
        """

        records = self.db.execute(sql).fetchall()
        ids = [a for a, b in records]

        cull_list = set(hydrophobic_keeper_iset.ids) - set(ids)
        cull_iset = InteractionSet(self.db, cull_list)
        self.db.delete_where(
            table=table,
            key=f"interaction_id IN {cull_iset.str_ids}",
            commit=commit,
        )
        self._indices = sorted(list(set(keep_list) - cull_list))

        ### Summary

        if debug:
            self.summary()


    ### DUNDERS


[docs]
    def __len__(self) -> int:
        """The number of interactions in this set"""
        return len(self.indices)



[docs]
    def __str__(self) -> str:
        """Unformatted command-line representation"""
        return "{" f"I × {len(self)}" "}"



[docs]
    def __repr__(self) -> str:
        """ANSI formatted command-line representation"""
        return f"{mcol.bold}{mcol.underline}{self}{mcol.unbold}{mcol.ununderline}"


    def __rich__(self) -> str:
        """Rich formatted command-line representation"""
        return f"[bold underline]{self}"


[docs]
    def __iter__(self):
        """Iterate through interactions in this set"""
        return iter(
            self.db.get_interaction(id=i, table=self.table) for i in self.indices
        )



[docs]
    def __getitem__(self, key) -> "Interaction | InteractionSet":
        """Get interaction or subsets thereof from this set"""
        match key:
            case int():
                index = self.indices[key]
                return self.db.get_interaction(id=index, table=self.table)

            case slice():
                indices = self.indices[key]
                return InteractionSet(self.db, indices, table=self.table)

            case _:
                raise NotImplementedError




def df_from_interaction_records(
    db: "Database",
    records: list[tuple],
) -> "pandas.DataFrame":
    """Construct a dataframe from the 'interaction' table records"""

    import json
    from pandas import DataFrame

    data = []
    for record in records:

        (
            id,
            feature_id,
            pose_id,
            type,
            family,
            atom_ids,
            prot_coord,
            lig_coord,
            distance,
            angle,
            energy,
        ) = record

        feature = db.get_feature(id=feature_id)

        d = dict(id=id)

        d["feature_id"] = feature_id
        d["pose_id"] = pose_id
        d["target_id"] = feature.target

        # d['type'] = INTERACTION_TYPES[(feature.family, family)]
        d["type"] = type

        d["prot_family"] = feature.family
        d["lig_family"] = family

        d["residue_name"] = feature.residue_name
        d["residue_number"] = feature.residue_number
        d["chain_name"] = feature.chain_name

        d["distance"] = distance
        d["angle"] = angle
        d["energy"] = energy

        d["prot_coord"] = json.loads(prot_coord)
        d["lig_coord"] = json.loads(lig_coord)

        d["prot_atoms"] = feature.atom_names
        d["lig_atoms"] = atom_ids

        d["backbone"] = feature.backbone
        d["sidechain"] = feature.sidechain

        data.append(d)

    df = DataFrame.from_records(data=data)

    return df