Source code for kg_topology_toolbox.topology_toolbox

# -*- coding: utf-8 -*-
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.

"""
Topology toolbox main functionalities
"""

import multiprocessing as mp
from functools import cache

import numpy as np
import pandas as pd
from scipy.sparse import coo_array

from kg_topology_toolbox.utils import (
    aggregate_by_relation,
    check_kg_df_structure,
    composition_count,
    jaccard_similarity,
    node_degrees_and_rels,
)



[docs]
class KGTopologyToolbox:
    """
    Toolbox class to compute Knowledge Graph topology statistics.
    """

    def __init__(
        self,
        kg_df: pd.DataFrame,
        head_column: str = "h",
        relation_column: str = "r",
        tail_column: str = "t",
    ):
        """
        Instantiate the Topology Toolbox for a Knowledge Graph defined
        by the list of its edges (h,r,t).

        :param kg_df:
            A Knowledge Graph represented as a pd.DataFrame.
            Must contain at least three columns, which specify the IDs of
            head entity, relation type and tail entity for each edge.
        :param head_column:
            The name of the column with the IDs of head entities. Default: "h".
        :param relation_column:
            The name of the column with the IDs of relation types. Default: "r".
        :param tail_column:
            The name of the column with the IDs of tail entities. Default: "t".

        """
        check_kg_df_structure(kg_df, head_column, relation_column, tail_column)

        self.df = kg_df[[head_column, relation_column, tail_column]].rename(
            columns={head_column: "h", relation_column: "r", tail_column: "t"}
        )
        self.n_entity = self.df[["h", "t"]].max().max() + 1
        self.n_rel = self.df.r.max() + 1


[docs]
    def loop_count(self) -> pd.DataFrame:
        """
        For each entity in the KG, compute the number of loops around the entity
        (i.e., the number of edges having the entity as both head and tail).

        :return:
            Loop count DataFrame, indexed on the IDs of the graph entities.
        """
        n_loops = (
            self.df[self.df.h == self.df.t].groupby("h").agg(n_loops=("r", "count"))
        )
        return (
            pd.DataFrame(n_loops, index=np.arange(self.n_entity)).fillna(0).astype(int)
        )



[docs]
    @cache
    def node_head_degree(self, return_relation_list: bool = False) -> pd.DataFrame:
        """
        For each entity in the KG, compute the number of edges having it as head
        (head-degree, or out-degree of the head node).
        The relation types going out of the head node are also identified.

        :param return_relation_list:
            If True, return the list of unique relations going
            out of the head node. WARNING: expensive for large graphs.
            Default: False.

        :return:
            The result DataFrame, indexed on the IDs `e` of the graph entities,
            with columns:

            - **h_degree** (int): Number of triples with head entity `e`.
            - **h_unique_rel** (int): Number of distinct relation types
              among edges with head entity `e`.
            - **h_rel_list** (Optional[list]): List of unique relation types
              among edges with head entity `e`.
              Only returned if `return_relation_list = True`.
        """
        node_df = node_degrees_and_rels(
            self.df, "h", self.n_entity, return_relation_list
        )
        return node_df.rename(columns={n: "h_" + n for n in node_df.columns})



[docs]
    @cache
    def node_tail_degree(self, return_relation_list: bool = False) -> pd.DataFrame:
        """
        For each entity in the KG, compute the number of edges having it as tail
        (tail-degree, or in-degree of the tail node).
        The relation types going into the tail node are also identified.

        :param return_relation_list:
            If True, return the list of unique relation types going
            into the tail node. WARNING: expensive for large graphs.
            Default: False.

        :return:
            The result DataFrame, indexed on the IDs `e` of the graph entities,
            with columns:

            - **t_degree** (int): Number of triples with tail entity `e`.
            - **t_unique_rel** (int): Number of distinct relation types
              among edges with tail entity `e`.
            - **t_rel_list** (Optional[list]): List of unique relation types
              among edges with tail entity `e`.
              Only returned if `return_relation_list = True`.
        """
        node_df = node_degrees_and_rels(
            self.df, "t", self.n_entity, return_relation_list
        )
        return node_df.rename(columns={n: "t_" + n for n in node_df.columns})



[docs]
    def node_degree_summary(self, return_relation_list: bool = False) -> pd.DataFrame:
        """
        For each entity in the KG, compute the number of edges having it as a head
        (head-degree, or out-degree), as a tail (tail-degree, or in-degree)
        or one of the two (total-degree).
        The in-going and out-going relation types are also identified.

        The output dataframe is indexed on the IDs of the graph entities.

        :param return_relation_list:
            If True, return the list of unique relations going
            in/out of an entity. WARNING: expensive for large graphs.

        :return:
            The results dataframe, indexed on the IDs `e` of the graph entities,
            with columns:

            - **h_degree** (int): Number of triples with head entity `e`.
            - **t_degree** (int): Number of triples with tail entity `e`.
            - **tot_degree** (int): Number of triples with head entity `e` or tail entity `e`.
            - **h_unique_rel** (int): Number of distinct relation types
              among edges with head entity `e`.
            - **h_rel_list** (Optional[list]): List of unique relation types among edges
              with head entity `e`.
              Only returned if `return_relation_list = True`.
            - **t_unique_rel** (int): Number of distinct relation types
              among edges with tail entity `e`.
            - **t_rel_list** (Optional[list]): List of unique relation types among edges
              with tail entity `e`.
              Only returned if `return_relation_list = True`.
            - **n_loops** (int): number of loops around entity `e`.
        """
        nodes_df = pd.merge(
            self.node_head_degree(return_relation_list),
            self.node_tail_degree(return_relation_list),
            left_index=True,
            right_index=True,
        )
        nodes_df = pd.merge(
            nodes_df,
            self.loop_count(),
            left_index=True,
            right_index=True,
        )
        nodes_df["tot_degree"] = (
            nodes_df["h_degree"] + nodes_df["t_degree"] - nodes_df["n_loops"]
        )

        return nodes_df[
            ["h_degree", "t_degree", "tot_degree", "h_unique_rel"]
            + (["h_rel_list"] if return_relation_list else [])
            + ["t_unique_rel"]
            + (["t_rel_list"] if return_relation_list else [])
            + ["n_loops"]
        ]



[docs]
    @cache
    def edge_head_degree(self) -> pd.DataFrame:
        """
        For each edge in the KG, compute the number of edges
        (in total or of the same relation type) with the same head node.

        :return:
            The result DataFrame, with the same indexing and ordering of
            triples as the original KG DataFrame, with columns
            (in addition to `h`, `r`, `t`):

            - **h_unique_rel** (int): Number of distinct relation types
              among edges with head entity `h`.
            - **h_degree** (int): Number of triples with head entity `h`.
            - **h_degree_same_rel** (int): Number of triples with head entity `h`
              and relation type `r`.
        """
        edge_by_hr_count = self.df.groupby(["h", "r"], as_index=False).agg(
            h_degree_same_rel=("t", "count")
        )
        df_res = self.df.merge(
            self.node_head_degree(), left_on=["h"], right_index=True, how="left"
        )
        return df_res.merge(edge_by_hr_count, on=["h", "r"], how="left")



[docs]
    @cache
    def edge_tail_degree(self) -> pd.DataFrame:
        """
        For each edge in the KG, compute the number of edges
        (in total or of the same relation type) with the same tail node.

        :return:
            The result DataFrame, with the same indexing and ordering of
            triples as the original KG DataFrame, with columns
            (in addition to `h`, `r`, `t`):

            - **t_unique_rel** (int): Number of distinct relation types
              among edges with tail entity `t`.
            - **t_degree** (int): Number of triples with tail entity `t`.
            - **t_degree_same_rel** (int): Number of triples with tail entity `t`
              and relation type `r`.
        """
        edge_by_rt_count = self.df.groupby(["r", "t"], as_index=False).agg(
            t_degree_same_rel=("h", "count")
        )
        df_res = self.df.merge(
            self.node_tail_degree(), left_on=["t"], right_index=True, how="left"
        )
        return df_res.merge(edge_by_rt_count, on=["r", "t"], how="left")



[docs]
    def edge_cardinality(self) -> pd.DataFrame:
        """
        Classify the cardinality of each edge in the KG: one-to-one
        (out-degree=in-degree=1), one-to-many (out-degree>1, in-degree=1),
        many-to-one(out-degree=1, in-degree>1) or many-to-many
        (in-degree>1, out-degree>1).

        :return:
            The result DataFrame, with the same indexing and ordering of
            triples as the original KG DataFrame, with columns
            (in addition to `h`, `r`, `t`):

            - **triple_cardinality** (int): cardinality type of the edge.
            - **triple_cardinality_same_rel** (int): cardinality type of the edge in
              the subgraph of edges with relation type `r`.
        """
        head_degree = self.edge_head_degree()
        tail_degree = self.edge_tail_degree()
        df_res = pd.DataFrame(
            {"h": head_degree.h, "r": head_degree.r, "t": head_degree.t}
        )
        # check if the values in the pair (h_degree, t_degree) are =1 or >1
        # to determine the edge cardinality
        for suffix in ["", "_same_rel"]:
            edge_type = 2 * (head_degree["h_degree" + suffix] == 1) + (
                tail_degree["t_degree" + suffix] == 1
            )
            df_res["triple_cardinality" + suffix] = pd.cut(
                edge_type,
                bins=[0, 1, 2, 3, 4],
                right=False,
                labels=["M:M", "1:M", "M:1", "1:1"],
            ).astype(str)
        return df_res



[docs]
    def edge_metapath_count(
        self,
        filter_relations: list[int] = [],
        composition_chunk_size: int = 2**8,
        composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
    ) -> pd.DataFrame:
        """
        For each edge in the KG, compute the number of triangles supported on it
        distinguishing between different metapaths (i.e., the unique ordered tuples
        (r1, r2) of relation types of the two additional edges of the triangle).

        :param filter_relations:
            If not empty, compute the output only for the edges with relation
            in this list of relation IDs.
        :param composition_chunk_size:
            Size of column chunks of sparse adjacency matrix
            to compute the triangle count. Reduce the parameter if running OOM.
            Default: 2**8.
        :param composition_workers:
            Number of workers to compute the triangle count. By default, assigned based
            on number of available threads (max: 32).

        :return:
            The output dataframe has one row for each (h, r, t, r1, r2) such that
            there exists at least one triangle of metapath (r1, r2) over (h, r, t).
            The number of metapath triangles is given in the column **n_triangles**.
            The column **index** provides the index of the edge (h, r, t) in the
            original Knowledge Graph dataframe.
        """
        # discard loops as edges of a triangle
        df_wo_loops = self.df[self.df.h != self.df.t]
        if len(filter_relations) > 0:
            rel_df = self.df[self.df.r.isin(filter_relations)]
            # unique heads and tails used by filtered edges
            filter_heads = rel_df.h.unique()
            filter_tails = rel_df.t.unique()
            # the only relevant edges for triangles are the ones with head in the
            # set of filtered heads, or tail in the set of filtered tails
            df_triangles = df_wo_loops[
                np.logical_or(
                    df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
                )
            ]
        else:
            rel_df = self.df
            df_triangles = df_wo_loops

        counts = composition_count(
            df_triangles,
            chunk_size=composition_chunk_size,
            workers=composition_workers,
            metapaths=True,
            directed=True,
        )

        return rel_df.reset_index().merge(counts, on=["h", "t"], how="inner")



[docs]
    def edge_degree_cardinality_summary(
        self, filter_relations: list[int] = [], aggregate_by_r: bool = False
    ) -> pd.DataFrame:
        """
        For each edge in the KG, compute the number of edges with the same head
        (head-degree, or out-degree), the same tail (tail-degree, or in-degree)
        or one of the two (total-degree).
        Based on entity degrees, each triple is classified as either one-to-one
        (out-degree=in-degree=1), one-to-many (out-degree>1, in-degree=1),
        many-to-one(out-degree=1, in-degree>1) or many-to-many
        (in-degree>1, out-degree>1).

        The output dataframe maintains the same indexing and ordering of triples
        as the original Knowledge Graph dataframe.

        :param filter_relations:
            If not empty, compute the output only for the edges with relation
            in this list of relation IDs.
        :param aggregate_by_r:
            If True, return metrics aggregated by relation type
            (the output DataFrame will be indexed over relation IDs).

        :return:
            The results dataframe. Contains the following columns
            (in addition to `h`, `r`, `t`):

            - **h_unique_rel** (int): Number of distinct relation types
              among edges with head entity h.
            - **h_degree** (int): Number of triples with head entity h.
            - **h_degree_same_rel** (int): Number of triples with head entity h
              and relation type r.
            - **t_unique_rel** (int): Number of distinct relation types
              among edges with tail entity t.
            - **t_degree** (int): Number of triples with tail entity t.
            - **t_degree_same_rel** (int): Number of triples with tail entity t
              and relation type r.
            - **tot_degree** (int): Number of triples with head entity h or
              tail entity t.
            - **tot_degree_same_rel** (int): Number of triples with head entity h or
              tail entity t, and relation type r.
            - **triple_cardinality** (int): cardinality type of the edge.
            - **triple_cardinality_same_rel** (int): cardinality type of the edge in
              the subgraph of edges with relation type r.
        """
        df_res = pd.concat(
            [
                self.edge_head_degree(),
                self.edge_tail_degree().drop(columns=["h", "r", "t"]),
            ],
            axis=1,
        )
        if len(filter_relations) > 0:
            df_res = df_res[df_res.r.isin(filter_relations)]
        # compute number of parallel edges to avoid double-counting them
        # in total degree
        num_parallel = df_res.merge(
            self.df.groupby(["h", "t"], as_index=False).agg(n_parallel=("r", "count")),
            on=["h", "t"],
            how="left",
        )
        df_res["tot_degree"] = (
            df_res.h_degree + df_res.t_degree - num_parallel.n_parallel.values
        )
        # when restricting to the same relation type, there is only one edge
        # (the edge itself) that is double-counted
        df_res["tot_degree_same_rel"] = (
            df_res.h_degree_same_rel + df_res.t_degree_same_rel - 1
        )

        edge_cardinality = self.edge_cardinality()
        df_res["triple_cardinality"] = edge_cardinality["triple_cardinality"]
        df_res["triple_cardinality_same_rel"] = edge_cardinality[
            "triple_cardinality_same_rel"
        ]
        return aggregate_by_relation(df_res) if aggregate_by_r else df_res



[docs]
    def edge_pattern_summary(
        self,
        return_metapath_list: bool = False,
        filter_relations: list[int] = [],
        aggregate_by_r: bool = False,
        composition_chunk_size: int = 2**8,
        composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
    ) -> pd.DataFrame:
        """
        Analyse structural properties of each edge in the KG:
        symmetry, presence of inverse/inference(=parallel) edges and
        triangles supported on the edge.

        The output dataframe maintains the same indexing and ordering of triples
        as the original Knowledge Graph dataframe.

        :param return_metapath_list:
            If True, return the list of unique metapaths for all
            triangles supported over each edge. WARNING: very expensive for large graphs.
        :param filter_relations:
            If not empty, compute the output only for the edges with relation
            in this list of relation IDs.
        :param aggregate_by_r:
            If True, return metrics aggregated by relation type
            (the output DataFrame will be indexed over relation IDs).
        :param composition_chunk_size:
            Size of column chunks of sparse adjacency matrix
            to compute the triangle count. Reduce the parameter if running OOM.
            Default: 2**8.
        :param composition_workers:
            Number of workers to compute the triangle count. By default, assigned based
            on number of available threads (max: 32).

        :return:
            The results dataframe. Contains the following columns
            (in addition to `h`, `r`, `t`):

            - **is_loop** (bool): True if the triple is a loop (``h == t``).
            - **is_symmetric** (bool): True if the triple (t, r, h) is also contained
              in the graph (assuming t and h are different).
            - **has_inverse** (bool): True if the graph contains one or more triples
              (t, r', h) with ``r' != r``.
            - **n_inverse_relations** (int): The number of inverse relations r'.
            - **inverse_edge_types** (list): All relations r' (including r if the edge
              is symmetric) such that (t, r', h) is in the graph.
            - **has_inference** (bool): True if the graph contains one or more triples
              (h, r', t) with ``r' != r``.
            - **n_inference_relations** (int): The number of inference relations r'.
            - **inference_edge_types** (list): All relations r' (including r) such that
              (h, r', t) is in the graph.
            - **has_composition** (bool): True if the graph contains one or more triangles
              supported on the edge: (h, r1, x) + (x, r2, t).
            - **n_triangles** (int): The number of triangles.
            - **has_undirected_composition** (bool): True if the graph contains one or more
              undirected triangles supported on the edge.
            - **n_undirected_triangles** (int): The number of undirected triangles
              (considering all edges as bidirectional).
            - **metapath_list** (list): The list of unique metapaths "r1-r2"
              for the directed triangles.
        """

        # discard loops as edges of a triangle
        df_wo_loops = self.df[self.df.h != self.df.t]
        if len(filter_relations) > 0:
            rel_df = self.df[self.df.r.isin(filter_relations)]
            # unique heads and tails used by filtered edges
            filter_heads = rel_df.h.unique()
            filter_tails = rel_df.t.unique()
            filter_entities = np.union1d(filter_heads, filter_tails)
            # restrict relevant edges to count inference/inverse patterns
            inference_df = self.df[
                np.logical_and(
                    self.df.h.isin(filter_heads), self.df.t.isin(filter_tails)
                )
            ]
            inverse_df = self.df[
                np.logical_and(
                    self.df.h.isin(filter_tails), self.df.t.isin(filter_heads)
                )
            ]
            # the only relevant edges for triangles are the ones with head in the
            # set of filtered heads, or tail in the set of filtered tails
            df_triangles = df_wo_loops[
                np.logical_or(
                    df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
                )
            ]
            # for undirected triangles, heads and tails can be any of the
            # filtered entities
            df_triangles_und = df_wo_loops[
                np.logical_or(
                    df_wo_loops.h.isin(filter_entities),
                    df_wo_loops.t.isin(filter_entities),
                )
            ]
        else:
            rel_df = inference_df = inverse_df = self.df
            df_triangles = df_triangles_und = df_wo_loops
        df_res = pd.DataFrame(
            {"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False}
        )
        # symmetry-asymmetry
        # edges with h/t switched
        df_inv = inverse_df.reindex(columns=["t", "r", "h"]).rename(
            columns={"t": "h", "r": "r", "h": "t"}
        )
        df_res.loc[
            df_res.reset_index().merge(df_inv)["index"],
            "is_symmetric",
        ] = True
        # loops are treated separately
        df_res["is_loop"] = df_res.h == df_res.t
        df_res.loc[df_res.h == df_res.t, "is_symmetric"] = False

        df_res = df_res.reset_index()

        # inverse
        unique_inv_r_by_ht = df_inv.groupby(["h", "t"], as_index=False).agg(
            inverse_edge_types=("r", list),
        )
        df_res = df_res.merge(unique_inv_r_by_ht, on=["h", "t"], how="left")
        df_res["inverse_edge_types"] = df_res["inverse_edge_types"].apply(
            lambda agg: agg if isinstance(agg, list) else []
        )
        # if the edge (h,r,t) is symmetric or loop, we do not consider the relation
        # r as a proper inverse
        df_res["n_inverse_relations"] = (
            df_res.inverse_edge_types.str.len() - df_res.is_symmetric - df_res.is_loop
        )
        df_res["n_inverse_relations"] = (
            df_res["n_inverse_relations"].fillna(0).astype(int)
        )
        df_res["has_inverse"] = df_res["n_inverse_relations"] > 0

        # inference
        if len(filter_relations) > 0:
            edges_between_ht = inference_df.groupby(["h", "t"], as_index=False).agg(
                inference_edge_types=("r", list),
            )
        else:
            edges_between_ht = unique_inv_r_by_ht.reindex(
                columns=["t", "h", "inverse_edge_types"]
            ).rename(
                columns={
                    "t": "h",
                    "h": "t",
                    "inverse_edge_types": "inference_edge_types",
                }
            )
        df_res = df_res.merge(edges_between_ht, on=["h", "t"], how="left")
        # inference_edge_types always contains the edge itself, which we need to drop
        df_res["n_inference_relations"] = df_res.inference_edge_types.str.len() - 1
        df_res["has_inference"] = df_res["n_inference_relations"] > 0

        # composition & metapaths
        counts = composition_count(
            df_triangles,
            chunk_size=composition_chunk_size,
            workers=composition_workers,
            metapaths=return_metapath_list,
            directed=True,
        )
        if return_metapath_list:
            # turn (r1, r2) into "r1-r2" string for metapaths
            counts["metapath"] = (
                counts["r1"].astype(str) + "-" + counts["r2"].astype(str)
            )
            # count triangles (summing over all metapaths between two nodes)
            # and list unique metapaths for each head and tail node pair
            grouped_triangles = counts.groupby(["h", "t"], as_index=False).agg(
                n_triangles=("n_triangles", "sum"), metapath_list=("metapath", list)
            )
            df_res = df_res.merge(
                grouped_triangles,
                on=["h", "t"],
                how="left",
            )
            # if no triangles are present over an edge, set metapath list to []
            df_res["metapath_list"] = df_res["metapath_list"].apply(
                lambda agg: agg if isinstance(agg, list) else []
            )
        else:
            df_res = df_res.merge(
                counts,
                on=["h", "t"],
                how="left",
            )
        df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
        df_res["has_composition"] = df_res["n_triangles"] > 0

        # undirected composition
        counts = composition_count(
            df_triangles_und,
            chunk_size=composition_chunk_size,
            workers=composition_workers,
            directed=False,
        )
        df_res = df_res.merge(
            counts.rename(columns={"n_triangles": "n_undirected_triangles"}),
            on=["h", "t"],
            how="left",
        )
        df_res["n_undirected_triangles"] = (
            df_res["n_undirected_triangles"].fillna(0).astype(int)
        )
        df_res["has_undirected_composition"] = df_res["n_undirected_triangles"] > 0

        df_res = df_res.set_index("index")[
            [
                "h",
                "r",
                "t",
                "is_loop",
                "is_symmetric",
                "has_inverse",
                "n_inverse_relations",
                "inverse_edge_types",
                "has_inference",
                "n_inference_relations",
                "inference_edge_types",
                "has_composition",
                "has_undirected_composition",
                "n_triangles",
                "n_undirected_triangles",
            ]
            + (["metapath_list"] if return_metapath_list else [])
        ]
        df_res.index.name = None

        return aggregate_by_relation(df_res) if aggregate_by_r else df_res



[docs]
    def jaccard_similarity_relation_sets(self) -> pd.DataFrame:
        """
        Compute the similarity between relations defined as the Jaccard Similarity
        between sets of entities (heads and tails) for all pairs
        of relations in the graph.

        :return:
            The results dataframe. Contains the following columns:

            - **r1** (int): Index of the first relation.
            - **r2** (int): Index of the second relation.
            - **num_triples_both** (int): Number of triples with relation r1/r2.
            - **frac_triples_both** (float): Fraction of triples with relation r1/r2.
            - **num_entities_both** (int): Number of unique entities (h or t) for triples
              with relation r1/r2.
            - **num_h_r1** (int): Number of unique head entities for relation r1.
            - **num_h_r2** (int): Number of unique head entities for relation r2.
            - **num_t_r1** (int): Number of unique tail entities for relation r1.
            - **num_t_r2** (int): Number of unique tail entities for relation r2.
            - **jaccard_head_head** (float): Jaccard similarity between the head set of r1
              and the head set of r2.
            - **jaccard_tail_tail** (float): Jaccard similarity between the tail set of r1
              and the tail set of r2.
            - **jaccard_head_tail** (float): Jaccard similarity between the head set of r1
              and the tail set of r2.
            - **jaccard_tail_head** (float): Jaccard similarity between the tail set of r1
              and the head set of r2.
            - **jaccard_both** (float): Jaccard similarity between the full entity set
              of r1 and r2.
        """
        # set of unique heads/tails/any for each relation
        ent_unique = self.df.groupby("r", as_index=False).agg(
            num_triples=("r", "count"), head=("h", "unique"), tail=("t", "unique")
        )
        ent_unique["both"] = ent_unique.apply(
            lambda x: np.unique(np.concatenate([x["head"], x["tail"]])), axis=1
        )
        ent_unique["num_h"] = ent_unique["head"].str.len()
        ent_unique["num_t"] = ent_unique["tail"].str.len()
        r_num = ent_unique[["r", "num_h", "num_t", "num_triples"]]
        # combinations of relations
        df_res = pd.merge(
            r_num.rename(columns={"r": "r1"}),
            r_num.rename(columns={"r": "r2"}),
            suffixes=["_r1", "_r2"],
            how="cross",
        )
        # order doesn't matter
        df_res = df_res[df_res.r1 < df_res.r2]

        df_res["num_triples_both"] = df_res["num_triples_r1"] + df_res["num_triples_r2"]
        df_res["frac_triples_both"] = df_res["num_triples_both"] / self.df.shape[0]
        df_res["num_entities_both"] = df_res.apply(
            lambda x: len(
                np.unique(
                    np.concatenate(
                        [
                            ent_unique.loc[x["r1"], "both"],
                            ent_unique.loc[x["r2"], "both"],
                        ]
                    )
                )
            ),
            axis=1,
        )
        df_res = df_res[
            [
                "r1",
                "r2",
                "num_triples_both",
                "frac_triples_both",
                "num_entities_both",
                "num_h_r1",
                "num_h_r2",
                "num_t_r1",
                "num_t_r2",
            ]
        ]
        for r1_ent in ["head", "tail"]:
            for r2_ent in ["head", "tail"]:
                df_res[f"jaccard_{r1_ent}_{r2_ent}"] = [
                    jaccard_similarity(a, b)
                    for a, b in zip(
                        ent_unique.loc[df_res.r1, r1_ent],
                        ent_unique.loc[df_res.r2, r2_ent],
                    )
                ]
        df_res["jaccard_both"] = [
            jaccard_similarity(a, b)
            for a, b in zip(
                ent_unique.loc[df_res.r1, "both"], ent_unique.loc[df_res.r2, "both"]
            )
        ]
        return df_res



[docs]
    def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame:
        """
        Compute the similarity between relations based on the approach proposed in
        InGram: Inductive Knowledge Graph Embedding via Relation Graphs,
        https://arxiv.org/abs/2305.19987.

        Only the pairs of relations witn ``affinity > 0`` are shown in the
        returned dataframe.

        :param min_max_norm:
            min-max normalization of edge weights. Default: False.

        :return:
            The results dataframe. Contains the following columns:

            - **h_relation** (int): Index of the head relation.
            - **t_relation** (int): Index of the tail relation.
            - **edge_weight** (float): Weight for the affinity between
              the head and the tail relation.
        """
        hr_freqs = self.df.groupby(["h", "r"], as_index=False).count()
        # normalize by global h frequency
        hr_freqs["t"] = hr_freqs["t"] / hr_freqs.groupby("h")["t"].transform("sum")
        rt_freqs = self.df.groupby(["t", "r"], as_index=False).count()
        # normalize by global t frequency
        rt_freqs["h"] = rt_freqs["h"] / rt_freqs.groupby("t")["h"].transform("sum")

        # sparse matrix of of (h,r) pair frequency
        E_h = coo_array(
            (hr_freqs.t, (hr_freqs.h, hr_freqs.r)),
            shape=[self.n_entity, self.n_rel],
        )
        # sparse matrix of of (t,r) pair frequency
        E_t = coo_array(
            (rt_freqs.h, (rt_freqs.t, rt_freqs.r)),
            shape=[self.n_entity, self.n_rel],
        )

        # adjacency matrix of relation graph
        A = (E_h.T @ E_h).toarray() + (E_t.T @ E_t).toarray()
        A[np.diag_indices_from(A)] = 0

        if min_max_norm:
            A = (A - np.min(A)) / (np.max(A) - np.min(A))

        h_rels, t_rels = np.nonzero(A)
        return pd.DataFrame(
            {
                "h_relation": h_rels,
                "t_relation": t_rels,
                "edge_weight": A[h_rels, t_rels],
            }
        )