Source code for kg_topology_toolbox.topology_toolbox
# -*- coding: utf-8 -*-
# Copyright (c) 2023 Graphcore Ltd. All rights reserved.
"""
Topology toolbox main functionalities
"""
import multiprocessing as mp
from functools import cache
import numpy as np
import pandas as pd
from scipy.sparse import coo_array
from kg_topology_toolbox.utils import (
aggregate_by_relation,
check_kg_df_structure,
composition_count,
jaccard_similarity,
node_degrees_and_rels,
)
[docs]
class KGTopologyToolbox:
"""
Toolbox class to compute Knowledge Graph topology statistics.
"""
def __init__(
self,
kg_df: pd.DataFrame,
head_column: str = "h",
relation_column: str = "r",
tail_column: str = "t",
):
"""
Instantiate the Topology Toolbox for a Knowledge Graph defined
by the list of its edges (h,r,t).
:param kg_df:
A Knowledge Graph represented as a pd.DataFrame.
Must contain at least three columns, which specify the IDs of
head entity, relation type and tail entity for each edge.
:param head_column:
The name of the column with the IDs of head entities. Default: "h".
:param relation_column:
The name of the column with the IDs of relation types. Default: "r".
:param tail_column:
The name of the column with the IDs of tail entities. Default: "t".
"""
check_kg_df_structure(kg_df, head_column, relation_column, tail_column)
self.df = kg_df[[head_column, relation_column, tail_column]].rename(
columns={head_column: "h", relation_column: "r", tail_column: "t"}
)
self.n_entity = self.df[["h", "t"]].max().max() + 1
self.n_rel = self.df.r.max() + 1
[docs]
def loop_count(self) -> pd.DataFrame:
"""
For each entity in the KG, compute the number of loops around the entity
(i.e., the number of edges having the entity as both head and tail).
:return:
Loop count DataFrame, indexed on the IDs of the graph entities.
"""
n_loops = (
self.df[self.df.h == self.df.t].groupby("h").agg(n_loops=("r", "count"))
)
return (
pd.DataFrame(n_loops, index=np.arange(self.n_entity)).fillna(0).astype(int)
)
[docs]
@cache
def node_head_degree(self, return_relation_list: bool = False) -> pd.DataFrame:
"""
For each entity in the KG, compute the number of edges having it as head
(head-degree, or out-degree of the head node).
The relation types going out of the head node are also identified.
:param return_relation_list:
If True, return the list of unique relations going
out of the head node. WARNING: expensive for large graphs.
Default: False.
:return:
The result DataFrame, indexed on the IDs `e` of the graph entities,
with columns:
- **h_degree** (int): Number of triples with head entity `e`.
- **h_unique_rel** (int): Number of distinct relation types
among edges with head entity `e`.
- **h_rel_list** (Optional[list]): List of unique relation types
among edges with head entity `e`.
Only returned if `return_relation_list = True`.
"""
node_df = node_degrees_and_rels(
self.df, "h", self.n_entity, return_relation_list
)
return node_df.rename(columns={n: "h_" + n for n in node_df.columns})
[docs]
@cache
def node_tail_degree(self, return_relation_list: bool = False) -> pd.DataFrame:
"""
For each entity in the KG, compute the number of edges having it as tail
(tail-degree, or in-degree of the tail node).
The relation types going into the tail node are also identified.
:param return_relation_list:
If True, return the list of unique relation types going
into the tail node. WARNING: expensive for large graphs.
Default: False.
:return:
The result DataFrame, indexed on the IDs `e` of the graph entities,
with columns:
- **t_degree** (int): Number of triples with tail entity `e`.
- **t_unique_rel** (int): Number of distinct relation types
among edges with tail entity `e`.
- **t_rel_list** (Optional[list]): List of unique relation types
among edges with tail entity `e`.
Only returned if `return_relation_list = True`.
"""
node_df = node_degrees_and_rels(
self.df, "t", self.n_entity, return_relation_list
)
return node_df.rename(columns={n: "t_" + n for n in node_df.columns})
[docs]
def node_degree_summary(self, return_relation_list: bool = False) -> pd.DataFrame:
"""
For each entity in the KG, compute the number of edges having it as a head
(head-degree, or out-degree), as a tail (tail-degree, or in-degree)
or one of the two (total-degree).
The in-going and out-going relation types are also identified.
The output dataframe is indexed on the IDs of the graph entities.
:param return_relation_list:
If True, return the list of unique relations going
in/out of an entity. WARNING: expensive for large graphs.
:return:
The results dataframe, indexed on the IDs `e` of the graph entities,
with columns:
- **h_degree** (int): Number of triples with head entity `e`.
- **t_degree** (int): Number of triples with tail entity `e`.
- **tot_degree** (int): Number of triples with head entity `e` or tail entity `e`.
- **h_unique_rel** (int): Number of distinct relation types
among edges with head entity `e`.
- **h_rel_list** (Optional[list]): List of unique relation types among edges
with head entity `e`.
Only returned if `return_relation_list = True`.
- **t_unique_rel** (int): Number of distinct relation types
among edges with tail entity `e`.
- **t_rel_list** (Optional[list]): List of unique relation types among edges
with tail entity `e`.
Only returned if `return_relation_list = True`.
- **n_loops** (int): number of loops around entity `e`.
"""
nodes_df = pd.merge(
self.node_head_degree(return_relation_list),
self.node_tail_degree(return_relation_list),
left_index=True,
right_index=True,
)
nodes_df = pd.merge(
nodes_df,
self.loop_count(),
left_index=True,
right_index=True,
)
nodes_df["tot_degree"] = (
nodes_df["h_degree"] + nodes_df["t_degree"] - nodes_df["n_loops"]
)
return nodes_df[
["h_degree", "t_degree", "tot_degree", "h_unique_rel"]
+ (["h_rel_list"] if return_relation_list else [])
+ ["t_unique_rel"]
+ (["t_rel_list"] if return_relation_list else [])
+ ["n_loops"]
]
[docs]
@cache
def edge_head_degree(self) -> pd.DataFrame:
"""
For each edge in the KG, compute the number of edges
(in total or of the same relation type) with the same head node.
:return:
The result DataFrame, with the same indexing and ordering of
triples as the original KG DataFrame, with columns
(in addition to `h`, `r`, `t`):
- **h_unique_rel** (int): Number of distinct relation types
among edges with head entity `h`.
- **h_degree** (int): Number of triples with head entity `h`.
- **h_degree_same_rel** (int): Number of triples with head entity `h`
and relation type `r`.
"""
edge_by_hr_count = self.df.groupby(["h", "r"], as_index=False).agg(
h_degree_same_rel=("t", "count")
)
df_res = self.df.merge(
self.node_head_degree(), left_on=["h"], right_index=True, how="left"
)
return df_res.merge(edge_by_hr_count, on=["h", "r"], how="left")
[docs]
@cache
def edge_tail_degree(self) -> pd.DataFrame:
"""
For each edge in the KG, compute the number of edges
(in total or of the same relation type) with the same tail node.
:return:
The result DataFrame, with the same indexing and ordering of
triples as the original KG DataFrame, with columns
(in addition to `h`, `r`, `t`):
- **t_unique_rel** (int): Number of distinct relation types
among edges with tail entity `t`.
- **t_degree** (int): Number of triples with tail entity `t`.
- **t_degree_same_rel** (int): Number of triples with tail entity `t`
and relation type `r`.
"""
edge_by_rt_count = self.df.groupby(["r", "t"], as_index=False).agg(
t_degree_same_rel=("h", "count")
)
df_res = self.df.merge(
self.node_tail_degree(), left_on=["t"], right_index=True, how="left"
)
return df_res.merge(edge_by_rt_count, on=["r", "t"], how="left")
[docs]
def edge_cardinality(self) -> pd.DataFrame:
"""
Classify the cardinality of each edge in the KG: one-to-one
(out-degree=in-degree=1), one-to-many (out-degree>1, in-degree=1),
many-to-one(out-degree=1, in-degree>1) or many-to-many
(in-degree>1, out-degree>1).
:return:
The result DataFrame, with the same indexing and ordering of
triples as the original KG DataFrame, with columns
(in addition to `h`, `r`, `t`):
- **triple_cardinality** (int): cardinality type of the edge.
- **triple_cardinality_same_rel** (int): cardinality type of the edge in
the subgraph of edges with relation type `r`.
"""
head_degree = self.edge_head_degree()
tail_degree = self.edge_tail_degree()
df_res = pd.DataFrame(
{"h": head_degree.h, "r": head_degree.r, "t": head_degree.t}
)
# check if the values in the pair (h_degree, t_degree) are =1 or >1
# to determine the edge cardinality
for suffix in ["", "_same_rel"]:
edge_type = 2 * (head_degree["h_degree" + suffix] == 1) + (
tail_degree["t_degree" + suffix] == 1
)
df_res["triple_cardinality" + suffix] = pd.cut(
edge_type,
bins=[0, 1, 2, 3, 4],
right=False,
labels=["M:M", "1:M", "M:1", "1:1"],
).astype(str)
return df_res
[docs]
def edge_metapath_count(
self,
filter_relations: list[int] = [],
composition_chunk_size: int = 2**8,
composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
) -> pd.DataFrame:
"""
For each edge in the KG, compute the number of triangles supported on it
distinguishing between different metapaths (i.e., the unique ordered tuples
(r1, r2) of relation types of the two additional edges of the triangle).
:param filter_relations:
If not empty, compute the output only for the edges with relation
in this list of relation IDs.
:param composition_chunk_size:
Size of column chunks of sparse adjacency matrix
to compute the triangle count. Reduce the parameter if running OOM.
Default: 2**8.
:param composition_workers:
Number of workers to compute the triangle count. By default, assigned based
on number of available threads (max: 32).
:return:
The output dataframe has one row for each (h, r, t, r1, r2) such that
there exists at least one triangle of metapath (r1, r2) over (h, r, t).
The number of metapath triangles is given in the column **n_triangles**.
The column **index** provides the index of the edge (h, r, t) in the
original Knowledge Graph dataframe.
"""
# discard loops as edges of a triangle
df_wo_loops = self.df[self.df.h != self.df.t]
if len(filter_relations) > 0:
rel_df = self.df[self.df.r.isin(filter_relations)]
# unique heads and tails used by filtered edges
filter_heads = rel_df.h.unique()
filter_tails = rel_df.t.unique()
# the only relevant edges for triangles are the ones with head in the
# set of filtered heads, or tail in the set of filtered tails
df_triangles = df_wo_loops[
np.logical_or(
df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
)
]
else:
rel_df = self.df
df_triangles = df_wo_loops
counts = composition_count(
df_triangles,
chunk_size=composition_chunk_size,
workers=composition_workers,
metapaths=True,
directed=True,
)
return rel_df.reset_index().merge(counts, on=["h", "t"], how="inner")
[docs]
def edge_degree_cardinality_summary(
self, filter_relations: list[int] = [], aggregate_by_r: bool = False
) -> pd.DataFrame:
"""
For each edge in the KG, compute the number of edges with the same head
(head-degree, or out-degree), the same tail (tail-degree, or in-degree)
or one of the two (total-degree).
Based on entity degrees, each triple is classified as either one-to-one
(out-degree=in-degree=1), one-to-many (out-degree>1, in-degree=1),
many-to-one(out-degree=1, in-degree>1) or many-to-many
(in-degree>1, out-degree>1).
The output dataframe maintains the same indexing and ordering of triples
as the original Knowledge Graph dataframe.
:param filter_relations:
If not empty, compute the output only for the edges with relation
in this list of relation IDs.
:param aggregate_by_r:
If True, return metrics aggregated by relation type
(the output DataFrame will be indexed over relation IDs).
:return:
The results dataframe. Contains the following columns
(in addition to `h`, `r`, `t`):
- **h_unique_rel** (int): Number of distinct relation types
among edges with head entity h.
- **h_degree** (int): Number of triples with head entity h.
- **h_degree_same_rel** (int): Number of triples with head entity h
and relation type r.
- **t_unique_rel** (int): Number of distinct relation types
among edges with tail entity t.
- **t_degree** (int): Number of triples with tail entity t.
- **t_degree_same_rel** (int): Number of triples with tail entity t
and relation type r.
- **tot_degree** (int): Number of triples with head entity h or
tail entity t.
- **tot_degree_same_rel** (int): Number of triples with head entity h or
tail entity t, and relation type r.
- **triple_cardinality** (int): cardinality type of the edge.
- **triple_cardinality_same_rel** (int): cardinality type of the edge in
the subgraph of edges with relation type r.
"""
df_res = pd.concat(
[
self.edge_head_degree(),
self.edge_tail_degree().drop(columns=["h", "r", "t"]),
],
axis=1,
)
if len(filter_relations) > 0:
df_res = df_res[df_res.r.isin(filter_relations)]
# compute number of parallel edges to avoid double-counting them
# in total degree
num_parallel = df_res.merge(
self.df.groupby(["h", "t"], as_index=False).agg(n_parallel=("r", "count")),
on=["h", "t"],
how="left",
)
df_res["tot_degree"] = (
df_res.h_degree + df_res.t_degree - num_parallel.n_parallel.values
)
# when restricting to the same relation type, there is only one edge
# (the edge itself) that is double-counted
df_res["tot_degree_same_rel"] = (
df_res.h_degree_same_rel + df_res.t_degree_same_rel - 1
)
edge_cardinality = self.edge_cardinality()
df_res["triple_cardinality"] = edge_cardinality["triple_cardinality"]
df_res["triple_cardinality_same_rel"] = edge_cardinality[
"triple_cardinality_same_rel"
]
return aggregate_by_relation(df_res) if aggregate_by_r else df_res
[docs]
def edge_pattern_summary(
self,
return_metapath_list: bool = False,
filter_relations: list[int] = [],
aggregate_by_r: bool = False,
composition_chunk_size: int = 2**8,
composition_workers: int = min(32, mp.cpu_count() - 1 or 1),
) -> pd.DataFrame:
"""
Analyse structural properties of each edge in the KG:
symmetry, presence of inverse/inference(=parallel) edges and
triangles supported on the edge.
The output dataframe maintains the same indexing and ordering of triples
as the original Knowledge Graph dataframe.
:param return_metapath_list:
If True, return the list of unique metapaths for all
triangles supported over each edge. WARNING: very expensive for large graphs.
:param filter_relations:
If not empty, compute the output only for the edges with relation
in this list of relation IDs.
:param aggregate_by_r:
If True, return metrics aggregated by relation type
(the output DataFrame will be indexed over relation IDs).
:param composition_chunk_size:
Size of column chunks of sparse adjacency matrix
to compute the triangle count. Reduce the parameter if running OOM.
Default: 2**8.
:param composition_workers:
Number of workers to compute the triangle count. By default, assigned based
on number of available threads (max: 32).
:return:
The results dataframe. Contains the following columns
(in addition to `h`, `r`, `t`):
- **is_loop** (bool): True if the triple is a loop (``h == t``).
- **is_symmetric** (bool): True if the triple (t, r, h) is also contained
in the graph (assuming t and h are different).
- **has_inverse** (bool): True if the graph contains one or more triples
(t, r', h) with ``r' != r``.
- **n_inverse_relations** (int): The number of inverse relations r'.
- **inverse_edge_types** (list): All relations r' (including r if the edge
is symmetric) such that (t, r', h) is in the graph.
- **has_inference** (bool): True if the graph contains one or more triples
(h, r', t) with ``r' != r``.
- **n_inference_relations** (int): The number of inference relations r'.
- **inference_edge_types** (list): All relations r' (including r) such that
(h, r', t) is in the graph.
- **has_composition** (bool): True if the graph contains one or more triangles
supported on the edge: (h, r1, x) + (x, r2, t).
- **n_triangles** (int): The number of triangles.
- **has_undirected_composition** (bool): True if the graph contains one or more
undirected triangles supported on the edge.
- **n_undirected_triangles** (int): The number of undirected triangles
(considering all edges as bidirectional).
- **metapath_list** (list): The list of unique metapaths "r1-r2"
for the directed triangles.
"""
# discard loops as edges of a triangle
df_wo_loops = self.df[self.df.h != self.df.t]
if len(filter_relations) > 0:
rel_df = self.df[self.df.r.isin(filter_relations)]
# unique heads and tails used by filtered edges
filter_heads = rel_df.h.unique()
filter_tails = rel_df.t.unique()
filter_entities = np.union1d(filter_heads, filter_tails)
# restrict relevant edges to count inference/inverse patterns
inference_df = self.df[
np.logical_and(
self.df.h.isin(filter_heads), self.df.t.isin(filter_tails)
)
]
inverse_df = self.df[
np.logical_and(
self.df.h.isin(filter_tails), self.df.t.isin(filter_heads)
)
]
# the only relevant edges for triangles are the ones with head in the
# set of filtered heads, or tail in the set of filtered tails
df_triangles = df_wo_loops[
np.logical_or(
df_wo_loops.h.isin(filter_heads), df_wo_loops.t.isin(filter_tails)
)
]
# for undirected triangles, heads and tails can be any of the
# filtered entities
df_triangles_und = df_wo_loops[
np.logical_or(
df_wo_loops.h.isin(filter_entities),
df_wo_loops.t.isin(filter_entities),
)
]
else:
rel_df = inference_df = inverse_df = self.df
df_triangles = df_triangles_und = df_wo_loops
df_res = pd.DataFrame(
{"h": rel_df.h, "r": rel_df.r, "t": rel_df.t, "is_symmetric": False}
)
# symmetry-asymmetry
# edges with h/t switched
df_inv = inverse_df.reindex(columns=["t", "r", "h"]).rename(
columns={"t": "h", "r": "r", "h": "t"}
)
df_res.loc[
df_res.reset_index().merge(df_inv)["index"],
"is_symmetric",
] = True
# loops are treated separately
df_res["is_loop"] = df_res.h == df_res.t
df_res.loc[df_res.h == df_res.t, "is_symmetric"] = False
df_res = df_res.reset_index()
# inverse
unique_inv_r_by_ht = df_inv.groupby(["h", "t"], as_index=False).agg(
inverse_edge_types=("r", list),
)
df_res = df_res.merge(unique_inv_r_by_ht, on=["h", "t"], how="left")
df_res["inverse_edge_types"] = df_res["inverse_edge_types"].apply(
lambda agg: agg if isinstance(agg, list) else []
)
# if the edge (h,r,t) is symmetric or loop, we do not consider the relation
# r as a proper inverse
df_res["n_inverse_relations"] = (
df_res.inverse_edge_types.str.len() - df_res.is_symmetric - df_res.is_loop
)
df_res["n_inverse_relations"] = (
df_res["n_inverse_relations"].fillna(0).astype(int)
)
df_res["has_inverse"] = df_res["n_inverse_relations"] > 0
# inference
if len(filter_relations) > 0:
edges_between_ht = inference_df.groupby(["h", "t"], as_index=False).agg(
inference_edge_types=("r", list),
)
else:
edges_between_ht = unique_inv_r_by_ht.reindex(
columns=["t", "h", "inverse_edge_types"]
).rename(
columns={
"t": "h",
"h": "t",
"inverse_edge_types": "inference_edge_types",
}
)
df_res = df_res.merge(edges_between_ht, on=["h", "t"], how="left")
# inference_edge_types always contains the edge itself, which we need to drop
df_res["n_inference_relations"] = df_res.inference_edge_types.str.len() - 1
df_res["has_inference"] = df_res["n_inference_relations"] > 0
# composition & metapaths
counts = composition_count(
df_triangles,
chunk_size=composition_chunk_size,
workers=composition_workers,
metapaths=return_metapath_list,
directed=True,
)
if return_metapath_list:
# turn (r1, r2) into "r1-r2" string for metapaths
counts["metapath"] = (
counts["r1"].astype(str) + "-" + counts["r2"].astype(str)
)
# count triangles (summing over all metapaths between two nodes)
# and list unique metapaths for each head and tail node pair
grouped_triangles = counts.groupby(["h", "t"], as_index=False).agg(
n_triangles=("n_triangles", "sum"), metapath_list=("metapath", list)
)
df_res = df_res.merge(
grouped_triangles,
on=["h", "t"],
how="left",
)
# if no triangles are present over an edge, set metapath list to []
df_res["metapath_list"] = df_res["metapath_list"].apply(
lambda agg: agg if isinstance(agg, list) else []
)
else:
df_res = df_res.merge(
counts,
on=["h", "t"],
how="left",
)
df_res["n_triangles"] = df_res["n_triangles"].fillna(0).astype(int)
df_res["has_composition"] = df_res["n_triangles"] > 0
# undirected composition
counts = composition_count(
df_triangles_und,
chunk_size=composition_chunk_size,
workers=composition_workers,
directed=False,
)
df_res = df_res.merge(
counts.rename(columns={"n_triangles": "n_undirected_triangles"}),
on=["h", "t"],
how="left",
)
df_res["n_undirected_triangles"] = (
df_res["n_undirected_triangles"].fillna(0).astype(int)
)
df_res["has_undirected_composition"] = df_res["n_undirected_triangles"] > 0
df_res = df_res.set_index("index")[
[
"h",
"r",
"t",
"is_loop",
"is_symmetric",
"has_inverse",
"n_inverse_relations",
"inverse_edge_types",
"has_inference",
"n_inference_relations",
"inference_edge_types",
"has_composition",
"has_undirected_composition",
"n_triangles",
"n_undirected_triangles",
]
+ (["metapath_list"] if return_metapath_list else [])
]
df_res.index.name = None
return aggregate_by_relation(df_res) if aggregate_by_r else df_res
[docs]
def jaccard_similarity_relation_sets(self) -> pd.DataFrame:
"""
Compute the similarity between relations defined as the Jaccard Similarity
between sets of entities (heads and tails) for all pairs
of relations in the graph.
:return:
The results dataframe. Contains the following columns:
- **r1** (int): Index of the first relation.
- **r2** (int): Index of the second relation.
- **num_triples_both** (int): Number of triples with relation r1/r2.
- **frac_triples_both** (float): Fraction of triples with relation r1/r2.
- **num_entities_both** (int): Number of unique entities (h or t) for triples
with relation r1/r2.
- **num_h_r1** (int): Number of unique head entities for relation r1.
- **num_h_r2** (int): Number of unique head entities for relation r2.
- **num_t_r1** (int): Number of unique tail entities for relation r1.
- **num_t_r2** (int): Number of unique tail entities for relation r2.
- **jaccard_head_head** (float): Jaccard similarity between the head set of r1
and the head set of r2.
- **jaccard_tail_tail** (float): Jaccard similarity between the tail set of r1
and the tail set of r2.
- **jaccard_head_tail** (float): Jaccard similarity between the head set of r1
and the tail set of r2.
- **jaccard_tail_head** (float): Jaccard similarity between the tail set of r1
and the head set of r2.
- **jaccard_both** (float): Jaccard similarity between the full entity set
of r1 and r2.
"""
# set of unique heads/tails/any for each relation
ent_unique = self.df.groupby("r", as_index=False).agg(
num_triples=("r", "count"), head=("h", "unique"), tail=("t", "unique")
)
ent_unique["both"] = ent_unique.apply(
lambda x: np.unique(np.concatenate([x["head"], x["tail"]])), axis=1
)
ent_unique["num_h"] = ent_unique["head"].str.len()
ent_unique["num_t"] = ent_unique["tail"].str.len()
r_num = ent_unique[["r", "num_h", "num_t", "num_triples"]]
# combinations of relations
df_res = pd.merge(
r_num.rename(columns={"r": "r1"}),
r_num.rename(columns={"r": "r2"}),
suffixes=["_r1", "_r2"],
how="cross",
)
# order doesn't matter
df_res = df_res[df_res.r1 < df_res.r2]
df_res["num_triples_both"] = df_res["num_triples_r1"] + df_res["num_triples_r2"]
df_res["frac_triples_both"] = df_res["num_triples_both"] / self.df.shape[0]
df_res["num_entities_both"] = df_res.apply(
lambda x: len(
np.unique(
np.concatenate(
[
ent_unique.loc[x["r1"], "both"],
ent_unique.loc[x["r2"], "both"],
]
)
)
),
axis=1,
)
df_res = df_res[
[
"r1",
"r2",
"num_triples_both",
"frac_triples_both",
"num_entities_both",
"num_h_r1",
"num_h_r2",
"num_t_r1",
"num_t_r2",
]
]
for r1_ent in ["head", "tail"]:
for r2_ent in ["head", "tail"]:
df_res[f"jaccard_{r1_ent}_{r2_ent}"] = [
jaccard_similarity(a, b)
for a, b in zip(
ent_unique.loc[df_res.r1, r1_ent],
ent_unique.loc[df_res.r2, r2_ent],
)
]
df_res["jaccard_both"] = [
jaccard_similarity(a, b)
for a, b in zip(
ent_unique.loc[df_res.r1, "both"], ent_unique.loc[df_res.r2, "both"]
)
]
return df_res
[docs]
def relational_affinity_ingram(self, min_max_norm: bool = False) -> pd.DataFrame:
"""
Compute the similarity between relations based on the approach proposed in
InGram: Inductive Knowledge Graph Embedding via Relation Graphs,
https://arxiv.org/abs/2305.19987.
Only the pairs of relations witn ``affinity > 0`` are shown in the
returned dataframe.
:param min_max_norm:
min-max normalization of edge weights. Default: False.
:return:
The results dataframe. Contains the following columns:
- **h_relation** (int): Index of the head relation.
- **t_relation** (int): Index of the tail relation.
- **edge_weight** (float): Weight for the affinity between
the head and the tail relation.
"""
hr_freqs = self.df.groupby(["h", "r"], as_index=False).count()
# normalize by global h frequency
hr_freqs["t"] = hr_freqs["t"] / hr_freqs.groupby("h")["t"].transform("sum")
rt_freqs = self.df.groupby(["t", "r"], as_index=False).count()
# normalize by global t frequency
rt_freqs["h"] = rt_freqs["h"] / rt_freqs.groupby("t")["h"].transform("sum")
# sparse matrix of of (h,r) pair frequency
E_h = coo_array(
(hr_freqs.t, (hr_freqs.h, hr_freqs.r)),
shape=[self.n_entity, self.n_rel],
)
# sparse matrix of of (t,r) pair frequency
E_t = coo_array(
(rt_freqs.h, (rt_freqs.t, rt_freqs.r)),
shape=[self.n_entity, self.n_rel],
)
# adjacency matrix of relation graph
A = (E_h.T @ E_h).toarray() + (E_t.T @ E_t).toarray()
A[np.diag_indices_from(A)] = 0
if min_max_norm:
A = (A - np.min(A)) / (np.max(A) - np.min(A))
h_rels, t_rels = np.nonzero(A)
return pd.DataFrame(
{
"h_relation": h_rels,
"t_relation": t_rels,
"edge_weight": A[h_rels, t_rels],
}
)