%pip install relationalai matplotlib

import relationalai as rai
from relationalai.std.graphs import Graph
from relationalai.std import alias, rel, strings
from relationalai.std import aggregates
from typing import Tuple
import matplotlib as mpl
import pandas as pd

provider = rai.Provider()

provider.sql("""
begin
	create database if not exists RAI_DEMO;
	create schema if not exists RAI_DEMO.IDENTITY_RESOLUTION;

	create or replace table RAI_DEMO.IDENTITY_RESOLUTION.USERS (
		ID NUMBER(18,0),
		NAME VARCHAR(16777216),
		DATE_OF_BIRTH VARCHAR(16777216),
		EMAIL VARCHAR(16777216),
		PHONE_NUMBER VARCHAR(16777216),
		CREDIT_CARD_NUMBER VARCHAR(16777216)
	);

	insert into RAI_DEMO.IDENTITY_RESOLUTION.USERS (ID, NAME, DATE_OF_BIRTH, EMAIL, PHONE_NUMBER, CREDIT_CARD_NUMBER)
	values
	(1,'John Smith','1990-05-15','john.smith@example.com','+1234567890','1234 5678 9012 3456'),
	(2,'John Smyth','1990-05-16','john.smyth@example.com','+1234567809','1234 5678 9012 3456'),
	(3,'Jane Doe','1985-10-20','jane.doe@example.com','+1987654321','2345 6789 0123 4567'),
	(4,'Michael Johnson','1982-07-08','michael.johnson@example.com','+1654321879','3456 7890 1234 5678'),
	(5,'Michael Jackson','1982-07-08','michael.jackson@example.com','+1654321879','3456 7890 1234 5678'),
	(6,'John Smith','1990-05-15','john.smith@example.com','+1234567890','1234 5678 9012 3456');
end;
""")

provider.create_streams(["RAI_DEMO.IDENTITY_RESOLUTION.USERS"], "EntityResolution")

model = rai.Model("EntityResolution", ensure_change_tracking=True)

User = model.Type("User", source = "rai_demo.identity_resolution.users")

User.known_properties()

['snowflake_id',
 'email',
 'credit_card_number',
 'date_of_birth',
 'name',
 'id',
 'phone_number']

with model.query() as select:
    u = User()
    response = select(u.id, u.name, u.email, u.date_of_birth, u.phone_number, u.credit_card_number)

response

ExactMatch = model.Type("ExactMatch")

exact_match_attributes = ['name', 'date_of_birth', 'email', 'phone_number', 'credit_card_number']

with model.rule(dynamic=True):
    user1 = User()
    user2 = User()
    user1 < user2
    for attribute in exact_match_attributes:
        getattr(user1, attribute) == getattr(user2, attribute)
    ExactMatch.add(user1=user1, user2=user2)

with model.query() as select:
    em = ExactMatch()
    response = select(*[attr for pair in ([alias(getattr(em.user1, a), a + " (1)"), alias(getattr(em.user2, a), a + " (2)")] for a in exact_match_attributes) for attr in pair])

response

exact_match_graph = Graph(model, undirected=True)
Node, Edge = exact_match_graph.Node, exact_match_graph.Edge

Node.extend(User, label=User.name, id=User.id)

with model.rule():
    exact_match = ExactMatch()
    Edge.add(exact_match.user1, exact_match.user2)

graph_style = {
    "node": {
        "color": "slategray",
        "hover": lambda n: f"Id: {n['id']}"
    },
    "edge": {
        "color": "coral",
        "size": lambda e: e.get('weight', 1.0) * 3,
        "hover": lambda e: f"Weight: {e.get('weight', 1.0):.2f}"
    }
}

exact_match_graph.visualize(style = graph_style).display(inline=True)

attribute_weights = {
    'name': 0.2, # Lower weight due to commonality and potential for variations or typos
    'email': 0.5, # Moderate weight as email is unique but can be changed or have multiple versions
    'credit_card_number': 0.85, # High weight since credit card numbers are unique and less likely to be shared
    'phone_number': 0.9 # Highest weight as phone numbers are highly unique and rarely shared among different users
}

ExactAttributeMatch = model.Type("ExactAttributeMatch")

with model.rule(dynamic=True):
    user1 = User()
    user2 = User()
    user1 < user2
    for (attribute, weight) in attribute_weights.items():
        with getattr(user1, attribute) == getattr(user2, attribute):
            ExactAttributeMatch.add(user1=user1, user2=user2, attribute=attribute).set(weight=weight)

# Aggregate the scores considering the weights of all the exact attribute matches
with model.rule():
    match = ExactAttributeMatch()
    norm_score = aggregates.sum(match.weight, per=[match.user1, match.user2]) / sum(attribute_weights.values())
    ExactAttributeMatch.add(user1=match.user1, user2=match.user2).set(combined_similarity_score=norm_score)

with model.query() as select:
    e = ExactAttributeMatch()
    e.combined_similarity_score.has_value()
    result = select(alias(e.user1.id, "id (user1)"), alias(e.user1.name, "name (user1)"), alias(e.user2.id, "id (user2)"), alias(e.user2.name, "name (user2)"), alias(e.combined_similarity_score, "combined similarity score"))
result

exact_attribute_match = Graph(model, undirected=True, weighted=True)
Node, Edge = exact_attribute_match.Node, exact_attribute_match.Edge

Node.extend(User, label=User.name, id=User.id)

# If there's an exact attribute match between two users with a similarity score greater than 0.7, add an edge between them
with model.rule():
    eam = ExactAttributeMatch()
    eam.combined_similarity_score > 0.7
    Edge.add(eam.user1, eam.user2, weight=eam.combined_similarity_score)

exact_attribute_match.visualize(style = graph_style).display(inline=True)

FuzzyMatch = model.Type("FuzzyMatch")

for attribute, weight in attribute_weights.items():
    with model.rule():
        user1 = User()
        user2 = User()
        user1 < user2
        attr1 = getattr(user1, attribute)
        attr2 = getattr(user2, attribute)
        lev = rel.levenshtein(attr1, attr2)
        score = 1 - (lev / rel.maximum(strings.length(attr1), strings.length(attr2)))
        FuzzyMatch.add(user1=user1, user2=user2, attribute=attribute).set(similarity_score=score, weight=weight)

with model.query(dynamic=True) as select:
    fm = FuzzyMatch()
    for a in attribute_weights.keys():
        with fm.attribute == a:
            result = select(alias(fm.user1.id, "id (user1)"), alias(fm.user2.id, "id (user2)"), fm.attribute, alias(getattr(fm.user1, a), "value (user1)"), alias(getattr(fm.user2, a), "value (user2)"), fm.similarity_score)
result.results.head(10)

with model.rule():
    fm = FuzzyMatch()
    weighted_score = fm.similarity_score * fm.weight
    norm_score = aggregates.sum(weighted_score, per=[fm.user1, fm.user2]) / sum(attribute_weights.values())
    FuzzyMatch.add(user1=fm.user1, user2=fm.user2).set(combined_similarity_score=norm_score)

with model.query() as select:
    fm = FuzzyMatch()
    fm.combined_similarity_score.has_value()
    result = select(alias(fm.user1.id, "id (user1)"), alias(fm.user1.name, "name (user1)"), alias(fm.user2.id, "id (user2)"), alias(fm.user2.name, "name (user2)"), alias(fm.combined_similarity_score, "combined fuzzy attribute match score"))

result

fuzzy_match_graph = Graph(model, undirected=True, weighted=True)
Node, Edge = fuzzy_match_graph.Node, fuzzy_match_graph.Edge

Node.extend(User, label=User.name, id=User.id)

# If there's a fuzzy match between two people with a score greater than 0.9, add an edge between them
with model.rule():
    fm = FuzzyMatch()
    fm.combined_similarity_score > 0.9
    Edge.add(fm.user1, fm.user2, weight=fm.combined_similarity_score)

fuzzy_match_graph.visualize(style = graph_style).display(inline=True)

# Combine all the matches
Match = model.Type("Match")

# Add all exact matches with similarity 1.0
Match.extend(ExactMatch, similarity_score=1.0, type="Exact All Attributes Match")

# Add exact attribute matches with a similarity greater than 0.7
with model.rule():
    eam = ExactAttributeMatch()
    eam.combined_similarity_score > 0.7
    Match.add(user1=eam.user1, user2=eam.user2, similarity_score=eam.combined_similarity_score, type="Weighted Attributes Exact Match")

# Add fuzzy attribute matches with a similarity greater than 0.9
with model.rule():
    fm = FuzzyMatch()
    fm.combined_similarity_score > 0.9
    Match.add(user1=fm.user1, user2=fm.user2, similarity_score=fm.combined_similarity_score, type="Levenstein-Distance Weighted Attribute Match")

with model.query() as select:
    m = Match()
    result = select(alias(m.user1.id, "id (user1)"), alias(m.user1.name, "name (user1)"), alias(m.user2.id, "id (user2)"), alias(m.user2.name, "name (user2)"), m.similarity_score, m.type)

result

match_wcc = Graph(model, undirected=True)
Node, Edge = match_wcc.Node, match_wcc.Edge

Node.extend(User, label=User.name, id=User.id)

with model.rule():
    match = Match()
    Edge.add(match.user1, match.user2)

# Compute weakly connected component for every user
with model.rule():
    user = User()
    community = match_wcc.compute.weakly_connected_component(user)
    user.set(community=community)
    Node(user).set(community=community)

with model.query() as select:
    user = User()
    aggregates.count(user, per=[user.community]) > 1
    result = select(user.id, user.community)

groups = result.results.groupby("community").id.apply(list)
for i, g in enumerate(groups):
    print(f"Group {i+1} with {len(g)} similar users: {g}")

Group 1 with 2 similar users: [4, 5]
Group 2 with 3 similar users: [1, 2, 6]

# Normalize graph node community property to set a color
with model.rule():
    n = Node()
    prop = aggregates.rank_asc(n.community)
    min_ = aggregates.min(prop)
    normalized = (prop - min_) / (aggregates.max(prop) - min_)
    n.set(color=normalized)
    Edge(from_=n).set(color=normalized)

community_graph_style = {
    "node": {
        "color": lambda n: mpl.colors.to_hex(mpl.cm.rainbow(n['color'])),
        "hover": lambda n: f"Id: {n['id']}"
    },
    "edge": {"color": lambda e: mpl.colors.to_hex(mpl.cm.rainbow(e['color']))}
}
match_wcc.visualize(style = community_graph_style).display(inline=True)

ResolvedUser = model.Type("ResolvedUser")

with model.rule():
    u = User()
    ru = ResolvedUser.add(id=u.community).set(name=u.name, email=u.email, date_of_birth=u.date_of_birth, phone_number=u.phone_number, credit_card_number=u.credit_card_number)
    ru.matching_users.add(u)

@model.export("rai_demo.identity_resolution")
def get_resolved_users() -> Tuple[str, int]:
    user = ResolvedUser()
    return user.id, user.matching_users.id

pd.DataFrame(model.resources._exec(f"call rai_demo.identity_resolution.get_resolved_users();"), columns=["Resolved User ID", "Matching User Record ID"])

	Resolved User ID	Matching User Record ID
0	a6cd0a90-ed1f-7b88-2bc7-0dc24f0ac4d9	4
1	a6cd0a90-ed1f-7b88-2bc7-0dc24f0ac4d9	5
2	b909f771-3097-6fe7-e824-36b82cadbb5d	2
3	b909f771-3097-6fe7-e824-36b82cadbb5d	6
4	b909f771-3097-6fe7-e824-36b82cadbb5d	1
5	0e209c2f-c3e8-b8ae-3039-0671e2ca8ff6	3

Entity Resolution

Overview

Let's get started!

Importing Packages

Importing the Data from Snowflake

Getting to know the raw input data

Approaches for matching entities

Exact Match

Visualizing Graph

Matching based on weighted attributes

Visualizing Graph

Fuzzy Attribute Match

Visualizing Graph

Summarizing the matches

Grouping matching users

Visualizing the results

Writing results back to Snowflake

id	name	email	date_of_birth	phone_number	credit_card_number
1	John Smith	john.smith@example.com	1990-05-15	+1234567890	1234 5678 9012 3456
2	John Smyth	john.smyth@example.com	1990-05-16	+1234567809	1234 5678 9012 3456
3	Jane Doe	jane.doe@example.com	1985-10-20	+1987654321	2345 6789 0123 4567
4	Michael Johnson	michael.johnson@example.com	1982-07-08	+1654321879	3456 7890 1234 5678
5	Michael Jackson	michael.jackson@example.com	1982-07-08	+1654321879	3456 7890 1234 5678
6	John Smith	john.smith@example.com	1990-05-15	+1234567890	1234 5678 9012 3456

id (user1)	name (user1)	id (user2)	name (user2)	combined similarity score
2	John Smyth	1	John Smith	0.346939
2	John Smyth	6	John Smith	0.346939
4	Michael Johnson	5	Michael Jackson	0.714286
6	John Smith	1	John Smith	1.000000

	id (user1)	id (user2)	attribute	value (user1)	value (user2)	similarity_score
0	1	5	credit_card_number	1234 5678 9012 3456	3456 7890 1234 5678	0.473684
1	1	5	email	john.smith@example.com	michael.jackson@example.com	0.518519
2	1	5	name	John Smith	Michael Jackson	0.133333
3	1	5	phone_number	+1234567890	+1654321879	0.363636
4	2	1	credit_card_number	1234 5678 9012 3456	1234 5678 9012 3456	1.000000
5	2	1	email	john.smyth@example.com	john.smith@example.com	0.954545
6	2	1	name	John Smyth	John Smith	0.900000
7	2	1	phone_number	+1234567809	+1234567890	0.818182
8	2	5	credit_card_number	1234 5678 9012 3456	3456 7890 1234 5678	0.473684
9	2	5	email	john.smyth@example.com	michael.jackson@example.com	0.518519

id (user1)	name (user1)	id (user2)	name (user2)	combined fuzzy attribute match score
1	John Smith	5	Michael Jackson	0.414625
2	John Smyth	1	John Smith	0.915770
2	John Smyth	5	Michael Jackson	0.448020
2	John Smyth	6	John Smith	0.915770
3	Jane Doe	1	John Smith	0.413846
3	Jane Doe	2	John Smyth	0.413846
3	Jane Doe	4	Michael Johnson	0.497540
3	Jane Doe	5	Michael Jackson	0.497540
3	Jane Doe	6	John Smith	0.413846
4	Michael Johnson	1	John Smith	0.414625
4	Michael Johnson	2	John Smyth	0.448020
4	Michael Johnson	5	Michael Jackson	0.960998
4	Michael Johnson	6	John Smith	0.414625
6	John Smith	1	John Smith	1.000000
6	John Smith	5	Michael Jackson	0.414625