Patent Deduplication
# uncomment and run if mismo is not installed
# %pip install -q git+https://github.com/NickCrews/mismo@main
from __future__ import annotations
import ibis
from ibis import _
from ibis.expr import types as ir
import mismo
ibis.options.interactive = True
We are going to dedupe the PATSTAT dataset. Every record represents a patent filed,
and our task is to determine which one came from the same inventor.
The end goal is to add a column called inventor_id
to each patent.
This dataset contains a ground truth label, so we can evaluate how well we did.
Mismo
contains this as an included dataset so it is easy to get started.
The returned dataset is an Ibis table, which is a lazy representation of a SQL table.
It is similar to a pandas dataframe, but has a few properties that make it much
better for the record linkage use case:
- Since it is SQL backed, it can handle datasets that are larger than memory, in the many millions of rows.
- Computation is performed by the powerful SQL backend of your choice: Google BigQuery, Apache Spark, Snowflake, etc. For this demo, we use DuckDB, which is a state-of-the-art SQL engine based around a columnar data model (ie oriented towards the bulk operations of record linkage)
- Ibis is strongly typed, has a full API, is well-documented, and has good integration with the rest of the python data science ecosystem.
patents = mismo.playdata.load_patents()
print(patents.count())
patents
2379
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ record_id ┃ label_true ┃ name_true ┃ name ┃ latitude ┃ longitude ┃ coauthors ┃ classes ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ int64 │ int64 │ string │ string │ float64 │ float64 │ string │ string │ ├───────────┼────────────┼──────────────────────┼──────────────────────────────┼──────────┼───────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────┤ │ 2909 │ 402600 │ AGILENT TECHNOLOGIES │ * AGILENT TECHNOLOGIES, INC. │ 0.00 │ 0.000000 │ KONINK PHILIPS ELECTRONICS N V**DAVID E SNYDER**THOMAS D LYSTER │ A61N**A61B │ │ 3574 │ 569309 │ AKZO NOBEL │ * AKZO NOBEL N.V. │ 0.00 │ 0.000000 │ TSJERK HOEKSTRA**ANDRESS K JOHNSON**TERESA MARIE CHERON**ALBERTO SLIKTA**JA… │ G01N**B01L**C11D**G02F**F16L │ │ 3575 │ 569309 │ AKZO NOBEL │ * AKZO NOBEL NV │ 0.00 │ 0.000000 │ WILLIAM JOHN ERNEST PARR**HANS OSKARSSON**MARTIN HELLSTEN**KORNELIS OVERKEM… │ C09K**F17D**B01F**C23F │ │ 3779 │ 656303 │ ALCATEL │ * ALCATEL N.V. │ 52.35 │ 4.916667 │ GUENTER KOCHSMEIER**ZBIGNIEW WIEGOLASKI**EVAN JOHN STANBURY**PETER GRANT JE… │ G02B**G04G**H02G**G06F │ │ 3780 │ 656303 │ ALCATEL │ * ALCATEL N.V. │ 52.35 │ 4.916667 │ ZILAN MANFRED**JOSIANE RAMOS**DUANE LYNN MORTENSEN**CHRISTIAN LE SERGENT │ H03G**B05D**H04L**H04B**C03B**C03C**G02B**H01B │ │ 3782 │ 656303 │ ALCATEL │ * ALCATEL N.V. │ 0.00 │ 0.000000 │ OLIVIER AUDOUIN**MICHEL SOTOM**JEAN MICHEL GABRIAGUES │ H04B**H01S**H04J │ │ 15041 │ 4333661 │ CANON EUROPA │ * CANON EUROPA N.V │ 0.00 │ 0.000000 │ LEE RICKLER**SIMON PARKER**CANON RES CENT EURO **RAKEFET SAGMAN**TIMOTHY FRA… │ G06F │ │ 15042 │ 4333661 │ CANON EUROPA │ * CANON EUROPA N.V. │ 0.00 │ 0.000000 │ QI HE HONG**ADAM MICHAEL BAUMBERG**ALEXANDER RALPH LYONS │ G06T**G01B │ │ 15043 │ 4333661 │ CANON EUROPA │ * CANON EUROPA NV │ 0.00 │ 0.000000 │ NILESH PATHAK**MASAMICHI MASUDA** CANON TECHNOLOGY EURO **PATRICK WILLIAM MO… │ H04B**G06T**G06F**H04M**H04N**H04Q**G03B**B41J**G01B**G06Q │ │ 25387 │ 7650783 │ DSM │ * DSM N.V. │ 0.00 │ 0.000000 │ GABRIEL MARINUS MEESTERS**RUDOLF CAROLUS BARENDSE**ARIE KARST KIES**ALEXANDE… │ C12N**A61K**A23L**A23J**A23K**A01H**B01J**C12R**C07D**A61P**B01D │ │ … │ … │ … │ … │ … │ … │ … │ … │ └───────────┴────────────┴──────────────────────┴──────────────────────────────┴──────────┴───────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────┘
mismo.eda.distribution_dashboard(patents, column="name")
VBox(children=(Dropdown(description='Column:', index=3, options=('record_id', 'label_true', 'name_true', 'name…
Let's clean this up a bit:
- clean up whitespace
- convert the
classes
column to actual arrays (they really represent sets). Each element inclasses
is a 4-character IPC technical code that is like a tag for the patent. Similar patents will have similar tags. - convert
coauthors
into a sorted list of tokens.
Of course, this sort of cleaning only happens after a lot of exploring the data manually. This isn't anything in particular to record linkage, but this needs to happen for any sort of data analysis.
If you're not familiar with Ibis, this is useful to give you a sense of what it's API is like. Mismo tries to just use Ibis's API whenever possible, and doesn't re-invent the wheel.
from mismo.text import norm_whitespace
def clean_names(names: ir.StringValue) -> ir.StringValue:
names = norm_whitespace(names)
names = names.upper()
# Only want to keep letters, numbers, and spaces
names = names.re_replace("[^0-9A-Z ]", "")
# Some company names have "NV" or "BV" in them, which is a Dutch
# classification of businesses sort of like LLC vs Corporation in the US.
# These seem to just be noise and don't help us distinguish between
# different companies, so we want to remove them.
# However, sometimes the names are ill-formed or have other junk in them
# like "philipsN V" or "philipsN.V."
names = names.re_replace(r"B\s*V", "")
names = names.re_replace(r"N\s*V", "")
# Also remove other common stopwords that usually just add noise
names = names.re_replace(r"\bCO\b", "")
names = names.re_replace(r"\bCORP\b", "")
names = names.re_replace(r"\bLTD\b", "")
names = names.re_replace(r"\bINC\b", "")
names = names.re_replace(r"\bLLC\b", "")
names = names.re_replace(r"\bNL\b", "")
# Now have to do whitespace fixup again
names = norm_whitespace(names)
return names
def clean_coauthors(s: ir.StringValue) -> ir.ArrayValue:
"""Turn a 'list' of coauthors into a sorted array of cleaned name tokens.
The raw string looks like:
"DAVID E SNYDER**THOMAS D LYSTER"
With "**" as the separator. However, entire names are not always useful
to differentiate between people: there are some records where a coauthor is
listed as "ALKEMA DUURT PIETER WILLEM" and another as "ALKEMA DUURT".
These are probably the same person, but if we compared entire names with each other
that would be a false negative. So instead we turn this into a sorted array of
name tokens, eg for the example above:
["DAVID", "E", "SNYDER", "THOMAS", "D", "LYSTER"].
Except we also want to remove initials and other stopwords like "VAN"
so scrub all tokens less than 3 characters long.
"""
s = s.upper().nullif("NONE").replace("**", " ")
s = norm_whitespace(s)
return s.split(" ").unique().sort().filter(lambda x: x.length() >= 3)
def clean_classes(s: ir.StringValue) -> ir.ArrayValue:
return s.upper().split("**").map(norm_whitespace).sort()
cleaned_names = clean_names(_.name)
featured = patents.select(
"record_id",
"label_true",
"name_true",
"name",
name_cleaned=cleaned_names,
name_tokens=cleaned_names.split(" ").map(norm_whitespace).sort(),
latitude=_.latitude.nullif(0),
longitude=_.longitude.nullif(0),
coauthors=clean_coauthors(patents.coauthors),
classes=clean_classes(patents.classes),
)
featured
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ record_id ┃ label_true ┃ name_true ┃ name ┃ name_cleaned ┃ name_tokens ┃ latitude ┃ longitude ┃ coauthors ┃ classes ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ int64 │ int64 │ string │ string │ string │ array<string> │ float64 │ float64 │ array<string> │ array<string> │ ├───────────┼────────────┼──────────────────────┼──────────────────────────────┼──────────────────────┼─────────────────────────────┼──────────┼───────────┼───────────────────────────────────┼──────────────────────────┤ │ 2909 │ 402600 │ AGILENT TECHNOLOGIES │ * AGILENT TECHNOLOGIES, INC. │ AGILENT TECHNOLOGIES │ ['AGILENT', 'TECHNOLOGIES'] │ NULL │ NULL │ ['DAVID', 'ELECTRONICS', ... +5] │ ['A61B', 'A61N'] │ │ 3574 │ 569309 │ AKZO NOBEL │ * AKZO NOBEL N.V. │ AKZO NOBEL │ ['AKZO', 'NOBEL'] │ NULL │ NULL │ ['ALBERT', 'ALBERTO', ... +31] │ ['B01L', 'C11D', ... +3] │ │ 3575 │ 569309 │ AKZO NOBEL │ * AKZO NOBEL NV │ AKZO NOBEL │ ['AKZO', 'NOBEL'] │ NULL │ NULL │ ['CHRISTINA', 'ERNEST', ... +11] │ ['B01F', 'C09K', ... +2] │ │ 3779 │ 656303 │ ALCATEL │ * ALCATEL N.V. │ ALCATEL │ ['ALCATEL'] │ 52.35 │ 4.916667 │ ['EVAN', 'GRANT', ... +8] │ ['G02B', 'G04G', ... +2] │ │ 3780 │ 656303 │ ALCATEL │ * ALCATEL N.V. │ ALCATEL │ ['ALCATEL'] │ 52.35 │ 4.916667 │ ['CHRISTIAN', 'DUANE', ... +7] │ ['B05D', 'C03B', ... +6] │ │ 3782 │ 656303 │ ALCATEL │ * ALCATEL N.V. │ ALCATEL │ ['ALCATEL'] │ NULL │ NULL │ ['AUDOUIN', 'GABRIAGUES', ... +4] │ ['H01S', 'H04B', ... +1] │ │ 15041 │ 4333661 │ CANON EUROPA │ * CANON EUROPA N.V │ CANON EUROPA │ ['CANON', 'EUROPA'] │ NULL │ NULL │ ['CANON', 'CENT', ... +11] │ ['G06F'] │ │ 15042 │ 4333661 │ CANON EUROPA │ * CANON EUROPA N.V. │ CANON EUROPA │ ['CANON', 'EUROPA'] │ NULL │ NULL │ ['ADAM', 'ALEXANDER', ... +5] │ ['G01B', 'G06T'] │ │ 15043 │ 4333661 │ CANON EUROPA │ * CANON EUROPA NV │ CANON EUROPA │ ['CANON', 'EUROPA'] │ NULL │ NULL │ ['BARRY', 'BROCARD', ... +24] │ ['B41J', 'G01B', ... +8] │ │ 25387 │ 7650783 │ DSM │ * DSM N.V. │ DSM │ ['DSM'] │ NULL │ NULL │ ['ALEXANDER', 'ANDELA', ... +39] │ ['A01H', 'A23J', ... +9] │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └───────────┴────────────┴──────────────────────┴──────────────────────────────┴──────────────────────┴─────────────────────────────┴──────────┴───────────┴───────────────────────────────────┴──────────────────────────┘
OK, now it's time to block! This is where we generate comparisons between records. If we were naive and generated all possible comparisons from N record, you would end up with N^2 comparisons. For our small dataset of ~2000 records we would be able to get away with this, but for datasets much larger than this it would be infeasible.
For blocking, we specify an collection of Blocker
s. A Blocker is just
a callable of the form Table, Table, **kwargs -> Table
that takes in two tables
and returns a table of pairs of records for further processing.
name_blocker = mismo.block.ConditionBlocker(
lambda left, right, **_kwargs: left.name_cleaned[0:3] == right.name_cleaned[0:3],
name="Name First 3",
)
name_blocker(featured, featured)
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ record_id_l ┃ record_id_r ┃ classes_l ┃ classes_r ┃ coauthors_l ┃ coauthors_r ┃ label_true_l ┃ label_true_r ┃ latitude_l ┃ latitude_r ┃ longitude_l ┃ longitude_r ┃ name_cleaned_l ┃ name_cleaned_r ┃ name_l ┃ name_r ┃ name_tokens_l ┃ name_tokens_r ┃ name_true_l ┃ name_true_r ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ int64 │ int64 │ array<string> │ array<string> │ array<string> │ array<string> │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ string │ array<string> │ array<string> │ string │ string │ ├─────────────┼─────────────┼──────────────────────────┼──────────────────────────┼───────────────────────────────────┼───────────────────────────────────┼──────────────┼──────────────┼────────────┼────────────┼─────────────┼─────────────┼────────────────────────────┼────────────────────────────────────────────┼───────────────────────────────┼─────────────────────────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼────────────────────────────┼───────────────────────────────────────┤ │ 170407 │ 170408 │ ['B23B', 'F16L', ... +3] │ ['H02G'] │ ['ADRIAAN', 'ADRIANUS', ... +48] │ ['ADRIAAN', 'MARINUS', ... +1] │ 173039 │ 173039 │ NULL │ NULL │ NULL │ NULL │ ABB │ ABB │ ABB B.V. │ ABB B.V., │ ['ABB'] │ ['ABB'] │ ABB │ ABB V │ │ 317981 │ 317982 │ ['A43B'] │ ['A43B'] │ ['ANDRU', 'ANGERS', ... +49] │ ['ALLEN', 'GERD', ... +4] │ 319886 │ 319886 │ NULL │ 52.350000 │ NULL │ 4.916667 │ ADIDAS INTERNATL MARKETING │ ADIDAS IUMLAUT OVER NTERNATIONAL MARKETING │ ADIDAS INTERNATL MARKETING BV │ adidas I{umlaut over (n)}ternational Marketing B.V. │ ['ADIDAS', 'INTERNATL', ... +1] │ ['ADIDAS', 'IUMLAUT', ... +3] │ ADIDAS INTERNATL MARKETING │ ADIDAS IUMLAUT MARKETING NTERNAT OVER │ │ 579028 │ 579029 │ ['A61K', 'A61P', ... +2] │ ['A61K', 'A61P', ... +1] │ ['AVENTIS', 'DUCHAUSSOY', ... +9] │ ['BUMA', 'BURSI', ... +10] │ 569309 │ 569309 │ NULL │ NULL │ NULL │ NULL │ AKZONOBEL │ AKZONOBEL │ AKZO-NOBEL N.V. │ Akzo-Nobel, N.V. │ ['AKZONOBEL'] │ ['AKZONOBEL'] │ AKZO NOBEL │ AKZO NOBEL │ │ 665774 │ 665781 │ ['B21C', 'G06F', ... +1] │ ['A01G'] │ ['BAS', 'EMILE', ... +3] │ ['CORNELIS', 'LUDOVICUS', ... +1] │ 867556 │ 867556 │ NULL │ NULL │ NULL │ NULL │ ALCOA NEDERLAND │ ALCOA NEDERLAND │ Alcoa Nederland B.V. │ ALCOA NEDERLAND NV │ ['ALCOA', 'NEDERLAND'] │ ['ALCOA', 'NEDERLAND'] │ ALCOA NEDERLAND │ ALCOA NEDERLAND │ │ 777218 │ 777219 │ ['C03B', 'C03C', ... +9] │ ['G02B', 'G02F', ... +5] │ ['ALAN', 'ANDRE', ... +48] │ ['ALEN', 'ANDRE', ... +14] │ 656303 │ 656303 │ NULL │ NULL │ NULL │ NULL │ ALKATEL │ ALKATEL │ AL'KATEL' N.V. │ ALKATEL N.V. │ ['ALKATEL'] │ ['ALKATEL'] │ ALKATEL │ ALKATEL │ │ 956583 │ 956587 │ ['H01R'] │ ['G01R', 'H01R'] │ ['DER', 'HEIJDEN', ... +4] │ ['ALOYSIUS', 'ANTONIUS', ... +28] │ 945661 │ 945661 │ NULL │ 51.700196 │ NULL │ 5.306806 │ AMPHOLLAND │ AMPHOLLAND HERTOGENBOSCH │ AMP-HOLLAND B.V. │ AMP-HOLLAND B.V., HERTOGENBOSCH, NL │ ['AMPHOLLAND'] │ ['AMPHOLLAND', 'HERTOGENBOSCH'] │ AMP HOLLAND │ AMP HOLLAND V │ │ 1598988 │ 1598998 │ ['G03F', 'H01L'] │ ['B05C', 'B05D', ... +4] │ ['HERMANUS', 'JACOBUS', ... +4] │ ['ANDREW', 'NGUYEN'] │ 1584805 │ 1584805 │ 51.416667 │ NULL │ 5.416667 │ NULL │ ASML NETHERLANDS │ ASML US │ ASML Netherlands-B.V. │ ASML US, Inc. │ ['ASML', 'NETHERLANDS'] │ ['ASML', 'US'] │ ASML NETHERLANDS │ ASML US │ │ 2228706 │ 2231128 │ ['B01J', 'C07F', ... +1] │ ['C11D'] │ ['ANGEL', 'BOHNEN', ... +12] │ ['AXEL', 'GUZMANN', ... +6] │ 2219999 │ 2220720 │ NULL │ NULL │ NULL │ NULL │ BASELL POLYOLEFINE GMBH │ BASF SE │ BASELL POLYOLEFINE GMBH │ BASF SE │ ['BASELL', 'GMBH', ... +1] │ ['BASF', 'SE'] │ BASELL POLYOLEFINE │ BASF SE │ │ 2332107 │ 2332108 │ ['A61K'] │ ['A23G', 'A23K', ... +1] │ ['COFFEE', 'JOHNSON', ... +4] │ ['COFFEE', 'JOHNSON', ... +4] │ 2322698 │ 2322698 │ 52.200000 │ 52.200000 │ 4.866667 │ 4.866667 │ BAYER │ BAYER │ Bayer B.V. │ Bayer B.V. │ ['BAYER'] │ ['BAYER'] │ BAYER │ BAYER │ │ 2962592 │ 2962593 │ ['A61K', 'C07K', ... +3] │ ['A61K', 'A61P', ... +7] │ ['ALBERTA', 'ARNOLDINA', ... +32] │ ['ADRIAANSE', 'AKIO', ... +48] │ 2943561 │ 2943561 │ 51.583333 │ NULL │ 5.316667 │ NULL │ BIOMRIEUX │ BIOMERIEUX BOXTEL │ bioMérieux BV │ BIOMERIEUX B.V., BOXTEL │ ['BIOMRIEUX'] │ ['BIOMERIEUX', 'BOXTEL'] │ BIOMARIEUX │ BIOMERIEUX BOXTEL V │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────────┴─────────────┴──────────────────────────┴──────────────────────────┴───────────────────────────────────┴───────────────────────────────────┴──────────────┴──────────────┴────────────┴────────────┴─────────────┴─────────────┴────────────────────────────┴────────────────────────────────────────────┴───────────────────────────────┴─────────────────────────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┴────────────────────────────┴───────────────────────────────────────┘
The result of that was the two tables joined together, with a _l
added
to all the columns from the left table, and a _r
added to all the columns
from the right table.
By using this simple API, Blockers are easily composable and extendable. For example, we can make an ensemble Blocker that takes multiple other blockers, and returns the union of all their pairs.
sub_blockers = [
name_blocker,
# Often, you want to look at pairs where records share some key.
# There is KeyBlocker for this. This is equivalent to
# `lambda left, right: left.coauthors == right.coauthors`
mismo.block.KeyBlocker("coauthors", name="Coauthors Exact"),
# You can also pass in deferreds for more complex logic.
# And notice that the .unnest() actually changes the cardinality of the
# key column, but the KeyBlocker can handle this.
# Here, we are finding pairs where the arrays of classes have at least
# one in common.
mismo.block.KeyBlocker(_.classes.unnest(), name="Any Class"),
# CoordinateBlocker is merely a class that implements a __call__ method,
# which makes it a callable similar to the lambda above.
# You can implement your own blockers by implementing __call__ for
# any class. You could do complex logic in there, like
# - Dropping any null records
# - Using a UDF to computing the doublemetaphone of the name on the fly.
# - Using a UDF to do an approximate-nearest-neighbor search in a vector database
# - whatever you want! As long as it follows the required signature.
mismo.lib.geo.CoordinateBlocker(
distance_km=10, name="Coordinates Close", lat="latitude", lon="longitude"
),
]
# If you didn't care about the names of the rules, you could also just pass
# in the blockers directly, eg `rules=["classes", _.name_cleaned[0:3], ...]`.
featured = featured.cache()
# by passing in `labels=True`, an additional column `blocking_rules` is added
# to the output that shows which rules were used to generate each pair.
# This is slower, but later we want to see how many pairs each rule generated.
blocker = mismo.block.UnionBlocker(*sub_blockers, labels=True)
blocked = blocker(featured, featured)
blocked = blocked.cache()
blocked
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓ ┃ record_id_l ┃ record_id_r ┃ blocking_rules ┃ classes_l ┃ classes_r ┃ coauthors_l ┃ coauthors_r ┃ label_true_l ┃ label_true_r ┃ latitude_l ┃ latitude_r ┃ longitude_l ┃ longitude_r ┃ name_cleaned_l ┃ name_cleaned_r ┃ name_l ┃ name_r ┃ name_tokens_l ┃ name_tokens_r ┃ name_true_l ┃ name_true_r ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩ │ int64 │ int64 │ array<string> │ array<string> │ array<string> │ array<string> │ array<string> │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ string │ array<string> │ array<string> │ string │ string │ ├─────────────┼─────────────┼────────────────┼──────────────────────────┼───────────────────────────┼───────────────────────────────────┼─────────────────────────────────────┼──────────────┼──────────────┼────────────┼────────────┼─────────────┼─────────────┼──────────────────────┼────────────────────┼──────────────────────────────┼──────────────────────────────┼─────────────────────────────┼───────────────────────────┼──────────────────────┼──────────────────────┤ │ 2909 │ 35244256 │ ['Any Class'] │ ['A61B', 'A61N'] │ ['A01B', 'A01D', ... +98] │ ['DAVID', 'ELECTRONICS', ... +5] │ ['ADRIAAN', 'ALAN', ... +240] │ 402600 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ AGILENT TECHNOLOGIES │ UNILEVER │ * AGILENT TECHNOLOGIES, INC. │ UNILEVER N.V. │ ['AGILENT', 'TECHNOLOGIES'] │ ['UNILEVER'] │ AGILENT TECHNOLOGIES │ UNILEVER │ │ 3574 │ 35244375 │ ['Any Class'] │ ['B01L', 'C11D', ... +3] │ ['A01H', 'A01J', ... +98] │ ['ALBERT', 'ALBERTO', ... +31] │ ['1JD', '3133', ... +209] │ 569309 │ 35272490 │ NULL │ NULL │ NULL │ NULL │ AKZO NOBEL │ UNILEVER ROTTERDAM │ * AKZO NOBEL N.V. │ UNILEVER N.V., ROTTERDAM │ ['AKZO', 'NOBEL'] │ ['ROTTERDAM', 'UNILEVER'] │ AKZO NOBEL │ ROTTERDAM UNILEVER V │ │ 3575 │ 35244256 │ ['Any Class'] │ ['B01F', 'C09K', ... +2] │ ['A01B', 'A01D', ... +98] │ ['CHRISTINA', 'ERNEST', ... +11] │ ['ADRIAAN', 'ALAN', ... +240] │ 569309 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ AKZO NOBEL │ UNILEVER │ * AKZO NOBEL NV │ UNILEVER N.V. │ ['AKZO', 'NOBEL'] │ ['UNILEVER'] │ AKZO NOBEL │ UNILEVER │ │ 3779 │ 35244256 │ ['Any Class'] │ ['G02B', 'G04G', ... +2] │ ['A01B', 'A01D', ... +98] │ ['EVAN', 'GRANT', ... +8] │ ['ADRIAAN', 'ALAN', ... +240] │ 656303 │ 35272490 │ 52.35 │ 51.916667 │ 4.916667 │ 4.500000 │ ALCATEL │ UNILEVER │ * ALCATEL N.V. │ UNILEVER N.V. │ ['ALCATEL'] │ ['UNILEVER'] │ ALCATEL │ UNILEVER │ │ 3780 │ 35244304 │ ['Any Class'] │ ['B05D', 'C03B', ... +6] │ ['A01B', 'A01F', ... +98] │ ['CHRISTIAN', 'DUANE', ... +7] │ ['ADRIANUS', 'AKKERMANS', ... +228] │ 656303 │ 35272490 │ 52.35 │ NULL │ 4.916667 │ NULL │ ALCATEL │ UNILEVER │ * ALCATEL N.V. │ UNILEVER NV │ ['ALCATEL'] │ ['UNILEVER'] │ ALCATEL │ UNILEVER │ │ 3782 │ 32621596 │ ['Any Class'] │ ['H01S', 'H04B', ... +1] │ ['G01R', 'G01S', ... +17] │ ['AUDOUIN', 'GABRIAGUES', ... +4] │ ['ADLING', 'AKVIST', ... +130] │ 656303 │ 32634657 │ NULL │ 52.350000 │ NULL │ 4.916667 │ ALCATEL │ STMICROELECTRONICS │ * ALCATEL N.V. │ STMicroelectronics N.V. │ ['ALCATEL'] │ ['STMICROELECTRONICS'] │ ALCATEL │ STMICROELECTRONICS │ │ 15041 │ 35244318 │ ['Any Class'] │ ['G06F'] │ ['A01H', 'A01N', ... +98] │ ['CANON', 'CENT', ... +11] │ ['ADAMS', 'ADRIAAN', ... +266] │ 4333661 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ CANON EUROPA │ UNILEVER │ * CANON EUROPA N.V │ Unilever N.V. │ ['CANON', 'EUROPA'] │ ['UNILEVER'] │ CANON EUROPA │ UNILEVER │ │ 15042 │ 35244259 │ ['Any Class'] │ ['G01B', 'G06T'] │ ['A01B', 'A01C', ... +98] │ ['ADAM', 'ALEXANDER', ... +5] │ ['ABDEL', 'ABRAHAN', ... +219] │ 4333661 │ 35272490 │ NULL │ NULL │ NULL │ NULL │ CANON EUROPA │ UNILEVER │ * CANON EUROPA N.V. │ UNILEVER N.V. │ ['CANON', 'EUROPA'] │ ['UNILEVER'] │ CANON EUROPA │ UNILEVER │ │ 15043 │ 35244318 │ ['Any Class'] │ ['B41J', 'G01B', ... +8] │ ['A01H', 'A01N', ... +98] │ ['BARRY', 'BROCARD', ... +24] │ ['ADAMS', 'ADRIAAN', ... +266] │ 4333661 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ CANON EUROPA │ UNILEVER │ * CANON EUROPA NV │ Unilever N.V. │ ['CANON', 'EUROPA'] │ ['UNILEVER'] │ CANON EUROPA │ UNILEVER │ │ 25387 │ 35244383 │ ['Any Class'] │ ['A01H', 'A23J', ... +9] │ ['A01H', 'A01J', ... +98] │ ['ALEXANDER', 'ANDELA', ... +39] │ ['00041', '06470', ... +405] │ 7650783 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ DSM │ UNILEVER ROTTERDAM │ * DSM N.V. │ UNILEVER N.V., ROTTERDAM, NL │ ['DSM'] │ ['ROTTERDAM', 'UNILEVER'] │ DSM │ UNILEVER V │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────────┴─────────────┴────────────────┴──────────────────────────┴───────────────────────────┴───────────────────────────────────┴─────────────────────────────────────┴──────────────┴──────────────┴────────────┴────────────┴─────────────┴─────────────┴──────────────────────┴────────────────────┴──────────────────────────────┴──────────────────────────────┴─────────────────────────────┴───────────────────────────┴──────────────────────┴──────────────────────┘
blocked.blocking_rules.unnest().value_counts()
┏━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓ ┃ blocking_rules ┃ blocking_rules_count ┃ ┡━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ int64 │ ├───────────────────┼──────────────────────┤ │ Any Class │ 569034 │ │ Name First 3 │ 215968 │ │ Coordinates Close │ 274831 │ │ Coauthors Exact │ 98 │ └───────────────────┴──────────────────────┘
By blocking, we reduced the number of needed pairs by a large factor. In larger datasets, and with better blocking rules, this would be even more!
n_comparisons = blocked.count().execute()
n_naive = mismo.block.n_naive_comparisons(featured)
reduction_ratio = n_comparisons / n_naive
n_naive, n_comparisons, reduction_ratio
(2828631, 862446, 0.30489873016310715)
We can also inspect which blocking rules were most to blame for the generated pairs. If some rules generate a huge amount of comparisons, it might be worth trying to make them more restrictive so we get better performance. Or, if some blocking rules aren't generating any comparisons, that might be an indication that we have a bug in there somewhere.
mismo.block.upset_chart(blocked)
OK, now that we have our candidate pairs generated, let's actually do the comparing of pairs.
In Mismo, similar to blocking, we use a single PComparer
protocol to do all
comparisons. A PComparer
is simply a callable of the form Table, **kwargs -> Table
that takes a table of blocked record pairs and returns a modified version.
Typical things a PComparer
might do include
- Dropping some comparisons that are definitely not matches.
- Adding a score column for how much some feature matches, eg how well the address matches.
- Combining these individual per-feature scores into an overall score for the pair.
Here, we use one of the most common methods using LevelComparers, each of which represents a measurement of similarity based on some dimension (eg "location", "name", "price", etc). Each LevelComparer is composed of MatchLevels, which represent discrete levels of agreement (eg "exactly", "within 100km", "one or both values null", etc)
from mismo.compare import LevelComparer, MatchLevel
from mismo.lib.geo import distance_km
from mismo.sets import jaccard
class NameMatchLevel(MatchLevel):
EXACT = 0
SHARE_1_TOKEN = 1
SHARE_2_PLUS_TOKENS = 2
ELSE = 3
name_comparison = LevelComparer(
name="Name",
levels=NameMatchLevel,
cases=[
(_.name_cleaned_l == _.name_cleaned_r, NameMatchLevel.EXACT),
(
_.name_tokens_l.intersect(_.name_tokens_r).length() == 1,
NameMatchLevel.SHARE_1_TOKEN,
),
(
_.name_tokens_l.intersect(_.name_tokens_r).length() >= 2,
NameMatchLevel.SHARE_2_PLUS_TOKENS,
),
(True, NameMatchLevel.ELSE),
],
)
class ClassMatchLevel(MatchLevel):
EXACT = 0
SHARE_1_CLASS = 1
SHARE_2_PLUS_CLASSES = 2
ELSE = 3
classes_comparison = LevelComparer(
name="Classes",
levels=ClassMatchLevel,
cases=[
(_.classes_l == _.classes_r, ClassMatchLevel.EXACT),
(
_.classes_l.intersect(_.classes_r).length() == 1,
ClassMatchLevel.SHARE_1_CLASS,
),
(
_.classes_l.intersect(_.classes_r).length() >= 2,
ClassMatchLevel.SHARE_2_PLUS_CLASSES,
),
(True, ClassMatchLevel.ELSE),
],
)
class CoauthorMatchLevel(MatchLevel):
JACCARD_50 = 0
JACCARD_25 = 1
JACCARD_10 = 2
JACCARD_02 = 3
ELSE = 4
coauthors_comparison = LevelComparer(
name="Coauthors",
levels=CoauthorMatchLevel,
cases=[
(jaccard(_.coauthors_l, _.coauthors_r) >= 0.5, CoauthorMatchLevel.JACCARD_50),
(jaccard(_.coauthors_l, _.coauthors_r) >= 0.25, CoauthorMatchLevel.JACCARD_25),
(jaccard(_.coauthors_l, _.coauthors_r) >= 0.1, CoauthorMatchLevel.JACCARD_10),
(jaccard(_.coauthors_l, _.coauthors_r) >= 0.02, CoauthorMatchLevel.JACCARD_02),
(True, CoauthorMatchLevel.ELSE),
],
)
class CoordMatchLevel(MatchLevel):
EXACT = 0
WITHIN_10KM = 1
WITHIN_100KM = 2
BOTH_MISSING = 3
ONE_MISSING = 4
ELSE = 5
coords_comparison = LevelComparer(
name="Coords",
levels=CoordMatchLevel,
cases=[
(_.latitude_l == _.latitude_r, CoordMatchLevel.EXACT),
(
distance_km(
lat1=_.latitude_l,
lon1=_.longitude_l,
lat2=_.latitude_r,
lon2=_.longitude_r,
)
<= 10,
CoordMatchLevel.WITHIN_10KM,
),
(
distance_km(
lat1=_.latitude_l,
lon1=_.longitude_l,
lat2=_.latitude_r,
lon2=_.longitude_r,
)
<= 100,
CoordMatchLevel.WITHIN_100KM,
),
(
ibis.and_(
_.latitude_l.isnull() | _.longitude_l.isnull(),
_.latitude_r.isnull() | _.longitude_r.isnull(),
),
CoordMatchLevel.BOTH_MISSING,
),
(
ibis.or_(
_.latitude_l.isnull() | _.longitude_l.isnull(),
_.latitude_r.isnull() | _.longitude_r.isnull(),
),
CoordMatchLevel.ONE_MISSING,
),
(True, CoordMatchLevel.ELSE),
],
)
comparers = [
name_comparison,
classes_comparison,
coauthors_comparison,
coords_comparison,
]
compared = blocked
for comp in comparers:
compared = comp(compared)
compared = compared.cache()
compared
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━┓ ┃ record_id_l ┃ record_id_r ┃ blocking_rules ┃ classes_l ┃ classes_r ┃ coauthors_l ┃ coauthors_r ┃ label_true_l ┃ label_true_r ┃ latitude_l ┃ latitude_r ┃ longitude_l ┃ longitude_r ┃ name_cleaned_l ┃ name_cleaned_r ┃ name_l ┃ name_r ┃ name_tokens_l ┃ name_tokens_r ┃ name_true_l ┃ name_true_r ┃ Name ┃ Classes ┃ Coauthors ┃ Coords ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━┩ │ int64 │ int64 │ array<string> │ array<string> │ array<string> │ array<string> │ array<string> │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ string │ array<string> │ array<string> │ string │ string │ int8 │ int8 │ int8 │ int8 │ ├─────────────┼─────────────┼────────────────┼──────────────────────────┼───────────────────────────┼───────────────────────────────────┼─────────────────────────────────────┼──────────────┼──────────────┼────────────┼────────────┼─────────────┼─────────────┼──────────────────────┼────────────────────┼──────────────────────────────┼──────────────────────────────┼─────────────────────────────┼───────────────────────────┼──────────────────────┼──────────────────────┼──────┼─────────┼───────────┼────────┤ │ 2909 │ 35244256 │ ['Any Class'] │ ['A61B', 'A61N'] │ ['A01B', 'A01D', ... +98] │ ['DAVID', 'ELECTRONICS', ... +5] │ ['ADRIAAN', 'ALAN', ... +240] │ 402600 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ AGILENT TECHNOLOGIES │ UNILEVER │ * AGILENT TECHNOLOGIES, INC. │ UNILEVER N.V. │ ['AGILENT', 'TECHNOLOGIES'] │ ['UNILEVER'] │ AGILENT TECHNOLOGIES │ UNILEVER │ 3 │ 1 │ 4 │ 4 │ │ 3574 │ 35244375 │ ['Any Class'] │ ['B01L', 'C11D', ... +3] │ ['A01H', 'A01J', ... +98] │ ['ALBERT', 'ALBERTO', ... +31] │ ['1JD', '3133', ... +209] │ 569309 │ 35272490 │ NULL │ NULL │ NULL │ NULL │ AKZO NOBEL │ UNILEVER ROTTERDAM │ * AKZO NOBEL N.V. │ UNILEVER N.V., ROTTERDAM │ ['AKZO', 'NOBEL'] │ ['ROTTERDAM', 'UNILEVER'] │ AKZO NOBEL │ ROTTERDAM UNILEVER V │ 3 │ 2 │ 4 │ 3 │ │ 3575 │ 35244256 │ ['Any Class'] │ ['B01F', 'C09K', ... +2] │ ['A01B', 'A01D', ... +98] │ ['CHRISTINA', 'ERNEST', ... +11] │ ['ADRIAAN', 'ALAN', ... +240] │ 569309 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ AKZO NOBEL │ UNILEVER │ * AKZO NOBEL NV │ UNILEVER N.V. │ ['AKZO', 'NOBEL'] │ ['UNILEVER'] │ AKZO NOBEL │ UNILEVER │ 3 │ 2 │ 3 │ 4 │ │ 3779 │ 35244256 │ ['Any Class'] │ ['G02B', 'G04G', ... +2] │ ['A01B', 'A01D', ... +98] │ ['EVAN', 'GRANT', ... +8] │ ['ADRIAAN', 'ALAN', ... +240] │ 656303 │ 35272490 │ 52.35 │ 51.916667 │ 4.916667 │ 4.500000 │ ALCATEL │ UNILEVER │ * ALCATEL N.V. │ UNILEVER N.V. │ ['ALCATEL'] │ ['UNILEVER'] │ ALCATEL │ UNILEVER │ 3 │ 1 │ 4 │ 2 │ │ 3780 │ 35244304 │ ['Any Class'] │ ['B05D', 'C03B', ... +6] │ ['A01B', 'A01F', ... +98] │ ['CHRISTIAN', 'DUANE', ... +7] │ ['ADRIANUS', 'AKKERMANS', ... +228] │ 656303 │ 35272490 │ 52.35 │ NULL │ 4.916667 │ NULL │ ALCATEL │ UNILEVER │ * ALCATEL N.V. │ UNILEVER NV │ ['ALCATEL'] │ ['UNILEVER'] │ ALCATEL │ UNILEVER │ 3 │ 1 │ 4 │ 4 │ │ 3782 │ 32621596 │ ['Any Class'] │ ['H01S', 'H04B', ... +1] │ ['G01R', 'G01S', ... +17] │ ['AUDOUIN', 'GABRIAGUES', ... +4] │ ['ADLING', 'AKVIST', ... +130] │ 656303 │ 32634657 │ NULL │ 52.350000 │ NULL │ 4.916667 │ ALCATEL │ STMICROELECTRONICS │ * ALCATEL N.V. │ STMicroelectronics N.V. │ ['ALCATEL'] │ ['STMICROELECTRONICS'] │ ALCATEL │ STMICROELECTRONICS │ 3 │ 2 │ 4 │ 4 │ │ 15041 │ 35244318 │ ['Any Class'] │ ['G06F'] │ ['A01H', 'A01N', ... +98] │ ['CANON', 'CENT', ... +11] │ ['ADAMS', 'ADRIAAN', ... +266] │ 4333661 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ CANON EUROPA │ UNILEVER │ * CANON EUROPA N.V │ Unilever N.V. │ ['CANON', 'EUROPA'] │ ['UNILEVER'] │ CANON EUROPA │ UNILEVER │ 3 │ 1 │ 4 │ 4 │ │ 15042 │ 35244259 │ ['Any Class'] │ ['G01B', 'G06T'] │ ['A01B', 'A01C', ... +98] │ ['ADAM', 'ALEXANDER', ... +5] │ ['ABDEL', 'ABRAHAN', ... +219] │ 4333661 │ 35272490 │ NULL │ NULL │ NULL │ NULL │ CANON EUROPA │ UNILEVER │ * CANON EUROPA N.V. │ UNILEVER N.V. │ ['CANON', 'EUROPA'] │ ['UNILEVER'] │ CANON EUROPA │ UNILEVER │ 3 │ 2 │ 4 │ 3 │ │ 15043 │ 35244318 │ ['Any Class'] │ ['B41J', 'G01B', ... +8] │ ['A01H', 'A01N', ... +98] │ ['BARRY', 'BROCARD', ... +24] │ ['ADAMS', 'ADRIAAN', ... +266] │ 4333661 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ CANON EUROPA │ UNILEVER │ * CANON EUROPA NV │ Unilever N.V. │ ['CANON', 'EUROPA'] │ ['UNILEVER'] │ CANON EUROPA │ UNILEVER │ 3 │ 2 │ 4 │ 4 │ │ 25387 │ 35244383 │ ['Any Class'] │ ['A01H', 'A23J', ... +9] │ ['A01H', 'A01J', ... +98] │ ['ALEXANDER', 'ANDELA', ... +39] │ ['00041', '06470', ... +405] │ 7650783 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ DSM │ UNILEVER ROTTERDAM │ * DSM N.V. │ UNILEVER N.V., ROTTERDAM, NL │ ['DSM'] │ ['ROTTERDAM', 'UNILEVER'] │ DSM │ UNILEVER V │ 3 │ 2 │ 4 │ 4 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────────┴─────────────┴────────────────┴──────────────────────────┴───────────────────────────┴───────────────────────────────────┴─────────────────────────────────────┴──────────────┴──────────────┴────────────┴────────────┴─────────────┴─────────────┴──────────────────────┴────────────────────┴──────────────────────────────┴──────────────────────────────┴─────────────────────────────┴───────────────────────────┴──────────────────────┴──────────────────────┴──────┴─────────┴───────────┴────────┘
The result above is the blocked table, with a column added for every Comparer
.
The value of each column is the level that the record pair matched at.
For example, there is now a "Name" column,
filled with values like "exact", "share 1 token", etc.
Now that we have our features, we can use the Fellegi-Sunter model to train weights for each of these features. This is a probabilistic model that is based on the concept of odds. When you see an exact match on name, that increases the odds of a match by some amount, maybe 10x. When you see a non-match on name, that decreases the odds of a match by some amount, maybe 0.1x. We can either train this from labeled data, or we can use unlabeled data using an algorithm called "Expectation Maximization".
weights = mismo.fs.train_using_em(comparers, featured, featured, max_pairs=100_000)
# Can save and load weights
# weights.to_json("weights.json")
# weights = ComparisonWeights.from_json("weights.json")
weights.plot()
FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))
In the above plot, you can see the odds for "share 1 class" and "share 2+ classes" are nearly the same. That doesn't quite make sense, and might be an indicator that we should look more into this to see if there is a problem with the features that we are using. But for now let's just move on.
Use the weights to score the record pairs, finding the odds for each
Comparer
, and then combining them into an overall odds for the record pair.
scored = weights.score_compared(compared)
scored = scored.cache()
scored
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ record_id_l ┃ record_id_r ┃ blocking_rules ┃ classes_l ┃ classes_r ┃ coauthors_l ┃ coauthors_r ┃ label_true_l ┃ label_true_r ┃ latitude_l ┃ latitude_r ┃ longitude_l ┃ longitude_r ┃ name_cleaned_l ┃ name_cleaned_r ┃ name_l ┃ name_r ┃ name_tokens_l ┃ name_tokens_r ┃ name_true_l ┃ name_true_r ┃ odds ┃ Name ┃ Name_odds ┃ Classes ┃ Classes_odds ┃ Coauthors ┃ Coauthors_odds ┃ Coords ┃ Coords_odds ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━┩ │ int64 │ int64 │ array<string> │ array<string> │ array<string> │ array<string> │ array<string> │ int64 │ int64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ string │ array<string> │ array<string> │ string │ string │ float64 │ int8 │ float64 │ int8 │ float64 │ int8 │ float64 │ int8 │ float64 │ ├─────────────┼─────────────┼────────────────┼──────────────────────────┼───────────────────────────┼───────────────────────────────────┼─────────────────────────────────────┼──────────────┼──────────────┼────────────┼────────────┼─────────────┼─────────────┼──────────────────────┼────────────────────┼──────────────────────────────┼──────────────────────────────┼─────────────────────────────┼───────────────────────────┼──────────────────────┼──────────────────────┼──────────┼──────┼───────────┼─────────┼──────────────┼───────────┼────────────────┼────────┼─────────────┤ │ 2909 │ 35244256 │ ['Any Class'] │ ['A61B', 'A61N'] │ ['A01B', 'A01D', ... +98] │ ['DAVID', 'ELECTRONICS', ... +5] │ ['ADRIAAN', 'ALAN', ... +240] │ 402600 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ AGILENT TECHNOLOGIES │ UNILEVER │ * AGILENT TECHNOLOGIES, INC. │ UNILEVER N.V. │ ['AGILENT', 'TECHNOLOGIES'] │ ['UNILEVER'] │ AGILENT TECHNOLOGIES │ UNILEVER │ 0.002051 │ 3 │ 0.000458 │ 1 │ 5.517122 │ 4 │ 0.830421 │ 4 │ 0.976704 │ │ 3574 │ 35244375 │ ['Any Class'] │ ['B01L', 'C11D', ... +3] │ ['A01H', 'A01J', ... +98] │ ['ALBERT', 'ALBERTO', ... +31] │ ['1JD', '3133', ... +209] │ 569309 │ 35272490 │ NULL │ NULL │ NULL │ NULL │ AKZO NOBEL │ UNILEVER ROTTERDAM │ * AKZO NOBEL N.V. │ UNILEVER N.V., ROTTERDAM │ ['AKZO', 'NOBEL'] │ ['ROTTERDAM', 'UNILEVER'] │ AKZO NOBEL │ ROTTERDAM UNILEVER V │ 0.002427 │ 3 │ 0.000458 │ 2 │ 6.465203 │ 4 │ 0.830421 │ 3 │ 0.986388 │ │ 3575 │ 35244256 │ ['Any Class'] │ ['B01F', 'C09K', ... +2] │ ['A01B', 'A01D', ... +98] │ ['CHRISTINA', 'ERNEST', ... +11] │ ['ADRIAAN', 'ALAN', ... +240] │ 569309 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ AKZO NOBEL │ UNILEVER │ * AKZO NOBEL NV │ UNILEVER N.V. │ ['AKZO', 'NOBEL'] │ ['UNILEVER'] │ AKZO NOBEL │ UNILEVER │ 0.004372 │ 3 │ 0.000458 │ 2 │ 6.465203 │ 3 │ 1.510955 │ 4 │ 0.976704 │ │ 3779 │ 35244256 │ ['Any Class'] │ ['G02B', 'G04G', ... +2] │ ['A01B', 'A01D', ... +98] │ ['EVAN', 'GRANT', ... +8] │ ['ADRIAAN', 'ALAN', ... +240] │ 656303 │ 35272490 │ 52.35 │ 51.916667 │ 4.916667 │ 4.500000 │ ALCATEL │ UNILEVER │ * ALCATEL N.V. │ UNILEVER N.V. │ ['ALCATEL'] │ ['UNILEVER'] │ ALCATEL │ UNILEVER │ 0.000089 │ 3 │ 0.000458 │ 1 │ 5.517122 │ 4 │ 0.830421 │ 2 │ 0.042623 │ │ 3780 │ 35244304 │ ['Any Class'] │ ['B05D', 'C03B', ... +6] │ ['A01B', 'A01F', ... +98] │ ['CHRISTIAN', 'DUANE', ... +7] │ ['ADRIANUS', 'AKKERMANS', ... +228] │ 656303 │ 35272490 │ 52.35 │ NULL │ 4.916667 │ NULL │ ALCATEL │ UNILEVER │ * ALCATEL N.V. │ UNILEVER NV │ ['ALCATEL'] │ ['UNILEVER'] │ ALCATEL │ UNILEVER │ 0.002051 │ 3 │ 0.000458 │ 1 │ 5.517122 │ 4 │ 0.830421 │ 4 │ 0.976704 │ │ 3782 │ 32621596 │ ['Any Class'] │ ['H01S', 'H04B', ... +1] │ ['G01R', 'G01S', ... +17] │ ['AUDOUIN', 'GABRIAGUES', ... +4] │ ['ADLING', 'AKVIST', ... +130] │ 656303 │ 32634657 │ NULL │ 52.350000 │ NULL │ 4.916667 │ ALCATEL │ STMICROELECTRONICS │ * ALCATEL N.V. │ STMicroelectronics N.V. │ ['ALCATEL'] │ ['STMICROELECTRONICS'] │ ALCATEL │ STMICROELECTRONICS │ 0.002403 │ 3 │ 0.000458 │ 2 │ 6.465203 │ 4 │ 0.830421 │ 4 │ 0.976704 │ │ 15041 │ 35244318 │ ['Any Class'] │ ['G06F'] │ ['A01H', 'A01N', ... +98] │ ['CANON', 'CENT', ... +11] │ ['ADAMS', 'ADRIAAN', ... +266] │ 4333661 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ CANON EUROPA │ UNILEVER │ * CANON EUROPA N.V │ Unilever N.V. │ ['CANON', 'EUROPA'] │ ['UNILEVER'] │ CANON EUROPA │ UNILEVER │ 0.002051 │ 3 │ 0.000458 │ 1 │ 5.517122 │ 4 │ 0.830421 │ 4 │ 0.976704 │ │ 15042 │ 35244259 │ ['Any Class'] │ ['G01B', 'G06T'] │ ['A01B', 'A01C', ... +98] │ ['ADAM', 'ALEXANDER', ... +5] │ ['ABDEL', 'ABRAHAN', ... +219] │ 4333661 │ 35272490 │ NULL │ NULL │ NULL │ NULL │ CANON EUROPA │ UNILEVER │ * CANON EUROPA N.V. │ UNILEVER N.V. │ ['CANON', 'EUROPA'] │ ['UNILEVER'] │ CANON EUROPA │ UNILEVER │ 0.002427 │ 3 │ 0.000458 │ 2 │ 6.465203 │ 4 │ 0.830421 │ 3 │ 0.986388 │ │ 15043 │ 35244318 │ ['Any Class'] │ ['B41J', 'G01B', ... +8] │ ['A01H', 'A01N', ... +98] │ ['BARRY', 'BROCARD', ... +24] │ ['ADAMS', 'ADRIAAN', ... +266] │ 4333661 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ CANON EUROPA │ UNILEVER │ * CANON EUROPA NV │ Unilever N.V. │ ['CANON', 'EUROPA'] │ ['UNILEVER'] │ CANON EUROPA │ UNILEVER │ 0.002403 │ 3 │ 0.000458 │ 2 │ 6.465203 │ 4 │ 0.830421 │ 4 │ 0.976704 │ │ 25387 │ 35244383 │ ['Any Class'] │ ['A01H', 'A23J', ... +9] │ ['A01H', 'A01J', ... +98] │ ['ALEXANDER', 'ANDELA', ... +39] │ ['00041', '06470', ... +405] │ 7650783 │ 35272490 │ NULL │ 51.916667 │ NULL │ 4.500000 │ DSM │ UNILEVER ROTTERDAM │ * DSM N.V. │ UNILEVER N.V., ROTTERDAM, NL │ ['DSM'] │ ['ROTTERDAM', 'UNILEVER'] │ DSM │ UNILEVER V │ 0.002403 │ 3 │ 0.000458 │ 2 │ 6.465203 │ 4 │ 0.830421 │ 4 │ 0.976704 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────────┴─────────────┴────────────────┴──────────────────────────┴───────────────────────────┴───────────────────────────────────┴─────────────────────────────────────┴──────────────┴──────────────┴────────────┴────────────┴─────────────┴─────────────┴──────────────────────┴────────────────────┴──────────────────────────────┴──────────────────────────────┴─────────────────────────────┴───────────────────────────┴──────────────────────┴──────────────────────┴──────────┴──────┴───────────┴─────────┴──────────────┴───────────┴────────────────┴────────┴─────────────┘
We can plot these compared pairs. We can see which comparison levels are most common, which occur together, which lead to matches, and which lead to non-matches.
The exact match levels have the highest odds, and the else levels have the lowest. The other levels are somewhere in between.
mismo.compare.compared_dashboard(compared, comparers=comparers, weights=weights)
VBox(children=(JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'vcon…
It looks like an odds of 50 seems to separate the pairs between non-matches and matches. If I hover over the above chart, I can see that pretty much all the "else" comparisons are in the low cluster, and all the "exact" comparisons are in the high cluster.
odds_threshold = 50
(scored.odds >= odds_threshold).value_counts()
┏━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ GreaterEqual(odds, 50) ┃ GreaterEqual(odds, 50)_count ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ boolean │ int64 │ ├────────────────────────┼──────────────────────────────┤ │ False │ 733134 │ │ True │ 129312 │ └────────────────────────┴──────────────────────────────┘
Let's be really picky and only take the most likely matches as true matches, and then perform connected components to label each patent with its inventor:
links = scored[_.odds >= odds_threshold]
links = links.cache()
print(links.count().execute())
labeled = mismo.cluster.connected_components(links=links, records=featured)
labeled = labeled.relocate("component", after="label_true").order_by("component")
print(labeled.count().execute())
labeled
129312 2379
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ record_id ┃ label_true ┃ component ┃ name_true ┃ name ┃ name_cleaned ┃ name_tokens ┃ latitude ┃ longitude ┃ coauthors ┃ classes ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ int64 │ int64 │ uint64 │ string │ string │ string │ array<string> │ float64 │ float64 │ array<string> │ array<string> │ ├───────────┼────────────┼───────────┼─────────────────────────────────┼──────────────────────────────────────────────────┼────────────────────────────────────────────┼────────────────────────────────────────────┼───────────┼───────────┼──────────────────────────────────────┼───────────────────────────┤ │ 30957671 │ 30982984 │ 0 │ INTERNAT MATTSCHAPPIJ RES SHELL │ SHELL INTERNATIONALE RESEARCH MATTSCHAPPIJ B. V. │ SHELL INTERNATIONALE RESEARCH MATTSCHAPPIJ │ ['INTERNATIONALE', 'MATTSCHAPPIJ', ... +2] │ NULL │ NULL │ ['DRENT', 'EIT'] │ ['B01J', 'C08G'] │ │ 30957791 │ 30982984 │ 0 │ INTERNATTONALE MIJ RES SHELL │ SHELL INTERNATTONALE RESEARCH MAATSCHAPPIJ B. V. │ SHELL INTERNATTONALE RESEARCH MAATSCHAPPIJ │ ['INTERNATTONALE', 'MAATSCHAPPIJ', ... +2] │ NULL │ NULL │ ['ALAIN', 'BART', ... +44] │ ['B01J', 'B29B', ... +11] │ │ 30957992 │ 30984013 │ 0 │ OIL SHELL │ SHELL OIL CO │ SHELL OIL │ ['OIL', 'SHELL'] │ NULL │ NULL │ ['BENING', 'CHARLES', ... +1] │ ['C08C', 'C08F'] │ │ 32621589 │ 32634657 │ 0 │ STMICROELECTRONICS │ STMicroelectronics N.V. │ STMICROELECTRONICS │ ['STMICROELECTRONICS'] │ 52.500000 │ 5.750000 │ ['BENTHIEN', 'STEPHAN'] │ ['H01L', 'H04N'] │ │ 32621604 │ 32634657 │ 0 │ STMICROELECTRONICS │ STMicroelectronics N.V. │ STMICROELECTRONICS │ ['STMICROELECTRONICS'] │ 52.350000 │ 4.916667 │ ['ALBERTO', 'ALTERNATIVES', ... +31] │ ['G11C', 'H01G', ... +2] │ │ 32621606 │ 32634657 │ 0 │ STMICROELECTRONICS │ STMicroelectronics N.V. │ STMICROELECTRONICS │ ['STMICROELECTRONICS'] │ 52.350000 │ 4.916667 │ ['ANTONIO', 'BAUDIN', ... +34] │ ['G06F', 'H03L', ... +3] │ │ 32621608 │ 32634657 │ 0 │ STMICROELECTRONICS │ STMicroelectronics N.V. │ STMICROELECTRONICS │ ['STMICROELECTRONICS'] │ 52.500000 │ 5.750000 │ ['ARASH', 'JENS', ... +8] │ ['H01L'] │ │ 32621615 │ 32634657 │ 0 │ STMICROELECTRONICS │ STMicroelectronics NV. │ STMICROELECTRONICS │ ['STMICROELECTRONICS'] │ 52.350000 │ 4.916667 │ ['BASTIAAN', 'BENTHIEN', ... +14] │ ['H01L', 'H01M', ... +3] │ │ 35243863 │ 35272490 │ 0 │ UNILEVER │ UNILEVER │ UNILEVER │ ['UNILEVER'] │ NULL │ NULL │ ['BARBARA', 'BEN', ... +39] │ ['A45D', 'A47G', ... +11] │ │ 35244158 │ 35272490 │ 0 │ UNILEVER V │ UNILEVER N. V., │ UNILEVER │ ['UNILEVER'] │ 51.916667 │ 4.500000 │ ['ANTHONY', 'GLYN', ... +13] │ ['A23C', 'A23L'] │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └───────────┴────────────┴───────────┴─────────────────────────────────┴──────────────────────────────────────────────────┴────────────────────────────────────────────┴────────────────────────────────────────────┴───────────┴───────────┴──────────────────────────────────────┴───────────────────────────┘
Now let's evaluate how good our labeling is. Mismo wraps all of the evaluation metrics from sklearn, so we can use them with Ibis Tables.
labels_true = labeled.select("record_id", label=_.label_true)
labels_pred = labeled.select("record_id", label=_.component)
print(mismo.cluster.adjusted_rand_score(labels_true, labels_pred))
print(mismo.cluster.homogeneity_score(labels_true, labels_pred))
print(mismo.cluster.completeness_score(labels_true, labels_pred))
0.10221328294557343 0.32421809375569566 0.9053393388568964