#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Demo for k-Nearest Neighbors with Strings This demonstrates how to implement the k-Nearest neighbors algorithm with string data. Some different distance metrics can be tried out and customized. In addition a directed neighbors graph is created and visualized. Prerequisites can be installed with: mamba install networkx pyvis pip install strsimpy """ # %% imports import numpy as np import pandas as pd # try out some different string distance metrics # 1. Edit distance: # from strsimpy.levenshtein import Levenshtein # levenshtein = Levenshtein() # distance = levenshtein.distance # 2. Some experimental "perceived" distance # from strsimpy import SIFT4 # sift = SIFT4() # distance = sift.distance # 3. Weighted edit distance, with custom costs from strsimpy.weighted_levenshtein import WeightedLevenshtein def insertion_deletion_cost(char): # additional spaces, dashes or underscores should not make a large difference # e.g. "1-123" ←→ "1123" has a distance of 0.1 # "1A123" ←→ "1123" has a distance of 0.75 if char.isspace() or char in '-_': return 0.1 return 0.75 def substitution_cost(a, b): # same type of letter should not make a large difference # e.g. "123QA7" ←→ "123QB8" has a distance of 0.2 # "123QA7" ←→ "123QBX" has a distance of 1.1 if (a.isdigit() and b.isdigit()) or \ (a.isalpha() and b.isalpha()) or \ (a.isspace() and b.isspace()): return 0.1 return 1.0 weighted_levenshtein = WeightedLevenshtein( substitution_cost_fn=substitution_cost, insertion_cost_fn=insertion_deletion_cost, deletion_cost_fn=insertion_deletion_cost) distance = weighted_levenshtein.distance # 4. n-Grams # from strsimpy.qgram import QGram # qgram = QGram(2) # distance = qgram.distance # 5. Overlap Coefficient, similar to n-Grams # from strsimpy.overlap_coefficient import OverlapCoefficient # overlapCoefficient = OverlapCoefficient() # distance = overlapCoefficient.distance # %% some test data data = [ # Hyundai part numbers '99490-ADE50', '1KF40-AK700', '66700-1P001', '66797-1P001', '65110-1P700', '65230-1P500', '56900-1K000EQ', '11254-06161', '85020-1P000', '96550-1K100', '96200-1P500', 'DP370-APU044CH0U', 'DP370-APU060SH1U', 'LP370-APE060SH1', '11290-08206B', '49580-2H366K', '11200-06453', '91568-45000', '12291-04103', '11250-08203', # Smart part numbers 'A 4537513400', 'A 4537513000', 'A 4537512500', 'A 4532920800', 'Q0012473V001000000', 'A 2711420172', 'N910143006000', # Chery part numbers 'Q1840880', '481H-1011030BA', '481H-1002041', '481FC-BJ1002001AA', 'A11-3810010BB', 'M11-8107010BA', 'M11-6101350', # DOIs '10.1093/ajae/aaq063', '10.1371/journal.pgen.1001111', '10.3139/9783446463554', '10.5555/3378999', # ISBNs '978-3-16-148410-0', '978-1-4920-3264-9', '9781492032649', '1-56619-909-3', '1566199093', ] # %% simple k-NN test k = 3 test = 'Q0012473V001000000' raw_dists = [distance(test, s) for s in data] dists = pd.Series(raw_dists, index=data).sort_values() # select k neighbors (or more if they have the same distances) neighbors = dists[dists <= dists.iloc[k-1]] neighbors # %% create a neighbors graph import networkx as nx graph = nx.DiGraph() graph.add_nodes_from(data) k = 2 normalize = False for i, sample in enumerate(data): # distances to other samples other_data = data[:i] + data[i+1:] raw_dists = [distance(sample, s) for s in other_data] dists = pd.Series(raw_dists, index=other_data).sort_values() if normalize: lengths = np.maximum(dists.index.str.len(), len(sample)) dists /= lengths * 10 # select k neighbors (or more if they have the same distances) dist_at_k = dists.iloc[k-1] if len(dists) >= k else dists.iloc[-1] neighbors = dists[dists <= dist_at_k] # add weighted neighbors as edges in a graph for n, d in neighbors.items(): graph.add_edge(sample, n, weight=1/d) # %% interactive visualization of the neighbors graph from pyvis.network import Network # notebook=True should work in Jupyter Notebooks (but not in VS Code) nt = Network(width='1800px', height='900px', notebook=False, cdn_resources='remote', directed=True) nt.from_nx(graph.copy()) nt.toggle_physics(True) nt.show('nx.html')