Using 亲和力传播聚类(并不完美,但也许是一个起点):
import pandas as pd
import numpy as np
import io
from fuzzywuzzy import fuzz
from scipy import spatial
import sklearn.cluster
s="""Name ID Value
0 James 1 10
1 James 2 2 142
2 Bike 3 1
3 Bicycle 4 1197
4 James Marsh 5 12
5 Ants 6 54
6 Job 7 6
7 Michael 8 80007
8 Arm 9 47
9 Mike K 10 9
10 Michael k 11 1"""
df = pd.read_csv(io.StringIO(s),sep='\s\s+',engine='python')
names = df.Name.values
sim = spatial.distance.pdist(names.reshape((-1,1)), lambda x,y: fuzz.WRatio(x,y))
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", random_state=None)
affprop.fit(spatial.distance.squareform(sim))
res = df.groupby(affprop.labels_).agg(
Names=('Name',','.join),
First_ID=('ID','first'),
Total_Value=('Value','sum'),
Group_Size=('Value','count')
)
Result
Names First_ID Total_Value Group_Size
0 James,James 2,James Marsh,Ants,Arm 1 265 5
1 Bike,Bicycle 3 1198 2
2 Job 7 6 1
3 Michael,Mike K,Michael k 8 80017 3