d
i
s
t
(
X
,
Y
)
=
∑
i
=
0
n
(
X
i
−
Y
i
)
2
=
(
X
1
−
Y
1
)
2
+
(
X
2
−
Y
2
)
2
+
(
X
3
−
Y
3
)
2
+
…
…
+
(
X
i
−
Y
i
)
dist(X,Y) = \sqrt{\sum_{i=0}^{n}(X_i - Y_i)^2} = \sqrt{(X_1-Y_1)^2 + (X_2-Y_2)^2+(X_3-Y_3)^2+……+(X_i-Y_i)}
dist(X,Y)=i=0∑n(Xi−Yi)2=(X1−Y1)2+(X2−Y2)2+(X3−Y3)2+……+(Xi−Yi)
### 导包,加载数据import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
# model_selection:模型选择# cross_val_score cross:交叉,validation:验证(测试)# 交叉验证from sklearn.model_selection import cross_val_score
# 通过训练样本数的开平方对k值进行参考
X,y = datasets.load_iris(True)
X.shape[0]**0.5
结果是:12.24744871391589,所以k值的取值范围是1到13
应用cross_val_score筛选最合适的邻居数量
erros =[]for k inrange(1,14):
knn = KNeighborsClassifier(n_neighbors=k)
score = cross_val_score(knn,X,y,scoring='accuracy',cv =6).mean()# 误差越小,说明k选择越合适,越好
erros.append(1- score)import matplotlib.pyplot as plt
%matplotlib inline
# k = 11时,误差最小,说明k = 11对鸢尾花来说,最合适的k值
plt.plot(np.arange(1,14),erros)
通过上面代码所得图,可以看出当k = 11时,误差最小,k值最好
多参数组合使用cross_val_score筛选最合适的参数组合
result ={}
weights =['distance','uniform']for k inrange(1,14):for w in weights:
knn = KNeighborsClassifier(n_neighbors=k,weights=w)
sm = cross_val_score(knn,X,y,scoring='accuracy',cv =6).mean()
result[w +str(k)]= sm
result