knn属于数据挖掘的分类算法,2的矩阵作为两
最近在看knn算法,顺便敲敲代码。
邻近算法,或者说K最近邻(kNN,k-NearestNeighbor)分类算法是数据挖掘分类技术中最简单的方法之一。所谓K最近邻,就是k个最近的邻居的意思,说的是每个样本都可以用它最接近的k个邻居来代表。
kNN算法的核心思想是如果一个样本在特征空间中的k个最相邻的样本中的大多数属于某一个类别,则该样本也属于这个类别,并具有这个类别上样本的特性。
knn属于数据挖掘的分类算法。基本思想是在距离空间里,如果一个样本的最接近的k个邻居里,绝大多数属于某个类别,则该样本也属于这个类别。俗话叫,“随大流”。
数据预备,这里使用random函数生成10*2的矩阵作为两列特征值,1个10个元素数组作为类别值
简单来说,KNN可以看成:有那么一堆你已经知道分类的数据,然后当一个新的数据进入的时候,就开始跟训练里的每个点求距离,然后挑出离这个数据最近的K个点,看看这K个点属于什么类型,然后用少数服从多数的原则,给新数据归类。
import numpy as np
import matplotlib.pyplot as plt
x_train = np.random.rand(10,2)*8
y_train = np.random.randint(0,2,10)
x = np.array([3,4])
k=3
plt.scatter(x_train[y_train==1,0],x_train[y_train==1,1],color="red")
plt.scatter(x_train[y_train==0,0],x_train[y_train==0,1],color="green")
plt.scatter(x[0],x[1],marker='+',color="blue")
plt.show()
该算法的示意图,简单明了:
绿点为类别0,红点为类别1
下面的算法步骤取自于百度文库(文库是一个好东西),代码是参照这个思路实现的:
X_train = np.array(x_train)
Y_train = np.array(y_train)
from math import sqrt
distances = []
for x_train in X_train:
d = sqrt(np.sum((x-x_train)**2))
distances.append(d)
distances = [sqrt(np.sum((x-x_train)**2)) for x_train in X_train]
argindex = np.argsort(distances)
from collections import Counter
topK_Y = [Y_train[i] for i in argindex[:k]]
votes = Counter(topK_Y)
votes.most_common(1)[0][0]
执行结果为判断x点大概率为类别0(绿点)
code:
使用sklearn中封装的knn算法
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train,Y_train)
knn_clf.predict(x.reshape(1,-1))[0]
1 #include<stdio.h>
2 #include<math.h>
3 #include<stdlib.h>
4
5 #define K 3 //近邻数k
6 typedef float type;
7
8 //动态创建二维数组
9 type **createarray(int n,int m)
10 {
11 int i;
12 type **array;
13 array=(type **)malloc(n*sizeof(type *));
14 array[0]=(type *)malloc(n*m*sizeof(type));
15 for(i=1;i<n;i++) array[i]=array[i-1]+m;
16 return array;
17 }
18 //读取数据,要求首行格式为 N=数据量,D=维数
19 void loaddata(int *n,int *d,type ***array,type ***karray)
20 {
21 int i,j;
22 FILE *fp;
23 if((fp=fopen("data.txt","r"))==NULL) fprintf(stderr,"can not open data.txt!n");
24 if(fscanf(fp,"N=%d,D=%d",n,d)!=2) fprintf(stderr,"reading error!n");
25 *array=createarray(*n,*d);
26 *karray=createarray(2,K);
27
28 for(i=0;i<*n;i++)
29 for(j=0;j<*d;j++)
30 fscanf(fp,"%f",&(*array)[i][j]); //读取数据
31
32 for(i=0;i<2;i++)
33 for(j=0;j<K;j++)
34 (*karray)[i][j]=9999.0; //默认的最大值
35 if(fclose(fp)) fprintf(stderr,"can not close data.txt");
36 }
37 //计算欧氏距离
38 type computedistance(int n,type *avector,type *bvector)
39 {
40 int i;
41 type dist=0.0;
42 for(i=0;i<n;i++)
43 dist+=pow(avector[i]-bvector[i],2);
44 return sqrt(dist);
45 }
46 //冒泡排序
47 void bublesort(int n,type **a,int choice)
48 {
49 int i,j;
50 type k;
51 for(j=0;j<n;j++)
52 for(i=0;i<n-j-1;i++){
53 if(0==choice){
54 if(a[0][i]>a[0][i+1]){
55 k=a[0][i];
56 a[0][i]=a[0][i+1];
57 a[0][i+1]=k;
58 k=a[1][i];
59 a[1][i]=a[1][i+1];
60 a[1][i+1]=k;
61 }
62 }
63 else if(1==choice){
64 if(a[1][i]>a[1][i+1]){
65 k=a[0][i];
66 a[0][i]=a[0][i+1];
67 a[0][i+1]=k;
68 k=a[1][i];
69 a[1][i]=a[1][i+1];
70 a[1][i+1]=k;
71 }
72 }
73 }
74 }
75 //统计有序表中的元素个数
76 type orderedlist(int n,type *list)
77 {
78 int i,count=1,maxcount=1;
79 type value;
80 for(i=0;i<(n-1);i++) {
81 if(list[i]!=list[i+1]) {
82 //printf("count of %d is value %dn",list[i],count);
83 if(count>maxcount){
84 maxcount=count;
85 value=list[i];
86 count=1;
87 }
88 }
89 else
90 count++;
91 }
92 if(count>maxcount){
93 maxcount=count;
94 value=list[n-1];
95 }
96 //printf("value %f has a Maxcount:%dn",value,maxcount);
97 return value;
98 }
99
100 int main()
101 {
102 int i,j,k;
103 int D,N; //维度,数据量
104 type **array=NULL; //数据数组
105 type **karray=NULL; // K近邻点的距离及其标签
106 type *testdata; //测试数据
107 type dist,maxdist;
108
109 loaddata(&N,&D,&array,&karray);
110 testdata=(type *)malloc((D-1)*sizeof(type));
111 printf("input test data containing %d numbers:n",D-1);
112 for(i=0;i<(D-1);i++) scanf("%f",&testdata[i]);
113
114 while(1){
115 for(i=0;i<K;i++){
116 if(K>N) exit(-1);
117 karray[0][i]=computedistance(D-1,testdata,array[i]);
118 karray[1][i]=array[i][D-1];
119 //printf("first karray:%6.2f %6.0fn",karray[0][i],karray[1][i]);
120 }
121
122 bublesort(K,karray,0);
123 //for(i=0;i<K;i++) printf("after bublesort in first karray:%6.2f %6.0fn",karray[0][i],karray[1][i]);
124 maxdist=karray[0][K-1]; //初始化k近邻数组的距离最大值
125
126 for(i=K;i<N;i++){
127 dist=computedistance(D-1,testdata,array[i]);
128 if(dist<maxdist)
129 for(j=0;j<K;j++){
130 if(dist<karray[0][j]){
131 for(k=K-1;k>j;k--){ //j后元素复制到后一位,为插入做准备
132 karray[0][k]=karray[0][k-1];
133 karray[1][k]=karray[1][k-1];
134 }
135 karray[0][j]=dist; //插入到j位置
136 karray[1][j]=array[i][D-1];
137 //printf("i:%d karray:%6.2f %6.0fn",i,karray[0][j],karray[1][j]);
138 break; //不比较karray后续元素
139 }
140 }
141 maxdist=karray[0][K-1];
142 //printf("i:%d maxdist:%6.2fn",i,maxdist);
143 }
144 //for(i=0;i<K;i++) printf("karray:%6.2f %6.0fn",karray[0][i],karray[1][i]);
145 bublesort(K,karray,1);
146 //for(i=0;i<K;i++) printf("after bublesort in karray:%6.2f %6.0fn",karray[0][i],karray[1][i]);
147 printf("nThe data has a tag: %.0fnn",orderedlist(K,karray[1]));
148
149 printf("input test data containing %d numbers:n",D-1);
150 for(i=0;i<(D-1);i++) scanf("%f",&testdata[i]);
151 }
152 return 0;
153 }
9159.com ,封装自己的knn算法
# _*_ encoding:utf-8 _*_
import numpy as np
from math import sqrt
from collections import Counter
class KNNClassifier:
def __init__(self,k):
assert k>=1, "k must be valid"
self.k = k
self._X_train = None
self._Y_train = None
def fit(self,X_train,Y_train):
assert X_train.shape[0] == Y_train.shape[0],
"The size of X_train must be equals to the size of Y-Train"
assert self.k <= X_train.shape[0]
self._X_train = X_train
self._Y_train = Y_train
return self
def predict(self,x_predict):
return np.array([self._predict(x) for x in x_predict])
def _predict(self,x):
distances = [ sqrt(np.sum((x_train-x)**2)) for x_train in self._X_train]
nearest = np.argsort(distances)
votes = [i for i in self._Y_train[nearest[:self.k]]]
return Counter(votes).most_common(1)[0][0]
def __repr__(self):
return "knn(k=%d)" %self.k
测试与训练数据集分类
为了能够确认模型的准确性,我们需要将已有数据集按一定比例分类为测试数据集和训练数据集
# _*_ encoding:utf-8 _*_
import numpy as np
def train_test_split(X,y,test_radio=0.2,seed=None):
assert X.shape[0]==y.shape[0],"The size of X and y must be equal"
assert 0.0<=test_radio<=1.0,"test radio must be valid"
if(seed):
np.random.seed(seed)
shuffled_indexes = np.random.permutation(len(X))
test_size = int(X.shape[0]*test_radio)
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
X_test = X[test_indexes]
y_test = y[test_indexes]
X_train = X[train_indexes]
y_train = y[train_indexes]
return X_train,X_test,y_train,y_test
实验:
使用knn算法测试数据集digits
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import matplotlib
%run MyScripts/KNN.py
%run MyScripts/metrics.py
%run MyScripts/model_selection.py
digits = datasets.load_digits()
X = digits.data
y = digits.target
some_digit = X[666]
some_digit_image = some_digit.reshape(8,8)
plt.imshow(some_digit_image,cmap=matplotlib.cm.binary)
画出第666个数据对应的数字图片
knn_clf = KNNClassifier(k=6)
X_train,X_test,y_train,y_test = train_test_split(X,y)
knn_clf.fit(X_train,y_train)
knn_clf.score(X_test,y_test)
训练数据data.txt:
超参数
超参数是模型运行前必须要决定的参数,例如k近邻算法中的k值和距离
确定超参数一般使用的方法:领域知识
经验数值
实验探索
N=6,D=9
1.0 1.1 1.2 2.1 0.3 2.3 1.4 0.5 1
1.7 1.2 1.4 2.0 0.2 2.5 1.2 0.8 1
1.2 1.8 1.6 2.5 0.1 2.2 1.8 0.2 1
1.9 2.1 6.2 1.1 0.9 3.3 2.4 5.5 0
1.0 0.8 1.6 2.1 0.2 2.3 1.6 0.5 1
1.6 2.1 5.2 1.1 0.8 3.6 2.4 4.5 0
确定knn算法用于digits数据集的最佳超参数
//使用网格搜索法确定weights和k超参数
best_k = -1
best_score = -1
methods = ["uniform","distance"]
best_method = ""
for method in methods:
for k in range(1,11):
knn_clf = KNeighborsClassifier(n_neighbors=k,weights=method)
knn_clf.fit(X_train,y_train)
score = knn_clf.score(X_test,y_test)
if(score>best_score):
best_k = k
best_score = score
best_method = method
print("best_k = ",best_k)
print("best_score = ",best_score)
print("best_method = ",best_method)
best_k = 3
best_score = 0.9888888888888889
best_method = uniform
best_k = -1
best_score = -1
best_p=-1
for p in range(1,6):
for k in range(1,11):
knn_clf = KNeighborsClassifier(n_neighbors=k,weights="distance",p=p)
knn_clf.fit(X_train,y_train)
score = knn_clf.score(X_test,y_test)
if(score>best_score):
best_k = k
best_score = score
best_p = p
print("best_k = ",best_k)
print("best_score = ",best_score)
print("best_p = ",best_p)
best_k = 3
best_score = 0.9888888888888889
best_p = 2
预测数据:
1.0 1.1 1.2 2.1 0.3 2.3 1.4 0.5
1.7 1.2 1.4 2.0 0.2 2.5 1.2 0.8
1.2 1.8 1.6 2.5 0.1 2.2 1.8 0.2
1.9 2.1 6.2 1.1 0.9 3.3 2.4 5.5
1.0 0.8 1.6 2.1 0.2 2.3 1.6 0.5
1.6 2.1 5.2 1.1 0.8 3.6 2.4 4.5
程序测试的结果:
1.0 1.1 1.2 2.1 0.3 2.3 1.4 0.5 类别为: 1
1.7 1.2 1.4 2.0 0.2 2.5 1.2 0.8 类别为: 1
1.2 1.8 1.6 2.5 0.1 2.2 1.8 0.2 类别为: 1
1.9 2.1 6.2 1.1 0.9 3.3 2.4 5.5 类别为: 0
1.0 0.8 1.6 2.1 0.2 2.3 1.6 0.5 类别为: 1
1.6 2.1 5.2 1.1 0.8 3.6 2.4 4.5 类别为: 0
实验截图:
本文由9159.com发布于编程,转载请注明出处: knn属于数据挖掘的分类算法,2的矩阵作为两
关键词:
下一篇:没有了