# 机器学习(周志华) 西瓜书 第九章课后习题9.10—— Python实现

• ## 实验原理

K均值算法原理

K均值算法

E值越小则簇内样本相似度越高，簇间样本相似度越低，且k值保证是较小的值，即簇类尽可能保证是大型簇类（这里考虑样本只有两种类别，所以k值应趋近于2）；

• ## 实验过程

### 数据集获取

```            ```
编号,密度,含糖率
1,0.697,0.460
2,0.774,0.376
3, 0.634,0.264
4,0.608,0.318
5,0.556,0.215
6,0.403,0.237
7,0.481,0.149
8,0.437,0.211
9,0.666,0.091
10,0.243,0.267
11,0.245,0.057
12,0.343,0.099
13,0.639,0.161
14,0.657,0.198
15,0.360,0.370
16,0.593,0.042
17,0.719,0.103
18,0.359,0.188
19,0.339,0.241
20,0.282,0.257
21,0.748,0.232
22,0.714,0.346
23,0.483,0.312
24,0.478,0.437
25,0.525,0.369
26,0.751,0.489
27,0.532,0.472
28,0.473,0.376
29,0.725,0.445
30,0.446,0.459
```
```

### 算法实现

main函数，调用上述函数，输出自动确定k值后的划分结果

• ## 程序清单：

```            ```
import random as rd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dataSet.drop(columns=['编号'], inplace=True)
return dataSet

#计算每个向量和均值向量之间的距离
def calc_distance(x, mu):
distance = 0
for xi, yi in zip(x, mu):
distance += (xi-yi)**2

return distance**(.5)

#根据目前的簇类计算出均值向量
def calc_mu(dataSet, indexs):
Ci = dataSet.loc[indexs]
return np.array(Ci.mean())

def k_means(dataSet, k, iterate=100):
Mu_indexs = rd.sample(range(dataSet.shape), k)
Mu = [np.array(dataSet.loc[index]) for index in Mu_indexs]
now, flag = 0, True

while flag and now < iterate:
C = [[] for _ in range(k)]
for index, row in dataSet.iterrows():
distances = []
for mu in Mu:
# x = np.array(dataSet.loc[index])
distance = calc_distance(row, mu)
distances.append(distance)

label = np.argmin(distances)
C[label].append(index)

flag = False
for i in range(len(Mu)):
new_mu = calc_mu(dataSet, C[i])
if (new_mu!=Mu[i]).any():
flag = True
Mu[i] = new_mu

now += 1

return C, Mu

def calc_E(dataSet, C, Mu, k):
E_inside, E_outside, size= 0, 0, dataSet.shape
#簇内
for Ci, mu in zip(C, Mu):
for index in Ci:
distance = calc_distance(dataSet.loc[index], mu)
E_inside += distance**2
# 正则化保持权重
E_inside /= size

# 簇间
for a in range(k):
for b in range(k):
if a == b:
continue
distance = calc_distance(Mu[a], Mu[b])
E_outside += distance**2

E_outside /= k
return E_inside - E_outside + 2*k/size

def Dynamic_K_means(dataSet):
size, before_E = len(dataSet), 9999
for k in range(2, size):
Es = []
# 计算多次k均值，取平方误差平均值
for time in range(10):
C, Mu = k_means(dataSet, k)
E = calc_E(dataSet, C, Mu, k)
Es.append(E)
Best_E = sum(Es)/len(Es)

if before_E <= Best_E:
return k-1
else:
before_E = Best_E

return 1

if __name__=='__main__':
filename = 'data_4.txt'
k = Dynamic_K_means(dataSet)

Best_E = 9999
# 多次计算，取最好结果
for _ in range(10):
C, Mu= k_means(dataSet, k)
E = calc_E(dataSet, C, Mu, k)
if E < Best_E:
Best_E = E
Best_C = C

print('k =', k)
for Ci in Best_C:
print(Ci)

```
``` QQ号联系： 360901061

【本文对您有帮助就好】  