0%

Part1-Kmeans和PCA练习

Part1-Kmeans和PCA练习

K-means

1.1 可视化数据

1
2
3
4
5
6
7
8
from scipy.io import loadmat
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
data = loadmat('ex7data2.mat')
data = pd.DataFrame(data['X'], columns=['X1','X2'])
data.plot('X1','X2',kind='scatter')
plt.show()

1.2 随机初始化

1
2
def random_init(X, K):
return X.sample(K).values

1.3 寻找距离X最近的聚类中心

1
2
3
4
5
6
7
8
9
10
11
12
def find_closest_center(X, u):
m = X.shape[0]
c = np.zeros(m)
for i in range(m):
temp_x = X.iloc[i, :].values
min_distance = 1e6
for j in range(len(u)):
distance = np.sum((temp_x-u[j, :])**2)
if distance < min_distance:
min_distance = distance
c[i] = j
return c

1.4 计算聚类中心

1
2
3
4
5
6
7
def compute_center(X, c, k):
m, n = X.shape
u = np.zeros((k, n))
for i in range(k):
cs = np.where(c==i)
u[i] = data.iloc[cs[0], :].mean()
return u

1.5 kmeans聚类

1
2
3
4
5
6
7
8
9
def kmeans(X, k, epoch=10):
m = X.shape[0]
u = random_init(X, k)
c = np.zeros(m)
for i in range(epoch):
c = find_closest_center(X, u)
u = compute_center(X, c, k)
c = find_closest_center(X, u)
return c, u

1.6 结果可视化显示

1
2
3
4
5
6
7
8
cluster1 = data[c==0]
cluster2 = data[c==1]
cluster3 = data[c==2]
_, ax = plt.subplots()
ax.scatter(cluster1.iloc[:, 0], cluster1.iloc[:, 1], c='r')
ax.scatter(cluster2.iloc[:, 0], cluster2.iloc[:, 1], c='g')
ax.scatter(cluster3.iloc[:, 0], cluster3.iloc[:, 1], c='b')
plt.show()

使用K-means进行图像压缩

2.1 图像读取

1
2
3
4
from skimage import io
pic = io.imread('bird_small.png')
io.imshow(pic)
io.show()

2.2 图像压缩

1
2
3
pic = pic / 256 # 需要标准化
new_pic = pic.reshape(-1, pic.shape[2]) # 调整形状
c, u = kmeans(pd.DataFrame(new_pic), 16)

2.3 图像恢复

1
2
3
4
X_recovered = u[c.astype(int),:]
pic_recovered = np.reshape(X_recovered, pic.shape)
plt.imshow(pic_recovered)
plt.show()

2.4 使用sklearn中的KMeans实现

1
2
3
4
5
6
7
8
9
10
11
pic = io.imread('bird_small.png')
io.imshow(pic)
data = pic.reshape(-1, 3)/255
from sklearn.cluster import KMeans
model = KMeans(n_clusters=16, n_init=100, n_jobs=-1)
model.fit(data)
center = model.cluster_centers_
C = model.predict(data)
pic_recovered = center[C].reshape(pic.shape)
plt.imshow(pic_recovered)
plt.show()

PCA

3.1 数据可视化

1
2
3
4
5
6
7
from scipy.io import loadmat
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
data = loadmat('ex7data1.mat')
plt.scatter(data['X'][:,0], data['X'][:,1])
plt.show()

3.2 PCA算法数据压缩与还原

1
2
3
4
5
6
7
def pca(X, k):
m = X.shape[0]
sigma = X.T@X/m
U,S,V = np.linalg.svd(sigma)
return X@U[:,:k],U
def recover_from_pca(Z, U, k):
return Z@U[:, :k].T
1
2
3
4
5
X = data['X']
Z, U = pca(X, 1)
X_recovered = recover_from_pca(Z, U, 1)
plt.scatter(X_recovered[:,0],X_recovered[:, 1])
plt.show()

PCA人脸图片压缩与还原

4.1 数据读取与可视化

1
2
3
4
5
6
from scipy.io import loadmat
import matplotlib.pyplot as plt
import numpy as np
data = loadmat('ex7faces.mat')['X']
plt.imshow(data[3,:].reshape(32, 32), cmap='gray')
plt.show()

4.2 PCA压缩与恢复

1
2
3
4
5
X = data
Z, U = pca(X, 100)
X_recovered = recover_from_pca(Z, U, 100)
plt.imshow(X_recovered[3,:].reshape(32, 32), cmap='gray')
plt.show()
image-20210718174018954
image-20210718174018954

4.3 使用sklearn中的PCA实现

1
2
3
4
5
6
7
8
9
from scipy.io import loadmat
from sklearn.decomposition import PCA
data = loadmat('ex7faces.mat')['X']
sk_pca = PCA(n_components=100)
X = data
Z = sk_pca.fit_transform(X)
X_recover = sk_pca.inverse_transform(Z)
plt.imshow(X_recovered[3,:].reshape(32, 32), cmap='gray')
plt.show()