机器学习平凡之路十

机器学习平凡之路十

  1. 无监督学习聚类与降维
  2. 半监督和自监督以及生成学习概念

聚类

聚类就是让机器把数据集中的样本按照特征的性质进行分组

K均值算法

  1. 先确定K的数值
  2. 从一大堆数据中随机挑选K个数据点,作为质心
  3. 不断遍历每一个数据点,计算它们与每一个质心的距离
  4. 重复以上步骤,找到新的质心,直至收敛

K值的选取

手肘法基于对聚类效果的一个度量指标来实现的。K值很小的时候,整体损失很大,随着K值的增大,损失函数会逐渐出现一个拐点。此时K值就是比较好的值

聚类:为客户分组

1
2
3
4
import numpy as  np
import pandas as pd
dataset = pd.read_csv('Customers Cluster.csv')
dataset.head()

image-20210406135457108

X = dataset.iloc[:,[3,4]].values # 针对两个特性进行聚类

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from sklearn.cluster import KMeans
cost = []
for i in range(1,11):
kmeans = KMeans(n_clusters=i,init='k-means++',random_state=0)
kmeans.fit(X)
cost.append(kmeans.inertia_)

import matplotlib.pyplot as plt
import seaborn as sns
plt.plot(range(1,11),cost)
plt.title('hand')
plt.xlabel('No of clusters')
plt.ylabel('cost')
plt.show()

image-20210406135948797

可以看出在3和4,聚类的个数是最优的

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 把聚类可视化
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1],
s = 100, c = 'cyan', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1],
s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1],
s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1],
s = 100, c = 'red', label = 'Cluster 4')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
s = 200, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Income')
plt.ylabel('Spending Score')
plt.legend()
plt.show()

image-20210406140652175

PCA算法

最常见的降维算法是主成分分析、通过正交变换将可能相关的原始变量转换为一组各维度线性无关的变量值。用于提取数据的主要特征分量,以达到压缩数据或提高数据可视化程度的目的

对手写数字集进行降维

1
2
from keras.datasets import mnist
(X_train_image,y_train_lable),(X_test_image,y_test_lable) = mnist.load_data()
1
2
3
4
for each in range(10):
plt.subplot(2,5,each+1)
plt.imshow(X_train_image[each].reshape(28,28))
plt.show()

image-20210406142246394

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from sklearn.decomposition import PCA
X = X_train_image.reshape(len(X_train_image),-1)
n_components =5
(n_samples,n_fratures ) = X.shape
pca = PCA(
n_components = n_components,
svd_solver = 'randomized', whiten=True
)
X_pca = pca.fit_transform(X)
components_ = pca.components_ # 保留简化后的模型
images = components_[:n_components]
plt.figure(figsize=(6, 5))
for i, comp in enumerate(images):
vmax = max(comp.max(), -comp.min())
plt.imshow(comp.reshape((28, 28)),
interpolation='nearest',vmin=-vmax, vmax=vmax)
plt.xticks(())
plt.yticks(())
plt.savefig('graph.png')
plt.show()

image-20210406152646756

可以看出来识别数字主要在中间的这几块

image-20210406152749759

特征从784个转变到5个特征

GAN的实现

可以参考的文章

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

from __future__ import print_function, division
import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout,Conv2D
from tensorflow.keras.layers import BatchNormalization, Activation, ZeroPadding2D, LeakyReLU
#from tensorflow.keras.layers.advanced_activations import LeakyReLU
#from tensorflow.keras.layers.convolutional import UpSampling2D, Conv2D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam

import matplotlib.pyplot as plt

import sys
import os
import numpy as np

class GAN():
def __init__(self):
# --------------------------------- #
# 行28,列28,也就是mnist的shape
# --------------------------------- #
self.img_rows = 28
self.img_cols = 28
self.channels = 1
# 28,28,1
self.img_shape = (self.img_rows, self.img_cols, self.channels)
self.latent_dim = 100 #噪声的维度
# adam优化器
optimizer = Adam(0.0002, 0.5)
#--------------------------训练鉴别器--------------------------
self.discriminator = self.build_discriminator()
self.discriminator.compile(loss='binary_crossentropy',
optimizer=optimizer,
metrics=['accuracy'])
#----------------------------训练生成器-------------------------------
self.generator = self.build_generator()
gan_input = Input(shape=(self.latent_dim,))
img = self.generator(gan_input) #Model(noise, img)
# 在训练generate的时候不训练discriminator
self.discriminator.trainable = False
# 对生成的假图片进行预测
validity = self.discriminator(img) #Model(img, validity)
self.combined = Model(gan_input, validity) #把生成器和鉴别器连起来
self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)


def build_generator(self):
# --------------------------------- #
# 生成器,输入一串随机数字,输出一副图像
# --------------------------------- #
model = Sequential()

model.add(Dense(256, input_dim=self.latent_dim))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))

model.add(Dense(512))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))

model.add(Dense(1024))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))

#生成图像大小的序列 28*28 *1
model.add(Dense(np.prod(self.img_shape), activation='tanh'))
#然后reshape成(28,28,1)大小
model.add(Reshape(self.img_shape))

noise = Input(shape=(self.latent_dim,))
img = model(noise)

return Model(noise, img)

def build_discriminator(self):
# ----------------------------------- #
# 评价器,对输入进来的图片进行鉴别真假
# ----------------------------------- #
model = Sequential()
# 输入一张图片
model.add(Flatten(input_shape=self.img_shape))
model.add(Dense(512))
model.add(LeakyReLU(alpha=0.2))
model.add(Dense(256))
model.add(LeakyReLU(alpha=0.2))
# 判断真伪,输出概率,1为真,0为假
model.add(Dense(1, activation='sigmoid'))

img = Input(shape=self.img_shape)
validity = model(img)

return Model(img, validity)

#训练步骤
def train(self, epochs, batch_size=128, sample_interval=50):
# 获得数据
(X_train, _), (_, _) = mnist.load_data()

# 进行标准化(-1,1)之间
X_train = X_train / 127.5 - 1.
X_train = np.expand_dims(X_train, axis=3)# (28,28)--->(28,28,1)

# 创建标签
valid = np.ones((batch_size, 1))
fake = np.zeros((batch_size, 1))

for epoch in range(epochs):

# --------------------------- #
# 随机选取batch_size个图片
# 对discriminator进行训练
# --------------------------- #
idx = np.random.randint(0, X_train.shape[0], batch_size) #在0到总图像数量中随机产生一个batch_size的序列
imgs = X_train[idx] #随机选取图像

noise = np.random.normal(0, 1, (batch_size, self.latent_dim)) #在(0,1)中随机产生batch_size个latent_dim维的噪声

gen_imgs = self.generator.predict(noise) #由噪声生成图片
#计算鉴别器loss = 0.5 *(鉴别真图像的loss + 鉴别假图像的loss)
d_loss_real = self.discriminator.train_on_batch(imgs, valid)
d_loss_fake = self.discriminator.train_on_batch(gen_imgs, fake)
d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

# --------------------------- #
# 训练generator
# --------------------------- #
noise = np.random.normal(0, 1, (batch_size, self.latent_dim)) #为了增加generator的随机性,这里的噪声是重新生成的,只是用了训练好的参数
g_loss = self.combined.train_on_batch(noise, valid)
print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))

if epoch % sample_interval == 0:
self.sample_images(epoch)

def sample_images(self, epoch):
#采样25个图像
r, c = 5, 5
noise = np.random.normal(0, 1, (r * c, self.latent_dim))
gen_imgs = self.generator.predict(noise) #为了增加generator的随机性,这里的噪声是重新生成的,只是用了训练好的参数

gen_imgs = 0.5 * gen_imgs + 0.5 #生成的图像在(-1,1)之间,转化到(0,1)之间

fig, axs = plt.subplots(r, c)
cnt = 0
for i in range(r):
for j in range(c):
axs[i,j].imshow(gen_imgs[cnt, :,:,0], cmap='gray')
axs[i,j].axis('off')
cnt += 1
fig.savefig("images/%d.png" % epoch)
plt.close()


if __name__ == '__main__':
if not os.path.exists("./images"):
os.makedirs("./images")
gan = GAN()
gan.train(epochs=30000, batch_size=256, sample_interval=200)

image-20210406173413336

这个是训练了5400次所生成的图片。距离以假乱真还有一些距离

思路

1、随机选取batch_size个真实的图片。
2、随机生成batch_size个N维向量,传入到Generator中生成batch_size个虚假图片。
3、真实图片的label为1,虚假图片的label为0,将真实图片和虚假图片当作训练集传入到Discriminator中进行训练,训练的loss使用均方差。
4、将虚假图片的Discriminator预测结果与1的对比作为loss对Generator进行训练(与1对比的意思是,如果Discriminator将虚假图片判断为1,说明这个生成的图片很“真实”),这个loss同样使用均方差

强化学习