0%

TensorFlow实践之分类问题

本节从二分类问题到多分类问题均进行了简单的介绍以及相应示例的实现,同时对Pandas处理二维数据进行了基本介绍。

一元逻辑回归的实现(二分类问题)

  • 问题:根据房屋面积这一属性对房屋进行普通房和高档房的分类
  • 思路:线性回归+sigmoid函数,即将线性回归的结果映射到0~1,通过设置阈值从而实现二分类问题(0/1)。个人理解是:将映射的结果看成概率,即若是通过线性回归得到的房屋价格越高,那么其属于高档房的概率就越高
  • 步骤:
    • 加载数据
    • 数据处理
    • 设置超参数:学习率和迭代次数
    • 设置模型参数的初值
    • 训练模型
    • 预测:设置阈值进行二分类
    • 可视化输出
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
#1.加载数据
x=np.array([137.97,104.50,100.00,126.32,79.20,99.00,124.00,114.00,106.69,140.05,53.75,46.91,68.00,63.02,81.26,86.21])
y=np.array([1,1,0,1,0,1,1,0,0,1,0,0,0,0,0,0])
#2.数据处理:中心化(每个样本点都减去他们的平均值)
x_train=x-np.mean(x)
y_train=y
#3.设置超参数学习率与迭代次数
learn_rate=0.005
iter=5
display_step=1 #设置每一次迭代输出一次结果
#4.设置模型参数初值w0,b0
np.random.seed(612)
w=tf.Variable(np.random.randn())
b=tf.Variable(np.random.randn())
x_=range(-80,80)
y_=1/(1+tf.exp(-(w*x_+b)))
plt.figure(figsize=(12,4))
plt.subplot(121)
plt.scatter(x_train,y_train)
plt.plot(x_,y_,color="red",linewidth=3)#绘制模型初始参数时对应的sigmoid曲线
#5.训练模型
cross_train=[]#用来存放训练集的交叉熵损失
acc_train=[]#用来存放训练集的准确率
for i in range(0,iter+1):
with tf.GradientTape() as tape:
pred_train=1/(1+tf.exp(-(w*x_train+b)))
Loss_train=-tf.reduce_mean(y_train*tf.math.log(pred_train)+(1-y_train)*tf.math.log(1-pred_train))
Accuracy_train=tf.reduce_mean(tf.cast(tf.equal(tf.where(pred_train<0.5,0,1),y_train),tf.float32))#因为tf.equal返回的是bool类型,故需要将其转化为float32
cross_train.append(Loss_train)
acc_train.append(Accuracy_train)
dL_dw,dL_db=tape.gradient(Loss_train,[w,b])
w.assign_sub(learn_rate*dL_dw)
b.assign_sub(learn_rate*dL_db)
if i % display_step==0:
print("i:%i\tTrain Loss:%f\tAccuracy Train:%f" % (i,Loss_train,Accuracy_train))
y_=1/(1+tf.exp(-(w*x_+b)))
plt.plot(x_,y_)#绘制随着迭代次数sigmoid曲线的变化
print(80*"-")
#6.预测
x_test=[128.15,45.00,141.43,106.27,99.00,53.84,85.36,70.00,162.00,114.60]
pred_test=1/(1+tf.exp(-(w*(x_test-np.mean(x))+b)))
y_test=tf.where(pred_test<0.5,0,1)#设置阈值进行二分类
for i in range(len(x_test)):
print(x_test[i],'\t',pred_test[i].numpy(),'\t',y_test[i].numpy(),'\t')
#将分类后的数据可视化输出
plt.subplot(122)
plt.scatter(x_test,y_test)
x_=np.array(range(-80,80))
y_=1/(1+tf.exp(-(w*x_+b)))
plt.plot(x_+np.mean(x),y_)
plt.show()

结果如下:

i:0    Train Loss:0.852807    Accuracy Train:0.625000
i:1    Train Loss:0.400259    Accuracy Train:0.875000
i:2    Train Loss:0.341504    Accuracy Train:0.812500
i:3    Train Loss:0.322571    Accuracy Train:0.812500
i:4    Train Loss:0.313972    Accuracy Train:0.812500
i:5    Train Loss:0.309411    Accuracy Train:0.812500
--------------------------------------------------------------------------------
128.15      0.8610252      1     
45.0      0.0029561974      0     
141.43      0.9545566      1     
106.27      0.45318928      0     
99.0      0.29813623      0     
53.84      0.00663888      0     
85.36      0.108105935      0     
70.0      0.028681064      0     
162.0      0.9928677      1     
114.6      0.6406205      1     

output_1_1


多元逻辑回归的实现(二分类问题)

  • 问题:通过花萼的长度和宽度这两个属性对山鸢尾和变色鸢尾进行分类
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl
#1.下载鸢尾花数据集iris
#训练集
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
train_path = tf.keras.utils.get_file("iris_train.csv",TRAIN_URL,cache_dir="D:\App_Data_File\Anaconda_data\jupyter\TensorFlow")
#测试集
TRAIN_URL = "http://download.tensorflow.org/data/iris_test.csv"
test_path = tf.keras.utils.get_file("iris_test.csv",TRAIN_URL,cache_dir="D:\App_Data_File\Anaconda_data\jupyter\TensorFlow")
df_iris_train=pd.read_csv(train_path,header=0)
df_iris_test=pd.read_csv(test_path,header=0)
#2.处理数据
#转化为numpy数组
iris_train=np.array(df_iris_train)
iris_test=np.array(df_iris_test)
#取出前两个属性
train_x=iris_train[:,0:2]
test_x=iris_test[:,0:2]
#取出标签
train_y=iris_train[:,4]
test_y=iris_test[:,4]
#取出山鸢尾和变色鸢尾
x_train=train_x[train_y<2]
y_train=train_y[train_y<2]
x_test=test_x[test_y<2]
y_test=test_y[test_y<2]
#记录此时的样本数
num_train=len(x_train)
num_test=len(x_test)
#可视化样本数据
cm_pt=mpl.colors.ListedColormap(["blue","red"])
#plt.scatter(x_train[:,0],x_train[:,1],c=y_train,cmap=cm_pt)#使用花萼长度和宽度分别作为横纵坐标
#属性中心化
x_train=x_train-np.mean(x_train,axis=0)
x_test=x_test-np.mean(x_test,axis=0)
#生成多元模型的属性矩阵和标签列向量
x0_train=np.ones(num_train).reshape(-1,1)
X_train=tf.cast(tf.concat((x0_train,x_train),axis=1),tf.float32)
Y_train=y_train.reshape(-1,1)
x0_test=np.ones(num_test).reshape(-1,1)
X_test=tf.cast(tf.concat((x0_test,x_test),axis=1),tf.float32)
Y_test=y_test.reshape(-1,1)
#4.设置超参数学习率与迭代次数
learn_rate=0.2
iter=120
display_step=30 #设置每三十次迭代输出一次结果
#5.设置模型参数初值W
np.random.seed(612)
W=tf.Variable(np.random.randn(3,1),dtype=tf.float32)
x_=[-1.5,1.5]
y_=-(W[0]+W[1]*x_)/W[2]
plt.figure(figsize=(12,4))
plt.subplot(121)
plt.scatter(x_train[:,0],x_train[:,1],c=y_train,cmap=cm_pt)
plt.plot(x_,y_,color="red",linewidth=3)
plt.xlim(-1.5,1.5)
plt.ylim(-1.5,1.5)
plt.subplot(122)
plt.scatter(x_test[:,0],x_test[:,1],c=y_test,cmap=cm_pt)
plt.plot(x_,y_,color="red",linewidth=3)
plt.xlim(-1.5,1.5)
plt.ylim(-1.5,1.5)
#6.训练模型
ce_train=[]#用来存放训练集的交叉熵损失
acc_train=[]#用来存放训练集的准确率
ce_test=[]#用来存放测试集的交叉熵损失
acc_test=[]#用来存放测试集的准确率
for i in range(0,iter+1):
with tf.GradientTape() as tape:
PRED_train=1/(1+tf.exp(-tf.matmul(X_train,W)))
Loss_train=-tf.reduce_mean(Y_train*tf.math.log(PRED_train)+(1-Y_train)*tf.math.log(1-PRED_train))
PRED_test=1/(1+tf.exp(-tf.matmul(X_test,W)))
Loss_test=-tf.reduce_mean(Y_test*tf.math.log(PRED_test)+(1-Y_test)*tf.math.log(1-PRED_test))
Accuracy_train=tf.reduce_mean(tf.cast(tf.equal(tf.where(PRED_train<0.5,0,1),Y_train),tf.float32))
Accuracy_test=tf.reduce_mean(tf.cast(tf.equal(tf.where(PRED_test<0.5,0,1),Y_test),tf.float32))
ce_train.append(Loss_train)
acc_train.append(Accuracy_train)
ce_test.append(Loss_test)
acc_test.append(Accuracy_test)
dL_dW=tape.gradient(Loss_train,W)
W.assign_sub(learn_rate*dL_dW)
if i % display_step==0:
print("i:%i\tTrain Loss:%f\tAccuracy Train:%f\t\tTest Loss:%f\tAccuracy Test:%f" % (i,Loss_train,Accuracy_train,Loss_test,Accuracy_test))
y_=-(W[0]+W[1]*x_)/W[2]
plt.subplot(121)
plt.plot(x_,y_)
plt.subplot(122)
plt.plot(x_,y_)
#7.结果可视化
plt.figure(figsize=(12,4))
plt.subplot(121)
plt.plot(ce_train,color="blue",label="Loss_train")
plt.plot(ce_test,color="red",label="Loss_test")
plt.legend()
plt.subplot(122)
plt.plot(acc_train,color="blue",label="Accuracy_train")
plt.plot(acc_test,color="red",label="Accuracy_test")
plt.legend()

结果如下:

i:0    Train Loss:0.994269    Accuracy Train:0.230769        Test Loss:0.939684    Accuracy Test:0.272727
i:30    Train Loss:0.481892    Accuracy Train:0.961538        Test Loss:0.505456    Accuracy Test:0.863636
i:60    Train Loss:0.319128    Accuracy Train:0.987179        Test Loss:0.362112    Accuracy Test:0.863636
i:90    Train Loss:0.246626    Accuracy Train:0.987179        Test Loss:0.295611    Accuracy Test:0.863636
i:120    Train Loss:0.204982    Accuracy Train:1.000000        Test Loss:0.256212    Accuracy Test:0.863636

output_3_2

output_3_3

  • 绘制决策边界原理image-20221107113341906

绘制分类图

1.绘制填充块

  • 生成网格坐标矩阵:np.meshgrid()
  • 填充网格:plt.pcolomesh()
1
2
3
4
5
6
7
8
9
10
11
12
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl
n=200
x=np.linspace(-10,10,n)
y=np.linspace(-10,10,n)
X,Y=np.meshgrid(x,y)
Z=X+Y
plt.pcolormesh(X,Y,Z,cmap="rainbow")
plt.show()

结果如下:

output_7_0
1
2
3
4
#自定义色彩区域
cm_bg=mpl.colors.ListedColormap(["#FFA0A0","#A0FFA0"])
plt.pcolormesh(X,Y,Z,cmap=cm_bg)
plt.show()

结果如下:

output_8_0

2.绘制轮廓线

  • plt.contour()#绘制轮廓线
  • plt.contourf()#填充
1
2
3
4
5
6
7
8
9
10
11
n=200
x=np.linspace(-10,10,n)
y=np.linspace(-10,10,n)
X,Y=np.meshgrid(x,y)
Z=X**2+Y**2
plt.figure(figsize=(12,5))
plt.subplot(121)
plt.contour(X,Y,Z,cmap="rainbow")
plt.subplot(122)
plt.contourf(X,Y,Z,20,cmap="rainbow")
plt.show()

结果如下:

output_10_0

3.根据鸢尾花分类模型,绘制分类图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
M=300
x1_min,x2_min=x_train.min(axis=0)
x1_max,x2_max=x_train.max(axis=0)
t1=np.linspace(x1_min,x1_max,M)
t2=np.linspace(x2_min,x2_max,M)
m1,m2=np.meshgrid(t1,t2)
m0=np.ones(M*M)
X_mesh=tf.cast(np.stack((m0,m1.reshape(-1),m2.reshape(-1)),axis=1),dtype=tf.float32)
Y_mesh=tf.cast(1/(1+tf.exp(-tf.matmul(X_mesh,W))),dtype=tf.float32)
Y_mesh=tf.where(Y_mesh<0.5,0,1)
n=tf.reshape(Y_mesh,m1.shape)
cm_pt=mpl.colors.ListedColormap(["blue","red"])
cm_bg=mpl.colors.ListedColormap(["#FFA0A0","#A0FFA0"])
plt.pcolormesh(m1,m2,n,cmap=cm_bg)
plt.scatter(x_train[:,0],x_train[:,1],c=y_train,cmap=cm_pt)
plt.xlim(-1.2,1.7)
plt.ylim(-1.2,1.4)
plt.show()

结果如下:

output_12_0

Pandas二维数据基本操作

1
2
3
import tensorflow as tf
import numpy as np
import pandas as pd

1.读取数据并设置列标题

  • pd.read_csv(filepath_or_buffer,header,names)
  • 通过header设置列标题,默认header=0,表示第一行作为列标题;header=None,表示没有标题
  • names参数:自定义列标题,代替header参数指定的列标题
1
2
3
#pd.read_csv(filepath_or_buffer,header,names)
df_iris=pd.read_csv(train_path,header=0)
df_iris.head()
120 4 setosa versicolor virginica
0 6.4 2.8 5.6 2.2 2
1 5.0 2.3 3.3 1.0 1
2 4.9 2.5 4.5 1.7 2
3 4.9 3.1 1.5 0.1 0
4 5.7 3.8 1.7 0.3 0
1
2
3
4
#自定义列标题
COLUMN_NAMES=["SepalLength","SepalWidth","PetalLength","PetalWidth","Species"]
df_iris=pd.read_csv(train_path,header=0,names=COLUMN_NAMES)
df_iris.head()
SepalLength SepalWidth PetalLength PetalWidth Species
0 6.4 2.8 5.6 2.2 2
1 5.0 2.3 3.3 1.0 1
2 4.9 2.5 4.5 1.7 2
3 4.9 3.1 1.5 0.1 0
4 5.7 3.8 1.7 0.3 0

2.访问数据

  • head(n)#默认读取前5行数据,也可以通过n指定读取前n行
  • tail(n)#默认读取后5行数据,也可以通过n指定读取后n行
  • 使用索引和切片读取数据
1
df_iris.head(8)
SepalLength SepalWidth PetalLength PetalWidth Species
0 6.4 2.8 5.6 2.2 2
1 5.0 2.3 3.3 1.0 1
2 4.9 2.5 4.5 1.7 2
3 4.9 3.1 1.5 0.1 0
4 5.7 3.8 1.7 0.3 0
5 4.4 3.2 1.3 0.2 0
6 5.4 3.4 1.5 0.4 0
7 6.9 3.1 5.1 2.3 2
1
df_iris.tail(8)
SepalLength SepalWidth PetalLength PetalWidth Species
112 5.0 3.0 1.6 0.2 0
113 6.3 3.3 6.0 2.5 2
114 5.0 3.5 1.6 0.6 0
115 5.5 2.6 4.4 1.2 1
116 5.7 3.0 4.2 1.2 1
117 4.4 2.9 1.4 0.2 0
118 4.8 3.0 1.4 0.1 0
119 5.5 2.4 3.7 1.0 1
1
2
#使用索引和切片读取数据
df_iris[10:20:2]
SepalLength SepalWidth PetalLength PetalWidth Species
10 5.2 2.7 3.9 1.4 1
12 5.8 4.0 1.2 0.2 0
14 7.7 3.8 6.7 2.2 2
16 6.8 3.2 5.9 2.3 2
18 6.4 3.2 5.3 2.3 2

3.显示统计信息

  • describe()#对每列数据进行统计
1
df_iris.describe()
SepalLength SepalWidth PetalLength PetalWidth Species
count 120.000000 120.000000 120.000000 120.000000 120.000000
mean 5.845000 3.065000 3.739167 1.196667 1.000000
std 0.868578 0.427156 1.822100 0.782039 0.840168
min 4.400000 2.000000 1.000000 0.100000 0.000000
25% 5.075000 2.800000 1.500000 0.300000 0.000000
50% 5.800000 3.000000 4.400000 1.300000 1.000000
75% 6.425000 3.300000 5.100000 1.800000 2.000000
max 7.900000 4.400000 6.900000 2.500000 2.000000

4.DataFrame的常用属性

  • .ndim #数据表的维数
  • .shape #数据表的形状
  • .size #数据表元素的总个数
1
2
3
print(df_iris.ndim)
print(df_iris.shape)
print(df_iris.size)

结果如下:

2
(120, 5)
600

实现多分类任务

1.独热编码的实现

  • tf.one_hot(indices,depth)
  • indices是一维整数数组或整数张量
  • depth是编码深度
1
2
3
4
5
import tensorflow as tf
import numpy as np
a=[0,2,3,5,2]
b=tf.one_hot(a,6)
print(b)

结果如下:

tf.Tensor(
[[1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0.]], shape=(5, 6), dtype=float32)

2.softmax函数的实现

  • tf.nn.softmax()
1
2
3
4
import tensorflow as tf
import numpy as np
tf.nn.softmax([1.0,2.0,5.0])
np.round(tf.nn.softmax([1.0,2.0,5.0]),2)

结果如下:

array([0.02, 0.05, 0.94], dtype=float32)

3.准确率的实现

1
2
3
4
5
6
7
8
9
import tensorflow as tf
import numpy as np
y=np.array([2,1,0])
y_onehot=np.array([[0,0,1],[0,1,0],[1,0,0]])
pred=np.array([[0.1,0.2,0.7],[0.1,0.7,0.2],[0.3,0.4,0.3]])
answer_tof=tf.equal(tf.argmax(pred,axis=1),y)
answer=tf.cast(answer_tof,tf.float32)
acc=tf.reduce_mean(answer)
print(acc)

结果如下:

tf.Tensor(0.6666667, shape=(), dtype=float32)

4.实例:使用花瓣长度、花瓣宽度将三种鸢尾花区分开

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#1.导入库,加载数据
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib as mpl
#训练集
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
train_path = tf.keras.utils.get_file("iris_train.csv",TRAIN_URL,cache_dir="D:\App_Data_File\Anaconda_data\jupyter\TensorFlow")
df_iris_train=pd.read_csv(train_path,header=0)
#2.处理数据
#转化为numpy数组
iris_train=np.array(df_iris_train)
#取出前两个属性
x_train=iris_train[:,2:4]
#取出标签
y_train=iris_train[:,4]
#记录此时的样本数
num_train=len(x_train)
#生成多元模型的属性矩阵和标签独热码
x0_train=np.ones(num_train).reshape(-1,1)
X_train=tf.cast(tf.concat((x0_train,x_train),axis=1),tf.float32)
Y_train=tf.one_hot(tf.constant(y_train,dtype=tf.int32),3)#Y_train为120*3
#3.设置超参数学习率与迭代次数
learn_rate=0.2
iter=500
display_step=100 #设置每三十次迭代输出一次结果
#4.设置模型参数初值W
np.random.seed(612)
W=tf.Variable(np.random.randn(3,3),dtype=tf.float32)#W为3*3,其中第一个3为(属性+1)对应的权重,第二个3为三类
#5.训练模型
ce_train=[]#用来存放训练集的交叉熵损失
acc_train=[]#用来存放训练集的准确率
for i in range(0,iter+1):
with tf.GradientTape() as tape:
PRED_train=tf.nn.softmax(tf.matmul(X_train,W))#PRED_train是120*3的矩阵
Loss_train=-tf.reduce_sum(Y_train*tf.math.log(PRED_train))/num_train
Accuracy_train=tf.reduce_mean(tf.cast(tf.equal(tf.argmax(PRED_train.numpy(),axis=1),y_train),tf.float32))
ce_train.append(Loss_train)
acc_train.append(Accuracy_train)
dL_dW=tape.gradient(Loss_train,W)
W.assign_sub(learn_rate*dL_dW)
if i % display_step==0:
print("i:%i\tAcc:%f\tLoss:%f"%(i,Accuracy_train,Loss_train))
print(tf.argmax(PRED_train.numpy(),axis=1)) #分类结果
print(Loss_train)

结果如下:

i:0    Acc:0.350000    Loss:4.510763
i:100    Acc:0.808333    Loss:0.503537
i:200    Acc:0.883333    Loss:0.402912
i:300    Acc:0.891667    Loss:0.352650
i:400    Acc:0.941667    Loss:0.319779
i:500    Acc:0.941667    Loss:0.295599
tf.Tensor(
[2 1 2 0 0 0 0 2 1 0 1 1 0 0 2 2 2 2 2 0 2 2 0 1 1 0 1 2 1 2 1 1 1 2 2 2 2
 2 0 0 2 2 2 0 0 1 0 2 0 2 0 1 1 0 1 2 2 2 2 1 1 2 2 2 1 2 0 2 2 0 0 1 0 2
 2 0 1 1 1 2 0 1 1 1 2 0 1 1 2 0 2 1 0 0 2 0 0 2 2 0 0 1 0 1 0 0 0 0 1 0 2
 1 0 2 0 1 1 0 0 1], shape=(120,), dtype=int64)
tf.Tensor(0.295599, shape=(), dtype=float32)
欢迎来到ssy的世界