数据
输入 32 维的向量 输出一个值
有151组这样的数据
目的
用这样一组数据建立一个预测模型
输入32维的向量就能预测一个值
代码部分
1 导入工具包
在import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import r2_score
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import History
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
""" 显示中文 """
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
2 读取数据
df = pd.read_excel('data.xls', index_col = 0) #index_col=0 表示第一列不作为索引
df = df.T #对列表进行转置
3 归一化
model_M = MinMaxScaler()
for i in df.columns:
if (i == 'soot'):
pass
else:
df[i] = model_M.fit_transform(pd.DataFrame(df[i]))
#DataFrame 表示为表的形式
#fit_transform 对数据进行归一化、标准化
#对fit、transform和fit_transform通俗易懂的解释 点这里
4 数据集划分
前100 个作为训练集、后 51 个作为测试集
df = shuffle(df) #对数据进行随机排列
df1 = df[:100]
df2 = df[100:]
5 相关性分析
column = df1.columns.tolist()#将列的索引转化为列表
corr_num = df1[column].corr(method = 'spearman').abs() #斯皮尔曼系数 相关性分析 并取绝对值
corr_ratio = corr_num[corr_num['soot'] > 0.2]['soot'] #挑选出相关性系数绝对值大于0.2的参数
plt.figure(figsize = (10, 6))
ratio = df1[corr_ratio.index].corr() #对挑选出的系数列进行相关性分析 皮尔森系数
g = sns.heatmap(ratio, annot = True, cmap = 'RdYlGn') #画出热力图
corr_top = corr_ratio.sort_values(ascending = False).reset_index()
#sort_values(ascending=False)降序排列 默认是升序
#reset_index() 重置索引
corr_top.columns = ['特征', '相关性'] #将索引改成特征、相关性
df_top = [col for col in corr_top['特征']] #储存特征索引
df1 = df1[df_top] #训练集
df2 = df2[df1.columns] #测试集
#
6 划分训练集、验证集
train_data = df1[df1.columns[1:]] #input
test_data = df1['soot'] #label
X_train, X_val, y_train, y_val = train_test_split(train_data, test_data, test_size = 0.3, random_state = 2020)
#test_size=0.3 这里训练集70组 验证集30组 random_state=2020保证每次划分是相同的
7 分离测试集特征、预测值
X_test = df2[df2.columns[1:]] #input
y_test = df2['soot'] #label
8 KNN
8.1 网格调参
model_KNN = KNeighborsRegressor() #KNN回归
parameters = {
'p':[1],
'n_neighbors':[i for i in range(3, 11)],
} #p=1 曼哈顿距离
model_GSCV = GridSearchCV(model_KNN, parameters, cv = 5)#网格搜索
model_GSCV.fit(X_train, y_train)
score_MSE = mean_squared_error(y_val, model_GSCV.predict(X_val))
print('验证集均方误差:', score_MSE)
print('最优参数:', model_GSCV.best_params_)
8.2 R2分数
model_KNN = KNeighborsRegressor(
p = 1,
n_neighbors = 3
)#这里最佳的n_neighbors最佳值为3
model_KNN.fit(X_train, y_train) #将训练集数据导入训练
predict_KNN = model_KNN.predict(X_val) #用验证集测试
r2_score(y_val, predict_KNN) #R2分数
8.3 测试集预测
predict_KNN = model_KNN.predict(X_test) #输入测试集数据,获得预测值
x = [i for i in range(0, 51)]
y1 = [i for i in y_test] #实际值
y2 = [i for i in predict_KNN] #预测值
plt.figure(figsize = (20, 6))
plt.plot(x, y1, label = '真实值')
plt.plot(x, y2, label = '预测值')
plt.title('KNN')
plt.legend() #图例位置
8.4 R2分数
r2_score(y_test, predict_KNN)
9 ANN
9.1搭建初级模型
#这里用 Sequential()建立神经网络,一层隐层,神经元个数32个
model_1 = Sequential()
model_1.add(Dense(32, input_dim = 7, activation = 'relu'))#激活函数relu
model_1.add(Dense(1, activation = 'linear'))
model_1.compile(optimizer = 'adam', loss = 'mean_absolute_error', metrics = ['mean_absolute_error']) #优化器选用adam,loss function选用 平均绝对误差
history = model_1.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 200, batch_size = 16) #模型训练 用history进行每一步的记录便于之后画图
#画图 训练集和验证集的loss随着epochs的变化
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('模型的训练和验证损失曲线')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc = 'upper right')
9.2 R2分数
predict_ANN = model_1.predict(X_val)#验证集中的预测值
r2_score(y_val, predict_ANN)
9.3再增加四个隐层
#5层隐层,每一层的神经元个数为32
model_2 = Sequential()
model_2.add(Dense(32, input_dim = 7, activation = 'relu'))
model_2.add(Dense(32, activation = 'relu'))
model_2.add(Dense(32, activation = 'relu'))
model_2.add(Dense(32, activation = 'relu'))
model_2.add(Dense(32, activation = 'relu'))
model_2.add(Dense(1, activation = 'linear'))
model_2.compile(optimizer = 'adam', loss = 'mean_absolute_error', metrics = ['mean_absolute_error'])
history = model_2.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 200, batch_size = 16)
#画图
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('模型的训练和验证损失曲线')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc = 'upper right')
9.4 R2分数
predict_ANN = model_2.predict(X_val)
r2_score(y_val, predict_ANN)
9.5增加神经元
#5层隐层,每一层的神经元个数为64
model_3 = Sequential()
model_3.add(Dense(64, input_dim = 7, activation = 'relu'))
model_3.add(Dense(64, activation = 'relu'))
model_3.add(Dense(64, activation = 'relu'))
model_3.add(Dense(64, activation = 'relu'))
model_3.add(Dense(64, activation = 'relu'))
model_3.add(Dense(1, activation = 'linear'))
model_3.compile(optimizer = 'adam', loss = 'mean_absolute_error', metrics = ['mean_absolute_error'])
history = model_3.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 100, batch_size = 16)
#画图
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('模型的训练和验证损失曲线')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc = 'upper right')
9.6 R2分数
predict_ANN = model_3.predict(X_val)
r2_score(y_val, predict_ANN)
9.7 测试集预测
#在三个ANN模型中选择R2分数最高的那个,这里选择model3
predict_ANN = model_3.predict(X_test)
x = [i for i in range(0, 51)]
y1 = [i for i in y_test]
y2 = [i for i in predict_ANN]
plt.figure(figsize = (20, 6))
plt.plot(x, y1, label = '真实值')
plt.plot(x, y2, label = '预测值')
plt.title('ANN')
plt.legend()
9.8 R2分数
r2_score(y_test, predict_ANN)