第4章 特征选择
4.0 特征选择简述
请参考《数据准备和特征工程》中的相关章节,调试如下代码。
import pandas as pd
df_wine = pd.read_csv("/home/aistudio/data/data20527/wine_data.csv")
df_wine.head()
|
Class_label |
Alcohol |
Malic_acid |
Ash |
Alcalinity_of_ash |
Magnesium |
Total_phenols |
Flavanoids |
Nonflavanoid_phenols |
Proanthocyanins |
Color_intensity |
Hue |
OD280/OD315_of_diluted_wines |
Proline |
0 |
1 |
14.23 |
1.71 |
2.43 |
15.6 |
127 |
2.80 |
3.06 |
0.28 |
2.29 |
5.64 |
1.04 |
3.92 |
1065 |
1 |
1 |
13.20 |
1.78 |
2.14 |
11.2 |
100 |
2.65 |
2.76 |
0.26 |
1.28 |
4.38 |
1.05 |
3.40 |
1050 |
2 |
1 |
13.16 |
2.36 |
2.67 |
18.6 |
101 |
2.80 |
3.24 |
0.30 |
2.81 |
5.68 |
1.03 |
3.17 |
1185 |
3 |
1 |
14.37 |
1.95 |
2.50 |
16.8 |
113 |
3.85 |
3.49 |
0.24 |
2.18 |
7.80 |
0.86 |
3.45 |
1480 |
4 |
1 |
13.24 |
2.59 |
2.87 |
21.0 |
118 |
2.80 |
2.69 |
0.39 |
1.82 |
4.32 |
1.04 |
2.93 |
735 |
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# X为数据集中特征的集合,y为数据集中标签的集合
# test_size=0.3: 测试数据集所占比例为0.3
# random_state=0: 随机数种子
# stratify=y 训练和测试集都按照y中的比例分配
X, y = df_wine.iloc[:, 1:], df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
# 对测试集和训练集分别实现特征标准化。
std = StandardScaler()
X_train_std = std.fit_transform(X_train)
X_test_std = std.fit_transform(X_test)
from sklearn.linear_model import LogisticRegression
# LogisticRegression():建立对数概率回归模型,并添加惩罚项,防止模型出现过拟合
lr = LogisticRegression(C = 1.0, penalty = 'l1',solver='liblinear')
lr.fit(X_train_std, y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l1',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)
# 输出13个特征的系数(权重)。特征系数为0的,说明此类特征与预测结果无关。
lr.coef_
array([[ 1.24625685, 0.18107053, 0.74257832, -1.16001118, 0. ,
0. , 1.17611757, 0. , 0. , 0. ,
0. , 0.54232728, 2.51117025],
[-1.53720803, -0.387145 , -0.99522705, 0.36479669, -0.05946812,
0. , 0.66779999, 0. , 0. , -1.93405254,
1.23412954, 0. , -2.2316079 ],
[ 0.1355303 , 0.1687654 , 0.3572857 , 0. , 0. ,
0. , -2.43734423, 0. , 0. , 1.5634205 ,
-0.81896512, -0.49331848, 0. ]])
# 截距,由于Class_label有3个可选值0,1,2;故存在3条截距
lr.intercept_
array([-1.26341218, -1.21591946, -2.37057917])