4-2 过滤器法

2023-11-19

4.2 过滤器法

请参考《数据准备和特征工程》中的相关章节,调试如下代码。


注意:本节内容因为要耗费比较大的内存,在线平台有可能无法支持,可以下载到本地执行

基础知识

from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest   
from sklearn.feature_selection import chi2    

iris = load_iris()
X, y = iris.data, iris.target

# SelectKBest:过滤器类
# score_func=chi2: chi2一个统计指标函数:卡方检验

# k=2:表示取特征子集中的特征数量为2

skb = SelectKBest(score_func=chi2, k=2)    
result = skb.fit(X, y)   #训练模型

# 计算每个特征的X^2和P-values
# X^2越大,两个变量之间的偏差越大
# P-values越小,原假设发生的概率越小
print("X^2 is: ", result.scores_)
print("P-values is: ", result.pvalues_)
X^2 is:  [ 10.81782088   3.7107283  116.31261309  67.0483602 ]
P-values is:  [4.47651499e-03 1.56395980e-01 5.53397228e-26 2.75824965e-15]
# 利用模型对数据集X进行有监督的特征选择
X_new = skb.transform(X)
X_new.shape
(150, 2)
# 取前5行样本数据
X_new = skb.fit_transform(X, y)
X_new[:5, :]
array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2]])
import numpy as np

# 显示前5行样本数据对应的特征名称,下面是列表解析的用法
[iris.feature_names[np.where(X[0, :]==i)[0][0]] for i in X_new[0, :]]
['petal length (cm)', 'petal width (cm)']
iris.feature_names
['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']
# 数据的第一列方差很小,为了不被移除,需要使用VarianceThreshold模块
X = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]) 

from sklearn.feature_selection import VarianceThreshold 

# 0.8 * (1 - 0.8):这是二项分布中的特例伯努利分布,0出现的概率将大于80%
# 方差的阈值为0.8*0.2,小于该值的特征将被移除(无监督的特征选择)

vt = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))   
vt.fit_transform(X)
array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

项目案例

import pandas as pd

data = pd.read_csv("data/data20531/santandar.csv")
data.shape
(76020, 371)
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import VarianceThreshold

# 划分训练集特征和测试集特征
train_features, test_features, train_labels, test_labels = train_test_split(
        data.drop(labels=['TARGET'], axis=1),
        data['TARGET'],
        test_size=0.2,
        random_state=41)

# 将特征的方差小于0.01的特征删除
qconstant_filter = VarianceThreshold(threshold=0.01)       
qconstant_filter.fit(train_features)  

# 不能将fit和transform合并为fit_transform
train_features = qconstant_filter.transform(train_features)  
test_features = qconstant_filter.transform(test_features)

train_features.shape, test_features.shape
((60816, 269), (15204, 269))

动手练习

# 第1题
import pandas as pd

data = pd.read_csv("data/data20531/santandar.csv")

from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import VarianceThreshold

train_features, test_features, train_labels, test_labels = train_test_split(
    data.drop(labels=['TARGET'], axis=1),
    data['TARGET'],
    test_size=0.2,
    random_state=41)

# 移除“常数特征”:特征中所有的值都相同
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(train_features)

train_features = constant_filter.transform(train_features)  
test_features = constant_filter.transform(test_features)

train_features.shape, test_features.shape  
((60816, 332), (15204, 332))
# 第2题
import pandas as pd

data = pd.read_csv("data/data20531/santandar.csv")

from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import VarianceThreshold

train_features, test_features, train_labels, test_labels = train_test_split(
    data.drop(labels=['TARGET'], axis=1),
    data['TARGET'],
    test_size=0.2,
    random_state=41)

# 矩阵转置
train_features_T = train_features.T  
print(train_features_T.shape)

# 重复特征数量
print(train_features_T.duplicated().sum())    

# 删除重复特征,并转置回原来的样子unique_features
unique_features = train_features_T.drop_duplicates(keep='first').T  
print(unique_features.shape)

#显示重复特征duplicated_features,列表解析
duplicated_features = [dup_col for dup_col in train_features.columns if dup_col not in unique_features.columns]  
duplicated_features 
(370, 60816)
65
(60816, 305)

['ind_var2',
 'ind_var13_medio',
 'ind_var18',
 'ind_var26',
 'ind_var25',
 'ind_var27_0',
 'ind_var28_0',
 'ind_var28',
 'ind_var27',
 'ind_var29_0',
 'ind_var29',
 'ind_var32',
 'ind_var34',
 'ind_var37',
 'ind_var41',
 'ind_var39',
 'ind_var46_0',
 'ind_var46',
 'num_var13_medio',
 'num_var18',
 'num_var26',
 'num_var25',
 'num_var27_0',
 'num_var28_0',
 'num_var28',
 'num_var27',
 'num_var29_0',
 'num_var29',
 'num_var32',
 'num_var34',
 'num_var37',
 'num_var41',
 'num_var39',
 'num_var46_0',
 'num_var46',
 'saldo_var28',
 'saldo_var27',
 'saldo_var29',
 'saldo_var41',
 'saldo_var46',
 'delta_imp_trasp_var33_out_1y3',
 'delta_num_reemb_var13_1y3',
 'delta_num_reemb_var17_1y3',
 'delta_num_reemb_var33_1y3',
 'delta_num_trasp_var17_in_1y3',
 'delta_num_trasp_var17_out_1y3',
 'delta_num_trasp_var33_in_1y3',
 'delta_num_trasp_var33_out_1y3',
 'imp_amort_var18_hace3',
 'imp_amort_var34_hace3',
 'imp_reemb_var13_hace3',
 'imp_reemb_var33_hace3',
 'imp_trasp_var17_out_hace3',
 'imp_trasp_var33_out_hace3',
 'imp_trasp_var33_out_ult1',
 'num_var2_0_ult1',
 'num_var2_ult1',
 'num_reemb_var13_hace3',
 'num_reemb_var33_hace3',
 'num_trasp_var17_out_hace3',
 'num_trasp_var33_out_hace3',
 'num_trasp_var33_out_ult1',
 'saldo_var2_ult1',
 'saldo_medio_var13_medio_hace3',
 'saldo_medio_var13_medio_ult1']
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

4-2 过滤器法 的相关文章

随机推荐