策略思想:
- 使用能够进行特征重要性评估的模型(一般带有feature_importances或coef_参数)训练特征
- 如果结果重要性的得分小于阈值,就会被认为是不重要的特征比如小于0.1*mean(重要性)
示例代码
import pandas as pd
def load_data():
"""用来生成训练、测试数据"""
from sklearn.datasets import make_classification
data_x, data_y = make_classification(n_samples=1000, n_classes=4, n_features=10, n_informative=8)
df_x = pd.DataFrame(data_x, columns=['f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', "f_7", "f_8", "f_9", "f_10"])
df_y = pd.Series(data_y)
return df_x, df_y
def select_from_model(x_data, y_data):
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
# 带L1惩罚项的逻辑回归作为基模型的特征选择
sf_model: SelectFromModel = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sf_model.fit(x_data, y_data)
print("select feature: ", x_data.columns[sf_model.get_support()])
# sf_model.estimator_.coef_
# sf_model.threshold_
# sf_model.get_support() # get_support函数来得到到底是那几列被选中了
return sf_model.transform(x_data) # 得到筛选的特征
if __name__ == '__main__':
value_x, value_y = load_data()
select_from_model(value_x, value_y) # 带特征的筛选x_data,y_data