数据来源于信贷用户,数据量级为2W
首先读取数据
# 忽略警告
import warnings
warnings.filterwarnings('ignore')
# 导入常用库pandas/Numpy/matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# jupyter图形界面显示图片
%matplotlib inline
# jupyter显示所有特征
pd.set_option('display.max_columns',None)
# 显示全部输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# 导入sklearn常用库
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation,metrics
# 设置本ipynb的工作目录
import os
os.chdir(r'E:\model\7379')
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
导入数据
# 导入bill信息
data_bill = pd.read_excel('data_20191008.xlsx',sheet_name='bill')
# 定义年龄计算函数
from datetime import date
def calculate_age(born):
today = date.today()
return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
就不一一导入数据了
# 连表
data_bill = pd.merge(left=data_bill,right=data_bxinyan,how='left',on='userid') #连1
data_bill = pd.merge(left=data_bill,right=data_btongdun,on='userid') # 连2
data_bill = pd.merge(left=data_bill,right=data_btc5,how='left',on='userid') # 连3
raw_data = pd.merge(left=data_bill,right=data_btc15,how='left',on='userid') # 连4
数据探索
raw_data.set_index('userid',drop=True,append=False,inplace=True,verify_integrity=False)
1、查看数据大致分布
2、查看缺失值
# 删除缺失超过70%的特征
raw_data.drop(['overdue_date','max_overdue_amt','result_code'],axis=1,inplace=True)
定义一个输出行确实个数和缺失率的函数
def miss_row(data):
"""
input:原始数据
output:行的缺失个数和缺失率
"""
row, col = data.shape
row_miss = []
row_total = []
for i in range(row):
w = data.iloc[i,:].isnull().sum() #第i行缺失的总数
row_total.append(w)
row_miss.append(w.sum()/col)
row_miss = pd.Series(row_miss)
row_total = pd.Series(row_total)
row_miss.index = data.index #要保证row_miss和data的index相同
row_percent = row_miss.sort_values(axis = 0,ascending = False)#对其进行排序
row_total = row_total.sort_values(axis = 0,ascending = False)
return row_total, row_percent
raw_data.dropna(thresh = len(raw_data.columns) * 0.8,axis=0,inplace=True) #删除缺失值大于80%的行数据
# 缺失值查看代码
import seaborn as sns # advanced vizs
import missingno as msno # missing values
%matplotlib inline
# missing values?
sns.set(style = "ticks")
msno.matrix(raw_data)
X = raw_data.iloc[:,1:]
y = raw_data['overdue']
# kde画密度图
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif'] = ['Simhei']
plt.style.use('seaborn')
# 查看每个数值特征的分布,
X.hist(bins=100,figsize=(20,16))
plt.show(