RandomForestClassifier
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import label_binarize
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import cross_validation,metrics
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
path1 = 'path'
path2 = 'path'
dftrian = pd.read_csv(path1, header=None)
dftest = pd.read_csv(path2, header=None)
feature = u'****'
X_train, Y_train = dftrian[list(range(8))], dftrian[8]
X_test = dftest[list(range(8))]
ss = MinMaxScaler()
X_train = ss.fit_transform(X_train, Y_train)
X_test = ss.transform(X_test)
forest = RandomForestClassifier(n_estimators=1000,max_features=6,max_depth=25,
oob_score=True,random_state=10)
forest.fit(X_train, Y_train)
score1 = forest.score(X_train, Y_train)
print('train准确率:%.2f%%' % (score1 * 100))
forest_y_score = forest.predict_proba(X_test)
forest_y_score1 = forest.predict_proba(X_train)
p = forest.predict(X_test)
returnMat = np.zeros((20000,2))
for i in range(20000):
returnMat[i][0] = forest_y_score[i][0]
returnMat[i][1] = forest_y_score[i][1]
with open('C:\\\\test.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(returnMat)
SVM
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ChangedBehaviorWarning
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings('ignore', category=ChangedBehaviorWarning)
path1 = 'path'
path2 = 'path'
dftrian = pd.read_csv(path1, header=None)
dftest = pd.read_csv(path2, header=None)
feature = u'***'
X_train, Y_train = dftrian[list(range(8))], dftrian[8]
X_test,Y_test = dftest[list(range(8))], dftest[8]
'''
sklearn.svm.SVC(C=1.0, kernel=’rbf’, degree=3, gamma=’auto_deprecated’, coef0=0.0,
shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None,
verbose=False, max_iter=-1, decision_function_shape=’ovr’, random_state=None)
svm.SVC API说明:
# 功能:使用SVM分类器进行模型构建
# 参数说明:
# C: 误差项的惩罚系数,默认为1.0;一般为大于0的一个数字,C越大表示在训练过程中对于总误差的关注度越高,
也就是说当C越大的时候,对于训练集的表现会越好,但是有可能引发过度拟合的问题(overfiting)
# kernel:指定SVM内部函数的类型,可选值:linear、poly、rbf、sigmoid、precomputed(基本不用,有前提要求,要求特征属性数目和样本数目一样);默认是rbf;
# degree:当使用多项式函数作为svm内部的函数的时候,给定多项式的项数,默认为3
# gamma:当SVM内部使用poly、rbf、sigmoid的时候,核函数的系数值,当默认值为auto的时候,实际系数为1/n_features
# coef0: 当核函数为poly或者sigmoid的时候,给定的独立系数,默认为0
# probability:是否启用概率估计,默认不启动,不太建议启动
# shrinking:是否开启收缩启发式计算,默认为True
# tol: 模型构建收敛参数,当模型的的误差变化率小于该值的时候,结束模型构建过程,默认值:1e-3
# cache_size:在模型构建过程中,缓存数据的最大内存大小,默认为空,单位MB
# class_weight:给定各个类别的权重,默认为空
# max_iter:最大迭代次数,默认-1表示不限制
# decision_function_shape: 决策函数,可选值:ovo和ovr,默认为None;推荐使用ovr;
'''
clf = svm.SVC(C=1, kernel='rbf', gamma=0.05)
clf.fit(X_train, Y_train)
print ('训练集准确率:', accuracy_score(Y_train, clf.predict(X_train)))
print ('训练集准确率X_test:', accuracy_score(Y_test, clf.predict(X_test)))
print ('decision_function:\n', clf.decision_function(X_train))
print ('\npredict:\n', clf.predict(X_train))
xgboost
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import xgboost as xgb
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
iris_feature = u'***'
path = 'path'
data = pd.read_csv(path, header=None)
X, Y = data[list(range(8))], data[8]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=0)
dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test)
params = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'reg:linear'}
num_round = 2
bst = xgb.train(params, dtrain, num_round)
bst.save_model('xgb.model')
y_pred = bst.predict(dtest)
print("均方误差为:",mean_squared_error(Y_test, y_pred))
bst2 = xgb.Booster()
bst2.load_model('xgb.model')
print ('测试集准确率:', accuracy_score(Y_test, bst))
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)