请问为什么XGBoost用训练集当作测试集来看一下学习的模型准确率和训练不一样

我拿全部的训练集来测试,然后拿训练集的一部分来测试,为什么训练的准确率是100%,而测试的准确率却不是呢(2个图片是分别去了两次训练集的一部分)?   也是想问拿训练集来测试,得到的准确率和之前训练集训练的准确率一定是一样的吗?  我的代码如下
# coding=utf-8
# Author:by Lilywei

import numpy as np
from numpy import genfromtxt
from sklearn.preprocessing import StandardScaler
import pickle

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
file_path =  open("3ts_feature_60.csv", "r")
label_path = open("3ts_label_60.csv", "r")
# test_path= open("test3_ts_feature_60.csv", "r")
# test_y_path = open("test3_ts_label_60.csv", "r")

train_data1 = genfromtxt(file_path, delimiter=',')
train_label= genfromtxt(label_path, delimiter=',')
# test_data1 = genfromtxt(test_path, delimiter=',')
# test_label = genfromtxt(test_y_path, delimiter=',')

test_data=train_data1[20000:30000]
test_label=train_label[20000:30000]


train_data = pd.DataFrame(train_data1)
test_data = pd.DataFrame(test_data)

train_data=train_data.dropna(axis=1)
test_data =test_data.dropna(axis=1)

scaler1=StandardScaler().fit(train_data)
scaler2=StandardScaler().fit(test_data)

train_data=scaler1.transform(train_data)
test_data=scaler2.transform((test_data))


# data = preprocessing.scale(train_data)
data=train_data
label=train_label

# test_xx=preprocessing.scale(test_data)
test_xx=test_data
test_label=test_label

label0=np.where(label==0.001)
label[label0]=0
label1=np.where(label==0.01)
label[label1]=1
label2=np.where(label==0.02)
label[label2]=2
label3=np.where(label==0.03)
label[label3]=3
label4=np.where(label==0.04)
label[label4]=4
label5=np.where(label==0.05)
label[label5]=5
label6=np.where(label==0.06)
label[label6]=6
label7=np.where(label==0.07)
label[label7]=7
label8=np.where(label==0.08)
label[label8]=8
label9=np.where(label==0.09)
label[label9]=9
label10=np.where(label==0.1)
label[label10]=10


test_label0=np.where(test_label==0.001)
test_label[test_label0]=0
test_label1=np.where(test_label==0.01)
test_label[test_label1]=1
test_label2=np.where(test_label==0.02)
test_label[test_label2]=2
test_label3=np.where(test_label==0.03)
test_label[test_label3]=3
test_label4=np.where(test_label==0.04)
test_label[test_label4]=4
test_label5=np.where(test_label==0.05)
test_label[test_label5]=5
test_label6=np.where(test_label==0.06)
test_label[test_label6]=6
test_label7=np.where(test_label==0.07)
test_label[test_label7]=7
test_label8=np.where(test_label==0.08)
test_label[test_label8]=8
test_label9=np.where(test_label==0.09)
test_label[test_label9]=9
test_label10=np.where(test_label==0.1)
test_label[test_label10]=10

test_save_label_file= open("test2_slice_pre_3ts_label_60.csv", "a")
test_label = [round(value) for value in test_label]
test_true_label1=str(test_label)
test_true_label2 = test_true_label1.lstrip('[')
test_true_label3 = test_true_label2.rstrip(']')  # 观 察
test_save_label_file.writelines(test_true_label3)
test_save_label_file.close()


print("类别替换完成。开始训练。。。。。。")




bst =XGBClassifier(max_depth=10, learning_rate=0.05, n_estimators=500, reg_alpha=0.1, reg_lambda=0.5,
                   silent=0, objective='multi:softmax' ,num_class=11,scale_pos_weight=1,  seed=1000)
bst.fit(data, label)
# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#        colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
#        max_depth=2, min_child_weight=1, missing=None, n_estimators=2,
#        n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
#        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
#        silent=True, subsample=1)

train_preds = bst.predict(data)
train_predictions = [round(value) for value in train_preds]

train_accuracy = accuracy_score(label, train_predictions)
print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))

preds = bst.predict(test_xx)
predictions = [round(value) for value in preds]


pre_label_file= open("pre2_slice_3ts_label_60.csv", "a")
predictions = [round(value) for value in predictions]
preds1 = str(predictions)
preds2 = preds1.lstrip('[')
preds3 = preds2.rstrip(']')  # 观 察
pre_label_file.writelines(preds3)
pre_label_file.close()

test_accuracy = accuracy_score(test_label, predictions)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))




#
# 混淆矩阵模块
from sklearn.metrics import confusion_matrix
print (confusion_matrix(label, train_preds))  # 验证集上的混淆矩阵

# y_tpre=estimator.predict(X_train)
print (confusion_matrix(test_label, preds))  # 训练集上的混淆矩阵

# 准确率及召回率等Report模块
# from sklearn.metrics import classification_report
# print (classification_report(test_xx, test_label))

pickle.dump(bst, open("bst2.pickle.dat", "wb"))
test_data_in_train5000-15000.png test_data_in_train10000-20000.png

fish - Hadooper

赞同来自:

之所以将数据分成训练集和测试集,就是希望模拟算法在通过已知内容算出模型之后,当遇到未知问题(测试集)时,其泛化能力如何。 本来模型就是依照最优化训练集结果的方向进行训练的,再拿训练集来测试,测试所得跟原来训练肯定是一样的。

要回复问题请先登录注册