doing basic preprocessing

#

import shap import pandas as pd

import pyreadstat
path = 'ding_ning_all_origin.sav'
df, meta = pyreadstat.read_sav(path, encoding='gbk')

Y = df['group'][0:270]
df.columns

Index(['group', 'Zphonedeletion', 'Zonsetrime', 'ZSpoonerism', 'ZRANdigit',
       'ZRANpic', 'ZRANdice', 'ZRANcolor', 'ZDigitSpan', 'ZMA', 'Z假字', 'Z部件错误',
       'Z位置错误', 'Z线条图', 'ZNumberACC', 'ZSymbolACC', 'ZColorACC', 'ZChineseACC',
       'age', 'IQper', 'Gender', '父亲最高学历(问卷星)', '母亲最高学历(问卷星)', '姓名', '学号',
       'Zphonologicalaccuracy', 'Zphonologicalspeed', 'ZOaverage', '认知缺陷1.5SD',
       'Z正字法', 'SESnew', 'SES分组', 'Zacc', 'paper', 'SESQ', 'expriment2',
       'VAR00001', 'Zphonologicalskills13', 'ZPA11', '父亲', '母亲', '亚类型认知缺陷',
       'PA3', 'RAN', 'RAN数字平均时间', 'RAN筛子平均时间', 'RAN图片平均时间', 'RAN颜色平均时间',
       '语音环境交互', 'SES交互(PA)', 'ZVAS', '汉字识别任务得分', '学校', 'Grade', '语素', 'Z语素重评',
       'Spoonerism', '声母、韵母删除测验', '音位删除测验得分', '正字法', '假字', '部件错误', '位置错误',
       '线条图', 'DS', '顺序', '倒序', '表征', '智力分数', '智力等级', '障碍1.5SD', 'ZPA3_RAN',
       '认知缺陷', '词表朗读时间', 'Z词表朗读时间', 'Z词表朗读反', 'ZVAS_verbal', 'ZVAS_nonverbal',
       '词表朗读得分', '顺序得分', '倒序得分', 'PA2', 'Zrepresentation', '语素产生得分原来',
       'Z语素产生得分', 'ZPA声母韵母音位删除', '亚类型8.7', '亚类型', 'VAS_verbal',
       'VAS_nonverbal', 'VAS', 'filter_$', 'ChineseACC', 'NumberACC',
       'SymbolACC', 'ColorACC', 'RAN环境交互', '语素环境交互'],
      dtype='object')
name = df['姓名'][0:270]
number = df['学号'][0:270]


needed_col = ['Zphonedeletion','Zonsetrime','ZSpoonerism','ZRANdigit','ZRANpic','ZRANcolor','ZDigitSpan',
             'ZMA','Z假字','Z部件错误','Z位置错误','Z线条图']
needed_col = ['Zphonedeletion','Zonsetrime','ZSpoonerism','ZRANdigit','ZRANpic','ZRANcolor','ZDigitSpan',
             'ZMA','ZOaverage']
# needed_col = ['Zphonedeletion','Zonsetrime','ZSpoonerism','ZRANdigit','ZRANpic','ZRANdice','ZRANcolor','ZDigitSpan',
#              'ZMA','Z假字','Z部件错误','Z位置错误','Z线条图'
#              ]
dict_to_replace = {
    'Z假字' : 'ZPseudoC',
    'Z部件错误' : 'ZIll-formed component',
    'Z位置错误' : 'ZIllegal position',
    'Z线条图': 'ZBW_drawings',
}
df_need = df.loc[:, needed_col][0:270]
qq = df_need.interpolate(method='polynomial', order=2,axis=0)
df2 = qq.rename(dict_to_replace, axis=1)  # new method
X = df2
new_colums = ['Phoneme Deleltion', 'Onset rime Deletion', 'Spoonerism', 'RAN Digits', 'RAN Pictures',
       'RAN Color', 'Digit Span', 'Morphological Awareness', 'Pseudo-character', 'Ill-formed Component',
       'Illegal Position', 'Black-and-white Drawings']
new_colums = ['Phoneme Deleltion', 'Onset And Rime Deletion', 'Spoonerism', 'RAN Digits', 'RAN Pictures',
       'RAN Color', 'Digit Span', 'Morphological Awareness','Orthographic Awareness']

X.columns = new_colums

doing classifcation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
         test_size = 0.2, random_state = 1234)
xgb_model = xgb.XGBRFClassifier()
#xgb_model = xgb.XGBRegressor()
#xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_train, Y_train)
[16:16:51] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.


C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)





XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain',
                interaction_constraints='', max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints='()',
                n_estimators=100, n_jobs=16, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                scale_pos_weight=1, tree_method='exact', validate_parameters=1,
                verbosity=None)
# 预测下 X
                
                
from sklearn import metrics
y_pred = xgb_model.predict(X_train)

fpr, tpr, thresholds = metrics.roc_curve(Y_train, y_pred)
metrics.auc(fpr, tpr)
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(





0.930839495432797
from sklearn.metrics import accuracy_score


y_pred = xgb_model.predict(X_test)
print(accuracy_score(Y_test , y_pred)*100)
83.33333333333334


C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(
import seaborn as sns
from sklearn.metrics import confusion_matrix

# cm

y_pred = xgb_model.predict(X)
cf_matrix = confusion_matrix(Y, y_pred)

group_counts = [(value) for value in cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]

labels = [f'{v1}\n{v2}\n' for v1, v2 in zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=labels, fmt='', cmap='Blues')
plt.savefig('all.pdf')
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(

png


import seaborn as sns
from sklearn.metrics import confusion_matrix


# cm

y_pred = xgb_model.predict(X_train)
cf_matrix = confusion_matrix(Y_train, y_pred)


sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
fmt='.2%', cmap='Blues')
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(





<AxesSubplot:>

png

from sklearn.metrics import auc
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


# good point 88

# set point 8 

# cm

y_pred = xgb_model.predict(X_train)
cm = confusion_matrix(Y_train, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot(cmap='Blues')

C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(

png

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state = 8)
# 交叉验证 轮数

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# ROC_5_fold
fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, Y)):
    xgb_model.fit(X.iloc[train], Y.iloc[train])
    viz = RocCurveDisplay.from_estimator(
        xgb_model,
        X.iloc[test],
        Y.iloc[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(


[13:04:07] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[13:04:07] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[13:04:07] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.


C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(
C:\ProgramData\Anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
  warnings.warn(label_encoder_deprecation_msg, UserWarning)


[13:04:07] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[13:04:07] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.


C:\ProgramData\Anaconda3\lib\site-packages\xgboost\data.py:112: UserWarning: Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
  warnings.warn(

png

import shap
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer(X)
shap.plots.bar(shap_values, max_display=20,show=False) # default is max_display=12
#plt.savefig('importance_xgb.pdf', format='pdf', dpi=1200, bbox_inches='tight')
#plt.close()
ntree_limit is deprecated, use `iteration_range` or model slicing instead.

png

qq = X[X.index == 80]
# 获取特殊的 index
print('model prediction is {}'.format(xgb_model.predict(qq)))
print('probability is ',xgb_model.predict_proba(qq))

shap_values = explainer(qq)
shap.plots.force(shap_values,show=False,link='logit')
# 80 
model prediction is [1.]
probability is  [[0.15268368 0.8473163 ]]
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
## 双括号 获取列
import pandas as pd

ding = pd.read_excel('longlong.xlsx')
needed = ding['ID'].tolist()
Unknown extension is not supported and will be removed
Conditional Formatting extension is not supported and will be removed
len(needed)
21
X['group'] = Y
all_dyxia = X[X['group'] == 1]

len(all_dyxia)
123
name = df['姓名'][df['group'] == 1]
number = df['学号'][df['group'] == 1]
prediction = []
prediction_prob_0 = []
prediction_prob_1 = []


for i in range(len(all_dyxia)):
    qq = all_dyxia.iloc[[i]].drop(['group'],axis=1)
    # 获取特殊的 index
    prediction.append(xgb_model.predict(qq))
    prediction_prob_0.append(xgb_model.predict_proba(qq)[0][0])
    prediction_prob_1.append(xgb_model.predict_proba(qq)[0][1])
import pandas as pd

df = pd.DataFrame(all_dyxia)

df['prediction'] = prediction
df['prediction_prob_0'] = prediction_prob_0
df['prediction_prob_1'] = prediction_prob_1
df['name'] = name 
df['number'] = number
df.to_excel('预测和概率_全部.xlsx',encoding='UTF-8')
import pdfkit
for i in range(len(all_dyxia)):
    f = shap.plots.force(shap_values[i],show=False,link='logit')
    shap.save_html("all/index{}.html".format(i), f)
    #pdfkit.from_file("index{}.html".format(i),"index{}.pdf".format(i),configuration=config,options={'javascript-delay':'5000'})
    
#shap.save_html("index.htm", f)
# default is max_display=12
#plt.savefig('importance_xgb1.pdf', format='pdf', dpi=1200, bbox_inches='tight')
#plt.close()
import pdfkit
for i in needed:
    f = shap.plots.force(shap_values[i],show=False,link='logit')
    shap.save_html("index{}.html".format(i), f)
    #pdfkit.from_file("index{}.html".format(i),"index{}.pdf".format(i),configuration=config,options={'javascript-delay':'5000'})
    
#shap.save_html("index.htm", f)
# default is max_display=12
#plt.savefig('importance_xgb1.pdf', format='pdf', dpi=1200, bbox_inches='tight')
#plt.close()

shap.plots.force(shap_values[1],link='logit')
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
shap.plots.force(shap_values[7],link='identity')
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
import pdfkit
path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
#pdfkit.from_url("http://google.com", "out.pdf", configuration=config)

import imgkit


pdfkit.from_file("index{}.html".format(i),"index{}.pdf".format(i),configuration=config)
True
import matplotlib.pyplot as plt
shap.plots.waterfall(shap_values[66],show=True) # For the first observation
#plt.savefig('case1.pdf', format='pdf', dpi=1200, bbox_inches='tight')
#plt.close()

png

shap.plots.waterfall(shap_values[136],show=True) # For the first observation
#plt.savefig('case2.pdf', format='pdf', dpi=1200, bbox_inches='tight')
#plt.close()

png

shap.plots.waterfall(shap_values[199]) # For the first observation
plt.savefig('case3.pdf', format='pdf', dpi=1200, bbox_inches='tight')
#plt.close()

png

<Figure size 432x288 with 0 Axes>
import pandas as pd

df = pd.DataFrame(shap_values.values)
df.columns = new_colums
#df.to_excel('dingning_excel.xlsx')
df.to_excel('dingning_excel.xlsx')
df['name'] = name
df['number'] = number
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state = 8)
# 交叉验证 轮数

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# ROC_5_fold
fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, Y)):
    xgb_model.fit(X.iloc[train], Y.iloc[train])
    viz = RocCurveDisplay.from_estimator(
        xgb_model,
        X.iloc[test],
        Y.iloc[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
import shap
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X)
shap.plots.bar(shap_values, max_display=20,show=False) # default is max_display=12
The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption


[18:14:51] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[18:14:51] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[18:14:51] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[18:14:51] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.


The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption
The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
Use subset (sliced data) of np.ndarray is not recommended because it will generate extra copies and increase memory consumption


[18:14:51] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.


ntree_limit is deprecated, use `iteration_range` or model slicing instead.

png

# The SHAP Values
import shap
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X)
shap.plots.bar(shap_values, max_display=20,show=False) # default is max_display=12

png

dn_df = pd.DataFrame(shap_values.values)
dn_df.columns=needed_col
dn_df.to_csv('dingning-all.csv')
shap.plots.waterfall(shap_values[88]) # For the first observation

png

shap.summary_plot(shap_values)

png

shap.force_plot(shap_values[0:10])
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.