doing basic preprocessing


import shap import pandas as pd

import pyreadstat
path = 'ding_ning_all_origin.sav'
df, meta = pyreadstat.read_sav(path, encoding='gbk')

Y = df['group'][0:270]

Index(['group', 'Zphonedeletion', 'Zonsetrime', 'ZSpoonerism', 'ZRANdigit',
       'ZRANpic', 'ZRANdice', 'ZRANcolor', 'ZDigitSpan', 'ZMA', 'Z假字', 'Z部件错误',
       'Z位置错误', 'Z线条图', 'ZNumberACC', 'ZSymbolACC', 'ZColorACC', 'ZChineseACC',
       'age', 'IQper', 'Gender', '父亲最高学历(问卷星)', '母亲最高学历(问卷星)', '姓名', '学号',
       'Zphonologicalaccuracy', 'Zphonologicalspeed', 'ZOaverage', '认知缺陷1.5SD',
       'Z正字法', 'SESnew', 'SES分组', 'Zacc', 'paper', 'SESQ', 'expriment2',
       'VAR00001', 'Zphonologicalskills13', 'ZPA11', '父亲', '母亲', '亚类型认知缺陷',
       'PA3', 'RAN', 'RAN数字平均时间', 'RAN筛子平均时间', 'RAN图片平均时间', 'RAN颜色平均时间',
       '语音环境交互', 'SES交互(PA)', 'ZVAS', '汉字识别任务得分', '学校', 'Grade', '语素', 'Z语素重评',
       'Spoonerism', '声母、韵母删除测验', '音位删除测验得分', '正字法', '假字', '部件错误', '位置错误',
       '线条图', 'DS', '顺序', '倒序', '表征', '智力分数', '智力等级', '障碍1.5SD', 'ZPA3_RAN',
       '认知缺陷', '词表朗读时间', 'Z词表朗读时间', 'Z词表朗读反', 'ZVAS_verbal', 'ZVAS_nonverbal',
       '词表朗读得分', '顺序得分', '倒序得分', 'PA2', 'Zrepresentation', '语素产生得分原来',
       'Z语素产生得分', 'ZPA声母韵母音位删除', '亚类型8.7', '亚类型', 'VAS_verbal',
       'VAS_nonverbal', 'VAS', 'filter_$', 'ChineseACC', 'NumberACC',
       'SymbolACC', 'ColorACC', 'RAN环境交互', '语素环境交互'],
name = df['姓名'][0:270]
number = df['学号'][0:270]

needed_col = ['Zphonedeletion','Zonsetrime','ZSpoonerism','ZRANdigit','ZRANpic','ZRANcolor','ZDigitSpan',
needed_col = ['Zphonedeletion','Zonsetrime','ZSpoonerism','ZRANdigit','ZRANpic','ZRANcolor','ZDigitSpan',
# needed_col = ['Zphonedeletion','Zonsetrime','ZSpoonerism','ZRANdigit','ZRANpic','ZRANdice','ZRANcolor','ZDigitSpan',
#              'ZMA','Z假字','Z部件错误','Z位置错误','Z线条图'
#              ]
dict_to_replace = {
    'Z假字' : 'ZPseudoC',
    'Z部件错误' : 'ZIll-formed component',
    'Z位置错误' : 'ZIllegal position',
    'Z线条图': 'ZBW_drawings',
df_need = df.loc[:, needed_col][0:270]
qq = df_need.interpolate(method='polynomial', order=2,axis=0)
df2 = qq.rename(dict_to_replace, axis=1)  # new method
X = df2
new_colums = ['Phoneme Deleltion', 'Onset rime Deletion', 'Spoonerism', 'RAN Digits', 'RAN Pictures',
       'RAN Color', 'Digit Span', 'Morphological Awareness', 'Pseudo-character', 'Ill-formed Component',
       'Illegal Position', 'Black-and-white Drawings']
new_colums = ['Phoneme Deleltion', 'Onset And Rime Deletion', 'Spoonerism', 'RAN Digits', 'RAN Pictures',
       'RAN Color', 'Digit Span', 'Morphological Awareness','Orthographic Awareness']

X.columns = new_colums

doing classifcation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import xgboost as xgb

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
         test_size = 0.2, random_state = 1234)
xgb_model = xgb.XGBRFClassifier()
#xgb_model = xgb.XGBRegressor()
#xgb_model = xgb.XGBClassifier(), Y_train)
XGBRFClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain',
                interaction_constraints='', max_delta_step=0, max_depth=6,
                min_child_weight=1, missing=nan, monotone_constraints='()',
                n_estimators=100, n_jobs=16, num_parallel_tree=100,
                objective='binary:logistic', random_state=0, reg_alpha=0,
                scale_pos_weight=1, tree_method='exact', validate_parameters=1,
# 预测下 X
from sklearn import metrics
y_pred = xgb_model.predict(X_train)

fpr, tpr, thresholds = metrics.roc_curve(Y_train, y_pred)
metrics.auc(fpr, tpr)
from sklearn.metrics import accuracy_score

y_pred = xgb_model.predict(X_test)
print(accuracy_score(Y_test , y_pred)*100)

import seaborn as sns
from sklearn.metrics import confusion_matrix

# cm

y_pred = xgb_model.predict(X)
cf_matrix = confusion_matrix(Y, y_pred)

group_counts = [(value) for value in cf_matrix.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]

labels = [f'{v1}\n{v2}\n' for v1, v2 in zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=labels, fmt='', cmap='Blues')
import seaborn as sns
from sklearn.metrics import confusion_matrix

# cm

y_pred = xgb_model.predict(X_train)
cf_matrix = confusion_matrix(Y_train, y_pred)

sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
fmt='.2%', cmap='Blues')
from sklearn.metrics import auc
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# good point 88

# set point 8 

# cm

y_pred = xgb_model.predict(X_train)
cm = confusion_matrix(Y_train, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot(cmap='Blues')

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state = 8)
# 交叉验证 轮数

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# ROC_5_fold
fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, Y)):[train], Y.iloc[train])
    viz = RocCurveDisplay.from_estimator(
        name="ROC fold {}".format(i),
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
import shap
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer(X), max_display=20,show=False) # default is max_display=12
#plt.savefig('importance_xgb.pdf', format='pdf', dpi=1200, bbox_inches='tight')
ntree_limit is deprecated, use `iteration_range` or model slicing instead.


qq = X[X.index == 80]
# 获取特殊的 index
print('model prediction is {}'.format(xgb_model.predict(qq)))
print('probability is ',xgb_model.predict_proba(qq))

shap_values = explainer(qq)
# 80 
model prediction is [1.]
probability is  [[0.15268368 0.8473163 ]]
## 双括号 获取列
import pandas as pd

ding = pd.read_excel('longlong.xlsx')
needed = ding['ID'].tolist()
X['group'] = Y
all_dyxia = X[X['group'] == 1]

name = df['姓名'][df['group'] == 1]
number = df['学号'][df['group'] == 1]
prediction = []
prediction_prob_0 = []
prediction_prob_1 = []

for i in range(len(all_dyxia)):
    qq = all_dyxia.iloc[[i]].drop(['group'],axis=1)
    # 获取特殊的 index
import pandas as pd

df = pd.DataFrame(all_dyxia)

df['prediction'] = prediction
df['prediction_prob_0'] = prediction_prob_0
df['prediction_prob_1'] = prediction_prob_1
df['name'] = name 
df['number'] = number
import pdfkit
for i in range(len(all_dyxia)):
    f = shap.plots.force(shap_values[i],show=False,link='logit')
    shap.save_html("all/index{}.html".format(i), f)
#shap.save_html("index.htm", f)
# default is max_display=12
#plt.savefig('importance_xgb1.pdf', format='pdf', dpi=1200, bbox_inches='tight')
import pdfkit
for i in needed:
    f = shap.plots.force(shap_values[i],show=False,link='logit')
    shap.save_html("index{}.html".format(i), f)
#shap.save_html("index.htm", f)
# default is max_display=12
#plt.savefig('importance_xgb1.pdf', format='pdf', dpi=1200, bbox_inches='tight')

import pdfkit
path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
#pdfkit.from_url("", "out.pdf", configuration=config)

import imgkit

import matplotlib.pyplot as plt
shap.plots.waterfall(shap_values[66],show=True) # For the first observation
#plt.savefig('case1.pdf', format='pdf', dpi=1200, bbox_inches='tight')


shap.plots.waterfall(shap_values[136],show=True) # For the first observation
#plt.savefig('case2.pdf', format='pdf', dpi=1200, bbox_inches='tight')


shap.plots.waterfall(shap_values[199]) # For the first observation
plt.savefig('case3.pdf', format='pdf', dpi=1200, bbox_inches='tight')


<Figure size 432x288 with 0 Axes>
import pandas as pd

df = pd.DataFrame(shap_values.values)
df.columns = new_colums
df['name'] = name
df['number'] = number
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state = 8)
# 交叉验证 轮数

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

# ROC_5_fold
fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, Y)):[train], Y.iloc[train])
    viz = RocCurveDisplay.from_estimator(
        name="ROC fold {}".format(i),
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
import shap
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X), max_display=20,show=False) # default is max_display=12
# The SHAP Values
import shap
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X), max_display=20,show=False) # default is max_display=12


dn_df = pd.DataFrame(shap_values.values)
shap.plots.waterfall(shap_values[88]) # For the first observation




