import os
import feather
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, plot_roc_curve, roc_auc_score
from sle.modeling import generate_data, prep_data, eval_model, calc_roc_cv, plot_roc_cv
from sle.penalization import regularization_range, choose_C
%load_ext autoreload
%autoreload 2
Main Results
Setup
= generate_data('imid')
data_all = generate_data('rest') X_test_df
Code for loading original data
= os.path.join('..', 'data', 'processed')
data_dir = feather.read_dataframe(os.path.join(data_dir, 'imid.feather'))
data_all = feather.read_dataframe(os.path.join(data_dir,'rest.feather')) X_test_df
Projection
See the projection.ipynb
notebook for extended results and analyses
= PowerTransformer(method='box-cox') trf
= pd.concat([data_all, X_test_df[X_test_df.Class.isin(['preSLE', 'rest_large'])]])
df_combined 'Class'] = pd.Categorical(df_combined.Class, categories = ['SLE','IMID','nonIMID','rest_large','BBD','preSLE']) # order for plotting: largest groups first
df_combined[='Class', inplace=True)
df_combined.sort_values(by= np.array(df_combined.loc[:, ~df_combined.columns.isin(["Arthritis","Pleurisy","Pericarditis","Nefritis"] + ['Class'] + ['dsDNA1'])])
X_combined = trf.fit_transform(X_combined+1) X_combined_scaled
X_combined.shape
(1887, 57)
df_combined.Class.value_counts()
SLE 483
rest_large 462
BBD 361
IMID 346
nonIMID 218
preSLE 17
Name: Class, dtype: int64
= umap.UMAP(random_state=42)
umap_obj = umap_obj.fit_transform(X_combined_scaled) Y_umap_combined
'umap_1'] = Y_umap_combined[:,0]
df_combined['umap_2'] = Y_umap_combined[:,1] df_combined[
Figure 1
with sns.plotting_context("paper"):
set(font="Arial")
sns.'white')
sns.set_style(= plt.subplots(figsize=(5.5, 5.5))
f, ax = sns.scatterplot(x='umap_1',y='umap_2',
g ='Class', style='Class', size='Class',
hue= {"SLE": "o", "rest_large": "o", "BBD": "o", "IMID": "o", "nonIMID": "o", "preSLE": "o"},
markers = {"SLE": 20, "rest_large": 20, "BBD": 20, "IMID": 20, "nonIMID": 20, "preSLE": 20},
sizes ={"SLE": '#0072B2', "rest_large": '#D55E00', "BBD": '#CC79A7', "IMID": '#009E73', "nonIMID": '#E69F00', "preSLE": '#000000'},
palette=.7,
alpha=df_combined,
data=ax)
ax= ax.legend(frameon=False, loc = 'upper left', bbox_to_anchor=(0,1.05))
legend 'Dimension 1'); ax.set_ylabel('Dimension 2');
ax.set_xlabel(-6,0)
ax.set_ylim(= ['','SLE','IMID','Non-IMID','Rest','BBD','Pre-SLE']; # remove legend title and change group spelling
new_labels for t, l in zip(legend.texts, new_labels): t.set_text(l)
=f,ax=ax) sns.despine(fig
Prediction models
= prep_data(data_all, 'SLE', 'nonIMID', drop_cols = ["Arthritis","Pleurisy","Pericarditis","Nefritis","dsDNA1"])
X_nonIMID, y_nonIMID = X_nonIMID.dsDNA2.values.reshape(-1,1) dsDNA_nonIMID
= LogisticRegression(penalty = 'none', max_iter = 10000) clf
= RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=40) cv
SLE vs. non-IMID
See the SLE Versus nonIMID.ipynb
notebook for extended analyses and results
Logistic regression: Only dsDNA from microarray
= clf.fit(dsDNA_nonIMID, y_nonIMID) lr_dsDNA_nonIMID
=cv, scoring = 'roc_auc')) np.mean(cross_val_score(clf, dsDNA_nonIMID, y_nonIMID, cv
0.8006217102395325
Logistic regression: Whole microarray
= X_nonIMID + 1 # Some < 0 values > -1. Because negative fluorescence isn't possible, and Box-Cox requires strictly positive values, add ofset Xp1_nonIMID
= Pipeline([
pipe_trf 'transform', trf),
('clf', clf)]) (
=cv, scoring = 'roc_auc')) np.mean(cross_val_score(pipe_trf, Xp1_nonIMID, y_nonIMID, cv
0.8337294463757692
LASSO
= LogisticRegression(penalty='l1', max_iter = 10000, solver = 'liblinear') clf_lasso
= 100
K = regularization_range(Xp1_nonIMID,y_nonIMID,trf)
lambda_min, lambda_max = np.logspace(np.log10(1/lambda_min),np.log10(1/lambda_max), K)
Cs_lasso_nonIMID = Pipeline([
pipe 'trf', trf),
('clf', clf_lasso)
(
])= [{
params "clf__C": Cs_lasso_nonIMID
}]
= GridSearchCV(pipe, params, cv = cv, scoring = 'roc_auc', refit=choose_C) lasso_nonIMID
%%time
lasso_nonIMID.fit(Xp1_nonIMID,y_nonIMID)
CPU times: user 3min 33s, sys: 64.1 ms, total: 3min 33s
Wall time: 3min 33s
GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=40),
estimator=Pipeline(steps=[('trf',
PowerTransformer(method='box-cox')),
('clf',
LogisticRegression(max_iter=10000,
penalty='l1',
solver='liblinear'))]),
param_grid=[{'clf__C': array([6.52012573, 6.08069108, 5.67087285, 5.288675 , 4.932236 ,
4.5998198 , 4.28980735, 4.00068869, 3.73105566, 3...
0.04932236, 0.0459982 , 0.04289807, 0.04000689, 0.03731056,
0.03479595, 0.03245082, 0.03026374, 0.02822407, 0.02632186,
0.02454785, 0.02289341, 0.02135047, 0.01991152, 0.01856955,
0.01731803, 0.01615085, 0.01506234, 0.01404719, 0.01310045,
0.01221753, 0.0113941 , 0.01062618, 0.00991001, 0.00924211,
0.00861922, 0.00803832, 0.00749656, 0.00699132, 0.00652013])}],
refit=<function choose_C at 0x7f80d43e74d0>, scoring='roc_auc')
Best model AUC:
'mean_test_score'].max() lasso_nonIMID.cv_results_[
0.8373932541974528
AUC with lambda selected through 1 SE rule:
'mean_test_score'][lasso_nonIMID.best_index_] lasso_nonIMID.cv_results_[
0.8334140652811983
Number of non-zero coefficients in this model:
= (pd.Series(lasso_nonIMID.best_estimator_.named_steps.clf.coef_.squeeze(),
coefs_final_nonimid = X_nonIMID.columns)[lambda x: x!=0].sort_values())
index len(coefs_final_nonimid)
29
Figure 2 (top panel)
sns.reset_defaults()with sns.plotting_context("paper"):
= plt.subplots(figsize=(4.5, 4.5))
fig, ax = calc_roc_cv(lasso_nonIMID.best_estimator_,cv,Xp1_nonIMID,y_nonIMID)
tprs, aucs = plot_roc_cv(tprs, aucs, fig, ax, fig_title='SLE vs. non-IMID', line_color='#E69F00', legend_label='LASSO')
fig, ax = calc_roc_cv(lr_dsDNA_nonIMID,cv,dsDNA_nonIMID,y_nonIMID)
tprs, aucs = plot_roc_cv(tprs, aucs, fig, ax, reuse=True, line_color='gray', legend_label='dsDNA (from chip) only')
fig, ax =fig,ax=ax, trim=True)
sns.despine(fig=False)
plt.legend(frameon plt.show()
Supplemental Figure 1a
sns.reset_defaults()= sns.diverging_palette(250, 15, s=75, l=40, center="dark", as_cmap=True)
pal with sns.plotting_context("paper"):
with sns.axes_style("whitegrid"):
= plt.subplots(figsize=(2.5, 5))
fig, ax = sns.scatterplot(y=coefs_final_nonimid.index, x=coefs_final_nonimid.values,
g =coefs_final_nonimid.values, palette=pal, legend=False)
hueset(title='SLE vs.non-IMID',
ax.=np.linspace(-1.0,1.0,5), xlim=[-1.2,1.2], xlabel=r'$\beta$')
xticks=fig,ax=ax, left=True, bottom=True) sns.despine(fig
SLE vs. BBD (Blood Bank Donors)
See the SLE Versus BBD.ipynb
notebook for extended analyses and results
Logistic regression: Only one feature
= prep_data(data_all, 'SLE', 'BBD', drop_cols = ["Arthritis","Pleurisy","Pericarditis","Nefritis"] + ["dsDNA1"])
X, y = X.dsDNA2.values.reshape(-1,1)
dsDNA = X.Ro60.values.reshape(-1,1) Ro60
dsDNA from microarray
= clf.fit(dsDNA, y) lr_dsDNA
=cv, scoring = 'roc_auc')) np.mean(cross_val_score(clf, dsDNA, y, cv
0.7956906204723647
Ro60 from microarray
=cv, scoring = 'roc_auc')) np.mean(cross_val_score(clf, Ro60, y, cv
0.8980197798817389
LASSO
= X + 1 # Some values are between -1 and 0. Because negative fluorescence isn't possible, and Box-Cox requires strictly positive values, add ofset Xp1
= regularization_range(Xp1,y,trf) lambda_min, lambda_max
= 100
K = np.logspace(np.log10(1/lambda_min),np.log10(1/lambda_max), K)
Cs_lasso = Pipeline([
pipe 'trf', trf),
('clf', clf_lasso)
(
])= [{
params "clf__C": Cs_lasso
}]
= GridSearchCV(pipe, params, cv = cv, scoring = 'roc_auc', refit=choose_C) search_lasso
%%time
search_lasso.fit(Xp1,y)
CPU times: user 9min 2s, sys: 17min 14s, total: 26min 16s
Wall time: 4min 36s
GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=5, n_splits=5, random_state=40),
estimator=Pipeline(steps=[('trf',
PowerTransformer(method='box-cox')),
('clf',
LogisticRegression(max_iter=10000,
penalty='l1',
solver='liblinear'))]),
param_grid=[{'clf__C': array([3.42342671, 3.19269921, 2.97752197, 2.77684695, 2.58969676,
2.41515987, 2.25238618, 2.10058289, 1.95901...
0.02589697, 0.0241516 , 0.02252386, 0.02100583, 0.01959011,
0.0182698 , 0.01703848, 0.01589014, 0.0148192 , 0.01382043,
0.01288898, 0.01202031, 0.01121018, 0.01045465, 0.00975004,
0.00909292, 0.00848009, 0.00790856, 0.00737555, 0.00687846,
0.00641488, 0.00598254, 0.00557933, 0.0052033 , 0.00485262,
0.00452557, 0.00422056, 0.00393611, 0.00367083, 0.00342343])}],
refit=<function choose_C at 0x7f80d43e74d0>, scoring='roc_auc')
Best model AUC:
'mean_test_score'].max() search_lasso.cv_results_[
0.9829972773711079
AUC with lambda selected through 1 SE rule:
'mean_test_score'][search_lasso.best_index_] search_lasso.cv_results_[
0.9813334155499589
Number of non-zero coefficients in this model:
= (pd.Series(search_lasso.best_estimator_.named_steps.clf.coef_.squeeze(),
coefs_final = X_nonIMID.columns)[lambda x: x!=0].sort_values())
index len(coefs_final)
15
Figure 2 (bottom panel)
sns.reset_defaults()with sns.plotting_context("paper"):
= plt.subplots(figsize=(4.5, 4.5))
fig, ax = calc_roc_cv(search_lasso.best_estimator_,cv,Xp1,y)
tprs, aucs = plot_roc_cv(tprs, aucs, fig, ax, fig_title='SLE vs. BBD', line_color='#CC79A7', legend_label='LASSO')
fig, ax = calc_roc_cv(lr_dsDNA,cv,dsDNA,y)
tprs, aucs = plot_roc_cv(tprs, aucs, fig, ax, reuse=True, line_color='gray', legend_label='dsDNA only')
fig, ax =fig,ax=ax, trim=True)
sns.despine(fig=False)
plt.legend(frameon plt.show()
Supplemental Figure 1b
sns.reset_defaults()= sns.diverging_palette(250, 15, s=75, l=40, center="dark", as_cmap=True)
pal with sns.plotting_context("paper"):
with sns.axes_style("whitegrid"):
= plt.subplots(figsize=(2.5, 2.5))
fig, ax = sns.scatterplot(y=coefs_final.index, x=coefs_final.values,
g =coefs_final.values, palette=pal, legend=False)
hueset(title='SLE vs. BBD',
ax.=np.linspace(-1.0,1.0,5), xlim=[-1.2,1.2], xlabel=r'$\beta$')
xticks=fig,ax=ax, left=True, bottom=True) sns.despine(fig
Prediction of the development of SLE
ROC AUC of SLE vs. BBD model
= prep_data(X_test_df, 'preSLE', 'rest_large', ["Arthritis","Pleurisy","Pericarditis","Nefritis"] + ["dsDNA1"])
X_test, y_test +1)[:,1]) roc_auc_score(y_test, search_lasso.predict_proba(X_test
0.5679908326967151
Figure 3
= prep_data(X_test_df, 'preSLE', 'rest_large', ["Arthritis","Pleurisy","Pericarditis","Nefritis"] + ["dsDNA1"])
X_test, y_test # ROC curve
=0.5
threshold1=0.84
threshold2= roc_curve(y_test, lasso_nonIMID.predict_proba(X_test+1)[:,1])
fpr, tpr, thresholds = (np.abs(thresholds - threshold1)).argmin() # find index of value closest to chosen threshold
thr_idx1 = (np.abs(thresholds - threshold2)).argmin() # find index of value closest to chosen threshold
thr_idx2
sns.reset_defaults()with sns.plotting_context("paper"):
= plt.subplots(figsize=(4.5, 4.5))
fig, ax 0, 1], [0, 1], linestyle='--', lw=1.5, color='k', alpha=.25)
ax.plot([set(title="Pre-SLE vs. Rest",
ax.=[-0.05, 1.05], xlabel='False positive rate',
xlim=[-0.05, 1.05], ylabel='True positive rate')
ylim= ax.get_ylim(); xmin, xmax = ax.get_xlim()
ymin, ymax =fpr[thr_idx1], ymin=ymin, ymax=tpr[thr_idx1], color='k', alpha=.6, linestyle='--', axes=ax) # plot line for fpr at threshold
plt.vlines(x=tpr[thr_idx1], xmin=xmin, xmax=fpr[thr_idx1], color='k', alpha=.6, linestyle='--', axes=ax) # plot line for tpr at threshold
plt.hlines(y=fpr[thr_idx2], ymin=ymin, ymax=tpr[thr_idx2], color='k', alpha=.6, linestyle='--', axes=ax) # plot line for fpr at threshold
plt.vlines(x=tpr[thr_idx2], xmin=xmin, xmax=fpr[thr_idx2], color='k', alpha=.6, linestyle='--', axes=ax) # plot line for tpr at threshold
plt.hlines(y+1, y_test, name = "LASSO", ax=ax, color='#D55E00', lw=2, alpha=.8)
plot_roc_curve(lasso_nonIMID, X_test0.56,0.81, '1', fontsize='large')
plt.text(0.08,0.45, '2', fontsize='large')
plt.text(=fig,ax=ax, trim=True) sns.despine(fig
Classification
= prep_data(X_test_df, 'preSLE', 'rest_large', ["Arthritis","Pleurisy","Pericarditis","Nefritis"] + ["dsDNA1"])
X_test, y_test +1, y_test, 'preSLE', 'rest_large') eval_model(lasso_nonIMID, X_test
Threshold for classification: 0.5
precision recall f1-score support
rest_large 0.99 0.51 0.67 462
preSLE 0.06 0.88 0.12 17
accuracy 0.52 479
macro avg 0.53 0.70 0.39 479
weighted avg 0.96 0.52 0.65 479
N.B.: "recall" = sensitivity for the group in this row (e.g. preSLE); specificity for the other group (rest_large)
N.B.: "precision" = PPV for the group in this row (e.g. preSLE); NPV for the other group (rest_large)
= prep_data(X_test_df, 'preSLE', 'rest_large', ["Arthritis","Pleurisy","Pericarditis","Nefritis"] + ["dsDNA1"])
X_test, y_test +1, y_test, 'preSLE', 'rest_large', threshold=0.84) eval_model(lasso_nonIMID, X_test
Threshold for classification: 0.84
precision recall f1-score support
rest_large 0.98 0.94 0.96 462
preSLE 0.25 0.53 0.34 17
accuracy 0.93 479
macro avg 0.62 0.74 0.65 479
weighted avg 0.96 0.93 0.94 479
N.B.: "recall" = sensitivity for the group in this row (e.g. preSLE); specificity for the other group (rest_large)
N.B.: "precision" = PPV for the group in this row (e.g. preSLE); NPV for the other group (rest_large)