import os
from time import time
import feather
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
%matplotlib inline
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.datasets import make_circles, load_digits
from sklearn.preprocessing import StandardScaler, PowerTransformer
import umap
import umap.plot
from sle.modeling import generate_data
%load_ext autoreload
%autoreload 2
Dimensionality reduction / projections
Setup
= generate_data('imid')
data_all = generate_data('rest') X_test_df
Code for loading original data
= os.path.join('..', 'data', 'processed')
data_dir = feather.read_dataframe(os.path.join(data_dir, 'imid.feather'))
data_all = feather.read_dataframe(os.path.join(data_dir,'rest.feather')) X_test_df
TSNE
Toy data
From https://scikit-learn.org/stable/auto_examples/manifold/plot_t_sne_perplexity.html#sphx-glr-auto-examples-manifold-plot-t-sne-perplexity-py
= 300
n_samples = 2
n_components = [5, 30, 50, 100] perplexities
= make_circles(n_samples=n_samples, factor=.5, noise=.05)
X, y
= y == 0
red = y == 1 green
= plt.subplots(1, 5, figsize=(25, 5))
(fig, subplots) = subplots[0]
ax 0], X[red, 1], c="r")
ax.scatter(X[red, 0], X[green, 1], c="g")
ax.scatter(X[green,
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())"Original data")
ax.set_title('tight')
plt.axis(
for i, perplexity in enumerate(perplexities):
= subplots[i + 1]
ax
= time()
t0 = TSNE(n_components=n_components, init='random',
tsne =0, perplexity=perplexity)
random_state= tsne.fit_transform(X)
Y = time()
t1 print("circles, perplexity=%d in %.2g sec" % (perplexity, t1 - t0))
"Perplexity=%d" % perplexity)
ax.set_title(0], Y[red, 1], c="r")
ax.scatter(Y[red, 0], Y[green, 1], c="g")
ax.scatter(Y[green,
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())'tight') ax.axis(
circles, perplexity=5 in 0.88 sec
circles, perplexity=30 in 1.1 sec
circles, perplexity=50 in 0.95 sec
circles, perplexity=100 in 1.3 sec
Real data
data_all.head()
Actinin | ASCA | Beta2GP1 | C1q | C3b | Cardiolipin | CCP1arg | CCP1cit | CENP | CMV | ... | SMP | TIF1gamma | TPO | tTG | Arthritis | Pleurisy | Pericarditis | Nefritis | dsDNA1 | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 94.8911 | 1117.530 | 1328.0800 | 88.0225 | 115.4250 | 55.1872 | 42.1010 | 40.3490 | 65.5990 | 1177.9000 | ... | 157.0750 | 283.8950 | 1011.080 | 170.611 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | SLE |
1 | 99.9188 | 1295.260 | 119.1230 | 133.0480 | 59.4884 | 39.9630 | 39.0714 | 39.0714 | 35.5006 | 1023.6600 | ... | 114.7650 | 84.1488 | 1111.850 | 146.075 | 0.0 | 0.0 | 0.0 | 1.0 | 63.0 | SLE |
2 | 121.3530 | 2636.220 | 38.4903 | 85.9066 | 117.8180 | 38.4903 | 42.0952 | 40.2934 | 53.7753 | 76.1163 | ... | 113.3960 | 154.8510 | 109.857 | 128.418 | 1.0 | 0.0 | 1.0 | 0.0 | 2.6 | SLE |
3 | 145.0990 | 995.634 | 509.1220 | 171.8770 | 179.0070 | 60.6069 | 67.8459 | 52.4473 | 203.9280 | 8717.1000 | ... | 148.6730 | 4777.2800 | 765.190 | 211.928 | 1.0 | 0.0 | 0.0 | 1.0 | 1.6 | SLE |
4 | 66.0117 | 994.225 | 40.8654 | 184.3840 | 85.9921 | 44.1397 | 42.5038 | 43.3220 | 33.4571 | 3849.9300 | ... | 66.8153 | 103.4140 | 716.172 | 237.993 | 1.0 | 0.0 | 0.0 | 1.0 | 22.0 | SLE |
5 rows × 63 columns
data_all.shape
(1408, 63)
'Class'].value_counts() data_all[
SLE 483
BBD 361
IMID 346
nonIMID 218
Name: Class, dtype: int64
= ['Arthritis', 'Pleurisy', 'Pericarditis', 'Nefritis']
symptoms for col in symptoms: # convert to boolean, so 0 or NaN become false
= data_all[col].fillna(False).astype('bool') data_all[col]
'Symptoms'] = 'more_than_one' # new column for symptoms with default value for patients with more than one symptom
data_all['Arthritis'] & ~data_all[['Pleurisy','Pericarditis','Nefritis']].any(1),'Symptoms'] = 'arthritis_only' # patients with only arthritis
data_all.loc[data_all['Nefritis'] & ~data_all[['Pleurisy','Pericarditis','Arthritis']].any(1),'Symptoms'] = 'nefritis_only' # patients with only nefritis
data_all.loc[data_all['Pleurisy','Pericarditis']].any(1) & ~data_all[['Nefritis','Arthritis']].any(1),'Symptoms'] = 'pleurisy_or_pericarditis_only' # patients with either pleuritis or pericardits and nothing else
data_all.loc[data_all[[~data_all[symptoms].any(1),'Symptoms'] = 'none' # patients with no symptoms data_all.loc[
data_all.Symptoms.value_counts()
none 1040
nefritis_only 108
more_than_one 102
arthritis_only 101
pleurisy_or_pericarditis_only 57
Name: Symptoms, dtype: int64
= data_all.copy()
df = data_all["Class"]
y_class = data_all['Symptoms']
y_symptom = np.array(data_all.loc[:, ~data_all.columns.isin(symptoms + ['Class'] + ['Symptoms'] + ['dsDNA1'])]) X
df_symps = (sle .groupby(symptoms) .size() #.reset_index(name=‘counts’) #.sort_values(by=‘counts’,ascending=False) ) df_symps
Default parameters
= TSNE(random_state=0, verbose=1)
tsne = tsne.fit_transform(X) Y_tsne
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1408 samples in 0.014s...
[t-SNE] Computed neighbors for 1408 samples in 0.230s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1408
[t-SNE] Computed conditional probabilities for sample 1408 / 1408
[t-SNE] Mean sigma: 1413.796264
[t-SNE] KL divergence after 250 iterations with early exaggeration: 68.268234
[t-SNE] KL divergence after 1000 iterations: 1.017738
# add tsne results back to pandas dataframe
'tsne_1'] = Y_tsne[:,0]
df['tsne_2'] = Y_tsne[:,1] df[
=(7,7))
plt.figure(figsize
sns.scatterplot(="tsne_1", y="tsne_2",
x="Class",
hue=df,
data=0.5)
alpha'off') plt.axis(
(-52.50039465973089, 40.55504019806097, -62.39752480691256, 49.14921090310397)
- The blood bank controls (
BBD
) are the most prominent cluster - SLE patients also seem to cluster together at the bottom
- IMIDs seem a bit closer to SLE than the nonIMIDs are
- There could be a 2nd cluster smaller that is a mix of all 4 groups. Check if we also see this with different perplexities/UMAP
Different perplexities
= np.linspace(5,50,10) perplexities
= y_class.values == "SLE"
SLE = y_class.values == "IMID"
IMID = y_class.values == "nonIMID"
nonIMID = y_class.values == "BBD"
BBD
= plt.subplots(2, round(len(perplexities)/2), figsize=(25, 10))
(fig, subplots) = 0
i = 0
j for perplexity in perplexities:
if i == round(len(perplexities)/2):
= 0
i = 1
j = subplots[j][i]
ax
= time()
t0 = TSNE(random_state=0, perplexity=perplexity)
tsne = tsne.fit_transform(X)
Y_tsne = time()
t1 print("perplexity=%d in %.2g sec" % (perplexity, t1 - t0))
"Perplexity=%d" % perplexity)
ax.set_title(0], Y_tsne[SLE, 1], c="r")
ax.scatter(Y_tsne[SLE, 0], Y_tsne[IMID, 1], c="g")
ax.scatter(Y_tsne[IMID, 0], Y_tsne[nonIMID, 1], c="b")
ax.scatter(Y_tsne[nonIMID, 0], Y_tsne[BBD, 1], c="c")
ax.scatter(Y_tsne[BBD,
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())'tight')
ax.axis(+= 1
i
'SLE', 'IMID', 'non-IMID', 'Blood bank']) fig.legend([
perplexity=5 in 3.2 sec
perplexity=10 in 3.4 sec
perplexity=15 in 3.8 sec
perplexity=20 in 3.9 sec
perplexity=25 in 4.2 sec
perplexity=30 in 4.5 sec
perplexity=35 in 4.6 sec
perplexity=40 in 4.8 sec
perplexity=45 in 4.7 sec
perplexity=50 in 5.1 sec
<matplotlib.legend.Legend at 0x7fee314a8950>
Different perplexities don’t seem to change the overall picture too much
UMAP
Toy data
Default parameters
= load_digits() digits
= plt.subplots(20, 20)
fig, ax_array = ax_array.flatten()
axes for i, ax in enumerate(axes):
='gray_r')
ax.imshow(digits.images[i], cmap=[], yticks=[], frame_on=False)
plt.setp(axes, xticks=0.5, w_pad=0.01) plt.tight_layout(h_pad
= umap.UMAP(random_state=42) reducer
= reducer.fit_transform(digits.data)
embedding embedding.shape
(1797, 2)
0], embedding[:, 1], c=digits.target, cmap='Spectral', s=5)
plt.scatter(embedding[:, 'equal', 'datalim')
plt.gca().set_aspect(=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.colorbar(boundaries'UMAP projection of the Digits dataset', fontsize=24); plt.title(
Play with parameters
42)
np.random.seed(= np.random.rand(800, 4) # sample from 4 dimensions data
= umap.UMAP()
fit %time u = fit.fit_transform(data)
CPU times: user 6.46 s, sys: 522 ms, total: 6.98 s
Wall time: 2.91 s
0], u[:,1], c=data) # plot 4D data as: R,G,B,alpha
plt.scatter(u[:,'UMAP embedding of random colours'); plt.title(
def draw_umap(n_neighbors=15, min_dist=0.1, n_components=2, metric='euclidean', title=''):
= umap.UMAP(
fit =n_neighbors,
n_neighbors=min_dist,
min_dist=n_components,
n_components=metric
metric
)= fit.fit_transform(data);
u = plt.figure()
fig if n_components == 1:
= fig.add_subplot(111)
ax 0], range(len(u)), c=data)
ax.scatter(u[:,if n_components == 2:
= fig.add_subplot(111)
ax 0], u[:,1], c=data)
ax.scatter(u[:,if n_components == 3:
= fig.add_subplot(111, projection='3d')
ax 0], u[:,1], u[:,2], c=data, s=100)
ax.scatter(u[:,=18) plt.title(title, fontsize
n_neighbors
for n in (2, 5, 10, 20, 50, 100, 200):
=n, title='n_neighbors = {}'.format(n)) draw_umap(n_neighbors
min_dist
for d in (0.0, 0.1, 0.25, 0.5, 0.8, 0.99):
=d, title='min_dist = {}'.format(d)) draw_umap(min_dist
Real data
= StandardScaler()
sc = PowerTransformer(method='box-cox') trf
= pd.concat([data_all, rest[rest.Class.isin(['preSLE', 'rest_large'])]])
df_combined 'Class'] = pd.Categorical(df_combined.Class, categories = ['SLE','IMID','nonIMID','rest_large','BBD','preSLE']) # order for plotting: largest groups first
df_combined[='Class', inplace=True)
df_combined.sort_values(by= np.array(df_combined.loc[:, ~df_combined.columns.isin(symptoms + ['Class'] + ['Symptoms'] + ['dsDNA1'])])
X_combined = trf.fit_transform(X_combined+1) X_combined_scaled
Default parameters
= umap.UMAP(random_state=42)
umap_obj = umap_obj.fit_transform(X) Y_umap
= umap_obj.fit_transform(X_combined_scaled) Y_umap_combined
# add umap results back to pandas dataframes
'umap_1'] = Y_umap[:,0]
df['umap_2'] = Y_umap[:,1] df[
# add umap results back to pandas dataframes
'umap_1'] = Y_umap_combined[:,0]
df_combined['umap_2'] = Y_umap_combined[:,1] df_combined[
df_combined.Class.value_counts()
SLE 483
rest_large 462
BBD 361
IMID 346
nonIMID 218
preSLE 17
Name: Class, dtype: int64
with sns.plotting_context("paper"):
set(font="Arial")
sns.'white')
sns.set_style(= plt.subplots(figsize=(5.5, 5.5))
f, ax = sns.scatterplot(x='umap_1',y='umap_2',
g ='Class', style='Class', size='Class',
hue= {"SLE": "o", "rest_large": "o", "BBD": "o", "IMID": "o", "nonIMID": "o", "preSLE": "o"},
markers = {"SLE": 20, "rest_large": 20, "BBD": 20, "IMID": 20, "nonIMID": 20, "preSLE": 20},
sizes ={"SLE": '#0072B2', "rest_large": '#D55E00', "BBD": '#CC79A7', "IMID": '#009E73', "nonIMID": '#E69F00', "preSLE": '#000000'},
palette=.7,
alpha=df_combined,
data=ax)
ax= ax.legend(frameon=False, loc = 'upper left', bbox_to_anchor=(0,1.05)) # remove legend box; push legend up from upper left
legend 'Dimension 1'); ax.set_ylabel('Dimension 2');
ax.set_xlabel(-6,0)
ax.set_ylim(= ['','SLE','IMID','Non-IMID','Rest','BBD','Pre-SLE']; # remove legend title and change group spelling
new_labels for t, l in zip(legend.texts, new_labels): t.set_text(l)
=f,ax=ax)
sns.despine(fig#f.savefig('umap.png', bbox_inches='tight', dpi=300, transparent=True)
#sf.savefig('umap.pdf', bbox_inches='tight', transparent=True)
def umap_vs_tsne(df,label):
=(14,7))
plt.figure(figsize
= plt.subplot(1, 2, 1)
ax1
sns.scatterplot(="tsne_1", y="tsne_2",
x=label,
hue=df,
data=0.5,
alpha=False,
legend=ax1)
ax"t-SNE (default)")
ax1.set_title('off')
plt.axis(= plt.subplot(1, 2, 2)
ax2
sns.scatterplot(="umap_1", y="umap_2",
x=label,
hue=df,
data=0.5,
alpha=ax2)
ax"UMAP (default)")
ax2.set_title('off') plt.axis(
'Class') umap_vs_tsne(df,
Looks fairly similar to t-SNE results. Clustering of blood bank controls is even clearer, while perhaps SLE is less clear
# filter patients with no symptoms
'Symptoms'].isin(['nefritis_only','arthritis_only','pleurisy_or_pericarditis_only', 'nefritis'])],'Symptoms') umap_vs_tsne(df[df[
Interactive (with all points)
from bokeh.plotting import show, save, output_notebook, output_file
output_notebook()
= df[['Class','Symptoms']].reset_index() hover_data
= umap_obj.fit(X) mapper
= umap.plot.interactive(mapper, labels=y_class, hover_data=hover_data, point_size=4, theme='fire')
p show(p)
= umap.plot.interactive(mapper, labels=y_symptom, hover_data=hover_data, point_size=4, theme='fire')
p show(p)
Different parameters
= (2, 5, 10, 20, 50, 100, 200)
n_neighborss = (0.0, 0.1, 0.25, 0.5, 0.8, 0.99) min_dists
= plt.subplots(len(min_dists), len(n_neighborss), figsize=(30,30))
(fig, subplots)
for i, n_neighbors in enumerate(n_neighborss):
for j, min_dist in enumerate(min_dists):
= subplots[j][i]
ax = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist).fit_transform(X);
u = pd.DataFrame({'umap_1': u[:,0], 'umap_2': u[:,1], 'Class': y_class.values})
df_params ='umap_1', y='umap_2', hue='Class', data=df_params, alpha=0.5, ax=ax, legend=False)
sns.scatterplot(x'n_neighbors = {}, min_dist = {}'.format(n_neighbors, min_dist))
ax.set_title('off') ax.axis(
/home/lcreteig/miniconda3/envs/SLE/lib/python3.7/site-packages/sklearn/manifold/_spectral_embedding.py:236: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
warnings.warn("Graph is not fully connected, spectral embedding"
/home/lcreteig/miniconda3/envs/SLE/lib/python3.7/site-packages/sklearn/manifold/_spectral_embedding.py:236: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
warnings.warn("Graph is not fully connected, spectral embedding"
/home/lcreteig/miniconda3/envs/SLE/lib/python3.7/site-packages/sklearn/manifold/_spectral_embedding.py:236: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
warnings.warn("Graph is not fully connected, spectral embedding"
/home/lcreteig/miniconda3/envs/SLE/lib/python3.7/site-packages/sklearn/manifold/_spectral_embedding.py:236: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
warnings.warn("Graph is not fully connected, spectral embedding"
/home/lcreteig/miniconda3/envs/SLE/lib/python3.7/site-packages/sklearn/manifold/_spectral_embedding.py:236: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
warnings.warn("Graph is not fully connected, spectral embedding"
/home/lcreteig/miniconda3/envs/SLE/lib/python3.7/site-packages/sklearn/manifold/_spectral_embedding.py:236: UserWarning: Graph is not fully connected, spectral embedding may not work as expected.
warnings.warn("Graph is not fully connected, spectral embedding"
Overall pattern remains the same. Could consider slightly higher than default parameters (for both) if we want a less “pinched” shape