import pandas as pd
import feather
import osPreprocessing
This notebook contains code for the following project:
Brunekreef TE, Reteig LC, Limper M, Haitjema S, Dias J, Mathsson-Alm L, van Laar JM, Otten HG. Microarray analysis of autoantibodies can identify future Systemic Lupus Erythematosus patients. Human Immunology. 2022 Apr 11. doi:10.1016/j.humimm.2022.03.010
Load data
read_dir = os.path.join("..", "data","interim")
write_dir = os.path.join("..", "data","processed")
ref_dir = os.path.join("..", "references")other_imid_smps = pd.read_spss(os.path.join(read_dir, "OtherIMID.sav"),usecols=['samplenr']) # load only sample numbers from this file, as something's wrong with the formatting
all_first = pd.read_spss(os.path.join(read_dir, "Alles 1e sample.sav")) # SLE patients (also has non- and other-IMID data)
non_imid = pd.read_spss(os.path.join(read_dir, "Non-Imid control set.sav"))
tmo = pd.read_csv(os.path.join(read_dir, "TMO.csv")) # blood bank controls (also has data from SLE patients)
rest_smps = pd.read_excel(os.path.join(read_dir, "Restgroep voor vergelijking.xlsx"), engine='openpyxl', usecols=['samplenr']) # contains sample numbers from rest group
# load translation table of columns in TMO.csv vs. the .sav files
df_cols = pd.read_csv(os.path.join(ref_dir, "chip_colnames.csv"), sep=";")all_first = all_first.set_index('samplenr')
non_imid = non_imid.set_index('samplenr')The all_first and tmo datasets contain data from more than one group
sle = all_first[all_first.SLE == 1] # subset with only SLE patients
blood_bank = tmo[tmo.Class == "nonSLE"] # subset with only blood bank controls# fix other-imids
smps_fixed = [f'{_:04}A' for _ in pd.to_numeric(other_imid_smps['samplenr'],downcast='unsigned')] # fix other_imid sample numbers
other_imid = all_first.loc[smps_fixed] # get other-IMIDs from the full dataset# samples from patients who had no diagnosis at the time, but were later diagnosed with SLE
pre_smps = ['0039A','0159A','0222A','0228A','0575A','0633A','1080A','1117A','1158A','1160A','1166A','1193A','1223A','1305A','1451A','1981A','0972A']
pre_sle = all_first.loc[pre_smps]# samples from patients who had no diagnosis at the time, or later, and are not in the non-imid group
rest_set = set(rest_smps['samplenr']) - set(non_imid.index)
rest = all_first.loc[rest_set]# we can also define the rest group less strictly, as "all the remaining samples":
rest_set_large = (set(all_first.index) - # set with all patient data from their first samples
set(non_imid.index) - # take out the non-IMIDS
set(other_imid.index) - # take out the other-IMIDS
set(sle.index) - # take out the SLE patients
set(all_first.index[all_first.dsDNA2.isna()]) - # take out those patients that weren't run on the chip (dsDNA2 column is empty)
set(pre_sle.index)) # take out the pre-SLE samples, because we want to compare them to this group
rest_large = all_first.loc[rest_set_large]# samples from patients with lupus-like disease. N.B this includes 2 pre-sle patients
lld = other_imid[other_imid.LLD==1]len(set(all_first.index) - # set with all patient data from their first samples
set(non_imid.index) - # take out the non-IMIDS
set(other_imid.index) - # take out the other-IMIDS
set(sle.index) - # take out the SLE patients
set(all_first.index[all_first.dsDNA2.isna()]))472
Process
The chip columns are called differently in the blood_bank dataset than in the others.
df_cols| TF | TB_all | TB_selection | |
|---|---|---|---|
| 0 | Actinin | Actinin | Actinin |
| 1 | anti-IgE | antiIgE | NaN |
| 2 | ASCA | ASCA | ASCA |
| 3 | Beta2GP1 | Beta2GP1 | Beta2GP1 |
| 4 | C1q | C1q | C1q |
| ... | ... | ... | ... |
| 96 | NaN | Strep15 | NaN |
| 97 | NaN | Strep16 | NaN |
| 98 | TIF1gamma | TIF1gamma | TIF1gamma |
| 99 | TPO | TPO | TPO |
| 100 | tTG | tTG | tTG |
101 rows × 3 columns
TFare the names inblood_bankTB_allare the names in the other dfsTB_selectionare names of the variables that should be most interesting (e.g. excluding control spots on the chip).
Each row corresponds to the same variable, but it might have a different name in each column!
Rename the columns in blood_bank as in the other data sets:
new_colnames = df_cols.TB_all[df_cols.TF.notnull()].tolist() # list of new names for blood bank columns
blood_bank = blood_bank.drop(columns='Class') # this column is in blood_bank, but not in the list (we'll add it back later)
blood_bank.columns = new_colnames # rename columns as in other datasetsWe want only the rows that have an entry in all three columns: these are the variables we want to use
keep_cols = df_cols.dropna().TB_all.tolist() # names of variables that exist in both datasets, and that are of interest
keep_cols['Actinin',
'ASCA',
'Beta2GP1',
'C1q',
'C3b',
'Cardiolipin',
'CCP1arg',
'CCP1cit',
'CENP',
'CMV',
'CollagenII',
'CpGmot',
'CRP1',
'DFS70',
'dsDNA2',
'Enolasearg',
'Enolasecit',
'EphB2',
'FcER',
'Fibrillarin',
'Ficolin',
'GAPDH',
'GBM',
'H2Bp',
'H2Bpac',
'H4p',
'H4pac',
'Histones',
'IFNLambda',
'IFNOmega',
'Jo1',
'Ku',
'LaSSB',
'MBL2',
'Mi2',
'Nucleosome',
'PCNA',
'Pentraxin3',
'PmScl100',
'RA33',
'RipP0',
'RipP0peptide',
'RipP1',
'RipP2',
'RNAPolIII',
'RNP70',
'RNPA',
'RNPC',
'Ro52',
'Ro60',
'RPP25ThTo',
'Scl70',
'SmBB',
'SMP',
'TIF1gamma',
'TPO',
'tTG']
Aside from diagnosis, we’re also interested in discrimating between patients depending on the presence of these 4 symptoms:
symptoms = ['Arthritis', 'Pleurisy', 'Pericarditis', 'Nefritis']Likewise, we also want to compare anti-dsDNA measured on the microchip array (dsDNA2) to the standard measurements taken in the clinic (dsDNA1)
lab = ['dsDNA1']In all datasets, keep only columns of interest
blood_bank = blood_bank.loc[:,keep_cols] # we don't have symptoms info or lab dsDNA for this group
other_imid = other_imid.loc[:,keep_cols+symptoms+lab]
non_imid = non_imid.loc[:,keep_cols+symptoms+lab]
sle = sle.loc[:,keep_cols+symptoms+lab]pre_sle = pre_sle.loc[:,keep_cols+symptoms+lab]
rest = rest.loc[:,keep_cols+symptoms+lab]
rest_large = rest_large.loc[:,keep_cols+symptoms+lab]
lld = lld.loc[:,keep_cols+symptoms+lab]Discard one SLE patient with missing data
sle = sle.dropna(subset=keep_cols) # serum from one SLE patient was not run on chipAnd row-bind all the data frames together
# add class to distinguish from others
blood_bank['Class'] = "BBD"
other_imid['Class'] = "IMID"
non_imid['Class'] = "nonIMID"
sle['Class'] = "SLE"
# join all data frames together by binding rows
df_all = pd.concat([sle, other_imid, non_imid, blood_bank])rest['Class'] = "rest"
rest_large['Class'] = "rest_large"
pre_sle['Class'] = "preSLE"
lld['Class'] = "LLD"
df_eval = pd.concat([pre_sle, rest, rest_large, lld])df_all['Class'].value_counts()SLE 483
BBD 361
IMID 346
nonIMID 218
Name: Class, dtype: int64
df_eval['Class'].value_counts()rest_large 462
rest 415
LLD 28
preSLE 17
Name: Class, dtype: int64
Write data
feather.write_dataframe(df_all, os.path.join(write_dir, "imid.feather"))
feather.write_dataframe(df_eval, os.path.join(write_dir, "rest.feather"))