import pandas as pd
import feather
import os
Preprocessing
Load data
= os.path.join("..", "data","interim")
read_dir = os.path.join("..", "data","processed")
write_dir = os.path.join("..", "references") ref_dir
= pd.read_spss(os.path.join(read_dir, "OtherIMID.sav"),usecols=['samplenr']) # load only sample numbers from this file, as something's wrong with the formatting
other_imid_smps = pd.read_spss(os.path.join(read_dir, "Alles 1e sample.sav")) # SLE patients (also has non- and other-IMID data)
all_first = pd.read_spss(os.path.join(read_dir, "Non-Imid control set.sav"))
non_imid = pd.read_csv(os.path.join(read_dir, "TMO.csv")) # blood bank controls (also has data from SLE patients)
tmo = pd.read_excel(os.path.join(read_dir, "Restgroep voor vergelijking.xlsx"), engine='openpyxl', usecols=['samplenr']) # contains sample numbers from rest group
rest_smps
# load translation table of columns in TMO.csv vs. the .sav files
= pd.read_csv(os.path.join(ref_dir, "chip_colnames.csv"), sep=";") df_cols
= all_first.set_index('samplenr')
all_first = non_imid.set_index('samplenr') non_imid
The all_first
and tmo
datasets contain data from more than one group
= all_first[all_first.SLE == 1] # subset with only SLE patients
sle = tmo[tmo.Class == "nonSLE"] # subset with only blood bank controls blood_bank
# fix other-imids
= [f'{_:04}A' for _ in pd.to_numeric(other_imid_smps['samplenr'],downcast='unsigned')] # fix other_imid sample numbers
smps_fixed = all_first.loc[smps_fixed] # get other-IMIDs from the full dataset other_imid
# samples from patients who had no diagnosis at the time, but were later diagnosed with SLE
= ['0039A','0159A','0222A','0228A','0575A','0633A','1080A','1117A','1158A','1160A','1166A','1193A','1223A','1305A','1451A','1981A','0972A']
pre_smps = all_first.loc[pre_smps] pre_sle
# samples from patients who had no diagnosis at the time, or later, and are not in the non-imid group
= set(rest_smps['samplenr']) - set(non_imid.index)
rest_set = all_first.loc[rest_set] rest
# we can also define the rest group less strictly, as "all the remaining samples":
= (set(all_first.index) - # set with all patient data from their first samples
rest_set_large set(non_imid.index) - # take out the non-IMIDS
set(other_imid.index) - # take out the other-IMIDS
set(sle.index) - # take out the SLE patients
set(all_first.index[all_first.dsDNA2.isna()]) - # take out those patients that weren't run on the chip (dsDNA2 column is empty)
set(pre_sle.index)) # take out the pre-SLE samples, because we want to compare them to this group
= all_first.loc[rest_set_large] rest_large
# samples from patients with lupus-like disease. N.B this includes 2 pre-sle patients
= other_imid[other_imid.LLD==1] lld
len(set(all_first.index) - # set with all patient data from their first samples
set(non_imid.index) - # take out the non-IMIDS
set(other_imid.index) - # take out the other-IMIDS
set(sle.index) - # take out the SLE patients
set(all_first.index[all_first.dsDNA2.isna()]))
472
Process
The chip columns are called differently in the blood_bank
dataset than in the others.
df_cols
TF | TB_all | TB_selection | |
---|---|---|---|
0 | Actinin | Actinin | Actinin |
1 | anti-IgE | antiIgE | NaN |
2 | ASCA | ASCA | ASCA |
3 | Beta2GP1 | Beta2GP1 | Beta2GP1 |
4 | C1q | C1q | C1q |
... | ... | ... | ... |
96 | NaN | Strep15 | NaN |
97 | NaN | Strep16 | NaN |
98 | TIF1gamma | TIF1gamma | TIF1gamma |
99 | TPO | TPO | TPO |
100 | tTG | tTG | tTG |
101 rows × 3 columns
TF
are the names inblood_bank
TB_all
are the names in the other dfsTB_selection
are names of the variables that should be most interesting (e.g. excluding control spots on the chip).
Each row corresponds to the same variable, but it might have a different name in each column!
Rename the columns in blood_bank
as in the other data sets:
= df_cols.TB_all[df_cols.TF.notnull()].tolist() # list of new names for blood bank columns
new_colnames = blood_bank.drop(columns='Class') # this column is in blood_bank, but not in the list (we'll add it back later)
blood_bank = new_colnames # rename columns as in other datasets blood_bank.columns
We want only the rows that have an entry in all three columns: these are the variables we want to use
= df_cols.dropna().TB_all.tolist() # names of variables that exist in both datasets, and that are of interest
keep_cols keep_cols
['Actinin',
'ASCA',
'Beta2GP1',
'C1q',
'C3b',
'Cardiolipin',
'CCP1arg',
'CCP1cit',
'CENP',
'CMV',
'CollagenII',
'CpGmot',
'CRP1',
'DFS70',
'dsDNA2',
'Enolasearg',
'Enolasecit',
'EphB2',
'FcER',
'Fibrillarin',
'Ficolin',
'GAPDH',
'GBM',
'H2Bp',
'H2Bpac',
'H4p',
'H4pac',
'Histones',
'IFNLambda',
'IFNOmega',
'Jo1',
'Ku',
'LaSSB',
'MBL2',
'Mi2',
'Nucleosome',
'PCNA',
'Pentraxin3',
'PmScl100',
'RA33',
'RipP0',
'RipP0peptide',
'RipP1',
'RipP2',
'RNAPolIII',
'RNP70',
'RNPA',
'RNPC',
'Ro52',
'Ro60',
'RPP25ThTo',
'Scl70',
'SmBB',
'SMP',
'TIF1gamma',
'TPO',
'tTG']
Aside from diagnosis, we’re also interested in discrimating between patients depending on the presence of these 4 symptoms:
= ['Arthritis', 'Pleurisy', 'Pericarditis', 'Nefritis'] symptoms
Likewise, we also want to compare anti-dsDNA measured on the microchip array (dsDNA2
) to the standard measurements taken in the clinic (dsDNA1
)
= ['dsDNA1'] lab
In all datasets, keep only columns of interest
= blood_bank.loc[:,keep_cols] # we don't have symptoms info or lab dsDNA for this group
blood_bank = other_imid.loc[:,keep_cols+symptoms+lab]
other_imid = non_imid.loc[:,keep_cols+symptoms+lab]
non_imid = sle.loc[:,keep_cols+symptoms+lab] sle
= pre_sle.loc[:,keep_cols+symptoms+lab]
pre_sle = rest.loc[:,keep_cols+symptoms+lab]
rest = rest_large.loc[:,keep_cols+symptoms+lab]
rest_large = lld.loc[:,keep_cols+symptoms+lab] lld
Discard one SLE patient with missing data
= sle.dropna(subset=keep_cols) # serum from one SLE patient was not run on chip sle
And row-bind all the data frames together
# add class to distinguish from others
'Class'] = "BBD"
blood_bank['Class'] = "IMID"
other_imid['Class'] = "nonIMID"
non_imid['Class'] = "SLE"
sle[# join all data frames together by binding rows
= pd.concat([sle, other_imid, non_imid, blood_bank]) df_all
'Class'] = "rest"
rest['Class'] = "rest_large"
rest_large['Class'] = "preSLE"
pre_sle['Class'] = "LLD"
lld[= pd.concat([pre_sle, rest, rest_large, lld]) df_eval
'Class'].value_counts() df_all[
SLE 483
BBD 361
IMID 346
nonIMID 218
Name: Class, dtype: int64
'Class'].value_counts() df_eval[
rest_large 462
rest 415
LLD 28
preSLE 17
Name: Class, dtype: int64
Write data
"imid.feather"))
feather.write_dataframe(df_all, os.path.join(write_dir, "rest.feather")) feather.write_dataframe(df_eval, os.path.join(write_dir,