-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_Data_forCox.py
127 lines (95 loc) · 7.02 KB
/
load_Data_forCox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def load_Data_forCox():
""" Function that loads data for the three matrix needed:
X containing the clinical, images, Immunoscore DATA
Y the survival time
delta the censure status
open the file for precision about the clinical data available,drop the not-needed ones in the notebook
cleaning of data consists of deleting the clinical data having only one level when categorical
"""
import os
import pandas
import numpy
#pathway to the folder containing all the datas
SAVE_CLI_PATH=os.path.join('..')
#List of usable patients
data_ID=pandas.read_csv(os.path.join('..','List_Patient_HEGP.csv'),sep=",")
#### GENERATING THE X MATRIX ####
# Data theoreticaly available can be found in file : Immunoscore Clinical Data and Formating Requirements 10.2.2015.xlsx
# In practice not all the columns are filled, for different reason, one is that the information does not apply to the cohort, so the whole of the column is empty.
# And for ethical reason, the column race is always ignored.
# Other columns ignored are the ones pertaining to:
# the dates of possible undergone surgeries
# the Survival data
# the additional markers, text not usable in a survival study
# Some of the data is categorical, from 2 to 6 different categories (not counting missing information)
# Missing data is represented by an empty cell in the original file
#the immunoscore
Data_IS_MDT=pandas.read_csv(os.path.join('../Data_output','HEGP_Immunoscore_mean.csv'),sep=',')
#Clinical data, only requested columns
data_cl=pandas.read_csv(os.path.join('../Data_output','data_HEGP_Clinical.txt'),sep=';')
col_names=['OfficialID','gender','age','t_stage_mod_6is4_7is1','n_stage','plnode','m_stage','UICC_TNM','preop_chemo','postop_chemo','postop_biotherapy','surg_pt_type','nlnode','plnode','colon','sidedness','cecum','ascending','splenflex','transverse','hepflex','descending','sigmoid','immuno_tx','differentiation','mucinous_colloide','occlusion','perforation','venous_emboli','lymphatic_invasion','perineural_invasion','msi_ihc','msi_gen','lynch_syndrome','fap','ibs','tumor_budding','cimp','p53_status','kras_status','apc_status','braf_status','pi3k_status','MSI_NBP','CD3_Tumor_Density','CD3_IM_Density','CD8_Tumor_Density','CD8_IM_Density','Immunoscore']
df_final=pandas.DataFrame(data=data_cl[['OfficialID','gender','age','t_stage_mod_6is4_7is1','n_stage','plnode','m_stage','UICC_TNM','preop_chemo','postop_chemo','postop_biotherapy',
'surg_pt_type','nlnode','plnode','colon','sidedness','cecum','ascending','splenflex','transverse','hepflex','descending',
'sigmoid','immuno_tx','differentiation','mucinous_colloide','occlusion','perforation','venous_emboli','lymphatic_invasion','perineural_invasion',
'msi_ihc','msi_gen','lynch_syndrome','fap','ibs','tumor_budding','cimp','p53_status','kras_status','apc_status','braf_status','pi3k_status','MSI_NBP']])
df_final.set_index('OfficialID',inplace=True,drop=False)
Data_IS_MDT.set_index('OfficialID',drop=False,inplace=True)
#Clinical data for the usable patient, other are droped
data_ID['OfficialID']
df_final=df_final.merge(data_ID,on='OfficialID')
df_final.set_index('OfficialID',inplace=True,drop=False)
#getting the density and immunoscore of the usable patients in their own dataFrame, with always the OfficialID for safety
df_CD3_CT=pandas.DataFrame(columns=['OfficialID','CD3_Tumor_Density'])
df_CD3_IM=pandas.DataFrame(columns=['OfficialID','CD3_IM_Density'])
df_CD8_CT=pandas.DataFrame(columns=['OfficialID','CD8_Tumor_Density'])
df_CD8_IM=pandas.DataFrame(columns=['OfficialID','CD8_IM_Density'])
df_IS=pandas.DataFrame(columns=['OfficialID','Immunoscore'])
for patient_id in list(data_ID['OfficialID']):
CD3PATH=os.path.join(SAVE_CLI_PATH,'Data_output','cache_HEGP','cd3',patient_id)
CD8PATH=os.path.join(SAVE_CLI_PATH,'Data_output','cache_HEGP','cd8',patient_id)
df=pandas.DataFrame(data=[[patient_id,[pandas.read_csv(os.path.join(CD3PATH,'Ratio_Cells_per_Tumor'),sep=' ',header=None)]]],columns=['OfficialID','CD3_Tumor_Density'])
df_CD3_CT=df_CD3_CT.append(df)
df=pandas.DataFrame(data=[[patient_id,[pandas.read_csv(os.path.join(CD3PATH,'Ratio_Cells_per_InvasiveFront'),sep=' ',header=None)]]],columns=['OfficialID','CD3_IM_Density'])
df_CD3_IM=df_CD3_IM.append(df)
df=pandas.DataFrame(data=[[patient_id,[pandas.read_csv(os.path.join(CD8PATH,'Ratio_Cells_per_Tumor'),sep=' ',header=None)]]],columns=['OfficialID','CD8_Tumor_Density'])
df_CD8_CT=df_CD8_CT.append(df)
df=pandas.DataFrame(data=[[patient_id,[pandas.read_csv(os.path.join(CD8PATH,'Ratio_Cells_per_InvasiveFront'),sep=' ',header=None)]]],columns=['OfficialID','CD8_IM_Density'])
df_CD8_IM=df_CD8_IM.append(df)
df=pandas.DataFrame(data=[[patient_id,Data_IS_MDT.at[patient_id,'IS_MDT']]],columns=['OfficialID','Immunoscore'])
df_IS=df_IS.append(df)
# Merging the dataFrame, based on OfficialID
df_final=df_final.merge(df_CD3_CT,on='OfficialID')
df_final=df_final.merge(df_CD3_IM,on='OfficialID')
df_final=df_final.merge(df_CD8_CT,on='OfficialID')
df_final=df_final.merge(df_CD8_IM,on='OfficialID')
df_final=df_final.merge(df_IS,on='OfficialID')
#Setting the index on the final DataFrame, patient id.
df_final.set_index('OfficialID',inplace=True,drop=True)
colTypes = pandas.read_csv('datatypes.csv',sep=',')
#Iterate through each row and assign variable type.
#Note: astype is used to assign types
for i, row in colTypes.iterrows(): #i: dataframe index; row: each row in series format
if row['type']=="categorical":
df_final[row['feature']]=df_final[row['feature']].astype(numpy.object)
elif row['type']=="continuous":
df_final[row['feature']]=df_final[row['feature']].astype(numpy.float)
#### GENERATING Y and Delta ####
#Survival data
data_Survival=pandas.read_csv(os.path.join('../Data_output','data_HEGP_Survival.txt'),sep=";")
# We are looking here at the TTR, Time To Relapse.
# The event is the relapse.
# In the file,rc_stat: 1 means relapse has occured
# time in TTR_2012_Days.
# output: time dataframe index on OfficialID time in TTR_2012_Days.
# output: censure dataframe index on Official ID
data_TTR=pandas.DataFrame(data=data_Survival[['OfficialID','TTR_2012_Days','rc_stat']])
data_TTR.set_index('OfficialID',inplace=True,drop=False)
#Selecting only usable patients
data_ID['OfficialID']
data_TTR=data_TTR.merge(data_ID,on='OfficialID')
data_TTR.set_index('OfficialID',inplace=True,drop=False)
data_time=pandas.DataFrame(data=data_TTR[['OfficialID','TTR_2012_Days']])
data_time.set_index('OfficialID',inplace=True,drop=True)
data_cens=pandas.DataFrame(data=data_TTR[['OfficialID','rc_stat']])
data_cens.set_index('OfficialID',inplace=True,drop=True)
return df_final,data_time,data_cens