Refactor final scripts

yossawat-suw · Oct 31, 2023 · ae97df5 · ae97df5
1 parent 9430e05
commit ae97df5
Show file tree

Hide file tree

Showing 14 changed files with 836 additions and 517 deletions.
diff --git a/env/NSEA_env.yml b/env/NSEA_env.yml
@@ -26,4 +26,5 @@ dependencies:
   - jax
   - numpyro
   - blackjax
-  - nutpie
+  - nutpie
+  - multiprocess
diff --git a/scripts_datasets/make_wave1.py b/scripts_datasets/make_wave1.py
@@ -0,0 +1,53 @@
+# %% ########################
+# IMPORTING
+%load_ext autoreload 
+%autoreload 2
+
+import sys
+sys.path.append("..")   # fix to import modules from root
+from src.general_imports import *
+
+# %% ########################
+# LOAD DATA
+
+DATASET_NAME = 'wave1'
+
+pheno = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave1/phenotypes_and_prevalent_disease.csv', index_col='Basename')
+survival = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave1/survival.csv', index_col='Basename')
+clock_results = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave1/DNAmAge_output.csv', index_col='Basename')
+
+# %% ########################
+# ADD PARTICIPANT META DATA
+
+# Create weighted_smoke phenotype
+# Normalize pack_years data
+pheno['norm_pack_years'] = np.log(1+pheno.pack_years)
+
+# Combine ever_smoke with pack_years
+pheno['weighted_smoke'] = pheno['norm_pack_years']/np.exp(pheno['ever_smoke'])
+
+pheno['log_bmi'] = np.log(pheno.bmi)
+pheno['log_pack_1'] = np.log(pheno.pack_years+1)
+pheno['log_units_1'] = np.log(pheno.units+1)
+
+# Add accelerations given to genscot participant by other clocks
+####################
+# rename columns to some manageable naming 
+clock_results = clock_results.rename(columns={
+    'AgeAccelerationResidualHannum': 'Hannum',
+    'EEAA': 'Horvath',
+    'AgeAccelGrim': 'GrimAge',
+    'AgeAccelPheno': 'PhenoAge',
+    })
+clock_columns = ['Hannum','Horvath','GrimAge','PhenoAge']
+pheno[clock_columns] = clock_results[clock_columns]
+
+###
+pheno[['Event', 'tte']] = survival[['Event', 'tte']]
+
+# %% ########################
+# SAVE RESULTS
+
+pheno.to_csv(f'{paths.DATA_PROCESSED_DIR}/{DATASET_NAME}_participants.csv')
+
+# %%
diff --git a/scripts_datasets/make_wave3.py b/scripts_datasets/make_wave3.py
@@ -0,0 +1,53 @@
+# %% ########################
+# IMPORTING
+%load_ext autoreload 
+%autoreload 2
+
+import sys
+sys.path.append("..")   # fix to import modules from root
+from src.general_imports import *
+
+# %% ########################
+# LOAD DATA
+
+DATASET_NAME = 'wave3'
+
+pheno = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave3/phenotypes_and_prevalent_disease.csv', index_col='Basename')
+survival = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave3/survival.csv', index_col='Basename')
+clock_results = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave3/DNAmAge_output.csv', index_col='Basename')
+
+# %% ########################
+# ADD PARTICIPANT META DATA
+
+# Create weighted_smoke phenotype
+# Normalize pack_years data
+pheno['norm_pack_years'] = np.log(1+pheno.pack_years)
+
+# Combine ever_smoke with pack_years
+pheno['weighted_smoke'] = pheno['norm_pack_years']/np.exp(pheno['ever_smoke'])
+
+pheno['log_bmi'] = np.log(pheno.bmi)
+pheno['log_pack_1'] = np.log(pheno.pack_years+1)
+pheno['log_units_1'] = np.log(pheno.units+1)
+
+# Add accelerations given to genscot participant by other clocks
+####################
+# rename columns to some manageable naming 
+clock_results = clock_results.rename(columns={
+    'AgeAccelerationResidualHannum': 'Hannum',
+    'EEAA': 'Horvath',
+    'AgeAccelGrim': 'GrimAge',
+    'AgeAccelPheno': 'PhenoAge',
+    })
+clock_columns = ['Hannum','Horvath','GrimAge','PhenoAge']
+pheno[clock_columns] = clock_results[clock_columns]
+
+###
+pheno[['Event', 'tte']] = survival[['Event', 'tte']]
+
+# %% ########################
+# SAVE RESULTS
+
+pheno.to_csv(f'{paths.DATA_PROCESSED_DIR}/{DATASET_NAME}_participants.csv')
+
+# %%
diff --git a/scripts_datasets/make_wave4.py b/scripts_datasets/make_wave4.py
@@ -0,0 +1,44 @@
+# %% ########################
+# IMPORTING
+%load_ext autoreload 
+%autoreload 2
+
+import sys
+sys.path.append("..")   # fix to import modules from root
+from src.general_imports import *
+
+# %% ########################
+# LOAD DATA
+
+DATASET_NAME = 'wave4'
+
+pheno = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave4/2023-08-02_w4_phenotypes.csv', index_col='id')
+# survival = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave4/2023-08-02_w4_deaths.csv', index_col='Basename')
+
+sample_meta = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave4/sample_meta.csv', index_col='Sample_Sentrix_ID')
+
+# fix index naming to fit with other waves
+sample_meta = sample_meta[['Sample_Name', 'age', 'sex']]
+sample_meta.index.name = 'Basename'
+sample_meta = sample_meta.rename({'Sample_Name': 'id'}, axis='columns')
+pheno = sample_meta.join(pheno, on='id')
+
+# %% ########################
+# ADD PARTICIPANT META DATA
+
+# Create weighted_smoke phenotype
+# Normalize pack_years data
+pheno['norm_pack_years'] = np.log(1+pheno.pack_years)
+
+# Combine ever_smoke with pack_years
+pheno['weighted_smoke'] = pheno['norm_pack_years']/np.exp(pheno['ever_smoke'])
+
+pheno['log_bmi'] = np.log(pheno.bmi)
+pheno['log_pack_1'] = np.log(pheno.pack_years+1)
+pheno['log_units_1'] = np.log(pheno.units+1)
+
+###
+# pheno[['dob_ym', 'dod_ym']] = survival[['dob_ym', 'dod_ym']]
+
+# %%
+pheno.to_csv(f'{paths.DATA_PROCESSED_DIR}/{DATASET_NAME}_participants.csv')
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,4 +26,5 @@ dependencies: @@
       - jax
       - numpyro
       - blackjax
-      - nutpie
+      - nutpie
+      - multiprocess