Skip to content

Commit

Permalink
Refactor final scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
zuberek committed Oct 31, 2023
1 parent 9430e05 commit ae97df5
Show file tree
Hide file tree
Showing 14 changed files with 836 additions and 517 deletions.
3 changes: 2 additions & 1 deletion env/NSEA_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ dependencies:
- jax
- numpyro
- blackjax
- nutpie
- nutpie
- multiprocess
53 changes: 53 additions & 0 deletions scripts_datasets/make_wave1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# %% ########################
# IMPORTING
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..") # fix to import modules from root
from src.general_imports import *

# %% ########################
# LOAD DATA

DATASET_NAME = 'wave1'

pheno = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave1/phenotypes_and_prevalent_disease.csv', index_col='Basename')
survival = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave1/survival.csv', index_col='Basename')
clock_results = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave1/DNAmAge_output.csv', index_col='Basename')

# %% ########################
# ADD PARTICIPANT META DATA

# Create weighted_smoke phenotype
# Normalize pack_years data
pheno['norm_pack_years'] = np.log(1+pheno.pack_years)

# Combine ever_smoke with pack_years
pheno['weighted_smoke'] = pheno['norm_pack_years']/np.exp(pheno['ever_smoke'])

pheno['log_bmi'] = np.log(pheno.bmi)
pheno['log_pack_1'] = np.log(pheno.pack_years+1)
pheno['log_units_1'] = np.log(pheno.units+1)

# Add accelerations given to genscot participant by other clocks
####################
# rename columns to some manageable naming
clock_results = clock_results.rename(columns={
'AgeAccelerationResidualHannum': 'Hannum',
'EEAA': 'Horvath',
'AgeAccelGrim': 'GrimAge',
'AgeAccelPheno': 'PhenoAge',
})
clock_columns = ['Hannum','Horvath','GrimAge','PhenoAge']
pheno[clock_columns] = clock_results[clock_columns]

###
pheno[['Event', 'tte']] = survival[['Event', 'tte']]

# %% ########################
# SAVE RESULTS

pheno.to_csv(f'{paths.DATA_PROCESSED_DIR}/{DATASET_NAME}_participants.csv')

# %%
53 changes: 53 additions & 0 deletions scripts_datasets/make_wave3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# %% ########################
# IMPORTING
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..") # fix to import modules from root
from src.general_imports import *

# %% ########################
# LOAD DATA

DATASET_NAME = 'wave3'

pheno = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave3/phenotypes_and_prevalent_disease.csv', index_col='Basename')
survival = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave3/survival.csv', index_col='Basename')
clock_results = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave3/DNAmAge_output.csv', index_col='Basename')

# %% ########################
# ADD PARTICIPANT META DATA

# Create weighted_smoke phenotype
# Normalize pack_years data
pheno['norm_pack_years'] = np.log(1+pheno.pack_years)

# Combine ever_smoke with pack_years
pheno['weighted_smoke'] = pheno['norm_pack_years']/np.exp(pheno['ever_smoke'])

pheno['log_bmi'] = np.log(pheno.bmi)
pheno['log_pack_1'] = np.log(pheno.pack_years+1)
pheno['log_units_1'] = np.log(pheno.units+1)

# Add accelerations given to genscot participant by other clocks
####################
# rename columns to some manageable naming
clock_results = clock_results.rename(columns={
'AgeAccelerationResidualHannum': 'Hannum',
'EEAA': 'Horvath',
'AgeAccelGrim': 'GrimAge',
'AgeAccelPheno': 'PhenoAge',
})
clock_columns = ['Hannum','Horvath','GrimAge','PhenoAge']
pheno[clock_columns] = clock_results[clock_columns]

###
pheno[['Event', 'tte']] = survival[['Event', 'tte']]

# %% ########################
# SAVE RESULTS

pheno.to_csv(f'{paths.DATA_PROCESSED_DIR}/{DATASET_NAME}_participants.csv')

# %%
44 changes: 44 additions & 0 deletions scripts_datasets/make_wave4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# %% ########################
# IMPORTING
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..") # fix to import modules from root
from src.general_imports import *

# %% ########################
# LOAD DATA

DATASET_NAME = 'wave4'

pheno = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave4/2023-08-02_w4_phenotypes.csv', index_col='id')
# survival = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave4/2023-08-02_w4_deaths.csv', index_col='Basename')

sample_meta = pd.read_csv(f'{paths.DATA_RAW_DIR}/wave4/sample_meta.csv', index_col='Sample_Sentrix_ID')

# fix index naming to fit with other waves
sample_meta = sample_meta[['Sample_Name', 'age', 'sex']]
sample_meta.index.name = 'Basename'
sample_meta = sample_meta.rename({'Sample_Name': 'id'}, axis='columns')
pheno = sample_meta.join(pheno, on='id')

# %% ########################
# ADD PARTICIPANT META DATA

# Create weighted_smoke phenotype
# Normalize pack_years data
pheno['norm_pack_years'] = np.log(1+pheno.pack_years)

# Combine ever_smoke with pack_years
pheno['weighted_smoke'] = pheno['norm_pack_years']/np.exp(pheno['ever_smoke'])

pheno['log_bmi'] = np.log(pheno.bmi)
pheno['log_pack_1'] = np.log(pheno.pack_years+1)
pheno['log_units_1'] = np.log(pheno.units+1)

###
# pheno[['dob_ym', 'dod_ym']] = survival[['dob_ym', 'dod_ym']]

# %%
pheno.to_csv(f'{paths.DATA_PROCESSED_DIR}/{DATASET_NAME}_participants.csv')
Loading

0 comments on commit ae97df5

Please sign in to comment.