Skip to content

Commit

Permalink
Merge branch 'release-v0.4.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
brainfood committed Mar 12, 2024
2 parents 3e34429 + a9610b5 commit 4db5a80
Show file tree
Hide file tree
Showing 11 changed files with 72 additions and 39 deletions.
10 changes: 5 additions & 5 deletions camparee/beagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class BeagleStep(AbstractCampareeStep):

#Beagle takes input from the VariantsCompilationStep. This will make sure the
#input filename matches up with the name used by the VariantsCompilationStep.
BEAGLE_INTPUT_FILENAME = CAMPAREE_CONSTANTS.VARIANTS_COMPILATION_OUTPUT_FILENAME
BEAGLE_INPUT_FILENAME = CAMPAREE_CONSTANTS.VARIANTS_COMPILATION_OUTPUT_FILENAME

#Name of file where script logging stored.
BEAGLE_LOG_FILENAME = CAMPAREE_CONSTANTS.BEAGLE_LOG_FILENAME
Expand Down Expand Up @@ -57,7 +57,7 @@ def execute(self, beagle_jar_path, seed=None):
the same results.
"""
input_file_path = os.path.join(self.data_directory_path, BeagleStep.BEAGLE_INTPUT_FILENAME)
input_file_path = os.path.join(self.data_directory_path, BeagleStep.BEAGLE_INPUT_FILENAME)
output_file_path = os.path.join(self.data_directory_path, BeagleStep.BEAGLE_OUTPUT_FILENAME)
log_file_path = os.path.join(self.log_directory_path, BeagleStep.BEAGLE_LOG_FILENAME)
command = f"java -jar {beagle_jar_path} gt={input_file_path} out={output_file_path}"
Expand All @@ -78,7 +78,7 @@ def execute(self, beagle_jar_path, seed=None):
try:
beagle_result = subprocess.run(command, shell=True, check=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, #Redirect stderr to stdout.
stderr=subprocess.STDOUT, # Redirect stderr to stdout.
encoding="ascii")
except subprocess.CalledProcessError as beagle_run_exception:
log_file.write("\n*****ERROR: Beagle command failed:\n")
Expand All @@ -90,9 +90,9 @@ def execute(self, beagle_jar_path, seed=None):
raise CampareeException(f"\nBeagle process failed. "
f"For full details see {log_file_path}\n")

print(f"Finished running Beagle.\n")
print("Finished running Beagle.\n")
log_file.write(f"{beagle_result.stdout}\n")
log_file.write(f"\nFinished running Beagle.\n")
log_file.write("\nFinished running Beagle.\n")
log_file.write("ALL DONE!\n")

def get_commandline_call(self, beagle_jar_path, seed=None):
Expand Down
2 changes: 1 addition & 1 deletion camparee/camparee_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,4 @@
MOLECULE_MAKER_LOG_FILENAME="MoleculeMakerStep.log"
)

CAMPAREE_VERSION="0.4.0"
CAMPAREE_VERSION="0.4.1"
48 changes: 30 additions & 18 deletions camparee/expression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,35 +349,35 @@ def validate_and_set_resources(self, resources):
resources_directory_path = resources.get('directory_path', None)
if not resources_directory_path:
resources_directory_path = os.path.join(CAMPAREE_CONSTANTS.CAMPAREE_ROOT_DIR, "resources")
elif not(os.path.exists(resources_directory_path) and os.path.isdir(resources_directory_path)):
elif not (os.path.exists(resources_directory_path) and os.path.isdir(resources_directory_path)):
print(f"The given resources directory, {resources_directory_path}, must exist as a directory.",
file=sys.stderr)
return False

# Insure that the species model directory exists. No point in continuing util this problem is
# resolved.
species_model_directory_path = os.path.join(resources_directory_path, resources['species_model'])
if not(os.path.exists(species_model_directory_path) and os.path.isdir(species_model_directory_path)):
if not (os.path.exists(species_model_directory_path) and os.path.isdir(species_model_directory_path)):
print(f"The species model directory, {species_model_directory_path}, must exist as a directory",
file=sys.stderr)
return False

# Insure that the reference genome file path exists and points to a file.
self.reference_genome_file_path = os.path.join(species_model_directory_path, resources['reference_genome_filename'])
if not(os.path.exists(self.reference_genome_file_path) and os.path.isfile(self.reference_genome_file_path)):
if not (os.path.exists(self.reference_genome_file_path) and os.path.isfile(self.reference_genome_file_path)):
print(f"The reference genome file path, {self.reference_genome_file_path}, must exist as"
f" a file.", file=sys.stderr)
valid = False

# Insure that the chromosome ploidy file path exists and points to a file.
self.chr_ploidy_file_path = os.path.join(species_model_directory_path, resources['chr_ploidy_filename'])
if not(os.path.exists(self.chr_ploidy_file_path) and os.path.isfile(self.chr_ploidy_file_path)):
if not (os.path.exists(self.chr_ploidy_file_path) and os.path.isfile(self.chr_ploidy_file_path)):
print(f"The chr ploidy file path, {self.chr_ploidy_file_path} must exist as a file", file=sys.stderr)
valid = False

# Insure that the annotations file path exists and points to a file.
self.annotation_file_path = os.path.join(species_model_directory_path, resources['annotation_filename'])
if not(os.path.exists(self.annotation_file_path) and os.path.isfile(self.annotation_file_path)):
if not (os.path.exists(self.annotation_file_path) and os.path.isfile(self.annotation_file_path)):
print(f"The annotation file path, {self.annotation_file_path} must exist as a file", file=sys.stderr)
valid = False

Expand Down Expand Up @@ -453,21 +453,34 @@ def execute(self):
# on the same BAM file at the same time, though I don't know why this would be a problem.

seed = seeds["VariantsCompilationStep"]
phased_output = False
# Can't phase variants with only one sample. Need VariantsCompilationStep
# output (alleles separated by "|", instead of "/") to follow phased format,
# since that's what the GenomeBuilderStep scripts can process.
if len(self.samples) == 1:
phased_output = True
self.run_step(step_name='VariantsCompilationStep',
sample=None,
cmd_line_args=[[sample.sample_id for sample in self.samples],
self.chr_ploidy_file_path,
self.reference_genome_file_path, seed],
self.reference_genome_file_path,
phased_output,
seed],
dependency_list=[f"VariantsFinderStep_{sample.sample_id}" for sample in self.samples])

phased_vcf_file = self.optional_inputs['phased_vcf_file']
# If user did not provide phased vcf file
if phased_vcf_file is None:
seed = seeds["BeagleStep"]
self.run_step(step_name='BeagleStep',
sample=None,
cmd_line_args=[self.beagle_file_path, seed],
dependency_list=["VariantsCompilationStep"])
# Can't phase variants with only one sample
if len(self.samples) == 1:
phased_vcf_file = os.path.join(self.data_directory_path,
CAMPAREE_CONSTANTS.VARIANTS_COMPILATION_OUTPUT_FILENAME)
else:
seed = seeds["BeagleStep"]
self.run_step(step_name='BeagleStep',
sample=None,
cmd_line_args=[self.beagle_file_path, seed],
dependency_list=["VariantsCompilationStep"])

#TODO: We could load all of the steps in the entire pipeline into the queue
# and then just have the queue keep running until everything finishes.
Expand Down Expand Up @@ -704,13 +717,12 @@ def run_step(self, step_name, sample, cmd_line_args, dependency_list=None,
f"{step_name}{f'-{jobname_suffix}' if jobname_suffix else ''}.{self.scheduler_mode}.err")
scheduler_job_name = (f"{step_name}{f'_sample{sample.sample_id}_{sample.sample_name}' if sample else ''}"
f"{f'-{jobname_suffix}' if jobname_suffix else ''}")
scheduler_args = {'job_name' : scheduler_job_name,
'stdout_logfile' : stdout_log,
'stderr_logfile' : stderr_log,
'num_processors' : scheduler_num_processors,
'memory_in_mb' : scheduler_memory_in_mb,
'additional_args' : scheduler_submission_args
}
scheduler_args = {'job_name': scheduler_job_name,
'stdout_logfile': stdout_log,
'stderr_logfile': stderr_log,
'num_processors': scheduler_num_processors,
'memory_in_mb': scheduler_memory_in_mb,
'additional_args': scheduler_submission_args}
command = step_class.get_commandline_call(*cmd_line_args)
validation_attributes = step_class.get_validation_attributes(*cmd_line_args)
output_directory = os.path.join(step_class.data_directory_path,
Expand Down
2 changes: 1 addition & 1 deletion camparee/genome_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def locate_sample(self):
elif line[0:2] == "##":
continue
else:
raise CampareeException(f"No sample data found.")
raise CampareeException("No sample data found.")
return sample_index

def execute(self, sample, phased_vcf_file_path, chr_ploidy_data, reference_genome, chromosome_list=None):
Expand Down
37 changes: 29 additions & 8 deletions camparee/variants_compilation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(self, log_directory_path, data_directory_path, parameters=None):
def validate(self):
return True

def execute(self, sample_id_list, chr_ploidy_data, reference_genome, seed=None):
def execute(self, sample_id_list, chr_ploidy_data, reference_genome, phased_output=False, seed=None):
"""
Entry point into variants_compilation.
Expand All @@ -33,6 +33,10 @@ def execute(self, sample_id_list, chr_ploidy_data, reference_genome, seed=None):
ploidy as values.
reference_genome : dict
Dictionary representation of the reference genome
phased_output : bool
Change character used to separate alleles in VCF output.
True - "|" = alleles have known phases
False - "/" = alleles have unknown phases [default]
seed : int
Seed for random number generator. Used so repeated runs will produce
the same results.
Expand All @@ -43,6 +47,10 @@ def execute(self, sample_id_list, chr_ploidy_data, reference_genome, seed=None):
log_file_path = os.path.join(self.log_directory_path,
CAMPAREE_CONSTANTS.VARIANTS_COMPILATION_LOG_FILENAME)

allele_sep = "/"
if phased_output is True:
allele_sep = "|"

# The common_variant() method defined below uses a random number when
# choosing which of two equally prevalent variants to keep.
if seed is not None:
Expand All @@ -51,6 +59,8 @@ def execute(self, sample_id_list, chr_ploidy_data, reference_genome, seed=None):
with open(log_file_path, "w") as log_file:
print("Converting variants into vcf file")
log_file.write("Converting variants into vcf file\n")
if phased_output is True:
log_file.write("Output formatted as phased alleles.\n")
contigs_so_far = []
last_chromosome = None
# Open then process all the files
Expand Down Expand Up @@ -127,7 +137,7 @@ def common_variant(variants):
most_common_variants = [common_variant(vars) for vars in variants]

# Output nothing if we've now discarded all the variants
if all(var == None for var in most_common_variants):
if all(var is None for var in most_common_variants):
next_lines = [line if not used else sample_file.readline()
for line, used, sample_file in zip(next_lines, used_samples, variant_files)]
continue
Expand Down Expand Up @@ -156,7 +166,7 @@ def variant_length(variant):
alts = []
for variant, num_vars in zip(most_common_variants, num_variants):
if variant is None:
sample_descriptions.append("0/0") # ref-ref if no variants for this sample
sample_descriptions.append(f"0{allele_sep}0") # ref-ref if no variants for this sample
continue

# With deletions, include one extra base
Expand Down Expand Up @@ -189,9 +199,9 @@ def variant_length(variant):
# and the most common alt as the other
# TODO: should we ever use alt-alt with two different alts? we don't currently
if num_vars == 1:
sample_descriptions.append(f"{idx}/{idx}") # alt-alt
sample_descriptions.append(f"{idx}{allele_sep}{idx}") # alt-alt
else:
sample_descriptions.append(f"0/{idx}") # ref-alt
sample_descriptions.append(f"0{allele_sep}{idx}") # ref-alt

line = "\t".join([chromosome,
str(position),
Expand All @@ -214,7 +224,7 @@ def variant_length(variant):
log_file.write(f"Finished creating VCF file for Beagle with {i} variant entries\n")
log_file.write("ALL DONE!\n")

def get_commandline_call(self, samples, chr_ploidy_file_path, reference_genome_file_path, seed=None):
def get_commandline_call(self, samples, chr_ploidy_file_path, reference_genome_file_path, phased_output=False, seed=None):
"""
Prepare command to execute the VariantsCompilationStep from the command
line, given all of the arugments used to run the execute() function.
Expand All @@ -228,6 +238,10 @@ def get_commandline_call(self, samples, chr_ploidy_file_path, reference_genome_f
File that maps chromosome names to their male/female ploidy.
reference_genome_file_path : string
File that maps chromosome names in reference to nucleotide sequence.
phased_output : bool
Change character used to separate alleles in VCF output.
True - "|" = alleles have known phases
False - "/" = alleles have unknown phases [default]
seed : integer
Seed for random number generator. Used so repeated runs will produce
the same results.
Expand All @@ -250,14 +264,15 @@ def get_commandline_call(self, samples, chr_ploidy_file_path, reference_genome_f
f" --data_directory_path {self.data_directory_path}"
f" --sample_ids '{json.dumps(samples)}'"
f" --chr_ploidy_file_path {chr_ploidy_file_path}"
f" --reference_genome_file_path {reference_genome_file_path}")
f" --reference_genome_file_path {reference_genome_file_path}"
f" --phased_output {phased_output}")

if seed is not None:
command += f" --seed {seed}"

return command

def get_validation_attributes(self, samples, chr_ploidy_file_path, reference_genome_file_path, seed=None):
def get_validation_attributes(self, samples, chr_ploidy_file_path, reference_genome_file_path, phased_output=False, seed=None):
"""
Prepare attributes required by is_output_valid() function to validate
output generated the VariantsCompilationStep job.
Expand All @@ -278,6 +293,10 @@ def get_validation_attributes(self, samples, chr_ploidy_file_path, reference_gen
[Note: this parameter is captured just so get_validation_attributes()
accepts the same arguments as get_commandline_call(). It is not used
here.]
phased_output : bool
Change character used to separate alleles in VCF output. [Note: this
parameter is captured just so get_validation_attributes() accepts
the same arguments as get_commandline_call(). It is not used here.]
seed : integer
Seed for random number generator. Used so repeated runs will produce
the same results. [Note: this parameter is captured just so
Expand Down Expand Up @@ -309,6 +328,7 @@ def main():
parser.add_argument('--sample_ids')
parser.add_argument('--chr_ploidy_file_path')
parser.add_argument('--reference_genome_file_path')
parser.add_argument('--phased_output', type=bool, default=False)
parser.add_argument('--seed', type=int, default=None)
args = parser.parse_args()

Expand All @@ -320,6 +340,7 @@ def main():
variants_compiler.execute(sample_id_list,
chr_ploidy_data,
reference_genome,
args.phased_output,
args.seed)

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion config/baby.config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Configuration File for CAMPAREE (v0.4.0)
# Configuration File for CAMPAREE (v0.4.1)
# This config file is prepared to work with the "baby genome" included with
# the CAMPAREE install. The baby genome and test data are derived from short
# (280KB-1MB) segments of mouse (GRCm38/mm10) chromosomes 1, 2, 3, X, and Y,
Expand Down
2 changes: 1 addition & 1 deletion config/template.config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Configuration File for CAMPAREE (v0.4.0)
# Configuration File for CAMPAREE (v0.4.1)
# See baby.config.yaml for an example of a fully specified config file.
# Everything listed below is required, except for those labeled [OPTIONAL].
# To remove optional arguments, they can be commented out by placing a "#"
Expand Down
2 changes: 1 addition & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = '0.4.0'
release = '0.4.1'


# -- General configuration ---------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion doc/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
scipy==1.1.0
scipy==1.10.0
roman==3.0
pandas==0.23.3
matplotlib==3.0.1
Expand Down
2 changes: 1 addition & 1 deletion doc/running_camparee.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Command Line Options
usage: run_camparee.py [-h] -c CONFIG [-r RUN_ID] [-d] [-m {lsf,serial,sge}]
[-s SEED]
CAMPAREE - RNA molecule simulator (v0.4.0)
CAMPAREE - RNA molecule simulator (v0.4.1)
optional arguments:
-h, --help show this help message and exit
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
numpy==1.22.1
scipy==1.7.3
scipy==1.10.0
roman==3.3
pysam==0.18.0
pandas==1.4.0
Expand Down

0 comments on commit 4db5a80

Please sign in to comment.