diff --git a/camparee/beagle.py b/camparee/beagle.py index 1600780..7fc4eae 100644 --- a/camparee/beagle.py +++ b/camparee/beagle.py @@ -23,7 +23,7 @@ class BeagleStep(AbstractCampareeStep): #Beagle takes input from the VariantsCompilationStep. This will make sure the #input filename matches up with the name used by the VariantsCompilationStep. - BEAGLE_INTPUT_FILENAME = CAMPAREE_CONSTANTS.VARIANTS_COMPILATION_OUTPUT_FILENAME + BEAGLE_INPUT_FILENAME = CAMPAREE_CONSTANTS.VARIANTS_COMPILATION_OUTPUT_FILENAME #Name of file where script logging stored. BEAGLE_LOG_FILENAME = CAMPAREE_CONSTANTS.BEAGLE_LOG_FILENAME @@ -57,7 +57,7 @@ def execute(self, beagle_jar_path, seed=None): the same results. """ - input_file_path = os.path.join(self.data_directory_path, BeagleStep.BEAGLE_INTPUT_FILENAME) + input_file_path = os.path.join(self.data_directory_path, BeagleStep.BEAGLE_INPUT_FILENAME) output_file_path = os.path.join(self.data_directory_path, BeagleStep.BEAGLE_OUTPUT_FILENAME) log_file_path = os.path.join(self.log_directory_path, BeagleStep.BEAGLE_LOG_FILENAME) command = f"java -jar {beagle_jar_path} gt={input_file_path} out={output_file_path}" @@ -78,7 +78,7 @@ def execute(self, beagle_jar_path, seed=None): try: beagle_result = subprocess.run(command, shell=True, check=True, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, #Redirect stderr to stdout. + stderr=subprocess.STDOUT, # Redirect stderr to stdout. encoding="ascii") except subprocess.CalledProcessError as beagle_run_exception: log_file.write("\n*****ERROR: Beagle command failed:\n") @@ -90,9 +90,9 @@ def execute(self, beagle_jar_path, seed=None): raise CampareeException(f"\nBeagle process failed. " f"For full details see {log_file_path}\n") - print(f"Finished running Beagle.\n") + print("Finished running Beagle.\n") log_file.write(f"{beagle_result.stdout}\n") - log_file.write(f"\nFinished running Beagle.\n") + log_file.write("\nFinished running Beagle.\n") log_file.write("ALL DONE!\n") def get_commandline_call(self, beagle_jar_path, seed=None): diff --git a/camparee/camparee_constants.py b/camparee/camparee_constants.py index d067f45..51d6950 100644 --- a/camparee/camparee_constants.py +++ b/camparee/camparee_constants.py @@ -147,4 +147,4 @@ MOLECULE_MAKER_LOG_FILENAME="MoleculeMakerStep.log" ) -CAMPAREE_VERSION="0.4.0" +CAMPAREE_VERSION="0.4.1" diff --git a/camparee/expression_pipeline.py b/camparee/expression_pipeline.py index 9f4a751..4cc8505 100644 --- a/camparee/expression_pipeline.py +++ b/camparee/expression_pipeline.py @@ -349,7 +349,7 @@ def validate_and_set_resources(self, resources): resources_directory_path = resources.get('directory_path', None) if not resources_directory_path: resources_directory_path = os.path.join(CAMPAREE_CONSTANTS.CAMPAREE_ROOT_DIR, "resources") - elif not(os.path.exists(resources_directory_path) and os.path.isdir(resources_directory_path)): + elif not (os.path.exists(resources_directory_path) and os.path.isdir(resources_directory_path)): print(f"The given resources directory, {resources_directory_path}, must exist as a directory.", file=sys.stderr) return False @@ -357,27 +357,27 @@ def validate_and_set_resources(self, resources): # Insure that the species model directory exists. No point in continuing util this problem is # resolved. species_model_directory_path = os.path.join(resources_directory_path, resources['species_model']) - if not(os.path.exists(species_model_directory_path) and os.path.isdir(species_model_directory_path)): + if not (os.path.exists(species_model_directory_path) and os.path.isdir(species_model_directory_path)): print(f"The species model directory, {species_model_directory_path}, must exist as a directory", file=sys.stderr) return False # Insure that the reference genome file path exists and points to a file. self.reference_genome_file_path = os.path.join(species_model_directory_path, resources['reference_genome_filename']) - if not(os.path.exists(self.reference_genome_file_path) and os.path.isfile(self.reference_genome_file_path)): + if not (os.path.exists(self.reference_genome_file_path) and os.path.isfile(self.reference_genome_file_path)): print(f"The reference genome file path, {self.reference_genome_file_path}, must exist as" f" a file.", file=sys.stderr) valid = False # Insure that the chromosome ploidy file path exists and points to a file. self.chr_ploidy_file_path = os.path.join(species_model_directory_path, resources['chr_ploidy_filename']) - if not(os.path.exists(self.chr_ploidy_file_path) and os.path.isfile(self.chr_ploidy_file_path)): + if not (os.path.exists(self.chr_ploidy_file_path) and os.path.isfile(self.chr_ploidy_file_path)): print(f"The chr ploidy file path, {self.chr_ploidy_file_path} must exist as a file", file=sys.stderr) valid = False # Insure that the annotations file path exists and points to a file. self.annotation_file_path = os.path.join(species_model_directory_path, resources['annotation_filename']) - if not(os.path.exists(self.annotation_file_path) and os.path.isfile(self.annotation_file_path)): + if not (os.path.exists(self.annotation_file_path) and os.path.isfile(self.annotation_file_path)): print(f"The annotation file path, {self.annotation_file_path} must exist as a file", file=sys.stderr) valid = False @@ -453,21 +453,34 @@ def execute(self): # on the same BAM file at the same time, though I don't know why this would be a problem. seed = seeds["VariantsCompilationStep"] + phased_output = False + # Can't phase variants with only one sample. Need VariantsCompilationStep + # output (alleles separated by "|", instead of "/") to follow phased format, + # since that's what the GenomeBuilderStep scripts can process. + if len(self.samples) == 1: + phased_output = True self.run_step(step_name='VariantsCompilationStep', sample=None, cmd_line_args=[[sample.sample_id for sample in self.samples], self.chr_ploidy_file_path, - self.reference_genome_file_path, seed], + self.reference_genome_file_path, + phased_output, + seed], dependency_list=[f"VariantsFinderStep_{sample.sample_id}" for sample in self.samples]) phased_vcf_file = self.optional_inputs['phased_vcf_file'] # If user did not provide phased vcf file if phased_vcf_file is None: - seed = seeds["BeagleStep"] - self.run_step(step_name='BeagleStep', - sample=None, - cmd_line_args=[self.beagle_file_path, seed], - dependency_list=["VariantsCompilationStep"]) + # Can't phase variants with only one sample + if len(self.samples) == 1: + phased_vcf_file = os.path.join(self.data_directory_path, + CAMPAREE_CONSTANTS.VARIANTS_COMPILATION_OUTPUT_FILENAME) + else: + seed = seeds["BeagleStep"] + self.run_step(step_name='BeagleStep', + sample=None, + cmd_line_args=[self.beagle_file_path, seed], + dependency_list=["VariantsCompilationStep"]) #TODO: We could load all of the steps in the entire pipeline into the queue # and then just have the queue keep running until everything finishes. @@ -704,13 +717,12 @@ def run_step(self, step_name, sample, cmd_line_args, dependency_list=None, f"{step_name}{f'-{jobname_suffix}' if jobname_suffix else ''}.{self.scheduler_mode}.err") scheduler_job_name = (f"{step_name}{f'_sample{sample.sample_id}_{sample.sample_name}' if sample else ''}" f"{f'-{jobname_suffix}' if jobname_suffix else ''}") - scheduler_args = {'job_name' : scheduler_job_name, - 'stdout_logfile' : stdout_log, - 'stderr_logfile' : stderr_log, - 'num_processors' : scheduler_num_processors, - 'memory_in_mb' : scheduler_memory_in_mb, - 'additional_args' : scheduler_submission_args - } + scheduler_args = {'job_name': scheduler_job_name, + 'stdout_logfile': stdout_log, + 'stderr_logfile': stderr_log, + 'num_processors': scheduler_num_processors, + 'memory_in_mb': scheduler_memory_in_mb, + 'additional_args': scheduler_submission_args} command = step_class.get_commandline_call(*cmd_line_args) validation_attributes = step_class.get_validation_attributes(*cmd_line_args) output_directory = os.path.join(step_class.data_directory_path, diff --git a/camparee/genome_builder.py b/camparee/genome_builder.py index 527e1e2..8860731 100644 --- a/camparee/genome_builder.py +++ b/camparee/genome_builder.py @@ -148,7 +148,7 @@ def locate_sample(self): elif line[0:2] == "##": continue else: - raise CampareeException(f"No sample data found.") + raise CampareeException("No sample data found.") return sample_index def execute(self, sample, phased_vcf_file_path, chr_ploidy_data, reference_genome, chromosome_list=None): diff --git a/camparee/variants_compilation.py b/camparee/variants_compilation.py index 3d421c7..3f18e65 100644 --- a/camparee/variants_compilation.py +++ b/camparee/variants_compilation.py @@ -20,7 +20,7 @@ def __init__(self, log_directory_path, data_directory_path, parameters=None): def validate(self): return True - def execute(self, sample_id_list, chr_ploidy_data, reference_genome, seed=None): + def execute(self, sample_id_list, chr_ploidy_data, reference_genome, phased_output=False, seed=None): """ Entry point into variants_compilation. @@ -33,6 +33,10 @@ def execute(self, sample_id_list, chr_ploidy_data, reference_genome, seed=None): ploidy as values. reference_genome : dict Dictionary representation of the reference genome + phased_output : bool + Change character used to separate alleles in VCF output. + True - "|" = alleles have known phases + False - "/" = alleles have unknown phases [default] seed : int Seed for random number generator. Used so repeated runs will produce the same results. @@ -43,6 +47,10 @@ def execute(self, sample_id_list, chr_ploidy_data, reference_genome, seed=None): log_file_path = os.path.join(self.log_directory_path, CAMPAREE_CONSTANTS.VARIANTS_COMPILATION_LOG_FILENAME) + allele_sep = "/" + if phased_output is True: + allele_sep = "|" + # The common_variant() method defined below uses a random number when # choosing which of two equally prevalent variants to keep. if seed is not None: @@ -51,6 +59,8 @@ def execute(self, sample_id_list, chr_ploidy_data, reference_genome, seed=None): with open(log_file_path, "w") as log_file: print("Converting variants into vcf file") log_file.write("Converting variants into vcf file\n") + if phased_output is True: + log_file.write("Output formatted as phased alleles.\n") contigs_so_far = [] last_chromosome = None # Open then process all the files @@ -127,7 +137,7 @@ def common_variant(variants): most_common_variants = [common_variant(vars) for vars in variants] # Output nothing if we've now discarded all the variants - if all(var == None for var in most_common_variants): + if all(var is None for var in most_common_variants): next_lines = [line if not used else sample_file.readline() for line, used, sample_file in zip(next_lines, used_samples, variant_files)] continue @@ -156,7 +166,7 @@ def variant_length(variant): alts = [] for variant, num_vars in zip(most_common_variants, num_variants): if variant is None: - sample_descriptions.append("0/0") # ref-ref if no variants for this sample + sample_descriptions.append(f"0{allele_sep}0") # ref-ref if no variants for this sample continue # With deletions, include one extra base @@ -189,9 +199,9 @@ def variant_length(variant): # and the most common alt as the other # TODO: should we ever use alt-alt with two different alts? we don't currently if num_vars == 1: - sample_descriptions.append(f"{idx}/{idx}") # alt-alt + sample_descriptions.append(f"{idx}{allele_sep}{idx}") # alt-alt else: - sample_descriptions.append(f"0/{idx}") # ref-alt + sample_descriptions.append(f"0{allele_sep}{idx}") # ref-alt line = "\t".join([chromosome, str(position), @@ -214,7 +224,7 @@ def variant_length(variant): log_file.write(f"Finished creating VCF file for Beagle with {i} variant entries\n") log_file.write("ALL DONE!\n") - def get_commandline_call(self, samples, chr_ploidy_file_path, reference_genome_file_path, seed=None): + def get_commandline_call(self, samples, chr_ploidy_file_path, reference_genome_file_path, phased_output=False, seed=None): """ Prepare command to execute the VariantsCompilationStep from the command line, given all of the arugments used to run the execute() function. @@ -228,6 +238,10 @@ def get_commandline_call(self, samples, chr_ploidy_file_path, reference_genome_f File that maps chromosome names to their male/female ploidy. reference_genome_file_path : string File that maps chromosome names in reference to nucleotide sequence. + phased_output : bool + Change character used to separate alleles in VCF output. + True - "|" = alleles have known phases + False - "/" = alleles have unknown phases [default] seed : integer Seed for random number generator. Used so repeated runs will produce the same results. @@ -250,14 +264,15 @@ def get_commandline_call(self, samples, chr_ploidy_file_path, reference_genome_f f" --data_directory_path {self.data_directory_path}" f" --sample_ids '{json.dumps(samples)}'" f" --chr_ploidy_file_path {chr_ploidy_file_path}" - f" --reference_genome_file_path {reference_genome_file_path}") + f" --reference_genome_file_path {reference_genome_file_path}" + f" --phased_output {phased_output}") if seed is not None: command += f" --seed {seed}" return command - def get_validation_attributes(self, samples, chr_ploidy_file_path, reference_genome_file_path, seed=None): + def get_validation_attributes(self, samples, chr_ploidy_file_path, reference_genome_file_path, phased_output=False, seed=None): """ Prepare attributes required by is_output_valid() function to validate output generated the VariantsCompilationStep job. @@ -278,6 +293,10 @@ def get_validation_attributes(self, samples, chr_ploidy_file_path, reference_gen [Note: this parameter is captured just so get_validation_attributes() accepts the same arguments as get_commandline_call(). It is not used here.] + phased_output : bool + Change character used to separate alleles in VCF output. [Note: this + parameter is captured just so get_validation_attributes() accepts + the same arguments as get_commandline_call(). It is not used here.] seed : integer Seed for random number generator. Used so repeated runs will produce the same results. [Note: this parameter is captured just so @@ -309,6 +328,7 @@ def main(): parser.add_argument('--sample_ids') parser.add_argument('--chr_ploidy_file_path') parser.add_argument('--reference_genome_file_path') + parser.add_argument('--phased_output', type=bool, default=False) parser.add_argument('--seed', type=int, default=None) args = parser.parse_args() @@ -320,6 +340,7 @@ def main(): variants_compiler.execute(sample_id_list, chr_ploidy_data, reference_genome, + args.phased_output, args.seed) @staticmethod diff --git a/config/baby.config.yaml b/config/baby.config.yaml index 2a85d67..d6f5bb9 100644 --- a/config/baby.config.yaml +++ b/config/baby.config.yaml @@ -1,4 +1,4 @@ -# Configuration File for CAMPAREE (v0.4.0) +# Configuration File for CAMPAREE (v0.4.1) # This config file is prepared to work with the "baby genome" included with # the CAMPAREE install. The baby genome and test data are derived from short # (280KB-1MB) segments of mouse (GRCm38/mm10) chromosomes 1, 2, 3, X, and Y, diff --git a/config/template.config.yaml b/config/template.config.yaml index 25bf5b7..e5cddbb 100644 --- a/config/template.config.yaml +++ b/config/template.config.yaml @@ -1,4 +1,4 @@ -# Configuration File for CAMPAREE (v0.4.0) +# Configuration File for CAMPAREE (v0.4.1) # See baby.config.yaml for an example of a fully specified config file. # Everything listed below is required, except for those labeled [OPTIONAL]. # To remove optional arguments, they can be commented out by placing a "#" diff --git a/doc/conf.py b/doc/conf.py index 9b293b5..57be075 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -26,7 +26,7 @@ # The short X.Y version version = '' # The full version, including alpha/beta/rc tags -release = '0.4.0' +release = '0.4.1' # -- General configuration --------------------------------------------------- diff --git a/doc/requirements.txt b/doc/requirements.txt index 8b33305..cfb59b4 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,4 +1,4 @@ -scipy==1.1.0 +scipy==1.10.0 roman==3.0 pandas==0.23.3 matplotlib==3.0.1 diff --git a/doc/running_camparee.rst b/doc/running_camparee.rst index 37ba768..e1a0905 100644 --- a/doc/running_camparee.rst +++ b/doc/running_camparee.rst @@ -12,7 +12,7 @@ Command Line Options usage: run_camparee.py [-h] -c CONFIG [-r RUN_ID] [-d] [-m {lsf,serial,sge}] [-s SEED] - CAMPAREE - RNA molecule simulator (v0.4.0) + CAMPAREE - RNA molecule simulator (v0.4.1) optional arguments: -h, --help show this help message and exit diff --git a/requirements.txt b/requirements.txt index 60b6aef..2a5e543 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy==1.22.1 -scipy==1.7.3 +scipy==1.10.0 roman==3.3 pysam==0.18.0 pandas==1.4.0