From 77724e470e1363c0fc5ee8dc1f445b270ffddd23 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Thu, 31 Aug 2023 17:11:10 +0100 Subject: [PATCH 01/10] add missing requirements --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a5e0bc..ae275be 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,9 @@ Toolkit for annotation | Repository name | branch | URL| |-----------------|--------|----| | ensembl | default | https://github.com/Ensembl/ensembl.git | -| ensembl-analysis | experimental/gbiab | https://github.com/Ensembl/ensembl-analysis.git | (need to make sure depencies are on main and update this to main/default for branch) +| ensembl-analysis | experimental/gbiab | https://github.com/Ensembl/ensembl-analysis.git | (need to make sure dependencies are on main and update this to main/default for branch) +| ensembl-io | default | https://github.com/Ensembl/ensembl-io.git | +| ensembl-taxonomy | default | https://github.com/Ensembl/ensembl-taxonomy.git | | ensembl-variation | default | https://github.com/Ensembl/ensembl-variation.git | From 6e5b515f709e89f15be0a809cb793be9656468e7 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Fri, 1 Sep 2023 12:56:52 +0100 Subject: [PATCH 02/10] bugfix: cover reachable elements via $PATH --- utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 580295a..c7b2958 100644 --- a/utils.py +++ b/utils.py @@ -397,4 +397,7 @@ def check_file(file_path: pathlib.Path): FileNotFoundError """ if not file_path.is_file(): - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) + # Check if the given file path needs to be resolved, e.g. which EukHighConfidenceFilter + file_path = shutil.which(file_path) + if not pathlib.Path(file_path).is_file(): + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) From ed1d86c78daa62bd50703862cbf2c02125133637 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 20 Sep 2023 15:35:22 +0100 Subject: [PATCH 03/10] rely on software environment configuration --- config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.json b/config.json index b03d7b1..6f6e467 100644 --- a/config.json +++ b/config.json @@ -71,7 +71,7 @@ "maxperiod" : 500 }, "trnascan": { - "software": "/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/bin/tRNAscan-SE", - "filter_path" : "/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/bin/EukHighConfidenceFilter" + "software": "tRNAscan-SE", + "filter_path" : "EukHighConfidenceFilter" } } From e332902ff092ebbd9863b5511deb1aee784fd3c2 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 20 Sep 2023 15:37:29 +0100 Subject: [PATCH 04/10] do not rely on ENSCODE but the location of the file instead --- ensembl_anno.py | 22 +++++++++------------- repeatmasking_utils.py | 7 ++++++- simple_feature_utils.py | 7 ++++++- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/ensembl_anno.py b/ensembl_anno.py index dee5b45..d059d08 100644 --- a/ensembl_anno.py +++ b/ensembl_anno.py @@ -37,7 +37,12 @@ import simple_feature_utils import utils -with open(os.environ["ENSCODE"] + "/ensembl-anno/config.json", "r") as f: + +_REPO_ROOT = pathlib.Path(__file__).parent + + +config_file = _REPO_ROOT / "config.json" +with config_file.open("r") as f: config = json.load(f) @@ -1445,17 +1450,8 @@ def run_genblast_align( asnb_file = masked_genome_file + ".asnb" logger.info("ASNB file: %s" % asnb_file) - if not os.path.exists("alignscore.txt"): - shutil.copy( - os.environ["ENSCODE"] + "/ensembl-anno/support_files/alignscore.txt", "./" - ) - # subprocess.run( - # [ - # "cp", - # os.environ["ENSCODE"] + "/ensembl-anno/support_files/alignscore.txt", - # "./", - # ] - # ) + if not Path(f"{genblast_dir}/alignscore.txt").exists(): + shutil.copy(_REPO_ROOT / "support_files" / "alignscore.txt", genblast_dir) if not os.path.exists(masked_genome_file): raise IOError("Masked genome file does not exist: %s" % masked_genome_file) @@ -4565,7 +4561,7 @@ def coallate_results(main_output_dir): # set up logger log_file_path = pathlib.Path(work_dir) / "ensembl_anno.log" - loginipath = pathlib.Path(os.environ["ENSCODE"] + "/ensembl-anno/logging.conf") + loginipath = _REPO_ROOT / "logging.conf" logging.config.fileConfig( loginipath, defaults={"logfilename": log_file_path}, diff --git a/repeatmasking_utils.py b/repeatmasking_utils.py index 4c599be..deb75d3 100644 --- a/repeatmasking_utils.py +++ b/repeatmasking_utils.py @@ -24,8 +24,13 @@ import utils + +_REPO_ROOT = pathlib.Path(__file__).parent + + logger = logging.getLogger(__name__) -with open(os.environ["ENSCODE"] + "/ensembl-anno/config.json", "r") as f: +config_file = _REPO_ROOT / "config.json" +with config_file.open("r") as f: config = json.load(f) diff --git a/simple_feature_utils.py b/simple_feature_utils.py index 75d3671..65a9af3 100644 --- a/simple_feature_utils.py +++ b/simple_feature_utils.py @@ -24,8 +24,13 @@ import utils + +_REPO_ROOT = pathlib.Path(__file__).parent + + logger = logging.getLogger(__name__) -with open(os.environ["ENSCODE"] + "/ensembl-anno/config.json", "r") as f: +config_file = _REPO_ROOT / "config.json" +with config_file.open("r") as f: config = json.load(f) From 1380356d91d309182246f6512c69d807628c71ab Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 20 Sep 2023 15:38:07 +0100 Subject: [PATCH 05/10] generalise and use utils.check_file() everywhere --- ensembl_anno.py | 12 +++--------- utils.py | 11 ++++++----- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/ensembl_anno.py b/ensembl_anno.py index d059d08..2bf7fb9 100644 --- a/ensembl_anno.py +++ b/ensembl_anno.py @@ -529,7 +529,7 @@ def run_trnascan_regions( utils.check_exe(trnascan_path) logger.info(trnascan_path) # check_exe(trnascan_filter_path) - check_file(trnascan_filter_path) + utils.check_file(trnascan_filter_path) logger.info(trnascan_filter_path) trnascan_output_dir = utils.create_dir(main_output_dir, "trnascan_output") @@ -3540,8 +3540,8 @@ def validate_coding_transcripts( subprocess.run(cpc2_cmd) cpc2_output_path = cpc2_output_path + ".txt" - check_file(rnasamba_output_path) - check_file(cpc2_output_path) + utils.check_file(rnasamba_output_path) + utils.check_file(cpc2_output_path) logger.info("diamond validation") diamond_results = None @@ -4188,12 +4188,6 @@ def create_paired_paths(fastq_file_paths): return final_list -def check_file(file_path): - - if not os.path.exists(file_path): - raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) - - def coallate_results(main_output_dir): results_dir = utils.create_dir(main_output_dir, "results") diff --git a/utils.py b/utils.py index c7b2958..3554e75 100644 --- a/utils.py +++ b/utils.py @@ -388,16 +388,17 @@ def reverse_complement(sequence): return sequence.translate(rev_matrix)[::-1] -def check_file(file_path: pathlib.Path): +def check_file(file_path: os.PathLike): """ Raise an error when the file doesn't exist Args: - file_path: pathlib.Path + file_path: File to path Returns: FileNotFoundError """ - if not file_path.is_file(): + fpath = pathlib.Path(file_path) + if not fpath.is_file(): # Check if the given file path needs to be resolved, e.g. which EukHighConfidenceFilter - file_path = shutil.which(file_path) - if not pathlib.Path(file_path).is_file(): + fpath = shutil.which(file_path) + if not fpath or not pathlib.Path(fpath).is_file(): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) From 115dde8e4fd2ab44da53d6a41b4b23f8e4d3717b Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Fri, 29 Sep 2023 11:18:46 +0100 Subject: [PATCH 06/10] update Path and PathLike signatures --- ensembl_anno.py | 33 ++++++++++----------------------- repeatmasking_utils.py | 14 +++++++------- simple_feature_utils.py | 8 ++++---- 3 files changed, 21 insertions(+), 34 deletions(-) diff --git a/ensembl_anno.py b/ensembl_anno.py index 2bf7fb9..80d054f 100644 --- a/ensembl_anno.py +++ b/ensembl_anno.py @@ -4217,6 +4217,7 @@ def coallate_results(main_output_dir): parser.add_argument( "--output_dir", type=str, + default="", help="Path where the output and temp files will write to. \ Uses current dir by default", ) @@ -4480,7 +4481,7 @@ def coallate_results(main_output_dir): ) args = parser.parse_args() - work_dir = args.output_dir + work_dir = pathlib.Path(args.output_dir).absolute() genome_file = args.genome_file num_threads = args.num_threads # masked_genome_file = genome_file # This will be updated later if Red is run @@ -4545,16 +4546,11 @@ def coallate_results(main_output_dir): species = args.repeatmasker_species main_script_dir = os.path.dirname(os.path.realpath(__file__)) - # work_dir=glob.glob(work_dir) if not os.path.exists(genome_file): raise IOError("File does not exist: %s" % genome_file) - if not work_dir: - work_dir = os.getcwd() - # work_dir=glob.glob(work_dir) - # set up logger - log_file_path = pathlib.Path(work_dir) / "ensembl_anno.log" + log_file_path = work_dir / "ensembl_anno.log" loginipath = _REPO_ROOT / "logging.conf" logging.config.fileConfig( loginipath, @@ -4565,25 +4561,16 @@ def coallate_results(main_output_dir): logger.propagate = False logger.info("work directory: %s" % work_dir) - if not os.path.exists(work_dir): - logger.info("Work dir does not exist, will create") - utils.create_dir(work_dir, None) + work_dir.mkdir(parents=True, exist_ok=True) if num_threads == 1: logger.info("Thread count is set to the default value 1; this might be slow.") - if os.path.exists( - os.path.join(work_dir, "red_output", "mask_output") - ) or os.path.join(work_dir, "red_output", "mask_output").endswith(".msk"): - red_genome_file = [ - f - for f in os.listdir(os.path.join(work_dir, "red_output", "mask_output")) - if f.endswith(".msk") - ] + mask_output_path = work_dir / "red_output" / "mask_output" + if mask_output_path.exists() or (mask_output_path.suffix == ".msk"): + red_genome_file = [f for f in mask_output_path.iterdir() if f.endswith(".msk")] logger.info("red_genome_file %s", red_genome_file) - masked_genome_file = os.path.join( - work_dir, "red_output", "mask_output", red_genome_file[0] - ) + masked_genome_file = mask_output_path / red_genome_file[0] else: masked_genome_file = genome_file logger.info("Masked genome file %s", masked_genome_file) @@ -4773,7 +4760,7 @@ def coallate_results(main_output_dir): genblast_path, convert2blastmask_path, makeblastdb_path, - os.path.join(work_dir, "genblast_output"), + work_dir / "genblast_output", protein_file, masked_genome_file, max_intron_length, @@ -4790,7 +4777,7 @@ def coallate_results(main_output_dir): genblast_path, convert2blastmask_path, makeblastdb_path, - os.path.join(work_dir, "busco_output"), + work_dir / "busco_output", busco_protein_file, masked_genome_file, max_intron_length, diff --git a/repeatmasking_utils.py b/repeatmasking_utils.py index deb75d3..5739015 100644 --- a/repeatmasking_utils.py +++ b/repeatmasking_utils.py @@ -35,11 +35,11 @@ def run_repeatmasker_regions( # pylint: disable=too-many-arguments - genome_file: typing.Union[pathlib.Path, str], + genome_file: os.PathLike, repeatmasker_path: str, library: str, species: str, - main_output_dir: str, + main_output_dir: os.PathLike, num_threads: int, ): """ @@ -233,9 +233,9 @@ def create_repeatmasker_gtf( # pylint: disable=too-many-locals def run_dust_regions( - genome_file: typing.Union[pathlib.Path, str], + genome_file: os.PathLike, dust_path: str, - main_output_dir: str, + main_output_dir: os.PathLike, num_threads: int, ): """ @@ -371,9 +371,9 @@ def create_dust_gtf( def run_trf_repeats( # pylint: disable=too-many-locals - genome_file: typing.Union[pathlib.Path, str], + genome_file: os.PathLike, trf_path: str, - main_output_dir: str, + main_output_dir: os.PathLike, num_threads: int, ): """ @@ -568,7 +568,7 @@ def create_trf_gtf( def run_red( - red_path: str, main_output_dir: str, genome_file: typing.Union[pathlib.Path, str] + red_path: str, main_output_dir: os.PathLike, genome_file: os.PathLike ): """ Run Red on genome file diff --git a/simple_feature_utils.py b/simple_feature_utils.py index 65a9af3..60c45db 100644 --- a/simple_feature_utils.py +++ b/simple_feature_utils.py @@ -35,10 +35,10 @@ def run_eponine_regions( # pylint: disable=too-many-locals - genome_file: typing.Union[pathlib.Path, str], + genome_file: os.PathLike, java_path: str, eponine_path: str, - main_output_dir: str, + main_output_dir: os.PathLike, num_threads: int, ): """ @@ -198,9 +198,9 @@ def create_eponine_gtf( def run_cpg_regions( - genome_file: typing.Union[pathlib.Path, str], + genome_file: os.PathLike, cpg_path: str, - main_output_dir: str, + main_output_dir: os.PathLike, num_threads: int, ): """ From 89fd0ee3a4f87e6130b807bbdd64d611bad1059b Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Fri, 29 Sep 2023 11:19:38 +0100 Subject: [PATCH 07/10] simplify utils.create_dir() method --- utils.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/utils.py b/utils.py index 3554e75..3ef641d 100644 --- a/utils.py +++ b/utils.py @@ -37,25 +37,20 @@ def create_dir(main_output_dir, dir_name): Returns: str Path to the created directory """ + target_dir = pathlib.Path(main_output_dir) if dir_name: - target_dir = os.path.join(main_output_dir, dir_name) - else: - target_dir = main_output_dir - - if os.path.exists(target_dir): + target_dir = target_dir / dir_name + if target_dir.exists(): logger.warning("Directory already exists, will not create again") - return target_dir - - logger.info("Attempting to create target dir: %s", target_dir) - - try: - os.mkdir(target_dir) - except OSError: - logger.error("Creation of the dir failed, path used: %s", target_dir) else: - logger.info("Successfully created the dir on the following path: %s", target_dir) - - return target_dir + logger.info("Attempting to create target dir: %s", target_dir) + try: + target_dir.mkdir(parents=True) + except OSError: + logger.error("Creation of the dir failed, path used: %s", target_dir) + else: + logger.info("Successfully created the dir on the following path: %s", target_dir) + return str(target_dir) def check_exe(exe_path): From 3d168559ffefcf46e714eb5569c9bb24e185625f Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Fri, 29 Sep 2023 11:26:06 +0100 Subject: [PATCH 08/10] copy alignscore.txt file to CWD --- ensembl_anno.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ensembl_anno.py b/ensembl_anno.py index 80d054f..36890e6 100644 --- a/ensembl_anno.py +++ b/ensembl_anno.py @@ -1445,13 +1445,12 @@ def run_genblast_align( else: logger.info("No gtf file, go on with the analysis") - genblast_output_file = os.path.join(genblast_dir, "genblast") - asnb_file = masked_genome_file + ".asnb" logger.info("ASNB file: %s" % asnb_file) - if not Path(f"{genblast_dir}/alignscore.txt").exists(): - shutil.copy(_REPO_ROOT / "support_files" / "alignscore.txt", genblast_dir) + alignscore_path = pathlib.Path().absolute() / "alignscore.txt" + if not alignscore_path.exists(): + shutil.copyfile(_REPO_ROOT / "support_files" / "alignscore.txt", alignscore_path) if not os.path.exists(masked_genome_file): raise IOError("Masked genome file does not exist: %s" % masked_genome_file) From 8309170e43750b5eeeab141a20e87c42a7a48dc1 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Fri, 29 Sep 2023 11:26:29 +0100 Subject: [PATCH 09/10] adapt parameter to avoid error message --- ensembl_anno.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ensembl_anno.py b/ensembl_anno.py index 36890e6..527ef29 100644 --- a/ensembl_anno.py +++ b/ensembl_anno.py @@ -1730,7 +1730,7 @@ def run_makeblastdb(makeblastdb_path, masked_genome_file, asnb_file): "-mask_data", asnb_file, "-max_file_sz", - "10000000000", + "4000000000", ] ) logger.info("Completed running makeblastdb") From 8f7c6053bae1d3e9a3cde277fee3efdfcfbed530 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Fri, 6 Oct 2023 16:15:50 +0100 Subject: [PATCH 10/10] bugfix: use suffix for PosixPaths --- ensembl_anno.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ensembl_anno.py b/ensembl_anno.py index 527ef29..c46c203 100644 --- a/ensembl_anno.py +++ b/ensembl_anno.py @@ -4567,7 +4567,7 @@ def coallate_results(main_output_dir): mask_output_path = work_dir / "red_output" / "mask_output" if mask_output_path.exists() or (mask_output_path.suffix == ".msk"): - red_genome_file = [f for f in mask_output_path.iterdir() if f.endswith(".msk")] + red_genome_file = [f for f in mask_output_path.iterdir() if f.suffix == ".msk"] logger.info("red_genome_file %s", red_genome_file) masked_genome_file = mask_output_path / red_genome_file[0] else: