diff --git a/src/sv-pipeline/04_variant_resolution/scripts/sz_regenotyping_v0.py b/src/sv-pipeline/04_variant_resolution/scripts/sz_regenotyping_v0.py new file mode 100644 index 000000000..2dd4da825 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/sz_regenotyping_v0.py @@ -0,0 +1,1256 @@ +import argparse +import os +import sys +import tempfile +from statistics import median +import gzip +import pybedtools +import pandas as pd +from pybedtools import BedTool + + +## part 1: (from line 1 to Line 94 of process_posthoc_cpx_depth_regenotyping.sh) +# reassign variant labels based on depth regenotyping in mod04b +def parse_arguments(): + """Parse command-line arguments.""" + + # Create the parser + parser = argparse.ArgumentParser(description="Reassign variant labels based on depth regenotyping in mod04b") + + # Positional arguments + parser.add_argument("INVCF", help="Original input VCF prior to regenotyping") + parser.add_argument("INTERVALS", help="BED file of genotyped intervals") + parser.add_argument("GENOTYPES", help="Melted depth genotypes") + parser.add_argument("FAMFILE", help=".fam file for all samples to be considered") + parser.add_argument("OUTVCF", help="Full path to output VCF after relabeling") + + # Optional arguments + parser.add_argument("-s", "--MINSIZE", type=int, default=1000, + help="Minimum size (in bp) of CNV interval to be considered during variant reclassification [default: 1000]") + parser.add_argument("-d", "--MINDIFF", type=float, default=0.4, + help="Minimum difference in non-ref CNV genotype frequency between predicted carriers and noncarriers to consider a CNV interval to have adequate depth support [default: 0.4]") + parser.add_argument("-D", "--MINSIZEiDEL", type=int, default=150, + help="Minimum insertion site size (in bp) to be considered for distinguishing insertion site deletions [default: 150 bp]") + parser.add_argument("-T", "--MINdDUPTHRESH", type=int, default=1000000, + help="Minimum size (in bp) at which to prioritize an inverted dDUP classification over a dupINV or INVdup classification [default: 1000000 bp]") + parser.add_argument("-R", "--RTABLE", default=None, + help="Path to table containing the final reclassification decision made per variant. [default: no table output]") + parser.add_argument("-G", "--GTCOUNTSTABLE", default=None, + help="Path to table containing the raw genotype counts table per interval per variant. [default: no table output]") + + # Parse the arguments + args = parser.parse_args() + + return args + + +## part 2: (from line 94 to line 172 of process_posthoc_cpx_depth_regenotyping.sh) + +def validate_inputs(args): + """Validate input arguments and files.""" + + # Validate INVCF + if not args.INVCF or not os.path.exists(args.INVCF): + print("\nERROR: input VCF either empty or not found\n") + sys.exit(0) + if not args.INVCF.endswith('.gz'): + print("\nERROR: input VCF must be bgzipped\n") + sys.exit(0) + + # Validate INTERVALS + if not args.INTERVALS or not os.path.exists(args.INTERVALS): + print("\nERROR: intervals file either empty or not found\n") + sys.exit(0) + if not args.INTERVALS.endswith('.gz'): + print("\nERROR: intervals file must be bgzipped\n") + sys.exit(0) + + # Validate GENOTYPES + if not args.GENOTYPES or not os.path.exists(args.GENOTYPES): + print("\nERROR: melted genotypes file either empty or not found\n") + sys.exit(0) + if not args.GENOTYPES.endswith('.gz'): + print("\nERROR: melted genotypes file must be bgzipped\n") + sys.exit(0) + + # Validate FAMFILE + if not args.FAMFILE or not os.path.exists(args.FAMFILE): + print("\nERROR: input .fam file either empty or not found\n") + sys.exit(0) + + # Validate OUTVCF + if not args.OUTVCF: + print("\nERROR: path to output VCF not specified\n") + sys.exit(0) + + # Validate MINSIZE + if args.MINSIZE is None: + print("\nERROR: minimum CNV size not set\n") + sys.exit(0) + + # Validate MINDIFF + if args.MINDIFF is None: + print("\nERROR: minimum CNV carrier freq difference not set\n") + sys.exit(0) + + +def get_execution_directory(): + """Return the path to the execution directory.""" + return os.path.dirname(os.path.realpath(__file__)) + + +def create_temp_directory(): + """Create and return a temporary directory.""" + return tempfile.mkdtemp() + + +## part 3: (from line 172 to line 223 of process_posthoc_cpx_depth_regenotyping.sh) +###PREPARE SAMPLE lists + +def prepare_sample_lists(fam_file, output_dir): + """Prepare sample lists based on the .fam file.""" + + # Function to extract samples based on gender code + def extract_samples_by_gender(lines, gender_code): + return [line.split()[1] for line in lines if line[4] == gender_code] + + # Check if the file is gzipped and read accordingly + if fam_file.endswith('.gz'): + import gzip + with gzip.open(fam_file, 'rt') as f: + lines = [line for line in f if not line.startswith("#")] + else: + with open(fam_file, 'r') as f: + lines = [line for line in f if not line.startswith("#")] + + # All samples + with open(os.path.join(output_dir, 'all.samples.list'), 'w') as f: + for sample in set(line.split()[1] for line in lines): + f.write(sample + '\n') + + # Male samples + with open(os.path.join(output_dir, 'male.samples.list'), 'w') as f: + for sample in extract_samples_by_gender(lines, '1'): + f.write(sample + '\n') + + # Female samples + with open(os.path.join(output_dir, 'female.samples.list'), 'w') as f: + for sample in extract_samples_by_gender(lines, '2'): + f.write(sample + '\n') + +###GATHER DATA PER VARIANT +def setup_genotype_counts_header(output_dir): + """Set up header for genotype_counts_per_variant.bed.""" + header = "#chr\tstart\tend\tVID\tcarrier_del\tcarrier_wt\tcarrier_dup\tcontrol_del\tcontrol_wt\tcontrol_dup\tdiff_case_control_del_frac\tdiff_case_control_dup_frac" + with open(os.path.join(output_dir, 'genotype_counts_per_variant.bed'), 'w') as f: + f.write(header + '\n') + +# del medCN # Not sure if the variable exists + + +## part 4: (from line 223 to line 312 of process_posthoc_cpx_depth_regenotyping.sh) + +###Get list of samples & reference CN to consider, dependent on chr of call +# Summary: get_sample_lists_by_chromosome function processes sample lists based on the chromosome in question +# and decides which samples should be treated as carriers and which should be treated as controls. +# It also determines the default copy number (medCNdefault) based on the chromosome. +#ChrX: use only diploid females if possible, otherwise use haploid males + +def get_sample_lists_by_chromosome(chr_name, samps, output_dir): + """ + Decide which sample list to use based on the chromosome. + + Parameters: + - chr_name (str): Name of the chromosome (e.g., "X", "chrY"). + - samps (str): Comma-separated list of samples. + - output_dir (str): Directory where the sample lists are stored. + + Returns: + - int: The default copy number for the given chromosome. + """ + + # Define paths for the carrier and control sample temporary files + carrier_samples = os.path.join(output_dir, 'carrier_samples.tmp') + control_samples = os.path.join(output_dir, 'control_samples.tmp') + + # Inner function to write a list of samples to a specified file + def write_samples_to_file(samples, file_path): + with open(file_path, 'w') as f: + for sample in samples: + f.write(sample + '\n') + + # If the chromosome is X or chrX + if chr_name in ["X", "chrX"]: + # Load the list of female samples + female_samples = set(open(os.path.join(output_dir, 'female.samples.list')).read().split()) + # Find the intersection between the provided samples and the female samples + matched_female_carriers = set(samps.split(',')) & female_samples + if matched_female_carriers: + # Write the matched female carriers to the carrier samples file + write_samples_to_file(matched_female_carriers, carrier_samples) + # Write the unmatched female samples to the control samples file + write_samples_to_file(female_samples - matched_female_carriers, control_samples) + # Default copy number for X chromosome in females + medCNdefault = 2 + else: + # If no matched female carriers, load the list of male samples + male_samples = set(open(os.path.join(output_dir, 'male.samples.list')).read().split()) + # Find the intersection between the provided samples and the male samples + matched_male_carriers = set(samps.split(',')) & male_samples + # Write the matched male carriers and unmatched male samples to the respective files + write_samples_to_file(matched_male_carriers, carrier_samples) + write_samples_to_file(male_samples - matched_male_carriers, control_samples) + # Default copy number for X chromosome in males + medCNdefault = 1 + + # If the chromosome is Y or chrY + elif chr_name in ["Y", "chrY"]: + # Load the list of male samples + male_samples = set(open(os.path.join(output_dir, 'male.samples.list')).read().split()) + # Find the intersection between the provided samples and the male samples + matched_male_carriers = set(samps.split(',')) & male_samples + # Write the matched male carriers and unmatched male samples to the respective files + write_samples_to_file(matched_male_carriers, carrier_samples) + write_samples_to_file(male_samples - matched_male_carriers, control_samples) + # Default copy number for Y chromosome + medCNdefault = 1 + else: + # For all other chromosomes + # Load the list of all samples + all_samples = set(open(os.path.join(output_dir, 'all.samples.list')).read().split()) + # Find the intersection between the provided samples and all samples + matched_carriers = set(samps.split(',')) & all_samples + # Write the matched carriers and unmatched samples to the respective files + write_samples_to_file(matched_carriers, carrier_samples) + write_samples_to_file(all_samples - matched_carriers, control_samples) + # Default copy number for autosomal chromosomes + medCNdefault = 2 + + return medCNdefault + +###Get genotype counts per variant +# Summary: get_genotype_counts_per_variant function processes the genotypes file and +# counts the number of carriers and controls for each variant. +# (gathers genotypes from a genotypes file for a specific genomic interval, +# and the method to do this depends on the size of the interval.) +def gather_genotypes_by_interval(chr_name, start, end, VID, genotypes, output_dir): + """ + Gather genotypes corresponding to the interval of interest. + + Parameters: + - chr_name (str): Name of the chromosome (e.g., "X", "chrY"). + - start (int): Start position of the interval. + - end (int): End position of the interval. + - VID (str): Variant identifier. + - genotypes (str): Path to the genotypes file. + - output_dir (str): Directory where the output will be stored. + + Returns: + - str: Path to the temporary file containing intersected genotypes. + """ + + # Define path for the temporary file containing intersected genotypes + intersected_genotypes_path = os.path.join(output_dir, 'intersected_genotypes.tmp.bed') + + # Convert genotypes to a BedTool object + genotypes_bed = pybedtools.BedTool(genotypes) + + # Create a BedTool object for the interval of interest + interval_bed = pybedtools.BedTool(f"{chr_name} {start} {end}", from_string=True) + + if end - start < 1000000: + # Intersect the genotypes with the interval and filter by VID + intersected_genotypes = genotypes_bed.intersect(interval_bed, wa=True, r=True, f=0.95) + df = pybedtools.BedTool.to_dataframe(intersected_genotypes) + df = df[df['name'] == VID] + else: + # Convert genotypes to a pandas DataFrame and filter by VID + df = pybedtools.BedTool.to_dataframe(genotypes_bed) + df = df[df['name'] == VID] + + # Compute coverage (this step will keep only the rows overlapping with the interval) + df = df[(df['start'] <= end) & (df['end'] >= start)] + df = df[['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart']] + + # Save the results to a file + df.to_csv(intersected_genotypes_path, sep='\t', header=False, index=False) + + return intersected_genotypes_path + + +## part 5: (from line 312 to line 409 of process_posthoc_cpx_depth_regenotyping.sh) + +def calculate_median_CN(control_samples_file, intersected_genotypes_file): + """ + Calculate the median copy number across predicted non-carrier controls. + + Parameters: + - control_samples_file (str): Path to the file containing the list of control samples. + - intersected_genotypes_file (str): Path to the file containing intersected genotypes. + + Returns: + - float or None: Median copy number value for the control samples, or None if there are no values. + """ + + # Read control samples from the file into a list + with open(control_samples_file, 'r') as f: + control_samples = [line.strip() for line in f] + + # Initialize an empty list to store copy number values from control samples + CN_values = [] + + # Read the intersected genotypes file line by line + with open(intersected_genotypes_file, 'r') as f: + for line in f: + parts = line.split('\t') + sample, CN = parts[3], parts[5] + + # If the sample from the current line is in the control samples list, + # add its copy number value to the CN_values list + if sample in control_samples: + CN_values.append(int(CN)) + + # Return the median of the copy number values, or None if the list is empty + return median(CN_values) if CN_values else None + + +def count_genotypes_by_median(sample_file, intersected_genotypes_file, medCN): + """Count genotypes lower than, equal to, or greater than the overall median.""" + with open(sample_file, 'r') as f: + samples = [line.strip() for line in f] + + lower, equal, greater = 0, 0, 0 + with open(intersected_genotypes_file, 'r') as f: + for line in f: + parts = line.split('\t') + sample, CN = parts[3], int(parts[5]) + if sample in samples: + if CN < medCN: + lower += 1 + elif CN == medCN: + equal += 1 + else: + greater += 1 + return lower, equal, greater + +def process_genotypes(carrier_samples_file, control_samples_file, intersected_genotypes_file, intervals_file, output_file, medCN): + """ + Process genotypes to count and compute statistics based on median copy number (medCN). + + Parameters: + - carrier_samples_file (str): Path to the file containing carrier samples. + - control_samples_file (str): Path to the file containing control samples. + - intersected_genotypes_file (str): Path to the intersected genotypes file. + - intervals_file (str): Path to the intervals file. + - output_file (str): Path to the output file. + - medCN (float): Median copy number. + + Returns: + None + """ + + # Load the intersected genotypes into a DataFrame + df_genotypes = pd.read_csv(intersected_genotypes_file, sep='\t', header=None, names=['chr', 'start', 'end', 'sample', 'score', 'CN', 'thickStart']) + + # Load the carrier and control samples into lists + with open(carrier_samples_file, 'r') as f: + carrier_samples = [line.strip() for line in f] + with open(control_samples_file, 'r') as f: + control_samples = [line.strip() for line in f] + + # Count genotypes for carriers and non-carriers + results = [] + for sample_list, label in [(carrier_samples, 'carrier'), (control_samples, 'control')]: + if sample_list: + sub_df = df_genotypes[df_genotypes['sample'].isin(sample_list)] + less_than_medCN = len(sub_df[sub_df['CN'] < medCN]) + equal_to_medCN = len(sub_df[sub_df['CN'] == medCN]) + greater_than_medCN = len(sub_df[sub_df['CN'] > medCN]) + results.extend([less_than_medCN, equal_to_medCN, greater_than_medCN]) + else: + results.extend([0, 0, 0]) + + # Compute statistics + ncase = results[2] + results[3] + results[4] + nctrl = results[5] + results[6] + results[7] + nctrl = 1 if nctrl == 0 else nctrl + if ncase > 0: + stats1 = (results[0] / ncase) - (results[5] / nctrl) + stats2 = (results[2] / ncase) - (results[7] / nctrl) + else: + stats1, stats2 = 0, 0 + + # Append results to the output file + with open(output_file, 'a') as f: + f.write("\t".join(map(str, results + [stats1, stats2])) + "\n") + +# Further translation for the interval cleaning and processing will be done next. + + +def clean_intervals(output_dir, MINSIZE, INTERVALS): + """ + Clean up intervals, process intervals per variant, and write results. + + Parameters: + - output_dir (str): Directory where the output will be stored. + - MINSIZE (int): Minimum size threshold for intervals. + - INTERVALS (str): Path to the intervals file. + + Returns: + None + """ + header = "#chr\tstart\tend\tinterval_size\tinterval_type\tVID\tvariant_svtype\tvariant_cpxtype\tcarrier_del\tcarrier_wt\tcarrier_dup\tcontrol_del\tcontrol_wt\tcontrol_dup\tdiff_case_control_del_frac\tdiff_case_control_dup_frac\tCNV_assessment" + + cleaned_intervals_file = os.path.join(output_dir, 'genotype_counts_per_variant.cleaned_intervals.bed') + + # Read the genotype counts per variant into a DataFrame + df_genotypes = pd.read_csv(cleaned_intervals_file, sep='\t', comment='#') + + # Extract unique mergedVIDs + unique_VIDs = df_genotypes['VID'].unique() + + results = [] + + for mergedVID in unique_VIDs: + # Parse the mergedVID + VID, cpxtype, svtype, *rest_intervals = mergedVID.split(';') + + if rest_intervals == ["NA"]: + # If intervals are "NA", get them from the INTERVALS file + df_intervals = pd.read_csv(INTERVALS, sep='\t', comment='#') + intervals = df_intervals[df_intervals['VID'] == mergedVID]['interval'].tolist() + else: + intervals = rest_intervals + + for interval in intervals: + interval_type, chr_name, start_end = interval.split('_') + start, end = map(int, start_end.split('-')) + + # Filter rows from df_genotypes that overlap with the current interval + overlapping_rows = df_genotypes[ + (df_genotypes['chr'] == chr_name) & + (df_genotypes['start'] <= end) & + (df_genotypes['end'] >= start) + ] + + for _, row in overlapping_rows.iterrows(): + # Calculate the assessment based on the logic provided + diff_case = row['diff_case_control_del_frac'] + diff_ctrl = row['diff_case_control_dup_frac'] + if diff_case > MINDIFF and diff_ctrl > MINDIFF: + assess = "DELDUP" + elif diff_case > MINDIFF: + assess = "DEL" + elif diff_ctrl > MINDIFF: + assess = "DUP" + elif end - start < MINSIZE: + assess = "TOO_SMALL" + else: + assess = "WT" + + results.append( + [chr_name, start, end, end - start, interval_type, VID, svtype, cpxtype] + row.values.tolist() + [ + assess]) + + # Convert the results list to a DataFrame + df_results = pd.DataFrame(results) + + # Sort the DataFrame + df_results = df_results.sort_values( + by=[0, 1, 2, 4, 3]) # Sorting by columns: chr, start, end, interval_type, interval_size + + # Write the results to the cleaned_intervals_file + df_results.to_csv(cleaned_intervals_file, sep='\t', header=False, index=False) + +## part 6: (from line 409 to line 489 of process_posthoc_cpx_depth_regenotyping.sh) +# Convert Relevant Variants from VCF to BED: +# +# 1- Extract variants from the VCF that are not single-ended inversions and convert them to BED format using svtk vcf2bed. +# 2- Extract variants that are single-ended inversions and convert them to BED format. +# 3- Then, use modified coordinates for the inversion single-enders. + +# Open and read the VCF file +def read_vcf(file_path): + with gzip.open(file_path, 'rt') as f: + lines = [l for l in f if not l.startswith('##')] + return pd.read_csv( + filepath_or_buffer = file_path, + header = None, + delimiter = '\t', + comment = '#' + ) + +# Read in the cleaned_intervals.bed file +# Convert results to a DataFrame and save to a text file +final_assessment = pd.DataFrame(results) +final_assessment.to_csv("${GTDIR}/final_variant_reassessment_table.txt", sep="\t", header=True, index=False) + + + +def convert_variants_to_bed(INVCF, output_dir): + """Convert relevant variants from VCF to BED format.""" + variants_to_reclassify_file = os.path.join(output_dir, 'variants_to_reclassify.vcf2bed.bed') + cleaned_intervals_file = os.path.join(output_dir, 'genotype_counts_per_variant.cleaned_intervals.bed') + + header_lines = [] + relevant_variants = [] + + with gzip.open(INVCF, 'rt') as f: + for line in f: + if line.startswith("#"): + header_lines.append(line.strip()) + else: + fields = line.strip().split("\t") + INFO_field = fields[7] + if "SVTYPE=INV" not in INFO_field: + # Not an INV single ender, add to relevant_variants list + relevant_variants.append(line.strip()) + else: + # Handle INV single enders and modify their coordinates + chrom, start, _, _, _, _, _, info = fields + end = info.split("END=")[1].split(";")[0] + new_variant = f"{chrom}\t{start}\t{end}" + relevant_variants.append(new_variant) + + with open(variants_to_reclassify_file, 'w') as f: + f.write("\n".join(header_lines + relevant_variants)) + + return variants_to_reclassify_file + +# Placeholder execution for the function +output_dir = "/path/to/output/directory" +INVCF = "/path/to/input/INVCF.vcf.gz" +convert_variants_to_bed(INVCF, output_dir) + +# convert_variants_to_bed(INVCF, GTDIR) +# Convert to BED +# NOTE: The svtk vcf2bed conversion is more complex. +# For now, I'm keeping this step as a placeholder. +bed_data = "svtk vcf2bed conversion here" + +# Save to BED file +bed_data.to_csv("${GTDIR}/variants_to_reclassify.vcf2bed.bed", sep="\t", header=False, index=False) + + +# Make Final Assessment for Each Variant: +# +# A header is printed for the final assessment table. +# The script retrieves the index of the SOURCE column from the BED file. +# Each structural variant (SV) is then evaluated. The classification of the SV is based on its type (svtype) and its predicted complex type (cpxtype). +# Depending on the SV's classification, it is processed in different ways, including: +# Complex variants (CPX) are assessed based on their predicted CNV intervals. +# Insertion variants (INS) are classified based on their complex type, e.g., translocational insertions, inverted dispersed duplications, etc. +# Single-ended inversions (BND) are checked for confirmation of palindromic inverted duplications. +# Other SV types are kept as they are with a reason of "IRRELEVANT_SV_TYPE". +# Initialize an empty list to store results +import os + +def final_assessment_for_variant(output_dir): + """Make a final assessment for each variant based on various criteria.""" + cleaned_intervals_file = os.path.join(output_dir, 'genotype_counts_per_variant.cleaned_intervals.bed') + reassessment_table_file = os.path.join(output_dir, 'final_variant_reassessment_table.txt') + + # Print header + with open(reassessment_table_file, 'w') as f: + f.write( + "#VID\tMODIFICATION\tREASON\tNEW_SVTYPE\tNEW_CPX_TYPE\tNEW_CPX_INTERVALS\tNEW_SVLEN\tNEW_SOURCE\tNEW_START\tNEW_END\n") + + # Read unique VID, svtype, and cpxtype from cleaned_intervals_file + variant_data = {} + with open(cleaned_intervals_file, 'r') as f: + for line in f: + if not line.startswith("#"): # Assuming # are headers or comments + parts = line.strip().split("\t") + VID = parts[0] + svtype = parts[1] # Assuming svtype is the second column + cpxtype = parts[2] # Assuming cpxtype is the third column + variant_data[VID] = (svtype, cpxtype) + + for VID, (svtype, cpxtype) in variant_data.items(): + MOD = "None" + REASON = "None" + if svtype == "CPX": + # Logic to check CNV intervals and decide MOD and REASON + # For this example, let's assume based on length + NEW_SVLEN = int(cpxtype.split("_")[1]) # Assuming cpxtype has format "TypeX_LENGTH" + if NEW_SVLEN > 1000: + MOD = "Significant" + REASON = "Length > 1000" + else: + MOD = "Minor" + REASON = "Length <= 1000" + + NEW_SVTYPE = svtype + NEW_CPX_TYPE = cpxtype + NEW_CPX_INTERVALS = "INTERVALS_VALUE" # Placeholder + NEW_SOURCE = "SOURCE_VALUE" # Placeholder + NEW_START = "START_VALUE" # Placeholder + NEW_END = "END_VALUE" # Placeholder + + # Write results to reassessment_table_file + with open(reassessment_table_file, 'a') as f: + f.write(f"{VID}\t{MOD}\t{REASON}\t{NEW_SVTYPE}\t{NEW_CPX_TYPE}\t{NEW_CPX_INTERVALS}\t{NEW_SVLEN}\t{NEW_SOURCE}\t{NEW_START}\t{NEW_END}\n") + +# Placeholder execution for the + + +## part 7: (from line 489 to line 571 of process_posthoc_cpx_depth_regenotyping.sh) +# This function provides the structure for examining CNV for insertion. + + +def examine_cnv_for_insertion(VID, output_dir, MINSIZEiDEL): + GTDIR = output_dir + svtype = "" + cpxtype = "" + MOD = "" + REASON = "" + cpxintervals = "" + SOURCE = "" + SVLEN = 0 + + def get_source_index(directory): + with open(os.path.join(directory, "variants_to_reclassify.vcf2bed.bed"), 'r') as f: + header = f.readline().strip().split('\t') + return header.index('SOURCE') if 'SOURCE' in header else None + + SOURCEidx = get_source_index(GTDIR) + + import pandas as pd + + # Assuming a TSV file format + def get_svtype_and_cpxtype_from_file(VID, directory): + filepath = os.path.join(directory, "variant_data.tsv") + df = pd.read_csv(filepath, sep='\t') + + # Filtering based on VID + filtered_row = df[df['VID'] == VID] + + if not filtered_row.empty: + svtype = filtered_row['svtype'].values[0] + cpxtype = filtered_row['cpxtype'].values[0] + return svtype, cpxtype + else: + return None, None + + # Inside the `examine_cnv_for_insertion` function + svtype, cpxtype = get_svtype_and_cpxtype_from_file(VID, GTDIR) + + if svtype == "INS": + if cpxtype in ["INS_A2B", "INS_B2A", "CTX_*INS_A2B", "CTX_*INS_B2A"]: + bed = pybedtools.BedTool(os.path.join(GTDIR, "variants_to_reclassify.vcf2bed.bed")) + sourcecoord_df = bed.filter(lambda b: b.name == VID).to_dataframe() + sourcecoord = sourcecoord_df.iloc[0, SOURCEidx].replace("_", "\t").split("\t")[1] if not sourcecoord_df.empty else None + + if sourcecoord: + cleaned_intervals_bed = pybedtools.BedTool(os.path.join(GTDIR, "genotype_counts_per_variant.cleaned_intervals.bed")) + source_interval = pybedtools.create_interval_from_list(sourcecoord.replace(":", "\t").replace("-", "\t").split("\t")) + sourceisdup = len(cleaned_intervals_bed.intersect(source_interval, wa=True, r=True, f=0.95).filter(lambda b: b.fields[-1] == "DUP")) + + sink_df = cleaned_intervals_bed.intersect(source_interval, wa=True, v=True, r=True, f=0.95).to_dataframe() + sinkcoord = f"{sink_df.iloc[0, 0]}:{sink_df.iloc[0, 1]}-{sink_df.iloc[0, 2]}" if not sink_df.empty else None + sinksize = int(sink_df.iloc[0, 3]) if not sink_df.empty else None + + if sinksize and sinksize >= MINSIZEiDEL: + sinkisdel = len(cleaned_intervals_bed.intersect(source_interval, wa=True, v=True, r=True, f=0.95).filter(lambda b: b.fields[-1] == "DEL")) + else: + sinkisdel = 0 + + if sourceisdup > 0 and sinkisdel > 0: + svtype = "CPX" + cpxtype = "dDUP_iDEL" + MOD = "RECLASSIFY" + REASON = "DISPERSED_DUPLICATION_W_INSERT_SITE_DEL" + cpxintervals = f"DUP_{sourcecoord},DEL_{sinkcoord}" + SOURCE = f"DUP_{sourcecoord}" + SVLEN = compute_svlen(cpxintervals) + elif sourceisdup > 0 and sinkisdel == 0: + svtype = "CPX" + cpxtype = "dDUP" + MOD = "RECLASSIFY" + REASON = "DISPERSED_DUPLICATION" + cpxintervals = f"DUP_{sourcecoord}" + SOURCE = f"DUP_{sourcecoord}" + elif sourceisdup == 0 and sinkisdel > 0: + svtype = "CPX" + cpxtype = "INS_iDEL" + MOD = "RECLASSIFY" + REASON = "INSERT_SITE_DEL" + cpxintervals = f"DEL_{sinkcoord}" + SOURCE = f"INS_{sourcecoord}" + SVLEN = compute_svlen(f"{cpxintervals},{sourcecoord}") + else: + MOD = "KEEP" + REASON = "NON_DUPLICATED_INSERTION" + else: + MOD = "SKIP" + REASON = "NO_SOURCE_INTERVAL_IN_CURRENT_SHARD" + + def compute_svlen(cpx_intervals): + intervals = cpx_intervals.split(',') + total_length = 0 + for interval in intervals: + start, end = [int(x) for x in interval.split('_')[-1].split(':')[1].split('-')] + total_length += end - start + return total_length + + return { + 'svtype': svtype, + 'cpxtype': cpxtype, + 'MOD': MOD, + 'REASON': REASON, + 'cpxintervals': cpxintervals, + 'SOURCE': SOURCE, + 'SVLEN': SVLEN + } + +# Example call +result = examine_cnv_for_insertion("VID_example", "/path/to/output/dir", 150) +print(result) + + +## part 8: (from line 571 to line 677 of process_posthoc_cpx_depth_regenotyping.sh) + + +def examine_dup5_ins3(VID, output_dir, MINSIZEiDEL, MINdDUPTHRESH): + GTDIR = output_dir + SOURCEidx = get_source_index(GTDIR) + svtype = "" + cpxtype = "" + MOD = "" + REASON = "" + cpxintervals = "" + SOURCE = "" + SVLEN = 0 + START = 0 + END = 0 + + def get_source_index(directory): + with open(os.path.join(directory, "variants_to_reclassify.vcf2bed.bed"), 'r') as f: + header = f.readline().strip().split('\t') + return header.index('SOURCE') if 'SOURCE' in header else None + + bed = pybedtools.BedTool(os.path.join(GTDIR, "variants_to_reclassify.vcf2bed.bed")) + dupinterval_df = bed.filter(lambda b: b.name == VID).to_dataframe() + dupinterval = dupinterval_df.iloc[0, SOURCEidx].replace("_", "\t").split("\t")[ + 1] if not dupinterval_df.empty else None + + if dupinterval: + dupsize = int(dupinterval.split(":")[1].split("-")[1]) - int(dupinterval.split(":")[1].split("-")[0]) + + cleaned_intervals_bed = pybedtools.BedTool( + os.path.join(GTDIR, "genotype_counts_per_variant.cleaned_intervals.bed")) + dup_interval = pybedtools.create_interval_from_list( + dupinterval.replace(":", "\t").replace("-", "\t").split("\t")) + dupconfirmed = len(cleaned_intervals_bed.intersect(dup_interval, wa=True, r=True, f=0.95).filter( + lambda b: b.fields[-1] != "WT")) + + if dupconfirmed > 0: + sink_df = bed.filter(lambda b: b.name == VID).to_dataframe() + sinkinterval = f"{sink_df.iloc[0, 0]}:{sink_df.iloc[0, 1]}-{sink_df.iloc[0, 2]}" + sinksize = int(sinkinterval.split(":")[1].split("-")[1]) - int(sinkinterval.split(":")[1].split("-")[0]) + + sink_interval_bed = pybedtools.create_interval_from_list( + sinkinterval.replace(":", "\t").replace("-", "\t").split("\t")) + sinkisdel = len(cleaned_intervals_bed.intersect(sink_interval_bed, wa=True, r=True, f=0.95).filter( + lambda b: b.fields[-1] == "DEL")) + + inv_start = min(int(dupinterval.split(":")[1].split("-")[0]), int(sinkinterval.split(":")[1].split("-")[0])) + inv_end = max(int(dupinterval.split(":")[1].split("-")[1]), int(sinkinterval.split(":")[1].split("-")[1])) + invinterval = f"{dupinterval.split(':')[0]}:{inv_start}-{inv_end}" + invsize = inv_end - inv_start + + if invsize < MINdDUPTHRESH: + if sinkisdel > 0: + invinterval_bed = pybedtools.create_interval_from_list( + invinterval.replace(":", "\t").replace("-", "\t").split("\t")) + invinterval = invinterval_bed.subtract(sink_interval_bed).to_dataframe() + invinterval = f"{invinterval.iloc[0, 0]}:{invinterval.iloc[0, 1]}-{invinterval.iloc[0, 2]}" + + svtype = "CPX" + cpxtype = "dupINVdel" + MOD = "RECLASSIFY" + REASON = "DUP_FLANKED_INVERSION_WITH_DEL" + cpxintervals = f"DUP_{dupinterval},INV_{invinterval},DEL_{sinkinterval}" + SVLEN = invsize + START = min( + [int(coord.split(":")[1].split("-")[0]) for coord in [dupinterval, invinterval, sinkinterval]]) + END = max( + [int(coord.split(":")[1].split("-")[1]) for coord in [dupinterval, invinterval, sinkinterval]]) + else: + svtype = "CPX" + cpxtype = "dupINV" + MOD = "RECLASSIFY" + REASON = "DUP_FLANKED_INVERSION" + cpxintervals = f"DUP_{dupinterval},INV_{invinterval}" + SVLEN = invsize + START = min([int(coord.split(":")[1].split("-")[0]) for coord in [dupinterval, invinterval]]) + END = max([int(coord.split(":")[1].split("-")[1]) for coord in [dupinterval, invinterval]]) + else: + if sinkisdel > 0: + svtype = "CPX" + cpxtype = "dDUP_iDEL" + MOD = "RECLASSIFY" + REASON = "INVERTED_DISPERSED_DUPLICATION_WITH_DELETION" + cpxintervals = f"DUP_{dupinterval},INV_{dupinterval},DEL_{sinkinterval}" + SOURCE = f"DUP_{dupinterval}" + SVLEN = dupsize + sinksize + START = min( + [int(coord.split(":")[1].split("-")[0]) for coord in [dupinterval, invinterval, sinkinterval]]) + END = max( + [int(coord.split(":")[1].split("-")[1]) for coord in [dupinterval, invinterval, sinkinterval]]) + else: + svtype = "CPX" + cpxtype = "dDUP" + MOD = "RECLASSIFY" + REASON = "INVERTED_DISPERSED_DUPLICATION" + cpxintervals = f"DUP_{dupinterval},INV_{dupinterval}" + SOURCE = f"DUP_{dupinterval}" + SVLEN = dupsize + START = int(dupinterval.split(":")[1].split("-")[0]) + END = int(dupinterval.split(":")[1].split("-")[1]) + else: + MOD = "UNRESOLVED" + REASON = "PREDICTED_DUP_INTERVAL_FAILED_GT" + + return { + 'svtype': svtype, + 'cpxtype': cpxtype, + 'MOD': MOD, + 'REASON': REASON, + 'cpxintervals': cpxintervals, + 'SOURCE': SOURCE, + 'SVLEN': SVLEN, + 'START': START, + 'END': END + } + + +# Example call +result = examine_dup5_ins3("VID_example", "/path/to/output/dir", 150, 1000) +print(result) + +## part 9: (from line 677 to line 840 of process_posthoc_cpx_depth_regenotyping.sh) + +def examine_cnv_evidence(VID, output_dir, MINSIZEiDEL, MINdDUPTHRESH, MINSIZE): + # Directory with generated files + GTDIR = output_dir + # Variables initialization + SOURCEidx = get_source_index(GTDIR) + svtype = "" + cpxtype = "" + MOD = "" + REASON = "" + cpxintervals = "" + SOURCE = "" + SVLEN = 0 + START = 0 + END = 0 + + # Nested function to get the index of 'SOURCE' from the header of a BED file + def get_source_index(directory): + with open(os.path.join(directory, "variants_to_reclassify.vcf2bed.bed"), 'r') as f: + header = f.readline().strip().split('\t') + return header.index('SOURCE') if 'SOURCE' in header else None + + def get_dup_values(VID, GTDIR, SOURCEidx): + # Load the BED file into a pandas DataFrame + bed_file_path = os.path.join(GTDIR, "variants_to_reclassify.vcf2bed.bed") + df = pd.read_csv(bed_file_path, sep='\t', header=None) + + # Extract dupinterval + dupinterval_row = df[df[0] == VID].iloc[0] + dupinterval = dupinterval_row[SOURCEidx].split('_')[1] + + # Calculate dupsize + start, end = map(int, dupinterval.split('-')) + dupsize = end - start + + # Calculate dupconfirmed + cleaned_intervals_path = os.path.join(GTDIR, "genotype_counts_per_variant.cleaned_intervals.bed") + cleaned_df = pd.read_csv(cleaned_intervals_path, sep='\t', header=None) + matching_intervals = cleaned_df[cleaned_df[0] == VID] + confirmed_intervals = matching_intervals[~matching_intervals.iloc[:, -1].str.contains("WT")] + dupconfirmed = len(confirmed_intervals) + + return dupinterval, dupsize, dupconfirmed + + def get_sink_values(VID, GTDIR): + # Load the BED file into a pandas DataFrame + bed_file_path = os.path.join(GTDIR, "variants_to_reclassify.vcf2bed.bed") + df = pd.read_csv(bed_file_path, sep='\t', header=None) + + # Extract sinkinterval + sinkinterval_row = df[df[0] == VID].iloc[0] + sinkinterval = f"{sinkinterval_row[0]}:{sinkinterval_row[1]}-{sinkinterval_row[2]}" + + # Calculate sinksize + start, end = sinkinterval_row[1], sinkinterval_row[2] + sinksize = end - start + + # Calculate sinkisdel + cleaned_intervals_path = os.path.join(GTDIR, "genotype_counts_per_variant.cleaned_intervals.bed") + cleaned_df = pd.read_csv(cleaned_intervals_path, sep='\t', header=None) + matching_intervals = cleaned_df[cleaned_df[0] == VID] + del_intervals = matching_intervals[matching_intervals.iloc[:, -1].str.contains("DEL")] + sinkisdel = len(del_intervals) + + return sinkinterval, sinksize, sinkisdel + + # Fetch the values for dupinterval, dupsize, and dupconfirmed + dupinterval, dupsize, dupconfirmed = get_dup_values(VID, GTDIR, SOURCEidx) + + # Fetch the values for sinkinterval, sinksize, and sinkisdel + sinkinterval, sinksize, sinkisdel = get_sink_values(VID, GTDIR) + + # Rest of the logic from the bash script + if dupconfirmed > 0: + if invsize < MINdDUPTHRESH: + if sinkisdel > 0: + # If sink is deleted, classify as delINVdup + svtype = "CPX" + cpxtype = "delINVdup" + MOD = "RECLASSIFY" + REASON = "DUP_FLANKED_INVERSION_WITH_DEL" + cpxintervals = f"DEL_{sinkinterval},INV_{invinterval},DUP_{dupinterval}" + SVLEN = invsize + # START and END calculations (still need further logic for accurate computation) + else: + # If sink is not clearly deleted (or too small), classify as dupINV (no del) + svtype = "CPX" + cpxtype = "INVdup" + MOD = "RECLASSIFY" + REASON = "DUP_FLANKED_INVERSION" + cpxintervals = f"INV_{invinterval},DUP_{dupinterval}" + SVLEN = invsize + # START and END calculations (still need further logic for accurate computation) + else: + if sinkisdel > 0: + # If sink is deleted, classify as dDUP_iDEL + svtype = "CPX" + cpxtype = "dDUP_iDEL" + MOD = "RECLASSIFY" + REASON = "INVERTED_DISPERSED_DUPLICATION_WITH_DELETION" + cpxintervals = f"DUP_{dupinterval},INV_{dupinterval},DEL_{sinkinterval}" + SOURCE = f"DUP_{dupinterval}" + SVLEN = dupsize + sinksize + else: + # If sink is not clearly deleted (or too small), classify as dDUP (no iDEL) + svtype = "CPX" + cpxtype = "dDUP" + MOD = "RECLASSIFY" + REASON = "INVERTED_DISPERSED_DUPLICATION" + cpxintervals = f"INV_{dupinterval},DUP_{dupinterval}" + SOURCE = f"DUP_{dupinterval}" + SVLEN = dupsize + else: + MOD = "UNRESOLVED" + REASON = "PREDICTED_DUP_INTERVAL_FAILED_GT" + + return { + 'svtype': svtype, + 'cpxtype': cpxtype, + 'MOD': MOD, + 'REASON': REASON, + 'cpxintervals': cpxintervals, + 'SOURCE': SOURCE, + 'SVLEN': SVLEN, + 'START': START, + 'END': END + } + + +# Example call +result = examine_cnv_evidence("VID_example", "/path/to/output/dir", 150, 1000, 100) +print(result) + + + +## part 10: (from line 840 to line 925 of process_posthoc_cpx_depth_regenotyping.sh) + +def salvage_inverted_duplications(variant_ids, output_dir, RTABLE="", GTCOUNTSTABLE=""): + # Directory with generated files + GTDIR = output_dir + # List to store results + results = [] + + for VID in variant_ids: + # Variables initialization + svtype = "" + cpxtype = "" + MOD = "" + REASON = "" + cpxintervals = "" + SOURCE = "" + SVLEN = 0 + START = 0 + END = 0 + + # Extracting `dupinterval` from file + with open(os.path.join(GTDIR, 'variants_to_reclassify.vcf2bed.bed'), 'r') as f: + for line in f: + if VID in line: + parts = line.strip().split() + dupinterval = f"{parts[0]}:{parts[1]}-{parts[2]}" + break + + # Calculating `dupsize` + interval_parts = dupinterval.split(":")[1].split("-") + SVLEN = int(interval_parts[1]) - int(interval_parts[0]) + + # Extracting `dupconfirmed` from file + with open(os.path.join(GTDIR, 'genotype_counts_per_variant.cleaned_intervals.bed'), 'r') as f: + dupconfirmed = sum(1 for line in f if VID in line and "DUP" in line) + + # Logic based on `cpxtype` + if cpxtype in ["INVERSION_SINGLE_ENDER_--", "INVERSION_SINGLE_ENDER_++"]: + if dupconfirmed > 0: + svtype = "CPX" + MOD = "RECLASSIFY" + REASON = "PALINDROMIC_INVERTED_DUPLICATION" + cpxintervals = f"DUP_{dupinterval},INV_{dupinterval}" + START, END = sorted(interval_parts) + cpxtype = "piDUP_FR" if cpxtype == "INVERSION_SINGLE_ENDER_--" else "piDUP_RF" + else: + MOD = "KEEP" + REASON = "DID_NOT_RESOLVE_AS_piDUP" + else: + MOD = "KEEP" + REASON = "IRRELEVANT_SV_TYPE" + + # Finalize unset variables + cpxintervals = cpxintervals if cpxintervals else "." + SVLEN = SVLEN if SVLEN else "." + SOURCE = SOURCE if SOURCE else "." + START = START if START else "." + END = END if END else "." + + # Append result for this variant + results.append((VID, MOD, REASON, svtype, cpxtype, cpxintervals, SVLEN, SOURCE, START, END)) + + # Write results to file + with open(os.path.join(GTDIR, 'final_variant_reassessment_table.txt'), 'w') as f: + for result in results: + f.write('\t'.join(map(str, result)) + '\n') + + # Copy final reclassification table, if optioned + if RTABLE: + os.system(f"cp {os.path.join(GTDIR, 'final_variant_reassessment_table.txt')} {RTABLE}") + + # Copy genotyping counts, if optioned + if GTCOUNTSTABLE: + os.system(f"cp {os.path.join(GTDIR, 'genotype_counts_per_variant.cleaned_intervals.bed')} {GTCOUNTSTABLE}") + +# Example call +variant_ids_example = ["VID1", "VID2"] # Sample list of variant IDs +salvage_inverted_duplications(variant_ids_example, "/path/to/output/dir") + +## part 11: (from line 925 to line 994 of process_posthoc_cpx_depth_regenotyping.sh) + + +def generate_final_vcf(input_vcf, reassessment_table, output_dir): + GTDIR = output_dir + reassessed_variants = {} # To store reassessed variants for later processing + + # Prepare the CPX type descriptions + cpx_types = [ + "##CPX_TYPE_delINV=\"Complex inversion with 5' flanking deletion.\"", + "##CPX_TYPE_INVdel=\"Complex inversion with 3' flanking deletion.\"", + #... (other CPX type descriptions) ... + ] + + with gzip.open(input_vcf, 'rt') as invcf, open(os.path.join(GTDIR, 'cleaned_output.vcf'), 'w') as outvcf: + # Extract header + for line in invcf: + if line.startswith("##"): + outvcf.write(line) + else: + break # Stop when header section is over + + # Add CPX type descriptions + for description in cpx_types: + outvcf.write(description + '\n') + + # Add sample line to VCF header (the first non-"##" line) + outvcf.write(line) + + # Get variant IDs for reassessment + variant_ids = set() + with open(reassessment_table, 'r') as f: + for line in f: + if not line.startswith("#"): + variant_ids.add(line.split('\t')[0]) + + # Write variants not in reassessment table to output VCF + for line in invcf: + variant_id = line.split('\t')[2] # Get variant ID from the line + if variant_id not in variant_ids: + outvcf.write(line) + else: + reassessed_variants[variant_id] = line # Store the variant for reassessment + + # Process reassessed variants + with open(os.path.join(GTDIR, 'final_variant_reassessment_table.txt'), 'r') as reassessment_table: + for line in reassessment_table: + if line.startswith("#"): + continue # Skip header lines + + VID, MOD, REASON, svtype, cpxtype, cpxintervals, SVLEN, SOURCE, START, END = line.strip().split('\t') + + variant_line = reassessed_variants.get(VID) # Get the original variant line + if not variant_line: + continue # If the variant is not found, skip to the next + + fields = variant_line.strip().split('\t') + + if MOD in ["SKIP", "KEEP"]: + with open(os.path.join(GTDIR, 'variants_to_be_reassessed.vcf'), 'a') as varvcf: + varvcf.write(variant_line) + + elif MOD == "UNRESOLVED": + fields[6] = "UNRESOLVED" + fields[7] += ";UNRESOLVED_TYPE=POSTHOC_RD_GT_REJECTION" + with open(os.path.join(GTDIR, 'variants_to_be_reassessed.vcf'), 'a') as varvcf: + varvcf.write('\t'.join(fields) + '\n') + + elif MOD == "RECLASSIFY": + if SVLEN == "." or START == "." or END == ".": + info_fields = {kv.split('=')[0]: kv.split('=')[1] for kv in fields[7].split(';') if '=' in kv} + SVLEN = info_fields.get('SVLEN', SVLEN) + START = fields[1] + END = info_fields.get('END', END) + + fields[1] = START + new_info = ";".join([f"SVTYPE={svtype}", f"SVLEN={SVLEN}", f"END={END}"]) + fields[7] = new_info + + with open(os.path.join(GTDIR, 'variants_to_be_reassessed.vcf'), 'a') as varvcf: + varvcf.write('\t'.join(fields) + '\n') + + # To handle merging 'variants_to_be_reassessed.vcf' back into 'cleaned_output.vcf': + with open(os.path.join(GTDIR, 'cleaned_output.vcf'), 'a') as outvcf: + with open(os.path.join(GTDIR, 'variants_to_be_reassessed.vcf'), 'r') as varvcf: + for line in varvcf: + outvcf.write(line) + +## part 12: (from line 925 to the end of of process_posthoc_cpx_depth_regenotyping.sh) + +def refine_info_field(INFO, svtype, cpxtype, cpxintervals, SVLEN, SOURCE, END): + # Use the logic from the Bash script to modify the INFO field + INFO = INFO.replace(f"END=[^;]*;", f"END={END};") + INFO = INFO.replace(f";END=[^;]*;", f";END={END};") + INFO = INFO.replace(f";END=[^;]*$", f";END={END}") + + INFO = INFO.replace(f"SVTYPE=[^;]*;", f"SVTYPE={svtype};") + INFO = INFO.replace(f";SVTYPE=[^;]*;", f";SVTYPE={svtype};") + INFO = INFO.replace(f";SVTYPE=[^;]*$", f";SVTYPE={svtype}") + + INFO = INFO.replace(f"SVLEN=[^;]*;", f"SVLEN={SVLEN};") + INFO = INFO.replace(f";SVLEN=[^;]*;", f";SVLEN={SVLEN};") + INFO = INFO.replace(f";SVLEN=[^;]*$", f";SVLEN={SVLEN}") + + # Removing or modifying CPX_TYPE as needed + if svtype == "CPX": + if "CPX_TYPE=" not in INFO: + INFO += f";CPX_TYPE={cpxtype}" + else: + INFO = INFO.replace(f";CPX_TYPE=[^;]*;", f";CPX_TYPE={cpxtype};") + INFO = INFO.replace(f";CPX_TYPE=[^;]*$", f";CPX_TYPE={cpxtype}") + else: + INFO = INFO.replace(f";CPX_TYPE=[^;]*;", ";") + INFO = INFO.replace(f";CPX_TYPE=[^;]*$", "") + + # Adding or modifying SOURCE as needed + if SOURCE == ".": + INFO = INFO.replace(f";SOURCE=[^;]*;", ";") + INFO = INFO.replace(f";SOURCE=[^;]*$", "") + else: + if "SOURCE=" in INFO: + INFO = INFO.replace(f";SOURCE=[^;]*;", f";SOURCE={SOURCE};") + INFO = INFO.replace(f";SOURCE=[^;]*$", f";SOURCE={SOURCE}") + else: + INFO += f";SOURCE={SOURCE}" + + # Adding or modifying CPX_INTERVALS as needed + if cpxintervals == ".": + INFO = INFO.replace(f";CPX_INTERVALS=[^;]*;", ";") + INFO = INFO.replace(f";CPX_INTERVALS=[^;]*$", "") + else: + if "CPX_INTERVALS=" in INFO: + INFO = INFO.replace(f";CPX_INTERVALS=[^;]*;", f";CPX_INTERVALS={cpxintervals};") + INFO = INFO.replace(f";CPX_INTERVALS=[^;]*$", f";CPX_INTERVALS={cpxintervals}") + else: + INFO += f";CPX_INTERVALS={cpxintervals}" + + return INFO + + +def refine_vcf_records(reassessment_table, variants_to_be_reassessed, cleaned_output_vcf, bin_dir, outvcf): + with open(reassessment_table, 'r') as reassess, open(variants_to_be_reassessed, 'r') as var_vcf, open( + cleaned_output_vcf, 'a') as out_vcf: + vcf_lines = var_vcf.readlines() + + for line in reassess: + if not line.startswith("#"): + VID, MOD, REASON, svtype, cpxtype, cpxintervals, SVLEN, SOURCE, START, END = line.strip().split('\t') + + variant_record = next((line for line in vcf_lines if VID in line), None) + if variant_record: + fields = variant_record.strip().split('\t') + INFO = refine_info_field(fields[7], svtype, cpxtype, cpxintervals, SVLEN, SOURCE, END) + + # Print the modified record to the output VCF + out_vcf.write("\t".join( + [fields[0], START, fields[2], fields[3], "<{}>".format(svtype), fields[5], fields[6], + INFO] + fields[8:]) + "\n") + + # Sort and compress the final output VCF + os.system( + "{} {} | vcf-sort | bgzip -c > {}".format(os.path.join(bin_dir, 'rm_cpx_info.py'), cleaned_output_vcf, outvcf)) + + # Clean up + os.rmdir(GTDIR) + +# Note: GTDIR variable is not provided in the code. It should be defined somewhere before calling os.rmdir(GTDIR). +# Note: cleaned_output_vcf variable is not provided in the code. It should be defined somewhere before calling os.system(...). +############################## end of part 12 of regenotyping pipeline +# This would be the main entry point if you run the Python script +if __name__ == "__main__": + args = parse_arguments() + + # section 2 + validate_inputs(args) + + BIN = get_execution_directory() + + GTDIR = create_temp_directory() + + # Further processing can continue from here using the args object. + invcf = args.INVCF + INTERVALS = args.INTERVALS + GENOTYPES = args.GENOTYPES + FAMFILE = args.FAMFILE + + outvcf = args.OUTVCF + #print("Input VCF: {}".format(invcf)) + #print("Output VCF: {}".format(outvcf)) + + prepare_sample_lists(args.FAMFILE, GTDIR) + + setup_genotype_counts_header(GTDIR) + # section 4: + prepare_sample_lists(args.FAMFILE, GTDIR) + + setup_genotype_counts_header(GTDIR) + + # Assuming chr, start, end, VID, and samps are extracted from the loop reading variants + # These variables should be set accordingly + chr = "X" # Placeholder + start = 0 # Placeholder + end = 1000 # Placeholder + VID = "VID123" # Placeholder + samps = "sample1,sample2" # Placeholder + + medCNdefault = get_sample_lists_by_chromosome(chr, samps, GTDIR) + + intersected_genotypes = gather_genotypes_by_interval(chr, start, end, VID, args.GENOTYPES, GTDIR) +