diff --git a/python_code/ebi.py b/python_code/ebi.py new file mode 100644 index 00000000..d4d3e1dc --- /dev/null +++ b/python_code/ebi.py @@ -0,0 +1,798 @@ +#!/usr/bin/env python + +from os.path import basename, exists, join +from os import mkdir +from datetime import date, timedelta +from xml.etree import ElementTree as ET +from xml.dom import minidom +from xml.sax.saxutils import escape +from functools import partial + +from make_per_sample_fastq import split_helper + + +class InvalidMetadataError(Exception): + """Error that is raised when metadata is not representable as a string""" + pass + + +class SampleAlreadyExistsError(Exception): + """Error that is raised when a sample is added to a submission that already + has a sample by that name""" + pass + + +class NoXMLError(Exception): + """Error that is raised when the generation of one XML file cannot be + completed because it depends on another XML file that has not yet been + generated""" + pass + + +def clean_whitespace(s): + """Standardizes whitespace so that there is only ever one space separating + tokens""" + return ' '.join(s.split()) + + +def iter_file_via_list_of_dicts(input_file): + """Iterates over a TSV file, yielding dicts keyed by the column headers + + Parameters + ---------- + input_file : file + The file to iterate over + + Returns + ------- + generator + Yields dicts keyed by the column headers + + Notes + ----- + - Assumes the first line contains the column headers + - Assumes no duplicate column headers + - All column headers will be transformed to lowercase + - Skips lines containing only whitespace + """ + header_line = input_file.readline() + header_line = header_line.lstrip('#') + headers = [x.strip().lower() for x in header_line.split('\t')] + for line in input_file: + line = line.strip() + if not line: + continue + + line_elements = line.split('\t') + yield dict(zip(headers, line_elements)) + + +class EBISubmission(object): + """Define an EBI submission and facilitate generation of required XML files + + Parameters + ---------- + study_id : str + study_title : str + study_abstract : str + investigation_type : str + 'metagenome', and 'mimarks-survey' are specially recognized and used to + set other attributes in the submission, but any string is valid + empty_value : str, optional + Defaults to "no_data". This is the value that will be used when data + for a particular metadata field is missing + """ + def __init__(self, study_id, study_title, study_abstract, + investigation_type, empty_value='no_data', **kwargs): + self.study_id = study_id + self.study_title = study_title + self.study_abstract = study_abstract + self.investigation_type = investigation_type + + self.study_xml_fp = None + self.sample_xml_fp = None + self.experiment_xml_fp = None + self.run_xml_fp = None + + # dicts that map investigation_type to library attributes + lib_strategies = {'metagenome': 'POOLCLONE', + 'mimarks-survey': 'AMPLICON'} + lib_selections = {'mimarks-survey': 'PCR'} + lib_sources = {} + + self.library_strategy = lib_strategies.get( + self.investigation_type, "OTHER") + self.library_source = lib_sources.get( + self.investigation_type, "METAGENOMIC") + self.library_selection = lib_selections.get( + self.investigation_type, "unspecififed") + + # This allows addition of other arbitrary study metadata + self.additional_metadata = self._stringify_kwargs(kwargs) + + # This will hold the submission's samples, keyed by the sample name + self.samples = {} + + def _stringify_kwargs(self, kwargs_dict): + """Turns values in a dictionay into strings, None, or self.empty_value + """ + try: + result = { + str(k): str(v) if v is not None else self.empty_value + for k, v in kwargs.iteritems()} + except ValueError: + raise InvalidMetadataError("All additional metadata passed via " + "kwargs to the EBISubmission " + "constructor must be representatable " + "as strings.") + + def _get_study_alias(self): + """Format alias using ``self.study_id``""" + return 'qiime_study_' + escape(clean_whitespace(str(self.study_id))) + + def _get_sample_alias(self, sample_name): + """Format alias using ``self.study_id``, `sample_name`""" + return "%s:%s" % (self._get_study_alias(), + escape(clean_whitespace(str(sample_name)))) + + def _get_experiment_alias(self, sample_name, row_number): + """Format alias using ``self.study_id``, `sample_name`, `row_number` + + `row_number` comes from the index of the prep in the sample's prep + list. + """ + return "%s:%d" % (self._get_sample_alias(sample_name), + row_number) + + def _get_submission_alias(self): + """Format alias using ``self.study_id``""" + safe_study_id = escape(clean_whitespace(str(self.study_id))) + return 'qiime_submission_' + safe_study_id + + def _get_library_name(self, sample_name, row_number): + """Format alias using `sample_name`, `row_number` + + `row_number` comes from the index of the prep in the sample's prep + list. + """ + return '%s:%d' % (escape(clean_whitespace(sample_name)), row_number) + + def _add_dict_as_tags_and_values(self, parent_node, attribute_element_name, + data_dict): + """Format key/value data using a common EBI XML motif""" + for attr, val in sorted(data_dict.items()): + attribute_element = ET.SubElement(parent_node, + attribute_element_name) + tag = ET.SubElement(attribute_element, 'TAG') + tag.text = escape(clean_whitespace(attr)) + value = ET.SubElement(attribute_element, 'VALUE') + value.text = escape(clean_whitespace(val)) + + def generate_study_xml(self): + """Generates the study XML file + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + """ + study_set = ET.Element('STUDY_SET', { + 'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance", + 'xsi:noNamespaceSchemaLocation': "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.study.xsd"}) + + study = ET.SubElement(study_set, 'STUDY', { + 'alias': self._get_study_alias(), + 'center_name': "CCME-COLORADO"} + ) + + descriptor = ET.SubElement(study, 'DESCRIPTOR') + study_title = ET.SubElement(descriptor, 'STUDY_TITLE') + study_title.text = escape(clean_whitespace(self.study_title)) + study_type = ET.SubElement(descriptor, 'STUDY_TYPE', { + 'existing_study_type': escape(clean_whitespace( + self.investigation_type))} + ) + study_abstract = ET.SubElement(descriptor, 'STUDY_ABSTRACT') + study_abstract.text = clean_whitespace(escape(self.study_abstract)) + + if self.additional_metadata: + study_attributes = ET.SubElement(study, 'STUDY_ATTRIBUTES') + self._add_dict_as_tags_and_values(study_attributes, + 'STUDY_ATTRIBUTE', + self.additional_metadata) + + return study_set + + def add_sample(self, sample_name, taxon_id=None, description=None, + **kwargs): + """Adds sample information to the current submission + + Parameters + ---------- + sample_name : str + Unique identifier for the sample + taxon_id : str, optional + Defaults to ``None``. If not provided, the `empty_value` will be + used for the taxon ID + description : str, optional + Defaults to ``None``. If not provided, the `empty_value` will be + used for the description + + Raises + ------ + SampleAlreadyExistsError + If `sample_name` already exists in the ``samples`` dict + """ + if sample_name in self.samples: + raise SampleAlreadyExistsError("Two samples with the same " + "sample_name cannot be added to " + "the same submission. " + "(sample_name: %s)" % sample_name) + + self.samples[sample_name] = {} + + self.samples[sample_name]['taxon_id'] = self.empty_value if \ + taxon_id is None else taxon_id + self.samples[sample_name]['taxon_id'] = \ + escape(clean_whitespace(self.samples[sample_name]['taxon_id'])) + + self.samples[sample_name]['description'] = self.empty_value if \ + description is None else description + self.samples[sample_name]['description'] = \ + escape(clean_whitespace(self.samples[sample_name]['description'])) + + self.samples[sample_name]['attributes'] = self._stringify_kwargs( + kwargs) + + self.samples[sample_name]['preps'] = [] + + def generate_sample_xml(self): + """Generates the sample XML file + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + """ + sample_set = ET.Element('SAMPLE_SET', { + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.sample.xsd"}) + + for sample_name, sample_info in sorted(self.samples.iteritems()): + sample = ET.SubElement(sample_set, 'SAMPLE', { + 'alias': self._get_sample_alias(sample_name), + 'center_name': 'CCME-COLORADO'} + ) + + sample_title = ET.SubElement(sample, 'TITLE') + sample_title.text = escape(clean_whitespace(sample_name)) + + sample_name_element = ET.SubElement(sample, 'SAMPLE_NAME') + taxon_id = ET.SubElement(sample_name_element, 'TAXON_ID') + taxon_id.text = escape(clean_whitespace(sample_info['taxon_id'])) + + description = ET.SubElement(sample, 'DESCRIPTION') + description.text = escape(clean_whitespace( + sample_info['description'])) + + if sample_info['attributes']: + sample_attributes = ET.SubElement(sample, 'SAMPLE_ATTRIBUTES') + self._add_dict_as_tags_and_values(sample_attributes, + 'SAMPLE_ATTRIBUTE', + sample_info['attributes']) + + return sample_set + + def add_sample_prep(self, sample_name, platform, file_type, file_path, + experiment_design_description, + library_construction_protocol, + **kwargs): + """Add prep info for an existing sample + + Parameters + ---------- + sample_name : str + The `sample_name` must exist in the ``samples`` list + platform : {'LS454', 'ILLUMINA', 'UNKNOWN'} + The sequencing platform + file_type : {'sff', 'fastq', 'fasta'} + The type of file + file_path : str + The path to the sequence file for this sample + experiment_design_description : str + library_construction_protocol : str + + Raises + ------ + KeyError + If `sample_name` is not in the list of samples in the + ``EBISubmission`` object + """ + prep_info = self._stringify_kwargs(kwargs) + + prep_info['platform'] = platform + prep_info['file_type'] = file_type + prep_info['file_path'] = file_path + prep_info['experiment_design_description'] = \ + experiment_design_description + prep_info['library_construction_protocol'] = \ + library_construction_protocol + + self.samples[sample_name]['preps'].append(prep_info) + + def _generate_library_descriptor(self, design, sample_name, row_number, + library_construction_protocol): + """This XML element (and its subelements) must be written for every + sample, but its generation depends on only study-level information. + Therefore, we can break it out into its own method. + """ + + library_descriptor = ET.SubElement(design, 'LIBRARY_DESCRIPTOR') + library_name = ET.SubElement(library_descriptor, 'LIBRARY_NAME') + library_name.text = self._get_library_name(sample_name, + row_number) + library_strategy = ET.SubElement(library_descriptor, + "LIBRARY_STRATEGY") + library_strategy.text = self.library_strategy + library_source = ET.SubElement(library_descriptor, + "LIBRARY_SOURCE") + library_source.text = self.library_source + library_selection = ET.SubElement(library_descriptor, + "LIBRARY_SELECTION") + library_selection.text = self.library_selection + library_layout = ET.SubElement(library_descriptor, + "LIBRARY_LAYOUT") + single = ET.SubElement(library_layout, "SINGLE") + library_construction_protocol_element = ET.SubElement( + library_descriptor, "LIBRARY_CONSTRUCTION_PROTOCOL") + library_construction_protocol_element.text = escape(clean_whitespace( + library_construction_protocol)) + + def _generate_spot_descriptor(self, design, platform): + """This XML element (and its subelements) must be written for every + sample, but its generation depends on only study-level information. + Therefore, we can break it out into its own method. + """ + # This section applies only to the LS454 platform + if platform is not 'LS454': + return + + # There is some hard-coded information in here, but this is what we + # have always done in the past... + spot_descriptor = ET.SubElement(design, 'SPOT_DESCRIPTOR') + spot_decode_spec = ET.SubElemenet(spot_descriptor, 'SPOT_DECODE_SPEC') + read_spec = ET.SubElemenet(spot_descriptor, 'READ_SPEC') + + read_index = ET.SubElemenet(read_spec, 'READ_INDEX') + read_index.text = '0' + read_class = ET.SubElement(read_spec, 'READ_CLASS') + read_class.text = 'Application Read' + read_type = ET.SubElement(read_spec, 'READ_TYPE') + read_type.text = 'Forward' + base_coord = ET.SubElement(read_spec, 'BASE_COORD') + base_coord.text = '1' + + def generate_experiment_xml(self): + """Generates the experiment XML file + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + """ + study_alias = self._get_study_alias() + experiment_set = ET.Element('EXPERIMENT_SET', { + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.experiment.xsd"}) + for sample_name, sample_info in sorted(self.samples.items()): + sample_alias = self._get_sample_alias(sample_name) + for row_number, prep_info in enumerate(sample_info['preps']): + experiment_alias = self._get_experiment_alias(sample_name, + row_number) + platform = prep_info['platform'] + experiment = ET.SubElement(experiment_set, 'EXPERIMENT', { + 'alias': experiment_alias, + 'center_name': 'CCME-COLORADO'} + ) + title = ET.SubElement(experiment, 'TITLE') + title.text = experiment_alias + study_ref = ET.SubElement(experiment, 'STUDY_REF', { + 'refname': study_alias} + ) + + design = ET.SubElement(experiment, 'DESIGN') + design_description = ET.SubElement(design, + 'DESIGN_DESCRIPTION') + design_description.text = escape(clean_whitespace( + prep_info['experiment_design_description'])) + sample_descriptor = ET.SubElement( + design, 'SAMPLE_DESCRIPTOR', {'refname': sample_alias} + ) + + self._generate_library_descriptor( + design, sample_name, row_number, + prep_info['library_construction_protocol'] + ) + + self._generate_spot_descriptor(design, platform) + + platform_element = ET.SubElement(experiment, 'PLATFORM') + platform_info = ET.SubElement(platform_element, + platform.upper()) + instrument_model = ET.SubElement(platform_info, + 'INSTRUMENT_MODEL') + instrument_model.text = 'unspecified' + + if prep_info: + experiment_attributes = ET.SubElement( + experiment, 'EXPERIMENT_ATTRIBUTES') + self._add_dict_as_tags_and_values(experiment_attributes, + 'EXPERIMENT_ATTRIBUTE', + prep_info) + + return experiment_set + + def generate_run_xml(self): + """Generates the run XML file + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + """ + # TODO: md5 stuff + run_set = ET.Element('RUN_SET', { + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.run.xsd"}) + for sample_name, sample_info in sorted(self.samples.items()): + sample_alias = self._get_sample_alias(sample_name) + for row_number, prep_info in enumerate(sample_info['preps']): + experiment_alias = self._get_experiment_alias(sample_name, + row_number) + platform = prep_info['platform'] + file_type = prep_info['file_type'] + file_path = prep_info['file_path'] + + run = ET.SubElement(run_set, 'RUN', { + 'alias': basename(file_path) + '_run', + 'center_name': 'CCME-COLORADO'} + ) + experiment_ref = ET.SubElement(run, 'EXPERIMENT_REF', { + 'refname': experiment_alias} + ) + data_block = ET.SubElement(run, 'DATA_BLOCK') + files = ET.SubElement(data_block, 'FILES') + file_element = ET.SubElement(files, 'FILE', { + 'filename': basename(file_path), + 'filetype': file_type, + 'quality_scring_system': 'phred', + 'checksum_method': 'MD5', + 'checksum': 'NONE'} # TODO: checksum + ) + + return run_set + + def generate_submission_xml(self, action): + """Generates the submission XML file + + Parameters + ---------- + action : {'ADD', 'VALIDATE', 'UPDATE'} + What action to take when communicating with EBI + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + + Raises + ------ + NoXMLError + If one of the necessary XML files has not been generated + """ + if any([self.study_xml_fp is None, + self.sample_xml_fp is None, + self.experiment_xml_fp is None, + self.run_xml_fp is None]): + raise NoXMLError("One of the necessary XML files has not been " + "generated. Make sure you write out the other " + "XML files before attempting to write the " + "submission XML file.") + + submission_set = ET.Element('SUBMISSION_SET', { + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.submission.xsd"}) + submission = ET.SubElement(submission_set, 'SUBMISSION', { + 'alias': self._get_submission_alias(), + 'center_name': 'CCME-COLORADO'} + ) + + actions = ET.SubElement(submission, 'ACTIONS') + + study_action = ET.SubElement(actions, 'ACTION') + study_action_2 = ET.SubElement(study_action, action, { + 'schema': 'study', + 'source': self.study_xml_fp} + ) + + sample_action = ET.SubElement(actions, 'ACTION') + sample_action_2 = ET.SubElement(sample_action, action, { + 'schema': 'sample', + 'source': self.sample_xml_fp} + ) + + experiment_action = ET.SubElement(actions, 'ACTION') + experiment_action_2 = ET.SubElement(experiment_action, action, { + 'schema': 'experiment', + 'source': self.experiment_xml_fp} + ) + + run_action = ET.SubElement(actions, 'ACTION') + run_action_2 = ET.SubElement(run_action, action, { + 'schema': 'run', 'source': self.run_xml_fp} + ) + + if action is 'ADD': + hold_action = ET.SubElement(actions, 'HOLD', { + 'HoldUntilDate': str(date.today() + timedelta(365))} + ) + + return submission_set + + def _write_xml_file(self, xml_gen_fn, attribute_name, fp, + xml_gen_fn_arg=None): + """Writes an XML file after calling one of the XML generation + functions + + Parameters + ---------- + xml_gen_fn : function + The function that will be called to generate the XML that will be + written + attribute_name : str + The name of the attribute in which to store the output filepath + fp : str + The filepath to which the XML will be written + xml_gen_fn_arg : str, optional + Defaults to None. If None, no arguments will be passed to + xml_gen_fn. Otherwise, this will be passed as the only argument to + xml_gen_fn + + Notes + ----- + xml_gen_fn_arg is needed for generating the submission XML + """ + if xml_gen_fn_arg is None: + xml_element = xml_gen_fn() + else: + xml_element = xml_gen_fn(xml_gen_fn_arg) + + xml = minidom.parseString(ET.tostring(xml_element)) + + with open(fp, 'w') as outfile: + outfile.write(xml.toprettyxml(indent=' ', encoding='UTF-8')) + + setattr(self, attribute_name, fp) + + def write_study_xml(self, fp): + """Write the study XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + self._write_xml_file(self.generate_study_xml, 'study_xml_fp', fp) + + def write_sample_xml(self, fp): + """Write the sample XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + self._write_xml_file(self.generate_sample_xml, 'sample_xml_fp', fp) + + def write_experiment_xml(self, fp): + """Write the experiment XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + self._write_xml_file(self.generate_experiment_xml, + 'experiment_xml_fp', fp) + + def write_run_xml(self, fp): + """Write the run XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + self._write_xml_file(self.generate_run_xml, 'run_xml_fp', fp) + + def write_submission_xml(self, fp, action): + """Write the submission XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + action : {'ADD', 'VALIDATE', 'UPDATE'} + What action to take when communicating with EBI + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + self._write_xml_file(self.generate_submission_xml, 'submission_xml_fp', + fp, action) + + def write_all_xml_files(study_fp, sample_fp, experiment_fp, run_fp, + submission_fp, action): + """Write all XML files needed for an EBI submission using current data + + Parameters + ---------- + study_fp : str + The filepath to which the study XML will be written + sample_fp : str + The filepath to which the sample XML will be written + experiment_fp : str + The filepath to which the experiment XML will be written + run_fp : str + The filepath to which the run XML will be written + submission_fp : str + The filepath to which the submission XML will be written + action : {'ADD', 'VALIDATE', 'UPDATE'} + What action to take when communicating with EBI + + Notes + ----- + If any of the filepaths point to an existing file, it will be + overwritten + """ + self.write_study_xml(study_fp) + self.write_sample_xml(sample_fp) + self.write_experiment_xml(experiment_fp) + self.write_run_xml(run_fp) + self.write_submission_xml(submission_fp, action) + + def add_samples_from_templates(self, sample_template, prep_templates, + per_sample_fastq_dir): + """ + Parameters + ---------- + sample_template : file + prep_templates : list of file + per_sample_fastq_dir : str + Path to the direcotry containing per-sample FASTQ files containing + The sequence labels should be: + ``SampleID_SequenceNumber And Additional Notes if Applicable`` + """ + for sample in iter_file_via_list_of_dicts(sample_template): + sample_name = sample.pop('sample_name') + taxon_id = sample.pop('taxon_id', None) + description = sample.pop('description', None) + + self.add_sample(sample_name, taxon_id=taxon_id, + description=description, + **sample) + + for prep_template in prep_templates: + for prep in iter_file_via_list_of_dicts(prep_template): + sample_name = prep.pop('sample_name') + platform = prep.pop('platform') + experiment_design_description = prep.pop( + 'experiment_design_description') + library_construction_protocol = prep.pop( + 'library_construction_protocol') + + file_path = join(per_sample_fastq_dir, sample_name+'.fastq') + self.add_sample_prep(sample_name, platform, 'fastq', + file_path, experiment_design_description, + library_construction_protocol, + **prep) + + @classmethod + def from_templates_and_demux_fastq( + cls, study_id, study_title, study_abstract, investigation_type, + sample_template, prep_templates, demux_seqs_fp, output_dir, + **kwargs): + """Generate an ``EBISubmission`` from templates and a sequence file + + Parameters + ---------- + study_id : str + study_title : str + study_abstract : str + investigation_type : str + sample_template : file + prep_templates : list of file + demux_seqs_fp : str + Path to FASTQ File containing the demultiplexed sequences in the + format output by QIIME. Namely, the sequence labels should be: + ``SampleID_SequenceNumber And Additional Notes if Applicable`` + output_dir : str + The directory to output the per-sample FASTQ files. It will be + created it if does not already exist. The files will be named + .fastq + """ + if not exists(output_dir): + mkdir(output_dir) + + # generate the per-sample FASTQ files + with open(demux_seqs_fp, 'U') as demux_seqs: + split_helper(demux_seqs, output_dir, sequence_buffer_size=1000) + + # initialize the EBISubmission object + submission = cls(study_id, study_title, study_abstract, + investigation_type, **kwargs) + + submission.add_samples_from_templates( + sample_template, prep_templates, output_dir) + + return submission + + @classmethod + def from_templates_and_per_sample_fastqs(cls, study_id, study_title, + study_abstract, + investigation_type, + sample_template, prep_templates, + per_sample_fastq_dir, + **kwargs): + """Generate an ``EBISubmission`` from templates and FASTQ files + + Parameters + ---------- + study_id : str + study_title : str + study_abstract : str + investigation_type : str + sample_template : file + prep_templates : list of file + per_sample_fastq_dir : str + Path to the direcotry containing per-sample FASTQ files containing + The sequence labels should be: + ``SampleID_SequenceNumber And Additional Notes if Applicable`` + + Notes + ----- + - kwargs will be passed directly to the ``EBISubmission`` constructor, + which will add them as key-value pairs to the study attributes + section of the submission + """ + # initialize the EBISubmission object + submission = cls(study_id, study_title, study_abstract, + investigation_type, **kwargs) + + submission.add_samples_from_templates(sample_template, + prep_templates, + per_sample_fastq_dir) + + return submission diff --git a/python_code/make_per_sample_fastq.py b/python_code/make_per_sample_fastq.py new file mode 100644 index 00000000..b0b28367 --- /dev/null +++ b/python_code/make_per_sample_fastq.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +from collections import defaultdict +from os.path import join, exists +from os import mkdir +from gzip import open as gz_open + +import click +from skbio.parse.sequences.fastq import parse_fastq +from skbio.format.sequences.fastq import format_fastq_record + +def split_helper(input_fastq, output_directory, sequence_buffer_size=1000, + gzip=True): + """Splits a demultiplexed FASTQ file into per-sample FASTQ files + + Parameters + ---------- + input_fastq : file + The input demultiplexed FASTQ file. + output_directory : str + Path to the output directory. It will be created if it does not already + exist. + sequence_buffer_size : int + The number of sequences to hold in memory for each sample before + writing them to disk. + gzip : bool, optional + If ``True``, then the generated FASTQ files will be gzipped + + Notes + ----- + The sequence identifiers in `input_fastq` should be of the form output by + QIIME's demultiplexing scripts; namely, they should be: + ``SampleID_SequenceNumber And Additional Notes if Applicable`` + """ + if not exists(output_directory): + mkdir(output_directory) + + per_sample_seqs = defaultdict(list) + per_sample_counts = defaultdict(int) + unique_filepaths = set() + for defline, seq, qual in parse_fastq(input_fastq, + enforce_qual_range=False): + label = defline.split()[0] + sample_name, sequence_number = label.rsplit('_', 1) + + per_sample_seqs[sample_name].append((defline, seq, qual)) + per_sample_counts[sample_name] += 1 + + if per_sample_counts[sample_name] > sequence_buffer_size: + output_filepath = join(output_directory, sample_name+'.fastq') + unique_filepaths.add(output_filepath) + with open(output_filepath, 'a') \ + as outfile: + outfile.write( + format_fastq_record(*per_sample_seqs[sample_name])) + + per_sample_seqs[sample_name] = [] + per_sample_counts[sample_name] = 0 + + for sample_name, entries in per_sample_seqs.iteritems(): + if not entries: + continue + + output_filepath = join(output_directory, sample_name+'.fastq') + unique_filepaths.add(output_filepath) + with open(output_filepath, 'a') \ + as outfile: + outfile.write( + format_fastq_record(*per_sample_seqs[sample_name])) + + if gzip: + pass + +@click.group() +def cli(): + pass + +@cli.command() +@click.option('--input-fastq', required=True, type=click.File('r')) +@click.option('--sequence-buffer-size', default=1000) +@click.option('--output-directory', required=True, type=str) +@click.option('--gzip/--no-gzip', default=True) +def split(input_fastq, output_directory, sequence_buffer_size, + gzip): + split_helper(input_fastq, output_directory, sequence_buffer_size, + gzip) + +if __name__ == '__main__': + cli() diff --git a/python_code/scripts/ebi.py b/python_code/scripts/ebi.py new file mode 100755 index 00000000..e80cd225 --- /dev/null +++ b/python_code/scripts/ebi.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +from logging import logger + +import click + +from data_access_connections import data_access_factory +from enums import ServerConfig + +# check sequence files +# check md5s (actually correctly) +# generate metadata files +# send sequence files +# send metadata + +@click.command() +@click.option('--study-id', required=True, type=int) +@click.option('--sequence-file', required=True, type=click.File('r')) +@click.option('--mapping-file', required=True, type=click.File('r')) +@click.option('--validate-md5/--no-validate-md5', required=False) +def validate_sample(study_id, sequence_file, mapping_file, validate_md5): + pass + +@click.command() +@click.option('--study-id', required=True, type=int) +@click.option('--sequence-file', required=True, type=click.File('r')) +@click.option('--mapping-file', required=True, type=click.File('r')) +@click.option('--validate-md5/--no-validate-md5', required=False) +def validate_and_submit_sample(study_id, sequence_file, mapping_file, validate_md5): + pass + +@click.command() +@click.option('--study-id', required=True, type=int) +@click.option('--sequence-file', required=True, type=click.File('r')) +@click.option('--mapping-file', required=True, type=click.File('r')) +@click.option('--validate-md5/--no-validate-md5', required=False) +def update_sample(study_id, sequence_file, mapping_file, validate_md5): + pass diff --git a/python_code/tests/test_make_per_sample_fastq.py b/python_code/tests/test_make_per_sample_fastq.py new file mode 100644 index 00000000..d5872cc4 --- /dev/null +++ b/python_code/tests/test_make_per_sample_fastq.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +from unittest import TestCase, main +from StringIO import StringIO +from tempfile import mkdtemp +from shutil import rmtree +from os.path import join, exists + +from make_per_sample_fastq import split_helper + +class MakePerSampleFastqTests(TestCase): + def setUp(self): + self.output_dir = mkdtemp() + self.test_fastq = StringIO(test_fastq) + + def tearDown(self): + rmtree(self.output_dir) + + def testSplit_4(self): + split_helper(self.test_fastq, self.output_dir, + sequence_buffer_size=4, ascii_increment=33) + self.assertTrue(exists(join(self.output_dir, 'S1.fastq'))) + self.assertTrue(exists(join(self.output_dir, 'S2.fastq'))) + s1_contents = open(join(self.output_dir, 'S1.fastq')).read() + s2_contents = open(join(self.output_dir, 'S2.fastq')).read() + self.assertEqual(s1_contents, s1_exp) + self.assertEqual(s2_contents, s2_exp) + + def testSplit_2(self): + split_helper(self.test_fastq, self.output_dir, + sequence_buffer_size=2, ascii_increment=33) + self.assertTrue(exists(join(self.output_dir, 'S1.fastq'))) + self.assertTrue(exists(join(self.output_dir, 'S2.fastq'))) + s1_contents = open(join(self.output_dir, 'S1.fastq')).read() + s2_contents = open(join(self.output_dir, 'S2.fastq')).read() + self.assertEqual(s1_contents, s1_exp) + self.assertEqual(s2_contents, s2_exp) + + + +test_fastq = """@S1_0 +ATGC ++ +#### +@S1_1 +ATGC ++ +#### +@S1_2 +ATGC ++ +#### +@S1_3 +ATGC ++ +#### +@S2_0 +ATGC ++ +#### +""" + +s1_exp = """@S1_0 +ATGC ++ +#### +@S1_1 +ATGC ++ +#### +@S1_2 +ATGC ++ +#### +@S1_3 +ATGC ++ +#### +""" +s2_exp = """@S2_0 +ATGC ++ +#### +""" + + +if __name__ == '__main__': + main()