From 44d9f2daebf38b6a42cfd98e627001aff72aea45 Mon Sep 17 00:00:00 2001 From: Adam Robbins-Pianka Date: Mon, 19 May 2014 16:55:21 -0600 Subject: [PATCH 1/7] Add per-sample FASTQ generator --- python_code/make_per_sample_fastq.py | 89 +++++++++++++++++++ .../tests/test_make_per_sample_fastq.py | 88 ++++++++++++++++++ 2 files changed, 177 insertions(+) create mode 100644 python_code/make_per_sample_fastq.py create mode 100644 python_code/tests/test_make_per_sample_fastq.py diff --git a/python_code/make_per_sample_fastq.py b/python_code/make_per_sample_fastq.py new file mode 100644 index 00000000..42b318de --- /dev/null +++ b/python_code/make_per_sample_fastq.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +from collections import defaultdict +from os.path import join, exists +from os import mkdir + +import click +from skbio.parse.sequences.fastq import parse_fastq + +def write_fastq(output_file, fastq_data, ascii_increment=33): + """Writes tuples of (defline, seq, qual) to an output file + + Parameters + ---------- + output_file : file + Data in `fastq_data` will be written to this file + fastq_data : iterable of tuples of (str, str, str) + Each item in `fastq_data` is a FASTQ entry, each of which is + represented by a tuple of (defline, sequence, quality). + """ + for defline, seq, qual in fastq_data: + qual = ''.join([chr(x+ascii_increment) for x in qual]) + output_file.write('@%s\n%s\n+\n%s\n' % (defline, seq, qual)) + +def split_helper(input_fastq, output_directory, sequence_buffer_size=1000, + ascii_increment=33): + """Splits a demultiplexed FASTQ file into per-sample FASTQ files + + Parameters + ---------- + input_fastq : file + The input demultiplexed FASTQ file. + output_directory : str + Path to the output directory. It will be created if it does not already + exist. + sequence_buffer_size : int + The number of sequences to hold in memory for each sample before + writing them to disk. + + Notes + ----- + The sequence identifiers in `input_fastq` should be of the form output by + QIIME's demultiplexing scripts; namely, they should be: + ``SampleID_SequenceNumber And Additional Notes if Applicable`` + """ + if not exists(output_directory): + mkdir(output_directory) + + per_sample_seqs = defaultdict(list) + per_sample_counts = defaultdict(int) + for defline, seq, qual in parse_fastq(input_fastq): + label = defline.split()[0] + sample_name, sequence_number = label.rsplit('_', 1) + + per_sample_seqs[sample_name].append((defline, seq, qual)) + per_sample_counts[sample_name] += 1 + + if per_sample_counts[sample_name] > sequence_buffer_size: + with open(join(output_directory, sample_name+'.fastq'), 'a') \ + as outfile: + write_fastq(outfile, per_sample_seqs[sample_name]) + + per_sample_seqs[sample_name] = [] + per_sample_counts[sample_name] = 0 + + for sample_name, entries in per_sample_seqs.iteritems(): + if not entries: + continue + + with open(join(output_directory, sample_name+'.fastq'), 'a') \ + as outfile: + write_fastq(outfile, per_sample_seqs[sample_name]) + +@click.group() +def cli(): + pass + +@cli.command() +@click.option('--input-fastq', required=True, type=click.File('r')) +@click.option('--sequence-buffer-size', default=1000) +@click.option('--output-directory', required=True, type=str) +@click.option('--ascii-increment', default=33) +def split(input_fastq, output_directory, sequence_buffer_size, + ascii_increment): + split_helper(input_fastq, output_directory, sequence_buffer_size, + ascii_increment) + +if __name__ == '__main__': + cli() diff --git a/python_code/tests/test_make_per_sample_fastq.py b/python_code/tests/test_make_per_sample_fastq.py new file mode 100644 index 00000000..d5872cc4 --- /dev/null +++ b/python_code/tests/test_make_per_sample_fastq.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +from unittest import TestCase, main +from StringIO import StringIO +from tempfile import mkdtemp +from shutil import rmtree +from os.path import join, exists + +from make_per_sample_fastq import split_helper + +class MakePerSampleFastqTests(TestCase): + def setUp(self): + self.output_dir = mkdtemp() + self.test_fastq = StringIO(test_fastq) + + def tearDown(self): + rmtree(self.output_dir) + + def testSplit_4(self): + split_helper(self.test_fastq, self.output_dir, + sequence_buffer_size=4, ascii_increment=33) + self.assertTrue(exists(join(self.output_dir, 'S1.fastq'))) + self.assertTrue(exists(join(self.output_dir, 'S2.fastq'))) + s1_contents = open(join(self.output_dir, 'S1.fastq')).read() + s2_contents = open(join(self.output_dir, 'S2.fastq')).read() + self.assertEqual(s1_contents, s1_exp) + self.assertEqual(s2_contents, s2_exp) + + def testSplit_2(self): + split_helper(self.test_fastq, self.output_dir, + sequence_buffer_size=2, ascii_increment=33) + self.assertTrue(exists(join(self.output_dir, 'S1.fastq'))) + self.assertTrue(exists(join(self.output_dir, 'S2.fastq'))) + s1_contents = open(join(self.output_dir, 'S1.fastq')).read() + s2_contents = open(join(self.output_dir, 'S2.fastq')).read() + self.assertEqual(s1_contents, s1_exp) + self.assertEqual(s2_contents, s2_exp) + + + +test_fastq = """@S1_0 +ATGC ++ +#### +@S1_1 +ATGC ++ +#### +@S1_2 +ATGC ++ +#### +@S1_3 +ATGC ++ +#### +@S2_0 +ATGC ++ +#### +""" + +s1_exp = """@S1_0 +ATGC ++ +#### +@S1_1 +ATGC ++ +#### +@S1_2 +ATGC ++ +#### +@S1_3 +ATGC ++ +#### +""" +s2_exp = """@S2_0 +ATGC ++ +#### +""" + + +if __name__ == '__main__': + main() From 83a78c7b8449da43f6fc58e6fa8f5c0306d789d6 Mon Sep 17 00:00:00 2001 From: Adam Robbins-Pianka Date: Tue, 27 May 2014 16:49:05 -0600 Subject: [PATCH 2/7] Initial commit for new EBI submission code --- python_code/ebi.py | 789 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 789 insertions(+) create mode 100644 python_code/ebi.py diff --git a/python_code/ebi.py b/python_code/ebi.py new file mode 100644 index 00000000..986423c8 --- /dev/null +++ b/python_code/ebi.py @@ -0,0 +1,789 @@ +#!/usr/bin/env python + +from subprocess import Popen, PIPE, STDOUT +from os.path import basename, exists, join +from os import mkdir +from datetime import date, timedelta +from xml.etree import ElementTree as ET +from xml.dom import minidom +from xml.sax.saxutils import escape + +from make_per_sample_fastq import split_helper + +class InvalidMetadataError(Exception): + pass + +class SampleAlreadyExistsError(Exception): + pass + +class NoXMLError(Exception): + pass + +def clean_whitespace(s): + return ' '.join(s.split()) + +def iter_file_via_list_of_dicts(input_file): + """Iterates over a TSV file, yielding dicts keyed by the column headers + + Parameters + ---------- + input_file : file + The file to iterate over + + Returns + ------- + generator + Yields dicts keyed by the column headers + + Notes + ----- + - Assumes the first line contains the column headers + - Assumes no duplicate column headers + - All column headers will be transformed to lowercase + - Skips lines containing only whitespace + """ + headers = [x.strip().lower() for x in input_file.readline().split('\t')] + for line in input_file: + line = line.strip() + if not line: + continue + + line_elements = line.split('\t') + yield dict(zip(headers, line_elements)) + + +class EBISubmission(object): + def __init__(self, study_id, study_title, study_abstract, + investigation_type, empty_value='unknown', **kwargs): + self.study_id = study_id + self.study_title = study_title + self.study_abstract = study_abstract + self.investigation_type = investigation_type + + self.study_xml_fp = None + self.sample_xml_fp = None + self.experiment_xml_fp = None + self.run_xml_fp = None + + # dicts that map investigation_type to library attributes + lib_strategies = {'metagenome': 'POOLCLONE', + 'mimarks-survey': 'AMPLICON'} + lib_selections = {'mimarks-survey': 'PCR'} + lib_sources = {} + + self.library_strategy = lib_strategies.get( + self.investigation_type, "OTHER") + self.library_source = lib_sources.get( + self.investigation_type, "METAGENOMIC") + self.library_selection = lib_selections.get( + self.investigation_type, "unspecififed") + + # This is allows addition of other arbitrary study metadata + try: + self.additional_metadata = { + str(k): str(v) if v is not None else empty_value + for k, v in kwargs.iteritems()} + except ValueError: + raise InvalidMetadataError("All additional metadata passed via " + "kwargs to the EBISubmission " + "constructor must be strings") + + # This will hold the submission's samples, keyed by the sample name + self.samples = {} + + def _get_study_alias(self): + """Format alias using ``self.study_id``""" + return 'qiime_study_' + escape(clean_whitespace(str(self.study_id))) + + def _get_sample_alias(self, sample_name): + """Format alias using ``self.study_id``, `sample_name`""" + return "%s:%s" % (self._get_study_alias(), + escape(clean_whitespace(str(sample_name)))) + + def _get_experiment_alias(self, sample_name, row_number): + """Format alias using ``self.study_id``, `sample_name`, `row_number` + + `row_number` comes from the index of the prep in the sample's prep + list. + """ + return "%s:%d" % (self._get_sample_alias(sample_name), + row_number) + + def _get_submission_alias(self): + """Format alias using ``self.study_id``""" + safe_study_id = escape(clean_whitespace(str(self.study_id))) + return 'qiime_submission_' + safe_study_id + + def _get_library_name(self, sample_name, row_number): + """Format alias using `sample_name`, `row_number` + + `row_number` comes from the index of the prep in the sample's prep + list. + """ + return '%s:%d' % (escape(clean_whitespace(sample_name)), row_number) + + def _add_dict_as_tags_and_values(self, parent_node, attribute_element_name, + data_dict): + """Format key/value data using a common EBI XML motif""" + for attr, val in sorted(data_dict.iteritems()): + attribute_element = ET.SubElement(parent_node, + attribute_element_name) + tag = ET.SubElement(attribute_element, 'TAG') + tag.text = escape(clean_whitespace(attr)) + value = ET.SubElement(attribute_element, 'VALUE') + value.text = escape(clean_whitespace(val)) + + def generate_study_xml(self): + """Generates the study XML file + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + """ + study_set = ET.Element('STUDY_SET', { + 'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance", + 'xsi:noNamespaceSchemaLocation': "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.study.xsd"} + ) + + study = ET.SubElement(study_set, 'STUDY', { + 'alias': self._get_study_alias(), + 'center_name': "CCME-COLORADO"} + ) + + descriptor = ET.SubElement(study, 'DESCRIPTOR') + study_title = ET.SubElement(descriptor, 'STUDY_TITLE') + study_title.text = escape(clean_whitespace(self.study_title)) + study_type = ET.SubElement(descriptor, 'STUDY_TYPE', { + 'existing_study_type': escape(clean_whitespace( + self.investigation_type))} + ) + study_abstract = ET.SubElement(descriptor, 'STUDY_ABSTRACT') + study_abstract.text = clean_whitespace(escape(self.study_abstract)) + + if self.additional_metadata: + study_attributes = ET.SubElement(study, 'STUDY_ATTRIBUTES') + self._add_dict_as_tags_and_values(study_attributes, + 'STUDY_ATTRIBUTE', + self.additional_metadata) + + return study_set + + def add_sample(self, sample_name, taxon_id=None, description=None, + empty_value='unknown', **kwargs): + """Adds sample information to the current submission + + Parameters + ---------- + sample_name : str + Unique identifier for the sample + taxon_id : str, optional + Defaults to ``None``. If not provided, the `empty_value` will be + used for the taxon ID + description : str, optional + Defaults to ``None``. If not provided, the `empty_value` will be + used for the description + empty_value : str, optional + Defaults to "unknown". This value will be used for `taxon_id` or + `description` when they are not supplied, and for all ``None`` + values in ``kwargs`` + + Raises + ------ + InvalidMetadataError + If metadata is passed via `kwargs` that is not interpretable as + a string + SampleAlreadyExistsError + If `sample_name` already exists in the ``samples`` dict + """ + if sample_name in self.samples: + raise SampleAlreadyExistsError("Two samples with the same " + "sample_name cannot be added to " + "the same submission. " + "(sample_name: %s)" % sample_name) + + self.samples[sample_name] = {} + + self.samples[sample_name]['taxon_id'] = empty_value if \ + taxon_id is None else taxon_id + self.samples[sample_name]['taxon_id'] = \ + escape(clean_whitespace(self.samples[sample_name]['taxon_id'])) + + self.samples[sample_name]['description'] = empty_value if \ + description is None else description + self.samples[sample_name]['description'] = \ + escape(clean_whitespace(self.samples[sample_name]['description'])) + + try: + self.samples[sample_name]['attributes'] = { + str(k): str(v) if v is not None else empty_value + for k, v in kwargs.iteritems()} + + except ValueError: + raise InvalidMetadataError("All metadata passed to add_sample " + "via kwargs must be strings.") + + self.samples[sample_name]['preps'] = [] + + def generate_sample_xml(self): + """Generates the sample XML file + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + """ + sample_set = ET.Element('SAMPLE_SET', { + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.sample.xsd"} + ) + + for sample_name, sample_info in sorted(self.samples.iteritems()): + sample = ET.SubElement(sample_set, 'SAMPLE', { + 'alias': self._get_sample_alias(sample_name), + 'center_name': 'CCME-COLORADO'} + ) + + sample_title = ET.SubElement(sample, 'TITLE') + sample_title.text = escape(clean_whitespace(sample_name)) + + sample_name_element = ET.SubElement(sample, 'SAMPLE_NAME') + taxon_id = ET.SubElement(sample_name_element, 'TAXON_ID') + taxon_id.text = escape(clean_whitespace(sample_info['taxon_id'])) + + description = ET.SubElement(sample, 'DESCRIPTION') + description.text = escape(clean_whitespace( + sample_info['description'])) + + if sample_info['attributes']: + sample_attributes = ET.SubElement(sample, 'SAMPLE_ATTRIBUTES') + self._add_dict_as_tags_and_values(sample_attributes, + 'SAMPLE_ATTRIBUTE', + sample_info['attributes']) + + return sample_set + + def add_sample_prep(self, sample_name, platform, file_type, file_path, + experiment_design_description, + library_construction_protocol, + empty_value='unknown', **kwargs): + """Add prep info for an existing sample + + Parameters + ---------- + sample_name : str + The `sample_name` must exist in the ``samples`` list + platform : {'LS454', 'ILLUMINA', 'UNKNOWN'} + The sequencing platform + file_type : {'sff', 'fastq', 'fasta'} + The type of file + file_path : str + The path to the sequence file for this sample + experiment_design_description : str + library_construction_protocol : str + empty_value : str, optional + Defaults to "unknown". This value will be used for all ``None`` + values in ``kwargs`` + + Raises + ------ + InvalidMetadataError + If metadata is passed via `kwargs` that is not interpretable as + a string + KeyError + If `sample_name` is not in the list of samples in the + ``EBISubmission`` object + """ + try: + prep_info = {str(k): str(v) if v is not None else empty_value + for k, v in kwargs.iteritems()} + except ValueError: + raise InvalidMetadataError("All metadata passed to " + "add_sample_prep via kwargs must be " + "strings.") + + prep_info['platform'] = platform + prep_info['file_type'] = file_type + prep_info['file_path'] = file_path + prep_info['experiment_design_description'] = \ + experiment_design_description + prep_info['library_construction_protocol'] = \ + library_construction_protocol + + self.samples[sample_name]['preps'].append(prep_info) + + def _generate_library_descriptor(self, design, sample_name, row_number, + library_construction_protocol): + """This XML element (and its subelements) must be written for every + sample, but its generation depends on only study-level information. + Therefore, we can breka it out into its own method. + """ + + library_descriptor = ET.SubElement(design, 'LIBRARY_DESCRIPTOR') + library_name = ET.SubElement(library_descriptor, 'LIBRARY_NAME') + library_name.text = self._get_library_name(sample_name, + row_number) + library_strategy = ET.SubElement(library_descriptor, + "LIBRARY_STRATEGY") + library_strategy.text = self.library_strategy + library_source = ET.SubElement(library_descriptor, + "LIBRARY_SOURCE") + library_source.text = self.library_source + library_selection = ET.SubElement(library_descriptor, + "LIBRARY_SELECTION") + library_selection.text = self.library_selection + library_layout = ET.SubElement(library_descriptor, + "LIBRARY_LAYOUT") + single = ET.SubElement(library_layout, "SINGLE") + library_construction_protocol_element = ET.SubElement( + library_descriptor, "LIBRARY_CONSTRUCTION_PROTOCOL") + library_construction_protocol_element.text = escape(clean_whitespace( + library_construction_protocol)) + + def _generate_spot_descriptor(self, design, platform): + """This XML element (and its subelements) must be written for every + sample, but its generation depends on only study-level information. + Therefore, we can breka it out into its own method. + """ + # This section applies only to the LS454 platform + if platform is not 'LS454': + return + + # There is some hard-coded information in here, but this is what we + # have always done in the past... + spot_descriptor = ET.SubElement(design, 'SPOT_DESCRIPTOR') + spot_decode_spec = ET.SubElemenet(spot_descriptor, 'SPOT_DECODE_SPEC') + read_spec = ET.SubElemenet(spot_descriptor, 'READ_SPEC') + + read_index = ET.SubElemenet(read_spec, 'READ_INDEX') + read_index.text = '0' + read_class = ET.SubElement(read_spec, 'READ_CLASS') + read_class.text = 'Application Read' + read_type = ET.SubElement(read_spec, 'READ_TYPE') + read_type.text = 'Forward' + base_coord = ET.SubElement(read_spec, 'BASE_COORD') + base_coord.text = '1' + + def generate_experiment_xml(self): + """Generates the experiment XML file + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + """ + study_alias = self._get_study_alias() + experiment_set = ET.Element('EXPERIMENT_SET', { + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.experiment.xsd"} + ) + for sample_name, sample_info in sorted(self.samples.iteritems()): + sample_alias = self._get_sample_alias(sample_name) + for row_number, prep_info in enumerate(sample_info['preps']): + experiment_alias = self._get_experiment_alias(sample_name, + row_number) + platform = prep_info['platform'] + experiment = ET.SubElement(experiment_set, 'EXPERIMENT', { + 'alias': experiment_alias, + 'center_name': 'CCME-COLORADO'} + ) + title = ET.SubElement(experiment, 'TITLE') + title.text = experiment_alias + study_ref = ET.SubElement(experiment, 'STUDY_REF', { + 'refname': study_alias} + ) + + design = ET.SubElement(experiment, 'DESIGN') + design_description = ET.SubElement(design, + 'DESIGN_DESCRIPTION') + design_description.text = escape(clean_whitespace( + prep_info['experiment_design_description'])) + sample_descriptor = ET.SubElement(design, 'SAMPLE_DESCRIPTOR', + {'refname': sample_alias} + ) + + self._generate_library_descriptor( + design, sample_name, row_number, + prep_info['library_construction_protocol'] + ) + + self._generate_spot_descriptor(design, platform) + + platform_element = ET.SubElement(experiment, 'PLATFORM') + platform_info = ET.SubElement(platform_element, + platform.upper()) + instrument_model = ET.SubElement(platform_info, + 'INSTRUMENT_MODEL') + instrument_model.text = 'unspecified' + + if prep_info: + experiment_attributes = ET.SubElement(experiment, + 'EXPERIMENT_ATTRIBUTES') + self._add_dict_as_tags_and_values(experiment_attributes, + 'EXPERIMENT_ATTRIBUTE', + prep_info) + + return experiment_set + + def generate_run_xml(self): + """Generates the run XML file + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + """ + #TODO: md5 stuff + run_set = ET.Element('RUN_SET', { + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd/" + "sra_1_3/SRA.run.xsd"} + ) + for sample_name, sample_info in sorted(self.samples.iteritems()): + sample_alias = self._get_sample_alias(sample_name) + for row_number, prep_info in enumerate(sample_info['preps']): + experiment_alias = self._get_experiment_alias(sample_name, + row_number) + platform = prep_info['platform'] + file_type = prep_info['file_type'] + file_path = prep_info['file_path'] + + run = ET.SubElement(run_set, 'RUN', { + 'alias': basename(file_path) + '_run', + 'center_name': 'CCME-COLORADO'} + ) + experiment_ref = ET.SubElement(run, 'EXPERIMENT_REF', { + 'refname': experiment_alias} + ) + data_block = ET.SubElement(run, 'DATA_BLOCK') + files = ET.SubElement(data_block, 'FILES') + file_element = ET.SubElement(files, 'FILE', { + 'filename': basename(file_path), + 'filetype': file_type, + 'quality_scring_system': 'phred', + 'checksum_method': 'MD5', + 'checksum': 'NONE'} # TODO: checksum + ) + + return run_set + + def generate_submission_xml(self, action): + """Generates the submission XML file + + Parameters + ---------- + action : {'ADD', 'VALIDATE', 'UPDATE'} + What action to take when communicating with EBI + + Returns + ------- + xml.etree.Element + The root elelement of the generated ``ElementTree`` + + Raises + ------ + NoXMLError + If one of the necessary XML files has not been generated + """ + if any([self.study_xml_fp is None, + self.sample_xml_fp is None, + self.experiment_xml_fp is None, + self.run_xml_fp is None]): + raise NoXMLError("One of the necessary XML files has not been " + "generated. Make sure you write out the other " + "XML files before attempting to write the " + "submission XML file.") + + submission_set = ET.Element('SUBMISSION_SET', { + "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", + "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.submission.xsd"} + ) + submission = ET.SubElement(submission_set, 'SUBMISSION', { + 'alias': self._get_submission_alias(), + 'center_name': 'CCME-COLORADO'} + ) + actions = ET.SubElement(submission, 'ACTIONS') + study_action = ET.SubElement(actions, 'ACTION', { + 'schema': 'study'} + ) + sample_action = ET.SubElement(actions, 'ACTION', { + 'schema': 'sample'} + ) + experiment_action = ET.SubElement(actions, 'ACTION', { + 'schema': 'experiment'} + ) + run_action = ET.SubElement(actions, 'ACTION', { + 'schema': 'run'} + ) + + if action is 'ADD': + hold_action = ET.SubElement(actions, 'HOLD', { + 'HoldUntilDate': str(date.today() + timedelta(365))} + ) + + return submission_set + + def write_study_xml(self, fp): + """Write the study XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + study_xml_element = self.generate_study_xml() + + study_xml = minidom.parseString(ET.tostring( + study_xml_element)) + + with open(fp, 'w') as outfile: + outfile.write(study_xml.toprettyxml(indent=' ', encoding='UTF-8')) + + self.study_xml_fp = fp + + def write_sample_xml(self, fp): + """Write the sample XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + sample_xml_element = self.generate_sample_xml() + + sample_xml = minidom.parseString(ET.tostring( + sample_xml_element)) + + with open(fp, 'w') as outfile: + outfile.write(sample_xml.toprettyxml(indent=' ', + encoding='UTF-8')) + + self.sample_xml_fp = fp + + def write_experiment_xml(self, fp): + """Write the experiment XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + experiment_xml_element = self.generate_experiment_xml() + + experiment_xml = minidom.parseString(ET.tostring( + experiment_xml_element)) + + with open(fp, 'w') as outfile: + outfile.write(experiment_xml.toprettyxml(indent=' ', + encoding='UTF-8')) + + self.experiment_xml_fp = fp + + def write_run_xml(self, fp): + """Write the run XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + run_xml_element = self.generate_run_xml() + + run_xml = minidom.parseString(ET.tostring( + run_xml_element)) + + with open(fp, 'w') as outfile: + outfile.write(run_xml.toprettyxml(indent=' ', encoding='UTF-8')) + + self.run_xml_fp = fp + + def write_submission_xml(self, fp, action): + """Write the submission XML file using the current data + + Parameters + ---------- + fp : str + The filepath to which the XML will be written + action : {'ADD', 'VALIDATE', 'UPDATE'} + What action to take when communicating with EBI + + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + submission_xml_element = self.generate_submission_xml(action) + + submission_xml = minidom.parseString(ET.tostring( + submission_xml_element)) + + with open(fp, 'w') as outfile: + outfile.write(submission_xml.toprettyxml(indent=' ', + encoding='UTF-8')) + + self.submission_xml_fp = fp + + def write_all_xml_files(study_fp, sample_fp, experiment_fp, run_fp, + submission_fp, action): + """Write all XML files needed for an EBI submission using current data + + Parameters + ---------- + study_fp : str + The filepath to which the study XML will be written + sample_fp : str + The filepath to which the sample XML will be written + experiment_fp : str + The filepath to which the experiment XML will be written + run_fp : str + The filepath to which the run XML will be written + submission_fp : str + The filepath to which the submission XML will be written + action : {'ADD', 'VALIDATE', 'UPDATE'} + What action to take when communicating with EBI + + Notes + ----- + If any of the filepaths point to an existing file, it will be + overwritten + """ + self.write_study_xml(study_fp) + self.write_sample_xml(sample_fp) + self.write_experiment_xml(experiment_fp) + self.write_run_xml(run_fp) + self.write_submission_xml(submission_fp, action) + + def add_samples_from_templates(self, sample_template, prep_templates, + per_sample_fastq_dir): + """ + Parameters + ---------- + sample_template : file + prep_templates : list of file + per_sample_fastq_dir : str + Path to the direcotry containing per-sample FASTQ files containing + The sequence labels should be: + ``SampleID_SequenceNumber And Additional Notes if Applicable`` + """ + for sample in iter_file_via_list_of_dicts(sample_template): + sample_name = sample.pop('sample_name') + taxon_id = sample.pop('taxon_id', None) + description = sample.pop('description', None) + + self.add_sample(sample_name, taxon_id=taxon_id, + description=description, + empty_value='unknown', **sample) + + for prep_template in prep_templates: + for prep in iter_file_via_list_of_dicts(prep_template): + sample_name = prep.pop('sample_name') + platform = prep.pop('platform') + experiment_design_description = prep.pop( + 'experiment_design_description') + library_construction_protocol = prep.pop( + 'library_construction_protocol') + + file_path = join(per_sample_fastq_dir, sample_name+'.fastq') + self.add_sample_prep(sample_name, platform, 'fastq', + file_path, experiment_design_description, + library_construction_protocol, + empty_value='unknown', **prep) + + + @classmethod + def from_templates_and_demux_fastq(cls, study_id, study_title, + study_abstract, investigation_type, + sample_template, prep_templates, + demux_seqs_fp, output_dir, + **kwargs): + """Generate an ``EBISubmission`` from templates and a sequence file + + Parameters + ---------- + study_id : str + study_title : str + study_abstract : str + investigation_type : str + sample_template : file + prep_templates : list of file + demux_seqs_fp : str + Path to FASTQ File containing the demultiplexed sequences in the + format output by QIIME. Namely, the sequence labels should be: + ``SampleID_SequenceNumber And Additional Notes if Applicable`` + output_dir : str + The directory to output the per-sample FASTQ files. It will be + created it if does not already exist. The files will be named + .fastq + """ + if not exists(output_dir): + mkdir(output_dir) + + # generate the per-sample FASTQ files + with open(demux_seqs_fp, 'U') as demux_seqs: + split_helper(demux_seqs, output_dir, sequence_buffer_size=1000) + + # initialize the EBISubmission object + submission = cls(study_id, study_title, study_abstract, + investigation_type, **kwargs) + + submission.add_samples_from_templates( + sample_template, prep_templates, output_dir) + + return submission + + @classmethod + def from_templates_and_per_sample_fastqs(cls, study_id, study_title, + study_abstract, + investigation_type, + sample_template, prep_templates, + per_sample_fastq_dir, + **kwargs): + """Generate an ``EBISubmission`` from templates and FASTQ files + + Parameters + ---------- + study_id : str + study_title : str + study_abstract : str + investigation_type : str + sample_template : file + prep_templates : list of file + per_sample_fastq_dir : str + Path to the direcotry containing per-sample FASTQ files containing + The sequence labels should be: + ``SampleID_SequenceNumber And Additional Notes if Applicable`` + + Notes + ----- + - kwargs will be passed directly to the ``EBISubmission`` constructor, + which will add them as key-value pairs to the study attributes + section of the submission + """ + # initialize the EBISubmission object + submission = cls(study_id, study_title, study_abstract, + investigation_type, **kwargs) + + submission.add_samples_from_templates(sample_template, + prep_templates, + per_sample_fastq_dir) + + return submission From 1cd8b931ae14a801b397514fada5b87f641fb177 Mon Sep 17 00:00:00 2001 From: Adam Robbins-Pianka Date: Thu, 29 May 2014 11:32:41 -0600 Subject: [PATCH 3/7] PEP8 Compatibility --- python_code/ebi.py | 60 ++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/python_code/ebi.py b/python_code/ebi.py index 986423c8..35e25f56 100644 --- a/python_code/ebi.py +++ b/python_code/ebi.py @@ -10,18 +10,23 @@ from make_per_sample_fastq import split_helper + class InvalidMetadataError(Exception): pass + class SampleAlreadyExistsError(Exception): pass + class NoXMLError(Exception): pass + def clean_whitespace(s): return ' '.join(s.split()) + def iter_file_via_list_of_dicts(input_file): """Iterates over a TSV file, yielding dicts keyed by the column headers @@ -144,8 +149,7 @@ def generate_study_xml(self): study_set = ET.Element('STUDY_SET', { 'xmlns:xsi': "http://www.w3.org/2001/XMLSchema-instance", 'xsi:noNamespaceSchemaLocation': "ftp://ftp.sra.ebi.ac.uk/meta/xsd" - "/sra_1_3/SRA.study.xsd"} - ) + "/sra_1_3/SRA.study.xsd"}) study = ET.SubElement(study_set, 'STUDY', { 'alias': self._get_study_alias(), @@ -237,8 +241,7 @@ def generate_sample_xml(self): sample_set = ET.Element('SAMPLE_SET', { "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" - "/sra_1_3/SRA.sample.xsd"} - ) + "/sra_1_3/SRA.sample.xsd"}) for sample_name, sample_info in sorted(self.samples.iteritems()): sample = ET.SubElement(sample_set, 'SAMPLE', { @@ -378,8 +381,7 @@ def generate_experiment_xml(self): experiment_set = ET.Element('EXPERIMENT_SET', { "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" - "/sra_1_3/SRA.experiment.xsd"} - ) + "/sra_1_3/SRA.experiment.xsd"}) for sample_name, sample_info in sorted(self.samples.iteritems()): sample_alias = self._get_sample_alias(sample_name) for row_number, prep_info in enumerate(sample_info['preps']): @@ -401,12 +403,12 @@ def generate_experiment_xml(self): 'DESIGN_DESCRIPTION') design_description.text = escape(clean_whitespace( prep_info['experiment_design_description'])) - sample_descriptor = ET.SubElement(design, 'SAMPLE_DESCRIPTOR', - {'refname': sample_alias} + sample_descriptor = ET.SubElement( + design, 'SAMPLE_DESCRIPTOR', {'refname': sample_alias} ) self._generate_library_descriptor( - design, sample_name, row_number, + design, sample_name, row_number, prep_info['library_construction_protocol'] ) @@ -420,8 +422,8 @@ def generate_experiment_xml(self): instrument_model.text = 'unspecified' if prep_info: - experiment_attributes = ET.SubElement(experiment, - 'EXPERIMENT_ATTRIBUTES') + experiment_attributes = ET.SubElement( + experiment, 'EXPERIMENT_ATTRIBUTES') self._add_dict_as_tags_and_values(experiment_attributes, 'EXPERIMENT_ATTRIBUTE', prep_info) @@ -436,12 +438,11 @@ def generate_run_xml(self): xml.etree.Element The root elelement of the generated ``ElementTree`` """ - #TODO: md5 stuff + # TODO: md5 stuff run_set = ET.Element('RUN_SET', { "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", - "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd/" - "sra_1_3/SRA.run.xsd"} - ) + "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" + "/sra_1_3/SRA.run.xsd"}) for sample_name, sample_info in sorted(self.samples.iteritems()): sample_alias = self._get_sample_alias(sample_name) for row_number, prep_info in enumerate(sample_info['preps']): @@ -465,7 +466,7 @@ def generate_run_xml(self): 'filetype': file_type, 'quality_scring_system': 'phred', 'checksum_method': 'MD5', - 'checksum': 'NONE'} # TODO: checksum + 'checksum': 'NONE'} # TODO: checksum ) return run_set @@ -500,8 +501,7 @@ def generate_submission_xml(self, action): submission_set = ET.Element('SUBMISSION_SET', { "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" - "/sra_1_3/SRA.submission.xsd"} - ) + "/sra_1_3/SRA.submission.xsd"}) submission = ET.SubElement(submission_set, 'SUBMISSION', { 'alias': self._get_submission_alias(), 'center_name': 'CCME-COLORADO'} @@ -701,20 +701,18 @@ def add_samples_from_templates(self, sample_template, prep_templates, 'experiment_design_description') library_construction_protocol = prep.pop( 'library_construction_protocol') - + file_path = join(per_sample_fastq_dir, sample_name+'.fastq') self.add_sample_prep(sample_name, platform, 'fastq', file_path, experiment_design_description, library_construction_protocol, empty_value='unknown', **prep) - @classmethod - def from_templates_and_demux_fastq(cls, study_id, study_title, - study_abstract, investigation_type, - sample_template, prep_templates, - demux_seqs_fp, output_dir, - **kwargs): + def from_templates_and_demux_fastq( + cls, study_id, study_title, study_abstract, investigation_type, + sample_template, prep_templates, demux_seqs_fp, output_dir, + **kwargs): """Generate an ``EBISubmission`` from templates and a sequence file Parameters @@ -740,14 +738,14 @@ def from_templates_and_demux_fastq(cls, study_id, study_title, # generate the per-sample FASTQ files with open(demux_seqs_fp, 'U') as demux_seqs: split_helper(demux_seqs, output_dir, sequence_buffer_size=1000) - + # initialize the EBISubmission object submission = cls(study_id, study_title, study_abstract, investigation_type, **kwargs) - + submission.add_samples_from_templates( sample_template, prep_templates, output_dir) - + return submission @classmethod @@ -755,7 +753,7 @@ def from_templates_and_per_sample_fastqs(cls, study_id, study_title, study_abstract, investigation_type, sample_template, prep_templates, - per_sample_fastq_dir, + per_sample_fastq_dir, **kwargs): """Generate an ``EBISubmission`` from templates and FASTQ files @@ -771,7 +769,7 @@ def from_templates_and_per_sample_fastqs(cls, study_id, study_title, Path to the direcotry containing per-sample FASTQ files containing The sequence labels should be: ``SampleID_SequenceNumber And Additional Notes if Applicable`` - + Notes ----- - kwargs will be passed directly to the ``EBISubmission`` constructor, @@ -781,7 +779,7 @@ def from_templates_and_per_sample_fastqs(cls, study_id, study_title, # initialize the EBISubmission object submission = cls(study_id, study_title, study_abstract, investigation_type, **kwargs) - + submission.add_samples_from_templates(sample_template, prep_templates, per_sample_fastq_dir) From 4b2a3ff05ee4ad360b39ba3ed94df82c8f2ac586 Mon Sep 17 00:00:00 2001 From: Adam Robbins-Pianka Date: Mon, 9 Jun 2014 10:17:59 -0600 Subject: [PATCH 4/7] Fix XML structure on some elements --- python_code/ebi.py | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/python_code/ebi.py b/python_code/ebi.py index 35e25f56..e16480b1 100644 --- a/python_code/ebi.py +++ b/python_code/ebi.py @@ -506,18 +506,39 @@ def generate_submission_xml(self, action): 'alias': self._get_submission_alias(), 'center_name': 'CCME-COLORADO'} ) + actions = ET.SubElement(submission, 'ACTIONS') - study_action = ET.SubElement(actions, 'ACTION', { - 'schema': 'study'} + + study_action = ET.SubElement(actions, 'ACTION') + study_action_2 = ET.SubElement(study_action, action, + { + 'schema': 'study', + 'source': self.study_xml_fp + } ) - sample_action = ET.SubElement(actions, 'ACTION', { - 'schema': 'sample'} + + sample_action = ET.SubElement(actions, 'ACTION') + sample_action_2 = ET.SubElement(sample_action, action, + { + 'schema': 'sample', + 'source': self.sample_xml_fp + } ) - experiment_action = ET.SubElement(actions, 'ACTION', { - 'schema': 'experiment'} + + experiment_action = ET.SubElement(actions, 'ACTION') + experiment_action_2 = ET.SubElement(experiment_action, action, + { + 'schema': 'experiment', + 'source': self.experiment_xml_fp + } ) - run_action = ET.SubElement(actions, 'ACTION', { - 'schema': 'run'} + + run_action = ET.SubElement(actions, 'ACTION') + run_action_2 = ET.SubElement(run_action, action, + { + 'schema': 'run', + 'source': self.run_xml_fp + } ) if action is 'ADD': From afcc1a39ffd971bb8115b3ac67c7a7b1d296b15c Mon Sep 17 00:00:00 2001 From: Adam Robbins-Pianka Date: Tue, 1 Jul 2014 12:39:44 -0600 Subject: [PATCH 5/7] Fixup changes --- python_code/make_per_sample_fastq.py | 35 ++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/python_code/make_per_sample_fastq.py b/python_code/make_per_sample_fastq.py index 42b318de..d94796b6 100644 --- a/python_code/make_per_sample_fastq.py +++ b/python_code/make_per_sample_fastq.py @@ -3,11 +3,13 @@ from collections import defaultdict from os.path import join, exists from os import mkdir +from gzip import open as gz_open import click from skbio.parse.sequences.fastq import parse_fastq +from skbio.format.sequences.fastq import format_fastq_record -def write_fastq(output_file, fastq_data, ascii_increment=33): +def write_fastq(output_file, fastq_data): """Writes tuples of (defline, seq, qual) to an output file Parameters @@ -23,7 +25,7 @@ def write_fastq(output_file, fastq_data, ascii_increment=33): output_file.write('@%s\n%s\n+\n%s\n' % (defline, seq, qual)) def split_helper(input_fastq, output_directory, sequence_buffer_size=1000, - ascii_increment=33): + gzip=True): """Splits a demultiplexed FASTQ file into per-sample FASTQ files Parameters @@ -36,6 +38,8 @@ def split_helper(input_fastq, output_directory, sequence_buffer_size=1000, sequence_buffer_size : int The number of sequences to hold in memory for each sample before writing them to disk. + gzip : bool, optional + If ``True``, then the generated FASTQ files will be gzipped Notes ----- @@ -48,7 +52,9 @@ def split_helper(input_fastq, output_directory, sequence_buffer_size=1000, per_sample_seqs = defaultdict(list) per_sample_counts = defaultdict(int) - for defline, seq, qual in parse_fastq(input_fastq): + unique_filepaths = set() + for defline, seq, qual in parse_fastq(input_fastq, + enforce_qual_range=False): label = defline.split()[0] sample_name, sequence_number = label.rsplit('_', 1) @@ -56,9 +62,12 @@ def split_helper(input_fastq, output_directory, sequence_buffer_size=1000, per_sample_counts[sample_name] += 1 if per_sample_counts[sample_name] > sequence_buffer_size: - with open(join(output_directory, sample_name+'.fastq'), 'a') \ + output_filepath = join(output_directory, sample_name+'.fastq') + unique_filepaths.add(output_filepath) + with open(output_filepath, 'a') \ as outfile: - write_fastq(outfile, per_sample_seqs[sample_name]) + outfile.write( + format_fastq_record(*per_sample_seqs[sample_name])) per_sample_seqs[sample_name] = [] per_sample_counts[sample_name] = 0 @@ -67,9 +76,15 @@ def split_helper(input_fastq, output_directory, sequence_buffer_size=1000, if not entries: continue - with open(join(output_directory, sample_name+'.fastq'), 'a') \ + output_filepath = join(output_directory, sample_name+'.fastq') + unique_filepaths.add(output_filepath) + with open(output_filepath, 'a') \ as outfile: - write_fastq(outfile, per_sample_seqs[sample_name]) + outfile.write( + format_fastq_record(*per_sample_seqs[sample_name])) + + if gzip: + pass @click.group() def cli(): @@ -79,11 +94,11 @@ def cli(): @click.option('--input-fastq', required=True, type=click.File('r')) @click.option('--sequence-buffer-size', default=1000) @click.option('--output-directory', required=True, type=str) -@click.option('--ascii-increment', default=33) +@click.option('--gzip/--no-gzip', default=True) def split(input_fastq, output_directory, sequence_buffer_size, - ascii_increment): + gzip): split_helper(input_fastq, output_directory, sequence_buffer_size, - ascii_increment) + gzip) if __name__ == '__main__': cli() From 7d7d0316d3471cfcdd4b797cbd0551c423cda78d Mon Sep 17 00:00:00 2001 From: Adam Robbins-Pianka Date: Tue, 1 Jul 2014 12:40:20 -0600 Subject: [PATCH 6/7] Adding ebi submit code --- python_code/scripts/ebi.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100755 python_code/scripts/ebi.py diff --git a/python_code/scripts/ebi.py b/python_code/scripts/ebi.py new file mode 100755 index 00000000..e80cd225 --- /dev/null +++ b/python_code/scripts/ebi.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +from logging import logger + +import click + +from data_access_connections import data_access_factory +from enums import ServerConfig + +# check sequence files +# check md5s (actually correctly) +# generate metadata files +# send sequence files +# send metadata + +@click.command() +@click.option('--study-id', required=True, type=int) +@click.option('--sequence-file', required=True, type=click.File('r')) +@click.option('--mapping-file', required=True, type=click.File('r')) +@click.option('--validate-md5/--no-validate-md5', required=False) +def validate_sample(study_id, sequence_file, mapping_file, validate_md5): + pass + +@click.command() +@click.option('--study-id', required=True, type=int) +@click.option('--sequence-file', required=True, type=click.File('r')) +@click.option('--mapping-file', required=True, type=click.File('r')) +@click.option('--validate-md5/--no-validate-md5', required=False) +def validate_and_submit_sample(study_id, sequence_file, mapping_file, validate_md5): + pass + +@click.command() +@click.option('--study-id', required=True, type=int) +@click.option('--sequence-file', required=True, type=click.File('r')) +@click.option('--mapping-file', required=True, type=click.File('r')) +@click.option('--validate-md5/--no-validate-md5', required=False) +def update_sample(study_id, sequence_file, mapping_file, validate_md5): + pass From 19cf3027963cef583a6c9481c4a23ee80ad178a7 Mon Sep 17 00:00:00 2001 From: Adam Robbins-Pianka Date: Tue, 5 Aug 2014 16:46:51 -0600 Subject: [PATCH 7/7] Address comments from @wasade and @ElDeveloper --- python_code/ebi.py | 220 +++++++++++++-------------- python_code/make_per_sample_fastq.py | 15 -- 2 files changed, 105 insertions(+), 130 deletions(-) diff --git a/python_code/ebi.py b/python_code/ebi.py index e16480b1..d4d3e1dc 100644 --- a/python_code/ebi.py +++ b/python_code/ebi.py @@ -1,29 +1,37 @@ #!/usr/bin/env python -from subprocess import Popen, PIPE, STDOUT from os.path import basename, exists, join from os import mkdir from datetime import date, timedelta from xml.etree import ElementTree as ET from xml.dom import minidom from xml.sax.saxutils import escape +from functools import partial from make_per_sample_fastq import split_helper class InvalidMetadataError(Exception): + """Error that is raised when metadata is not representable as a string""" pass class SampleAlreadyExistsError(Exception): + """Error that is raised when a sample is added to a submission that already + has a sample by that name""" pass class NoXMLError(Exception): + """Error that is raised when the generation of one XML file cannot be + completed because it depends on another XML file that has not yet been + generated""" pass def clean_whitespace(s): + """Standardizes whitespace so that there is only ever one space separating + tokens""" return ' '.join(s.split()) @@ -47,7 +55,9 @@ def iter_file_via_list_of_dicts(input_file): - All column headers will be transformed to lowercase - Skips lines containing only whitespace """ - headers = [x.strip().lower() for x in input_file.readline().split('\t')] + header_line = input_file.readline() + header_line = header_line.lstrip('#') + headers = [x.strip().lower() for x in header_line.split('\t')] for line in input_file: line = line.strip() if not line: @@ -58,8 +68,22 @@ def iter_file_via_list_of_dicts(input_file): class EBISubmission(object): + """Define an EBI submission and facilitate generation of required XML files + + Parameters + ---------- + study_id : str + study_title : str + study_abstract : str + investigation_type : str + 'metagenome', and 'mimarks-survey' are specially recognized and used to + set other attributes in the submission, but any string is valid + empty_value : str, optional + Defaults to "no_data". This is the value that will be used when data + for a particular metadata field is missing + """ def __init__(self, study_id, study_title, study_abstract, - investigation_type, empty_value='unknown', **kwargs): + investigation_type, empty_value='no_data', **kwargs): self.study_id = study_id self.study_title = study_title self.study_abstract = study_abstract @@ -83,18 +107,24 @@ def __init__(self, study_id, study_title, study_abstract, self.library_selection = lib_selections.get( self.investigation_type, "unspecififed") - # This is allows addition of other arbitrary study metadata + # This allows addition of other arbitrary study metadata + self.additional_metadata = self._stringify_kwargs(kwargs) + + # This will hold the submission's samples, keyed by the sample name + self.samples = {} + + def _stringify_kwargs(self, kwargs_dict): + """Turns values in a dictionay into strings, None, or self.empty_value + """ try: - self.additional_metadata = { - str(k): str(v) if v is not None else empty_value + result = { + str(k): str(v) if v is not None else self.empty_value for k, v in kwargs.iteritems()} except ValueError: raise InvalidMetadataError("All additional metadata passed via " "kwargs to the EBISubmission " - "constructor must be strings") - - # This will hold the submission's samples, keyed by the sample name - self.samples = {} + "constructor must be representatable " + "as strings.") def _get_study_alias(self): """Format alias using ``self.study_id``""" @@ -130,7 +160,7 @@ def _get_library_name(self, sample_name, row_number): def _add_dict_as_tags_and_values(self, parent_node, attribute_element_name, data_dict): """Format key/value data using a common EBI XML motif""" - for attr, val in sorted(data_dict.iteritems()): + for attr, val in sorted(data_dict.items()): attribute_element = ET.SubElement(parent_node, attribute_element_name) tag = ET.SubElement(attribute_element, 'TAG') @@ -175,7 +205,7 @@ def generate_study_xml(self): return study_set def add_sample(self, sample_name, taxon_id=None, description=None, - empty_value='unknown', **kwargs): + **kwargs): """Adds sample information to the current submission Parameters @@ -188,16 +218,9 @@ def add_sample(self, sample_name, taxon_id=None, description=None, description : str, optional Defaults to ``None``. If not provided, the `empty_value` will be used for the description - empty_value : str, optional - Defaults to "unknown". This value will be used for `taxon_id` or - `description` when they are not supplied, and for all ``None`` - values in ``kwargs`` Raises ------ - InvalidMetadataError - If metadata is passed via `kwargs` that is not interpretable as - a string SampleAlreadyExistsError If `sample_name` already exists in the ``samples`` dict """ @@ -209,24 +232,18 @@ def add_sample(self, sample_name, taxon_id=None, description=None, self.samples[sample_name] = {} - self.samples[sample_name]['taxon_id'] = empty_value if \ + self.samples[sample_name]['taxon_id'] = self.empty_value if \ taxon_id is None else taxon_id self.samples[sample_name]['taxon_id'] = \ escape(clean_whitespace(self.samples[sample_name]['taxon_id'])) - self.samples[sample_name]['description'] = empty_value if \ + self.samples[sample_name]['description'] = self.empty_value if \ description is None else description self.samples[sample_name]['description'] = \ escape(clean_whitespace(self.samples[sample_name]['description'])) - try: - self.samples[sample_name]['attributes'] = { - str(k): str(v) if v is not None else empty_value - for k, v in kwargs.iteritems()} - - except ValueError: - raise InvalidMetadataError("All metadata passed to add_sample " - "via kwargs must be strings.") + self.samples[sample_name]['attributes'] = self._stringify_kwargs( + kwargs) self.samples[sample_name]['preps'] = [] @@ -271,7 +288,7 @@ def generate_sample_xml(self): def add_sample_prep(self, sample_name, platform, file_type, file_path, experiment_design_description, library_construction_protocol, - empty_value='unknown', **kwargs): + **kwargs): """Add prep info for an existing sample Parameters @@ -286,26 +303,14 @@ def add_sample_prep(self, sample_name, platform, file_type, file_path, The path to the sequence file for this sample experiment_design_description : str library_construction_protocol : str - empty_value : str, optional - Defaults to "unknown". This value will be used for all ``None`` - values in ``kwargs`` Raises ------ - InvalidMetadataError - If metadata is passed via `kwargs` that is not interpretable as - a string KeyError If `sample_name` is not in the list of samples in the ``EBISubmission`` object """ - try: - prep_info = {str(k): str(v) if v is not None else empty_value - for k, v in kwargs.iteritems()} - except ValueError: - raise InvalidMetadataError("All metadata passed to " - "add_sample_prep via kwargs must be " - "strings.") + prep_info = self._stringify_kwargs(kwargs) prep_info['platform'] = platform prep_info['file_type'] = file_type @@ -321,7 +326,7 @@ def _generate_library_descriptor(self, design, sample_name, row_number, library_construction_protocol): """This XML element (and its subelements) must be written for every sample, but its generation depends on only study-level information. - Therefore, we can breka it out into its own method. + Therefore, we can break it out into its own method. """ library_descriptor = ET.SubElement(design, 'LIBRARY_DESCRIPTOR') @@ -348,7 +353,7 @@ def _generate_library_descriptor(self, design, sample_name, row_number, def _generate_spot_descriptor(self, design, platform): """This XML element (and its subelements) must be written for every sample, but its generation depends on only study-level information. - Therefore, we can breka it out into its own method. + Therefore, we can break it out into its own method. """ # This section applies only to the LS454 platform if platform is not 'LS454': @@ -382,7 +387,7 @@ def generate_experiment_xml(self): "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" "/sra_1_3/SRA.experiment.xsd"}) - for sample_name, sample_info in sorted(self.samples.iteritems()): + for sample_name, sample_info in sorted(self.samples.items()): sample_alias = self._get_sample_alias(sample_name) for row_number, prep_info in enumerate(sample_info['preps']): experiment_alias = self._get_experiment_alias(sample_name, @@ -443,7 +448,7 @@ def generate_run_xml(self): "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd" "/sra_1_3/SRA.run.xsd"}) - for sample_name, sample_info in sorted(self.samples.iteritems()): + for sample_name, sample_info in sorted(self.samples.items()): sample_alias = self._get_sample_alias(sample_name) for row_number, prep_info in enumerate(sample_info['preps']): experiment_alias = self._get_experiment_alias(sample_name, @@ -510,35 +515,26 @@ def generate_submission_xml(self, action): actions = ET.SubElement(submission, 'ACTIONS') study_action = ET.SubElement(actions, 'ACTION') - study_action_2 = ET.SubElement(study_action, action, - { - 'schema': 'study', - 'source': self.study_xml_fp - } + study_action_2 = ET.SubElement(study_action, action, { + 'schema': 'study', + 'source': self.study_xml_fp} ) sample_action = ET.SubElement(actions, 'ACTION') - sample_action_2 = ET.SubElement(sample_action, action, - { - 'schema': 'sample', - 'source': self.sample_xml_fp - } + sample_action_2 = ET.SubElement(sample_action, action, { + 'schema': 'sample', + 'source': self.sample_xml_fp} ) experiment_action = ET.SubElement(actions, 'ACTION') - experiment_action_2 = ET.SubElement(experiment_action, action, - { - 'schema': 'experiment', - 'source': self.experiment_xml_fp - } + experiment_action_2 = ET.SubElement(experiment_action, action, { + 'schema': 'experiment', + 'source': self.experiment_xml_fp} ) run_action = ET.SubElement(actions, 'ACTION') - run_action_2 = ET.SubElement(run_action, action, - { - 'schema': 'run', - 'source': self.run_xml_fp - } + run_action_2 = ET.SubElement(run_action, action, { + 'schema': 'run', 'source': self.run_xml_fp} ) if action is 'ADD': @@ -548,30 +544,43 @@ def generate_submission_xml(self, action): return submission_set - def write_study_xml(self, fp): - """Write the study XML file using the current data + def _write_xml_file(self, xml_gen_fn, attribute_name, fp, + xml_gen_fn_arg=None): + """Writes an XML file after calling one of the XML generation + functions Parameters ---------- + xml_gen_fn : function + The function that will be called to generate the XML that will be + written + attribute_name : str + The name of the attribute in which to store the output filepath fp : str The filepath to which the XML will be written + xml_gen_fn_arg : str, optional + Defaults to None. If None, no arguments will be passed to + xml_gen_fn. Otherwise, this will be passed as the only argument to + xml_gen_fn Notes ----- - If `fp` points to an existing file, it will be overwritten + xml_gen_fn_arg is needed for generating the submission XML """ - study_xml_element = self.generate_study_xml() + if xml_gen_fn_arg is None: + xml_element = xml_gen_fn() + else: + xml_element = xml_gen_fn(xml_gen_fn_arg) - study_xml = minidom.parseString(ET.tostring( - study_xml_element)) + xml = minidom.parseString(ET.tostring(xml_element)) with open(fp, 'w') as outfile: - outfile.write(study_xml.toprettyxml(indent=' ', encoding='UTF-8')) + outfile.write(xml.toprettyxml(indent=' ', encoding='UTF-8')) - self.study_xml_fp = fp + setattr(self, attribute_name, fp) - def write_sample_xml(self, fp): - """Write the sample XML file using the current data + def write_study_xml(self, fp): + """Write the study XML file using the current data Parameters ---------- @@ -582,16 +591,21 @@ def write_sample_xml(self, fp): ----- If `fp` points to an existing file, it will be overwritten """ - sample_xml_element = self.generate_sample_xml() + self._write_xml_file(self.generate_study_xml, 'study_xml_fp', fp) - sample_xml = minidom.parseString(ET.tostring( - sample_xml_element)) + def write_sample_xml(self, fp): + """Write the sample XML file using the current data - with open(fp, 'w') as outfile: - outfile.write(sample_xml.toprettyxml(indent=' ', - encoding='UTF-8')) + Parameters + ---------- + fp : str + The filepath to which the XML will be written - self.sample_xml_fp = fp + Notes + ----- + If `fp` points to an existing file, it will be overwritten + """ + self._write_xml_file(self.generate_sample_xml, 'sample_xml_fp', fp) def write_experiment_xml(self, fp): """Write the experiment XML file using the current data @@ -605,16 +619,8 @@ def write_experiment_xml(self, fp): ----- If `fp` points to an existing file, it will be overwritten """ - experiment_xml_element = self.generate_experiment_xml() - - experiment_xml = minidom.parseString(ET.tostring( - experiment_xml_element)) - - with open(fp, 'w') as outfile: - outfile.write(experiment_xml.toprettyxml(indent=' ', - encoding='UTF-8')) - - self.experiment_xml_fp = fp + self._write_xml_file(self.generate_experiment_xml, + 'experiment_xml_fp', fp) def write_run_xml(self, fp): """Write the run XML file using the current data @@ -628,15 +634,7 @@ def write_run_xml(self, fp): ----- If `fp` points to an existing file, it will be overwritten """ - run_xml_element = self.generate_run_xml() - - run_xml = minidom.parseString(ET.tostring( - run_xml_element)) - - with open(fp, 'w') as outfile: - outfile.write(run_xml.toprettyxml(indent=' ', encoding='UTF-8')) - - self.run_xml_fp = fp + self._write_xml_file(self.generate_run_xml, 'run_xml_fp', fp) def write_submission_xml(self, fp, action): """Write the submission XML file using the current data @@ -652,16 +650,8 @@ def write_submission_xml(self, fp, action): ----- If `fp` points to an existing file, it will be overwritten """ - submission_xml_element = self.generate_submission_xml(action) - - submission_xml = minidom.parseString(ET.tostring( - submission_xml_element)) - - with open(fp, 'w') as outfile: - outfile.write(submission_xml.toprettyxml(indent=' ', - encoding='UTF-8')) - - self.submission_xml_fp = fp + self._write_xml_file(self.generate_submission_xml, 'submission_xml_fp', + fp, action) def write_all_xml_files(study_fp, sample_fp, experiment_fp, run_fp, submission_fp, action): @@ -712,7 +702,7 @@ def add_samples_from_templates(self, sample_template, prep_templates, self.add_sample(sample_name, taxon_id=taxon_id, description=description, - empty_value='unknown', **sample) + **sample) for prep_template in prep_templates: for prep in iter_file_via_list_of_dicts(prep_template): @@ -727,7 +717,7 @@ def add_samples_from_templates(self, sample_template, prep_templates, self.add_sample_prep(sample_name, platform, 'fastq', file_path, experiment_design_description, library_construction_protocol, - empty_value='unknown', **prep) + **prep) @classmethod def from_templates_and_demux_fastq( diff --git a/python_code/make_per_sample_fastq.py b/python_code/make_per_sample_fastq.py index d94796b6..b0b28367 100644 --- a/python_code/make_per_sample_fastq.py +++ b/python_code/make_per_sample_fastq.py @@ -9,21 +9,6 @@ from skbio.parse.sequences.fastq import parse_fastq from skbio.format.sequences.fastq import format_fastq_record -def write_fastq(output_file, fastq_data): - """Writes tuples of (defline, seq, qual) to an output file - - Parameters - ---------- - output_file : file - Data in `fastq_data` will be written to this file - fastq_data : iterable of tuples of (str, str, str) - Each item in `fastq_data` is a FASTQ entry, each of which is - represented by a tuple of (defline, sequence, quality). - """ - for defline, seq, qual in fastq_data: - qual = ''.join([chr(x+ascii_increment) for x in qual]) - output_file.write('@%s\n%s\n+\n%s\n' % (defline, seq, qual)) - def split_helper(input_fastq, output_directory, sequence_buffer_size=1000, gzip=True): """Splits a demultiplexed FASTQ file into per-sample FASTQ files