diff --git a/CHANGELOG.md b/CHANGELOG.md index c2fe0c05..e2a4c4b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,10 +10,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package (#305) - Added a parser and writer for the `pileup` sequence alignment format (#329) +- Created copy methods for Feature and Location to address concerns raised by [(#342)](https://github.com/TimothyStiles/poly/issues/342) +- Created new methods to convert polyjson -> genbank. +- Created new `Feature.StoreSequence` method to enable [(#388)](https://github.com/TimothyStiles/poly/issues/388) + +### Changed +- **Breaking**: Genbank parser uses new custom multimap for `Feature.Attributes`, which allows for duplicate keys. This changes the type of Features.Attributes from `map[string]string` to `MultiMap[string, string]`, an alias for `map[string]string` defined in `multimap.go`. [(#383)](https://github.com/TimothyStiles/poly/issues/383) +- Improves error reporting for genbank parse errors via a new `ParseError` struct. ### Fixed - `fastq` parser no longer becomes de-aligned when reading (#325) - `fastq` now handles optionals correctly (#323) +- Adds functional test and fix for [(#313)](https://github.com/TimothyStiles/poly/issues/313). +- In addition to expanding the set of genbank files which can be validly parsed, the parser is more vocal when it encounters unusual syntax in the "feature" section. This "fail fast" approach is better as there were cases where inputs triggered a codepath which would neither return a valid Genbank object nor an error, and should help with debugging. ## [0.26.0] - 2023-07-22 Oops, we weren't keeping a changelog before this tag! diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4d4bcb86..b29dc2a8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -93,6 +93,18 @@ In order to simplify the development experience, and environment setup, the poly Whether you're a beginner with Go or you're an experienced developer, You should see the suggestions popup automatically when you goto the *Plugins* tab in VSCode. Using these plugins can help accelerate the development experience and also allow you to work more collaboratively with other poly developers. +## Local Checks + +Poly runs numerous CI/CD checks via Github Actions before a PR can be merged. In order to make your PR mergeable, your PR must pass all of these checks. + +A quick way to check your PR will pass is to run: + +```sh +gofmt -s -w . && go test ./... +``` + +Additionally, you may want to [install](https://golangci-lint.run/usage/install/#local-installation) and run the linter. + # How to report a bug ### Security disclosures diff --git a/bio/example_test.go b/bio/example_test.go index 5dc43638..d125c095 100644 --- a/bio/example_test.go +++ b/bio/example_test.go @@ -253,7 +253,7 @@ ORIGIN records, _ := parser.Parse() fmt.Println(records[0].Features[2].Attributes["translation"]) - // Output: MTMITPSLHACRSTLEDPRVPSSNSLAVVLQRRDWENPGVTQLNRLAAHPPFASWRNSEEARTDRPSQQLRSLNGEWRLMRYFLLTHLCGISHRIWCTLSTICSDAA + // Output: [MTMITPSLHACRSTLEDPRVPSSNSLAVVLQRRDWENPGVTQLNRLAAHPPFASWRNSEEARTDRPSQQLRSLNGEWRLMRYFLLTHLCGISHRIWCTLSTICSDAA] } func ExampleNewSlow5Parser() { diff --git a/bio/fasta/fasta.go b/bio/fasta/fasta.go index 28e72305..416cbf36 100644 --- a/bio/fasta/fasta.go +++ b/bio/fasta/fasta.go @@ -26,15 +26,15 @@ Fasta Parser begins here Many thanks to Jordan Campbell (https://github.com/0x106) for building the first parser for Poly and thanks to Tim Stiles (https://github.com/TimothyStiles) for helping complete that PR. This work expands on the previous work by allowing -for concurrent parsing and giving Poly a specific parser subpackage, +for concurrent parsing and giving Poly a specific parser subpackage, as well as few bug fixes. Fasta is a very simple file format for working with DNA, RNA, or protein sequences. It was first released in 1985 and is still widely used in bioinformatics. -https://en.wikipedia.org/wiki/_format +https://en.wikipedia.org/wiki/FASTA_format -One interesting use of the concurrent parser is working with the Uniprot +One interesting use of the concurrent parser is working with the Uniprot fasta dump files, which are far too large to fit into RAM. This parser is able to easily handle those files by doing computation actively while the data dump is getting parsed. diff --git a/bio/genbank/data/NC_001141.2_redux.gb b/bio/genbank/data/NC_001141.2_redux.gb new file mode 100644 index 00000000..6a45765c --- /dev/null +++ b/bio/genbank/data/NC_001141.2_redux.gb @@ -0,0 +1,147 @@ +LOCUS NC_001141 439888 bp DNA linear CON 15-SEP-2023 +DEFINITION Saccharomyces cerevisiae S288C chromosome IX, complete sequence. +ACCESSION NC_001141 +VERSION NC_001141.2 +DBLINK BioProject: PRJNA128 + Assembly: GCF_000146045.2 +KEYWORDS RefSeq. +SOURCE Saccharomyces cerevisiae S288C + ORGANISM Saccharomyces cerevisiae S288C + Eukaryota; Fungi; Dikarya; Ascomycota; Saccharomycotina; + Saccharomycetes; Saccharomycetales; Saccharomycetaceae; + Saccharomyces. +REFERENCE 1 (bases 1 to 439888) + AUTHORS Engel,S.R., Wong,E.D., Nash,R.S., Aleksander,S., Alexander,M., + Douglass,E., Karra,K., Miyasato,S.R., Simison,M., Skrzypek,M.S., + Weng,S. and Cherry,J.M. + TITLE New data and collaborations at the Saccharomyces Genome Database: + updated reference genome, alleles, and the Alliance of Genome + Resources + JOURNAL Genetics 220 (4) (2022) + PUBMED 34897464 +REFERENCE 2 (bases 1 to 439888) + AUTHORS Churcher,C., Bowman,S., Badcock,K., Bankier,A., Brown,D., + Chillingworth,T., Connor,R., Devlin,K., Gentles,S., Hamlin,N., + Harris,D., Horsnell,T., Hunt,S., Jagels,K., Jones,M., Lye,G., + Moule,S., Odell,C., Pearson,D., Rajandream,M., Rice,P., Rowley,N., + Skelton,J., Smith,V., Barrell,B. et al. + TITLE The nucleotide sequence of Saccharomyces cerevisiae chromosome IX + JOURNAL Nature 387 (6632 SUPPL), 84-87 (1997) + PUBMED 9169870 +REFERENCE 3 (bases 1 to 439888) + AUTHORS Goffeau,A., Barrell,B.G., Bussey,H., Davis,R.W., Dujon,B., + Feldmann,H., Galibert,F., Hoheisel,J.D., Jacq,C., Johnston,M., + Louis,E.J., Mewes,H.W., Murakami,Y., Philippsen,P., Tettelin,H. and + Oliver,S.G. + TITLE Life with 6000 genes + JOURNAL Science 274 (5287), 546 (1996) + PUBMED 8849441 +REFERENCE 4 (bases 1 to 439888) + CONSRTM NCBI Genome Project + TITLE Direct Submission + JOURNAL Submitted (14-SEP-2023) National Center for Biotechnology + Information, NIH, Bethesda, MD 20894, USA +REFERENCE 5 (bases 1 to 439888) + CONSRTM Saccharomyces Genome Database + TITLE Direct Submission + JOURNAL Submitted (04-MAY-2012) Department of Genetics, Stanford + University, Stanford, CA 94305-5120, USA + REMARK Protein update by submitter +REFERENCE 6 (bases 1 to 439888) + CONSRTM Saccharomyces Genome Database + TITLE Direct Submission + JOURNAL Submitted (31-MAR-2011) Department of Genetics, Stanford + University, Stanford, CA 94305-5120, USA + REMARK Sequence update by submitter +REFERENCE 7 (bases 1 to 439888) + CONSRTM Saccharomyces Genome Database + TITLE Direct Submission + JOURNAL Submitted (14-DEC-2009) Department of Genetics, Stanford + University, Stanford, CA 94305-5120, USA +COMMENT REVIEWED REFSEQ: This record has been curated by SGD. The reference + sequence is identical to BK006942. + + On Apr 26, 2011 this sequence version replaced NC_001141.1. + + ##Genome-Annotation-Data-START## + Annotation Provider :: SGD + Annotation Status :: Full Annotation + Annotation Version :: R64-4-1 + URL :: http://www.yeastgenome.org/ + ##Genome-Annotation-Data-END## + COMPLETENESS: full length. +FEATURES Location/Qualifiers + source 1..439888 + /organism="Saccharomyces cerevisiae S288C" + /mol_type="genomic DNA" + /strain="S288C" + /db_xref="taxon:559292" + /chromosome="IX" + telomere complement(1..7784) + /note="TEL09L; Telomeric region on the left arm of + Chromosome IX; composed of an X element core sequence, X + element combinatorial repeats, a long Y' element, and a + short terminal stretch of telomeric repeats" + /db_xref="SGD:S000028896" + gene complement(<483..>6147) + /locus_tag="YIL177C" + /db_xref="GeneID:854630" + mRNA complement(join(<483..4598,4987..>6147)) + /locus_tag="YIL177C" + /product="Y' element ATP-dependent helicase" + /transcript_id="NM_001179522.1" + /db_xref="GeneID:854630" + CDS complement(join(483..4598,4987..6147)) + /locus_tag="YIL177C" + /EC_number="3.6.4.12" + /note="Putative Y' element ATP-dependent helicase" + /codon_start=1 + /product="Y' element ATP-dependent helicase" + /protein_id="NP_012092.1" + /db_xref="GeneID:854630" + /db_xref="SGD:S000001439" + /translation="MKVSDRRKFEKANFDEFESALNNKNDLVHCPSITLFESIPTEVR + SFYEDEKSGLIKVVKFRTGAMDRKRSFEKVVISVMVGKNVKKFLTFVEDEPDFQGGPI + PSKYLIPKKINLMVYTLFQVHTLKFNRKDYDTLSLFYLNRGYYNELSFRVLERCHEIA + SARPNDSSTMRTFTDFVSGAPIVRSLQKSTIRKYGYNLAPYMFLLLHVDELSIFSAYQ + ASLPGEKKVDTERLKRDLCPRKPIEIKYFSQICNDMMNKKDRLGDILHIILRACALNF + GAGPRGGAGDEEDRSITNEEPIIPSVDEHGLKVCKLRSPNTPRRLRKTLDAVKALLVS + SCACTARDLDIFDDNNGVAMWKWIKILYHEVAQETTLKDSYRITLVPSSDGISLLAFA + GPQRNVYVDDTTRRIQLYTDYNKNGSSEPRLKTLDGLTSDYVFYFVTVLRQMQICALG + NSYDAFNHDPWMDVVGFEDPNQVTNRDISRIVLYSYMFLNTAKGCLVEYATFRQYMRE + LPKNAPQKLNFREMRQGLIALGRHCVGSRFETDLYESATSELMANHSVQTGRNIYGVD + SFSLTSVSGTTATLLQERASERWIQWLGLESDYHCSFSSTRNAEDVVAGEAASSNHHQ + KISRVTRKRPREPKSTNDILVAGQKLFGSSFEFRDLHQLRLCYEIYMADTPSVAVQAP + PGYGKTELFHLPLIALASKGDVEYVSFLFVPYTVLLANCMIRLGRCGCLNVAPVRNFI + EEGYDGVTDLYVGIYDDLASTNFTDRIAAWENIVECTFRTNNVKLGYLIVDEFHNFET + EVYRQSQFGGITNLDFDAFEKAIFLSGTAPEAVADAALQRIGLTGLAKKSMDINELKR + SEDLSRGLSSYPTRMFNLIKEKSEVPLGHVHKIRKKVESQPEEALKLLLALFESEPES + KAIVVASTTNEVEELACSWRKYFRVVWIHGKLGAAEKVSRTKEFVTDGSMQVLIGTKL + VTEGIDIKQLMMVIMLDNRLNIIELIQGVGRLRDGGLCYLLSRKNSWAARNRKGELPP + IKEGCITEQVREFYGLESKKGKKGQHVGCCGSRTDLSADTVELIERMDRLAEKQATAS + MSIVALPSSFQESNSSDRYRKYCSSDEDSNTCIHGSANASTNASTNAITTASTNVRTN + ATTNASTNATTNASTNASTNATTNASTNATTNSSTNATTTASTNVRTSATTTASINVR + TSATTTESTNSSTNATTTESTNSSTNATTTESTNSNTSATTTASINVRTSATTTESTN + SSTSATTTASINVRTSATTTKSINSSTNATTTESTNSNTNATTTESTNSSTNATTTES + TNSSTNATTTESTNSNTSAATTESTNSNTSATTTESTNASAKEDANKDGNAEDNRFHP + VTDINKESYKRKGSQMVLLERKKLKAQFPNTSENMNVLQFLGFRSDEIKHLFLYGIDI + YFCPEGVFTQYGLCKGCQKMFELCVCWAGQKVSYRRIAWEALAVERMLRNDEEYKEYL + EDIEPYHGDPVGYLKYFSVKRREIYSQIQRNYAWYLAITRRRETISVLDSTRGKQGSQ + VFRMSGRQIKELYFKVWSNLRESKTEVLQYFLNWDEKKCQEEWEAKDDTVVVEALEKG + GVFQRLRSMTSAGLQGPQYVKLQFSRHHRQLRSRYELSLGMHLRDQIALGVTPSKVPH + WTAFLSMLIGLFYNKTFRQKLEYLLEQISEVWLLPHWLDLANVEVLAADDTRVPLYML + MVAVHKELDSDDVPDGRFDILLCRDSSREVGE" + rep_origin 7470..8793 + /note="ARS902; Putative replication origin; identified in + multiple array studies, not yet confirmed by plasmid-based + assay" + /db_xref="SGD:S000130156" + mRNA join(<155222,155311..>155765) + /gene="COX5B" + /locus_tag="YIL111W" + /product="cytochrome c oxidase subunit Vb" + /transcript_id="NM_001179459.1" + /db_xref="GeneID:854695" +CONTIG join(BK006942.2:1..439888) +// + diff --git a/data/benchling.gb b/bio/genbank/data/benchling.gb similarity index 100% rename from data/benchling.gb rename to bio/genbank/data/benchling.gb diff --git a/data/bsub.gbk b/bio/genbank/data/bsub.gbk similarity index 100% rename from data/bsub.gbk rename to bio/genbank/data/bsub.gbk diff --git a/data/flatGbk_test.seq b/bio/genbank/data/flatGbk_test.seq similarity index 100% rename from data/flatGbk_test.seq rename to bio/genbank/data/flatGbk_test.seq diff --git a/data/flatGbk_test.seq.gz b/bio/genbank/data/flatGbk_test.seq.gz similarity index 100% rename from data/flatGbk_test.seq.gz rename to bio/genbank/data/flatGbk_test.seq.gz diff --git a/data/long_comment.seq b/bio/genbank/data/long_comment.seq similarity index 100% rename from data/long_comment.seq rename to bio/genbank/data/long_comment.seq diff --git a/data/malformed_read_test.gbk b/bio/genbank/data/malformed_read_test.gbk similarity index 100% rename from data/malformed_read_test.gbk rename to bio/genbank/data/malformed_read_test.gbk diff --git a/data/multiGbk_test.seq b/bio/genbank/data/multiGbk_test.seq similarity index 100% rename from data/multiGbk_test.seq rename to bio/genbank/data/multiGbk_test.seq diff --git a/data/phix174.gb b/bio/genbank/data/phix174.gb similarity index 100% rename from data/phix174.gb rename to bio/genbank/data/phix174.gb diff --git a/data/pichia_chr1_head.gb b/bio/genbank/data/pichia_chr1_head.gb similarity index 100% rename from data/pichia_chr1_head.gb rename to bio/genbank/data/pichia_chr1_head.gb diff --git a/data/puc19.gbk b/bio/genbank/data/puc19.gbk similarity index 100% rename from data/puc19.gbk rename to bio/genbank/data/puc19.gbk diff --git a/data/puc19_303_regression.gbk b/bio/genbank/data/puc19_303_regression.gbk similarity index 100% rename from data/puc19_303_regression.gbk rename to bio/genbank/data/puc19_303_regression.gbk diff --git a/data/puc19_consrtm.gbk b/bio/genbank/data/puc19_consrtm.gbk similarity index 100% rename from data/puc19_consrtm.gbk rename to bio/genbank/data/puc19_consrtm.gbk diff --git a/data/puc19_snapgene.gb b/bio/genbank/data/puc19_snapgene.gb similarity index 100% rename from data/puc19_snapgene.gb rename to bio/genbank/data/puc19_snapgene.gb diff --git a/data/sample.gbk b/bio/genbank/data/sample.gbk similarity index 100% rename from data/sample.gbk rename to bio/genbank/data/sample.gbk diff --git a/data/t4_intron.gb b/bio/genbank/data/t4_intron.gb similarity index 100% rename from data/t4_intron.gb rename to bio/genbank/data/t4_intron.gb diff --git a/bio/genbank/genbank.go b/bio/genbank/genbank.go index 55b3d61e..f2152b23 100644 --- a/bio/genbank/genbank.go +++ b/bio/genbank/genbank.go @@ -66,14 +66,14 @@ type Meta struct { // Feature holds the information for a feature in a Genbank file and other annotated sequence files. type Feature struct { - Type string `json:"type"` - Description string `json:"description"` - Attributes map[string]string `json:"attributes"` - SequenceHash string `json:"sequence_hash"` - SequenceHashFunction string `json:"hash_function"` - Sequence string `json:"sequence"` - Location Location `json:"location"` - ParentSequence *Genbank `json:"-"` + Type string `json:"type"` + Description string `json:"description"` + Attributes map[string][]string `json:"attributes"` + SequenceHash string `json:"sequence_hash"` + SequenceHashFunction string `json:"hash_function"` + Sequence string `json:"sequence"` + Location Location `json:"location"` + ParentSequence *Genbank `json:"-"` } // Reference holds information for one reference in a Meta struct. @@ -125,7 +125,25 @@ var ( sequenceRegex = regexp.MustCompile("[^a-zA-Z]+") ) +// StoreFeatureSequences calls StoreSequence on all features. +// The resulting JSON is guaranteed to have useful Feature.Sequence values. +// Useful when exporting for downstream analysis, such as with json.Marshal. +func (sequence *Genbank) StoreFeatureSequences() error { + for i := range sequence.Features { + _, err := sequence.Features[i].StoreSequence() + if err != nil { + return err + } + } + return nil +} + // AddFeature adds a feature to a Genbank struct. +// NOTE: This method assumes feature is not referenced in another location +// as this only creates a shallow copy. +// If you intend to duplicate a feature from another Genbank and plan +// to modify in either location, it is recommended you first call feature.Copy() +// before passing as input to save yourself trouble. func (sequence *Genbank) AddFeature(feature *Feature) error { feature.ParentSequence = sequence sequence.Features = append(sequence.Features, *feature) @@ -137,27 +155,58 @@ func (feature Feature) GetSequence() (string, error) { return getFeatureSequence(feature, feature.Location) } +// StoreSequence infers and assigns the value of feature.Sequence +// if currently an empty string. +func (feature *Feature) StoreSequence() (string, error) { + if feature.Sequence != "" { + return feature.Sequence, nil + } + seq, err := getFeatureSequence(*feature, feature.Location) + if err == nil { + feature.Sequence = seq + } + return seq, err +} + +// Copy creates deep copy of Feature, which supports safe duplication. +func (feature *Feature) Copy() Feature { + copy := *feature + copy.Location = CopyLocation(feature.Location) + copy.Attributes = NewMultiMap[string, string]() + ForEachKey(feature.Attributes, func(k string, v []string) { + copy.Attributes[k] = MapSlice(v, identity[string]) + }) + return copy +} + +// CopyLocation creates deep copy of Location, which supports safe duplication +func CopyLocation(location Location) Location { + location.SubLocations = MapSlice(location.SubLocations, CopyLocation) + return location +} + // getFeatureSequence takes a feature and location object and returns a sequence string. func getFeatureSequence(feature Feature, location Location) (string, error) { var sequenceBuffer bytes.Buffer - var sequenceString string parentSequence := feature.ParentSequence.Sequence if len(location.SubLocations) == 0 { sequenceBuffer.WriteString(parentSequence[location.Start:location.End]) } else { for _, subLocation := range location.SubLocations { - sequence, _ := getFeatureSequence(feature, subLocation) + sequence, err := getFeatureSequence(feature, subLocation) + if err != nil { + return "", err + } sequenceBuffer.WriteString(sequence) } } // reverse complements resulting string if needed. + sequenceString := sequenceBuffer.String() if location.Complement { - sequenceString = transform.ReverseComplement(sequenceBuffer.String()) - } else { - sequenceString = sequenceBuffer.String() + sequenceString = transform.ReverseComplement(sequenceString) } return sequenceString, nil @@ -422,6 +471,44 @@ func (sequence *Genbank) WriteTo(w io.Writer) (int64, error) { return writtenBytes, nil } +// ParseError represents failures encountered while parsing, +// and pointers to it are fully compatable with the error interface. +type ParseError struct { + file string // the file origin + line string // the offending line + before bool // whether the error occurred before or on this line + lineNo int // the line number, 0 indexed + info string `default:"syntax error"` // description of the error type + wraps error // stores the error that led to this, if any +} + +func (e ParseError) Error() string { + var out, loc string + if e.wraps == io.EOF { + out = "unexpected EOF" + if e.file != "" { + return fmt.Sprintf("%s in %s", out, e.file) + } else { + return out + } + } + if e.file == "" { + loc = fmt.Sprintf("line %d", e.lineNo) + } else { + loc = fmt.Sprintf("%s:%d", e.file, e.lineNo) + } + if e.before { + out = fmt.Sprintf("%s encountered before %s", e.info, loc) + } else { + out = fmt.Sprintf("%s encountered on %s: %s", e.info, loc, e.line) + } + if e.wraps != nil { + out = fmt.Sprintf("%s\nfrom %v", out, e.wraps) + } + return out +} + +// defines state for the parser, and utility methods to modify type parseLoopParameters struct { newLocation bool attribute string @@ -443,12 +530,31 @@ type parseLoopParameters struct { // method to init loop parameters func (params *parseLoopParameters) init() { params.newLocation = true - params.feature.Attributes = make(map[string]string) + params.feature.Attributes = NewMultiMap[string, string]() params.parseStep = "metadata" params.genbankStarted = false params.genbank.Meta.Other = make(map[string]string) } +// save our completed attribute / qualifier string to the current feature +// useful as a wrap-up step from multiple states +func (params *parseLoopParameters) saveLastAttribute() { + newValue := params.attributeValue != "" + emptyType := params.feature.Type != "" + if newValue || emptyType { + if newValue { + Put(params.feature.Attributes, params.attribute, params.attributeValue) + } + params.features = append(params.features, params.feature) + + // reset attribute state + params.attributeValue = "" + params.attribute = "" + params.feature = Feature{} + params.feature.Attributes = NewMultiMap[string, string]() + } +} + // Header is a blank struct, needed for compatibility with bio parsers. It contains nothing. type Header struct{} @@ -506,11 +612,12 @@ func (parser *Parser) Next() (*Genbank, error) { continue } + // define parser state machine switch parser.parameters.parseStep { case "metadata": // Handle empty lines if len(line) == 0 { - return &Genbank{}, fmt.Errorf("Empty metadata line on line %d", lineNum) + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "unexpected empty metadata"} } // If we are currently reading a line, we need to figure out if it is a new meta line. @@ -532,7 +639,7 @@ func (parser *Parser) Next() (*Genbank, error) { // parseReferencesFn = parseReferences in genbank_test. We use Fn for testing purposes. reference, err := parseReferencesFn(parser.parameters.metadataData) if err != nil { - return &Genbank{}, fmt.Errorf("Failed in parsing reference above line %d. Got error: %s", lineNum, err) + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, before: true, wraps: err, info: "failed in parsing reference"} } parser.parameters.genbank.Meta.References = append(parser.parameters.genbank.Meta.References, reference) @@ -564,7 +671,7 @@ func (parser *Parser) Next() (*Genbank, error) { for countIndex := 2; countIndex < len(fields)-1; countIndex += 2 { // starts at two because we don't want to include "BASE COUNT" in our fields count, err := strconv.Atoi(fields[countIndex]) if err != nil { - return &Genbank{}, err + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, wraps: err, info: "invalid base count"} } baseCount := BaseCount{ @@ -577,66 +684,51 @@ func (parser *Parser) Next() (*Genbank, error) { } // Switch to sequence parsing originFlag := strings.Contains(line, "ORIGIN") // we detect the beginning of the sequence with "ORIGIN" - if originFlag { + contigFlag := strings.Contains(line, "CONTIG") + if originFlag || contigFlag { parser.parameters.parseStep = "sequence" - // save our completed attribute / qualifier string to the current feature - if parser.parameters.attributeValue != "" { - parser.parameters.feature.Attributes[parser.parameters.attribute] = parser.parameters.attributeValue - parser.parameters.features = append(parser.parameters.features, parser.parameters.feature) - parser.parameters.attributeValue = "" - parser.parameters.attribute = "" - parser.parameters.feature = Feature{} - parser.parameters.feature.Attributes = make(map[string]string) - } else { - parser.parameters.features = append(parser.parameters.features, parser.parameters.feature) - } + parser.parameters.saveLastAttribute() // add our features to the genbank for _, feature := range parser.parameters.features { + // TODO: parse location when line is read, or track line number so error is localized location, err := parseLocation(feature.Location.GbkLocationString) if err != nil { - return &Genbank{}, err + return &Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "invalid feature location"} } feature.Location = location err = parser.parameters.genbank.AddFeature(&feature) if err != nil { - return &Genbank{}, err + return &Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "problem adding feature"} } } + + if contigFlag { + parser.parameters.genbank.Meta.Other["CONTIG"] = parseMetadata(splitLine[1:]) + } continue - } // end sequence parsing flag logic + } // check if current line contains anything but whitespace trimmedLine := strings.TrimSpace(line) - if len(trimmedLine) < 1 { + if len(trimmedLine) == 0 { continue } + indent := countLeadingSpaces(parser.parameters.currentLine) // determine if current line is a new top level feature - if countLeadingSpaces(parser.parameters.currentLine) < countLeadingSpaces(parser.parameters.prevline) || parser.parameters.prevline == "FEATURES" { - // save our completed attribute / qualifier string to the current feature - if parser.parameters.attributeValue != "" { - parser.parameters.feature.Attributes[parser.parameters.attribute] = parser.parameters.attributeValue - parser.parameters.features = append(parser.parameters.features, parser.parameters.feature) - parser.parameters.attributeValue = "" - parser.parameters.attribute = "" - parser.parameters.feature = Feature{} - parser.parameters.feature.Attributes = make(map[string]string) - } - - // } - // checks for empty types - if parser.parameters.feature.Type != "" { - parser.parameters.features = append(parser.parameters.features, parser.parameters.feature) - } + if indent == 0 { + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "unexpected metadata when parsing feature"} + } else if indent < countLeadingSpaces(parser.parameters.prevline) || parser.parameters.prevline == "FEATURES" { + parser.parameters.saveLastAttribute() parser.parameters.feature = Feature{} - parser.parameters.feature.Attributes = make(map[string]string) + parser.parameters.feature.Attributes = NewMultiMap[string, string]() // An initial feature line looks like this: `source 1..2686` with a type separated by its location if len(splitLine) < 2 { - return &Genbank{}, fmt.Errorf("Feature line malformed on line %d. Got line: %s", lineNum, line) + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "malformed feature"} } parser.parameters.feature.Type = strings.TrimSpace(splitLine[0]) parser.parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1]) @@ -659,7 +751,7 @@ func (parser *Parser) Next() (*Genbank, error) { } // save our completed attribute / qualifier string to the current feature if parser.parameters.attributeValue != "" || parser.parameters.emptyAttribute { - parser.parameters.feature.Attributes[parser.parameters.attribute] = parser.parameters.attributeValue + Put(parser.parameters.feature.Attributes, parser.parameters.attribute, parser.parameters.attributeValue) parser.parameters.emptyAttribute = false } parser.parameters.attributeValue = "" @@ -678,11 +770,13 @@ func (parser *Parser) Next() (*Genbank, error) { } parser.parameters.attributeValue = removeAttributeValueQuotes parser.parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier + } else { + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "invalid feature"} } case "sequence": if len(line) < 2 { // throw error if line is malformed - return &Genbank{}, fmt.Errorf("Too short line found while parsing genbank sequence on line %d. Got line: %s", lineNum, line) + return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "too short line found while parsing genbank sequence"} } else if line[0:2] == "//" { // end of sequence parser.parameters.genbank.Sequence = parser.parameters.sequenceBuilder.String() @@ -904,7 +998,7 @@ func parseLocation(locationString string) (Location, error) { location.GbkLocationString = locationString if !strings.ContainsAny(locationString, "(") { // Case checks for simple expression of x..x if !strings.ContainsAny(locationString, ".") { //Case checks for simple expression x - position, err := strconv.Atoi(locationString) + position, err := strconv.Atoi(partialRegex.ReplaceAllString(locationString, "")) if err != nil { return Location{}, err } @@ -1060,13 +1154,10 @@ func BuildFeatureString(feature Feature) string { featureHeader := generateWhiteSpace(subMetaIndex) + feature.Type + whiteSpaceTrail + location + "\n" returnString := featureHeader - qualifierKeys := make([]string, 0, len(feature.Attributes)) - for key := range feature.Attributes { - qualifierKeys = append(qualifierKeys, key) - } - - for _, qualifier := range qualifierKeys { - returnString += generateWhiteSpace(qualifierIndex) + "/" + qualifier + "=\"" + feature.Attributes[qualifier] + "\"\n" + if feature.Attributes != nil { + ForEachValue(feature.Attributes, func(key string, value string) { + returnString += generateWhiteSpace(qualifierIndex) + "/" + key + "=\"" + value + "\"\n" + }) } return returnString } @@ -1088,12 +1179,10 @@ GBK specific IO related things end here. ******************************************************************************/ /****************************************************************************** - Old functions for testing here. We used to have integrated read files, before we had the generic parser interface. These are still here because of the need to switch over our tests. - ******************************************************************************/ // read reads a GBK file from path and returns a Genbank struct. @@ -1118,21 +1207,28 @@ func readMultiNth(path string, count int) ([]Genbank, error) { return []Genbank{}, err } - sequence, err := parseMultiNthFn(file, count) - if err != nil { - return []Genbank{}, err + sequence, perr := parseMultiNthFn(file, count) + if perr != nil { + perr.file = path + return []Genbank{}, perr } return sequence, nil } -func parseMultiNth(r io.Reader, count int) ([]Genbank, error) { +func parseMultiNth(r io.Reader, count int) ([]Genbank, *ParseError) { parser := NewParser(r, bufio.MaxScanTokenSize) var genbanks []Genbank for i := 0; i < count; i++ { gb, err := parser.Next() if err != nil { - return genbanks, err + var perr *ParseError + if err == io.EOF { + perr = &ParseError{wraps: io.EOF} + } else if err != nil { + perr = err.(*ParseError) + } + return genbanks, perr } genbanks = append(genbanks, *gb) } diff --git a/bio/genbank/genbank_test.go b/bio/genbank/genbank_test.go index c74ed9ad..8e148c5c 100644 --- a/bio/genbank/genbank_test.go +++ b/bio/genbank/genbank_test.go @@ -25,13 +25,13 @@ Gbk/gb/genbank related benchmarks begin here. ******************************************************************************/ var singleGbkPaths = []string{ - "../../data/t4_intron.gb", - "../../data/puc19.gbk", - "../../data/puc19_snapgene.gb", - "../../data/benchling.gb", - "../../data/phix174.gb", - "../../data/sample.gbk", - // "../../data/pichia_chr1_head.gb", + "data/t4_intron.gb", + "data/puc19.gbk", + "data/puc19_snapgene.gb", + "data/benchling.gb", + "data/phix174.gb", + "data/sample.gbk", + // "data/pichia_chr1_head.gb", } func TestGbkIO(t *testing.T) { @@ -56,7 +56,7 @@ func TestGbkIO(t *testing.T) { } func TestMultiLineFeatureParse(t *testing.T) { - pichia, _ := read("../../data/pichia_chr1_head.gb") + pichia, _ := read("data/pichia_chr1_head.gb") var multilineOutput string for _, feature := range pichia.Features { multilineOutput = feature.Location.GbkLocationString @@ -75,7 +75,7 @@ func TestMultiGenbankIO(t *testing.T) { defer os.RemoveAll(tmpDataDir) // Test multiline Genbank features - gbkPath := "../../data/multiGbk_test.seq" + gbkPath := "data/multiGbk_test.seq" multiGbk, _ := readMulti(gbkPath) tmpGbkFilePath := filepath.Join(tmpDataDir, filepath.Base(gbkPath)) _ = writeMulti(multiGbk, tmpGbkFilePath) @@ -94,7 +94,7 @@ func TestGbkLocationStringBuilder(t *testing.T) { } defer os.RemoveAll(tmpDataDir) - scrubbedGbk, err := read("../../data/sample.gbk") + scrubbedGbk, err := read("data/sample.gbk") if err != nil { t.Error(err) } @@ -107,7 +107,7 @@ func TestGbkLocationStringBuilder(t *testing.T) { tmpGbkFilePath := filepath.Join(tmpDataDir, "sample.gbk") _ = write(scrubbedGbk, tmpGbkFilePath) - testInputGbk, _ := read("../../data/sample.gbk") + testInputGbk, _ := read("data/sample.gbk") testOutputGbk, _ := read(tmpGbkFilePath) if diff := cmp.Diff(testInputGbk, testOutputGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { @@ -122,7 +122,7 @@ func TestGbLocationStringBuilder(t *testing.T) { } defer os.RemoveAll(tmpDataDir) - scrubbedGb, _ := read("../../data/t4_intron.gb") + scrubbedGb, _ := read("data/t4_intron.gb") // removing gbkLocationString from features to allow testing for gbkLocationBuilder for featureIndex := range scrubbedGb.Features { @@ -132,7 +132,7 @@ func TestGbLocationStringBuilder(t *testing.T) { tmpGbFilePath := filepath.Join(tmpDataDir, "t4_intron_test.gb") _ = write(scrubbedGb, tmpGbFilePath) - testInputGb, _ := read("../../data/t4_intron.gb") + testInputGb, _ := read("data/t4_intron.gb") testOutputGb, _ := read(tmpGbFilePath) if diff := cmp.Diff(testInputGb, testOutputGb, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { @@ -141,14 +141,14 @@ func TestGbLocationStringBuilder(t *testing.T) { } func TestPartialLocationParseRegression(t *testing.T) { - gbk, _ := read("../../data/sample.gbk") + gbk, _ := read("data/sample.gbk") for _, feature := range gbk.Features { if feature.Location.GbkLocationString == "687..3158>" && (feature.Location.Start != 686 || feature.Location.End != 3158) { t.Errorf("Partial location for three prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with read()") } } - gbk, err := read("../../data/sample.gbk") + gbk, err := read("data/sample.gbk") if err != nil { t.Errorf("Failed to read sample.gbk. Got err: %s", err) } @@ -188,7 +188,7 @@ func TestSubLocationStringParseRegression(t *testing.T) { } func TestSnapgeneGenbankRegression(t *testing.T) { - snapgene, err := read("../../data/puc19_snapgene.gb") + snapgene, err := read("data/puc19_snapgene.gb") if snapgene.Sequence == "" { t.Errorf("Parsing snapgene returned an empty string. Got error: %s", err) @@ -196,7 +196,7 @@ func TestSnapgeneGenbankRegression(t *testing.T) { } func TestGetSequenceMethod(t *testing.T) { - gbk, _ := read("../../data/t4_intron.gb") + gbk, _ := read("data/t4_intron.gb") // Check to see if GetSequence method works on Features struct feature, _ := gbk.Features[1].GetSequence() @@ -207,7 +207,7 @@ func TestGetSequenceMethod(t *testing.T) { } func TestLocationParser(t *testing.T) { - gbk, _ := read("../../data/t4_intron.gb") + gbk, _ := read("data/t4_intron.gb") // read 1..243 feature, _ := gbk.Features[1].GetSequence() @@ -250,11 +250,11 @@ func TestLocationParser(t *testing.T) { } func TestGenbankNewlineParsingRegression(t *testing.T) { - gbk, _ := read("../../data/puc19.gbk") + gbk, _ := read("data/puc19.gbk") for _, feature := range gbk.Features { if feature.Location.Start == 410 && feature.Location.End == 1750 && feature.Type == "CDS" { - if feature.Attributes["product"] != "chromosomal replication initiator informational ATPase" { + if feature.Attributes["product"][0] != "chromosomal replication initiator informational ATPase" { t.Errorf("Newline parsing has failed.") } break @@ -264,7 +264,7 @@ func TestGenbankNewlineParsingRegression(t *testing.T) { func BenchmarkRead(b *testing.B) { for i := 0; i < b.N; i++ { - _, _ = read("../../data/bsub.gbk") + _, _ = read("data/bsub.gbk") } } @@ -281,7 +281,7 @@ Gbk/gb/genbank related benchmarks end here. ******************************************************************************/ func TestBenchlingGenbank(t *testing.T) { - sequence, _ := read("../../data/benchling.gb") + sequence, _ := read("data/benchling.gb") if len(sequence.Features) != 17 { t.Errorf("Parsing benchling genbank file not returned the correct quantity of features") @@ -531,7 +531,7 @@ func TestRead(t *testing.T) { { name: "error on malformed file", args: args{ - path: "../../data/malformed_read_test.gbk", + path: "data/malformed_read_test.gbk", }, wantErr: true, }, @@ -689,23 +689,22 @@ func TestBuildFeatureString(t *testing.T) { } func TestParse_error(t *testing.T) { - parseMultiErr := io.EOF + parseMultiErr := &ParseError{wraps: io.EOF} oldParseMultiNthFn := parseMultiNthFn - parseMultiNthFn = func(r io.Reader, count int) ([]Genbank, error) { + parseMultiNthFn = func(r io.Reader, count int) ([]Genbank, *ParseError) { return nil, parseMultiErr } defer func() { parseMultiNthFn = oldParseMultiNthFn }() _, err := parse(strings.NewReader("")) - assert.EqualError(t, err, parseMultiErr.Error()) + assert.Equal(t, err, parseMultiErr.wraps) _, err = parseMultiNth(strings.NewReader(""), 10000) - assert.EqualError(t, err, parseMultiErr.Error()) + assert.Equal(t, err, parseMultiErr) } func TestParseReferences_error(t *testing.T) { - parseReferencesErr := errors.New("Failed in parsing reference above line 13. Got error: ") oldParseReferencesFn := parseReferencesFn parseReferencesFn = func(metadataData []string) (Reference, error) { return Reference{}, errors.New("") @@ -713,21 +712,21 @@ func TestParseReferences_error(t *testing.T) { defer func() { parseReferencesFn = oldParseReferencesFn }() - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open("data/puc19.gbk") _, err := parseMultiNthFn(file, 1) - assert.EqualError(t, err, parseReferencesErr.Error()) + assert.Equal(t, err.info, "failed in parsing reference") } func TestIssue303Regression(t *testing.T) { - seq, _ := read("../../data/puc19_303_regression.gbk") - expectedAttribute := "16S rRNA(adenine(1518)-N(6)/adenine(1519)-N(6))-dimethyltransferase" + seq, _ := read("data/puc19_303_regression.gbk") + expectedAttribute := []string{"16S rRNA(adenine(1518)-N(6)/adenine(1519)-N(6))-dimethyltransferase"} for _, feature := range seq.Features { - if feature.Attributes["locus_tag"] == "JCVISYN3A_0004" && feature.Type == "CDS" { - if feature.Attributes["product"] != expectedAttribute { + if cmp.Equal(feature.Attributes["locus_tag"], []string{"JCVISYN3A_0004"}) && feature.Type == "CDS" { + if !cmp.Equal(feature.Attributes["product"], expectedAttribute) { t.Errorf("Failed to get proper expected attribute. Got: %s Expected: %s", feature.Attributes["product"], expectedAttribute) } } - if feature.Attributes["locus_tag"] == "JCVISYN3A_0051" && feature.Type == "CDS" { + if cmp.Equal(feature.Attributes["locus_tag"], []string{"JCVISYN3A_0051"}) && feature.Type == "CDS" { if _, ok := feature.Attributes["pseudo"]; !ok { t.Errorf("pseudo should be in attributes") } @@ -736,7 +735,7 @@ func TestIssue303Regression(t *testing.T) { } func TestConsortiumRegression(t *testing.T) { - _, err := read("../../data/puc19_consrtm.gbk") + _, err := read("data/puc19_consrtm.gbk") if err != nil { t.Errorf("Failed to read consrtm. Got err: %s", err) } diff --git a/bio/genbank/multimap.go b/bio/genbank/multimap.go new file mode 100644 index 00000000..3f067272 --- /dev/null +++ b/bio/genbank/multimap.go @@ -0,0 +1,65 @@ +/* +This file provides utilities for working with a MultiMap, +which is simply a map which can store multiple values for a single key instead +of the usual one. + +Useful for when we expect to encounter repeated keys but we want to store all pairs, +not just the last-inserted one. This implementation has the advantage of being compatible with +json.Marshal, cmp.Diff, pretty printing, and bracket indexing out of the box. +Does not make uniqueness quarantees for key value pairs. +Currently only used in genbank.Feature.Attributes, however his may end up +being useful for other parsers which allow for repeated keys, in which case +this should be made into its own module. +*/ +package genbank + +// MultiMap is defined as a simple type alias over a map of slices. +type MultiMap[K, V comparable] map[K][]V + +// NewMultiMap creates a new empty multimap. +func NewMultiMap[K, V comparable]() MultiMap[K, V] { + return make(map[K][]V) +} + +// Put adds a key-value pair to the multimap. +func Put[K, V comparable](m MultiMap[K, V], k K, v ...V) { + if _, ok := m[k]; !ok { + m[k] = v + } else { + m[k] = append(m[k], v...) + } +} + +// ForEachKey iterates over the multimap, once for each key with all values passed as a slice. +// do is a callback that takes the key, values slice for that key +// This exists purely as a convenience function, if your use case would benefit from +// early break/return, it is recommended you do the usual range iteration operator. +func ForEachKey[K, V comparable](m MultiMap[K, V], do func(K, []V)) { + for k, values := range m { + do(k, values) + } +} + +// ForEachValue iterates over the multimap, once for each value +// do is a callback that takes and a key and value. +func ForEachValue[K, V comparable](m MultiMap[K, V], do func(K, V)) { + ForEachKey(m, func(k K, values []V) { + for _, v := range values { + do(k, v) + } + }) +} + +// MapSlice efficiently applies a transformation to each element of a slice to create a new slice +func MapSlice[X any, Y any](slice []X, mapper func(X) Y) []Y { + y := make([]Y, len(slice)) + for i, x := range slice { + y[i] = mapper(x) + } + return y +} + +// identity returns its input, useful for using MapSlice to do a shallow copy +func identity[X any](x X) X { + return x +} diff --git a/bio/polyjson/polyjson.go b/bio/polyjson/polyjson.go index 81370480..359370e5 100644 --- a/bio/polyjson/polyjson.go +++ b/bio/polyjson/polyjson.go @@ -13,6 +13,7 @@ import ( "os" "time" + "github.com/TimothyStiles/poly/bio/genbank" "github.com/TimothyStiles/poly/transform" ) @@ -153,6 +154,82 @@ func Write(sequence Poly, path string) error { return os.WriteFile(path, file, 0644) } +// Utilities to convert polyjson objects -> their genbank equivalents +// TODO add convert <- genbank methods, which is currently difficult as most +// genbank Meta values are discarded due to lack of support for wildcard metadata in polyjson. + +func (sequence *Poly) ToGenbank() genbank.Genbank { + gb := genbank.Genbank{ + Meta: sequence.Meta.ToGenbank(), + Features: make([]genbank.Feature, len(sequence.Features)), + Sequence: sequence.Sequence, + } + for i, f := range sequence.Features { + gb.Features[i] = f.ToGenbank() + gb.Features[i].ParentSequence = &gb + } + return gb +} + +func (meta *Meta) ToGenbank() genbank.Meta { + other := make(map[string]string) + if meta.URL != "" { + other["URL"] = meta.URL + } + if meta.CreatedBy != "" { + other["CreatedBy"] = meta.CreatedBy + } + if meta.CreatedWith != "" { + other["CreatedWith"] = meta.CreatedWith + } + other["CreatedOn"] = meta.CreatedOn.String() + if meta.Schema != "" { + other["Schema"] = meta.Schema + } + return genbank.Meta{ + Definition: meta.Description, + Source: meta.CreatedBy, + Origin: meta.CreatedWith, + Name: meta.Name, + SequenceHash: meta.Hash, + Other: other, + } +} + +func (feature *Feature) ToGenbank() genbank.Feature { + attributes := genbank.NewMultiMap[string, string]() + for key, value := range feature.Tags { + genbank.Put(attributes, key, value) + } + genbank.Put(attributes, "Name", feature.Name) + + return genbank.Feature{ + Type: feature.Type, + Description: feature.Description, + Attributes: attributes, + SequenceHash: feature.Hash, + Sequence: feature.Sequence, + Location: feature.Location.ToGenbank(), + } +} + +func (location *Location) ToGenbank() genbank.Location { + loc := genbank.Location{ + Start: location.Start, + End: location.End, + Complement: location.Complement, + Join: location.Join, + FivePrimePartial: location.FivePrimePartial, + ThreePrimePartial: location.ThreePrimePartial, + SubLocations: genbank.MapSlice( + location.SubLocations, + func(s Location) genbank.Location { return s.ToGenbank() }, + ), + } + loc.GbkLocationString = genbank.BuildLocationString(loc) + return loc +} + /****************************************************************************** JSON specific IO related things end here. diff --git a/bio/polyjson/polyjson_test.go b/bio/polyjson/polyjson_test.go index eaee25b1..dedd7039 100644 --- a/bio/polyjson/polyjson_test.go +++ b/bio/polyjson/polyjson_test.go @@ -104,3 +104,9 @@ func TestWrite_error(t *testing.T) { err := Write(Poly{}, "/tmp/file") assert.EqualError(t, err, marshalIndentErr.Error()) } + +func TestConvert(t *testing.T) { + sequence, err := Read("../../data/cat.json") + assert.NoError(t, err) + sequence.ToGenbank() +} diff --git a/go.mod b/go.mod index cbe060b1..659c5439 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/mroth/weightedrand v0.4.1 github.com/pmezard/go-difflib v1.0.0 github.com/sergi/go-diff v1.2.0 + github.com/spaolacci/murmur3 v1.1.0 golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0 lukechampine.com/blake3 v1.1.5 ) diff --git a/synthesis/codon/codon_test.go b/synthesis/codon/codon_test.go index 067e11a6..0374e799 100644 --- a/synthesis/codon/codon_test.go +++ b/synthesis/codon/codon_test.go @@ -12,6 +12,8 @@ import ( "github.com/stretchr/testify/assert" ) +const puc19path = "../../bio/genbank/data/puc19.gbk" + func TestTranslation(t *testing.T) { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" gfpDnaSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" @@ -77,7 +79,7 @@ func TestTranslationLowerCase(t *testing.T) { func TestOptimize(t *testing.T) { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open("../../bio/genbank/data/puc19.gbk") defer file.Close() parser, _ := bio.NewGenbankParser(file) sequence, _ := parser.Next() @@ -111,7 +113,7 @@ func TestOptimize(t *testing.T) { func TestOptimizeSameSeed(t *testing.T) { var gfpTranslation = "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open(puc19path) defer file.Close() parser, _ := bio.NewGenbankParser(file) sequence, _ := parser.Next() @@ -144,7 +146,7 @@ func TestOptimizeSameSeed(t *testing.T) { func TestOptimizeDifferentSeed(t *testing.T) { var gfpTranslation = "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open(puc19path) defer file.Close() parser, _ := bio.NewGenbankParser(file) sequence, _ := parser.Next() @@ -303,7 +305,7 @@ Codon Compromise + Add related tests begin here. ***************************************************************************** */ func TestCompromiseCodonTable(t *testing.T) { - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open(puc19path) defer file.Close() parser, _ := bio.NewGenbankParser(file) sequence, _ := parser.Next() @@ -377,7 +379,7 @@ func TestCapitalizationRegression(t *testing.T) { // Tests to make sure that amino acids are capitalized gfpTranslation := "MaSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open(puc19path) defer file.Close() parser, _ := bio.NewGenbankParser(file) sequence, _ := parser.Next() diff --git a/synthesis/codon/example_test.go b/synthesis/codon/example_test.go index b794289f..213ce650 100644 --- a/synthesis/codon/example_test.go +++ b/synthesis/codon/example_test.go @@ -9,6 +9,9 @@ import ( "github.com/TimothyStiles/poly/synthesis/codon" ) +const puc19path = "../../bio/genbank/data/puc19.gbk" +const phix174path = "../../bio/genbank/data/phix174.gb" + func ExampleTranslate() { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" gfpDnaSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" @@ -21,7 +24,7 @@ func ExampleTranslate() { func ExampleOptimize() { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open(puc19path) defer file.Close() parser, _ := bio.NewGenbankParser(file) sequence, _ := parser.Next() @@ -102,7 +105,7 @@ func ExampleWriteCodonJSON() { } func ExampleCompromiseCodonTable() { - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open(puc19path) defer file.Close() parser, _ := bio.NewGenbankParser(file) sequence, _ := parser.Next() @@ -125,7 +128,7 @@ func ExampleCompromiseCodonTable() { // weight our codon optimization table using the regions we collected from the genbank file above optimizationTable := codonTable.OptimizeTable(codingRegions) - file2, _ := os.Open("../../data/phix174.gb") + file2, _ := os.Open(phix174path) defer file2.Close() parser2, _ := bio.NewGenbankParser(file2) sequence2, _ := parser2.Next() @@ -160,7 +163,7 @@ func ExampleCompromiseCodonTable() { } func ExampleAddCodonTable() { - file, _ := os.Open("../../data/puc19.gbk") + file, _ := os.Open(puc19path) defer file.Close() parser, _ := bio.NewGenbankParser(file) sequence, _ := parser.Next() @@ -183,7 +186,7 @@ func ExampleAddCodonTable() { // weight our codon optimization table using the regions we collected from the genbank file above optimizationTable := codonTable.OptimizeTable(codingRegions) - file2, _ := os.Open("../../data/phix174.gb") + file2, _ := os.Open(phix174path) defer file2.Close() parser2, _ := bio.NewGenbankParser(file2) sequence2, _ := parser2.Next() diff --git a/tutorials/001_input_output_test.go b/tutorials/001_input_output_test.go index 08278295..b0d25200 100644 --- a/tutorials/001_input_output_test.go +++ b/tutorials/001_input_output_test.go @@ -47,6 +47,13 @@ TTFN, Tim ******************************************************************************/ +// Ignore ParentSequence as that's a pointer which can't be serialized. +func CmpOptions() []cmp.Option { + return []cmp.Option{ + cmpopts.IgnoreFields(genbank.Feature{}, "ParentSequence"), + } +} + // if you're using VS-CODE you should see a DEBUG TEST button right below this // comment. Please set break points and use it early and often. func TestFileIOTutorial(t *testing.T) { @@ -54,7 +61,7 @@ func TestFileIOTutorial(t *testing.T) { // backbone puc19. Plasmids are super small rings of "Circular DNA" that are // between 1 and 10 kilobases in length. - file, _ := os.Open("../data/puc19.gbk") + file, _ := os.Open("../bio/genbank/data/puc19.gbk") defer file.Close() parser, _ := bio.NewGenbankParser(file) puc19, _ := parser.Next() @@ -175,8 +182,7 @@ func TestFileIOTutorial(t *testing.T) { } // compare our read-in plasmid to the the one we wrote out. - // Ignore ParentSequence as that's a pointer which can't be serialized. - if diff := cmp.Diff(puc19, puc19Copy, []cmp.Option{cmpopts.IgnoreFields(genbank.Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(puc19, puc19Copy, CmpOptions()...); diff != "" { t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(puc19Path), diff) } @@ -199,7 +205,7 @@ func TestFileIOTutorial(t *testing.T) { if err := json.Unmarshal(jsonContent, &unmarshaledPuc19); err != nil { t.Error(err) } - if diff := cmp.Diff(puc19, &unmarshaledPuc19, []cmp.Option{cmpopts.IgnoreFields(genbank.Feature{}, "ParentSequence")}...); diff != "" { + if diff := cmp.Diff(puc19, &unmarshaledPuc19, CmpOptions()...); diff != "" { t.Errorf("Parsing the JSON does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(puc19Path), diff) }