From 446e0ea323fbe85dd8d5946c3c93b5014c0a6167 Mon Sep 17 00:00:00 2001 From: Koeng101 Date: Tue, 13 Aug 2024 13:43:20 -0700 Subject: [PATCH] Errgroup int (#84) * integrate errgroup errgroup doesn't change at all, and so I'm removing it as an external dependency. This makes dnadesign more self-contained, which is a goal. --- README.md | 3 + lib/bio/bio.go | 2 +- lib/bio/errgroup/LICENSE | 28 ++ lib/bio/errgroup/README.md | 3 + lib/bio/errgroup/errgroup.go | 135 +++++++++ .../errgroup/errgroup_example_md5all_test.go | 100 +++++++ lib/bio/errgroup/errgroup_test.go | 262 ++++++++++++++++++ lib/bio/errgroup/go120.go | 13 + lib/bio/errgroup/go120_test.go | 54 ++++ lib/bio/errgroup/pre_go120.go | 14 + lib/bio/example_test.go | 2 +- lib/bio/pileup/pileup.go | 130 +++++++++ lib/bio/slow5/slow5_test.go | 8 +- lib/go.mod | 6 +- lib/go.sum | 4 - 15 files changed, 749 insertions(+), 15 deletions(-) create mode 100644 lib/bio/errgroup/LICENSE create mode 100644 lib/bio/errgroup/README.md create mode 100644 lib/bio/errgroup/errgroup.go create mode 100644 lib/bio/errgroup/errgroup_example_md5all_test.go create mode 100644 lib/bio/errgroup/errgroup_test.go create mode 100644 lib/bio/errgroup/go120.go create mode 100644 lib/bio/errgroup/go120_test.go create mode 100644 lib/bio/errgroup/pre_go120.go diff --git a/README.md b/README.md index 07f6016d..4d5f456a 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ On the highest level: * [external](https://pkg.go.dev/github.com/koeng101/dnadesign/external) contains integrations with external bioinformatics software, usually operating on the command line. * [external/minimap2](https://pkg.go.dev/github.com/koeng101/dnadesign/external/minimap2) contains a function for working with [minimap2](https://github.com/lh3/minimap2) with Go. * [external/samtools](https://pkg.go.dev/github.com/koeng101/dnadesign/external/samtools) contains a function for generating pileup files using [samtools](https://github.com/samtools/samtools) with Go. + * [external/bcftools](https://pkg.go.dev/github.com/koeng101/dnadesign/external/bcftools) contains GenerateVCF to generate a VCF file from sam alignments using [bcftools](https://samtools.github.io/bcftools/) with Go. ## Python @@ -60,6 +61,7 @@ There are a few pieces of "complete" software that we have directly integrated i - [svb](https://github.com/rleiwang/svb) in `lib/bio/slow5/svb` - [intel-cpuid](https://github.com/aregm/cpuid) in `lib/bio/slow5/svb/intel-cpuid` - [wordwrap](https://github.com/mitchellh/go-wordwrap) in `lib/bio/genbank` +- [errgroup](https://cs.opensource.google/go/x/sync/+/master:errgroup/) in `lib/bio/` ## Other @@ -73,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Integrated errgroup into source tree [#84](https://github.com/Koeng101/dnadesign/pull/84) - Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83) - Added option for linear ligations [#82](https://github.com/Koeng101/dnadesign/pull/82) - Added minimal python packaging [#81](https://github.com/Koeng101/dnadesign/pull/81) diff --git a/lib/bio/bio.go b/lib/bio/bio.go index 09eb0ffd..4fc5e72f 100644 --- a/lib/bio/bio.go +++ b/lib/bio/bio.go @@ -16,6 +16,7 @@ import ( "io" "math" + "github.com/koeng101/dnadesign/lib/bio/errgroup" "github.com/koeng101/dnadesign/lib/bio/fasta" "github.com/koeng101/dnadesign/lib/bio/fastq" "github.com/koeng101/dnadesign/lib/bio/genbank" @@ -23,7 +24,6 @@ import ( "github.com/koeng101/dnadesign/lib/bio/sam" "github.com/koeng101/dnadesign/lib/bio/slow5" "github.com/koeng101/dnadesign/lib/bio/uniprot" - "golang.org/x/sync/errgroup" ) // Format is a enum of different parser formats. diff --git a/lib/bio/errgroup/LICENSE b/lib/bio/errgroup/LICENSE new file mode 100644 index 00000000..58c42916 --- /dev/null +++ b/lib/bio/errgroup/LICENSE @@ -0,0 +1,28 @@ +Copyright 2009 The Go Authors. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google LLC nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/lib/bio/errgroup/README.md b/lib/bio/errgroup/README.md new file mode 100644 index 00000000..9561ed77 --- /dev/null +++ b/lib/bio/errgroup/README.md @@ -0,0 +1,3 @@ +This is errgroup from `https://cs.opensource.google/go/x/sync`. It is integrated into the source tree because it doesn't change a lot and is one of the only external dependencies we use, especially for filtering of `bio` parser results. + +I modified the code to pass the Go linter requirements. diff --git a/lib/bio/errgroup/errgroup.go b/lib/bio/errgroup/errgroup.go new file mode 100644 index 00000000..948a3ee6 --- /dev/null +++ b/lib/bio/errgroup/errgroup.go @@ -0,0 +1,135 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package errgroup provides synchronization, error propagation, and Context +// cancelation for groups of goroutines working on subtasks of a common task. +// +// [errgroup.Group] is related to [sync.WaitGroup] but adds handling of tasks +// returning errors. +package errgroup + +import ( + "context" + "fmt" + "sync" +) + +type token struct{} + +// A Group is a collection of goroutines working on subtasks that are part of +// the same overall task. +// +// A zero Group is valid, has no limit on the number of active goroutines, +// and does not cancel on error. +type Group struct { + cancel func(error) + + wg sync.WaitGroup + + sem chan token + + errOnce sync.Once + err error +} + +func (g *Group) done() { + if g.sem != nil { + <-g.sem + } + g.wg.Done() +} + +// WithContext returns a new Group and an associated Context derived from ctx. +// +// The derived Context is canceled the first time a function passed to Go +// returns a non-nil error or the first time Wait returns, whichever occurs +// first. +func WithContext(ctx context.Context) (*Group, context.Context) { + ctx, cancel := withCancelCause(ctx) + return &Group{cancel: cancel}, ctx +} + +// Wait blocks until all function calls from the Go method have returned, then +// returns the first non-nil error (if any) from them. +func (g *Group) Wait() error { + g.wg.Wait() + if g.cancel != nil { + g.cancel(g.err) + } + return g.err +} + +// Go calls the given function in a new goroutine. +// It blocks until the new goroutine can be added without the number of +// active goroutines in the group exceeding the configured limit. +// +// The first call to return a non-nil error cancels the group's context, if the +// group was created by calling WithContext. The error will be returned by Wait. +func (g *Group) Go(f func() error) { + if g.sem != nil { + g.sem <- token{} + } + + g.wg.Add(1) + go func() { + defer g.done() + + if err := f(); err != nil { + g.errOnce.Do(func() { + g.err = err + if g.cancel != nil { + g.cancel(g.err) + } + }) + } + }() +} + +// TryGo calls the given function in a new goroutine only if the number of +// active goroutines in the group is currently below the configured limit. +// +// The return value reports whether the goroutine was started. +func (g *Group) TryGo(f func() error) bool { + if g.sem != nil { + select { + case g.sem <- token{}: + // Note: this allows barging iff channels in general allow barging. + default: + return false + } + } + + g.wg.Add(1) + go func() { + defer g.done() + + if err := f(); err != nil { + g.errOnce.Do(func() { + g.err = err + if g.cancel != nil { + g.cancel(g.err) + } + }) + } + }() + return true +} + +// SetLimit limits the number of active goroutines in this group to at most n. +// A negative value indicates no limit. +// +// Any subsequent call to the Go method will block until it can add an active +// goroutine without exceeding the configured limit. +// +// The limit must not be modified while any goroutines in the group are active. +func (g *Group) SetLimit(n int) { + if n < 0 { + g.sem = nil + return + } + if len(g.sem) != 0 { + panic(fmt.Errorf("errgroup: modify limit while %v goroutines in the group are still active", len(g.sem))) + } + g.sem = make(chan token, n) +} diff --git a/lib/bio/errgroup/errgroup_example_md5all_test.go b/lib/bio/errgroup/errgroup_example_md5all_test.go new file mode 100644 index 00000000..c2b89615 --- /dev/null +++ b/lib/bio/errgroup/errgroup_example_md5all_test.go @@ -0,0 +1,100 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package errgroup_test + +import ( + "context" + "crypto/md5" + "fmt" + "log" + "os" + "path/filepath" + + "github.com/koeng101/dnadesign/lib/bio/errgroup" +) + +// Pipeline demonstrates the use of a Group to implement a multi-stage +// pipeline: a version of the MD5All function with bounded parallelism from +// https://blog.golang.org/pipelines. +func ExampleGroup_pipeline() { + m, err := MD5All(context.Background(), ".") + if err != nil { + log.Fatal(err) + } + + for k, sum := range m { + fmt.Printf("%s:\t%x\n", k, sum) + } +} + +type result struct { + path string + sum [md5.Size]byte +} + +// MD5All reads all the files in the file tree rooted at root and returns a map +// from file path to the MD5 sum of the file's contents. If the directory walk +// fails or any read operation fails, MD5All returns an error. +func MD5All(ctx context.Context, root string) (map[string][md5.Size]byte, error) { + // ctx is canceled when g.Wait() returns. When this version of MD5All returns + // - even in case of error! - we know that all of the goroutines have finished + // and the memory they were using can be garbage-collected. + g, ctx := errgroup.WithContext(ctx) + paths := make(chan string) + + g.Go(func() error { + defer close(paths) + return filepath.Walk(root, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !info.Mode().IsRegular() { + return nil + } + select { + case paths <- path: + case <-ctx.Done(): + return ctx.Err() + } + return nil + }) + }) + + // Start a fixed number of goroutines to read and digest files. + c := make(chan result) + const numDigesters = 20 + for i := 0; i < numDigesters; i++ { + g.Go(func() error { + for path := range paths { + data, err := os.ReadFile(path) + if err != nil { + return err + } + select { + case c <- result{path, md5.Sum(data)}: + case <-ctx.Done(): + return ctx.Err() + } + } + return nil + }) + } + go func() { + _ = g.Wait() + close(c) + }() + + m := make(map[string][md5.Size]byte) + for r := range c { + m[r.path] = r.sum + } + // Check whether any of the goroutines failed. Since g is accumulating the + // errors, we don't need to send them (or check for them) in the individual + // results sent on the channel. + if err := g.Wait(); err != nil { + return nil, err + } + return m, nil +} diff --git a/lib/bio/errgroup/errgroup_test.go b/lib/bio/errgroup/errgroup_test.go new file mode 100644 index 00000000..ee727d9e --- /dev/null +++ b/lib/bio/errgroup/errgroup_test.go @@ -0,0 +1,262 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package errgroup_test + +import ( + "context" + "errors" + "fmt" + "os" + "sync/atomic" + "testing" + "time" + + "github.com/koeng101/dnadesign/lib/bio/errgroup" +) + +var ( + Web = fakeSearch("web") + Image = fakeSearch("image") + Video = fakeSearch("video") +) + +type Result string +type Search func(ctx context.Context, query string) (Result, error) + +func fakeSearch(kind string) Search { + return func(_ context.Context, query string) (Result, error) { + return Result(fmt.Sprintf("%s result for %q", kind, query)), nil + } +} + +// This calls the actual internet. Removing it because why would you do that. +//// JustErrors illustrates the use of a Group in place of a sync.WaitGroup to +//// simplify goroutine counting and error handling. This example is derived from +//// the sync.WaitGroup example at https://golang.org/pkg/sync/#example_WaitGroup. +//func ExampleGroup_justErrors() { +// g := new(errgroup.Group) +// var urls = []string{ +// "http://www.golang.org/", +// "http://www.google.com/", +// "http://www.somestupidname.com/", +// } +// for _, url := range urls { +// // Launch a goroutine to fetch the URL. +// url := url // https://golang.org/doc/faq#closures_and_goroutines +// g.Go(func() error { +// // Fetch the URL. +// resp, err := http.Get(url) //nolint:all +// if err == nil { +// resp.Body.Close() +// } +// return err +// }) +// } +// // Wait for all HTTP fetches to complete. +// if err := g.Wait(); err == nil { +// fmt.Println("Successfully fetched all URLs.") +// } +//} + +// Parallel illustrates the use of a Group for synchronizing a simple parallel +// task: the "Google Search 2.0" function from +// https://talks.golang.org/2012/concurrency.slide#46, augmented with a Context +// and error-handling. +func ExampleGroup_parallel() { + Google := func(ctx context.Context, query string) ([]Result, error) { + g, ctx := errgroup.WithContext(ctx) + + searches := []Search{Web, Image, Video} + results := make([]Result, len(searches)) + for i, search := range searches { + i, search := i, search // https://golang.org/doc/faq#closures_and_goroutines + g.Go(func() error { + result, err := search(ctx, query) + if err == nil { + results[i] = result + } + return err + }) + } + if err := g.Wait(); err != nil { + return nil, err + } + return results, nil + } + + results, err := Google(context.Background(), "golang") + if err != nil { + fmt.Fprintln(os.Stderr, err) + return + } + for _, result := range results { + fmt.Println(result) + } + + // Output: + // web result for "golang" + // image result for "golang" + // video result for "golang" +} + +func TestZeroGroup(t *testing.T) { + err1 := errors.New("errgroup_test: 1") + err2 := errors.New("errgroup_test: 2") + + cases := []struct { + errs []error + }{ + {errs: []error{}}, + {errs: []error{nil}}, + {errs: []error{err1}}, + {errs: []error{err1, nil}}, + {errs: []error{err1, nil, err2}}, + } + + for _, tc := range cases { + g := new(errgroup.Group) + + var firstErr error + for i, err := range tc.errs { + err := err + g.Go(func() error { return err }) + + if firstErr == nil && err != nil { + firstErr = err + } + + if gErr := g.Wait(); gErr != firstErr { + t.Errorf("after %T.Go(func() error { return err }) for err in %v\n"+ + "g.Wait() = %v; want %v", + g, tc.errs[:i+1], err, firstErr) + } + } + } +} + +func TestWithContext(t *testing.T) { + errDoom := errors.New("group_test: doomed") + + cases := []struct { + errs []error + want error + }{ + {want: nil}, + {errs: []error{nil}, want: nil}, + {errs: []error{errDoom}, want: errDoom}, + {errs: []error{errDoom, nil}, want: errDoom}, + } + + for _, tc := range cases { + g, ctx := errgroup.WithContext(context.Background()) + + for _, err := range tc.errs { + err := err + g.Go(func() error { return err }) + } + + if err := g.Wait(); err != tc.want { + t.Errorf("after %T.Go(func() error { return err }) for err in %v\n"+ + "g.Wait() = %v; want %v", + g, tc.errs, err, tc.want) + } + + canceled := false + select { + case <-ctx.Done(): + canceled = true + default: + } + if !canceled { + t.Errorf("after %T.Go(func() error { return err }) for err in %v\n"+ + "ctx.Done() was not closed", + g, tc.errs) + } + } +} + +func TestTryGo(t *testing.T) { + g := &errgroup.Group{} + n := 42 + g.SetLimit(42) + ch := make(chan struct{}) + fn := func() error { + ch <- struct{}{} + return nil + } + for i := 0; i < n; i++ { + if !g.TryGo(fn) { + t.Fatalf("TryGo should succeed but got fail at %d-th call.", i) + } + } + if g.TryGo(fn) { + t.Fatalf("TryGo is expected to fail but succeeded.") + } + go func() { + for i := 0; i < n; i++ { + <-ch + } + }() + _ = g.Wait() + + if !g.TryGo(fn) { + t.Fatalf("TryGo should success but got fail after all goroutines.") + } + go func() { <-ch }() + _ = g.Wait() + + // Switch limit. + g.SetLimit(1) + if !g.TryGo(fn) { + t.Fatalf("TryGo should success but got failed.") + } + if g.TryGo(fn) { + t.Fatalf("TryGo should fail but succeeded.") + } + go func() { <-ch }() + _ = g.Wait() + + // Block all calls. + g.SetLimit(0) + for i := 0; i < 1<<10; i++ { + if g.TryGo(fn) { + t.Fatalf("TryGo should fail but got succeeded.") + } + } + _ = g.Wait() +} + +func TestGoLimit(t *testing.T) { + const limit = 10 + + g := &errgroup.Group{} + g.SetLimit(limit) + var active int32 + for i := 0; i <= 1<<10; i++ { + g.Go(func() error { + n := atomic.AddInt32(&active, 1) + if n > limit { + return fmt.Errorf("saw %d active goroutines; want ≤ %d", n, limit) + } + time.Sleep(1 * time.Microsecond) // Give other goroutines a chance to increment active. + atomic.AddInt32(&active, -1) + return nil + }) + } + if err := g.Wait(); err != nil { + t.Fatal(err) + } +} + +func BenchmarkGo(b *testing.B) { + fn := func() {} + g := &errgroup.Group{} + b.ResetTimer() + b.ReportAllocs() + for i := 0; i < b.N; i++ { + g.Go(func() error { fn(); return nil }) + } + _ = g.Wait() +} diff --git a/lib/bio/errgroup/go120.go b/lib/bio/errgroup/go120.go new file mode 100644 index 00000000..f93c740b --- /dev/null +++ b/lib/bio/errgroup/go120.go @@ -0,0 +1,13 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.20 + +package errgroup + +import "context" + +func withCancelCause(parent context.Context) (context.Context, func(error)) { + return context.WithCancelCause(parent) +} diff --git a/lib/bio/errgroup/go120_test.go b/lib/bio/errgroup/go120_test.go new file mode 100644 index 00000000..e14102fa --- /dev/null +++ b/lib/bio/errgroup/go120_test.go @@ -0,0 +1,54 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.20 + +package errgroup_test + +import ( + "context" + "errors" + "testing" + + "github.com/koeng101/dnadesign/lib/bio/errgroup" +) + +func TestCancelCause(t *testing.T) { + errDoom := errors.New("group_test: doomed") + + cases := []struct { + errs []error + want error + }{ + {want: nil}, + {errs: []error{nil}, want: nil}, + {errs: []error{errDoom}, want: errDoom}, + {errs: []error{errDoom, nil}, want: errDoom}, + } + + for _, tc := range cases { + g, ctx := errgroup.WithContext(context.Background()) + + for _, err := range tc.errs { + err := err + g.TryGo(func() error { return err }) + } + + if err := g.Wait(); err != tc.want { + t.Errorf("after %T.TryGo(func() error { return err }) for err in %v\n"+ + "g.Wait() = %v; want %v", + g, tc.errs, err, tc.want) + } + + if tc.want == nil { + tc.want = context.Canceled + } + + if err := context.Cause(ctx); err != tc.want { + t.Errorf("after %T.TryGo(func() error { return err }) for err in %v\n"+ + "context.Cause(ctx) = %v; tc.want %v", + g, tc.errs, err, tc.want) + } + } +} diff --git a/lib/bio/errgroup/pre_go120.go b/lib/bio/errgroup/pre_go120.go new file mode 100644 index 00000000..88ce3343 --- /dev/null +++ b/lib/bio/errgroup/pre_go120.go @@ -0,0 +1,14 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !go1.20 + +package errgroup + +import "context" + +func withCancelCause(parent context.Context) (context.Context, func(error)) { + ctx, cancel := context.WithCancel(parent) + return ctx, func(error) { cancel() } +} diff --git a/lib/bio/example_test.go b/lib/bio/example_test.go index 3f7e4710..6fbaaaff 100644 --- a/lib/bio/example_test.go +++ b/lib/bio/example_test.go @@ -9,10 +9,10 @@ import ( "strings" "github.com/koeng101/dnadesign/lib/bio" + "github.com/koeng101/dnadesign/lib/bio/errgroup" "github.com/koeng101/dnadesign/lib/bio/fasta" "github.com/koeng101/dnadesign/lib/bio/fastq" "github.com/koeng101/dnadesign/lib/bio/sam" - "golang.org/x/sync/errgroup" ) // Example_read shows an example of reading a file from disk. diff --git a/lib/bio/pileup/pileup.go b/lib/bio/pileup/pileup.go index e87cfd97..211cc713 100644 --- a/lib/bio/pileup/pileup.go +++ b/lib/bio/pileup/pileup.go @@ -39,6 +39,7 @@ import ( "bufio" "fmt" "io" + "regexp" "strconv" "strings" "unicode" @@ -197,3 +198,132 @@ func (line *Line) WriteTo(w io.Writer) (int64, error) { written, err := fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\t%s\n", line.Sequence, strconv.FormatUint(uint64(line.Position), 10), line.ReferenceBase, strconv.FormatUint(uint64(line.ReadCount), 10), buffer.String(), line.Quality) return int64(written), err } + +/****************************************************************************** + +Sequencing + +The primary use of pileup files are to have a visual way of looking at +SNP/indels in alignment data. In particular, it can be used analyze plasmid +sequencing data to see if there any mutations in the DNA. + +******************************************************************************/ + +type MutationType string + +const ( + NoMutation MutationType = "no_mutation" + Point MutationType = "point" + PointIndel MutationType = "point_indel" + Indel MutationType = "indel" + Insertion MutationType = "insertion" + Noisy MutationType = "noisy" +) + +type Mutation struct { + Type MutationType + From string + To string + Length int // Only for Indel and Insertions + TotalCorrect int + TotalMutated int + TotalAligned int +} + +func CallMutations(readResults []string, referenceBase string, minimalRatio float64) Mutation { + reads := make(map[string]int) + for _, readResult := range readResults { + if len(readResult) == 1 { + // This will include simple point mutations and + // correct sequences. + reads[string(readResult[0])]++ + } else { + // This includes more complicated Sequences + switch { + case strings.Contains(readResult, "$"): + // An "end" will always start with the base called + reads[string(readResult[0])]++ + case strings.Contains(readResult, "^"): + // A "start" will end with the base called + reads[string(readResult[len(readResult)-1])]++ + default: + // The default case would be a deletion or an insertion. + // We handle those by inserting the entire thing. + reads[readResult]++ + } + } + } + // First, check how many correct alignments we have. + noMutation := reads["."] + reads[","] + + // ratioFunction is a function that returns "true" if a mutation + // has a greater than or equal ratio than the minimalRatio. + ratioFunction := func(mutation int) bool { + if len(readResults) == 0 { + return false + } + return float64(float64(mutation)/float64(len(readResults))) >= minimalRatio + } + + // Next, let's check for point mutations. + aMutation := reads["A"] + reads["a"] + tMutation := reads["T"] + reads["t"] + gMutation := reads["G"] + reads["g"] + cMutation := reads["C"] + reads["c"] + pointIndel := reads["*"] + + // First, let's check if the mutation is a point mutation + switch { + case ratioFunction(aMutation): + return Mutation{Type: Point, From: referenceBase, To: "A", TotalCorrect: noMutation, TotalMutated: aMutation, TotalAligned: len(readResults)} + case ratioFunction(tMutation): + return Mutation{Type: Point, From: referenceBase, To: "T", TotalCorrect: noMutation, TotalMutated: tMutation, TotalAligned: len(readResults)} + case ratioFunction(gMutation): + return Mutation{Type: Point, From: referenceBase, To: "G", TotalCorrect: noMutation, TotalMutated: gMutation, TotalAligned: len(readResults)} + case ratioFunction(cMutation): + return Mutation{Type: Point, From: referenceBase, To: "C", TotalCorrect: noMutation, TotalMutated: cMutation, TotalAligned: len(readResults)} + case ratioFunction(pointIndel): + return Mutation{Type: PointIndel, From: referenceBase, To: "*", TotalCorrect: noMutation, TotalMutated: pointIndel, TotalAligned: len(readResults)} + } + + // Ok, we know there is no point mutation. That means there is an insertion or indel. Let's call it. + for key, value := range reads { + if len(key) > 1 { + var mutationType MutationType + switch key[0] { + case '-': + mutationType = Indel + case '+': + mutationType = Insertion + default: + panic(fmt.Sprintf("Unknown readResult! got: %s", key)) + } + indelModification := float64(float64(value) / float64(len(readResults))) + if indelModification >= minimalRatio { + // Let's get the length of the insertion / indel + re := regexp.MustCompile(`-?\d+`) + match := re.FindString(key) + if match == "" { + panic("no num found!") + } + // Given the regex, this will never fail + length, _ := strconv.Atoi(match) + return Mutation{Type: mutationType, To: key, TotalCorrect: noMutation, TotalMutated: value, TotalAligned: len(readResults), Length: length} + } + } + } + + // Now we know it is not a point mutation, not an indel, and not an insertion. Maybe the read + // is just noisy though, so let's check that now + var noisyCount int + for key, value := range reads { + if key != "." && key != "," { + noisyCount += value + } + } + if ratioFunction(noisyCount) { + return Mutation{Type: Noisy, To: "?", TotalCorrect: noMutation, TotalMutated: noisyCount, TotalAligned: len(readResults)} + } + + return Mutation{Type: NoMutation, To: ".", TotalCorrect: noMutation, TotalMutated: noisyCount, TotalAligned: len(readResults)} +} diff --git a/lib/bio/slow5/slow5_test.go b/lib/bio/slow5/slow5_test.go index 65f3cddd..1030546a 100644 --- a/lib/bio/slow5/slow5_test.go +++ b/lib/bio/slow5/slow5_test.go @@ -6,8 +6,6 @@ import ( "os" "strings" "testing" - - "github.com/google/go-cmp/cmp" ) const maxLineSize = 2 * 32 * 1024 @@ -245,8 +243,10 @@ func TestSvb(t *testing.T) { rawSignal := read.RawSignal mask, data := SvbCompressRawSignal(rawSignal) rawSignalDecompressed := SvbDecompressRawSignal(len(rawSignal), mask, data) - if !cmp.Equal(rawSignal, rawSignalDecompressed) { - t.Errorf("Read signal at readNum %d (%v) didn't match decompressed signal %v", readNum, rawSignal, rawSignalDecompressed) + for i := range rawSignal { + if rawSignal[i] != rawSignalDecompressed[i] { + t.Errorf("Read signal at readNum %d (%v) didn't match decompressed signal %v", readNum, rawSignal, rawSignalDecompressed) + } } } } diff --git a/lib/go.mod b/lib/go.mod index 15e101f4..05a0c012 100644 --- a/lib/go.mod +++ b/lib/go.mod @@ -2,8 +2,4 @@ module github.com/koeng101/dnadesign/lib go 1.22.0 -require ( - github.com/google/go-cmp v0.6.0 - github.com/koeng101/dnadesign/external v0.0.0-20240213205901-f4998ef84117 - golang.org/x/sync v0.5.0 -) +require github.com/google/go-cmp v0.6.0 diff --git a/lib/go.sum b/lib/go.sum index 440d22d5..5a8d551d 100644 --- a/lib/go.sum +++ b/lib/go.sum @@ -1,6 +1,2 @@ github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/koeng101/dnadesign/external v0.0.0-20240213205901-f4998ef84117 h1:MLWgADbigSsAmDP3yG93ESlN0Ek9QLtH5uHigmWVXwg= -github.com/koeng101/dnadesign/external v0.0.0-20240213205901-f4998ef84117/go.mod h1:nb80z/jm5HMCxfNZ50cBJa5TffkXxpY9okvqnBj8RrM= -golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE= -golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=