From 0cc4f7c06fba2f2335680c9909fbe7c91c6912b4 Mon Sep 17 00:00:00 2001 From: Koeng101 Date: Tue, 13 Aug 2024 11:11:13 -0700 Subject: [PATCH] added kmer functionality, removed enzymemanager (#83) * added kmer functionality, removed enzymemanager --- README.md | 1 + lib/align/megamash/megamash.go | 8 ++-- lib/clone/clone.go | 69 ++++++++++--------------------- lib/clone/clone_test.go | 70 +++++++++----------------------- lib/clone/data/ligations.fastq | 12 ++++++ lib/clone/example_test.go | 7 +--- lib/clone/kmer.go | 74 ++++++++++++++++++++++++++++++++++ lib/clone/kmer_test.go | 73 +++++++++++++++++++++++++++++++++ 8 files changed, 205 insertions(+), 109 deletions(-) create mode 100644 lib/clone/data/ligations.fastq create mode 100644 lib/clone/kmer.go create mode 100644 lib/clone/kmer_test.go diff --git a/README.md b/README.md index 55463d85..07f6016d 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83) - Added option for linear ligations [#82](https://github.com/Koeng101/dnadesign/pull/82) - Added minimal python packaging [#81](https://github.com/Koeng101/dnadesign/pull/81) - Greatly simplified the Ligate function [#77](https://github.com/Koeng101/dnadesign/pull/77) diff --git a/lib/align/megamash/megamash.go b/lib/align/megamash/megamash.go index d07ab4f0..bb8bc4d0 100644 --- a/lib/align/megamash/megamash.go +++ b/lib/align/megamash/megamash.go @@ -17,9 +17,9 @@ import ( "github.com/koeng101/dnadesign/lib/transform" ) -// StandardizedDNA returns the alphabetically lesser strand of a double +// StandardizeDNA returns the alphabetically lesser strand of a double // stranded DNA molecule. -func StandardizedDNA(sequence string) string { +func StandardizeDNA(sequence string) string { sequence = strings.ToUpper(sequence) var deterministicSequence string reverseComplement := transform.ReverseComplement(sequence) @@ -59,7 +59,7 @@ func NewMegamashMap(sequences []fasta.Record, kmerSize uint, kmerMinimalCount in sequence := fastaRecord.Sequence sequenceSpecificKmers := make(map[string]bool) for i := 0; i <= len(sequence)-int(kmerSize); i++ { - kmerString := StandardizedDNA(sequence[i : i+int(kmerSize)]) + kmerString := StandardizeDNA(sequence[i : i+int(kmerSize)]) kmerMap[kmerString] = fastaRecord.Identifier sequenceSpecificKmers[kmerString] = true } @@ -111,7 +111,7 @@ func (m *MegamashMap) Match(sequence string) []Match { identifierToCounts[identifier] = 0 } for i := 0; i <= len(sequence)-int(m.KmerSize); i++ { - kmerString := StandardizedDNA(sequence[i : i+int(m.KmerSize)]) + kmerString := StandardizeDNA(sequence[i : i+int(m.KmerSize)]) identifier, ok := m.Kmers[kmerString] if ok { identifierToCounts[identifier]++ diff --git a/lib/clone/clone.go b/lib/clone/clone.go index b8a47ddf..e1ac7384 100644 --- a/lib/clone/clone.go +++ b/lib/clone/clone.go @@ -90,23 +90,12 @@ type Enzyme struct { RecognitionSite string } -// EnzymeManager manager for Enzymes. Allows for management of enzymes throughout the lifecyle of your -// program. EnzymeManager is not safe for concurrent use. -type EnzymeManager struct { - // enzymeMap Map of enzymes that exist for the lifetime of the manager. Not safe for concurrent use. - enzymeMap map[string]Enzyme -} - -// NewEnzymeManager creates a new EnzymeManager given some enzymes. -func NewEnzymeManager(enzymes []Enzyme) EnzymeManager { - enzymeMap := make(map[string]Enzyme) - for enzymeIndex := range enzymes { - enzymeMap[enzymes[enzymeIndex].Name] = enzymes[enzymeIndex] - } - - return EnzymeManager{ - enzymeMap: enzymeMap, - } +var DefaultEnzymes = map[string]Enzyme{ + "BsaI": {"BsaI", regexp.MustCompile("GGTCTC"), regexp.MustCompile("GAGACC"), 1, 4, "GGTCTC"}, + "BbsI": {"BbsI", regexp.MustCompile("GAAGAC"), regexp.MustCompile("GTCTTC"), 2, 4, "GAAGAC"}, + "BtgZI": {"BtgZI", regexp.MustCompile("GCGATG"), regexp.MustCompile("CATCGC"), 10, 4, "GCGATG"}, + "PaqCI": {"PaqCI", regexp.MustCompile("CACCTGC"), regexp.MustCompile("GCAGGTG"), 4, 4, "CACCTGC"}, + "BsmBI": {"BsmBI", regexp.MustCompile("CGTCTC"), regexp.MustCompile("GAGACG"), 1, 4, "CGTCTC"}, } /****************************************************************************** @@ -119,26 +108,17 @@ Base cloning functions begin here. // enzyme's name. It is a convenience wrapper around CutWithEnzyme that // allows us to specify the enzyme by name. Set methylated flag to true if // there is lowercase methylated DNA as part of the sequence. -func (enzymeManager EnzymeManager) CutWithEnzymeByName(part Part, directional bool, name string, methylated bool) ([]Fragment, error) { +func CutWithEnzymeByName(part Part, directional bool, name string, methylated bool) ([]Fragment, error) { // Get the enzyme from the enzyme map - enzyme, err := enzymeManager.GetEnzymeByName(name) - if err != nil { + enzyme, ok := DefaultEnzymes[name] + if !ok { // Return an error if there was an error - return []Fragment{}, err + return []Fragment{}, errors.New("enzyme not found") } // Cut the sequence with the enzyme return CutWithEnzyme(part, directional, enzyme, methylated), nil } -// GetEnzymeByName gets the enzyme by it's name. If the enzyme manager does not -// contain an enzyme with the provided name, an error will be returned -func (enzymeManager EnzymeManager) GetEnzymeByName(name string) (Enzyme, error) { - if enzyme, ok := enzymeManager.enzymeMap[name]; ok { - return enzyme, nil - } - return Enzyme{}, errors.New("Enzyme " + name + " not found") -} - // CutWithEnzyme cuts a given sequence with an enzyme represented by an Enzyme struct. // If there is methylated parts of the target DNA, set the "methylated" flag to // true and lowercase ONLY methylated DNA. @@ -285,10 +265,14 @@ func CutWithEnzyme(part Part, directional bool, enzyme Enzyme, methylated bool) // the first fragment WILL be used in the ligation reaction. This function // is a massive simplification of the original ligation code which can do more. // If this does not fulfill your needs, please leave an issue in git. -func Ligate(fragments []Fragment, circular bool) (string, error) { +func Ligate(fragments []Fragment, circular bool) (string, []int, error) { if len(fragments) == 0 { - return "", errors.New("no fragments to ligate") + return "", []int{}, errors.New("no fragments to ligate") } + // Ligation pattern is used in downstream functions for analyzing + // ligation patterns. + var ligationPattern []int + ligationPattern = append(ligationPattern, 0) // first fragment is the first ligation site finalFragment := fragments[0] used := make(map[int]bool) @@ -303,6 +287,7 @@ func Ligate(fragments []Fragment, circular bool) (string, error) { finalFragment.ReverseOverhang = fragment.ReverseOverhang used[i] = true matchFound = true + ligationPattern = append(ligationPattern, i) break } if !used[i] && finalFragment.ReverseOverhang == transform.ReverseComplement(fragment.ReverseOverhang) { @@ -310,6 +295,7 @@ func Ligate(fragments []Fragment, circular bool) (string, error) { finalFragment.ReverseOverhang = transform.ReverseComplement(fragment.ForwardOverhang) used[i] = true matchFound = true + ligationPattern = append(ligationPattern, i) break } } @@ -318,11 +304,11 @@ func Ligate(fragments []Fragment, circular bool) (string, error) { // attempt circularization if circular { if finalFragment.ForwardOverhang != finalFragment.ReverseOverhang { - return "", errors.New("does not circularize") + return "", ligationPattern, errors.New("does not circularize") } - return finalFragment.ForwardOverhang + finalFragment.Sequence, nil + return finalFragment.ForwardOverhang + finalFragment.Sequence, ligationPattern, nil } - return finalFragment.ForwardOverhang + finalFragment.Sequence + finalFragment.ReverseOverhang, nil + return finalFragment.ForwardOverhang + finalFragment.Sequence + finalFragment.ReverseOverhang, ligationPattern, nil } /****************************************************************************** @@ -334,7 +320,7 @@ Specific cloning functions begin here. // GoldenGate simulates a GoldenGate cloning reaction. As of right now, we only // support BsaI, BbsI, BtgZI, and BsmBI. Set methylated flag to true if there // is lowercase methylated DNA as part of the sequence. -func GoldenGate(sequences []Part, cuttingEnzyme Enzyme, methylated bool) (string, error) { +func GoldenGate(sequences []Part, cuttingEnzyme Enzyme, methylated bool) (string, []int, error) { var fragments []Fragment for _, sequence := range sequences { newFragments := CutWithEnzyme(sequence, true, cuttingEnzyme, methylated) @@ -342,14 +328,3 @@ func GoldenGate(sequences []Part, cuttingEnzyme Enzyme, methylated bool) (string } return Ligate(fragments, true) } - -// GetBaseRestrictionEnzymes return a basic slice of common enzymes used in Golden Gate Assembly. Eventually, we want to get the data for this map from ftp://ftp.neb.com/pub/rebase -func GetBaseRestrictionEnzymes() []Enzyme { - return []Enzyme{ - {"BsaI", regexp.MustCompile("GGTCTC"), regexp.MustCompile("GAGACC"), 1, 4, "GGTCTC"}, - {"BbsI", regexp.MustCompile("GAAGAC"), regexp.MustCompile("GTCTTC"), 2, 4, "GAAGAC"}, - {"BtgZI", regexp.MustCompile("GCGATG"), regexp.MustCompile("CATCGC"), 10, 4, "GCGATG"}, - {"PaqCI", regexp.MustCompile("CACCTGC"), regexp.MustCompile("GCAGGTG"), 4, 4, "CACCTGC"}, - {"BsmBI", regexp.MustCompile("CGTCTC"), regexp.MustCompile("GAGACG"), 1, 4, "CGTCTC"}, - } -} diff --git a/lib/clone/clone_test.go b/lib/clone/clone_test.go index 5144ca5c..21f31b18 100644 --- a/lib/clone/clone_test.go +++ b/lib/clone/clone_test.go @@ -8,15 +8,13 @@ import ( var popen = Part{"TAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGGCCTACTATTAGCAACAACGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGAACCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACCTGCACCAGTCAGTAAAACGACGGCCAGTAGTCAAAAGCCTCCGACCGGAGGCTTTTGACTTGGTTCAGGTGGAGTGGGAGTAgtcttcGCcatcgCtACTAAAagccagataacagtatgcgtatttgcgcgctgatttttgcggtataagaatatatactgatatgtatacccgaagtatgtcaaaaagaggtatgctatgaagcagcgtattacagtgacagttgacagcgacagctatcagttgctcaaggcatatatgatgtcaatatctccggtctggtaagcacaaccatgcagaatgaagcccgtcgtctgcgtgccgaacgctggaaagcggaaaatcaggaagggatggctgaggtcgcccggtttattgaaatgaacggctcttttgctgacgagaacagggGCTGGTGAAATGCAGTTTAAGGTTTACACCTATAAAAGAGAGAGCCGTTATCGTCTGTTTGTGGATGTACAGAGTGATATTATTGACACGCCCGGGCGACGGATGGTGATCCCCCTGGCCAGTGCACGTCTGCTGTCAGATAAAGTCTCCCGTGAACTTTACCCGGTGGTGCATATCGGGGATGAAAGCTGGCGCATGATGACCACCGATATGGCCAGTGTGCCGGTCTCCGTTATCGGGGAAGAAGTGGCTGATCTCAGCCACCGCGAAAATGACATCAAAAACGCCATTAACCTGATGTTCTGGGGAATATAAATGTCAGGCTCCCTTATACACAGgcgatgttgaagaccaCGCTGAGGTGTCAATCGTCGGAGCCGCTGAGCAATAACTAGCATAACCCCTTGGGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCATGGTCATAGCTGTTTCCTGAGAGCTTGGCAGGTGATGACACACATTAACAAATTTCGTGAGGAGTCTCCAGAAGAATGCCATTAATTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGG", true} func TestCutWithEnzymeByName(t *testing.T) { - enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) - _, err := enzymeManager.CutWithEnzymeByName(popen, true, "EcoFake", false) + _, err := CutWithEnzymeByName(popen, true, "EcoFake", false) if err == nil { t.Errorf("CutWithEnzymeByName should have failed when looking for fake restriction enzyme EcoFake") } } func TestCutWithEnzyme(t *testing.T) { - enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) var sequence Part bsai := "GGTCTCAATGC" bsaiComplement := "ATGCAGAGACC" @@ -25,7 +23,7 @@ func TestCutWithEnzyme(t *testing.T) { // Test case of `<-bsaiComplement bsai-> <-bsaiComplement bsai->` where bsaI cuts off of a linear sequence. This tests the line: // if !sequence.Circular && (overhangSet[len(overhangSet)-1].Position+enzyme.EnzymeSkip+enzyme.EnzymeOverhangLen > len(sequence)) sequence = Part{"ATATATA" + bsaiComplement + bsai + "ATGCATCGATCGACTAGCATG" + bsaiComplement + bsai[:8], false} - fragment, err := enzymeManager.CutWithEnzymeByName(sequence, true, "BsaI", false) + fragment, err := CutWithEnzymeByName(sequence, true, "BsaI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(1). Got error: %s", err) } @@ -39,7 +37,7 @@ func TestCutWithEnzyme(t *testing.T) { // test(2) // Now if we take the same sequence and circularize it, we get a different result sequence.Circular = true - fragment, err = enzymeManager.CutWithEnzymeByName(sequence, true, "BsaI", false) + fragment, err = CutWithEnzymeByName(sequence, true, "BsaI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(2). Got error: %s", err) } @@ -57,7 +55,7 @@ func TestCutWithEnzyme(t *testing.T) { // directionality flag to false. This tests the line: // if len(overhangs) == 1 && !directional && !sequence.Circular sequence = Part{"ATATATATATATATAT" + bsai + "GCGCGCGCGCGCGCGCGCGC", false} - fragment, err = enzymeManager.CutWithEnzymeByName(sequence, false, "BsaI", false) + fragment, err = CutWithEnzymeByName(sequence, false, "BsaI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(3). Got error: %s", err) } @@ -73,7 +71,7 @@ func TestCutWithEnzyme(t *testing.T) { // tests the line: // if len(overhangs) == 2 && !directional && sequence.Circular sequence.Circular = true - fragment, err = enzymeManager.CutWithEnzymeByName(sequence, false, "BsaI", false) + fragment, err = CutWithEnzymeByName(sequence, false, "BsaI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(4). Got error: %s", err) } @@ -87,7 +85,7 @@ func TestCutWithEnzyme(t *testing.T) { // test(5) // This tests if we have a fragment where we do not care about directionality // but have more than 1 cut site in our fragment. We can use pOpen for this. - fragment, err = enzymeManager.CutWithEnzymeByName(popen, false, "BbsI", false) + fragment, err = CutWithEnzymeByName(popen, false, "BbsI", false) if err != nil { t.Errorf("CutWithEnzyme should not have failed on test(5). Got error: %s", err) } @@ -100,16 +98,13 @@ func TestCutWithEnzymeRegression(t *testing.T) { sequence := "AGCTGCTGTTTAAAGCTATTACTTTGAGACC" // this is a real sequence I came across that was causing problems part := Part{sequence, false} - - // get enzymes with enzyme manager - enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) - bsa1, err := enzymeManager.GetEnzymeByName("BsaI") - if err != nil { - t.Errorf("Error when getting Enzyme. Got error: %s", err) + bsaI, ok := DefaultEnzymes["BsaI"] + if ok != true { + t.Errorf("Error when getting Enzyme. Not found.") } // cut with BsaI - fragments := CutWithEnzyme(part, false, bsa1, false) + fragments := CutWithEnzyme(part, false, bsaI, false) // check that the fragments are correct if len(fragments) != 2 { @@ -142,7 +137,7 @@ func TestCutWithEnzymeRegression(t *testing.T) { func TestCircularLigate(t *testing.T) { fragment1 := Fragment{"AAAAAA", "GTTG", "CTAT"} fragment2 := Fragment{"AAAAAA", "CAAC", "ATAG"} - output, err := Ligate([]Fragment{fragment1, fragment2}, true) + output, _, err := Ligate([]Fragment{fragment1, fragment2}, true) if err != nil { t.Errorf("Got error on ligation: %s", err) } @@ -154,7 +149,7 @@ func TestCircularLigate(t *testing.T) { func TestLinearLigate(t *testing.T) { fragment1 := Fragment{"AAAAAA", "GTTG", "CTAT"} fragment2 := Fragment{"AAAAAA", "ATGC", "ATAG"} - output, err := Ligate([]Fragment{fragment1, fragment2}, false) + output, _, err := Ligate([]Fragment{fragment1, fragment2}, false) if err != nil { t.Errorf("Got error on ligation: %s", err) } @@ -163,17 +158,6 @@ func TestLinearLigate(t *testing.T) { } } -func TestEnzymeManage_GetEnzymeByName_NotFound(t *testing.T) { - enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) - _, err := enzymeManager.GetEnzymeByName("EcoRFake") - if err == nil { - t.Errorf("GoldenGate should fail when using enzyme EcoRFake") - } - if err.Error() != "Enzyme EcoRFake not found" { - t.Errorf("Failure of GoldenGate on incorrect enzyme should follow the exact string `Enzyme EcoRFake not found in enzymeMap`. Got: %s", err.Error()) - } -} - // This was a previous regression test. However, now ligate only outputs a // single construct as an output. If we change in the future for multiple // ligations, we should revive this function. @@ -207,7 +191,6 @@ func TestEnzymeManage_GetEnzymeByName_NotFound(t *testing.T) { //} func TestPanicGoldenGate(t *testing.T) { - enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) // This used to panic with the message: // panic: runtime error: slice bounds out of range [:-2] [recovered] // It was from the following sequence: GAAGACATAATGGTCTTC . There are 2 intercepting BbsI sites. @@ -217,21 +200,14 @@ func TestPanicGoldenGate(t *testing.T) { fragment4 := Part{"AAACCGGAGCCATACAGTACGAAGACATGACTTCGCAACCATCAACGAAGTCTACAAACAGTTCTTCGACGAACACCAGGCAACCTACCCGACCCGTTCATGCGTCCAGGTCGCACGTCTACTAGTCTTCGCACTTGGCTTAGATGCAAC", false} fragment5 := Part{"AAACCGGAGCCATACAGTACGAAGACATCTACCGAAAGACGTCAAACTAGAAATCGAAGCAATCGCAGTCCGTTCAGCAAGAGCTTAGAGACCCGCTTAGTCTTCGCACTTGGCTTAGATGCAAC", false} fragments := []Part{popen, fragment1, fragment2, fragment3, fragment4, fragment5} - - bbsI, err := enzymeManager.GetEnzymeByName("BbsI") - if err != nil { - t.Errorf("Error when getting Enzyme. Got error: %s", err) - } - - _, _ = GoldenGate(fragments, bbsI, false) + _, _, _ = GoldenGate(fragments, DefaultEnzymes["BbsI"], false) } func TestCircularCutRegression(t *testing.T) { - enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) // This used to error with 0 fragments since the BsaI cut site is on the other // side of the origin from its recognition site. plasmid1 := Part{"AAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCCGAGaccaagtcgcggccgcgaggtgtcaatcgtcggagtagggataacagggtaatccgctgagcaataactagcataaccccttggggcctctaaacgggtcttgaggggttttttgcatggtcatagctgtttcctgttacgccccgccctgccactcgtcgcagtactgttgtaattcattaagcattctgccgacatggaagccatcacaaacggcatgatgaacctgaatcgccagcggcatcagcaccttgtcgccttgcgtataatatttgcccatggtgaaaacgggggcgaagaagttgtccatattggccacgtttaaatcaaaactggtgaaactcacccagggattggctgacacgaaaaacatattctcaataaaccctttagggaaataggccaggttttcaccgtaacacgccacatcttgcgaatatatgtgtagaaactgccggaaatcgtcgtggtattcactccagagggatgaaaacgtttcagtttgctcatggaaaacggtgtaacaagggtgaacactatcccatatcaccagctcaccatccttcattgccatacgaaattccggatgagcattcatcaggcgggcaagaatgtgaataaaggccggataaaacttgtgcttatttttctttacggtctttaaaaaggccgtaatatccagctgaacggtctggttataggtacattgagcaactgactgaaatgcctcaaaatgttctttacgatgccattgggatatatcaacggtggtatatccagtgatttttttctccattttagcttccttagctcctgaaaatctcgataactcaaaaaatacgcccggtagtgatcttatttcattatggtgaaagttggaacctcttacgtgccgatcatttccataggctccgcccccctgacgagcatcacaaaaatcgacgctcaagtcagaggtggcgaaacccgacaggactataaagataccaggcgtttccccctggaagctccctcgtgcgctctcctgttccgaccctgccgcttaccggatacctgtccgcctttctcccttcgggaagcgtggcgctttctcatagctcacgctgtaggtatctcagttcggtgtaggtcgttcgctccaagctgggctgtgtgcacgaaccccccgttcagcccgaccgctgcgccttatccggtaactatcgtcttgagtccaacccggtaagacacgacttatcgccactggcagcagccactggtaacaggattagcagagcgaggtatgtaggcggtgctacagagttcttgaagtggtggcctaactacggctacactagaaggacagtatttggtatctgcgctctgctgaagccagttaccttcggaaaaagagttggtagctcttgatccggcaaacaaaccaccgctggtagcggtggtttttttgtttgcaagcagcagattacgcgcagaaaaaaaggatctcaagtaaaacgacggccagtagtcaaaagcctccgaccggaggcttttgacttggttcaggtggagtggcggccgcgacttgGTCTC", true} - newFragments, err := enzymeManager.CutWithEnzymeByName(plasmid1, true, "BsaI", false) + newFragments, err := CutWithEnzymeByName(plasmid1, true, "BsaI", false) if err != nil { t.Errorf("Failed to cut: %s", err) } @@ -246,31 +222,21 @@ func TestMethylatedGoldenGate(t *testing.T) { frag2 := Part{"AGTTGCAGTATCTAACCCGCGGTCTCTGAGATAGGCTGATCAGGAGCAAGCTCGTACGAGAAGAAACAAAATGACAAAAAAAATCCTATACTATATAGGTTACAAATAAAAAAGTATCAAAAATGAAGCTGAGACCATACTGTAAGAACCACGCGGTAAAAGACGCTACG", false} frag3 := Part{"AGTTGCAGTATCTAACCCGCGGTCTCTAAGCCTGCATCTCTCAGGCAAATGGCATTCTGACATCCTCTTGATTACGAGTGAGACCATACTGTAAGAACCACGCGGCTGAACCTCCAGCGGACTCAGTCGCGAAAATACTTACCAAAGGACCGAATTCACCGATCGAACGG", false} - enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) - bsai, err := enzymeManager.GetEnzymeByName("BsaI") - if err != nil { - t.Errorf("Error when getting Enzyme. Got error: %s", err) - } - _, err = GoldenGate([]Part{pOpenV3Methylated, frag1, frag2, frag3}, bsai, true) + _, _, err := GoldenGate([]Part{pOpenV3Methylated, frag1, frag2, frag3}, DefaultEnzymes["BsaI"], true) if err != nil { t.Errorf("Should have gotten a single clone") } } -func benchmarkGoldenGate(b *testing.B, enzymeManager EnzymeManager, parts []Part) { - bbsI, err := enzymeManager.GetEnzymeByName("BbsI") - if err != nil { - b.Errorf("Error when getting Enzyme. Got error: %s", err) - } +func benchmarkGoldenGate(b *testing.B, parts []Part) { for n := 0; n < b.N; n++ { - _, _ = GoldenGate(parts, bbsI, false) + _, _, _ = GoldenGate(parts, DefaultEnzymes["BbsI"], false) } } func BenchmarkGoldenGate3Parts(b *testing.B) { - enzymeManager := NewEnzymeManager(GetBaseRestrictionEnzymes()) fragment1 := Part{"GAAGTGCCATTCCGCCTGACCTGAAGACCAGGAGAAACACGTGGCAAACATTCCGGTCTCAAATGGAAAAGAGCAACGAAACCAACGGCTACCTTGACAGCGCTCAAGCCGGCCCTGCAGCTGGCCCGGGCGCTCCGGGTACCGCCGCGGGTCGTGCACGTCGTTGCGCGGGCTTCCTGCGGCGCCAAGCGCTGGTGCTGCTCACGGTGTCTGGTGTTCTGGCAGGCGCCGGTTTGGGCGCGGCACTGCGTGGGCTCAGCCTGAGCCGCACCCAGGTCACCTACCTGGCCTTCCCCGGCGAGATGCTGCTCCGCATGCTGCGCATGATCATCCTGCCGCTGGTGGTCTGCAGCCTGGTGTCGGGCGCCGCCTCCCTCGATGCCAGCTGCCTCGGGCGTCTGGGCGGTATCGCTGTCGCCTACTTTGGCCTCACCACACTGAGTGCCTCGGCGCTCGCCGTGGCCTTGGCGTTCATCATCAAGCCAGGATCCGGTGCGCAGACCCTTCAGTCCAGCGACCTGGGGCTGGAGGACTCGGGGCCTCCTCCTGTCCCCAAAGAAACGGTGGACTCTTTCCTCGACCTGGCCAGAAACCTGTTTCCCTCCAATCTTGTGGTTGCAGCTTTCCGTACGTATGCAACCGATTATAAAGTCGTGACCCAGAACAGCAGCTCTGGAAATGTAACCCATGAAAAGATCCCCATAGGCACTGAGATAGAAGGGATGAACATTTTAGGATTGGTCCTGTTTGCTCTGGTGTTAGGAGTGGCCTTAAAGAAACTAGGCTCCGAAGGAGAGGACCTCATCCGTTTCTTCAATTCCCTCAACGAGGCGACGATGGTGCTGGTGTCCTGGATTATGTGGTACGCGTCTTCAGGCTAGGTGGAGGCTCAGTG", false} fragment2 := Part{"GAAGTGCCATTCCGCCTGACCTGAAGACCAGTACGTACCTGTGGGCATCATGTTCCTTGTTGGAAGCAAGATCGTGGAAATGAAAGACATCATCGTGCTGGTGACCAGCCTGGGGAAATACATCTTCGCATCTATATTGGGCCACGTCATTCATGGTGGTATCGTCCTGCCGCTGATTTATTTTGTTTTCACACGAAAAAACCCATTCAGATTCCTCCTGGGCCTCCTCGCCCCATTTGCGACAGCATTTGCTACGTGCTCCAGCTCAGCGACCCTTCCCTCTATGATGAAGTGCATTGAAGAGAACAATGGTGTGGACAAGAGGATCTCCAGGTTTATTCTCCCCATCGGGGCCACCGTGAACATGGACGGAGCAGCCATCTTCCAGTGTGTGGCCGCGGTGTTCATTGCGCAACTCAACAACGTAGAGCTCAACGCAGGACAGATTTTCACCATTCTAGTGACTGCCACAGCGTCCAGTGTTGGAGCAGCAGGCGTGCCAGCTGGAGGGGTCCTCACCATTGCCATTATCCTGGAGGCCATTGGGCTGCCTACTCATGATCTGCCTCTGATCCTGGCTGTGGACTGGATTGTGGACCGGACCACCACGGTGGTGAATGTGGAAGGGGATGCCCTGGGTGCAGGCATTCTCCACCACCTGAATCAGAAGGCAACAAAGAAAGGCGAGCAGGAACTTGCTGAGGTGAAAGTGGAAGCCATCCCCAACTGCAAGTCTGAGGAGGAAACCTCGCCCCTGGTGACACACCAGAACCCCGCTGGCCCCGTGGCCAGTGCCCCAGAACTGGAATCCAAGGAGTCGGTTCTGTGAAGAGCTTAGAGACCGACGACTGCCTAAGGACATTCGCTGCGTCTTCAGGCTAGGTGGAGGCTCAGTG", false} - benchmarkGoldenGate(b, enzymeManager, []Part{fragment1, fragment2, popen}) + benchmarkGoldenGate(b, []Part{fragment1, fragment2, popen}) } diff --git a/lib/clone/data/ligations.fastq b/lib/clone/data/ligations.fastq new file mode 100644 index 00000000..a44884da --- /dev/null +++ b/lib/clone/data/ligations.fastq @@ -0,0 +1,12 @@ +@36fc097c-9e9a-432d-86ab-5125d7995977 runid=899a22d21fe6e90a422b41a1a8e49dc261d2e911 read=3216 ch=1 start_time=2024-08-07T22:47:21.049912-07:00 flow_cell_id=ASZ105 protocol_group_id=nseq42 sample_id=b9-2gg_b9-2post barcode=barcode01 barcode_alias=barcode01 parent_read_id=36fc097c-9e9a-432d-86ab-5125d7995977 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.3.0 +CTGCCAGTATTTACTCGTTCAGTTAGTATTGCTAAGGTTAACACAAAGACACCGACAACTTCTTCAGCACCTAGGGTAATGGTCTCTCGAGACAGAAGATCGTCATAGCTGTTTCCTGGCCGTGAAGAAGAGTTTCGGGATGGCGTAGCAGCAGCGATTCGTTATGCCCGTGCGCTGGGTAATAAAAAATTAACTGTCTGGTCGGTAAAACGACGGCCAGTGACTGGTCTCAGCACCGGCAGCAGCAGCACGCGCAGGTGGTCCGGACATCTCGGAGGCAGGTGCACTGGGTACGCGCGCACTGGCACGCGCATGGCTGGCATACGGTATCACGGCAGAGGACGTGAAGCGCATCCTGGACACGCTGGGTCTGGGTTCGGGTGAGTTCGGTTTCCGCCAGCAGGTGCTGGAGGACACGATCCAGCCGGCAATCGCATCGGGTGCAGGTCTGCACTTCGTGGACCCGGAGGCAGCAGCAGGTTACCTGGCAGAGCTGGTGAAGCGCTCGTACTCGAAGGCAGTGCAGCAGTGAAGGCAGAGGACTGGCTGAAGAAGATCGCACTGCTGGTGGCATCGGAGTCGACGTCGTCGTCGGGTCTGAAGATATTCCTGAAGTCGCCGCAGCCGGTGAAGTGGGTGTCGCCGGACGGTTTCCCGGTGATCCTGGACCCGAAGAAGCCGATCCAGATGAGACCGCCATCCTC ++ +##$$%%$%&(*/++/'%%%$$$$$#$%&&**+-(()257//333111236322.-.*)('(''+)*27566669>?AEEH===;::942,**))(())*((()++7769:IKEMAED;<<>?QLIIGOKLSOCCA>=CCHKJROPHKMNJKSLSSSKQKHSMKHHIICFAEFFGDF::::2221102220.-./18=;==?B@@AA?@@AA(''&&))'&$$$$%$$$%%(9IGGILPRIOKNIMKLLMLNMIABAAB;;;;>>@@LRMLPNLSHLSOJMSPSSOKLDDDCHECBCAA@>??=B78KIMOHKFGDDFCFJIMSNGGDEFF?=>=>GGD=AFCDMHIJGPKHSSJSSKSSLNSLSK?>>==H22222=?CB9875578BBBBAACDENSIJLISIKKLKNNMNOLMNRSHECCCDQKLJAA@>2)*;777789999765445OHEHILHHHGJJIOKINMHGHIGSJMLJSLKHIJKHGHHSISJFEA@@==<:8/,.85400;5*))))*334@BCBCDAAAAHIGHSIKLILHJHJLIHNIIGJHHFFA@?=>:13BCGIJJJHDBD89)5446>ACA>4447???>=;:888/.../320+ +@f45b8c4d-6038-4e32-9a8d-17654049f998 runid=899a22d21fe6e90a422b41a1a8e49dc261d2e911 read=63871 ch=13 start_time=2024-08-08T05:43:50.049912-07:00 flow_cell_id=ASZ105 protocol_group_id=nseq42 sample_id=b9-2gg_b9-2post barcode=barcode01 barcode_alias=barcode01 parent_read_id=f45b8c4d-6038-4e32-9a8d-17654049f998 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.3.0 +ATGTTATGTAGTCTTACCTGACTCACTACGTATTGCTAAGGTTAACACAAAGACACCGACAACTTCTTCGGCACCTTTAGGGAACCAGGCTTAACGGTCGTAACTCCCTCACCTACGGACTCGACAATACCTGCGGCTTCGTGCCCGGTCTGACCGGCAGCGGCATGCAGGTGCGATGAGTCGCCCCTAAGCAGCCACGGATCAAGGATGGCGGTCTCATCTGGATCGGCTTCTTCGGGTCCAGGATCACCGGGAAACCGTCCGGCGACACCCACTTCACCGGCTGCGGCGACTTCAGGATCTTCAGACCCGACGACGACATCAACTCCGATGCCACCAGCAGTGCGATCTTCTTCAGCCAGTCCTCTGCCTTCACTGCTGCCGGCACTGCCTTCGAGTACCAGCGCTTCGCCCGGCTCTGCCAGGTAACCTGCTGCTGCCTGCGGGTCCACGAAGTGCAGACCTGCACCCGATGCGATTCCCGGCTGGATCGTGTCCTCCAGCACCTGCTGGCGGAAACCGAACTCACCCGAACCCAGACGCGGCATGTCGGGATCGCTACCGCGTGTTCTGCCGTGATACCGTATGCCAGCCATGCGCGTGCCAGTGCGCGCGTCCCCGTGCACCTGCCTCCGAGATGTGCCGGACCACCTGCGCGTGCTGCTGCTGCCGGTGCTGAGACCAAGTCACTGGCCGTCGTTTTACCGATTTTGGCGTCTCACAAGGCAAGGCGACGAAAGAGAAAGTGATCGAACAGGAACGGGTCAGCGCCGCGCCGTTTTGCGAAAAGCTGAACATCCCCGGAAACAGCTATGACGACTTGGTCTCGAGAGACCATTACCCTAGGTCCTGAAGAAAGTTCTGGTGTCTTTGTGTC ++ +&''((%%%%$###"##%%%%&&&%&&&%%(((--./:;;@20/.....)+--3335;;@IIJIMLKDD))'''-.D?>>+***+''''*0+('''****+2//0010002899:10000111899;;<911111;;8.,+)&)*))455667>?@CAGC=;9911.,01@CBDJMDBEDEAAB=<<<8/(((*433./001457:51+*()+-).5477555&&&&((((,001FFIOLNKKLSNKKPMLSHGIMF<;989546656685//001;=?@BAHDB???AEPHAAA>=6688789644438766...-+,--+***,((///588:;87)('''(((0/0056876220.+++-779;==A666,()*+2877768758:;62224442...<<=7886776233,))'&&')*56667<=...,*,111+&'&&%&%%$$$&%$$$%%)&&&'''&&&()++((()343499;32200/////433334::9<;<;>8....00/--*)))*-)+,,()))*222140////48+++++-7:89:;:==DHHJIKI98876+*''&'(5=/..--201==>AAKNJPH>?=<===>@A?<>BDDE99988>=>CQHKKJGLSPCAAABCHFAAA;;;;CEFGGJIGGJFFHKIIE<;43334FJHAACFCC<;;<:BDDDG666666556/.--+'&'+,798962232100*))))3334222///..0.--,('''')++)))&('&$ +@7e47fc05-b782-4a42-8f11-38aa55ca68ab runid=899a22d21fe6e90a422b41a1a8e49dc261d2e911 read=76257 ch=1 start_time=2024-08-08T07:46:45.049912-07:00 flow_cell_id=ASZ105 protocol_group_id=nseq42 sample_id=b9-2gg_b9-2post barcode=barcode01 barcode_alias=barcode01 parent_read_id=7e47fc05-b782-4a42-8f11-38aa55ca68ab basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.3.0 +ATGTTTTTGTACGTACTCGTTCAGTTACGTATTGCTAAGGTTAACACAAAGACACCGACAACTTTCTTCAGCACCTAAAGCACTCTTAGGCCTCTGCGTACTCCGGTGCCAGGATGATGGTCTCCGAGTCCTGGTCCACCACACCTGCGTTCTGGATGGCTTCTTCGGGTCCAGGTTCACCGGGAAACCGTCCGGCGACACCCACTTCACCGGCTGCGGCCGGTTTAGGATCTTCGAGCCCGACGACGACATCAACTCCGATGCCACCAGCAGTGCGATCTTCTTCAGCCAGTCCTCTGCCTTCACTGCTGCCGGCACTGCCTTCAAGTACGAGCGCTCACCAGCTCTGCCAGGTAACCTGCTGCTGCCTCCGGGTCCACGAAGTGCAGACCTGCACCCGATGCGATTGCCGGCTGGATCGTGTCCTCCAGCACCTGCTGGCGGAAACCGAACTCACCGAACCCAGACCCAGCGTGTCCAGGATGCGCTTCGCGTCCTCTGCCGTGATCCGTATGCCAGCCATGCGCGTGCCAGTGCGCGCGTACCCAGTGCACCTGCCTCCGAGATGTCCGGACCACCTGCGCGTGCTGCTGCTGCCGGTGCTAAAGCAAGTCACTGGCCGTCGTTTTACTGACTCAGGGTCGGAACAGGAGAGCGCACGAGGAGGAAGCTCCAGGGGGAAACGCCTCGTACTTTTTATAGTCCTGTCGGGTTTCGCCACCTCTGACTTGAGCGTCGATTTTTGTGATGCTCGTCGGGGGACGAGCGCCTATGGAAATTAATGGCATTCTTCTGAAACCTTCACGAAATTTGTTAATGTGTGTCATCACCTAAGCTCTCGGGGAACAGGTTGTG ++ +$$$&'('(''$$####'&&&)()(((((0114445@???AAC:77--,,,+.-+,-**-./0060))434BB87774568954445:3467:BKJKLDG32225;99452,-185...,,///GGA@=:667??@@A?>>:987500766666-,,,*)))*730,+,0**,(()')+++58;ABMNEF33333@A99989@?77831124;+**))))))))++/5:;<111123266567433*))()++,42222-6))323336HLSFFGFGJDDD<;AAFFLMIMJLPKPIKCA@CF888EEFFEGIPKGFDBB.((((')((())*+)(*.----.@???>>??>@CBBBCHB><:;==<<79500555/..,,/.'&&&)0(((()-((++++++.,,,--AA>;:::8779;?AGJIJIIHFGHHOFGFBA<<;66,,,,,+*++)**'&'')))/268D66666<;<C7778:=ECDBBBCCGEGEFFDDAA;;9:-*)*''''))---..BFGIE<<9:)((((*+**++367866756*))))24+,,/*+>@@A:::;<::::=@@CBBADHJ>=<;<@??.--***)*+*('&%%$$$%**/./4=4922368878:..+&'(),+++++'+57899:AA9;:67:@BA@;889982222((((/++)+,0/122110(&%%$ diff --git a/lib/clone/example_test.go b/lib/clone/example_test.go index 1365abd3..c0f7fd4e 100644 --- a/lib/clone/example_test.go +++ b/lib/clone/example_test.go @@ -9,7 +9,6 @@ import ( ) func ExampleGoldenGate() { - enzymeManager := clone.NewEnzymeManager(clone.GetBaseRestrictionEnzymes()) // Fragment 1 has a palindrome at its start. This isn't very common but // can occur. These two fragments are real DNA fragments used in the // FreeGenes Project. They are used because they were on my computer @@ -20,11 +19,7 @@ func ExampleGoldenGate() { // pOpen plasmid series (https://stanford.freegenes.org/collections/open-genes/products/open-plasmids#description). I use it for essentially all my cloning. -Keoni var popen = clone.Part{"TAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGGCCTACTATTAGCAACAACGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGAACCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACCTGCACCAGTCAGTAAAACGACGGCCAGTAGTCAAAAGCCTCCGACCGGAGGCTTTTGACTTGGTTCAGGTGGAGTGGGAGTAgtcttcGCcatcgCtACTAAAagccagataacagtatgcgtatttgcgcgctgatttttgcggtataagaatatatactgatatgtatacccgaagtatgtcaaaaagaggtatgctatgaagcagcgtattacagtgacagttgacagcgacagctatcagttgctcaaggcatatatgatgtcaatatctccggtctggtaagcacaaccatgcagaatgaagcccgtcgtctgcgtgccgaacgctggaaagcggaaaatcaggaagggatggctgaggtcgcccggtttattgaaatgaacggctcttttgctgacgagaacagggGCTGGTGAAATGCAGTTTAAGGTTTACACCTATAAAAGAGAGAGCCGTTATCGTCTGTTTGTGGATGTACAGAGTGATATTATTGACACGCCCGGGCGACGGATGGTGATCCCCCTGGCCAGTGCACGTCTGCTGTCAGATAAAGTCTCCCGTGAACTTTACCCGGTGGTGCATATCGGGGATGAAAGCTGGCGCATGATGACCACCGATATGGCCAGTGTGCCGGTCTCCGTTATCGGGGAAGAAGTGGCTGATCTCAGCCACCGCGAAAATGACATCAAAAACGCCATTAACCTGATGTTCTGGGGAATATAAATGTCAGGCTCCCTTATACACAGgcgatgttgaagaccaCGCTGAGGTGTCAATCGTCGGAGCCGCTGAGCAATAACTAGCATAACCCCTTGGGGCCTCTAAACGGGTCTTGAGGGGTTTTTTGCATGGTCATAGCTGTTTCCTGAGAGCTTGGCAGGTGATGACACACATTAACAAATTTCGTGAGGAGTCTCCAGAAGAATGCCATTAATTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGG", true} - bbsI, err := enzymeManager.GetEnzymeByName("BbsI") - if err != nil { - log.Fatalf("Something went wrong when trying to get the enzyme. Got error: %s", err) - } - plasmid, err := clone.GoldenGate([]clone.Part{fragment1, fragment2, popen}, bbsI, false) + plasmid, _, err := clone.GoldenGate([]clone.Part{fragment1, fragment2, popen}, clone.DefaultEnzymes["BbsI"], false) if err != nil { log.Fatalf("Failed to GoldenGate. Got error: %s", err) } diff --git a/lib/clone/kmer.go b/lib/clone/kmer.go new file mode 100644 index 00000000..27d340fc --- /dev/null +++ b/lib/clone/kmer.go @@ -0,0 +1,74 @@ +package clone + +import ( + "errors" + "strings" + + "github.com/koeng101/dnadesign/lib/align/megamash" + "github.com/koeng101/dnadesign/lib/bio/fastq" + "github.com/koeng101/dnadesign/lib/transform" +) + +// KmerOverlap represents the overlap between two fragments indicative of a +// ligation event. +type KmerOverlap struct { + Kmer string + Fragment1 Fragment + Fragment2 Fragment +} + +// FindKmerOverlaps finds kmerOverlaps from ligation reactions. It can be used +// with FindKmers to find ligation events in sequence data. +// +// Here is the problem: You have sequenced a GoldenGate or ligation you've run +// to assemble some DNA. How do you quantify the efficiency of the ligation +// from this raw data? You may want to do this to make sure your ligations are +// working properly in a way that simple controls wouldn't: you can *directly* +// observe single molecules of interest that are ligated and ones which aren't. +// +// How would you do this? The simplest version, which we implement here, is to +// check whether or not kmers indicative of ligation exist within the sequenced +// fragments. This is simple and computationally inexpensive. +func FindKmerOverlaps(fragments []Fragment, ligationProduct string, ligationPattern []int, kmerSize int) ([]KmerOverlap, error) { + if len(fragments) < 2 { + return []KmerOverlap{}, errors.New("need at least two fragments to find overlaps") + } + bpFromEach := (kmerSize - 4) / 2 // bp needed from each side + if bpFromEach < 4 { + return []KmerOverlap{}, errors.New("need at least a kmer of 12") + } + + ligation := ligationProduct + ligationProduct // double for circularization + position := 0 // index position + var kmer string + var kmerOverlaps []KmerOverlap + + for i := 0; i < len(ligationPattern); i++ { + var frag1, frag2 Fragment + frag1 = fragments[ligationPattern[i]] + if len(fragments)-1 == i { // Account for last fragment + frag2 = fragments[0] + } else { + frag2 = fragments[ligationPattern[i+1]] + } + + position = position + len(frag1.ForwardOverhang) + len(frag1.Sequence) + kmer = megamash.StandardizeDNA(ligation[position-bpFromEach : position+4+bpFromEach]) + kmerOverlaps = append(kmerOverlaps, KmerOverlap{Kmer: kmer, Fragment1: frag1, Fragment2: frag2}) + } + return kmerOverlaps, nil +} + +// FindKmers finds kmers indicative of ligation events within fastq sequencing +// reads. If you need to just search raw sequence and not fastq reads, you can +// simply fake a fastq: we only use sequence data from it in this function. +func FindKmers(kmerOverlaps []KmerOverlap, read fastq.Read) []KmerOverlap { + var outputKmerOverlaps []KmerOverlap + sequence := strings.ToUpper(read.Sequence) + for _, kmerOverlap := range kmerOverlaps { + if strings.Contains(sequence, strings.ToUpper(kmerOverlap.Kmer)) || strings.Contains(sequence, strings.ToUpper(transform.ReverseComplement(kmerOverlap.Kmer))) { + outputKmerOverlaps = append(outputKmerOverlaps, KmerOverlap{Kmer: kmerOverlap.Kmer, Fragment1: kmerOverlap.Fragment1, Fragment2: kmerOverlap.Fragment2}) + } + } + return outputKmerOverlaps +} diff --git a/lib/clone/kmer_test.go b/lib/clone/kmer_test.go new file mode 100644 index 00000000..6ee6be55 --- /dev/null +++ b/lib/clone/kmer_test.go @@ -0,0 +1,73 @@ +package clone + +import ( + _ "embed" + "strings" + "testing" + + "github.com/koeng101/dnadesign/lib/bio" + "github.com/koeng101/dnadesign/lib/transform" +) + +//go:embed data/ligations.fastq +var ligationReads string + +func TestKmer(t *testing.T) { + parts := []Part{{Sequence: transform.ReverseComplement("CACTCGATAGGTACAACCGGGGTCTCTGTCTCAGCACCGGCAGCAGCAGCACGCGCAGGTGGTCCGGACATCTCGGAGGCAGGTGCACTGGGTACGCGCGCACTGGCACGCGCATGGCTGGCATACGGTATCACGGCAGAGGACGTGAAGCGCATCCTGGACACGCTGGGTCTGGGTTCGGGTGAGTTCGGTTTCCGCCAGCAGGTGCTGGAGGACACGATCCAGCCGGCAATCGCATCGGGTGCAGGTCTGCACTTCGTGGACCTGAGACCGCCATCCTCTTATCTCGTGGCATTGAGT"), Circular: false}, {Sequence: "CACTCGATAGGTACAACCGGGGTCTCTGACCCGGAGGCAGCAGCAGGTTACCTGGCAGAGCTGGTGAAGCGCTCGTACTCGAAGGCAGTGCCGGCAGCAGTGAAGGCAGAGGACTGGCTGAAGAAGATCGCACTGCTGGTGGCATCGGAGTCGACGTCGTCGTCGGGTCTGAAGATCCTGAAGTCGCCGCAGCCGGTGAAGTGGGTGTCGCCGGACGGTTTCCCGGTGATCCTGGACCCGAAGAAGCCGATCCAGATGAGACCGCCATCCTCTTATCTCGTGGCCCCGAGCATCAGCGGA", Circular: false}, {Sequence: "CACTCGATAGGTACAACCGGGGTCTCTCAGACGCGCCTGCAGCTGATGTTCCTGGGTCAGTTCACGCTGCAGCCGACGATCAACACGAACAAGGACTCGGAGATCGACAAGGAGAAGCTGGCATCGGGTATCGCACCGAACTTCGTGGCATCGCAGATCGCATCGCTGGTGCGCCGCACGGTGGTGCTGGCACACGAGAAGTACGGTATCAAGTCGTTCTGGATCGACAAGGACGAGTACGGTACGATCCCGGCACAGATGGACAATGAGACCGCCATCCTCTTATCTCGTGGTCTCAGC", Circular: false}, {Sequence: "CACTCGATAGGTACAACCGGGGTCTCTACAAGCTGAAGGCAGCAATCAAGGAGTCGCTGGTGGAGACGTACACGAACAACGACCTGCTGGAGAACCTGCGCGACCTGGTGGAGAAGGAGGGTTGCCCGGGTGACTGCTCGTCGGTGCCGGAGCTGCCGCCGAAGGGTGACCTGGACCTGAACGAGATCCTGAAGTCGGAGTACGGTGGTTTCTAAATCCCGAGTGAGACCGCCATCCTCTTATCTCGTGGGGGTTAATCTAGTCAGCGCCACGACTACTTCCGCTTCCCCACATAAGCAG", Circular: false}} + pOpenV3Methylated := Part{"AGGGTAATGGTCTCTCGAGACcAAGTCGTCATAGCTGTTTCCTGAGAGCTTGGCAGGTGATGACACACATTAACAAATTTCGTGAGGAGTCTCCAGAAGAATGCCATTAATTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGGCCTACTATTAGCAACAACGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGAACCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACCTGCACCAGTCAGTAAAACGACGGCCAGTGACTTgGTCTCGAGACCTAGGGATA", false} + parts = append(parts, pOpenV3Methylated) + var fragments []Fragment + for _, part := range parts { + newFragments := CutWithEnzyme(part, true, DefaultEnzymes["BsaI"], true) + fragments = append(fragments, newFragments...) + } + + // First, ligate the fragments + ligation, ligationPattern, err := Ligate(fragments, true) + if err != nil { + t.Errorf("Failed to ligate: %s", err) + } + + // Now, find kmer overlaps + kmerOverlaps, err := FindKmerOverlaps(fragments, ligation, ligationPattern, 16) + if err != nil { + t.Errorf("Failed to find kmer overlaps: %s", err) + } + expectedOverlaps := []string{"GACTTGGTCTCAGCAC", "AAATCCCGAGACCAAG", "AGATGGACAAGCTGAA", "CCGATCCAGACGCGCC", "CCTCCGGGTCCACGAA"} + for i := range kmerOverlaps { + if kmerOverlaps[i].Kmer != expectedOverlaps[i] { + t.Errorf("Expected %s on kmerOverlap %d, Got: %s", expectedOverlaps[i], i, kmerOverlaps[i].Kmer) + } + } + + // Now, find kmers in example sequences + // These are from real reads from a real ligation + // + // Interestingly enough, read #2 (idx 1) is an example of where this fails. + // It definitely has a ligation event with kmer "CCTCCGGGTCCACGAA" that + // is mutated by a single base pair, so is not detected. It is actually + // a triple ligation, in that it ligated to part of a mal-PCRed vector + // in addition to the two fragments, and that ligation event is sequence + // correct. + parser := bio.NewFastqParser(strings.NewReader(ligationReads)) + reads, _ := parser.Parse() + expectedOverlapResults := []string{"CCTCCGGGTCCACGAA", "GACTTGGTCTCAGCAC", "CCTCCGGGTCCACGAA"} + for i, read := range reads { + overlaps := FindKmers(kmerOverlaps, read) + if overlaps[0].Kmer != expectedOverlapResults[i] { + t.Errorf("Failed to find correct overlap in read %d. Expected: %s Got: %s", i, expectedOverlapResults[i], overlaps[0].Kmer) + } + } +} + +func TestKmerFailure(t *testing.T) { + // Tests failure cases + _, err := FindKmerOverlaps([]Fragment{}, "", []int{}, 0) + if err == nil { + t.Errorf("Expected: need at least two fragments to find overlaps") + } + _, err = FindKmerOverlaps([]Fragment{{}, {}}, "", []int{}, 0) + if err == nil { + t.Errorf("Expected: need at least a kmer of 12") + } +}