-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasta.go
219 lines (194 loc) · 6.23 KB
/
fasta.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
/*
Package fasta contains fasta parsers and writers.
Fasta is a flat text file format developed in 1985 to store nucleotide and
amino acid sequences. It is extremely simple and well-supported across many
languages. However, this simplicity means that annotation of genetic objects
is not supported.
This package provides a parser and writer for working with Fasta formatted
genetic sequences.
*/
package fasta
import (
"bufio"
"bytes"
"fmt"
"io"
)
/******************************************************************************
Apr 25, 2021
Fasta Parser begins here
Many thanks to Jordan Campbell (https://github.com/0x106) for building the first
parser for Poly and thanks to Tim Stiles (https://github.com/TimothyStiles)
for helping complete that PR. This work expands on the previous work by allowing
for concurrent parsing and giving Poly a specific parser subpackage,
as well as few bug fixes.
Fasta is a very simple file format for working with DNA, RNA, or protein sequences.
It was first released in 1985 and is still widely used in bioinformatics.
https://en.wikipedia.org/wiki/FASTA_format
One interesting use of the concurrent parser is working with the Uniprot
fasta dump files, which are far too large to fit into RAM. This parser is able
to easily handle those files by doing computation actively while the data dump
is getting parsed.
https://www.uniprot.org/downloads
I have removed the Parsers from the io.go file and moved them into this
subpackage.
Hack the Planet,
Keoni
******************************************************************************/
// Record is a struct representing a single Record file element with a Identifier and its corresponding Sequence.
type Record struct {
Identifier string `json:"identifier"`
Sequence string `json:"sequence"`
}
// Header is a blank struct, needed for compatibility with bio parsers. It contains nothing.
type Header struct{}
// WriteTo is a blank function, needed for compatibility with bio parsers. It doesn't do anything.
func (header *Header) WriteTo(w io.Writer) (int64, error) {
return 0, nil
}
// Parser is a flexible parser that provides ample
// control over reading fasta-formatted sequences.
// It is initialized with NewParser.
type Parser struct {
// scanner keeps state of current reader.
scanner bufio.Scanner
buff bytes.Buffer
identifier string
start bool
line uint
more bool
}
// Header returns nil,nil.
func (p *Parser) Header() (Header, error) {
return Header{}, nil
}
// NewParser returns a Parser that uses r as the source
// from which to parse fasta formatted sequences.
func NewParser(r io.Reader, maxLineSize int) *Parser {
scanner := bufio.NewScanner(r)
buf := make([]byte, maxLineSize)
scanner.Buffer(buf, maxLineSize)
return &Parser{
scanner: *scanner,
start: true,
more: true,
}
}
// Next reads next fasta genome in underlying reader and returns the result
// and the amount of bytes read during the call.
// Next only returns an error if it:
// - Attempts to read and fails to find a valid fasta sequence.
// - Returns reader's EOF if called after reader has been exhausted.
// - If a EOF is encountered immediately after a sequence with no newline ending.
// In this case the Fasta up to that point is returned with an EOF error.
//
// It is worth noting the amount of bytes read are always right up to before
// the next fasta starts which means this function can effectively be used
// to index where fastas start in a file or string.
func (p *Parser) Next() (Record, error) {
if !p.more {
return Record{}, io.EOF
}
for p.scanner.Scan() {
line := p.scanner.Bytes()
if p.scanner.Err() != nil {
break
}
p.line++
switch {
// if there's nothing on this line skip this iteration of the loop
case len(line) == 0:
continue
// if it's a comment skip this line
case line[0] == ';':
continue
// start of file with no identifier, error
case line[0] != '>' && p.start:
err := fmt.Errorf("invalid input: missing sequence identifier for sequence starting at line %d", p.line)
record, _ := p.newRecord()
return record, err
// start of a fasta line
case line[0] != '>':
p.buff.Write(line)
// Process normal new lines
case line[0] == '>' && !p.start:
record, err := p.newRecord()
// New name
p.identifier = string(line[1:])
return record, err
// Process first line of file
case line[0] == '>' && p.start:
p.identifier = string(line[1:])
p.start = false
}
}
p.more = false
// Add final sequence in file
record, err := p.newRecord()
if err != nil {
return record, err
}
return record, nil
}
func (p *Parser) newRecord() (Record, error) {
sequence := p.buff.String()
if sequence == "" {
return Record{}, fmt.Errorf("%s has no sequence", p.identifier)
}
record := Record{
Identifier: p.identifier,
Sequence: sequence,
}
// Reset sequence buffer
p.buff.Reset()
return record, nil
}
///******************************************************************************
//
//Start of Write functions
//
//******************************************************************************/
// WriteTo implements the io.WriterTo interface for fasta records.
func (record *Record) WriteTo(w io.Writer) (int64, error) {
var writtenBytes int64
var newWrittenBytes int
newWrittenBytes, err := w.Write([]byte(">"))
if err != nil {
return writtenBytes, err
}
writtenBytes += int64(newWrittenBytes)
newWrittenBytes, err = w.Write([]byte(record.Identifier))
if err != nil {
return writtenBytes, err
}
writtenBytes += int64(newWrittenBytes)
newWrittenBytes, err = w.Write([]byte("\n"))
if err != nil {
return writtenBytes, err
}
writtenBytes += int64(newWrittenBytes)
lineCount := 0
// write the fasta sequence 80 characters at a time
for _, character := range record.Sequence {
newWrittenBytes, err = w.Write([]byte{byte(character)})
if err != nil {
return writtenBytes, err
}
writtenBytes += int64(newWrittenBytes)
lineCount++
if lineCount == 80 {
newWrittenBytes, err = w.Write([]byte("\n"))
if err != nil {
return writtenBytes, err
}
writtenBytes += int64(newWrittenBytes)
lineCount = 0
}
}
newWrittenBytes, err = w.Write([]byte("\n\n"))
if err != nil {
return writtenBytes, err
}
writtenBytes += int64(newWrittenBytes)
return writtenBytes, nil
}