Skip to content

Commit f1fa69f

Browse files
author
Brian C. Thomas
committed
initial commit
0 parents  commit f1fa69f

18 files changed

+1045
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
*.swp
2+
*.o
3+
*.gch

Makefile

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
PROGRAM_NAME = main
2+
VERSION = 0.1
3+
CC = gcc
4+
CFLAGS = -g -Wall -pedantic -DVERSION=$(VERSION)
5+
DEBUG = -g
6+
OPT = -O3
7+
ARCHIVE = $(PROGRAM_NAME)_$(VERSION)
8+
LDFLAGS = -lz
9+
SDIR = src
10+
BDIR = build
11+
12+
.PHONY: clean default build distclean dist debug
13+
14+
default: build
15+
16+
file_read.o: $(SDIR)/file_read.c $(SDIR)/file_read.h
17+
$(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c # -o $(SDIR)/$*.o
18+
19+
linked_list.o: $(SDIR)/linked_list.c $(SDIR)/linked_list.h
20+
$(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c # -o $(SDIR)/$*.o
21+
22+
pull_from_list.o: $(SDIR)/pull_from_list.c $(SDIR)/pull_from_list.h $(SDIR)/kseq.h
23+
$(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c # -o $(SDIR)/$*.o
24+
25+
pullseq.o: $(SDIR)/pullseq.c
26+
$(CC) $(CFLAGS) $(OPT) -c $(SDIR)/$*.c # -o $(SDIR)/$*.o
27+
28+
clean:
29+
rm -rf *.o $(BDIR)/*.o $(SDIR)/*.gch ./pullseq
30+
31+
distclean: clean
32+
rm -rf *.tar.gz
33+
34+
dist:
35+
tar -zcf $(ARCHIVE).tar.gz *.ch Makefile
36+
37+
build: file_read.o linked_list.o pull_from_list.o pullseq.o
38+
$(CC) $(CFLAGS) $(LDFLAGS) $(OPT) $? -o pullseq
39+
40+
debug:
41+
$(MAKE) build "CFLAGS=-Wall -pedantic -g -DDEBUG"
42+

code.txt

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
A) size filters
2+
Open fasta/fastq file
3+
Foreach fasta/fastq in file
4+
o if sequence length passes the size filters, print
5+
6+
B) contig list filters
7+
Open contig list file
8+
o read in first word on each line
9+
o store in a linked list
10+
11+
Open fasta/fastq file
12+
Foreach fasta/fastq in file
13+
o if sequence length passes the size filters, print
14+
15+
main()
16+
17+

readme.txt

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
Summary:
2+
pullseq: extract sequences from a fasta/fastq file
3+
4+
Synopsis:
5+
# general extraction with a list of names
6+
pullseq --input=<input fasta/fastq file> --names=<fasta header ids file>
7+
8+
# general extraction with a minimum size requirement
9+
pullseq --input=<input fasta/fastq file> --min=<minimum size sequence to extract>
10+
11+
# only sequences with min 200 and max 500
12+
pullseq -i input.fasta -m 200 -a 500 > new.fasta
13+
14+
Options:
15+
--input, -i <s>: contig file (fasta format)
16+
--names, -n <s>: file containing single column list of contig_ids
17+
--min, -m <i>: minimum size contig to pull
18+
--max, -a <i>: maximum size contig to pull
19+
--exclude, -e: contigs list will be excluded
20+
--version, -v: Print version and exit
21+
--help, -h: Show this message
22+
23+
24+
25+
26+
# c parts
27+
1. sequence parts
28+
kseq.h:
29+
kseq_t
30+
kstring_t
31+
32+
2. filter file list
33+
34+
struct
35+
list
36+
37+
linked_list
38+
readline # read from input
39+
FILE
40+
parseline # grab 1st word
41+
42+
main
43+
filter # find matching header or minimum/maximum seq length
44+
print # dump to file or if -o option, open file and dump
45+
46+
Scenarios
47+
1) given a fasta file and a file of header names to select
48+
open header names file
49+
read each line and extract the first word
50+
create a linked_list node for the word
51+
close the file
52+
open the fasta file
53+
read each kseq_t entry
54+
search for kseq_t entry in linked list of header names
55+
if found, print
56+
close fasta file
57+
58+
2) given a fasta file and a minimum length
59+
open the fasta file
60+
read each kseq_t entry
61+
search for kseq_t entry where sequence length >= cutoff
62+
if found, print
63+
close fasta file

src/file_read.c

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <string.h>
4+
#include <errno.h>
5+
#include <limits.h>
6+
7+
#include "file_read.h"
8+
9+
char *parse_name(char *line)
10+
{
11+
char *word;
12+
char *delims = " \t\n"; /* space, tab, newline */
13+
word = strtok(line,delims);
14+
return word;
15+
}
16+
17+
#define BUFSIZE 80
18+
int getl(char **lineptr, FILE *fp) {
19+
char ch;
20+
ssize_t buf_pos = 0;
21+
ssize_t count = 2; /* Always buf_pos + 2 (see below). */
22+
size_t new_length = 0;
23+
size_t n = BUFSIZE;
24+
char *temp;
25+
26+
if ((lineptr == NULL) || (fp == NULL)) {
27+
errno = EINVAL;
28+
return -1;
29+
}
30+
31+
if (errno != 0)
32+
errno = 0;
33+
34+
if ((*lineptr == NULL)) {
35+
*lineptr = malloc(n * sizeof(char));
36+
37+
if (*lineptr == NULL) {
38+
return -1; /* Out of memory. */
39+
}
40+
}
41+
42+
/*
43+
* There are buf_pos characters in the buffer. When we read another
44+
* character, we want to store it, and we also need enough
45+
* room for a nul string. So we need to realloc as soon as our capacity
46+
* becomes less than buf_pos + 2.
47+
* Hence the variable "count" which always equals buf_pos + 2.
48+
*/
49+
50+
while ((ch = getc(fp)) != EOF) {
51+
if (errno != 0)
52+
return -1;
53+
54+
if (count > n) { /* current chars read is going to blow our buffer - add more */
55+
new_length = n * 2; /* double the current buffer size */
56+
if (new_length <= n) { /* Overflow. */
57+
errno = ENOMEM;
58+
/* We couldn't store the character, */
59+
/* so put it back on the stream. */
60+
ungetc(ch, fp);
61+
return -1;
62+
}
63+
temp = (char *)realloc(*lineptr, new_length * sizeof(char)); /* realloc to a temp */
64+
if (temp == NULL) {
65+
ungetc(ch, fp);
66+
return -1;
67+
}
68+
n = new_length; /* set n to the new length we were able to get from system */
69+
*lineptr = temp; /* set line to this new temp string */
70+
}
71+
72+
(*lineptr)[buf_pos++] = ch; /* set this char in the string at buf_pos and THEN increment buf_pos */
73+
74+
if (ch == '\n') /* eol */
75+
break;
76+
77+
if (count == SSIZE_MAX) { /* SSIZE_MAX is 32767 - posix def */
78+
/* We'll overflow ssize_t on the next round, since the return
79+
* type is SSIZE_T */
80+
errno = ENOMEM;
81+
return -1;
82+
}
83+
count++; /* increment ch count */
84+
}
85+
86+
(*lineptr)[buf_pos] = '\0'; /* set last position to \0 */
87+
88+
if (buf_pos == 0) { /* nothing in the file? */
89+
buf_pos = -1;
90+
}
91+
return buf_pos;
92+
}
93+
94+
int getlx(char **iline,FILE *fp)
95+
{
96+
char *line = *iline;
97+
char *newline = NULL;
98+
char *buf = NULL;
99+
char *eol = NULL;
100+
size_t capacity = sizeof(line); /* reasonable starting point for line length */
101+
size_t remaining = capacity;
102+
size_t used = 0;
103+
104+
buf = line; /* point buf -> line */
105+
line[0] = '\0';
106+
107+
/* read file into buf */
108+
while (fgets(buf, remaining, fp)) {
109+
eol = strchr(buf, '\n'); /* locate first occurrence of '\n' */
110+
if (eol) { /* found a newline in the string */
111+
*eol = '\0'; /* replace the newline with the null character */
112+
break;
113+
} else {
114+
/* buffer was too small - enlarge it */
115+
used = buf + remaining - line;
116+
117+
newline = realloc(line, capacity * 2);
118+
if (!newline) {
119+
fprintf(stderr, "getl - realloc: %s\n", strerror(errno));
120+
return -1;
121+
} else {
122+
line = newline;
123+
}
124+
125+
buf = line + used - 1;
126+
capacity *= 2;
127+
remaining = capacity - used;
128+
}
129+
}
130+
131+
if (errno) {
132+
fprintf(stderr, "getl - fgets: %s\n", strerror(errno));
133+
} else if (line[0]) {
134+
char *eol = strchr(buf, '\n');
135+
if (eol)
136+
*eol = '\0';
137+
/*buf = line;*/
138+
return strlen(line);
139+
}
140+
return -1;
141+
}

src/file_read.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#ifndef __FILE_READ_H__
2+
#define __FILE_READ_H__
3+
#endif
4+
5+
int getl(char **line, FILE *fp);
6+
char *parse_name(char *line);

src/global.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef __GLOBAL_H__
2+
#define __GLOBAL_H__
3+
#endif
4+
5+
#define _POSIX_C_SOURCE 200809L
6+
7+
static int verbose_flag;
8+
char const *progname;

0 commit comments

Comments
 (0)