-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_db.py
69 lines (55 loc) · 1.93 KB
/
generate_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import urllib2
import StringIO
import gzip
import sqlite3
import os
# Generate a database of gene ids and orthologs
# Remove database if it exists
if os.path.exists("wb.db"):
os.remove("wb.db")
conn = sqlite3.connect('wb.db')
URL = "ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/annotation/geneIDs/c_elegans.PRJNA13758.current.geneIDs.txt.gz"
response = urllib2.urlopen(URL)
compressedFile = StringIO.StringIO()
compressedFile.write(response.read())
compressedFile.seek(0)
decompressedFile = gzip.GzipFile(fileobj=compressedFile, mode='rb').read()
genes = [x.split(",")[1:] for x in decompressedFile.splitlines()]
# Generate match name.
for row in genes:
if row[1] != "":
row += [row[1]]
elif row[2] != "":
row += [row[2]]
else:
row += [row[0]]
c = conn.cursor()
# Create table
c.execute('''CREATE VIRTUAL TABLE idset using
fts3(WBID,sequence,gene,live,match);''')
# Insert genes
c.executemany('INSERT INTO idset VALUES (?,?,?,?,?);', genes)
#
# WBGeneID \t PublicName \n
# Species \t Ortholog \t Public_name \t MethodsUsedToAssignOrtholog \n
#
# Load Ortholog Database
URL = "ftp://ftp.wormbase.org/pub/wormbase/species/c_elegans/annotation/orthologs/c_elegans.PRJNA13758.current.orthologs.txt.gz"
response = urllib2.urlopen(URL)
compressedFile = StringIO.StringIO()
compressedFile.write(response.read())
compressedFile.seek(0)
decompressedFile = gzip.GzipFile(fileobj=compressedFile, mode='rb').read().split("=\n")[1:]
#genes = [x.split("\n")[1:] for x in decompressedFile]
splitgroup = [x.splitlines() for x in decompressedFile]
orthodb = []
for i in splitgroup:
WBID, gene = i[0].split("\t")
orthodb.append([WBID, gene] + i[2].split("\t"))
# Create table
c.execute('''CREATE VIRTUAL TABLE orthodb using
fts3(WBID,sequence,species,ortholog, ortholog_name,method_to_assign);''')
# Insert genes
c.executemany('INSERT INTO orthodb VALUES (?,?,?,?,?,?);', orthodb)
conn.commit()
c.close()