-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathtokenizers.py
82 lines (65 loc) · 2.09 KB
/
tokenizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Tokenizers for natural language SQL queries, and lambda calculus."""
import nltk
import sqlparse
def nl_tokenize(string):
"""Tokenizes a natural language string into tokens.
Inputs:
string: the string to tokenize.
Outputs:
a list of tokens.
Assumes data is space-separated (this is true of ZC07 data in ATIS2/3).
"""
return nltk.word_tokenize(string)
def sql_tokenize(string):
""" Tokenizes a SQL statement into tokens.
Inputs:
string: string to tokenize.
Outputs:
a list of tokens.
"""
tokens = []
statements = sqlparse.parse(string)
# SQLparse gives you a list of statements.
for statement in statements:
# Flatten the tokens in each statement and add to the tokens list.
flat_tokens = sqlparse.sql.TokenList(statement.tokens).flatten()
for token in flat_tokens:
strip_token = str(token).strip()
if len(strip_token) > 0:
tokens.append(strip_token)
newtokens = []
keep = True
for i, token in enumerate(tokens):
if token == ".":
newtoken = newtokens[-1] + "." + tokens[i + 1]
newtokens = newtokens[:-1] + [newtoken]
keep = False
elif keep:
newtokens.append(token)
else:
keep = True
return newtokens
def lambda_tokenize(string):
""" Tokenizes a lambda-calculus statement into tokens.
Inputs:
string: a lambda-calculus string
Outputs:
a list of tokens.
"""
space_separated = string.split(" ")
new_tokens = []
# Separate the string by spaces, then separate based on existence of ( or
# ).
for token in space_separated:
tokens = []
current_token = ""
for char in token:
if char == ")" or char == "(":
tokens.append(current_token)
tokens.append(char)
current_token = ""
else:
current_token += char
tokens.append(current_token)
new_tokens.extend([tok for tok in tokens if tok])
return new_tokens