-
Notifications
You must be signed in to change notification settings - Fork 1
/
chunk_chink.py
51 lines (41 loc) · 1.61 KB
/
chunk_chink.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
text=state_union.raw('2006- GWBush.txt')
custom_tnzr=PunktSentenceTokenizer(text)# this is synonimous to training the tokenizer using the corpus provided
tokenized=custom_tnzr.tokenize(text)
def process_content():
try:
for i in tokenized:
words=nltk.word_tokenize(i)
tagged=nltk.pos_tag(words)
print(tagged)
#the angle bracket is to introduce parts of speech abbreviations
chunkGram=''' Chunk:{<RB.?>*<VB.?>*<NNP><NN>?}'''# regex pattern to define the required parts of speech
chunkParser=nltk.RegexpParser
chunked=chunkParser.parse(tagged)
print(chunked)
chunked.draw()# visual representation
except Exception as e:
print(str(e))
process_content()
# chunking- Grouping things
#chinking- Removing some things from a chunk
#CHINKING
def process2_content():
try:
for i in tokenized:
words=nltk.word_tokenize(i)
tagged=nltk.pos_tag(words)
print(tagged)
#the angle bracket is to introduce parts of speech abbreviations
chunkGram=''' Chunk:{<.*>+}
}< VB.?|IN|DT>+ { '''# chinking part, specify what you dont want
# regex pattern to define the required parts of speech
chunkParser=nltk.RegexpParser
chunked=chunkParser.parse(tagged)
print(chunked)
chunked.draw()# visual representation
except Exception as e:
print(str(e))
process2_content()