-
Notifications
You must be signed in to change notification settings - Fork 2
/
util_tag.py
106 lines (88 loc) · 2.9 KB
/
util_tag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Adapted from seqeval.metrics.sequence_labeling for more control
import warnings
def get_entities(seq):
"""
Include start tag in return to later identify span starting with 'I' or 'E'.
:param seq: ['B-PER', 'I-PER', 'O', 'B-LOC']
:return: [('PER', 0, 1, 'B'), ('LOC', 3, 3, 'B')]
:rtype: list of (chunk_type, chunk_start, chunk_end, start tag)
"""
def _validate_chunk(chunk):
assert chunk in ['O', 'B', 'I', 'E', 'S'] or chunk.startswith(('B-', 'I-', 'E-', 'S-'))
prev_tag = 'O'
prev_type = ''
begin_offset = 0
begin_tag = 'O'
chunks = []
for i, chunk in enumerate(seq + ['O']):
_validate_chunk(chunk)
tag = chunk[0]
type_ = chunk[1:].split('-', maxsplit=1)[-1] or '_'
if end_of_chunk(prev_tag, tag, prev_type, type_):
chunks.append((prev_type, begin_offset, i - 1, begin_tag)) # Inclusive
if start_of_chunk(prev_tag, tag, prev_type, type_):
begin_offset = i
begin_tag = tag
prev_tag = tag
prev_type = type_
return chunks
def end_of_chunk(prev_tag, tag, prev_type, type_):
"""Checks if a chunk ended between the previous and current word.
Args:
prev_tag: previous chunk tag.
tag: current chunk tag.
prev_type: previous type.
type_: current type.
Returns:
chunk_end: boolean.
"""
chunk_end = False
if prev_tag == 'E':
chunk_end = True
if prev_tag == 'S':
chunk_end = True
if prev_tag == 'B' and tag == 'B':
chunk_end = True
if prev_tag == 'B' and tag == 'S':
chunk_end = True
if prev_tag == 'B' and tag == 'O':
chunk_end = True
if prev_tag == 'I' and tag == 'B':
chunk_end = True
if prev_tag == 'I' and tag == 'S':
chunk_end = True
if prev_tag == 'I' and tag == 'O':
chunk_end = True
if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
chunk_end = True
return chunk_end
def start_of_chunk(prev_tag, tag, prev_type, type_):
"""Checks if a chunk started between the previous and current word.
Args:
prev_tag: previous chunk tag.
tag: current chunk tag.
prev_type: previous type.
type_: current type.
Returns:
chunk_start: boolean.
"""
chunk_start = False
if tag == 'B':
chunk_start = True
if tag == 'S':
chunk_start = True
if prev_tag == 'E' and tag == 'E':
chunk_start = True
if prev_tag == 'E' and tag == 'I':
chunk_start = True
if prev_tag == 'S' and tag == 'E':
chunk_start = True
if prev_tag == 'S' and tag == 'I':
chunk_start = True
if prev_tag == 'O' and tag == 'E':
chunk_start = True
if prev_tag == 'O' and tag == 'I':
chunk_start = True
if tag != 'O' and tag != '.' and prev_type != type_:
chunk_start = True
return chunk_start