-
Notifications
You must be signed in to change notification settings - Fork 1
/
highlight.py
168 lines (131 loc) · 5.86 KB
/
highlight.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
def highlightDoc(doc, query):
"""
Select to display the most relevant snippet that contains all query words.
Highlight all query words in the snippet.
Args:
doc - String that is a document to be highlighted
query - String that contains the search query
Returns:
The most relevant snippet with the query terms highlighted.
"""
# We walk through doc and find the minimum snippet that contains all
# query words. In general, we consider the minimum snippet the most
# relevant one because it usually carries what the writer trying to say
# convert doc into a list of words
words = re.findall(r'\w+', doc)
# init a queue of words that represent the current snippet
sequence = list(words[0])
# init start(inclusive) and end(exclusive) indexes of the min snippet
start, end = 0, 1
# total word count of doc, only compute once
word_count = len(words)
# init the minimum word count as total word count
min = word_count
# the start and end index of the min snippet queue, in a tuple
snippet = (start, end)
# find the smallest snippet because they tend to be the most relevant
# complexity O(2n) where n is the number of words in doc
while end < word_count:
# move on to find next snippet if query matches the current snippet
if inSnippet(sequence, query) and start < end:
# maintian the first min sequence of words because usually
# earlier snippet tends to be more relevant
if len(sequence) < min:
min = len(sequence)
# assign snippet as the result queue indexes
snippet = (start, end)
sequence.pop(0)
start += 1
# enqueue the next word to snippet
else:
sequence.append(words[end])
end += 1
# find minimal sentence(s) that contains the shortest snippet
sentence = findSentence(doc, snippet)
# highlight the query words in our sentence
result = highlightSentence(sentence, query)
return result
def inSnippet(sequence, query):
"""
Check if all words in query are presented in a list of words
Args:
sequence - a list of words
query - strings to match the sequence, assuming they are separated by
whitespaces
Returns:
True if all the words in the query string match the strings in sequence,
case insensitive. False otherwise
"""
text = (' ').join(sequence)
terms = query.split()
match = True
# O(m) where is m is number of words in query
for term in terms:
pattern = '(^|\s+)%s(\s+|$)' % term
if not re.search(pattern, text, re.I):
match = False
return match
def findSentence(doc, snippet):
"""
Find minimal sentences that contains words given their start and end index.
Args:
doc - the document for scanning
snippet - tuple containing the start and end indexes of some words
Returns:
Minimal sentence(s) that contains words given their start and end index
"""
# build a list to keep track of the start and end indexes of each sentence
pattern = re.compile(r'([^\s][^\.!?]*[\.!?])', re.M)
sentences = pattern.finditer(doc)
sent_indexes = [sentence.span() for sentence in sentences]
# build a list to keep track of the start and end indexes of each word
words_match = re.finditer(r'[^\s]+(?=\s*)', doc)
word_indexes = [word.span() for word in words_match]
# start and end index of current sentence
start, end = 0, 0
# finds the minimum sentence or sentences that contains the snippet
# complexity is O(n'), where n' is number of sentences in doc
while end < len(sent_indexes):
# if the beginning of snippet is after the end of a sentence, move
# to next sentence
if word_indexes[snippet[0]][0] >= sent_indexes[end][1]:
start += 1
end += 1
# if the snippet is within one sentence, return
elif word_indexes[snippet[0]][0] >= sent_indexes[start][0] and \
word_indexes[snippet[1]][1] <= sent_indexes[end][1]:
return doc[sent_indexes[start][0]:sent_indexes[end][1]]
# if the beginning of snippet is larger than start of a sentence,
# and end of the snippet is larger than the end of the sentence,
# expand end to next sentence
elif word_indexes[snippet[0]][0] < sent_indexes[start][1] and \
word_indexes[snippet[1]][1] > sent_indexes[end][1]:
end += 1
else:
pass
return doc[sent_indexes[start][0]:sent_indexes[end][1]]
def highlightSentence(sentence, query):
"""
Wrap [[HIGHLIGHT]] and [[ENDHIGHLIGH]] tags around query strings
in a sentence.
Args:
sentence - sentence to be highlighted
query - string that contains the words to be highlighted
Returns:
sentence after wrapping highlight tags around given queries
"""
# change query string to a|b|c where a, b, c are query words
terms = re.sub('\s+', '|', query)
regex = re.compile(r'(\s*)((?:\b\s*(?:%s)\b)+)' % terms, re.I)
return regex.sub(r'\1[[HIGHLIGHT]]\2[[ENDHIGHLIGHT]]', sentence)
if __name__ == "__main__":
# Overall complexity is approximately O(2n * m) + O(n')
# where n is number of words in the documentm and m is number of words in
# query string. n' is number of sentences in document. m is close to a
# constant in a search, and 2 can be ignored as a constant. We can also drop O(n').
# Therefore, the overall complexity is O(n)
print highlightDoc("I like fish. Little star's deep dish pizza sure is fantastic. \
Dogs are funny.", "deep dish pizza")