forked from discourse/discourse
-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_watcher.rb
285 lines (233 loc) · 7.47 KB
/
word_watcher.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# frozen_string_literal: true
class WordWatcher
REPLACEMENT_LETTER ||= CGI.unescape_html("■")
CACHE_VERSION ||= 3
def initialize(raw)
@raw = raw
end
@cache_enabled = true
def self.disable_cache
@cache_enabled = false
end
def self.cache_enabled?
@cache_enabled
end
def self.cache_key(action)
"watched-words-list:v#{CACHE_VERSION}:#{action}"
end
def self.clear_cache!
WatchedWord.actions.each { |action, _| Discourse.cache.delete(cache_key(action)) }
end
def self.words_for_action(action)
WatchedWord
.where(action: WatchedWord.actions[action.to_sym])
.limit(WatchedWord::MAX_WORDS_PER_ACTION)
.order(:id)
.pluck(:word, :replacement, :case_sensitive)
.to_h do |w, r, c|
[
word_to_regexp(w, match_word: false),
{ word: w, replacement: r, case_sensitive: c }.compact,
]
end
end
def self.words_for_action_exist?(action)
WatchedWord.where(action: WatchedWord.actions[action.to_sym]).exists?
end
def self.cached_words_for_action(action)
if cache_enabled?
Discourse
.cache
.fetch(cache_key(action), expires_in: 1.day) { words_for_action(action).presence }
else
words_for_action(action).presence
end
end
def self.regexps_for_action(action, engine: :ruby)
cached_words_for_action(action)&.to_h do |_, attrs|
[word_to_regexp(attrs[:word], engine: engine), attrs]
end
end
# This regexp is run in miniracer, and the client JS app
# Make sure it is compatible with major browsers when changing
# hint: non-chrome browsers do not support 'lookbehind'
def self.compiled_regexps_for_action(action, engine: :ruby, raise_errors: false)
words = cached_words_for_action(action)
return [] if words.blank?
words
.values
.group_by { |attrs| attrs[:case_sensitive] ? :case_sensitive : :case_insensitive }
.map do |group_key, attrs_list|
words = attrs_list.map { |attrs| attrs[:word] }
# Compile all watched words into a single regular expression
regexp =
words
.map do |word|
r = word_to_regexp(word, match_word: SiteSetting.watched_words_regular_expressions?)
begin
r if Regexp.new(r)
rescue RegexpError
raise if raise_errors
end
end
.select { |r| r.present? }
.join("|")
# Add word boundaries to the regexp for regular watched words
regexp =
match_word_regexp(
regexp,
engine: engine,
) if !SiteSetting.watched_words_regular_expressions?
# Add case insensitive flag if needed
Regexp.new(regexp, group_key == :case_sensitive ? nil : Regexp::IGNORECASE)
end
end
def self.serialized_regexps_for_action(action, engine: :ruby)
compiled_regexps_for_action(action, engine: engine).map do |r|
{ r.source => { case_sensitive: !r.casefold? } }
end
end
def self.word_to_regexp(word, engine: :ruby, match_word: true)
if SiteSetting.watched_words_regular_expressions?
regexp = word
regexp = "(#{regexp})" if match_word
regexp
else
# Convert word to regex by escaping special characters in a regexp.
# Avoid using Regexp.escape because it escapes more characters than
# it should (for example, whitespaces, dashes, etc)
regexp = word.gsub(/([.*+?^${}()|\[\]\\])/, '\\\\\1')
# Convert wildcards to regexp
regexp = regexp.gsub("\\*", '\S*')
regexp = match_word_regexp(regexp, engine: engine) if match_word
regexp
end
end
def self.censor(html)
regexps = compiled_regexps_for_action(:censor)
return html if regexps.blank?
doc = Nokogiri::HTML5.fragment(html)
doc.traverse do |node|
regexps.each do |regexp|
node.content = censor_text_with_regexp(node.content, regexp) if node.text?
end
end
doc.to_s
end
def self.censor_text(text)
return text if text.blank?
regexps = compiled_regexps_for_action(:censor)
return text if regexps.blank?
regexps.inject(text) { |txt, regexp| censor_text_with_regexp(txt, regexp) }
end
def self.replace_text(text)
return text if text.blank?
replace(text, :replace)
end
def self.replace_link(text)
return text if text.blank?
replace(text, :link)
end
def self.apply_to_text(text)
text = censor_text(text)
text = replace_text(text)
text = replace_link(text)
text
end
def requires_approval?
word_matches_for_action?(:require_approval)
end
def should_flag?
word_matches_for_action?(:flag)
end
def should_block?
word_matches_for_action?(:block, all_matches: true)
end
def should_silence?
word_matches_for_action?(:silence)
end
def word_matches_for_action?(action, all_matches: false)
regexps = self.class.compiled_regexps_for_action(action)
return if regexps.blank?
match_list = []
regexps.each do |regexp|
match = regexp.match(@raw)
if !all_matches
return match if match
next
end
next if !match
if SiteSetting.watched_words_regular_expressions?
set = Set.new
@raw
.scan(regexp)
.each do |m|
if Array === m
set.add(m.find(&:present?))
elsif String === m
set.add(m)
end
end
matches = set.to_a
else
matches = @raw.scan(regexp)
matches.flatten!
end
match_list.concat(matches)
end
return if match_list.blank?
match_list.compact!
match_list.uniq!
match_list.sort!
match_list
end
def word_matches?(word, case_sensitive: false)
options = case_sensitive ? nil : Regexp::IGNORECASE
Regexp.new(WordWatcher.word_to_regexp(word), options).match?(@raw)
end
def self.replace_text_with_regexp(text, regexp, replacement)
text.gsub(regexp) do |match|
prefix = ""
# match may be prefixed with a non-word character from the non-capturing group
# Ensure this isn't replaced if watched words regular expression is disabled.
if !SiteSetting.watched_words_regular_expressions? && (match[0] =~ /\W/) != nil
prefix = "#{match[0]}"
end
"#{prefix}#{replacement}"
end
end
private_class_method :replace_text_with_regexp
def self.censor_text_with_regexp(text, regexp)
text.gsub(regexp) do |match|
# the regex captures leading whitespaces
padding = match.size - match.lstrip.size
if padding > 0
match[0..padding - 1] + REPLACEMENT_LETTER * (match.size - padding)
else
REPLACEMENT_LETTER * match.size
end
end
end
private_class_method :censor_text_with_regexp
# Returns a regexp that transforms a regular expression into a regular
# expression that matches a whole word.
def self.match_word_regexp(regexp, engine: :ruby)
if engine == :js
"(?:\\P{L}|^)(#{regexp})(?=\\P{L}|$)"
elsif engine == :ruby
"(?:[^[:word:]]|^)(#{regexp})(?=[^[:word:]]|$)"
else
raise "unknown regexp engine: #{engine}"
end
end
private_class_method :match_word_regexp
def self.replace(text, watch_word_type)
regexps_for_action(watch_word_type)
.to_a
.reduce(text) do |t, (word_regexp, attrs)|
case_flag = attrs[:case_sensitive] ? nil : Regexp::IGNORECASE
replace_text_with_regexp(t, Regexp.new(word_regexp, case_flag), attrs[:replacement])
end
end
private_class_method :replace
end