@@ -21,16 +21,22 @@ import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT}
21
21
import com .johnsnowlabs .nlp .annotators .param .ExternalResourceParam
22
22
import com .johnsnowlabs .nlp .util .io .{ExternalResource , ReadAs , ResourceHelper }
23
23
import org .apache .spark .ml .PipelineModel
24
- import org .apache .spark .ml .param .Param
24
+ import org .apache .spark .ml .param .{ Param , StringArrayParam }
25
25
import org .apache .spark .ml .util .{DefaultParamsReadable , Identifiable }
26
26
import org .apache .spark .sql .Dataset
27
27
28
- /** Uses a reference file to match a set of regular expressions and associate them with a provided
28
+ /** Uses rules to match a set of regular expressions and associate them with a provided
29
29
* identifier.
30
30
*
31
- * A dictionary of predefined regular expressions must be provided with `setExternalRules`. The
32
- * dictionary can be set in either in the form of a delimited text file or directly as an
33
- * [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource ]].
31
+ * A rule consists of a regex pattern and an identifier, delimited by a character of choice. An
32
+ * example could be `\d{4}\/\d\d\/\d\d,date` which will match strings like `"1970/01/01"` to the
33
+ * identifier `"date"`.
34
+ *
35
+ * Rules must be provided by either `setRules` (followed by `setDelimiter`) or an external file.
36
+ *
37
+ * To use an external file, a dictionary of predefined regular expressions must be provided with
38
+ * `setExternalRules`. The dictionary can be set in either in the form of a delimited text file
39
+ * or directly as an [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource ]].
34
40
*
35
41
* Pretrained pipelines are available for this module, see
36
42
* [[https://nlp.johnsnowlabs.com/docs/en/pipelines Pipelines ]].
@@ -117,14 +123,26 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
117
123
*/
118
124
override val inputAnnotatorTypes : Array [AnnotatorType ] = Array (DOCUMENT )
119
125
120
- /** external resource to rules, needs 'delimiter' in options
126
+ /** Rules with regex pattern and identifiers for matching
127
+ * @group param
128
+ */
129
+ val rules : StringArrayParam =
130
+ new StringArrayParam (this , " rules" , " Rules with regex pattern and identifiers for matching" )
131
+
132
+ /** Delimiter for rules provided with setRules
133
+ *
134
+ * @group param
135
+ */
136
+ val delimiter : Param [String ] = new Param [String ](this , " delimiter" , " Delimiter for the rules" )
137
+
138
+ /** External resource to rules, needs 'delimiter' in options
121
139
*
122
140
* @group param
123
141
*/
124
142
val externalRules : ExternalResourceParam = new ExternalResourceParam (
125
143
this ,
126
144
" externalRules" ,
127
- " external resource to rules, needs 'delimiter' in options" )
145
+ " External resource to rules, needs 'delimiter' in options" )
128
146
129
147
/** Strategy for which to match the expressions (Default: `"MATCH_ALL"`). Possible values are:
130
148
* - MATCH_ALL brings one-to-many results
@@ -144,6 +162,9 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
144
162
145
163
/** External dictionary already in the form of [[ExternalResource ]], for which the Map member
146
164
* `options` has `"delimiter"` defined.
165
+ *
166
+ * Note that only either externalRules or rules can be set at once.
167
+ *
147
168
* ==Example==
148
169
* {{{
149
170
* val regexMatcher = new RegexMatcher()
@@ -163,20 +184,32 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
163
184
require(
164
185
value.options.contains(" delimiter" ),
165
186
" RegexMatcher requires 'delimiter' option to be set in ExternalResource" )
187
+ require(get(rules).isEmpty, " Only either parameter externalRules or rules should be set." )
188
+ require(
189
+ get(this .delimiter).isEmpty,
190
+ " Parameter delimiter should only be set with parameter rules. " +
191
+ " Please provide the delimiter in the ExternalResource." )
166
192
set(externalRules, value)
167
193
}
168
194
169
195
/** External dictionary to be used by the lemmatizer, which needs `delimiter` set for parsing
170
- * the resource
196
+ * the resource.
197
+ *
198
+ * Note that only either externalRules or rules can be set at once.
171
199
*
172
200
* @group setParam
173
201
*/
174
202
def setExternalRules (
175
203
path : String ,
176
204
delimiter : String ,
177
205
readAs : ReadAs .Format = ReadAs .TEXT ,
178
- options : Map [String , String ] = Map (" format" -> " text" )): this .type =
206
+ options : Map [String , String ] = Map (" format" -> " text" )): this .type = {
207
+ require(get(rules).isEmpty, " Only either parameter externalRules or rules should be set." )
208
+ require(
209
+ get(this .delimiter).isEmpty,
210
+ " Parameter delimiter should only be set with parameter rules." )
179
211
set(externalRules, ExternalResource (path, readAs, options ++ Map (" delimiter" -> delimiter)))
212
+ }
180
213
181
214
/** Strategy for which to match the expressions (Default: `"MATCH_ALL"`)
182
215
*
@@ -193,17 +226,73 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
193
226
*
194
227
* @group getParam
195
228
*/
196
- def getStrategy : String = $(strategy).toString
229
+ def getStrategy : String = $(strategy)
230
+
231
+ /** Sets the regex rules to match the identifier with.
232
+ *
233
+ * The rules must consist of a regex pattern and an identifier for that pattern. The regex
234
+ * pattern and the identifier must be delimited by a character that will also have to set with
235
+ * `setDelimiter`.
236
+ *
237
+ * Only one of either parameter `rules` or `externalRules` must be set.
238
+ *
239
+ * ==Example==
240
+ * {{{
241
+ * val regexMatcher = new RegexMatcher()
242
+ * .setRules(Array("\d{4}\/\d\d\/\d\d,date", "\d{2}\/\d\d\/\d\d,date_short")
243
+ * .setDelimiter(",")
244
+ * .setInputCols("sentence")
245
+ * .setOutputCol("regex")
246
+ * .setStrategy("MATCH_ALL")
247
+ * }}}
248
+ *
249
+ * @group setParam
250
+ * @param value
251
+ * Array of rules
252
+ */
253
+ def setRules (value : Array [String ]): this .type = {
254
+ require(
255
+ get(externalRules).isEmpty,
256
+ " Only either parameter rules or externalRules should be set." )
257
+ set(rules, value)
258
+ }
259
+
260
+ /** Sets the regex rules to match the identifier with.
261
+ *
262
+ * Note that only either externalRules or rules can be set at once.
263
+ *
264
+ * @group setParam
265
+ * @param value
266
+ * Array of rules and identifiers as tuples
267
+ */
268
+ def setDelimiter (value : String ): this .type = {
269
+ require(
270
+ get(externalRules).isEmpty,
271
+ " Only either parameter rules or externalRules should be set." )
272
+ set(delimiter, value)
273
+ }
197
274
198
275
override def train (
199
276
dataset : Dataset [_],
200
277
recursivePipeline : Option [PipelineModel ]): RegexMatcherModel = {
201
- val processedRules = ResourceHelper .parseTupleText($(externalRules))
278
+ val processedRules : Array [(String , String )] =
279
+ if (get(externalRules).nonEmpty) ResourceHelper .parseTupleText($(externalRules))
280
+ else {
281
+ val delim = getOrDefault(delimiter)
282
+ getOrDefault(rules).map { rule =>
283
+ rule.split(delim) match {
284
+ case Array (pattern, identifier) => (pattern, identifier)
285
+ case a : Array [String ] =>
286
+ throw new IllegalArgumentException (
287
+ s " Expected 2-tuple after splitting, but got ${a.length} for ' $rule' " )
288
+ }
289
+ }
290
+ }
291
+
202
292
new RegexMatcherModel ()
203
293
.setExternalRules(processedRules)
204
294
.setStrategy($(strategy))
205
295
}
206
-
207
296
}
208
297
209
298
/** This is the companion object of [[RegexMatcher ]]. Please refer to that class for the
0 commit comments