Skip to content

Commit d7f561e

Browse files
authored
SPARKNLP-621: Add string support to RegexMatcher in addition to a file (#13060)
- new parameter rules where users can provide an array of rules - new parameter delimiter to split the rules
1 parent 2a382ce commit d7f561e

File tree

6 files changed

+295
-56
lines changed

6 files changed

+295
-56
lines changed

docs/en/annotator_entries/RegexMatcher.md

+10-3
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,17 @@ CHUNK
2828
{%- endcapture -%}
2929

3030
{%- capture approach_description -%}
31-
Uses a reference file to match a set of regular expressions and associate them with a provided identifier.
31+
Uses rules to match a set of regular expressions and associate them with a provided
32+
identifier.
3233

33-
A dictionary of predefined regular expressions must be provided with `setExternalRules`.
34-
The dictionary can be set as a delimited text file.
34+
A rule consists of a regex pattern and an identifier, delimited by a character of choice. An
35+
example could be `"\d{4}\/\d\d\/\d\d,date"` which will match strings like `"1970/01/01"` to the
36+
identifier `"date"`.
37+
38+
Rules must be provided by either `setRules` (followed by `setDelimiter`) or an external file.
39+
40+
To use an external file, a dictionary of predefined regular expressions must be provided with
41+
`setExternalRules`. The dictionary can be set as a delimited text file.
3542

3643
Pretrained pipelines are available for this module, see [Pipelines](https://nlp.johnsnowlabs.com/docs/en/pipelines).
3744

docs/en/annotators.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ There are two types of Annotators:
7979
{% include templates/anno_table_entry.md path="" name="NorvigSweeting Spellchecker" summary="Retrieves tokens and makes corrections automatically if not found in an English dictionary."%}
8080
{% include templates/anno_table_entry.md path="" name="POSTagger (Part of speech tagger)" summary="Averaged Perceptron model to tag words part-of-speech."%}
8181
{% include templates/anno_table_entry.md path="" name="RecursiveTokenizer" summary="Tokenizes raw text recursively based on a handful of definable rules."%}
82-
{% include templates/anno_table_entry.md path="" name="RegexMatcher" summary="Uses a reference file to match a set of regular expressions and associate them with a provided identifier."%}
82+
{% include templates/anno_table_entry.md path="" name="RegexMatcher" summary="Uses rules to match a set of regular expressions and associate them with a provided identifier."%}
8383
{% include templates/anno_table_entry.md path="" name="RegexTokenizer" summary="A tokenizer that splits text by a regex pattern."%}
8484
{% include templates/anno_table_entry.md path="" name="SentenceDetector" summary="Annotator that detects sentence boundaries using regular expressions."%}
8585
{% include templates/anno_table_entry.md path="" name="SentenceDetectorDL" summary="Detects sentence boundaries using a deep learning approach."%}

python/sparknlp/annotator/matcher/regex_matcher.py

+61-6
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,23 @@
1313
# limitations under the License.
1414
"""Contains classes for the RegexMatcher."""
1515

16-
1716
from sparknlp.common import *
1817
from sparknlp.common.annotator_type import AnnotatorType
1918

2019

2120
class RegexMatcher(AnnotatorApproach):
22-
"""Uses a reference file to match a set of regular expressions and associate
23-
them with a provided identifier.
21+
"""Uses rules to match a set of regular expressions and associate them with a
22+
provided identifier.
23+
24+
A rule consists of a regex pattern and an identifier, delimited by a character of
25+
choice. An example could be `"\\d{4}\\/\\d\\d\\/\\d\\d,date"` which will match
26+
strings like `"1970/01/01"` to the identifier `"date"`.
27+
28+
Rules must be provided by either :meth:`.setRules` (followed by
29+
:meth:`.setDelimiter`) or an external file.
2430
25-
A dictionary of predefined regular expressions must be provided with
26-
:meth:`.setExternalRules`. The dictionary can be set in the form of a
31+
To use an external file, a dictionary of predefined regular expressions must be
32+
provided with :meth:`.setExternalRules`. The dictionary can be set in the form of a
2733
delimited text file.
2834
2935
Pretrained pipelines are available for this module, see `Pipelines
@@ -43,6 +49,10 @@ class RegexMatcher(AnnotatorApproach):
4349
strategy
4450
Can be either MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE, by default
4551
"MATCH_ALL"
52+
rules
53+
Regex rules to match the identifier with
54+
delimiter
55+
Delimiter for rules provided with setRules
4656
externalRules
4757
external resource to rules, needs 'delimiter' in options
4858
@@ -91,6 +101,14 @@ class RegexMatcher(AnnotatorApproach):
91101
"externalRules",
92102
"external resource to rules, needs 'delimiter' in options",
93103
typeConverter=TypeConverters.identity)
104+
rules = Param(Params._dummy(),
105+
"rules",
106+
"Regex rules to match the identifier with",
107+
typeConverter=TypeConverters.toListString)
108+
delimiter = Param(Params._dummy(),
109+
"delimiter",
110+
"Delimiter for rules",
111+
typeConverter=TypeConverters.toString)
94112

95113
@keyword_only
96114
def __init__(self):
@@ -114,6 +132,9 @@ def setStrategy(self, value):
114132
def setExternalRules(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}):
115133
"""Sets external resource to rules, needs 'delimiter' in options.
116134
135+
Only one of either parameter `rules` or `externalRules` must be set.
136+
137+
117138
Parameters
118139
----------
119140
path : str
@@ -130,6 +151,41 @@ def setExternalRules(self, path, delimiter, read_as=ReadAs.TEXT, options={"forma
130151
opts["delimiter"] = delimiter
131152
return self._set(externalRules=ExternalResource(path, read_as, opts))
132153

154+
def setRules(self, value):
155+
"""Sets the regex rules to match the identifier with.
156+
157+
The rules must consist of a regex pattern and an identifier for that pattern. The regex
158+
pattern and the identifier must be delimited by a character that will also have to set with
159+
`setDelimiter`.
160+
161+
Only one of either parameter `rules` or `externalRules` must be set.
162+
163+
Examples
164+
--------
165+
>>> regexMatcher = RegexMatcher() \\
166+
... .setRules(["\\d{4}\\/\\d\\d\\/\\d\\d,date", "\\d{2}\\/\\d\\d\\/\\d\\d,short_date"]) \\
167+
... .setDelimiter(",") \\
168+
... .setInputCols(["sentence"]) \\
169+
... .setOutputCol("regex") \\
170+
... .setStrategy("MATCH_ALL")
171+
172+
Parameters
173+
----------
174+
value : List[str]
175+
List of rules
176+
"""
177+
return self._set(rules=value)
178+
179+
def setDelimiter(self, value):
180+
"""Sets the delimiter for rules.
181+
182+
Parameters
183+
----------
184+
value : str
185+
Delimiter for the rules
186+
"""
187+
return self._set(delimiter=value)
188+
133189
def _create_model(self, java_model):
134190
return RegexMatcherModel(java_model=java_model)
135191

@@ -160,4 +216,3 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RegexMatcherModel"
160216
)
161217

162218
name = "RegexMatcherModel"
163-

python/test/annotator/matcher/regex_matcher_test.py

+22
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,25 @@ def runTest(self):
4141
assembled = document_assembler.transform(self.data)
4242
regex_matcher.fit(assembled).transform(assembled).show()
4343

44+
45+
@pytest.mark.fast
46+
class RegexMatcherWithStringTestSpec(unittest.TestCase):
47+
def setUp(self):
48+
# This implicitly sets up py4j for us
49+
self.data = SparkContextForTest.spark.createDataFrame(
50+
[["My first sentence with the first rule. This is my second sentence with ceremonies rule."]], ["text"])
51+
52+
def runTest(self):
53+
document_assembler = DocumentAssembler() \
54+
.setInputCol("text") \
55+
.setOutputCol("document")
56+
57+
rules = ["the\\s\\w+,followed by 'the'", "ceremonies,ceremony"]
58+
regex_matcher = RegexMatcher() \
59+
.setInputCols(['document']) \
60+
.setStrategy("MATCH_ALL") \
61+
.setRules(rules) \
62+
.setDelimiter(",") \
63+
.setOutputCol("regex")
64+
assembled = document_assembler.transform(self.data)
65+
regex_matcher.fit(assembled).transform(assembled).select("regex").show()

src/main/scala/com/johnsnowlabs/nlp/annotators/RegexMatcher.scala

+101-12
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,22 @@ import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT}
2121
import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
2222
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
2323
import org.apache.spark.ml.PipelineModel
24-
import org.apache.spark.ml.param.Param
24+
import org.apache.spark.ml.param.{Param, StringArrayParam}
2525
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
2626
import org.apache.spark.sql.Dataset
2727

28-
/** Uses a reference file to match a set of regular expressions and associate them with a provided
28+
/** Uses rules to match a set of regular expressions and associate them with a provided
2929
* identifier.
3030
*
31-
* A dictionary of predefined regular expressions must be provided with `setExternalRules`. The
32-
* dictionary can be set in either in the form of a delimited text file or directly as an
33-
* [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource]].
31+
* A rule consists of a regex pattern and an identifier, delimited by a character of choice. An
32+
* example could be `\d{4}\/\d\d\/\d\d,date` which will match strings like `"1970/01/01"` to the
33+
* identifier `"date"`.
34+
*
35+
* Rules must be provided by either `setRules` (followed by `setDelimiter`) or an external file.
36+
*
37+
* To use an external file, a dictionary of predefined regular expressions must be provided with
38+
* `setExternalRules`. The dictionary can be set in either in the form of a delimited text file
39+
* or directly as an [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource]].
3440
*
3541
* Pretrained pipelines are available for this module, see
3642
* [[https://nlp.johnsnowlabs.com/docs/en/pipelines Pipelines]].
@@ -117,14 +123,26 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
117123
*/
118124
override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
119125

120-
/** external resource to rules, needs 'delimiter' in options
126+
/** Rules with regex pattern and identifiers for matching
127+
* @group param
128+
*/
129+
val rules: StringArrayParam =
130+
new StringArrayParam(this, "rules", "Rules with regex pattern and identifiers for matching")
131+
132+
/** Delimiter for rules provided with setRules
133+
*
134+
* @group param
135+
*/
136+
val delimiter: Param[String] = new Param[String](this, "delimiter", "Delimiter for the rules")
137+
138+
/** External resource to rules, needs 'delimiter' in options
121139
*
122140
* @group param
123141
*/
124142
val externalRules: ExternalResourceParam = new ExternalResourceParam(
125143
this,
126144
"externalRules",
127-
"external resource to rules, needs 'delimiter' in options")
145+
"External resource to rules, needs 'delimiter' in options")
128146

129147
/** Strategy for which to match the expressions (Default: `"MATCH_ALL"`). Possible values are:
130148
* - MATCH_ALL brings one-to-many results
@@ -144,6 +162,9 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
144162

145163
/** External dictionary already in the form of [[ExternalResource]], for which the Map member
146164
* `options` has `"delimiter"` defined.
165+
*
166+
* Note that only either externalRules or rules can be set at once.
167+
*
147168
* ==Example==
148169
* {{{
149170
* val regexMatcher = new RegexMatcher()
@@ -163,20 +184,32 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
163184
require(
164185
value.options.contains("delimiter"),
165186
"RegexMatcher requires 'delimiter' option to be set in ExternalResource")
187+
require(get(rules).isEmpty, "Only either parameter externalRules or rules should be set.")
188+
require(
189+
get(this.delimiter).isEmpty,
190+
"Parameter delimiter should only be set with parameter rules. " +
191+
"Please provide the delimiter in the ExternalResource.")
166192
set(externalRules, value)
167193
}
168194

169195
/** External dictionary to be used by the lemmatizer, which needs `delimiter` set for parsing
170-
* the resource
196+
* the resource.
197+
*
198+
* Note that only either externalRules or rules can be set at once.
171199
*
172200
* @group setParam
173201
*/
174202
def setExternalRules(
175203
path: String,
176204
delimiter: String,
177205
readAs: ReadAs.Format = ReadAs.TEXT,
178-
options: Map[String, String] = Map("format" -> "text")): this.type =
206+
options: Map[String, String] = Map("format" -> "text")): this.type = {
207+
require(get(rules).isEmpty, "Only either parameter externalRules or rules should be set.")
208+
require(
209+
get(this.delimiter).isEmpty,
210+
"Parameter delimiter should only be set with parameter rules.")
179211
set(externalRules, ExternalResource(path, readAs, options ++ Map("delimiter" -> delimiter)))
212+
}
180213

181214
/** Strategy for which to match the expressions (Default: `"MATCH_ALL"`)
182215
*
@@ -193,17 +226,73 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
193226
*
194227
* @group getParam
195228
*/
196-
def getStrategy: String = $(strategy).toString
229+
def getStrategy: String = $(strategy)
230+
231+
/** Sets the regex rules to match the identifier with.
232+
*
233+
* The rules must consist of a regex pattern and an identifier for that pattern. The regex
234+
* pattern and the identifier must be delimited by a character that will also have to set with
235+
* `setDelimiter`.
236+
*
237+
* Only one of either parameter `rules` or `externalRules` must be set.
238+
*
239+
* ==Example==
240+
* {{{
241+
* val regexMatcher = new RegexMatcher()
242+
* .setRules(Array("\d{4}\/\d\d\/\d\d,date", "\d{2}\/\d\d\/\d\d,date_short")
243+
* .setDelimiter(",")
244+
* .setInputCols("sentence")
245+
* .setOutputCol("regex")
246+
* .setStrategy("MATCH_ALL")
247+
* }}}
248+
*
249+
* @group setParam
250+
* @param value
251+
* Array of rules
252+
*/
253+
def setRules(value: Array[String]): this.type = {
254+
require(
255+
get(externalRules).isEmpty,
256+
"Only either parameter rules or externalRules should be set.")
257+
set(rules, value)
258+
}
259+
260+
/** Sets the regex rules to match the identifier with.
261+
*
262+
* Note that only either externalRules or rules can be set at once.
263+
*
264+
* @group setParam
265+
* @param value
266+
* Array of rules and identifiers as tuples
267+
*/
268+
def setDelimiter(value: String): this.type = {
269+
require(
270+
get(externalRules).isEmpty,
271+
"Only either parameter rules or externalRules should be set.")
272+
set(delimiter, value)
273+
}
197274

198275
override def train(
199276
dataset: Dataset[_],
200277
recursivePipeline: Option[PipelineModel]): RegexMatcherModel = {
201-
val processedRules = ResourceHelper.parseTupleText($(externalRules))
278+
val processedRules: Array[(String, String)] =
279+
if (get(externalRules).nonEmpty) ResourceHelper.parseTupleText($(externalRules))
280+
else {
281+
val delim = getOrDefault(delimiter)
282+
getOrDefault(rules).map { rule =>
283+
rule.split(delim) match {
284+
case Array(pattern, identifier) => (pattern, identifier)
285+
case a: Array[String] =>
286+
throw new IllegalArgumentException(
287+
s"Expected 2-tuple after splitting, but got ${a.length} for '$rule'")
288+
}
289+
}
290+
}
291+
202292
new RegexMatcherModel()
203293
.setExternalRules(processedRules)
204294
.setStrategy($(strategy))
205295
}
206-
207296
}
208297

209298
/** This is the companion object of [[RegexMatcher]]. Please refer to that class for the

0 commit comments

Comments
 (0)