SPARKNLP-621: Add string support to RegexMatcher in addition to a file (#13060)

DevinTDHa · web-flow · commit d7f561ec7fcf · 2022-11-10T11:30:05.000+01:00
- new parameter rules where users can provide an array of rules
- new parameter delimiter to split the rules
diff --git a/docs/en/annotator_entries/RegexMatcher.md b/docs/en/annotator_entries/RegexMatcher.md
@@ -28,10 +28,17 @@ CHUNK
 {%- endcapture -%}
 
 {%- capture approach_description -%}
-Uses a reference file to match a set of regular expressions and associate them with a provided identifier.
+Uses rules to match a set of regular expressions and associate them with a provided
+identifier.
 
-A dictionary of predefined regular expressions must be provided with `setExternalRules`.
-The dictionary can be set as a delimited text file.
+A rule consists of a regex pattern and an identifier, delimited by a character of choice. An
+example could be `"\d{4}\/\d\d\/\d\d,date"` which will match strings like `"1970/01/01"` to the
+identifier `"date"`.
+
+Rules must be provided by either `setRules` (followed by `setDelimiter`) or an external file.
+
+To use an external file, a dictionary of predefined regular expressions must be provided with
+`setExternalRules`. The dictionary can be set as a delimited text file.
 
 Pretrained pipelines are available for this module, see [Pipelines](https://nlp.johnsnowlabs.com/docs/en/pipelines).
 
diff --git a/docs/en/annotators.md b/docs/en/annotators.md
@@ -79,7 +79,7 @@ There are two types of Annotators:
 {% include templates/anno_table_entry.md path="" name="NorvigSweeting Spellchecker" summary="Retrieves tokens and makes corrections automatically if not found in an English dictionary."%}
 {% include templates/anno_table_entry.md path="" name="POSTagger (Part of speech tagger)" summary="Averaged Perceptron model to tag words part-of-speech."%}
 {% include templates/anno_table_entry.md path="" name="RecursiveTokenizer" summary="Tokenizes raw text recursively based on a handful of definable rules."%}
-{% include templates/anno_table_entry.md path="" name="RegexMatcher" summary="Uses a reference file to match a set of regular expressions and associate them with a provided identifier."%}
+{% include templates/anno_table_entry.md path="" name="RegexMatcher" summary="Uses rules to match a set of regular expressions and associate them with a provided identifier."%}
 {% include templates/anno_table_entry.md path="" name="RegexTokenizer" summary="A tokenizer that splits text by a regex pattern."%}
 {% include templates/anno_table_entry.md path="" name="SentenceDetector" summary="Annotator that detects sentence boundaries using regular expressions."%}
 {% include templates/anno_table_entry.md path="" name="SentenceDetectorDL" summary="Detects sentence boundaries using a deep learning approach."%}
diff --git a/python/sparknlp/annotator/matcher/regex_matcher.py b/python/sparknlp/annotator/matcher/regex_matcher.py
@@ -13,17 +13,23 @@
 #  limitations under the License.
 """Contains classes for the RegexMatcher."""
 
-
 from sparknlp.common import *
 from sparknlp.common.annotator_type import AnnotatorType
 
 
 class RegexMatcher(AnnotatorApproach):
-    """Uses a reference file to match a set of regular expressions and associate
-    them with a provided identifier.
+    """Uses rules to match a set of regular expressions and associate them with a
+    provided identifier.
+
+    A rule consists of a regex pattern and an identifier, delimited by a character of
+    choice. An example could be `"\\d{4}\\/\\d\\d\\/\\d\\d,date"` which will match
+    strings like `"1970/01/01"` to the identifier `"date"`.
+
+    Rules must be provided by either :meth:`.setRules` (followed by
+    :meth:`.setDelimiter`) or an external file.
 
-    A dictionary of predefined regular expressions must be provided with
-    :meth:`.setExternalRules`. The dictionary can be set in the form of a
+    To use an external file, a dictionary of predefined regular expressions must be
+    provided with :meth:`.setExternalRules`. The dictionary can be set in the form of a
     delimited text file.
 
     Pretrained pipelines are available for this module, see `Pipelines
@@ -43,6 +49,10 @@ class RegexMatcher(AnnotatorApproach):
     strategy
         Can be either MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE, by default
         "MATCH_ALL"
+    rules
+        Regex rules to match the identifier with
+    delimiter
+        Delimiter for rules provided with setRules
     externalRules
         external resource to rules, needs 'delimiter' in options
 
@@ -91,6 +101,14 @@ class RegexMatcher(AnnotatorApproach):
                           "externalRules",
                           "external resource to rules, needs 'delimiter' in options",
                           typeConverter=TypeConverters.identity)
+    rules = Param(Params._dummy(),
+                  "rules",
+                  "Regex rules to match the identifier with",
+                  typeConverter=TypeConverters.toListString)
+    delimiter = Param(Params._dummy(),
+                      "delimiter",
+                      "Delimiter for rules",
+                      typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self):
@@ -114,6 +132,9 @@ def setStrategy(self, value):
     def setExternalRules(self, path, delimiter, read_as=ReadAs.TEXT, options={"format": "text"}):
         """Sets external resource to rules, needs 'delimiter' in options.
 
+        Only one of either parameter `rules` or `externalRules` must be set.
+
+
         Parameters
         ----------
         path : str
@@ -130,6 +151,41 @@ def setExternalRules(self, path, delimiter, read_as=ReadAs.TEXT, options={"forma
             opts["delimiter"] = delimiter
         return self._set(externalRules=ExternalResource(path, read_as, opts))
 
+    def setRules(self, value):
+        """Sets the regex rules to match the identifier with.
+
+        The rules must consist of a regex pattern and an identifier for that pattern. The regex
+        pattern and the identifier must be delimited by a character that will also have to set with
+        `setDelimiter`.
+
+        Only one of either parameter `rules` or `externalRules` must be set.
+
+         Examples
+         --------
+        >>> regexMatcher = RegexMatcher() \\
+        ...     .setRules(["\\d{4}\\/\\d\\d\\/\\d\\d,date", "\\d{2}\\/\\d\\d\\/\\d\\d,short_date"]) \\
+        ...     .setDelimiter(",") \\
+        ...     .setInputCols(["sentence"]) \\
+        ...     .setOutputCol("regex") \\
+        ...     .setStrategy("MATCH_ALL")
+
+        Parameters
+        ----------
+        value : List[str]
+             List of rules
+        """
+        return self._set(rules=value)
+
+    def setDelimiter(self, value):
+        """Sets the delimiter for rules.
+
+        Parameters
+        ----------
+        value : str
+             Delimiter for the rules
+        """
+        return self._set(delimiter=value)
+
     def _create_model(self, java_model):
         return RegexMatcherModel(java_model=java_model)
 
@@ -160,4 +216,3 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.RegexMatcherModel"
         )
 
     name = "RegexMatcherModel"
-
diff --git a/python/test/annotator/matcher/regex_matcher_test.py b/python/test/annotator/matcher/regex_matcher_test.py
@@ -41,3 +41,25 @@ def runTest(self):
         assembled = document_assembler.transform(self.data)
         regex_matcher.fit(assembled).transform(assembled).show()
 
+
+@pytest.mark.fast
+class RegexMatcherWithStringTestSpec(unittest.TestCase):
+    def setUp(self):
+        # This implicitly sets up py4j for us
+        self.data = SparkContextForTest.spark.createDataFrame(
+            [["My first sentence with the first rule. This is my second sentence with ceremonies rule."]], ["text"])
+
+    def runTest(self):
+        document_assembler = DocumentAssembler() \
+            .setInputCol("text") \
+            .setOutputCol("document")
+
+        rules = ["the\\s\\w+,followed by 'the'", "ceremonies,ceremony"]
+        regex_matcher = RegexMatcher() \
+            .setInputCols(['document']) \
+            .setStrategy("MATCH_ALL") \
+            .setRules(rules) \
+            .setDelimiter(",") \
+            .setOutputCol("regex")
+        assembled = document_assembler.transform(self.data)
+        regex_matcher.fit(assembled).transform(assembled).select("regex").show()
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexMatcher.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexMatcher.scala
@@ -21,16 +21,22 @@ import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT}
 import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
 import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
 import org.apache.spark.ml.PipelineModel
-import org.apache.spark.ml.param.Param
+import org.apache.spark.ml.param.{Param, StringArrayParam}
 import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
 import org.apache.spark.sql.Dataset
 
-/** Uses a reference file to match a set of regular expressions and associate them with a provided
+/** Uses rules to match a set of regular expressions and associate them with a provided
   * identifier.
   *
-  * A dictionary of predefined regular expressions must be provided with `setExternalRules`. The
-  * dictionary can be set in either in the form of a delimited text file or directly as an
-  * [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource]].
+  * A rule consists of a regex pattern and an identifier, delimited by a character of choice. An
+  * example could be `\d{4}\/\d\d\/\d\d,date` which will match strings like `"1970/01/01"` to the
+  * identifier `"date"`.
+  *
+  * Rules must be provided by either `setRules` (followed by `setDelimiter`) or an external file.
+  *
+  * To use an external file, a dictionary of predefined regular expressions must be provided with
+  * `setExternalRules`. The dictionary can be set in either in the form of a delimited text file
+  * or directly as an [[com.johnsnowlabs.nlp.util.io.ExternalResource ExternalResource]].
   *
   * Pretrained pipelines are available for this module, see
   * [[https://nlp.johnsnowlabs.com/docs/en/pipelines Pipelines]].
@@ -117,14 +123,26 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
     */
   override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
 
-  /** external resource to rules, needs 'delimiter' in options
+  /** Rules with regex pattern and identifiers for matching
+    * @group param
+    */
+  val rules: StringArrayParam =
+    new StringArrayParam(this, "rules", "Rules with regex pattern and identifiers for matching")
+
+  /** Delimiter for rules provided with setRules
+    *
+    * @group param
+    */
+  val delimiter: Param[String] = new Param[String](this, "delimiter", "Delimiter for the rules")
+
+  /** External resource to rules, needs 'delimiter' in options
     *
     * @group param
     */
   val externalRules: ExternalResourceParam = new ExternalResourceParam(
     this,
     "externalRules",
-    "external resource to rules, needs 'delimiter' in options")
+    "External resource to rules, needs 'delimiter' in options")
 
   /** Strategy for which to match the expressions (Default: `"MATCH_ALL"`). Possible values are:
     *   - MATCH_ALL brings one-to-many results
@@ -144,6 +162,9 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
 
   /** External dictionary already in the form of [[ExternalResource]], for which the Map member
     * `options` has `"delimiter"` defined.
+    *
+    * Note that only either externalRules or rules can be set at once.
+    *
     * ==Example==
     * {{{
     * val regexMatcher = new RegexMatcher()
@@ -163,20 +184,32 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
     require(
       value.options.contains("delimiter"),
       "RegexMatcher requires 'delimiter' option to be set in ExternalResource")
+    require(get(rules).isEmpty, "Only either parameter externalRules or rules should be set.")
+    require(
+      get(this.delimiter).isEmpty,
+      "Parameter delimiter should only be set with parameter rules. " +
+        "Please provide the delimiter in the ExternalResource.")
     set(externalRules, value)
   }
 
   /** External dictionary to be used by the lemmatizer, which needs `delimiter` set for parsing
-    * the resource
+    * the resource.
+    *
+    * Note that only either externalRules or rules can be set at once.
     *
     * @group setParam
     */
   def setExternalRules(
       path: String,
       delimiter: String,
       readAs: ReadAs.Format = ReadAs.TEXT,
-      options: Map[String, String] = Map("format" -> "text")): this.type =
+      options: Map[String, String] = Map("format" -> "text")): this.type = {
+    require(get(rules).isEmpty, "Only either parameter externalRules or rules should be set.")
+    require(
+      get(this.delimiter).isEmpty,
+      "Parameter delimiter should only be set with parameter rules.")
     set(externalRules, ExternalResource(path, readAs, options ++ Map("delimiter" -> delimiter)))
+  }
 
   /** Strategy for which to match the expressions (Default: `"MATCH_ALL"`)
     *
@@ -193,17 +226,73 @@ class RegexMatcher(override val uid: String) extends AnnotatorApproach[RegexMatc
     *
     * @group getParam
     */
-  def getStrategy: String = $(strategy).toString
+  def getStrategy: String = $(strategy)
+
+  /** Sets the regex rules to match the identifier with.
+    *
+    * The rules must consist of a regex pattern and an identifier for that pattern. The regex
+    * pattern and the identifier must be delimited by a character that will also have to set with
+    * `setDelimiter`.
+    *
+    * Only one of either parameter `rules` or `externalRules` must be set.
+    *
+    * ==Example==
+    * {{{
+    * val regexMatcher = new RegexMatcher()
+    *   .setRules(Array("\d{4}\/\d\d\/\d\d,date", "\d{2}\/\d\d\/\d\d,date_short")
+    *   .setDelimiter(",")
+    *   .setInputCols("sentence")
+    *   .setOutputCol("regex")
+    *   .setStrategy("MATCH_ALL")
+    * }}}
+    *
+    * @group setParam
+    * @param value
+    *   Array of rules
+    */
+  def setRules(value: Array[String]): this.type = {
+    require(
+      get(externalRules).isEmpty,
+      "Only either parameter rules or externalRules should be set.")
+    set(rules, value)
+  }
+
+  /** Sets the regex rules to match the identifier with.
+    *
+    * Note that only either externalRules or rules can be set at once.
+    *
+    * @group setParam
+    * @param value
+    *   Array of rules and identifiers as tuples
+    */
+  def setDelimiter(value: String): this.type = {
+    require(
+      get(externalRules).isEmpty,
+      "Only either parameter rules or externalRules should be set.")
+    set(delimiter, value)
+  }
 
   override def train(
       dataset: Dataset[_],
       recursivePipeline: Option[PipelineModel]): RegexMatcherModel = {
-    val processedRules = ResourceHelper.parseTupleText($(externalRules))
+    val processedRules: Array[(String, String)] =
+      if (get(externalRules).nonEmpty) ResourceHelper.parseTupleText($(externalRules))
+      else {
+        val delim = getOrDefault(delimiter)
+        getOrDefault(rules).map { rule =>
+          rule.split(delim) match {
+            case Array(pattern, identifier) => (pattern, identifier)
+            case a: Array[String] =>
+              throw new IllegalArgumentException(
+                s"Expected 2-tuple after splitting, but got ${a.length} for '$rule'")
+          }
+        }
+      }
+
     new RegexMatcherModel()
       .setExternalRules(processedRules)
       .setStrategy($(strategy))
   }
-
 }
 
 /** This is the companion object of [[RegexMatcher]]. Please refer to that class for the
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexMatcherBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/RegexMatcherBehaviors.scala