Merge branch 'master' into removing-ner-duckling

RasaHQ · Jan 11, 2019 · 99e59d0 · 99e59d0
2 parents f202e66 + 2636bf7
commit 99e59d0
Show file tree

Hide file tree

Showing 4 changed files with 31 additions and 15 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -80,6 +80,20 @@ jobs:
     - cd rasa-website
     - git commit --allow-empty -m "trigger nlu docs update"
     - git push origin master
+  - stage: Test starter packs
+    name: "NLU starter pack"
+    python: 3.6
+    script:
+    - git clone https://github.com/RasaHQ/starter-pack-rasa-nlu.git
+    - cd starter-pack-rasa-nlu
+    - python -m pytest tests/test_nlu.py
+  - stage: Test starter packs
+    name: "Stack starter pack (NLU only)"
+    python: 3.6
+    script:
+    - git clone https://github.com/RasaHQ/starter-pack-rasa-stack.git
+    - cd starter-pack-rasa-stack
+    - python -m pytest tests/test_nlu.py
   - stage: deploy
     python: 3.6
     install: skip

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -14,6 +14,7 @@ Added
 - more documentation on how to run NLU with Docker
 - ``analyzer`` parameter to ``intent_featurizer_count_vectors`` featurizer to
   configure whether to use word or character n-grams
+- Travis script now clones and tests the Rasa NLU starter pack
 
 Changed
 -------

diff --git a/rasa_nlu/utils/__init__.py b/rasa_nlu/utils/__init__.py
@@ -24,11 +24,6 @@
 from future.utils import PY3
 from requests.auth import HTTPBasicAuth
 
-# Regular expression to test if string contains emoji code
-unicode_regex = re.compile(
-    u'[\u231A-\u231B\u2328\u23CF\23E9-\u23F3...\U0001F9C0]',
-    flags=re.UNICODE)
-
 
 def add_logging_option_arguments(parser, default=logging.WARNING):
     """Add options to an argument parser to configure logging levels."""
@@ -264,13 +259,18 @@ def read_yaml(content):
     yaml_parser.version = "1.2"
     yaml_parser.unicode_supplementary = True
 
-    if unicode_regex.match(content):
+    try:
+        return yaml_parser.load(content)
+    except yaml.scanner.ScannerError as _:
+        # A `ruamel.yaml.scanner.ScannerError` might happen due to escaped
+        # unicode sequences that form surrogate pairs. Try converting the input
+        # to a parsable format based on
+        # https://stackoverflow.com/a/52187065/3429596.
         content = (content.encode('utf-8')
-                   .decode('unicode_escape')
+                   .decode('raw_unicode_escape')
                    .encode("utf-16", 'surrogatepass')
                    .decode('utf-16'))
-
-    return yaml_parser.load(content)
+        return yaml_parser.load(content)
 
 
 def read_yaml_file(filename):

diff --git a/tests/base/test_utils.py b/tests/base/test_utils.py
@@ -201,38 +201,39 @@ def test_emojis_in_yaml():
     test_data = """
     data:
         - one 😁💯 👩🏿‍💻👨🏿‍💻
-        - two £
+        - two £ (?u)\\b\\w+\\b f\u00fcr
     """
     actual = utils.read_yaml(test_data)
 
     assert actual["data"][0] == "one 😁💯 👩🏿‍💻👨🏿‍💻"
-    assert actual["data"][1] == "two £"
+    assert actual["data"][1] == "two £ (?u)\\b\\w+\\b für"
 
 
 def test_emojis_in_tmp_file():
     test_data = """
         data:
             - one 😁💯 👩🏿‍💻👨🏿‍💻
-            - two £
+            - two £ (?u)\\b\\w+\\b f\u00fcr
         """
     test_file = utils.create_temporary_file(test_data)
     with io.open(test_file, mode='r', encoding="utf-8") as f:
         content = f.read()
     actual = utils.read_yaml(content)
 
     assert actual["data"][0] == "one 😁💯 👩🏿‍💻👨🏿‍💻"
-    assert actual["data"][1] == "two £"
+    assert actual["data"][1] == "two £ (?u)\\b\\w+\\b für"
 
 
 def test_read_emojis_from_json():
     import json
     from rasa_nlu.utils import read_yaml
-    d = {"text": "hey 😁💯 👩🏿‍💻👨🏿‍💻🧜‍♂️"}
+    d = {"text": "hey 😁💯 👩🏿‍💻👨🏿‍💻🧜‍♂️(?u)\\b\\w+\\b} f\u00fcr"}
     json_string = json.dumps(d, indent=2)
 
     s = read_yaml(json_string)
 
-    assert s.get('text') == "hey 😁💯 👩🏿‍💻👨🏿‍💻🧜‍♂️"
+    expected = "hey 😁💯 👩🏿‍💻👨🏿‍💻🧜‍♂️(?u)\\b\\w+\\b} für"
+    assert s.get('text') == expected
 
 
 def test_bool_str():