Skip to content

Commit

Permalink
Merge branch 'master' into removing-ner-duckling
Browse files Browse the repository at this point in the history
  • Loading branch information
EPedrotti authored Jan 11, 2019
2 parents f202e66 + 2636bf7 commit 99e59d0
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 15 deletions.
14 changes: 14 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,20 @@ jobs:
- cd rasa-website
- git commit --allow-empty -m "trigger nlu docs update"
- git push origin master
- stage: Test starter packs
name: "NLU starter pack"
python: 3.6
script:
- git clone https://github.com/RasaHQ/starter-pack-rasa-nlu.git
- cd starter-pack-rasa-nlu
- python -m pytest tests/test_nlu.py
- stage: Test starter packs
name: "Stack starter pack (NLU only)"
python: 3.6
script:
- git clone https://github.com/RasaHQ/starter-pack-rasa-stack.git
- cd starter-pack-rasa-stack
- python -m pytest tests/test_nlu.py
- stage: deploy
python: 3.6
install: skip
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Added
- more documentation on how to run NLU with Docker
- ``analyzer`` parameter to ``intent_featurizer_count_vectors`` featurizer to
configure whether to use word or character n-grams
- Travis script now clones and tests the Rasa NLU starter pack

Changed
-------
Expand Down
18 changes: 9 additions & 9 deletions rasa_nlu/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,6 @@
from future.utils import PY3
from requests.auth import HTTPBasicAuth

# Regular expression to test if string contains emoji code
unicode_regex = re.compile(
u'[\u231A-\u231B\u2328\u23CF\23E9-\u23F3...\U0001F9C0]',
flags=re.UNICODE)


def add_logging_option_arguments(parser, default=logging.WARNING):
"""Add options to an argument parser to configure logging levels."""
Expand Down Expand Up @@ -264,13 +259,18 @@ def read_yaml(content):
yaml_parser.version = "1.2"
yaml_parser.unicode_supplementary = True

if unicode_regex.match(content):
try:
return yaml_parser.load(content)
except yaml.scanner.ScannerError as _:
# A `ruamel.yaml.scanner.ScannerError` might happen due to escaped
# unicode sequences that form surrogate pairs. Try converting the input
# to a parsable format based on
# https://stackoverflow.com/a/52187065/3429596.
content = (content.encode('utf-8')
.decode('unicode_escape')
.decode('raw_unicode_escape')
.encode("utf-16", 'surrogatepass')
.decode('utf-16'))

return yaml_parser.load(content)
return yaml_parser.load(content)


def read_yaml_file(filename):
Expand Down
13 changes: 7 additions & 6 deletions tests/base/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,38 +201,39 @@ def test_emojis_in_yaml():
test_data = """
data:
- one 😁💯 👩🏿‍💻👨🏿‍💻
- two £
- two £ (?u)\\b\\w+\\b f\u00fcr
"""
actual = utils.read_yaml(test_data)

assert actual["data"][0] == "one 😁💯 👩🏿‍💻👨🏿‍💻"
assert actual["data"][1] == "two £"
assert actual["data"][1] == "two £ (?u)\\b\\w+\\b für"


def test_emojis_in_tmp_file():
test_data = """
data:
- one 😁💯 👩🏿‍💻👨🏿‍💻
- two £
- two £ (?u)\\b\\w+\\b f\u00fcr
"""
test_file = utils.create_temporary_file(test_data)
with io.open(test_file, mode='r', encoding="utf-8") as f:
content = f.read()
actual = utils.read_yaml(content)

assert actual["data"][0] == "one 😁💯 👩🏿‍💻👨🏿‍💻"
assert actual["data"][1] == "two £"
assert actual["data"][1] == "two £ (?u)\\b\\w+\\b für"


def test_read_emojis_from_json():
import json
from rasa_nlu.utils import read_yaml
d = {"text": "hey 😁💯 👩🏿‍💻👨🏿‍💻🧜‍♂️"}
d = {"text": "hey 😁💯 👩🏿‍💻👨🏿‍💻🧜‍♂️(?u)\\b\\w+\\b} f\u00fcr"}
json_string = json.dumps(d, indent=2)

s = read_yaml(json_string)

assert s.get('text') == "hey 😁💯 👩🏿‍💻👨🏿‍💻🧜‍♂️"
expected = "hey 😁💯 👩🏿‍💻👨🏿‍💻🧜‍♂️(?u)\\b\\w+\\b} für"
assert s.get('text') == expected


def test_bool_str():
Expand Down

0 comments on commit 99e59d0

Please sign in to comment.