Skip to content

Commit

Permalink
fixed warnings encoding and line lengths
Browse files Browse the repository at this point in the history
  • Loading branch information
tmbo committed Dec 12, 2017
1 parent 98f5f27 commit 3625c61
Showing 1 changed file with 67 additions and 27 deletions.
94 changes: 67 additions & 27 deletions rasa_nlu/training_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def get(self, prop, default=None):

def as_dict(self, only_output_properties=False):
if only_output_properties:
d = {key: value for key, value in self.data.items() if key in self.output_properties}
d = {key: value
for key, value in self.data.items()
if key in self.output_properties}
else:
d = self.data
return dict(d, text=self.text)
Expand All @@ -54,7 +56,8 @@ def __eq__(self, other):
if not isinstance(other, Message):
return False
else:
return (other.text, ordered(other.data)) == (self.text, ordered(self.data))
return ((other.text, ordered(other.data)) ==
(self.text, ordered(self.data)))

def __hash__(self):
return hash((self.text, str(ordered(self.data))))
Expand All @@ -67,18 +70,25 @@ class TrainingData(object):
MIN_EXAMPLES_PER_INTENT = 2
MIN_EXAMPLES_PER_ENTITY = 2

def __init__(self, training_examples=None, entity_synonyms=None, regex_features=None):
def __init__(self,
training_examples=None,
entity_synonyms=None,
regex_features=None):
# type: (Optional[List[Message]], Optional[Dict[Text, Text]]) -> None

self.training_examples = self.sanitice_examples(training_examples) if training_examples else []
if training_examples:
self.training_examples = self.sanitice_examples(training_examples)
else:
self.training_examples = []
self.entity_synonyms = entity_synonyms if entity_synonyms else {}
self.regex_features = regex_features if regex_features else []

self.validate()

def sanitice_examples(self, examples):
# type: (List[Message]) -> List[Message]
"""Makes sure the training data is cleaned, e.q. removes trailing whitespaces from intent annotations."""
"""Makes sure the training data is cleaned, e.q. removes trailing
whitespaces from intent annotations."""

for e in examples:
if e.get("intent") is not None:
Expand All @@ -88,19 +98,26 @@ def sanitice_examples(self, examples):
@lazyproperty
def intent_examples(self):
# type: () -> List[Message]
return [e for e in self.training_examples if e.get("intent") is not None]
return [e
for e in self.training_examples
if e.get("intent") is not None]

@lazyproperty
def entity_examples(self):
# type: () -> List[Message]
return [e for e in self.training_examples if e.get("entities") is not None]
return [e
for e in self.training_examples
if e.get("entities") is not None]

@lazyproperty
def num_entity_examples(self):
# type: () -> int
"""Returns the number of proper entity training examples (containing at least one annotated entity)."""
"""Returns the number of proper entity training examples
(containing at least one annotated entity)."""

return len([e for e in self.training_examples if len(e.get("entities", [])) > 0])
return len([e
for e in self.training_examples
if len(e.get("entities", [])) > 0])

@lazyproperty
def num_intent_examples(self):
Expand All @@ -111,30 +128,40 @@ def num_intent_examples(self):

def as_json(self, **kwargs):
# type: (**Any) -> str
"""Represent this set of training examples as json adding the passed meta information."""
"""Represent this set of training examples as json adding
the passed meta information."""

js_entity_synonyms = defaultdict(list)
for k, v in self.entity_synonyms.items():
if k != v:
js_entity_synonyms[v].append(k)

formatted_synonyms = [{'value': value, 'synonyms': syns}
for value, syns in js_entity_synonyms.items()]

formatted_examples = [example.as_dict()
for example in self.training_examples]

return str(json.dumps({
"rasa_nlu_data": {
"common_examples": [example.as_dict() for example in self.training_examples],
"common_examples": formatted_examples,
"regex_features": self.regex_features,
"entity_synonyms": [{'value': value, 'synonyms': syns} for value, syns in js_entity_synonyms.items()]
"entity_synonyms": formatted_synonyms
}
}, **kwargs))

def as_markdown(self, **kwargs):
# type: (**Any) -> str
"""Represent this set of training examples as markdown adding the passed meta information."""
"""Represent this set of training examples as markdown adding
the passed meta information."""

return JsonToMd(self.training_examples, self.entity_synonyms).to_markdown()
return JsonToMd(self.training_examples,
self.entity_synonyms).to_markdown()

def persist(self, dir_name):
# type: (Text) -> Dict[Text, Any]
"""Persists this training data to disk and returns necessary information to load it again."""
"""Persists this training data to disk and returns necessary
information to load it again."""

data_file = os.path.join(dir_name, "training_data.json")
with io.open(data_file, 'w') as f:
Expand All @@ -148,8 +175,10 @@ def sorted_entity_examples(self):
# type: () -> List[Message]
"""Sorts the entity examples by the annotated entity."""

return sorted([entity for ex in self.entity_examples for entity in ex.get("entities")],
key=lambda e: e["entity"])
entity_examples = [entity
for ex in self.entity_examples
for entity in ex.get("entities")]
return sorted(entity_examples, key=lambda e: e["entity"])

def sorted_intent_examples(self):
# type: () -> List[Message]
Expand All @@ -159,7 +188,9 @@ def sorted_intent_examples(self):

def validate(self):
# type: () -> None
"""Ensures that the loaded training data is valid, e.g. has a minimum of certain training examples."""
"""Ensures that the loaded training data is valid, e.g.
has a minimum of certain training examples."""

logger.debug("Validating training data...")
examples = self.sorted_intent_examples()
Expand All @@ -168,24 +199,33 @@ def validate(self):
size = len(list(group))
different_intents.append(intent)
if intent == "":
warnings.warn("Found empty intent, please check your training data."
"This may result in wrong intent predictions.")
warnings.warn("Found empty intent, please check your "
"training data. This may result in wrong "
"intent predictions.")
if size < self.MIN_EXAMPLES_PER_INTENT:
template = "Intent '{}' has only {} training examples! minimum is {}, training may fail."
warnings.warn(template.format(intent, size, self.MIN_EXAMPLES_PER_INTENT))
template = ("Intent '{}' has only {} training examples! "
"minimum is {}, training may fail.")
warnings.warn(template.format(repr(intent),
size,
self.MIN_EXAMPLES_PER_INTENT))

different_entities = []
for entity, group in groupby(self.sorted_entity_examples(), lambda e: e["entity"]):
for entity, group in groupby(self.sorted_entity_examples(),
lambda e: e["entity"]):
size = len(list(group))
different_entities.append(entity)
if size < self.MIN_EXAMPLES_PER_ENTITY:
template = "Entity '{}' has only {} training examples! minimum is {}, training may fail."
warnings.warn(template.format(entity, size, self.MIN_EXAMPLES_PER_ENTITY))
template = ("Entity '{}' has only {} training examples! "
"minimum is {}, training may fail.")
warnings.warn(template.format(repr(entity), size,
self.MIN_EXAMPLES_PER_ENTITY))

logger.info("Training data stats: \n" +
"\t- intent examples: {} ({} distinct intents)\n".format(
self.num_intent_examples, len(different_intents)) +
"\t- found intents: {}\n".format(list_to_str(different_intents)) +
"\t- found intents: {}\n".format(
list_to_str(different_intents)) +
"\t- entity examples: {} ({} distinct entities)\n".format(
self.num_entity_examples, len(different_entities)) +
"\t- found entities: {}\n".format(list_to_str(different_entities)))
"\t- found entities: {}\n".format(
list_to_str(different_entities)))

0 comments on commit 3625c61

Please sign in to comment.