Merge branch 'main' of github.com:alteryx/nlp_primitives into fix_docs

Gaurav Sheni · Gaurav Sheni · commit 06c6d8ba281b · 2022-08-15T10:49:32.000-04:00
diff --git a/nlp_primitives/__init__.py b/nlp_primitives/__init__.py
@@ -10,11 +10,14 @@
 from .diversity_score import DiversityScore
 from .lsa import LSA
 from .mean_characters_per_word import MeanCharactersPerWord
+from .mean_characters_per_sentence import MeanCharactersPerSentence
 from .median_word_length import MedianWordLength
 from .num_unique_separators import NumUniqueSeparators
 from .number_of_common_words import NumberOfCommonWords
+from .number_of_unique_words import NumberOfUniqueWords
 from .number_of_hashtags import NumberOfHashtags
 from .number_of_mentions import NumberOfMentions
+from .number_of_sentences import NumberOfSentences
 from .part_of_speech_count import PartOfSpeechCount
 from .polarity_score import PolarityScore
 from .punctuation_count import PunctuationCount
diff --git a/nlp_primitives/mean_characters_per_sentence.py b/nlp_primitives/mean_characters_per_sentence.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+from featuretools.primitives.base import TransformPrimitive
+from nltk.tokenize import sent_tokenize
+from woodwork.column_schema import ColumnSchema
+from woodwork.logical_types import Double, NaturalLanguage
+
+
+class MeanCharactersPerSentence(TransformPrimitive):
+    """Determines the mean count of characters per sentence in a given string.
+
+    Description:
+        Given list of strings, determine the mean count of characters per sentence
+        in a string.
+
+        If a string is missing, return `NaN`.
+
+    Examples:
+        >>> x = ['This.', 'Yay! Yay!', 'Dog cat.']
+        >>> mean_characters_per_sentence = MeanCharactersPerSentence()
+        >>> mean_characters_per_sentence(x).tolist()
+        [5.0, 4.0, 8.0]
+    """
+
+    name = "mean_characters_per_sentence"
+    input_types = [ColumnSchema(logical_type=NaturalLanguage)]
+    return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"})
+    default_value = 0
+
+    def get_function(self):
+        def _mean_characters_per_sentence(text):
+            if not isinstance(text, str):
+                return np.nan
+            if len(text) == 0:
+                return 0
+            sentences = sent_tokenize(text)
+            total = 0.0
+            for s in sentences:
+                total += len(s)
+            return total / len(sentences)
+
+        def mean_characters_per_sentence(array):
+            return array.apply(_mean_characters_per_sentence)
+
+        return mean_characters_per_sentence
diff --git a/nlp_primitives/number_of_sentences.py b/nlp_primitives/number_of_sentences.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+from featuretools.primitives.base import TransformPrimitive
+from nltk.tokenize import sent_tokenize
+from woodwork.column_schema import ColumnSchema
+from woodwork.logical_types import IntegerNullable, NaturalLanguage
+
+
+class NumberOfSentences(TransformPrimitive):
+    """Determines number of sentences in a string.
+
+    Description:
+        Given list of strings, determine the number of sentences
+        in each string.
+
+        If a string is missing, return `NaN`.
+
+    Examples:
+        >>> x = ['This is a test string.', 'This is second string! This is a second string', 'third string.']
+        >>> number_of_sentences = NumberOfSentences()
+        >>> number_of_sentences(x).tolist()
+        [1, 2, 1]
+    """
+
+    name = "number_of_sentences"
+    input_types = [ColumnSchema(logical_type=NaturalLanguage)]
+    return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
+    default_value = 0
+
+    def get_function(self):
+        def _number_of_sentences(text):
+            if not isinstance(text, str):
+                return np.nan
+            if len(text) == 0:
+                return 0
+            sentences = sent_tokenize(text)
+            return len(sentences)
+
+        def number_of_sentences(array):
+            return array.apply(_number_of_sentences)
+
+        return number_of_sentences
diff --git a/nlp_primitives/number_of_unique_words.py b/nlp_primitives/number_of_unique_words.py
@@ -0,0 +1,64 @@
+import re
+from string import punctuation
+from typing import Iterable
+
+import pandas as pd
+from featuretools.primitives.base import TransformPrimitive
+from woodwork.column_schema import ColumnSchema
+from woodwork.logical_types import IntegerNullable, NaturalLanguage
+
+
+class NumberOfUniqueWords(TransformPrimitive):
+    """Determines the number of unique words in a string.
+
+    Description:
+        Determines the number of unique words in a given string. Includes options for
+        case-insensitive behavior.
+
+    Args:
+        case_insensitive (bool, optional): Specify case_insensitivity when searching for unique words.
+        For example, setting this to True would mean "WORD word" would be treated as having
+        one unique word. Defaults to False.
+
+    Examples:
+        >>> x = ['Word word Word', 'This is a SENTENCE.', 'green red green']
+        >>> number_of_unique_words = NumberOfUniqueWords()
+        >>> number_of_unique_words(x).tolist()
+        [2, 4, 2]
+
+        >>> x = ['word WoRD WORD worD', 'dog dog dog', 'catt CAT caT']
+        >>> number_of_unique_words = NumberOfUniqueWords(case_insensitive=True)
+        >>> number_of_unique_words(x).tolist()
+        [1, 1, 2]
+    """
+
+    name = "number_of_unique_words"
+    input_types = [ColumnSchema(logical_type=NaturalLanguage)]
+    return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
+
+    default_value = 0
+
+    def __init__(self, case_insensitive=False):
+        self.case_insensitive = case_insensitive
+
+    def get_function(self):
+        def _unique_word_helper(text):
+            if not isinstance(text, Iterable):
+                return pd.NA
+            unique = set()
+            for t in text:
+                punct_less = t.strip(punctuation)
+                if len(punct_less) > 0:
+                    unique.add(punct_less)
+            return len(unique)
+
+        def num_unique_words(array):
+            if self.case_insensitive:
+                array = array.str.lower()
+            DELIMITERS = set(punctuation) - {".", "'", "-", "@"}
+            DELIMITERS = "".join(list(DELIMITERS))
+            DELIMITERS = re.escape(f" {DELIMITERS}\n\t")
+            array = array.str.split(f"[{DELIMITERS}]")
+            return array.apply(_unique_word_helper)
+
+        return num_unique_words
diff --git a/nlp_primitives/tests/test_mean_characters_per_sentence.py b/nlp_primitives/tests/test_mean_characters_per_sentence.py
@@ -0,0 +1,54 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from ..mean_characters_per_sentence import MeanCharactersPerSentence
+from .test_utils import PrimitiveT, find_applicable_primitives, valid_dfs
+
+
+class TestMeanCharactersPerSentence(PrimitiveT):
+    primitive = MeanCharactersPerSentence
+
+    def test_sentences(self):
+        x = pd.Series(
+            [
+                "Ab. Bb. Db.",
+                "And? Why! Box. Car? Rat.",
+                "Yep.",
+            ]
+        )
+        primitive_func = self.primitive().get_function()
+        answers = pd.Series([3.0, 4.0, 4.0])
+        pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
+
+    def test_multiline(self):
+        x = pd.Series(["Ab\n."])
+        primitive_func = self.primitive().get_function()
+        answers = pd.Series([4.0])
+        pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
+
+    @pytest.mark.parametrize(
+        "na_value",
+        [None, np.nan, pd.NA],
+    )
+    def test_nans(self, na_value):
+        x = pd.Series([na_value, "", "third line"])
+        primitive_func = self.primitive().get_function()
+        answers = pd.Series([np.nan, 0, 10.0])
+        pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
+
+    @pytest.mark.parametrize(
+        "na_value",
+        [None, np.nan, pd.NA],
+    )
+    def test_all_nans(self, na_value):
+        x = pd.Series([na_value, na_value, na_value])
+        primitive_func = self.primitive().get_function()
+        answers = pd.Series([np.nan, np.nan, np.nan])
+        pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
+
+    def test_with_featuretools(self, es):
+        transform, aggregation = find_applicable_primitives(self.primitive)
+        primitive_instance = self.primitive()
+        transform.append(primitive_instance)
+        valid_dfs(es, aggregation, transform, self.primitive.name.upper())
diff --git a/nlp_primitives/tests/test_number_of_sentences.py b/nlp_primitives/tests/test_number_of_sentences.py
@@ -0,0 +1,51 @@
+import numpy as np
+import pandas as pd
+
+from ..number_of_sentences import NumberOfSentences
+from .test_utils import PrimitiveT, find_applicable_primitives, valid_dfs
+
+
+class TestNumberOfSentences(PrimitiveT):
+    primitive = NumberOfSentences
+
+    def test_regular_input(self):
+        x = pd.Series(
+            [
+                "Hello. Hello! Hello? Hello.",
+                "and?",
+                "yes no",
+            ]
+        )
+        expected = [4.0, 1.0, 1.0]
+        actual = self.primitive().get_function()(x)
+        np.testing.assert_array_equal(actual, expected)
+
+    def test_unicode_input(self):
+        x = pd.Series(["Ángel is here.", "Ángel is here áèí! I am not."])
+        expected = [1.0, 2.0]
+        actual = self.primitive().get_function()(x)
+        np.testing.assert_array_equal(actual, expected)
+
+    def test_multiline(self):
+        x = pd.Series(
+            [
+                "Yes\n, this is true!",
+            ]
+        )
+
+        expected = [1.0]
+        actual = self.primitive().get_function()(x)
+        np.testing.assert_array_equal(actual, expected)
+
+    def test_null(self):
+        x = pd.Series([np.nan, pd.NA, None, ""])
+
+        actual = self.primitive().get_function()(x)
+        expected = [np.nan, np.nan, np.nan, 0.0]
+        np.testing.assert_array_equal(actual, expected)
+
+    def test_with_featuretools(self, es):
+        transform, aggregation = find_applicable_primitives(self.primitive)
+        primitive_instance = self.primitive()
+        transform.append(primitive_instance)
+        valid_dfs(es, aggregation, transform, self.primitive.name.upper())
diff --git a/nlp_primitives/tests/test_number_of_unique_words.py b/nlp_primitives/tests/test_number_of_unique_words.py
@@ -0,0 +1,84 @@
+import numpy as np
+import pandas as pd
+
+from ..number_of_unique_words import NumberOfUniqueWords
+from .test_utils import PrimitiveT, find_applicable_primitives, valid_dfs
+
+
+class TestNumberOfUniqueWords(PrimitiveT):
+    primitive = NumberOfUniqueWords
+
+    def test_general(self):
+        x = pd.Series(
+            [
+                "test test test test",
+                "test TEST test TEST",
+                "and;subsequent;lines...",
+                "$0.99 alteryx@alteryx.com",
+            ]
+        )
+
+        expected = pd.Series([1, 2, 3, 2])
+        actual = self.primitive().get_function()(x)
+        pd.testing.assert_series_equal(actual, expected, check_names=False)
+
+    def test_special_characters(self):
+        x = pd.Series(["50% 50 50%", "a test* test"])
+
+        expected = pd.Series([1, 2])
+        actual = self.primitive().get_function()(x)
+        pd.testing.assert_series_equal(actual, expected, check_names=False)
+
+    def test_unicode_input(self):
+        x = pd.Series(
+            [
+                "Ángel Angel Ángel ángel",
+            ]
+        )
+
+        expected = pd.Series([3])
+        actual = self.primitive().get_function()(x)
+        pd.testing.assert_series_equal(actual, expected, check_names=False)
+
+    def test_contractions(self):
+        x = pd.Series(
+            [
+                "can't won't don't can't aren't won't don't they'd there's",
+            ]
+        )
+
+        expected = pd.Series([6])
+        actual = self.primitive().get_function()(x)
+        pd.testing.assert_series_equal(actual, expected, check_names=False)
+
+    def test_multiline(self):
+        x = pd.Series(
+            [
+                "word word word word.",
+                "This is \nthird line \nthird line",
+            ]
+        )
+
+        expected = pd.Series([1, 4])
+        actual = self.primitive().get_function()(x)
+        pd.testing.assert_series_equal(actual, expected, check_names=False)
+
+    def test_null(self):
+        x = pd.Series([np.nan, pd.NA, None, "This is a test file."])
+
+        actual = self.primitive().get_function()(x)
+        expected = pd.Series([pd.NA, pd.NA, pd.NA, 5])
+        pd.testing.assert_series_equal(actual, expected, check_names=False)
+
+    def test_case_insensitive(self):
+        x = pd.Series(["WORD word WORd WORd WOrD word"])
+
+        actual = self.primitive(case_insensitive=True).get_function()(x)
+        expected = pd.Series([1])
+        pd.testing.assert_series_equal(actual, expected, check_names=False)
+
+    def test_with_featuretools(self, es):
+        transform, aggregation = find_applicable_primitives(self.primitive)
+        primitive_instance = self.primitive()
+        transform.append(primitive_instance)
+        valid_dfs(es, aggregation, transform, self.primitive.name.upper())
diff --git a/release_notes.rst b/release_notes.rst
@@ -5,7 +5,9 @@ Changelog
 Future Release
 ==============
     * Enhancements
-        * Add NumberOfHashtags and NumberOfMentions primitives (:pr:`180`)
+        * Add `NumberOfHashtags` and `NumberOfMentions`` primitives (:pr:`180`)
+        * Add `NumberOfUniqueWords` primitive (:pr:`187`)
+        * Add `NumberOfSentences` and `MeanSentenceLength` primitives (:pr:`188`)
     * Fixes
         * Update README.md with Alteryx info (:pr:`167`)
     * Changes