Skip to content

Commit 06c6d8b

Browse files
author
Gaurav Sheni
committed
Merge branch 'main' of github.com:alteryx/nlp_primitives into fix_docs
2 parents f736d0d + 23814b8 commit 06c6d8b

8 files changed

+346
-1
lines changed

nlp_primitives/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,14 @@
1010
from .diversity_score import DiversityScore
1111
from .lsa import LSA
1212
from .mean_characters_per_word import MeanCharactersPerWord
13+
from .mean_characters_per_sentence import MeanCharactersPerSentence
1314
from .median_word_length import MedianWordLength
1415
from .num_unique_separators import NumUniqueSeparators
1516
from .number_of_common_words import NumberOfCommonWords
17+
from .number_of_unique_words import NumberOfUniqueWords
1618
from .number_of_hashtags import NumberOfHashtags
1719
from .number_of_mentions import NumberOfMentions
20+
from .number_of_sentences import NumberOfSentences
1821
from .part_of_speech_count import PartOfSpeechCount
1922
from .polarity_score import PolarityScore
2023
from .punctuation_count import PunctuationCount
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# -*- coding: utf-8 -*-
2+
import numpy as np
3+
from featuretools.primitives.base import TransformPrimitive
4+
from nltk.tokenize import sent_tokenize
5+
from woodwork.column_schema import ColumnSchema
6+
from woodwork.logical_types import Double, NaturalLanguage
7+
8+
9+
class MeanCharactersPerSentence(TransformPrimitive):
10+
"""Determines the mean count of characters per sentence in a given string.
11+
12+
Description:
13+
Given list of strings, determine the mean count of characters per sentence
14+
in a string.
15+
16+
If a string is missing, return `NaN`.
17+
18+
Examples:
19+
>>> x = ['This.', 'Yay! Yay!', 'Dog cat.']
20+
>>> mean_characters_per_sentence = MeanCharactersPerSentence()
21+
>>> mean_characters_per_sentence(x).tolist()
22+
[5.0, 4.0, 8.0]
23+
"""
24+
25+
name = "mean_characters_per_sentence"
26+
input_types = [ColumnSchema(logical_type=NaturalLanguage)]
27+
return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"})
28+
default_value = 0
29+
30+
def get_function(self):
31+
def _mean_characters_per_sentence(text):
32+
if not isinstance(text, str):
33+
return np.nan
34+
if len(text) == 0:
35+
return 0
36+
sentences = sent_tokenize(text)
37+
total = 0.0
38+
for s in sentences:
39+
total += len(s)
40+
return total / len(sentences)
41+
42+
def mean_characters_per_sentence(array):
43+
return array.apply(_mean_characters_per_sentence)
44+
45+
return mean_characters_per_sentence

nlp_primitives/number_of_sentences.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# -*- coding: utf-8 -*-
2+
import numpy as np
3+
from featuretools.primitives.base import TransformPrimitive
4+
from nltk.tokenize import sent_tokenize
5+
from woodwork.column_schema import ColumnSchema
6+
from woodwork.logical_types import IntegerNullable, NaturalLanguage
7+
8+
9+
class NumberOfSentences(TransformPrimitive):
10+
"""Determines number of sentences in a string.
11+
12+
Description:
13+
Given list of strings, determine the number of sentences
14+
in each string.
15+
16+
If a string is missing, return `NaN`.
17+
18+
Examples:
19+
>>> x = ['This is a test string.', 'This is second string! This is a second string', 'third string.']
20+
>>> number_of_sentences = NumberOfSentences()
21+
>>> number_of_sentences(x).tolist()
22+
[1, 2, 1]
23+
"""
24+
25+
name = "number_of_sentences"
26+
input_types = [ColumnSchema(logical_type=NaturalLanguage)]
27+
return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
28+
default_value = 0
29+
30+
def get_function(self):
31+
def _number_of_sentences(text):
32+
if not isinstance(text, str):
33+
return np.nan
34+
if len(text) == 0:
35+
return 0
36+
sentences = sent_tokenize(text)
37+
return len(sentences)
38+
39+
def number_of_sentences(array):
40+
return array.apply(_number_of_sentences)
41+
42+
return number_of_sentences
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import re
2+
from string import punctuation
3+
from typing import Iterable
4+
5+
import pandas as pd
6+
from featuretools.primitives.base import TransformPrimitive
7+
from woodwork.column_schema import ColumnSchema
8+
from woodwork.logical_types import IntegerNullable, NaturalLanguage
9+
10+
11+
class NumberOfUniqueWords(TransformPrimitive):
12+
"""Determines the number of unique words in a string.
13+
14+
Description:
15+
Determines the number of unique words in a given string. Includes options for
16+
case-insensitive behavior.
17+
18+
Args:
19+
case_insensitive (bool, optional): Specify case_insensitivity when searching for unique words.
20+
For example, setting this to True would mean "WORD word" would be treated as having
21+
one unique word. Defaults to False.
22+
23+
Examples:
24+
>>> x = ['Word word Word', 'This is a SENTENCE.', 'green red green']
25+
>>> number_of_unique_words = NumberOfUniqueWords()
26+
>>> number_of_unique_words(x).tolist()
27+
[2, 4, 2]
28+
29+
>>> x = ['word WoRD WORD worD', 'dog dog dog', 'catt CAT caT']
30+
>>> number_of_unique_words = NumberOfUniqueWords(case_insensitive=True)
31+
>>> number_of_unique_words(x).tolist()
32+
[1, 1, 2]
33+
"""
34+
35+
name = "number_of_unique_words"
36+
input_types = [ColumnSchema(logical_type=NaturalLanguage)]
37+
return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
38+
39+
default_value = 0
40+
41+
def __init__(self, case_insensitive=False):
42+
self.case_insensitive = case_insensitive
43+
44+
def get_function(self):
45+
def _unique_word_helper(text):
46+
if not isinstance(text, Iterable):
47+
return pd.NA
48+
unique = set()
49+
for t in text:
50+
punct_less = t.strip(punctuation)
51+
if len(punct_less) > 0:
52+
unique.add(punct_less)
53+
return len(unique)
54+
55+
def num_unique_words(array):
56+
if self.case_insensitive:
57+
array = array.str.lower()
58+
DELIMITERS = set(punctuation) - {".", "'", "-", "@"}
59+
DELIMITERS = "".join(list(DELIMITERS))
60+
DELIMITERS = re.escape(f" {DELIMITERS}\n\t")
61+
array = array.str.split(f"[{DELIMITERS}]")
62+
return array.apply(_unique_word_helper)
63+
64+
return num_unique_words
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import numpy as np
2+
import pandas as pd
3+
import pytest
4+
5+
from ..mean_characters_per_sentence import MeanCharactersPerSentence
6+
from .test_utils import PrimitiveT, find_applicable_primitives, valid_dfs
7+
8+
9+
class TestMeanCharactersPerSentence(PrimitiveT):
10+
primitive = MeanCharactersPerSentence
11+
12+
def test_sentences(self):
13+
x = pd.Series(
14+
[
15+
"Ab. Bb. Db.",
16+
"And? Why! Box. Car? Rat.",
17+
"Yep.",
18+
]
19+
)
20+
primitive_func = self.primitive().get_function()
21+
answers = pd.Series([3.0, 4.0, 4.0])
22+
pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
23+
24+
def test_multiline(self):
25+
x = pd.Series(["Ab\n."])
26+
primitive_func = self.primitive().get_function()
27+
answers = pd.Series([4.0])
28+
pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
29+
30+
@pytest.mark.parametrize(
31+
"na_value",
32+
[None, np.nan, pd.NA],
33+
)
34+
def test_nans(self, na_value):
35+
x = pd.Series([na_value, "", "third line"])
36+
primitive_func = self.primitive().get_function()
37+
answers = pd.Series([np.nan, 0, 10.0])
38+
pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
39+
40+
@pytest.mark.parametrize(
41+
"na_value",
42+
[None, np.nan, pd.NA],
43+
)
44+
def test_all_nans(self, na_value):
45+
x = pd.Series([na_value, na_value, na_value])
46+
primitive_func = self.primitive().get_function()
47+
answers = pd.Series([np.nan, np.nan, np.nan])
48+
pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
49+
50+
def test_with_featuretools(self, es):
51+
transform, aggregation = find_applicable_primitives(self.primitive)
52+
primitive_instance = self.primitive()
53+
transform.append(primitive_instance)
54+
valid_dfs(es, aggregation, transform, self.primitive.name.upper())
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from ..number_of_sentences import NumberOfSentences
5+
from .test_utils import PrimitiveT, find_applicable_primitives, valid_dfs
6+
7+
8+
class TestNumberOfSentences(PrimitiveT):
9+
primitive = NumberOfSentences
10+
11+
def test_regular_input(self):
12+
x = pd.Series(
13+
[
14+
"Hello. Hello! Hello? Hello.",
15+
"and?",
16+
"yes no",
17+
]
18+
)
19+
expected = [4.0, 1.0, 1.0]
20+
actual = self.primitive().get_function()(x)
21+
np.testing.assert_array_equal(actual, expected)
22+
23+
def test_unicode_input(self):
24+
x = pd.Series(["Ángel is here.", "Ángel is here áèí! I am not."])
25+
expected = [1.0, 2.0]
26+
actual = self.primitive().get_function()(x)
27+
np.testing.assert_array_equal(actual, expected)
28+
29+
def test_multiline(self):
30+
x = pd.Series(
31+
[
32+
"Yes\n, this is true!",
33+
]
34+
)
35+
36+
expected = [1.0]
37+
actual = self.primitive().get_function()(x)
38+
np.testing.assert_array_equal(actual, expected)
39+
40+
def test_null(self):
41+
x = pd.Series([np.nan, pd.NA, None, ""])
42+
43+
actual = self.primitive().get_function()(x)
44+
expected = [np.nan, np.nan, np.nan, 0.0]
45+
np.testing.assert_array_equal(actual, expected)
46+
47+
def test_with_featuretools(self, es):
48+
transform, aggregation = find_applicable_primitives(self.primitive)
49+
primitive_instance = self.primitive()
50+
transform.append(primitive_instance)
51+
valid_dfs(es, aggregation, transform, self.primitive.name.upper())
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
from ..number_of_unique_words import NumberOfUniqueWords
5+
from .test_utils import PrimitiveT, find_applicable_primitives, valid_dfs
6+
7+
8+
class TestNumberOfUniqueWords(PrimitiveT):
9+
primitive = NumberOfUniqueWords
10+
11+
def test_general(self):
12+
x = pd.Series(
13+
[
14+
"test test test test",
15+
"test TEST test TEST",
16+
"and;subsequent;lines...",
17+
"$0.99 alteryx@alteryx.com",
18+
]
19+
)
20+
21+
expected = pd.Series([1, 2, 3, 2])
22+
actual = self.primitive().get_function()(x)
23+
pd.testing.assert_series_equal(actual, expected, check_names=False)
24+
25+
def test_special_characters(self):
26+
x = pd.Series(["50% 50 50%", "a test* test"])
27+
28+
expected = pd.Series([1, 2])
29+
actual = self.primitive().get_function()(x)
30+
pd.testing.assert_series_equal(actual, expected, check_names=False)
31+
32+
def test_unicode_input(self):
33+
x = pd.Series(
34+
[
35+
"Ángel Angel Ángel ángel",
36+
]
37+
)
38+
39+
expected = pd.Series([3])
40+
actual = self.primitive().get_function()(x)
41+
pd.testing.assert_series_equal(actual, expected, check_names=False)
42+
43+
def test_contractions(self):
44+
x = pd.Series(
45+
[
46+
"can't won't don't can't aren't won't don't they'd there's",
47+
]
48+
)
49+
50+
expected = pd.Series([6])
51+
actual = self.primitive().get_function()(x)
52+
pd.testing.assert_series_equal(actual, expected, check_names=False)
53+
54+
def test_multiline(self):
55+
x = pd.Series(
56+
[
57+
"word word word word.",
58+
"This is \nthird line \nthird line",
59+
]
60+
)
61+
62+
expected = pd.Series([1, 4])
63+
actual = self.primitive().get_function()(x)
64+
pd.testing.assert_series_equal(actual, expected, check_names=False)
65+
66+
def test_null(self):
67+
x = pd.Series([np.nan, pd.NA, None, "This is a test file."])
68+
69+
actual = self.primitive().get_function()(x)
70+
expected = pd.Series([pd.NA, pd.NA, pd.NA, 5])
71+
pd.testing.assert_series_equal(actual, expected, check_names=False)
72+
73+
def test_case_insensitive(self):
74+
x = pd.Series(["WORD word WORd WORd WOrD word"])
75+
76+
actual = self.primitive(case_insensitive=True).get_function()(x)
77+
expected = pd.Series([1])
78+
pd.testing.assert_series_equal(actual, expected, check_names=False)
79+
80+
def test_with_featuretools(self, es):
81+
transform, aggregation = find_applicable_primitives(self.primitive)
82+
primitive_instance = self.primitive()
83+
transform.append(primitive_instance)
84+
valid_dfs(es, aggregation, transform, self.primitive.name.upper())

release_notes.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ Changelog
55
Future Release
66
==============
77
* Enhancements
8-
* Add NumberOfHashtags and NumberOfMentions primitives (:pr:`180`)
8+
* Add `NumberOfHashtags` and `NumberOfMentions`` primitives (:pr:`180`)
9+
* Add `NumberOfUniqueWords` primitive (:pr:`187`)
10+
* Add `NumberOfSentences` and `MeanSentenceLength` primitives (:pr:`188`)
911
* Fixes
1012
* Update README.md with Alteryx info (:pr:`167`)
1113
* Changes

0 commit comments

Comments
 (0)