diff --git a/prod.requirements.txt b/prod.requirements.txt index e6318bb4..7c87f8ac 100644 --- a/prod.requirements.txt +++ b/prod.requirements.txt @@ -1,7 +1,7 @@ loguru>=0.5.1 click>=7.1.2 -smart-open>=2.1.0 +smart-open==2.0.0 uvicorn>=0.11.8 fastapi>=0.61.0 diff --git a/sadedegel/bblock/doc.py b/sadedegel/bblock/doc.py index 855eeb08..e546d4f1 100644 --- a/sadedegel/bblock/doc.py +++ b/sadedegel/bblock/doc.py @@ -173,9 +173,9 @@ def raw_tf(self, drop_stopwords=False, lowercase=False, drop_suffix=False, drop_ v = np.zeros(self.vocabulary.size_cs) if lowercase: - tokens = [tr_lower(t) for t in self.tokens] + tokens = [t.lower_ for t in self.tokens] else: - tokens = self.tokens + tokens = [t.word for t in self.tokens] counter = Counter(tokens) @@ -304,7 +304,6 @@ def __init__(self, id_: int, text: str, doc, config: dict = {}): self.id = id_ self.text = text - self._tokens = None self.document = doc self.config = doc.builder.config self._bert = None @@ -332,7 +331,7 @@ def __init__(self, id_: int, text: str, doc, config: dict = {}): f"Unknown term frequency method {self.tf_method}. Choose on of {','.join(TF_METHOD_VALUES)}") @property - def avgdl(self) -> int: + def avgdl(self) -> float: """Average number of tokens per sentence""" return self.config['default'].getfloat('avg_sentence_length') @@ -361,17 +360,17 @@ def input_ids(self): return self.tokenizer.convert_tokens_to_ids(self.tokens_with_special_symbols) @cached_property - def tokens(self): - return self.tokenizer(self.text) + def tokens(self) -> List[Token]: + return [t for t in self.tokenizer(self.text)] @property def tokens_with_special_symbols(self): - return ['[CLS]'] + self.tokens + ['[SEP]'] + return [Token('[CLS]')] + self.tokens + [Token('[SEP]')] - def rouge1(self, metric): + def rouge1(self, metric) -> float: return rouge1_score( - flatten([[tr_lower(token) for token in sent.tokens] for sent in self.document if sent.id != self.id]), - [tr_lower(t) for t in self.tokens], metric) + flatten([[t.lower_ for t in sent] for sent in self.document if sent.id != self.id]), + [t.lower_ for t in self], metric) @property def bm25(self) -> np.float32: @@ -418,8 +417,7 @@ def tf(self): def idf(self): v = np.zeros(len(self.vocabulary)) - for token in self.tokens: - t = self.vocabulary[token] + for t in self.tokens: if not t.is_oov: v[t.id] = t.idf @@ -438,11 +436,10 @@ def __eq__(self, s: str): return self.text == s # no need for type checking, will return false for non-strings def __getitem__(self, token_ix): - return Token(self.tokens[token_ix]) + return self.tokens[token_ix] def __iter__(self): - for t in self.tokens: - yield Token(t) + yield from self.tokens class Document(TFImpl, IDFImpl, BM25Impl): @@ -459,19 +456,18 @@ def __init__(self, raw, builder): self.config = self.builder.config @property - def avgdl(self) -> int: + def avgdl(self) -> float: """Average number of tokens per document""" return self.config['default'].getfloat('avg_document_length') - @property - def tokens(self): - if self._tokens is None: - self._tokens = [] - for s in self: - for t in s.tokens: - self._tokens.append(t) + @cached_property + def tokens(self) -> List[str]: + tokens = [] + for s in self: + for t in s.tokens: + tokens.append(t) - return self._tokens + return tokens @property def vocabulary(self): @@ -606,7 +602,9 @@ def __init__(self, **kwargs): tokenizer_str = normalize_tokenizer_name(self.config['default']['tokenizer']) - self.tokenizer = WordTokenizer.factory(tokenizer_str) + self.tokenizer = WordTokenizer.factory(tokenizer_str, emoji=self.config['tokenizer'].getboolean('emoji'), + hashtag=self.config['tokenizer'].getboolean('hashtag'), + mention=self.config['tokenizer'].getboolean('mention')) Token.set_vocabulary(self.tokenizer.vocabulary) diff --git a/sadedegel/tokenize/_sent.py b/sadedegel/bblock/sbd.py similarity index 100% rename from sadedegel/tokenize/_sent.py rename to sadedegel/bblock/sbd.py diff --git a/sadedegel/bblock/token.py b/sadedegel/bblock/token.py index 9e5f7c88..824818b6 100644 --- a/sadedegel/bblock/token.py +++ b/sadedegel/bblock/token.py @@ -2,6 +2,7 @@ from math import log import numpy as np +from cached_property import cached_property from .util import tr_lower, load_stopwords, deprecate, ConfigNotSet, VocabularyIsNotSet, WordVectorNotFound from .vocabulary import Vocabulary @@ -25,13 +26,7 @@ def get_idf(self, method=IDF_SMOOTH, drop_stopwords=False, lowercase=False, drop else: v = np.zeros(self.vocabulary.size_cs) - if lowercase: - tokens = [tr_lower(t) for t in self.tokens] - else: - tokens = self.tokens - - for token in tokens: - t = Token(token) + for t in self.tokens: if t.is_oov or (drop_stopwords and t.is_stopword) or (drop_suffix and t.is_suffix) or ( drop_punct and t.is_punct): continue @@ -106,7 +101,9 @@ def _create_token(cls, word: str): token.is_punct = all(unicodedata.category(c).startswith("P") for c in token.word) token.is_digit = token.word.isdigit() token.is_suffix = token.word.startswith('##') - token.shape = word_shape(token.word) + token.is_emoji = False + token.is_hashtag = False + token.is_mention = False return token @@ -117,6 +114,17 @@ def __new__(cls, word: str): return cls.cache[word] + def __len__(self): + return len(self.word) + + def __eq__(self, other): + if type(other) == str: + return self.word == other + elif type(other) == Token: + return self.word == other.word + else: + raise TypeError(f"Unknown comparison type with Token {type(other)}") + @classmethod def set_vocabulary(cls, vocab: Vocabulary): Token.vocabulary = vocab @@ -236,6 +244,10 @@ def vector(self) -> np.ndarray: else: raise WordVectorNotFound(self.word) + @cached_property + def shape(self) -> str: + return word_shape(self.word) + def __str__(self): return self.word diff --git a/sadedegel/bblock/util.py b/sadedegel/bblock/util.py index f06882e6..dbee4659 100644 --- a/sadedegel/bblock/util.py +++ b/sadedegel/bblock/util.py @@ -20,7 +20,10 @@ def tr_lower(s: str) -> str: - return s.replace("I", "ı").replace("İ", "i").lower() + if "I" in s or "İ" in s: + return s.replace("I", "ı").replace("İ", "i").lower() + else: + return s.lower() def tr_upper(s: str) -> str: @@ -161,14 +164,15 @@ def load_stopwords(base_path=None): return stopwords -def deprecate(message: str, eol_version: tuple): +def deprecate(message: str, eol_version: tuple, post_message: str = None): current = tuple([int(v) for v in __version__.split('.')]) if current >= eol_version: - console.print(f"[red]{message}[/red]") + console.print(f"[red]{message}[/red]. {post_message}") sys.exit(1) else: - console.print(f"[magenta]{message}[/magenta], will be dropped by {'.'.join(map(str, eol_version))}") + console.print( + f"{message}, will be [magenta]dropped[/magenta] by {'.'.join(map(str, eol_version))}. {post_message}") class ConfigNotSet(Exception): diff --git a/sadedegel/bblock/word_tokenizer.py b/sadedegel/bblock/word_tokenizer.py index 54aefad0..de61c169 100644 --- a/sadedegel/bblock/word_tokenizer.py +++ b/sadedegel/bblock/word_tokenizer.py @@ -1,24 +1,73 @@ import sys +import re +import sys import warnings from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum from typing import List from cached_property import cached_property from rich.console import Console +from sadedegel.bblock.word_tokenizer_helper import ICUTokenizerHelper from .util import normalize_tokenizer_name from .vocabulary import Vocabulary -from .word_tokenizer_helper import word_tokenize, ICUTokenizerHelper +from .token import Token +from .word_tokenizer_helper import word_tokenize from ..about import __version__ + +class TokenType(Enum): + TEXT = "text" + MENTION = "mention" + EMOJI = "emoji" + HASHTAG = "hashtag" + + +@dataclass +class TokenSpan: + type: TokenType + start: int + end: int + + console = Console() class WordTokenizer(ABC): __instances = {} - def __init__(self): + def __init__(self, mention=False, hashtag=False, emoji=False): + """ + + @param mention: Handle mention in tweet texts. + @param hashtag: Handle hashtag in tweet texts. + @param emoji: Handle emoji unicode texts in texts. + """ self._vocabulary = None + self.mention = mention + self.hashtag = hashtag + self.emoji = emoji + + self.regexes = [] + + if self.hashtag: + console.print("Handling hashtags") + self.regexes.append(re.compile(r"(?P#\S+)")) + + if self.mention: + console.print("Handling mentions") + self.regexes.append(re.compile(r"(?P@\S+)")) + + if self.emoji: + self.regexes.append(re.compile(r"(?P[\U00010000-\U0010ffff])", + flags=re.UNICODE)) + + if len(self.regexes) > 0: + self.exception_rules = re.compile('|'.join(x.pattern for x in self.regexes), flags=re.UNICODE) + + console.log(f"{len(self.regexes)} tokenizer exception rules.") @abstractmethod def _tokenize(self, text: str) -> List[str]: @@ -28,39 +77,89 @@ def _tokenize(self, text: str) -> List[str]: def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: pass - def __call__(self, sentence: str) -> List[str]: - return self._tokenize(str(sentence)) + def __call__(self, sentence: str) -> List[Token]: + text = str(sentence) + + if len(self.regexes) == 0: + return [Token(t) for t in self._tokenize(text)] + else: + EOS = len(text) + + spans = [] + for m in self.exception_rules.finditer(text): + start, end = m.start(), m.end() + + if len(spans) == 0: + if start != 0: + spans.append(TokenSpan(TokenType.TEXT, 0, start)) + else: + if start > spans[-1].end: + spans.append(TokenSpan(TokenType.TEXT, spans[-1].end, start)) + + if m.lastgroup == "hashtag": + spans.append(TokenSpan(TokenType.HASHTAG, start, end)) + elif m.lastgroup == "mention": + spans.append(TokenSpan(TokenType.MENTION, start, end)) + else: + spans.append(TokenSpan(TokenType.EMOJI, start, end)) + + if len(spans) == 0: + if EOS != 0: + spans.append(TokenSpan(TokenType.TEXT, 0, EOS)) + else: + if EOS > spans[-1].end: + spans.append(TokenSpan(TokenType.TEXT, spans[-1].end, EOS)) + + tokens = [] + for s in spans: + if s.type == TokenType.TEXT: + tokens += [Token(t) for t in self._tokenize(text[s.start:s.end])] + elif s.type == TokenType.EMOJI: + t = Token(text[s.start:s.end]) + t.is_emoji = True + tokens.append(t) + elif s.type == TokenType.HASHTAG: + t = Token(text[s.start:s.end]) + t.is_hashtag = True + tokens.append(t) + else: + t = Token(text[s.start:s.end]) + t.is_mention = True + tokens.append(t) + + return tokens @staticmethod - def factory(tokenizer_name: str): + def factory(tokenizer_name: str, mention=False, hashtag=False, emoji=False): + console.log(f"mention={mention}, hashtag={hashtag}, emoji={emoji}") normalized_name = normalize_tokenizer_name(tokenizer_name) if normalized_name not in WordTokenizer.__instances: if normalized_name == "bert": - WordTokenizer.__instances[normalized_name] = BertTokenizer() + return BertTokenizer(mention, hashtag, emoji) elif normalized_name == "simple": warnings.warn( ("Note that SimpleTokenizer is pretty new in sadedeGel. " "If you experience any problems, open up a issue " "(https://github.com/GlobalMaksimum/sadedegel/issues/new)")) - WordTokenizer.__instances[normalized_name] = SimpleTokenizer() + return SimpleTokenizer(mention, hashtag, emoji) elif normalized_name == "icu": - WordTokenizer.__instances[normalized_name] = ICUTokenizer() + return ICUTokenizer(mention, hashtag, emoji) else: raise Exception( (f"No word tokenizer type match with name {tokenizer_name}." " Use one of 'bert-tokenizer', 'SimpleTokenizer', etc.")) - return WordTokenizer.__instances[normalized_name] + # return WordTokenizer.__instances[normalized_name] class BertTokenizer(WordTokenizer): __name__ = "BertTokenizer" - def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]: - return self.tokenizer.convert_tokens_to_ids(tokens) + def convert_tokens_to_ids(self, tokens: List[Token]) -> List[int]: + return self.tokenizer.convert_tokens_to_ids([t.word for t in tokens]) - def __init__(self): - super(BertTokenizer, self).__init__() + def __init__(self, mention=False, hashtag=False, emoji=False): + super(BertTokenizer, self).__init__(mention, hashtag, emoji) self.tokenizer = None @@ -91,8 +190,8 @@ def vocabulary(self): class SimpleTokenizer(WordTokenizer): __name__ = "SimpleTokenizer" - def __init__(self): - super(SimpleTokenizer, self).__init__() + def __init__(self, mention=False, hashtag=False, emoji=False): + super(SimpleTokenizer, self).__init__(mention, hashtag, emoji) self.tokenizer = word_tokenize def _tokenize(self, text: str) -> List[str]: @@ -114,8 +213,8 @@ def vocabulary(self): class ICUTokenizer(WordTokenizer): __name__ = "ICUTokenizer" - def __init__(self): - super(ICUTokenizer, self).__init__() + def __init__(self, mention=False, hashtag=False, emoji=False): + super(ICUTokenizer, self).__init__(mention, hashtag, emoji) self.tokenizer = ICUTokenizerHelper() def _tokenize(self, text: str) -> List[str]: @@ -132,17 +231,3 @@ def vocabulary(self): console.print("[red]icu[/red] vocabulary file not found.") return None - - -def get_default_word_tokenizer() -> WordTokenizer: - if tuple(map(int, __version__.split('.'))) < (0, 17): - warnings.warn( - ("get_default_word_tokenizer is deprecated and will be removed by 0.17. " - "Use `sadedegel config` to get default configuration. " - "Use ~/.sadedegel/user.ini to update default tokenizer."), - DeprecationWarning, - stacklevel=2) - else: - raise Exception("Remove get_default_word_tokenizer before release.") - - return WordTokenizer.factory(BertTokenizer.__name__) diff --git a/sadedegel/config.py b/sadedegel/config.py index 15978742..135b65b4 100644 --- a/sadedegel/config.py +++ b/sadedegel/config.py @@ -71,6 +71,9 @@ def show_config(config, section=None): "default__drop_stopwords": ("Whether to drop stopwords in various calculations. " "Such as, tfidf, bm25, etc."), "default__lowercase": "Whether to use lowercased form rather than form itself.", + "tokenizer__hashtag": "enable/disable hashtag (#sadedegel) handler in word tokenizer", + "tokenizer__mention": "enable/disable mention (@sadedegel) handler in word tokenizer", + "tokenizer__emoji": "enable/disable emoji (🍰) handler in word tokenizer", "default__drop_punct": ("Whether to drop punctuations in various calculations. " "Such as, tfidf, bm25, etc."), "tf__method": "Method used in term frequency calculation", @@ -78,6 +81,8 @@ def show_config(config, section=None): "idf__method": "Method used in Inverse Document Frequency calculation", "bert__avg_document_length": "Average number of tokens in a bert tokenized document.", "bert__avg_sentence_length": "Average number of tokens in a bert tokenized sentences.", + "icu__avg_document_length": "Average number of tokens in a icu tokenized document.", + "icu__avg_sentence_length": "Average number of tokens in a icu tokenized sentences.", "bert__drop_suffix": ("Whether to drop BERT generated suffixes in various calculations. " "Such as, tfidf, bm25, etc."), "simple__avg_document_length": "Average token count in a simple tokenizer tokenized document.", @@ -98,7 +103,7 @@ def show_config(config, section=None): table.add_column("parameter_name") table.add_column("current_value") table.add_column("default_value") - table.add_column("description", width=40) + table.add_column("description", width=50) for sec in config.sections(): if sec == section or section is None: diff --git a/sadedegel/dataset/README.md b/sadedegel/dataset/README.md index 627ccf4a..a60cc214 100644 --- a/sadedegel/dataset/README.md +++ b/sadedegel/dataset/README.md @@ -51,18 +51,17 @@ sents = load_annotated_corpus() * [extended](extended/) **raw** is simply a larger collection of news documents collected by [scraper] * [extended](extended/) **sents** is generated using [extended](extended/) **raw** and ML based sentence boundary detector trained over [sents](sents/) corpus - -### Download Dataset +### Download Dataset -You can download extended dataset using +You can download extended dataset using ```bash python -m sadedegel.dataset.extended download ``` -Sub command requires two flags to access GCS buckets +Sub command requires two flags to access GCS buckets * `access-key` * `secret-key` @@ -74,7 +73,7 @@ Those can be passed in 3 ways: ### Check Metadata -You can assert your extended dataset using +You can assert your extended dataset using ```bash python -m sadedegel.dataset.extended metadata @@ -148,7 +147,7 @@ Corpora consist of two corpus * sports * travel -[tscorpus] allows us to +[tscorpus] allows us to 1. Verify/Calibrate word tokenizers (bert, simple, etc.) available in sadedegel 2. Ship a prebuilt news classifier. @@ -175,16 +174,17 @@ data = load_classification_raw() Refer [news classification](../prebuilt/README.md) for details - ## `profanity` -Corpus used in [SemEval-2020 Task 12](https://arxiv.org/pdf/2006.07235.pdf) to implement profanity classifier over Turkish tweeter dataset. + +Corpus used in [SemEval-2020 Task 12](https://arxiv.org/pdf/2006.07235.pdf) to implement profanity classifier over +Turkish tweeter dataset. Training dataset contains 31277 documents, whereas test dataset consists of 3515 documents. ### Using Corpus ```python -from sadedegel.dataset.profanity import load_offenseval_train ,load_offenseval_test_label,load_offenseval_test +from sadedegel.dataset.profanity import load_offenseval_train, load_offenseval_test_label, load_offenseval_test tr = load_offenseval_train() tst = load_offenseval_test() @@ -192,25 +192,174 @@ tst_label = load_offenseval_test_label() next(tr) -#{'id': 20948, +# {'id': 20948, # 'tweet': "@USER en güzel uyuyan insan ödülü jeon jungkook'a gidiyor...", # 'profanity_class': 0} next(tst) -#{'id': 41993, 'tweet': '@USER Sayın başkanım bu şekilde devam inşallah👏'} +# {'id': 41993, 'tweet': '@USER Sayın başkanım bu şekilde devam inşallah👏'} next(tst_label) -#{'id': 41993, 'profanity_class': 0} +# {'id': 41993, 'profanity_class': 0} ``` For more details please refer [tweet profanity](../prebuilt/README.md) ## `tweet_sentiment` -[Twitter Dataset](https://www.kaggle.com/mrtbeyz/trke-sosyal-medya-paylam-veri-seti) is another corpus used to build prebuilt + +[Twitter Dataset](https://www.kaggle.com/mrtbeyz/trke-sosyal-medya-paylam-veri-seti) is another corpus used to build prebuilt tweeter sentiment classifier. For more details please refer [tweet sentiment](../prebuilt/README.md) +## `customer_review` + +Customer review classification corpus consists of 323479 training and 107827 test instances which contains customer reviews in +the `text` field and shopping category that the review refers to in the `review_class` field. + +There are 32 unique class labels for this corpus which are mapped to their respective IDs on `CLASS_VALUES` dict. + +### Using Corpus +**Usage** + +```python +from sadedegel.dataset.customer_review import load_train +from sadedegel.dataset.customer_review import load_test +from sadedegel.dataset.customer_review import load_test_label + +next(load_train()) + +# Out[6]: +# {'id': 'cb60a760-cfeb-44e8-abb1-4cbcd6814c64', +# 'text': 'Hipp 1 Mama Bebeğime Zarar Verdi,"Hipp 1devam sütü bebeğimde inanılmaz derecede gaz ve kusmaya neden oldu! Kızımda yıllar önce yine Hipp in devam sütü, pirinç maması, ek gıdaları gofret ve bisküvileri, yine aynı şekilde erik meyve püreleri her şeyini kullanıyordum. Hiçbir şekilde böyle bir sorunla karşılaşmamıştım. Ancak bu sefer al...Devamını oku"', +# 'review_class': 1} + +next(load_test()) + +# {'id': '97fdc0de-98e1-4577-9d7f-86cb71a49bbe', +# 'text': 'Samsung Garanti Garanti Değil!,BDH ile anlaşılmış garanti şirketi olarak ama hiçbir şekilde ne onlar kabul ediyor hatalarını ne de Samsung üstleniyor. Ben bıktım servise kendimi iletemediğimi sanıyordum içerisinde kağıt ile şikayetlerimi gönderdim ama maalesef okumayıp bir kez daha beni salak yerine koyup bu sefer eğik olan kasam...Devamını oku'} + +next(load_test_label()) + +# {'id': '97fdc0de-98e1-4577-9d7f-86cb71a49bbe', 'review_class': 4} +``` + +```python +from sadedegel.dataset.customer_review import CLASS_VALUES + +CLASS_VALUES[1] + +# Out[2]: 'alisveris' +``` +## `telco_sentiment` + +Telecom Sentiment dataset is an open sourced tweet sentiment corpus that includes tweets referring to a certain telecom +company. It is a social media commentary dataset used to evaluate sentiments over a certain brand. +Dataset [source](http://www.kemik.yildiz.edu.tr/veri_kumelerimiz.html) +and [paper](https://ieeexplore.ieee.org/document/8554037) are provided. + +### Using Corpus + +```python +from sadedegel.dataset.telco_sentiment import load_telco_sentiment_train +from sadedegel.dataset.telco_sentiment import load_telco_sentiment_test +from sadedegel.dataset.telco_sentiment import load_telco_sentiment_test_label + +import pandas as pd + +train_raw = load_telco_sentiment_train() +test_raw = load_telco_sentiment_test() +target_raw = load_telco_sentiment_test_label() + +train_df = pd.DataFrame().from_records(train_raw) +test_df = pd.DataFrame().from_records(test_raw) +target_df = pd.DataFrame().from_records(target_raw) +``` + +## `movie_sentiment` + +[Movie sentiment dataset](https://www.kaggle.com/mustfkeskin/turkish-movie-sentiment-analysis-dataset) is a corpus of ** +entertainment** domain. + +It contains 42975 instances of movie reviews with `POSITIVE` and `NEGATIVE` sentiments as a class label. + +### Using Corpus + +```python +from sadedegel.dataset.movie_sentiment import load_movie_sentiment_train +from sadedegel.dataset.movie_sentiment import load_movie_sentiment_test +from sadedegel.dataset.movie_sentiment import load_movie_sentiment_test_label + +train = load_movie_sentiment_train() +test = load_movie_sentiment_test() +test_label = load_movie_sentiment_test_label() +``` + +## `hotel_sentiment` + +Hotel sentiment data is part of [HUMIR dataset](http://humirapps.cs.hacettepe.edu.tr/tsad.aspx), which is a combination +of hotel and movie reviews. This implementation contains reviews of type 'Hotel Review'. + +It contains 11,600 instances with `POSITIVE` and `NEGATIVE` sentiments as a class label. The train and +test split is based on the split present in HUMIR dataset. + +### Using Corpus + +```python +from sadedegel.dataset.hotel_sentiment import load_hotel_sentiment_train +from sadedegel.dataset.hotel_sentiment import load_hotel_sentiment_test +from sadedegel.dataset.hotel_sentiment import load_hotel_sentiment_test_label + +train = load_hotel_sentiment_train() +test = load_hotel_sentiment_test() +test_label = load_hotel_sentiment_test_label() +``` + +## `categorized_product_sentiment` + +This corpus contains 5600 instances of customer product reviews from E-commerce sites. Reviews contain two sets of class labels. First label is `sentiment_class` which contains `[POSITIVE, NEGATIVE]` sentiment of the review. Second label is `product_category` which contains `["Kitchen", "DVD", "Books", "Electronics"]` as the category of the product being reviewed. Each product category contains 1400 instances. The dataset is material to the research [paper](https://sentic.net/wisdom2013pechenizkiy.pdf) by Demirtaş and Pechenizkiy. + +Number of instances in each `product_category` grouped by `sentiment_class`: + +| | `Kitchen` | `Books` | `DVD` | `Electronics` | +| :--- | :----: | :---: | :---: | :---: | +| **`POSITIVE`** | 700 | 700 | 700 | 700 | +| **`NEGATIVE`** | 700 | 700 | 700 | 700 | + +```python +import pandas as pd + +from sadedegel.dataset.categorized_product_sentiment import load_categorized_product_sentiment_train +from sadedegel.dataset.categorized_product_sentiment import SENTIMENT_CLASS_VALUES, PRODUCT_CATEGORIES + +raw = load_categorized_product_sentiment_train() + +next(raw) + +# Out [0]: {'id': 'bac3a153-397e-4c90-aaec-c9dfa51a9784', 'text': 'ürünün tedarik edilme süreci biraz uzasa da beklediğime değdi, set cam ağırlıklı olmasına rağmen sağlam olarak elime ulaştı. almayı düşünenlere tavsiye ederim, set beklentilerinizi karşılıyor...', 'product_category': 0, 'sentiment_class': 0} + + +df = pd.DataFrame().from_records(raw) + +# Load Subsets + +dvd = load_categorized_product_sentiment_train('DVD') +kitchen = load_categorized_product_sentiment_train('Kitchen') +books = load_categorized_product_sentiment_train('Books') +electronics = load_categorized_product_sentiment_train('Electronics') + +# Mappings + +SENTIMENT_CLASS_VALUES[0] +# Out [0]: 'POSITIVE' + +PRODUCT_CATEGORIES[0] +# Out [0]: 'Kitchen' +``` + + [scraper]: https://github.com/GlobalMaksimum/sadedegel-scraper + [Taner Sezer]: https://github.com/tanerim -[tscorpus]: tscorpus/ \ No newline at end of file + +[tscorpus]: tscorpus/ diff --git a/sadedegel/dataset/categorized_product_sentiment/__init__.py b/sadedegel/dataset/categorized_product_sentiment/__init__.py new file mode 100644 index 00000000..982c7f12 --- /dev/null +++ b/sadedegel/dataset/categorized_product_sentiment/__init__.py @@ -0,0 +1,2 @@ +from ._core import load_categorized_product_sentiment_train +from ._core import SENTIMENT_CLASS_VALUES, PRODUCT_CATEGORIES, CORPUS_SIZE diff --git a/sadedegel/dataset/categorized_product_sentiment/__main__.py b/sadedegel/dataset/categorized_product_sentiment/__main__.py new file mode 100644 index 00000000..3344f382 --- /dev/null +++ b/sadedegel/dataset/categorized_product_sentiment/__main__.py @@ -0,0 +1,90 @@ +import gzip +import os.path +import sys +from itertools import tee +from pathlib import Path +from shutil import copyfileobj + +import boto3 +import click +from loguru import logger +from rich.console import Console +from smart_open import open + +from ._core import load_categorized_product_sentiment_train, CORPUS_SIZE + +console = Console() + +logger.disable("sadedegel") + + +@click.group(help="Categorized Product Sentiment Dataset Commandline") +def cli(): + pass + + +@cli.command() +@click.option("--access-key", help="Access Key ID to download dataset.", prompt=True, + default=lambda: os.environ.get('sadedegel_access_key', '')) +@click.option("--secret-key", help="Secret Key ID to download dataset.", prompt=True, + default=lambda: os.environ.get('sadedegel_secret_key', '')) +@click.option("--data_home", '-d', help="Data home directory", default="~/.sadedegel_data") +def download(access_key, secret_key, data_home): + """Download twitter sentiment corpus from cloud with your key.""" + + data_home = Path(os.path.expanduser(data_home)) / 'categorized_product_sentiment' + data_home.mkdir(parents=True, exist_ok=True) + console.print(f"Data directory for categorized product sentiment corpus {data_home}") + + transport_params = { + 'session': boto3.Session(aws_access_key_id=access_key, + aws_secret_access_key=secret_key), + 'resource_kwargs': { + 'endpoint_url': 'https://storage.googleapis.com', + } + } + + url = f"s3://sadedegel/dataset/categorized_product_sentiment.csv.gz" + + with open(url, 'rb', transport_params=transport_params) as fp, gzip.open(data_home / os.path.basename(url), + "wb") as wp: + copyfileobj(fp, wp) + + +@cli.command() +def validate(): + """Sanity check on corpus + """ + with console.status("[bold yellow]Validating train"): + train = load_categorized_product_sentiment_train() + + train_clone, train_s, train_c = tee(train, 3) + + n_train = sum(1 for _ in train_clone) + sentiment_categories = set([row['sentiment_class'] for row in train_s]) + product_categories = set([row['product_category'] for row in train_c]) + + if n_train == CORPUS_SIZE: + console.log("Cardinality check [yellow]DONE[/yellow]") + else: + console.log(f"Cardinality check [red]FAILED[/red]") + console.log(f"|Categorized sentiment (train)| : {n_train} ({CORPUS_SIZE} expected)") + sys.exit(1) + + if sentiment_categories == {0, 1}: + console.log("Sentiment Label check [yellow]DONE[/yellow]") + else: + console.log("Sentiment Label check [red]FAILED[/red]") + console.log(f"\tCatgorized product sentiment classes : {sentiment_categories} ({set(['POSITIVE', 'NEGATIVE'])} expected)") + sys.exit(1) + + if product_categories == {0, 1, 2, 3}: + console.log("Product Category Label check [yellow]DONE[/yellow]") + else: + console.log("Category Label check [red]FAILED[/red]") + console.log(f"\tCatgorized product category classes : {product_categories} ({set(['Kitchen', 'DVD', 'Books', 'Electronics'])} expected)") + sys.exit(1) + + +if __name__ == "__main__": + cli() diff --git a/sadedegel/dataset/categorized_product_sentiment/_core.py b/sadedegel/dataset/categorized_product_sentiment/_core.py new file mode 100644 index 00000000..d8f378a9 --- /dev/null +++ b/sadedegel/dataset/categorized_product_sentiment/_core.py @@ -0,0 +1,79 @@ +import csv +import gzip +from pathlib import Path +from rich.console import Console + +from typing import Union, List, Iterator + +SENTIMENT_CLASS_VALUES = ['POSITIVE', 'NEGATIVE'] +PRODUCT_CATEGORIES = ['Kitchen', 'DVD', 'Books', 'Electronics'] +CORPUS_SIZE = 5600 + +console = Console() + +__general_download_message__ = """Ensure that you have properly downloaded categorized product sentiment corpus using + + python -m sadedegel.dataset.tweet_sentiment download --access-key xxx --secret-key xxxx + + Unfortunately due to data licensing issues we could not share data publicly. + Get in touch with sadedegel team to obtain a download key. + """ + + +def check_directory_structure(path: str) -> bool: + base_dir = Path(path).expanduser() + + cat_prod_sentiment_dir = base_dir / 'categorized_product_sentiment' + + if not base_dir.exists(): + console.log(f"Dataset base directory ([bold red]{base_dir}[/bold red]) does not exist") + + elif not cat_prod_sentiment_dir.exists(): + console.log( + f"Tweet sentiment directory ([bold red]{cat_prod_sentiment_dir}[/bold red]) does not exist") + + else: + return True + + console.log(__general_download_message__) + + return False + + +def load_categorized_product_sentiment_train(data_home="~/.sadedegel_data", + categories: Union[None, List[str], str] = None) -> Iterator[dict]: + """ + + @param data_home: Sadedegel data directory base. Default to be ~/.sadedegel_data + @param categories: + If None (default), load all the categories. + If not None, list of category names (or a single category) to load (other categories + ignored). + @return: Iterator of dictionary + """ + + if not check_directory_structure(data_home): + raise Exception("Categorized Product Corpus validation error") + + train_csv = Path(data_home).expanduser() / "categorized_product_sentiment" + train_csv = train_csv / "categorized_product_sentiment.csv.gz" + + if categories is None: + filtered_categories = PRODUCT_CATEGORIES + elif isinstance(categories, str): + filtered_categories = [categories] + elif isinstance(categories, list): + filtered_categories = categories + else: + raise ValueError(f"categories of type {type(categories)} is invalid.") + + with gzip.open(train_csv, "rt") as csvfile: + rd = csv.DictReader(csvfile) + + for rec in rd: + d = dict(id=rec['text_uuid'], text=rec['text'], + product_category=PRODUCT_CATEGORIES.index(rec['category']), + sentiment_class=SENTIMENT_CLASS_VALUES.index(rec['sentiment_class'])) + + if rec['category'] in filtered_categories: + yield d diff --git a/sadedegel/dataset/customer_review/__init__.py b/sadedegel/dataset/customer_review/__init__.py new file mode 100644 index 00000000..ba0d3d1b --- /dev/null +++ b/sadedegel/dataset/customer_review/__init__.py @@ -0,0 +1,2 @@ +from ._core import load_train, load_test, load_test_label +from ._core import CLASS_VALUES, CORPUS_SIZE diff --git a/sadedegel/dataset/customer_review/__main__.py b/sadedegel/dataset/customer_review/__main__.py new file mode 100644 index 00000000..37ee23f5 --- /dev/null +++ b/sadedegel/dataset/customer_review/__main__.py @@ -0,0 +1,91 @@ +import os.path +import sys +from itertools import tee +from pathlib import Path +from zipfile import ZipFile +import logging + +import boto3 +import click +from loguru import logger +from rich.console import Console +from smart_open import open + +from ._core import load_test_label, load_test, \ + load_train, CORPUS_SIZE + +console = Console() + +logger.disable("sadedegel") + + +@click.group(help="Customer Review Classification Dataset Commandline") +def cli(): + pass + + +@cli.command() +@click.option("--access-key", help="Access Key ID to download dataset.", prompt=True, + default=lambda: os.environ.get('sadedegel_access_key', '')) +@click.option("--secret-key", help="Secret Key ID to download dataset.", prompt=True, + default=lambda: os.environ.get('sadedegel_secret_key', '')) +@click.option("--data_home", '-d', help="Data home directory", default="~/.sadedegel_data") +@click.option("--verbose", "-v", is_flag=True, default=False, help="Cli verbosity") +def download(access_key, secret_key, data_home, verbose): + """Download tokenization corpus from cloud with your key.""" + + data_home = Path(os.path.expanduser(data_home)) + data_home.mkdir(parents=True, exist_ok=True) + console.print(f"Data directory for customer review classification data {data_home}") + + if verbose: + boto3.set_stream_logger("boto3", logging.DEBUG) + boto3.set_stream_logger("botocore", logging.DEBUG) + + transport_params = { + 'session': boto3.Session(aws_access_key_id=access_key, + aws_secret_access_key=secret_key), + 'resource_kwargs': { + 'endpoint_url': 'https://storage.googleapis.com', + } + } + + url = f"s3://sadedegel/dataset/customer_review_classification.zip" + + with open(url, 'rb', transport_params=transport_params) as fp: + with ZipFile(fp) as zp: + zp.extractall(data_home) + + +@cli.command() +def validate(): + """Sanity check on corpus + """ + with console.status("[bold yellow]Validating train"): + train = load_train() + + train_clone, train = tee(train, 2) + + n_train = sum(1 for _ in train_clone) + + if n_train == CORPUS_SIZE: + console.log("Cardinality check [yellow]DONE[/yellow]") + else: + console.log("Cardinality check [red]FAILED[/red]") + console.log(f"|Telco Sentiment (train)| : {n_train}") + sys.exit(1) + + with console.status("[bold yellow]Validate test"): + test = set((d['id'] for d in load_test())) + test_label = set((d['id'] for d in load_test_label())) + + a_b, ab, b_a = test - test_label, test & test_label, test_label - test + + if len(a_b) == 0 and len(b_a) == 0: + console.log("Corpus check [green]DONE[/green]") + else: + console.log(f"Test file [red]DIVERGE[/red] from label file. {len(a_b)}, {len(b_a)}") + + +if __name__ == "__main__": + cli() diff --git a/sadedegel/dataset/customer_review/_core.py b/sadedegel/dataset/customer_review/_core.py new file mode 100644 index 00000000..077bae25 --- /dev/null +++ b/sadedegel/dataset/customer_review/_core.py @@ -0,0 +1,86 @@ +import csv +from pathlib import Path +from rich.console import Console +import gzip + +CLASS_VALUES = ['Alışveriş', 'Anne-Bebek', 'Beyaz-Eşya', 'Bilgisayar', 'Cep Telefon Kategori', 'Eğitim', 'Elektronik', + 'Emlak ve İnşaat', 'Enerji', 'Etkinlik ve Organizasyon', 'Finans', + 'Gıda', 'Giyim', 'Hizmet Sektörü', 'İçecek', 'İnternet', 'Kamu Hizmetleri', 'Kargo Nakliyat', + 'Kişisel Bakım ve Kozmetik', 'Küçük Ev Aletleri', 'Medya', 'Mekan ve Eğlence', 'Mobilya Ev Tekstili', + 'Mücevher Saat Gözlük', 'Mutfak Araç Gereç', 'Otomotiv', 'Sağlık', + 'Sigortacılık', 'Spor', 'Temizlik', + 'Turizm', 'Ulaşım'] + +CORPUS_SIZE = 323479 + +console = Console() + +__general_download_message__ = """Ensure that you have properly downloaded customer review classification corpus using + + python -m sadedegel.dataset.customer_review_classification download --access-key xxx --secret-key xxxx + + Unfortunately due to data licensing issues we could not share data publicly. + Get in touch with sadedegel team to obtain a download key. + """ + + +def check_directory_structure(path: str) -> bool: + base_dir = Path(path).expanduser() + + customer_review_dir = base_dir / 'customer_review_classification' + + if not base_dir.exists(): + console.log(f"Dataset base directory ([bold red]{base_dir}[/bold red]) does not exist") + + elif not customer_review_dir.exists(): + console.log( + f"Customer revie classification dataset directory ([bold red]{customer_review_dir}[/bold red]) does not exist") + + else: + return True + + console.log(__general_download_message__) + + return False + + +def load_train(data_home="~/.sadedegel_data"): + if not check_directory_structure(data_home): + raise Exception("Customer Review Classification Corpus validation error") + + train_csv = Path(data_home).expanduser() / "customer_review_classification" + train_csv = train_csv / "customer_review_train.csv.gz" + + with gzip.open(train_csv, "rt") as csvfile: + rd = csv.DictReader(csvfile) + + for rec in rd: + yield dict(id=rec['text_uuid'], text=rec['text'], review_class=int(rec['review_class'])) + + +def load_test(data_home="~/.sadedegel_data"): + if not check_directory_structure(data_home): + raise Exception("Customer Review Classification Corpus validation error") + + test_csv = Path(data_home).expanduser() / "customer_review_classification" + test_csv = test_csv / "customer_review_test.csv.gz" + + with gzip.open(test_csv, "rt") as csvfile: + rd = csv.DictReader(csvfile) + + for rec in rd: + yield dict(id=rec['text_uuid'], text=rec['text']) + + +def load_test_label(data_home="~/.sadedegel_data"): + if not check_directory_structure(data_home): + raise Exception("Customer Review Classification Corpus validation error") + + target_csv = Path(data_home).expanduser() / "customer_review_classification" + target_csv = target_csv / "customer_review_target.csv.gz" + + with gzip.open(target_csv, "rt") as csvfile: + rd = csv.DictReader(csvfile) + + for rec in rd: + yield dict(id=rec['text_uuid'], review_class=int(rec['review_class'])) diff --git a/sadedegel/dataset/hotel_sentiment/__init__.py b/sadedegel/dataset/hotel_sentiment/__init__.py new file mode 100644 index 00000000..b73d9560 --- /dev/null +++ b/sadedegel/dataset/hotel_sentiment/__init__.py @@ -0,0 +1,2 @@ +from ._core import load_hotel_sentiment_train, load_hotel_sentiment_test, load_hotel_sentiment_test_label +from ._core import CORPUS_SIZE, CLASS_VALUES diff --git a/sadedegel/dataset/hotel_sentiment/__main__.py b/sadedegel/dataset/hotel_sentiment/__main__.py new file mode 100644 index 00000000..140cd0ab --- /dev/null +++ b/sadedegel/dataset/hotel_sentiment/__main__.py @@ -0,0 +1,83 @@ +import os.path +from pathlib import Path +import sys +from itertools import tee +from smart_open import open +import click +import boto3 +from loguru import logger +from ._core import load_hotel_sentiment_train, load_hotel_sentiment_test, load_hotel_sentiment_test_label, \ + CLASS_VALUES, CORPUS_SIZE +from zipfile import ZipFile +from rich.console import Console + +console = Console() + +logger.disable("sadedegel") + + +@click.group(help="Hotel Sentiment Dataset Commandline") +def cli(): + pass + + +@cli.command() +@click.option("--access-key", help="Access Key ID to download dataset.", prompt=True, + default=lambda: os.environ.get('sadedegel_access_key', '')) +@click.option("--secret-key", help="Secret Key ID to download dataset.", prompt=True, + default=lambda: os.environ.get('sadedegel_secret_key', '')) +@click.option("--data_home", '-d', help="Data home directory", default="~/.sadedegel_data") +def download(access_key, secret_key, data_home): + """Download tokenization corpus from cloud with your key.""" + + data_home = Path(os.path.expanduser(data_home)) + data_home.mkdir(parents=True, exist_ok=True) + console.print(f"Data directory for data {data_home}") + + transport_params = { + 'session': boto3.Session(aws_access_key_id=access_key, + aws_secret_access_key=secret_key), + 'resource_kwargs': { + 'endpoint_url': 'https://storage.googleapis.com', + } + } + + url = f"s3://sadedegel/dataset/hotel_sentiment.zip" + + with open(url, 'rb', transport_params=transport_params) as fp: + with ZipFile(fp) as zp: + zp.extractall(data_home) + + +@cli.command() +def validate(): + """Sanity check on corpus + """ + with console.status("[bold yellow]Validating train"): + train = load_hotel_sentiment_train() + + train_clone, train = tee(train, 2) + + n_train = sum(1 for _ in train_clone) + + if n_train == CORPUS_SIZE: + console.log("Cardinality check [yellow]DONE[/yellow]") + else: + console.log("Cardinality check [red]FAILED[/red]") + console.log(f"|Hotel Sentiment (train)| : {n_train}") + sys.exit(1) + + with console.status("[bold yellow]Validate test"): + test = set((d['id'] for d in load_hotel_sentiment_test())) + test_label = set((d['id'] for d in load_hotel_sentiment_test_label())) + + a_b, ab, b_a = test - test_label, test & test_label, test_label - test + + if len(a_b) == 0 and len(b_a) == 0: + console.log("Corpus check [green]DONE[/green]") + else: + console.log(f"Test file [red]DIVERGE[/red] from label file. {len(a_b)}, {len(b_a)}") + + +if __name__ == "__main__": + cli() diff --git a/sadedegel/dataset/hotel_sentiment/_core.py b/sadedegel/dataset/hotel_sentiment/_core.py new file mode 100644 index 00000000..d738842f --- /dev/null +++ b/sadedegel/dataset/hotel_sentiment/_core.py @@ -0,0 +1,79 @@ +import csv +import gzip +from pathlib import Path +from rich.console import Console + +CLASS_VALUES = ["NEGATIVE", "POSITIVE"] +CORPUS_SIZE = 5800 + +console = Console() + +__general_download_message__ = """Ensure that you have properly downloaded hotel sentiment corpus using + + python -m sadedegel.dataset.hotel_sentiment download --access-key xxx --secret-key xxxx + + Unfortunately due to data licensing issues we could not share data publicly. + Get in touch with sadedegel team to obtain a download key. + """ + + +def check_directory_structure(path: str) -> bool: + base_dir = Path(path).expanduser() + + hotel_sentiment_dir = base_dir / 'hotel_sentiment' + + if not base_dir.exists(): + console.log(f"Dataset base directory ([bold red]{base_dir}[/bold red]) does not exist") + + elif not hotel_sentiment_dir.exists(): + console.log( + f" directory ([bold red]{hotel_sentiment_dir}[/bold red]) does not exist") + + else: + return True + + console.log(__general_download_message__) + + return False + + +def load_hotel_sentiment_train(data_home="~/.sadedegel_data"): + if not check_directory_structure(data_home): + raise Exception(" validation error") + + train_csv = Path(data_home).expanduser() / "hotel_sentiment" + train_csv = train_csv / "hotel_sentiment_train.csv.gz" + + with gzip.open(train_csv, "rt") as csvfile: + rd = csv.DictReader(csvfile) + + for rec in rd: + yield dict(id=rec['text_uuid'], text=rec['text'], sentiment_class=int(rec['sentiment'])) + + +def load_hotel_sentiment_test(data_home="~/.sadedegel_data"): + if not check_directory_structure(data_home): + raise Exception("Hotel Sentiment Corpus validation error") + + test_csv = Path(data_home).expanduser() / "hotel_sentiment" + test_csv = test_csv / "hotel_sentiment_test.csv.gz" + + with gzip.open(test_csv, "rt") as csvfile: + rd = csv.DictReader(csvfile) + + for rec in rd: + yield dict(id=rec['text_uuid'], text=rec['text']) + + +def load_hotel_sentiment_test_label(data_home="~/.sadedegel_data"): + if not check_directory_structure(data_home): + raise Exception("Hotel Sentiment Corpus validation error") + + test_csv = Path(data_home).expanduser() / "hotel_sentiment" + test_csv = test_csv / "hotel_sentiment_target.csv.gz" + + with gzip.open(test_csv, "rt") as csvfile: + rd = csv.DictReader(csvfile) + + for rec in rd: + yield dict(id=rec['text_uuid'], sentiment_class=int(rec['sentiment'])) diff --git a/sadedegel/dataset/movie_sentiment/__init__.py b/sadedegel/dataset/movie_sentiment/__init__.py index 15bdd07e..86619b93 100644 --- a/sadedegel/dataset/movie_sentiment/__init__.py +++ b/sadedegel/dataset/movie_sentiment/__init__.py @@ -1,2 +1,2 @@ -from ._core import load_movie_sentiment_train, load_movie_sentiment_test, load_movie_sentiment_target +from ._core import load_movie_sentiment_train, load_movie_sentiment_test, load_movie_sentiment_test_label from ._core import CORPUS_SIZE, CLASS_VALUES diff --git a/sadedegel/dataset/movie_sentiment/_core.py b/sadedegel/dataset/movie_sentiment/_core.py index 570d3786..3b4816b3 100644 --- a/sadedegel/dataset/movie_sentiment/_core.py +++ b/sadedegel/dataset/movie_sentiment/_core.py @@ -65,7 +65,7 @@ def load_movie_sentiment_test(data_home="~/.sadedegel_data"): yield dict(id=rec['text_uuid'], text=rec['comment']) -def load_movie_sentiment_target(data_home="~/.sadedegel_data"): +def load_movie_sentiment_test_label(data_home="~/.sadedegel_data"): if not check_directory_structure(data_home): raise Exception("Movie Sentiment Corpus validation error") diff --git a/sadedegel/dataset/telco_sentiment/__init__.py b/sadedegel/dataset/telco_sentiment/__init__.py index 1eabcebc..e77ecfc1 100644 --- a/sadedegel/dataset/telco_sentiment/__init__.py +++ b/sadedegel/dataset/telco_sentiment/__init__.py @@ -1,2 +1,2 @@ -from ._core import load_telco_sentiment_train, load_telco_sentiment_test, load_telco_sentiment_target +from ._core import load_telco_sentiment_train, load_telco_sentiment_test, load_telco_sentiment_test_label from ._core import CLASS_VALUES, CORPUS_SIZE diff --git a/sadedegel/dataset/telco_sentiment/__main__.py b/sadedegel/dataset/telco_sentiment/__main__.py index 0a1b50a8..7041f8e1 100644 --- a/sadedegel/dataset/telco_sentiment/__main__.py +++ b/sadedegel/dataset/telco_sentiment/__main__.py @@ -11,7 +11,7 @@ from loguru import logger -from ._core import load_telco_sentiment_target, load_telco_sentiment_train, \ +from ._core import load_telco_sentiment_test_label, load_telco_sentiment_train, \ load_telco_sentiment_test, CORPUS_SIZE, CLASS_VALUES from zipfile import ZipFile @@ -75,8 +75,8 @@ def validate(): sys.exit(1) with console.status("[bold yellow]Validate test"): - test = set((d['text_uuid'] for d in load_telco_sentiment_test())) - test_label = set((d['text_uuid'] for d in load_telco_sentiment_target())) + test = set((d['id'] for d in load_telco_sentiment_test())) + test_label = set((d['id'] for d in load_telco_sentiment_test_label())) a_b, ab, b_a = test - test_label, test & test_label, test_label - test diff --git a/sadedegel/dataset/telco_sentiment/_core.py b/sadedegel/dataset/telco_sentiment/_core.py index 706a9c3c..b64da2c7 100644 --- a/sadedegel/dataset/telco_sentiment/_core.py +++ b/sadedegel/dataset/telco_sentiment/_core.py @@ -65,7 +65,7 @@ def load_telco_sentiment_test(data_home="~/.sadedegel_data"): yield dict(id=rec['text_uuid'], tweet=rec['tweet']) -def load_telco_sentiment_target(data_home="~/.sadedegel_data"): +def load_telco_sentiment_test_label(data_home="~/.sadedegel_data"): if not check_directory_structure(data_home): raise Exception("Telco Sentiment Corpus validation error") diff --git a/sadedegel/default.ini b/sadedegel/default.ini index e76b6653..d578f8ba 100644 --- a/sadedegel/default.ini +++ b/sadedegel/default.ini @@ -4,6 +4,11 @@ drop_stopwords = false lowercase = false drop_punct = false +[tokenizer] +hashtag = false +mention = false +emoji = false + [bert] avg_document_length = 42.37 avg_sentence_length = 17.93 diff --git a/sadedegel/extension/sklearn.py b/sadedegel/extension/sklearn.py index 84d573c2..5d74e806 100644 --- a/sadedegel/extension/sklearn.py +++ b/sadedegel/extension/sklearn.py @@ -9,6 +9,7 @@ from ..bblock.doc import DocBuilder, Document from ..bblock.token import Token +from tqdm import tqdm def check_type(X): @@ -34,15 +35,20 @@ def partial_fit(self, X, y=None, **kwargs): class Text2Doc(BaseEstimator, TransformerMixin): Doc = None - def __init__(self, tokenizer="icu"): + def __init__(self, tokenizer="icu", hashtag=False, mention=False, emoji=False, progress_tracking=True): self.tokenizer = tokenizer + self.hashtag = hashtag + self.mention = mention + self.emoji = emoji + self.progress_tracking = progress_tracking # TODO: Add sadedegel version self.init() def init(self): if Text2Doc.Doc is None: - Text2Doc.Doc = DocBuilder(tokenizer=self.tokenizer) + Text2Doc.Doc = DocBuilder(tokenizer=self.tokenizer, tokenizer__hashtag=self.hashtag, + tokenizer__mention=self.mention, tokenizer__emoji=self.emoji) def fit(self, X, y=None): return self @@ -65,7 +71,7 @@ def transform(self, X, y=None): docs = [] - for text in X: + for text in tqdm(X, disable=not hasattr(self, 'progress_tracking') or not self.progress_tracking, unit="doc"): docs.append(Text2Doc.Doc(text)) return docs @@ -95,8 +101,8 @@ def partial_fit(self, X, y=None, **kwargs): def transform(self, docs): def feature_iter(): for d in docs: - yield [('prefix5', Token(t).lower_[:5]) for t in d.tokens] + [('prefix3', Token(t).lower_[:3]) for t in - d.tokens] + yield [('prefix5', t.lower_[:5]) for t in d.tokens] + [('prefix3', t.lower_[:3]) for t in + d.tokens] return FeatureHasher(self.n_features, alternate_sign=self.alternate_sign, input_type="pair", dtype=np.float32).transform(feature_iter()) diff --git a/sadedegel/metrics/_score.py b/sadedegel/metrics/_score.py index db0423cc..4b7d379b 100644 --- a/sadedegel/metrics/_score.py +++ b/sadedegel/metrics/_score.py @@ -43,7 +43,7 @@ def _get_f1(y_ref: list, y_cand: list) -> float: return f1 -def rouge1_score(y_ref: List, y_cand: List, metric: str = "f1"): +def rouge1_score(y_ref: List, y_cand: List, metric: str = "f1") -> float: if metric.lower() not in _AVAILABLE_METRICS: raise ValueError(f"metrics ({metric}) should be one of {_AVAILABLE_METRICS}") diff --git a/sadedegel/prebuilt/README.md b/sadedegel/prebuilt/README.md index d98ae43e..71f1ddd5 100644 --- a/sadedegel/prebuilt/README.md +++ b/sadedegel/prebuilt/README.md @@ -97,10 +97,33 @@ y_pred = model.predict(['süper aksiyon, tavsiye ederim']) movie_reviews.evaluate() ``` +### Telco Brand Tweet Sentiment Classification + +Classifier assigns each tweet mentioning the telecom brand into three classes ('olumlu', olumsuz', 'notr') by using sadedegel built-in pipeline. + +#### Loading and Predicting with the Model: + +```python +from sadedegel.prebuilt import telco_sentiment +# We load our prebuilt model: +model = telco_sentiment.load() + +# Here we feed our text to get predictions: +y_pred = model.predict(['Magma tabakasından bile çekiyor helal olsun valla.']) + +# You can check original test results on holdout set: +telco_sentiment.evaluate() +``` + #### Accuracy -* Current prebuilt movie review model has a **macro-F1** score of `0.825` on holdout test set model never seen before. +Current prebuilt open source telco sentiment model has a **accuracy** score of `0.6925` (**macro-F1** score of `0.6871`) on test set. +Comparable [benchmark](https://ieeexplore.ieee.org/document/8554037/) models has +* `0.6925` **accuracy** score (convolutional neural networks fed with char ngrams) +* `0.66` **accuracy** score (classical ML approach fed with bag-of-words) +on the hold-out set. + ### Turkish Product Sentiment Classification Classifier assigns each Turkish product review texts into one of 3 classes ('NEUTRAL','NEGATIVE','POSITIVE') by using sadedegel built-in pipeline. #### Loading and Predicting with the Model: @@ -114,4 +137,4 @@ Classifier assigns each Turkish product review texts into one of 3 classes ('NEU #### Accuracy Current prebuilt model has * 3-fold cross validation F1 macro score of `mean 0.6494, std 0.0045)`. -* 5-fold cross validation F1 macro score of `mean 0.655, std 0.0083)` \ No newline at end of file +* 5-fold cross validation F1 macro score of `mean 0.655, std 0.0083)` diff --git a/sadedegel/prebuilt/model/telco_sentiment_classification.joblib b/sadedegel/prebuilt/model/telco_sentiment_classification.joblib new file mode 100644 index 00000000..247e27a5 --- /dev/null +++ b/sadedegel/prebuilt/model/telco_sentiment_classification.joblib @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eee18cb9f05c35f813c18c976ab5dbff4037820a98edc17aa8ac4fb95ad10ad +size 142543 diff --git a/sadedegel/prebuilt/movie_reviews.py b/sadedegel/prebuilt/movie_reviews.py index 7c7b1dd8..2fd7661d 100644 --- a/sadedegel/prebuilt/movie_reviews.py +++ b/sadedegel/prebuilt/movie_reviews.py @@ -11,7 +11,7 @@ from ..extension.sklearn import TfidfVectorizer, Text2Doc from .util import load_model from ..dataset.movie_sentiment import load_movie_sentiment_train, load_movie_sentiment_test, \ - load_movie_sentiment_target, CORPUS_SIZE + load_movie_sentiment_test_label, CORPUS_SIZE console = Console() @@ -75,7 +75,7 @@ def evaluate(): raw_test = load_movie_sentiment_test() test = pd.DataFrame.from_records(raw_test) - true_labels = pd.DataFrame.from_records(load_movie_sentiment_target()) + true_labels = pd.DataFrame.from_records(load_movie_sentiment_test_label()) y_pred = model.predict(test.text) diff --git a/sadedegel/prebuilt/telco_sentiment.py b/sadedegel/prebuilt/telco_sentiment.py new file mode 100644 index 00000000..9fcbcfd6 --- /dev/null +++ b/sadedegel/prebuilt/telco_sentiment.py @@ -0,0 +1,86 @@ +from os.path import dirname +from pathlib import Path + +from joblib import dump +from rich.console import Console +from sklearn.linear_model import SGDClassifier +from sklearn.metrics import accuracy_score, f1_score +from sklearn.pipeline import Pipeline +from sklearn.utils import shuffle + +from .util import load_model +from ..dataset.telco_sentiment import load_telco_sentiment_train, load_telco_sentiment_test, \ + load_telco_sentiment_test_label, CORPUS_SIZE +from ..extension.sklearn import HashVectorizer, Text2Doc + +console = Console() + + +def empty_model(): + return Pipeline( + [('text2doc', Text2Doc("icu", emoji=True, hashtag=True, mention=True)), + ('hash', HashVectorizer(n_features=1033297, alternate_sign=False)), + ('sgd', SGDClassifier(alpha=0.00036252996496306393, penalty="elasticnet", loss="log", random_state=42))] + ) + + +def build(save=True): + try: + import pandas as pd + except ImportError: + console.log(("pandas package is not a general sadedegel dependency." + " But we do have a dependency on building our prebuilt models")) + + raw = load_telco_sentiment_train() + df = pd.DataFrame.from_records(raw) + df = shuffle(df) + + console.log(f"Corpus Size: {CORPUS_SIZE}") + + pipeline = empty_model() + + pipeline.fit(df.tweet, df.sentiment_class) + + evaluate(pipeline) + + console.log("Model build [green]DONE[/green]") + + if save: + model_dir = Path(dirname(__file__)) / 'model' + + model_dir.mkdir(parents=True, exist_ok=True) + + pipeline.steps[0][1].Doc = None + + dump(pipeline, (model_dir / 'telco_sentiment_classification.joblib').absolute(), compress=('gzip', 9)) + + console.log("Model save [green]DONE[/green]") + + +def load(model_name="telco_sentiment_classification"): + return load_model(model_name) + + +def evaluate(model=None): + try: + import pandas as pd + except ImportError: + console.log(("pandas package is not a general sadedegel dependency." + " But we do have a dependency on building our prebuilt models")) + + if model is None: + model = load() + + test = pd.DataFrame.from_records(load_telco_sentiment_test()) + test_label = pd.DataFrame.from_records(load_telco_sentiment_test_label()) + + test = test.merge(test_label, on='id') + + y_pred = model.predict(test.tweet) + + console.log(f"Model test accuracy (accuracy): {accuracy_score(test.sentiment_class, y_pred)}") + console.log(f"Model test accuracy (f1-macro): {f1_score(test.sentiment_class, y_pred, average='macro')}") + + +if __name__ == "__main__": + build(save=True) diff --git a/sadedegel/tokenize/__init__.py b/sadedegel/tokenize/__init__.py index 90bce75f..1f44d99f 100644 --- a/sadedegel/tokenize/__init__.py +++ b/sadedegel/tokenize/__init__.py @@ -1,7 +1,8 @@ -from ._sent import RegexpSentenceTokenizer, NLTKPunctTokenizer # noqa: F401 -from ..bblock.util import tr_lower, tr_upper, __tr_upper__, __tr_lower__ # noqa: F401 +from ..bblock.util import deprecate -# This is kepts for backward compatibility -# from .. import Doc, Sentences # noqa: F401 +deprecate("[yellow]sadedegel.tokenize[/yellow] module is deprecated", (0, 21, 0), + post_message="Use [yellow]sadedegel.sbd[/yellow] instead.") +from ..bblock.sbd import RegexpSentenceTokenizer, NLTKPunctTokenizer # noqa: F401 +from ..bblock.util import tr_lower, tr_upper, __tr_upper__, __tr_lower__ # noqa: F401 from .. import Doc, Sentences diff --git a/sadedegel/tokenize/__main__.py b/sadedegel/tokenize/__main__.py index 18a2d26e..c1ce9e17 100644 --- a/sadedegel/tokenize/__main__.py +++ b/sadedegel/tokenize/__main__.py @@ -5,26 +5,38 @@ from sadedegel.ml import create_model, save_model from sklearn.model_selection import cross_val_score +from difflib import Differ + +from rich.console import Console +from rich.table import Table +from collections import Counter +from ..bblock.util import deprecate + +console = Console() + import click from typing import List @click.group(help="sadedeGel SBD commandline") def tokenize(): - pass + deprecate("sadedegel-sbd is deprecated", (0, 21, 0), "Use [yellow]sadedegel sbd[/yellow] instead.") def iou_eval(name: str, y_true: List[List[str]], y_pred: List[List[str]], paths=None): - i, u, iou = 0, 0, 0 + micro_i, micro_u, iou = 0, 0, 0 for idx, (seq1, seq2) in enumerate(zip(y_true, y_pred)): - s1 = list(map(lambda x: x.strip(), seq1)) - s2 = list(map(lambda x: x.strip(), seq2)) + s1 = Counter(map(lambda x: x.strip(), seq1)) + s2 = Counter(map(lambda x: x.strip(), seq2)) + + i = sum((s1 & s2).values()) + u = sum((s1 | s2).values()) - i += len(set(s1) & set(s2)) - u += len(set(s1) | set(s2)) + micro_i += i + micro_u += u - delta = len(set(s1) & set(s2)) / len(set(s1) | set(s2)) + delta = i / u if paths: click.secho(f"Lost {1.0 - delta:.2f} because of {paths[idx]}", fg="yellow") @@ -33,7 +45,7 @@ def iou_eval(name: str, y_true: List[List[str]], y_pred: List[List[str]], paths= click.secho(f"IoU score for {name}") - click.secho(f"Micro IoU: {i / u:.4f}".rjust(25)) + click.secho(f"Micro IoU: {micro_i / micro_u:.4f}".rjust(25)) click.secho(f"Macro IoU: {iou / len(y_true):.4f}".rjust(25)) @@ -62,33 +74,57 @@ def evaluate(v): @tokenize.command() -def diff(): +@click.option('-v', '--verbose', count=True) +def diff(verbose): """Git like diff tool to compare sentence generated by our tokenizer vs actual list of sentences.""" click.secho("Loading corpus...") raw, sents = load_raw_corpus(False), load_sentence_corpus() y_true = [doc['sentences'] for doc in sents] - y_pred = [Doc(doc) for doc in raw] + y_pred = [[str(s) for s in Doc(doc)] for doc in raw] paths = file_paths() - for i in range(len(y_true)): + differ = Differ() + + for t, p, f in zip(y_true, y_pred, paths): + + table = Table(show_header=True, header_style="bold magenta", show_edge=False) + table.add_column("true", style="dim", width=100) + table.add_column("predict", style="dim", width=100) + + table.columns[0].style = "green" + table.columns[1].style = "red" + + ndiff = 0 + match = 0 + for sent in differ.compare(p, t): + if sent.startswith('+'): + if match > 0 and verbose > 0: + table.add_row(f"[blue]{match} sentences...[/blue]", f"[blue]{match} sentences...[/blue]") + match = 0 - if y_true[i] != y_pred[i]: - click.secho(f"Document {paths[i]}") - for s_true in y_true[i]: - if s_true not in y_pred[i]: - click.secho(f"+ {s_true}", fg="green") + table.add_row(sent[2:], "") + ndiff += 1 + elif sent.startswith('-'): + if match > 0 and verbose > 0: + table.add_row(f"[blue]{match} sentences...[/blue]", f"[blue]{match} sentences...[/blue]") + match = 0 - click.secho() + table.add_row("", sent[2:]) + ndiff += 1 + else: + match += 1 - for s_pred in y_pred[i]: - if s_pred not in y_true[i]: - click.secho(f"- {s_pred}", fg="red") + if match > 0 and verbose > 0: + table.add_row(f"[blue]{match} sentences...[/blue]", f"[blue]{match} sentences...[/blue]") - click.secho() - click.secho() + if ndiff > 0: + console.print(f) + console.print(table) + console.print(f"[blue]{len(t)} sentences...[/blue]") + console.print() @tokenize.command() @@ -108,7 +144,7 @@ def build(): sbd_model = create_model() - scores = cross_val_score(sbd_model, features, y, scoring="f1") + scores = cross_val_score(sbd_model.pipeline, features, y, scoring="f1") for i, score in enumerate(scores): click.secho(f"Fold {i + 1}: {score:.4f}", fg="yellow") @@ -116,9 +152,10 @@ def build(): sbd_model.fit(features, y) click.secho("\nTop 10 Features") - feature_importance = sbd_model.steps[1][1].feature_importances_ + feature_importance = sbd_model.pipeline.steps[1][1].feature_importances_ for idx in list(reversed(feature_importance.argsort()))[:20]: - click.secho(f" {sbd_model.steps[0][1].feature_names_[idx]}: {feature_importance[idx]:.4f}", fg="yellow") + click.secho(f" {sbd_model.pipeline.steps[0][1].feature_names_[idx]}: {feature_importance[idx]:.4f}", + fg="yellow") save_model(sbd_model) diff --git a/tests/datasets/context.py b/tests/datasets/context.py index 257e0a44..c1f7ee28 100644 --- a/tests/datasets/context.py +++ b/tests/datasets/context.py @@ -9,9 +9,11 @@ from sadedegel.dataset.tweet_sentiment import load_tweet_sentiment_train, CLASS_VALUES # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset.product_sentiment import load_product_sentiment_train # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset.product_sentiment import CLASS_VALUES as PS_CLASS_VALUES # noqa # pylint: disable=unused-import, wrong-import-position -from sadedegel.dataset.telco_sentiment import load_telco_sentiment_train, load_telco_sentiment_test, load_telco_sentiment_target # noqa # pylint: disable=unused-import, wrong-import-position +from sadedegel.dataset.telco_sentiment import load_telco_sentiment_train, load_telco_sentiment_test, load_telco_sentiment_test_label # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset.telco_sentiment import CLASS_VALUES as TELCO_CLASS_VALUES # noqa # pylint: disable=unused-import, wrong-import-position +from sadedegel.dataset.categorized_product_sentiment import load_categorized_product_sentiment_train, SENTIMENT_CLASS_VALUES, PRODUCT_CATEGORIES # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset import movie_sentiment # noqa # pylint: disable=unused-import, wrong-import-position +from sadedegel.dataset import hotel_sentiment # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.bblock.cli.__main__ import tok_eval # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset import util # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset import file_paths, CorpusTypeEnum # noqa # pylint: disable=unused-import, wrong-import-position diff --git a/tests/datasets/test_categorized_product_sentiment.py b/tests/datasets/test_categorized_product_sentiment.py new file mode 100644 index 00000000..d0d0871f --- /dev/null +++ b/tests/datasets/test_categorized_product_sentiment.py @@ -0,0 +1,51 @@ +import pytest +from pathlib import Path # pylint: disable=unused-import +from os.path import expanduser # pylint: disable=unused-import +from .context import load_categorized_product_sentiment_train, SENTIMENT_CLASS_VALUES, PRODUCT_CATEGORIES + + +@pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data/categorized_product_sentiment")).exists()') +def test_data_load(): + data = load_categorized_product_sentiment_train() + + count = 0 + for row in data: + assert any(key in row.keys() for key in ['id', 'text', 'sentiment_class', 'product_category']) + assert isinstance(row['id'], str) + assert isinstance(row['text'], str) + + count += 1 + + assert count == 5600 + + +@pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data/categorized_product_sentiment")).exists()') +@pytest.mark.parametrize('subset', ['Kitchen', 'DVD', 'Books', 'Electronics']) +def test_data_subset_str(subset): + data = load_categorized_product_sentiment_train(categories=subset) + count = 0 + for row in data: + assert any(key in row.keys() for key in ['id', 'text', 'sentiment_class', 'product_category']) + assert isinstance(row['id'], str) + assert isinstance(row['text'], str) + assert PRODUCT_CATEGORIES[row['product_category']] == subset + + count += 1 + + assert count == 1400 + + +@pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data/categorized_product_sentiment")).exists()') +def test_data_subset_list(): + lov = ['Kitchen', 'DVD'] + data = load_categorized_product_sentiment_train(categories=lov) + count = 0 + for row in data: + assert any(key in row.keys() for key in ['id', 'text', 'sentiment_class', 'product_category']) + assert isinstance(row['id'], str) + assert isinstance(row['text'], str) + assert PRODUCT_CATEGORIES[row['product_category']] in lov + + count += 1 + + assert count == 2800 diff --git a/tests/datasets/test_customer_review.py b/tests/datasets/test_customer_review.py new file mode 100644 index 00000000..33e120bd --- /dev/null +++ b/tests/datasets/test_customer_review.py @@ -0,0 +1,51 @@ +import pytest +from pathlib import Path # pylint: disable=unused-import +from os.path import expanduser # pylint: disable=unused-import + +from sadedegel.dataset.customer_review import load_train +from sadedegel.dataset.customer_review import load_test +from sadedegel.dataset.customer_review import load_test_label +from sadedegel.dataset.customer_review import CLASS_VALUES + + +@pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data/customer_review_classification")).exists()') +def test_data_load_train(): + data = load_train() + + count = 0 + for row in data: + assert any(key in row.keys() for key in ['id', 'text', 'review_class']) + assert isinstance(row['id'], str) + assert isinstance(row['text'], str) + assert 0 <= row['review_class'] < len(CLASS_VALUES) + + count += 1 + + assert count == 323479 + + +@pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data/customer_review_classification")).exists()') +def test_data_load_test(): + data = load_test() + + count = 0 + for row in data: + assert any(key in row.keys() for key in ['id', 'tweet']) + assert isinstance(row['id'], str) + assert isinstance(row['text'], str) + + count += 1 + assert count == 107827 + + +@pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data/customer_review_classification")).exists()') +def test_data_load_target(): + data = load_test_label() + count = 0 + for row in data: + assert any(key in row.keys() for key in ['id', 'sentiment_class']) + assert isinstance(row['id'], str) + assert 0 <= row['review_class'] < len(CLASS_VALUES) + + count += 1 + assert count == 107827 diff --git a/tests/datasets/test_hotel_sentiment.py b/tests/datasets/test_hotel_sentiment.py new file mode 100644 index 00000000..e8b81a09 --- /dev/null +++ b/tests/datasets/test_hotel_sentiment.py @@ -0,0 +1,35 @@ +from pathlib import Path # pylint: disable=unused-import +from os.path import expanduser # pylint: disable=unused-import + +import pytest + +from .context import hotel_sentiment + + +@pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data")).exists()') +def test_data_load(): + data = hotel_sentiment.load_hotel_sentiment_train() + for i, row in enumerate(data): + assert any(key in row.keys() for key in ['id', 'text', 'sentiment_class']) + assert isinstance(row['id'], str) + assert isinstance(row['text'], str) + assert hotel_sentiment.CLASS_VALUES[row['sentiment_class']] in ['POSITIVE', 'NEGATIVE'] + assert i + 1 == 5800 + +@pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data")).exists()') +def test_data_load_test(): + data = hotel_sentiment.load_hotel_sentiment_test() + for i, row in enumerate(data): + assert any(key in row.keys() for key in ['id', 'text']) + assert isinstance(row['id'], str) + assert isinstance(row['text'], str) + assert i +1 == 5800 + +@pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data")).exists()') +def test_data_load_label(): + data = hotel_sentiment.load_hotel_sentiment_test_label() + for i, row in enumerate(data): + assert any(key in row.keys() for key in ['id', 'sentiment_class']) + assert isinstance(row['id'], str) + assert hotel_sentiment.CLASS_VALUES[row['sentiment_class']] in ['POSITIVE', 'NEGATIVE'] + assert i +1 == 5800 diff --git a/tests/datasets/test_telco_sentiment_corpus.py b/tests/datasets/test_telco_sentiment_corpus.py index f6621ce7..019e6f22 100644 --- a/tests/datasets/test_telco_sentiment_corpus.py +++ b/tests/datasets/test_telco_sentiment_corpus.py @@ -3,7 +3,7 @@ import pytest -from .context import load_telco_sentiment_train, load_telco_sentiment_test, load_telco_sentiment_target +from .context import load_telco_sentiment_train, load_telco_sentiment_test, load_telco_sentiment_test_label from .context import TELCO_CLASS_VALUES @@ -30,7 +30,7 @@ def test_data_load_test(): @pytest.mark.skipif('not Path(expanduser("~/.sadedegel_data/telco_sentiment")).exists()') def test_data_load_target(): - data = load_telco_sentiment_target() + data = load_telco_sentiment_test_label() for i, row in enumerate(data): assert any(key in row.keys() for key in ['id', 'sentiment_class']) assert isinstance(row['id'], str) diff --git a/tests/prebuilt/context.py b/tests/prebuilt/context.py index 90c65feb..ea65d2d5 100644 --- a/tests/prebuilt/context.py +++ b/tests/prebuilt/context.py @@ -4,10 +4,11 @@ sys.path.insert(0, (Path(__file__) / '..' / '..').absolute()) from sadedegel.dataset import load_raw_corpus # noqa # pylint: disable=unused-import, wrong-import-position -from sadedegel.prebuilt import news_classification, tweet_profanity # noqa # pylint: disable=unused-import, wrong-import-position +from sadedegel.prebuilt import news_classification, tweet_profanity, telco_sentiment # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset.tscorpus import CATEGORIES # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset.profanity import CLASS_VALUES # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.prebuilt import tweet_sentiment , movie_reviews # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset.tweet_sentiment import CLASS_VALUES as SENTIMENT_VALUES # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.dataset.movie_sentiment import CLASS_VALUES as SENTIMENT_VALUES_M # noqa # pylint: disable=unused-import, wrong-import-position from sadedegel.prebuilt import product_sentiment +from sadedegel.dataset.telco_sentiment import CLASS_VALUES as SENTIMENT_VALUES_T # noqa # pylint: disable=unused-import, wrong-import-position diff --git a/tests/prebuilt/test_telco_sentiment.py b/tests/prebuilt/test_telco_sentiment.py new file mode 100644 index 00000000..2099cb27 --- /dev/null +++ b/tests/prebuilt/test_telco_sentiment.py @@ -0,0 +1,19 @@ +from .context import telco_sentiment, SENTIMENT_VALUES_T + + +def test_model_load(): + pipeline = telco_sentiment.load() + assert pipeline is not None + + +def test_inference(): + model = telco_sentiment.load() + + pred = model.predict(['turkcell en iyi operatör.', 'burada hala çekmiyor.']) + + assert SENTIMENT_VALUES_T[pred[0]] in SENTIMENT_VALUES_T + assert SENTIMENT_VALUES_T[pred[1]] in SENTIMENT_VALUES_T + + probability = model.predict_proba(['turkcell en iyi operatör.', 'burada hala çekmiyor.']) + + assert probability.shape == (2, 3) diff --git a/tests/test_word_tokenizer.py b/tests/test_word_tokenizer.py index eb78a397..ef81b64b 100644 --- a/tests/test_word_tokenizer.py +++ b/tests/test_word_tokenizer.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from .context import Doc, SimpleTokenizer, BertTokenizer, tokenizer_context, WordTokenizer +from .context import Doc, SimpleTokenizer, BertTokenizer, tokenizer_context, WordTokenizer, ICUTokenizer from .context import load_raw_corpus @@ -44,26 +44,25 @@ def test_bert_tokenization_sents(): assert doc[1].tokens == ['Barış', 'için', 'geldik', '.'] -def test_singleton_tokenizer(): +def test_tokenizer_type(): st1 = WordTokenizer.factory('simple') st2 = WordTokenizer.factory('simple-tokenizer') st3 = WordTokenizer.factory('SimpleTokenizer') - assert st1 == st2 == st3 + assert isinstance(st1, SimpleTokenizer) == isinstance(st2, SimpleTokenizer) == isinstance(st3, SimpleTokenizer) + if pkgutil.find_loader("transformers") is not None: + bt1 = WordTokenizer.factory('bert') + bt2 = WordTokenizer.factory('bert-tokenizer') + bt3 = WordTokenizer.factory('BERTTokenizer') -if pkgutil.find_loader("transformers") is not None: - bt1 = WordTokenizer.factory('bert') - bt2 = WordTokenizer.factory('bert-tokenizer') - bt3 = WordTokenizer.factory('BERTTokenizer') + assert isinstance(bt1, BertTokenizer) and isinstance(bt2, BertTokenizer) and isinstance(bt3, BertTokenizer) - assert bt1 == bt2 == bt3 + icut1 = WordTokenizer.factory('icu') + icut2 = WordTokenizer.factory('icu-tokenizer') + icut3 = WordTokenizer.factory('ICUTokenizer') -bt1 = WordTokenizer.factory('icu') -bt2 = WordTokenizer.factory('icu-tokenizer') -bt3 = WordTokenizer.factory('ICUTokenizer') - -assert bt1 == bt2 == bt3 + assert isinstance(icut1, ICUTokenizer) == isinstance(icut2, ICUTokenizer) == isinstance(icut3, ICUTokenizer) @pytest.mark.parametrize("toker", ["bert", "simple", "icu"])