diff --git a/src/stage_02_featurization.py b/src/stage_02_featurization.py index b3c971b..c9fb762 100644 --- a/src/stage_02_featurization.py +++ b/src/stage_02_featurization.py @@ -4,7 +4,7 @@ from tqdm import tqdm import logging from src.utils.common import read_yaml, create_directories, get_df -#from src.utils.featurize import save_matrix +from src.utils.featurize import save_matrix import numpy as np from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer @@ -39,7 +39,23 @@ def main(config_path, params_path): df_train = get_df(train_data_path) train_words = np.array(df_train.text.str.lower().values.astype("U")) ## << U1000 - print(train_words[: 5]) + # print(train_words[: 5]) + + bag_of_words = CountVectorizer( + stop_words="enlish", + max_features= max_features, + ngram_range=(1,ngrams) + ) + + bag_of_words.fit(train_words) + train_words_binary_matrix = bag_of_words..transform(train_words) + + tfidf = TfidfTransformer(smooth_idf=False) + tfidf.fit(train_words_binary_matrix) + train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix) + save_matrix(df_train, train_words_tfidf_matrix, featurized_train_data_path) + + diff --git a/src/utils/featurize.py b/src/utils/featurize.py new file mode 100644 index 0000000..8bd46f3 --- /dev/null +++ b/src/utils/featurize.py @@ -0,0 +1,16 @@ +import os +import logging +import pandas as pd +import joblib +import scipy.sparse as sparse +import numpy as np + +def save_matrix(df, matrix, out_path): + id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T + label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T + + result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr") + + msg = f"The ouput matrix {out_path} of size {result.shape} and data type: {result.dtype}" + logging.info(msg) + joblib.dump(result, out_path) \ No newline at end of file