Skip to content

Commit

Permalink
stage -02 train data tfidf matrix saved
Browse files Browse the repository at this point in the history
  • Loading branch information
shivpalSW committed Jul 16, 2023
1 parent 9f3e23d commit cee8e82
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
20 changes: 18 additions & 2 deletions src/stage_02_featurization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from tqdm import tqdm
import logging
from src.utils.common import read_yaml, create_directories, get_df
#from src.utils.featurize import save_matrix
from src.utils.featurize import save_matrix
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

Expand Down Expand Up @@ -39,7 +39,23 @@ def main(config_path, params_path):
df_train = get_df(train_data_path)

train_words = np.array(df_train.text.str.lower().values.astype("U")) ## << U1000
print(train_words[: 5])
# print(train_words[: 5])

bag_of_words = CountVectorizer(
stop_words="enlish",
max_features= max_features,
ngram_range=(1,ngrams)
)

bag_of_words.fit(train_words)
train_words_binary_matrix = bag_of_words..transform(train_words)

tfidf = TfidfTransformer(smooth_idf=False)
tfidf.fit(train_words_binary_matrix)
train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
save_matrix(df_train, train_words_tfidf_matrix, featurized_train_data_path)





Expand Down
16 changes: 16 additions & 0 deletions src/utils/featurize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import os
import logging
import pandas as pd
import joblib
import scipy.sparse as sparse
import numpy as np

def save_matrix(df, matrix, out_path):
id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T

result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")

msg = f"The ouput matrix {out_path} of size {result.shape} and data type: {result.dtype}"
logging.info(msg)
joblib.dump(result, out_path)

0 comments on commit cee8e82

Please sign in to comment.