+{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.7.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## more common imports\nfrom collections import Counter\nimport re\nfrom tqdm import tqdm_notebook as tqdm \n\n# languange processing imports\nimport nltk\nfrom gensim.corpora import Dictionary\n# preprocessing imports\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.model_selection import train_test_split\n\n# model imports\nfrom gensim.models.word2vec import Word2Vec\nimport gensim.downloader as api\nfrom sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n\n# hyperparameter training imports\nfrom sklearn.model_selection import GridSearchCV\n\n# visualization imports\nimport umap\nfrom IPython.display import display\nimport seaborn as sns\nimport matplotlib.pyplot as plt\nimport base64\nimport io\n%matplotlib inline\nsns.set() # defines the style","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# find and remove non-ascii words\n# I stored our special word in a variable for later use\n\nour_special_word = 'qwerty'\n\ndef remove_ascii_words(df):\n \"\"\" removes non-ascii characters from the 'texts' column in df.\n It returns the words containig non-ascii characers.\n \"\"\"\n non_ascii_words = []\n for i in range(len(df)):\n for word in df.loc[i, 'excerpt'].split(' '):\n if any([ord(character) >= 128 for character in word]):\n non_ascii_words.append(word)\n df.loc[i, 'excerpt'] = df.loc[i, 'excerpt'].replace(word, our_special_word)\n return non_ascii_words","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def get_good_tokens(sentence):\n replaced_punctation = list(map(lambda token: re.sub('[^0-9A-Za-z!?]+', '', token), sentence))\n removed_punctation = list(filter(lambda token: token, replaced_punctation))\n return removed_punctation","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Here we get transform the documents into sentences for the word2vecmodel\n# we made a function such that later on when we make the submission, we don't need to write duplicate code\ndef w2v_preprocessing(df):\n \"\"\" All the preprocessing steps for word2vec are done in this function.\n All mutations are done on the dataframe itself. So this function returns\n nothing.\n \"\"\"\n df['excerpt'] = df.excerpt.str.lower()\n df['document_sentences'] = df.excerpt.str.split('.') # split texts into individual sentences\n df['tokenized_sentences'] = list(map(lambda sentences:\n list(map(nltk.word_tokenize, sentences)),\n df.document_sentences)) # tokenize sentences\n df['tokenized_sentences'] = list(map(lambda sentences:\n list(map(get_good_tokens, sentences)),\n df.tokenized_sentences)) # remove unwanted characters\n df['tokenized_sentences'] = list(map(lambda sentences:\n list(filter(lambda lst: lst, sentences)),\n df.tokenized_sentences)) # remove empty lists","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_data = pd.read_csv(\"/kaggle/input/commonlitreadabilityprize/train.csv\")\ntrain_data.head()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_data.excerpt = train_data['excerpt'].apply(str)\nnon_ascii_words = remove_ascii_words(train_data)\n\nprint(\"Replaced {} words with characters with an ordinal >= 128 in the test data.\".format(\n len(non_ascii_words)))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"w2v_preprocessing(train_data)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_data.drop(train_data[train_data.tokenized_sentences.str.len() == 0].index, inplace= True) ","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#create dictionary with all sentences\nsentences = []\nfor sentence_group in train_data.tokenized_sentences:\n sentences.extend(sentence_group)\n\nprint(\"Number of sentences: {}.\".format(len(sentences)))\nprint(\"Number of texts: {}.\".format(len(train_data)))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def get_w2v_features(w2v_model, sentence_group):\n \"\"\" Transform a sentence_group (containing multiple lists\n of words) into a feature vector. It averages out all the\n word vectors of the sentence_group.\n \"\"\"\n words = np.concatenate(sentence_group) # words in text\n index2word_set = set(w2v_model.index_to_key)# set(w2v_model.wv.vocab.keys()) # words known to model\n \n featureVec = np.zeros(w2v_model.vector_size, dtype=\"float32\")\n \n # Initialize a counter for number of words in a review\n nwords = 0\n # Loop over each word in the comment and, if it is in the model's vocabulary, add its feature vector to the total\n for word in words:\n if word in index2word_set: \n featureVec = np.add(featureVec, w2v_model[word])\n nwords += 1.\n\n # Divide the result by the number of words to get the average\n if nwords > 0:\n featureVec = np.divide(featureVec, nwords)\n return featureVec","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"W2Vmodel = api.load('word2vec-google-news-300')","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_data['w2v_features'] = list(map(lambda sen_group:\n get_w2v_features(W2Vmodel, sen_group),\n train_data.tokenized_sentences))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_data.head()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_data[\"w2v_resh_features\"] = train_data[\"w2v_features\"].apply(lambda x : x.reshape(1,-1) )","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"arr_w2v = train_data.w2v_resh_features[0]\nfor i in range(1, len(train_data)):\n arr_w2v = np.vstack((arr_w2v, train_data.w2v_resh_features[i]))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## UMAP","metadata":{}},{"cell_type":"code","source":"umap_emb = umap.UMAP(n_neighbors= 15, n_components = 2, target_metric = 'l1' , n_epochs = 500).fit_transform(arr_w2v, y=train_data.target)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig, ax = plt.subplots(1, figsize=(14, 10))\nplt.scatter(*umap_emb.T, s=0.3, c=train_data.target, cmap='Spectral', alpha=1.0)\nplt.setp(ax, xticks=[], yticks=[])\nax.patch.set_facecolor('black')\nfg_color = 'white'\ncbar = plt.colorbar()\nplt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color=fg_color)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Validation ","metadata":{}},{"cell_type":"code","source":"X_train, X_val, y_train, y_val = train_test_split(\n arr_w2v, train_data.target, test_size=0.05, random_state=42)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"mapper = umap.UMAP(n_neighbors= 15, n_components = 2, target_metric = 'l1' , n_epochs = 1000).fit(X_train, y=y_train)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"val_embedding = mapper.transform(X_val)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig, ax = plt.subplots(1, figsize=(14, 10))\nplt.scatter(*mapper.embedding_.T, s=3, c=y_train, cmap='Spectral', alpha=1.0)\nplt.setp(ax, xticks=[], yticks=[])\nax.patch.set_facecolor('black')\nfg_color = 'black'\ncbar = plt.colorbar()\nplt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color=fg_color)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"fig, ax = plt.subplots(1, figsize=(14, 10))\nplt.scatter(*val_embedding.T, s=3, c=y_val, cmap='Spectral', alpha=1.0)\nplt.setp(ax, xticks=[], yticks=[])\nax.patch.set_facecolor('black')\nfg_color = 'black'\ncbar = plt.colorbar()\nplt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color=fg_color)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Regression on W2V\nWithout augmentation","metadata":{}},{"cell_type":"code","source":"from sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.metrics import mean_squared_error as mse","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"reg = GradientBoostingRegressor(random_state=0)\nreg.fit(X_train, y_train)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"y_pred = reg.predict(X_val)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"mse(y_val, y_pred) #0.44 not bad, let's see on test :) ","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Augmenting W2V","metadata":{}},{"cell_type":"markdown","source":"# Test Data","metadata":{}},{"cell_type":"code","source":"test_data = pd.read_csv(\"/kaggle/input/commonlitreadabilityprize/test.csv\")\ntest_data.head()","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_data.excerpt = test_data['excerpt'].apply(str)\nnon_ascii_words = remove_ascii_words(test_data)\n\nprint(\"Replaced {} words with characters with an ordinal >= 128 in the test data.\".format(\n len(non_ascii_words)))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"w2v_preprocessing(test_data)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_data.drop(test_data[test_data.tokenized_sentences.str.len() == 0].index, inplace= True) ","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#create dictionary with all sentences\nsentences_test = []\nfor sentence_group in test_data.tokenized_sentences:\n sentences_test.extend(sentence_group)\n\nprint(\"Number of sentences: {}.\".format(len(sentences)))\nprint(\"Number of texts: {}.\".format(len(test_data)))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_data['w2v_features'] = list(map(lambda sen_group:\n get_w2v_features(W2Vmodel, sen_group),\n test_data.tokenized_sentences))","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_data[\"w2v_resh_features\"] = test_data[\"w2v_features\"].apply(lambda x : x.reshape(1,-1) )","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"arr_w2v_test = test_data.w2v_resh_features[0]\nfor i in range(1, len(test_data)):\n arr_w2v_test = np.vstack((arr_w2v_test, test_data.w2v_resh_features[i]))","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]}
0 commit comments