+{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"#!pip install Pyphen --no-index --find-links=file:///kaggle/input/roberta/Pyphen-0.9.5-py2.py3-none-any.whl\n#!pip install repoze.lru --no-index --find-links=file:///kaggle/input/roberta/repoze.lru-0.7-py3-none-any.whl\n#!pip install textstat --no-index --find-links=file:///kaggle/input/roberta/textstat-0.7.0-py3-none-any.whl","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import torch\n#import textstat\nimport xgboost as xgb\nimport pandas as pd\nimport numpy as np\nfrom transformers import AutoModelForSequenceClassification, AutoTokenizer","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":31,"outputs":[]},{"cell_type":"code","source":"class Dataset:\n def __init__(self, excerpt, tokenizer, max_len):\n self.excerpt = excerpt\n self.tokenizer = tokenizer\n self.max_len = max_len\n\n def __len__(self):\n return len(self.excerpt)\n\n def __getitem__(self, item):\n text = str(self.excerpt[item])\n inputs = self.tokenizer(\n text, \n max_length=self.max_len, \n padding=\"max_length\", \n truncation=True\n )\n\n ids = inputs[\"input_ids\"]\n mask = inputs[\"attention_mask\"]\n\n return {\n \"input_ids\": torch.tensor(ids, dtype=torch.long),\n \"attention_mask\": torch.tensor(mask, dtype=torch.long),\n }","metadata":{"trusted":true},"execution_count":32,"outputs":[]},{"cell_type":"code","source":"def generate_predictions(model_path, max_len):\n model = AutoModelForSequenceClassification.from_pretrained(model_path)\n tokenizer = AutoTokenizer.from_pretrained(model_path)\n\n model.to(\"cuda\")\n model.eval()\n \n df = pd.read_csv(\"../input/commonlitreadabilityprize/test.csv\")\n \n dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len)\n data_loader = torch.utils.data.DataLoader(\n dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False\n )\n\n final_output = []\n\n for b_idx, data in enumerate(data_loader):\n with torch.no_grad():\n for key, value in data.items():\n data[key] = value.to(\"cuda\")\n output = model(**data)\n output = output.logits.detach().cpu().numpy().ravel().tolist()\n final_output.extend(output)\n \n torch.cuda.empty_cache()\n return np.array(final_output)","metadata":{"_kg_hide-output":true,"trusted":true},"execution_count":33,"outputs":[]},{"cell_type":"code","source":"def generate_predictions_train(model_path, max_len):\n model = AutoModelForSequenceClassification.from_pretrained(model_path)\n tokenizer = AutoTokenizer.from_pretrained(model_path)\n\n model.to(\"cuda\")\n model.eval()\n \n df = pd.read_csv(\"../input/commonlitreadabilityprize/train.csv\")\n \n dataset = Dataset(excerpt=df.excerpt.values, tokenizer=tokenizer, max_len=max_len)\n data_loader = torch.utils.data.DataLoader(\n dataset, batch_size=32, num_workers=4, pin_memory=True, shuffle=False\n )\n\n final_output = []\n\n for b_idx, data in enumerate(data_loader):\n with torch.no_grad():\n for key, value in data.items():\n data[key] = value.to(\"cuda\")\n output = model(**data)\n output = output.logits.detach().cpu().numpy().ravel().tolist()\n final_output.extend(output)\n \n torch.cuda.empty_cache()\n return np.array(final_output)","metadata":{"trusted":true},"execution_count":44,"outputs":[]},{"cell_type":"code","source":"preds1_train = generate_predictions_train(\"../input/a81653/\", max_len=256)\npreds2_train = generate_predictions_train(\"../input/a81656/\", max_len=256)\npreds3_train = generate_predictions_train(\"../input/a81657/\", max_len=256)\npreds4_train = generate_predictions_train(\"../input/a81660/\", max_len=256)\npreds5_train = generate_predictions_train(\"../input/a81675/\", max_len=192)\npreds6_train = generate_predictions_train(\"../input/a87832/\", max_len=256)","metadata":{"trusted":true},"execution_count":45,"outputs":[]},{"cell_type":"code","source":"preds1_test = generate_predictions(\"../input/a81653/\", max_len=256)\npreds2_test = generate_predictions(\"../input/a81656/\", max_len=256)\npreds3_test = generate_predictions(\"../input/a81657/\", max_len=256)\npreds4_test = generate_predictions(\"../input/a81660/\", max_len=256)\npreds5_test = generate_predictions(\"../input/a81675/\", max_len=192)\npreds6_test = generate_predictions(\"../input/a87832/\", max_len=256)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')\ntest_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#linguistic features\n\"\"\"\ntrain_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')\ntest_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')\n\nind = np.where(train_df.standard_error == train_df.standard_error.min())[0]\ntrain_df.loc[ind]\ntrain_df.drop(ind, inplace = True)\ntrain_df.reset_index(inplace = True,drop = True)\n\ntrain_df['character_count'] = train_df['excerpt'].apply(lambda x: len(str(x)))\ntrain_df['digit_count'] = train_df['excerpt'].apply(lambda x: np.sum(([int(word.isdigit()) for word in str(x).split()])))\ntrain_df['word_count'] = train_df['excerpt'].apply(textstat.lexicon_count)\ntrain_df['unique_word_count'] = train_df['excerpt'].apply(lambda x: len(set(str(x).split())))\ntrain_df['mean_word_length'] = train_df['excerpt'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))\ntrain_df['syllable_count'] = train_df['excerpt'].apply(textstat.syllable_count)\ntrain_df['sentence_count'] = train_df['excerpt'].apply(textstat.sentence_count)\ntrain_df['flesch_reading_ease'] = train_df['excerpt'].apply(textstat.flesch_reading_ease)\ntrain_df['flesch_kincaid_grade'] = train_df['excerpt'].apply(textstat.flesch_kincaid_grade)\ntrain_df['smog_index'] = train_df['excerpt'].apply(textstat.smog_index)\ntrain_df['automated_readability_index'] = train_df['excerpt'].apply(textstat.automated_readability_index)\ntrain_df['coleman_liau_index'] = train_df['excerpt'].apply(textstat.coleman_liau_index)\ntrain_df['linsear_write_formula'] = train_df['excerpt'].apply(textstat.linsear_write_formula)\n\n\nX_train = train_df.iloc[:,6:]\ny_train = train_df[\"target\"].values\n\nxg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.1,\n max_depth = 5, alpha = 10, n_estimators = 1000, verbosity = 1)\n\nxg_reg.fit(X_train,y_train)\n\ntest_df['character_count'] = test_df['excerpt'].apply(lambda x: len(str(x)))\ntest_df['digit_count'] = test_df['excerpt'].apply(lambda x: np.sum(([int(word.isdigit()) for word in str(x).split()])))\ntest_df['word_count'] = test_df['excerpt'].apply(textstat.lexicon_count)\ntest_df['unique_word_count'] = test_df['excerpt'].apply(lambda x: len(set(str(x).split())))\ntest_df['mean_word_length'] = test_df['excerpt'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))\ntest_df['syllable_count'] = test_df['excerpt'].apply(textstat.syllable_count)\ntest_df['sentence_count'] = test_df['excerpt'].apply(textstat.sentence_count)\ntest_df['flesch_reading_ease'] = test_df['excerpt'].apply(textstat.flesch_reading_ease)\ntest_df['flesch_kincaid_grade'] = test_df['excerpt'].apply(textstat.flesch_kincaid_grade)\ntest_df['smog_index'] = test_df['excerpt'].apply(textstat.smog_index)\ntest_df['automated_readability_index'] = test_df['excerpt'].apply(textstat.automated_readability_index)\ntest_df['coleman_liau_index'] = test_df['excerpt'].apply(textstat.coleman_liau_index)\ntest_df['linsear_write_formula'] = test_df['excerpt'].apply(textstat.linsear_write_formula)\n\n\nX_test = test_df.iloc[:,4:]\npreds7 = xg_reg.predict(X_test)\n\"\"\";","metadata":{"trusted":true},"execution_count":35,"outputs":[]},{"cell_type":"code","source":"preds_final_train = np.vstack((preds1_train, preds2_train, preds3_train, preds4_train, preds5_train, preds6_train))\ny_train = train_df[\"target\"].values","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.5, learning_rate = 0.1,\n max_depth = 5, alpha = 10, n_estimators = 1000, verbosity = 1)\n\nxg_reg.fit(preds_final_train,y_train)","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"preds_final_test = np.vstack((preds1_test, preds2_test, preds3_test, preds4_test, preds5_test, preds6_test))","metadata":{"trusted":true},"execution_count":42,"outputs":[{"execution_count":42,"output_type":"execute_result","data":{"text/plain":"array([[-0.16711852, -0.54188776, -0.18852767, -2.72694254, -1.76201248,\n -1.16781425, 0.36099541],\n [-0.07859137, -0.62694353, -0.20510463, -2.31090212, -1.67234659,\n -1.19319785, 0.40452987],\n [ 0.03635367, -0.47886458, -0.33608192, -2.22210574, -1.74552488,\n -0.90777624, 0.37602738],\n [-0.36222428, -0.48514423, -0.55559903, -2.10066557, -1.9371829 ,\n -1.40488291, 0.08838883],\n [-0.19421875, -0.18657285, -0.49220958, -1.73313713, -1.7733413 ,\n -1.59317338, -0.1030577 ],\n [-0.31367698, -0.23227143, -0.2733309 , -2.76938272, -1.93409562,\n -0.99660009, 0.697528 ]])"},"metadata":{}}]},{"cell_type":"code","source":"preds = xg_reg.predict(preds_final_test)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#preds = (0.9*(preds1 + preds2 + preds3 + preds4 + preds5 + preds6) + 0.1*preds7) / 7\n","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"submission = pd.read_csv(\"../input/commonlitreadabilityprize/sample_submission.csv\")\nsubmission.target = preds\nsubmission.to_csv(\"submission.csv\", index=False)","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]}
0 commit comments