Minor corrections to HW1 and HW3

dattgoswami · Jun 6, 2023 · cdde576 · cdde576
1 parent b3cc2b4
commit cdde576
Show file tree

Hide file tree

Showing 2 changed files with 100 additions and 25 deletions.
diff --git a/hw_recogs.ipynb b/hw_recogs.ipynb
@@ -237,7 +237,7 @@
     "\n",
     "1. Identify proper names. All and only proper names begin with capital letters in these LFs, and proper names consist only of ascii letters. The format is, informally, `Name ( d+ )`, as in `Sandy ( 47 )`.\n",
     "\n",
-    "2. Identify role expressions. The pattern is always `role ( d+ , d+ )`, as in `agent ( 1 , 47 )`. Here, the first variable is for the associated event, and the second is the role argument. The possible roles are `agent`, `theme`, and `recipient`.\n",
+    "2. Identify role expressions. The pattern is always `role ( d+ , d+ )`, as in `agent ( 1 , 47 )`. Here, the first variable is for the associated event, and the second is the role argument. The possible roles are `agent`, `theme`, and `recipient`. (The dataset includes other roles, but these involve events, not people.)\n",
     "\n",
     "3. Determine which of the above are linked in the sense that the variable names are the same. A given name can link to multiple role expressions (or none at all), and LFs can contain multiple names and multiple role expressions.\n",
     "\n",
@@ -313,12 +313,12 @@
     "    examples = [\n",
     "        # Standard case:\n",
     "        (\n",
-    "            \"Bella ( 7 ) ; smile ( 4 ) AND agent ( 4 , 7 )\", \n",
+    "            \"Bella ( 7 ) ; smile ( 4 ) AND agent ( 4 , 7 )\",\n",
     "            {(\"Bella\", \"agent\")}\n",
     "        ),\n",
     "        # No binding:\n",
     "        (\n",
-    "            \"Riley ( 37 ) ; theme ( 4 , 7 )\", \n",
+    "            \"Riley ( 37 ) ; theme ( 4 , 7 )\",\n",
     "            set()\n",
     "        ),\n",
     "        # Two tokens of the same name referring to different entities:\n",
@@ -328,12 +328,12 @@
     "        ),\n",
     "        # Two names:\n",
     "        (\n",
-    "            \"Riley ( 4 ) ; Emma ( 243 ) ; recipient ( 6 , 4 ) AND agent ( 6, 243 )\",\n",
+    "            \"Riley ( 4 ) ; Emma ( 243 ) ; recipient ( 6 , 4 ) AND agent ( 6 , 243 )\",\n",
     "            {(\"Riley\", \"recipient\"), (\"Emma\", \"agent\")},\n",
     "        ),\n",
     "        # One name binding into multiple role expressions:\n",
     "        (\n",
-    "            \"Riley ( 4 ) ; agent ( 6 , 4 ) AND theme ( 6, 4 )\",\n",
+    "            \"Riley ( 4 ) ; agent ( 6 , 4 ) AND theme ( 6 , 4 )\",\n",
     "            {(\"Riley\", \"theme\"), (\"Riley\", \"agent\")},\n",
     "        ),\n",
     "        # Nothing to match:\n",
@@ -1037,8 +1037,10 @@
     "        return preds\n",
     "\n",
     "    def score(self, X, y, device=None):\n",
+    "        # An overall accuracy score:\n",
     "        preds = self.predict(X, device=device)\n",
-    "        return recogs_exact_match(y, preds)"
+    "        vals = [int(recogs_exact_match(gold, pred)) for gold, pred in zip(y, preds)]\n",
+    "        return sum(vals) / len(vals)"
    ]
   },
   {
@@ -2042,7 +2044,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "test_bakeoff_file()"
+    "test_bakeoff_file(\"cs224u-recogs-bakeoff-entry.tsv\")"
    ]
   }
  ],

diff --git a/hw_sentiment.ipynb b/hw_sentiment.ipynb
@@ -875,7 +875,7 @@
     "\n",
     "toy_mod.fit(X_train, [0, 1])\n",
     "\n",
-    "# Here's the error! Don't use `fit_transform` again! \n",
+    "# Here's the error! Don't use `fit_transform` again!\n",
     "# Use `transform`!\n",
     "X_test = vec.fit_transform(test_feats)\n",
     "\n",
@@ -922,8 +922,8 @@
     "from sklearn.model_selection import train_test_split\n",
     "\n",
     "X_toy, y_toy = make_classification(\n",
-    "    n_samples=200, n_classes=3, \n",
-    "    n_informative=15, n_features=20, \n",
+    "    n_samples=200, n_classes=3,\n",
+    "    n_informative=15, n_features=20,\n",
     "    weights=[0.2, 0.2, 0.6],\n",
     "    random_state=1)\n",
     "\n",
@@ -1172,9 +1172,9 @@
     "    if caps_result != caps_expected:\n",
     "        errcount += 1\n",
     "        print(f\"Error for `{func.__name__}`: For input {caps_ex}, \"\n",
-    "              f\"expected {caps_expected} but got {caps_result}\")    \n",
+    "              f\"expected {caps_expected} but got {caps_result}\")\n",
     "    if errcount == 0:\n",
-    "        print(f\"All tests passed for `{func.__name__}`\")    "
+    "        print(f\"All tests passed for `{func.__name__}`\")"
    ]
   },
   {
@@ -1232,7 +1232,7 @@
     "    featfunc : func\n",
     "        Maps strings to Counter instances\n",
     "    train_dataset: dict\n",
-    "        Must have a key \"sentence\" containing strings that `featfunc` \n",
+    "        Must have a key \"sentence\" containing strings that `featfunc`\n",
     "        will process, and a key \"gold_label\" giving labels\n",
     "\n",
     "    Returns\n",
@@ -1362,7 +1362,7 @@
     "        Maps strings to count dicts\n",
     "    vectorizer : fitted DictVectorizer\n",
     "    assess_dataset: dict\n",
-    "        Must have a key \"sentence\" containing strings that `featfunc` \n",
+    "        Must have a key \"sentence\" containing strings that `featfunc`\n",
     "        will process, and a key \"gold_label\" giving labels\n",
     "\n",
     "    Returns\n",
@@ -1795,10 +1795,10 @@
     "    1. The max length should be 512.\n",
     "    2. Examples longer than the max length should be truncated\n",
     "    3. Examples should be padded to the max length for the batch.\n",
-    "    4. The special [CLS] should be added to the start and the special \n",
+    "    4. The special [CLS] should be added to the start and the special\n",
     "       token [SEP] should be added to the end.\n",
     "    5. The attention mask should be returned\n",
-    "    6. The return value of each component should be a tensor.    \n",
+    "    6. The return value of each component should be a tensor.\n",
     "\n",
     "    Parameters\n",
     "    ----------\n",
@@ -2010,8 +2010,8 @@
     "            n_classes, \n",
     "            hidden_activation, \n",
     "            weights_name=\"prajjwal1/bert-mini\"):\n",
-    "        \"\"\"This module loads a Transformer based on  `weights_name`, \n",
-    "        puts it in train mode, add a dense layer with activation \n",
+    "        \"\"\"This module loads a Transformer based on  `weights_name`,\n",
+    "        puts it in train mode, add a dense layer with activation\n",
     "        function give by `hidden_activation`, and puts a classifier\n",
     "        layer on top of that as the final output. The output of\n",
     "        the dense layer should have the same dimensionality as the\n",
@@ -2034,17 +2034,17 @@
     "        self.bert.train()\n",
     "        self.hidden_activation = hidden_activation\n",
     "        self.hidden_dim = self.bert.embeddings.word_embeddings.embedding_dim\n",
-    "        # Add the new parameters here using `nn.Sequential`. \n",
+    "        # Add the new parameters here using `nn.Sequential`.\n",
     "        # We can define this layer as\n",
     "        # \n",
     "        #  h = f(cW1 + b_h)\n",
     "        #  y = hW2 + b_y\n",
     "        #\n",
     "        # where c is the final hidden state above the [CLS] token,\n",
     "        # W1 has dimensionality (self.hidden_dim, self.hidden_dim),\n",
-    "        # W2 has dimensionality (self.hidden_dim, self.n_classes), \n",
-    "        # and we rely on the PyTorch loss function to add apply a\n",
-    "        # softmax to y.  \n",
+    "        # W2 has dimensionality (self.hidden_dim, self.n_classes),\n",
+    "        # f is the hidden activation, and we rely on the PyTorch loss\n",
+    "        # function to add apply a softmax to y.\n",
     "        self.classifier_layer = None\n",
     "        ##### YOUR CODE HERE\n",
     "\n",
@@ -2053,7 +2053,7 @@
     "    def forward(self, indices, mask):\n",
     "        \"\"\"Process `indices` with `mask` by feeding these arguments\n",
     "        to `self.bert` and then feeding the initial hidden state\n",
-    "        in `last_hidden_state` to `self.classifier_layer`.\n",
+    "        in `last_hidden_state` to `self.classifier_layer`\n",
     "\n",
     "        Parameters\n",
     "        ----------\n",
@@ -2435,12 +2435,85 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To enter the bakeoff, you simply need to use your original system t:\n",
+    "To enter the bakeoff, you simply need to use your original system to:\n",
     "\n",
     "1. Add a column named 'prediction' to `cs224u-sentiment-test-unlabeled.csv` with your model predictions (which are strings in {`positive`, `negative`, `neutral`}). The existing columns should remain.\n",
     "\n",
-    "2. Save the file as `cs224u-sentiment-bakeoff-entry.csv`.\n",
+    "2. Save the file as `cs224u-sentiment-bakeoff-entry.csv`. Here is a good snippet of code for writing this file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This is a placeholder for adding the \"prediction\" column:\n",
+    "# bakeoff_df['prediction'] = # Use your model to add predictions.\n",
+    "\n",
+    "# Write to disk\n",
+    "bakeoff_df.to_csv(\"cs224u-sentiment-bakeoff-entry.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In particular, you need to be sure that `example_id` is a column rather than an index when read in by Pandas. Here is a quick test:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_bakeoff_entry(filename=\"cs224u-sentiment-bakeoff-entry.csv\"):\n",
+    "    gold_df = pd.read_csv(\n",
+    "        os.path.join(\"data\", \"sentiment\", \"cs224u-sentiment-test-unlabeled.csv\"))\n",
+    "    entry_df = pd.read_csv(filename)\n",
     "\n",
+    "    # Check that no required columns are missing:\n",
+    "    expected_cols = {'example_id', 'sentence', 'prediction'}\n",
+    "    missing_cols = expected_cols - set(entry_df.columns)\n",
+    "    errcount = 0\n",
+    "    if len(missing_cols) != 0:\n",
+    "        errcount += 1\n",
+    "        print(f\"Entry is missing required columns {missing_cols}\")\n",
+    "        return\n",
+    "\n",
+    "    # Check that the predictions are in our space:\n",
+    "    labels = {'positive', 'negative', 'neutral'}\n",
+    "    predtypes = set(entry_df.prediction.unique())\n",
+    "    unexpected = predtypes - labels\n",
+    "    if len(unexpected) != 0:\n",
+    "        errcount += 1\n",
+    "        print(f\"Prediction column has unexpected values: {unexpected}\")\n",
+    "\n",
+    "    # Check that the dataset hasn't been rearranged:\n",
+    "    for colname in ('example_id', 'sentence'):\n",
+    "        if not entry_df[colname].equals(gold_df[colname]):\n",
+    "            errcount += 1\n",
+    "            print(f\"Entry is misaligned with test data on column {colname}\")\n",
+    "\n",
+    "    # Clean bill of health:\n",
+    "    if errcount == 0:\n",
+    "        print(\"No errors detected with `test_bakeoff_entry`.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_bakeoff_entry(\"cs224u-sentiment-bakeoff-entry.csv\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "Submit the following files to Gradescope:\n",
     "\n",
     "* `hw_sentiment.ipynb` (this notebook)\n",