Skip to content

Commit

Permalink
Update generated files
Browse files Browse the repository at this point in the history
  • Loading branch information
fchollet committed Aug 29, 2020
1 parent c471a01 commit e5f9c43
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 179 deletions.
66 changes: 21 additions & 45 deletions examples/nlp/ipynb/pretrained_word_embeddings.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
"source": [
"import numpy as np\n",
"import tensorflow as tf\n",
"from tensorflow import keras\n",
""
"from tensorflow import keras"
]
},
{
Expand Down Expand Up @@ -76,8 +75,7 @@
" \"news20.tar.gz\",\n",
" \"http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz\",\n",
" untar=True,\n",
")\n",
""
")"
]
},
{
Expand All @@ -100,17 +98,14 @@
"import os\n",
"import pathlib\n",
"\n",
"os.listdir(pathlib.Path(data_path).parent)\n",
"\n",
"data_dir = pathlib.Path(data_path).parent / \"20_newsgroup\"\n",
"dirnames = os.listdir(data_dir)\n",
"print(\"Number of directories:\", len(dirnames))\n",
"print(\"Directory names:\", dirnames)\n",
"\n",
"fnames = os.listdir(data_dir / \"comp.graphics\")\n",
"print(\"Number of files in comp.graphics:\", len(fnames))\n",
"print(\"Some example filenames:\", fnames[:5])\n",
""
"print(\"Some example filenames:\", fnames[:5])"
]
},
{
Expand All @@ -130,8 +125,7 @@
},
"outputs": [],
"source": [
"print(open(data_dir / \"comp.graphics\" / \"38987\").read())\n",
""
"print(open(data_dir / \"comp.graphics\" / \"38987\").read())"
]
},
{
Expand Down Expand Up @@ -174,8 +168,7 @@
" class_index += 1\n",
"\n",
"print(\"Classes:\", class_names)\n",
"print(\"Number of samples:\", len(samples))\n",
""
"print(\"Number of samples:\", len(samples))"
]
},
{
Expand Down Expand Up @@ -218,8 +211,7 @@
"train_samples = samples[:-num_validation_samples]\n",
"val_samples = samples[-num_validation_samples:]\n",
"train_labels = labels[:-num_validation_samples]\n",
"val_labels = labels[-num_validation_samples:]\n",
""
"val_labels = labels[-num_validation_samples:]"
]
},
{
Expand Down Expand Up @@ -249,8 +241,7 @@
"\n",
"vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)\n",
"text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)\n",
"vectorizer.adapt(text_ds)\n",
""
"vectorizer.adapt(text_ds)"
]
},
{
Expand All @@ -271,8 +262,7 @@
},
"outputs": [],
"source": [
"vectorizer.get_vocabulary()[:5]\n",
""
"vectorizer.get_vocabulary()[:5]"
]
},
{
Expand All @@ -292,9 +282,8 @@
},
"outputs": [],
"source": [
"output = vectorizer(np.array([[\"the cat sat on the mat\"]]))\n",
"output.numpy()[0, :6]\n",
""
"output = vectorizer([[\"the cat sat on the mat\"]])\n",
"output.numpy()[0, :6]"
]
},
{
Expand All @@ -319,8 +308,7 @@
"outputs": [],
"source": [
"voc = vectorizer.get_vocabulary()\n",
"word_index = dict(zip(voc, range(2, len(voc))))\n",
""
"word_index = dict(zip(voc, range(2, len(voc))))"
]
},
{
Expand All @@ -340,9 +328,8 @@
},
"outputs": [],
"source": [
"test = [b\"the\", b\"cat\", b\"sat\", b\"on\", b\"the\", b\"mat\"]\n",
"[word_index[w] for w in test]\n",
""
"test = [\"the\", \"cat\", \"sat\", \"on\", \"the\", \"mat\"]\n",
"[word_index[w] for w in test]"
]
},
{
Expand Down Expand Up @@ -401,8 +388,7 @@
" coefs = np.fromstring(coefs, \"f\", sep=\" \")\n",
" embeddings_index[word] = coefs\n",
"\n",
"print(\"Found %s word vectors.\" % len(embeddings_index))\n",
""
"print(\"Found %s word vectors.\" % len(embeddings_index))"
]
},
{
Expand All @@ -413,11 +399,7 @@
"source": [
"Now, let's prepare a corresponding embedding matrix that we can use in a Keras\n",
"`Embedding` layer. It's a simple NumPy matrix where entry at index `i` is the pre-trained\n",
"vector for the word of index `i` in our `vectorizer`'s vocabulary.\n",
"\n",
"**Note:** the `TextVectorization` layer stores tokens as bytes, not `str` types.\n",
"This means that you need to decode them to `utf-8` before doing string comparisons, like\n",
"the below: `embeddings_index.get(word.decode('utf-8'))`"
"vector for the word of index `i` in our `vectorizer`'s vocabulary."
]
},
{
Expand All @@ -436,7 +418,7 @@
"# Prepare embedding matrix\n",
"embedding_matrix = np.zeros((num_tokens, embedding_dim))\n",
"for word, i in word_index.items():\n",
" embedding_vector = embeddings_index.get(word.decode(\"utf-8\"))\n",
" embedding_vector = embeddings_index.get(word)\n",
" if embedding_vector is not None:\n",
" # Words not found in embedding index will be all-zeros.\n",
" # This includes the representation for \"padding\" and \"OOV\"\n",
Expand All @@ -445,7 +427,6 @@
" else:\n",
" misses += 1\n",
"print(\"Converted %d words (%d misses)\" % (hits, misses))\n",
"\n",
""
]
},
Expand Down Expand Up @@ -476,8 +457,7 @@
" embedding_dim,\n",
" embeddings_initializer=keras.initializers.Constant(embedding_matrix),\n",
" trainable=False,\n",
")\n",
""
")"
]
},
{
Expand Down Expand Up @@ -513,8 +493,7 @@
"x = layers.Dropout(0.5)(x)\n",
"preds = layers.Dense(len(class_names), activation=\"softmax\")(x)\n",
"model = keras.Model(int_sequences_input, preds)\n",
"model.summary()\n",
""
"model.summary()"
]
},
{
Expand All @@ -541,8 +520,7 @@
"x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()\n",
"\n",
"y_train = np.array(train_labels)\n",
"y_val = np.array(val_labels)\n",
""
"y_val = np.array(val_labels)"
]
},
{
Expand All @@ -566,8 +544,7 @@
"model.compile(\n",
" loss=\"sparse_categorical_crossentropy\", optimizer=\"rmsprop\", metrics=[\"acc\"]\n",
")\n",
"model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))\n",
""
"model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))"
]
},
{
Expand Down Expand Up @@ -602,8 +579,7 @@
" [[\"this message is about computer graphics and 3D modeling\"]]\n",
")\n",
"\n",
"class_names[np.argmax(probabilities[0])]\n",
""
"class_names[np.argmax(probabilities[0])]"
]
}
],
Expand Down
Loading

0 comments on commit e5f9c43

Please sign in to comment.