DorinK
diff --git a/‎.DS_Store
6 KB b/‎.DS_Store
6 KB
diff --git a/‎Assignment 1.pdf
130 KB b/‎Assignment 1.pdf
130 KB
diff --git a/‎code/.DS_Store
6 KB b/‎code/.DS_Store
6 KB
diff --git a/‎code/README.txt
Lines changed: 8 additions & 0 deletions b/‎code/README.txt
Lines changed: 8 additions & 0 deletions
diff --git a/‎code/answers.txt
Lines changed: 15 additions & 0 deletions b/‎code/answers.txt
Lines changed: 15 additions & 0 deletions
diff --git a/‎code/grad_check.py
Lines changed: 64 additions & 0 deletions b/‎code/grad_check.py
Lines changed: 64 additions & 0 deletions
diff --git a/‎code/loglinear.py
Lines changed: 133 additions & 0 deletions b/‎code/loglinear.py
Lines changed: 133 additions & 0 deletions
diff --git a/‎code/mlp1.py
Lines changed: 110 additions & 0 deletions b/‎code/mlp1.py
Lines changed: 110 additions & 0 deletions
@@ -0,0 +1,8 @@
+Name: Dorin Keshales
+ID: 313298424
+
+I added a file called mlpn_main.py in order to allow you to determine how many hidden layers you want and their sizes.
+
+In train_mlpn I wrote hard coded the number of layers and their sizes according to what I found giving high accuracy on the validation set.
+
+So you have the ability to choose between the two.
@@ -0,0 +1,15 @@
+Name: Dorin Keshales
+ID: 313298424
+
+1. I got about the same percentage of accuracy with both models - around 85%-86.6%.
+Sometimes with the log-linear model I even got accuracy of 87%.
+In my opinion, when linear model gets such high accuracy there is no much to MLP with one hidden layer to do in order to improve the accuracy. I mean that a linear model is enough in this case to solve the language identification task well.
+
+
+2. The best I can get with the MLP1 model with the letter-unigrams features is accuracy of 69%-70%. And with the log-linear model, the best I can get with these features is accuracy of 72%.
+In my opinion, the reason for lower percentage of accuracy with the letter-unigrams features in contrary to the letter-bigrams features is that we have much less unigram features than bigrams features. And when looking on the probabilities to each language (which are a consequence of the frequency of the unigrams features in that language) after the Softmax, we are much less sure in our prediction, because we have much less features to count on, when predicting. Therefore, we get more wrong predictions with the letter-unigrams features and the accuracy is lower.
+
+
+3. In each execution of train_mlp1, I got different number of iterations in which I correctly solve the xor problem. In my opinion, it's caused because of the random initialisation of the weights matrices and bias vectors. Moreover, it is known that perceptron doesn't assure that after he saw an example he will correctly classify this example the next time he will see it. And that's another reason that can explain the difference between the runs.
+In order to still be able to answer that question, I used an average of 5 runs which can approximately tell how many iterations it takes to mlp1 to correctly solve the xor problem.
+So, on an average of 5 runs, I was able to solve the xor problem in the 34th iteration.
@@ -0,0 +1,64 @@
+import numpy as np
+
+STUDENT = {'name': 'Dorin Keshales',
+           'ID': '313298424'}
+
+
+def gradient_check(f, x):
+    """ 
+    Gradient check for a function f 
+    - f should be a function that takes a single argument and outputs the cost and its gradients
+    - x is the point (numpy array) to check the gradient at
+    """
+    fx, grad = f(x)  # Evaluate function value at original point
+    h = 1e-4
+
+    # Iterate over all indexes in x
+    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
+    while not it.finished:
+        ix = it.multi_index
+
+        ### modify x[ix] with h defined above to compute the numerical gradient.
+        ### if you change x, make sure to return it back to its original state for the next iteration.
+        ### YOUR CODE HERE:
+        x_plus = x.copy()
+        x_plus[ix] = x_plus[ix] + h
+
+        x_minus = x.copy()
+        x_minus[ix] = x_minus[ix] - h
+
+        fx_plus, grad_plus = f(x_plus)
+        fx_minus, grad_minus = f(x_minus)
+
+        numeric_gradient = np.divide(np.subtract(fx_plus, fx_minus), (2.0 * h))
+        ### END YOUR CODE
+
+        # Compare gradients
+        reldiff = abs(numeric_gradient - grad[ix]) / max(1, abs(numeric_gradient), abs(grad[ix]))
+        if reldiff > 1e-5:
+            print("Gradient check failed.")
+            print("First gradient error found at index %s" % str(ix))
+            print("Your gradient: %f \t Numerical gradient: %f" % (grad[ix], numeric_gradient))
+            return
+
+        it.iternext()  # Step to next index
+
+    print("Gradient check passed!")
+
+
+def sanity_check():
+    """
+    Some basic sanity checks.
+    """
+    quad = lambda x: (np.sum(x ** 2), x * 2)
+
+    print("Running sanity checks...")
+    gradient_check(quad, np.array(123.456))  # scalar test
+    gradient_check(quad, np.random.randn(3, ))  # 1-D test
+    gradient_check(quad, np.random.randn(4, 5))  # 2-D test
+    print("")
+
+
+if __name__ == '__main__':
+    # If these fail, your code is definitely wrong.
+    sanity_check()
@@ -0,0 +1,133 @@
+import numpy as np
+
+STUDENT = {'name': 'Dorin Keshales',
+           'ID': '313298424'}
+
+
+def softmax(x):
+    """
+    Compute the softmax vector.
+    x: a n-dim vector (numpy array)
+    returns: an n-dim vector (numpy array) of softmax values
+    """
+    # YOUR CODE HERE
+    # Your code should be fast, so use a vectorized implementation using numpy,
+    # don't use any loops.
+    # With a vectorized implementation, the code should be no more than 2 lines.
+    #
+    # For numeric stability, use the identify you proved in Ex 2 Q1.
+
+    x -= x.max()
+    x = np.exp(x) / np.sum(np.exp(x))
+
+    return x
+
+
+def classifier_output(x, params):
+    """
+    Return the output layer (class probabilities) 
+    of a log-linear classifier with given params on input x.
+    """
+    W, b = params
+    # YOUR CODE HERE.
+
+    # Calculating the output of the model and using SoftMax
+    result = np.dot(x, W) + b
+    probs = softmax(result)
+
+    return probs
+
+
+def predict(x, params):
+    """
+    Returnss the prediction (highest scoring class id) of a
+    a log-linear classifier with given parameters on input x.
+
+    params: a list of the form [(W, b)]
+    W: matrix
+    b: vector
+    """
+    return np.argmax(classifier_output(x, params))
+
+
+def loss_and_gradients(x, y, params):
+    """
+    Compute the loss and the gradients at point x with given parameters.
+    y is a scalar indicating the correct label.
+
+    returns:
+        loss,[gW,gb]
+
+    loss: scalar
+    gW: matrix, gradients of W
+    gb: vector, gradients of b
+    """
+    W, b = params
+    # YOU CODE HERE
+
+    # Calculating the loss
+    model_output = classifier_output(x, params)
+    loss = -np.log(model_output[y])
+
+    # derivative of the loss by b
+    gb = model_output.copy()
+    gb[y] -= 1
+
+    # derivative of loss by W
+    copy_output = model_output.copy()
+    gW = np.outer(x, copy_output)
+    gW[:, y] -= x
+
+    return loss, [gW, gb]
+
+
+def create_classifier(in_dim, out_dim):
+    """
+    returns the parameters (W,b) for a log-linear classifier
+    with input dimension in_dim and output dimension out_dim.
+    """
+    W = np.zeros((in_dim, out_dim))
+    b = np.zeros(out_dim)
+    return [W, b]
+
+
+if __name__ == '__main__':
+    # Sanity checks for softmax. If these fail, your softmax is definitely wrong.
+    # If these pass, it may or may not be correct.
+    test1 = softmax(np.array([1, 2]))
+    print(test1)
+    assert np.amax(np.fabs(test1 - np.array([0.26894142, 0.73105858]))) <= 1e-6
+
+    test2 = softmax(np.array([1001, 1002]))
+    print(test2)
+    assert np.amax(np.fabs(test2 - np.array([0.26894142, 0.73105858]))) <= 1e-6
+
+    test3 = softmax(np.array([-1001, -1002]))
+    print(test3)
+    assert np.amax(np.fabs(test3 - np.array([0.73105858, 0.26894142]))) <= 1e-6
+
+    # Sanity checks. If these fail, your gradient calculation is definitely wrong.
+    # If they pass, it is likely, but not certainly, correct.
+    from grad_check import gradient_check
+
+    W, b = create_classifier(3, 4)
+
+
+    def _loss_and_W_grad(W):
+        global b
+        loss, grads = loss_and_gradients([1, 2, 3], 0, [W, b])
+        return loss, grads[0]
+
+
+    def _loss_and_b_grad(b):
+        global W
+        loss, grads = loss_and_gradients([1, 2, 3], 0, [W, b])
+        return loss, grads[1]
+
+
+    # for _ in xrange(10):
+    for _ in range(10):
+        W = np.random.randn(W.shape[0], W.shape[1])
+        b = np.random.randn(b.shape[0])
+        gradient_check(_loss_and_b_grad, b)
+        gradient_check(_loss_and_W_grad, W)
@@ -0,0 +1,110 @@
+import numpy as np
+
+STUDENT = {'name': 'Dorin Keshales',
+           'ID': '313298424'}
+
+
+def classifier_output(x, params):
+    # YOUR CODE HERE.
+
+    W, b, U, b_tag = params
+
+    # Calculation of the input to the hidden layer
+    result = np.dot(x, W) + b
+
+    # Saving copy of the hidden layer input - before the tanh
+    global z1
+    z1 = result.copy()
+
+    # Using tanh as activation function
+    result = np.tanh(result)
+
+    # Saving copy of the hidden layer output - after the tanh
+    global h1
+    h1 = result.copy()
+
+    # Calculating the output of the model and using SoftMax
+    result = np.dot(result, U) + b_tag
+    result -= result.max()
+    probs = np.exp(result) / np.sum(np.exp(result))
+
+    return probs
+
+
+def predict(x, params):
+    """
+    params: a list of the form [W, b, U, b_tag]
+    """
+    return np.argmax(classifier_output(x, params))
+
+
+def loss_and_gradients(x, y, params):
+    """
+    params: a list of the form [W, b, U, b_tag]
+
+    returns:
+        loss,[gW, gb, gU, gb_tag]
+
+    loss: scalar
+    gW: matrix, gradients of W
+    gb: vector, gradients of b
+    gU: matrix, gradients of U
+    gb_tag: vector, gradients of b_tag
+    """
+    # YOU CODE HERE
+
+    W, b, U, b_tag = params
+
+    # Calculating the loss
+    model_output = classifier_output(x, params)
+    loss = -np.log(model_output[y])
+
+    # derivative of the loss by b_tag
+    gb_tag = model_output.copy()
+    gb_tag[y] -= 1
+
+    # derivative of loss by U
+    copy_output = model_output.copy()
+    copy_h1 = h1.copy()
+    gU = np.outer(copy_h1, copy_output)
+    gU[:, y] -= copy_h1
+
+    # derivative of softmax by h1 which represents the vector after the tanh
+    ds_dh1 = np.dot(U, model_output) - U[:, y]
+
+    # derivative of the vector after the tanh (h1) by the vector before the tanh (z1)
+    copy_z1 = z1.copy()
+    dh1_dz1 = 1 - np.square(np.tanh(copy_z1))
+
+    # derivative of the loss by b
+    gb = ds_dh1 * dh1_dz1
+    # derivative of the loss by W
+    gW = np.outer(x, gb.copy())
+
+    return loss, [gW, gb, gU, gb_tag]
+
+
+# Initialization function to the weights matrices and the bias vectors.
+def my_random(size1, size2=None):
+    t = 1 if size2 is None else size2
+    eps = np.sqrt(6.0 / (size1 + t))
+    return np.random.uniform(-eps, eps, (size1, size2)) if size2 is not None else np.random.uniform(-eps, eps, size1)
+
+
+def create_classifier(in_dim, hid_dim, out_dim):
+    """
+    returns the parameters for a multi-layer perceptron,
+    with input dimension in_dim, hidden dimension hid_dim,
+    and output dimension out_dim.
+
+    return:
+    a flat list of 4 elements, W, b, U, b_tag.
+    """
+
+    W = my_random(in_dim, hid_dim)
+    b = my_random(hid_dim)
+    U = my_random(hid_dim, out_dim)
+    b_tag = my_random(out_dim)
+
+    params = [W, b, U, b_tag]
+    return params