Skip to content

Commit

Permalink
[SPARK-3701][MLLIB] update python linalg api and small fixes
Browse files Browse the repository at this point in the history
1. doc updates
2. simple checks on vector dimensions
3. use column major for matrices

davies jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes apache#2548 from mengxr/mllib-py-clean and squashes the following commits:

6dce2df [Xiangrui Meng] address comments
116b5db [Xiangrui Meng] use np.dot instead of array.dot
75f2fcc [Xiangrui Meng] fix python style
fefce00 [Xiangrui Meng] better check of vector size with more tests
067ef71 [Xiangrui Meng] majored -> major
ef853f9 [Xiangrui Meng] update python linalg api and small fixes
  • Loading branch information
mengxr committed Oct 1, 2014
1 parent 6c696d7 commit d75496b
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ sealed trait Matrix extends Serializable {
}

/**
* Column-majored dense matrix.
* Column-major dense matrix.
* The entry values are stored in a single array of doubles with columns listed in sequence.
* For example, the following matrix
* {{{
Expand Down Expand Up @@ -128,7 +128,7 @@ class DenseMatrix(val numRows: Int, val numCols: Int, val values: Array[Double])
}

/**
* Column-majored sparse matrix.
* Column-major sparse matrix.
* The entry values are stored in Compressed Sparse Column (CSC) format.
* For example, the following matrix
* {{{
Expand Down Expand Up @@ -207,7 +207,7 @@ class SparseMatrix(
object Matrices {

/**
* Creates a column-majored dense matrix.
* Creates a column-major dense matrix.
*
* @param numRows number of rows
* @param numCols number of columns
Expand All @@ -218,7 +218,7 @@ object Matrices {
}

/**
* Creates a column-majored sparse matrix in Compressed Sparse Column (CSC) format.
* Creates a column-major sparse matrix in Compressed Sparse Column (CSC) format.
*
* @param numRows number of rows
* @param numCols number of columns
Expand Down
150 changes: 121 additions & 29 deletions python/pyspark/mllib/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,41 @@ def _convert_to_vector(l):
raise TypeError("Cannot convert type %s into Vector" % type(l))


def _vector_size(v):
"""
Returns the size of the vector.
>>> _vector_size([1., 2., 3.])
3
>>> _vector_size((1., 2., 3.))
3
>>> _vector_size(array.array('d', [1., 2., 3.]))
3
>>> _vector_size(np.zeros(3))
3
>>> _vector_size(np.zeros((3, 1)))
3
>>> _vector_size(np.zeros((1, 3)))
Traceback (most recent call last):
...
ValueError: Cannot treat an ndarray of shape (1, 3) as a vector
"""
if isinstance(v, Vector):
return len(v)
elif type(v) in (array.array, list, tuple):
return len(v)
elif type(v) == np.ndarray:
if v.ndim == 1 or (v.ndim == 2 and v.shape[1] == 1):
return len(v)
else:
raise ValueError("Cannot treat an ndarray of shape %s as a vector" % str(v.shape))
elif _have_scipy and scipy.sparse.issparse(v):
assert v.shape[1] == 1, "Expected column vector"
return v.shape[0]
else:
raise TypeError("Cannot treat type %s as a vector" % type(v))


class Vector(object):
"""
Abstract class for DenseVector and SparseVector
Expand All @@ -76,6 +111,9 @@ def toArray(self):


class DenseVector(Vector):
"""
A dense vector represented by a value array.
"""
def __init__(self, ar):
if not isinstance(ar, array.array):
ar = array.array('d', ar)
Expand All @@ -100,15 +138,31 @@ def dot(self, other):
5.0
>>> dense.dot(np.array(range(1, 3)))
5.0
>>> dense.dot([1.,])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> dense.dot(np.reshape([1., 2., 3., 4.], (2, 2), order='F'))
array([ 5., 11.])
>>> dense.dot(np.reshape([1., 2., 3.], (3, 1), order='F'))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
if isinstance(other, SparseVector):
return other.dot(self)
if type(other) == np.ndarray and other.ndim > 1:
assert len(self) == other.shape[0], "dimension mismatch"
return np.dot(self.toArray(), other)
elif _have_scipy and scipy.sparse.issparse(other):
return other.transpose().dot(self.toArray())[0]
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
assert len(self) == other.shape[0], "dimension mismatch"
return other.transpose().dot(self.toArray())
else:
return np.dot(self.toArray(), other)
assert len(self) == _vector_size(other), "dimension mismatch"
if isinstance(other, SparseVector):
return other.dot(self)
elif isinstance(other, Vector):
return np.dot(self.toArray(), other.toArray())
else:
return np.dot(self.toArray(), other)

def squared_distance(self, other):
"""
Expand All @@ -126,7 +180,16 @@ def squared_distance(self, other):
>>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
>>> dense1.squared_distance(sparse1)
2.0
>>> dense1.squared_distance([1.,])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if isinstance(other, SparseVector):
return other.squared_distance(self)
elif _have_scipy and scipy.sparse.issparse(other):
Expand Down Expand Up @@ -165,12 +228,10 @@ def __getattr__(self, item):


class SparseVector(Vector):

"""
A simple sparse vector class for passing data to MLlib. Users may
alternatively pass SciPy's {scipy.sparse} data types.
"""

def __init__(self, size, *args):
"""
Create a sparse vector, using either a dictionary, a list of
Expand Down Expand Up @@ -222,20 +283,33 @@ def dot(self, other):
0.0
>>> a.dot(np.array([[1, 1], [2, 2], [3, 3], [4, 4]]))
array([ 22., 22.])
>>> a.dot([1., 2., 3.])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> a.dot(np.array([1., 2.]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> a.dot(DenseVector([1., 2.]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> a.dot(np.zeros((3, 2)))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
if type(other) == np.ndarray:
if other.ndim == 1:
result = 0.0
for i in xrange(len(self.indices)):
result += self.values[i] * other[self.indices[i]]
return result
elif other.ndim == 2:
if other.ndim == 2:
results = [self.dot(other[:, i]) for i in xrange(other.shape[1])]
return np.array(results)
else:
raise Exception("Cannot call dot with %d-dimensional array" % other.ndim)
elif other.ndim > 2:
raise ValueError("Cannot call dot with %d-dimensional array" % other.ndim)

assert len(self) == _vector_size(other), "dimension mismatch"

elif type(other) in (array.array, DenseVector):
if type(other) in (np.ndarray, array.array, DenseVector):
result = 0.0
for i in xrange(len(self.indices)):
result += self.values[i] * other[self.indices[i]]
Expand All @@ -254,6 +328,7 @@ def dot(self, other):
else:
j += 1
return result

else:
return self.dot(_convert_to_vector(other))

Expand All @@ -273,7 +348,16 @@ def squared_distance(self, other):
30.0
>>> b.squared_distance(a)
30.0
>>> b.squared_distance([1., 2.])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> b.squared_distance(SparseVector(3, [1,], [1.0,]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if type(other) in (list, array.array, DenseVector, np.array, np.ndarray):
if type(other) is np.array and other.ndim != 1:
raise Exception("Cannot call squared_distance with %d-dimensional array" %
Expand Down Expand Up @@ -348,7 +432,6 @@ def __eq__(self, other):
>>> v1 != v2
False
"""

return (isinstance(other, self.__class__)
and other.size == self.size
and other.indices == self.indices
Expand Down Expand Up @@ -414,23 +497,32 @@ def stringify(vector):


class Matrix(object):
""" the Matrix """
def __init__(self, nRow, nCol):
self.nRow = nRow
self.nCol = nCol
"""
Represents a local matrix.
"""

def __init__(self, numRows, numCols):
self.numRows = numRows
self.numCols = numCols

def toArray(self):
"""
Returns its elements in a NumPy ndarray.
"""
raise NotImplementedError


class DenseMatrix(Matrix):
def __init__(self, nRow, nCol, values):
Matrix.__init__(self, nRow, nCol)
assert len(values) == nRow * nCol
"""
Column-major dense matrix.
"""
def __init__(self, numRows, numCols, values):
Matrix.__init__(self, numRows, numCols)
assert len(values) == numRows * numCols
self.values = values

def __reduce__(self):
return DenseMatrix, (self.nRow, self.nCol, self.values)
return DenseMatrix, (self.numRows, self.numCols, self.values)

def toArray(self):
"""
Expand All @@ -439,10 +531,10 @@ def toArray(self):
>>> arr = array.array('d', [float(i) for i in range(4)])
>>> m = DenseMatrix(2, 2, arr)
>>> m.toArray()
array([[ 0., 1.],
[ 2., 3.]])
array([[ 0., 2.],
[ 1., 3.]])
"""
return np.ndarray((self.nRow, self.nCol), np.float64, buffer=self.values.tostring())
return np.reshape(self.values, (self.numRows, self.numCols), order='F')


def _test():
Expand Down

0 comments on commit d75496b

Please sign in to comment.