Skip to content

Commit f3906cb

Browse files
committed
rm breeze dependency from dataframe
align operations
1 parent 8795d5a commit f3906cb

File tree

11 files changed

+312
-228
lines changed

11 files changed

+312
-228
lines changed

src/main/scala/qnt/bz/Align.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
package qnt.bz
2+
3+
object Align extends Enumeration {
4+
type AlignType = Value
5+
val left, right, inner, outer = Value
6+
}

src/main/scala/qnt/bz/DataFrame.scala

Lines changed: 66 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
package qnt.bz
22

3-
import breeze.linalg.operators.{OpDiv, OpMulMatrix}
4-
import breeze.linalg.support.CanSlice2
5-
import breeze.linalg.{DenseMatrix, Matrix, SliceMatrix, Tensor, TensorLike}
3+
import breeze.linalg.{DenseMatrix, Matrix, SliceMatrix}
64
import breeze.math.Semiring
5+
import qnt.bz.DataFrame.fill
76

87
import scala.reflect.ClassTag
98

@@ -15,8 +14,7 @@ class DataFrame[R, C, @specialized(Double, Int, Float, Long) V]
1514
val data: Matrix[V]
1615
)
1716
(implicit val rTag: ClassTag[R], val cTag: ClassTag[C], val vTag: ClassTag[V], val vSem: Semiring[V])
18-
extends Tensor[(R, C), V]
19-
with TensorLike[(R, C), V, DataFrame[R, C, V]]
17+
extends scala.collection.Map[(R, C),V]
2018
with Slice2dOps[R,C,DataFrame[R,C,V]] {
2119

2220
if (!rowIdx.unique) {
@@ -34,42 +32,18 @@ class DataFrame[R, C, @specialized(Double, Int, Float, Long) V]
3432

3533
override def apply(i: (R, C)): V = apply(i._1, i._2)
3634

37-
def apply(r: R, c: C): V = data(rowIdx.indexOfExactUnsafe(r), colIdx.indexOfExactUnsafe(c))
35+
def apply(r: R, c: C): V = data(rowIdx.hashIndexOfUnsafe(r), colIdx.hashIndexOfUnsafe(c))
3836

39-
override def update(i: (R, C), v: V): Unit
37+
def update(i: (R, C), v: V): Unit
4038
= update(i._1, i._2, v)
4139

4240
def update(r: R, c: C, v: V): Unit
43-
= data((rowIdx.indexOfExactUnsafe(r), colIdx.indexOfExactUnsafe(c))) = v
41+
= data((rowIdx.hashIndexOfUnsafe(r), colIdx.hashIndexOfUnsafe(c))) = v
4442

45-
override def size: Int = data.size
46-
47-
override def activeSize: Int = data.size
43+
override def knownSize: Int = rowIdx.length * colIdx.length
4844

4945
override def iterator: Iterator[((R, C), V)] = data.iterator.map(v => ((rowIdx(v._1._1), colIdx(v._1._2)), v._2))
5046

51-
override def activeIterator: Iterator[((R, C), V)] = iterator
52-
53-
object keySet extends Set[(R, C)] {
54-
override def incl(elem: (R, C)): Set[(R, C)] = Set() ++ iterator + elem
55-
56-
override def excl(elem: (R, C)): Set[(R, C)] = Set() ++ iterator + elem
57-
58-
override def contains(elem: (R, C)): Boolean = rowIdx.contains(elem._1) && colIdx.contains(elem._2)
59-
60-
override def iterator: Iterator[(R, C)] = rowIdx.valuesIterator.flatMap(r => colIdx.valuesIterator.map(c => (r, c)))
61-
}
62-
63-
override def keysIterator: Iterator[(R, C)] = keySet.iterator
64-
65-
override def activeKeysIterator: Iterator[(R, C)] = keySet.iterator
66-
67-
override def valuesIterator: Iterator[V] = data.valuesIterator
68-
69-
override def activeValuesIterator: Iterator[V] = data.valuesIterator
70-
71-
override def repr: DataFrame[R, C, V] = this
72-
7347
override def toString: String = toString(5, 5, 5, 5)
7448

7549
def toString
@@ -174,7 +148,7 @@ class DataFrame[R, C, @specialized(Double, Int, Float, Long) V]
174148
: DataFrame[R, C, V] = iloc(colIdx.loc(start, end, step, keepStart, keepEnd, round).slices)
175149
}
176150

177-
def reIndex[R, C](rows: IndexVector[R], cols: IndexVector[C])
151+
def withIdx[R, C](rows: IndexVector[R], cols: IndexVector[C])
178152
: DataFrame[R, C, V] = DataFrame(rows, cols, data)(rTag = rows.tag, cTag = cols.tag, vTag = vTag, vSem = vSem)
179153

180154
def intersect(another: DataFrame[R, C, V]): DataFrame[R, C, V] = {
@@ -187,7 +161,50 @@ class DataFrame[R, C, @specialized(Double, Int, Float, Long) V]
187161
}
188162
}
189163

190-
def canEqual(other: Any): Boolean = other.isInstanceOf[DataFrame[R, C, V]]
164+
def align(right: DataFrame[R,C,V], align: Align.AlignType, missingValue: V): DataFrame[R,C,V] = {
165+
val left = this
166+
val rowIdx = left.rowIdx.align(right.rowIdx, align)
167+
val colIdx = left.colIdx.align(right.colIdx, align)
168+
this.align(rowIdx, colIdx, missingValue)
169+
}
170+
171+
def align(rowIdx:IndexVector[R], colIdx: IndexVector[C], missingValue: V): DataFrame[R, C, V] = {
172+
if(this.rowIdx == rowIdx && this.colIdx == colIdx) this
173+
else {
174+
val containsAllRows = rowIdx.forall(e => this.rowIdx.contains(e))
175+
val containsAllCols = colIdx.forall(e => this.colIdx.contains(e))
176+
if(containsAllCols && containsAllRows) {
177+
loc(rowIdx.toIndexedSeq, colIdx.toIndexedSeq).withIdx(rowIdx, colIdx)
178+
} else {
179+
var df = fill(rowIdx, colIdx, missingValue)
180+
var ir = this.rowIdx.intersect(rowIdx)
181+
var ic = this.colIdx.intersect(colIdx)
182+
for (r <- ir.toIndexedSeq; c <- ic.toIndexedSeq) {
183+
df(r,c) = this(r,c)
184+
}
185+
df
186+
}
187+
}
188+
}
189+
190+
def combine(frames: Seq[DataFrame[R, C, V]], missingValue: V)
191+
: DataFrame[R, C, V] = {
192+
val first = this
193+
194+
val rowIdx = first.rowIdx.combine(frames.map(_.rowIdx))
195+
val colIdx = first.colIdx.combine(frames.map(_.colIdx))
196+
197+
val result = fill(rowIdx, colIdx, missingValue)(this.rTag, this.cTag, this.vTag, this.vSem)
198+
199+
for (f <- frames) {
200+
for (r <- f.rowIdx.toIndexedSeq; c <- f.colIdx.toIndexedSeq) {
201+
result(r, c) = f(r, c)
202+
}
203+
}
204+
result
205+
}
206+
207+
def fillLike(value:V): DataFrame[R, C, V] = DataFrame.fill(rowIdx, colIdx, value)
191208

192209
override def equals(other: Any): Boolean = other match {
193210
case that: DataFrame[R, C, V] =>
@@ -202,6 +219,20 @@ class DataFrame[R, C, @specialized(Double, Int, Float, Long) V]
202219
val state = Seq(super.hashCode(), rowIdx, colIdx, data)
203220
state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
204221
}
222+
223+
def transpose : DataFrame[C, R, V] = { // this is just logical transpose
224+
val dt:Matrix[V] = data match {
225+
case d: DenseMatrix[V] => d.t
226+
case _ => data.asInstanceOf[SliceMatrix[Int, Int, V]].toDenseMatrix.t
227+
}
228+
DataFrame(colIdx, rowIdx, dt)
229+
}
230+
231+
override def -(key: (R, C)): collection.Map[(R, C), V] = ???
232+
233+
override def -(key1: (R, C), key2: (R, C), keys: (R, C)*): collection.Map[(R, C), V] = ???
234+
235+
override def get(key: (R, C)): Option[V] = rowIdx.hashIndexOf(key._1).zip(colIdx.hashIndexOf(key._2)).map(data.apply)
205236
}
206237

207238
object DataFrame {
@@ -218,55 +249,4 @@ object DataFrame {
218249
DenseMatrix.create(ridx.size, cidx.size, new Array[V](ridx.size * cidx.size))
219250
)
220251

221-
def combine[R, C, @specialized(Double, Int, Float, Long) V](frames: Seq[DataFrame[R, C, V]], missingValue: V): DataFrame[R, C, V] = {
222-
val first = frames(0)
223-
import first._
224-
225-
val rowIdx = IndexVector.combine(frames.map(_.rowIdx))
226-
val colIdx = IndexVector.combine(frames.map(_.colIdx))
227-
228-
val result = fill(rowIdx, colIdx, missingValue)
229-
230-
for (f <- frames) {
231-
for (r <- f.rowIdx.toIndexedSeq; c <- f.colIdx.toIndexedSeq) {
232-
result(r, c) = f(r, c)
233-
}
234-
}
235-
result
236-
}
237-
238-
implicit def canSlice2[R, C, V]: CanSlice2[DataFrame[R, C, V], R, C, V]
239-
= new CanSlice2[DataFrame[R, C, V], R, C, V] {
240-
override def apply(from: DataFrame[R, C, V], slice: R, slice2: C): V = from.apply(slice, slice2)
241-
}
242-
243-
244-
implicit def divOps2[R, C, @specialized(Double, Float, Int, Long) V]
245-
: OpDiv.Impl2[DataFrame[R, C, V], DataFrame[R, C, V], DataFrame[R, C, V]] =
246-
new OpDiv.Impl2[DataFrame[R, C, V], DataFrame[R, C, V], DataFrame[R, C, V]]() {
247-
override def apply(v: DataFrame[R, C, V], v2: DataFrame[R, C, V]): DataFrame[R, C, V] = {
248-
val result = v.intersect(v2).copy
249-
for (c <- result.colIdx.toIndexedSeq; r <- result.rowIdx.toIndexedSeq) {
250-
result(r, c) = (v(r, c).asInstanceOf[Double] / v(r, c).asInstanceOf[Double]).asInstanceOf[V]
251-
}
252-
result
253-
}
254-
}
255-
256-
257-
implicit def mulOps2[R, C, @specialized(Double, Int, Float, Long) V]: OpMulMatrix.Impl2[DataFrame[R, C, V], DataFrame[R, C, V], DataFrame[R, C, V]] =
258-
new OpMulMatrix.Impl2[DataFrame[R, C, V], DataFrame[R, C, V], DataFrame[R, C, V]] {
259-
override def apply(v: DataFrame[R, C, V], v2: DataFrame[R, C, V]): DataFrame[R, C, V] = {
260-
import v._
261-
if (v.rowIdx == v2.rowIdx && v.colIdx == v2.colIdx) {
262-
DataFrame(v.rowIdx, v2.colIdx, v.data.toDenseMatrix *:* v2.data.toDenseMatrix)
263-
} else {
264-
val result = v.intersect(v2).copy
265-
for (c <- result.colIdx.toIndexedSeq; r <- result.rowIdx.toIndexedSeq) {
266-
result(r, c) = (v(r, c).asInstanceOf[Double] * v2(r, c).asInstanceOf[Double]).asInstanceOf[V]
267-
}
268-
result
269-
}
270-
}
271-
}
272252
}

src/main/scala/qnt/bz/DataIndexVector.scala

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@ import scala.collection.{IterableOnce, mutable}
66
import scala.reflect.ClassTag
77

88
class DataIndexVector[V](
9-
private val data: Array[V],
10-
val unique: Boolean,
11-
val ordered: Boolean,
12-
val reversed: Boolean,
9+
private val data: Array[V],
10+
val unique: Boolean,
11+
val ordered: Boolean,
12+
val descending: Boolean,
1313
)(implicit ord: Ordering[V], tag: ClassTag[V]) extends IndexVector[V] {
1414

1515
private val valueToIdxMap = new mutable.HashMap[V, Int]()
1616

1717
if (ordered) {
1818
for (i <- 0 to (data.length - 2))
19-
if (ord.gt(data(i), data(i + 1)) ^ reversed) {
19+
if (ord.gt(data(i), data(i + 1)) ^ descending) {
2020
throw new IllegalArgumentException(s"ordering violation idx1=$i idx2=${i + 1}")
2121
}
2222
}
@@ -38,13 +38,13 @@ class DataIndexVector[V](
3838
if (ordered) {
3939
if (i > 0) {
4040
val prev = apply(i - 1)
41-
if (ord.lt(prev, v) ^ reversed) {
41+
if (ord.lt(prev, v) ^ descending) {
4242
throw new IllegalArgumentException(s"ordering violation (prev, cur) idx=$i")
4343
}
4444
}
4545
if (i < size - 1) {
4646
val nxt = apply(i + 1)
47-
if (ord.gt(v, nxt) ^ reversed) {
47+
if (ord.gt(v, nxt) ^ descending) {
4848
throw new IllegalArgumentException(s"ordering violation (cur, nxt) idx=$i")
4949
}
5050
}
@@ -54,19 +54,21 @@ class DataIndexVector[V](
5454
valueToIdxMap(v) = i
5555
}
5656

57-
override def size: Int = data.length
57+
override def length: Int = data.length
5858

5959
override def apply(i: Int): V = data(i)
6060

61-
override def contains(v: V): Boolean
62-
= if (unique) valueToIdxMap.contains(v)
63-
else if (ordered) indexOfBinarySearch(v).foundValue
64-
else data.contains(v)
61+
override def contains[A1 >: V](elem: A1): Boolean = {
62+
val v = elem.asInstanceOf[V]
63+
if (unique) valueToIdxMap.contains(v)
64+
else if (ordered) indexOfBinarySearch(v).foundValue
65+
else data.contains(v)
66+
}
6567

66-
override def indexOfExact(v: V): Option[Int] =
68+
override def hashIndexOf(v: V): Option[Int] =
6769
if (unique) valueToIdxMap.get(v) else throw new IllegalStateException("not unique")
6870

69-
override def indexOfExactUnsafe(value: V): Int = valueToIdxMap(value)
71+
override def hashIndexOfUnsafe(value: V): Int = valueToIdxMap.apply(value)
7072

7173
}
7274

@@ -89,7 +91,7 @@ object DataIndexVector {
8991
(implicit ord: Ordering[V], tag: ClassTag[V]): DataIndexVector[V] = {
9092
val values = sliceVector.toArray
9193
val t = sliceVector.source
92-
apply(values, t.unique, t.ordered, t.reversed)
94+
apply(values, t.unique, t.ordered, t.descending)
9395
}
9496

9597
def apply[V](vector: Vector[V], unique: Boolean, ordered: Boolean, reversed: Boolean)

0 commit comments

Comments
 (0)