Do not re-use objects in the EdgePartition/EdgeTriplet iterators. This avoids a silent data corruption issue (SPARK-1188) and has no performance impact in my measurements. It also simplifies the code.

darabos · darabos · commit 0182f2b329b2 · 2014-03-31T12:58:37.000+02:00
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
@@ -62,18 +62,8 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
    *         applied to each edge
    */
   def map[ED2: ClassTag](f: Edge[ED] => ED2): EdgePartition[ED2] = {
-    val newData = new Array[ED2](data.size)
-    val edge = new Edge[ED]()
-    val size = data.size
-    var i = 0
-    while (i < size) {
-      edge.srcId  = srcIds(i)
-      edge.dstId  = dstIds(i)
-      edge.attr = data(i)
-      newData(i) = f(edge)
-      i += 1
-    }
-    new EdgePartition(srcIds, dstIds, newData, index)
+    val newData = (0 until data.size).map(i => f(Edge(srcIds(i), dstIds(i), data(i))))
+    new EdgePartition(srcIds, dstIds, newData.toArray, index)
   }
 
   /**
@@ -84,19 +74,13 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
    * order of the edges returned by `EdgePartition.iterator` and
    * should return attributes equal to the number of edges.
    *
-   * @param f a function from an edge to a new attribute
+   * @param iter an iterator for the new attribute values
    * @tparam ED2 the type of the new attribute
-   * @return a new edge partition with the result of the function `f`
-   *         applied to each edge
+   * @return a new edge partition with the attribute values replaced
    */
   def map[ED2: ClassTag](iter: Iterator[ED2]): EdgePartition[ED2] = {
-    val newData = new Array[ED2](data.size)
-    var i = 0
-    while (iter.hasNext) {
-      newData(i) = iter.next()
-      i += 1
-    }
-    assert(newData.size == i)
+    val newData = iter.toArray
+    assert(newData.size == data.size)
     new EdgePartition(srcIds, dstIds, newData, index)
   }
 
@@ -191,15 +175,12 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
    * @return an iterator over edges in the partition
    */
   def iterator = new Iterator[Edge[ED]] {
-    private[this] val edge = new Edge[ED]
     private[this] var pos = 0
 
     override def hasNext: Boolean = pos < EdgePartition.this.size
 
     override def next(): Edge[ED] = {
-      edge.srcId = srcIds(pos)
-      edge.dstId = dstIds(pos)
-      edge.attr = data(pos)
+      val edge = Edge(srcIds(pos), dstIds(pos), data(pos))
       pos += 1
       edge
     }
@@ -218,7 +199,6 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
    * cluster must start at position `index`.
    */
   private def clusterIterator(srcId: VertexId, index: Int) = new Iterator[Edge[ED]] {
-    private[this] val edge = new Edge[ED]
     private[this] var pos = index
 
     override def hasNext: Boolean = {
@@ -227,9 +207,7 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
 
     override def next(): Edge[ED] = {
       assert(srcIds(pos) == srcId)
-      edge.srcId = srcIds(pos)
-      edge.dstId = dstIds(pos)
-      edge.attr = data(pos)
+      val edge = Edge(srcIds(pos), dstIds(pos), data(pos))
       pos += 1
       edge
     }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala
@@ -37,20 +37,15 @@ class EdgeTripletIterator[VD: ClassTag, ED: ClassTag](
   // Current position in the array.
   private var pos = 0
 
-  // A triplet object that this iterator.next() call returns. We reuse this object to avoid
-  // allocating too many temporary Java objects.
-  private val triplet = new EdgeTriplet[VD, ED]
-
   private val vmap = new PrimitiveKeyOpenHashMap[VertexId, VD](vidToIndex, vertexArray)
 
   override def hasNext: Boolean = pos < edgePartition.size
 
   override def next() = {
+    val triplet = new EdgeTriplet[VD, ED]
     triplet.srcId = edgePartition.srcIds(pos)
-    // assert(vmap.containsKey(e.src.id))
     triplet.srcAttr = vmap(triplet.srcId)
     triplet.dstId = edgePartition.dstIds(pos)
-    // assert(vmap.containsKey(e.dst.id))
     triplet.dstAttr = vmap(triplet.dstId)
     triplet.attr = edgePartition.data(pos)
     pos += 1