flush

flush First sane version without isRowDeleted
delta-io · scottsand-db · Apr 26, 2024 · Apr 23, 2024 · Apr 23, 2024 · Apr 23, 2024
commit 3938944c2f20c487b22df8bf41970b43bc897449
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaParquetFileFormat.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaParquetFileFormat.scala
@@ -61,8 +61,8 @@ case class DeltaParquetFileFormat(
   extends ParquetFileFormat {
   // Validate either we have all arguments for DV enabled read or none of them.
   if (hasTablePath) {
-    require(!isSplittable && disablePushDowns,
-      "Wrong arguments for Delta table scan with deletion vectors")
+    // require(!isSplittable && disablePushDowns,
+    //  "Wrong arguments for Delta table scan with deletion vectors")
   }
 
   TypeWidening.assertTableReadable(protocol, metadata)
@@ -283,8 +283,8 @@ case class DeltaParquetFileFormat(
 
   def disableSplittingAndPushdown(tablePath: String): DeltaParquetFileFormat = {
     this.copy(
-      isSplittable = false,
-      disablePushDowns = true,
+      // isSplittable = true,
+      // disablePushDowns = false,
       tablePath = Some(tablePath))
   }
 

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableWithDVs.scala b/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableWithDVs.scala
@@ -71,8 +71,8 @@ object ScanWithDeletionVectors {
     // If the table has no DVs enabled, no change needed
     if (!deletionVectorsReadable(index.protocol, index.metadata)) return None
 
-    // require(!index.isInstanceOf[TahoeLogFileIndex],
-    //  "Cannot work with a non-pinned table snapshot of the TahoeFileIndex")
+   require(!index.isInstanceOf[TahoeLogFileIndex],
+     "Cannot work with a non-pinned table snapshot of the TahoeFileIndex")
 
     // If the table has no DVs enabled, no change needed
     if (!deletionVectorsReadable(index.protocol, index.metadata)) return None
@@ -141,7 +141,7 @@ object ScanWithDeletionVectors {
     val skipRowColumnRef = skipRowColumnRefs.head
 
     val keepRow = DeltaUDF.booleanFromByte( _ == RowIndexFilter.KEEP_ROW_VALUE)
-      .asNondeterministic() // To avoid constant folding the filter based on stats.
+      // .asNondeterministic() // To avoid constant folding the filter based on stats.
 
     val filterExp = keepRow(new Column(skipRowColumnRef)).expr
     Filter(filterExp, newScan)

diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/DMLWithDeletionVectorsHelper.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/DMLWithDeletionVectorsHelper.scala
@@ -48,7 +48,7 @@ import org.apache.spark.util.{SerializableConfiguration, Utils => SparkUtils}
  * Contains utility classes and method for performing DML operations with Deletion Vectors.
  */
 object DMLWithDeletionVectorsHelper extends DeltaCommand {
-  val SUPPORTED_DML_COMMANDS: Seq[String] = Seq("DELETE", "UPDATE", "MERGE")
+  val SUPPORTED_DML_COMMANDS: Seq[String] = Seq("DELETE", "UPDATE")
 
   /**
    * Creates a DataFrame that can be used to scan for rows matching the condition in the given
@@ -106,8 +106,8 @@ object DMLWithDeletionVectorsHelper extends DeltaCommand {
         val newProjectList = projectList ++ Seq(rowIndexCol, fileMetadataCol)
         p.copy(projectList = newProjectList)
     }
-    // newTarget
-    target
+    newTarget
+    // target
   }
 
   /**

diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/TightBoundsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/TightBoundsSuite.scala
@@ -16,6 +16,9 @@
 
 package org.apache.spark.sql.delta
 
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.FILES_MAX_PARTITION_BYTES
+
 import scala.collection.mutable.ArrayBuffer
 
 // scalastyle:off import.ordering.noEmptyLine
@@ -287,13 +290,27 @@ class TightBoundsSuite
 
   test("TEST") {
     withTempDeltaTable(
-      dataDF = spark.range(0, 10, 1, 1).toDF("id"),
+      // .repartition(1)
+      dataDF = spark.range(0, 50000000, 1, 1).toDF("id"),
+      // dataDF = spark.range(0, 100000000, 1, 1).toDF("id"),
       enableDVs = true
     ) { (targetTable, targetLog) =>
-      targetTable().delete("id == 2")
-
-      val a = targetTable().toDF.filter("id != 1").collect()
-      val b = 1
+      withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> true.toString,
+          SQLConf.FILES_MAX_PARTITION_BYTES.key -> "128MB") {
+        targetTable().delete("id == 40000000")
+
+        // val d = targetTable().toDF.filter("id != 1").queryExecution.executedPlan
+        // .filter("id != 1")
+        val a = targetTable().toDF.filter("id != 1").collect()
+        val c = targetLog.update().allFiles.collect()
+        val b = 1
+        assert(a.length === 49999999)
+
+        // a(40000000).getLong(0)
+        assert(a(40000000).getLong(0) === 40000000)
+        // assert(!a.map(_.getLong(0)).toSeq.contains(40000000))
+        // assert(a === Seq(0, 100000000).drop(2))
+      }
     }
   }