Skip to content

Commit

Permalink
[Spark] Pass catalog table through TahoeLogFileIndex (#4150)
Browse files Browse the repository at this point in the history
<!--
Thanks for sending a pull request!  Here are some tips for you:
1. If this is your first time, please read our contributor guidelines:
https://github.com/delta-io/delta/blob/master/CONTRIBUTING.md
2. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP]
Your PR title ...'.
  3. Be sure to keep the PR description updated to reflect all changes.
  4. Please write your PR title to summarize what this PR proposes.
5. If possible, provide a concise example to reproduce the issue for a
faster review.
6. If applicable, include the corresponding issue number in the PR title
and link it in the body.
-->

#### Which Delta project/connector is this regarding?
<!--
Please add the component selected below to the beginning of the pull
request title
For example: [Spark] Title of my pull request
-->

- [x] Spark
- [ ] Standalone
- [ ] Flink
- [ ] Kernel
- [ ] Other (fill in here)

## Description
`TahoeLogFileIndex` contains `DeltaLog::update` calls but it does not
pass in the catalog table yet. This causes the check for catalog table
existence unreliable in `DeltaLog::update`. This PR fixes that by adding
the catalog table argument to `TahoeLogFileIndex` and pass the catalog
table from `DeltaLog::createRelation`

## How was this patch tested?

Run existing unit tests

## Does this PR introduce _any_ user-facing changes?

No
  • Loading branch information
ctring authored Feb 14, 2025
1 parent 14c2997 commit 318e2ec
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ case class DeltaSharingFileIndex(
partitionFilters: Seq[Expression],
dataFilters: Seq[Expression]): TahoeLogFileIndex = {
val deltaLog = fetchFilesAndConstructDeltaLog(partitionFilters, dataFilters, None)
TahoeLogFileIndex(params.spark, deltaLog)
TahoeLogFileIndex(params.spark, deltaLog, catalogTableOpt = None)
}

override def listFiles(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,7 @@ class DeltaLog private(
}

val fileIndex = TahoeLogFileIndex(
spark, this, dataPath, snapshotToUse, partitionFilters, isTimeTravelQuery)
spark, this, dataPath, snapshotToUse, catalogTableOpt, partitionFilters, isTimeTravelQuery)
var bucketSpec: Option[BucketSpec] = None

val r = buildHadoopFsRelationWithFileIndex(snapshotToUse, fileIndex, bucketSpec = bucketSpec)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import org.apache.hadoop.fs.Path

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, GenericInternalRow, Literal}
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.types.StructType
Expand Down Expand Up @@ -244,6 +245,7 @@ case class TahoeLogFileIndex(
override val deltaLog: DeltaLog,
override val path: Path,
snapshotAtAnalysis: SnapshotDescriptor,
catalogTableOpt: Option[CatalogTable],
partitionFilters: Seq[Expression],
isTimeTravelQuery: Boolean)
extends TahoeFileIndex(spark, deltaLog, path) {
Expand All @@ -253,6 +255,7 @@ case class TahoeLogFileIndex(
deltaLog: DeltaLog,
path: Path,
snapshotAtAnalysis: Snapshot,
catalogTableOpt: Option[CatalogTable],
partitionFilters: Seq[Expression] = Nil,
isTimeTravelQuery: Boolean = false
) = this (
Expand All @@ -261,6 +264,7 @@ case class TahoeLogFileIndex(
path,
if (isTimeTravelQuery) snapshotAtAnalysis
else new ShallowSnapshotDescriptor(snapshotAtAnalysis),
catalogTableOpt,
partitionFilters,
isTimeTravelQuery)

Expand Down Expand Up @@ -288,7 +292,7 @@ case class TahoeLogFileIndex(
if (isTimeTravelQuery) {
snapshotAtAnalysis.asInstanceOf[Snapshot]
} else {
deltaLog.update(stalenessAcceptable = true)
deltaLog.update(stalenessAcceptable = true, catalogTableOpt = catalogTableOpt)
}
}

Expand Down Expand Up @@ -366,19 +370,23 @@ case class TahoeLogFileIndex(
}

object TahoeLogFileIndex {
def apply(spark: SparkSession, deltaLog: DeltaLog): TahoeLogFileIndex =
new TahoeLogFileIndex(spark, deltaLog, deltaLog.dataPath, deltaLog.unsafeVolatileSnapshot)
def apply(
spark: SparkSession,
deltaLog: DeltaLog,
catalogTableOpt: Option[CatalogTable]): TahoeLogFileIndex =
new TahoeLogFileIndex(
spark, deltaLog, deltaLog.dataPath, deltaLog.unsafeVolatileSnapshot, catalogTableOpt)

def apply(
spark: SparkSession,
deltaLog: DeltaLog,
path: Path,
snapshotAtAnalysis: Snapshot,
catalogTableOpt: Option[CatalogTable],
partitionFilters: Seq[Expression] = Nil,
isTimeTravelQuery: Boolean = false): TahoeLogFileIndex
= new TahoeLogFileIndex(
spark, deltaLog, path, snapshotAtAnalysis, partitionFilters, isTimeTravelQuery
)
spark, deltaLog, path, snapshotAtAnalysis, catalogTableOpt, partitionFilters, isTimeTravelQuery)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import org.apache.spark.sql.delta.actions.{Action, AddFile, Metadata, Protocol}
import org.apache.spark.sql.delta.catalog.DeltaTableV2
import org.apache.spark.sql.delta.commands.optimize.OptimizeMetrics
import org.apache.spark.sql.delta.coordinatedcommits.TableCommitCoordinatorClient
import org.apache.spark.sql.delta.files.TahoeLogFileIndex
import org.apache.spark.sql.delta.hooks.AutoCompact
import org.apache.spark.sql.delta.stats.StatisticsCollection
import io.delta.storage.commit.{CommitResponse, GetCommitsResponse, UpdatedActions}
Expand Down Expand Up @@ -181,6 +182,12 @@ object DeltaTestImplicits {
def snapshot: Snapshot = deltaTable.initialSnapshot
}

implicit class TahoeLogFileIndexObjectTestHelper(index: TahoeLogFileIndex.type) {
def apply(spark: SparkSession, deltaLog: DeltaLog): TahoeLogFileIndex = {
index.apply(spark, deltaLog, catalogTableOpt = None)
}
}

implicit class AutoCompactObjectTestHelper(ac: AutoCompact.type) {
private[delta] def compact(
spark: SparkSession,
Expand Down

0 comments on commit 318e2ec

Please sign in to comment.