forked from datastax/spark-cassandra-connector
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SPARKC-94: Remove describe_splits_ex call.
The thrift method describe_splits_ex was called to estimate the amount of data per given token range, in order to split token ranges into sub-splits of appropriate size. Cassandra 2.1.5 introduces the system.size_estimates table that allows us to do the same using CQL queries, without doing deprecated thrift calls. The new size estimates allow to control split size more precisely than describe_splits_ex could. Cassandra exposes information on the data size in bytes for each token range. Therefore the configuration option spark.cassandra.input.split.size, which previously controlled split size in number of C* partitions, has been renamed to spark.cassandra.input.split.size_in_mb and controls size in megabytes. Additionally spark.cassandra.input.page.row.size property has been renamed to spark.cassandra.input.fetch.size_in_rows to make the units visible.
- Loading branch information
Showing
15 changed files
with
274 additions
and
185 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
60 changes: 60 additions & 0 deletions
60
...tor/src/it/scala/com/datastax/spark/connector/rdd/partitioner/DataSizeEstimatesSpec.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package com.datastax.spark.connector.rdd.partitioner | ||
|
||
import org.scalatest.{Matchers, FlatSpec} | ||
|
||
import com.datastax.spark.connector.cql.CassandraConnector | ||
import com.datastax.spark.connector.embedded.EmbeddedCassandra | ||
import com.datastax.spark.connector.rdd.partitioner.dht.LongToken | ||
import com.datastax.spark.connector.testkit.SharedEmbeddedCassandra | ||
|
||
class DataSizeEstimatesSpec extends FlatSpec with Matchers with SharedEmbeddedCassandra { | ||
|
||
useCassandraConfig(Seq("cassandra-default.yaml.template")) | ||
val conn = CassandraConnector(hosts = Set(EmbeddedCassandra.getHost(0))) | ||
|
||
val keyspaceName = "data_size_estimates" | ||
|
||
conn.withSessionDo { session => | ||
session.execute( | ||
s"CREATE KEYSPACE IF NOT EXISTS $keyspaceName " + | ||
s"WITH REPLICATION = { 'class': 'SimpleStrategy', 'replication_factor': 1 }") | ||
} | ||
|
||
// TODO: enable this test once we upgrade to 2.1.5, which populates the size estimates table | ||
"DataSizeEstimates" should "fetch data size estimates for a known table" ignore { | ||
val tableName = "table1" | ||
conn.withSessionDo { session => | ||
session.execute( | ||
s"CREATE TABLE IF NOT EXISTS $keyspaceName.$tableName(key int PRIMARY KEY, value VARCHAR)") | ||
for (i <- 1 to 10000) | ||
session.execute( | ||
s"INSERT INTO $keyspaceName.$tableName(key, value) VALUES (?, ?)", | ||
i.asInstanceOf[AnyRef], | ||
"value" + i) | ||
} | ||
|
||
val estimates = new DataSizeEstimates[Long, LongToken](conn, keyspaceName, tableName) | ||
estimates.partitionCount should be > 5000L | ||
estimates.partitionCount should be < 20000L | ||
estimates.dataSizeInBytes should be > 0L | ||
} | ||
|
||
it should "should return zeroes for an empty table" in { | ||
val tableName = "table2" | ||
conn.withSessionDo { session => | ||
session.execute( | ||
s"CREATE TABLE IF NOT EXISTS $keyspaceName.$tableName(key int PRIMARY KEY, value VARCHAR)") | ||
} | ||
|
||
val estimates = new DataSizeEstimates[Long, LongToken](conn, keyspaceName, tableName) | ||
estimates.partitionCount shouldBe 0L | ||
estimates.dataSizeInBytes shouldBe 0L | ||
} | ||
|
||
it should "return zeroes for a non-existing table" in { | ||
val tableName = "table3" | ||
val estimates = new DataSizeEstimates[Long, LongToken](conn, keyspaceName, tableName) | ||
estimates.partitionCount shouldBe 0L | ||
estimates.dataSizeInBytes shouldBe 0L | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
86 changes: 86 additions & 0 deletions
86
...ector/src/main/scala/com/datastax/spark/connector/rdd/partitioner/DataSizeEstimates.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package com.datastax.spark.connector.rdd.partitioner | ||
|
||
import scala.collection.JavaConversions._ | ||
|
||
import org.apache.spark.Logging | ||
|
||
import com.datastax.driver.core.exceptions.InvalidQueryException | ||
import com.datastax.spark.connector.cql.CassandraConnector | ||
import com.datastax.spark.connector.rdd.partitioner.dht.{TokenFactory, Token} | ||
|
||
|
||
/** Estimates amount of data in the Cassandra table. | ||
* Takes token range size estimates from the `system.size_estimates` table, | ||
* available since Cassandra 2.1.5. */ | ||
class DataSizeEstimates[V, T <: Token[V]]( | ||
conn: CassandraConnector, | ||
keyspaceName: String, | ||
tableName: String)( | ||
implicit | ||
tokenFactory: TokenFactory[V, T]) | ||
extends Logging { | ||
|
||
/** Represents a single `system.size_estimates` table row */ | ||
private case class TokenRangeSizeEstimate( | ||
rangeStart: T, | ||
rangeEnd: T, | ||
partitionsCount: Long, | ||
meanPartitionSize: Long) { | ||
|
||
def ringFraction: Double = | ||
tokenFactory.ringFraction(rangeStart, rangeEnd) | ||
|
||
def totalSizeInBytes: Long = | ||
partitionsCount * meanPartitionSize | ||
} | ||
|
||
private lazy val tokenRanges: Seq[TokenRangeSizeEstimate] = | ||
conn.withSessionDo { session => | ||
try { | ||
val rs = session.execute( | ||
"SELECT range_start, range_end, partitions_count, mean_partition_size " + | ||
"FROM system.size_estimates " + | ||
"WHERE keyspace_name = ? AND table_name = ?", keyspaceName, tableName) | ||
|
||
for (row <- rs.all()) yield TokenRangeSizeEstimate( | ||
rangeStart = tokenFactory.tokenFromString(row.getString("range_start")), | ||
rangeEnd = tokenFactory.tokenFromString(row.getString("range_end")), | ||
partitionsCount = row.getLong("partitions_count"), | ||
meanPartitionSize = row.getLong("mean_partition_size") | ||
) | ||
|
||
// The table may not contain the estimates yet if the data was just inserted and the | ||
// amount of data in the table was small. This is very common situation during tests, | ||
// when we insert a few rows and immediately query them. However, for tiny data sets the lack | ||
// of size estimates is not a problem at all, because we don't want to split tiny data anyways. | ||
// Therefore, we're not issuing a warning if the result set was empty. | ||
} | ||
catch { | ||
case e: InvalidQueryException => | ||
logError( | ||
s"Failed to fetch size estimates for $keyspaceName.$tableName from system.size_estimates " + | ||
s"table. The number of created Spark partitions may be inaccurate. " + | ||
s"Please make sure you use Cassandra 2.1.5 or newer.", e) | ||
Seq.empty | ||
} | ||
} | ||
|
||
private lazy val ringFraction = | ||
tokenRanges.map(_.ringFraction).sum | ||
|
||
/** Estimates the total number of partitions in a ring */ | ||
lazy val partitionCount: Long = { | ||
val partitionsCount = tokenRanges.map(_.partitionsCount).sum | ||
val normalizedCount = (partitionsCount / ringFraction).toLong | ||
logDebug(s"Estimated partition count of $keyspaceName.$tableName is $normalizedCount") | ||
normalizedCount | ||
} | ||
|
||
/** Estimates the total amount of data in a table assuming no replication. */ | ||
lazy val dataSizeInBytes: Long = { | ||
val byteCount = tokenRanges.map(_.totalSizeInBytes).sum | ||
val normalizedCount = (byteCount / ringFraction).toLong | ||
logDebug(s"Estimated size of $keyspaceName.$tableName is $normalizedCount bytes") | ||
normalizedCount | ||
} | ||
} |
Oops, something went wrong.