Skip to content

Commit 3165ca7

Browse files
sunchaocloud-fan
authored andcommitted
[SPARK-33376][SQL] Remove the option of "sharesHadoopClasses" in Hive IsolatedClientLoader
### What changes were proposed in this pull request? This removes the `sharesHadoopClasses` flag from `IsolatedClientLoader` in Hive module. ### Why are the changes needed? Currently, when initializing `IsolatedClientLoader`, users can set the `sharesHadoopClasses` flag to decide whether the `HiveClient` created should share Hadoop classes with Spark itself or not. In the latter case, the client will only load Hadoop classes from the Hive dependencies. There are two reasons to remove this: 1. this feature is currently used in two cases: 1) unit tests, 2) when the Hadoop version defined in Maven can not be found when `spark.sql.hive.metastore.jars` is equal to "maven", which could be very rare. 2. when `sharesHadoopClasses` is false, Spark doesn't really only use Hadoop classes from Hive jars: we also download `hadoop-client` jar and put all the sub-module jars (e.g., `hadoop-common`, `hadoop-hdfs`) together with the Hive jars, and the Hadoop version used by `hadoop-client` is the same version used by Spark itself. As result, we're mixing two versions of Hadoop jars in the classpath, which could potentially cause issues, especially considering that the default Hadoop version is already 3.2.0 while most Hive versions supported by the `IsolatedClientLoader` is still using Hadoop 2.x or even lower. ### Does this PR introduce _any_ user-facing change? This affects Spark users in one scenario: when `spark.sql.hive.metastore.jars` is set to `maven` AND the Hadoop version specified in pom file cannot be downloaded, currently the behavior is to switch to _not_ share Hadoop classes, but with the PR it will share Hadoop classes with Spark. ### How was this patch tested? Existing UTs. Closes #30284 from sunchao/SPARK-33376. Authored-by: Chao Sun <sunchao@apple.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
1 parent 34f5e7c commit 3165ca7

File tree

5 files changed

+9
-27
lines changed

5 files changed

+9
-27
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,9 @@ private[hive] object IsolatedClientLoader extends Logging {
5252
config: Map[String, String] = Map.empty,
5353
ivyPath: Option[String] = None,
5454
sharedPrefixes: Seq[String] = Seq.empty,
55-
barrierPrefixes: Seq[String] = Seq.empty,
56-
sharesHadoopClasses: Boolean = true): IsolatedClientLoader = synchronized {
55+
barrierPrefixes: Seq[String] = Seq.empty): IsolatedClientLoader = synchronized {
5756
val resolvedVersion = hiveVersion(hiveMetastoreVersion)
58-
// We will first try to share Hadoop classes. If we cannot resolve the Hadoop artifact
59-
// with the given version, we will use Hadoop 2.7 and then will not share Hadoop classes.
60-
var _sharesHadoopClasses = sharesHadoopClasses
57+
// We will use Hadoop 2.7 if we cannot resolve the Hadoop artifact.
6158
val files = if (resolvedVersions.contains((resolvedVersion, hadoopVersion))) {
6259
resolvedVersions((resolvedVersion, hadoopVersion))
6360
} else {
@@ -72,10 +69,8 @@ private[hive] object IsolatedClientLoader extends Logging {
7269
val fallbackVersion = "2.7.4"
7370
logWarning(s"Failed to resolve Hadoop artifacts for the version $hadoopVersion. We " +
7471
s"will change the hadoop version from $hadoopVersion to $fallbackVersion and try " +
75-
"again. Hadoop classes will not be shared between Spark and Hive metastore client. " +
76-
"It is recommended to set jars used by Hive metastore client through " +
72+
"again. It is recommended to set jars used by Hive metastore client through " +
7773
"spark.sql.hive.metastore.jars in the production environment.")
78-
_sharesHadoopClasses = false
7974
(downloadVersion(
8075
resolvedVersion, fallbackVersion, ivyPath, remoteRepos), fallbackVersion)
8176
}
@@ -89,7 +84,6 @@ private[hive] object IsolatedClientLoader extends Logging {
8984
execJars = files,
9085
hadoopConf = hadoopConf,
9186
config = config,
92-
sharesHadoopClasses = _sharesHadoopClasses,
9387
sharedPrefixes = sharedPrefixes,
9488
barrierPrefixes = barrierPrefixes)
9589
}
@@ -177,7 +171,6 @@ private[hive] object IsolatedClientLoader extends Logging {
177171
* @param config A set of options that will be added to the HiveConf of the constructed client.
178172
* @param isolationOn When true, custom versions of barrier classes will be constructed. Must be
179173
* true unless loading the version of hive that is on Spark's classloader.
180-
* @param sharesHadoopClasses When true, we will share Hadoop classes between Spark and
181174
* @param baseClassLoader The spark classloader that is used to load shared classes.
182175
*/
183176
private[hive] class IsolatedClientLoader(
@@ -187,7 +180,6 @@ private[hive] class IsolatedClientLoader(
187180
val execJars: Seq[URL] = Seq.empty,
188181
val config: Map[String, String] = Map.empty,
189182
val isolationOn: Boolean = true,
190-
val sharesHadoopClasses: Boolean = true,
191183
val baseClassLoader: ClassLoader = Thread.currentThread().getContextClassLoader,
192184
val sharedPrefixes: Seq[String] = Seq.empty,
193185
val barrierPrefixes: Seq[String] = Seq.empty)
@@ -204,7 +196,7 @@ private[hive] class IsolatedClientLoader(
204196
name.startsWith("org.apache.log4j") || // log4j1.x
205197
name.startsWith("org.apache.logging.log4j") || // log4j2
206198
name.startsWith("org.apache.spark.") ||
207-
(sharesHadoopClasses && isHadoopClass) ||
199+
isHadoopClass ||
208200
name.startsWith("scala.") ||
209201
(name.startsWith("com.google") && !name.startsWith("com.google.cloud")) ||
210202
name.startsWith("java.") ||

sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,7 @@ class HadoopVersionInfoSuite extends SparkFunSuite {
4949
sparkConf = new SparkConf(),
5050
hadoopConf = hadoopConf,
5151
config = HiveClientBuilder.buildConf(Map.empty),
52-
ivyPath = Some(ivyPath.getCanonicalPath),
53-
sharesHadoopClasses = true)
52+
ivyPath = Some(ivyPath.getCanonicalPath))
5453
val jars = client.classLoader.getParent.asInstanceOf[URLClassLoader].getURLs
5554
.map(u => new File(u.toURI))
5655
// Drop all Hadoop jars to use the existing Hadoop jars on the classpath

sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientBuilder.scala

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,13 @@ private[client] object HiveClientBuilder {
4646
def buildClient(
4747
version: String,
4848
hadoopConf: Configuration,
49-
extraConf: Map[String, String] = Map.empty,
50-
sharesHadoopClasses: Boolean = true): HiveClient = {
49+
extraConf: Map[String, String] = Map.empty): HiveClient = {
5150
IsolatedClientLoader.forVersion(
5251
hiveMetastoreVersion = version,
5352
hadoopVersion = VersionInfo.getVersion,
5453
sparkConf = new SparkConf(),
5554
hadoopConf = hadoopConf,
5655
config = buildConf(extraConf),
57-
ivyPath = ivyPath,
58-
sharesHadoopClasses = sharesHadoopClasses).createClient()
56+
ivyPath = ivyPath).createClient()
5957
}
6058
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,6 @@ class HivePartitionFilteringSuite(version: String)
272272
day1 :: day2 :: Nil)
273273
}
274274

275-
test("create client with sharesHadoopClasses = false") {
276-
buildClient(new Configuration(), sharesHadoopClasses = false)
277-
}
278-
279275
private def testMetastorePartitionFiltering(
280276
filterExpr: Expression,
281277
expectedDs: Seq[Int],

sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveVersionSuite.scala

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,7 @@ private[client] abstract class HiveVersionSuite(version: String) extends SparkFu
2828
override protected val enableAutoThreadAudit = false
2929
protected var client: HiveClient = null
3030

31-
protected def buildClient(
32-
hadoopConf: Configuration,
33-
sharesHadoopClasses: Boolean = true): HiveClient = {
31+
protected def buildClient(hadoopConf: Configuration): HiveClient = {
3432
// Hive changed the default of datanucleus.schema.autoCreateAll from true to false and
3533
// hive.metastore.schema.verification from false to true since 2.0
3634
// For details, see the JIRA HIVE-6113 and HIVE-12463
@@ -46,8 +44,7 @@ private[client] abstract class HiveVersionSuite(version: String) extends SparkFu
4644
HiveClientBuilder.buildClient(
4745
version,
4846
hadoopConf,
49-
HiveUtils.formatTimeVarsForHiveClient(hadoopConf),
50-
sharesHadoopClasses = sharesHadoopClasses)
47+
HiveUtils.formatTimeVarsForHiveClient(hadoopConf))
5148
}
5249

5350
override def suiteName: String = s"${super.suiteName}($version)"

0 commit comments

Comments
 (0)