1
1
package com .spark .sql
2
2
3
+ import org .apache .spark .sql .{DataFrame , SparkSession }
3
4
import org .apache .spark .{SparkConf , SparkContext }
4
5
5
6
/**
6
- * @fileName: SparkHive.java
7
- * @description: SparkHive.java类说明
8
- * @author : by echo huang
9
- * @date: 2020-06-29 10:27
10
- */
7
+ * @fileName: SparkHive.java
8
+ * @description: SparkHive.java类说明
9
+ * @author : by echo huang
10
+ * @date: 2020-06-29 10:27
11
+ */
11
12
object SparkHive extends App {
12
13
override def main (args : Array [String ]): Unit = {
13
- testYarn()
14
+ // testYarn()
14
15
// val sparkBuilder = SparkSession.builder()
15
16
// .master("local[*]")
16
17
// .appName("hive")
17
- // .config( "spark.driver.memory", "4g")
18
+ // .config"spark.driver.memory", "4g")
18
19
// .config("spark.num.executors", "4")
19
20
// .config("spark.executor.memory", "2g")
20
21
// .config("spark.executor.cores", "4")
@@ -26,6 +27,41 @@ object SparkHive extends App {
26
27
// val frame: DataFrame = spark.sql("select * from forchange_prod.user_orders")
27
28
// frame.show(20)
28
29
// spark.close()
30
+ val spark : SparkSession = SparkSession .builder().master(" local[*]" )
31
+ .appName(" hive" )
32
+ .config(" spark.shuffle.manager" , " sort" )
33
+ .config(" hive.exec.dynamic.partition" , " true" )
34
+ .config(" hive.exec.dynamic.partition.mode" , " nonstrict" )
35
+ .config(" hive.exec.max.dynamic.partitions" , 2048 )
36
+ .config(" spark.sql.files.maxPartitionBytes" , 134217728 )
37
+ .config(" spark.sql.shuffle.partitions" , 200 )
38
+ .config(" spark.sql.inMemoryColumnarStorage.compressed" , value = true )
39
+ // 是否启用bypass机制,如果分区数小于该则直接使用hash用于shuffle,前提shuffle map端没有预聚合操作
40
+ .config(" spark.shuffle.sort.bypassMergeThreshold" , 300 )
41
+ .config(" spark.shuffle.compress" , value = true )
42
+ .config(" spark.shuffle.file.buffer" , " 512k" )
43
+ .config(" spark.shuffle.io.numConnectionsPerPeer" , 5 )
44
+ .config(" spark.shuffle.spill.compress" , value = true )
45
+ .config(" spark.io.compression.codec" , " snappy" )
46
+ .config(" spark.driver.memory" , " 1g" )
47
+ .config(" spark.num.executors" , " 3" )
48
+ .config(" spark.executor.memory" , " 2g" )
49
+ .config(" spark.executor.cores" , " 3" )
50
+ .config(" spark.default.parallelism" , " 10" )
51
+ .config(" spark.mapreduce.fileoutputcommitter.marksuccessfuljobs" , " false" )
52
+ .config(" spark.sql.parquet.writeLegacyFormat" , " true" )
53
+ .enableHiveSupport()
54
+ .getOrCreate()
55
+
56
+ // spark.sql("show databases").show()
57
+
58
+ spark.sql(" use wh_dwd" )
59
+ spark.sql(" show tables" ).show()
60
+
61
+ val startLog : DataFrame = spark.table(" dwd_start_log" )
62
+
63
+ startLog.show()
64
+ spark.stop()
29
65
}
30
66
31
67
@@ -47,7 +83,7 @@ object SparkHive extends App {
47
83
.set(" spark.driver.host" , " 192.168.6.35" )
48
84
// 设置jar包的路径,如果有其他的依赖包,可以在这里添加,逗号隔开
49
85
.setJars(List (" "
50
- ))
86
+ ))
51
87
conf.set(" spark.serializer" , " org.apache.spark.serializer.KryoSerializer" )
52
88
val sc = new SparkContext (conf)
53
89
val input = sc.makeRDD(List (1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 0 ))
0 commit comments