multi line json

mahmoudparsian · bimanmandal · Jan 17, 2022 · Jan 15, 2022 · Jan 15, 2022 · Jan 15, 2022
commit 8d65ee446c1b8e4d751f275193fcf73da9eab875
diff --git a/code/chap07/scala/data/sample_multi_line.json b/code/chap07/scala/data/sample_multi_line.json
@@ -0,0 +1,32 @@
+[
+    {"name":"alex","id":100,"scores":[8,1,2,3],"dict": {"key": "value11"}},
+    {"name":"jane","id":200,"scores":[4,6],"dict": {"key": "value22"}},
+    {
+        "name": "bob",
+        "id": 300,
+        "scores": [
+            3,
+            4,
+            6,
+            9
+        ],
+        "dict": {
+            "key": "value33",
+            "key2": "value44"
+        }
+    },
+    {
+        "name": "bob",
+        "id": 400,
+        "scores": [
+            3,
+            5,
+            6,
+            9
+        ],
+        "dict": {
+            "key": "value55",
+            "key2": "value66"
+        }
+    }
+]
diff --git a/code/chap07/scala/data/sample_single_line.json b/code/chap07/scala/data/sample_single_line.json
@@ -0,0 +1,4 @@
+{"name":"alex","id":200,"scores":[1,2],"dict": {"key1": "value11", "key2": "value12"}}
+{"name":"bob","id":300,"scores":[1,2,4,6],"dict": {"key1": "value16"}}
+{"name":"jane","id":400,"scores":[2,4,6],"dict": {"key4": "value41"}}
+{"name":"mary","id":500,"scores":[5,9],"dict": {"key4": "value77", "key3": "value88"}}
diff --git a/code/chap07/scala/run_spark_applications_scripts/datasource_json_reader_multi_line.sh b/code/chap07/scala/run_spark_applications_scripts/datasource_json_reader_multi_line.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+INPUT_PATH="data/sample_multi_line.json"
+./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJSONReaderMultiLine "--args=$INPUT_PATH"
diff --git a/code/chap07/scala/run_spark_applications_scripts/datasource_json_reader_single_line.sh b/code/chap07/scala/run_spark_applications_scripts/datasource_json_reader_single_line.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+INPUT_PATH="data/sample_single_line.json"
+./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJSONReaderSingleLine "--args=$INPUT_PATH"
diff --git a/...7/scala/src/main/scala/org/data/algorithms/spark/ch07/DatasourceJSONReaderMultiLine.scala b/...7/scala/src/main/scala/org/data/algorithms/spark/ch07/DatasourceJSONReaderMultiLine.scala
@@ -0,0 +1,63 @@
+package org.data.algorithms.spark.ch07
+
+import org.apache.spark.sql.SparkSession
+
+/**
+ *-----------------------------------------------------
+ * Create a DataFrame from a JSON file
+ * Input: JSON File
+ * In this example, JSON object occupies multiple lines.
+ * Then we must enable multi-line mode for Spark to load
+ * the JSON file. Files will be loaded as a whole entity
+ * and cannot be split.
+ *------------------------------------------------------
+ * Input Parameters:
+ *    a JSON file
+ *-------------------------------------------------------
+ *
+ * @author Biman Mandal
+ *-------------------------------------------------------
+ */
+object DatasourceJSONReaderMultiLine {
+
+  def debugFile(fileName: String): Unit =
+    println(scala.io.Source.fromFile(fileName).mkString)
+
+  def main(args: Array[String]): Unit = {
+    // create an instance of SparkSession
+    val spark = SparkSession.builder.master("local[*]").getOrCreate()
+
+    // read name of input file
+    val inputPath = args(0)
+    println("input path : " + inputPath)
+    debugFile(inputPath)
+
+    /**
+     *=====================================
+     * Create a DataFrame from a given input JSON file
+     *=====================================
+     *
+     * Spark enable us to read multi-line JSON files
+     * and create a new DataFrame
+     *
+     * The following example reads a multi-line JSON file
+     * and creates a new DataFrame:
+     */
+    val df = spark.read
+      .option("multiline", "true")
+      .format("org.apache.spark.sql.execution.datasources.v2.json.JsonDataSourceV2")
+      .load(inputPath)
+
+    println("df.count() = " + df.count())
+
+    println("df = " + df.collect().mkString("Array(", ", ", ")"))
+
+    df.show(10, truncate = false)
+
+    df.printSchema()
+
+    // done!
+    spark.stop()
+  }
+
+}
diff --git a/.../scala/src/main/scala/org/data/algorithms/spark/ch07/DatasourceJSONReaderSingleLine.scala b/.../scala/src/main/scala/org/data/algorithms/spark/ch07/DatasourceJSONReaderSingleLine.scala
@@ -0,0 +1,59 @@
+package org.data.algorithms.spark.ch07
+
+import org.apache.spark.sql.SparkSession
+
+/**
+ *-----------------------------------------------------
+ * Create a DataFrame from a JSON file
+ * Input: JSON File
+ * In this example, there is one JSON object per line.
+ *------------------------------------------------------
+ * Input Parameters:
+ *    a JSON file
+ *-------------------------------------------------------
+ *
+ * @author Biman Mandal
+ *-------------------------------------------------------
+ */
+object DatasourceJSONReaderSingleLine {
+
+  def debugFile(fileName: String): Unit =
+    println(scala.io.Source.fromFile(fileName).mkString)
+
+  def main(args: Array[String]): Unit = {
+    // create an instance of SparkSession
+    val spark = SparkSession.builder.master("local[*]").getOrCreate()
+
+    // read name of input file
+    val inputPath = args(0)
+    println("input path : " + inputPath)
+    debugFile(inputPath)
+
+    /**
+     *=====================================
+     * Create a DataFrame from a given input JSON file
+     *=====================================
+     *
+     * Spark enable us to read JSON files
+     * and create a new DataFrame
+     *
+     * The following example reads a JSON file
+     * and creates a new DataFrame:
+     */
+    val df = spark.read
+      .format("org.apache.spark.sql.execution.datasources.v2.json.JsonDataSourceV2")
+      .load(inputPath)
+
+    println("df.count() = " + df.count())
+
+    println("df = " + df.collect().mkString("Array(", ", ", ")"))
+
+    df.show(10, truncate = false)
+
+    df.printSchema()
+
+    // done!
+    spark.stop()
+  }
+
+}