Skip to content

Feature/scala code/ch07 biman #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jan 17, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
multi line json
  • Loading branch information
bimanmandal committed Jan 17, 2022
commit 8d65ee446c1b8e4d751f275193fcf73da9eab875
32 changes: 32 additions & 0 deletions code/chap07/scala/data/sample_multi_line.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
[
{"name":"alex","id":100,"scores":[8,1,2,3],"dict": {"key": "value11"}},
{"name":"jane","id":200,"scores":[4,6],"dict": {"key": "value22"}},
{
"name": "bob",
"id": 300,
"scores": [
3,
4,
6,
9
],
"dict": {
"key": "value33",
"key2": "value44"
}
},
{
"name": "bob",
"id": 400,
"scores": [
3,
5,
6,
9
],
"dict": {
"key": "value55",
"key2": "value66"
}
}
]
4 changes: 4 additions & 0 deletions code/chap07/scala/data/sample_single_line.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"name":"alex","id":200,"scores":[1,2],"dict": {"key1": "value11", "key2": "value12"}}
{"name":"bob","id":300,"scores":[1,2,4,6],"dict": {"key1": "value16"}}
{"name":"jane","id":400,"scores":[2,4,6],"dict": {"key4": "value41"}}
{"name":"mary","id":500,"scores":[5,9],"dict": {"key4": "value77", "key3": "value88"}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
INPUT_PATH="data/sample_multi_line.json"
./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJSONReaderMultiLine "--args=$INPUT_PATH"
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
INPUT_PATH="data/sample_single_line.json"
./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJSONReaderSingleLine "--args=$INPUT_PATH"
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package org.data.algorithms.spark.ch07

import org.apache.spark.sql.SparkSession

/**
*-----------------------------------------------------
* Create a DataFrame from a JSON file
* Input: JSON File
* In this example, JSON object occupies multiple lines.
* Then we must enable multi-line mode for Spark to load
* the JSON file. Files will be loaded as a whole entity
* and cannot be split.
*------------------------------------------------------
* Input Parameters:
* a JSON file
*-------------------------------------------------------
*
* @author Biman Mandal
*-------------------------------------------------------
*/
object DatasourceJSONReaderMultiLine {

def debugFile(fileName: String): Unit =
println(scala.io.Source.fromFile(fileName).mkString)

def main(args: Array[String]): Unit = {
// create an instance of SparkSession
val spark = SparkSession.builder.master("local[*]").getOrCreate()

// read name of input file
val inputPath = args(0)
println("input path : " + inputPath)
debugFile(inputPath)

/**
*=====================================
* Create a DataFrame from a given input JSON file
*=====================================
*
* Spark enable us to read multi-line JSON files
* and create a new DataFrame
*
* The following example reads a multi-line JSON file
* and creates a new DataFrame:
*/
val df = spark.read
.option("multiline", "true")
.format("org.apache.spark.sql.execution.datasources.v2.json.JsonDataSourceV2")
.load(inputPath)

println("df.count() = " + df.count())

println("df = " + df.collect().mkString("Array(", ", ", ")"))

df.show(10, truncate = false)

df.printSchema()

// done!
spark.stop()
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package org.data.algorithms.spark.ch07

import org.apache.spark.sql.SparkSession

/**
*-----------------------------------------------------
* Create a DataFrame from a JSON file
* Input: JSON File
* In this example, there is one JSON object per line.
*------------------------------------------------------
* Input Parameters:
* a JSON file
*-------------------------------------------------------
*
* @author Biman Mandal
*-------------------------------------------------------
*/
object DatasourceJSONReaderSingleLine {

def debugFile(fileName: String): Unit =
println(scala.io.Source.fromFile(fileName).mkString)

def main(args: Array[String]): Unit = {
// create an instance of SparkSession
val spark = SparkSession.builder.master("local[*]").getOrCreate()

// read name of input file
val inputPath = args(0)
println("input path : " + inputPath)
debugFile(inputPath)

/**
*=====================================
* Create a DataFrame from a given input JSON file
*=====================================
*
* Spark enable us to read JSON files
* and create a new DataFrame
*
* The following example reads a JSON file
* and creates a new DataFrame:
*/
val df = spark.read
.format("org.apache.spark.sql.execution.datasources.v2.json.JsonDataSourceV2")
.load(inputPath)

println("df.count() = " + df.count())

println("df = " + df.collect().mkString("Array(", ", ", ")"))

df.show(10, truncate = false)

df.printSchema()

// done!
spark.stop()
}

}