diff --git a/data/books.csv b/data/books.csv new file mode 100644 index 0000000..85dad38 --- /dev/null +++ b/data/books.csv @@ -0,0 +1,12 @@ +id,authorId,title,releaseDate,link +1,1,Fantastic Beasts and Where to Find Them: The Original Screenplay,11/18/16,http://amzn.to/2kup94P +2,1,"Harry Potter and the Sorcerer's Stone: The Illustrated Edition (Harry Potter, Book 1)",10/6/15,http://amzn.to/2l2lSwP +3,1,"The Tales of Beedle the Bard, Standard Edition (Harry Potter)",12/4/08,http://amzn.to/2kYezqr +4,1,"Harry Potter and the Chamber of Secrets: The Illustrated Edition (Harry Potter, Book 2)",10/4/16,http://amzn.to/2kYhL5n +5,2,"Informix 12.10 on Mac 10.12 with a dash of Java 8: The Tale of the Apple, the Coffee, and a Great Database",4/23/17,http://amzn.to/2i3mthT +6,2,"Development Tools in 2006: any Room for a 4GL-style Language?: An independent study by Jean Georges Perrin, IIUG Board Member",12/28/16,http://amzn.to/2vBxOe1 +7,3,Adventures of Huckleberry Finn,5/26/94,http://amzn.to/2wOeOav +8,3,A Connecticut Yankee in King Arthur's Court,6/17/17,http://amzn.to/2x1NuoD +10,4,Jacques le Fataliste,3/1/00,http://amzn.to/2uZj2KA +11,4,Diderot Encyclopedia: The Complete Illustrations 1762-1777,,http://amzn.to/2i2zo3I +12,,A Woman in Berlin,7/11/06,http://amzn.to/2i472WZ diff --git a/pom.xml b/pom.xml index 594b8c4..8504286 100644 --- a/pom.xml +++ b/pom.xml @@ -1,5 +1,6 @@ - + 4.0.0 net.jgp.books @@ -7,12 +8,58 @@ 1.0.0-SNAPSHOT jar - + UTF-8 1.8 + 2.11 + 2.2.0 + + + + org.apache.spark + spark-core_${scala.version} + ${spark.version} + + + + org.apache.spark + spark-sql_${scala.version} + ${spark.version} + + + org.slf4j + slf4j-simple + + + + + + org.apache.spark + spark-mllib_${scala.version} + ${spark.version} + + + org.slf4j + slf4j-log4j12 + + + org.slf4j + slf4j-simple + + + + + + junit + junit + 4.11 + test + + + @@ -26,12 +73,6 @@ - - - junit - junit - 4.11 - test - - + + diff --git a/src/main/java/net/jgp/books/sparkWithJava/ch01/CsvToDataframeApp.java b/src/main/java/net/jgp/books/sparkWithJava/ch01/CsvToDataframeApp.java new file mode 100644 index 0000000..a0b75e9 --- /dev/null +++ b/src/main/java/net/jgp/books/sparkWithJava/ch01/CsvToDataframeApp.java @@ -0,0 +1,37 @@ +package net.jgp.books.sparkWithJava.ch01; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; + +/** + * CSV ingestion in a dataframe. + * + * @author jperrin + */ +public class CsvToDataframeApp { + + public static void main(String[] args) { + CsvToDataframeApp app = new CsvToDataframeApp(); + app.start(); + } + + /** + * The worker code. + */ + private void start() { + // Creates a session on a local master + SparkSession spark = SparkSession.builder() + .appName("CSV to Dataset") + .master("local") + .getOrCreate(); + + // Reads a CSV file with header, called books.csv, stores it in a dataframe + Dataset df = spark.read().format("csv") + .option("header", "true") + .load("data/books.csv"); + + // Shows at most 20 rows from the dataframe + df.show(); + } +}