diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index 5be91f9..0000000 --- a/.gitattributes +++ /dev/null @@ -1 +0,0 @@ -*.ipynb linguist-vendored diff --git a/.gitignore b/.gitignore index 72d1333..a201c80 100644 --- a/.gitignore +++ b/.gitignore @@ -1,58 +1,8 @@ -.ensime -.ensime_cache/ -.ipynb_checkpoints/ -_site/ +.sass-cache/ .jekyll-metadata -.settings/ -target/ -.metadata -bin/ -tmp/ -*.tmp -*.bak -*.swp -*~.nib -local.properties -.settings/ -.loadpath - -# Eclipse Core -.project - -# External tool builders -.externalToolBuilders/ - -# Locally stored "Eclipse launch configurations" -*.launch - -# PyDev specific (Python IDE for Eclipse) -*.pydevproject - -# CDT-specific (C/C++ Development Tooling) -.cproject - -# JDT-specific (Eclipse Java Development Tools) -.classpath - -# Java annotation processor (APT) -.factorypath - -# PDT-specific (PHP Development Tools) -.buildpath - -# sbteclipse plugin -.target - -# Tern plugin -.tern-project - -# TeXlipse plugin -.texlipse - -# STS (Spring Tool Suite) -.springBeans +_site/ +resources/target/ +GemFile.lock -# Code Recommenders -.recommenders -/target +.DS_Store diff --git a/BingSiteAuth.xml b/BingSiteAuth.xml new file mode 100644 index 0000000..4559943 --- /dev/null +++ b/BingSiteAuth.xml @@ -0,0 +1,4 @@ + + + AA6ADF247B8500740F83E546D994033C + \ No newline at end of file diff --git a/CNAME b/CNAME new file mode 100644 index 0000000..c02e0aa --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +vishnuviswanath.com diff --git a/Flume/.gitignore b/Flume/.gitignore deleted file mode 100644 index 5708a8d..0000000 --- a/Flume/.gitignore +++ /dev/null @@ -1,22 +0,0 @@ -.classpath -.project -.settings -target - -*.class - -# Mobile Tools for Java (J2ME) -.mtj.tmp/ - -#intellij project file -*.iml -.idea/ -.cache-main - -# Package Files # -*.jar -*.war -*.ear - -# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml -hs_err_pid* diff --git a/Flume/README.md b/Flume/README.md deleted file mode 100644 index 194f30d..0000000 --- a/Flume/README.md +++ /dev/null @@ -1,14 +0,0 @@ -### Flume Custom TCP Source - -CustomFlumeTCPSource.java is custom flume source which listens to a port and sends the content to the configured channel. The custom source adds the client information to the header of message before sending to the channel. -It takes two configurations - -1. port - the port to listen to -2. buffer - how often should the events be send to the channel - -#### Sample configuration -agent.sources.CustomTcpSource.type = com.vishnu.flume.source.CustomFlumeTCPSource -agent.sources.CustomTcpSource.port = 4443 -agent.sources.CustomTcpSource.buffer = 1 - - diff --git a/Flume/pom.xml b/Flume/pom.xml deleted file mode 100644 index fc81f03..0000000 --- a/Flume/pom.xml +++ /dev/null @@ -1,26 +0,0 @@ - - 4.0.0 - com.vishnu - Flume - 0.0.1-SNAPSHOT - - src - - - maven-compiler-plugin - 3.3 - - 1.8 - 1.8 - - - - - - - org.apache.flume - flume-ng-core - 1.6.0 - - - \ No newline at end of file diff --git a/Flume/src/com/vishnu/flume/config/flume-conf.properties b/Flume/src/com/vishnu/flume/config/flume-conf.properties deleted file mode 100644 index 96ac65b..0000000 --- a/Flume/src/com/vishnu/flume/config/flume-conf.properties +++ /dev/null @@ -1,49 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# The configuration file needs to define the sources, -# the channels and the sinks. -# Sources, channels and sinks are defined per agent, -# in this case called 'agent' - -agent.sources = CustomTcpSource -agent.channels = memoryChannel -agent.sinks = loggerSink - -# For each one of the sources, the type is defined -agent.sources.CustomTcpSource.type = com.vishnu.flume.source.CustomFlumeTCPSource -agent.sources.CustomTcpSource.port = 4443 -agent.sources.CustomTcpSource.buffer = 1 - - -# The channel can be defined as follows. -agent.sources.CustomTcpSource.channels = memoryChannel - -# Each sink's type must be defined -agent.sinks.loggerSink.type = logger - -#Specify the channel the sink should use -agent.sinks.loggerSink.channel = memoryChannel - -# Each channel's type is defined. -agent.channels.memoryChannel.type = memory - -# Other config values specific to each type of channel(sink or source) -# can be defined as well -# In this case, it specifies the capacity of the memory channel -agent.channels.memoryChannel.capacity = 100 diff --git a/Flume/src/com/vishnu/flume/config/flume-conf_spark.properties b/Flume/src/com/vishnu/flume/config/flume-conf_spark.properties deleted file mode 100644 index 56d2bd5..0000000 --- a/Flume/src/com/vishnu/flume/config/flume-conf_spark.properties +++ /dev/null @@ -1,33 +0,0 @@ -# Flume configuration to listen to netcat host and port, -# sink is of the type avro -# Created for testing spark streaming from flume -# @author vishnu viswanath - -agent.sources = Netcat -agent.channels = memoryChannel -agent.sinks = avroSink -#agent.sinks = loggerSink - -# For each one of the sources, the type is defined -agent.sources.Netcat.type = netcat -agent.sources.Netcat.bind = localhost -agent.sources.Netcat.port = 6666 -agent.sources.Netcat.channels = memoryChannel - -# avro sink for spark -agent.sinks.avroSink.type = avro -agent.sinks.avroSink.channel = memoryChannel -agent.sinks.avroSink.hostname = localhost -agent.sinks.avroSink.port = 4444 - -#logger sink -#agent.sinks.loggerSink.type = logger -#agent.sinks.loggerSink.channel = memoryChannel - -# Each channel's type is defined. -agent.channels.memoryChannel.type = memory - -# Other config values specific to each type of channel(sink or source) -# can be defined as well -# In this case, it specifies the capacity of the memory channel -agent.channels.memoryChannel.capacity = 100 diff --git a/Flume/src/com/vishnu/flume/source/CustomFlumeTCPSource.java b/Flume/src/com/vishnu/flume/source/CustomFlumeTCPSource.java deleted file mode 100644 index 4a8e385..0000000 --- a/Flume/src/com/vishnu/flume/source/CustomFlumeTCPSource.java +++ /dev/null @@ -1,94 +0,0 @@ -package com.vishnu.flume.source; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.ServerSocket; -import java.net.Socket; -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.flume.Context; -import org.apache.flume.Event; -import org.apache.flume.EventDrivenSource; -import org.apache.flume.channel.ChannelProcessor; -import org.apache.flume.conf.Configurable; -import org.apache.flume.event.EventBuilder; -import org.apache.flume.source.AbstractSource; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - - -/** - * A custom flume source to connect listen to a TCP port - */ -public class CustomFlumeTCPSource extends AbstractSource - implements EventDrivenSource, Configurable { - - private static final Logger logger = - LoggerFactory.getLogger(CustomFlumeTCPSource.class); - private int port; - private int buffer; - private ServerSocket serverSocket; - private BufferedReader receiveBuffer; - private Socket clientSocket; - /** - * Configurations for the TCP source - */ - @Override - public void configure(Context context) { - port = context.getInteger("port"); - buffer = context.getInteger("buffer"); - - try{ - serverSocket = new ServerSocket(port); - logger.info("FlumeTCP source initialized"); - }catch(Exception e) { - logger.error("FlumeTCP source failed to initialize"); - } - } - - @Override - public void start() { - try { - clientSocket = serverSocket.accept(); - receiveBuffer = new BufferedReader(new InputStreamReader(clientSocket.getInputStream())); - logger.info("Connection established with client : " + clientSocket.getRemoteSocketAddress()); - final ChannelProcessor channel = getChannelProcessor(); - final Map headers = new HashMap(); - headers.put("hostname", clientSocket.getRemoteSocketAddress().toString()); - String line = ""; - List events = new ArrayList(); - - while ((line = receiveBuffer.readLine()) != null) { - Event event = EventBuilder.withBody( - line, Charset.defaultCharset(),headers); - - logger.info("Event created"); - events.add(event); - if (events.size() == buffer) { - channel.processEventBatch(events); - } - } - } catch (Exception e) { - - } - super.start(); - } - - - @Override - public void stop() { - logger.info("Closing the connection"); - try { - clientSocket.close(); - serverSocket.close(); - } catch (IOException e) { - e.printStackTrace(); - } - super.stop(); - } -} \ No newline at end of file diff --git a/Flume/src/com/vishnu/tcp/client/TcpClient.java b/Flume/src/com/vishnu/tcp/client/TcpClient.java deleted file mode 100644 index 3a676f1..0000000 --- a/Flume/src/com/vishnu/tcp/client/TcpClient.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.vishnu.tcp.client; - -import java.io.BufferedReader; -import java.io.DataOutputStream; -import java.io.InputStreamReader; -import java.net.Socket; - -public class TcpClient { - - public static void main(String[] args) throws Exception { - String sentence; - String modifiedSentence; - BufferedReader inFromUser = new BufferedReader(new InputStreamReader(System.in)); - Socket clientSocket = new Socket("localhost", 4443); - DataOutputStream outToServer = new DataOutputStream(clientSocket.getOutputStream()); - outToServer.writeBytes("test message" + '\n'); - clientSocket.close(); - } -} diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..44b505a --- /dev/null +++ b/Gemfile @@ -0,0 +1,3 @@ +source 'https://rubygems.org' +gem 'github-pages', group: :jekyll_plugins +gem 'jekyll-seo-tag' \ No newline at end of file diff --git a/KafkaStreams/.gitignore b/KafkaStreams/.gitignore deleted file mode 100644 index f32e31a..0000000 --- a/KafkaStreams/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -.idea/ -.DS_Store diff --git a/KafkaStreams/README.md b/KafkaStreams/README.md deleted file mode 100644 index c7e7060..0000000 --- a/KafkaStreams/README.md +++ /dev/null @@ -1,3 +0,0 @@ -**KafkaStreams** is a stream processing library on top of Apache Kafka. - -This project contains basic examples of how to create a Kafka Stream application in Scala. For more detailed explaination visit the [blog post](http://vishnuviswanath.com/hello-kafka-streams.html). diff --git a/KafkaStreams/build.sbt b/KafkaStreams/build.sbt deleted file mode 100644 index f68b500..0000000 --- a/KafkaStreams/build.sbt +++ /dev/null @@ -1,29 +0,0 @@ -name := "KafkaStreams" - -version := "1.0" - -scalaVersion := "2.11.8" - -organization := "com.vishnuviswanath" - -val kafkaStreamsVersion = "0.10.2.0" - -val kafkaDependencies = Seq( - "org.apache.kafka" % "kafka-streams" % kafkaStreamsVersion) - -val otherDependencies = Seq( - "com.esotericsoftware.kryo" % "kryo" % "2.24.0" -) - -val main = "com.vishnuviswanath.kafka.streams.KafkaStreamsExample" -mainClass in (Compile, run) := Some(main) -mainClass in (Compile, packageBin) := Some(main) - -lazy val root = (project in file(".")). - settings( - libraryDependencies ++= kafkaDependencies, - libraryDependencies ++= otherDependencies - ) - - - \ No newline at end of file diff --git a/KafkaStreams/project/assembly.sbt b/KafkaStreams/project/assembly.sbt deleted file mode 100644 index e69de29..0000000 diff --git a/KafkaStreams/project/build.properties b/KafkaStreams/project/build.properties deleted file mode 100644 index e0cbc71..0000000 --- a/KafkaStreams/project/build.properties +++ /dev/null @@ -1 +0,0 @@ -sbt.version = 0.13.13 \ No newline at end of file diff --git a/KafkaStreams/project/plugins.sbt b/KafkaStreams/project/plugins.sbt deleted file mode 100644 index efff725..0000000 --- a/KafkaStreams/project/plugins.sbt +++ /dev/null @@ -1,2 +0,0 @@ -logLevel := Level.Warn -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") \ No newline at end of file diff --git a/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/ClimateLogStream.scala b/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/ClimateLogStream.scala deleted file mode 100644 index f8f393d..0000000 --- a/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/ClimateLogStream.scala +++ /dev/null @@ -1,196 +0,0 @@ -package com.vishnuviswanath.kafka.streams - -/** - * Created by vviswanath on 4/22/17. - * - * A KafkaStreams example that demonstrates use of flatMapValues, branch, predicate, selectKey, through, join and to - * Also demonstrates the use of custom Serializer using Kryo - * - * flatMapValues → applies a flatMap function on the values - * branch → creates branches on the input stream based on the predicates given (applies each predicate to the element till the first sucesss and assigns it to that stream) - * predicate → predicate used for branching - * through → saves the stream to a kafka topic and reads back as KStream - * join → joins two streams - * to → saves to a kafka topic - */ -import java.io.{ByteArrayInputStream, ByteArrayOutputStream} -import java.util.Properties - -import org.apache.kafka.clients.consumer.ConsumerConfig -import org.apache.kafka.streams._ -import org.apache.kafka.streams.kstream._ - -import collection.JavaConverters._ -import java.lang.Iterable -import java.util - -import com.esotericsoftware.kryo.io.{ByteBufferInput, Input, Output} -import com.esotericsoftware.kryo.Kryo -import org.apache.kafka.common.serialization.{Deserializer, Serde, Serdes, Serializer} - -import scala.util.DynamicVariable - -object ClimateLogStream { - - def main(args: Array[String]): Unit = { - val settings = new Properties - settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "kafka-streams-example") - settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092") - settings.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest") - settings.put(StreamsConfig.KEY_SERDE_CLASS_CONFIG, Serdes.serdeFrom(classOf[String]).getClass.getName) - settings.put(StreamsConfig.VALUE_SERDE_CLASS_CONFIG, classOf[ClimateLogSerDe].getName) - - val kstreamBuilder = new KStreamBuilder - val rawStream: KStream[String, String] = kstreamBuilder.stream(Serdes.String, Serdes.String, "climate_events") - - val climateLogStream: KStream[String, ClimateLog] = rawStream.flatMapValues(new ValueMapper[String, Iterable[ClimateLog]]{ - override def apply(value: String): Iterable[ClimateLog] = ClimateLog(value).toIterable.asJava - }) - - climateLogStream.print() - - //define the predicates to split the stream into branches - val highHumidty = new Predicate[String, ClimateLog] { - override def test(t: String, c: ClimateLog): Boolean = c.humidity > 50 - } - - val lowTemp = new Predicate[String, ClimateLog] { - override def test(t: String, c: ClimateLog): Boolean = c.temperature < 0 - } - - //array of streams for each predicate - val branches = climateLogStream.branch(highHumidty, lowTemp) - - //persists the stream onto the topic and reads back using default serde. - val highHumidityStream = branches(0).through(new Serdes.StringSerde, new ClimateLogSerDe, "high_humidity") - val lowTempStream = branches(1).through(new Serdes.StringSerde, new ClimateLogSerDe, "low_temp") - - highHumidityStream.print() - lowTempStream.print() - - - val keyedHighHumStream: KStream[String, ClimateLog] = highHumidityStream.selectKey(new KeyValueMapper[String, ClimateLog, String] { - override def apply(key: String, value: ClimateLog): String = value.country - }) - - val keyedLowTempStream: KStream[String, ClimateLog] = lowTempStream.selectKey(new KeyValueMapper[String, ClimateLog, String] { - override def apply(key: String, value: ClimateLog): String = value.country - }) - - keyedHighHumStream.print() - keyedLowTempStream.print() - - //create a join window. This window joins all the elements of the same key if the difference between their timestamps is within 60 seconds - val joinWindow = JoinWindows.of(60 * 1000) - - //join the streams - - val warningsStream: KStream[String, String] = keyedHighHumStream.join[ClimateLog, String]( - keyedLowTempStream, - new ValueJoiner[ClimateLog, ClimateLog, String] { - override def apply(value1: ClimateLog, value2: ClimateLog): String = value2.copy(humidity = value1.humidity).toString - }, - joinWindow) - - warningsStream.print() - warningsStream.to(new Serdes.StringSerde, new Serdes.StringSerde, "warnings") - - val streams = new KafkaStreams(kstreamBuilder, settings) - - streams.start - - Thread.sleep(4000) - println(streams.toString()) - - } - - - case class ClimateLog(country: String, state: String, temperature: Float, humidity: Float) { - override def toString = { - s"$country, $state, $temperature, $humidity" - } - } - object ClimateLog { - def apply(line: String): Option[ClimateLog] = { - val parts = line.split(",") - try { - Some(ClimateLog(parts(0), parts(1), parts(2).toFloat, parts(3).toFloat)) - } catch { - case e: Exception => None - } - } - } - - class ClimateLogKryoSerDe extends com.esotericsoftware.kryo.Serializer[ClimateLog](false) with Serializable { - override def write(kryo: Kryo, output: Output, `object`: ClimateLog): Unit = { - output.writeString(`object`.country) - output.writeString(`object`.state) - output.writeFloat(`object`.humidity) - output.writeFloat(`object`.temperature) - } - - override def read(kryo: Kryo, input: Input, `type`: Class[ClimateLog]): ClimateLog = { - val country = input.readString - val state = input.readString - val humidity = input.readFloat - val temp = input.readFloat - ClimateLog(country, state, temp, humidity) - } - } - - class ClimateLogSerializer extends Serializer[ClimateLog]{ - - val kryos = new DynamicVariable[Kryo]({ - val kryo = new Kryo() - kryo.addDefaultSerializer(classOf[ClimateLog], new ClimateLogKryoSerDe) - kryo - }) - - override def serialize(topic: String, data: ClimateLog): Array[Byte] = { - val baos = new ByteArrayOutputStream - val output = new Output(baos) - kryos.value.writeObject(output, data) - output.flush - baos.toByteArray - } - - override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = {/*nothing to be done here*/} - override def close(): Unit = {/*nothing to be done here*/} - } - - class ClimateLogDeserializer extends Deserializer[ClimateLog] { - - val kryos = new DynamicVariable[Kryo]({ - val kryo = new Kryo() - kryo.addDefaultSerializer(classOf[ClimateLog], new ClimateLogKryoSerDe) - kryo - }) - - override def deserialize(topic: String, data: Array[Byte]): ClimateLog = { - val input = new Input(new ByteArrayInputStream(data)) - kryos.value.readObject(new ByteBufferInput(data), classOf[ClimateLog]) - } - - override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = {/*nothing to be done here*/} - override def close(): Unit = {/*nothing to be done here*/} - } - - class ClimateLogSerDe extends ClimatelogWrappedSerde(new ClimateLogSerializer, new ClimateLogDeserializer) - - class ClimatelogWrappedSerde(ser: Serializer[ClimateLog], desr: Deserializer[ClimateLog]) extends Serde[ClimateLog] { - - override def deserializer(): Deserializer[ClimateLog] = desr - override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = { - ser.configure(configs, isKey) - desr.configure(configs, isKey) - } - - override def close(): Unit = { - ser.close() - desr.close() - } - - override def serializer(): Serializer[ClimateLog] = ser - } - -} diff --git a/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/HelloKafkaStreams.scala b/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/HelloKafkaStreams.scala deleted file mode 100644 index c6b2c0d..0000000 --- a/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/HelloKafkaStreams.scala +++ /dev/null @@ -1,38 +0,0 @@ -package com.vishnuviswanath.kafka.streams - -import java.util.Properties - -import org.apache.kafka.clients.consumer.ConsumerConfig -import org.apache.kafka.common.serialization.Serdes -import org.apache.kafka.streams.{KafkaStreams, StreamsConfig} -import org.apache.kafka.streams.kstream.{KStream, KStreamBuilder, ValueMapper} - -/** - * Created by vviswanath on 4/22/17. - * - * HelloKafkaStream reads a list of names from a topic and - * outputs "hello " in output topic - */ -object HelloKafkaStreams { - - def main(args: Array[String]): Unit = { - val settings = new Properties - settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "hello-kafka-streams") - settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092") - settings.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest") - settings.put(StreamsConfig.KEY_SERDE_CLASS_CONFIG, Serdes.serdeFrom(classOf[String]).getClass.getName) - settings.put(StreamsConfig.VALUE_SERDE_CLASS_CONFIG, Serdes.serdeFrom(classOf[String]).getClass.getName) - - val kstreamBuilder = new KStreamBuilder - val rawStream: KStream[String, String] = kstreamBuilder.stream("names") - - val helloStream: KStream[String, String] = rawStream.mapValues(new ValueMapper[String, String]{ - override def apply(value: String): String = s"hello $value" - }) - - helloStream.to(Serdes.String, Serdes.String, "hellostream") - - val streams = new KafkaStreams(kstreamBuilder, settings) - streams.start - } -} diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 89f5369..c941874 --- a/README.md +++ b/README.md @@ -1,15 +1 @@ -# Hadoop and ML repository - -A repository to hold all my Hadoop and Machine Learning related codes. - -Visit my blog at : www.vishnuviswanath.com - -### Contents - -1. Flink Streaming -2. Spark ML, Streaming, SQL and GraphX -3. Kafka Streams -4. StormKafka streaming application POC -5. Flume custom source and config files -6. Hadoop MapReduce old api joins,custom types etc -7. Solutions for kaggle problems using numpy or graphlab +www.vishnuviswanath.com \ No newline at end of file diff --git a/_config.yml b/_config.yml new file mode 100755 index 0000000..484d047 --- /dev/null +++ b/_config.yml @@ -0,0 +1,34 @@ +# Site settings +title: Bigdata & ML Notebook +email: vishnu.viswanath25@gmail.com +description: Welcome to my blog where I hope to share my knowledge in the field of Bigdata and Machine learning. I believe any topic if understood properly can be explained in simple terms, and I intend to do so through this blog. +baseurl: +url: http://www.vishnuviswanath.com +twitter_username: # +github_username: soniclavier +name: Bigdata & ML Notebook +# Build settings +markdown: kramdown + +highlighter: rouge + +gems : + - jekyll-paginate + - jekyll-seo-tag + - jekyll-sitemap + +# Pagination +paginate: 5 +paginate_path: "/blog/page:num/" + + + +collections: + portfolio: + output: true + permalink: /portfolio/:path/ + poetry: + output: true + permalink: /poetry/:path/ + + diff --git a/_includes/_include_slider.html b/_includes/_include_slider.html new file mode 100644 index 0000000..963db3e --- /dev/null +++ b/_includes/_include_slider.html @@ -0,0 +1,34 @@ +
+ +
+ + +
+ {% for image in site.static_files %} + {% if image.path contains include.folder %} + + {% endif %} + {% endfor %} + + +
+ + \ No newline at end of file diff --git a/_includes/_share.html b/_includes/_share.html new file mode 100644 index 0000000..2275c88 --- /dev/null +++ b/_includes/_share.html @@ -0,0 +1,17 @@ + \ No newline at end of file diff --git a/_includes/copyright.html b/_includes/copyright.html new file mode 100644 index 0000000..f4578d4 --- /dev/null +++ b/_includes/copyright.html @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/_includes/disqus.html b/_includes/disqus.html new file mode 100644 index 0000000..4c73cf6 --- /dev/null +++ b/_includes/disqus.html @@ -0,0 +1,13 @@ +
+ + \ No newline at end of file diff --git a/_includes/footer.html b/_includes/footer.html new file mode 100755 index 0000000..bb49097 --- /dev/null +++ b/_includes/footer.html @@ -0,0 +1,7 @@ +
+ +
+ {% include subscribe.html %} + {% include copyright.html %} +
+
\ No newline at end of file diff --git a/_includes/head.html b/_includes/head.html new file mode 100755 index 0000000..2f2be36 --- /dev/null +++ b/_includes/head.html @@ -0,0 +1,71 @@ + + + + + + {% if page.title %}{{ page.title }}{% else %}{{ site.title }}{% endif %} + {% if page.description %}{% else %}{% endif %} + {% seo %} + + + + + + {% include shareaholic.html %} + + + + {% if page.image %} + + {% else %} + + {% endif %} + {% if page.categories %} + {% for category in page.categories limit:1 %} + + {% endfor %} + {% endif %} + {% if page.tags %} + {% for tag in page.tags %} + + {% endfor %} + {% endif %} + + + {% if page.id or page.title == 'about' %} + + + {% endif %} + + + + + + + {% if page.has_math %} + + {% endif %} + + + + + + {% if page.title %} + + {% else %} + + {% endif %} + {% if page.url %} + + {% endif %} + {% if page.description %} + + {% else %} + + {% endif %} + {% if page.image %} + + {% else %} + + {% endif %} + diff --git a/_includes/header.html b/_includes/header.html new file mode 100755 index 0000000..c59f2d7 --- /dev/null +++ b/_includes/header.html @@ -0,0 +1,23 @@ + diff --git a/_includes/page_index.html b/_includes/page_index.html new file mode 100644 index 0000000..f2aabae --- /dev/null +++ b/_includes/page_index.html @@ -0,0 +1,62 @@ +
+ + + diff --git a/_includes/pingdom.html b/_includes/pingdom.html new file mode 100644 index 0000000..250ee25 --- /dev/null +++ b/_includes/pingdom.html @@ -0,0 +1,11 @@ + \ No newline at end of file diff --git a/_includes/pop-up-subscribe.html b/_includes/pop-up-subscribe.html new file mode 100644 index 0000000..c8940d5 --- /dev/null +++ b/_includes/pop-up-subscribe.html @@ -0,0 +1,74 @@ + +
+ + \ No newline at end of file diff --git a/_includes/read_time.html b/_includes/read_time.html new file mode 100644 index 0000000..c0fd1e2 --- /dev/null +++ b/_includes/read_time.html @@ -0,0 +1,8 @@ + + {% assign words = content | number_of_words %} + {% if words < 270 %} + 1 minute + {% else %} + {{ words | divided_by:220 }} minutes + {% endif %} + \ No newline at end of file diff --git a/_includes/shareaholic.html b/_includes/shareaholic.html new file mode 100644 index 0000000..40621c8 --- /dev/null +++ b/_includes/shareaholic.html @@ -0,0 +1 @@ + diff --git a/_includes/subscribe.html b/_includes/subscribe.html new file mode 100644 index 0000000..e8fe586 --- /dev/null +++ b/_includes/subscribe.html @@ -0,0 +1,18 @@ + + + +
+
+
+ + + + +
+
+
+ + \ No newline at end of file diff --git a/_includes/track.html b/_includes/track.html new file mode 100644 index 0000000..851b589 --- /dev/null +++ b/_includes/track.html @@ -0,0 +1,19 @@ + \ No newline at end of file diff --git a/_layouts/default.html b/_layouts/default.html new file mode 100755 index 0000000..ffbe4a1 --- /dev/null +++ b/_layouts/default.html @@ -0,0 +1,20 @@ + + + + {% include head.html %} + {% include track.html %} + + + + {% include header.html %} + +
+
+ {{ content }} +
+
+ + {% include footer.html %} + + + diff --git a/_layouts/default_with_img.html b/_layouts/default_with_img.html new file mode 100755 index 0000000..7b08485 --- /dev/null +++ b/_layouts/default_with_img.html @@ -0,0 +1,25 @@ + + + + {% include head.html %} + {% include track.html %} + + + + {% include header.html %} + +
+
+ +
+
+ {{ content }} +
+
+ + {% include footer.html %} + + + + + diff --git a/_layouts/page.html b/_layouts/page.html new file mode 100755 index 0000000..97e7531 --- /dev/null +++ b/_layouts/page.html @@ -0,0 +1,16 @@ +--- +layout: default +--- +
+ +
+

{{ page.title }}

+
{{ page.description }}
+
+ +
+ {{ content }} +
+ +
+ diff --git a/_layouts/post.html b/_layouts/post.html new file mode 100755 index 0000000..e1de879 --- /dev/null +++ b/_layouts/post.html @@ -0,0 +1,27 @@ +--- +layout: default +--- +
+ +
+

{{ page.title }}

+ +
+
+ + {{ content }} +
+
+ Tags: + {% for tags in page.tags %} + {{ tags }} + {% endfor %} +
+
+
+{% if page.show_index %} + {% include page_index.html %} +{% endif %} + +{% include disqus.html %} +{% include pop-up-subscribe.html %} \ No newline at end of file diff --git a/_posts/2015-09-15-spark_scala.markdown b/_posts/2015-09-15-spark_scala.markdown new file mode 100755 index 0000000..7b500a7 --- /dev/null +++ b/_posts/2015-09-15-spark_scala.markdown @@ -0,0 +1,127 @@ +--- +layout: post +comments: true +title: Spark MapReduce and Scala underscore +date: 2015-09-15 +permalink: /spark-scala.html +PAGE_IDENTIFIER: spark_scala_blog +tags: ApacheSpark Scala BigData Hadoop +description: A blog on how to write Spark mapreduce and an introduction on Scala underscore +--- +
+ +
+This is a basic guide on how to run map-reduce in Apache Spark using Scala. I will also try to explain the basics of Scala underscore, how it works and few examples of writing map-reduce programs with and without using underscore. + +The source code is available here + +### MapReduce +The first step is to create an RDD(Resilient Distributed Dataset) of input Strings. An RDD is a collection of data which is partitioned, it is similar to a distributed collection. The more the number of partitions in an RDD, the more the parallelism. When a job runs, each partition will be moved to the node where it is going to be processed. + +{% highlight scala %} +val lines = sc.parallelize(List("this is","an example")) +lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[0] at parallelize at :21 +{% endhighlight %} + +sc, is the spark context which is available by default in the spark-shell and sc.parallelilize will parallelize the input which is a list of two Strings in this case. + +### Map +Now that we have the RDD, we will run a map() operation on it. + +{% highlight scala %} +val lengthOfLines = lines.map(line => (line.length)) +lengthOfLines: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[2] at map at :23 +lengthOfLines.collect() +res0: Array[Int] = Array(7, 10) +{% endhighlight %} + +Here the map operation executes given function (to find the length) on each the element in the RDD and returns a new RDD. lengthOfLines.collect() operation shows that the result is an array with elements 7 and 10. + +### Reduce +Let us run the reduce operation on the result we obtained from map. +{% highlight scala %} +lines.map(line => (line.length)).reduce((a,b) => a + b) +res1: Int = 17 +{% endhighlight %} + +reduce() operation in Spark is a bit different from how the Hadoop MapReduce used to be, reduce() in spark produces only single output instead of producing key-value pairs. In the above reduce operation the length of each line, is summed up to obtain the result, 17. We will later go through reduceByKey() operation which is similar to the reduce in Hadoop MapReduce. + +Let's take another example of reduce() + +{% highlight scala %} +val lines = sc.parallelize(List("this is","an example")) +val firstWords = lines.map(line => (line.substring(0,line.indexOf(" ")))).collect() +firstWords: Array[String] = Array(this, an) +firstWords.reduce((a,b) => a +" "+b) +res2: String = this an +{% endhighlight %} + +In this example, we took two lines, did a substring operation to obtain only the first word of each line and then concatenated in reduce. The point to note here is that, reduce() operation always returns a single result - string "this an" in this example. +
Here we ran reduce operation on an Array(firstWords) which is not on an RDD. reduceByKey() can only called on an RDD where as reduce() can be called even if the object is not an RDD.
+ +### Scala Underscore +Now, let us come back to the reduce operation and see how we can re-write it using underscore( _ ). Here reduce takes two arguments a and b and does a summation. + +{% highlight scala %} +lines.map(line => (line.length)).reduce((a,b) => a + b) +{% endhighlight %} + +Using Scala underscore, this can also be written as + +{% highlight scala %} +lines.map(line => (line.length)).reduce(_ + _) +{% endhighlight %} + +((a,b) => a+b) can be re-written as (_ + _) : here it is implicitly understood that the function takes two parameters and does a "+" on them. + +(line => (line.length)) can be re-written as _.length : map is taking a single parameter- line as input and doing a.length on it, so the _ here becomes the only parameter and _.length finds the length of the line. + +### Word count using flatMap and reduceByKey +One of the difference between flatMap() and map() is that, map should always return a result where as flatMap need not. Have a look at the below examples + +{% highlight scala %} +val lines = sc.parallelize(List("this is line number one","line number two","line number three")) +lines.flatMap(_.split(" ").filter(word => word.contains("this")).map(word => (word,1))).collect() +res83: Array[(String, Int)] = Array((this,1)) +lines.map(_.split(" ").filter(word => word.contains("this")).map(word => (word,1))).collect() +res85: Array[Array[(String, Int)]] = Array(Array((this,1)), Array(), Array()) +lines.map(line => (line.length)).reduce(_ + _) +{% endhighlight %} + +We have three strings and we are doing a filtering based on the content. The result we got from the flatMap after filtering is `Array((this,1))` where as the map operation returned `Array(Array((this,1)), Array(), Array())` -two empty arrays. + +
return type of flatMap and map is an RDD not an array, the above result with array was obtained after calling collect() on the RDD returned by map operations.
+ +Another difference between flatMap and map is that, flatMap flattens out the result, i.e., if you are getting an Array of Array of String in map, in flatMap you will get Array of String. See the below example + +{% highlight scala %} +val lines = sc.parallelize(List("line number one","line number two")) +lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[10] at parallelize at :21 +val words = lines.map(line => (line.split(" "))) +words: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[11] at map at :23 +words.collect() +res6: Array[Array[String]] = Array(Array(line, number, one), Array(line, number, two)) +val words = lines.flatMap(line => (line.split(" "))) +words: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[12] at flatMap at :23 +words.collect() +res7: Array[String] = Array(line, number, one, line, number, two) +{% endhighlight %} + +reduceByKey() takes a function that accepts two values and produces one. Here the result is an RDD of (key,value) pairs in contrast to reduce() where the result was just one value. + +Let's move on to the word count program. We will be using flatMap and reduceByKey explained earlier + +{% highlight scala %} +val lines = sc.parallelize(List("line number one","line number two")) +lines.collect() +res8: Array[String] = Array(line number one, line number two) +val words = lines.flatMap(line => (line.split(" "))) +words.collect() +res11: Array[String] = Array(line, number, one, line, number, two) +val wordCount = words.map(x => (x,1)).reduceByKey(_ + _) +wordCount.collect() +res12: Array[(String, Int)] = Array((number,2), (two,1), (line,2), (one,1)) +{% endhighlight %} + +In the first line, we are creating an RDD with two Strings. Next, we are splitting the line based on space. Then for each word, the mapper will emit key as the word and 1 as the value. The reducer will receive these key,value pairs and will do an aggregation of all the values to find the number of occurrences of each word. +
Continue reading \ No newline at end of file diff --git a/_posts/2015-11-15-storm_kafka_part1.markdown b/_posts/2015-11-15-storm_kafka_part1.markdown new file mode 100755 index 0000000..5eceb09 --- /dev/null +++ b/_posts/2015-11-15-storm_kafka_part1.markdown @@ -0,0 +1,203 @@ +--- +layout: post +comments: true +title: Realtime Processing using Storm-Kafka- Part1 +date: 2015-11-05 +PAGE_IDENTIFIER: storm_kafka1 +permalink: /realtime-storm-kafka1.html +tags: ApacheStorm MongoDB SOLR Hadoop BigData Java Kafka +description: This is the first of the three-part series of a POC on how to build a near Realtime Processing system using Apache Storm and Kafka in Java. In this first part, we will be dealing with setting up of the environment. +--- +
+ +
+ +This is a three-part series of a POC on how to build a near Realtime Processing system using Apache Storm and Kafka in Java. So to give a brief introduction on how the system works, messages come into a Kafka topic, Storm picks up these messages using Kafka Spout and gives it to a Bolt, which parses and identifies the message type based on the header. Once the message type is identified, the content of the message is extracted and is sent to different bolts for persistence - SOLR bolt, MongoDB bolt or HDFS bolt. + +In this first part, we will be dealing with setting up of the environment. If you already have the environment setup, you can jump to the Part 2 which talks about how to setup the project in Eclipse and how to write the Bolts. Execution of the project and creation of Spout is discussed in the Part 3 + +The source code for this project is available in my github + + +### Setup + + +For building this system, we would require + +1. Hadoop *2.6.0* +2. Zookeeper *3.4.6* +3. Apache Kafka *0.8.2.1* Scala*2.9.1* +4. Apache Solr *5.3.1* +5. MongoDB *3.0.7* +6. Apache Storm *0.10.0* + +*Note: All are single node setup* + +Versions I used are given in italics, It is not necessary to use the same versions but there might be some changes needed if versions are different.
+Update (07/05/2016) : The source code for Storm-1.0 is available in the branch [storm1.0-kafka-poc](https://github.com/soniclavier/hadoop_datascience/tree/storm1.0-kafka-poc)
+ + +#### **Hadoop** +I am assuming that hadoop is installed and I am not going through the installation steps here. If not you can do so easily by following the instructions here. +After installation, start hadoop daemons and verify that all daemons have started by running jps command. +{% highlight sh %} +$ jps +12768 QuorumPeerMain +14848 SecondaryNameNode +15024 NodeManager +14949 ResourceManager +14758 DataNode +15067 Jps +14687 NameNode +{% endhighlight %} + +#### **Zookeeper** +Zookeeper setup is also pretty straight forward, you can download zookeeper from here. +Once downloaded, extract the archive and look for a zoo_sample.cfg inside conf folder. Copy it and change the name to zoo.cfg +{% highlight sh %} +$cp zoo_sample.cfg zoo.cfg +{% endhighlight %} +For single node setup most of the configurations in zoo_sample.cfg can be used as is, the only configuration that I usually change is `dataDir=/tmp/zookeeper`. +Create a new directory of your convenience and point *dataDir* to that directory to. e.g., `dataDir=/Users/vishnu/zookeeper_data` + +Start the zookeeper server by running below command from zookeeper base directory. +{% highlight sh %} +$bin/zkServer.sh start +Starting zookeeper ... STARTED +{% endhighlight %} + +#### **Apache Kafka** +Download the kafka binary from here and unpack it. +Now start the kafka broker by running below command from kafka base directory + +{% highlight sh %} +$bin/kafka-server-start.sh config/server.properties +[2015-11-06 10:00:54,412] INFO Registered broker 0 at path /brokers/ids/0 with address 10.0.0.8:9092. (kafka.utils.ZkUtils$) +[2015-11-06 10:00:54,418] INFO [Kafka Server 0], started (kafka.server.KafkaServer) +{% endhighlight %} + +config/server.properties holds the information about the kafka broker. Take a note of +{% highlight properties %} +broker.id=0 +port=9092 +zookeeper.connect=localhost:2181 +{% endhighlight %} + +Now we need to create a kafka *topic* to which the message will be posted. Let's name this topic *'incoming'*. For creating a topic, we need to specify the topic name by `--topic`, zookeeper url by `--zookeeper`, number of partitions and replication factor. Run the below command for creating the topic. +{% highlight sh %} +$bin/kafka-topics.sh --create --topic incoming --zookeeper localhost:2181 --partitions 1 --replication-factor 1 +Created topic "incoming" +{% endhighlight %} +Let's now test if the topic has be created successfully by posting and retrieving some messages. +Open a new terminal and run below command +{% highlight sh %} +$bin/kafka-console-consumer.sh --topic incoming --zookeeper localhost:2181 +{% endhighlight %} +In another terminal start a kafka console producer and send a sample message. +{% highlight sh %} +$bin/kafka-console-producer.sh --topic incoming --broker localhost:9092 +hdfs testmessage +{% endhighlight %} +Check the terminal running the consumer if the message(*'hdfs testmessage'*) has been received. + +#### **Apache Solr** +Download the solr distribution from here and unpack it. +Start the Solr server by running +{% highlight sh %} +$bin/solr start +Waiting up to 30 seconds to see Solr running on port 8983 [/] +Started Solr server on port 8983 (pid=15373). Happy searching! +{% endhighlight %} +You can now access the Solr UI via http://localhost:8983/solr/#/ + +*Solr home page:* + +
+ +
+ +Now we need to create a collection in solr. This will be used by our Storm topology to store the message of the type *solr*. +For creating a collection there are a set of configuration files needed, solr provides us basic configuration files which can be used for this. These files are available in +{% highlight sh %} +SOLR_BASE/server/solr/configsets/basic_configs/conf +{% endhighlight %} +Let's first create a new folder and copy the basic configs to it. +{% highlight sh %} +$mkdir server/solr/collection1 +$cp -r server/solr/configsets/basic_configs/conf/ server/solr/collection1/conf +{% endhighlight %} +We need to change the default schema given by the basic configuration. To do that, open the file schema.xml in `server/solr/collection1/conf` + +add the below line after ``. This adds a field named *value* of the type *string*, it is a required attribute and is stored (*stored=true makes the field retrievable while doing search*). Indexed = false indicates that we are not going to do search on this field. +{% highlight xml %} + +{% endhighlight %} + +Now we have modified the schema as per our requirement, we will go ahead and create collection - named *collection1*. To create the collection, run + +{% highlight sh %} +$bin/solr create -c collection1 +Creating new core 'collection1' using command: +http://localhost:8983/solr/admin/cores?action=CREATE&name=collection1&instanceDir=collection1 + +{ + "responseHeader":{ + "status":0, + "QTime":539}, + "core":"collection1"} +{% endhighlight %} + +You can view the collection via http://localhost:8983/solr/#/collection1. Also, make sure that the fields *id* and *value* are created correctly from the dropdown in schema browser. + +#### **MongoDB** + +Download and unpack mongodb from here. We need to create a folder which will be used by mongodb as data directory. +{% highlight sh %} +mkdir mongodb_data +{% endhighlight %} +Start the `mongod` daemon by running below command from mongodb installation folder. We need to pass the path of the data directory to mongod script. +{% highlight sh %} +bin/mongod --dbpath /Users/vishnu/mongodb_data +... +2015-11-07T17:51:05.223-0600 I STORAGE [FileAllocator] done allocating datafile /Users/vishnu/mongodb_data/local.0, size: 64MB, took 0.039 secs +2015-11-07T17:51:05.238-0600 I NETWORK [initandlisten] waiting for connections on port 27017 +{% endhighlight %} +Now we will be creating a database and a collection to store our messages. +Open another terminal and run below commands. + +{% highlight sh %} +$ bin/mongo +MongoDB shell version: 3.0.7 +connecting to: test +> use storm +switched to db storm +> db.createCollection("collection1"); +{ "ok" : 1 } +{% endhighlight %} +`use storm` creates a new database called storm and switches to it. `createCollection("collection1")` creates a new collection named *'collection1'* + +#### **Apache Storm** +Download Storm distribution from here and unpack it. It is better to add the STORM_HOME/bin to you PATH. You can do so by changing your `bash_profile`. *Note: this step by might vary based on your OS. See how to set the path variable for Linux or Mac. + +{% highlight sh %} +export STORM_HOME=/Users/vishnu/apache-storm-0.10.0 +export PATH=$PATH/:$STORM_HOME/bin +{% endhighlight %} + +We need to start the storm master - *nimbus* and the slave *supervisor*. Along with these we will also start Storm UI server and logviewer - this enables us to view the logs from storm ui +{% highlight sh %} +$bin/storm nimbus +$bin/storm supervisor +$bin/storm ui +$bin/storm logviewer +{% endhighlight %} + +Check http://localhost:8080/index.html and make sure that supervisor and the nimbus servers has been started. + +
+ +
+ +We have completed the environment setup and in the next part we will see how to setup the Eclipse project and start writing Storm Topology. + +Next \ No newline at end of file diff --git a/_posts/2015-11-15-storm_kafka_part2.markdown b/_posts/2015-11-15-storm_kafka_part2.markdown new file mode 100755 index 0000000..6303794 --- /dev/null +++ b/_posts/2015-11-15-storm_kafka_part2.markdown @@ -0,0 +1,259 @@ +--- +layout: post +comments: true +title: Realtime Processing using Storm-Kafka- Part2 +date: 2015-11-05 +PAGE_IDENTIFIER: storm_kafka_2 +permalink: /realtime-storm-kafka2.html +tags: ApacheStorm MongoDB SOLR Hadoop BigData Kafka Java +description: This is part two of the series Realtime Processing using Storm and Kafka. In this section we are going to create an Eclipse project and develop the Solr, MongoDb and Hdfs Bolt used for persisting the messages. +--- +
+ +
+ +This is Part 2 of the series *Realtime Processing using Storm and Kafka*. If you have not read the first part, you can read it here. In this section we are going to create an Eclipse project and develop the Solr, MongoDb and Hdfs Bolt used for persisting the messages. + +The source code for this project is available in my github + +### **Building Storm Topology** + + +* Language: Java +* IDE : Eclipse +* Build tool : Maven + +Storm has mainly two components - *Spouts* and *Bolts*. + +#### Spouts +Spouts are the data sources for a topology. A spout reads data from an external source and emits them into the topology. There can me more than one spout in a topology reading data from different source (*e.g., twitter, tcp connection, kafka topic, flume). In this example, we will be creating a Kafka spout which will be reading the messages coming into the topic 'incoming' that we created during kafka setup. + +#### Bolts +Bolts are the processing units of a topology. It can enrich the message, filter, persist into different sinks etc. In this example we will be creating four Bolts. + +#### Topology +A topology is a network of Spouts and Bolts + +1. *Sink-Type-bolt* : will act as a decision making node, by identifying the message type and sending it to the appropriate bolt for persistence. +2. *Solr-bolt* : for indexing into SOLR collection +1. *Hdfs-bolt* : for storing in HDFS +1. *Mongodb-bolt* : for saving in MongoDB collection + +
+ +
+ +#### **Creating the project** +Create a new maven project in eclipse and add the following dependencies in the pom.xml. + +1. storm-core +2. kafka_2.9.1 +3. storm-kafka +4. storm-hdfs +5. solr-solrj +6. json-simple + +You can download the pom.xml from here +
+ Note: The artifact slf4j-log4j12 has to be excluded from storm-core and kafka_2.9.1 dependency. Otherwise you might get 'multiple SLF4J bindings' exception during execution. +
+ +
+ Note: We have to package the jar with all the dependencies except storm-core. It is better to use maven shade plugin rather than maven assembly plugin because the packaging done by assembly plugin may throw exception while submitting the jar to storm. +
+ +#### Structure of the project +
+ +
+ +#### Keys class +Keys.java holds all the keys for the customizable properties of the topology. +e.g., +{% highlight java %} +String KAFKA_SPOUT_ID = "kafka-spout"; +String KAFKA_ZOOKEEPER = "kafka.zookeeper"; +String KAFKA_TOPIC = "kafa.topic"; +String KAFKA_ZKROOT = "kafka.zkRoot"; +String KAFKA_CONSUMERGROUP = "kafka.consumer.group"; +{% endhighlight %} + +There is a default config file - `default_configs.properties` which will contain the default values for these properties. And this can be overriden by passing the path of some custom properties file. But the only condition is that, it should override all the properties defined in default_configs.properties. Below is a section of default_configs.properties +{% highlight properties %} +kafka-spout=kafka-spout +kafka.zookeeper=localhost:2181 +kafa.topic=incoming +kafka.zkRoot=/kafka +kafka.consumer.group=sample_group +{% endhighlight %} + +These properties will be loaded into a `Properties` object named `config` in the Toplogies class and can be accessed using the Keys class. e.g., to get the value of kafka spout id we can call +{% highlight java %} +configs.getProperty(Keys.KAFKA_SPOUT_ID) +{% endhighlight %} +
+ + +#### **Building the Bolts** +All the bolts are built by BoltBuilder.java. It has methods for creating SinkTypeBolt, HdfsBolt, SolrBolt and MongoDB bolt. + +#### SinkTypeBolt +SinkTypeBolt.java extends BaseRichBolt. It has two important methods + +1.declareOutputFields: + +This method is used for declaring what are the output streams being emitted from this bolt and what will the fields be for each of the tuple it emits to those streams. +We are declaring 3 output streams here and each stream is going to have two fields 1) sinkType and 2) content. Topology.SOLR_STREAM, Topology.HDFS_STREAM etc are Strings used for naming these streams. +{% highlight java %} +public void declareOutputFields(OutputFieldsDeclarer declarer) { + declarer.declareStream(Topology.SOLR_STREAM, new Fields( "sinkType","content" )); + declarer.declareStream(Topology.HDFS_STREAM, new Fields( "sinkType","content" )); + declarer.declareStream(Topology.MONGODB_STREAM, new Fields( "sinkType","content" )); +} +{% endhighlight %} +2.execute + +Execute method receives a tuple at at time and does some processing. To make the example simple, it is assumed that our messages will have certain format i.e., it will be of the format `[type] [content]` where type will be either *solr*, *hdfs* or *mongo*. Also, SOLR and MongoDB messages will be of the format `fieldname:fieldvalue`; and there will be two fields - 1) id and 2) value. The execute method reads the tuple and extracts the type out of it. It then sends the content to any one of the streams by calling `collector.emit()` +{% highlight java %} +public void execute(Tuple tuple) { + String value = tuple.getString(0); + System.out.println("Received in SinkType bolt : "+value); + int index = value.indexOf(" "); + if (index == -1) + return; + String type = value.substring(0,index); + System.out.println("Type : "+type); + value = value.substring(index); + if(type.equals("solr")) { + collector.emit(Topology.SOLR_STREAM,new Values(type,value)); + System.out.println("Emitted : "+value); + } else if (type.equals("hdfs")) { + collector.emit(Topology.HDFS_STREAM,new Values(type,value)); + System.out.println("Emitted : "+value); + } else if (type.equals("mongo")) { + collector.emit(Topology.MONGODB_STREAM,new Values(type,value)); + System.out.println("Emitted : "+value); + } + collector.ack(tuple); +} +{% endhighlight %} +As you can see based on the type, the value is emitted to their respective streams. + +#### SolrBolt + +SolrBolt.java receives a tuple, converts the tuple into a SolrInputDocument and send that document to SOLR server. Therefore it needs to know SOLR server url, which can be set through its constructor. + +1.prepare + +During prepare, a new HttpSolrClient object is created using the solrAddress which was set through its constructor. +
Note: We are not creating the client object in the constructor because when a topology is submitted, the bolt object will be serialized and submitted and the class HttpSolrClient is non-serializable. If we initialize HttpSolrClient in the constructor, we will receive java.io.NotSerializableException exception. Where as the method prepare will be called only after the object is deserialized.
+{% highlight java %} +public void prepare(Map conf, TopologyContext context, OutputCollector collector) { + this.collector = collector; + this.solrClient = new HttpSolrClient(solrAddress); +} +{% endhighlight %} +2.getSolrInputDocumentForInput + +This method is used for converting a tuple into SolrInputDocument, which is required for indexing the document onto SOLR. +{% highlight java %} +public SolrInputDocument getSolrInputDocumentForInput(Tuple input) { + String content = (String) input.getValueByField("content"); + String[] parts = content.trim().split(" "); + System.out.println("Received in SOLR bolt "+content); + SolrInputDocument document = new SolrInputDocument(); + try { + for(String part : parts) { + String[] subParts = part.split(":"); + String fieldName = subParts[0]; + String value = subParts[1]; + document.addField(fieldName, value); + } + } catch(Exception e) { + + } + return document; +} +{% endhighlight %} +3.execute + +Execute method converts the input Tuple into a SolrInputDocument and sends it to SOLR server by calling commit() +
Note: Ideally, we should not be committing each document, rather we should first buffer the documents and commit only once the buffer reaches a certain threshold. +
+{% highlight java %} +public void execute(Tuple input) { + SolrInputDocument document = getSolrInputDocumentForInput(input); + try{ + solrClient.add(document); + solrClient.commit(); + collector.ack(input); + }catch(Exception e) { + } +} +{% endhighlight %} + + +#### MongoDB Bolt +MongodbBolt.java is similar to SolrBolt. It creates an instance of MongoClient using hostname and port, and then it creates an instance of MongoDatabase using ths MongoClient and the database name. Input tuple is converted into `org.bson.Document` by the method `getMongoDocForInput` and is inserted into the collection by + +{% highlight java %} +mongoDB.getCollection(collection).insertOne(mongoDoc) +{% endhighlight %} + +{% highlight java %} +public void execute(Tuple input) { + Document mongoDoc = getMongoDocForInput(input); + try{ + mongoDB.getCollection(collection).insertOne(mongoDoc); + collector.ack(input); + }catch(Exception e) { + e.printStackTrace(); + collector.fail(input); + } +} + +public Document getMongoDocForInput(Tuple input) { + Document doc = new Document(); + String content = (String) input.getValueByField("content"); + String[] parts = content.trim().split(" "); + System.out.println("Received in MongoDB bolt "+content); + try { + for(String part : parts) { + String[] subParts = part.split(":"); + String fieldName = subParts[0]; + String value = subParts[1]; + doc.append(fieldName, value); + } + } catch(Exception e) { + + } + return doc; +} +{% endhighlight %} +
+ + +#### HdfsBolt + +HdfsBolt in BoltBuilder.java receives a tuple and saves the content on to HDFS. This bolt should be aware of the hdfs hostname and port. This should match host:port set by the property `fs.defaultFS` in `core-site.xml`. *FileNameFormat* specifies the name of the file that will be created in HDFS. *SyncPolicy* specifies how often should the data be synced/flushed to HDFS. +{% highlight java %} +public HdfsBolt buildHdfsBolt() { + RecordFormat format = new DelimitedRecordFormat().withFieldDelimiter("|"); + SyncPolicy syncPolicy = new CountSyncPolicy(1000); + FileRotationPolicy rotationPolicy = new FileSizeRotationPolicy(5.0f, Units.MB); + FileNameFormat fileNameFormat = new DefaultFileNameFormat().withPath(configs.getProperty(Keys.HDFS_FOLDER)); + String port = configs.getProperty((Keys.HDFS_PORT)); + String host = configs.getProperty((Keys.HDFS_HOST)); + HdfsBolt bolt = new HdfsBolt() + .withFsUrl("hdfs://"+host+":"+port) + .withFileNameFormat(fileNameFormat) + .withRecordFormat(format) + .withRotationPolicy(rotationPolicy) + .withSyncPolicy(syncPolicy); + return bolt; +} +{% endhighlight %} + +In the next part of this series, we will develop the Kafka Spout, tie it all together using Storm Topology and execute the project. +
+Previous Next \ No newline at end of file diff --git a/_posts/2015-11-15-storm_kafka_part3.markdown b/_posts/2015-11-15-storm_kafka_part3.markdown new file mode 100755 index 0000000..692b28c --- /dev/null +++ b/_posts/2015-11-15-storm_kafka_part3.markdown @@ -0,0 +1,158 @@ +--- +layout: post +comments: true +title: Realtime Processing using Storm-Kafka - Part3 +date: 2015-11-05 +PAGE_IDENTIFIER: spark_scala +permalink: /realtime-storm-kafka3.html +tags: ApacheStorm MongoDB SOLR Hadoop BigData Kafka Java +description: This is the final part of the series. Here we are dealing with developing the Kafka Spout, Storm Topology and execution of the project. +--- +
+ +
+ +This is the last part of the blog *Realtime Processing using Storm and Kafka*. You can find the previous parts here - Part 1, Part 2. In this section we will develop the Kafka Spout, Storm Topology and execute the project. + +The source code for this project is available in my github + +#### **Creating Kafka Spout** +Kafka spout reads from the kafka topic we created. So it has to know how to connect to Kafka broker, the name of the topic from which it has to read, zookeeper root and consumer group id. Zookeeper root and group id is used by the spout to store the offset information of till where it has read from the topic. In case of failure, the spout can use this information to start reading from where it failed. If zkRoot is 'kafka' and consumer group id is 'sample_group', then /kafka/sample_group will be created in zookeeper. + +{% highlight sh %} +[zk: localhost:2181(CONNECTED) 0] ls / +[controller_epoch, brokers, storm, zookeeper, kafka, admin, consumers, config] +[zk: localhost:2181(CONNECTED) 1] ls /kafka +[sample_group] +[zk: localhost:2181(CONNECTED) 2] ls /kafka/sample_group +[partition_0] +{% endhighlight %} + +Below java method creates a KafkaSpout. It first creates SpoutConfig using the values form the default_config.properties file and then passes it on to KafkaSpout class. This method is written inside the class SpoutBuilder.java +{% highlight java %} +public KafkaSpout buildKafkaSpout() { + BrokerHosts hosts = new ZkHosts(configs.getProperty(Keys.KAFKA_ZOOKEEPER)); + String topic = configs.getProperty(Keys.KAFKA_TOPIC); + String zkRoot = configs.getProperty(Keys.KAFKA_ZKROOT); + String groupId = configs.getProperty(Keys.KAFKA_CONSUMERGROUP); + SpoutConfig spoutConfig = new SpoutConfig(hosts, topic, zkRoot, groupId); + spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme()); + KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig); + return kafkaSpout; +} +{% endhighlight %} + +
+ + +#### **Building the Topology** +Topology.java is the main class which connects all the spouts and bolts together. Below diagram shows how the spout and bolts are connected together. Kafka spout picks up message from the topic.SinkTypeBolt listens to the KafkaSpout. SinkTypeBolt emits the tuples in three streams. SOLR bolt listens to the solr stream of SinkTypeBolt and similarly HDFS bolt and MongoDB bolt listens to hdfs stream and the mongodb stream of the SinkTypeBolt respectively. + +The Topology class uses SpoutBuilder and BoltBuilder to build all the spouts and bolts +{% highlight java %} +TopologyBuilder builder = new TopologyBuilder(); +KafkaSpout kafkaSpout = spoutBuilder.buildKafkaSpout(); +SinkTypeBolt sinkTypeBolt = boltBuilder.buildSinkTypeBolt(); +SolrBolt solrBolt = boltBuilder.buildSolrBolt(); +HdfsBolt hdfsBolt = boltBuilder.buildHdfsBolt(); +MongodbBolt mongoBolt = boltBuilder.buildMongodbBolt(); +{% endhighlight %} + +These spouts and bolts are linked together by the TopologyBuilder class. Each spout should define from which stream it should receive it's input from. e.g., If bolt 'B' wants to receive it's input from bolt 'A', then we should call +{% highlight java %} +builder.setBolt('B',boltBobj,1).shuffleGrouping("A"); +{% endhighlight %} +If bolt 'A' is emitting multiple streams -x and y, then bolt 'B' should also specify the stream name of bolt 'A'. It would look something like +{% highlight java %} +builder.setBolt('B',bolt,1).shuffleGrouping("A","x");` +{% endhighlight %} + +{% highlight java %} +//set the kafkaSpout to topology +builder.setSpout(configs.getProperty(Keys.KAFKA_SPOUT_ID), kafkaSpout, kafkaSpoutCount); +//set the sinktype bolt +builder.setBolt(configs.getProperty(Keys.SINK_TYPE_BOLT_ID),sinkTypeBolt,sinkBoltCount).shuffleGrouping(configs.getProperty(Keys.KAFKA_SPOUT_ID)); +//set the solr bolt +builder.setBolt(configs.getProperty(Keys.SOLR_BOLT_ID), solrBolt,solrBoltCount).shuffleGrouping(configs.getProperty(Keys.SINK_TYPE_BOLT_ID),SOLR_STREAM); +//set the hdfs bolt +builder.setBolt(configs.getProperty(Keys.HDFS_BOLT_ID),hdfsBolt,hdfsBoltCount).shuffleGrouping(configs.getProperty(Keys.SINK_TYPE_BOLT_ID),HDFS_STREAM); +//set the mongodb bolt +builder.setBolt(configs.getProperty(Keys.MONGO_BOLT_ID),mongoBolt,mongoBoltCount).shuffleGrouping(configs.getProperty(Keys.SINK_TYPE_BOLT_ID),MONGODB_STREAM); +{% endhighlight %} + +kafkaSpoutCount : parallelism-hint for the kafkaSpout - defines number of executors/threads to be spawn per container +
Note: shuffleGrouping is one of the eight stream grouping methods available in Storm (it sends the tuples to bolts in random). Another type of grouping is fieldsGrouping - in fields grouping, the tuples are grouped based on a specified field and the tuples having same value for that field is always sent to the same task. We can also implement custom grouping by implementing the interface CustomStreamGrouping. +
+Finally the topology can be submitted by +{% highlight java %} +Config conf = new Config(); +conf.put("solr.zookeeper.hosts",configs.getProperty(Keys.SOLR_ZOOKEEPER_HOSTS)); +String topologyName = configs.getProperty(Keys.TOPOLOGY_NAME); +//Defines how many worker processes have to be created for the topology in the cluster. +conf.setNumWorkers(1); +StormSubmitter.submitTopology(topologyName, conf, builder.createTopology()); +{% endhighlight %} + + +#### **Execution** +For execution, we need to start the below servers + +1. Hadoop servers +2. Solr server +3. Kafka broker +4. Mongod server +5. Storm nimbus +6. Storm supervisor +7. Storm UI (optional) + +Build the jar using the command `mvn clean install`. The command will create your toplogy jar with all the dependencies - `stormkafka-0.0.1-SNAPSHOT.jar`. +Run the jar using the command +{% highlight sh %} +$storm jar stormkafka-0.0.1-SNAPSHOT.jar com.vishnu.storm.Topology +... +768 [main] INFO b.s.StormSubmitter - Successfully uploaded topology jar to assigned location: /Users/vishnu/apache-storm-0.10.0/storm-local/nimbus/inbox/stormjar-be5f5f13-c6d6-456d-b45e-2e7bbf6ba4c8.jar +768 [main] INFO b.s.StormSubmitter - Submitting topology storm-kafka-topology in distributed mode with conf {"storm.zookeeper.topology.auth.scheme":"digest","storm.zookeeper.topology.auth.payload":"-8123923076974561721:-8924677632676109956","topology.workers":1,"solr.zookeeper.hosts":"localhost:2181"} +861 [main] INFO b.s.StormSubmitter - Finished submitting topology: storm-kafka-topology +{% endhighlight %} +where `com.vishnu.storm` is the package name and `Topology` is the class containing the main method. +Open your storm UI at http://localhost:8080/ and verify that job has been deployed correctly. Storm UI provides a very good visualization of the toplogy, you can view it by clicking `your-tolology-name>Show Visualization`. + +
+ +
+ +Now let us insert some sample messages for each of the sinks - MongoDB, SOLR and HDFS and check if those messages makes their way to the destination. +To do that, start your kafka-console-producer. If you had forgotten the name of the kafka topic we created earlier (I know I did !) you can use the following command from kafka base folder. +{% highlight sh %} +bin/kafka-topics.sh --zookeeper localhost:2181 --list +//start the console producer +$bin/kafka-console-producer.sh --topic incoming --broker localhost:9092 +//insert message +hdfs this message goes to hdfs +mongo id:1 value:mongodb_message +solr id:1 value:solr_message +{% endhighlight %} + +We now verify each of the sinks + +1) MongoDB - from your mongodb folder, you can run +{% highlight sh %} +$bin/mongo +use storm +db.collection1.find() +{ "_id" : ObjectId("56442855a9ee7800956aaf50"), "id" : "1", "value" : "mongodb_message" } +{% endhighlight %} +2) SOLR - You can see the Solr message by accessing the SOLR UI url. + +
+ +
+ +3) HDFS - You can either run `hadoop fs -ls /from_storm` or access namenode UI url. + +
+ +
+ +I hope you got a fair idea about how to integrate Storm, Kafka, MongoDB, SOLR and HDFS for Realtime analysis. Although this was implemented in a single node cluster for learning purpose, it can be extended for multi-node scenarios as well. For further doubts and clarifications please comment below and I will respond as soon as possible. +
Continue reading \ No newline at end of file diff --git a/_posts/2015-11-20-kaggle_ipython.markdown b/_posts/2015-11-20-kaggle_ipython.markdown new file mode 100644 index 0000000..6da85a5 --- /dev/null +++ b/_posts/2015-11-20-kaggle_ipython.markdown @@ -0,0 +1,321 @@ +--- +layout: post +comments: true +title: Kaggle Titanic using python +date: 2015-11-20 +PAGE_IDENTIFIER: kaggle_ipython +permalink: /kaggle-titanic.html +tags: MachineLearning Classification Python GraphLab Kaggle +description: Kaggle Titanic challenge solution using python and graphlab create +--- +### Load graphlab +
+{% highlight python %} +import graphlab +{% endhighlight %} +
+ +### Load the data +
+ +{% highlight python %} +passengers = graphlab.SFrame('train.csv') + +PROGRESS: Finished parsing file /Users/vishnu/git/hadoop/ipython/train.csv +PROGRESS: Parsing completed. Parsed 100 lines in 0.020899 secs. +------------------------------------------------------ +Inferred types from first line of file as +column_type_hints=[int,int,int,str,str,float,int,int,str,float,str,str] +If parsing fails due to incorrect types, you can correct +the inferred type list above and pass it to read_csv in +the column_type_hints argument +------------------------------------------------------ +PROGRESS: Finished parsing file /Users/vishnu/git/hadoop/ipython/train.csv +PROGRESS: Parsing completed. Parsed 891 lines in 0.010159 secs. +{% endhighlight %} +
+ +### Analyze +
+{% highlight python %} +graphlab.canvas.set_target('ipynb') +passengers.show() +{% endhighlight %} +
+ +
+ +
+ +### Pre process +
+Age column has null values, fill it with Avg age + +{% highlight python %} +passengers = passengers.fillna("Age",passengers["Age"].mean()) +{% endhighlight %} +
+ +### Feature engineering +
+Consider the family size = 1 if (#siblings + #parents) > 3 else 0 + +{% highlight python %} +passengers['family'] = passengers['SibSp']+passengers['Parch'] >3 +{% endhighlight %} +Create a new feature child, if the age is less than 15 + +{% highlight python %} +passengers["Child"] = passengers["Age"]<15 +{% endhighlight %} +Extract title from Name + +{% highlight python %} +import re +def findTitle(name): + match = re.search("(Dr|Mrs?|Ms|Miss|Master|Rev|Capt|Mlle|Col|Major|Sir|Jonkheer|Lady|the Countess|Mme|Don)\\.",name) + if match: + title = match.group(0) + if (title == 'Don.' or title == 'Major.' or title == 'Capt.'): + title = 'Sir.' + if (title == 'Mlle.' or title == 'Mme.'): + title = 'Miss.' + return title + else: + return "Other" +passengers["Title"] = passengers["Name"].apply(findTitle) +passengers["Title"].show() +{% endhighlight %} +
+ +
+ + +**Feature binning** +
+{% highlight python %} +from graphlab.toolkits.feature_engineering import * + +binner = graphlab.feature_engineering.create(passengers, FeatureBinner(features = ['Fare'],strategy='quantile',num_bins = 5)) +fit_binner = binner.fit(passengers) +passengers_binned = fit_binner.transform(passengers) +passengers_binned["Fare"].show() +{% endhighlight %} +
+ +
+ +
+ +### Feature selection +
+{% highlight python %} +features = ["Pclass","Sex","Age","family","Child","Fare","Title"] +{% endhighlight %} +
+ +### Model building +
+Split data into train and test set + +{% highlight python %} +train,test = passengers_binned.random_split(0.8,seed=0) + + +model = graphlab.logistic_classifier.create(passengers_binned, + target="Survived", + features = features, + validation_set = test) + +PROGRESS: Logistic regression: +PROGRESS: -------------------------------------------------------- +PROGRESS: Number of examples : 891 +PROGRESS: Number of classes : 2 +PROGRESS: Number of feature columns : 7 +PROGRESS: Number of unpacked features : 7 +PROGRESS: Number of coefficients : 21 +PROGRESS: Starting Newton Method +PROGRESS: -------------------------------------------------------- +PROGRESS: +-----------+----------+--------------+-------------------+---------------------+ +PROGRESS: | Iteration | Passes | Elapsed Time | Training-accuracy | Validation-accuracy | +PROGRESS: +-----------+----------+--------------+-------------------+---------------------+ +PROGRESS: | 1 | 2 | 0.002642 | 0.831650 | 0.781915 | +PROGRESS: | 2 | 3 | 0.004899 | 0.835017 | 0.781915 | +PROGRESS: | 3 | 4 | 0.007302 | 0.831650 | 0.776596 | +PROGRESS: | 4 | 5 | 0.009823 | 0.831650 | 0.776596 | +PROGRESS: | 5 | 6 | 0.012186 | 0.831650 | 0.776596 | +PROGRESS: | 6 | 7 | 0.014614 | 0.831650 | 0.776596 | +PROGRESS: +-----------+----------+--------------+-------------------+---------------------+ +{% endhighlight %} +
+ +### Evaluation +
+ROC curve + +{% highlight python %} +model.evaluate(test,metric='roc_curve') + +{'roc_curve': Columns: + threshold float + fpr float + tpr float + p int + n int + + Rows: 1001 + + Data: + +------------------+----------------+-----+----+-----+ + | threshold | fpr | tpr | p | n | + +------------------+----------------+-----+----+-----+ + | 0.0 | 0.0 | 0.0 | 75 | 113 | + | 0.0010000000475 | 1.0 | 1.0 | 75 | 113 | + | 0.00200000009499 | 1.0 | 1.0 | 75 | 113 | + | 0.00300000002608 | 1.0 | 1.0 | 75 | 113 | + | 0.00400000018999 | 1.0 | 1.0 | 75 | 113 | + | 0.00499999988824 | 1.0 | 1.0 | 75 | 113 | + | 0.00600000005215 | 0.982300884956 | 1.0 | 75 | 113 | + | 0.00700000021607 | 0.982300884956 | 1.0 | 75 | 113 | + | 0.00800000037998 | 0.982300884956 | 1.0 | 75 | 113 | + | 0.00899999961257 | 0.982300884956 | 1.0 | 75 | 113 | + +------------------+----------------+-----+----+-----+ + [1001 rows x 5 columns] + Note: Only the head of the SFrame is printed. + You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.} + +model.show(view='Evaluation') +{% endhighlight %} +
+ +
+ + +
+ +### Build model again using the entre input +
+{% highlight python %} +model = graphlab.logistic_classifier.create(passengers_binned, + target="Survived", + features = features, + validation_set = None) + +PROGRESS: Logistic regression: +PROGRESS: -------------------------------------------------------- +PROGRESS: Number of examples : 891 +PROGRESS: Number of classes : 2 +PROGRESS: Number of feature columns : 7 +PROGRESS: Number of unpacked features : 7 +PROGRESS: Number of coefficients : 21 +PROGRESS: Starting Newton Method +PROGRESS: -------------------------------------------------------- +PROGRESS: +-----------+----------+--------------+-------------------+ +PROGRESS: | Iteration | Passes | Elapsed Time | Training-accuracy | +PROGRESS: +-----------+----------+--------------+-------------------+ +PROGRESS: | 1 | 2 | 0.002700 | 0.831650 | +PROGRESS: | 2 | 3 | 0.004679 | 0.835017 | +PROGRESS: | 3 | 4 | 0.006863 | 0.831650 | +PROGRESS: | 4 | 5 | 0.008501 | 0.831650 | +PROGRESS: | 5 | 6 | 0.010505 | 0.831650 | +PROGRESS: | 6 | 7 | 0.012663 | 0.831650 | +PROGRESS: +-----------+----------+--------------+-------------------+ +{% endhighlight %} +
+ +### Predict +
+{% highlight python %} +passengers_submission = graphlab.SFrame('test.csv') + +PROGRESS: Finished parsing file /Users/vishnu/git/hadoop/ipython/test.csv +PROGRESS: Parsing completed. Parsed 100 lines in 0.021006 secs. +------------------------------------------------------ +Inferred types from first line of file as +column_type_hints=[int,int,str,str,float,int,int,str,float,str,str] +If parsing fails due to incorrect types, you can correct +the inferred type list above and pass it to read_csv in +the column_type_hints argument +------------------------------------------------------ +PROGRESS: Finished parsing file /Users/vishnu/git/hadoop/ipython/test.csv +PROGRESS: Parsing completed. Parsed 418 lines in 0.008928 secs. + +passengers_submission.show() + +passengers_submission['family'] = passengers_submission['SibSp']+passengers_submission['Parch'] >3 +passengers_submission["Child"] = passengers_submission["Age"]<15 +passengers_submission["Title"] = passengers_submission["Name"].apply(findTitle) +binner = graphlab.feature_engineering.create(passengers_submission, FeatureBinner(features = ['Fare'],strategy='quantile',num_bins = 5)) +fit_binner = binner.fit(passengers_submission) +passengers_submission_binned = fit_binner.transform(passengers_submission) + +passengers["Pclass","Sex","Age","family","Child","Fare","Title"].show() +{% endhighlight %} + +
+ +
+{% highlight python %} +prediction = model.predict(passengers_submission_binned,output_type='class') +passengers_submission["Survived"] = prediction +result = passengers_submission["PassengerId","Survived"] +result +{% endhighlight %} + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PassengerIdSurvived
8920
8931
8940
8950
8961
8970
8981
8990
9001
9010
+[418 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns. +
+ +
+{% highlight python %} +result.save('submission.csv') +{% endhighlight %} +Received score of ***0.78469*** + diff --git a/_posts/2015-11-25-spark_class.markdown b/_posts/2015-11-25-spark_class.markdown new file mode 100755 index 0000000..162912a --- /dev/null +++ b/_posts/2015-11-25-spark_class.markdown @@ -0,0 +1,427 @@ +--- +layout: post +comments: true +title: Building Classification model using Apache Spark +date: 2015-11-25 +show_index: true +PAGE_IDENTIFIER: spark_ml +permalink: /spark_lr.html +tags: ApacheSpark MachineLearning Classification Scala BigData Hadoop +description: Build a LogisticRegression classification model to predict survival of passengers in Titanic disaster. The blog tries to solve the Kaggle knowledge challenge - Titanic Machine Learning from Disaster using Apache Spark and Scala. +--- +
+ +
+The aim of this blog is to explain how to use SparkML to build a Classification model. To explain the usage better, I am going to try solve the Kaggle knowledge challenge - [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic). The source code of this project is available in my [github](https://github.com/soniclavier/hadoop/tree/master/spark/src/main/scala/com/vishnu/spark/kaggle/titanic/KaggleTitanic.scala). + +**Updated** Code for Spark 2.0 can be found in this [branch](https://github.com/soniclavier/bigdata-notebook/blob/update-spark-version/spark/src/main/scala/com/vishnu/spark/kaggle/titanic/KaggleTitanic.scala). + +In this challenge, we are given a set of details of passengers such as name, gender, fare, cabin etc and if the person survived the Titanic disaster. Based on this we have to build a Model that can predict, given another passenger, if he/she is likely to survive. This is an example of binary classification where there are only two possible classes(1 if passenger survives and 0 if not). + +- The first step when trying to build a machine learning model is to analyze and understand the data you have. So that you can decide which all features has to be used for building the model, whether the features are numeric or categorical, what is the mean,max or min of your numerical features and so on. +- Once the data is analyzed, next step is feature selection where we decide which all features are relevant for building the model +- Next is data preprocessing. The input data that you receive for modeling is not going to be good data most of the times. During this stage, for example, we can decide on what to do with the missing values - whether to drop rows having nulls, or fill those with average value of the feature(if feature is numerical), or fill with most occurring value of the feature(if feature is categorical) etc. +- Next comes the Feature engineering and Feature transformation step. In Feature engineering we derive new features from existing ones and during feature transformation we transform existing features so that it can be used for building the model. +- Finally we build the model using the selected features and do prediction on a new set of data. + +We will be implementing all of the above steps using Spark and Scala and will be building a machine learning pipeline - the overall flow can be shown by the diagram below. The grey section of the diagram shows the model building flow and the blue section of the diagram shows the flow for making prediction. + +
+ +
+

+ +### **Load and Analyze data** +
+As mentioned earlier, first step is to analyze the data. To do that, we have to first load data into `Spark`. Download the train.csv file from [here](https://www.kaggle.com/c/titanic/data), and open the file and check the content + +{% highlight sh %} +$ head train.csv +PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked +1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S +2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C +3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S +5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S +6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q +7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S +8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S +{% endhighlight %} +As you can see, the file contains a header row which has PassengerId, Survived, Pclass, Name, Sex, Age, SibSp ,Parch ,Ticket ,Fare ,Cabin and Embarked. You can find more information about what each of these fields are from the Kaggle website. Move this file to some folder in HDFS(I have kept mine at `/kaggle/titanic/train.csv`). The data is in csv format, to load csv files we will use the library [spark-csv](https://github.com/databricks/spark-csv). + +We will define a simple load function that can be used to load csv file. First start your spark-shell using the below command. +{% highlight sh %} +spark-shell --master spark://yourspark-server-url --packages com.databricks:spark-csv_2.11:1.3.0 +{% endhighlight %} +*Note: You will have to import a few classes for this project, which can be found [here](https://github.com/soniclavier/hadoop/tree/master/spark/src/main/scala/com/vishnu/spark/kaggle/titanic/KaggleTitanic.scala)* +{% highlight scala %} +def load(path: String, sqlContext: SQLContext, featuresArr: String*): DataFrame = { + var data = sqlContext.read.format("com.databricks.spark.csv") + .option("header", "true") + .option("inferSchema", "true") + .load(path) + .toDF(featuresArr: _*) + return data + } +{% endhighlight %} +The method takes 3 inputs - the path where the csv file is, sqlContext and a featuresArr which is used to name the columns being loaded. We don't really have to give the featuresArr here since our csv file contains header information. If not, the column names would have been assigned default values such as C0, C1 etc. + +Use the load method defined, to load csv file and create a DataFrame +{% highlight scala %} +var train_data = load("/kaggle/titanic/train.csv", + sqlContext, + "PassengerId", "Survived", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked" + ).cache() +{% endhighlight %} +*Note: We are caching the dataFrame in-memory by calling `cache()`, this will help improve the performance during model building.* + +Now we will explore the loaded DataFrame for to understand the data better. We can check the schema of the loaded data by calling +{% highlight scala %} +scala> train_data.printSchema() +root + |-- PassengerId: integer (nullable = true) + |-- Survived: integer (nullable = true) + |-- Pclass: integer (nullable = true) + |-- Name: string (nullable = true) + |-- Sex: string (nullable = true) + |-- Age: double (nullable = true) + |-- SibSp: integer (nullable = true) + |-- Parch: integer (nullable = true) + |-- Ticket: string (nullable = true) + |-- Fare: double (nullable = true) + |-- Cabin: string (nullable = true) + |-- Embarked: string (nullable = true) +{% endhighlight %} +As you can see, the spark-csv library has inferred the data type of each column. If you go back and check the load method you can see that we have used, `.option("inferSchema", "true")` which tells the library to do so. If not set, all the fields will set to type `string`. +show() method in DataFrame can be used to display the dataframe in tabular form. You can also pass an int to this method to tell how many rows to be displayed. e.g.,`df.show(100)` +{% highlight scala %} +scala> train_data.show() ++-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+ +|PassengerId|Survived|Pclass| Name| Sex| Age|SibSp|Parch| Ticket| Fare|Cabin|Embarked| ++-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+ +| 1| 0| 3|Braund, Mr. Owen ...| male|22.0| 1| 0| A/5 21171| 7.25| | S| +| 2| 1| 1|Cumings, Mrs. Joh...|female|38.0| 1| 0| PC 17599|71.2833| C85| C| +| 3| 1| 3|Heikkinen, Miss. ...|female|26.0| 0| 0|STON/O2. 3101282| 7.925| | S| +| 4| 1| 1|Futrelle, Mrs. Ja...|female|35.0| 1| 0| 113803| 53.1| C123| S| +| 5| 0| 3|Allen, Mr. Willia...| male|35.0| 0| 0| 373450| 8.05| | S| +| 6| 0| 3| Moran, Mr. James| male|null| 0| 0| 330877| 8.4583| | Q| +| 7| 0| 1|McCarthy, Mr. Tim...| male|54.0| 0| 0| 17463|51.8625| E46| S| +| 8| 0| 3|Palsson, Master. ...| male| 2.0| 3| 1| 349909| 21.075| | S| +| 9| 1| 3|Johnson, Mrs. Osc...|female|27.0| 0| 2| 347742|11.1333| | S| +| 10| 1| 2|Nasser, Mrs. Nich...|female|14.0| 1| 0| 237736|30.0708| | C| +| 11| 1| 3|Sandstrom, Miss. ...|female| 4.0| 1| 1| PP 9549| 16.7| G6| S| +| 12| 1| 1|Bonnell, Miss. El...|female|58.0| 0| 0| 113783| 26.55| C103| S| +| 13| 0| 3|Saundercock, Mr. ...| male|20.0| 0| 0| A/5. 2151| 8.05| | S| +| 14| 0| 3|Andersson, Mr. An...| male|39.0| 1| 5| 347082| 31.275| | S| +| 15| 0| 3|Vestrom, Miss. Hu...|female|14.0| 0| 0| 350406| 7.8542| | S| +| 16| 1| 2|Hewlett, Mrs. (Ma...|female|55.0| 0| 0| 248706| 16.0| | S| +| 17| 0| 3|Rice, Master. Eugene| male| 2.0| 4| 1| 382652| 29.125| | Q| +| 18| 1| 2|Williams, Mr. Cha...| male|null| 0| 0| 244373| 13.0| | S| +| 19| 0| 3|Vander Planke, Mr...|female|31.0| 1| 0| 345763| 18.0| | S| +| 20| 1| 3|Masselmani, Mrs. ...|female|null| 0| 0| 2649| 7.225| | C| ++-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+ +only showing top 20 rows +{% endhighlight %} +You can also see stats about any numerical column by using `dataFrame.describe("column")`. e.g., +{% highlight scala %} +scala> train_data.describe("Fare").show() ++-------+------------------+ +|summary| Fare| ++-------+------------------+ +| count| 891| +| mean|32.204207968574615| +| stddev|49.665534444774124| +| min| 0| +| max| 93.5| ++-------+------------------+ +{% endhighlight %} + +Play around with other columns also till you get an idea on how the data is. +
+ +### **Pre-process** +
+ +#### **Fill missing values** +On analyzing the data, you can see a few irregularities in it. For example there are few missing values in column Age. Similarly there are null/missing values in Cabin, Fare and Embarked. There are several techniques for filling in the missing values. You can + +- Ignore/drop the rows having missing values. This can be done in spark by calling
+{% highlight scala %} +var train_na_removed = train_data.na.drop() +{% endhighlight %} +- If the column is numerical, fill in the missing value with the mean/avg value of the column. We are going to replace the missing values in Age column by using this method. +{% highlight scala %} +var avgAge = train_data.select(mean("Age")).first()(0).asInstanceOf[Double] +train_data = train_data.na.fill(avgAge, Seq("Age")) + +{% endhighlight %} + +- If the column is categorical, fill in with the most occurring category
+{% highlight scala %} +//Note: we are not using this feature in our model. Below example is shown to explain how to do this in spark +var train_embarked_filled = train_data.na.fill("S", Seq("Embarked")) +{% endhighlight %} +- Build a machine learning model which can predict those missing values. + +#### **Discover new features** +In many cases, there will be features in your input data that can be used to derive new features which will help in building a better model. This is also called `Feature Engineering`. For example, if you take a closer look at the column 'Name', you can see that the format is `FirstName Title. LastName`. We could not possibly make any prediction based on the passenger's name but may be there is some relation between the Title and the passenger's survival. So, we will extract the title from each name and form a new column/feature. The udf `findTitle` is used for extracting title from a given string. +{% highlight scala %} +val findTitle = sqlContext.udf.register("findTitle", (name: String) => { + val pattern = "(Dr|Mrs?|Ms|Miss|Master|Rev|Capt|Mlle|Col|Major|Sir|Lady|Mme|Don)\\.".r + val matchedStr = pattern.findFirstIn(name) + var title = matchedStr match { + case Some(s) => matchedStr.getOrElse("Other.") + case None => "Other." + } + if (title.equals("Don.") || title.equals("Major.") || title.equals("Capt.")) + title = "Sir." + if (title.equals("Mlle.") || title.equals("Mme.")) + title = "Miss." + title + }) +{% endhighlight %} +DataFrame provides a method `withColumn` which can be used for adding/replacing an existing column. It takes two parameters - the name of the new column and a `Column` of the current DataFrame. i.e., if you call +{% highlight scala %} +var temp = train_data.withColumn("test",train_data("PassengerId")) +//^will create a new column named test with same values as in the column PassengerId. +//we can also modify the value of the new column. e.g., +temp = train_data.withColumn("test",train_data("PassengerId")-1) +temp.select("PassengerId","test").show(3) ++-----------+----+ +|PassengerId|test| ++-----------+----+ +| 1| 0| +| 2| 1| +| 3| 2| ++-----------+----+ +{% endhighlight %} +We will now apply the function `findTitle` on the `Name` column to extract title and create a new column - *Title*. +{% highlight scala %} +train_data = train_data.withColumn("Title", findTitle(train_data("Name"))) +train_data.select("Name","Title").show() ++--------------------+-------+ +| Name| Title| ++--------------------+-------+ +|Braund, Mr. Owen ...| Mr.| +|Cumings, Mrs. Joh...| Mrs.| +|Heikkinen, Miss. ...| Miss.| +|Futrelle, Mrs. Ja...| Mrs.| +|Allen, Mr. Willia...| Mr.| +| Moran, Mr. James| Mr.| +|McCarthy, Mr. Tim...| Mr.| +|Palsson, Master. ...|Master.| +|Johnson, Mrs. Osc...| Mrs.| +|Nasser, Mrs. Nich...| Mrs.| +|Sandstrom, Miss. ...| Miss.| +|Bonnell, Miss. El...| Miss.| +|Saundercock, Mr. ...| Mr.| +|Andersson, Mr. An...| Mr.| +|Vestrom, Miss. Hu...| Miss.| +|Hewlett, Mrs. (Ma...| Mrs.| +|Rice, Master. Eugene|Master.| +|Williams, Mr. Cha...| Mr.| +|Vander Planke, Mr...| Mrs.| +|Masselmani, Mrs. ...| Mrs.| ++--------------------+-------+ +only showing top 20 rows +{% endhighlight %} +Similarly we will define 3 other udfs, using which we will generate new features. +{% highlight scala %} +//Categorize a passenger as child if his/her age is less than 15 +//(more chances of survival) + val addChild = sqlContext.udf.register("addChild", (sex: String, age: Double) => { + if (age < 15) + "Child" + else + sex + }) + +//withFamily is true(1) if the family size excluding self is > 3 +//(large family may have more/less chance of survival) + val withFamily = sqlContext.udf.register("withFamily", (sib: Int, par: Int) => { + if (sib + par > 3) + 1.0 + else + 0.0 + }) +//for converting integer columns to double. Requires since few of the +//columns of our DataFrame are of Int type. +val toDouble = sqlContext.udf.register("toDouble", ((n: Int) => { n.toDouble })) + +//apply the udfs +train_data = train_data.withColumn("Sex", addChild(train_data("Sex"), train_data("Age"))) +train_data = train_data.withColumn("Pclass", toDouble(train_data("Pclass"))) +train_data = train_data.withColumn("Family", withFamily(train_data("SibSp"), train_data("Parch"))) +train_data = train_data.withColumn("Survived", toDouble(train_data("Survived"))) +{% endhighlight %} + +### **Pipeline Components** +
+ML pipeline will have a sequence of Pipeline components. There are two types of components - **Transformers** and **Estimators**. Transformers transforms the input Dataframe into a new DataFrame using the method `transform()`. An Estimator first fits a model to data, using the method `fit()` and then does transform. These will be more clear once you go through the below components.
+ +#### **StringIndexer** +To build a model in Spark, the features must be of the type Double but we have a few features which are of the type String. Spark provides a Feature Transformer - StringIndexer which can be used for this transformation. +{% highlight scala %} +scala> val titleInd = new StringIndexer().setInputCol("Title").setOutputCol("TitleIndex") +titleInd: org.apache.spark.ml.feature.StringIndexer = strIdx_20dfaf280ccc +{% endhighlight %} + Here StringIndexer is an Estimator that transforms the column Title, generates indices for the words and creates a new column named TitleIndex. Fit method of StringIndexer converts the column to StringType*(if it is not of StringType)* and then counts the occurrence of each word. It then sorts these words in descending order of their frequency and assigns an index to each word. StringIndexer.fit() method returns a StringIndexerModel which is a Transformer. +{% highlight scala %} +//execution of fit() and transform() will be done by the pipeline, this is shown to explain how fit and transform works +var strIndModel = titleInd.fit(train_data) +strIndModel: org.apache.spark.ml.feature.StringIndexerModel = strIdx_a3feab934783 +{% endhighlight %} +StringIndexerModel.transform() assigns the generated index to each value of the column in the given DataFrame. +{% highlight scala %} +strIndModel.transform(train_data).select("Title","TitleIndex").show(5) ++-----+----------+ +|Title|TitleIndex| ++-----+----------+ +| Mr.| 0.0| +| Mrs.| 2.0| +|Miss.| 1.0| +| Mrs.| 2.0| +| Mr.| 0.0| ++-----+----------+ +only showing top 5 rows +{% endhighlight %} +*Mr.* is the most frequent word in this data, so it is given index 0. Similarly, we will also create an indexer for the feature - *Sex* +{% highlight scala %} +val sexInd = new StringIndexer().setInputCol("Sex").setOutputCol("SexIndex") +{% endhighlight %} +
Note that we did not call methods fit() or transform() here, that will be taken care by the Pipeline. Pipeline will execute each stage and pass the result of current stage to the next. If a stage is a Transformer, Pipeline will call transform() on it, or if it is an Estimator, pipeline will first call fit() and then transform(). But if the Estimator is the last stage in a pipeline, then the transform() won't be called. +
+ +#### **Binning / Bucketing** + +During Binning/Bukceting, a column with continuous values is converted into buckets. We define the start and end value of each bucket while creating the Bucketizer - *which is a Transformer*. We are going to bucketize the column 'Fare'. + +{% highlight scala %} +//define the buckets/splits +val fareSplits = Array(0.0,10.0,20.0,30.0,40.0,Double.PositiveInfinity) +val fareBucketize = new Bucketizer().setInputCol("Fare").setOutputCol("FareBucketed").setSplits(fareSplits) +fareBucketize.transform(train_data).select("Fare","FareBucketed").show(10) ++-------+------------+ +| Fare|FareBucketed| ++-------+------------+ +| 7.25| 0.0| +|71.2833| 4.0| +| 7.925| 0.0| +| 53.1| 4.0| +| 8.05| 0.0| +| 8.4583| 0.0| +|51.8625| 4.0| +| 21.075| 2.0| +|11.1333| 1.0| +|30.0708| 3.0| ++-------+------------+ +only showing top 10 rows +{% endhighlight %} + +#### **Vector Assembler** +VectorAssembler is used for assembling features into a vector. We will pass all the columns that we are going to use for the prediction to the VectorAssembler and it will create a new vector column. +{% highlight scala %} +val assembler = new VectorAssembler().setInputCols(Array("SexIndex", "Age", "TitleIndex", "Pclass", "Family","FareBucketed")).setOutputCol("features_temp") +{% endhighlight %} + +#### **Normalizer** + +Next we will normalize or standardize the data using the transformer - `Normalizer`. The normalizer will take the column created by the VectorAssembler, normalize it and produce a new column. +{% highlight scala %} +val normalizer = new Normalizer().setInputCol("features_temp").setOutputCol("features") +{% endhighlight %} + +### **Building and Evaluating Model** +
+We will be building our model using **LogisticRegression** algorithm which is used for classification. The variable that is being classified is called the dependent variable and other variables which decides the value of dependent variable are called independent variables. + +In Logistic regression, based on the values of the independent variables, it predicts the probability that the dependent variable takes one of it's categorical value(classes). In our example there are two possible classes 0 or 1. To create a LogitsticRegression component, + +{% highlight scala %} +val lr = new LogisticRegression().setMaxIter(10) +lr.setLabelCol("Survived") +{% endhighlight %} + +#### **Create Pipeline** +Using all the components we defined till now, create a pipeline object. As already mentioned, a pipeline has set of stages and each component we add is a stage in the pipeline. Pipeline will execute each stage one after another, first executing the **fit**(if Evaluator) and then passing the result of **transform** on to the next stage. +{% highlight scala %} +val pipeline = new Pipeline().setStages(Array(sexInd, titleInd, fareBucketize, assembler, normalizer,lr)) +{% endhighlight %} + +#### **Training set & Test set** +To evaluate the model, we will split our data into two - training set(80%) and test set(20%). We will build our model using the training set and evaluate it using test set. We will use area under ROC curve to determine how good the model is. To split input data, +{% highlight scala %} +val splits = train_data.randomSplit(Array(0.8, 0.2), seed = 11L) +val train = splits(0).cache() +val test = splits(1).cache() +{% endhighlight %} +We will now use the pipeline to fit our training data. The result of fitting pipeline on our training data is a PipelineModel object which can be used to do prediction on test data. +{% highlight scala %} +var model = pipeline.fit(train) +model: org.apache.spark.ml.PipelineModel = pipeline_8a2ae1c4a077 +var result = model.transform(test) +{% endhighlight %} +
+ Note that the model object here is instance of PipelineModel not LogisticRegression. This is because LogisticRegression is only a component in our PipelineModel. Whenever a prediction is done for a data set, the data set has to go through all the transformations done by other components in the Pipeline before it can be used by the LogisticRegression component for prediction. +
+To evaluate how well the model did, select the columns 'prediction' and 'Survived' from `result`, create an RDD of [(Double, Double)] and pass it on to BinaryClassificationMetrics. +{% highlight scala %} +result = result.select("prediction","Survived") +val predictionAndLabels = result.map { row => + (row.get(0).asInstanceOf[Double],row.get(1).asInstanceOf[Double]) + } +val metrics = new BinaryClassificationMetrics(predictionAndLabels) +println("Area under ROC = " + metrics.areaUnderROC()) +Area under ROC = 0.7757266300078556 +{% endhighlight %} + +Which is not bad, check this [link](http://gim.unmc.edu/dxtests/roc3.htm) to read more about how to evaluate the model based on the value of area under ROC curve. + +The prediction that we did now, was on our input data where we knew the actual classification. The reason why split the data into train and test set is because we needed to compare actual result with predicted result for evaluating the model. Now will use the entire input data to train the model again. +{% highlight scala %} +model = pipeline.fit(train_data) +{% endhighlight %} + +### **Doing the Prediction** +
+ Download [test.csv](https://www.kaggle.com/c/titanic/download/test.csv) from Kaggle and put it in your HDFS. The test data(submission data) has to go through all loading and pre-process steps done on the training data with an additional requirement of adding the column 'Survived', because test.csv does not contain the column 'Survived'. Loading and pre-processing of test data is done using the below code: +{% highlight scala %} +var submission_data = load("/kaggle/titanic/test.csv", + sqlContext, + "PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked").cache() +avgAge = submission_data.select(mean("Age")).first()(0).asInstanceOf[Double] +submission_data = submission_data.na.fill(avgAge, Seq("Age")) + +submission_data = submission_data.withColumn("Sex", addChild(submission_data("Sex"), submission_data("Age"))) +submission_data = submission_data.withColumn("Title", findTitle(submission_data("Name"))) +submission_data = submission_data.withColumn("Pclass", toDouble(submission_data("Pclass"))) +submission_data = submission_data.withColumn("Family", withFamily(submission_data("SibSp"), submission_data("Parch"))) + +//add column `Survived` +val getZero = sqlContext.udf.register("toDouble", ((n: Int) => { 0.0 })) +submission_data = submission_data.withColumn("Survived", toDouble(submission_data("PassengerId"))) +{% endhighlight %} +Use the PipelineModel object created during model building to do the prediction. +{% highlight scala %} +result = model.transform(submission_data) +{% endhighlight %} + +Let us now take a look at what our model predicted for first three passengers in test data +{% highlight scala %} +result.select("PassengerId","prediction").show(3) ++-----------+----------+ +|PassengerId|prediction| ++-----------+----------+ +| 892| 0.0| +| 893| 1.0| +| 894| 0.0| ++-----------+----------+ +only showing top 3 rows + +{% endhighlight %} +The model predicted that passengers with ID 892 and 894 will not survive and Passenger 893 will survive. +
Note : Received a score of 0.77512 on submitting this to Kaggle.
+ +This concludes the post and I hope it was helpful. Thanks for reading. +
Continue reading \ No newline at end of file diff --git a/_posts/2016-01-10-spark_getting_started.markdown b/_posts/2016-01-10-spark_getting_started.markdown new file mode 100755 index 0000000..c0749d9 --- /dev/null +++ b/_posts/2016-01-10-spark_getting_started.markdown @@ -0,0 +1,80 @@ +--- +layout: post +comments: true +title: Getting Started With Apache Spark 1.6 +date: 2016-01-10 +PAGE_IDENTIFIER: spark_getting_started +permalink: /spark_start.html +tags: ApacheSpark Hadoop BigData +description: This is quick guide on how to Setup Apache Spark 1.6 with YARN. +--- +
+ +
+ +In this short blog, I will explain how to setup Apache Spark 1.6 with YARN. I assume Hadoop is already installed. + +# 1. Download Apache Spark +Go to [spark.apache.org/downloads.html](http://spark.apache.org/downloads.html) and choose the Spark release you want to download(1.6.0 is the default currently). Then under the package type choose the Spark release corresponding to your Hadoop version. Mine is 2.6 hence I choose *Pre-built for Hadoop 2.6 and later* +
+ +
+If you want, you can download the Source Code, navigate to the base folder and build it based on your Hadoop version using below command. +{% highlight sh %} +mvn -Pyarn -Phadoop-2.6 -Dhadoop.version=2.6.0 -DskipTests clean package +{% endhighlight %} + +# 2. Set HADOOP_CONF_DIR +To run Spark in YARN mode, we need to set the HADOOP_CONF_DIR environment variable. +{% highlight sh %} +$ vi ~/.bash_profile +export HADOOP_HOME=/Users/vishnu/hadoop-2.6.0 +export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop +$ source ~/.bash_profile +{% endhighlight %} + + +# 3. Start the Master +Run the start-master.sh which is located in the sbin folder of Spark. This will start the Spark master at IP:8080 +check [http://localhost:8080](http://localhost:8080) or http://yourip:8080 + +{% highlight sh %} +sbin/start-master.sh +{% endhighlight %} + +From the Spark UI, copy the Master URL, which in my case is *spark://Vishnus-MacBook-Pro.local:7077* +
+ +
+ +# 4. Start the Slave +Run the start-slave.sh which is located in the sbin folder. Pass the Master URL copied in the previous step as argument to the start-slave.sh script. This will start the Slave/Worker. +{% highlight sh %} +sbin/start-slave.sh sbin/start-slave.sh spark://Vishnus-MacBook-Pro.local:7077 +{% endhighlight %} +Go back to your Spark UI and you can see that *Alive Workers* is now 1 and the worker details are displayed under *Workers*. +
+ +
+ +# 5. Start Spark Shell +Run the spark-shell.sh file located in the bin folder of Spark with *'--master'* as *'yarn'* +{% highlight sh %} +bin/spark-shell --master yarn +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /___/ .__/\_,_/_/ /_/\_\ version 1.6.0 + /_/ + +Using Scala version 2.10.5 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_60) +Type in expressions to have them evaluated. +Type :help for more information. +Spark context available as sc. +SQL context available as sqlContext. +scala> +{% endhighlight %} + +Spark setup is complete. Thanks for reading. +
Continue reading diff --git a/_posts/2016-02-04-spark_rdd.markdown b/_posts/2016-02-04-spark_rdd.markdown new file mode 100755 index 0000000..32b2069 --- /dev/null +++ b/_posts/2016-02-04-spark_rdd.markdown @@ -0,0 +1,60 @@ +--- +layout: post +comments: true +title: Spark RDDs Simplified +date: 2016-02-04 +PAGE_IDENTIFIER: spark_rdd +permalink: /spark_rdd.html +tags: ApacheSpark Scala BigData Hadoop +description: This blog tries to explain what a Spark RDD is and how it is used for distributing data across the cluster. I have tried to use simple illustrations for better understanding. +--- +
+ +
+Spark RDDs are very simple at the same time very important concept in Apache Spark. Most of you might be knowing the full form of RDD, it is **Resilient Distributed Datasets**. ***Resilient*** because RDDs are immutable*(can't be modified once created)* and fault tolerant, ***Distributed*** because it is distributed across cluster and ***Dataset*** because it holds data. + +So why RDD? Apache Spark lets you treat your input files almost like any other variable, which you cannot do in Hadoop MapReduce. RDDs are automatically distributed across the network by means of Partitions. + +### **Partitions** +
+ +
+RDDs are divided into smaller chunks called Partitions, and when you execute some action, a task is launched per partition. So it means, the more the number of partitions, the more the parallelism. Spark automatically decides the number of partitions that an RDD has to be divided into but you can also specify the number of partitions when creating an RDD. These partitions of an RDD is distributed across all the nodes in the network. + +### **Creating an RDD** + +Creating an RDD is easy, it can be created either from an external file or by parallelizing collections in your driver. For example, +{% highlight scala %} +val rdd = sc.textFile("/some_file",3) +val lines = sc.parallelize(List("this is","an example")) +{% endhighlight %} +The first line creates an RDD from an external file, and the second line creates an RDD from a list of Strings. *Note that the argument '3' in the method call sc.textFile() specifies the number of partitions that has to be created. If you don't want to specify the number of partitions, then you can simply call sc.textFile("some_file").* + +### **Actions/Transformations** +There are two types of operations that you can perform on an RDD- *Transformations and Actions*. **Transformation** applies some function on a RDD and creates a new RDD, it does not modify the RDD that you apply the function on.*(Remember that RDDs are resilient/immutable).* Also, the new RDD keeps a pointer to it's parent RDD. + +
+ +
+When you call a transformation, Spark does not execute it immediately, instead it creates a **lineage**. A lineage keeps track of what all transformations has to be applied on that RDD, including from where it has to read the data. For example, consider the below example + +
+ +
+{% highlight scala %} +val rdd = sc.textFile("spam.txt") +val filtered = rdd.filter(line => line.contains("money")) +filtered.count() +{% endhighlight %} +*sc.textFile() and rdd.filter()* do not get executed immediately, it will only get executed once you call an *Action* on the RDD - here filtered.count(). An **Action** is used to either save result to some location or to display it. You can also print the RDD lineage information by using the command `filtered.toDebugString`*(filtered is the RDD here)*. +
RDDs can also be thought of as a set of instructions that has to be executed, first instruction being the load instruction.
+ +### **Caching** +You can cache an RDD in memory by calling `rdd.cache()`. When you cache an RDD, it's Partitions are loaded into memory of the nodes that hold it. +
+ +
+Caching can improve the performance of your application to a great extent. In the previous section you saw that when an action is performed on a RDD, it executes it's entire lineage. Now imagine you are going to perform an action multiple times on the same RDD which has a long lineage, this will cause an increase in execution time. Caching stores the computed result of the RDD in the memory thereby eliminating the need to recompute it every time. You can think of caching as if it is breaking the lineage, but it does remember the lineage so that it can be recomputed in case of a node failure. + +This concludes the basics of RDD. If you would like to read more, [Part2](spark_rdd_part2.html) talks about Persistence, Broadcast variables and Accumulators. Thanks for reading! +
Continue reading diff --git a/_posts/2016-02-28-spark_rdd_2.markdown b/_posts/2016-02-28-spark_rdd_2.markdown new file mode 100755 index 0000000..966599c --- /dev/null +++ b/_posts/2016-02-28-spark_rdd_2.markdown @@ -0,0 +1,88 @@ +--- +layout: post +comments: true +title: Spark RDDs Simplified - Part 2 +date: 2016-02-28 +PAGE_IDENTIFIER: spark_rdd_part2 +permalink: /spark_rdd_part2.html +tags: ApacheSpark Scala BigData Hadoop +description: This is part 2 of the blog Spark RDDs Simplified. In this part, I am trying to cover the topics Persistence, Broadcast variables and Accumulators. +--- +
+ +
+ +This is Part 2 of the blog **Spark RDDs Simplified.** In this part, I am trying to cover the topics **Persistence**, **Broadcast** variables and **Accumulators**. You can read the first part from [here](spark_rdd) where I talked about Partitions, Actions/Transformations and Caching. + +### **Persistence** +In my previous [blog](spark_rdd), I talked about caching which can be used to avoid recomputation of RDD lineage by saving its contents in memory. If there is not enough memory in the cluster, you can tell spark to use disk also for saving the RDD by using the method *persist()*. +{% highlight scala %} +rdd.persist(StorageLevel.MEMORY_AND_DISK) +{% endhighlight %} + In fact Caching is a type of persistence with StorageLevel -*MEMORY_ONLY*. If you use MEMORY_ONLY as the *Storage Level* and if there is not enough memory in your cluster to hold the entire RDD, then some partitions of the RDD cannot be stored in memory and will have to be recomputed every time it is needed. If you don't want this to happen, you can use the StorageLevel - +*MEMORY_AND_DISK* in which if an RDD does not fit in memory, the partitions that do not fit are saved to disk. +
+ +
+In the above example, the RDD has 3 partitions and there are 2 nodes in the cluster. Also, memory available in the cluster can hold only 2 out of 3 partitions of the RDD. Here, partitions 1 and 2 can be saved in memory where as partition 3 will be saved to disk. Another StorageLevel, *DISK_ONLY* stores all the partitions on the disk. +
In the above method, the RDDs are not serialized before saving to Memory, there are two other StorageLevels - MEMORY_ONLY_SER and MEMORY_AND_DISK_SER, which will store the RDDs as serialized java objects. +
+There are a few more StorageLevels which I did not mention here, you can find more details about it [here](http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence) + +### **Broadcast variables** +A broadcast variable, is a type of shared variable, used for broadcasting data across the cluster. Hadoop MapReduce users can relate this to distributed cache. Let us first understand why we need a broadcast variable. Take a look at the below example, where *names* is joined with *addresses*. +{% highlight scala %} +val names = sc.textFile("/names").map(line => (line.split(",")(3),line)) +val addresses = sc.textFile("/address").map(line=>(line.split(",")(0),line)) +names.join(addresses) +{% endhighlight %} +Here, both names and addresses will be shuffled over the network for performing the join which is not efficient since any data transfer over the network will reduce the execution speed. +
+ +
+Another approach is, if one of the RDDs is small in size, we can choose to send it along with each task. Consider the below example +{% highlight scala %} +val names = sc.textFile("/names").map(line => (line.split(",")(3),line)) +val addresses = sc.textFile("/address").map(line=>(line.split(",")(0),line)) +val addressesMap = addresses.collect().toMap +val joined = names.map(v=>(v._2,(addressesMap(v._1)))) +{% endhighlight %} +
+ +
+This is also inefficient since we are sending sizable amount of data over the network for each task. So how do we overcome this problem? By means of **broadcast** variables. +{% highlight scala %} +val names = sc.textFile("/names").map(line => (line.split(",")(3),line)) +val addresses = sc.textFile("/address").map(line=>(line.split(",")(0),line)) +val addressesMap = addresses.collect().toMap +val broadcast = sc.broadcast(addressesMap) +val joined = names.map(v=>(v._2,(broadcast.value(v._1)))) +{% endhighlight %} +If a variable is broadcasted, it will be sent to each node only once, thereby reducing network traffic. +
Broadcast variables are read-only, broadcast.value is an immutable object
+Spark uses BitTorrent like protocol for sending the broadcast variable across the cluster, i.e., for each variable that has to be broadcasted, initially the driver will act as the only source. The data will be split into blocks at the driver and each leecher *(receiver)* will start fetching the block to it's local directory. Once a block is completely received, then that leecher will also act as a source for this block for the rest of the leechers *(This reduces the load at the machine running driver).* This is continued for rest of the blocks. So initially, only the driver is the source and later on the number of sources increases - because of this, rate at which the blocks are fetched by a node increases over time. +
+ +
+ +### **Accumulators** +Accumulators, as the name suggests accumulates data during execution. This is similar to *Counters* in Hadoop MapReduce. An accumulator is initialized at the driver and is then modified *(added)* by each executors. Finally all these values are aggregated back at the driver. +{% highlight scala %} +val names = sc.textFile("/names").map(line => (line.split(",")(3),line)) +val addresses = sc.textFile("/address").map(line=>(line.split(",")(0),line)) +val addressesMap = addresses.collect().toMap +val broadcast = sc.broadcast(addressesMap) +val joined = names.map(v=>(v._2,(broadcast.value(v._1)))) + +val accum = sc.accumulator(0,"india_counter") +joined.foreach(v=> if (v._2.contains("india")) accum += 1) + +//we cannot do below operations on accumulators of the type Int +//joined.foreach(v=> if (v._2.contains("india")) accum -= 1) +//joined.foreach(v=> if (v._2.contains("india")) accum *= 1) +//error: value *= is not a member of org.apache.spark.Accumulator[Int] +{% endhighlight %} + +That concludes part 2 of the blog **Spark RDDs Simplified**, thanks for reading. Please leave a comment for any clarifications or queries. +
Continue reading + diff --git a/_posts/2016-03-09-flink_start.markdown b/_posts/2016-03-09-flink_start.markdown new file mode 100755 index 0000000..8ea47f9 --- /dev/null +++ b/_posts/2016-03-09-flink_start.markdown @@ -0,0 +1,55 @@ +--- +layout: post +comments: true +title: Getting Started With Apache Flink 1.0 +date: 2016-03-09 +PAGE_IDENTIFIER: flink_start +permalink: /flink_start.html +tags: ApacheFlink BigData Hadoop Scala +description: This is a quick guide on how to Setup Apache Flink 1.0 +--- +
+ +
+**Apache Flink** is the new star in the town. It is stealing the thunder from Apache Spark (at least in the streaming system) which has been creating buzz for some time now. This is because Spark streaming is built on top of RDDs which is essentially a collection, not a Stream. So now would be the right time to try your hands on Flink, even more so since Flink 1.0 was released last week. + +In this short blog, I will explain you how to setup Flink in your system. + +# 1. Download Apache Flink +Go to [https://flink.apache.org/downloads.html](https://flink.apache.org/downloads.html). This page will show the latest stable release of Flink that is available for download(1.0 is latest currently). Under the binaries, click on Download according to your Hadoop version and Scala version. I am having Hadoop 2.6.0 and Scala 2.11 +
+ +
+If you don't have scala installed, you can install it by following instruction from [here](http://www.scala-lang.org/download/install.html).
If you want to know your current scala version you can find it by running below command +{% highlight sh %} +$scala -version +Scala code runner version 2.11.7 -- Copyright 2002-2013, LAMP/EPFL +{% endhighlight %} + +# 2. Start Flink +Start Flink jobmanager by running below command from the root folder of your Flink +{% highlight sh %} +bin/start-local.sh +Starting jobmanager daemon on host Vishnus-MacBook-Pro.local. +{% endhighlight %} + +# 3. Flink dashboard +Flink has a pretty good UI where you can see details of your job, how many slots are present etc. You can access the Flink UI from [localhost:8081](localhost:8081)
*Note: Spark UI also uses same port, make sure you don't have Spark running* +
+ +
+ +# 4. Flink Shell +Flink comes with a scala shell which can be started by running below command from Flink base folder +{% highlight sh %} +$bin/start-scala-shell.sh local +{% endhighlight %} + +
+ +
+ +*Note:start-scala-shell.sh creates a mini cluster, you don't have to start separate jobmanager for scala shell to work on local mode* + +Flink setup is complete. In the coming blogs, I will be writing more about how to write various Streaming applications using Apache Flink. Thanks for reading! +
Continue reading diff --git a/_posts/2016-03-12-flink_streaming.markdown b/_posts/2016-03-12-flink_streaming.markdown new file mode 100755 index 0000000..93adb1a --- /dev/null +++ b/_posts/2016-03-12-flink_streaming.markdown @@ -0,0 +1,85 @@ +--- +layout: post +comments: true +title: Flink Streaming - Tumbling and Sliding Windows +date: 2016-03-12 +PAGE_IDENTIFIER: flink_streaming +permalink: /flink_streaming.html +tags: ApacheFlink BigData Hadoop Scala +description: This article explains about the two types of Windows in Flink - Sliding windows and Tumbling windows +--- +
+ +
+Flink has two types of Windows - **Tumbling Window** and **Sliding Window**. The main difference between these windows is that, Tumbling windows are non-overlapping whereas Sliding windows **can be** overlapping.
+In this article, I will try to explain these two windows and will also show how to write Scala program for each of these. Code used in this blog is also available in my [Github](https://github.com/soniclavier/hadoop_datascience/tree/master/flink/src/main/scala/com/vishnu/flink/streaming) + +# **Need of Window** +In the case of Streaming applications, the data is continuous and therefore we can't wait for the whole data to be streamed before starting the processing. Of course, we can process each incoming event as it comes and move on to the next one, but in some cases we will need to do some kind of aggregation on the incoming data - e.g,. how many users clicked a link on your web page over the last 10 minutes. In such cases, we have to define a window and do the processing for the data within the window. + +# **Tumbling Window** +A Tumbling window, tumbles over the stream of data. This type of Window is non-overlapping - i.e., the events/data in one window will not overlap/present in the other windows. +
+ +
+You can configure the window to tumble based on the count - e.g., for every 5 elements, or based on the time - e.g., for every 10 seconds. + +# **Sliding Window** +A sliding window, opposed to a tumbling window, slides over the stream of data. Because of this, a sliding window can be overlapping and it gives a smoother aggregation over the incoming stream of data - since you are not jumping from one set of input to the next, rather you are sliding over the incoming stream of data. +
+ +
+Similar to Tumbling window, you can configure a Sliding window also to slide based on time or by the count of events. + +# **Scala Code - Tumbling Window** +Below example shows a word count program that listens to a socket and counts the number of times each word is received within a window. The window here is based on count and it tumbles for every 5 items. +{% highlight scala %} +object CountTumblingWindow { + def main(args: Array[String]) { + val sev = StreamExecutionEnvironment.getExecutionEnvironment + val socTextStream = sev.socketTextStream("localhost",4444) //read from socket + val counts = socTextStream.flatMap{_.split("\\s")} //split sentence into words + .map { (_, 1) } //emit 1 for each word + .keyBy(0) //group based on word + .countWindow(5) //window for every 5 items in the group + .sum(1) + .setParallelism(4); //setting parallelism (optional) + counts.print() + sev.execute() + } +} +{% endhighlight %} +In the above example, window is triggered for every 5 items. Since we are doing keyby, each window will be containing only words of the same group. + +{% highlight sh %} +e.g., + if stream is : one two one two one two one two one + window1 = {one,one,one,one,one} + window2 = {two,two,two,two} + window1 will triggered but not window 2, it need one more 'two' to reach count 5. +{% endhighlight %} + +# **Scala Code - Sliding Window** +In the below example, the window slides every 10 seconds and the width of the window is 15 seconds of data. +Therefore, there is an overlap between the windows. +{% highlight scala %} +object TimeSlidingWindow { + def main(args: Array[String]) { + val sev = StreamExecutionEnvironment.getExecutionEnvironment + val socTextStream = sev.socketTextStream("localhost",4444) + val counts = socTextStream.flatMap{_.split("\\s")} + .map { (_, 1) } + .keyBy(0) + .timeWindow(Time.seconds(15),Time.seconds(10)) + .sum(1).setParallelism(4); + + counts.print() + sev.execute() + } +} +{% endhighlight %} + +That covers the basics on the types of Windows in Flink. There are various complex windowing operations that we can do in Flink - e.g., We can choose to Slide window based on time, but trigger the execution based on the Count of items and also choose to keep few of the items in the current window for the next window processing. I will try to cover these advanced topics in the upcoming blogs. + +Thanks for reading! +
Continue reading \ No newline at end of file diff --git a/_posts/2016-03-12-flink_trigger_evictor.markdown b/_posts/2016-03-12-flink_trigger_evictor.markdown new file mode 100755 index 0000000..56a07f7 --- /dev/null +++ b/_posts/2016-03-12-flink_trigger_evictor.markdown @@ -0,0 +1,77 @@ +--- +layout: post +comments: true +title: Flink Streaming - Triggers and Evictors +date: 2016-03-14 +PAGE_IDENTIFIER: flink_trigger_evictor +permalink: /flink_trigger_evictor.html +tags: ApacheFlink BigData Hadoop Scala +description: This article explains the concepts - Triggers and Evictors in Flink Streaming and how to impelemnt it using Scala. +--- +
+ +
+ +In the last [blog](flink_streaming), we looked at the two basic types of Windows in Flink - Sliding and Tumbling windows. In the blog, I will explain you two important concepts that can be used in Flink - **Triggers** and **Evictors**. + +# **Triggers** +Assume we have a sliding window *(of width 15 seconds, which slides every 10 seconds)* and we are collecting items in the window during streaming. A **trigger** can be used to tell Flink when to evaluate the function on the items in the window. For example, if you want the function to be evaluated on every 5 items that you receive within the window that we defined above, we can use `trigger(CountTrigger.of(5))`. + +{% highlight scala %} +val counts = socTextStream.flatMap{_.split("\\s")} + .map { (_, 1) } + .keyBy(0) + .window(SlidingProcessingTimeWindows.of(Time.seconds(15),Time.seconds(10))) + .trigger(CountTrigger.of(5)) + .sum(1) +{% endhighlight %} + +Let us consider few scenarios to understand trigger better.
+ +
Note 1: I am assuming that we are receiving the same word in the stream. This is done to make the explanation simple. Since there is a keyBy(0) after map, each word will belong to separate logical window grouped by the word.
+ +
Note 2: The sliding window used in this example is based on Processing time. Processing time is the time at which an event is processed in the system compared to EventTime which is the time at which event was created. I will be explaining these concepts in the upcoming blogs.
+**Update** : Read about the concept of ProcesingTime and EventTime from this [blog](flink_eventtime.html).
+**scenario 1:** +
+ +
+This is the basic case, where window 1 received 5 items within its window-width of 15 seconds. Last two items have overlap with window 2, hence it will be present in both windows 1 and 2. But window 2 has only 2 items which is less than the trigger count **5**. Whereas window 1 has received 5 items within its window-width and hence the function `sum()` will be triggered. + +**scenario 2:** +
+ +
+In this case, the items arrived in such a way that, both windows 1 and 2 received 5 items in the region where it overlaps. Hence, both windows will be triggered at the same time. + +**scenario 3:** +
+ +
+This is similar to scenario 2, except that window 1 received 10 items, 5 of which are overlapping with window 2. What do you think will happen in such scenario? + + +# **Evictors** +An evictor is used to remove some items from the window before the window function is called. Let us add an evictor to our trigger example.
+***Update:*** After [FLIP-4](https://cwiki.apache.org/confluence/display/FLINK/FLIP-4+%3A+Enhance+Window+Evictor), an evictor can evict elements before or after the window function is called. + +{% highlight scala %} +val counts = socTextStream.flatMap{_.split("\\s")} + .map { (_, 1) } + .keyBy(0) + .window(SlidingProcessingTimeWindows.of(Time.seconds(15),Time.seconds(10))) + .trigger(CountTrigger.of(5)) + .evictor(CountEvictor.of(3)) //evictor + .sum(1) +{% endhighlight %} + +Here, the evictor is CountEvictor of 3, i.e., it will evict all the items except 3 from the window once the trigger is fired. e.g., +Consider the **scenario 1** of trigger example and assume we added a CountEvictor of 3 to it. +
+ +
+ +The function sum will be applied only to the 3 items which are left in the window after eviction. + +That concludes this post, you can find the code used in this article in my [GitHub](https://github.com/soniclavier/hadoop_datascience/tree/master/flink/src/main/scala/com/vishnu/flink/streaming). Thanks for reading! +
Continue reading \ No newline at end of file diff --git a/_posts/2016-05-18-spark_session.markdown b/_posts/2016-05-18-spark_session.markdown new file mode 100755 index 0000000..d2adc8b --- /dev/null +++ b/_posts/2016-05-18-spark_session.markdown @@ -0,0 +1,181 @@ +--- +layout: post +comments: true +title: Experiment with Spark 2.0 - Session +date: 2016-03-14 +PAGE_IDENTIFIER: spark_session +permalink: /spark_session.html +tags: ApacheSpark BigData Hadoop Scala +description: This blog post talks about how to create a SparkSession object in Spark 2.0 and how to use it for registering Tables, creating DataSets, DataFrames, UDFs and Catalogs +--- +
+ +
+**SparkSession** is the new entry point from Spark 2.0. Prior to 2.0, we had only SparkContext and SQLContext, and also we would create StreamingContext (if using streaming). +It looks like SparkSession is part of the Spark's plan of unifying the APIs from Spark 2.0. + +### **start spark shell** +Run the following commands from your spark base folder. +{% highlight sh %} +sbin/start-master.sh +sbin/start-slave.sh spark://:7077 +bin/spark-shell --master spark://:7077 +{% endhighlight %} + +### **create spark session** +SparkSession object will be available by default in the spark shell as "spark". But when you build your spark project outside the shell, you can create a session as follows +{% highlight scala %} +import org.apache.spark.sql.SparkSession +val spark = SparkSession. + builder(). + master("spark://Vishnus-MacBook-Pro.local:7077"). + appName("ExperimentWithSession"). + getOrCreate() +{% endhighlight %} +If you run the above command in spark shell, you will see this warning +{% highlight scala %} +WARN SparkSession$Builder: Using an existing SparkSession; some configuration may not take effect. +spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1c571162 +{% endhighlight %} +This is because there is already an instance SparkSession object in the scope, which is also evident from the builder's getOrCreate() method. +getOrCreate method of SparkSession builder does the following: + +1. ***Create a SparkConf*** +2. ***Get a SparkContext*** (using SparkContext.getOrCreate(sparkConf)) +3. ***Get a SparkSession*** (using SQLContext.getOrCreate(sparkContext).sparkSession) + +Once spark session is created, it can be used to read data from various sources. + +
Note : All the commands used in the blog post can be found here
+{% highlight scala %} +spark.read. //pressed tab here +csv format jdbc json load option options orc parquet schema stream table text +//Load some json file +val df = spark.read.json("/spark_learning/pandainfo.json") +df.show ++--------------------+-----------+---------------+ +| knows|lovesPandas| name| ++--------------------+-----------+---------------+ +| null| true|Sparky The Bear| +| null| null| Holden| +|[WrappedArray(hol...| true|Sparky The Bear| ++--------------------+-----------+---------------+ +{% endhighlight %} +
Note: I am using the dataset from learning-spark github repository.
+Let us now register this Dataframe as a temp table. +{% highlight scala %} +df.registerTempTable("pandas") +warning: there was one deprecation warning; re-run with -deprecation for details +{% endhighlight %} +It looks like `registerTempTable` method is deprecated. Let's check [Dataset.scala](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala#L2692) to figure out which alternate method to use. +
+ +
+{% highlight scala %} +df.createOrReplaceTempView("pandas") +{% endhighlight %} + +You can also save the dataframe as table in hive metastore using. +{% highlight scala %} +df.write.saveAsTable("pandas") +{% endhighlight %} + +### **spark.table** +You can access the registered table via +{% highlight scala %} +spark.table("pandas") +//also we can run sql queries +//this we used to do using SQLContext in earlier versions +//using sqlContext.sql("query here") +spark.sql("select name from pandas").show ++---------------+ +| name| ++---------------+ +|Sparky The Bear| +| Holden| +|Sparky The Bear| ++---------------+ +{% endhighlight %} + +### **spark.udf** +We can register udf(User Defined Function) using the SparkSession. +{% highlight scala %} +spark.udf.register("addone",(x:Int)=>x+1) +{% endhighlight %} + +### **createDataSet** +This API is similar to how we create an RDD using SparkContext +{% highlight scala %} +scala> val ds = spark.createDataset(List(1,2,3)) //from a List +ds: org.apache.spark.sql.Dataset[Int] = [value: int] + +scala> val rdd = sc.parallelize(List(1,2,3)) +scala> val ds = spark.createDataset(rdd) //from RDD +ds: org.apache.spark.sql.Dataset[Int] = [value: int] +{% endhighlight %} + +### **createDataFrames** +Used for creating DataFrames. We cannot create a Dataframe from our earlier RDD[Int] because createDataFrame requires an `RDD[A <: Product]` - i.e., a class that is subclass of Product. So we will create a DataFrame from an RDD of case class. +{% highlight scala %} +case class Num(x:Int) +val rdd = sc.parallelize(List(Num(1),Num(2),Num(3))) +spark.createDataFrame(rdd).show ++---+ +| x| ++---+ +| 1| +| 2| +| 3| ++---+ +{% endhighlight %} + +Let us look at one more way of creating DataFrame, using Row RDD and Schema +{% highlight scala %} +import org.apache.spark.sql.types.{StructType,StructField,IntegerType}; +import org.apache.spark.sql.Row +val rowRDD = rdd.map(x=>Row(x)) +val schema = StructType(Array(StructField("num", IntegerType, true))) +spark.createDataFrame(rowRDD,schema).show ++---+ +|num| ++---+ +| 1| +| 2| +| 3| ++---+ +{% endhighlight %} + +### **DataFrame to RDD / DataSet to RDD** +A DataFrame or a DataSet can be converted to rdd by calling .rdd +{% highlight scala %} +val ds = spark.createDataset(List(1,2,3)) +val rdd = ds.rdd +{% endhighlight %} + +### **Catalog** +Catalog provides a catalog of information about the databases and tables in the session, also some actions like drop view, cacheTable, clearCache etc + +{% highlight scala %} +spark.catalog.cacheTable("pandas") // caches the table into memory, throws Table or view not found in database exeception if not found. +spark.catalog.uncacheTable("pandas") // to remove table from memory + +spark.catalog.currentDatabase +res4: String = default + +spark.catalog.isCached("pandas") +res24: Boolean = true + +spark.catalog.clearCache + +spark.catalog.listDatabases.take(1) +res29: Array[org.apache.spark.sql.catalog.Database] = Array(Database[name='default', description='Default Hive database', path='hdfs://localhost:9000/Users/vishnu/spark-2.0.0-S +NAPSHOT-bin-hadoop2.6/spark-warehouse']) + +spark.catalog.listTables("default").take(1) +res30: Array[org.apache.spark.sql.catalog.Table] = Array(Table[name='pandas', tableType='TEMPORARY', isTemporary='true']) + +spark.catalog.dropTempView("pandas") //drops the table +{% endhighlight %} + +This concludes my experiments with SparkSession for now. I will try to explore more about the new features in Spark 2.0 and share with you in later posts! +
Continue reading \ No newline at end of file diff --git a/_posts/2016-11-07-flink-eventime.markdown b/_posts/2016-11-07-flink-eventime.markdown new file mode 100755 index 0000000..e59274c --- /dev/null +++ b/_posts/2016-11-07-flink-eventime.markdown @@ -0,0 +1,110 @@ +--- +layout: post +comments: true +title: Flink Event Time Processing and Watermarks +date: 2016-11-07 +PAGE_IDENTIFIER: flink_eventtime_watermark +permalink: /flink_eventtime.html +image: /img/flink_eventtime/blog_header.png +tags: ApacheFlink BigData Hadoop Scala Streaming +description: If you are creating a Realtime streaming application, Event Time processing is one of the features that you will have to use sooner or later. In the blog post, we will see why and how we can enable EventTime processing in ApacheFlink. +--- +
+ +
+If you are building a Realtime streaming application, Event Time processing is one of the features that you will have to use sooner or later. Since in most of the real-world use cases messages arrive out-of-order, there should be some way through which the system you build understands the fact that messages could arrive late and handle them accordingly. In this blog post, we will see why we need Event Time processing and how we can enable it in ApacheFlink. + +**EventTime** is the time at which an event occurred in the real-world and **ProcessingTime** is the time at which that event is processed by the Flink system. To understand the importance of Event Time processing, we will first start by building a Processing Time based system and see it's drawback. + +We will create a [SlidingWindow](flink_streaming.html) of size 10 seconds which slides every 5 seconds and at the end of the window, the system will emit the number of messages that were received during that time. Once you understand how EventTime processing works with respect to a SlidingWindow, it will not be difficult to understand how it works for a [TumblingWindow](flink_streaming.html) as well. So let's get started. + + +### **ProcessingTime based system** +For this example we expect messages to have the format ***value,timestamp*** where value is the message and timestamp is the time at which this message was generated at the source. Since we are now building a Processing Time based system, the code below ignores the timestamp part. +
+ It is an important aspect to understand that the messages should contain the information on when it was generated. Flink or any other system is not a magic box that can somehow figure this out by itself. Later we will see that, Event Time processing extracts this timestamp information to handle late messages. +
+{% highlight scala %} +val text = senv.socketTextStream("localhost", 9999) +val counts = text.map {(m: String) => (m.split(",")(0), 1) } + .keyBy(0) + .timeWindow(Time.seconds(10), Time.seconds(5)) + .sum(1) +counts.print +senv.execute("ProcessingTime processing example") +{% endhighlight %} + +#### **Case 1: Messages arrive without delay** +Suppose the source generated three messages of the type **a** at times 13th second, 13th second and 16th second respectively. (Hours and minutes are not important here since the window size is only 10 seconds). +
+ +
+These messages will fall into the windows as follows. The first two messages that were generated at 13th sec will fall into both window1*[5s-15s]* and window2*[10s-20s]* and the third message generated at 16th second will fall into window2*[10s-20s]* and window3*[15s-25s]*. The final counts emitted by each window will be (a,2), (a,3) and (a,1) respectively. +
+ +
+This output can be considered as the expected behavior. Now we will look at what happens when one of the message arrives late into the system. + +#### **Case 2: Messages arrive in delay** +Now suppose one of the messages (generated at 13th second) arrived at a delay of 6 seconds(at 19th second), may be due to some network congestion. Can you guess which all windows would this message fall into? +
+ +
+The delayed message fell into window 2 and 3, since 19 is within the range *10-20* and *15-25*. It did not cause any problem to the calculation in window2 (because the message was anyways supposed to fall into that window) but it affected the result of window1 and window3. We will now try to fix this problem by using EventTime processing. + +### **EventTime based system** +To enable EventTime processing, we need a timestamp extractor that extracts the event time information from the message. Remember that the messages were of the format *value,timestamp*. The *extractTimestamp* method gets the timestamp part and returns it as a Long. Ignore the *getCurrentWatermark* method for now, we will come back to it later. + +{% highlight scala %} +class TimestampExtractor extends AssignerWithPeriodicWatermarks[String] with Serializable { + override def extractTimestamp(e: String, prevElementTimestamp: Long) = { + e.split(",")(1).toLong + } + override def getCurrentWatermark(): Watermark = { + new Watermark(System.currentTimeMillis) + } +} +{% endhighlight %} +We now need to set this timestamp extractor and also set the **TimeCharactersistic** as EventTime. Rest of the code remains the same as in the case of ProcessingTime. + +{% highlight scala %} +senv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) +val text = senv.socketTextStream("localhost", 9999) + .assignTimestampsAndWatermarks(new TimestampExtractor) +val counts = text.map {(m: String) => (m.split(",")(0), 1) } + .keyBy(0) + .timeWindow(Time.seconds(10), Time.seconds(5)) + .sum(1) +counts.print +senv.execute("EventTime processing example") +{% endhighlight %} +The result of running the above code is shown in the diagram below. +
+ +
+The results look better, the windows 2 and 3 now emitted correct result, but window1 is still wrong. Flink did not assign the delayed message to window 3 because it now checked the message's event time and understood that it did not fall in that window. But why didn't it assign the message to window 1?. The reason is that by the time the delayed message reached the system(at 19th second), the evaluation of window 1 has already finished (at 15th second). Let us now try to fix this issue by using the Watermark. +
Note that in window 2, the delayed message was still placed at 19th second, not at 13th second(it's event time). This depiction in the figure was intentional to indicate that the messages within a window are not sorted according to it's event time. (this might change in future)
+ +## **Watermarks** +Watermarks is a very important and interesting idea and I will try to give you a brief overview about it. If you are interested in learning more, you can watch this awesome [talk](https://www.youtube.com/watch?v=3UfZN59Nsk8) from Google and also read this [blog](http://data-artisans.com/how-apache-flink-enables-new-streaming-applications-part-1/) from dataArtisans. A Watermark is essentially a timestamp. When an Operator in Flink receives a watermark, it understands(assumes) that it is not going to see any message older than that timestamp. Hence watermark can also be thought of as a way of telling Flink how far it is, in the "EventTime". + +For the purpose of this example, think of it as a way of telling Flink how much delayed a message can be. In the last attempt, we set the watermark as the current system time. It was, therefore, not expecting any delayed messages. We will now set the watermark as **current time - 5 seconds**, which tells Flink to expect messages to be a maximum of 5 seconds dealy - This is because each window will be evaluated only when the watermark passes through it. Since our watermark is current time - 5 seconds, the first window [5s-15s] will be evaluated only at 20th second. Similarly the window [10s-20s] will be evaluated at 25th second and so on. + +{% highlight scala %} +override def getCurrentWatermark(): Watermark = { + new Watermark(System.currentTimeMillis - 5000) + } +{% endhighlight %} +
Here we are assuming that the eventtime is 5 seconds older than the current system time, but that is not always the case. In many cases it will be better to hold the max timestamp received so far(which is extracted from the message) and subtract the expected delay from it.
+The result of running the code after making above changes is: +
+ +
+Finally we have the correct result, all the three windows now emit counts as expected - which is (a,2), (a,3) and (a,1). + +## **Allowed Lateness** +In our earlier approach where we used "watermark - delay", the window would not fire until the watermark is past window_length + delay. If you want to accommodate late events, and want the window to fire on-time you can use **Allowed Lateness**. If allowed lateness is set, Flink will not discard message unless it is past the *window_end_time + allowed lateness*. Once a late message is received, Flink will extract it's timestamp and check if it is within the allowed lateness, then it will check whether to FIRE the window or not (as per the Trigger set). Hence, note that a window might fire multiple times in this approach, and you might want to make your sink idempotent - if you need exactly once processing. + +### **Conclusion** +The importance of real-time stream processing systems has grown lately and having to deal with delayed message is part of any such system you build. In this blog post, we saw how late arriving messages can affect the results of your system and how ApacheFlink's Event Time processing capabilities can be used to solve them. That concludes the post, Thanks for reading! +
Continue reading \ No newline at end of file diff --git a/_posts/2017-04-23-hello-kafka-streams.markdown b/_posts/2017-04-23-hello-kafka-streams.markdown new file mode 100755 index 0000000..278e9a1 --- /dev/null +++ b/_posts/2017-04-23-hello-kafka-streams.markdown @@ -0,0 +1,87 @@ +--- +layout: post +comments: true +title: Hello Kafka Streams +date: 2017-04-23 +PAGE_IDENTIFIER: hello-kafka-streams +permalink: /hello-kafka-streams.html +image: /img/kafka_streams/header_share.png +tags: Kafka BigData Hadoop Scala Streaming +description: Kafka Streams is a stream processing library on top of Apache Kafka. In this blog we will have a quick look at the basic concepts Kafka Streams and then build a simple Hello Streams application that reads messages (names of people) from a topic and writes “hello name” to another topic +--- +
+ +
+Kafka Streams is a stream processing library on top of Apache Kafka. Even though Kafka Streams might look very similar to [Apache Flink](search.html?query=flink), they are meant for different use cases. The main difference being that Flink is a cluster based analytics framework whereas Kafka Streams is a library that can be used to build applications that process messages from Kafka topics. Kafka Streams is tightly integrated with Kafka as its source which is a design choice, whereas Flink is more general purpose. The advantage of Kafka Streams is that it is light weight and it comes out of the box with Kafka (which is almost always the choice of messaging system in Big Data applications) therefore making it easy to build stream processing applications. + +In this blog, we will have a quick look at the basic concepts Kafka Streams and then build a simple Hello Streams application that reads messages *(names of people)* from a topic and writes "hello *name*" to another topic. All the code used in this blog can be found in my [Github](https://github.com/soniclavier/hadoop_datascience/tree/master/KafkaStreams) . + +#### **Topology** +Similar to other stream processing systems, the topology in KafkaStreams defines from where to read the data from, how to process and where to save the results. It has mainly three types of nodes - **Source, Processor and Sink**, connected by edges called **Streams**. +
+ +
+#### **KStreams and KTables** +KStreams and KTables are main two abstractions that represent a Stream of messages - which are (key, value) pairs. A KTable can be thought of as a KStream with only the latest value for each key, and a KStream can be thought of as a stream of changes(changelogs) that happen to a KTable. +
+ +
+ +### **Hello Kafka Streams** +Before we start writing the code, there are a few very easy environment setup steps to be done, which are - start Zookeeper, Kafka Broker and create the Topics. Run the below commands from the base folder of Kafka, which in my case is **~/kafka_2.11-0.10.2.0**. + +{% highlight sh %} +kafka_2.11-0.10.2.0$ bin/zookeeper-server-start.sh config/zookeeper.properties +kafka_2.11-0.10.2.0$ bin/kafka-server-start.sh config/server.properties +{% endhighlight %} +This starts the Zookeeper at port 2181 and Kafka Broker at port 9092 (which are the defaults and can be changed by editing the config files). Next, we will create the topics needed for the application. +{% highlight sh %} +kafka_2.11-0.10.2.0$ bin/kafka-topics.sh --create --topic names --replication-factor 1 --partitions 1 --zookeeper localhost:2181 +kafka_2.11-0.10.2.0$ bin/kafka-topics.sh --create --topic hellostream --replication-factor 1 --partitions 1 --zookeeper localhost:2181 +{% endhighlight %} + +#### **Building the Application** +Create a new SBT project in your IDE and edit the build.sbt file as per [this](https://github.com/soniclavier/hadoop_datascience/blob/master/KafkaStreams/build.sbt) (*You can ignore the kryo dependencies for now*). Next, create an object called [HelloKafkaStreams.scala](https://github.com/soniclavier/hadoop_datascience/blob/master/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/HelloKafkaStreams.scala), and create a Properties object with following properties - Kafka Broker Url , Key SerDe(Serializer and Deserializer) and value SerDe. +{% highlight scala %} +val settings = new Properties +settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "hello-kafka-streams") +settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092") +settings.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest") +settings.put(StreamsConfig.KEY_SERDE_CLASS_CONFIG, Serdes.serdeFrom(classOf[String]).getClass.getName) +settings.put(StreamsConfig.VALUE_SERDE_CLASS_CONFIG, Serdes.serdeFrom(classOf[String]).getClass.getName) +{% endhighlight %} + +We will now create a stream builder and use it to create a KStream that reads from topic - names. +{% highlight scala %} +val kstreamBuilder = new KStreamBuilder +val rawStream: KStream[String, String] = kstreamBuilder.stream("names") +{% endhighlight %} + +Now we map each value in the raw stream by using the method mapValues. mapValues takes an instance of a ValueMapper class, which will append the word "hello" to each name read from the names topic. +{% highlight scala %} +val helloStream: KStream[String, String] = rawStream.mapValues(new ValueMapper[String, String]{ + override def apply(value: String): String = s"hello $value" +}) +{% endhighlight %} + +Finally, we will write the result back to another topic and start the processing. The first two parameters in the "to" method are optional, if not provided Kafka will take the default serializers from the Properties object set initially. +{% highlight scala %} +helloStream.to(Serdes.String, Serdes.String, "hellostream") +val streams = new KafkaStreams(kstreamBuilder, settings) +streams.start +{% endhighlight %} +We can now build the application using "sbt assembly", and run the jar using the following command. +{% highlight sh %} +java -cp target/scala-2.11/KafkaStreams-assembly-1.0.jar com.vishnuviswanath.kafka.streams.HelloKafkaStreams +{% endhighlight %} +Now, open a terminal and start a kafka-console-producer to send some names to the "names" topic and open another terminal and start a kafka-console-consumer to listen to "hellostream" topic. +{% highlight sh %} +bin/kafka-console-producer.sh --broker-list localhost:9092 --topic names +vishnu +bin/kafka-console-consumer.sh --topic hellostream --bootstrap-server localhost:9092 --from-beginning +hello vishnu +{% endhighlight %} +As you can see, it is very easy to build a simple stream processing application using Kafka Streams. In the next blog, we will build a bit more complicated application that demonstrates the use of flatMapValues, branch, predicate, selectKey, through, join and also see how to create a custom SerDe using Kryo. + +Thanks for reading! +
Continue reading \ No newline at end of file diff --git a/_posts/2017-05-07-kafka-streams.markdown b/_posts/2017-05-07-kafka-streams.markdown new file mode 100755 index 0000000..62f63dd --- /dev/null +++ b/_posts/2017-05-07-kafka-streams.markdown @@ -0,0 +1,104 @@ +--- +layout: post +comments: true +title: Kafka Streams - Part 2 +date: 2017-05-07 +PAGE_IDENTIFIER: kafka-streams-part2 +permalink: /kafka-streams-part2.html +image: /img/kafka_streams_2/logo_share.png +tags: Kafka BigData Hadoop Scala Streaming +description: This is continuation of the blog post - "Hello Kafka Streams". In this blog we build a bit more complicated application that demonstrates the use of flatMapValues, branch, predicate, selectKey, through, join and also see how to create a custom SerDe using Kryo. +--- +
+ +
+This is Part 2 of the blog on Kafka Streams, in the previous blog [Hello Kafka Streams](hello-kafka-streams), we built a simple stream processing application using Kafka Streams Library. In this blog, we will continue exploring more features in Kafka Streams by building a bit more involved application which explains the use of flatMapValues, branch, predicate, selectKey, through, join and also see how to create a custom SerDe using Kryo. All the code used in this blog can be found in my [Github](https://github.com/soniclavier/hadoop_datascience/blob/master/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/ClimateLogStream.scala) . + +In this post, our input will be a stream of ClimateLog messages which will be of the format **. We will be creating a streaming application that has the below topology. +
+ +
+*climate_events* is a topic where we receive the ClimateLog messages in String format. These raw messages are parsed into *case class ClimateLog*. + +{% highlight scala %} +case class ClimateLog(country: String, state: String, temperature: Float, humidity: Float) +val rawStream: KStream[String, String] = kstreamBuilder.stream(Serdes.String, Serdes.String, "climate_events") + +val climateLogStream: KStream[String, ClimateLog] = rawStream.flatMapValues(new ValueMapper[String, Iterable[ClimateLog]]{ + override def apply(value: String): Iterable[ClimateLog] = ClimateLog(value).toIterable.asJava +}) +{% endhighlight %} + +
Note that all messages in Kafka should have a Key and a Value. If we do not pass a key during ingestion through KafkaConsoleProducer, it will be null.
+ +### **branch** +Branch creates multiple branches from a single stream. It takes in Varargs of Predicates and produces a KStream of each Predicate. Each element in the source KStream is applied against each Predicate and the element is assigned to the KStream corresponding to the first Predicate that it matches. In our example, we will create two predicates one for highHumidity and the other for lowTemp. + +{% highlight scala %} +//define the predicates to split the stream into branches +val highHumidty = new Predicate[String, ClimateLog] { + override def test(t: String, c: ClimateLog): Boolean = c.humidity > 50 +} +val lowTemp = new Predicate[String, ClimateLog] { + override def test(t: String, c: ClimateLog): Boolean = c.temperature < 0 +} +//array of streams for each predicate +val branches = climateLogStream.branch(highHumidty, lowTemp) +{% endhighlight %} + + +### **through** +Through persists the messages from a KStream to the given topic and creates a new KStream from that topic. This can be used if you want the intermediate result from the application to be made available to other application, but at the same time use the stream further downstream in the current application. We will persist lowTemp stream and highHumidity stream to 2 new topics - low_temp and high_humidity. + +{% highlight scala %} +val highHumidityStream = branches(0).through(new Serdes.StringSerde, new ClimateLogSerDe, "high_humidity") +val lowTempStream = branches(1).through(new Serdes.StringSerde, new ClimateLogSerDe, "low_temp") +{% endhighlight %} + +Note that the Value serializer is a custom Kryo based serializer for ClimateLog, which we will be creating next. + +### **kryo serializer** +The serializer needs to implement `org.apache.kafka.common.serialization.Serde`. *Serde* has mainly two methods - serializer() and deserializer() which return instance of Serializer and Deserializer. Kafka expects this class to have an empty constructor. So, we will create a class ClimateLogSerDe which extends ClimatelogWrappedSerde class, which takes the Serializer and Deserializer as arguments in it's constructor. We also create ClimateLogSerializer and ClimateLogDeserializer which uses ClimateLogKryoSerDe as default serializer. The implementation is bit lengthy, please check the [github page](https://github.com/soniclavier/hadoop_datascience/blob/master/KafkaStreams/src/main/scala-2.11/com/vishnuviswanath/kafka/streams/ClimateLogStream.scala#L124-L194) for complete code. + +### **selectKey** +The streams we have till now does not have a key (assuming you are using KafkaConsoleProducer and is not passing a key). *selectKey* selects a key using the KeyValueMapper provided and creates a new stream from the existing stream. We create two streams from highHumdityStream and lowTempStream by choosing *value.country* as the key. + +{% highlight scala %} +val keyedHighHumStream: KStream[String, ClimateLog] = highHumidityStream.selectKey(new KeyValueMapper[String, ClimateLog, String] { + override def apply(key: String, value: ClimateLog): String = value.country +}) + +val keyedLowTempStream: KStream[String, ClimateLog] = lowTempStream.selectKey(new KeyValueMapper[String, ClimateLog, String] { + override def apply(key: String, value: ClimateLog): String = value.country +}) +{% endhighlight %} + +### **join** +Next, we join the highHumidity stream and lowTemperature stream to create a new stream called warnings. The two streams will be joined based on the key - which in this case is the country. We should also define a join window, +{% highlight scala %} +//create a join window. This window joins all the elements of the same key if the difference between their timestamps is within 60 seconds +val joinWindow = JoinWindows.of(60 * 1000) +{% endhighlight %} +Now join the streams using a ValueJoiner. A ValueJoiner defines what should be done when we find two values for the same key. In this example, we simply merge these two values by getting the temperature from low temp stream and humidity from high humidity stream. +{% highlight scala %} +val warningsStream: KStream[String, String] = keyedHighHumStream.join[ClimateLog, String]( + keyedLowTempStream, + new ValueJoiner[ClimateLog, ClimateLog, String] { + override def apply(value1: ClimateLog, value2: ClimateLog): String = value2.copy(humidity = value1.humidity).toString + }, + joinWindow) +{% endhighlight %} + +Finally, we store the warningsStream to another topic called "warnings", and then start the stream. + +{% highlight scala %} +warningsStream.to(new Serdes.StringSerde, new Serdes.StringSerde, "warnings") + +val streams = new KafkaStreams(kstreamBuilder, settings) +streams.start +{% endhighlight %} + +We have already seen how to submit the job, how to create the topics(climate_events, high_humidity, low_temp, warnings) and how to send the message to these topics in the previous [blog post](hello-kafka-streams#hello-kafka-streams), so I am not going to bore you with the same details :) + +To summarize, we saw how to use various KafkaStreams APIs such as - branch, through, selectKey, join. We also created a custom serializer using Kryo. Hope this was useful and Thanks for reading! +
Continue reading \ No newline at end of file diff --git a/_posts/2017-05-20-flink-session-windows.markdown b/_posts/2017-05-20-flink-session-windows.markdown new file mode 100755 index 0000000..1a69053 --- /dev/null +++ b/_posts/2017-05-20-flink-session-windows.markdown @@ -0,0 +1,87 @@ +--- +layout: post +comments: true +title: Session Windows in Apache Flink +date: 2017-06-10 +PAGE_IDENTIFIER: flink_session_windows +permalink: /flink-session-windows.html +image: /img/flink_session/logo.png +tags: ApacheFlink BigData Hadoop Scala Streaming +description: Apache Flink's Session Windows allows messages to be windowed into sessions. In this blog, we will create a streaming application that counts number of Clicks made by each user within a session using EventTimeSession windows. +--- +
+ +
+Session Windows in Apache Flink allows messages to be [Windowed](flink_streaming) into sessions based on user's activity. Flink allows us to define a time gap and all the messages that arrive within a "period of inactivity" less than the defined time gap can be considered to belong to the same session. This has many practical use cases, mainly because this relates to Sessions in Web applications. + +In this blog, we will build a streaming application that uses [EventTime](flink_eventtime) based Session Windows to identify how many times user made a Click during a session. Flink version at the time of writing this blog is 1.3.0. All the code used in this blog can be found in my [Github](https://github.com/soniclavier/bigdata-notebook/tree/master/flink/src/main/scala/com/vishnu/flink/streaming/sessionwindows) . +
+ +
+### Message format +For this example, our Click events are of the format , where event_source can be recommendation, ad etc. This following case class can be used to capture these messages. +{% highlight scala %} +case class Click(timestamp: Long, userId: String, source: String) +{% endhighlight %} +We could also create a companion object to make it easy to parse the raw logs into Clicks. +{% highlight scala %} +object Click { + def apply(raw: String): Option[Click] = { + val p = raw.split(",") + Try(Click(p(0).toLong, p(1), p(2))) match { + case Success(e) ⇒ { + Some(e) + } + case Failure(e) ⇒ { + None + } + } + } + } +{% endhighlight %} + +### Streaming Pipeline +Next, we create the pipeline. We will be creating an EventTime based application since messages can come delayed and we should be able to handle such scenarios. If you are not aware of the terms EventTime, ProcessingTime and Watermarks please read this [blog](flink_eventtime) post. +{% highlight scala %} +val senv = StreamExecutionEnvironment.getExecutionEnvironment +senv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) + +//read the raw_logs from socket and parse into DataStream[Click] events. +val rawStream: DataStream[String] = senv.socketTextStream("localhost", 4444) +val events: DataStream[Click] = rawStream.flatMap(Click(_)) + +//assign the timestamp and watermark generator. +val eventsWithTs = events.assignTimestampsAndWatermarks(new WatermarkGenerator) +{% endhighlight %} +*Note: WatermarkGenerator is an implementation of AssignerWithPeriodicWatermarks, you can find the implementation [here](https://github.com/soniclavier/bigdata-notebook/blob/master/flink/src/main/scala/com/vishnu/flink/streaming/sessionwindows/SessionWindowExample.scala#L77).* + +To calculate the number of clicks per user during a session, we need to key the stream based on userId. Once a stream is keyed, all the message with the same key will be part of the same Stream. Next, we have to define the Session gap (timeout). In this example, we will create an EventTime based SessionWindow with a gap of 2 seconds and also set an AllowedLateness of maximum 4 seconds. AllowedLateness allows messages that come delayed to be processed. Flink keeps the Window alive till it's MaxTimestamp + AllowedLateness. +{% highlight scala %} +val output: DataStream[(String, Int)] = eventsWithTs + .keyBy("userId") + .window(EventTimeSessionWindows.withGap(Time.seconds(2))) + .allowedLateness(Time.seconds(4)) + .apply[(String, Int)](new WindowFunction[Click, (String, Int), Tuple, TimeWindow](){ + override def apply(key: Tuple, + window: TimeWindow, + input: Iterable[Click], + out: Collector[(String, Int)]): Unit = { + out.collect((key.getField(0).toString, input.size)) + } +}) +{% endhighlight %} +### Execution and Under the hood +Let us now test the application by sending few messages. We will send Click messages from 2 users - User1 and User2. User1 will send a click event at 0th second, 1st second, 2nd second and 8th second. Where the click at 2nd second is delayed and arrives only at 5.2nd second. User2 will send just one click event at 4.5th second. + +
+ +
+ +***Note*** : The reason I included a User2 is to show how Watermark is advanced by Flink and how that affects the results. The Watermark generator that we used in this application keeps track of the latest timestamp seen so far and uses it as the CurrentWatermark. So when the Click event from User2 arrives at 4.5th second, Flink will understand that the EventTime is now 4.5 second (some day, hour and minute). At this point the User1's Window with 2 Click events will be evaluated - producing output *(user1,2)*, since the Window's end point(max timestamp) is at 3rd second which is less than the current watermark (behavior of the default trigger for EventTimeSessionWindow - EventTimeTrigger). But this Window will be kept alive since Max Timestamp + Allowed Lateness is not less than the CurrentWatermark. So, when the late message arrives, it will be put into this Window and the Window will be evaluated again to produce output *(user1, 3)*. Note that in this case the Window was evaluated 2 times, this is something you will have to take care when using AllowedLateness. If you don't want this behavior, the only way is to make CurrentWatermark lag behind max timestamp by "maximum delay" that you expect the messages to arrive.(Check [this](flink_eventtime#watermarks) blog to see how it can be done). The drawback with the Watermark approach is that Window will be evaluated only after MaxTimestamp + MaxDealy, even if there are no late arriving messages. + +When using Session window, Flink initially puts each message in its own window. This Window's end time being timestamp + session gap. Then, it gets all the Windows for that key(userId) and merges them if there are any overlaps. + +This execution will produce output - (user1,2), (user1,3), (user2,1), (user1,1). I hope you understood how this output is produced, the first two entries were from the first 3 Clicks of User1 (two entries due to re-evaluation of window due to late message). Third entry(user2, 1) from the one Click from User2. The last entry (user1, 1) is from the last Click we received from User1. This Click belongs to a new Session since it was received only at 8th second, which is > MaxTime (4th second) of the previous Window. + +Session windows are very useful since it aligns very well with the events that we receive from a Web application. The reason I wrote this blog is that someone asked me about Session Windows and I could not find much material online on how to use it along with EventTime. This was more of self-learn + share. Hope you liked it and thanks for reading. +
Continue reading \ No newline at end of file diff --git a/_posts/2017-12-10-nn-rnn-lstm.markdown b/_posts/2017-12-10-nn-rnn-lstm.markdown new file mode 100755 index 0000000..e60856a --- /dev/null +++ b/_posts/2017-12-10-nn-rnn-lstm.markdown @@ -0,0 +1,134 @@ +--- +layout: post +comments: true +title: Deep Learning - ANN, RNN, LSTM networks +date: 2017-12-17 +PAGE_IDENTIFIER: ann_rnn_lstm +permalink: /ann_rnn_lstm.html +image: /img/lstm/header.png +tags: DeepLearning ANN RNN LSTM SequenceModeling Timeseries MachineLearning +has_math: true +description: Long Short Term Memory(LSTM) model is a type supervised Deep Neural Network that is very good at doing time-series prediction. In this blog, we do a step by step exploration of it's architecture starting from the basic NN, then RNN leading to LSTM. +--- +
+ +
+LSTM - Long Short Term Memory model is a type supervised Deep Neural Network that is very good at doing time-series prediction. It is a type of RNN (Recurrent Neural Network). An LSTM model looks at last "n" days(timestep) data (also called lag) and predicts how the series can progress in the future. + +In this blog, we will try to understand how the layers in an LSTM model is connected with each other and understand the shape of weights, output and input matrices. We will not be looking at any particular implementation, that will be done in one of the future blog posts. Let us begin by looking at the basic ANN model, then RNN and later on LSTM. + +### **Artificial Neural Network (ANN)** +As you might already know, an ANN has an input layer, one or more hidden layer, and an output layer. In our example, we will consider a network with just one hidden layer with 2 neurons. + +
+ +
+ +Each node in the input layer is connected to each node in the hidden layer and each node in the hidden layer is connected to each node in the output layer. All these connections have a weight associated with it, which is what a Neural Network learns during training - a set of weights that minimizes the overall cost of the model. Cost will be lowest if your prediction is close to the actual and will be high otherwise. + +$$A_1 = \sigma(W_1X + b_1)$$ + +$$A_2 = \sigma(W_2A_1 + b_2)$$ + +\\(\sigma =\\) activation function. +\\(W_1, W_2, b_1, b_2 =\\) weights. +\\(A_1, A_2 =\\) activations. + + +Here we have one example with two features \\(x_1\\) and \\(x_2\\), hence \\(X\\) has the shape \\((2, 1)\\). Each of this input feature is connected to each of the 2 hidden layer nodes, hence the weight \\(W_1\\) has the shape \\((2, 2)\\). The bias unit \\(b1\\) is also connected with both the nodes in the hidden layer, so the shape of b1 is \\((2, 1)\\). The activation \\(A_1 = \sigma(W_1X + b_1)\\) will have the shape \\((2,1)\\). + + +$$A_1(2,1) = W_1(2,2) \times X(2,1) + b_1(2,1)$$ + +Similarly, you can figure out the shapes of \\(b2, W2, A2\\). Understanding the relation between the shape of these matrices and the network architecture will help us later in figuring out the RNN and LSTM networks. Also notice that here we are dealing with only 1 example, suppose we have \\(m\\) such examples, then the shape equation holds true if we just replace all 1s with m. This helps in avoiding loops by vectorizing the computations. + +$$A_1(2,m) = W_1(2,2) \times X(2,m) + b_1(2,m)$$ + +### **Recurrent Neural Network (RNN)** +RNN is a type of ANN, that has a recurring connection to itself. This recurring connection helps RNN learn the effect of previous input x(t-1) along with the current input x(t) while predicting the output at time "t" y(t). This gives RNN a sense of time context. The hidden layer activations calculated at time "t-1" are fed in as an input at time "t". +
+ +
+Above figure shows the high-level view of an RNN. We could also unroll the RNN's recurrent connection as shown in the figure. *Note: some Deep Learning libraries, such as Keras does not unroll the network by default since that requires more memory.* +Here, \\(h(t), y(t)\\) stands for hidden state and output at time t. \\(h_t\\) and \\(y_t\\) are defined as: + +$$h_t = \sigma(W_h x_t + U_h h_{t-1} + b_h )$$ + +$$y_t = \sigma(W_y h_t + b_y )$$ + +\\(\sigma\\) = the activation function. +\\(W_h, U_h, b_h, W_y, b_y\\) = weights. + +The above equations can be put into perspective using the following figure (RNN with 2 units). As you can see, \\(h_{t-1}\\) is fed into at the network time \\(t\\) and is combined with \\(x_t\\) to produce \\(h_t\\) and \\(y_t\\). During back-propagation the model learns to adjust the weights \\(W_h, b_h, U_h, W_y, b_y \\) which controls how much influence the current input and the past input(indirectly through \\(h_{t-1}\\)) has on the current output. +
+ +
+*Note:* The inputs x1 and x2 are not two features, instead they are two timesteps(lag) of the same feature. This is a minor detail on how the input is structured for an RNN/LSTM model training that you should be aware of. Since we will be looking at last n(e.g., 2) timesteps(e.g., days) of data to predict next m(e.g., 2) days of data for some feature \\(x\\), \\(X\\) and \\(Y\\) should be structured as + +$$ X_1 = [x_{1},x_{2}], Y_1 = [x_{3}, x_{4}] $$ + +$$ X_2 = [x_{2},x_{3}], Y_2 = [x_{4}, x_{5}] $$ + +$$ X_3 = [x_{3},x_{4}], Y_3 = [x_{5}, x_{6}] $$ + +Will discuss more on this in future blogs, when we look at an implementation. In this example, we choose a lag of 2. + +Let us now figure out the shapes of \\(W_h, U_h, b_h, W_y, b_y, h_t, y_t\\) + +- shape of \\(W_h\\) = (2, 2) = (number of units, lag) +- shape of \\(U_h\\) = (2, 2) = (number of units, number of units) +- shape of \\(b_h\\) = (2, 1) = (number of units, 1) +- shape of \\(W_y\\) = (2, 2) = (number of units, number of units) +- shape of \\(b_y\\) = (2, 1) = (number of units, 1) +- shape of \\(h_t\\)= (2, 1) = \\(W_h(2,2) \times x_t(2, 1) + U_h(2, 2) \times h_{t-1}(2, 1) + b_h(2, 1)\\) +- shape of \\(y_t\\) = (2, 1) = \\(W_y(2, 2) \times h_t(2, 1) + b_y(2, 1)\\) + +#### **Vanishing gradient problem** +One of the problems with RNN networks is vanishing gradients, the gradients vanish to 0 during backpropagation. It arises because the derivative of the activation functions sigmoid(\\(\sigma\\)) or \\(tanh\\) are less than 0.25 and 1 respectively. And when many of these derivatives are multiplied together while applying chain rule, the gradients vanish to 0. This causes earlier layers to learn very slowly compared to later layers. +### **Long Short Term Memory (LSTM)** +LSTM model solves the problem of vanishing gradients by introducing a new state called cell state and having a CEC(Constant Error Carousel) which allows the error to propagate back without vanishing. For more details on what vanishing gradient problem is and how LSTM's CEC avoids this, watch out for the upcoming blog post [CEC in LSTM](#) here. + +Forward pass in an LSTM cell: **(Use the slider to navigate)** +{% include _include_slider.html folder="lstm_slider" %} + +#### **Need for gates** +**Forget gate** allows the model to learn when to clear(or partially clear) the contents of cell state. Intuitively this tells the model that the time context that it has been remembering is starting to get irrelevant. It might be now obvious the need of input and output gates, **input gate(i)** controls how much new information should be added to cell state, and **output gate(o)** controls when to use the contents in the cell state for producing \\(h_t\\), but the question "why it should be done?" remains. The reason is to avoid conflicts, e.g., for a weight *w* some of the inputs might try to pull the weights in one direction, where as some other input might try to pull it in another direction, in such cases these gates allows the model to control the weights update in such a way that it is updated in the direction where overall error is low. + + +LSTM's equations corresponding to the figure shown in the slides: + +$$ f_t = \sigma(W_f x_t + U_f h_{t-1} + b_f) $$ + +$$ i_t = \sigma(W_i x_t + U_i h_{t-1} + b_i) $$ + +$$ o_t = \sigma(W_o x_t + U_o h_{t-1} + b_o) $$ + +$$ g_t = tanh(W_c x_t + U_c h_{t-1} + b_c) $$ + +$$ c_t = f_t \circ c_{t-1} + i_t \circ g_t $$ + +$$ h_t = o_t \circ \sigma(c_t) $$ + +'\\(\circ\\)' represents 'hadamard product' and \\(W_f, U_f, b_f, W_i\\) etc are the weights that LSTM learns during back propagation. We will now visualize the LSTM cell(with 2 units) as a network to see how are the inputs(with a lag of 2), weights and biases wired with each other. + +- shape of \\(W\\) is \\((2 \times 2) = (units \times lag)\\) +- shape of \\(U\\) is \\((2 \times 2) = (units \times units)\\) +- shape of \\(b\\) is \\((2 \times 1) = (units \times 1)\\) +- shape of \\(f_t, i_t, g_t\\) and \\(o_t\\) is \\((2 \times 1)\\) = \\(W (2 \times 2) \times x_t (2 \times 1) + U (2 \times 2) \times h_{t-1} (2 \times 1) + b (2 \times 1) \\) +- shape of \\(c_t\\) is \\((2 \times 1) = f_t(2 \times 1) \circ c_{t-1}(2 \times 1) + i_t (2 \times 1) \circ g_t (2 \times 1) \\) +- shape of \\(h_t\\) is \\((2 \times 1) = o_t(2 \times 1) \circ c_t(2 \times 1) \\) + +
+ +
+[[View in high resolution]({{ site.baseurl }}/img/lstm/lstm_2.png)] + +### **Conclusion** +LSTMs are always preferred over RNN since it is able to hold on to its memory for a longer period of time. In this blog post we focused mainly on the forward propagation, in the next post I will try to describe how back-propagation works and how CEC(Constant Error Carousel) avoids vanishing gradient problem. I hope this post helped you paint an overall picture of RNNs and LSTMs, or get a better understanding of what you already knew. And as always thank you for making till the end :) + +#### References +[1] Sepp Hochreiter, Jurgen Schmidhuber. *Long Short Term Memory. Neural Computation,* 1997. +[2] Christopher Olah, *Understanding LSTM Networks* + +[Home]({{ site.url }}) + diff --git a/_posts/2017-13-19-flink-queryable-state-part-1.markdown b/_posts/2017-13-19-flink-queryable-state-part-1.markdown new file mode 100755 index 0000000..d47f228 --- /dev/null +++ b/_posts/2017-13-19-flink-queryable-state-part-1.markdown @@ -0,0 +1,33 @@ +--- +layout: post +comments: true +title: Queryable States in ApacheFlink - How it works +date: 2017-03-19 +PAGE_IDENTIFIER: flink_queryable_state +permalink: /flink_queryable_state1.html +image: /img/flink_queryable_state/queryable_flow.png +tags: ApacheFlink BigData Hadoop Scala Streaming +description: Queryable States allows users to do real-time queries on the internal state of the stream without having to store the result on to any external storage. In this blog post we will see how this is done in ApacheFlink. +--- +
+ +
+QueryableStates allows users to do real-time queries on the internal state of the stream without having to store the result on to any external storage. This opens up many interesting possibilities since we no longer need to wait for the system to write to the external storage (which has always been one of the main bottlenecks in these kinds of systems). It might be even possible to not have any kind of database and make the user facing applications directly query the stream, which will make the application faster and cheaper. This might not be applicable to all the use cases, but if your pipeline has to maintain an internal state (may be to do some aggregations), it would be a good idea to make the state available to query. + +We will first look at the overall steps that take places inside Flink when we make a state queryable, and when we do the query. In the next [blog](flink_queryable_state2.html), we will see how to create a Pipeline with queryable state and how to create a client to query its state. + +### **Making the State Queryable** +Let us assume that we have created a pipeline with a queryable state and submitted the Job via JobClient. The following diagram shows what happens inside Flink. +
+ +
+I hope the figure is pretty much self-explanatory but to sum up, once a Job is submitted, JobManager builds ExecutionGraph from the JobGraph and then deploys the tasks to TaskManager. While creating instances of the Tasks, Operators are created, if an Operator is found to be queryable then reference to the "state" of the operator is saved in KvStateRegistry with a state name. The state name is a unique name that is set during the creation of the Job. Then the JobManager actor is notified about the state registration and JobManager stores the location info in a KvStateLocationRegistry, which is later used during the time of querying. + +### **Querying the state** +
+ +
+The above figure shows the steps during execution of a query by a client (who is not part of the submitted Job). The client sends a KvStateLookup message to JobManager actor, this request should contain the JobId and "state name" which was used while building the Job. JobManager checks if the JobId is valid, gets the JobGraph for the JobId and the KvStateLocationRegistry is retrieved from the JobGraph. JobManager then returns the state location information corresponding to the "state name" back to KvStateClient. This response contains a KvStateServer address of where the state is stored. The client then opens a connection with the KvStateServer and fetches the state from the registry using the KvStateID. Once the state is retrieved an Asynchronous query is submitted to fetch the value from the state for a given key. The result obtained is serialized and sent back to the client. Meanwhile, the state is continuously updated by the Job during processing and therefore the client always gets to see the latest state value while querying. + +So this is what Apache Flink does under the hood to make its state queryable. In the next part of the blog, we will implement a Streaming Job which exposes its state via QueryableState API and will also create a QueryClient to query this state. Thanks for reading! +
Continue reading \ No newline at end of file diff --git a/_posts/2017-13-19-flink-queryable-state-part-2.markdown b/_posts/2017-13-19-flink-queryable-state-part-2.markdown new file mode 100644 index 0000000..53ccf2a --- /dev/null +++ b/_posts/2017-13-19-flink-queryable-state-part-2.markdown @@ -0,0 +1,170 @@ +--- +layout: post +comments: true +title: Queryable States in ApacheFlink - Implementation +date: 2017-03-25 +PAGE_IDENTIFIER: flink_queryable_state_impl +permalink: /flink_queryable_state2.html +image: /img/flink_queryable_state/queryable_flow2.png +tags: ApacheFlink BigData Hadoop Scala Streaming +description: This is part 2 of the blog Queryable States in Apache Flink. In the previous blog we saw how Apache Flink enabled Queryable States. In this part we will create a Streaming Job with Queryable States and create a QueryClient to query the state. +--- +
+ +
+ +This is part 2 of the blog Queryable States in Apache Flink. In the previous [blog](flink_queryable_state1.html), we saw how Apache Flink enabled Queryable States. In this part, we will create a Streaming Job with Queryable States and create a QueryClient to query the state. I assume that Flink is already installed and setup. If not you can check out my earlier blog post on installation [here](flink_start.html). I will be using a Tumbling window in this example, to read about Windows in Flink, please read [this](flink_streaming.html) blog post. + +All the code used in this blog post will be available on my [GitHub](https://github.com/soniclavier/hadoop_datascience/tree/master/flink/src/main/scala/com/vishnu/flink/streaming/queryablestate) . +*Note: The implementation of QueryClient is specific to Flink 1.2.0* + +### **Creating the Pipeline** +Let us now create a streaming job with QueryableState. In this example, our input is climate log which is of the format `country, state, temperature, humidity` where country and state are Strings, temperature and humidity are Floats. We will first create case class to hold these logs. + +{% highlight scala %} +case class ClimateLog(country: String, state: String, temperature: Float, humidity: Float) + object ClimateLog { + def apply(line: String): Option[ClimateLog] = { + val parts = line.split(",") + try{ Some(ClimateLog(parts(0), parts(1), parts(2).toFloat, parts(3).toFloat)) } + catch { + case e: Exception => None } } } +{% endhighlight %} + +We can then read the logs from a socket using + +{% highlight scala %} +val climateLogStream = senv.socketTextStream("localhost", 2222) + .flatMap(ClimateLog(_)) +{% endhighlight %} + +We will create a KeyedStream and apply a [Tumbling](flink_streaming.html) TimeWindow of 10 seconds. This will cause the window to be evaluated each time it tumbles. In the apply function, we will do a simple aggregation to sum up all the values of temperatures and humidities seen in that window. + +{% highlight scala %} +val climateLogAgg = climateLogStream + .keyBy("country", "state") + .timeWindow(Time.seconds(10)) + .apply((key: Tuple, w: TimeWindow, clogs: Iterable[ClimateLog], out: Collector[ClimateLog]) => { + val agg = clogs.reduce((c1: ClimateLog, c2: ClimateLog) => c1.copy( + temperature = c1.temperature + c2.temperature, + humidity=c1.humidity + c2.humidity)) + out.collect(agg) + }) +{% endhighlight %} +#### **QueryableStateStream** +Now we will create a Stream that is queryable. To do that, we need a StateDescriptor that describes the type of elements that are going to be stored in the stream. We will create a ReducingStateDescriptor that aggregates the values seen so far. The ReducingStateDescriptor takes three parameters, first parameter is the name, second is the reducing function that has to be applied when new elements are added to the state, and the third describes the type of values that are going to be stored in the state. +{% highlight scala %} +val climateLogStateDesc = new ReducingStateDescriptor[ClimateLog]( + "climate-record-state", + reduceFunction, + TypeInformation.of(new TypeHint[ClimateLog]() {})) + +val reduceFunction = new ReduceFunction[ClimateLog] { + override def reduce(c1: ClimateLog, c2: ClimateLog): ClimateLog = { + c1.copy( + temperature = c1.temperature + c2.temperature, + humidity=c1.humidity + c2.humidity) } } +{% endhighlight %} + +Once that is done, we call `asQueryableState` function to make the stream queryable and pass the state descriptor created.This is shown below. +{% highlight scala %} +val queryableStream = climateLogAgg + .keyBy("country") + .asQueryableState("climatelog-stream", climateLogStateDesc) +senv.execute("Queryablestate example streaming job") +{% endhighlight %} +Note the first parameter while calling the `asQueryableState` state function, this is the `queryableStateName` which is used for identifying the stream. This will be later used by the QueryClient while querying. + +#### **QueryClient** +Now we will move on to the creating the QueryClient. The client is going to be a separate application that queries the state of an already running Streaming job. First thing that the client needs to know is how to connect to the JobManager (remember the diagram from the previous blog?), which can be configured as follows + +{% highlight scala %} +val config = new Configuration +config.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, "localhost") +config.setString(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, "6123") +{% endhighlight %} + +Next we create an instance of QueryableStateClient and also Serializers for key and the value. The key serializer is used to create a serializedKey. The value serializer will be used later to deserialize the result returned back from the query. In the below example, we are asking the state to return the current running state value for the country "USA". + +{% highlight scala %} +val client = new QueryableStateClient(config) +val execConfig = new ExecutionConfig +val keySerializer = createTypeInformation[String].createSerializer(execConfig) +val valueSerializer = TypeInformation.of(new TypeHint[ClimateLog]() {}).createSerializer(execConfig) +val key = "USA" +val serializedKey = KvStateRequestSerializer.serializeKeyAndNamespace( + key, + keySerializer, + VoidNamespace.INSTANCE, + VoidNamespaceSerializer.INSTANCE) +{% endhighlight %} + +Now we can query the state using the client. Pass the serializedKey, JobID and queryableStateName as parameters. JobID can be obtained either from the Flink UI or from the job submission log. Note that `climatelog-stream` parameter which should be same as the queryableStateName used during job submission. +{% highlight scala %} +val serializedResult = client.getKvState(jobId, "climatelog-stream", key.hashCode(), serializedKey) +{% endhighlight %} +The query returns a Future object which can be accessed as follows. If the query was successful, then we can use the valueSerializer to deserialize and read the result. In this case, the deserialized result is an instance of the ClimateLog case class. +{% highlight scala %} +serializedResult onSuccess { + case result ⇒ { + try { + val clog: ClimateLog = KvStateRequestSerializer.deserializeValue(result, valueSerializer) + println(s"State value: $clog") + } catch { + case e: Exception ⇒ e.printStackTrace() } } } +serializedResult onFailure { + case uk :UnknownKeyOrNamespace ⇒ println(uk.getMessage) + case e: Exception ⇒ println(e.getMessage) } +{% endhighlight %} + +To test the job, open a terminal and run netcat. +{% highlight console %} +nc -lk 2222 +{% endhighlight %} +Now submit the job using flink command line interface +{% highlight console %} +flink run target/scala-2.11/flink-vishnu-assembly-1.0.jar +Submitting job with JobID: ec685d96da49644ab025c8f9a27ca07a. Waiting for job completion +{% endhighlight %} +Now all that is left to do is send some sample messages through netcat, and run the QueryClient with the JobId and other parameters. + +There are a few possible Exceptions that can occur at this point. + +1) Actor not found +{% highlight java %} +Actor not found for: ActorSelection[Anchor(akka.tcp://flink@localhost:6123/), Path(/user/jobmanager)] +{% endhighlight %} +Make sure that your Flink cluster is up and running. Also you have to submit the Job through the command line, not from the IDE. + +2) Job not found +{% highlight java %} +java.lang.IllegalStateException: Job d8a3b9f9b8e6da33aa714633cee61c3b not found +{% endhighlight %} +This is an easy one, just make sure that JobId passed matches with that of the running job. + +3) No KvStateLocation found +{% highlight java %} +org.apache.flink.runtime.query.UnknownKvStateLocation: No KvStateLocation found for KvState instance with name 'climatelog-stream-temp' +{% endhighlight %} +Make sure that the state name(climatelog-stream) in the client matches with the one that was used during job submission. + +4) KvState does not hold any state for key/namespace +{% highlight java %} +org.apache.flink.runtime.query.netty.UnknownKeyOrNamespace: KvState does not hold any state for key/namespace +{% endhighlight %} +This means that the stream that you are tying to query does not have the key(in this example - "USA") that you are looking for. Did the messages that were sent through netcat have the key that is being used in the query? + +5) Could not deserialize value +{% highlight java %} +java.io.EOFException + at org.apache.flink.runtime.util.DataInputDeserializer.readUnsignedByte(DataInputDeserializer.java:310) + at org.apache.flink.types.StringValue.readString(StringValue.java:770) + at org.apache.flink.api.common.typeutils.base.StringSerializer.deserialize +{% endhighlight %} +Which indicates that something is wrong with the ValueSerializer. The easiest way to fix this is by going back to your Streaming Job code and making sure that you use the exact same TypeInformation in the client as used in the Job. e.g., using *createTypeInformation[ClimateLog]* instead of `TypeInformation.of(new TypeHint[ClimateLog]() {})` can cause exception. + + +To summarize, we saw how Apache Flink enables querying it's internal state and how we can develop a pipeline and query client to do so. Apart from Flink, Kafka also provides this feature. + +That concludes the post and hope it was useful. Thanks for reading! +
Continue reading \ No newline at end of file diff --git a/_posts/2018-01-29-spark_structured_streaming_part1.markdown b/_posts/2018-01-29-spark_structured_streaming_part1.markdown new file mode 100755 index 0000000..3126e70 --- /dev/null +++ b/_posts/2018-01-29-spark_structured_streaming_part1.markdown @@ -0,0 +1,179 @@ +--- +layout: post +comments: true +title: A Tour of Spark Structured Streaming +date: 2018-01-29 +PAGE_IDENTIFIER: spark_structured_streaming +permalink: /spark_structured_streaming.html +image: /img/spark_structured_streaming/header_share.png +show_index: true +tags: ApacheSpark Kafka Streaming Scala BigData Hadoop +description: Structured Streaming is Apache Spark's streaming engine which can be used for doing near real-time analytics. In this blog we explore Structured Streaming by going through a very simple use case. +--- +
+ +
+Structured Streaming is Apache Spark's streaming engine which can be used for doing near real-time analytics. In this blog, we explore Structured Streaming by going through a very simple use case. Imagine you started a ride hauling company and need to check if the vehicles are over-speeding. We will create a simple near real-time streaming application to calculate the average speed of vehicles every few seconds, while talking about **SlidingWindow**, **TumblingWindow**, **EventTime**, **ProcessingTime**, **Watermarks** and **Kafka Source & Sink**. All the code used in this blog is available in my [Github repository](https://github.com/soniclavier/bigdata-notebook/blob/master/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/KafkaSourceStreaming.scala). + +### **Micro Batch based Streaming** +Before we jump into the use case, let us take a look at how streaming works under the hood in Apache Spark. Structured Streaming in Spark, similar to its predecessor (DStream) uses **micro-batching** to do streaming. That is, spark waits for a very small interval say 1 second (or even 0 seconds - i.e., as soon as possible) and batches together all the events that were received during that interval into a micro batch. This micro batch is then scheduled by the Driver to be executed as Tasks at the Executors. After a micro-batch execution is complete, the next batch is collected and scheduled again. This scheduling is done frequently to give an impression of streaming execution. + +
+ +
+ +In version 2.3, Spark released a new execution engine called Continuous Processing, which does not do micro-batching. Instead, it launches long runnings tasks that read and process incoming data continuously. To read more about it, do check my blog post [here](spark_streaming_continuous_processing.html). +### **Kafka Source** +We will be reading the events from a Kafka topic - *cars*. To do that, we need to set the **format** as "kafka", set **kafka.bootstrap.server** with the broker address and provide the topic name using the option "subscribe". +{% highlight scala %} +val df: DataFrame = spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", "localhost:9092") + .option("subscribe", "cars") + //.schema(schema) : we cannot set a schema for kafka source. Kafka source has a fixed schema of (key, value) + .load() +{% endhighlight %} +To simulate a vehicle sending us sensor data, we will create a Kafka producer that writes id, speed, acceleration and the timestamp to the "cars" topic. Code for which can be found here [RandomCarsKafkaProducer.scala](https://github.com/soniclavier/bigdata-notebook/blob/master/spark_23/src/main/scala/com/vishnuviswanath/spark/util/RandomCarsKafkaProducer.scala). Note that the timestamp here is called the **EventTime**, because it is the time at which the event(message) was generated at its source. + +
+ +
+*Note: if you need to setup local Kafka broker, instructions are available [here](https://github.com/soniclavier/bigdata-notebook/blob/master/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/KafkaSourceStreaming.scala#L14-L32).* + +Next, we parse the raw data into a case class so that we have a structure to work with. + +{% highlight scala %} +case class CarEvent(carId: String, speed: Option[Int], acceleration: Option[Double], timestamp: Timestamp) + +object CarEvent { + def apply(rawStr: String): CarEvent = { + val parts = rawStr.split(",") + CarEvent(parts(0), Some(Integer.parseInt(parts(1))), Some(java.lang.Double.parseDouble(parts(2))), new Timestamp(parts(3).toLong)) + } +} +val cars: Dataset[CarEvent] = df + .selectExpr("CAST(value AS STRING)") + .map(r ⇒ CarEvent(r.getString(0))) +{% endhighlight %} +This produces a DataSet of type CarEvent. +### **Performing Aggregation** +We start off simple by finding the average speed of each vehicle. This can be done by doing a **groupby** on carId and by applying the **avg** aggregate function. +{% highlight scala %} +val aggregates = cars + .groupBy("carId") + .agg( + "speed" → "avg" + ) + +{% endhighlight %} +This calculates the average speed of events received during every micro-batch. In Structured Streaming, the micro-batch interval can be controlled using **Triggers**. *Spark’s idea of Trigger is slightly different from event-at-a-time streaming processing systems such as [**Flink**](search.html?query=flink) or **Apex**. In Spark, a trigger is set to specify how long to wait before checking if new data is available. If no trigger is set, Spark will check for availability of new data as soon as the previous micro-batch execution is complete. Whereas in event-at-a-time systems, as the new data comes in, it is collected in the window’s internal state until the trigger fires.* + +That was easy! But what if we want to calculate the average speed of a vehicle over last 5 seconds. Also, we would like to calculate it based on the **EventTime** of the events (i.e., based on the time at which the event occurred at the source, not based on when it was processed in the system.) If you don't know what EventTime is, read on. + +### **EventTime & ProcessingTime** +**EventTime** is the time at which an event is generated at its source, whereas a **ProcessingTime** is the time at which that event is processed by the system. There is also one more time which some stream processing systems account for, that is **IngestionTime** - the time at which event/message was ingested into the System. It is important to understand the difference between EventTime and ProcessingTime. +
+ +
+The red dot in the above image is the message, which originates from the vehicle, then flows through the Kafka topic to Spark's Kafka source and then reaches executor during task execution. There could be a slight delay (or maybe a long delay if there is any network connectivity issue) between these points. The time at the source is what is called an **EventTime**, the time at the executor is what is called the **ProcessingTime**. You can think of the ingestion time as the time at when it was first read into the system at the Kafka source (IngestionTime is not relevant for spark). + +Now that you have a fair idea of different time characteristics, let us get back to the use-case of calculating the average speed of cars over last 5 seconds. To do that we need to group the events into 5-second interval time groups, based on its EventTime. This grouping is called Windowing. + +### **Windows** +In Spark, Windowing is done by adding an additional key (window) in the groupBy clause. For each message, its EventTime(timestamp generated by the sensor) is used to identify which window the message belongs to. Based on the type of window (Tumbling/Sliding) an event might belong to one or more windows. To understand how, we need to first learn what a TumblingWindow and a SlidingWindow are. + +#### **Tumbling Window & Sliding Window** +A tumbling window is a non-overlapping window, that tumbles over every "window-size". e.g., for a Tumbling window of size 4 seconds, there could be window for [00:00 to 00:04), [00:04: 00:08), [00:08: 00:12) etc (ignoring day, hour etc here). If an incoming event has EventTime 00:05, that event will be assigned the window - [00:04 to 00:08) + +A SlidingWindow is a window of a given size(say 4 seconds) that slides every given interval (say 2 seconds). That means a sliding window could overlap with another window. For a window of size 4 seconds, that slides every 2 seconds there could windows [00:00 to 00:04), [00:02 to 00:06), [00:04 to 00:08) etc. Notice that the windows 1 and 2 are overlapping here. If an event with EventTime 00:05 comes in, that event will belong to the windows [00:02 to 00:06) and [00:04 to 00:08). + +
+ +
+ +To do windowing, Spark adds a new column called "window" and explodes the provided 'timestamp' column into one or more rows(based on its value and the window's size and slide) and do a groupby on that column. This implicitly pulls all the events that belong to a time-interval into same "window". + +*Side note: A tumbling window can also be thought of as a sliding window whose slide interval is same as the window size. i.e., a sliding window of size 4 seconds that slides every 4 seconds is same as a tumbling window of size 4 seconds. In fact, that is exactly what Spark does internally.* + +Here we group the cars DataSet based on 'window' and carId. *Note that `window()` is a function in Spark that returns a Column.* +{% highlight scala %} +//a tumbling window of size 4 seconds +val aggregates = cars + .groupBy(window($"timestamp","4 seconds"), $"carId") + .agg(avg("speed").alias("speed")) + .where("speed > 70") + +//a sliding window of size 4 seconds that slides every 2 seconds can be created using cars.groupBy(window($"timestamp","4 seconds","2 seconds"), $"carId") +{% endhighlight %} + +This produces a DataFrame of carId, avg speed, and the corresponding time window. e.g output: + +- Batch 1 + - [2018-01-21 00:50:00, 2018-01-21 00:50:04] car1 75.0 + +### **Output Modes** +The final(almost) piece of the puzzle is to output the results that we produced to a sink - a **Kafka topic**. Spark provides three output modes - **Complete, Update and Append**. Each mode differs in how Spark updates the state and outputs the results after processing a micro-batch. +
+ +
+During each micro-batch, Spark updates values for some of the keys from the previous batch, some are new and some remains the same. In the Complete mode, all the rows are output, in Update mode only the new and updated rows are output. Append mode is slightly different in that, in Append mode, there won't be any updated rows and it outputs only the new rows. + +### **Kafka Sink** +Writing to Kafka is pretty straightforward - set format as "kafka", point the sink to the Kafka broker using option **kafka.bootstrap.server**, and set the option **topic** to tell which Kafka topic to write to. Kafka sink expects a field - **value** to be present in the data. We can make use of Spark SQL's **selectExpr** to convert the field *'speed'* to *'value'* and also cast it to String. The **key** is optional but if you have multiple partitions and wants to distribute the data across partitions, it is needed. A **checkpointLocation** is a must when using Kafka sink and it enables failure recovery and exactly once processing. + +{% highlight scala %} +val writeToKafka = aggregates + .selectExpr("CAST(carId AS STRING) AS key", "CAST(speed AS STRING) AS value") + .writeStream + .format("kafka") + .option("kafka.bootstrap.servers","localhost:9092") + .option("topic", "fastcars") + .option("checkpointLocation", "/tmp/sparkcheckpoint/") + .queryName("kafka spark streaming kafka") + .outputMode("update") + .start() +{% endhighlight %} + +Output of running the application will look something like this: +- Batch: 1 + - [2018-01-21 00:50:00, 2018-01-21 00:50:04] car1 75.0 +- Batch: 2 + - [2018-01-21 00:50:04, 2018-01-21 00:50:08] car2 20.0 + - [2018-01-21 00:50:12, 2018-01-21 00:50:16] car2 20.0 + - [2018-01-21 00:50:00, 2018-01-21 00:50:04] car1 62.5 + +Note that Structured Streaming API implicitly maintains the **state** across batches for aggregate functions, i.e., in the above example, the average speed calculated in the second micro-batch will be average of events received during the 1st and 2nd batch. So as a user you don't have to do custom state management. But that comes with the cost of maintaining a large state over time, and no one want to keep the state forever. This can be achieved using watermarks. + +### **Watermark** +In Spark, Watermark is used to decide when to clear a state based on current maximum event time. Based on the **delay** you specify, Watermark lags behind the maximum event time seen so far. e.g., if dealy is 3 seconds and current max event time is 10:00:45 then the watermark is at 10:00:42. This means that Spark will keep the state of windows who's end time is less than 10:00:42. + +{% highlight scala %} +val aggregates = cars + .withWatermark("timestamp", "3 seconds") //set watermark using timestamp filed with a max delay of 3s. + .groupBy(window($"timestamp","4 seconds"), $"carId") + .agg(avg("speed").alias("speed")) + .where("speed > 70") +{% endhighlight %} + +A subtle but important detail to understand is that when using EventTime based processing, time progresses only if you receive a message/event with a higher timestamp value. Think of it as clock inside Spark, but unlike normal clocks that ticks every second(ProcessingTime based) this clock only moves when you receive an event with a higher timestamp. + +Let us look at an example to see how this works when there is a late arriving message. We will focus on a single window between [10:00 to 10:10) and a maximum delay of 5 seconds. i.e., `.withWatermark("timestamp", "5 seconds")` + +
+ +
+ + +- An event with timestamp 10:00 arrives, falls in the window [10:00, 10:10) and watermark is updated as timestamp - 5 +- An event with timestamp 10:02 is generated at the source, but is delayed. This event is supposed to fall in window [10:00, 10:10) +- An event with timestamp 10:04 arrives late at 10:05, but this still falls in the window [10:00, 10:10) since the current watermark is 09:55 which is < window end time. Watermark is updated to 10:04 - 00:05 = 09:59. +- An event with timestamp 10:16 arrives, this updates watermark to 10:11. (This event will fall into window [10:10, 10:20), but is not relevant here). +- Late event with timestamp 10:02 arrives, but the window [10:00, 10:10) is cleared, so this event will be dropped. + +Setting watermark will ensure that state does not grow forever. Also, notice how one of the late events was processed while the other was ignored (since it was too late). + +### **Conclusion** +We have built a simple streaming application while explaining EventTime processing, Windowing, Watermarks, Output modes and how to read and write to Kafka. The code for this and some more examples are available in my [Github repository](https://github.com/soniclavier/bigdata-notebook/tree/master/spark_23). I hope this gave a better insight on some of the new features in Spark Structured Streaming. Let me know if there is any question in the comments. Thanks for reading! + +Continue reading diff --git a/_posts/2018-02-15-spark_continuous_processing.markdown b/_posts/2018-02-15-spark_continuous_processing.markdown new file mode 100644 index 0000000..549e228 --- /dev/null +++ b/_posts/2018-02-15-spark_continuous_processing.markdown @@ -0,0 +1,77 @@ +--- +layout: post +comments: true +title: Spark Continuous Processing +date: 2018-02-25 +PAGE_IDENTIFIER: spark_streaming_continuous +permalink: /spark_streaming_continuous_processing.html +image: /img/spark_continuous/header_share.png +show_index: false +tags: ApacheSpark Kafka Streaming Scala BigData Hadoop +description: Continuous Processing is Apache Spark's new Execution engine that allows very low latency(in milliseconds) event at a time processing. In earlier versions, streaming was done via micro-batching. In continuous processing, Spark launches long-running tasks that continuously read, process and write data. In this blog, we are going to do an early peek at this still experimental feature in Apache Spark that is going to be available in version 2.3. +--- +
+ +
+Continuous Processing is Apache Spark's new Execution engine that allows very low latency(in milliseconds) event at a time processing. In this blog, we are going to do an early peek at this still experimental feature in Apache Spark that is going to be available in version 2.3. I am going to assume that you are already familiar with Spark's micro-batch based execution engine. If you are not, do read my previous blog post [here](spark_structured_streaming.html). The code used in this blog post is available in my [Github repo](https://github.com/soniclavier/bigdata-notebook/blob/master/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/ContinuousKafkaStreaming.scala) + +### **From MicroBatch to ContinuousProcessing** +Apache Spark has been providing stream processing capabilities via micro-batching all this while, the main disadvantage of this approach is that each task/micro-batch has to be collected and scheduled at regular intervals, through which the best(minimum) latency that Spark could provide is around 1 second. There was no concept of a single event/message processing. Continuous processing is Spark's attempt to overcome this limitations to provide stream processing with very low latencies. + +To enable this features, Spark had to make two major changes in its underlying code. + + 1. Create new sources and sinks that could read message continuously(instead of micro-batch) - called DataSourceV2. + 2. Create a new execution engine called - ContinuousProcessing which uses ContinuousTrigger and launch long runnings tasks using DataSourceV2. + +
+ +
+ +### **DataSourceV2** +DataSourceV2 has the ability read/write record at a time. For example, the KafkaSource has *get()* and *next()* methods to read each record, instead of the *getBatch()* method in V1. *(Note: even though the records are read one at a time, there is still some buffering done at the KafkaConsumer)* + +KafkaSink runs continuously waiting for new records to be committed to the topic and writes/commits record at a time. + +#### **Available Sources** +Readers supported right now are + - KafkaSource(short name *kafka*) + - RateSource(short name *rate*) - for testing purpose only + +Writers supported right now are + - KafkaSink + - ConsoleSink - for testing purpose only + - MemorySink - for testing purpose only + +#### **Custom Source/Sink** +It is not very difficult to create your own reader/writer. I have an example of a source - [NetcatContinuousReader](https://github.com/soniclavier/bigdata-notebook/tree/master/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/sources/netcat) and an [application](https://github.com/soniclavier/bigdata-notebook/blob/master/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/CustomV2SourceExample.scala) that uses this source in my Github. + +### **ContinuousExecution Engine** +This is the second major change that allows low latency processing in Spark. A ContinuousExeuction engine is chosen as the StreamExecution when the trigger set is ContinuousTrigger (also the source and sink should be of the type DataSourceV2). The operations supported by this engine are limited for now, it supports mainly Map, Filter, and Project. Aggregation operations, joins, [windowing](spark_structured_streaming.html#windows) etc are not supported. The idea behind this is that for such operations we need to wait for sometime to collect the data, and in those use cases, the Micro-Batch based engine should suffice. The use cases that require very low latency(in milliseconds) are the ones that fit this model. + +### **Example** +If you are already familiar with Spark's Structured Streaming API, the only change that needs to be made is in the Trigger - set the trigger as **ContinuousTrigger**. I will be trying to convert the [code](https://github.com/soniclavier/bigdata-notebook/blob/master/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/KafkaSourceStreaming.scala) written as part of my previous [blog](spark_structured_streaming.html) to use ContinuousProcessing. As a first step I will set the trigger as ContinuousTrigger, rest of the code will remain same. + +{% highlight scala %} + val writeToKafka = aggregates + .selectExpr("CAST(carId AS STRING) AS key", "CAST(speed AS STRING) AS value") + .writeStream + .format("kafka") + .option("kafka.bootstrap.servers","localhost:9092") + .option("topic", "fastcars") + .option("checkpointLocation", "/tmp/sparkcheckpoint/") + .queryName("kafka spark streaming kafka") + .outputMode("update") + .trigger(Trigger.Continuous("10 seconds")) //10 seconds is the checkpoint interval. + .start() +{% endhighlight %} + +This caused an exception, *org.apache.spark.sql.AnalysisException* - **Continuous processing does not support EventTimeWatermark operations**. Watermarks are not supported in ContinuousProcessing since that involves collecting data. So we will remove **withWatermark("timestamp", "3 seconds")** from the code.
+ +Now the application threw another exception **Continuous processing does not support Aggregate operations**. As I mentioned earlier, Spark expects you to use micro-batch based processing if you need to do aggregations, since this involves waiting for data to arrive. Removing the code related to avg, groupBy and window fixes the problem and the application runs. The modified application code is [here](https://github.com/soniclavier/bigdata-notebook/blob/master/spark_23/src/main/scala/com/vishnuviswanath/spark/streaming/ContinuousKafkaStreaming.scala). + +Note: ContinuousTrigger internally uses a ProcessingTimeExecutor(same as ProcessingTime trigger). But this does not have any effect on how often the data is processed since the tasks are already launched and is continuously processing the data. + +### **Conclusion** +ContinuousExecution provides us the ability to do very low latency processing but is limited in what we can we can do. This can change in near future since this is a new feature and is in the experimental stage. Hope you liked the post and as always thanks for reading. + +Continue reading diff --git a/_sass/_base.scss b/_sass/_base.scss new file mode 100755 index 0000000..fcd4e01 --- /dev/null +++ b/_sass/_base.scss @@ -0,0 +1,354 @@ +/** + * Reset some basic elements + */ +body, h1, h2, h3, h4, h5, h6, +p, blockquote, pre, hr, +dl, dd, ol, ul, figure { + margin: 0; + padding: 0; +} + + + +/** Variables **********************************************/ +$base-font-family: Helvetica, sans-serif; +$base-line-height: 2.0em; +$horizontal-spacing-unit: 50px; +$vertical-spacing-unit: 40px; +$nav-height: 56px; +$text-field-height: 20px; +$button-height: 26px; + +/* portfolio tinkering */ +$img_spacing: 5px; +$img_height: 275px; +$caption_font_size: 12px; +$caption_color: #aaa; + +/* COLORS */ +$red: #FF3636; +$red_thunderbird: #D91E18; +$orange: #F29105; +$blue: #4183D7; +$blue_dark:#446CB3; +$confetti: #E9D460; +$green: #11D68B; +$lime_green: #B7D12A; +$purple: #B509AC; +$white: #FFFFFF; + + +$grey-color-dark: #333; /* footer */ +$grey-color-mid: #424141; +$grey-color-light: #ddd; /* navigation bar border */ +$text-color: #666; + +/* Set theme color *************************/ +$theme-color: $blue; + + +/************************************************************/ + +/** + * Basic styling + */ +body { + font-family: $base-font-family; + font-size: $base-font-size; + line-height: $base-line-height; + font-weight: 100; + color: $text-color; + background-color: $background-color; + -webkit-text-size-adjust: 100%; +} + + + +/** + * Set `margin-bottom` to maintain vertical rhythm + */ +p, blockquote, pre, +ul, ol, dl, figure, +%vertical-rhythm { + margin-bottom: $vertical-spacing-unit / 2; +} + +.alignright { + float: right; +} + + +hr{ + /* Inset, by Dan Eden */ + border: 0; + height: 0; + border-top: 1px solid rgba(0, 0, 0, 0.1); + border-bottom: 1px solid rgba(255, 255, 255, 0.3); + +} + + +/** + * Figures + */ +figure > img { + display: block; +} + +figcaption { + font-size: $small-font-size; +} + + + +/** + * Lists + */ +ul, ol { + margin-left: $horizontal-spacing-unit; +} + +li { + > ul, + > ol { + margin-bottom: 0; + } +} + + + +/** + * Headings + */ +h1, h2, h3, h4, h5, h6 { + font-weight: 100; +} + + +/** + * Links + */ +a { + color: $text-color; + text-decoration: none; + /* + &:visited { + color: darken($brand-color, 15%); + } + */ + &:hover { + color: $theme-color; + text-decoration: none; + } +} + +article a { + color: $theme-color; + font-weight: 100; + + &:hover { + text-decoration: none; + } +} + + + +/** + * Blockquotes + */ +blockquote { + color: $text-color; + border-left: 10px solid $white; + padding-left: $horizontal-spacing-unit / 2; + font-size: 18px; + font-style: italic; + + > :last-child { + margin-bottom: 0; + } +} + + + +/** + * Code formatting + */ +pre, +code { + font-size: 15px; + border-radius: 3px; + background-color: $grey-color-light; +} + +code { + padding: 1px 5px; +} + +pre { + padding: 8px 12px; + overflow-x: scroll; + + > code { + border: 0; + padding-right: 0; + padding-left: 0; + } +} + + + +/** + * Wrapper + */ + +.wrapper { + max-width: -webkit-calc(800px - (#{$horizontal-spacing-unit} * 2)); + max-width: calc(800px - (#{$horizontal-spacing-unit} * 2)); + margin-right: auto; + margin-left: auto; + padding-right: $horizontal-spacing-unit; + padding-left: $horizontal-spacing-unit; + @extend %clearfix; + + @include media-query($on-laptop) { + max-width: -webkit-calc(800px - (#{$horizontal-spacing-unit})); + max-width: calc(800px - (#{$horizontal-spacing-unit})); + padding-right: $spacing-unit / 2; + padding-left: $spacing-unit / 2; + } +} + + +/** + * Clearfix + */ +%clearfix { + &:before, + &:after { + content: ""; + display: table; + clear: both; + } +} + +/** + * Pagination style + */ +.previous { + color:#{$theme-color}; +} + +.next { + color:#{$theme-color}; +} + +/* Style the Image Used to Trigger the Modal */ +.expandable { + border-radius: 5px; + cursor: pointer; + transition: 0.3s; +} + +.expandable:hover { + opacity: 0.98; + cursor: -moz-zoom-in; + cursor: -webkit-zoom-in; + cursor: zoom-in; +} +.expandable:active { + transform: scale(1.5); + transition: all 0.2s ease-in-out; + background-color:white; + opacity: 1.0; + user-drag: none; + user-select: none; + -moz-user-select: none; + -webkit-user-drag: none; + -webkit-user-select: none; + -ms-user-select: none; +} + +/* +* Popup +*/ +.popup-subscribe { + position: fixed; + padding: 10px; + align: centre; + bottom:70px; + height:30px; + border-radius: 15px; + border: 0.5px solid $grey-color-light; + color: $white; + background-color: rgb(65,131,215); + /* RGBa with 0.6 opacity */ + background-color: rgba(65,131,215, 0.9); + clear:both; + display:none; +} + +.popup-subscribe #sub_close:hover { + cursor: pointer; +} + +.popup-subscribe #sub_close { + color: black; + text-shadow: 0px 2px 3px #555; +} + + +/** +* Floating per page index +* Some of the behavior is implemented in page_index.html include file +*/ + +.floating_index { + position: fixed; + right: 0px; + top: 100px; + text-align: center; + background-color: $grey-color-dark; +} + +.floating_index .index_head { + font-weight: bold; + text-align: center; + background-color: $theme-color; + color: white; +} + +.floating_index .true { + background-color: $grey-color-dark; +} +.floating_index .false { + background-color: $grey-color-mid; +} +.floating_index a .index_item { + color: white; + padding-left: 15px; + padding-right: 15px; + padding-top: 5px; + padding-bottom: 5px; + font-size: 14px; + +} + +.floating_index .index_item:hover { + background-color: rgba(65, 131, 215, .5); + color: white; +} + +.floating_hidden_index { + position: absolute; + right: 20px; + top: 100px; + width:10px; + height:20px; + background-color:$theme-color; +} + +@media only screen and (max-width: 768px) { + .floating_index { + display: none; + } +} \ No newline at end of file diff --git a/_sass/_layout.scss b/_sass/_layout.scss new file mode 100755 index 0000000..1637eef --- /dev/null +++ b/_sass/_layout.scss @@ -0,0 +1,438 @@ + +/** + * Site header ********************************************************** + */ +.site-header { +border-bottom: 1px solid $grey-color-light; +background-color: #fff; +opacity: 0.95; +position:fixed; +left:0px; +top:0px; +height:56px; +width:100%; +z-index: 50; +} + + +.site-title { +font-size: 20px; +line-height: $nav-height; +letter-spacing: -1px; +margin-bottom: 0; +&:hover { +text-decoration: none; +color: $theme-color; + } +} +.site-nav { +float: right; +line-height: $nav-height; +.page-link{ +line-height: $base-line-height; + // Gaps between nav items, but not on the first one +&:not(:first-child) { +margin-left: 10px; + } + } +@include media-query($on-palm) { +position: fixed; +top: 0px; +right: 10px; +text-align: right; +&:hover .trigger { +display: block; +padding-bottom: 5px; + } +.page-link { +display: line; + } + } +} + +.header-bar{ + left: 0px; + top: 0px; + position: relative; + font-size: 20px; + display: block; + opacity: 0.75; + width: 100%; + text-align: center; + padding-top: 25px; + line-height: 3em; + z-index: 25; + h1{ + color: $theme-color; + font-size:75px; + } + h2{ + font-size:25px; + } +} + +/** +* Share this page. +*/ +ul.share-buttons{ + list-style: none; + padding: 0; + margin-left:0px; +} + +ul.share-buttons li{ + display: inline; +} + +ul.share-buttons li i{ + color:$theme-color; +} +/** + * Site footer ********************************************************** + */ +.site-footer { +border-top: 1px solid $grey-color-dark; +font-size: 10px; +background-color: $grey-color-dark; +padding: 2px; +color: #aaa; +position:fixed; +left:0px; +bottom:0px; +height:50px; +width:100%; +} + + +/** +* Copy right ****************************************************** +*/ + +.copyright { + background-color: $grey-color-dark; + color: $grey-color-light; + position:fixed; + right: 5px; + bottom: 0px; + font-style: italic; +} + +@media only screen and (max-width: 768px) { + .copyright { + display: none; + } +} + + +/** + * Pagination ********************************************************** + */ +.pagination{ +max-width: -webkit-calc(800px - (#{$horizontal-spacing-unit} * 2)); +text-align: center; +width: 100%; +bottom: 50px; +} +.paginationicon { +font-size: 50px; +a { +color: $theme-color; + } +} +/** + * Page content ********************************************************** + */ +.page-content { +padding: 100px 0; /* VERTICAL PADDING FOR TITLE ON EVERY PAGE */ +} +.page-heading { +font-size: 20px; +} +.post-list { +margin: 0px 0; +list-style: none; + > li { +margin-bottom: $vertical-spacing-unit; + } +} +.contacticon { +font-size: 60px; +display:block; +margin: 10px; +} +.center{ +text-align: center; +} +/** + * Posts ********************************************************** + */ +.post-header { +margin-bottom: $vertical-spacing-unit; +} +.post-title { +font-size: 42px; +letter-spacing: -1px; +line-height: 1; +@include media-query($on-laptop) { +font-size: 36px; + } +} +.post-content { +h2 { +font-size: 42px; +@include media-query($on-laptop) { +font-size: 28px; + } + } +h3 { +font-size: 30px; +@include media-query($on-laptop) { +font-size: 22px; + } + } +h4 { +font-size: 20px; +@include media-query($on-laptop) { +font-size: 18px; + } + } +} +.post-meta { +font-size: $small-font-size; +color: $grey-color; +margin-bottom: 0px; +} +.post-link { +display: block; +font-size: 42px; +} + +/** Poem formatting ********************************************/ +.poem-title { +font-size: 24px; +letter-spacing: -1px; +line-height: 1; +@include media-query($on-laptop) { +font-size: 16px; + } +} + + +/** + * Portfolio grid ********************************************************** +*/ +// Nicolas Gallagher's micro clearfix hack +// http://nicolasgallagher.com/micro-clearfix-hack/ +.clearfix:before, +.clearfix:after { +content: " "; +display: table; +} +.clearfix:after { +clear: both; +} +.project { +width: 33.33%; +height: 250px; +float: left; +vertical-align: middle; +box-sizing: border-box; +padding: 10px; +} +.thumbnail{ +width: 100%; +height: 230px; +overflow: hidden; +} +.thumbnail img{ +width: 500px; +height: auto; +position: relative; +left: -25%; +top: -5%; +} +.thumbnail a{ +float: left; +position: relative; +width: 100%; +height: 230px; +} +.thumbnail a span { +display: none; +position: absolute; +top: 0; +left: 0; +bottom: 0; +right: 0; +background: rgba(0,0,0,0.4); +color: $grey-color-light; +padding: 40px; +text-align: center; +} +.thumbnail a:hover span { +display: block; +} +/** + * Portfolio pages ********************************************************** +*/ +.blankbox{ +background: $theme-color; +} +.img_row{ +height: $img_height; +width: 100%; +overflow: hidden; +box-sizing:border-box; +padding: $img_spacing; +} +.col{ +width: 100%; +height: 100%; +float: left; +object-fit: cover; +box-sizing:border-box; +padding: $img_spacing; +} +.right{ +float: right; +} +.one { +width:33.33%; +} +.two { +width: 66.66%; +} +.three{ +width: 100%; +} +.caption{ +height: 100%; +color: $caption_color; +text-align: center; +vertical-align: middle; +font-size: $caption_font_size; +} + +/** +* tags +*/ + + + +.tag-box { + list-style: none; + margin: 0; + padding: 4px 0; + overflow: hidden; + *zoom: 1; +} + +.tag-box:before, .tag-box:after { + display: table; + content: ""; + line-height: 0; +} + +.tag-box:after { + clear: both; +} + +.tag-box.inline li { + float: left; + font-size: 14px; + font-size: 0.875rem; + line-height: 2.5; +} + +.tag-box a { + padding: 4px 6px; + margin: 2px; + background-color: #e6e6e6; + -webkit-border-radius: 4px; + -moz-border-radius: 4px; + border-radius: 4px; + text-decoration: none; +} + +.tag-box a span { + vertical-align: super; + font-size: 10px; + font-size: 0.625rem; +} + +/** +* Search box +*/ + +.search-box { + height: $text-field-height; +} + +.search-button { + -webkit-appearance: none; + height: $button-height; + background-color:$theme-color; + color: $white; + font-family: $base-font-family; + font-weight: 100; + border: solid $white 1px; + font-size: 14px; +} + +.search-form { + margin-top:5px; +} + +@media only screen and (max-width: 768px) { + .search-button { + display: none; + } +} + +/** +* header image +*/ +.head_img { + margin-top:-44px; + padding:0; + margin-bottom:50px; + background-color:#3a7cc0; +} +.head_img img { + padding:0; +} + +/** +* +*/ +@media only screen and (max-width: 760px) { + #is-mobile { display: none; } +} + + +/** +* instagram icon +*/ +.insta { + background: #8a3ab9; /* For browsers that do not support gradients */ + background: -webkit-linear-gradient(#8a3ab9 , #4c68d7 5%, #cd486b, #fbad50, #fccc63); /* For Safari 5.1 to 6.0 */ + background: -o-linear-gradient(#8a3ab9, #4c68d7 5%, #cd486b, #fbad50, #fccc63); /* For Opera 11.1 to 12.0 */ + background: -moz-linear-gradient(#8a3ab9, #4c68d7 5%, #cd486b, #fbad50, #fccc63); /* For Firefox 3.6 to 15 */ + background: linear-gradient(#8a3ab9, #4c68d7 5%, #cd486b, #fbad50, #fccc63); /* Standard syntax */ + color:transparent; + -webkit-background-clip: text; + background-clip: text; + padding: 7px; + float: left; +} + + +/** +* Slider navigation +*/ +.nav_button { + background-color:#3a7cc0; + width: 30px; + height: 30px; + color: white; +} \ No newline at end of file diff --git a/_sass/_syntax-highlighting.scss b/_sass/_syntax-highlighting.scss new file mode 100755 index 0000000..8a26a1a --- /dev/null +++ b/_sass/_syntax-highlighting.scss @@ -0,0 +1,136 @@ +.highlight code, .highlight pre { +color:#19B5FE; +background-color:#333; +} + +.highlight .hll { +background-color:#222; +} + +.highlight .err { +color:#e37170; +background-color:#3d3535; +} + +.highlight .k { +color:#ffff86; +} + +.highlight .p { +color:#41706f; +} + +.highlight .cs { +color:#cd0000; +font-weight:700; +} + +.highlight .gd { +color:#cd0000; +} + +.highlight .ge { +color:#ccc; +font-style:italic; +} + +.highlight .gr { +color:red; +} + +.highlight .go { +color:gray; +} + +.highlight .gs { +color:#ccc; +font-weight:700; +} + +.highlight .gu { +color:purple; +font-weight:700; +} + +.highlight .gt { +color:#0040D0; +} + +.highlight .kc { +color:#dca3a3; +} + +.highlight .kd { +color:#ffff86; +} + +.highlight .kn { +color:#dfaf8f; +font-weight:700; +} + +.highlight .kp { +color:#cdcf99; +} + +.highlight .kr { +color:#cdcd00; +} + +.highlight .ni { +color:#c28182; +} + +.highlight .ne { +color:#c3bf9f; +font-weight:700; +} + +.highlight .nn { +color:#8fbede; +} + +.highlight .vi { +color:#ffffc7; +} + +.highlight .c,.preview-zenburn .highlight .g,.preview-zenburn .highlight .cm,.preview-zenburn .highlight .cp,.preview-zenburn .highlight .c1 { +color:#7f9f7f; +} + +.highlight .l,.preview-zenburn .highlight .x,.preview-zenburn .highlight .no,.preview-zenburn .highlight .nd,.preview-zenburn .highlight .nl,.preview-zenburn .highlight .nx,.preview-zenburn .highlight .py,.preview-zenburn .highlight .w { +color:#ccc; +} + +.highlight .n,.preview-zenburn .highlight .nv,.preview-zenburn .highlight .vg { +color:#ffffff; +} + +.highlight .o,.preview-zenburn .highlight .ow { +color:#f0efd0; +} + +.highlight .gh,.preview-zenburn .highlight .gp { +color:#dcdccc; +font-weight:700; +} + +.highlight .gi,.preview-zenburn .highlight .kt { +color:#00cd00; +} + +.highlight .ld,.preview-zenburn .highlight .s,.preview-zenburn .highlight .sb,.preview-zenburn .highlight .sc,.preview-zenburn .highlight .sd,.preview-zenburn .highlight .s2,.preview-zenburn .highlight .se,.preview-zenburn .highlight .sh,.preview-zenburn .highlight .si,.preview-zenburn .highlight .sx,.preview-zenburn .highlight .sr,.preview-zenburn .highlight .s1,.preview-zenburn .highlight .ss { +color:#cc9393; +} + +.highlight .m,.preview-zenburn .highlight .mf,.preview-zenburn .highlight .mh,.preview-zenburn .highlight .mi,.preview-zenburn .highlight .mo,.preview-zenburn .highlight .il { +color:#8cd0d3; +} + +.highlight .na,.preview-zenburn .highlight .nt { +color:#9ac39f; +} + +.highlight .nb,.preview-zenburn .highlight .nc,.preview-zenburn .highlight .nf,.preview-zenburn .highlight .bp,.preview-zenburn .highlight .vc { +color:#efef8f; +} \ No newline at end of file diff --git a/about.md b/about.md new file mode 100755 index 0000000..3815b70 --- /dev/null +++ b/about.md @@ -0,0 +1,29 @@ +--- +layout: page +title: about +permalink: /about/ +--- + + + + +

Vishnu Viswanath

+I am a Data Engineer with 6 years of experience in designing and building scalable and efficient systems and proficient in most of the BigData stacks. My current interests include Funcitonal Programming, Reactive Programming, Distributed Systems, Real-time Processing and Deep Learning.

+Welcome to my blog where I hope to share my knowledge in the field of Bigdata and Machine learning. I believe any topic if understood properly can be explained in simple terms, and I intend to do so through this blog.

+
+
+ + + + + + + + +
+ +I also have a serious case of wanderlust and in between fiddling with all these technologies I travel as much as possible with my wife. You can follow our travel instagram here! + +{% include pop-up-subscribe.html %} + + diff --git a/css/main.scss b/css/main.scss new file mode 100755 index 0000000..f03d1c3 --- /dev/null +++ b/css/main.scss @@ -0,0 +1,49 @@ +--- +# Only the main Sass file needs front matter (the dashes are enough) +--- +@charset "utf-8"; + + + +// Our variables +$base-font-family: Helvetica, Arial, sans-serif; +$base-font-size: 16px; +$small-font-size: $base-font-size * 0.875; +$base-line-height: 1.5; + +$spacing-unit: 30px; + +$text-color: #111; +$background-color: #fdfdfd; +$brand-color: #2a7ae2; + +$grey-color: #828282; +$grey-color-light: lighten($grey-color, 40%); +$grey-color-dark: darken($grey-color, 25%); + +$on-palm: 600px; +$on-laptop: 800px; + + + +// Using media queries with like this: +// @include media-query($palm) { +// .wrapper { +// padding-right: $spacing-unit / 2; +// padding-left: $spacing-unit / 2; +// } +// } +@mixin media-query($device) { + @media screen and (max-width: $device) { + @content; + } +} + + + +// Import partials from `sass_dir` (defaults to `_sass`) +@import + "base", + "layout", + "syntax-highlighting" +; diff --git a/datascience/kaggle/AnimalShelter/AnimalShelterPreprocess.java b/datascience/kaggle/AnimalShelter/AnimalShelterPreprocess.java deleted file mode 100644 index 866accb..0000000 --- a/datascience/kaggle/AnimalShelter/AnimalShelterPreprocess.java +++ /dev/null @@ -1,434 +0,0 @@ -package weka.filters; - -import weka.core.*; -import weka.core.Capabilities.*; - -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.*; - -/** - * Created by vishnu on 6/20/16. - */ -public class AnimalShelterPreprocess extends SimpleStreamFilter implements OptionHandler{ - - - protected int nameIndex = 1; - protected int ageUponoutcomeIndex = 5; - protected int breedIndex = 6; - protected int dateTimeIndex = 2; - protected int colorIndex = 7; - protected int sexUponOutcomeIndex = 4; - - int noNameIndex = 0; - int ageCatIndex = 0; - int sexIndex = 0; - int seasonIndex = 0; - - int numNewAttrib = 0; - - static final String DATE_FORMAT = "MM/dd/yy HH:mm"; - static final String YOUNG = "young"; - static final String TEEN = "teen"; - static final String ADULT = "adult"; - static final String OLD = "old"; - static final String BABY = "baby"; - - - static final String WINTER = "winter"; - static final String SUMMER = "summer"; - static final String SPRING = "spring"; - static final String FALL = "fall"; - - static final String YES = "yes"; - static final String NO = "no"; - - static final String MALE = "male"; - static final String FEMALE = "female"; - static final String UNKNOWN = "unknown"; - - static Map breedValuesMap = new HashMap(); - static Set newBreedValues = new HashSet(); - static Map colorValuesMap = new HashMap(); - static Set newColorValues = new HashSet(); - static boolean initSetup = false; - static Instances result; - - /** - * 1. normalize ageUponOutcome to days - * 2. create a new filed called, no_name and set it to true or false - * @return - */ - public Enumeration