apache · sourav-mazumder · Nov 15, 2016 · Nov 16, 2016 · Nov 16, 2016 · Nov 16, 2016
diff --git a/datasource-webhdfs/README.md b/datasource-webhdfs/README.md
@@ -0,0 +1,55 @@
+A custom data source to read and write data from and to remote HDFS clusters using the [WebHDFS](https://hadoop.apache.org/docs/r2.7.3/hadoop-project-dist/hadoop-hdfs/WebHDFS.html) protocol. 
+
+## Linking
+
+Using SBT:
+
+```scala
+libraryDependencies += "org.apache.bahir" %% "spark-datasource-webhdfs" % "2.1.0-SNAPSHOT"
+```
+
+Using Maven (Scala version 2.11):
+
+```xml
+<dependency>
+    <groupId>org.apache.bahir</groupId>
+    <artifactId>spark-datasource-webhdfs_2.11</artifactId>
+    <version>2.1.0-SNAPSHOT</version>
+</dependency>
+```
+
+This library can also be added to Spark jobs launched through `spark-shell` or `spark-submit` by using the `--packages` command line option.
+For example, to include it when starting the spark shell:
+
+```Shell
+$ bin/bin/spark-shell --packages org.apache.bahir:spark-datasource-webhdfs_2.11:2.1.0-SNAPSHOT
+```
+
+Unlike using `--jars`, using `--packages` ensures that this library and its dependencies will be added to the classpath.
+The `--packages` argument can also be used with `bin/spark-submit`.
+
+This library is compiled for Scala 2.10 and 2.11, and intended to support Spark 2.0 onwards.
+
+## Examples
+
+A data frame can be created using this custom data source as shown below -
+
+```scala
+val filePath = s"webhdfs://<server name or ip>/gateway/default/webhdfs/v1/<file or folder name>"
+
+val df = spark.read
+    .format("webhdfs")
+    .option("certValidation", "Y")
+    .option("userCred", "user1:pass1")
+    .option("header", "true")
+    .option("partitions", "8")
+    .load(filePath)
+```
+
+## Configuration options.
+
+ * `certValidation` Set this to `'Y'` or `'N'`. In case of `'N'` this component will ignore validation of teh SSL certification. Otherwise it will download the certificate and validate.
+ * `userCred` Set this to `'userid:password'` as needed by the remote HDFS for accessing a file from there.
+ * `partitions` This number tells the Data Source how many parallel connections to be opened to read data from HDFS in the remote cluster for each file. If this option is not specified default value is used which is 4. Recommended value for this option is same as the next nearest integer of (file size/block size) in HDFS or multiples of that. For example if file size in HDFS is 0.95 GB and block size of the file is 128 MB use 8 or multiple of 8 as number of partitions. However, number of partitions should not be more than (or may be little more than) maximum number of parallel tasks possible to spawn in your Spark cluster. 
+ * `format` Format of the file. Right now only 'csv' is supported. If this option is not specified by default 'csv' is assumed.
+ * `output`  Specify either `'LIST'` or `'Data'`. By default, `'Data'` is assumed which returns the actual data in the file. If a folder name is specified then data from all files in that folder would be fetched at once. If `'LIST'` is specified then the files within the folder is listed.
diff --git a/datasource-webhdfs/pom.xml b/datasource-webhdfs/pom.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.bahir</groupId>
+    <artifactId>bahir-parent_2.11</artifactId>
+    <version>2.1.0-SNAPSHOT</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <groupId>org.apache.bahir</groupId>
+  <artifactId>spark-datasource-webhdfs_2.11</artifactId>
+  <properties>
+    <sbt.project.name>datasource-webhdfs</sbt.project.name>
+  </properties>
+  <packaging>jar</packaging>
+  <name>Apache Bahir - Spark DataSource WebHDFS</name>
+  <url>http://bahir.apache.org/</url>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.scalaj</groupId>
+      <artifactId>scalaj-http_${scala.binary.version}</artifactId>
+      <version>2.3.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-tags_${scala.binary.version}</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${spark.version}</version>
+    </dependency>
+  </dependencies>
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-source-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+</project>