Merge pull request apache#77 from mesosphere/add-pyspark-tests

susanxhuynh · web-flow · commit 6433570f1b41 · 2016-10-24T12:54:55.000-07:00
Add pyspark test
diff --git a/tests/jobs/PySparkTestInclude.py b/tests/jobs/PySparkTestInclude.py
@@ -0,0 +1,2 @@
+def func():
+    print "Import is working"
diff --git a/tests/jobs/pi_with_include.py b/tests/jobs/pi_with_include.py
@@ -0,0 +1,51 @@
+from __future__ import print_function
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+from random import random
+from operator import add
+
+from pyspark.sql import SparkSession
+
+import PySparkTestInclude
+
+if __name__ == "__main__":
+    """
+        Usage: pi [partitions]
+    """
+
+    # Make sure we can include this user-provided module
+    PySparkTestInclude.func()
+
+    spark = SparkSession\
+        .builder\
+        .appName("PythonPi")\
+        .getOrCreate()
+
+    partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
+    n = 100000 * partitions
+
+    def f(_):
+        x = random() * 2 - 1
+        y = random() * 2 - 1
+        return 1 if x ** 2 + y ** 2 < 1 else 0
+
+    count = spark.sparkContext.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
+    print("Pi is roughly %f" % (4.0 * count / n))
+
+    spark.stop()
diff --git a/tests/test.py b/tests/test.py
@@ -13,14 +13,21 @@
 import shakedown
 
 
-def upload_jar(jar):
+def upload_file(file_path):
     conn = S3Connection(os.environ['AWS_ACCESS_KEY_ID'], os.environ['AWS_SECRET_ACCESS_KEY'])
     bucket = conn.get_bucket(os.environ['S3_BUCKET'])
-    basename = os.path.basename(jar)
+    basename = os.path.basename(file_path)
+
+    if basename.endswith('.jar'):
+        content_type = 'application/java-archive'
+    elif basename.endswith('.py'):
+        content_type = 'application/x-python'
+    else:
+        raise ValueError("Unexpected file type: {}. Expected .jar or .py file.".format(basename))
 
     key = Key(bucket, '{}/{}'.format(os.environ['S3_PREFIX'], basename))
-    key.metadata = {'Content-Type': 'application/java-archive'}
-    key.set_contents_from_filename(jar)
+    key.metadata = {'Content-Type': content_type}
+    key.set_contents_from_filename(file_path)
     key.make_public()
 
     jar_url = "http://{0}.s3.amazonaws.com/{1}/{2}".format(
@@ -31,10 +38,18 @@ def upload_jar(jar):
     return jar_url
 
 
-def submit_job(jar_url):
-    spark_job_runner_args = 'http://leader.mesos:5050 dcos \\"*\\" spark:only 2'
-    submit_args = "-Dspark.driver.memory=2g --class com.typesafe.spark.test.mesos.framework.runners.SparkJobRunner {0} {1}".format(
-        jar_url, spark_job_runner_args)
+def submit_job(app_resource_url, app_args, app_class, py_files):
+    if app_class is not None:
+        app_class_option = '--class {} '.format(app_class)
+    else:
+        app_class_option = ''
+    if py_files is not None:
+        py_files_option = '--py-files {} '.format(py_files)
+    else:
+        py_files_option = ''
+
+    submit_args = "-Dspark.driver.memory=2g {0}{1}{2} {3}".format(
+        app_class_option, py_files_option, app_resource_url, app_args)
     cmd = 'dcos --log-level=DEBUG spark --verbose run --submit-args="{0}"'.format(submit_args)
     print('Running {}'.format(cmd))
     stdout = subprocess.check_output(cmd, shell=True).decode('utf-8')
@@ -52,14 +67,34 @@ def task_log(task_id):
     return stdout
 
 
-def main():
-    jar_url = upload_jar(os.getenv('TEST_JAR_PATH'))
-    task_id = submit_job(jar_url)
+def run_tests(app_path, app_args, expected_output, app_class=None, py_file_path=None):
+    app_resource_url = upload_file(app_path)
+    if py_file_path is not None:
+      py_file_url = upload_file(py_file_path)
+    else:
+      py_file_url = None
+    task_id = submit_job(app_resource_url, app_args, app_class, py_file_url)
     print('Waiting for task id={} to complete'.format(task_id))
     shakedown.wait_for_task_completion(task_id)
     log = task_log(task_id)
     print(log)
-    assert "All tests passed" in log
+    assert expected_output in log
+
+
+def main():
+    spark_job_runner_args = 'http://leader.mesos:5050 dcos \\"*\\" spark:only 2'
+    run_tests(os.getenv('TEST_JAR_PATH'),
+        spark_job_runner_args,
+        "All tests passed",
+        app_class='com.typesafe.spark.test.mesos.framework.runners.SparkJobRunner')
+
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    python_script_path = os.path.join(script_dir, 'jobs', 'pi_with_include.py')
+    py_file_path = os.path.join(script_dir, 'jobs', 'PySparkTestInclude.py')
+    run_tests(python_script_path,
+        '30',
+        "Pi is roughly 3",
+        py_file_path=py_file_path)
 
 
 if __name__ == '__main__':

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+def func():`
	`2`	`+ print "Import is working"`