[TESTS/DOCS][SPARK-659] Updated HDFS.auth_to_local, changed the principal to "alice" (apache#276)

susanxhuynh · web-flow · commit aac9f15908cf · 2018-02-21T10:01:38.000-08:00
* [TESTS][SPARK-659] In HDFS tests, updated the HDFS auth_to_local setting and changed the principal to "alice".

* Updated the docs

* Incorporated suggestions from Evan and Stavros: clarified the Spark User section of the docs; added a run_terasort_job() helper function, and more.
diff --git a/docs/kerberos.md b/docs/kerberos.md
@@ -121,6 +121,11 @@ single `krb5.conf` file for all of the its drivers.
             }
        ```
 
+1. Make sure all users have write permission to the history HDFS directory. In an HDFS client:
+    ```bash
+    hdfs dfs -chmod 1777 <history directory>
+    ```
+
 ## Job Submission
 
 To authenticate to a Kerberos KDC, Spark on Mesos supports keytab files as well as ticket-granting tickets (TGTs).
@@ -146,13 +151,27 @@ You can also set the base64 encoded krb5.conf after install time:
 **Note** This setting `SPARK_MESOS_KRB5_CONF_BASE64` will overwrite/override any settings set with
 `SPARK_SECURITY_KERBEROS_KDC_HOSTNAME`, `SPARK_SECURITY_KERBEROS_KDC_PORT`, and `SPARK_SECURITY_KERBEROS_REALM`
 
+### Setting the Spark User
+
+By default, when Kerberos is enabled, Spark runs as the OS user
+corresponding to the primary of the specified Kerberos principal.
+For example, the principal "alice@LOCAL" would map to the username "alice".
+If it is known that "alice" is not available as an OS user, either in
+the docker image or in the host,
+the Spark user should be specified as "root" or "nobody" instead:
+
+    ```
+    --conf spark.mesos.driverEnv.SPARK_USER=<Spark user>
+    ```
+
 ### Keytab Authentication
 
 Submit the job with the keytab:
 
     dcos spark run --submit-args="\
     --kerberos-principal user@REALM \
     --keytab-secret-path /__dcos_base64__hdfs-keytab \
+    --conf spark.mesos.driverEnv.SPARK_USER=<spark user> \
     --conf ... --class MySparkJob <url> <args>"
 
 ### TGT Authentication
@@ -162,6 +181,7 @@ Submit the job with the ticket:
     dcos spark run --submit-args="\
     --kerberos-principal user@REALM \
     --tgt-secret-path /__dcos_base64__tgt \
+    --conf spark.mesos.driverEnv.SPARK_USER=<spark user> \
     --conf ... --class MySparkJob <url> <args>"
 
 **Note:** You can access external (i.e. non-DC/OS) Kerberos-secured HDFS clusters from Spark on Mesos.
diff --git a/tests/hdfs_auth.py b/tests/hdfs_auth.py
@@ -0,0 +1,32 @@
+"""
+Authnz specifics for HDFS tests
+"""
+import logging
+import base64
+
+
+log = logging.getLogger(__name__)
+
+
+USERS = [
+    "hdfs",
+    "alice",
+    "bob",
+]
+
+
+def get_principal_to_user_mapping() -> str:
+    """
+    Kerberized HDFS maps the primary component of a principal to local users, so
+    we need to create an appropriate mapping to test authorization functionality.
+    :return: A base64-encoded string of principal->user mappings
+    """
+    rules = [
+        "RULE:[2:$1@$0](^hdfs@.*$)s/.*/hdfs/",
+        "RULE:[1:$1@$0](^nobody@.*$)s/.*/nobody/"
+    ]
+
+    for user in USERS:
+        rules.append("RULE:[1:$1@$0](^{user}@.*$)s/.*/{user}/".format(user=user))
+
+    return base64.b64encode('\n'.join(rules).encode("utf-8")).decode("utf-8")
diff --git a/tests/test_hdfs.py b/tests/test_hdfs.py
@@ -14,18 +14,21 @@
 import sdk_security
 import sdk_tasks
 
+from tests import hdfs_auth
 from tests import utils
 
 
 log = logging.getLogger(__name__)
 DEFAULT_HDFS_TASK_COUNT=10
 GENERIC_HDFS_USER_PRINCIPAL = "hdfs@{realm}".format(realm=sdk_auth.REALM)
+ALICE_PRINCIPAL = "alice@{realm}".format(realm=sdk_auth.REALM)
 KEYTAB_SECRET_PATH = os.getenv("KEYTAB_SECRET_PATH", "__dcos_base64___keytab")
 # To do: change when no longer using HDFS stub universe
 HDFS_PACKAGE_NAME='beta-hdfs'
 HDFS_SERVICE_NAME='hdfs'
-KERBEROS_ARGS = ["--kerberos-principal", GENERIC_HDFS_USER_PRINCIPAL,
-                 "--keytab-secret-path", "/{}".format(KEYTAB_SECRET_PATH)]
+KERBEROS_ARGS = ["--kerberos-principal", ALICE_PRINCIPAL,
+                 "--keytab-secret-path", "/{}".format(KEYTAB_SECRET_PATH),
+                 "--conf", "spark.mesos.driverEnv.SPARK_USER={}".format(utils.SPARK_USER)]
 HDFS_CLIENT_ID = "hdfsclient"
 SPARK_HISTORY_USER = "nobody"
 
@@ -64,21 +67,27 @@ def hdfs_with_kerberos(configure_security_hdfs):
                 )
             )
         principals.append(GENERIC_HDFS_USER_PRINCIPAL)
+        principals.append(ALICE_PRINCIPAL)
 
         kerberos_env = sdk_auth.KerberosEnvironment()
         kerberos_env.add_principals(principals)
         kerberos_env.finalize()
         service_kerberos_options = {
             "service": {
-                "kerberos": {
-                    "enabled": True,
-                    "kdc_host_name": kerberos_env.get_host(),
-                    "kdc_host_port": kerberos_env.get_port(),
-                    "keytab_secret": kerberos_env.get_keytab_path(),
-                    "primary": primaries[0],
-                    "primary_http": primaries[1],
-                    "realm": sdk_auth.REALM
+                "security": {
+                    "kerberos": {
+                        "enabled": True,
+                        "kdc": {
+                            "hostname": kerberos_env.get_host(),
+                            "port": int(kerberos_env.get_port())
+                        },
+                        "keytab_secret": kerberos_env.get_keytab_path(),
+                        "realm": kerberos_env.get_realm()
+                    }
                 }
+            },
+            "hdfs": {
+                "security_auth_to_local": hdfs_auth.get_principal_to_user_mapping()
             }
         }
 
@@ -116,16 +125,24 @@ def setup_hdfs_client(hdfs_with_kerberos):
         sdk_marathon.install_app(hdfsclient_app_def)
 
         sdk_auth.kinit(HDFS_CLIENT_ID, keytab="hdfs.keytab", principal=GENERIC_HDFS_USER_PRINCIPAL)
+        hdfs_cmd("mkdir -p /users/alice")
+        hdfs_cmd("chown alice:users /users/alice")
         yield
 
     finally:
         sdk_marathon.destroy_app(HDFS_CLIENT_ID)
 
 
+def hdfs_cmd(cmd):
+    sdk_tasks.task_exec(HDFS_CLIENT_ID, "bin/hdfs dfs -{}".format(cmd))
+
+
 @pytest.fixture(scope='module')
 def setup_history_server(hdfs_with_kerberos, setup_hdfs_client, configure_universe):
     try:
-        sdk_tasks.task_exec(HDFS_CLIENT_ID, "bin/hdfs dfs -mkdir /history")
+        sdk_auth.kinit(HDFS_CLIENT_ID, keytab="hdfs.keytab", principal=GENERIC_HDFS_USER_PRINCIPAL)
+        hdfs_cmd("mkdir /history")
+        hdfs_cmd("chmod 1777 /history")
 
         shakedown.install_package(
             package_name=utils.HISTORY_PACKAGE_NAME,
@@ -161,28 +178,34 @@ def setup_spark(hdfs_with_kerberos, setup_history_server, configure_security_spa
         utils.teardown_spark()
 
 
+def _run_terasort_job(terasort_class, app_args, expected_output):
+    jar_url = 'https://downloads.mesosphere.io/spark/examples/spark-terasort-1.1-jar-with-dependencies_2.11.jar'
+    submit_args = ["--class", terasort_class] + KERBEROS_ARGS
+    utils.run_tests(app_url=jar_url,
+                    app_args=" ".join(app_args),
+                    expected_output=expected_output,
+                    args=submit_args)
+
+
 @pytest.mark.skipif(not utils.hdfs_enabled(), reason='HDFS_ENABLED is false')
 @pytest.mark.sanity
 def test_terasort_suite():
-    jar_url = 'https://downloads.mesosphere.io/spark/examples/spark-terasort-1.1-jar-with-dependencies_2.11.jar'
+    data_dir = "hdfs:///users/alice"
+    terasort_in = "{}/{}".format(data_dir, "terasort_in")
+    terasort_out = "{}/{}".format(data_dir, "terasort_out")
+    terasort_validate = "{}/{}".format(data_dir, "terasort_validate")
 
-    teragen_args=["--class", "com.github.ehiggs.spark.terasort.TeraGen"] + KERBEROS_ARGS
-    utils.run_tests(app_url=jar_url,
-                    app_args="1g hdfs:///terasort_in",
-                    expected_output="Number of records written",
-                    args=teragen_args)
+    _run_terasort_job(terasort_class="com.github.ehiggs.spark.terasort.TeraGen",
+                      app_args=["1g", terasort_in],
+                      expected_output="Number of records written")
 
-    terasort_args = ["--class", "com.github.ehiggs.spark.terasort.TeraSort"] + KERBEROS_ARGS
-    utils.run_tests(app_url=jar_url,
-                    app_args="hdfs:///terasort_in hdfs:///terasort_out",
-                    expected_output="",
-                    args=terasort_args)
+    _run_terasort_job(terasort_class="com.github.ehiggs.spark.terasort.TeraSort",
+                      app_args=[terasort_in, terasort_out],
+                      expected_output="")
 
-    teravalidate_args = ["--class", "com.github.ehiggs.spark.terasort.TeraValidate"] + KERBEROS_ARGS
-    utils.run_tests(app_url=jar_url,
-                    app_args="hdfs:///terasort_out hdfs:///terasort_validate",
-                    expected_output="partitions are properly sorted",
-                    args=teravalidate_args)
+    _run_terasort_job(terasort_class="com.github.ehiggs.spark.terasort.TeraValidate",
+                      app_args=[terasort_out, terasort_validate],
+                      expected_output="partitions are properly sorted")
 
 
 @pytest.mark.skipif(not utils.hdfs_enabled(), reason='HDFS_ENABLED is false')
@@ -208,8 +231,9 @@ def has_running_executors():
                 "--conf", "spark.cores.max=8",
                 "--conf", "spark.executors.cores=4"]
 
+    data_dir = "hdfs:///users/alice"
     driver_id = utils.submit_job(app_url=utils.SPARK_EXAMPLES,
-                                 app_args="10.0.0.1 9090 hdfs:///netcheck hdfs:///outfile",
+                                 app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(dir=data_dir),
                                  app_name=utils.SPARK_APP_NAME,
                                  args=(KERBEROS_ARGS + job_args))
     log.info("Started supervised driver {}".format(driver_id))