Skip to content

Commit aac9f15

Browse files
authored
[TESTS/DOCS][SPARK-659] Updated HDFS.auth_to_local, changed the principal to "alice" (apache#276)
* [TESTS][SPARK-659] In HDFS tests, updated the HDFS auth_to_local setting and changed the principal to "alice". * Updated the docs * Incorporated suggestions from Evan and Stavros: clarified the Spark User section of the docs; added a run_terasort_job() helper function, and more.
1 parent 141e49c commit aac9f15

File tree

3 files changed

+104
-28
lines changed

3 files changed

+104
-28
lines changed

docs/kerberos.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,11 @@ single `krb5.conf` file for all of the its drivers.
121121
}
122122
```
123123

124+
1. Make sure all users have write permission to the history HDFS directory. In an HDFS client:
125+
```bash
126+
hdfs dfs -chmod 1777 <history directory>
127+
```
128+
124129
## Job Submission
125130

126131
To authenticate to a Kerberos KDC, Spark on Mesos supports keytab files as well as ticket-granting tickets (TGTs).
@@ -146,13 +151,27 @@ You can also set the base64 encoded krb5.conf after install time:
146151
**Note** This setting `SPARK_MESOS_KRB5_CONF_BASE64` will overwrite/override any settings set with
147152
`SPARK_SECURITY_KERBEROS_KDC_HOSTNAME`, `SPARK_SECURITY_KERBEROS_KDC_PORT`, and `SPARK_SECURITY_KERBEROS_REALM`
148153

154+
### Setting the Spark User
155+
156+
By default, when Kerberos is enabled, Spark runs as the OS user
157+
corresponding to the primary of the specified Kerberos principal.
158+
For example, the principal "alice@LOCAL" would map to the username "alice".
159+
If it is known that "alice" is not available as an OS user, either in
160+
the docker image or in the host,
161+
the Spark user should be specified as "root" or "nobody" instead:
162+
163+
```
164+
--conf spark.mesos.driverEnv.SPARK_USER=<Spark user>
165+
```
166+
149167
### Keytab Authentication
150168

151169
Submit the job with the keytab:
152170

153171
dcos spark run --submit-args="\
154172
--kerberos-principal user@REALM \
155173
--keytab-secret-path /__dcos_base64__hdfs-keytab \
174+
--conf spark.mesos.driverEnv.SPARK_USER=<spark user> \
156175
--conf ... --class MySparkJob <url> <args>"
157176

158177
### TGT Authentication
@@ -162,6 +181,7 @@ Submit the job with the ticket:
162181
dcos spark run --submit-args="\
163182
--kerberos-principal user@REALM \
164183
--tgt-secret-path /__dcos_base64__tgt \
184+
--conf spark.mesos.driverEnv.SPARK_USER=<spark user> \
165185
--conf ... --class MySparkJob <url> <args>"
166186

167187
**Note:** You can access external (i.e. non-DC/OS) Kerberos-secured HDFS clusters from Spark on Mesos.

tests/hdfs_auth.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""
2+
Authnz specifics for HDFS tests
3+
"""
4+
import logging
5+
import base64
6+
7+
8+
log = logging.getLogger(__name__)
9+
10+
11+
USERS = [
12+
"hdfs",
13+
"alice",
14+
"bob",
15+
]
16+
17+
18+
def get_principal_to_user_mapping() -> str:
19+
"""
20+
Kerberized HDFS maps the primary component of a principal to local users, so
21+
we need to create an appropriate mapping to test authorization functionality.
22+
:return: A base64-encoded string of principal->user mappings
23+
"""
24+
rules = [
25+
"RULE:[2:$1@$0](^hdfs@.*$)s/.*/hdfs/",
26+
"RULE:[1:$1@$0](^nobody@.*$)s/.*/nobody/"
27+
]
28+
29+
for user in USERS:
30+
rules.append("RULE:[1:$1@$0](^{user}@.*$)s/.*/{user}/".format(user=user))
31+
32+
return base64.b64encode('\n'.join(rules).encode("utf-8")).decode("utf-8")

tests/test_hdfs.py

Lines changed: 52 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,21 @@
1414
import sdk_security
1515
import sdk_tasks
1616

17+
from tests import hdfs_auth
1718
from tests import utils
1819

1920

2021
log = logging.getLogger(__name__)
2122
DEFAULT_HDFS_TASK_COUNT=10
2223
GENERIC_HDFS_USER_PRINCIPAL = "hdfs@{realm}".format(realm=sdk_auth.REALM)
24+
ALICE_PRINCIPAL = "alice@{realm}".format(realm=sdk_auth.REALM)
2325
KEYTAB_SECRET_PATH = os.getenv("KEYTAB_SECRET_PATH", "__dcos_base64___keytab")
2426
# To do: change when no longer using HDFS stub universe
2527
HDFS_PACKAGE_NAME='beta-hdfs'
2628
HDFS_SERVICE_NAME='hdfs'
27-
KERBEROS_ARGS = ["--kerberos-principal", GENERIC_HDFS_USER_PRINCIPAL,
28-
"--keytab-secret-path", "/{}".format(KEYTAB_SECRET_PATH)]
29+
KERBEROS_ARGS = ["--kerberos-principal", ALICE_PRINCIPAL,
30+
"--keytab-secret-path", "/{}".format(KEYTAB_SECRET_PATH),
31+
"--conf", "spark.mesos.driverEnv.SPARK_USER={}".format(utils.SPARK_USER)]
2932
HDFS_CLIENT_ID = "hdfsclient"
3033
SPARK_HISTORY_USER = "nobody"
3134

@@ -64,21 +67,27 @@ def hdfs_with_kerberos(configure_security_hdfs):
6467
)
6568
)
6669
principals.append(GENERIC_HDFS_USER_PRINCIPAL)
70+
principals.append(ALICE_PRINCIPAL)
6771

6872
kerberos_env = sdk_auth.KerberosEnvironment()
6973
kerberos_env.add_principals(principals)
7074
kerberos_env.finalize()
7175
service_kerberos_options = {
7276
"service": {
73-
"kerberos": {
74-
"enabled": True,
75-
"kdc_host_name": kerberos_env.get_host(),
76-
"kdc_host_port": kerberos_env.get_port(),
77-
"keytab_secret": kerberos_env.get_keytab_path(),
78-
"primary": primaries[0],
79-
"primary_http": primaries[1],
80-
"realm": sdk_auth.REALM
77+
"security": {
78+
"kerberos": {
79+
"enabled": True,
80+
"kdc": {
81+
"hostname": kerberos_env.get_host(),
82+
"port": int(kerberos_env.get_port())
83+
},
84+
"keytab_secret": kerberos_env.get_keytab_path(),
85+
"realm": kerberos_env.get_realm()
86+
}
8187
}
88+
},
89+
"hdfs": {
90+
"security_auth_to_local": hdfs_auth.get_principal_to_user_mapping()
8291
}
8392
}
8493

@@ -116,16 +125,24 @@ def setup_hdfs_client(hdfs_with_kerberos):
116125
sdk_marathon.install_app(hdfsclient_app_def)
117126

118127
sdk_auth.kinit(HDFS_CLIENT_ID, keytab="hdfs.keytab", principal=GENERIC_HDFS_USER_PRINCIPAL)
128+
hdfs_cmd("mkdir -p /users/alice")
129+
hdfs_cmd("chown alice:users /users/alice")
119130
yield
120131

121132
finally:
122133
sdk_marathon.destroy_app(HDFS_CLIENT_ID)
123134

124135

136+
def hdfs_cmd(cmd):
137+
sdk_tasks.task_exec(HDFS_CLIENT_ID, "bin/hdfs dfs -{}".format(cmd))
138+
139+
125140
@pytest.fixture(scope='module')
126141
def setup_history_server(hdfs_with_kerberos, setup_hdfs_client, configure_universe):
127142
try:
128-
sdk_tasks.task_exec(HDFS_CLIENT_ID, "bin/hdfs dfs -mkdir /history")
143+
sdk_auth.kinit(HDFS_CLIENT_ID, keytab="hdfs.keytab", principal=GENERIC_HDFS_USER_PRINCIPAL)
144+
hdfs_cmd("mkdir /history")
145+
hdfs_cmd("chmod 1777 /history")
129146

130147
shakedown.install_package(
131148
package_name=utils.HISTORY_PACKAGE_NAME,
@@ -161,28 +178,34 @@ def setup_spark(hdfs_with_kerberos, setup_history_server, configure_security_spa
161178
utils.teardown_spark()
162179

163180

181+
def _run_terasort_job(terasort_class, app_args, expected_output):
182+
jar_url = 'https://downloads.mesosphere.io/spark/examples/spark-terasort-1.1-jar-with-dependencies_2.11.jar'
183+
submit_args = ["--class", terasort_class] + KERBEROS_ARGS
184+
utils.run_tests(app_url=jar_url,
185+
app_args=" ".join(app_args),
186+
expected_output=expected_output,
187+
args=submit_args)
188+
189+
164190
@pytest.mark.skipif(not utils.hdfs_enabled(), reason='HDFS_ENABLED is false')
165191
@pytest.mark.sanity
166192
def test_terasort_suite():
167-
jar_url = 'https://downloads.mesosphere.io/spark/examples/spark-terasort-1.1-jar-with-dependencies_2.11.jar'
193+
data_dir = "hdfs:///users/alice"
194+
terasort_in = "{}/{}".format(data_dir, "terasort_in")
195+
terasort_out = "{}/{}".format(data_dir, "terasort_out")
196+
terasort_validate = "{}/{}".format(data_dir, "terasort_validate")
168197

169-
teragen_args=["--class", "com.github.ehiggs.spark.terasort.TeraGen"] + KERBEROS_ARGS
170-
utils.run_tests(app_url=jar_url,
171-
app_args="1g hdfs:///terasort_in",
172-
expected_output="Number of records written",
173-
args=teragen_args)
198+
_run_terasort_job(terasort_class="com.github.ehiggs.spark.terasort.TeraGen",
199+
app_args=["1g", terasort_in],
200+
expected_output="Number of records written")
174201

175-
terasort_args = ["--class", "com.github.ehiggs.spark.terasort.TeraSort"] + KERBEROS_ARGS
176-
utils.run_tests(app_url=jar_url,
177-
app_args="hdfs:///terasort_in hdfs:///terasort_out",
178-
expected_output="",
179-
args=terasort_args)
202+
_run_terasort_job(terasort_class="com.github.ehiggs.spark.terasort.TeraSort",
203+
app_args=[terasort_in, terasort_out],
204+
expected_output="")
180205

181-
teravalidate_args = ["--class", "com.github.ehiggs.spark.terasort.TeraValidate"] + KERBEROS_ARGS
182-
utils.run_tests(app_url=jar_url,
183-
app_args="hdfs:///terasort_out hdfs:///terasort_validate",
184-
expected_output="partitions are properly sorted",
185-
args=teravalidate_args)
206+
_run_terasort_job(terasort_class="com.github.ehiggs.spark.terasort.TeraValidate",
207+
app_args=[terasort_out, terasort_validate],
208+
expected_output="partitions are properly sorted")
186209

187210

188211
@pytest.mark.skipif(not utils.hdfs_enabled(), reason='HDFS_ENABLED is false')
@@ -208,8 +231,9 @@ def has_running_executors():
208231
"--conf", "spark.cores.max=8",
209232
"--conf", "spark.executors.cores=4"]
210233

234+
data_dir = "hdfs:///users/alice"
211235
driver_id = utils.submit_job(app_url=utils.SPARK_EXAMPLES,
212-
app_args="10.0.0.1 9090 hdfs:///netcheck hdfs:///outfile",
236+
app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(dir=data_dir),
213237
app_name=utils.SPARK_APP_NAME,
214238
args=(KERBEROS_ARGS + job_args))
215239
log.info("Started supervised driver {}".format(driver_id))

0 commit comments

Comments
 (0)