Skip to content

Commit f0249bd

Browse files
committed
WIP
1 parent e86fbdb commit f0249bd

File tree

1 file changed

+205
-25
lines changed

1 file changed

+205
-25
lines changed

dev/run-tests.py

Lines changed: 205 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,199 @@
2828
USER_HOME = os.environ.get("HOME")
2929

3030

31+
all_modules = []
32+
33+
34+
class Module(object):
35+
36+
def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(),
37+
should_run_python_tests=False, should_run_r_tests=False):
38+
self.name = name
39+
self.dependencies = dependencies
40+
self.source_file_prefixes = source_file_regexes
41+
self.sbt_test_goals = sbt_test_goals
42+
self.should_run_python_tests = should_run_python_tests
43+
self.should_run_r_tests = should_run_r_tests
44+
45+
self.dependent_modules = set()
46+
for dep in dependencies:
47+
dep.dependent_modules.add(self)
48+
all_modules.append(self)
49+
50+
def contains_file(self, filename):
51+
return any(re.match(p, filename) for p in self.source_file_prefixes)
52+
53+
54+
root = Module(
55+
name="root",
56+
dependencies=[],
57+
source_file_regexes=[],
58+
sbt_test_goals=[
59+
"test",
60+
]
61+
)
62+
63+
64+
sql = Module(
65+
name="sql",
66+
dependencies=[],
67+
source_file_regexes=[
68+
"sql/(?!hive-thriftserver)",
69+
"bin/spark-sql",
70+
"examples/src/main/java/org/apache/spark/examples/sql/",
71+
"examples/src/main/scala/org/apache/spark/examples/sql/",
72+
],
73+
sbt_test_goals=[
74+
"catalyst/test",
75+
"sql/test",
76+
"hive/test",
77+
])
78+
79+
80+
hive_thriftserver = Module(
81+
name="hive-thriftserver",
82+
dependencies=[sql],
83+
source_file_regexes=[
84+
"sql/hive-thriftserver",
85+
"sbin/start-thriftserver.sh",
86+
],
87+
sbt_test_goals=[
88+
"hive-thriftserver/test",
89+
]
90+
)
91+
92+
93+
mllib = Module(
94+
name="mllib",
95+
dependencies=[sql],
96+
source_file_regexes=[
97+
"examples/src/main/java/org/apache/spark/examples/mllib/",
98+
"examples/src/main/scala/org/apache/spark/examples/mllib",
99+
"data/mllib/",
100+
"mllib/",
101+
],
102+
sbt_test_goals=[
103+
"mllib/test",
104+
"examples/test",
105+
]
106+
)
107+
108+
109+
graphx = Module(
110+
name="graphx",
111+
dependencies=[],
112+
source_file_regexes=[
113+
"graphx/",
114+
],
115+
sbt_test_goals=[
116+
"graphx/test"
117+
]
118+
)
119+
120+
121+
streaming = Module(
122+
name="streaming",
123+
dependencies=[],
124+
source_file_regexes=[
125+
"external/",
126+
"extras/java8-tests/",
127+
"extras/kinesis-asl/",
128+
"streaming",
129+
],
130+
sbt_test_goals=[
131+
"streaming/test",
132+
"streaming-flume/test",
133+
"streaming-flume-sink/test",
134+
"streaming-kafka/test",
135+
"streaming-mqtt/test",
136+
"streaming-twitter/test",
137+
"streaming-zeromq/test",
138+
]
139+
)
140+
141+
142+
examples = Module(
143+
name="examples",
144+
dependencies=[graphx, mllib, streaming, sql],
145+
source_file_regexes=[
146+
"examples/",
147+
],
148+
sbt_test_goals=[
149+
"examples/test",
150+
]
151+
)
152+
153+
154+
pyspark = Module(
155+
name="pyspark",
156+
dependencies=[mllib, streaming, sql],
157+
source_file_regexes=[
158+
"python/"
159+
],
160+
should_run_python_tests=True
161+
)
162+
163+
164+
sparkr = Module(
165+
name="sparkr",
166+
dependencies=[sql, mllib],
167+
source_file_regexes=[
168+
"R/",
169+
],
170+
should_run_r_tests=True
171+
)
172+
173+
174+
docs = Module(
175+
name="docs",
176+
dependencies=[],
177+
source_file_regexes=[
178+
"docs/",
179+
]
180+
)
181+
182+
183+
def determine_modules(filenames):
184+
"""
185+
Given a list of filenames, return the set of modules that contain those files.
186+
If a file is not associated with a more specific submodule, then this method will consider that
187+
file to belong to the 'root' module.
188+
189+
>>> sorted(x.name for x in determine_modules(["python/pyspark/a.py", "sql/test/foo"]))
190+
['pyspark', 'sql']
191+
>>> [x.name for x in determine_modules(["file_not_matched_by_any_subproject"])]
192+
['root']
193+
"""
194+
changed_modules = set()
195+
for filename in filenames:
196+
matched_at_least_one_module = False
197+
for module in all_modules:
198+
if module.contains_file(filename):
199+
changed_modules.add(module)
200+
matched_at_least_one_module = True
201+
if not matched_at_least_one_module:
202+
changed_modules.add(root)
203+
return changed_modules
204+
205+
206+
def determine_modules_to_test(changed_modules):
207+
"""
208+
Given a set of modules that have changed, compute the transitive closure of those modules'
209+
dependent modules in order to determine the set of modules that should be tested.
210+
211+
>>> sorted(x.name for x in determine_modules_to_test([root]))
212+
['root']
213+
>>> sorted(x.name for x in determine_modules_to_test([graphx]))
214+
['examples', 'graphx']
215+
>>> sorted(x.name for x in determine_modules_to_test([sql]))
216+
['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql']
217+
"""
218+
modules_to_test = set()
219+
for module in changed_modules:
220+
modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
221+
return modules_to_test.union(set(changed_modules))
222+
223+
31224
def get_error_codes(err_code_file):
32225
"""Function to retrieve all block numbers from the `run-tests-codes.sh`
33226
file to maintain backwards compatibility with the `run-tests-jenkins`
@@ -43,7 +236,7 @@ def get_error_codes(err_code_file):
43236

44237

45238
def exit_from_command_with_retcode(cmd, retcode):
46-
print "[error] running", cmd, "; received return code", retcode
239+
print "[error] running", cmd.join(' '), "; received return code", retcode
47240
sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
48241

49242

@@ -177,14 +370,14 @@ def build_spark_documentation():
177370
os.chdir(SPARK_HOME)
178371

179372

180-
def exec_maven(mvn_args=[]):
373+
def exec_maven(mvn_args=()):
181374
"""Will call Maven in the current directory with the list of mvn_args passed
182375
in and returns the subprocess for any further processing"""
183376

184377
run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
185378

186379

187-
def exec_sbt(sbt_args=[]):
380+
def exec_sbt(sbt_args=()):
188381
"""Will call SBT in the current directory with the list of mvn_args passed
189382
in and returns the subprocess for any further processing"""
190383

@@ -231,7 +424,7 @@ def get_hadoop_profiles(hadoop_version):
231424
sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
232425

233426

234-
def get_build_profiles(hadoop_version="hadoop2.3",
427+
def get_build_profiles(hadoop_version,
235428
enable_base_profiles=True,
236429
enable_hive_profiles=False,
237430
enable_doc_profiles=False):
@@ -318,19 +511,6 @@ def identify_changed_modules(test_env):
318511
# remove any empty strings
319512
changed_files = [f for f in raw_output.split('\n') if f]
320513

321-
sql_files = [f for f in changed_files
322-
if any(f.startswith(p) for p in
323-
["sql/",
324-
"bin/spark-sql",
325-
"sbin/start-thriftserver.sh",
326-
"examples/src/main/java/org/apache/spark/examples/sql/",
327-
"examples/src/main/scala/org/apache/spark/examples/sql/"])]
328-
mllib_files = [f for f in changed_files
329-
if any(f.startswith(p) for p in
330-
["examples/src/main/java/org/apache/spark/examples/mllib/",
331-
"examples/src/main/scala/org/apache/spark/examples/mllib",
332-
"data/mllib/",
333-
"mllib/"])]
334514
streaming_files = [f for f in changed_files
335515
if any(f.startswith(p) for p in
336516
["examples/scala-2.10/",
@@ -356,12 +536,6 @@ def identify_changed_modules(test_env):
356536

357537
if changed_core_files:
358538
changed_modules.add("CORE")
359-
if sql_files:
360-
print "[info] Detected changes in SQL. Will run Hive test suite."
361-
changed_modules.add("SQL")
362-
if mllib_files:
363-
print "[info] Detected changes in MLlib. Will run MLlib test suite."
364-
changed_modules.add("MLLIB")
365539
if streaming_files:
366540
print "[info] Detected changes in Streaming. Will run Streaming test suite."
367541
changed_modules.add("STREAMING")
@@ -416,8 +590,6 @@ def run_scala_tests_sbt(test_modules, test_profiles):
416590
"streaming-twitter/test",
417591
"streaming-zeromq/test",
418592
"examples/test"]
419-
if "GRAPHX" in test_modules and "CORE" not in test_modules:
420-
sbt_test_goals += ["graphx/test", "examples/test"]
421593
if not sbt_test_goals:
422594
sbt_test_goals = ["test"]
423595

@@ -532,5 +704,13 @@ def main():
532704
run_python_tests()
533705
run_sparkr_tests()
534706

707+
708+
def _test():
709+
import doctest
710+
(failure_count, test_count) = doctest.testmod()
711+
if failure_count:
712+
exit(-1)
713+
535714
if __name__ == "__main__":
715+
_test()
536716
main()

0 commit comments

Comments
 (0)