28
28
USER_HOME = os .environ .get ("HOME" )
29
29
30
30
31
+ all_modules = []
32
+
33
+
34
+ class Module (object ):
35
+
36
+ def __init__ (self , name , dependencies , source_file_regexes , sbt_test_goals = (),
37
+ should_run_python_tests = False , should_run_r_tests = False ):
38
+ self .name = name
39
+ self .dependencies = dependencies
40
+ self .source_file_prefixes = source_file_regexes
41
+ self .sbt_test_goals = sbt_test_goals
42
+ self .should_run_python_tests = should_run_python_tests
43
+ self .should_run_r_tests = should_run_r_tests
44
+
45
+ self .dependent_modules = set ()
46
+ for dep in dependencies :
47
+ dep .dependent_modules .add (self )
48
+ all_modules .append (self )
49
+
50
+ def contains_file (self , filename ):
51
+ return any (re .match (p , filename ) for p in self .source_file_prefixes )
52
+
53
+
54
+ root = Module (
55
+ name = "root" ,
56
+ dependencies = [],
57
+ source_file_regexes = [],
58
+ sbt_test_goals = [
59
+ "test" ,
60
+ ]
61
+ )
62
+
63
+
64
+ sql = Module (
65
+ name = "sql" ,
66
+ dependencies = [],
67
+ source_file_regexes = [
68
+ "sql/(?!hive-thriftserver)" ,
69
+ "bin/spark-sql" ,
70
+ "examples/src/main/java/org/apache/spark/examples/sql/" ,
71
+ "examples/src/main/scala/org/apache/spark/examples/sql/" ,
72
+ ],
73
+ sbt_test_goals = [
74
+ "catalyst/test" ,
75
+ "sql/test" ,
76
+ "hive/test" ,
77
+ ])
78
+
79
+
80
+ hive_thriftserver = Module (
81
+ name = "hive-thriftserver" ,
82
+ dependencies = [sql ],
83
+ source_file_regexes = [
84
+ "sql/hive-thriftserver" ,
85
+ "sbin/start-thriftserver.sh" ,
86
+ ],
87
+ sbt_test_goals = [
88
+ "hive-thriftserver/test" ,
89
+ ]
90
+ )
91
+
92
+
93
+ mllib = Module (
94
+ name = "mllib" ,
95
+ dependencies = [sql ],
96
+ source_file_regexes = [
97
+ "examples/src/main/java/org/apache/spark/examples/mllib/" ,
98
+ "examples/src/main/scala/org/apache/spark/examples/mllib" ,
99
+ "data/mllib/" ,
100
+ "mllib/" ,
101
+ ],
102
+ sbt_test_goals = [
103
+ "mllib/test" ,
104
+ "examples/test" ,
105
+ ]
106
+ )
107
+
108
+
109
+ graphx = Module (
110
+ name = "graphx" ,
111
+ dependencies = [],
112
+ source_file_regexes = [
113
+ "graphx/" ,
114
+ ],
115
+ sbt_test_goals = [
116
+ "graphx/test"
117
+ ]
118
+ )
119
+
120
+
121
+ streaming = Module (
122
+ name = "streaming" ,
123
+ dependencies = [],
124
+ source_file_regexes = [
125
+ "external/" ,
126
+ "extras/java8-tests/" ,
127
+ "extras/kinesis-asl/" ,
128
+ "streaming" ,
129
+ ],
130
+ sbt_test_goals = [
131
+ "streaming/test" ,
132
+ "streaming-flume/test" ,
133
+ "streaming-flume-sink/test" ,
134
+ "streaming-kafka/test" ,
135
+ "streaming-mqtt/test" ,
136
+ "streaming-twitter/test" ,
137
+ "streaming-zeromq/test" ,
138
+ ]
139
+ )
140
+
141
+
142
+ examples = Module (
143
+ name = "examples" ,
144
+ dependencies = [graphx , mllib , streaming , sql ],
145
+ source_file_regexes = [
146
+ "examples/" ,
147
+ ],
148
+ sbt_test_goals = [
149
+ "examples/test" ,
150
+ ]
151
+ )
152
+
153
+
154
+ pyspark = Module (
155
+ name = "pyspark" ,
156
+ dependencies = [mllib , streaming , sql ],
157
+ source_file_regexes = [
158
+ "python/"
159
+ ],
160
+ should_run_python_tests = True
161
+ )
162
+
163
+
164
+ sparkr = Module (
165
+ name = "sparkr" ,
166
+ dependencies = [sql , mllib ],
167
+ source_file_regexes = [
168
+ "R/" ,
169
+ ],
170
+ should_run_r_tests = True
171
+ )
172
+
173
+
174
+ docs = Module (
175
+ name = "docs" ,
176
+ dependencies = [],
177
+ source_file_regexes = [
178
+ "docs/" ,
179
+ ]
180
+ )
181
+
182
+
183
+ def determine_modules (filenames ):
184
+ """
185
+ Given a list of filenames, return the set of modules that contain those files.
186
+ If a file is not associated with a more specific submodule, then this method will consider that
187
+ file to belong to the 'root' module.
188
+
189
+ >>> sorted(x.name for x in determine_modules(["python/pyspark/a.py", "sql/test/foo"]))
190
+ ['pyspark', 'sql']
191
+ >>> [x.name for x in determine_modules(["file_not_matched_by_any_subproject"])]
192
+ ['root']
193
+ """
194
+ changed_modules = set ()
195
+ for filename in filenames :
196
+ matched_at_least_one_module = False
197
+ for module in all_modules :
198
+ if module .contains_file (filename ):
199
+ changed_modules .add (module )
200
+ matched_at_least_one_module = True
201
+ if not matched_at_least_one_module :
202
+ changed_modules .add (root )
203
+ return changed_modules
204
+
205
+
206
+ def determine_modules_to_test (changed_modules ):
207
+ """
208
+ Given a set of modules that have changed, compute the transitive closure of those modules'
209
+ dependent modules in order to determine the set of modules that should be tested.
210
+
211
+ >>> sorted(x.name for x in determine_modules_to_test([root]))
212
+ ['root']
213
+ >>> sorted(x.name for x in determine_modules_to_test([graphx]))
214
+ ['examples', 'graphx']
215
+ >>> sorted(x.name for x in determine_modules_to_test([sql]))
216
+ ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql']
217
+ """
218
+ modules_to_test = set ()
219
+ for module in changed_modules :
220
+ modules_to_test = modules_to_test .union (determine_modules_to_test (module .dependent_modules ))
221
+ return modules_to_test .union (set (changed_modules ))
222
+
223
+
31
224
def get_error_codes (err_code_file ):
32
225
"""Function to retrieve all block numbers from the `run-tests-codes.sh`
33
226
file to maintain backwards compatibility with the `run-tests-jenkins`
@@ -43,7 +236,7 @@ def get_error_codes(err_code_file):
43
236
44
237
45
238
def exit_from_command_with_retcode (cmd , retcode ):
46
- print "[error] running" , cmd , "; received return code" , retcode
239
+ print "[error] running" , cmd . join ( ' ' ) , "; received return code" , retcode
47
240
sys .exit (int (os .environ .get ("CURRENT_BLOCK" , 255 )))
48
241
49
242
@@ -177,14 +370,14 @@ def build_spark_documentation():
177
370
os .chdir (SPARK_HOME )
178
371
179
372
180
- def exec_maven (mvn_args = [] ):
373
+ def exec_maven (mvn_args = () ):
181
374
"""Will call Maven in the current directory with the list of mvn_args passed
182
375
in and returns the subprocess for any further processing"""
183
376
184
377
run_cmd ([os .path .join (SPARK_HOME , "build" , "mvn" )] + mvn_args )
185
378
186
379
187
- def exec_sbt (sbt_args = [] ):
380
+ def exec_sbt (sbt_args = () ):
188
381
"""Will call SBT in the current directory with the list of mvn_args passed
189
382
in and returns the subprocess for any further processing"""
190
383
@@ -231,7 +424,7 @@ def get_hadoop_profiles(hadoop_version):
231
424
sys .exit (int (os .environ .get ("CURRENT_BLOCK" , 255 )))
232
425
233
426
234
- def get_build_profiles (hadoop_version = "hadoop2.3" ,
427
+ def get_build_profiles (hadoop_version ,
235
428
enable_base_profiles = True ,
236
429
enable_hive_profiles = False ,
237
430
enable_doc_profiles = False ):
@@ -318,19 +511,6 @@ def identify_changed_modules(test_env):
318
511
# remove any empty strings
319
512
changed_files = [f for f in raw_output .split ('\n ' ) if f ]
320
513
321
- sql_files = [f for f in changed_files
322
- if any (f .startswith (p ) for p in
323
- ["sql/" ,
324
- "bin/spark-sql" ,
325
- "sbin/start-thriftserver.sh" ,
326
- "examples/src/main/java/org/apache/spark/examples/sql/" ,
327
- "examples/src/main/scala/org/apache/spark/examples/sql/" ])]
328
- mllib_files = [f for f in changed_files
329
- if any (f .startswith (p ) for p in
330
- ["examples/src/main/java/org/apache/spark/examples/mllib/" ,
331
- "examples/src/main/scala/org/apache/spark/examples/mllib" ,
332
- "data/mllib/" ,
333
- "mllib/" ])]
334
514
streaming_files = [f for f in changed_files
335
515
if any (f .startswith (p ) for p in
336
516
["examples/scala-2.10/" ,
@@ -356,12 +536,6 @@ def identify_changed_modules(test_env):
356
536
357
537
if changed_core_files :
358
538
changed_modules .add ("CORE" )
359
- if sql_files :
360
- print "[info] Detected changes in SQL. Will run Hive test suite."
361
- changed_modules .add ("SQL" )
362
- if mllib_files :
363
- print "[info] Detected changes in MLlib. Will run MLlib test suite."
364
- changed_modules .add ("MLLIB" )
365
539
if streaming_files :
366
540
print "[info] Detected changes in Streaming. Will run Streaming test suite."
367
541
changed_modules .add ("STREAMING" )
@@ -416,8 +590,6 @@ def run_scala_tests_sbt(test_modules, test_profiles):
416
590
"streaming-twitter/test" ,
417
591
"streaming-zeromq/test" ,
418
592
"examples/test" ]
419
- if "GRAPHX" in test_modules and "CORE" not in test_modules :
420
- sbt_test_goals += ["graphx/test" , "examples/test" ]
421
593
if not sbt_test_goals :
422
594
sbt_test_goals = ["test" ]
423
595
@@ -532,5 +704,13 @@ def main():
532
704
run_python_tests ()
533
705
run_sparkr_tests ()
534
706
707
+
708
+ def _test ():
709
+ import doctest
710
+ (failure_count , test_count ) = doctest .testmod ()
711
+ if failure_count :
712
+ exit (- 1 )
713
+
535
714
if __name__ == "__main__" :
715
+ _test ()
536
716
main ()
0 commit comments