@@ -21,13 +21,15 @@ import java.io.IOException
2121import java .net .URI
2222import java .util .ServiceLoader
2323
24+ import scala .annotation .tailrec
2425import scala .collection .JavaConverters ._
2526
2627import org .apache .hadoop .conf .Configuration
2728import org .apache .hadoop .fs .{FileStatus , FileSystem , Path }
2829
2930import org .apache .spark .SparkConf
3031import org .apache .spark .deploy .history .EventFilter .FilterStatistic
32+ import org .apache .spark .deploy .history .EventFilterBuildersLoader .LowerIndexLoadRequested
3133import org .apache .spark .internal .Logging
3234import org .apache .spark .internal .config .{EVENT_LOG_COMPACTION_SCORE_THRESHOLD , EVENT_LOG_ROLLING_MAX_FILES_TO_RETAIN }
3335import org .apache .spark .scheduler .ReplayListenerBus
@@ -46,10 +48,6 @@ import org.apache.spark.util.Utils
4648 * represents approximate rate of filtered-out events. Score is being calculated via applying
4749 * heuristic; task events tend to take most size in event log.
4850 *
49- * This class assumes caller will provide the sorted list of files which are sorted by the index of
50- * event log file, with "at most" one compact file placed first if it exists. Caller should keep in
51- * mind that this class doesn't care about the semantic of ordering.
52- *
5351 * When compacting the files, the range of compaction for given file list is determined as:
5452 * (first ~ the file where there're `maxFilesToRetain` files on the right side)
5553 *
@@ -59,22 +57,43 @@ class EventLogFileCompactor(
5957 sparkConf : SparkConf ,
6058 hadoopConf : Configuration ,
6159 fs : FileSystem ) extends Logging {
60+ import EventFilterBuildersLoader ._
61+
6262 private val maxFilesToRetain : Int = sparkConf.get(EVENT_LOG_ROLLING_MAX_FILES_TO_RETAIN )
6363 private val compactionThresholdScore : Double = sparkConf.get(EVENT_LOG_COMPACTION_SCORE_THRESHOLD )
6464
65- def compact (eventLogFiles : Seq [FileStatus ]): (CompactionResult .Value , Option [Long ]) = {
66- assertPrecondition(eventLogFiles)
65+ private var filterBuildersLoader = new EventFilterBuildersLoader (fs)
66+ private var loadedLogPath : Path = _
67+
68+ def compact (reader : EventLogFileReader ): (CompactionResult .Value , Option [Long ]) = {
69+ doCompact(reader)
70+ }
71+
72+ @ tailrec
73+ private def doCompact (reader : EventLogFileReader ): (CompactionResult .Value , Option [Long ]) = {
74+ if (loadedLogPath == null ) {
75+ loadedLogPath = reader.rootPath
76+ } else {
77+ require(loadedLogPath == null || reader.rootPath == loadedLogPath,
78+ " An instance of compactor should deal with same path of event log." )
79+ }
80+
81+ if (reader.lastIndex.isEmpty) {
82+ return (CompactionResult .NOT_ENOUGH_FILES , None )
83+ }
6784
85+ val eventLogFiles = reader.listEventLogFiles
6886 if (eventLogFiles.length < maxFilesToRetain) {
6987 return (CompactionResult .NOT_ENOUGH_FILES , None )
7088 }
7189
7290 val filesToCompact = findFilesToCompact(eventLogFiles)
7391 if (filesToCompact.isEmpty) {
74- (CompactionResult .NOT_ENOUGH_FILES , None )
75- } else {
76- val builders = initializeBuilders(fs, filesToCompact.map(_.getPath))
92+ return (CompactionResult .NOT_ENOUGH_FILES , None )
93+ }
7794
95+ try {
96+ val builders = filterBuildersLoader.loadNewFiles(filesToCompact)
7897 val filters = builders.map(_.createFilter())
7998 val minScore = filters.flatMap(_.statistic()).map(calculateScore).min
8099
@@ -87,37 +106,14 @@ class EventLogFileCompactor(
87106 (CompactionResult .SUCCESS , Some (RollingEventLogFilesWriter .getEventLogFileIndex(
88107 filesToCompact.last.getPath.getName)))
89108 }
109+ } catch {
110+ case _ : LowerIndexLoadRequested =>
111+ // reset loader and load again
112+ filterBuildersLoader = new EventFilterBuildersLoader (fs)
113+ doCompact(reader)
90114 }
91115 }
92116
93- private def assertPrecondition (eventLogFiles : Seq [FileStatus ]): Unit = {
94- val idxCompactedFiles = eventLogFiles.zipWithIndex.filter { case (file, _) =>
95- EventLogFileWriter .isCompacted(file.getPath)
96- }
97- require(idxCompactedFiles.size < 2 && idxCompactedFiles.headOption.forall(_._2 == 0 ),
98- " The number of compact files should be at most 1, and should be placed first if exists." )
99- }
100-
101- /**
102- * Loads all available EventFilterBuilders in classloader via ServiceLoader, and initializes
103- * them via replaying events in given files.
104- */
105- private def initializeBuilders (fs : FileSystem , files : Seq [Path ]): Seq [EventFilterBuilder ] = {
106- val bus = new ReplayListenerBus ()
107-
108- val builders = ServiceLoader .load(classOf [EventFilterBuilder ],
109- Utils .getContextOrSparkClassLoader).asScala.toSeq
110- builders.foreach(bus.addListener)
111-
112- files.foreach { log =>
113- Utils .tryWithResource(EventLogFileReader .openEventLog(log, fs)) { in =>
114- bus.replay(in, log.getName)
115- }
116- }
117-
118- builders
119- }
120-
121117 private def calculateScore (stats : FilterStatistic ): Double = {
122118 // For now it's simply measuring how many task events will be filtered out (rejected)
123119 // but it can be sophisticated later once we get more heuristic information and found
@@ -162,6 +158,68 @@ object CompactionResult extends Enumeration {
162158 val SUCCESS, NOT_ENOUGH_FILES, LOW_SCORE_FOR_COMPACTION = Value
163159}
164160
161+ class EventFilterBuildersLoader (fs : FileSystem ) {
162+ // the implementation of this bus is expected to be stateless
163+ private val bus = new ReplayListenerBus ()
164+
165+ /** Loads all available EventFilterBuilders in classloader via ServiceLoader */
166+ private val filterBuilders : Seq [EventFilterBuilder ] = ServiceLoader .load(
167+ classOf [EventFilterBuilder ], Utils .getContextOrSparkClassLoader).asScala.toSeq
168+
169+ filterBuilders.foreach(bus.addListener)
170+
171+ private var latestIndexLoaded : Long = - 1L
172+
173+ /** only exposed for testing; simple metric to help testing */
174+ private [history] var numFilesToLoad : Long = 0L
175+
176+ /**
177+ * Initializes EventFilterBuilders via replaying events in given files. Loading files are done
178+ * incrementally, via dropping indices which are already loaded and replaying remaining files.
179+ * For example, If the last index of requested files is same as the last index being loaded,
180+ * this will not replay any files.
181+ *
182+ * If the last index of requested files is smaller than the last index being loaded, it will
183+ * throw [[LowerIndexLoadRequested ]], which caller can decide whether ignoring it or
184+ * invalidating loader and retrying.
185+ */
186+ def loadNewFiles (eventLogFiles : Seq [FileStatus ]): Seq [EventFilterBuilder ] = {
187+ require(eventLogFiles.nonEmpty)
188+
189+ val idxToStatuses = eventLogFiles.map { status =>
190+ val idx = RollingEventLogFilesWriter .getEventLogFileIndex(status.getPath.getName)
191+ idx -> status
192+ }
193+
194+ val newLatestIdx = idxToStatuses.last._1
195+ if (newLatestIdx < latestIndexLoaded) {
196+ throw new LowerIndexLoadRequested (" Loader already loads higher index of event log than" +
197+ " requested." )
198+ }
199+
200+ val filesToLoad = idxToStatuses
201+ .filter { case (idx, _) => idx > latestIndexLoaded }
202+ .map { case (_, status) => status.getPath }
203+
204+ if (filesToLoad.nonEmpty) {
205+ filesToLoad.foreach { log =>
206+ Utils .tryWithResource(EventLogFileReader .openEventLog(log, fs)) { in =>
207+ bus.replay(in, log.getName)
208+ }
209+ numFilesToLoad += 1
210+ }
211+
212+ latestIndexLoaded = newLatestIdx
213+ }
214+
215+ filterBuilders
216+ }
217+ }
218+
219+ object EventFilterBuildersLoader {
220+ class LowerIndexLoadRequested (_msg : String ) extends Exception (_msg)
221+ }
222+
165223/**
166224 * This class rewrites the event log files into one compact file: the compact file will only
167225 * contain the events which pass the filters. Events will be dropped only when all filters
0 commit comments