update with Java API

Davies Liu · Davies Liu · commit 25590c922845 · 2014-11-19T12:58:06.000-08:00
diff --git a/examples/src/main/python/status_api_demo.py b/examples/src/main/python/status_api_demo.py
@@ -19,7 +19,7 @@
 import threading
 import Queue
 
-from pyspark import SparkContext
+from pyspark import SparkConf, SparkContext
 
 
 def delayed(seconds):
@@ -38,26 +38,26 @@ def call_in_background(f, *args):
 
 
 def main():
-    sc = SparkContext(appName="PythonStatusAPIDemo")
+    conf = SparkConf().set("spark.ui.showConsoleProgress", "false")
+    sc = SparkContext(appName="PythonStatusAPIDemo", conf=conf)
 
     def run():
-        sc.setJobGroup("demo", "demo status api")
         rdd = sc.parallelize(range(10), 10).map(delayed(2))
         reduced = rdd.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
         return reduced.map(delayed(2)).collect()
 
     result = call_in_background(run)
-
+    status = sc.statusTracker()
     while result.empty():
-        ids = sc.getJobIdsForGroup("demo")
+        ids = status.getJobIdsForGroup()
         for id in ids:
-            job = sc.getJobInfo(id)
-            print "Job", id, "status: ", job.status()
-            for sid in job.stageIds():
-                info = sc.getStageInfo(sid)
+            job = status.getJobInfo(id)
+            print "Job", id, "status: ", job.status
+            for sid in job.stageIds:
+                info = status.getStageInfo(sid)
                 if info:
                     print "Stage %d: %d tasks total (%d active, %d complete)" % \
-                          (sid, info.numTasks(), info.numActiveTasks(), info.numCompletedTasks())
+                          (sid, info.numTasks, info.numActiveTasks, info.numCompletedTasks)
         time.sleep(1)
 
     print "Job results are:", result.get()
diff --git a/python/pyspark/__init__.py b/python/pyspark/__init__.py
@@ -22,17 +22,17 @@
 
   - :class:`SparkContext`:
       Main entry point for Spark functionality.
-  - L{RDD}
+  - :class:`RDD`:
       A Resilient Distributed Dataset (RDD), the basic abstraction in Spark.
-  - L{Broadcast}
+  - :class:`Broadcast`:
       A broadcast variable that gets reused across tasks.
-  - L{Accumulator}
+  - :class:`Accumulator`:
       An "add-only" shared variable that tasks can only add values to.
-  - L{SparkConf}
+  - :class:`SparkConf`:
       For configuring Spark.
-  - L{SparkFiles}
+  - :class:`SparkFiles`:
       Access files shipped with jobs.
-  - L{StorageLevel}
+  - :class:`StorageLevel`:
       Finer-grained cache persistence levels.
 
 """
@@ -45,11 +45,13 @@
 from pyspark.accumulators import Accumulator, AccumulatorParam
 from pyspark.broadcast import Broadcast
 from pyspark.serializers import MarshalSerializer, PickleSerializer
+from pyspark.status import *
 
 # for back compatibility
 from pyspark.sql import SQLContext, HiveContext, SchemaRDD, Row
 
 __all__ = [
     "SparkConf", "SparkContext", "SparkFiles", "RDD", "StorageLevel", "Broadcast",
     "Accumulator", "AccumulatorParam", "MarshalSerializer", "PickleSerializer",
+    "StatusTracker", "SparkJobInfo", "SparkStageInfo"
 ]
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
@@ -33,6 +33,7 @@
 from pyspark.storagelevel import StorageLevel
 from pyspark.rdd import RDD
 from pyspark.traceback_utils import CallSite, first_spark_call
+from pyspark.status import StatusTracker
 
 from py4j.java_collections import ListConverter
 
@@ -800,29 +801,11 @@ def cancelAllJobs(self):
         """
         self._jsc.sc().cancelAllJobs()
 
-    def getJobIdsForGroup(self, jobGroup):
+    def statusTracker(self):
         """
-        Return a list of all known jobs in a particular job group.
-
-        The returned list may contain running, failed, and completed jobs, and may
-        vary across invocations of this method. This method does not guarantee the
-        order of the elements in its result.
-        """
-        return list(self._jsc.getJobIdsForGroup(jobGroup))
-
-    def getJobInfo(self, jobId):
-        """
-        Returns job information, or `None` if the job info could not be found
-        or was garbage collected.
-        """
-        return self._jsc.getJobInfo(jobId)
-
-    def getStageInfo(self, stageId):
-        """
-        Returns stage information, or `None` if the stage info could not be found or was
-        garbage collected.
+        Return :class:`StatusTracker` object
         """
-        return self._jsc.getStageInfo(stageId)
+        return StatusTracker(self._jsc.statusTracker())
 
     def runJob(self, rdd, partitionFunc, partitions=None, allowLocal=False):
         """
diff --git a/python/pyspark/status.py b/python/pyspark/status.py
@@ -0,0 +1,96 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from collections import namedtuple
+
+__all__ = ["SparkJobInfo", "SparkStageInfo", "StatusTracker"]
+
+
+class SparkJobInfo(namedtuple("SparkJobInfo", "jobId stageIds status")):
+    """
+    Exposes information about Spark Jobs.
+    """
+
+
+class SparkStageInfo(namedtuple("SparkStageInfo",
+                                "stageId currentAttemptId name numTasks numActiveTasks "
+                                "numCompletedTasks numFailedTasks")):
+    """
+    Exposes information about Spark Stages.
+    """
+
+
+class StatusTracker(object):
+    """
+    Low-level status reporting APIs for monitoring job and stage progress.
+
+    These APIs intentionally provide very weak consistency semantics;
+    consumers of these APIs should be prepared to handle empty / missing
+    information. For example, a job's stage ids may be known but the status
+    API may not have any information about the details of those stages, so
+    `getStageInfo` could potentially return `None` for a valid stage id.
+
+    To limit memory usage, these APIs only provide information on recent
+    jobs / stages.  These APIs will provide information for the last
+    `spark.ui.retainedStages` stages and `spark.ui.retainedJobs` jobs.
+    """
+    def __init__(self, jtracker):
+        self._jtracker = jtracker
+
+    def getJobIdsForGroup(self, jobGroup=None):
+        """
+        Return a list of all known jobs in a particular job group.  If
+        `jobGroup` is None, then returns all known jobs that are not
+        associated with a job group.
+
+        The returned list may contain running, failed, and completed jobs,
+        and may vary across invocations of this method. This method does
+        not guarantee the order of the elements in its result.
+        """
+        return list(self._jtracker.getJobIdsForGroup(jobGroup))
+
+    def getActiveStageIds(self):
+        """
+        Returns an array containing the ids of all active stages.
+        """
+        return sorted(list(self._jtracker.getActiveStageIds()))
+
+    def getActiveJobsIds(self):
+        """
+        Returns an array containing the ids of all active jobs.
+        """
+        return sorted((list(self._jtracker.getActiveJobIds())))
+
+    def getJobInfo(self, jobId):
+        """
+        Returns a :class:`SparkJobInfo` object, or None if the job info
+        could not be found or was garbage collected.
+        """
+        job = self._jtracker.getJobInfo(jobId)
+        if job is not None:
+            return SparkJobInfo(jobId, job.stageIds(), str(job.status()))
+
+    def getStageInfo(self, stageId):
+        """
+        Returns a :class:`SparkStageInfo` object, or None if the stage
+        info could not be found or was garbage collected.
+        """
+        stage = self._jtracker.getStageInfo(stageId)
+        if stage is not None:
+            # TODO: fetch them in batch for better performance
+            attrs = [getattr(stage, f)() for f in SparkStageInfo._fields[1:]]
+            return SparkStageInfo(stageId, *attrs)