Skip to content

Commit c72be03

Browse files
author
Reza Safi
committed
Adressing most of Imran's comments
1 parent 245221d commit c72be03

File tree

3 files changed

+51
-46
lines changed

3 files changed

+51
-46
lines changed

core/src/main/scala/org/apache/spark/executor/ProcfsBasedSystems.scala

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,20 @@ import scala.collection.mutable.Queue
2828
import org.apache.spark.SparkEnv
2929
import org.apache.spark.internal.{config, Logging}
3030

31-
private[spark] case class ProcfsBasedSystemsMetrics(jvmVmemTotal: Long,
32-
jvmRSSTotal: Long,
33-
pythonVmemTotal: Long,
34-
pythonRSSTotal: Long,
35-
otherVmemTotal: Long,
36-
otherRSSTotal: Long)
31+
private[spark] case class ProcfsBasedSystemsMetrics(
32+
jvmVmemTotal: Long,
33+
jvmRSSTotal: Long,
34+
pythonVmemTotal: Long,
35+
pythonRSSTotal: Long,
36+
otherVmemTotal: Long,
37+
otherRSSTotal: Long)
3738

3839
// Some of the ideas here are taken from the ProcfsBasedProcessTree class in hadoop
3940
// project.
40-
private[spark] class ProcfsBasedSystems extends Logging {
41-
var procfsDir = "/proc/"
41+
private[spark] class ProcfsBasedSystems(procfsDir: String = "/proc/") extends Logging {
4242
val procfsStatFile = "stat"
43-
var pageSize = 0
44-
var isAvailable: Boolean = isItProcfsBased
43+
var pageSize: Long = computePageSize()
44+
var isAvailable: Boolean = isProcfsAvailable
4545
private val pid: Int = computePid()
4646
private val ptree: scala.collection.mutable.Map[ Int, Set[Int]] =
4747
scala.collection.mutable.Map[ Int, Set[Int]]()
@@ -56,7 +56,7 @@ private[spark] class ProcfsBasedSystems extends Logging {
5656

5757
computeProcessTree()
5858

59-
private def isItProcfsBased: Boolean = {
59+
private def isProcfsAvailable: Boolean = {
6060
val testing = sys.env.contains("SPARK_TESTING") || sys.props.contains("spark.testing")
6161
if (testing) {
6262
return true
@@ -92,33 +92,36 @@ private[spark] class ProcfsBasedSystems extends Logging {
9292
}
9393
catch {
9494
case e: IOException => logDebug("IO Exception when trying to compute process tree." +
95-
" As a result reporting of ProcessTree metrics is stopped")
95+
" As a result reporting of ProcessTree metrics is stopped", e)
9696
isAvailable = false
9797
return -1
98-
case _ => logDebug("Some exception occurred when trying to compute process tree. " +
99-
"As a result reporting of ProcessTree metrics is stopped")
98+
case t: Throwable => logDebug("Some exception occurred when trying to" +
99+
" compute process tree. As a result reporting of ProcessTree metrics is stopped", t)
100100
isAvailable = false
101101
return -1
102102
}
103103
}
104104

105-
private def computePageSize(): Unit = {
105+
private def computePageSize(): Long = {
106+
val testing = sys.env.contains("SPARK_TESTING") || sys.props.contains("spark.testing")
107+
if (testing) {
108+
return 0;
109+
}
106110
val cmd = Array("getconf", "PAGESIZE")
107111
val out: Array[Byte] = Array.fill[Byte](10)(0)
108112
Runtime.getRuntime.exec(cmd).getInputStream.read(out)
109-
pageSize = Integer.parseInt(new String(out, "UTF-8").trim)
113+
return Integer.parseInt(new String(out, "UTF-8").trim)
110114
}
111115

112116
private def computeProcessTree(): Unit = {
113117
if (!isAvailable) {
114118
return
115119
}
116-
computePageSize
117120
val queue: Queue[Int] = new Queue[Int]()
118121
queue += pid
119122
while( !queue.isEmpty ) {
120123
val p = queue.dequeue()
121-
val c = getChildPIds(p)
124+
val c = getChildPids(p)
122125
if(!c.isEmpty) {
123126
queue ++= c
124127
ptree += (p -> c.toSet)
@@ -129,7 +132,7 @@ private[spark] class ProcfsBasedSystems extends Logging {
129132
}
130133
}
131134

132-
private def getChildPIds(pid: Int): ArrayBuffer[Int] = {
135+
private def getChildPids(pid: Int): ArrayBuffer[Int] = {
133136
try {
134137
val cmd = Array("pgrep", "-P", pid.toString)
135138
val input = Runtime.getRuntime.exec(cmd).getInputStream
@@ -150,23 +153,23 @@ private[spark] class ProcfsBasedSystems extends Logging {
150153
childPidsInInt
151154
} catch {
152155
case e: IOException => logDebug("IO Exception when trying to compute process tree." +
153-
" As a result reporting of ProcessTree metrics is stopped")
156+
" As a result reporting of ProcessTree metrics is stopped", e)
154157
isAvailable = false
155158
return new mutable.ArrayBuffer()
156-
case _ => logDebug("Some exception occurred when trying to compute process tree." +
157-
" As a result reporting of ProcessTree metrics is stopped")
159+
case t: Throwable => logDebug("Some exception occurred when trying to compute process tree." +
160+
" As a result reporting of ProcessTree metrics is stopped", t)
158161
isAvailable = false
159162
return new mutable.ArrayBuffer()
160163
}
161164
}
162165

163-
/**
166+
def getProcessInfo(pid: Int): Unit = {
167+
/*
164168
* Hadoop ProcfsBasedProcessTree class used regex and pattern matching to retrive the memory
165169
* info. I tried that but found it not correct during tests, so I used normal string analysis
166170
* instead. The computation of RSS and Vmem are based on proc(5):
167171
* http://man7.org/linux/man-pages/man5/proc.5.html
168172
*/
169-
def getProcessInfo(pid: Int): Unit = {
170173
try {
171174
val pidDir: File = new File(procfsDir, pid.toString)
172175
val fReader = new InputStreamReader(
@@ -178,20 +181,23 @@ private[spark] class ProcfsBasedSystems extends Logging {
178181
fReader.close
179182
val procInfoSplit = procInfo.split(" ")
180183
if ( procInfoSplit != null ) {
184+
val vmem = procInfoSplit(22).toLong
185+
val rssPages = procInfoSplit(23).toLong
181186
if (procInfoSplit(1).toLowerCase.contains("java")) {
182-
latestJVMVmemTotal += procInfoSplit(22).toLong
183-
latestJVMRSSTotal += procInfoSplit(23).toLong
187+
latestJVMVmemTotal += vmem
188+
latestJVMRSSTotal += rssPages
184189
}
185190
else if (procInfoSplit(1).toLowerCase.contains("python")) {
186-
latestPythonVmemTotal += procInfoSplit(22).toLong
187-
latestPythonRSSTotal += procInfoSplit(23).toLong
191+
latestPythonVmemTotal += vmem
192+
latestPythonRSSTotal += rssPages
188193
}
189194
else {
190-
latestOtherVmemTotal += procInfoSplit(22).toLong
191-
latestOtherRSSTotal += procInfoSplit(23).toLong }
195+
latestOtherVmemTotal += vmem
196+
latestOtherRSSTotal += rssPages }
192197
}
193198
} catch {
194-
case f: FileNotFoundException =>
199+
case f: FileNotFoundException => log.debug("There was a problem with reading" +
200+
" the stat file of the process", f)
195201
}
196202
}
197203

core/src/test/scala/org/apache/spark/executor/ProcfsBasedSystemsSuite.scala

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,22 +22,20 @@ import org.apache.spark.SparkFunSuite
2222

2323
class ProcfsBasedSystemsSuite extends SparkFunSuite {
2424

25-
val p = new ProcfsBasedSystems
26-
p.pageSize = 4096
27-
p.procfsDir = getTestResourcePath("ProcessTree")
25+
val p = new ProcfsBasedSystems(getTestResourcePath("ProcessTree"))
26+
p.pageSize = 4096L
2827

2928
test("testGetProcessInfo") {
30-
p.getProcessInfo(26109)
31-
assert(p.getJVMVirtualMemInfo == 4769947648L)
32-
assert(p.getJVMRSSInfo == 262610944)
33-
assert(p.getPythonVirtualMemInfo == 0)
34-
assert(p.getPythonRSSInfo == 0)
35-
36-
p.getProcessInfo(22763)
37-
assert(p.getPythonVirtualMemInfo == 360595456)
38-
assert(p.getPythonRSSInfo == 7831552)
39-
assert(p.getJVMVirtualMemInfo == 4769947648L)
40-
assert(p.getJVMRSSInfo == 262610944)
41-
29+
p.getProcessInfo(26109)
30+
assert(p.getJVMVirtualMemInfo == 4769947648L)
31+
assert(p.getJVMRSSInfo == 262610944)
32+
assert(p.getPythonVirtualMemInfo == 0)
33+
assert(p.getPythonRSSInfo == 0)
34+
35+
p.getProcessInfo(22763)
36+
assert(p.getPythonVirtualMemInfo == 360595456)
37+
assert(p.getPythonRSSInfo == 7831552)
38+
assert(p.getJVMVirtualMemInfo == 4769947648L)
39+
assert(p.getJVMRSSInfo == 262610944)
4240
}
4341
}

dev/.rat-excludes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ app-20161116163331-0000
8383
application_1516285256255_0012
8484
application_1506645932520_24630151
8585
application_1538416563558_0014
86+
stat
8687
local-1422981759269
8788
local-1422981780767
8889
local-1425081759269

0 commit comments

Comments
 (0)