Skip to content

Commit

Permalink
Merge pull request #1169 from beehive-lab/florin/memory-leak-tests
Browse files Browse the repository at this point in the history
Add memory leak tests
  • Loading branch information
jjfumero authored Sep 30, 2024
2 parents 0da1084 + 7228473 commit 0644225
Show file tree
Hide file tree
Showing 28 changed files with 407 additions and 45 deletions.
4 changes: 0 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,6 @@ tests-spirv-opencl:
tornado-test --jvm="-Dtornado.spirv.dispatcher=opencl"--ea -V -J"-Dtornado.device.memory=1MB" uk.ac.manchester.tornado.unittests.fails.HeapFail#test03
test-native.sh

tests-opt:
tornado --devices
tornado-test -V --fast --ea --verbose -J"-Dtornado.spirv.loadstore=True" --printKernel

test-slam:
tornado-test -V --fast uk.ac.manchester.tornado.unittests.slam.GraphicsTests

Expand Down
4 changes: 0 additions & 4 deletions Makefile.mak
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,5 @@ tests:
python %TORNADO_SDK%\bin\tornado-test --ea -V -J"-Dtornado.device.memory=1MB" uk.ac.manchester.tornado.unittests.fails.HeapFail#test03
%TORNADO_SDK%\bin\test-native.cmd

tests-opt:
python %TORNADO_SDK%\bin\tornado --devices
python %TORNADO_SDK%\bin\tornado-test -V --fast --ea --verbose -J"-Dtornado.spirv.loadstore=True" --printKernel

test-slam:
python %TORNADO_SDK%\bin\tornado-test -V --fast uk.ac.manchester.tornado.unittests.slam.GraphicsTests
1 change: 1 addition & 0 deletions bin/softwareDevDependencies.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ black
rstcheck
pre-commit
pyinstaller
psutil
1 change: 1 addition & 0 deletions bin/tornadoDepModules.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ wget
black
sphinx_rtd_theme
pyinstaller
psutil
8 changes: 4 additions & 4 deletions docs/source/spirv-backend.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ All drivers are available here: `https://github.com/intel/compute-runtime/releas
Install TornadoVM for SPIR-V
-----------------------------

Install TornadoVM following the instructions in :ref:`installation`.
Install TornadoVM following the instructions in :ref:`installation`.

To build the SPIR-V Backend, enable the backend as follows:

Expand Down Expand Up @@ -48,9 +48,9 @@ Running DFT from the unit-test suite
Number of workgroups : [16]
Test: class uk.ac.manchester.tornado.unittests.compute.ComputeTests#testDFT
Running test: testDFT ................ [PASS]
Running test: testDFT ................ [PASS]
In this execution, the SPIR-V Binary is stored in ``/tmp/tornadoVM-spirv/8442884346950-s0.t0computeDFT.spv``.
In this execution, the SPIR-V Binary is stored in ``/tmp/tornadoVM-spirv/8442884346950-s0.t0computeDFT.spv``.
We can disassemble the binary with ``spirv-dis`` `from Khronos <https://github.com/KhronosGroup/SPIRV-Tools>`__

Note: Usually, ``spirv-dis`` can be installed from the common OS repositories (e.g., Fedora, Ubuntu repositories):
Expand All @@ -71,7 +71,7 @@ TornadoVM/Java Options for SPIR-V:

- ``-Dtornado.spirv.dispatcher=opencl``: It sets the runtime to dispatch SPIR-V kernels. Allowed values are: ``opencl`` and ``levelzero``.

- ``-Dtornado.spirv.levelzero.extended.memory=True``: It uses Level Zero extended memory mode. It is set to ``true`` by default.
- ``-Dtornado.spirv.levelzero.extended.memory=True``: It uses Level Zero extended memory mode. It is set to ``true`` by default.



Expand Down
3 changes: 2 additions & 1 deletion tornado-assembly/src/bin/tornado
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,8 @@ class TornadoVMRunnerTool():
else:
command = javaFlags + " " + str(args.application) + " " + params
## Execute the command
os.system(command)
status = os.system(command)
sys.exit(status)


def parseArguments():
Expand Down
166 changes: 154 additions & 12 deletions tornado-assembly/src/bin/tornado-test
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,95 @@ import shlex
import subprocess
import sys
import time
import psutil
from abc import abstractmethod
from builtins import staticmethod, isinstance
from typing import Union, Optional

# ################################################################################################################
## Monitor classes
# ################################################################################################################

class MonitorClass:
def __init__(self):
pass

@abstractmethod
def monitor(self, pid, cmd):
"""Monitors the process with the given pid and command.
Returns False if the process failed the monitor, True otherwise.
"""
pass


class OutOfMemoryMonitorClass(MonitorClass):
def __init__(self):
super().__init__()

@staticmethod
def _getMaxMemoryUsageMB(cmd):
"""
Get the maximum memory usage in MB based on the set Xmx flag.
"""
# The native memory usage of the JVM is always higher than the heap size.
# Use a hardcoded multiplier to account for this.
_HEAP_TO_RSS_MULTIPLIER = 2.0
_DEFAULT_HEAP_SIZE_MB = 6144

xmx_pattern = r'-Xmx(\d+)([gGmM])'
matches = list(re.finditer(xmx_pattern, cmd))
if matches:
last_match = matches[-1]
value = int(last_match.group(1))
unit = last_match.group(2).lower()

value *= _HEAP_TO_RSS_MULTIPLIER
if unit == 'g':
return value * 1024
elif unit == 'm':
return value
else:
raise ValueError(f"Invalid unit {unit} in -Xmx flag")

print(f"Warning: Could not find -Xmx flag in command {cmd}. Defaulting to {_DEFAULT_HEAP_SIZE_MB} MB.")
return _DEFAULT_HEAP_SIZE_MB

def monitor(self, pid, cmd):
"""
Monitor the JVM process and check if it contains a memory leak. We do this in Python because the JVM
process could be leaking native memory.
Returns: False if the JVM process has been prematurely terminated, True otherwise.
"""
max_memory_usage_mb = OutOfMemoryMonitorClass._getMaxMemoryUsageMB(cmd)
print(f"Monitoring JVM process {pid} for memory usage. Max memory usage is {max_memory_usage_mb} MB.")
process = psutil.Process(pid)
# The process becomes zombie when it is terminated, but the parent process has not yet read the exit status.
while process.is_running() and process.status() != psutil.STATUS_ZOMBIE:
# Compute the memory usage of the pid and all the child processes. This is because the JVM
# is spawned as a child process of the tornado script.
memory_usage = process.memory_info().rss
for child in process.children(recursive=True):
memory_usage += child.memory_info().rss
memory_usage /= 1024 ** 2 # Convert to MB
if memory_usage > max_memory_usage_mb:
print(f"JVM process exceeded {max_memory_usage_mb} MB of memory. Got {memory_usage} MB.")
return False
time.sleep(1)
return True

MONITOR_REGISTRY = {
"outOfMemoryMonitor": OutOfMemoryMonitorClass,
}

# ################################################################################################################

class TestEntry:
def __init__(self, testName, testMethods=None, testParameters=None):
def __init__(self, testName, testMethods=None, testParameters=None, monitorClass=None):
self.testName = testName
self.testMethods = testMethods
self.testParameters = testParameters
self.monitorClass = monitorClass


## List of classes to be tested. Include new unittest classes here
Expand Down Expand Up @@ -159,7 +241,14 @@ __TEST_THE_WORLD__ = [
TestEntry(testName="uk.ac.manchester.tornado.unittests.memory.TestStressDeviceMemory",
testParameters=[
"-Dtornado.device.memory=4GB",
"-Xmx14g"])
"-Xmx14g"]),
TestEntry(testName="uk.ac.manchester.tornado.unittests.memory.leak.TestMemoryLeak",
testMethods=["test_no_cached_hot_loop", "test_no_cached_hot_loop_primitive",
"test_cached_task_graph_and_input_output_primitive",
"test_cached_task_graph_and_input_output", "test_cached_everything_primitive",
"test_cached_everything"],
testParameters=["-Xmx2g"],
monitorClass=OutOfMemoryMonitorClass),
]

## List of tests that can be ignored. The following either fail (we know it is a precision error), or they are not supported
Expand Down Expand Up @@ -221,9 +310,14 @@ __TORNADO_TESTS_WHITE_LIST__ = [

## List of tests to be excluded when running with the quick pass argument as they cause delays
__TORNADO_HEAVY_TESTS__ = [
"uk.ac.manchester.tornado.unittests.memory.TestStressDeviceMemory",
"uk.ac.manchester.tornado.unittests.profiler.TestProfiler",
"uk.ac.manchester.tornado.unittests.tensors.TestTensorAPIWithOnnx",
"uk.ac.manchester.tornado.unittests.tensors.TestTensorAPIWithOnnx"
]

## List of tests that are memory intensive and potentially take a long time to run
__TORNADO_MEMORY_TESTS__ = [
"uk.ac.manchester.tornado.unittests.memory.TestStressDeviceMemory",
"uk.ac.manchester.tornado.unittests.memory.leak.TestMemoryLeak",
]

# ################################################################################################################
Expand Down Expand Up @@ -329,6 +423,32 @@ def composeAllOptions(args):
return options


def killAllProcesses(pid: int):
parent = psutil.Process(pid)
for child in parent.children(recursive=True):
child.kill()
parent.kill()


def runMonitorClass(monitorClass: Optional[Union[str, MonitorClass]], cmd: str, pid: int):
""" Run the monitor class if specified.
:param monitorClass: Can be a string used to look up the monitor class in the MONITOR_REGISTRY or an instance of a MonitorClass.
:param cmd: The command that was used to spawn the process.
:param pid: The process ID of the spawned process.
:return: True if the monitor passed or the monitor class is None. False if the monitor failed.
"""
if monitorClass:
if isinstance(monitorClass, str):
assert monitorClass in MONITOR_REGISTRY, f"Monitor class {monitorClass} not found in registry. Options are {list(MONITOR_REGISTRY.keys())}"
monitor = MONITOR_REGISTRY[monitorClass]()
else:
monitor = monitorClass()
monitor_status = monitor.monitor(pid, cmd)
return monitor_status
# No monitor class specified, return True
return True


def runSingleCommand(cmd, args):
""" Run a command without processing the result of which tests
are passed and failed. This method is used to pass a single
Expand All @@ -339,6 +459,10 @@ def runSingleCommand(cmd, args):

start = time.time()
p = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
monitor_status = runMonitorClass(args.monitorClass, cmd, p.pid)
if not monitor_status:
print(f"Monitor {args.monitorClass} failed. Killing process {p.pid}")
killAllProcesses(p.pid)
out, err = p.communicate()
end = time.time()
out = out.decode('utf-8')
Expand Down Expand Up @@ -392,14 +516,24 @@ def processStats(out, stats):
return stats


def runCommandWithStats(command, stats):
def runCommandWithStats(command, stats, monitorClass):
""" Run a command and update the stats dictionary """
p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
monitor_status = runMonitorClass(monitorClass, command, p.pid)
monitor_failed_msg = ""
if not monitor_status:
print(f"Monitor {args.monitorClass} failed. Killing process {p.pid}")
monitor_failed_msg = f"[FAILED] Test: class {command.split(' ')[-1]}"
killAllProcesses(p.pid)
out, err = p.communicate()
out = out.decode('utf-8')
err = err.decode('utf-8')
if not monitor_status:
out += monitor_failed_msg

if (err.rfind("Segmentation fault") > 0):
# Run the test again if the monitor passed but the test seg faulted.
# Otherwise, consider this test as not worth to try again.
if (err.rfind("Segmentation fault") > 0 and monitor_status):
print(Colors.REVERSE)
print("[!] RUNNING AGAIN BECAUSE OF A SEG FAULT")
print(Colors.RESET)
Expand Down Expand Up @@ -436,8 +570,8 @@ def runTests(args):
cmd = TORNADO_CMD + options
command = appendTestRunnerClassToCmd(cmd, args)
command = command + " --params \"" + args.testClass + "\""
print(command)
if (args.fast):
print(command)
os.system(command)
else:
runSingleCommand(command, args)
Expand All @@ -447,7 +581,7 @@ def runTests(args):
end = time.time()
print(Colors.CYAN)

if args.fast == False and args.verbose == True:
if not args.fast and args.verbose:
print(Colors.GREEN)
print("==================================================")
print(Colors.BLUE + " Unit tests report " + Colors.GREEN)
Expand All @@ -470,8 +604,10 @@ def runTests(args):
def runTestTheWorld(options, args):
stats = {"[PASS]": 0, "[FAILED]": 0, "[UNSUPPORTED]": 0}

__TORNADO_QUICK_PASS_SKIP_TESTS__ = __TORNADO_HEAVY_TESTS__ + __TORNADO_MEMORY_TESTS__

for t in __TEST_THE_WORLD__:
if args.quickPass and t.testName in __TORNADO_HEAVY_TESTS__:
if args.quickPass and t.testName in __TORNADO_QUICK_PASS_SKIP_TESTS__:
continue
command = options
if t.testParameters:
Expand All @@ -485,24 +621,26 @@ def runTestTheWorld(options, args):
if t.testMethods:
for testMethod in t.testMethods:
testMethodCmd = command + "#" + testMethod + "\""
print(testMethodCmd)
if (args.fast):
os.system(testMethodCmd)
else:
print(testMethodCmd)
stats = runCommandWithStats(testMethodCmd, stats)
stats = runCommandWithStats(testMethodCmd, stats, t.monitorClass)
elif (args.fast):
command += "\""
os.system(command)
else:
command += "\""
print(command)
stats = runCommandWithStats(command, stats)
stats = runCommandWithStats(command, stats, t.monitorClass)

return stats


def runWithJUnit(args):
""" Run the tests using JUNIT """
if args.monitorClass:
print("[WARNING] Monitor class is not supported when running with JUnit.")

if (args.testClass != None):
command = appendTestRunnerClassToCmd(TORNADO_CMD, args)
Expand All @@ -523,6 +661,8 @@ def runTestTheWorldWithJunit(args):

command = appendTestRunnerClassToCmd(command, args)
command += " --params \"" + t.testName + "\""
if t.monitorClass:
print("[WARNING] Monitor class is not supported when running with JUnit.")
if t.testMethods:
for testMethod in t.testMethods:
print(
Expand Down Expand Up @@ -565,6 +705,8 @@ def parseArguments():
help="Pass options to the JVM e.g. -J=\"-Ds0.t0.device=0:1\"")
parser.add_argument('--enableProfiler', action="store", dest="enable_profiler", default=None,
help="Enable the profiler {silent|console}")
parser.add_argument("--monitorClass", action="store", dest="monitorClass", default=None,
help="Monitor class to monitor the JVM process. Options: outOfMemoryMonitor")
args = parser.parse_args()
return args

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public int registerEvent(long oclEventId, EventDescriptor descriptorId, OCLComma
* exit.
*/
if (oclEventId == -1) {
logger.fatal("invalid event: event=0x%x, description=%s, tag=0x%x\n", oclEventId, descriptorId.getNameDescription());
logger.fatal("invalid event: event=0x%x, description=%s\n", oclEventId, descriptorId.getNameDescription());
logger.fatal("terminating application as system integrity has been compromised.");
System.exit(-1);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
*
* <p>
* <code>
* tornado-test -V --fast uk.ac.manchester.tornado.unittests.batches.TestBatches
* tornado-test -V uk.ac.manchester.tornado.unittests.batches.TestBatches
* </code>
* </p>
*/
Expand Down
1 change: 1 addition & 0 deletions tornado-unittests/src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@
exports uk.ac.manchester.tornado.unittests.tools;
exports uk.ac.manchester.tornado.unittests.vectortypes;
exports uk.ac.manchester.tornado.unittests.virtualization;
exports uk.ac.manchester.tornado.unittests.memory.leak;
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
* How to run?
*
* <code>
* $ tornado-test -V --fast uk.ac.manchester.tornado.unittests.api.TestConcat
* $ tornado-test -V uk.ac.manchester.tornado.unittests.api.TestConcat
* </code>
*/
public class TestConcat extends TornadoTestBase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
* How to run?
*
* <code>
* $ tornado-test -V --fast uk.ac.manchester.tornado.unittests.api.TestInitDataTypes
* $ tornado-test -V uk.ac.manchester.tornado.unittests.api.TestInitDataTypes
* </code>
*/
public class TestInitDataTypes extends TornadoTestBase {
Expand Down
Loading

0 comments on commit 0644225

Please sign in to comment.