Skip to content

Commit

Permalink
Use CMake for breakpad (pytorch#63186)
Browse files Browse the repository at this point in the history
Summary:
We currently build breakpad from [this fork](https://github.com/driazati/breakpad) to include extra logic to restore signal handlers that were previously present. With some [new additions](google/breakpad@main...driazati:main) this fork now includes a CMake based build, so we can add breakpad as a proper dependency rather than rely on including it in Docker images as a system library which is error prone (we have a bunch of images) and hard to extend to MacOS / Windows. This also includes some changes to the crash handling code to support MacOS / Windows in a similar way to Linux.

```python
import torch

# On Windows this writes crashes to C:\Users\<user>\AppData\pytorch_crashes
# On MacOS/Linux this writes crashes to /tmp/pytorch_crashes
torch.utils._crash_handler.enable_minidumps()

# Easy way to cause a segfault and trigger the handler
torch.bincount(input=torch.tensor([9223372036854775807]))
```

Pull Request resolved: pytorch#63186

Reviewed By: malfet, seemethere

Differential Revision: D30318404

Pulled By: driazati

fbshipit-source-id: 0d7daf3701cfaba5451cc529a0730272ab1eb1dc
  • Loading branch information
driazati authored and facebook-github-bot committed Aug 19, 2021
1 parent e030b81 commit bd8608c
Show file tree
Hide file tree
Showing 12 changed files with 162 additions and 72 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,6 @@
[submodule "third_party/pocketfft"]
path = third_party/pocketfft
url = https://github.com/mreineck/pocketfft
[submodule "third_party/breakpad"]
path = third_party/breakpad
url = https://github.com/driazati/breakpad.git
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ cmake_dependent_option(
"USE_CUDNN" OFF)
option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
option(USE_KINETO "Use Kineto profiling library" ON)
option(USE_BREAKPAD "Use breakpad crash dump library" ON)
option(USE_CUPTI_SO "Use CUPTI as a shared library" OFF)
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
option(USE_FFMPEG "Use ffmpeg" OFF)
Expand Down Expand Up @@ -264,6 +265,10 @@ if(NOT DEFINED USE_VULKAN)
"ANDROID" OFF)
endif()

if(IOS)
set(USE_BREAKPAD OFF)
endif()

option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
Expand Down
25 changes: 4 additions & 21 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1042,27 +1042,10 @@ if(USE_TBB)
target_link_libraries(torch_cpu PUBLIC TBB::tbb)
endif()


if(LINUX)
find_library(BREAKPAD_LIB breakpad_client
PATHS /usr/local/lib/)
find_path(BREAKPAD_INCLUDE_DIR breakpad
PATHS /usr/local/include/)

if(BREAKPAD_LIB AND BREAKPAD_INCLUDE_DIR)
message(STATUS "found breakpad library")
target_link_libraries(torch_cpu PRIVATE ${BREAKPAD_LIB})
target_compile_definitions(torch_cpu PRIVATE ADD_BREAKPAD_SIGNAL_HANDLER)
target_include_directories(torch_cpu PRIVATE ${BREAKPAD_INCLUDE_DIR}/breakpad)
else()
if(BREAKPAD_INCLUDE_DIR)
message(STATUS "breakpad_client library not found")
elseif(BREAKPAD_LIB)
message(STATUS "breakpad include path not found")
else()
message(STATUS "breakpad_client library and include path not found")
endif()
endif()
if(USE_BREAKPAD)
target_compile_definitions(torch_cpu PRIVATE ADD_BREAKPAD_SIGNAL_HANDLER)
target_include_directories(torch_cpu PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../third_party ${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad/src)
target_link_libraries(torch_cpu PRIVATE breakpad)
endif()

target_include_directories(torch_cpu PRIVATE ${ATen_CPU_INCLUDE})
Expand Down
4 changes: 4 additions & 0 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1881,6 +1881,10 @@ set_target_properties(fmt-header-only PROPERTIES INTERFACE_COMPILE_FEATURES "")
list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)

if(USE_BREAKPAD)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/breakpad)
endif()

# ---[ Kineto
# edge profiler depends on KinetoProfiler but it only does cpu
# profiling. Thus we dont need USE_CUDA/USE_ROCM
Expand Down
1 change: 1 addition & 0 deletions cmake/Summary.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ function(caffe2_print_configuration_summary)
message(STATUS " SELECTED_OP_LIST : ${SELECTED_OP_LIST}")
endif()
message(STATUS " USE_DEPLOY : ${USE_DEPLOY}")
message(STATUS " USE_BREAKPAD : ${USE_BREAKPAD}")
message(STATUS " Public Dependencies : ${Caffe2_PUBLIC_DEPENDENCY_LIBS}")
message(STATUS " Private Dependencies : ${Caffe2_DEPENDENCY_LIBS}")
endfunction()
65 changes: 43 additions & 22 deletions test/test_cpp_extensions_jit.py
Original file line number Diff line number Diff line change
Expand Up @@ -869,11 +869,29 @@ def test_custom_compound_op_autograd(self):

gradcheck(torch.ops.my.add, [a, b], eps=1e-2)

@unittest.skipIf(not has_breakpad(), "Breakpad library must be present on system for crash handler")
@unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler")
def test_crash_handler(self):
def run_test(stderr_file, destination):
# Code to enable dumps and trigger a segfault
@staticmethod
def _crash_handler_test_process(stderr_file, destination):
# Code to enable dumps and trigger a segfault
if sys.platform == "win32":
destination = destination.replace("\\", "\\\\")
csrc = textwrap.dedent(f"""
#include <torch/torch.h>
#include <locale>
#include <iostream>
#include <codecvt>
#include <string>
int fail() {{
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
std::string narrow("{destination}");
std::wstring wide = converter.from_bytes(narrow);
torch::crash_handler::enable_minidumps(wide.c_str());
volatile int* bad = nullptr;
return *bad;
}}
""")
else:
csrc = textwrap.dedent(f"""
#include <torch/torch.h>
Expand All @@ -885,29 +903,32 @@ def run_test(stderr_file, destination):
}}
""")

# Some special stuff to overwrite stderr for a C++ extension
# Copied from: https://stackoverflow.com/questions/8804893/redirect-stdout-from-python-for-c-calls
sys.stdout.flush()
newstdout = os.dup(2)
devnull = os.open(stderr_file, os.O_WRONLY)
os.dup2(devnull, 2)
os.close(devnull)
sys.stdout = os.fdopen(newstdout, 'w')

module = torch.utils.cpp_extension.load_inline(
name="segfault",
cpp_sources=csrc,
functions=["fail"],
)
module.fail()
# Some special stuff to overwrite stderr for a C++ extension
# Copied from: https://stackoverflow.com/questions/8804893/redirect-stdout-from-python-for-c-calls
sys.stdout.flush()
newstdout = os.dup(2)
devnull = os.open(stderr_file, os.O_WRONLY)
os.dup2(devnull, 2)
os.close(devnull)
sys.stdout = os.fdopen(newstdout, 'w')

module = torch.utils.cpp_extension.load_inline(
name="segfault",
cpp_sources=csrc,
functions=["fail"],
)
module.fail()

with tempfile.TemporaryDirectory() as temp_dir, tempfile.NamedTemporaryFile() as stderr:
@unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler")
@unittest.skipIf(not has_breakpad(), "Built without breakpad")
def test_crash_handler(self):
with tempfile.TemporaryDirectory() as temp_dir, tempfile.NamedTemporaryFile(delete=not sys.platform == "win32") as stderr:
# Use multiprocessing to spin up a separate process to make catching
# the segfault easier
p = Process(target=run_test, args=(stderr.name, temp_dir))
p = Process(target=self._crash_handler_test_process, args=(stderr.name, temp_dir))
p.start()
p.join()

with open(stderr.name) as f:
result = f.read().strip()

Expand Down
5 changes: 3 additions & 2 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import torch.hub as hub
from torch.autograd._functions.utils import check_onnx_broadcast
from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
from torch.testing._internal.common_utils import load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, has_breakpad
from torch.testing._internal.common_utils import has_breakpad, load_tests, retry, IS_SANDCASTLE, IS_WINDOWS, TEST_WITH_ASAN
from urllib.error import URLError

# load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
Expand Down Expand Up @@ -739,7 +739,8 @@ def forward(self, x):


class TestCrashHandler(TestCase):
@unittest.skipIf(not has_breakpad(), "Crash handler lib was not linked in")
@unittest.skipIf(TEST_WITH_ASAN, "ASAN disables the crash handler's signal handler")
@unittest.skipIf(not has_breakpad(), "Built without breakpad")
def test_python_exception_writing(self):
with tempfile.TemporaryDirectory() as temp_dir:
torch.utils._crash_handler.enable_minidumps(temp_dir)
Expand Down
1 change: 1 addition & 0 deletions third_party/breakpad
Submodule breakpad added at 469a80
87 changes: 72 additions & 15 deletions torch/csrc/utils/crash_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,16 @@
#include <iostream>

#ifdef ADD_BREAKPAD_SIGNAL_HANDLER
#include <breakpad/client/linux/handler/exception_handler.h>
#ifdef __linux__
#include <breakpad/src/client/linux/handler/exception_handler.h>
#include <csignal>
#elif __APPLE__
#include <breakpad/src/client/mac/handler/exception_handler.h>
#elif _WIN32
#include <breakpad/src/client/windows/handler/exception_handler.h>
#else
#error unsupported platform
#endif
#endif

#include <c10/util/Exception.h>
Expand All @@ -16,9 +24,10 @@ namespace crash_handler {
#ifdef ADD_BREAKPAD_SIGNAL_HANDLER

static std::unique_ptr<google_breakpad::ExceptionHandler> handler; // NOLINT
static std::string minidump_directory; // NOLINT
static STRING_TYPE minidump_directory; // NOLINT
static bool enabled_for_exceptions = false; // NOLINT

#if __linux__
bool dump_callback(
const google_breakpad::MinidumpDescriptor& descriptor,
void* context,
Expand All @@ -28,24 +37,76 @@ bool dump_callback(
}
return succeeded;
}
#elif __APPLE__

void enable_minidumps(const std::string& dir) {
bool dump_callback(
const char* dump_dir,
const char* minidump_id,
void* context,
bool succeeded) {
if (succeeded) {
std::cerr << "Wrote minidump to " << dump_dir << "/" << minidump_id
<< ".dmp" << std::endl;
}
return succeeded;
}
#elif _WIN32
bool dump_callback(
const wchar_t* dump_path,
const wchar_t* minidump_id,
void* context,
EXCEPTION_POINTERS* exinfo,
MDRawAssertionInfo* assertion,
bool succeeded) {
if (succeeded) {
// Printing with wcerr inserts spaces between all the characters for some
// reason. If someone figures that out then we can get rid of the std::string
// conversions here.
std::wstring dump_path_ws(dump_path);
std::string dump_path_string(dump_path_ws.begin(), dump_path_ws.end());
std::wstring minidump_id_ws(minidump_id);
std::string minidump_id_string(minidump_id_ws.begin(), minidump_id_ws.end());
std::cerr << "Wrote minidump to " << dump_path_string << "\\" << minidump_id_string << ".dmp" << std::endl;
}
return succeeded;
}
#endif

void enable_minidumps(const STRING_TYPE& dir) {
minidump_directory = dir;
// The constructor here registers the actual signal handler
// The constructor here registers the actual signal handler
#ifdef __linux__
handler = std::make_unique<google_breakpad::ExceptionHandler>(
google_breakpad::MinidumpDescriptor(minidump_directory),
nullptr,
dump_callback,
nullptr,
true,
-1);
#elif __APPLE__
handler = std::make_unique<google_breakpad::ExceptionHandler>(
/*dump_path=*/minidump_directory.c_str(),
/*filter=*/nullptr,
/*callback=*/dump_callback,
/*callback_context=*/nullptr,
/*install_handler=*/true,
/*port_name=*/nullptr);
#elif _WIN32
handler = std::make_unique<google_breakpad::ExceptionHandler>(
/*dump_path=*/minidump_directory.c_str(),
/*filter=*/nullptr,
/*callback=*/dump_callback,
/*callback_context=*/nullptr,
/*handler_types=*/
google_breakpad::ExceptionHandler::HandlerType::HANDLER_ALL);
#endif
}

void disable_minidumps() {
handler.reset();
}

const std::string& get_minidump_directory() {
const STRING_TYPE& get_minidump_directory() {
if (handler == nullptr) {
AT_ERROR(
"Minidump handler is uninintialized, make sure to call enable_minidumps first");
Expand Down Expand Up @@ -78,32 +139,28 @@ void enable_minidumps_on_exceptions() {

#else
// On unspported systems we can't do anything, so stub out everything.
void enable_minidumps(const std::string& dir) {
AT_ERROR(
"Minidump collection is currently only implemented for Linux platforms");
void enable_minidumps(const STRING_TYPE& dir) {
AT_ERROR("Compiled without minidump support");
}

void disable_minidumps() {
// Purposefully do nothing
}

const std::string& get_minidump_directory() {
AT_ERROR(
"Minidump collection is currently only implemented for Linux platforms");
const STRING_TYPE& get_minidump_directory() {
AT_ERROR("Compiled without minidump support");
}

bool is_enabled_on_exceptions() {
return false;
}

void write_minidump() {
AT_ERROR(
"Minidump collection is currently only implemented for Linux platforms");
AT_ERROR("Compiled without minidump support");
}

void enable_minidumps_on_exceptions() {
AT_ERROR(
"Minidump collection is currently only implemented for Linux platforms");
AT_ERROR("Compiled without minidump support");
}

#endif
Expand Down
10 changes: 8 additions & 2 deletions torch/csrc/utils/crash_handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,16 @@
namespace torch {
namespace crash_handler {

#ifdef _WIN32
typedef std::wstring STRING_TYPE;
#else
typedef std::string STRING_TYPE;
#endif

// Set up a handler that writes minidumps to 'dir' on signals. This is not
// necessary to call unless you want to change 'dir' to something other than
// the default '/tmp/pytorch_crashes'.
TORCH_API void enable_minidumps(const std::string& dir);
TORCH_API void enable_minidumps(const STRING_TYPE& dir);

// Enable minidumps when passing exceptions up to Python. By default these don't
// do anything special, but it can be useful to write out a minidump on
Expand All @@ -19,7 +25,7 @@ TORCH_API void enable_minidumps_on_exceptions();
TORCH_API void disable_minidumps();

// Get the directory that minidumps will be written to
TORCH_API const std::string& get_minidump_directory();
TORCH_API const STRING_TYPE& get_minidump_directory();

// These are TORCH_API'ed since they are used from libtorch_python.so
TORCH_API bool is_enabled_on_exceptions();
Expand Down
23 changes: 16 additions & 7 deletions torch/testing/_internal/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2533,13 +2533,6 @@ def disable_gc():
else:
yield

def has_breakpad() -> bool:
# If not on a special build, check that the library was actually linked in
try:
torch._C._get_minidump_directory() # type: ignore[attr-defined]
return True
except RuntimeError as e:
return False

def find_library_location(lib_name: str) -> Path:
# return the shared library file in the installed folder if exist,
Expand Down Expand Up @@ -2590,6 +2583,22 @@ def get_tensors_from(args, kwargs):
return set([arg for arg in args if isinstance(arg, Tensor)] +
[v for v in kwargs.values() if isinstance(v, Tensor)])


def has_breakpad():
# We always build with breakpad in CI
if IS_IN_CI:
return True

# If not on a special build, check that the library was actually linked in
try:
torch._C._get_minidump_directory() # type: ignore[attr-defined]
return True
except RuntimeError as e:
if "Minidump handler is uninintialized" in str(e):
return True
return False


def sandcastle_skip_if(condition, reason):
"""
Similar to unittest.skipIf, however in the sandcastle environment it just
Expand Down
Loading

0 comments on commit bd8608c

Please sign in to comment.