diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..3d5c903f4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +* text=auto eol=lf +*.jpg binary +*.png binary +*.gif binary diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..d606775fe --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +*~ +*.swp +*.o +make.out +core +a.out +.cproject +.project +.settings/ +nbproject/ +rpmbuild/ +dpkgbuild/ +rpm/ +dpkg/ +.deps +.vscode +*.user +~* +*~ +*.db +*.htmp +*.hpptmp diff --git a/CODING_STYLE.md b/CODING_STYLE.md new file mode 100644 index 000000000..17bb57134 --- /dev/null +++ b/CODING_STYLE.md @@ -0,0 +1 @@ +https://www.cis.upenn.edu/~lee/06cse480/data/cstyle.ms.pdf diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..11fbddd30 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,55 @@ +# Contributing to the strace.ebpf + +Here you'll find instructions on how to contribute to the strace.ebpf. + +Your contributions are most welcome! You'll find it is best to begin +with a conversation about your changes, rather than just writing a bunch +of code and contributing it out of the blue. +There are several good ways to suggest new features, offer to add a feature, +or just begin a dialog about the strace.ebpf: + +* Open an issue in GitHub + +**NOTE: If you do decide to implement code changes and contribute them, +please make sure you agree your contribution can be made available +under the [BSD-style License used for the strace.ebpf] + +**NOTE: Submitting your changes also means that you certify the following:** + +``` +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` + +In case of any doubt, the gatekeeper may ask you to certify the above in writing, +i.e. via email or by including a `Signed-off-by:` line at the bottom +of your commit comments. + +To improve tracking of who is the author of a contribution, we kindly ask you +to use your real name (not an alias) when commiting your changes to the strace.ebpf: +``` +Author: Random J Developer +``` diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 000000000..23f1d4ea0 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,4 @@ +Mon 5 Dec CET 2016 Vitalii Chernookyi + + * Version 0.1 + Extracting from NVML project diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..e1e250bc2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,36 @@ +Copyright 2014-2016, Intel Corporation + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +Everything in this source tree is covered by the previous license +with the following exceptions: + + +* utils/cstyle (used only during development) licensed under CDDL. diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..303c4c754 --- /dev/null +++ b/Makefile @@ -0,0 +1,140 @@ +# +# Copyright 2014-2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# Makefile -- top-level Makefile for NVM Library +# +# Use "make" to build the library. +# +# Use "make doc" to build documentation. +# +# Use "make test" to build unit tests. Add "SKIP_SYNC_REMOTES=y" to skip +# or "FORCE_SYNC_REMOTES=y" to force syncing remote nodes if any is defined. +# +# Use "make check" to run unit tests. +# +# Use "make check-remote" to run only remote unit tests. +# +# Use "make clean" to delete all intermediate files (*.o, etc). +# +# Use "make clobber" to delete everything re-buildable (binaries, etc.). +# +# Use "make cstyle" to run cstyle on all C source files +# +# Use "make check-license" to check copyright and license in all source files +# +# Use "make rpm" to build rpm packages +# +# Use "make dpkg" to build dpkg packages +# +# Use "make source DESTDIR=path_to_dir" to copy source files +# from HEAD to 'path_to_dir/nvml' directory. +# +# As root, use "make install" to install the library in the usual +# locations (/usr/local/lib, /usr/local/include, and /usr/local/share/man). +# You can provide custom directory prefix for installation using +# DESTDIR variable e.g.: "make install DESTDIR=/opt" +# You can override the prefix within DESTDIR using prefix variable +# e.g.: "make install prefix=/usr" + +include src/common.inc + +export SRCVERSION = $(shell git describe 2>/dev/null ||\ + cat .version 2>/dev/null ||\ + git log -1 --format=%h 2>/dev/null) + +RPM_BUILDDIR=rpmbuild +DPKG_BUILDDIR=dpkgbuild +EXPERIMENTAL ?= n +BUILD_PACKAGE_CHECK ?= y +rpm : override DESTDIR=$(CURDIR)/$(RPM_BUILDDIR) +dpkg: override DESTDIR=$(CURDIR)/$(DPKG_BUILDDIR) +rpm dpkg: override prefix=/usr + +all: + $(MAKE) -C src $@ + +doc: + $(MAKE) -C doc all + +clean: + $(MAKE) -C src $@ + $(MAKE) -C doc $@ + $(MAKE) -C utils $@ + $(RM) -r $(RPM_BUILDDIR) $(DPKG_BUILDDIR) + +clobber: + $(MAKE) -C src $@ + $(MAKE) -C doc $@ + $(MAKE) -C utils $@ + $(RM) -r $(RPM_BUILDDIR) $(DPKG_BUILDDIR) rpm dpkg + +test check pcheck check-remote: all + $(MAKE) -C src $@ + +cstyle: + $(MAKE) -C src $@ + $(MAKE) -C utils $@ + @echo Checking files for whitespace issues... + @utils/check_whitespace -g + @echo Done. + +format: + $(MAKE) -C src $@ + $(MAKE) -C utils $@ + @echo Done. + +check-license: + $(MAKE) -C utils $@ + @utils/check_license/check-headers.sh + @echo Done. + +source: + $(if $(shell git rev-parse 2>&1), $(error Not a git repository)) + $(if $(shell git status --porcelain), $(error Working directory is dirty: $(shell git status --porcelain))) + $(if $(DESTDIR), , $(error Please provide DESTDIR variable)) + mkdir -p $(DESTDIR)/nvml + echo -n $(SRCVERSION) > $(DESTDIR)/nvml/.version + git archive HEAD | tar -x -C $(DESTDIR)/nvml + +pkg-clean: + $(RM) -r $(DESTDIR) + +rpm dpkg: pkg-clean source + +utils/build-$@.sh $(SRCVERSION) $(DESTDIR)/nvml $(DESTDIR) $(CURDIR)/$@\ + ${EXPERIMENTAL} ${BUILD_PACKAGE_CHECK} $(CURDIR)/src/test/testconfig.sh + +install uninstall: + $(MAKE) -C src $@ + $(MAKE) -C doc $@ + +.PHONY: all clean clobber test check cstyle check-license install uninstall\ + source rpm dpkg pkg-clean pcheck check-remote format doc $(SUBDIRS) diff --git a/Makefile.inc b/Makefile.inc new file mode 100644 index 000000000..17512213c --- /dev/null +++ b/Makefile.inc @@ -0,0 +1,291 @@ +# Copyright 2014-2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# src/Makefile.inc -- Makefile include for all tools +# + +TOP := $(dir $(lastword $(MAKEFILE_LIST))) + +include $(TOP)/src/common.inc + +INSTALL_TARGET ?= y + +INCS += -I. +INCS += -I$(TOP)/src/include +CFLAGS += -std=gnu99 +CFLAGS += -Wall +CFLAGS += -Werror +CFLAGS += -Wmissing-prototypes +CFLAGS += -Wpointer-arith +CFLAGS += -Wunused-macros +CFLAGS += -Wmissing-field-initializers +CFLAGS += -Wsign-conversion +CFLAGS += -Wsign-compare +ifeq ($(call check_Wconversion), y) +CFLAGS += -Wconversion +endif +CFLAGS += -fno-common + +CFLAGS += -DSRCVERSION='"$(SRCVERSION)"' +ifeq ($(call check_flag, -Wunreachable-code-return), y) +CFLAGS += -Wunreachable-code-return +endif +ifeq ($(call check_flag, -Wmissing-variable-declarations), y) +CFLAGS += -Wmissing-variable-declarations +endif + +ifeq ($(DEBUG),1) +CFLAGS += -ggdb $(EXTRA_CFLAGS_DEBUG) +else +CFLAGS += -O2 -D_FORTIFY_SOURCE=2 $(EXTRA_CFLAGS_RELEASE) +endif + +CFLAGS += $(EXTRA_CFLAGS) + +LDFLAGS += -Wl,-z,relro -Wl,--warn-common -Wl,--fatal-warnings $(EXTRA_LDFLAGS) -L$(TOP)/src/nondebug +TARGET_DIR=$(DESTDIR)$(bindir) +BASH_COMP_FILES ?= +BASH_COMP_DESTDIR = $(DESTDIR)$(bashcompdir) + +TARGET_STATIC_NONDEBUG=$(TARGET).static-nondebug +TARGET_STATIC_DEBUG=$(TARGET).static-debug + +LIBSDIR=$(TOP)/src +LIBSDIR_DEBUG=$(LIBSDIR)/debug +LIBSDIR_NONDEBUG=$(LIBSDIR)/nondebug + +ifneq ($(DEBUG),) +LIBSDIR_PRIV=$(LIBSDIR_DEBUG) +else +LIBSDIR_PRIV=$(LIBSDIR_NONDEBUG) +endif + +PMEMLOG_PRIV_OBJ=$(LIBSDIR_PRIV)/libpmemlog/libpmemlog_unscoped.o +PMEMOBJ_PRIV_OBJ=$(LIBSDIR_PRIV)/libpmemobj/libpmemobj_unscoped.o +PMEMBLK_PRIV_OBJ=$(LIBSDIR_PRIV)/libpmemblk/libpmemblk_unscoped.o + +LIBS += -pthread + +ifeq ($(TOOLS_COMMON), y) +LIBPMEMCOMMON=y +endif + +ifeq ($(LIBPMEMCOMMON), y) +DYNAMIC_LIBS += $(LIBSDIR_DEBUG)/libpmemcommon.a +STATIC_DEBUG_LIBS += $(LIBSDIR_DEBUG)/libpmemcommon.a +STATIC_NONDEBUG_LIBS += $(LIBSDIR_NONDEBUG)/libpmemcommon.a +CFLAGS += -I$(TOP)/src/common +LIBS += -ldl +endif + +ifeq ($(LIBPMEMPOOL), y) +DYNAMIC_LIBS += -lpmempool +STATIC_DEBUG_LIBS += $(LIBSDIR_DEBUG)/libpmempool.a +STATIC_NONDEBUG_LIBS += $(LIBSDIR_NONDEBUG)/libpmempool.a +endif + +ifeq ($(LIBPMEMBLK), y) +DYNAMIC_LIBS += -lpmemblk +STATIC_DEBUG_LIBS += $(LIBSDIR_DEBUG)/libpmemblk.a +STATIC_NONDEBUG_LIBS += $(LIBSDIR_NONDEBUG)/libpmemblk.a +endif + +ifeq ($(LIBPMEMLOG), y) +DYNAMIC_LIBS += -lpmemlog +STATIC_DEBUG_LIBS += $(LIBSDIR_DEBUG)/libpmemlog.a +STATIC_NONDEBUG_LIBS += $(LIBSDIR_NONDEBUG)/libpmemlog.a +endif + +ifeq ($(LIBPMEMOBJ), y) +LIBS += -ldl +DYNAMIC_LIBS += -lpmemobj +STATIC_DEBUG_LIBS += $(LIBSDIR_DEBUG)/libpmemobj.a +STATIC_NONDEBUG_LIBS += $(LIBSDIR_NONDEBUG)/libpmemobj.a +endif + +ifeq ($(LIBPMEM),y) +DYNAMIC_LIBS += -lpmem +STATIC_DEBUG_LIBS += $(LIBSDIR_DEBUG)/libpmem.a +STATIC_NONDEBUG_LIBS += $(LIBSDIR_NONDEBUG)/libpmem.a +endif + +ifeq ($(LIBVMEM),y) +DYNAMIC_LIBS += -lvmem +STATIC_DEBUG_LIBS += $(LIBSDIR_DEBUG)/libvmem.a +STATIC_NONDEBUG_LIBS += $(LIBSDIR_NONDEBUG)/libvmem.a +endif + +ifeq ($(TOOLS_COMMON), y) +vpath %.c $(TOP)/src/tools/pmempool + +OBJS += common.o output.o + +CFLAGS += -I$(TOP)/src/common +CFLAGS += -I$(TOP)/src/libpmemlog +CFLAGS += -I$(TOP)/src/libpmemblk +CFLAGS += -I$(TOP)/src/libpmemobj +CFLAGS += -I$(TOP)/src/tools/pmempool +common.o: CFLAGS += -D__USE_UNIX98 + +endif + +ifneq ($(LIBPMEMLOG_PRIV),) +OBJS += pmemlog_priv.o +endif + +ifneq ($(LIBPMEMOBJ_PRIV),) +OBJS += pmemobj_priv.o +endif + +ifneq ($(LIBPMEMBLK_PRIV),) +OBJS += pmemblk_priv.o +endif + +ifneq ($(HEADERS),) +ifneq ($(filter 1 2, $(CSTYLEON)),) +TMP_HEADERS := $(addsuffix tmp, $(HEADERS)) +endif +endif + +MAKEFILE_DEPS=$(TOP)/src/Makefile.inc $(TOP)/src/common.inc + +ifneq ($(TARGET),) +all: $(TARGET) $(TARGET_STATIC_NONDEBUG) $(TARGET_STATIC_DEBUG) +else +all: +endif + +SYNC_FILE=.synced + +ifneq ($(EXTRA_TARGETS),) +EXTRA_TARGETS_CLEAN = $(EXTRA_TARGETS:=-clean) +EXTRA_TARGETS_CLOBBER = $(EXTRA_TARGETS:=-clobber) +endif + +clean: $(EXTRA_TARGETS_CLEAN) + $(RM) $(OBJS) $(CLEAN_FILES) $(SYNC_FILE) $(TMP_HEADERS) + +clobber: clean $(EXTRA_TARGETS_CLOBBER) +ifneq ($(TARGET),) + $(RM) $(TARGET) + $(RM) $(TARGET_STATIC_NONDEBUG) + $(RM) $(TARGET_STATIC_DEBUG) + $(RM) -r .deps +endif + +install: all +ifeq ($(INSTALL_TARGET),y) +ifneq ($(TARGET),) + install -d $(TARGET_DIR) + install -p -m 0755 $(TARGET) $(TARGET_DIR) +endif +ifneq ($(BASH_COMP_FILES),) + install -d $(BASH_COMP_DESTDIR) + install -p -m 0644 $(BASH_COMP_FILES) $(BASH_COMP_DESTDIR) +endif +endif + +uninstall: +ifeq ($(INSTALL_TARGET),y) +ifneq ($(TARGET),) + $(RM) $(TARGET_DIR)/$(TARGET) +endif +ifneq ($(BASH_COMP_FILES),) + $(RM) $(BASH_COMP_DESTDIR)/$(BASH_COMP_FILES) +endif +endif + +%.gz: % + gzip -c ./$< > $@ + +%.txt: % + man ./$< > $@ + +%.html: % + groff -mandoc -Thtml ./$< > $@ + +$(TARGET) $(TARGET_STATIC_DEBUG) $(TARGET_STATIC_NONDEBUG): $(TMP_HEADERS) $(OBJS) $(MAKEFILE_DEPS) + +$(TARGET_STATIC_DEBUG): $(STATIC_DEBUG_LIBS) + $(CC) $(LDFLAGS) -o $@ $(OBJS) $(STATIC_DEBUG_LIBS) $(LIBS) + +$(TARGET_STATIC_NONDEBUG): $(STATIC_NONDEBUG_LIBS) + $(CC) $(LDFLAGS) -o $@ $(OBJS) $(STATIC_NONDEBUG_LIBS) $(LIBS) + +$(TARGET): + $(CC) $(LDFLAGS) -o $@ $(OBJS) $(DYNAMIC_LIBS) $(LIBS) + +$(PMEMLOG_PRIV_OBJ): + $(MAKE) -C $(LIBSDIR) libpmemlog + +pmemlog_priv.o: $(PMEMLOG_PRIV_OBJ) + $(OBJCOPY) --localize-hidden $(addprefix -G, $(LIBPMEMLOG_PRIV)) $< $@ + +$(PMEMOBJ_PRIV_OBJ): + $(MAKE) -C $(LIBSDIR) libpmemobj + +pmemobj_priv.o: $(PMEMOBJ_PRIV_OBJ) + $(OBJCOPY) --localize-hidden $(addprefix -G, $(LIBPMEMOBJ_PRIV)) $< $@ + +$(PMEMBLK_PRIV_OBJ): + $(MAKE) -C $(LIBSDIR) libpmemblk + +pmemblk_priv.o: $(PMEMBLK_PRIV_OBJ) + $(OBJCOPY) --localize-hidden $(addprefix -G, $(LIBPMEMBLK_PRIV)) $< $@ + +objdir=. + +%.o: %.c $(MAKEFILE_DEPS) + $(call check-cstyle, $<) + @mkdir -p .deps + $(CC) -MD $(CFLAGS) $(INCS) -c -o $@ $< + $(create-deps) + +%.htmp: %.h + $(call check-cstyle, $<, $@) + +test check pcheck: all + +TESTCONFIG=$(TOP)/src/test/testconfig.sh +DIR_SYNC=$(TOP)/src/test/.sync-dir + +$(TESTCONFIG): + +sync-remotes: all $(SYNC_FILE) + +$(SYNC_FILE): $(TARGET) $(TESTCONFIG) +ifeq ($(SCP_TO_REMOTE_NODES), y) + cp $(TARGET) $(DIR_SYNC) + @touch $(SYNC_FILE) +endif + +.PHONY: all clean clobber install uninstall test check pcheck + +-include .deps/*.P diff --git a/README b/README new file mode 100644 index 000000000..6fa569f85 --- /dev/null +++ b/README @@ -0,0 +1,17 @@ +This directory contains a tool which traces syscalls in a fast +way using eBPF linux kernel feature. + +** DEPENDENCIES: ** +The strace.ebpf depends on libbcc library: + +$ sudo apt-get install libbcc + +Bcc sources: + +https://github.com/iovisor/bcc + +** WARNING ** + +Some old libbcc packages require manual coping of libbcc.pc from sources to +appropriate place in a system. In case of Ubuntu 16.04 LTS appropriate place +is /usr/lib/x86_64-linux-gnu/pkgconfig/libbcc.pc. diff --git a/TODO.rst b/TODO.rst new file mode 100644 index 000000000..a87478838 --- /dev/null +++ b/TODO.rst @@ -0,0 +1,95 @@ +TODO +##### + +1. Performance improovement +============================ + +Currently we require a bit more than 1000 nsec for tracing single syscall. +It is not bad but there are at least few places which could allow us to +reduce that values may be to 600 nsec. Every syscall itself currently require +a bit more than 100 nsec for entering, and close value for returning. So a bit +more then 200 nsec together. + +1.1 Extra poll() +----------------- + +Currently libbcc do two same poll() syscalls per iter. There are no reason for +it and we should drop it. It will improove our time for about 200 nsec, but it +is a libbcc bug. Back trace for one of that poll() syscalls: + +(gdb) bt +#0 poll () at ../sysdeps/unix/syscall-template.S:84 +#1 0x00007f9c40a07566 in perf_reader_poll () from /usr/lib/x86_64-linux-gnu/libbcc.so.0 +#2 0x0000000000401a7b in kprobe_poll (b=, timeout=) at BPF.c:82 +#3 0x000000000040175d in main (argc=, argv=0x7fffe635c888) at snoop.c:228 + +1.2 Tracepoints support +------------------------ + +Currently kernel provide a way for fast intercepting of all syscalls together. +But we temporarily can't use it because of this bug: + + - https://github.com/iovisor/bcc/issues/748 + +As soon as bug will be fixed we should try it one time more. + +1.3 out buffering +------------------ + +Optimization of this place is critical to achieve maximum possible log +bandwidth. Most likely we should use fd directly. + + +2. Debuging +============ + +2.1 Enable Valgrind +-------------------- + +Currently Valgrind fails with a message like: + +--12470-- WARNING: unhandled amd64-linux syscall: 321 +==12470== at 0x77F7C19: syscall (syscall.S:38) +==12470== by 0x5129133: bpf_create_map (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x5181809: ??? (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x51AE4A7: ??? (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x51835E6: ??? (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x522FE1C: ??? (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x53DCE85: ??? (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x520B9BD: ??? (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x51E0065: ??? (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x51751A4: ??? (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x51209B3: ebpf::BPFModule::load_cfile(std::__cxx11::basic_string, std::allocator > const&, bool, char const**, int) (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +==12470== by 0x51268FD: ebpf::BPFModule::load_string(std::__cxx11::basic_string, std::allocator > const&, char const**, int) (in /usr/lib/x86_64-linux-gnu/libbcc.so.0.1.8) +--12470-- You may be able to write your own handler. +--12470-- Read the file README_MISSING_SYSCALL_OR_IOCTL. +--12470-- Nevertheless we consider this a bug. Please report +--12470-- it at http://valgrind.org/support/bug_reports.html. + +3. Extra features +================== + +3.1 Multi-process tracing +-------------------------- + +It is not difficult to attach to few PIDs simultaneously. Maybe we should do +it for parrallel applications like apache, nginx and like. + +3.2 Attaching by name +---------------------- + +It is good to have ability to attach to processes not only by PIDs but also by +names. + +3.3 eBPF sources +----------------- + +It is good to have embedded copies of these files as fallback. + +4. Documentation +================= + +4.1 Doc format +--------------- + +It is good to convert rst to md for consistency diff --git a/doc/.gitattributes b/doc/.gitattributes new file mode 100644 index 000000000..d72fd520b --- /dev/null +++ b/doc/.gitattributes @@ -0,0 +1 @@ +*.pdf binary diff --git a/doc/.gitignore b/doc/.gitignore new file mode 100644 index 000000000..016931d98 --- /dev/null +++ b/doc/.gitignore @@ -0,0 +1,2 @@ +*.rst.build_temp +*.png diff --git a/doc/DESIGN.pdf b/doc/DESIGN.pdf new file mode 100644 index 000000000..b92ea873f Binary files /dev/null and b/doc/DESIGN.pdf differ diff --git a/doc/DESIGN.rst b/doc/DESIGN.rst new file mode 100644 index 000000000..a4e258859 --- /dev/null +++ b/doc/DESIGN.rst @@ -0,0 +1,91 @@ +Strace.eBPF +############################### + +.. role:: large1 +.. role:: large2 +.. role:: large3 + +.. space:: 50 + +.. class:: center + +:large3:`Fast syscall's tracing` + +.. space:: 150 + +.. class:: center + +:large1:`Vitalii Chernookyi` + + +Why we need new tool +--------------------- + + - regular system tracing tools are slow + - regular tools slowdown traced application for few orders + - output of regular tools is human-oriented and don't assume automated + processing + - overcoming above problems in regular way require: + + - kernel hacking (sysdig) + - special HW (Lauterbach). + + +Used technologies +------------------ + + - eBPF + - KProbe + - Perf Event Circular Buffer + - event-loop + + +System requirements +-------------------- + + - libbcc + - Linux Kernel 4.4 (for Perf Event Circular Buffer) + - CAP_SYS_ADMIN capability for bpf() syscall + - mounted tracefs + + +Pros +----- + + - Used combination of technologies allow tool to be about one order faster + than regular system strace. + - This tool consume much less amount of CPU resource + - Output of this tool is designed to be suiteable for processing with + classical tools and technologies, like awk. + - Could trace syscalls system-wide. + + +Cons +----- + + - Limited functionality + - Slow attaching and detaching + - Asyncronity. If user will not provide enough system resources for + performace tool will skip some calls. Tool does not assume to try + any work-around behind the scene. + + +Structural Component Diagram +----------------------------- + +.. uml:: + !include DSGN_struct_comp_dia.uml + + +Behavioral Activity Diagram +---------------------------- + +.. uml:: + !include DSGN_beh_act_dia.uml + + +Conclusion +----------- + + - we reached performance about 1000000 syscalls per second. + - there is places for future optimization. diff --git a/doc/DSGN_beh_act_dia.uml b/doc/DSGN_beh_act_dia.uml new file mode 100644 index 000000000..bf7a80c14 --- /dev/null +++ b/doc/DSGN_beh_act_dia.uml @@ -0,0 +1,19 @@ +@startuml +scale 0.50 +start +:Command Line Parsing; +:Loading "command"; +note right +Optional +end note +:Generating eBPF source code; +:Compiling eBPF source code; +:Attaching eBPF handlers to syscalls using KProbe and eBPF VM; +:Starting "command"; +note right +Optional +end note +while (cont?) +partition libbcc { + :poll() - wait for events; +} diff --git a/doc/DSGN_rst_style.yaml b/doc/DSGN_rst_style.yaml new file mode 100644 index 000000000..d65f07715 --- /dev/null +++ b/doc/DSGN_rst_style.yaml @@ -0,0 +1,631 @@ + # This file has RSON syntax which is superset of JSON and, probably(?), + # a subset of YAML. 'yaml' extension is used primarily for syntax + # highlighting. 'json' extension is not used for comments incompatibilty + # reason. + # + # Project's homepage: https://code.google.com/archive/p/rson/ + + # List any fonts you would like to embed in the PDF here + embeddedFonts: [] + + # Default page setup. Can be overridden by including other + # stylesheets with -s + + pageSetup: + size: a5-landscape + width: null + height: null + margin-top: 0cm + margin-bottom: 0cm + margin-left: 0cm + margin-right: 0cm + margin-gutter: 0cm + spacing-header: 0mm + spacing-footer: 0mm + + # The first template is one of the 'pageTemplates" + # (See next section) + + firstTemplate: coverPage + + # pageTemplates can be accessed with the .. raw:: pdf PageBreak command + + pageTemplates: + coverPage: + frames: [] + [0cm, 0cm, 100%, 100%] + showHeader : false + showFooter : false + + emptyPage: + frames: [] + [0cm, 0cm, 100%, 100%] + showHeader : false + showFooter : false + + oneColumn: + frames: [] + [0cm, 0cm, 100%, 100%] + showHeader : true + showFooter : true + + twoColumn: + frames: [] + [0cm, 0cm, 49%, 100%] + [51%, 0cm, 49%, 100%] + showHeader : true + showFooter : true + + threeColumn: + frames: [] + [2%, 0cm, 29.333%, 100%] + [35.333%, 0cm, 29.333%, 100%] + [68.666%, 0cm, 29.333%, 100%] + showHeader : true + showFooter : true + + cutePage: + frames: [] + [0%, 0%, 100%, 100%] + showHeader : true + showFooter : true + defaultFooter : ###Page### + defaultHeader : ###Section### + + fontsAlias: + stdFont: Helvetica + stdBold: Helvetica-Bold + stdItalic: Helvetica-Oblique + stdBoldItalic: Helvetica-BoldOblique + stdSans: Helvetica + stdSansBold: Helvetica-Bold + stdSansItalic: Helvetica-Oblique + stdSansBoldItalic: Helvetica-BoldOblique + stdMono: Courier + stdMonoItalic: Courier-Oblique + stdMonoBold: Courier-Bold + stdMonoBoldItalic: Courier-BoldOblique + stdSerif: Times-Roman + + linkColor: navy + + styles: + base: + parent: null + fontName: stdFont + fontSize: 10 + leading: 12 + leftIndent: 0 + rightIndent: 0 + firstLineIndent: 0 + alignment: TA_LEFT + spaceBefore: 0 + spaceAfter: 0 + bulletFontName: stdFont + bulletFontSize: 10 + bulletIndent: 0 + textColor: black + backColor: null + wordWrap: null + borderWidth: 0 + borderPadding: 0 + borderColor: null + borderRadius: null + allowWidows: false + allowOrphans: false + hyphenation: false + kerning: false + underline: false + strike: false + commands: [] + + normal: + parent: base + + large1: + parent: normal + fontName: stdBold + fontSize: 175% + hyphenation: true + + large2: + parent: large1 + fontName: stdBold + fontSize: 175% + + large3: + parent: large2 + fontName: stdBold + fontSize: 175% + + large4: + parent: large3 + fontName: stdBold + fontSize: 175% + + large5: + parent: large4 + fontName: stdBold + fontSize: 175% + + large6: + parent: large5 + fontName: stdBold + fontSize: 175% + + large7: + parent: large6 + fontName: stdBold + fontSize: 175% + + + title-reference: + parent: normal + fontName: stdItalic + + bodytext: + parent: normal + spaceBefore: 6 + alignment: TA_JUSTIFY + hyphenation: true + fontSize: 175% + + toc: + parent: normal + + blockquote: + parent: bodytext + leftIndent: 20 + + lineblock: + parent: bodytext + + line: + parent: lineblock + spaceBefore: 0 + + toc1: + parent: toc + fontName: stdBold + + toc2: + parent: toc + leftIndent: 20 + + toc3: + parent: toc + leftIndent: 40 + + toc4: + parent: toc + leftIndent: 60 + + toc5: + parent: toc + leftIndent: 80 + + toc6: + parent: toc + leftIndent: 100 + + toc7: + parent: toc + leftIndent: 100 + + toc8: + parent: toc + leftIndent: 100 + + toc9: + parent: toc + leftIndent: 100 + + toc10: + parent: toc + leftIndent: 100 + + toc11: + parent: toc + leftIndent: 100 + + toc12: + parent: toc + leftIndent: 100 + + toc13: + parent: toc + leftIndent: 100 + + toc14: + parent: toc + leftIndent: 100 + + toc15: + parent: toc + leftIndent: 100 + + footer: + parent: normal + alignment: TA_CENTER + + header: + parent: normal + alignment: TA_CENTER + + attribution: + parent: bodytext + alignment: TA_RIGHT + + image: + parent: bodytext + alignment: TA_CENTER + + figure: + parent: bodytext + alignment: TA_CENTER + commands: [] + [VALIGN, [ 0, 0 ], [ -1, -1 ], TOP ] + [ALIGN, [ 0, 0 ], [ -1, -1 ], CENTER ] + colWidths: [100%] + + figure-caption: + parent: bodytext + fontName: stdItalic + alignment: TA_CENTER + + figure-legend: + parent: bodytext + + bullet-list: + parent: bodytext + commands: [] + [VALIGN, [ 0, 0 ], [ -1, -1 ], TOP ] + [RIGHTPADDING, [ 0, 0 ], [ 1, -1 ], 0 ] + colWidths: ["20", null] + + bullet-list-item: + parent: bodytext + + item-list: + parent: bodytext + commands: [] + [VALIGN, [ 0, 0 ], [ -1, -1 ], TOP ] + [RIGHTPADDING, [ 0, 0 ], [ 1, -1 ], 0 ] + colWidths: [20pt,null] + + item-list-item: + parent: bodytext + + definition-list-term: + parent: normal + fontName: stdBold + spaceBefore: 4 + spaceAfter: 0 + keepWithNext: false + + definition-list-classifier: + parent: normal + fontName: stdItalic + + definition: + parent: bodytext + firstLineIndent: 0 + bulletIndent: 0 + spaceBefore: 0 + colWidths: [20pt,null] + commands: [] + [VALIGN, [ 0, 0 ], [ -1, -1 ], TOP ] + [LEFTPADDING, [ 0, 0 ], [ -1, -1 ], 0 ] + [BOTTOMPADDING, [ 0, 0 ], [ -1, -1 ], 0 ] + [RIGHTPADDING, [ 0, 0 ], [ -1, -1 ], 0 ] + + fieldname: + parent: bodytext + alignment: TA_RIGHT + fontName: stdBold + + fieldvalue: + parent: bodytext + + rubric: + parent: bodytext + textColor: darkred + alignment: TA_CENTER + + italic: + parent: bodytext + fontName: stdItalic + + heading: + parent: normal + keepWithNext: true + spaceBefore: 12 + spaceAfter: 6 + fontSize: 175% + alignment: TA_CENTER + + title: + parent: heading + fontName: stdBold + fontSize: 200% + alignment: TA_CENTER + keepWithNext: false + spaceAfter: 10 + + subtitle: + parent: title + spaceBefore: 12 + fontSize: 75% + + heading1: + parent: heading + fontName: stdBold + fontSize: 175% + + heading2: + parent: heading + fontName: stdBold + fontSize: 150% + + heading3: + parent: heading + fontName: stdBoldItalic + fontSize: 125% + + heading4: + parent: heading + fontName: stdBoldItalic + + heading5: + parent: heading + fontName: stdBoldItalic + + heading6: + parent: heading + fontName: stdBoldItalic + + topic-title: + parent: heading3 + + sidebar-title: + parent: heading3 + + sidebar-subtitle: + parent: heading4 + + sidebar: + float: none + width: 100% + parent: normal + backColor: beige + borderColor: darkgray + borderPadding: 8 + borderWidth: 0.5 + + admonition: + parent: normal + spaceBefore: 12 + spaceAfter: 6 + borderPadding: [16,16,16,16] + backColor: beige + borderColor: darkgray + borderWidth: 0.5 + commands:[] + [VALIGN, [ 0, 0 ], [ -1, -1 ], TOP ] + + attention: + parent: admonition + + caution: + parent: admonition + + danger: + parent: admonition + + error: + parent: admonition + + hint: + parent: admonition + + important: + parent: admonition + + note: + parent: admonition + + tip: + parent: admonition + + warning: + parent: admonition + + admonition-title: + parent: heading3 + + admonition-heading: + parent: heading3 + + attention-heading: + parent: admonition-heading + + caution-heading: + parent: admonition-heading + + danger-heading: + parent: admonition-heading + + error-heading: + parent: admonition-heading + + hint-heading: + parent: admonition-heading + + important-heading: + parent: admonition-heading + + note-heading: + parent: admonition-heading + + tip-heading: + parent: admonition-heading + + warning-heading: + parent: admonition-heading + + literal: + parent: normal + fontName: stdMono + firstLineIndent: 0 + hyphenation: false + wordWrap: null + + aafigure: + parent: literal + + table: + spaceBefore:6 + spaceAfter:0 + alignment: TA_CENTER + commands: [] + [VALIGN, [ 0, 0 ], [ -1, -1 ], TOP ] + [INNERGRID, [ 0, 0 ], [ -1, -1 ], 0.25, black ] + [ROWBACKGROUNDS, [0, 0], [-1, -1], [white,#E0E0E0]] + [BOX, [ 0, 0 ], [ -1, -1 ], 0.25, black ] + + table-title: + parent : heading4 + keepWithNext: false + alignment : TA_CENTER + + table-heading: + parent : heading + backColor : beige + alignment : TA_CENTER + valign : BOTTOM + borderPadding : 0 + + table-body: + parent : normal + + dedication: + parent : normal + + abstract: + parent : normal + + contents: + parent : normal + + tableofcontents: + parent : normal + + code: + parent: literal + leftIndent: 0 + spaceBefore: 8 + spaceAfter: 8 + backColor: beige + borderColor: darkgray + borderWidth: 0.5 + borderPadding: 6 + + linenumber: + parent: code + + right: + parent: bodytext + alignment: right + + center: + parent: bodytext + alignment: center + + pygments-n: parent: code + pygments-nx: parent: code + pygments-p: parent: code + pygments-hll: {parent: code, backColor: #ffffcc} + pygments-c: {textColor: #008800, parent: code} + pygments-err: {parent: code} + pygments-k: {textColor: #AA22FF, parent: code} + pygments-o: {textColor: #666666, parent: code} + pygments-cm: {textColor: #008800, parent: code} + pygments-cp: {textColor: #008800, parent: code} + pygments-c1: {textColor: #008800, parent: code} + pygments-cs: {textColor: #008800, parent: code} + pygments-gd: {textColor: #A00000, parent: code} + pygments-ge: {parent: code} + pygments-gr: {textColor: #FF0000, parent: code} + pygments-gh: {textColor: #000080, parent: code} + pygments-gi: {textColor: #00A000, parent: code} + pygments-go: {textColor: #808080, parent: code} + pygments-gp: {textColor: #000080, parent: code} + pygments-gs: {parent: code} + pygments-gu: {textColor: #800080, parent: code} + pygments-gt: {textColor: #0040D0, parent: code} + pygments-kc: {textColor: #AA22FF, parent: code} + pygments-kd: {textColor: #AA22FF, parent: code} + pygments-kn: {textColor: #AA22FF, parent: code} + pygments-kp: {textColor: #AA22FF, parent: code} + pygments-kr: {textColor: #AA22FF, parent: code} + pygments-kt: {textColor: #00BB00, parent: code} + pygments-m: {textColor: #666666, parent: code} + pygments-s: {textColor: #BB4444, parent: code} + pygments-na: {textColor: #BB4444, parent: code} + pygments-nb: {textColor: #AA22FF, parent: code} + pygments-nc: {textColor: #0000FF, parent: code} + pygments-no: {textColor: #880000, parent: code} + pygments-nd: {textColor: #AA22FF, parent: code} + pygments-ni: {textColor: #999999, parent: code} + pygments-ne: {textColor: #D2413A, parent: code} + pygments-nf: {textColor: #00A000, parent: code} + pygments-nl: {textColor: #A0A000, parent: code} + pygments-nn: {textColor: #0000FF, parent: code} + pygments-nt: {textColor: #008000, parent: code} + pygments-nv: {textColor: #B8860B, parent: code} + pygments-ow: {textColor: #AA22FF, parent: code} + pygments-w: {textColor: #bbbbbb, parent: code} + pygments-mf: {textColor: #666666, parent: code} + pygments-mh: {textColor: #666666, parent: code} + pygments-mi: {textColor: #666666, parent: code} + pygments-mo: {textColor: #666666, parent: code} + pygments-sb: {textColor: #BB4444, parent: code} + pygments-sc: {textColor: #BB4444, parent: code} + pygments-sd: {textColor: #BB4444, parent: code} + pygments-s2: {textColor: #BB4444, parent: code} + pygments-se: {textColor: #BB6622, parent: code} + pygments-sh: {textColor: #BB4444, parent: code} + pygments-si: {textColor: #BB6688, parent: code} + pygments-sx: {textColor: #008000, parent: code} + pygments-sr: {textColor: #BB6688, parent: code} + pygments-s1: {textColor: #BB4444, parent: code} + pygments-ss: {textColor: #B8860B, parent: code} + pygments-bp: {textColor: #AA22FF, parent: code} + pygments-vc: {textColor: #B8860B, parent: code} + pygments-vg: {textColor: #B8860B, parent: code} + pygments-vi: {textColor: #B8860B, parent: code} + pygments-il: {textColor: #666666, parent: code} + + endnote: + parent: bodytext + commands: [] + [VALIGN, [ 0, 0 ], [ -1, -1 ], TOP ] + [TOPPADDING, [ 0, 0 ], [ -1, -1 ], 0 ] + [BOTTOMPADDING, [ 0, 0 ], [ -1, -1 ], 0 ] + [RIGHTPADDING, [ 0, 0 ], [ 1, -1 ], 0 ] + colWidths: [3cm, null] + + field-list: + parent: bodytext + commands: [] + [VALIGN, [ 0, 0 ], [ -1, -1 ], TOP ] + [TOPPADDING, [ 0, 0 ], [ -1, -1 ], 0 ] + colWidths: [3cm, null] + spaceBefore: 6 + + option-list: + commands: [] + [VALIGN, [ 0, 0 ], [ -1, -1 ], TOP ] + [TOPPADDING, [ 0, 0 ], [ -1, -1 ], 0 ] + colWidths: [null,null] + diff --git a/doc/DSGN_struct_comp_dia.uml b/doc/DSGN_struct_comp_dia.uml new file mode 100644 index 000000000..1bf4b11fd --- /dev/null +++ b/doc/DSGN_struct_comp_dia.uml @@ -0,0 +1,43 @@ +@startuml +skinparam componentStyle uml2 +scale 0.6 +package "User Space" { +[Traced Application] as TA +[Tracing Tool] as TT +} +cloud { +() "Circular Buffer" as CB +CB -left-> TT : events +} +package "Kernel Space" { +folder "SysCalls table" { +() "SC #1" as SC1 +() "..." as SC2 +() "SC #x" as SCx +() "..." as SC4 +() "SC #n" as SCn +TA -down-> SCx : SysCall +} +'1 +[EBPF VM #1] as VM1 +SC1 -down-> VM1 : KProbe +VM1 -up-> CB : event + +[SC #1 Handler] as SCH1 +VM1 -down-> SCH1 : KProbe +'x +[EBPF VM #x] as VMx +SCx -down-> VMx : KProbe +VMx -up-> CB : event + +[SC #x Handler] as SCHx +VMx -down-> SCHx : KProbe +'n +[EBPF VM #n] as VMn +SCn -down-> VMn : KProbe +VMn -up-> CB : event + +[SC #n Handler] as SCHn +VMn -down-> SCHn : KProbe +} +@enduml diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 000000000..c7424edd8 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,43 @@ +# Copyright 2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# Makefile -- Makefile for documentation +# + + +DESIGN.pdf: DESIGN.rst Makefile DSGN_rst_style.yaml *.uml + rst2pdf -c -b 1 \ + --verbose \ + --default-dpi=300 \ + -l en \ + -e preprocess -e plantuml \ + -s DSGN_rst_style.yaml \ + $< diff --git a/man/.gitignore b/man/.gitignore new file mode 100644 index 000000000..976a94634 --- /dev/null +++ b/man/.gitignore @@ -0,0 +1,5 @@ +*.txt +*.html +*.gz +cpp_html +LICENSE diff --git a/man/Makefile b/man/Makefile new file mode 100644 index 000000000..065c3c748 --- /dev/null +++ b/man/Makefile @@ -0,0 +1,112 @@ +# +# Copyright 2014-2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# +# man/Makefile -- Makefile for man page +# + +#include ../src/common.inc + +MANPAGES_1_MD = strace.ebpf.1.md + +MANPAGES_BUILDDIR = generated + +MANPAGES_1_GROFF = $(MANPAGES_1_MD:.1.md=.1) + +MANPAGES_1 = $(MANPAGES_1_GROFF) + + +MANPAGES_GROFF_1 = $(MANPAGES_1:.1.md=.1) + +MANPAGES = $(MANPAGES_GROFF_1) + +MANPAGES_BUILD = $(addprefix $(MANPAGES_BUILDDIR)/, $(MANPAGES)) + +HTMLFILES = $(MANPAGES_BUILD:=.html) +TXTFILES = $(MANPAGES_BUILD:=.txt) + +GZFILES_1 = $(MANPAGES_1:=.gz) +GZFILES = $(GZFILES_1) + +GZFILES_BUILD = $(addprefix $(MANPAGES_BUILDDIR)/, $(GZFILES)) +GZFILES_1_BUILD = $(addprefix $(MANPAGES_BUILDDIR)/, $(GZFILES_1)) + +MANPAGES_DESTDIR_1 = $(DESTDIR)$(man1dir) + +DOCS_DESTDIR = $(DESTDIR)$(docdir) + +all: $(MANPAGES_BUILD) $(TXTFILES) | $(MANPAGES_BUILDDIR) + +$(MANPAGES_BUILDDIR): + $(MKDIR) -p $@ + +%.txt: % + man ./$< > $@ + +groff: $(MANPAGES_1) + +doxygen_docs: + doxygen cppobj.Doxyfile + +html: $(HTMLFILES) doxygen_docs + +%.html: % + groff -mandoc -Thtml ./$< > $@ + +$(MANPAGES_BUILDDIR)/%.1: %.1.md default.man ../utils/md2man.sh FORCE + ../utils/md2man.sh ./$< default.man $@ + +compress: $(GZFILES_BUILD) + +%.gz: + gzip -c ./$* > $@ + +clean: + +clobber: clean + $(RM) -rf $(DOXYGEN_HTMLDIR) \ + $(MANPAGES_BUILDDIR)/*.txt \ + $(MANPAGES_BUILDDIR)/*.html \ + $(MANPAGES_BUILDDIR)/*.gz + + +install: compress + install -d $(MANPAGES_DESTDIR_1) + install -p -m 0644 $(GZFILES_1_BUILD) $(MANPAGES_DESTDIR_1) + +uninstall: + $(foreach f, $(GZFILES_1), $(RM) $(MANPAGES_DESTDIR_1)/$(f)) + +FORCE: + +.PHONY: all html clean compress clobber cstyle install uninstall install-cpp\ + uninstall-cpp doxygen_docs diff --git a/man/README b/man/README new file mode 100644 index 000000000..732c2239e --- /dev/null +++ b/man/README @@ -0,0 +1,11 @@ +strace.ebpf + +This is man/README. + +This directory contains source for the man page. + +To create more readable text files from the source, use: + $ make +An even more convenient way to read these is to use the "man" command to +format them (includes bold, underline, etc. when run in a terminal window): + $ man -l strace.ebpf.1 diff --git a/man/default.man b/man/default.man new file mode 100644 index 000000000..faf6e572f --- /dev/null +++ b/man/default.man @@ -0,0 +1,59 @@ +$if(has-tables)$ +.\"t +$endif$ +$if(pandoc-version)$ +.\" Automatically generated by Pandoc $pandoc-version$ +.\" +$endif$ +$if(adjusting)$ +.ad $adjusting$ +$endif$ +.TH "$title$" "$section$" "$version$" "$date$" "$footer$" "$header$" +$if(hyphenate)$ +.hy +$else$ +.nh \" Turn off hyphenation by default. +$endif$ +$for(header-includes)$ +$header-includes$ +$endfor$ +.\" Copyright 2014-$year$, Intel Corporation +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" +.\" * Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" +.\" * Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in +.\" the documentation and/or other materials provided with the +.\" distribution. +.\" +.\" * Neither the name of the copyright holder nor the names of its +.\" contributors may be used to endorse or promote products derived +.\" from this software without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +.\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +.\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +.\" OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +.\" SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +.\" LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.\" OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +$for(include-before)$ +$include-before$ +$endfor$ +$body$ +$for(include-after)$ +$include-after$ +$endfor$ +$if(author)$ +.SH AUTHORS +$for(author)$$author$$sep$; $endfor$. +$endif$ diff --git a/man/generated/strace.ebpf.1 b/man/generated/strace.ebpf.1 new file mode 100644 index 000000000..dfd1042af --- /dev/null +++ b/man/generated/strace.ebpf.1 @@ -0,0 +1,291 @@ +.\" Automatically generated by Pandoc 1.16.0.2 +.\" +.TH "strace" "1" "pmem Tools version 1.0.2" "" "" "" +.hy +.\" Copyright 2014-2016, Intel Corporation +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" +.\" * Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" +.\" * Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in +.\" the documentation and/or other materials provided with the +.\" distribution. +.\" +.\" * Neither the name of the copyright holder nor the names of its +.\" contributors may be used to endorse or promote products derived +.\" from this software without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +.\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +.\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +.\" OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +.\" SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +.\" LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +.\" OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.SH NAME +.PP +\f[B]strace.ebpf\f[] \-\- extreamely fast strace\-like tool builded on +top of eBPF and KProbe technologies. +.SH SYNOPSIS +.IP +.nf +\f[C] +$\ strace.ebpf\ [options]\ [command\ [arg\ ...]] +\f[] +.fi +.SH DESCRIPTION +.PP +strace.ebpf is a limited functional strace equivalent for Linux but +based on eBPF and KProbe technologies and libbcc library. +.IP \[bu] 2 +Pros: +.RS 2 +.IP \[bu] 2 +Used combination of technologies allow tool to be about one order faster +than regular system strace. +.IP \[bu] 2 +This tool consume much less amount of CPU resource +.IP \[bu] 2 +Output of this tool is designed to be suiteable for processing with +classical tools and technologies, like awk. +.IP \[bu] 2 +Could trace syscalls system\-wide. +.RE +.IP \[bu] 2 +Cons: +.RS 2 +.IP \[bu] 2 +Limited functionality +.IP \[bu] 2 +Slow attaching and detaching +.IP \[bu] 2 +Asyncronity. +If user will not provide enough system resources for performace tool +will skip some calls. +Tool does not assume to try any work\-around behind the scene. +.RE +.PP +WARNING: System\-wide tracing can fill out your disk really fast. +.SH OPTIONS +.PP +\f[C]\-t,\ \-\-timestamp\f[] +.PP +include timestamp in output +.PP +\f[C]\-X,\ \-\-failed\f[] +.PP +only show failed syscalls +.PP +\f[C]\-d,\ \-\-debug\f[] +.PP +enable debug output +.PP +\f[C]\-p,\ \-\-pid\f[] +.PP +this PID only. +Command arg should be missing +.PP +\f[C]\-o,\ \-\-output\f[] +.PP +filename +.PP +\f[C]\-l,\ \-\-format\f[] +.PP +output logs format. +Possible values: +.IP +.nf +\f[C] +\[aq]bin\[aq],\ \[aq]binary\[aq],\ \[aq]hex\[aq],\ \[aq]strace\[aq],\ \[aq]list\[aq]\ &\ \[aq]help\[aq]. +\f[] +.fi +.PP +\[aq]bin\[aq]/\[aq]binary\[aq] file format is described in generated +trace.h. +If current directory is not writable generating is skipped. +.PP +Default: \[aq]hex\[aq] +.PP +\f[C]\-K,\ \-\-hex\-separator\f[] +.PP +set field separator for hex logs. +Default is \[aq]\[aq]. +.PP +\f[C]\-e,\ \-\-expr\f[] +.PP +expression, \[aq]help\[aq] or \[aq]list\[aq] for supported list. +.PP +Default: trace=kp\-kern\-all. +.PP +\f[C]\-L,\ \-\-list\f[] +.PP +Print a list of all traceable syscalls of the running kernel. +.PP +\f[C]\-R,\ \-\-ll\-list\f[] +.PP +Print a list of all traceable low\-level funcs of the running kernel. +.PP +WARNING: really long. +~45000 functions for 4.4 kernel. +.PP +\f[C]\-b,\ \-\-builtin\-list\f[] +.PP +Print a list of all syscalls known by glibc. +.PP +\f[C]\-h,\ \-\-help\f[] +.PP +print help +.SH CONFIGURATION +.PP +** System Configuring ** +.IP "1." 3 +You should provide permissions to access tracefs for final user. +.IP "2." 3 +It\[aq]s good to put this command in init scripts such as local.rc: +.RS 4 +.PP +echo 1 > /proc/sys/net/core/bpf_jit_enable +.PP +It will significantly improve performance and avoid \[aq]Lost +events\[aq] +.RE +.IP "3." 3 +You should increase "Open File Limit", for example according to this +instruction: +.RS 4 +.PP +https://easyengine.io/tutorials/linux/increase\-open\-files\-limit/ +.RE +.SH FILES +.PP +Putting into current directory following files allow to customize eBPF +code for supporting more newer eBPF VM features in newer kernels. +Also if current directory does not contain trace.h strace.ebpf on first +start saves built\-in trace.h into current directory. +Saved built\-in describe binary log\[aq]s format. +.IP \[bu] 2 +trace.h +.IP \[bu] 2 +trace_head.c +.IP \[bu] 2 +trace_tp_all.c +.IP \[bu] 2 +trace_kern_tmpl.c +.IP \[bu] 2 +trace_libc_tmpl.c +.IP \[bu] 2 +trace_file_tmpl.c +.IP \[bu] 2 +trace_fileat_tmpl.c +.SH EXAMPLES +.SH Example output: +.PP +# ./strace.ebpf \-l hex +.PP +\&./strace.ebpf \-l hex PID ERR RES SYSCALL ARG1 ARG2 ARG3 AUX_DATA +0000000000000AFD 000000000000000B FFFFFFFFFFFFFFFF read 0000000000000005 +0000000000000427 0000000000000000 0000000000000020 read 000000000000000A +0000000000000B3D 0000000000000000 0000000000000001 write +000000000000001C 0000000000000B11 0000000000000000 0000000000000001 read +000000000000001B 0000000000000427 0000000000000000 0000000000000020 read +000000000000000A 0000000000000B3D 0000000000000000 0000000000000001 +write 000000000000001C 0000000000000B11 0000000000000000 +0000000000000001 read 000000000000001B 0000000000000B3D 0000000000000000 +0000000000000001 write 000000000000001C 0000000000000B11 +0000000000000000 0000000000000001 read 000000000000001B 0000000000000B3D +0000000000000000 0000000000000001 write 000000000000001C +0000000000000B11 0000000000000000 0000000000000001 read 000000000000001B +... +.PP +^C +.PP +# +.SH The \-p option can be used to filter on a PID, which is filtered +in\-kernel. +.PP +Here \-t option is used to print timestamps: +.PP +# ./strace.ebpf \-l hex \-tp 2833 +.PP +\&./strace.ebpf \-l hex \-tp 2833 PID TIME(usec) ERR RES SYSCALL ARG1 +ARG2 ARG3 AUX_DATA +.PP +0000000000000B11 0000000000000000 0000000000000000 0000000000000001 read +000000000000001B +.PP +0000000000000B11 0000000000004047 0000000000000000 0000000000000001 read +000000000000001B +.PP +0000000000000B11 0000000000008347 0000000000000000 0000000000000001 read +000000000000001B +.PP +0000000000000B11 000000000000C120 0000000000000000 0000000000000001 read +000000000000001B +.PP +0000000000000B11 000000000000C287 0000000000000000 0000000000000001 read +000000000000001B +.PP +0000000000000B11 000000000000C508 0000000000000000 0000000000000001 read +000000000000001B +.PP +0000000000000B11 0000000000010548 0000000000000000 0000000000000001 read +000000000000001B +.PP +0000000000000B11 00000000000144A4 0000000000000000 0000000000000001 read +000000000000001B +.PP +\&... +.PP +^C +.PP +# +.SH The \-X option only prints failed syscalls: +.PP +# ./strace.ebpf \-l hex \-X mkdir . +.PP +\&./strace.ebpf \-l hex \-X mkdir . +.PP +PID ERR RES SYSCALL ARG1 ARG2 ARG3 AUX_DATA +.PP +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open +/usr/share/locale/en_US/LC_MESSAGES/coreutils.mo mkdir +.PP +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open +/usr/share/locale/en/LC_MESSAGES/coreutils.mo mkdir +.PP +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open +/usr/share/locale\-langpack/en_US/LC_MESSAGES/coreutils.mo mkdir +.PP +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open +/usr/lib/x86_64\-linux\-gnu/charset.alias mkdir +.PP +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open +/usr/share/locale/en_US/LC_MESSAGES/libc.mo mkdir +.PP +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open +/usr/share/locale/en/LC_MESSAGES/libc.mo mkdir +.PP +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open +/usr/share/locale\-langpack/en_US/LC_MESSAGES/libc.mo mkdir +.PP +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open +/usr/share/locale\-langpack/en/LC_MESSAGES/libc.mo mkdir +.PP +# +.PP +The ERR column is the system error number. +Error number 2 is ENOENT: no such file or directory. +.SH SEE ALSO +.PP +\f[B]strace\f[](1), \f[B]bpf\f[](2), \f[B]\f[]. +.PP +Also Documentation/networking/filter.txt in kernel sources. diff --git a/man/strace.ebpf.1.md b/man/strace.ebpf.1.md new file mode 100644 index 000000000..c207a4994 --- /dev/null +++ b/man/strace.ebpf.1.md @@ -0,0 +1,273 @@ +--- +layout: manual +Content-Style: 'text/css' +title: strace.ebpf(1) +header: NVM Library +date: pmem Tools version 1.0.2 +... + +[comment]: <> (Copyright 2016, Intel Corporation) + +[comment]: <> (Redistribution and use in source and binary forms, with or without) +[comment]: <> (modification, are permitted provided that the following conditions) +[comment]: <> (are met:) +[comment]: <> ( * Redistributions of source code must retain the above copyright) +[comment]: <> ( notice, this list of conditions and the following disclaimer.) +[comment]: <> ( * Redistributions in binary form must reproduce the above copyright) +[comment]: <> ( notice, this list of conditions and the following disclaimer in) +[comment]: <> ( the documentation and/or other materials provided with the) +[comment]: <> ( distribution.) +[comment]: <> ( * Neither the name of the copyright holder nor the names of its) +[comment]: <> ( contributors may be used to endorse or promote products derived) +[comment]: <> ( from this software without specific prior written permission.) + +[comment]: <> (THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS) +[comment]: <> ("AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT) +[comment]: <> (LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR) +[comment]: <> (A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT) +[comment]: <> (OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,) +[comment]: <> (SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT) +[comment]: <> (LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,) +[comment]: <> (DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY) +[comment]: <> (THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT) +[comment]: <> ((INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE) +[comment]: <> (OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.) + +[comment]: <> (strace.ebpf.1 -- man page for strace.ebpf) + +[NAME](#name)
+[SYNOPSIS](#synopsis)
+[DESCRIPTION](#description)
+[OPTIONS](#options)
+[CONFIGURATION](#configuration)
+[FILES](#files)
+[EXAMPLES](#examples)
+[SEE ALSO](#see-also)
+ + +# NAME # + +**strace.ebpf** -- extreamely fast strace-like tool builded on top of eBPF +and KProbe technologies. + + +# SYNOPSIS # + +``` +$ strace.ebpf [options] [command [arg ...]] +``` + + +# DESCRIPTION # + +strace.ebpf is a limited functional strace equivalent for Linux but based on +eBPF and KProbe technologies and libbcc library. + ++ Pros: + + - Used combination of technologies allow tool to be about one order faster + than regular system strace. + - This tool consume much less amount of CPU resource + - Output of this tool is designed to be suiteable for processing with + classical tools and technologies, like awk. + - Could trace syscalls system-wide. + ++ Cons: + + - Limited functionality + - Slow attaching and detaching + - Asyncronity. If user will not provide enough system resources for + performace tool will skip some calls. Tool does not assume to try + any work-around behind the scene. + + +WARNING: System-wide tracing can fill out your disk really fast. + + +# OPTIONS # + +`-t, --timestamp` + +include timestamp in output + +`-X, --failed` + +only show failed syscalls + +`-d, --debug` + +enable debug output + +`-p, --pid` + +this PID only. Command arg should be missing + +`-o, --output` + +filename + +`-l, --format` + +output logs format. Possible values: + + 'bin', 'binary', 'hex', 'strace', 'list' & 'help'. + +'bin'/'binary' file format is described in generated trace.h. If current +directory is not writable generating is skipped. + +Default: 'hex' + +`-K, --hex-separator` + +set field separator for hex logs. Default is '\t'. + +`-e, --expr` + +expression, 'help' or 'list' for supported list. + +Default: trace=kp-kern-all. + +`-L, --list` + +Print a list of all traceable syscalls of the running kernel. + +`-R, --ll-list` + +Print a list of all traceable low-level funcs of the running kernel. + +WARNING: really long. ~45000 functions for 4.4 kernel. + +`-b, --builtin-list` + +Print a list of all syscalls known by glibc. + +`-h, --help` + +print help + + +# CONFIGURATION # + +** System Configuring ** + +1. You should provide permissions to access tracefs for final user. + +2. It's good to put this command in init scripts such as local.rc: + + echo 1 > /proc/sys/net/core/bpf_jit_enable + + It will significantly improve performance and avoid 'Lost events' + +3. You should increase "Open File Limit", for example according to this + instruction: + + https://easyengine.io/tutorials/linux/increase-open-files-limit/ + + +# FILES # + +Putting into current directory following files allow to customize eBPF code for +supporting more newer eBPF VM features in newer kernels. Also if current +directory does not contain trace.h strace.ebpf on first start saves built-in +trace.h into current directory. Saved built-in describe binary log's format. + +- trace.h +- trace_head.c +- trace_tp_all.c +- trace_kern_tmpl.c +- trace_libc_tmpl.c +- trace_file_tmpl.c +- trace_fileat_tmpl.c + + +# EXAMPLES # + +#Example output: + + # ./strace.ebpf -l hex + +./strace.ebpf -l hex +PID ERR RES SYSCALL ARG1 ARG2 ARG3 AUX_DATA +0000000000000AFD 000000000000000B FFFFFFFFFFFFFFFF read 0000000000000005 +0000000000000427 0000000000000000 0000000000000020 read 000000000000000A +0000000000000B3D 0000000000000000 0000000000000001 write 000000000000001C +0000000000000B11 0000000000000000 0000000000000001 read 000000000000001B +0000000000000427 0000000000000000 0000000000000020 read 000000000000000A +0000000000000B3D 0000000000000000 0000000000000001 write 000000000000001C +0000000000000B11 0000000000000000 0000000000000001 read 000000000000001B +0000000000000B3D 0000000000000000 0000000000000001 write 000000000000001C +0000000000000B11 0000000000000000 0000000000000001 read 000000000000001B +0000000000000B3D 0000000000000000 0000000000000001 write 000000000000001C +0000000000000B11 0000000000000000 0000000000000001 read 000000000000001B +... + +^C + + # + + +#The -p option can be used to filter on a PID, which is filtered in-kernel. +Here -t option is used to print timestamps: + + # ./strace.ebpf -l hex -tp 2833 + +./strace.ebpf -l hex -tp 2833 +PID TIME(usec) ERR RES SYSCALL ARG1 ARG2 ARG3 AUX_DATA + +0000000000000B11 0000000000000000 0000000000000000 0000000000000001 read 000000000000001B + +0000000000000B11 0000000000004047 0000000000000000 0000000000000001 read 000000000000001B + +0000000000000B11 0000000000008347 0000000000000000 0000000000000001 read 000000000000001B + +0000000000000B11 000000000000C120 0000000000000000 0000000000000001 read 000000000000001B + +0000000000000B11 000000000000C287 0000000000000000 0000000000000001 read 000000000000001B + +0000000000000B11 000000000000C508 0000000000000000 0000000000000001 read 000000000000001B + +0000000000000B11 0000000000010548 0000000000000000 0000000000000001 read 000000000000001B + +0000000000000B11 00000000000144A4 0000000000000000 0000000000000001 read 000000000000001B + +... + +^C + + # + + +#The -X option only prints failed syscalls: + + # ./strace.ebpf -l hex -X mkdir . + +./strace.ebpf -l hex -X mkdir . + +PID ERR RES SYSCALL ARG1 ARG2 ARG3 AUX_DATA + +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open /usr/share/locale/en_US/LC_MESSAGES/coreutils.mo mkdir + +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open /usr/share/locale/en/LC_MESSAGES/coreutils.mo mkdir + +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open /usr/share/locale-langpack/en_US/LC_MESSAGES/coreutils.mo mkdir + +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open /usr/lib/x86_64-linux-gnu/charset.alias mkdir + +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open /usr/share/locale/en_US/LC_MESSAGES/libc.mo mkdir + +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open /usr/share/locale/en/LC_MESSAGES/libc.mo mkdir + +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open /usr/share/locale-langpack/en_US/LC_MESSAGES/libc.mo mkdir + +000000000000441A 0000000000000002 FFFFFFFFFFFFFFFF open /usr/share/locale-langpack/en/LC_MESSAGES/libc.mo mkdir + + # + +The ERR column is the system error number. Error number 2 is ENOENT: no such +file or directory. + +# SEE ALSO # + +**strace**(1), **bpf**(2), ****. + +Also Documentation/networking/filter.txt in kernel sources. diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 000000000..c531b287c --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,8 @@ +strace.ebpf +file_sc_bench +*.trc +*.1.txt +*.rst.build_temp +*.png +/trace.h +/nondebug diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 000000000..6f7297e34 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,166 @@ +# Copyright 2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Makefile -- top Makefile for strace.ebpf +# + + +TARGET = strace.ebpf + +TOP = ../ + +EXTRA_TARGETS += nondebug/libebpf.a nondebug/libstrace.a + +$(TARGET): nondebug/libebpf.a nondebug/libstrace.a +$(TARGET).static-debug: nondebug/libebpf.a nondebug/libstrace.a +$(TARGET).static-nondebug: nondebug/libebpf.a nondebug/libstrace.a + +nondebug/libebpf.a: + $(MAKE) -C ebpf + +nondebug/libstrace.a: + $(MAKE) -C libstrace + +nondebug/libebpf.a-clean: + $(MAKE) -C ebpf clean + $(RM) nondebug/libebpf.a + +nondebug/libstrace.a-clean: + $(MAKE) -C libstrace clean + $(RM) nondebug/libstrace.a + +nondebug/libebpf.a-clobber: nondebug/libebpf.a-clean +nondebug/libstrace.a-clobber: nondebug/libstrace.a-clean + +.PHONY: nondebug/libebpf.a-clean nondebug/libstrace.a-clean +.PHONY: nondebug/libebpf.a-clobber nondebug/libstrace.a-clobber + +OBJS = main.o \ + + +CFLAGS += $(shell $(PKG_CONFIG) --cflags libbcc) + +# XXX libbcc expects multi-treading safity. Currently it's required for +# print_event_cb.o only, although we will apply it for overall application. +CFLAGS += -pthread + +CFLAGS += -g -Wextra + +ifeq ($(CC),clang) +CFLAGS += -Wno-initializer-overrides +else +CFLAGS += -Wno-override-init +endif + +CFLAGS += -I compat +CFLAGS += -I libstrace +CFLAGS += -I ebpf + +LDFLAGS += -g -Wextra + +# XXX libbcc expects multi-treading safity. +LDFLAGS += -pthread + +LIBS += $(shell $(PKG_CONFIG) --libs libbcc) +LIBS += nondebug/libstrace.a +LIBS += nondebug/libebpf.a + + +INSTALL_TARGET=$(EXPERIMENTAL) + + +include ../Makefile.inc + + +# Local BenchMark +PROGS = file_sc_bench +PROGS_CFLAGS = -g -O2 -Wall -Wextra -D__USE_GNU + + +$(PROGS).o: $(PROGS).c Makefile + $(CC) $(PROGS_CFLAGS) -c -o $@ $< + +$(PROGS): $(PROGS).o Makefile + $(CC) $(PROGS_CFLAGS) -o $@ $< + +.PHONY: run +run: $(PROGS) + time -p -v sudo ./$(PROGS) 500000 + @echo "Tracepoint's version is skipped because of this bug:" + @echo "\t - https://github.com/iovisor/bcc/issues/748" + #time -p -v sudo ./strace.ebpf -l hex -e trace=tp-all \ + # -o $(TMP)/$(PROGS).tp-all.trc ./$(PROGS) 50000 + #ln -f -s $(TMP)/$(PROGS).tp-all.trc $(PROGS).tp-all.trc + @echo + time -p -v sudo ./strace.ebpf -l hex -e trace=kp-libc-all \ + -o $(TMP)/$(PROGS).libc-all.trc ./$(PROGS) 50000 + ln -f -s $(TMP)/$(PROGS).libc-all.trc $(PROGS).libc-all.trc + @echo + time -p -v sudo ./strace.ebpf -l hex -e trace=kp-kern-all \ + -o $(TMP)/$(PROGS).kern-all.trc ./$(PROGS) 50000 + ln -f -s $(TMP)/$(PROGS).kern-all.trc $(PROGS).kern-all.trc + @echo + time -p -v sudo strace \ + -o $(TMP)/$(PROGS).trc ./$(PROGS) 10000 + ln -f -s $(TMP)/$(PROGS).trc $(PROGS).trc + +.PHONY: redis +redis: + @echo ">>>>> WARNING: Please disable system redis service in advance" + -redis-cli shutdown + # Should be same as in make-redis.sh + $(ECHO) > redis-server.log + sudo ./make-redis.sh + @echo "Tracepoint's version is skipped because of this bug:" + @echo "\t - https://github.com/iovisor/bcc/issues/748" + #sudo ./make-redis.sh ./strace.ebpf -l hex -e trace=tp-all \ + # -o $(TMP)/redis-server.tp-all.trc + #ln -f -s $(TMP)/redis-server.tp-all.trc redis-server.tp-all.trc + @echo + sudo ./make-redis.sh ./strace.ebpf -l hex -e trace=kp-libc-all \ + -o $(TMP)/redis-server.libc-all.trc + ln -f -s $(TMP)/redis-server.libc-all.trc redis-server.libc-all.trc + @echo + sudo ./make-redis.sh ./strace.ebpf -l hex -e trace=kp-kern-all \ + -o $(TMP)/redis-server.kern-all.trc + ln -f -s $(TMP)/redis-server.kern-all.trc redis-server.kern-all.trc + @echo + sudo ./make-redis.sh strace -f \ + -o $(TMP)/redis-server.trc + ln -f -s $(TMP)/redis-server.trc redis-server.trc + +# XXX Valgrind is confused with bpf() syscall and behaves unexpectedly on it +.PHONY: valgrind +valgrind: $(PROGS) + sudo valgrind -v\ + --leak-check=full \ + --track-origins=yes \ + --log-file=strace.ebpf.valgrind \ + ./strace.ebpf -d -o $(PROGS).ebpf.trc ./$(PROGS) 40 diff --git a/src/Makefile.inc b/src/Makefile.inc new file mode 100644 index 000000000..11b703a06 --- /dev/null +++ b/src/Makefile.inc @@ -0,0 +1,247 @@ +# Copyright 2014-2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# src/Makefile.inc -- common Makefile rules for NVM library +# + +TOP := $(dir $(lastword $(MAKEFILE_LIST))).. + +include $(TOP)/src/common.inc + +INCLUDE = $(TOP)/src/include + +RPMEM_COMMON = $(TOP)/src/rpmem_common +vpath %.c $(RPMEM_COMMON) + +COMMON = $(TOP)/src/common +vpath %.c $(COMMON) + +INCS += -I../include -I../common/ + +CFLAGS += -std=gnu99 +CFLAGS += -Wall +CFLAGS += -Werror +CFLAGS += -Wmissing-prototypes +CFLAGS += -Wpointer-arith +CFLAGS += -Wunused-macros +CFLAGS += -Wmissing-field-initializers +CFLAGS += -Wsign-conversion +CFLAGS += -Wsign-compare +ifeq ($(call check_Wconversion), y) +CFLAGS += -Wconversion +endif +CFLAGS += -pthread +CFLAGS += -fno-common +CFLAGS += -DSRCVERSION=\"$(SRCVERSION)\" +ifeq ($(call check_flag, -Wunreachable-code-return), y) +CFLAGS += -Wunreachable-code-return +endif +ifeq ($(call check_flag, -Wmissing-variable-declarations), y) +CFLAGS += -Wmissing-variable-declarations +endif + +ifeq ($(DEBUG),1) +CFLAGS += -O0 -ggdb -DDEBUG $(EXTRA_CFLAGS_DEBUG) +LIB_SUBDIR = /nvml_debug +OBJDIR = debug +else +CFLAGS += -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 $(EXTRA_CFLAGS_RELEASE) +LIB_SUBDIR = +OBJDIR = nondebug +endif + +CFLAGS += $(EXTRA_CFLAGS) + +LDFLAGS += -Wl,-z,relro -Wl,--fatal-warnings -Wl,--warn-common $(EXTRA_LDFLAGS) + +define arch32_error_msg + +################################################## +### 32-bit builds of NVML are not supported! ### +### Please, use 64-bit platform/compiler. ### +################################################## + +endef + +TESTCMD := $(CC) $(CFLAGS) -dM -E -x c /dev/null -o /dev/null +TESTBUILD := $(shell $(TESTCMD) && echo 1 || echo 0) +ifneq ($(TESTBUILD), 1) +$(error "$(TESTCMD)" failed) +endif + +LP64 := $(shell $(CC) $(CFLAGS) -dM -E -x c /dev/null | grep -Ec "__SIZEOF_LONG__.+8|__SIZEOF_POINTER__.+8" ) +ifneq ($(LP64), 2) +$(error $(arch32_error_msg)) +endif + +LIBS_DESTDIR = $(DESTDIR)$(libdir)$(LIB_SUBDIR) + +DIRNAME = $(shell basename $(CURDIR)) + +ifeq ($(OBJDIR),$(abspath $(OBJDIR))) +objdir = $(OBJDIR)/$(DIRNAME) +else +objdir = ../$(OBJDIR)/$(DIRNAME) +endif + +LIB_OUTDIR = $(objdir)/.. + +LDFLAGS += -L$(LIB_OUTDIR) + +ifneq ($(SOURCE),) +_OBJS = $(SOURCE:.c=.o) +_OBJS_COMMON = $(patsubst $(COMMON)/%, %, $(_OBJS)) +_OBJS_RPMEM_COMMON = $(patsubst $(RPMEM_COMMON)/%, %, $(_OBJS_COMMON)) +OBJS += $(addprefix $(objdir)/, $(_OBJS_RPMEM_COMMON)) +endif + +ifneq ($(HEADERS),) +ifneq ($(filter 1 2, $(CSTYLEON)),) +TMP_HEADERS := $(addsuffix tmp, $(HEADERS)) +TMP_HEADERS := $(addprefix $(objdir)/, $(TMP_HEADERS)) +endif +endif + +ifneq ($(LIBRARY_NAME),) +LIB_NAME = lib$(LIBRARY_NAME) +endif + +ifneq ($(LIBRARY_SO_VERSION),) +LIB_MAP = $(LIB_NAME).map +LIB_SONAME = $(LIB_NAME).so.$(LIBRARY_SO_VERSION) +LIB_SO = $(LIB_OUTDIR)/$(LIB_NAME).so + +LIB_SO_SONAME = $(LIB_SO).$(LIBRARY_SO_VERSION) + +ifneq ($(LIBRARY_VERSION),) +LIB_SO_REAL = $(LIB_SO_SONAME).$(LIBRARY_VERSION) +else +$(error LIBRARY_VERSION not set) +endif + +TARGET_LIBS = $(LIB_SO_REAL) +TARGET_LINKS = $(LIB_SO_SONAME) $(LIB_SO) +endif + +ifneq ($(LIB_NAME),) +LIB_AR = $(LIB_OUTDIR)/$(LIB_NAME).a +LIB_AR_UNSCOPED = $(objdir)/$(LIB_NAME)_unscoped.o +LIB_AR_ALL = $(objdir)/$(LIB_NAME)_all.o +TARGET_LIBS += $(LIB_AR) +endif + +ifneq ($(EXTRA_TARGETS),) +EXTRA_TARGETS_CLEAN = $(EXTRA_TARGETS:=-clean) +EXTRA_TARGETS_CLOBBER = $(EXTRA_TARGETS:=-clobber) +endif + +PMEMLOG_PRIV_OBJ=$(LIB_OUTDIR)/libpmemlog/libpmemlog_unscoped.o +PMEMBLK_PRIV_OBJ=$(LIB_OUTDIR)/libpmemblk/libpmemblk_unscoped.o + +ifneq ($(LIBPMEMLOG_PRIV_FUNCS),) +OBJS += pmemlog_priv_funcs.o +endif + +ifneq ($(LIBPMEMBLK_PRIV_FUNCS),) +OBJS += pmemblk_priv_funcs.o +endif + +MAKEFILE_DEPS=$(TOP)/src/Makefile.inc Makefile $(TOP)/src/common.inc + +all: $(objdir) $(LIB_OUTDIR) $(EXTRA_TARGETS) $(LIB_AR) $(LIB_SO_SONAME) $(LIB_SO_REAL) $(LIB_SO) $(TMP_HEADERS) + +$(objdir) $(LIB_OUTDIR): + $(MKDIR) -p $@ + +$(LIB_SO_REAL): $(OBJS) $(EXTRA_OBJS) $(LIB_MAP) $(MAKEFILE_DEPS) + $(CC) $(LDFLAGS) -shared -Wl,--version-script=$(LIB_MAP),-soname,$(LIB_SONAME) -o $@ $(OBJS) $(EXTRA_OBJS) $(LIBS) + +$(LIB_SO_SONAME): $(LIB_SO_REAL) $(MAKEFILE_DEPS) + $(LN) -sf $(shell basename $<) $@ + +$(LIB_SO): $(LIB_SO_SONAME) $(MAKEFILE_DEPS) + $(LN) -sf $(shell basename $<) $@ + +$(LIB_AR_UNSCOPED): $(OBJS) $(EXTRA_OBJS) $(MAKEFILE_DEPS) + $(LD) -o $@ -r $(OBJS) $(EXTRA_OBJS) + +ifeq ($(LIB_MAP),) +$(LIB_AR_ALL): $(LIB_AR_UNSCOPED) $(MAKEFILE_DEPS) + $(OBJCOPY) $< $@ +else +$(LIB_AR_ALL): $(LIB_AR_UNSCOPED) $(LIB_MAP) $(MAKEFILE_DEPS) + $(OBJCOPY) --localize-hidden `sed -n 's/^ *\([a-zA-Z0-9_]*\);$$/-G \1/p' $(LIB_MAP)` $< $@ +endif + +$(LIB_AR): $(LIB_AR_ALL) $(MAKEFILE_DEPS) + $(AR) rv $@ $(LIB_AR_ALL) + +$(PMEMBLK_PRIV_OBJ): + $(MAKE) -C $(LIBSDIR) libpmemblk + +install: all +ifneq ($(LIBRARY_NAME),) + $(INSTALL) -d $(LIBS_DESTDIR) + $(INSTALL) -p -m 0755 $(TARGET_LIBS) $(LIBS_DESTDIR) + $(CP) -d $(TARGET_LINKS) $(LIBS_DESTDIR) +endif + +uninstall: +ifneq ($(LIBRARY_NAME),) + $(foreach f, $(TARGET_LIBS), $(RM) $(LIBS_DESTDIR)/$(notdir $(f))) + $(foreach f, $(TARGET_LINKS), $(RM) $(LIBS_DESTDIR)/$(notdir $(f))) +endif + +clean: $(EXTRA_TARGETS_CLEAN) +ifneq ($(LIBRARY_NAME),) + $(RM) $(OBJS) $(TMP_HEADERS) + $(RM) $(LIB_AR_ALL) $(LIB_AR_UNSCOPED) +endif + +clobber: clean $(EXTRA_TARGETS_CLOBBER) +ifneq ($(LIBRARY_NAME),) + $(RM) $(LIB_AR) $(LIB_SO_SONAME) $(LIB_SO_REAL) $(LIB_SO) + $(RM) -r $(objdir)/.deps +endif + +$(eval $(cstyle-rule)) + +$(objdir)/%.o: %.c $(MAKEFILE_DEPS) + $(call check-cstyle, $<) + @mkdir -p $(objdir)/.deps + $(CC) -MD -c -o $@ $(CFLAGS) $(INCS) -fPIC $< + $(create-deps) + +$(objdir)/%.htmp: %.h + $(call check-cstyle, $<, $@) + +.PHONY: all clean clobber install uninstall cstyle + +-include $(objdir)/.deps/*.P diff --git a/src/common.inc b/src/common.inc new file mode 100644 index 000000000..db2ad9539 --- /dev/null +++ b/src/common.inc @@ -0,0 +1,183 @@ +# Copyright 2014-2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# src/Makefile.inc -- common Makefile rules for NVM library +# + +TOP := $(dir $(lastword $(MAKEFILE_LIST))).. + +LN = ln +OBJCOPY = objcopy +MKDIR = mkdir +INSTALL = install +CP = cp +CSTYLE = $(TOP)/utils/cstyle +CSTYLEON = 0 +STYLE_CHECK = $(TOP)/utils/style_check.sh +PKG_CONFIG = pkg-config +CLANG_FORMAT ?= clang-format +HEADERS = $(wildcard *.h) $(wildcard *.hpp) + +ifeq ($(shell command -v $(PKG_CONFIG) && echo y || echo n), n) +$(error $(PKG_CONFIG) not found) +endif + +check_package = $(shell $(PKG_CONFIG) $(1) && echo y || echo n) + +check_flag = $(shell echo "int main(){return 0;}" |\ + $(CC) $(CFLAGS) $(1) -x c -o /dev/null - 2>/dev/null && echo y || echo n) + +# Check for issues with older clang compilers which assert on delete persistent<[][]>. +check_clang_template_bug = $(shell echo "using namespace nvml::obj; int main() { delete_persistent(make_persistent(2), 2); return 0; }" |\ + $(CXX) --std=c++11 -x c++ -I$(TOP)/src/include/ -include libpmemobj++/make_persistent_array.hpp -L$(TOP)/src/debug/ -c -o /dev/null - 2>/dev/null && echo y || echo n) + +# Check for issues with older gcc compilers which do not expand variadic template +# variables in lambda expressions. +check_gcc_variadic_template_bug = $(shell echo "void print() {} template void print(const T&, const Args &...arg) {auto f = [&]{ print(arg...);};} int main() {print(1, 2, 3); return 0;}" |\ + $(CXX) --std=c++11 -x c++ -o /dev/null - 2>/dev/null && echo y || echo n) + +check_cxx_flags = $(shell echo "int main(){return 0;}" |\ + $(CXX) $(1) -x c++ -o /dev/null - 2>/dev/null && echo y || echo n) + +CXX_TESTS=$(check_clang_template_bug)$(check_gcc_variadic_template_bug) + +cxx_ok=$(if $(findstring n,$(CXX_TESTS)),n,y) + +# This is a workaround for older incompatible versions of libstdc++ and clang. +# Please see https://llvm.org/bugs/show_bug.cgi?id=15517 for more info. +check_cxx_chrono = $(shell echo "int main(){return 0;}" |\ + $(CXX) -std=c++11 -x c++ -include future -o /dev/null - 2>/dev/null && echo y || echo n) + +check_Wconversion = $(shell echo "long random(void); char test(void); char test(void){char a = 0; char b = 'a'; char ret = random() == 1 ? a : b; return ret;}" |\ + $(CC) -c $(CFLAGS) -Wconversion -x c -o /dev/null - 2>/dev/null && echo y || echo n) + +check_librt = $(shell echo "int main() { struct timespec t; return clock_gettime(CLOCK_MONOTONIC, &t); }" |\ + $(CC) $(CFLAGS) -x c -include time.h -o /dev/null - 2>/dev/null && echo y || echo n) + +install_recursive = $(shell cd $(1) && find . -type f -exec install -m $(2) -D {} $(3)/{} \;) + +install_recursive_filter = $(shell cd $(1) && find . -type f -name "$(2)" -exec install -m $(3) -D {} $(4)/{} \;) + +define create-deps + @cp $(objdir)/$*.d $(objdir)/.deps/$*.P; \ + sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \ + -e '/^$$/ d' -e 's/$$/ :/' < $(objdir)/$*.d >> $(objdir)/.deps/$*.P; \ + $(RM) -f $(objdir)/$*.d +endef + +export prefix = /usr/local +export exec_prefix := $(prefix) +export sysconfdir := $(prefix)/etc +export datarootdir := $(prefix)/share +export mandir := $(datarootdir)/man +export docdir := $(datarootdir)/doc +export man1dir := $(mandir)/man1 +export man3dir := $(mandir)/man3 +export cstyle_bin := $(CSTYLE) +export clang_format_bin := $(CLANG_FORMAT) + +ifneq ($(wildcard $(exec_prefix)/x86_64-linux-gnu),) +LIB_PREFIX ?= x86_64-linux-gnu/lib +endif + +ifneq ($(wildcard $(exec_prefix)/lib64),) +LIB_PREFIX ?= lib64 +endif + +LIB_PREFIX ?= lib + +all: + +cstyle-%: + $(STYLE_CHECK) $* $(wildcard *.[ch]) $(wildcard *.[ch]pp) + +cstyle: cstyle-check + +format: cstyle-format + +ifeq ($(CSTYLEON),1) +define check-cstyle + @$(STYLE_CHECK) check $1 && if [ "$2" != "" ]; then mkdir -p `dirname $2` && touch $2; fi +endef +else ifeq ($(CSTYLEON),2) +define check-cstyle + @$(STYLE_CHECK) check $1 && if [ "$2" != "" ]; then mkdir -p `dirname $2` && touch $2; fi || true +endef +else +define check-cstyle +endef +endif + +define sub-target-foreach +$(1)-$(2): + $$(MAKE) -C $1 $2 +ifeq ($(3),y) +ifeq ($(custom_build),) + $$(MAKE) -C $1 $2 DEBUG=1 +endif +endif +endef + +define sub-target +$(foreach f, $(1), $(eval $(call sub-target-foreach, $f,$(2),$(3)))) +endef + +ifneq ($(wildcard $(prefix)/x86_64-linux-gnu),) +INC_PREFIX ?= x86_64-linux-gnu/include +endif + +INC_PREFIX ?= include + +test_build=$(addprefix -b, $(TEST_BUILD)) + +export libdir := $(exec_prefix)/$(LIB_PREFIX) +export includedir := $(prefix)/$(INC_PREFIX) +export pkgconfigdir := $(libdir)/pkgconfig +export bindir := $(exec_prefix)/bin +export bashcompdir := $(sysconfdir)/bash_completion.d + +check_ibv_fork_init = $(shell echo "\#include int main(void) { return ibv_fork_init(); }" |\ + $(CC) -c $(CFLAGS) -x c -o /dev/null -libverbs - 2>/dev/null && echo y || echo n) + +export HAS_LIBFABRIC := $(call check_package, libfabric) + +ifeq ($(HAS_LIBFABRIC),y) +ifeq ($(RPMEM_DISABLE_LIBIBVERBS),y) +export HAS_LIBIBVERBS := n +export BUILD_RPMEM := y +else +export HAS_LIBIBVERBS := $(call check_ibv_fork_init) +export BUILD_RPMEM := $(HAS_LIBIBVERBS) +endif +else +export BUILD_RPMEM := n +endif + +export BUILD_STRACE_EBPF := $(call check_package, libbcc) diff --git a/src/compat/bcc/perf_reader.h b/src/compat/bcc/perf_reader.h new file mode 100644 index 000000000..cc0dfef2e --- /dev/null +++ b/src/compat/bcc/perf_reader.h @@ -0,0 +1,48 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * perf_reader.h -- utility functions + */ + +#ifndef PERF_READER_H +#define PERF_READER_H + +struct perf_reader; + +void perf_reader_free(void *ptr); +int perf_reader_poll(int num_readers, + struct perf_reader **readers, + int timeout); +int perf_reader_fd(struct perf_reader *reader); + +#endif /* PERF_READER_H */ diff --git a/src/ebpf/Makefile b/src/ebpf/Makefile new file mode 100644 index 000000000..57c9f80b7 --- /dev/null +++ b/src/ebpf/Makefile @@ -0,0 +1,60 @@ +# Copyright 2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# src/ebpf/Makefile -- Makefile for libebpf +# + + +LIBRARY_NAME = ebpf + +# eBPF sources +BIN_SRCS = \ + trace.h \ + trace_head.c \ + trace_tp_all.c \ + trace_kern_tmpl.c \ + trace_libc_tmpl.c \ + trace_file_tmpl.c \ + trace_fileat_tmpl.c \ + + +BIN_OBJS = $(addsuffix .o,$(BIN_SRCS)) + +%.c.o: %.c Makefile + $(LD) -r -b binary -o $@ $< + +%.h.o: %.h Makefile + $(LD) -r -b binary -o $@ $< + +OBJS = $(BIN_OBJS) + + +include ../Makefile.inc diff --git a/src/ebpf/README b/src/ebpf/README new file mode 100644 index 000000000..80060e015 --- /dev/null +++ b/src/ebpf/README @@ -0,0 +1 @@ +This directory contains code which runs inside eBPF VM. diff --git a/src/ebpf/trace.h b/src/ebpf/trace.h new file mode 100644 index 000000000..b611587a2 --- /dev/null +++ b/src/ebpf/trace.h @@ -0,0 +1,103 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * trace.h -- Data exchange packet between packet filter and reader callback + */ + + +#ifndef TRACE_H +#define TRACE_H + + +struct ev_dt_t { + /* + * the value equals to -1 mean "header" + * the value equals to -2 mean that syscall's num is unknown for glibc + * and the field sc_name should be used to figuring out syscall. + */ + s64 sc_id; + + u64 pid_tid; + + /* Timestamps */ + u64 start_ts_nsec; + u64 finish_ts_nsec; + s64 ret; + + union { + struct { + s64 arg_1; + s64 arg_2; + s64 arg_3; + s64 arg_4; + s64 arg_5; + s64 arg_6; + }; + struct { + } open; + + struct { + s64 fd; + } close; + + struct { + s64 fd; + } read; + + struct { + s64 fd; + } write; + }; + + union { + /* + * The longest syscall's name is equal to 26 characters: + * 'SyS_sched_get_priority_max'. + * Let's to add a space for '\0' and few extra bytes. + */ + char sc_name[32]; + + struct { + char fl_nm[NAME_MAX]; + /* Current process name. XXX Reserved for future. */ + char comm[TASK_COMM_LEN]; + }; + + struct { + s32 argc; + char argv[]; + } header; + }; +}; + +#endif /* TRACE_H */ diff --git a/src/ebpf/trace_file_tmpl.c b/src/ebpf/trace_file_tmpl.c new file mode 100644 index 000000000..336df6a03 --- /dev/null +++ b/src/ebpf/trace_file_tmpl.c @@ -0,0 +1,98 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * trace_file_tmpl.c -- Trace syscalls with numbers known from libc and + * filename as first argument. Uses BCC, eBPF. + */ + +/* + * SYSCALL_NAME() entry handler + */ +int +kprobe__SYSCALL_NAME(struct pt_regs *ctx) +{ + struct first_step_t fs = {}; + u64 pid_tid = bpf_get_current_pid_tgid(); + + PID_CHECK_HOOK + + fs.start_ts_nsec = bpf_ktime_get_ns(); + fs.arg_1 = PT_REGS_PARM1(ctx); + fs.arg_2 = PT_REGS_PARM2(ctx); + fs.arg_3 = PT_REGS_PARM3(ctx); + fs.arg_4 = PT_REGS_PARM4(ctx); + fs.arg_5 = PT_REGS_PARM5(ctx); + fs.arg_5 = PT_REGS_PARM6(ctx); + + tmp_i.update(&pid_tid, &fs); + + return 0; +}; + +/* + * SYSCALL_NAME() exit handler + */ +int +kretprobe__SYSCALL_NAME(struct pt_regs *ctx) +{ + struct first_step_t *fsp; + struct ev_dt_t ev = {}; + + u64 cur_nsec = bpf_ktime_get_ns(); + + u64 pid_tid = bpf_get_current_pid_tgid(); + fsp = tmp_i.lookup(&pid_tid); + if (fsp == 0) + return 0; + + ev.sc_id = SYSCALL_NR; /* SysCall ID */ + ev.arg_1 = fsp->arg_1; + ev.arg_2 = fsp->arg_2; + ev.arg_3 = fsp->arg_3; + ev.arg_4 = fsp->arg_4; + ev.arg_5 = fsp->arg_5; + ev.arg_6 = fsp->arg_6; + ev.pid_tid = pid_tid; + ev.start_ts_nsec = fsp->start_ts_nsec; + ev.finish_ts_nsec = cur_nsec; + ev.ret = PT_REGS_RC(ctx); + bpf_probe_read(&ev.fl_nm, sizeof(ev.fl_nm), (void *)fsp->arg_1); + + const size_t ev_size = offsetof(struct ev_dt_t, fl_nm) + + sizeof(ev.fl_nm); + events.perf_submit(ctx, &ev, ev_size); + + tmp_i.delete(&pid_tid); + + return 0; +} diff --git a/src/ebpf/trace_fileat_tmpl.c b/src/ebpf/trace_fileat_tmpl.c new file mode 100644 index 000000000..3621243ca --- /dev/null +++ b/src/ebpf/trace_fileat_tmpl.c @@ -0,0 +1,98 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * trace_fileat_tmpl.c -- Trace syscalls with numbers known from libc and + * a fd as first arg and a filename as second argument. Uses BCC, eBPF. + */ + +/* + * SYSCALL_NAME() entry handler + */ +int +kprobe__SYSCALL_NAME(struct pt_regs *ctx) +{ + struct first_step_t fs = {}; + u64 pid_tid = bpf_get_current_pid_tgid(); + + PID_CHECK_HOOK + + fs.start_ts_nsec = bpf_ktime_get_ns(); + fs.arg_1 = PT_REGS_PARM1(ctx); + fs.arg_2 = PT_REGS_PARM2(ctx); + fs.arg_3 = PT_REGS_PARM3(ctx); + fs.arg_4 = PT_REGS_PARM4(ctx); + fs.arg_5 = PT_REGS_PARM5(ctx); + fs.arg_5 = PT_REGS_PARM6(ctx); + + tmp_i.update(&pid_tid, &fs); + + return 0; +}; + +/* + * SYSCALL_NAME() exit handler + */ +int +kretprobe__SYSCALL_NAME(struct pt_regs *ctx) +{ + struct first_step_t *fsp; + struct ev_dt_t ev = {}; + + u64 cur_nsec = bpf_ktime_get_ns(); + + u64 pid_tid = bpf_get_current_pid_tgid(); + fsp = tmp_i.lookup(&pid_tid); + if (fsp == 0) + return 0; + + ev.sc_id = SYSCALL_NR; /* SysCall ID */ + ev.arg_1 = fsp->arg_1; + ev.arg_2 = fsp->arg_2; + ev.arg_3 = fsp->arg_3; + ev.arg_4 = fsp->arg_4; + ev.arg_5 = fsp->arg_5; + ev.arg_6 = fsp->arg_6; + ev.pid_tid = pid_tid; + ev.start_ts_nsec = fsp->start_ts_nsec; + ev.finish_ts_nsec = cur_nsec; + ev.ret = PT_REGS_RC(ctx); + bpf_probe_read(&ev.fl_nm, sizeof(ev.fl_nm), (void *)fsp->arg_2); + + const size_t ev_size = offsetof(struct ev_dt_t, fl_nm) + + sizeof(ev.fl_nm); + events.perf_submit(ctx, &ev, ev_size); + + tmp_i.delete(&pid_tid); + + return 0; +} diff --git a/src/ebpf/trace_head.c b/src/ebpf/trace_head.c new file mode 100644 index 000000000..ef4aec4ee --- /dev/null +++ b/src/ebpf/trace_head.c @@ -0,0 +1,54 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * trace_head.c -- Header for generated eBPF code. Uses BCC, eBPF. + */ + +#include +#include +#include + +#include "trace.h" + +struct first_step_t { + s64 arg_1; + s64 arg_2; + s64 arg_3; + s64 arg_4; + s64 arg_5; + s64 arg_6; + u64 start_ts_nsec; +}; + +BPF_HASH(tmp_i, u64, struct first_step_t); +BPF_PERF_OUTPUT(events); diff --git a/src/ebpf/trace_kern_tmpl.c b/src/ebpf/trace_kern_tmpl.c new file mode 100644 index 000000000..85d13d5ef --- /dev/null +++ b/src/ebpf/trace_kern_tmpl.c @@ -0,0 +1,98 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * trace_kern_tmpl.c -- Trace syscalls with unknown numbers. + * Uses BCC, eBPF. + */ + +/* + * SYSCALL_NAME() entry handler + */ +int +kprobe__SYSCALL_NAME(struct pt_regs *ctx) +{ + struct first_step_t fs = {}; + u64 pid_tid = bpf_get_current_pid_tgid(); + + PID_CHECK_HOOK + + fs.start_ts_nsec = bpf_ktime_get_ns(); + fs.arg_1 = PT_REGS_PARM1(ctx); + fs.arg_2 = PT_REGS_PARM2(ctx); + fs.arg_3 = PT_REGS_PARM3(ctx); + fs.arg_4 = PT_REGS_PARM4(ctx); + fs.arg_5 = PT_REGS_PARM5(ctx); + fs.arg_5 = PT_REGS_PARM6(ctx); + + tmp_i.update(&pid_tid, &fs); + + return 0; +}; + +/* + * SYSCALL_NAME() exit handler + */ +int +kretprobe__SYSCALL_NAME(struct pt_regs *ctx) +{ + struct first_step_t *fsp; + struct ev_dt_t ev = {}; + + u64 cur_nsec = bpf_ktime_get_ns(); + + u64 pid_tid = bpf_get_current_pid_tgid(); + fsp = tmp_i.lookup(&pid_tid); + if (fsp == 0) + return 0; + + ev.sc_id = -2; /* SysCall ID */ + ev.arg_1 = fsp->arg_1; + ev.arg_2 = fsp->arg_2; + ev.arg_3 = fsp->arg_3; + ev.arg_4 = fsp->arg_4; + ev.arg_5 = fsp->arg_5; + ev.arg_6 = fsp->arg_6; + ev.pid_tid = pid_tid; + ev.start_ts_nsec = fsp->start_ts_nsec; + ev.finish_ts_nsec = cur_nsec; + ev.ret = PT_REGS_RC(ctx); + strcpy(ev.sc_name, "SYSCALL_NAME"); + + const size_t ev_size = offsetof(struct ev_dt_t, sc_name) + + sizeof(ev.sc_name); + events.perf_submit(ctx, &ev, ev_size); + + tmp_i.delete(&pid_tid); + + return 0; +} diff --git a/src/ebpf/trace_libc_tmpl.c b/src/ebpf/trace_libc_tmpl.c new file mode 100644 index 000000000..9c7634db2 --- /dev/null +++ b/src/ebpf/trace_libc_tmpl.c @@ -0,0 +1,96 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * trace_libc_tmpl.c -- Trace syscalls with numbers known from libc. + * Uses BCC, eBPF. + */ + +/* + * SYSCALL_NAME() entry handler + */ +int +kprobe__SYSCALL_NAME(struct pt_regs *ctx) +{ + struct first_step_t fs = {}; + u64 pid_tid = bpf_get_current_pid_tgid(); + + PID_CHECK_HOOK + + fs.start_ts_nsec = bpf_ktime_get_ns(); + fs.arg_1 = PT_REGS_PARM1(ctx); + fs.arg_2 = PT_REGS_PARM2(ctx); + fs.arg_3 = PT_REGS_PARM3(ctx); + fs.arg_4 = PT_REGS_PARM4(ctx); + fs.arg_5 = PT_REGS_PARM5(ctx); + fs.arg_5 = PT_REGS_PARM6(ctx); + + tmp_i.update(&pid_tid, &fs); + + return 0; +}; + +/* + * SYSCALL_NAME() exit handler + */ +int +kretprobe__SYSCALL_NAME(struct pt_regs *ctx) +{ + struct first_step_t *fsp; + struct ev_dt_t ev = {}; + + u64 cur_nsec = bpf_ktime_get_ns(); + + u64 pid_tid = bpf_get_current_pid_tgid(); + fsp = tmp_i.lookup(&pid_tid); + if (fsp == 0) + return 0; + + ev.sc_id = SYSCALL_NR; /* SysCall ID */ + ev.arg_1 = fsp->arg_1; + ev.arg_2 = fsp->arg_2; + ev.arg_3 = fsp->arg_3; + ev.arg_4 = fsp->arg_4; + ev.arg_5 = fsp->arg_5; + ev.arg_6 = fsp->arg_6; + ev.pid_tid = pid_tid; + ev.start_ts_nsec = fsp->start_ts_nsec; + ev.finish_ts_nsec = cur_nsec; + ev.ret = PT_REGS_RC(ctx); + + const size_t ev_size = offsetof(struct ev_dt_t, sc_name); + events.perf_submit(ctx, &ev, ev_size); + + tmp_i.delete(&pid_tid); + + return 0; +} diff --git a/src/ebpf/trace_tp_all.c b/src/ebpf/trace_tp_all.c new file mode 100644 index 000000000..5ecf58005 --- /dev/null +++ b/src/ebpf/trace_tp_all.c @@ -0,0 +1,88 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * trace_tp_all.c -- Trace syscalls. Uses BCC, eBPF. + */ + +/* + * Syscall's entry handler. + */ +int +tracepoint__sys_enter(struct pt_regs *ctx) +{ + struct first_step_t fs = {}; + u64 pid_tid = bpf_get_current_pid_tgid(); + + PID_CHECK_HOOK + + if (!bpf_get_current_comm(&fs.comm, sizeof(fs.comm))) + return; + + fs.start_ts_nsec = bpf_ktime_get_ns(); + tmp_i.update(&pid_tid, &fs); + + return 0; +}; + +/* + * Syscall's exit handler. + */ +int +tracepoint__sys_exit(struct pt_regs *ctx) +{ + struct first_step_t *fsp; + struct ev_dt_t ev = {}; + + u64 cur_nsec = bpf_ktime_get_ns(); + + u64 pid_tid = bpf_get_current_pid_tgid(); + fsp = tmp_i.lookup(&pid_tid); + if (fsp == 0) + return 0; + + bpf_probe_read(&ev.comm, sizeof(ev.comm), fsp->comm); + bpf_probe_read(&ev.open.fl_nm, + sizeof(ev.open.fl_nm), + (void *)fsp->fl_nm); + /* SysCall ID */ + /* ev.sc_id = __NR_open; */ + ev.pid_tid = pid_tid; + ev.start_ts_nsec = fsp->start_ts_nsec; + ev.finish_ts_nsec = cur_nsec; + ev.ret = PT_REGS_RC(ctx); + + events.perf_submit(ctx, &ev, sizeof(ev)); + tmp_i.delete(&pid_tid); + + return 0; +} diff --git a/src/file_sc_bench.c b/src/file_sc_bench.c new file mode 100644 index 000000000..6fae43dd6 --- /dev/null +++ b/src/file_sc_bench.c @@ -0,0 +1,119 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * file_sc_bench.c -- testing BenchMark for strace.ebpf. This simple benchmark + * allow us to measure and compare different tracing tools. This benchmark + * doesn't have any dependencies and will be compiled during 'make run'. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +typedef void (*tx_t)(); + +/* + * Tested usecase itself + */ +static void +open_close() +{ + int fd; + int x; + + fd = open("/dev/null", O_RDONLY); + x = read(fd, &x, sizeof(x)); + x = write(fd, &x, sizeof(x)); + (void) close(fd); +} + +/* + * This function runs and measures tested usecase. + */ +static void +loop_tx(char *name, tx_t tx_f, uint64_t qty, FILE *f) +{ + uint64_t i; + + uint64_t tu_start, tu_end, delta; + struct timeval tv_start, tv_end; + + gettimeofday(&tv_start, NULL); + + for (i = 0; i < qty; i++) + tx_f(); + + gettimeofday(&tv_end, NULL); + + if (NULL == f) + return; + + tu_start = tv_start.tv_sec * 1000000 + tv_start.tv_usec; + tu_end = tv_end.tv_sec * 1000000 + tv_end.tv_usec; + + delta = (tu_end - tu_start); + delta *= 1000; + + fprintf(stderr, "%s: Iter time: %ld nsec\n", name, delta / qty); +} + +/* + * BenchMark entry point + */ +int +main(int argc, char *argv[]) +{ + uint64_t iters_qty; + + if (argc != 2) { + printf("usage: %s iters qty\n", argv[0]); + return 1; + } + + iters_qty = atol(argv[1]); + + /* WARM-UP */ + loop_tx("open_read_write_close", + open_close, iters_qty / 10, NULL); + loop_tx(">>> open_read_write_close ", + open_close, iters_qty, stderr); + + return 0; +} diff --git a/src/libstrace/Makefile b/src/libstrace/Makefile new file mode 100644 index 000000000..dbfd38336 --- /dev/null +++ b/src/libstrace/Makefile @@ -0,0 +1,64 @@ +# Copyright 2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# src/libstrace/Makefile -- Makefile for libstrace +# + +#TOP = ../../.. + +LIBRARY_NAME = strace + +OBJS = bpf.o\ + utils.o\ + attach_probes.o \ + ebpf_syscalls.o \ + generate_ebpf.o \ + print_event_cb.o \ + + +CFLAGS += $(shell $(PKG_CONFIG) --cflags libbcc) + +# XXX libbcc expects multi-treading safity. Currently it's required for +# print_event_cb.o only, although we will apply it for overall application. +CFLAGS += -pthread + +CFLAGS += -g -Wextra + +ifeq ($(CC),clang) +CFLAGS += -Wno-initializer-overrides +else +CFLAGS += -Wno-override-init +endif + +CFLAGS += -I ../compat +CFLAGS += -I ../ebpf + +include ../Makefile.inc diff --git a/src/libstrace/README b/src/libstrace/README new file mode 100644 index 000000000..fccf13d14 --- /dev/null +++ b/src/libstrace/README @@ -0,0 +1 @@ +This directory contains libstrace code. diff --git a/src/libstrace/attach_probes.c b/src/libstrace/attach_probes.c new file mode 100644 index 000000000..6b40571c5 --- /dev/null +++ b/src/libstrace/attach_probes.c @@ -0,0 +1,451 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * attach_probes.c -- attach_probes() function + */ + +#include + +#include + +#include "bpf.h" +#include "main.h" +#include "utils.h" +#include "attach_probes.h" +#include "ebpf_syscalls.h" + +enum { HANDLER_NAME_MAX_SIZE = 128 }; + +/* + * This function attaches eBPF handler to each syscall known to libc. + * + * It can be useful because kernel has a lot of "unused" syscalls. + */ +static bool +attach_kp_libc_all(struct bpf_ctx *b) +{ + unsigned succ_counter = 0; + + for (unsigned i = 0; i < SC_TBL_SIZE; i++) { + int res; + char kprobe[HANDLER_NAME_MAX_SIZE]; + char kretprobe[HANDLER_NAME_MAX_SIZE]; + + if (NULL == sc_tbl[i].hlr_name) + continue; + + snprintf(kprobe, sizeof(kprobe), + "kprobe__%s", + sc_tbl[i].hlr_name); + + snprintf(kretprobe, sizeof(kretprobe), + "kretprobe__%s", + sc_tbl[i].hlr_name); + + /* KRetProbe should be first to prevent race condition */ + res = load_fn_and_attach_to_kretp(b, + sc_tbl[i].hlr_name, kretprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kretprobe, sc_tbl[i].hlr_name); + + /* Kretprobe fails. There is no reason to try probe */ + continue; + } + + res = load_fn_and_attach_to_kp(b, sc_tbl[i].hlr_name, kprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kprobe, sc_tbl[i].hlr_name); + + continue; + } + + succ_counter ++; + } + + return succ_counter > 0; +} + +/* XXX HACK: this syscall is exported by kernel twice. */ +static unsigned SyS_sigsuspend = 0; + +/* + * This function attaches eBPF handler to all existing syscalls in running + * kernel. It consume more time than attach_kp_libc_all(). + */ +static bool +attach_kp_kern_all(struct bpf_ctx *b) +{ + unsigned succ_counter = 0; + + char *line = NULL; + size_t len = 0; + ssize_t read; + + FILE *in = fopen(debug_tracing_aff, "r"); + + if (NULL == in) { + fprintf(stderr, "%s: ERROR: '%m'\n", __func__); + return false; + } + + while ((read = getline(&line, &len, in)) != -1) { + int res; + char kprobe[HANDLER_NAME_MAX_SIZE]; + char kretprobe[HANDLER_NAME_MAX_SIZE]; + + if (!is_a_sc(line, read - 1)) + continue; + + line [read - 1] = '\0'; + + /* XXX HACK: this syscall is exported by kernel twice. */ + if (!strcasecmp("SyS_sigsuspend", line)) { + if (SyS_sigsuspend) + continue; + + SyS_sigsuspend ++; + } + + snprintf(kprobe, sizeof(kprobe), + "kprobe__%s", line); + + snprintf(kretprobe, sizeof(kretprobe), + "kretprobe__%s", line); + + /* KRetProbe should be first to prevent race condition */ + res = load_fn_and_attach_to_kretp(b, line, kretprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kretprobe, line); + + /* Kretprobe fails. There is no reason to try probe */ + continue; + } + + res = load_fn_and_attach_to_kp(b, line, kprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kprobe, line); + + continue; + } + + succ_counter ++; + } + + free(line); + fclose(in); + + return succ_counter > 0; +} + +/* + * This function attaches eBPF handler to each syscall which operates on file + * descriptor. Inspired by: 'strace -e trace=desc' + */ +static bool +attach_kp_desc(struct bpf_ctx *b) +{ + unsigned succ_counter = 0; + + for (unsigned i = 0; i < SC_TBL_SIZE; i++) { + int res; + char kprobe[HANDLER_NAME_MAX_SIZE]; + char kretprobe[HANDLER_NAME_MAX_SIZE]; + + if (NULL == sc_tbl[i].hlr_name) + continue; + + if (EM_desc != (EM_desc & sc_tbl[i].masks)) + continue; + + snprintf(kprobe, sizeof(kprobe), + "kprobe__%s", + sc_tbl[i].hlr_name); + + snprintf(kretprobe, sizeof(kretprobe), + "kretprobe__%s", + sc_tbl[i].hlr_name); + + /* KRetProbe should be first to prevent race condition */ + res = load_fn_and_attach_to_kretp(b, + sc_tbl[i].hlr_name, kretprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kretprobe, sc_tbl[i].hlr_name); + + /* Kretprobe fails. There is no reason to try probe */ + continue; + } + + res = load_fn_and_attach_to_kp(b, sc_tbl[i].hlr_name, kprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kprobe, sc_tbl[i].hlr_name); + + continue; + } + + succ_counter ++; + } + + return succ_counter > 0; +} + +/* + * This function attaches eBPF handler to each syscall which operates on + * filenames. Inspired by 'strace -e trace=file'. + */ +static bool +attach_kp_file(struct bpf_ctx *b) +{ + unsigned succ_counter = 0; + + for (unsigned i = 0; i < SC_TBL_SIZE; i++) { + int res; + char kprobe[HANDLER_NAME_MAX_SIZE]; + char kretprobe[HANDLER_NAME_MAX_SIZE]; + + if (NULL == sc_tbl[i].hlr_name) + continue; + + if (EM_file != (EM_file & sc_tbl[i].masks)) + continue; + + snprintf(kprobe, sizeof(kprobe), + "kprobe__%s", + sc_tbl[i].hlr_name); + + snprintf(kretprobe, sizeof(kretprobe), + "kretprobe__%s", + sc_tbl[i].hlr_name); + + /* KRetProbe should be first to prevent race condition */ + res = load_fn_and_attach_to_kretp(b, + sc_tbl[i].hlr_name, kretprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kretprobe, sc_tbl[i].hlr_name); + + /* Kretprobe fails. There is no reason to try probe */ + continue; + } + + res = load_fn_and_attach_to_kp(b, sc_tbl[i].hlr_name, kprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kprobe, sc_tbl[i].hlr_name); + + continue; + } + + succ_counter ++; + } + + return succ_counter > 0; +} + +/* + * This function attaches eBPF handler to each syscall which operates on + * relative file path. There are no equivalents in strace. + */ +static bool +attach_kp_fileat(struct bpf_ctx *b) +{ + unsigned succ_counter = 0; + + for (unsigned i = 0; i < SC_TBL_SIZE; i++) { + int res; + char kprobe[HANDLER_NAME_MAX_SIZE]; + char kretprobe[HANDLER_NAME_MAX_SIZE]; + + if (NULL == sc_tbl[i].hlr_name) + continue; + + if (EM_fileat != (EM_fileat & sc_tbl[i].masks)) + continue; + + snprintf(kprobe, sizeof(kprobe), + "kprobe__%s", + sc_tbl[i].hlr_name); + + snprintf(kretprobe, sizeof(kretprobe), + "kretprobe__%s", + sc_tbl[i].hlr_name); + + /* KRetProbe should be first to prevent race condition */ + res = load_fn_and_attach_to_kretp(b, + sc_tbl[i].hlr_name, kretprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kretprobe, sc_tbl[i].hlr_name); + + /* Kretprobe fails. There is no reason to try probe */ + continue; + } + + res = load_fn_and_attach_to_kp(b, sc_tbl[i].hlr_name, kprobe, + args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s'. Ignoring.\n", + __func__, kprobe, sc_tbl[i].hlr_name); + + continue; + } + + succ_counter ++; + } + + return succ_counter > 0; +} + +/* + * Attach eBPF handlers to all file-related syscalls. Inspired by: + * 'strace -e trace=desc,file' + */ +static bool +attach_kp_pmemfile(struct bpf_ctx *b) +{ + bool res = false; + + res |= attach_kp_desc(b); + res |= attach_kp_file(b); + res |= attach_kp_fileat(b); + + return res; +} + +static const char tp_all_category[] = "raw_syscalls"; +static const char tp_all_enter_name[] = "sys_enter"; +static const char tp_all_exit_name[] = "sys_exit"; +static const char tp_all_enter_fn[] = "tracepoint__sys_enter"; +static const char tp_all_exit_fn[] = "tracepoint__sys_exit"; + +/* + * Intercept all syscalls of running kernel using TracePoint way. + * Should be faster and better but require at kernel at least 4.6. + * + * XXX Not tested. + */ +static bool +attach_tp_all(struct bpf_ctx *b) +{ + int res; + + /* 'sys_exit' should be first to prevent race condition */ + res = load_fn_and_attach_to_tp(b, tp_all_category, tp_all_enter_name, + tp_all_enter_fn, args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s:%s'. Exiting.\n", + __func__, tp_all_enter_fn, + tp_all_category, tp_all_enter_name); + + /* Tracepoint fails. There is no reason to try continue */ + return false; + } + + res = load_fn_and_attach_to_tp(b, tp_all_category, tp_all_exit_name, + tp_all_exit_fn, args.pid, 0, -1); + + if (res == -1) { + fprintf(stderr, + "ERROR:%s:Can't attach %s to '%s:%s'. Ignoring.\n", + __func__, tp_all_exit_fn, + tp_all_category, tp_all_exit_name); + } + + return true; +} + +/* + * This function parses and processes expression. + * + * XXX Think about applying 'fn_name' via str_replace_all() + * to be more consistent + */ +bool +attach_probes(struct bpf_ctx *b) +{ + if (NULL == args.expr) + goto DeFault; + + if (!strcasecmp(args.expr, "trace=kp-libc-all")) { + return attach_kp_libc_all(b); + } else if (!strcasecmp(args.expr, "trace=kp-kern-all")) { + return attach_kp_kern_all(b); + } else if (!strcasecmp(args.expr, "trace=kp-file")) { + return attach_kp_file(b); + } else if (!strcasecmp(args.expr, "trace=kp-desc")) { + return attach_kp_desc(b); + } else if (!strcasecmp(args.expr, "trace=kp-pmemfile")) { + return attach_kp_pmemfile(b); + } else if (!strcasecmp(args.expr, "trace=tp-all")) { + return attach_tp_all(b); + } + +DeFault: + return attach_kp_kern_all(b); +} diff --git a/src/libstrace/attach_probes.h b/src/libstrace/attach_probes.h new file mode 100644 index 000000000..ad8f55381 --- /dev/null +++ b/src/libstrace/attach_probes.h @@ -0,0 +1,46 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * attach_probes.h -- attach_probes() function + */ + +#ifndef ATTACH_PROBES_H +#define ATTACH_PROBES_H + +#include + +#include "bpf.h" + +bool attach_probes(struct bpf_ctx *b); + +#endif /* ATTACH_PROBES_H */ diff --git a/src/libstrace/bpf.c b/src/libstrace/bpf.c new file mode 100644 index 000000000..5ccffdeca --- /dev/null +++ b/src/libstrace/bpf.c @@ -0,0 +1,408 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * bpf.c -- functions related to struct bpf_ctx + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "bpf.h" +#include "main.h" + + +/* + * This function checks possibility of intercepting one more syscall. + * Should be actual if we will intercept something more low-level than regular + * syscalls. + */ +static bool pr_arr_check_quota(struct bpf_ctx *sbcp, unsigned new_pr_qty) +{ + return sbcp->pr_arr_qty + new_pr_qty <= args.pr_arr_max; +} + +/* + * Save reference to hendler of intercepted syscall in pr_arr. + */ +static void append_item_to_pr_arr(struct bpf_ctx *sbcp, const char *name, + struct perf_reader *probe, bool attached) +{ + struct bpf_pr *item = + calloc(1, sizeof(*item) + strlen(name) + 1); + item->pr = probe; + item->attached = attached; + strcpy(item->key, name); + + if (NULL == sbcp->pr_arr) + sbcp->pr_arr = + calloc(args.pr_arr_max, sizeof(*sbcp->pr_arr)); + + sbcp->pr_arr[sbcp->pr_arr_qty] = item; + sbcp->pr_arr_qty += 1; +} + +/* + * Register callback to capture stream of events. + */ +int +attach_callback_to_perf_output(struct bpf_ctx *sbcp, + const char *name, perf_reader_raw_cb callback) +{ + int map_fd = bpf_table_fd(sbcp->module, name); + + if (map_fd < 0) { + fprintf(stderr, + "ERROR:%s:Can't attach to perf output '%s':%m.\n", + __func__, name); + return -1; + } + + size_t map_id = bpf_table_id(sbcp->module, name); + int ttype = bpf_table_type_id(sbcp->module, map_id); + + if (ttype != BPF_MAP_TYPE_PERF_EVENT_ARRAY) { + fprintf(stderr, "ERROR:%s:Unknown table type %d.\n", + __func__, ttype); + return -1; + } + + /* + * XXX It can be reasonable to replace sysconf with sched_getaffinity(). + * It will allow us to ignore non-actual CPUs. + */ + long cpu_qty = sysconf(_SC_NPROCESSORS_ONLN); + + if (!pr_arr_check_quota(sbcp, (unsigned)cpu_qty)) { + fprintf(stderr, + "ERROR:%s:Number of perf readers would exceed" + " global quota: %d\n", + __func__, args.pr_arr_max); + + return -1; + } + + for (int cpu = 0; cpu < cpu_qty; cpu++) { + char reader_name[128]; + + struct perf_reader *reader = + bpf_open_perf_buffer(callback, NULL, -1, cpu); + + if (NULL == reader) { + fprintf(stderr, + "WARNING:%s:" + "Could not open perf buffer on cpu %d." + " Ignored.\n", + __func__, cpu); + continue; + } + + int fd = perf_reader_fd(reader); + + int res = bpf_update_elem(map_fd, &cpu, &fd, 0); + + if (res < 0) { + fprintf(stderr, + "WARNING:%s:" + "Could not update table on cpu %d: %m." + " Ignored.\n", + __func__, cpu); + } + + snprintf(reader_name, sizeof(reader_name), "%p:%d", sbcp, cpu); + append_item_to_pr_arr(sbcp, reader_name, reader, false); + } + + return 0; +} + +/* + * Overall resource cleanup. + * + * WARNING We really need explicit cleanup to prevent in-kernel memory leaks. + * Yes, there still are kernel bugs related to eBPF. + */ +void +detach_all(struct bpf_ctx *b) +{ + fprintf(stderr, + "INFO: Detaching. PLEASE wait." + " It can hold few tens of seconds.\n"); + + for (unsigned i = 0; i < b->pr_arr_qty; i++) { + perf_reader_free(b->pr_arr[i]->pr); + + /* non-attached keys here include the perf_events reader */ + if (b->pr_arr[i]->attached) { + char desc[256]; + + snprintf(desc, sizeof(desc), + "-:kprobes/%s", b->pr_arr[i]->key); + bpf_detach_kprobe(desc); + } + + free(b->pr_arr[i]); + } + + bpf_module_destroy(b->module); + + free(b->pr_arr); + free(b); +} + +/* + * Load eBPF object code to kernel VM and obtaining a fd + */ +static int +load_obj_code_into_ebpf_vm(struct bpf_ctx *sbcp, const char *func_name, + enum bpf_prog_type prog_type) +{ + int fd = -1; + void *bfs_res = bpf_function_start(sbcp->module, func_name); + + if (NULL == bfs_res) { + fprintf(stderr, "%s: Unknown program %s\n", + __func__, func_name); + return -1; + } + + const unsigned log_buf_size = sbcp->debug ? 65536 : 0; + char *const log_buf = sbcp->debug ? calloc(1, log_buf_size) : NULL; + + fd = bpf_prog_load(prog_type, + bfs_res, + (int)bpf_function_size(sbcp->module, func_name), + bpf_module_license(sbcp->module), + bpf_module_kern_version(sbcp->module), + log_buf, log_buf_size); + + if (sbcp->debug) { + /* XXX Command line options to save it to separate file */ + fprintf(stderr, "DEBUG:%s('%s'):\n%s\n", + __func__, func_name, log_buf); + } + + if (fd < 0) { + fprintf(stderr, + "ERROR:%s:Failed to load BPF program %s: %m\n", + __func__, func_name); + + return -1; + } + + return fd; +} + +/* + * This function replaces character 'tmpl' in string 'str' with 'ch'. + */ +static void +chr_replace(char *str, const char tmpl, const char ch) +{ + if (NULL == str) + return; + + for (; *str; str++) + if (tmpl == *str) + *str = ch; +} + +/* + * Load ebpf function code into VM and attach it to syscall exit point using + * KProbe. + */ +int +load_fn_and_attach_to_kp(struct bpf_ctx *sbcp, + const char *event, const char *fn_name, + pid_t pid, unsigned cpu, int group_fd) +{ + char desc[256]; + struct perf_reader *pr; + int fn_fd; + + if (!pr_arr_check_quota(sbcp, 1)) { + fprintf(stderr, + "ERROR:%s:Number of perf readers would exceed" + " global quota: %d\n", + __func__, args.pr_arr_max); + + return -1; + } + + fn_fd = load_obj_code_into_ebpf_vm(sbcp, fn_name, BPF_PROG_TYPE_KPROBE); + if (fn_fd == -1) { + return -1; + } + + char *ev_name = calloc(1, 2 + strlen(event) + 1); + + strcpy(ev_name, "p_"); + strcat(ev_name, event); + chr_replace(ev_name, '+', '_'); + chr_replace(ev_name, '.', '_'); + + snprintf(desc, sizeof(desc), "p:kprobes/%s %s", ev_name, event); + + pr = bpf_attach_kprobe(fn_fd, ev_name, desc, pid, (int)cpu, group_fd, + NULL, NULL); + + if (NULL == pr) { + fprintf(stderr, + "ERROR:%s:Failed to attach eBPF function '%s'" + " to kprobe '%s': %m\n", + __func__, fn_name, event); + + free(ev_name); + + return -1; + } + + append_item_to_pr_arr(sbcp, ev_name, pr, true); + + free(ev_name); + + return 0; +} + +/* + * Load ebpf function code into VM and attach it to syscall exit point using + * KProbe. + */ +int +load_fn_and_attach_to_kretp(struct bpf_ctx *sbcp, + const char *event, const char *fn_name, + pid_t pid, unsigned cpu, int group_fd) +{ + char desc[256]; + struct perf_reader *pr; + int fn_fd; + + if (!pr_arr_check_quota(sbcp, 1)) { + fprintf(stderr, + "ERROR:%s:Number of perf readers would exceed" + " global quota: %d\n", + __func__, args.pr_arr_max); + + return -1; + } + + fn_fd = load_obj_code_into_ebpf_vm(sbcp, fn_name, BPF_PROG_TYPE_KPROBE); + if (fn_fd == -1) { + return -1; + } + + char *ev_name = calloc(1, 2 + strlen(event) + 1); + + strcpy(ev_name, "r_"); + strcat(ev_name, event); + chr_replace(ev_name, '+', '_'); + chr_replace(ev_name, '.', '_'); + + snprintf(desc, sizeof(desc), "r:kprobes/%s %s", ev_name, event); + + pr = bpf_attach_kprobe(fn_fd, ev_name, desc, pid, (int)cpu, group_fd, + NULL, NULL); + + if (NULL == pr) { + fprintf(stderr, + "ERROR:%s:Failed to attach eBPF function '%s'" + " to kprobe '%s': %m\n", + __func__, fn_name, event); + + return -1; + } + + append_item_to_pr_arr(sbcp, ev_name, pr, true); + + free(ev_name); + + return 0; +} + +/* + * Load ebpf function code into VM and attach it to syscall exit point using + * TracePoint. + */ +int +load_fn_and_attach_to_tp(struct bpf_ctx *sbcp, + const char *tp_category, const char *tp_name, + const char *fn_name, + int pid, unsigned cpu, int group_fd) +{ + if (!pr_arr_check_quota(sbcp, 1)) { + fprintf(stderr, + "ERROR:%s:Number of perf readers would exceed" + " global quota: %d\n", + __func__, args.pr_arr_max); + + return -1; + } + + int fn_fd = load_obj_code_into_ebpf_vm(sbcp, + fn_name, BPF_PROG_TYPE_TRACEPOINT); + + struct perf_reader *pr = bpf_attach_tracepoint(fn_fd, + tp_category, tp_name, + pid, (int)cpu, group_fd, NULL, NULL); + + if (NULL == pr) { + fprintf(stderr, + "ERROR:%s:Failed to attach eBPF function '%s'" + " to tracepoint '%s:%s': %m\n", + __func__, fn_name, tp_category, tp_name); + + return -1; + } + + char *ev_name = calloc(1, + strlen(tp_category) + 1 + strlen(tp_name) + 1); + + strcpy(ev_name, tp_category); + strcat(ev_name, ":"); + strcat(ev_name, tp_name); + + /* XXX May be we should mark this pr with some specific numeric code */ + append_item_to_pr_arr(sbcp, ev_name, pr, false); + + free(ev_name); + + return 0; +} diff --git a/src/libstrace/bpf.h b/src/libstrace/bpf.h new file mode 100644 index 000000000..53c544388 --- /dev/null +++ b/src/libstrace/bpf.h @@ -0,0 +1,84 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * bpf.h -- Key bpf_ctx structure and related functions + */ + +/* PLEASE do not rename this macro to BPF_H. There is a conflict. */ +#ifndef __BPF_H +#define __BPF_H + +#include +#include +#include + +#include + +struct bpf_pr { + struct perf_reader *pr; + + /* + * XXX May be we should replace this field with some + * enum perf_reader_type_t as soon as tracepoints + * will be fixed. + */ + bool attached; + char key[]; +}; + +struct bpf_ctx { + void *module; + unsigned debug; + struct bpf_pr **pr_arr; + unsigned pr_arr_qty; +}; + +int attach_callback_to_perf_output(struct bpf_ctx *sbcp, + const char *perf_event, perf_reader_raw_cb callback); + +int load_fn_and_attach_to_kp(struct bpf_ctx *sbcp, + const char *event, const char *fn_name, + pid_t pid, unsigned cpu, int group_fd); + +int load_fn_and_attach_to_kretp(struct bpf_ctx *sbcp, + const char *event, const char *fn_name, + pid_t pid, unsigned cpu, int group_fd); + +int load_fn_and_attach_to_tp(struct bpf_ctx *sbcp, + const char *tp_category, const char *tp_name, + const char *fn_name, + int pid, unsigned cpu, int group_fd); + +void detach_all(struct bpf_ctx *b); + +#endif /* __BPF_H */ diff --git a/src/libstrace/ebpf_syscalls.c b/src/libstrace/ebpf_syscalls.c new file mode 100644 index 000000000..8d286f5c7 --- /dev/null +++ b/src/libstrace/ebpf_syscalls.c @@ -0,0 +1,455 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * ebpf_syscalls.c -- a table of glibc-supported syscalls + */ + +#include + +#include "ebpf_syscalls.h" + + +/* EBPF_SYSCALL(__NR_setxattr, sys_setxattr) */ +#define EBPF_SYSCALL(nr, sym) [nr] = {\ + .num = nr, \ + .num_name = #nr, \ + .hlr_name = #sym, \ + .masks = 0 }, + +#define EBPF_SYSCALL_FILE(nr, sym) [nr] = {\ + .num = nr, \ + .num_name = #nr, \ + .hlr_name = #sym, \ + .masks = EM_file }, + +#define EBPF_SYSCALL_FILEAT(nr, sym) [nr] = {\ + .num = nr, \ + .num_name = #nr, \ + .hlr_name = #sym, \ + .masks = EM_fileat }, + +#define EBPF_SYSCALL_DESC(nr, sym) [nr] = {\ + .num = nr, \ + .num_name = #nr, \ + .hlr_name = #sym, \ + .masks = EM_desc }, + +#define SC_NI { .num = SC_TBL_SIZE, \ + .num_name = "NI", \ + .hlr_name = NULL } + +/* + * Commented syscalls mean that syscall exists in the kernel but glibc + * does not provide __NR_* and SYS_* macros. + */ +struct sc_t sc_tbl[SC_TBL_SIZE] = { + [0 ... SC_TBL_SIZE - 1] = SC_NI, + /* + * [__NR_open] = + * { .num = __NR_open, .hlr_name = "SyS_open", .masks = EM_file }, + * [__NR_read] = + * { .num = __NR_read, .hlr_name = "SyS_read", .masks = EM_desk }, + * [__NR_write] = + * { .num = __NR_write, .hlr_name = "SyS_write", .masks = EM_desk }, + * [__NR_close] = + * { .num = __NR_close, .hlr_name = "SyS_close", .masks = EM_desk }, + */ + +EBPF_SYSCALL(__NR_arch_prctl, sys_arch_prctl) +EBPF_SYSCALL(__NR_rt_sigreturn, sys_rt_sigreturn) +EBPF_SYSCALL(__NR_ioperm, sys_ioperm) +EBPF_SYSCALL(__NR_iopl, SyS_iopl) +EBPF_SYSCALL(__NR_modify_ldt, sys_modify_ldt) +EBPF_SYSCALL_DESC(__NR_mmap, SyS_mmap) +EBPF_SYSCALL(__NR_set_thread_area, SyS_set_thread_area) +EBPF_SYSCALL(__NR_get_thread_area, SyS_get_thread_area) +EBPF_SYSCALL(__NR_set_tid_address, SyS_set_tid_address) +EBPF_SYSCALL(__NR_fork, sys_fork) +EBPF_SYSCALL(__NR_vfork, sys_vfork) +EBPF_SYSCALL(__NR_clone, SyS_clone) +EBPF_SYSCALL(__NR_unshare, SyS_unshare) +EBPF_SYSCALL(__NR_personality, SyS_personality) +EBPF_SYSCALL(__NR_exit, SyS_exit) +EBPF_SYSCALL(__NR_exit_group, SyS_exit_group) +EBPF_SYSCALL(__NR_waitid, SyS_waitid) +EBPF_SYSCALL(__NR_wait4, SyS_wait4) +/* EBPF_SYSCALL(__NR_waitpid, SyS_waitpid) */ +EBPF_SYSCALL(__NR__sysctl, SyS_sysctl) +EBPF_SYSCALL(__NR_capget, SyS_capget) +EBPF_SYSCALL(__NR_capset, SyS_capset) +EBPF_SYSCALL(__NR_ptrace, SyS_ptrace) +EBPF_SYSCALL(__NR_restart_syscall, sys_restart_syscall) +EBPF_SYSCALL(__NR_rt_sigprocmask, SyS_rt_sigprocmask) +EBPF_SYSCALL(__NR_rt_sigpending, SyS_rt_sigpending) +EBPF_SYSCALL(__NR_rt_sigtimedwait, SyS_rt_sigtimedwait) +EBPF_SYSCALL(__NR_kill, SyS_kill) +EBPF_SYSCALL(__NR_tgkill, SyS_tgkill) +EBPF_SYSCALL(__NR_tkill, SyS_tkill) +EBPF_SYSCALL(__NR_rt_sigqueueinfo, SyS_rt_sigqueueinfo) +EBPF_SYSCALL(__NR_rt_tgsigqueueinfo, SyS_rt_tgsigqueueinfo) +EBPF_SYSCALL(__NR_sigaltstack, SyS_sigaltstack) +/* EBPF_SYSCALL(__NR_sigpending, SyS_sigpending) */ +/* EBPF_SYSCALL(__NR_sigprocmask, SyS_sigprocmask) */ +EBPF_SYSCALL(__NR_rt_sigaction, SyS_rt_sigaction) +/* EBPF_SYSCALL(__NR_sgetmask, sys_sgetmask) */ +/* EBPF_SYSCALL(__NR_ssetmask, SyS_ssetmask) */ +/* EBPF_SYSCALL(__NR_signal, SyS_signal) */ +EBPF_SYSCALL(__NR_pause, sys_pause) +EBPF_SYSCALL(__NR_rt_sigsuspend, SyS_rt_sigsuspend) +/* EBPF_SYSCALL(__NR_sigsuspend, SyS_sigsuspend) */ +/* EBPF_SYSCALL(__NR_sigsuspend, SyS_sigsuspend) */ +EBPF_SYSCALL(__NR_setpriority, SyS_setpriority) +EBPF_SYSCALL(__NR_getpriority, SyS_getpriority) +EBPF_SYSCALL(__NR_setregid, SyS_setregid) +EBPF_SYSCALL(__NR_setgid, SyS_setgid) +EBPF_SYSCALL(__NR_setreuid, SyS_setreuid) +EBPF_SYSCALL(__NR_setuid, SyS_setuid) +EBPF_SYSCALL(__NR_setresuid, SyS_setresuid) +EBPF_SYSCALL(__NR_getresuid, SyS_getresuid) +EBPF_SYSCALL(__NR_setresgid, SyS_setresgid) +EBPF_SYSCALL(__NR_getresgid, SyS_getresgid) +EBPF_SYSCALL(__NR_setfsuid, SyS_setfsuid) +EBPF_SYSCALL(__NR_setfsgid, SyS_setfsgid) +EBPF_SYSCALL(__NR_getpid, sys_getpid) +EBPF_SYSCALL(__NR_gettid, sys_gettid) +EBPF_SYSCALL(__NR_getppid, sys_getppid) +EBPF_SYSCALL(__NR_getuid, sys_getuid) +EBPF_SYSCALL(__NR_geteuid, sys_geteuid) +EBPF_SYSCALL(__NR_getgid, sys_getgid) +EBPF_SYSCALL(__NR_getegid, sys_getegid) +EBPF_SYSCALL(__NR_times, SyS_times) +EBPF_SYSCALL(__NR_setpgid, SyS_setpgid) +EBPF_SYSCALL(__NR_getpgid, SyS_getpgid) +EBPF_SYSCALL(__NR_getpgrp, sys_getpgrp) +EBPF_SYSCALL(__NR_getsid, SyS_getsid) +EBPF_SYSCALL(__NR_setsid, sys_setsid) +/* EBPF_SYSCALL(__NR_newuname, SyS_newuname) */ +EBPF_SYSCALL(__NR_uname, SyS_uname) +/* EBPF_SYSCALL(__NR_olduname, SyS_olduname) */ +EBPF_SYSCALL(__NR_sethostname, SyS_sethostname) +/* EBPF_SYSCALL(__NR_gethostname, SyS_gethostname) */ +EBPF_SYSCALL(__NR_setdomainname, SyS_setdomainname) +/* EBPF_SYSCALL(__NR_old_getrlimit, SyS_old_getrlimit) */ +EBPF_SYSCALL(__NR_getrlimit, SyS_getrlimit) +EBPF_SYSCALL(__NR_prlimit64, SyS_prlimit64) +EBPF_SYSCALL(__NR_setrlimit, SyS_setrlimit) +EBPF_SYSCALL(__NR_getrusage, SyS_getrusage) +EBPF_SYSCALL(__NR_umask, SyS_umask) +EBPF_SYSCALL(__NR_prctl, SyS_prctl) +EBPF_SYSCALL(__NR_getcpu, SyS_getcpu) +EBPF_SYSCALL(__NR_sysinfo, SyS_sysinfo) +/* EBPF_SYSCALL(__NR_ni_syscall, sys_ni_syscall) */ +EBPF_SYSCALL(__NR_setns, SyS_setns) +EBPF_SYSCALL(__NR_reboot, SyS_reboot) +EBPF_SYSCALL(__NR_getgroups, SyS_getgroups) +EBPF_SYSCALL(__NR_setgroups, SyS_setgroups) +/* EBPF_SYSCALL(__NR_nice, SyS_nice) */ +EBPF_SYSCALL(__NR_sched_setscheduler, SyS_sched_setscheduler) +EBPF_SYSCALL(__NR_sched_setparam, SyS_sched_setparam) +EBPF_SYSCALL(__NR_sched_setattr, SyS_sched_setattr) +EBPF_SYSCALL(__NR_sched_getscheduler, SyS_sched_getscheduler) +EBPF_SYSCALL(__NR_sched_getparam, SyS_sched_getparam) +EBPF_SYSCALL(__NR_sched_getattr, SyS_sched_getattr) +EBPF_SYSCALL(__NR_sched_setaffinity, SyS_sched_setaffinity) +EBPF_SYSCALL(__NR_sched_getaffinity, SyS_sched_getaffinity) +EBPF_SYSCALL(__NR_sched_yield, sys_sched_yield) +EBPF_SYSCALL(__NR_sched_get_priority_max, SyS_sched_get_priority_max) +EBPF_SYSCALL(__NR_sched_get_priority_min, SyS_sched_get_priority_min) +EBPF_SYSCALL(__NR_sched_rr_get_interval, SyS_sched_rr_get_interval) +EBPF_SYSCALL(__NR_syslog, SyS_syslog) +EBPF_SYSCALL(__NR_kcmp, SyS_kcmp) +EBPF_SYSCALL(__NR_time, SyS_time) +/* EBPF_SYSCALL(__NR_stime, SyS_stime) */ +EBPF_SYSCALL(__NR_gettimeofday, SyS_gettimeofday) +EBPF_SYSCALL(__NR_settimeofday, SyS_settimeofday) +EBPF_SYSCALL(__NR_adjtimex, SyS_adjtimex) +EBPF_SYSCALL(__NR_alarm, SyS_alarm) +EBPF_SYSCALL(__NR_nanosleep, SyS_nanosleep) +EBPF_SYSCALL(__NR_getitimer, SyS_getitimer) +EBPF_SYSCALL(__NR_setitimer, SyS_setitimer) +EBPF_SYSCALL(__NR_timer_create, SyS_timer_create) +EBPF_SYSCALL(__NR_timer_gettime, SyS_timer_gettime) +EBPF_SYSCALL(__NR_timer_getoverrun, SyS_timer_getoverrun) +EBPF_SYSCALL(__NR_timer_settime, SyS_timer_settime) +EBPF_SYSCALL(__NR_timer_delete, SyS_timer_delete) +EBPF_SYSCALL(__NR_clock_settime, SyS_clock_settime) +EBPF_SYSCALL(__NR_clock_gettime, SyS_clock_gettime) +EBPF_SYSCALL(__NR_clock_adjtime, SyS_clock_adjtime) +EBPF_SYSCALL(__NR_clock_getres, SyS_clock_getres) +EBPF_SYSCALL(__NR_clock_nanosleep, SyS_clock_nanosleep) +EBPF_SYSCALL(__NR_set_robust_list, SyS_set_robust_list) +EBPF_SYSCALL(__NR_get_robust_list, SyS_get_robust_list) +EBPF_SYSCALL(__NR_futex, SyS_futex) +/* EBPF_SYSCALL(__NR_chown16, SyS_chown16) */ +/* EBPF_SYSCALL(__NR_lchown16, SyS_lchown16) */ +/* EBPF_SYSCALL(__NR_fchown16, SyS_fchown16) */ +/* EBPF_SYSCALL(__NR_setregid16, SyS_setregid16) */ +/* EBPF_SYSCALL(__NR_setgid16, SyS_setgid16) */ +/* EBPF_SYSCALL(__NR_setreuid16, SyS_setreuid16) */ +/* EBPF_SYSCALL(__NR_setuid16, SyS_setuid16) */ +/* EBPF_SYSCALL(__NR_setresuid16, SyS_setresuid16) */ +/* EBPF_SYSCALL(__NR_getresuid16, SyS_getresuid16) */ +/* EBPF_SYSCALL(__NR_setresgid16, SyS_setresgid16) */ +/* EBPF_SYSCALL(__NR_getresgid16, SyS_getresgid16) */ +/* EBPF_SYSCALL(__NR_setfsuid16, SyS_setfsuid16) */ +/* EBPF_SYSCALL(__NR_setfsgid16, SyS_setfsgid16) */ +/* EBPF_SYSCALL(__NR_getgroups16, SyS_getgroups16) */ +/* EBPF_SYSCALL(__NR_setgroups16, SyS_setgroups16) */ +/* EBPF_SYSCALL(__NR_getuid16, sys_getuid16) */ +/* EBPF_SYSCALL(__NR_geteuid16, sys_geteuid16) */ +/* EBPF_SYSCALL(__NR_getgid16, sys_getgid16) */ +/* EBPF_SYSCALL(__NR_getegid16, sys_getegid16) */ +EBPF_SYSCALL_FILE(__NR_delete_module, SyS_delete_module) +EBPF_SYSCALL(__NR_init_module, SyS_init_module) +EBPF_SYSCALL_DESC(__NR_finit_module, SyS_finit_module) +EBPF_SYSCALL_FILE(__NR_acct, SyS_acct) +EBPF_SYSCALL(__NR_kexec_load, SyS_kexec_load) +EBPF_SYSCALL_DESC(__NR_kexec_file_load, SyS_kexec_file_load) +EBPF_SYSCALL(__NR_seccomp, SyS_seccomp) +EBPF_SYSCALL(__NR_bpf, SyS_bpf) +EBPF_SYSCALL(__NR_membarrier, SyS_membarrier) +EBPF_SYSCALL_DESC(__NR_readahead, SyS_readahead) +EBPF_SYSCALL_FILE(__NR_memfd_create, SyS_memfd_create) +EBPF_SYSCALL(__NR_mincore, SyS_mincore) +EBPF_SYSCALL(__NR_mlock, SyS_mlock) +EBPF_SYSCALL(__NR_mlock2, SyS_mlock2) +EBPF_SYSCALL(__NR_munlock, SyS_munlock) +EBPF_SYSCALL(__NR_mlockall, SyS_mlockall) +EBPF_SYSCALL(__NR_munlockall, sys_munlockall) +/* EBPF_SYSCALL(__NR_mmap_pgoff, SyS_mmap_pgoff) */ +EBPF_SYSCALL(__NR_brk, SyS_brk) +EBPF_SYSCALL(__NR_munmap, SyS_munmap) +EBPF_SYSCALL(__NR_remap_file_pages, SyS_remap_file_pages) +EBPF_SYSCALL(__NR_mprotect, SyS_mprotect) +EBPF_SYSCALL(__NR_mremap, SyS_mremap) +EBPF_SYSCALL(__NR_msync, SyS_msync) +EBPF_SYSCALL(__NR_process_vm_readv, SyS_process_vm_readv) +EBPF_SYSCALL(__NR_process_vm_writev, SyS_process_vm_writev) +/* EBPF_SYSCALL_DESC(__NR_fadvise64_64, SyS_fadvise64_64) */ +EBPF_SYSCALL_DESC(__NR_fadvise64, SyS_fadvise64) +EBPF_SYSCALL(__NR_madvise, SyS_madvise) +EBPF_SYSCALL_FILE(__NR_swapoff, SyS_swapoff) +EBPF_SYSCALL_FILE(__NR_swapon, SyS_swapon) +EBPF_SYSCALL(__NR_set_mempolicy, SyS_set_mempolicy) +EBPF_SYSCALL(__NR_migrate_pages, SyS_migrate_pages) +EBPF_SYSCALL(__NR_get_mempolicy, SyS_get_mempolicy) +EBPF_SYSCALL(__NR_mbind, SyS_mbind) +EBPF_SYSCALL(__NR_move_pages, SyS_move_pages) +EBPF_SYSCALL_DESC(__NR_close, SyS_close) +EBPF_SYSCALL_FILE(__NR_truncate, SyS_truncate) +EBPF_SYSCALL_DESC(__NR_ftruncate, SyS_ftruncate) +EBPF_SYSCALL_DESC(__NR_fallocate, SyS_fallocate) +EBPF_SYSCALL_FILEAT(__NR_faccessat, SyS_faccessat) +EBPF_SYSCALL_FILE(__NR_access, SyS_access) +EBPF_SYSCALL_FILE(__NR_chdir, SyS_chdir) +EBPF_SYSCALL_DESC(__NR_fchdir, SyS_fchdir) +EBPF_SYSCALL_FILE(__NR_chroot, SyS_chroot) +EBPF_SYSCALL_DESC(__NR_fchmod, SyS_fchmod) +EBPF_SYSCALL_FILEAT(__NR_fchmodat, SyS_fchmodat) +EBPF_SYSCALL_FILE(__NR_chmod, SyS_chmod) +EBPF_SYSCALL_FILEAT(__NR_fchownat, SyS_fchownat) +EBPF_SYSCALL_FILE(__NR_chown, SyS_chown) +EBPF_SYSCALL_FILE(__NR_lchown, SyS_lchown) +EBPF_SYSCALL_DESC(__NR_fchown, SyS_fchown) +EBPF_SYSCALL_FILE(__NR_open, SyS_open) +EBPF_SYSCALL_FILEAT(__NR_openat, SyS_openat) +EBPF_SYSCALL_FILE(__NR_creat, SyS_creat) +EBPF_SYSCALL(__NR_vhangup, sys_vhangup) +EBPF_SYSCALL_DESC(__NR_lseek, SyS_lseek) +/* EBPF_SYSCALL_DESC(__NR_llseek, SyS_llseek) */ +EBPF_SYSCALL_DESC(__NR_read, SyS_read) +EBPF_SYSCALL_DESC(__NR_write, SyS_write) +EBPF_SYSCALL_DESC(__NR_pread64, SyS_pread64) +EBPF_SYSCALL_DESC(__NR_pwrite64, SyS_pwrite64) +EBPF_SYSCALL_DESC(__NR_readv, SyS_readv) +EBPF_SYSCALL_DESC(__NR_writev, SyS_writev) +EBPF_SYSCALL_DESC(__NR_preadv, SyS_preadv) +EBPF_SYSCALL_DESC(__NR_pwritev, SyS_pwritev) +EBPF_SYSCALL_DESC(__NR_sendfile, SyS_sendfile) +/* EBPF_SYSCALL_DESC(__NR_sendfile64, SyS_sendfile64) */ +EBPF_SYSCALL_FILE(__NR_stat, SyS_stat) +EBPF_SYSCALL_FILE(__NR_lstat, SyS_lstat) +EBPF_SYSCALL_DESC(__NR_fstat, SyS_fstat) +EBPF_SYSCALL_FILE(__NR_stat, SyS_newstat) +EBPF_SYSCALL_FILE(__NR_lstat, SyS_newlstat) +EBPF_SYSCALL_DESC(__NR_newfstatat, SyS_newfstatat) +EBPF_SYSCALL_DESC(__NR_fstat, SyS_newfstat) +EBPF_SYSCALL_FILEAT(__NR_readlinkat, SyS_readlinkat) +EBPF_SYSCALL_FILE(__NR_readlink, SyS_readlink) +EBPF_SYSCALL_FILE(__NR_uselib, SyS_uselib) +EBPF_SYSCALL_FILE(__NR_execve, SyS_execve) +EBPF_SYSCALL_FILEAT(__NR_execveat, SyS_execveat) +EBPF_SYSCALL(__NR_pipe2, SyS_pipe2) +EBPF_SYSCALL(__NR_pipe, SyS_pipe) +EBPF_SYSCALL_FILEAT(__NR_mknodat, SyS_mknodat) +EBPF_SYSCALL_FILE(__NR_mknod, SyS_mknod) +EBPF_SYSCALL_FILEAT(__NR_mkdirat, SyS_mkdirat) +EBPF_SYSCALL_FILE(__NR_mkdir, SyS_mkdir) +EBPF_SYSCALL_FILE(__NR_rmdir, SyS_rmdir) +EBPF_SYSCALL_FILEAT(__NR_unlinkat, SyS_unlinkat) +EBPF_SYSCALL_FILE(__NR_unlink, SyS_unlink) +/* WARNING non-standard API */ +EBPF_SYSCALL_FILE(__NR_symlinkat, SyS_symlinkat) +EBPF_SYSCALL_FILE(__NR_symlink, SyS_symlink) +EBPF_SYSCALL_FILEAT(__NR_linkat, SyS_linkat) +EBPF_SYSCALL_FILE(__NR_link, SyS_link) +EBPF_SYSCALL_FILEAT(__NR_renameat2, SyS_renameat2) +EBPF_SYSCALL_FILEAT(__NR_renameat, SyS_renameat) +EBPF_SYSCALL_FILE(__NR_rename, SyS_rename) +EBPF_SYSCALL_DESC(__NR_fcntl, SyS_fcntl) +EBPF_SYSCALL_DESC(__NR_ioctl, SyS_ioctl) +/* EBPF_SYSCALL_DESC(__NR_old_readdir, SyS_old_readdir) */ +EBPF_SYSCALL_DESC(__NR_getdents, SyS_getdents) +EBPF_SYSCALL_DESC(__NR_getdents64, SyS_getdents64) +EBPF_SYSCALL(__NR_select, SyS_select) +EBPF_SYSCALL(__NR_pselect6, SyS_pselect6) +EBPF_SYSCALL(__NR_poll, SyS_poll) +EBPF_SYSCALL(__NR_ppoll, SyS_ppoll) +EBPF_SYSCALL(__NR_getcwd, SyS_getcwd) +EBPF_SYSCALL_DESC(__NR_dup3, SyS_dup3) +EBPF_SYSCALL_DESC(__NR_dup2, SyS_dup2) +EBPF_SYSCALL_DESC(__NR_dup, SyS_dup) +EBPF_SYSCALL(__NR_sysfs, SyS_sysfs) +/* EBPF_SYSCALL_FILE(__NR_umount, SyS_umount) */ +/* EBPF_SYSCALL_FILE(__NR_oldumount, SyS_oldumount) */ +EBPF_SYSCALL_FILE(__NR_mount, SyS_mount) +EBPF_SYSCALL_FILE(__NR_pivot_root, SyS_pivot_root) +EBPF_SYSCALL_FILE(__NR_setxattr, SyS_setxattr) +EBPF_SYSCALL_FILE(__NR_lsetxattr, SyS_lsetxattr) +EBPF_SYSCALL_DESC(__NR_fsetxattr, SyS_fsetxattr) +EBPF_SYSCALL_FILE(__NR_getxattr, SyS_getxattr) +EBPF_SYSCALL_FILE(__NR_lgetxattr, SyS_lgetxattr) +EBPF_SYSCALL_DESC(__NR_fgetxattr, SyS_fgetxattr) +EBPF_SYSCALL_FILE(__NR_listxattr, SyS_listxattr) +EBPF_SYSCALL_FILE(__NR_llistxattr, SyS_llistxattr) +EBPF_SYSCALL_DESC(__NR_flistxattr, SyS_flistxattr) +EBPF_SYSCALL_FILE(__NR_removexattr, SyS_removexattr) +EBPF_SYSCALL_FILE(__NR_lremovexattr, SyS_lremovexattr) +EBPF_SYSCALL_DESC(__NR_fremovexattr, SyS_fremovexattr) +EBPF_SYSCALL_DESC(__NR_vmsplice, SyS_vmsplice) +EBPF_SYSCALL_DESC(__NR_splice, SyS_splice) +EBPF_SYSCALL_DESC(__NR_tee, SyS_tee) +EBPF_SYSCALL(__NR_sync, sys_sync) +EBPF_SYSCALL_DESC(__NR_syncfs, SyS_syncfs) +EBPF_SYSCALL_DESC(__NR_fsync, SyS_fsync) +EBPF_SYSCALL_DESC(__NR_fdatasync, SyS_fdatasync) +EBPF_SYSCALL_DESC(__NR_sync_file_range, SyS_sync_file_range) +/* EBPF_SYSCALL_DESC(__NR_sync_file_range2, SyS_sync_file_range2) */ +EBPF_SYSCALL_FILE(__NR_utime, SyS_utime) +EBPF_SYSCALL_FILEAT(__NR_utimensat, SyS_utimensat) +EBPF_SYSCALL_FILEAT(__NR_futimesat, SyS_futimesat) +EBPF_SYSCALL_FILE(__NR_utimes, SyS_utimes) +EBPF_SYSCALL_FILE(__NR_statfs, SyS_statfs) +/* EBPF_SYSCALL_FILE(__NR_statfs64, SyS_statfs64) */ +EBPF_SYSCALL_DESC(__NR_fstatfs, SyS_fstatfs) +/* EBPF_SYSCALL_DESC(__NR_fstatfs64, SyS_fstatfs64) */ +EBPF_SYSCALL(__NR_ustat, SyS_ustat) +/* EBPF_SYSCALL(__NR_bdflush, SyS_bdflush) */ +EBPF_SYSCALL(__NR_inotify_init1, SyS_inotify_init1) +EBPF_SYSCALL(__NR_inotify_init, sys_inotify_init) +EBPF_SYSCALL_DESC(__NR_inotify_add_watch, SyS_inotify_add_watch) +EBPF_SYSCALL_DESC(__NR_inotify_rm_watch, SyS_inotify_rm_watch) +EBPF_SYSCALL(__NR_fanotify_init, SyS_fanotify_init) +EBPF_SYSCALL_DESC(__NR_fanotify_mark, SyS_fanotify_mark) +EBPF_SYSCALL(__NR_epoll_create1, SyS_epoll_create1) +EBPF_SYSCALL(__NR_epoll_create, SyS_epoll_create) +EBPF_SYSCALL_DESC(__NR_epoll_ctl, SyS_epoll_ctl) +EBPF_SYSCALL_DESC(__NR_epoll_wait, SyS_epoll_wait) +EBPF_SYSCALL_DESC(__NR_epoll_pwait, SyS_epoll_pwait) +EBPF_SYSCALL_DESC(__NR_signalfd4, SyS_signalfd4) +EBPF_SYSCALL_DESC(__NR_signalfd, SyS_signalfd) +EBPF_SYSCALL(__NR_timerfd_create, SyS_timerfd_create) +EBPF_SYSCALL_DESC(__NR_timerfd_settime, SyS_timerfd_settime) +EBPF_SYSCALL_DESC(__NR_timerfd_gettime, SyS_timerfd_gettime) +EBPF_SYSCALL(__NR_eventfd2, SyS_eventfd2) +EBPF_SYSCALL(__NR_eventfd, SyS_eventfd) +EBPF_SYSCALL(__NR_userfaultfd, SyS_userfaultfd) +EBPF_SYSCALL(__NR_io_setup, SyS_io_setup) +EBPF_SYSCALL(__NR_io_destroy, SyS_io_destroy) +EBPF_SYSCALL(__NR_io_submit, SyS_io_submit) +EBPF_SYSCALL(__NR_io_cancel, SyS_io_cancel) +EBPF_SYSCALL(__NR_io_getevents, SyS_io_getevents) +EBPF_SYSCALL_DESC(__NR_flock, SyS_flock) +EBPF_SYSCALL_FILEAT(__NR_name_to_handle_at, SyS_name_to_handle_at) +EBPF_SYSCALL_DESC(__NR_open_by_handle_at, SyS_open_by_handle_at) +EBPF_SYSCALL(__NR_quotactl, SyS_quotactl) +EBPF_SYSCALL(__NR_lookup_dcookie, SyS_lookup_dcookie) +EBPF_SYSCALL(__NR_msgget, SyS_msgget) +EBPF_SYSCALL(__NR_msgctl, SyS_msgctl) +EBPF_SYSCALL(__NR_msgsnd, SyS_msgsnd) +EBPF_SYSCALL(__NR_msgrcv, SyS_msgrcv) +EBPF_SYSCALL(__NR_semget, SyS_semget) +EBPF_SYSCALL(__NR_semctl, SyS_semctl) +EBPF_SYSCALL(__NR_semtimedop, SyS_semtimedop) +EBPF_SYSCALL(__NR_semop, SyS_semop) +EBPF_SYSCALL(__NR_shmget, SyS_shmget) +EBPF_SYSCALL(__NR_shmctl, SyS_shmctl) +EBPF_SYSCALL(__NR_shmat, SyS_shmat) +EBPF_SYSCALL(__NR_shmdt, SyS_shmdt) +EBPF_SYSCALL_FILE(__NR_mq_open, SyS_mq_open) +EBPF_SYSCALL_FILE(__NR_mq_unlink, SyS_mq_unlink) +EBPF_SYSCALL(__NR_mq_timedsend, SyS_mq_timedsend) +EBPF_SYSCALL(__NR_mq_timedreceive, SyS_mq_timedreceive) +EBPF_SYSCALL(__NR_mq_notify, SyS_mq_notify) +EBPF_SYSCALL(__NR_mq_getsetattr, SyS_mq_getsetattr) +EBPF_SYSCALL(__NR_add_key, SyS_add_key) +EBPF_SYSCALL(__NR_request_key, SyS_request_key) +EBPF_SYSCALL(__NR_keyctl, SyS_keyctl) +EBPF_SYSCALL(__NR_ioprio_set, SyS_ioprio_set) +EBPF_SYSCALL(__NR_ioprio_get, SyS_ioprio_get) +/* EBPF_SYSCALL(__NR_size_show, sys_size_show) */ +EBPF_SYSCALL(__NR_getrandom, SyS_getrandom) +/* EBPF_SYSCALL(__NR_dmi_field_show, sys_dmi_field_show) */ +/* EBPF_SYSCALL(__NR_dmi_modalias_show, sys_dmi_modalias_show) */ +EBPF_SYSCALL(__NR_socket, SyS_socket) +EBPF_SYSCALL(__NR_socketpair, SyS_socketpair) +EBPF_SYSCALL_DESC(__NR_bind, SyS_bind) +EBPF_SYSCALL_DESC(__NR_listen, SyS_listen) +EBPF_SYSCALL_DESC(__NR_accept4, SyS_accept4) +EBPF_SYSCALL_DESC(__NR_accept, SyS_accept) +EBPF_SYSCALL_DESC(__NR_connect, SyS_connect) +EBPF_SYSCALL_DESC(__NR_getsockname, SyS_getsockname) +EBPF_SYSCALL_DESC(__NR_getpeername, SyS_getpeername) +EBPF_SYSCALL_DESC(__NR_sendto, SyS_sendto) +/* EBPF_SYSCALL_DESC(__NR_send, SyS_send) */ +EBPF_SYSCALL_DESC(__NR_recvfrom, SyS_recvfrom) +/* EBPF_SYSCALL_DESC(__NR_recv, SyS_recv) */ +EBPF_SYSCALL_DESC(__NR_setsockopt, SyS_setsockopt) +EBPF_SYSCALL_DESC(__NR_getsockopt, SyS_getsockopt) +EBPF_SYSCALL_DESC(__NR_shutdown, SyS_shutdown) +EBPF_SYSCALL_DESC(__NR_sendmsg, SyS_sendmsg) +EBPF_SYSCALL_DESC(__NR_sendmmsg, SyS_sendmmsg) +EBPF_SYSCALL_DESC(__NR_recvmsg, SyS_recvmsg) +EBPF_SYSCALL_DESC(__NR_recvmmsg, SyS_recvmmsg) +/* EBPF_SYSCALL(__NR_socketcall, SyS_socketcall) */ +}; diff --git a/src/libstrace/ebpf_syscalls.h b/src/libstrace/ebpf_syscalls.h new file mode 100644 index 000000000..94da673f1 --- /dev/null +++ b/src/libstrace/ebpf_syscalls.h @@ -0,0 +1,70 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * ebpf_syscalls.h -- a list of glibc-supported syscalls. + */ + +#ifndef EBPF_SYSCALLS_H +#define EBPF_SYSCALLS_H + +#include + +enum masks_t { + /* syscall returns an fd */ + EM_rdesc = 1 << 0, + /* syscall accepts fd as a first arg */ + EM_desc = 1 << 1, + /* syscall accepts fs path as a first arg */ + EM_file = 1 << 2, + /* syscall accepts dir fd as a first arg and path as a second */ + EM_fileat = 1 << 3, + /* syscall is actual for PMemFile */ + EM_pmemfile = 1 << 4, + EM_kern_all = 1 << 5, + EM_libc_all = 1 << 6, + + EM_ALL = -1, +}; + +struct sc_t { + unsigned num; + const char *num_name; + const char *hlr_name; + unsigned masks; +}; + +/* Currently glibc does not have appropriate macro for it */ +enum { SC_TBL_SIZE = 1024 }; +extern struct sc_t sc_tbl[SC_TBL_SIZE]; + +#endif /* EBPF_SYSCALLS_H */ diff --git a/src/libstrace/generate_ebpf.c b/src/libstrace/generate_ebpf.c new file mode 100644 index 000000000..b188ca603 --- /dev/null +++ b/src/libstrace/generate_ebpf.c @@ -0,0 +1,334 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * generate_ebpf.h -- generate_ebpf() function + */ + +#include +#include + +#include "main.h" +#include "utils.h" +#include "ebpf_syscalls.h" +#include "generate_ebpf.h" + +const char *ebpf_trace_h_file = "trace.h"; + +const char *ebpf_head_file = "trace_head.c"; +const char *ebpf_libc_tmpl_file = "trace_libc_tmpl.c"; +const char *ebpf_file_tmpl_file = "trace_file_tmpl.c"; +const char *ebpf_fileat_tmpl_file = "trace_fileat_tmpl.c"; +const char *ebpf_kern_tmpl_file = "trace_kern_tmpl.c"; + +const char *ebpf_tp_all_file = "trace_tp_all.c"; + +/* + * This function returns syscall number by name according to libc knowledge. + */ +static int +get_sc_num(const char *sc_name) +{ + for (int i = 0; i < SC_TBL_SIZE; i++) { + if (NULL == sc_tbl[i].hlr_name) + continue; + + if (!strcasecmp(sc_name, sc_tbl[i].hlr_name)) + return i; + } + + return -1; +} + +/* + * This function generates eBPF handler for syscalls which are known to glibc. + */ +static void +generate_ebpf_kp_libc_all(FILE *ts) +{ + char *text = NULL; + + for (unsigned i = 0; i < SC_TBL_SIZE; i++) { + if (NULL == sc_tbl[i].hlr_name) + continue; + + if (EM_file == (EM_file & sc_tbl[i].masks)) + text = load_file(ebpf_file_tmpl_file); + else if (EM_fileat == (EM_fileat & sc_tbl[i].masks)) + text = load_file(ebpf_fileat_tmpl_file); + else + text = load_file(ebpf_libc_tmpl_file); + + str_replace_all(&text, "SYSCALL_NR", + sc_tbl[i].num_name); + str_replace_all(&text, "SYSCALL_NAME", + sc_tbl[i].hlr_name); + + fwrite(text, strlen(text), 1, ts); + + free(text); text = NULL; + } +} + +/* XXX HACK: this syscall is exported by kernel twice. */ +static unsigned SyS_sigsuspend = 0; + +/* + * This function generates universal default eBPF syscall handler. + * + * Primer purpose of generated handler - new and unknown syscalls. + */ +static void +generate_ebpf_kp_kern_all(FILE *ts) +{ + char *text = NULL; + + + char *line = NULL; + size_t len = 0; + ssize_t read; + + FILE *in = fopen(debug_tracing_aff, "r"); + + if (NULL == in) { + fprintf(stderr, "%s: ERROR: '%m'\n", __func__); + return; + } + + while ((read = getline(&line, &len, in)) != -1) { + int sc_num; + + if (!is_a_sc(line, read - 1)) + continue; + + line [read - 1] = '\0'; + + /* XXX HACK: this syscall is exported by kernel twice. */ + if (!strcasecmp("SyS_sigsuspend", line)) { + if (SyS_sigsuspend) + continue; + + SyS_sigsuspend ++; + } + + sc_num = get_sc_num(line); + + /* Some optimization for glibc-supported syscalls */ + if (0 <= sc_num) { + if (EM_file == (EM_file & sc_tbl[sc_num].masks)) + text = load_file(ebpf_file_tmpl_file); + else if (EM_fileat == + (EM_fileat & sc_tbl[sc_num].masks)) + text = load_file(ebpf_fileat_tmpl_file); + else + text = load_file(ebpf_libc_tmpl_file); + + str_replace_all(&text, "SYSCALL_NR", + sc_tbl[sc_num].num_name); + } else { + text = load_file(ebpf_kern_tmpl_file); + } + + str_replace_all(&text, "SYSCALL_NAME", line); + + fwrite(text, strlen(text), 1, ts); + + free(text); text = NULL; + } + + free(line); + fclose(in); +} + +/* + * This function generates eBPF syscall handlers specific for syscalls with + * filename in arguments. + */ +static void +generate_ebpf_kp_file(FILE *ts) +{ + char *text = NULL; + + for (unsigned i = 0; i < SC_TBL_SIZE; i++) { + if (NULL == sc_tbl[i].hlr_name) + continue; + + if (EM_file != (EM_file & sc_tbl[i].masks)) + continue; + + text = load_file(ebpf_file_tmpl_file); + + str_replace_all(&text, "SYSCALL_NR", + sc_tbl[i].num_name); + str_replace_all(&text, "SYSCALL_NAME", + sc_tbl[i].hlr_name); + + fwrite(text, strlen(text), 1, ts); + + free(text); text = NULL; + } +} + +/* + * This function generates eBPF syscall handlers specific for syscalls with + * relative filename in arguments. + */ +static void +generate_ebpf_kp_fileat(FILE *ts) +{ + char *text = NULL; + + for (unsigned i = 0; i < SC_TBL_SIZE; i++) { + if (NULL == sc_tbl[i].hlr_name) + continue; + + if (EM_fileat != (EM_fileat & sc_tbl[i].masks)) + continue; + + text = load_file(ebpf_fileat_tmpl_file); + + str_replace_all(&text, "SYSCALL_NR", + sc_tbl[i].num_name); + str_replace_all(&text, "SYSCALL_NAME", + sc_tbl[i].hlr_name); + + fwrite(text, strlen(text), 1, ts); + + free(text); text = NULL; + } +} + +/* + * This function generates eBPF syscall handlers specific for syscalls with + * file-descriptor in arguments. + */ +static void +generate_ebpf_kp_desc(FILE *ts) +{ + char *text = NULL; + + for (unsigned i = 0; i < SC_TBL_SIZE; i++) { + if (NULL == sc_tbl[i].hlr_name) + continue; + + if (EM_desc != (EM_desc & sc_tbl[i].masks)) + continue; + + text = load_file(ebpf_libc_tmpl_file); + + str_replace_all(&text, "SYSCALL_NR", + sc_tbl[i].num_name); + str_replace_all(&text, "SYSCALL_NAME", + sc_tbl[i].hlr_name); + + fwrite(text, strlen(text), 1, ts); + + free(text); text = NULL; + } +} + +/* + * This function generates eBPF syscall handlers specific for syscalls which + * operate on files. + */ +static void +generate_ebpf_kp_pmemfile(FILE *ts) +{ + generate_ebpf_kp_file(ts); + generate_ebpf_kp_desc(ts); + generate_ebpf_kp_fileat(ts); +} + +/* + * This function generates eBPF syscall handler specific for tracepoint + * feature. + */ +static void +generate_ebpf_tp_all(FILE *ts) +{ + char *text = load_file(ebpf_tp_all_file); + + fwrite(text, strlen(text), 1, ts); + + free(text); text = NULL; +} + +/* + * This function parses and process expression. + */ +char * +generate_ebpf() +{ + char *text = NULL; + size_t text_size = 0; + + FILE *ts = open_memstream(&text, &text_size); + + /* Let's from header */ + char *head = load_file(ebpf_head_file); + fwrite(head, strlen(head), 1, ts); + free(head); head = NULL; + + if (NULL == args.expr) + goto DeFault; + + if (!strcasecmp(args.expr, "trace=kp-libc-all")) { + generate_ebpf_kp_libc_all(ts); + goto out; + } else if (!strcasecmp(args.expr, "trace=kp-kern-all")) { + generate_ebpf_kp_kern_all(ts); + goto out; + } else if (!strcasecmp(args.expr, "trace=kp-file")) { + generate_ebpf_kp_file(ts); + goto out; + } else if (!strcasecmp(args.expr, "trace=kp-desc")) { + generate_ebpf_kp_desc(ts); + goto out; + } else if (!strcasecmp(args.expr, "trace=kp-pmemfile")) { + generate_ebpf_kp_pmemfile(ts); + goto out; + } else if (!strcasecmp(args.expr, "trace=tp-all")) { + generate_ebpf_tp_all(ts); + goto out; + } + +DeFault: + fprintf(stderr, + "%s: Default expression 'trace=kp-kern-all' was chosen." + " If you would like some speed improvment think about" + " 'trace=kp-libc-all'.\n", __func__); + generate_ebpf_kp_kern_all(ts); + +out: + fclose(ts); + return text; +} diff --git a/src/libstrace/generate_ebpf.h b/src/libstrace/generate_ebpf.h new file mode 100644 index 000000000..659ed8a5a --- /dev/null +++ b/src/libstrace/generate_ebpf.h @@ -0,0 +1,72 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * generate_ebpf.h -- generate_ebpf() function + */ + +#ifndef GENERATE_EBPF_H +#define GENERATE_EBPF_H + +extern const char *ebpf_trace_h_file; + +extern const char *ebpf_head_file; +extern const char *ebpf_libc_tmpl_file; +extern const char *ebpf_file_tmpl_file; +extern const char *ebpf_fileat_tmpl_file; +extern const char *ebpf_kern_tmpl_file; +extern const char *ebpf_tp_all_file; + +char *generate_ebpf(void); + +extern const char _binary_trace_fileat_tmpl_c_size[]; +extern const char _binary_trace_fileat_tmpl_c_start[]; + +extern const char _binary_trace_file_tmpl_c_size[]; +extern const char _binary_trace_file_tmpl_c_start[]; + +extern const char _binary_trace_head_c_size[]; +extern const char _binary_trace_head_c_start[]; + +extern const char _binary_trace_h_size[]; +extern const char _binary_trace_h_start[]; + +extern const char _binary_trace_kern_tmpl_c_size[]; +extern const char _binary_trace_kern_tmpl_c_start[]; + +extern const char _binary_trace_libc_tmpl_c_size[]; +extern const char _binary_trace_libc_tmpl_c_start[]; + +extern const char _binary_trace_tp_all_c_size[]; +extern const char _binary_trace_tp_all_c_start[]; + +#endif diff --git a/src/libstrace/main.h b/src/libstrace/main.h new file mode 100644 index 000000000..757a4a3f3 --- /dev/null +++ b/src/libstrace/main.h @@ -0,0 +1,79 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * main.h -- application-wide stuff + */ + +#ifndef MAIN_H +#define MAIN_H + +#include +#include +#include + +enum out_fmt { + EOF_HEX = 0, + EOF_BIN, + EOF_STRACE, + + + EOF_QTY, /* Should be last */ +}; + +struct args_t { + bool timestamp; + bool failed; + bool command; + + unsigned debug; + + pid_t pid; + const char *out_fn; + const char *out_fmt_str; + char out_sep_ch; + const char *expr; +/* + * XXX Set this variable using args and + * command line options + */ + unsigned pr_arr_max; +}; + +extern struct args_t args; +extern bool cont; + +/* Output log */ +extern FILE *out; +extern enum out_fmt out_fmt; + +#endif /* MAIN_H */ diff --git a/src/libstrace/print_event_cb.c b/src/libstrace/print_event_cb.c new file mode 100644 index 000000000..fba31ecb5 --- /dev/null +++ b/src/libstrace/print_event_cb.c @@ -0,0 +1,506 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * print_event_cb.c -- print_event_cb() function + */ + +#include +#include +#include /* For SYS_xxx definitions */ + +#include +#include + +#include "main.h" +#include "ebpf_syscalls.h" +#include "print_event_cb.h" + +/* + * XXX A bit of black magic to have some US <-> KS portability. + * PLEASE do not add any other includes afters this comment. + */ +typedef __s32 s32; +typedef __u32 u32; +typedef __s64 s64; +typedef __u64 u64; + +enum { TASK_COMM_LEN = 16 }; + +#include "trace.h" + +static unsigned long long start_ts_nsec = 0; + +const char *sc_num2str(const int64_t sc_num); +void fprint_i64(FILE *f, uint64_t x); +char b2hex(char b); + +/* + * Process event. + * + * Also it can be a good idea to use cb_cookie for args, for out or for static + * variable above. + */ + +/* + * Print logs header. + * + * XXX A blank for human-readable strace-like logs + */ +static void +print_header_strace(int argc, char *argv[]) +{ + if (args.timestamp) + fprintf(out, "%-14s", "TIME(s)"); + + fprintf(out, "%-7s %-6s %4s %3s %s\n", + "SYSCALL", "PID_TID", "ARG1", "ERR", "PATH"); + + (void) argc; + (void) argv; +} + +/* + * Print syscall's log entry. + * + * XXX A blank for human-readable strace-like logs + */ +static void +print_event_strace(void *cb_cookie, void *data, int size) +{ + s64 res, err; + struct ev_dt_t *const event = data; + + /* XXX Check size arg */ + (void) size; + + /* split return value into result and errno */ + res = (event->ret >= 0) ? event->ret : -1; + err = (event->ret >= 0) ? 0 : -event->ret; + + if (start_ts_nsec == 0) + start_ts_nsec = event->start_ts_nsec; + + if (args.failed && (event->ret >= 0)) + return; + + if (args.timestamp) { + unsigned long long delta_nsec = + event->finish_ts_nsec - start_ts_nsec; + fprintf(out, "%-14.9f", + (double)((double)delta_nsec / 1000000000.0)); + } + + if (0 <= event->sc_id) + fprintf(out, "%-7s ", sc_num2str(event->sc_id)); + else + fprintf(out, "%-7s ", event->sc_name + 4); + + fprintf(out, "%-6llu %4lld %3lld %s\n", + event->pid_tid, res, err, event->fl_nm); + + (void) cb_cookie; +} + +/* ** Hex logs ** */ + +/* + * This function prints header for hexadecimal logs. + */ +static void +print_header_hex(int argc, char *argv[]) +{ + for (int i = 0; i < argc; i++) { + if (i + 1 != argc) + fprintf(out, "%s%c", argv[i], args.out_sep_ch); + else + fprintf(out, "%s\n", argv[i]); + } + + fprintf(out, "%s%c", "PID_TID", args.out_sep_ch); + + if (args.timestamp) + fprintf(out, "%s%c", "TIME(nsec)", args.out_sep_ch); + + fprintf(out, "%s%c", "ERR", args.out_sep_ch); + fprintf(out, "%s%c", "RES", args.out_sep_ch); + fprintf(out, "%s%c", "SYSCALL", args.out_sep_ch); + + fprintf(out, "%s%c", "ARG1", args.out_sep_ch); + fprintf(out, "%s%c", "ARG2", args.out_sep_ch); + fprintf(out, "%s%c", "ARG3", args.out_sep_ch); + fprintf(out, "%s%c", "ARG4", args.out_sep_ch); + fprintf(out, "%s%c", "ARG5", args.out_sep_ch); + fprintf(out, "%s%c", "ARG6", args.out_sep_ch); + + /* For COMM and like */ + fprintf(out, "%s", "AUX_DATA"); + + fprintf(out, "\n"); +} + +/* + * This function returnss character corresponding to hexadecimal digit. + */ +char +b2hex(char b) +{ + switch (b & 0xF) { + case 0: return '0'; + case 1: return '1'; + case 2: return '2'; + case 3: return '3'; + case 4: return '4'; + case 5: return '5'; + case 6: return '6'; + case 7: return '7'; + case 8: return '8'; + case 9: return '9'; + case 0xA: return 'A'; + case 0xB: return 'B'; + case 0xC: return 'C'; + case 0xD: return 'D'; + case 0xE: return 'E'; + case 0xF: return 'F'; + } + + return '?'; +} + +/* + * This function prints 64-bit integer in hexadecimal forn in stream. + */ +void +fprint_i64(FILE *f, uint64_t x) +{ + char str[2 * sizeof(x)]; + + const char *const px = (const char *)&x; + + for (unsigned i = 0; i < sizeof(x); i++) { + str[sizeof(str) - 1 - 2 * i - 0] = b2hex(px[i]); + str[sizeof(str) - 1 - 2 * i - 1] = b2hex(px[i]>>4); + } + + fwrite(str, sizeof(str), 1, f); +} + +/* + * This function returnss syscall's name by number + */ +const char * +sc_num2str(const int64_t sc_num) +{ + static char buf[32]; + + if ((0 <= sc_num) && (SC_TBL_SIZE > sc_num)) { + if (NULL == sc_tbl[sc_num].hlr_name) + goto out; + + return sc_tbl[sc_num].hlr_name + 4 /* strlen("sys_") */; + } + +out: + snprintf(buf, sizeof(buf), "sys_%ld", sc_num); + + return buf; +} + +/* + * This function prints syscall's logs entry in stream. + * + * WARNING + * + * PLEASE don't use *printf() calls because it will slow down this + * function too much. + */ +static void +print_event_hex(void *cb_cookie, void *data, int size) +{ + s64 res, err; + struct ev_dt_t *const event = data; + + /* XXX Check size arg */ + (void) size; + + /* split return value into result and errno */ + res = (event->ret >= 0) ? event->ret : -1; + err = (event->ret >= 0) ? 0 : -event->ret; + + if (start_ts_nsec == 0) + start_ts_nsec = event->start_ts_nsec; + + if (args.failed && (event->ret >= 0)) + return; + + fprint_i64(out, event->pid_tid); + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + if (args.timestamp) { + unsigned long long delta_nsec = + event->finish_ts_nsec - start_ts_nsec; + + fprint_i64(out, delta_nsec); + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + } + + fprint_i64(out, (uint64_t)err); + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + fprint_i64(out, (uint64_t)res); + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + if (event->sc_id >= 0) + fwrite(sc_num2str(event->sc_id), + strlen(sc_num2str(event->sc_id)), + 1, out); + else + fwrite(event->sc_name + 4, + strlen(event->sc_name + 4), + 1, out); + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + /* "ARG1" */ + switch (event->sc_id) { + case -2: + fprint_i64(out, (uint64_t)event->arg_1); + break; + + case -1: + /* + * XXX Something unexpected happened. Ma be we should issue a + * warning or do something better + */ + break; + + default: + if (EM_file == (EM_file & sc_tbl[event->sc_id].masks)) + fwrite(event->fl_nm, strlen(event->fl_nm), 1, out); + else if (EM_desc == (EM_desc & sc_tbl[event->sc_id].masks)) + fprint_i64(out, (uint64_t)event->arg_1); + else if (EM_fileat == (EM_fileat & sc_tbl[event->sc_id].masks)) + fprint_i64(out, (uint64_t)event->arg_1); + else { + /* + * XXX We don't have any idea about this syscall args. + * May be we should expand our table with additional + * syscall descriptions. + */ + } + break; + } + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + /* "ARG2" */ + switch (event->sc_id) { + case -2: + fprint_i64(out, (uint64_t)event->arg_2); + break; + + case -1: + /* + * XXX Something unexpected happened. Ma be we should issue a + * warning or do something better + */ + break; + + default: + if (EM_fileat == (EM_fileat & sc_tbl[event->sc_id].masks)) + fwrite(event->fl_nm, strlen(event->fl_nm), 1, out); + break; + } + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + /* "ARG3" */ + switch (event->sc_id) { + case -2: + fprint_i64(out, (uint64_t)event->arg_3); + break; + + case -1: + /* + * XXX Something unexpected happened. Ma be we should issue a + * warning or do something better + */ + break; + + default: + break; + } + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + /* "ARG4" */ + switch (event->sc_id) { + case -2: + fprint_i64(out, (uint64_t)event->arg_4); + break; + + case -1: + /* + * XXX Something unexpected happened. Ma be we should issue a + * warning or do something better + */ + break; + + default: + break; + } + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + /* "ARG5" */ + switch (event->sc_id) { + case -2: + fprint_i64(out, (uint64_t)event->arg_5); + break; + + case -1: + /* + * XXX Something unexpected happened. Ma be we should issue a + * warning or do something better + */ + break; + + default: + break; + } + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + /* "ARG6" */ + switch (event->sc_id) { + case -2: + fprint_i64(out, (uint64_t)event->arg_6); + break; + + case -1: + /* + * XXX Something unexpected happened. Ma be we should issue a + * warning or do something better + */ + break; + + default: + break; + } + fwrite(&args.out_sep_ch, sizeof(args.out_sep_ch), 1, out); + + /* "AUX_DATA". For COMM and like. XXX */ + /* fwrite(event->comm, strlen(event->comm), 1, out); */ + fwrite("\n", 1, 1, out); + + (void) cb_cookie; +} + +/* ** Binary logs ** */ + +/* + * This function writes header in stream. + */ +static void +print_header_bin(int argc, char *argv[]) +{ + size_t argv_size = 0; + + struct ev_dt_t d = { .sc_id = -1 }; + + const size_t d_size = sizeof(d); + d.header.argc = argc; + + /* + * here we assume that our command line will not be longer + * than 255 bytes + */ + for (int i = 0; i < argc; i++) { + strcpy(d.header.argv + argv_size, argv[i]); + argv_size += strlen(argv[i]) + 1; + } + + if (1 != fwrite(&d_size, sizeof(d_size), 1, out)) { + /* ERROR */ + cont = false; + } + + if (1 != fwrite(&d, sizeof(d), 1, out)) { + /* ERROR */ + cont = false; + } +} + +/* + * This function writes syscall's log entry in stream + */ +static void +print_event_bin(void *cb_cookie, void *data, int size) +{ + struct ev_dt_t *const event = data; + + /* XXX Check size arg */ + + if (args.failed && (event->ret >= 0)) + return; + + if (1 != fwrite(data, (size_t)size, 1, out)) { + /* ERROR */ + cont = false; + } + + (void) cb_cookie; +} + +/* + * This function parsess log's type + */ +enum out_fmt +out_fmt_str2enum(const char *str) +{ + if (!strcasecmp("bin", str) || !strcasecmp("binary", str)) + return EOF_BIN; + + if (!strcasecmp("strace", str)) + return EOF_STRACE; + + if (!strcasecmp("hex", str)) + return EOF_HEX; + + return EOF_HEX; +} + +perf_reader_raw_cb print_event_cb[EOF_QTY + 1] = { + [EOF_HEX] = print_event_hex, + [EOF_BIN] = print_event_bin, + [EOF_STRACE] = print_event_strace, +}; + +print_header_t print_header[EOF_QTY + 1] = { + [EOF_HEX] = print_header_hex, + [EOF_BIN] = print_header_bin, + [EOF_STRACE] = print_header_strace, +}; diff --git a/src/libstrace/print_event_cb.h b/src/libstrace/print_event_cb.h new file mode 100644 index 000000000..a44ea145d --- /dev/null +++ b/src/libstrace/print_event_cb.h @@ -0,0 +1,54 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * print_event_cb.h -- print_event_cb() function + */ + +#ifndef PRINT_EVENT_CB_H +#define PRINT_EVENT_CB_H + +#include + +#include + +#include "main.h" + +/* process event */ +extern perf_reader_raw_cb print_event_cb[EOF_QTY + 1]; + +typedef void (*print_header_t)(int argc, char *argv[]); +extern print_header_t print_header[EOF_QTY + 1]; + +enum out_fmt out_fmt_str2enum(const char *str); + +#endif /* PRINT_EVENT_CB_H */ diff --git a/src/libstrace/utils.c b/src/libstrace/utils.c new file mode 100644 index 000000000..616413d53 --- /dev/null +++ b/src/libstrace/utils.c @@ -0,0 +1,365 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * utils.c -- utility functions + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "main.h" +#include "utils.h" +#include "generate_ebpf.h" + +/* + * This function loads text file from disk and return malloc-ed, + * null-terminated string + */ +char * +load_file_from_disk(const char *const fn) +{ + int fd; + long res; + char *buf = NULL; + struct stat st; + + fd = open(fn, O_RDONLY); + + if (fd == -1) + return buf; + + res = fstat(fd, &st); + + if (res == -1) + goto out; + + buf = calloc(1, (size_t)st.st_size + 1); + + res = read(fd, buf, (size_t)st.st_size); + + if (st.st_size != res) { + free(buf); + buf = NULL; + } + +out: + close(fd); + + return buf; +} + +/* + * Export embedded trace.h to file + */ +void save_trace_h(void) { + int fd; + + long res = access(ebpf_trace_h_file, R_OK); + + if (res == 0) + return; + + fd = open(ebpf_trace_h_file, O_WRONLY | O_CREAT, 0666); + + if (fd == -1) + return; + + res = write(fd, _binary_trace_h_start, (size_t)_binary_trace_h_size); + + close(fd); +} + +/* + * This function loads 'virtual' file. + */ +char * +load_file(const char *const fn) +{ + char *f = load_file_from_disk(fn); + + if (NULL != f) + return f; + + /* fallback to embedded ones */ + if (0 == strcmp(ebpf_head_file, fn)) { + return strndup(_binary_trace_head_c_start, + (size_t)_binary_trace_head_c_size); + } else if (0 == strcmp(ebpf_libc_tmpl_file, fn)) { + return strndup(_binary_trace_libc_tmpl_c_start, + (size_t)_binary_trace_libc_tmpl_c_size); + } else if (0 == strcmp(ebpf_file_tmpl_file, fn)) { + return strndup(_binary_trace_file_tmpl_c_start, + (size_t)_binary_trace_file_tmpl_c_size); + } else if (0 == strcmp(ebpf_fileat_tmpl_file, fn)) { + return strndup(_binary_trace_fileat_tmpl_c_start, + (size_t)_binary_trace_fileat_tmpl_c_size); + } else if (0 == strcmp(ebpf_kern_tmpl_file, fn)) { + return strndup(_binary_trace_kern_tmpl_c_start, + (size_t)_binary_trace_kern_tmpl_c_size); + } else if (0 == strcmp(ebpf_tp_all_file, fn)) { + return strndup(_binary_trace_tp_all_c_start, + (size_t)_binary_trace_tp_all_c_size); + } else if (0 == strcmp(ebpf_trace_h_file, fn)) { + return strndup(_binary_trace_h_start, + (size_t)_binary_trace_h_size); + } + + return NULL; +} + +/* + * This function reads status of eBPF JIT compiler. + */ +static int +load_bpf_jit_status(void) +{ + int fd, err_no; + long res; + char buf[16]; + + fd = open("/proc/sys/net/core/bpf_jit_enable", O_RDONLY); + + if (fd == -1) + return -1; + + errno = 0; + res = read(fd, buf, sizeof(buf)); + + err_no = errno; + close(fd); + errno = err_no; + + if (res <= 0) + return -1; + + return atoi(buf); +} + +/* + * This function checks status of eBPF JIT compiler and prints appropriate + * message. + */ +void +check_bpf_jit_status(FILE *file) +{ + int status = load_bpf_jit_status(); + + switch (status) { + case -1: + fprintf(file, + "ERROR:%s: could not read bpf_jit status: '%m'\n", + __func__); + return; + + case 0: + fprintf(file, + "WARNING:%s: DISABLED.\n" + "\tPlease reffer to `man strace.ebpf`," + " section 'Configuration'.\n" + "\tIt will allow to improve performance significantly\n" + "\tand drop appropriate problems.\n", + __func__); + return; + + case 1: + fprintf(file, "INFO:%s: ENABLED.\n", __func__); + return; + + case 2: + fprintf(file, "INFO:%s: DEBUG.\n", __func__); + return; + + default: + fprintf(file, + "WARNING:%s: UNKNOWN. Please notify the author.\n", + __func__); + return; + } +} + + +/* + * This function recognises syscalls among in-kernel functions. + */ +bool +is_a_sc(const char *const line, const ssize_t size) +{ + static const char template[] = "sys_"; + + const size_t template_len = strlen(template); + + if (size <= (ssize_t)template_len) + return false; + + if (strncasecmp(line, template, template_len)) + return false; + + if (line[size - 1] == ']') + return false; + + return true; +} + +const char debug_tracing[] = DEBUG_TRACING; +const char debug_tracing_aff[] = DEBUG_TRACING DT_AFF; + +/* + * This function fetch syscall's list from running kernel + */ +void +get_sc_list(FILE *f, template_t template) +{ + char *line = NULL; + size_t len = 0; + ssize_t read; + + FILE *in = fopen(debug_tracing_aff, "r"); + + if (NULL == in) { + fprintf(stderr, "%s: ERROR: '%m'\n", __func__); + return; + } + + while ((read = getline(&line, &len, in)) != -1) { + if (NULL != template) { + if (!template(line, read - 1)) + continue; + } + + fwrite(line, (size_t)read, 1, f); + } + + free(line); + fclose(in); + fflush(f); +} + +/* + * Replace all occurrence of 'templt' in 'text' with 'str' + */ +void +str_replace_all(char **const text, const char *templt, const char *str) +{ + char *occ; + + const size_t templt_len = strlen(templt); + const size_t str_len = strlen(str); + + while (NULL != (occ = strstr(*text, templt))) { + char *p; + size_t text_len; + + p = *text; + text_len = strlen(p); + + *text = calloc(1, text_len - templt_len + str_len + 1); + + strncpy(*text, p, ((uintptr_t)occ) - ((uintptr_t)p)); + strcat(*text, str); + strcat(*text, occ + templt_len); + + free(p); + } +} + +/* + * This function runs traced command passed through command line. + */ +pid_t +start_command(int argc, char *argv[]) +{ + pid_t pid = -1; + + pid = fork(); + + switch (pid) { + case -1: + break; + + case 0: + /* Wait until parent will be ready */ + /* + * for unknown reason sigwait(SIGCONT) and pause() + * do not success with any signal. + */ + raise(SIGSTOP); + + execvp(argv[0], argv); + exit(errno); + break; + + default: + break; + } + + (void) argc; + return pid; +} + +/* + * SIGCHLD handler. Is used if "command" was provided on command line. + */ +void +sig_chld_handler(int sig, siginfo_t *si, void *unused) +{ + if (si->si_code == CLD_EXITED && args.pid == si->si_pid) { + cont = false; + } + + (void) sig; + (void) unused; +} + +/* + * Generic signal hendler. Is used for notification of traced process about + * parent's death. + */ +void +sig_transmit_handler(int sig, siginfo_t *si, void *unused) +{ + kill(args.pid, SIGSEGV == sig ? SIGHUP : sig); + + cont = false; + + (void) si; + (void) unused; +} diff --git a/src/libstrace/utils.h b/src/libstrace/utils.h new file mode 100644 index 000000000..a40433546 --- /dev/null +++ b/src/libstrace/utils.h @@ -0,0 +1,66 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * utils.h -- utility functions + */ + +#ifndef UTILS_H +#define UTILS_H + +#include +#include + +char *load_file(const char *fn); +char *load_file_from_disk(const char *const fn); +void check_bpf_jit_status(FILE *file); + +void save_trace_h(void); + +typedef bool (*template_t)(const char *line, ssize_t size); +bool is_a_sc(const char *const line, const ssize_t size); +void get_sc_list(FILE *f, template_t template); + +void str_replace_all(char **text, const char *templt, const char *str); + +pid_t start_command(int argc, char *argv[]); + +void sig_chld_handler(int sig, siginfo_t *si, void *unused); +void sig_transmit_handler(int sig, siginfo_t *si, void *unused); + +#define DEBUG_TRACING "/sys/kernel/debug/tracing" +#define DT_AFF "/available_filter_functions" + +extern const char debug_tracing[]; +extern const char debug_tracing_aff[]; + +#endif /* UTILS_H */ diff --git a/src/main.c b/src/main.c new file mode 100644 index 000000000..459f8d316 --- /dev/null +++ b/src/main.c @@ -0,0 +1,497 @@ +/* + * Copyright 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * main.c -- Trace syscalls. For Linux, uses BCC, ebpf. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +/* from bcc import BPF */ +#include +#include +#include + +#include "bpf.h" + +#include "main.h" +#include "utils.h" +#include "attach_probes.h" +#include "ebpf_syscalls.h" +#include "generate_ebpf.h" +#include "print_event_cb.h" + +static const char help_text[] = "\ +\n\ +Run the specified command and trace syscalls.\n\ +\n\ +USAGE:\n\ +\tstrace.ebpf [-h] [-t] [-X] [-p PID] [command [arg ...]]\n\ +\n\ +\t-t, --timestamp include timestamp in output\n\ +\t-X, --failed only show failed syscalls\n\ +\t-d, --debug enable debug output\n\ +\t-p, --pid trace this PID only. Command arg should be missing\n\ +\t-o, --output filename\n\ +\t-l, --format output logs format. Possible values:\n\ +\t 'bin', 'binary', 'hex', 'strace', 'list' & 'help'.\n\ +\t 'bin'/'binary' file format is described in trace.h.\n\ +\t Default: 'hex'\n\ +\t-K, --hex-separator\n\ +\t set field separator for hex logs. Default is '\\t'.\n\ +\t-e, --expr expression, 'help' or 'list' for supported list.\n\ +\t Default: trace=kp-kern-all.\n\ +\t-L, --list Print a list of all traceable syscalls\n\ +\t of the running kernel.\n\ +\t-R, --ll-list Print a list of all traceable low-level funcs\n\ +\t of the running kernel.\n\ +\t WARNING: really long. ~45000 functions.\n\ +\t-b, --builtin-list\n\ +\t Print a list of all syscalls known by glibc.\n\ +\t-h, --help print help\n\ +\n\ +examples:\n\ + ./strace.ebpf -l hex # trace all syscalls in the system\n\ + ./strace.ebpf -l hex ls # trace syscalls of ls command\n\ + ./strace.ebpf -l hex -t ls # include timestamps\n\ + ./strace.ebpf -l hex -X ls # only show failed syscalls\n\ + ./strace.ebpf -l hex -p 342 # only trace PID 342\n\ +\n\ +WARNING: System-wide tracing can fillout your disk really fast.\n\ +"; + +static const char trace_list_text[] = "\ +List of supported sets:\n" + " * Help:\n" + "\t - 'help', 'list' This list.\n" + "\n" + " * Intercepting using KProbe:\n" + "\t - 'kp-pmemfile' PMemFile - actual SCs\n" + "\t - 'kp-file' SCs with path in args\n" + "\t - 'kp-desc' SCs with fdesd in args\n" + "\t - 'kp-kern-all' All syscalls provided by kernel.\n" + "\t - A bit slower.\n" + "\t - 'kp-libc-all' All syscalls provided by glibc.\n" + "\t This list is 36%% shorter\n" + "\t than previous and loads faster.\n" + "\t - 'kp-sc_glob:*' Choose SCs by glob pattern, such as 'set*'\n" + "\t - 'kp-sc_re:.*' Choose SCs by re pattern, such as 'set.*'\n" + "\t - 'kp-raw_glob:*' Choose low-level funcs by glob pattern,\n" + "\t such as 'raw_glob:ext4_*'\n" + "\t - 'kp-raw_re:.*' Choose low-level funcs by re pattern,\n" + "\t such as 'raw_glob:ext4_*'\n" + "\t - 'kp-XXXX' Choose exact single SC by name,\n" + "\t such as 'open'\n" + "\t - 'kp-raw:XXXX' Choose exact single low-level func by\n" + "\t name, such as 'raw:ext4_mkdir'\n" + "\n" + " * Intercepting using TracePoints:\n" + " Currently malfunctions because of this bug:\n" + " https://github.com/iovisor/bcc/issues/748\n" + "\t - 'tp-all' All syscalls provided by kernel.\n" + "\t This option starts many times faster than\n" + "\t corresponding kprobe ones, but can eat\n" + "\t more of CPU resource.\n" + "\n"; + +/* + * This function prints help message in stream. + */ +static void +fprint_help(FILE *f) +{ + fwrite(help_text, sizeof(help_text)-1, 1, f); +} + +/* + * This function prints description of expressions in stream. + */ +static void +fprint_trace_list(FILE *f) +{ + fwrite(trace_list_text, sizeof(trace_list_text)-1, 1, f); +} + +struct args_t args; +bool cont = true; +FILE *out; +enum out_fmt out_fmt; + +/* HACK Should be fixed in libbcc */ +extern int perf_reader_page_cnt; + +/* 8 Megabytes should be something close to reasonable */ +static unsigned out_buf_size = 8 * 1024 * 1024; + +/* + * Tool's entry point + */ +int +main(int argc, char *argv[]) +{ + args.pid = -1; + args.out_sep_ch = '\t'; + + /* + * XXX Should be set by cl options + * if we need something over syscalls + */ + args.pr_arr_max = 1000; + + /* + * XXX Let's enlarge ring buffers. It's really improve situation + * with lost events. In the future we should do it via cl options. + */ + perf_reader_page_cnt *= perf_reader_page_cnt; + perf_reader_page_cnt *= perf_reader_page_cnt; + + while (1) { + int c; + int option_index = 0; + + static struct option long_options[] = { + {"timestamp", no_argument, 0, 't'}, + {"failed", no_argument, 0, 'X'}, + {"help", no_argument, 0, 'h'}, + {"debug", no_argument, 0, 'd'}, + {"list", no_argument, 0, 'L'}, + {"ll-list", no_argument, 0, 'R'}, + {"builtin-list", no_argument, 0, 'b'}, + + {"pid", required_argument, 0, 'p'}, + {"format", required_argument, 0, 'l'}, + {"expr", required_argument, 0, 'e'}, + {"output", required_argument, 0, 'o'}, + {"hex-separator", required_argument, 0, 'K'}, + {0, 0, 0, 0 } + }; + + c = getopt_long(argc, argv, "+tXhdp:o:l:K:e:LRb", + long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 't': + args.timestamp = true; + break; + + case 'X': + args.failed = true; + break; + + case 'h': + fprint_help(stdout); + exit(EXIT_SUCCESS); + + case 'd': + args.debug = true; + break; + + case 'p': + args.pid = atoi(optarg); + break; + + case 'o': + args.out_fn = optarg; + break; + + case 'K': + args.out_sep_ch = *optarg; + break; + + case 'e': + if (!strcasecmp(optarg, "list") || + !strcasecmp(optarg, "help")) { + fprintf(stderr, + "List of supported expressions:" + " 'help', 'list', 'trace=set'" + "\n"); + exit(EXIT_SUCCESS); + } else if (!strcasecmp(optarg, "trace=help") || + !strcasecmp(optarg, + "trace=list")) { + fprint_trace_list(stderr); + fprintf(stderr, + "You can combine sets" + " by using comma.\n"); + exit(EXIT_SUCCESS); + } + args.expr = optarg; + break; + + case 'l': + if (!strcasecmp(optarg, "list") || + !strcasecmp(optarg, "help")) { + fprintf(stderr, + "List of supported expressions:" + "'bin', 'binary', 'hex', " + "'strace', 'list' & 'help'\n"); + exit(EXIT_SUCCESS); + } + args.out_fmt_str = optarg; + out_fmt = out_fmt_str2enum(args.out_fmt_str); + break; + + case 'L': + get_sc_list(stdout, is_a_sc); + exit(EXIT_SUCCESS); + + case 'R': + get_sc_list(stdout, NULL); + exit(EXIT_SUCCESS); + + case 'b': + for (unsigned i = 0; i < SC_TBL_SIZE; i++) + if (NULL != sc_tbl[i].hlr_name) + fprintf(stdout, + "%03d: %-20s\t %s\n", + sc_tbl[i].num, + sc_tbl[i].num_name, + sc_tbl[i].hlr_name); + exit(EXIT_SUCCESS); + + case ':': + fprintf(stderr, "ERROR: " + "Missing mandatory option's " + "argument\n"); + fprint_help(stderr); + exit(EXIT_FAILURE); + + default: + fprintf(stderr, "ERROR: " + "Unknown option: '-%c'\n", c); + case '?': + fprint_help(stderr); + exit(EXIT_FAILURE); + } + } + + if (optind < argc) + args.command = true; + + /* Check for JIT acceleration of BPF */ + check_bpf_jit_status(stderr); + + if (NULL != args.out_fn) { + out = fopen(args.out_fn, "w"); + + if (NULL == out) { + fprintf(stderr, "ERROR: " + "Failed to open '%s' for appending: '%m'\n", + args.out_fn); + + exit(errno); + } + } else { + out = stdout; + } + + /* XXX We should improve it. May be we should use fd directly */ + /* setbuffer(out, NULL, out_buf_size); */ + (void) out_buf_size; + + if (args.pid != -1 && args.command) { + fprintf(stderr, "ERROR: " + "It is currently unsupported to watch for PID" + " and command simultaneously.\n"); + fprint_help(stderr); + exit(EXIT_FAILURE); + } + + if (args.command) { + struct sigaction sa; + + args.pid = start_command(argc - optind, argv + optind); + + if (args.pid == -1) { + fprintf(stderr, "ERROR: " + "Failed to run: '%s': %m. Exiting.\n", + argv[optind]); + exit(errno); + } + + sa.sa_sigaction = sig_chld_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_RESTART | SA_SIGINFO | + SA_NOCLDSTOP | SA_NOCLDWAIT; + + (void) sigaction(SIGCHLD, &sa, NULL); + + sa.sa_sigaction = sig_transmit_handler; + sa.sa_flags = SA_RESTART; + + (void) sigaction(SIGINT, &sa, NULL); + (void) sigaction(SIGHUP, &sa, NULL); + (void) sigaction(SIGQUIT, &sa, NULL); + (void) sigaction(SIGTERM, &sa, NULL); + + sa.sa_flags = (int)(SA_RESTART | SA_RESETHAND); + (void) sigaction(SIGSEGV, &sa, NULL); + } + + /* define BPF program */ + char *bpf_str = generate_ebpf(); + + if (0 < args.pid) { + char str[128]; + + snprintf(str, sizeof(str), + "if ((pid_tid >> 32) != %d) { return 0; }", + args.pid); + + str_replace_all(&bpf_str, "PID_CHECK_HOOK", str); + + if (!args.command) { + if (kill(args.pid, 0) == -1) { + fprintf(stderr, + "ERROR: Process with pid '%d'" + " does not exist: '%m'.\n", args.pid); + + exit(errno); + } + } + } else { + str_replace_all(&bpf_str, "PID_CHECK_HOOK", ""); + } + + char *trace_h = load_file(ebpf_trace_h_file); + + str_replace_all(&bpf_str, "#include \"trace.h\"\n", trace_h); + + free(trace_h); + + if (args.debug) { + fprintf(stderr, "\t>>>>> Generated eBPF code <<<<<\n"); + + if (bpf_str) + fwrite(bpf_str, strlen(bpf_str), 1, stderr); + + fprintf(stderr, "\t>>>>> EndOf generated eBPF code <<<<<<\n"); + } + + save_trace_h(); + + /* initialize BPF */ + struct bpf_ctx *b = calloc(1, sizeof(*b)); + + /* Compiling of generated eBPF code */ + b->module = bpf_module_create_c_from_string(bpf_str, 0, NULL, 0); + b->debug = args.debug; + + free(bpf_str); + + if (!attach_probes(b)) { + /* No probes were attached */ + fprintf(stderr, + "ERROR: No probes were attached. Exiting.\n"); + + if (args.command) { + /* let's KILL child */ + kill(args.pid, SIGKILL); + } + + return EXIT_FAILURE; + } + + /* header */ + print_header[out_fmt](argc, argv); + + /* + * Attach callback to perf output. "events" is a name of class declared + * with BPF_PERF_OUTPUT() in trace.c. + * + * XXX Most likely we should utilise here str_replace for consistence + * increasing. + */ +#define PERF_OUTPUT_NAME "events" + int res = attach_callback_to_perf_output(b, + PERF_OUTPUT_NAME, print_event_cb[out_fmt]); + + if (!res) { + if (args.command) { + /* let's child go */ + kill(args.pid, SIGCONT); + } + } else { + fprintf(stderr, + "ERROR: Can't attach to perf output '%s'. Exiting.\n", + PERF_OUTPUT_NAME); + + if (args.command) { + /* let's KILL child */ + kill(args.pid, SIGKILL); + } + + detach_all(b); + return EXIT_FAILURE; + } + + struct perf_reader *readers[b->pr_arr_qty]; + + for (unsigned i = 0; i < b->pr_arr_qty; i++) + readers[i] = b->pr_arr[i]->pr; + + while (cont) { + (void) perf_reader_poll((int)b->pr_arr_qty, readers, -1); + + if (!args.command && 0 < args.pid) { + if (kill(args.pid, 0) == -1) { + cont = false; + + fprintf(stderr, + "ERROR: Process with pid '%d'" + " has disappeared : '%m'.\n", + args.pid); + + fprintf(stderr, "Exit.\n"); + } + } + } + + + detach_all(b); + return EXIT_SUCCESS; +} diff --git a/src/make-redis.sh b/src/make-redis.sh new file mode 100755 index 000000000..d7bf2ece0 --- /dev/null +++ b/src/make-redis.sh @@ -0,0 +1,55 @@ +#!/bin/bash -x +# +# Copyright 2014-2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# +# make-redis.sh - Script for running redis-benchmark while redis-server +# is traced. +# + +set -m + +# Should be same as in redis.conf +echo -n > /tmp/appendonly.aof + +`which time` -p -v "$@" redis-server redis.conf >> redis-server.log & + +sleep 13 + +redis-benchmark -q -n 100000 + +sleep 3 + +redis-cli shutdown + +fg + +exit 0 diff --git a/src/redis.conf b/src/redis.conf new file mode 100644 index 000000000..750799001 --- /dev/null +++ b/src/redis.conf @@ -0,0 +1,943 @@ +# Redis configuration file example. +# +# Note that in order to read the configuration file, Redis must be +# started with the file path as first argument: +# +# ./redis-server /path/to/redis.conf + +# Note on units: when memory size is needed, it is possible to specify +# it in the usual form of 1k 5GB 4M and so forth: +# +# 1k => 1000 bytes +# 1kb => 1024 bytes +# 1m => 1000000 bytes +# 1mb => 1024*1024 bytes +# 1g => 1000000000 bytes +# 1gb => 1024*1024*1024 bytes +# +# units are case insensitive so 1GB 1Gb 1gB are all the same. + +################################## INCLUDES ################################### + +# Include one or more other config files here. This is useful if you +# have a standard template that goes to all Redis servers but also need +# to customize a few per-server settings. Include files can include +# other files, so use this wisely. +# +# Notice option "include" won't be rewritten by command "CONFIG REWRITE" +# from admin or Redis Sentinel. Since Redis always uses the last processed +# line as value of a configuration directive, you'd better put includes +# at the beginning of this file to avoid overwriting config change at runtime. +# +# If instead you are interested in using includes to override configuration +# options, it is better to use include as the last line. +# +# include /path/to/local.conf +# include /path/to/other.conf + +################################ GENERAL ##################################### + +# By default Redis does not run as a daemon. Use 'yes' if you need it. +# Note that Redis will write a pid file in /var/run/redis.pid when daemonized. +daemonize no + +# When running daemonized, Redis writes a pid file in /var/run/redis.pid by +# default. You can specify a custom pid file location here. +pidfile /var/run/redis/redis-server.pid + +# Accept connections on the specified port, default is 6379. +# If port 0 is specified Redis will not listen on a TCP socket. +port 6379 + +# TCP listen() backlog. +# +# In high requests-per-second environments you need an high backlog in order +# to avoid slow clients connections issues. Note that the Linux kernel +# will silently truncate it to the value of /proc/sys/net/core/somaxconn so +# make sure to raise both the value of somaxconn and tcp_max_syn_backlog +# in order to get the desired effect. +tcp-backlog 511 + +# By default Redis listens for connections from all the network interfaces +# available on the server. It is possible to listen to just one or multiple +# interfaces using the "bind" configuration directive, followed by one or +# more IP addresses. +# +# Examples: +# +# bind 192.168.1.100 10.0.0.1 +bind 127.0.0.1 + +# Specify the path for the Unix socket that will be used to listen for +# incoming connections. There is no default, so Redis will not listen +# on a unix socket when not specified. +# +# unixsocket /var/run/redis/redis.sock +# unixsocketperm 700 + +# Close the connection after a client is idle for N seconds (0 to disable) +timeout 0 + +# TCP keepalive. +# +# If non-zero, use SO_KEEPALIVE to send TCP ACKs to clients in absence +# of communication. This is useful for two reasons: +# +# 1) Detect dead peers. +# 2) Take the connection alive from the point of view of network +# equipment in the middle. +# +# On Linux, the specified value (in seconds) is the period used to send ACKs. +# Note that to close the connection the double of the time is needed. +# On other kernels the period depends on the kernel configuration. +# +# A reasonable value for this option is 60 seconds. +tcp-keepalive 0 + +# Specify the server verbosity level. +# This can be one of: +# debug (a lot of information, useful for development/testing) +# verbose (many rarely useful info, but not a mess like the debug level) +# notice (moderately verbose, what you want in production probably) +# warning (only very important / critical messages are logged) +loglevel notice + +# Specify the log file name. Also the empty string can be used to force +# Redis to log on the standard output. Note that if you use standard +# output for logging but daemonize, logs will be sent to /dev/null +#logfile /var/log/redis/redis-server.log + +# To enable logging to the system logger, just set 'syslog-enabled' to yes, +# and optionally update the other syslog parameters to suit your needs. +# syslog-enabled no + +# Specify the syslog identity. +# syslog-ident redis + +# Specify the syslog facility. Must be USER or between LOCAL0-LOCAL7. +# syslog-facility local0 + +# Set the number of databases. The default database is DB 0, you can select +# a different one on a per-connection basis using SELECT where +# dbid is a number between 0 and 'databases'-1 +databases 16 + +################################ SNAPSHOTTING ################################ +# +# Save the DB on disk: +# +# save +# +# Will save the DB if both the given number of seconds and the given +# number of write operations against the DB occurred. +# +# In the example below the behaviour will be to save: +# after 900 sec (15 min) if at least 1 key changed +# after 300 sec (5 min) if at least 10 keys changed +# after 60 sec if at least 10000 keys changed +# +# Note: you can disable saving completely by commenting out all "save" lines. +# +# It is also possible to remove all the previously configured save +# points by adding a save directive with a single empty string argument +# like in the following example: +# +save "" + +#save 900 1 +#save 300 10 +#save 60 10000 + +# By default Redis will stop accepting writes if RDB snapshots are enabled +# (at least one save point) and the latest background save failed. +# This will make the user aware (in a hard way) that data is not persisting +# on disk properly, otherwise chances are that no one will notice and some +# disaster will happen. +# +# If the background saving process will start working again Redis will +# automatically allow writes again. +# +# However if you have setup your proper monitoring of the Redis server +# and persistence, you may want to disable this feature so that Redis will +# continue to work as usual even if there are problems with disk, +# permissions, and so forth. +stop-writes-on-bgsave-error yes + +# Compress string objects using LZF when dump .rdb databases? +# For default that's set to 'yes' as it's almost always a win. +# If you want to save some CPU in the saving child set it to 'no' but +# the dataset will likely be bigger if you have compressible values or keys. +rdbcompression yes + +# Since version 5 of RDB a CRC64 checksum is placed at the end of the file. +# This makes the format more resistant to corruption but there is a performance +# hit to pay (around 10%) when saving and loading RDB files, so you can disable it +# for maximum performances. +# +# RDB files created with checksum disabled have a checksum of zero that will +# tell the loading code to skip the check. +rdbchecksum yes + +# The filename where to dump the DB +dbfilename dump.rdb + +# The working directory. +# +# The DB will be written inside this directory, with the filename specified +# above using the 'dbfilename' configuration directive. +# +# The Append Only File will also be created inside this directory. +# +# Note that you must specify a directory here, not a file name. +dir /tmp + +################################# REPLICATION ################################# + +# Master-Slave replication. Use slaveof to make a Redis instance a copy of +# another Redis server. A few things to understand ASAP about Redis replication. +# +# 1) Redis replication is asynchronous, but you can configure a master to +# stop accepting writes if it appears to be not connected with at least +# a given number of slaves. +# 2) Redis slaves are able to perform a partial resynchronization with the +# master if the replication link is lost for a relatively small amount of +# time. You may want to configure the replication backlog size (see the next +# sections of this file) with a sensible value depending on your needs. +# 3) Replication is automatic and does not need user intervention. After a +# network partition slaves automatically try to reconnect to masters +# and resynchronize with them. +# +# slaveof + +# If the master is password protected (using the "requirepass" configuration +# directive below) it is possible to tell the slave to authenticate before +# starting the replication synchronization process, otherwise the master will +# refuse the slave request. +# +# masterauth + +# When a slave loses its connection with the master, or when the replication +# is still in progress, the slave can act in two different ways: +# +# 1) if slave-serve-stale-data is set to 'yes' (the default) the slave will +# still reply to client requests, possibly with out of date data, or the +# data set may just be empty if this is the first synchronization. +# +# 2) if slave-serve-stale-data is set to 'no' the slave will reply with +# an error "SYNC with master in progress" to all the kind of commands +# but to INFO and SLAVEOF. +# +slave-serve-stale-data yes + +# You can configure a slave instance to accept writes or not. Writing against +# a slave instance may be useful to store some ephemeral data (because data +# written on a slave will be easily deleted after resync with the master) but +# may also cause problems if clients are writing to it because of a +# misconfiguration. +# +# Since Redis 2.6 by default slaves are read-only. +# +# Note: read only slaves are not designed to be exposed to untrusted clients +# on the internet. It's just a protection layer against misuse of the instance. +# Still a read only slave exports by default all the administrative commands +# such as CONFIG, DEBUG, and so forth. To a limited extent you can improve +# security of read only slaves using 'rename-command' to shadow all the +# administrative / dangerous commands. +slave-read-only yes + +# Replication SYNC strategy: disk or socket. +# +# ------------------------------------------------------- +# WARNING: DISKLESS REPLICATION IS EXPERIMENTAL CURRENTLY +# ------------------------------------------------------- +# +# New slaves and reconnecting slaves that are not able to continue the replication +# process just receiving differences, need to do what is called a "full +# synchronization". An RDB file is transmitted from the master to the slaves. +# The transmission can happen in two different ways: +# +# 1) Disk-backed: The Redis master creates a new process that writes the RDB +# file on disk. Later the file is transferred by the parent +# process to the slaves incrementally. +# 2) Diskless: The Redis master creates a new process that directly writes the +# RDB file to slave sockets, without touching the disk at all. +# +# With disk-backed replication, while the RDB file is generated, more slaves +# can be queued and served with the RDB file as soon as the current child producing +# the RDB file finishes its work. With diskless replication instead once +# the transfer starts, new slaves arriving will be queued and a new transfer +# will start when the current one terminates. +# +# When diskless replication is used, the master waits a configurable amount of +# time (in seconds) before starting the transfer in the hope that multiple slaves +# will arrive and the transfer can be parallelized. +# +# With slow disks and fast (large bandwidth) networks, diskless replication +# works better. +repl-diskless-sync no + +# When diskless replication is enabled, it is possible to configure the delay +# the server waits in order to spawn the child that transfers the RDB via socket +# to the slaves. +# +# This is important since once the transfer starts, it is not possible to serve +# new slaves arriving, that will be queued for the next RDB transfer, so the server +# waits a delay in order to let more slaves arrive. +# +# The delay is specified in seconds, and by default is 5 seconds. To disable +# it entirely just set it to 0 seconds and the transfer will start ASAP. +repl-diskless-sync-delay 5 + +# Slaves send PINGs to server in a predefined interval. It's possible to change +# this interval with the repl_ping_slave_period option. The default value is 10 +# seconds. +# +# repl-ping-slave-period 10 + +# The following option sets the replication timeout for: +# +# 1) Bulk transfer I/O during SYNC, from the point of view of slave. +# 2) Master timeout from the point of view of slaves (data, pings). +# 3) Slave timeout from the point of view of masters (REPLCONF ACK pings). +# +# It is important to make sure that this value is greater than the value +# specified for repl-ping-slave-period otherwise a timeout will be detected +# every time there is low traffic between the master and the slave. +# +# repl-timeout 60 + +# Disable TCP_NODELAY on the slave socket after SYNC? +# +# If you select "yes" Redis will use a smaller number of TCP packets and +# less bandwidth to send data to slaves. But this can add a delay for +# the data to appear on the slave side, up to 40 milliseconds with +# Linux kernels using a default configuration. +# +# If you select "no" the delay for data to appear on the slave side will +# be reduced but more bandwidth will be used for replication. +# +# By default we optimize for low latency, but in very high traffic conditions +# or when the master and slaves are many hops away, turning this to "yes" may +# be a good idea. +repl-disable-tcp-nodelay no + +# Set the replication backlog size. The backlog is a buffer that accumulates +# slave data when slaves are disconnected for some time, so that when a slave +# wants to reconnect again, often a full resync is not needed, but a partial +# resync is enough, just passing the portion of data the slave missed while +# disconnected. +# +# The bigger the replication backlog, the longer the time the slave can be +# disconnected and later be able to perform a partial resynchronization. +# +# The backlog is only allocated once there is at least a slave connected. +# +# repl-backlog-size 1mb + +# After a master has no longer connected slaves for some time, the backlog +# will be freed. The following option configures the amount of seconds that +# need to elapse, starting from the time the last slave disconnected, for +# the backlog buffer to be freed. +# +# A value of 0 means to never release the backlog. +# +# repl-backlog-ttl 3600 + +# The slave priority is an integer number published by Redis in the INFO output. +# It is used by Redis Sentinel in order to select a slave to promote into a +# master if the master is no longer working correctly. +# +# A slave with a low priority number is considered better for promotion, so +# for instance if there are three slaves with priority 10, 100, 25 Sentinel will +# pick the one with priority 10, that is the lowest. +# +# However a special priority of 0 marks the slave as not able to perform the +# role of master, so a slave with priority of 0 will never be selected by +# Redis Sentinel for promotion. +# +# By default the priority is 100. +slave-priority 100 + +# It is possible for a master to stop accepting writes if there are less than +# N slaves connected, having a lag less or equal than M seconds. +# +# The N slaves need to be in "online" state. +# +# The lag in seconds, that must be <= the specified value, is calculated from +# the last ping received from the slave, that is usually sent every second. +# +# This option does not GUARANTEE that N replicas will accept the write, but +# will limit the window of exposure for lost writes in case not enough slaves +# are available, to the specified number of seconds. +# +# For example to require at least 3 slaves with a lag <= 10 seconds use: +# +# min-slaves-to-write 3 +# min-slaves-max-lag 10 +# +# Setting one or the other to 0 disables the feature. +# +# By default min-slaves-to-write is set to 0 (feature disabled) and +# min-slaves-max-lag is set to 10. + +################################## SECURITY ################################### + +# Require clients to issue AUTH before processing any other +# commands. This might be useful in environments in which you do not trust +# others with access to the host running redis-server. +# +# This should stay commented out for backward compatibility and because most +# people do not need auth (e.g. they run their own servers). +# +# Warning: since Redis is pretty fast an outside user can try up to +# 150k passwords per second against a good box. This means that you should +# use a very strong password otherwise it will be very easy to break. +# +# requirepass foobared + +# Command renaming. +# +# It is possible to change the name of dangerous commands in a shared +# environment. For instance the CONFIG command may be renamed into something +# hard to guess so that it will still be available for internal-use tools +# but not available for general clients. +# +# Example: +# +# rename-command CONFIG b840fc02d524045429941cc15f59e41cb7be6c52 +# +# It is also possible to completely kill a command by renaming it into +# an empty string: +# +# rename-command CONFIG "" +# +# Please note that changing the name of commands that are logged into the +# AOF file or transmitted to slaves may cause problems. + +################################### LIMITS #################################### + +# Set the max number of connected clients at the same time. By default +# this limit is set to 10000 clients, however if the Redis server is not +# able to configure the process file limit to allow for the specified limit +# the max number of allowed clients is set to the current file limit +# minus 32 (as Redis reserves a few file descriptors for internal uses). +# +# Once the limit is reached Redis will close all the new connections sending +# an error 'max number of clients reached'. +# +# maxclients 10000 + +# Don't use more memory than the specified amount of bytes. +# When the memory limit is reached Redis will try to remove keys +# according to the eviction policy selected (see maxmemory-policy). +# +# If Redis can't remove keys according to the policy, or if the policy is +# set to 'noeviction', Redis will start to reply with errors to commands +# that would use more memory, like SET, LPUSH, and so on, and will continue +# to reply to read-only commands like GET. +# +# This option is usually useful when using Redis as an LRU cache, or to set +# a hard memory limit for an instance (using the 'noeviction' policy). +# +# WARNING: If you have slaves attached to an instance with maxmemory on, +# the size of the output buffers needed to feed the slaves are subtracted +# from the used memory count, so that network problems / resyncs will +# not trigger a loop where keys are evicted, and in turn the output +# buffer of slaves is full with DELs of keys evicted triggering the deletion +# of more keys, and so forth until the database is completely emptied. +# +# In short... if you have slaves attached it is suggested that you set a lower +# limit for maxmemory so that there is some free RAM on the system for slave +# output buffers (but this is not needed if the policy is 'noeviction'). +# +# maxmemory + +# MAXMEMORY POLICY: how Redis will select what to remove when maxmemory +# is reached. You can select among five behaviors: +# +# volatile-lru -> remove the key with an expire set using an LRU algorithm +# allkeys-lru -> remove any key according to the LRU algorithm +# volatile-random -> remove a random key with an expire set +# allkeys-random -> remove a random key, any key +# volatile-ttl -> remove the key with the nearest expire time (minor TTL) +# noeviction -> don't expire at all, just return an error on write operations +# +# Note: with any of the above policies, Redis will return an error on write +# operations, when there are no suitable keys for eviction. +# +# At the date of writing these commands are: set setnx setex append +# incr decr rpush lpush rpushx lpushx linsert lset rpoplpush sadd +# sinter sinterstore sunion sunionstore sdiff sdiffstore zadd zincrby +# zunionstore zinterstore hset hsetnx hmset hincrby incrby decrby +# getset mset msetnx exec sort +# +# The default is: +# +# maxmemory-policy noeviction + +# LRU and minimal TTL algorithms are not precise algorithms but approximated +# algorithms (in order to save memory), so you can tune it for speed or +# accuracy. For default Redis will check five keys and pick the one that was +# used less recently, you can change the sample size using the following +# configuration directive. +# +# The default of 5 produces good enough results. 10 Approximates very closely +# true LRU but costs a bit more CPU. 3 is very fast but not very accurate. +# +# maxmemory-samples 5 + +############################## APPEND ONLY MODE ############################### + +# By default Redis asynchronously dumps the dataset on disk. This mode is +# good enough in many applications, but an issue with the Redis process or +# a power outage may result into a few minutes of writes lost (depending on +# the configured save points). +# +# The Append Only File is an alternative persistence mode that provides +# much better durability. For instance using the default data fsync policy +# (see later in the config file) Redis can lose just one second of writes in a +# dramatic event like a server power outage, or a single write if something +# wrong with the Redis process itself happens, but the operating system is +# still running correctly. +# +# AOF and RDB persistence can be enabled at the same time without problems. +# If the AOF is enabled on startup Redis will load the AOF, that is the file +# with the better durability guarantees. +# +# Please check http://redis.io/topics/persistence for more information. + +appendonly yes + +# The name of the append only file (default: "appendonly.aof") + +appendfilename "appendonly.aof" + +# The fsync() call tells the Operating System to actually write data on disk +# instead of waiting for more data in the output buffer. Some OS will really flush +# data on disk, some other OS will just try to do it ASAP. +# +# Redis supports three different modes: +# +# no: don't fsync, just let the OS flush the data when it wants. Faster. +# always: fsync after every write to the append only log. Slow, Safest. +# everysec: fsync only one time every second. Compromise. +# +# The default is "everysec", as that's usually the right compromise between +# speed and data safety. It's up to you to understand if you can relax this to +# "no" that will let the operating system flush the output buffer when +# it wants, for better performances (but if you can live with the idea of +# some data loss consider the default persistence mode that's snapshotting), +# or on the contrary, use "always" that's very slow but a bit safer than +# everysec. +# +# More details please check the following article: +# http://antirez.com/post/redis-persistence-demystified.html +# +# If unsure, use "everysec". + +# appendfsync always +appendfsync everysec +# appendfsync no + +# When the AOF fsync policy is set to always or everysec, and a background +# saving process (a background save or AOF log background rewriting) is +# performing a lot of I/O against the disk, in some Linux configurations +# Redis may block too long on the fsync() call. Note that there is no fix for +# this currently, as even performing fsync in a different thread will block +# our synchronous write(2) call. +# +# In order to mitigate this problem it's possible to use the following option +# that will prevent fsync() from being called in the main process while a +# BGSAVE or BGREWRITEAOF is in progress. +# +# This means that while another child is saving, the durability of Redis is +# the same as "appendfsync none". In practical terms, this means that it is +# possible to lose up to 30 seconds of log in the worst scenario (with the +# default Linux settings). +# +# If you have latency problems turn this to "yes". Otherwise leave it as +# "no" that is the safest pick from the point of view of durability. + +no-appendfsync-on-rewrite no + +# Automatic rewrite of the append only file. +# Redis is able to automatically rewrite the log file implicitly calling +# BGREWRITEAOF when the AOF log size grows by the specified percentage. +# +# This is how it works: Redis remembers the size of the AOF file after the +# latest rewrite (if no rewrite has happened since the restart, the size of +# the AOF at startup is used). +# +# This base size is compared to the current size. If the current size is +# bigger than the specified percentage, the rewrite is triggered. Also +# you need to specify a minimal size for the AOF file to be rewritten, this +# is useful to avoid rewriting the AOF file even if the percentage increase +# is reached but it is still pretty small. +# +# Specify a percentage of zero in order to disable the automatic AOF +# rewrite feature. + +auto-aof-rewrite-percentage 100 +auto-aof-rewrite-min-size 4mb + +# An AOF file may be found to be truncated at the end during the Redis +# startup process, when the AOF data gets loaded back into memory. +# This may happen when the system where Redis is running +# crashes, especially when an ext4 filesystem is mounted without the +# data=ordered option (however this can't happen when Redis itself +# crashes or aborts but the operating system still works correctly). +# +# Redis can either exit with an error when this happens, or load as much +# data as possible (the default now) and start if the AOF file is found +# to be truncated at the end. The following option controls this behavior. +# +# If aof-load-truncated is set to yes, a truncated AOF file is loaded and +# the Redis server starts emitting a log to inform the user of the event. +# Otherwise if the option is set to no, the server aborts with an error +# and refuses to start. When the option is set to no, the user requires +# to fix the AOF file using the "redis-check-aof" utility before to restart +# the server. +# +# Note that if the AOF file will be found to be corrupted in the middle +# the server will still exit with an error. This option only applies when +# Redis will try to read more data from the AOF file but not enough bytes +# will be found. +aof-load-truncated yes + +################################ LUA SCRIPTING ############################### + +# Max execution time of a Lua script in milliseconds. +# +# If the maximum execution time is reached Redis will log that a script is +# still in execution after the maximum allowed time and will start to +# reply to queries with an error. +# +# When a long running script exceeds the maximum execution time only the +# SCRIPT KILL and SHUTDOWN NOSAVE commands are available. The first can be +# used to stop a script that did not yet called write commands. The second +# is the only way to shut down the server in the case a write command was +# already issued by the script but the user doesn't want to wait for the natural +# termination of the script. +# +# Set it to 0 or a negative value for unlimited execution without warnings. +lua-time-limit 5000 + +################################ REDIS CLUSTER ############################### +# +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# WARNING EXPERIMENTAL: Redis Cluster is considered to be stable code, however +# in order to mark it as "mature" we need to wait for a non trivial percentage +# of users to deploy it in production. +# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +# +# Normal Redis instances can't be part of a Redis Cluster; only nodes that are +# started as cluster nodes can. In order to start a Redis instance as a +# cluster node enable the cluster support uncommenting the following: +# +# cluster-enabled yes + +# Every cluster node has a cluster configuration file. This file is not +# intended to be edited by hand. It is created and updated by Redis nodes. +# Every Redis Cluster node requires a different cluster configuration file. +# Make sure that instances running in the same system do not have +# overlapping cluster configuration file names. +# +# cluster-config-file nodes-6379.conf + +# Cluster node timeout is the amount of milliseconds a node must be unreachable +# for it to be considered in failure state. +# Most other internal time limits are multiple of the node timeout. +# +# cluster-node-timeout 15000 + +# A slave of a failing master will avoid to start a failover if its data +# looks too old. +# +# There is no simple way for a slave to actually have a exact measure of +# its "data age", so the following two checks are performed: +# +# 1) If there are multiple slaves able to failover, they exchange messages +# in order to try to give an advantage to the slave with the best +# replication offset (more data from the master processed). +# Slaves will try to get their rank by offset, and apply to the start +# of the failover a delay proportional to their rank. +# +# 2) Every single slave computes the time of the last interaction with +# its master. This can be the last ping or command received (if the master +# is still in the "connected" state), or the time that elapsed since the +# disconnection with the master (if the replication link is currently down). +# If the last interaction is too old, the slave will not try to failover +# at all. +# +# The point "2" can be tuned by user. Specifically a slave will not perform +# the failover if, since the last interaction with the master, the time +# elapsed is greater than: +# +# (node-timeout * slave-validity-factor) + repl-ping-slave-period +# +# So for example if node-timeout is 30 seconds, and the slave-validity-factor +# is 10, and assuming a default repl-ping-slave-period of 10 seconds, the +# slave will not try to failover if it was not able to talk with the master +# for longer than 310 seconds. +# +# A large slave-validity-factor may allow slaves with too old data to failover +# a master, while a too small value may prevent the cluster from being able to +# elect a slave at all. +# +# For maximum availability, it is possible to set the slave-validity-factor +# to a value of 0, which means, that slaves will always try to failover the +# master regardless of the last time they interacted with the master. +# (However they'll always try to apply a delay proportional to their +# offset rank). +# +# Zero is the only value able to guarantee that when all the partitions heal +# the cluster will always be able to continue. +# +# cluster-slave-validity-factor 10 + +# Cluster slaves are able to migrate to orphaned masters, that are masters +# that are left without working slaves. This improves the cluster ability +# to resist to failures as otherwise an orphaned master can't be failed over +# in case of failure if it has no working slaves. +# +# Slaves migrate to orphaned masters only if there are still at least a +# given number of other working slaves for their old master. This number +# is the "migration barrier". A migration barrier of 1 means that a slave +# will migrate only if there is at least 1 other working slave for its master +# and so forth. It usually reflects the number of slaves you want for every +# master in your cluster. +# +# Default is 1 (slaves migrate only if their masters remain with at least +# one slave). To disable migration just set it to a very large value. +# A value of 0 can be set but is useful only for debugging and dangerous +# in production. +# +# cluster-migration-barrier 1 + +# By default Redis Cluster nodes stop accepting queries if they detect there +# is at least an hash slot uncovered (no available node is serving it). +# This way if the cluster is partially down (for example a range of hash slots +# are no longer covered) all the cluster becomes, eventually, unavailable. +# It automatically returns available as soon as all the slots are covered again. +# +# However sometimes you want the subset of the cluster which is working, +# to continue to accept queries for the part of the key space that is still +# covered. In order to do so, just set the cluster-require-full-coverage +# option to no. +# +# cluster-require-full-coverage yes + +# In order to setup your cluster make sure to read the documentation +# available at http://redis.io web site. + +################################## SLOW LOG ################################### + +# The Redis Slow Log is a system to log queries that exceeded a specified +# execution time. The execution time does not include the I/O operations +# like talking with the client, sending the reply and so forth, +# but just the time needed to actually execute the command (this is the only +# stage of command execution where the thread is blocked and can not serve +# other requests in the meantime). +# +# You can configure the slow log with two parameters: one tells Redis +# what is the execution time, in microseconds, to exceed in order for the +# command to get logged, and the other parameter is the length of the +# slow log. When a new command is logged the oldest one is removed from the +# queue of logged commands. + +# The following time is expressed in microseconds, so 1000000 is equivalent +# to one second. Note that a negative number disables the slow log, while +# a value of zero forces the logging of every command. +slowlog-log-slower-than 10000 + +# There is no limit to this length. Just be aware that it will consume memory. +# You can reclaim memory used by the slow log with SLOWLOG RESET. +slowlog-max-len 128 + +################################ LATENCY MONITOR ############################## + +# The Redis latency monitoring subsystem samples different operations +# at runtime in order to collect data related to possible sources of +# latency of a Redis instance. +# +# Via the LATENCY command this information is available to the user that can +# print graphs and obtain reports. +# +# The system only logs operations that were performed in a time equal or +# greater than the amount of milliseconds specified via the +# latency-monitor-threshold configuration directive. When its value is set +# to zero, the latency monitor is turned off. +# +# By default latency monitoring is disabled since it is mostly not needed +# if you don't have latency issues, and collecting data has a performance +# impact, that while very small, can be measured under big load. Latency +# monitoring can easily be enabled at runtime using the command +# "CONFIG SET latency-monitor-threshold " if needed. +latency-monitor-threshold 0 + +############################# EVENT NOTIFICATION ############################## + +# Redis can notify Pub/Sub clients about events happening in the key space. +# This feature is documented at http://redis.io/topics/notifications +# +# For instance if keyspace events notification is enabled, and a client +# performs a DEL operation on key "foo" stored in the Database 0, two +# messages will be published via Pub/Sub: +# +# PUBLISH __keyspace@0__:foo del +# PUBLISH __keyevent@0__:del foo +# +# It is possible to select the events that Redis will notify among a set +# of classes. Every class is identified by a single character: +# +# K Keyspace events, published with __keyspace@__ prefix. +# E Keyevent events, published with __keyevent@__ prefix. +# g Generic commands (non-type specific) like DEL, EXPIRE, RENAME, ... +# $ String commands +# l List commands +# s Set commands +# h Hash commands +# z Sorted set commands +# x Expired events (events generated every time a key expires) +# e Evicted events (events generated when a key is evicted for maxmemory) +# A Alias for g$lshzxe, so that the "AKE" string means all the events. +# +# The "notify-keyspace-events" takes as argument a string that is composed +# of zero or multiple characters. The empty string means that notifications +# are disabled. +# +# Example: to enable list and generic events, from the point of view of the +# event name, use: +# +# notify-keyspace-events Elg +# +# Example 2: to get the stream of the expired keys subscribing to channel +# name __keyevent@0__:expired use: +# +# notify-keyspace-events Ex +# +# By default all notifications are disabled because most users don't need +# this feature and the feature has some overhead. Note that if you don't +# specify at least one of K or E, no events will be delivered. +notify-keyspace-events "" + +############################### ADVANCED CONFIG ############################### + +# Hashes are encoded using a memory efficient data structure when they have a +# small number of entries, and the biggest entry does not exceed a given +# threshold. These thresholds can be configured using the following directives. +hash-max-ziplist-entries 512 +hash-max-ziplist-value 64 + +# Similarly to hashes, small lists are also encoded in a special way in order +# to save a lot of space. The special representation is only used when +# you are under the following limits: +list-max-ziplist-entries 512 +list-max-ziplist-value 64 + +# Sets have a special encoding in just one case: when a set is composed +# of just strings that happen to be integers in radix 10 in the range +# of 64 bit signed integers. +# The following configuration setting sets the limit in the size of the +# set in order to use this special memory saving encoding. +set-max-intset-entries 512 + +# Similarly to hashes and lists, sorted sets are also specially encoded in +# order to save a lot of space. This encoding is only used when the length and +# elements of a sorted set are below the following limits: +zset-max-ziplist-entries 128 +zset-max-ziplist-value 64 + +# HyperLogLog sparse representation bytes limit. The limit includes the +# 16 bytes header. When an HyperLogLog using the sparse representation crosses +# this limit, it is converted into the dense representation. +# +# A value greater than 16000 is totally useless, since at that point the +# dense representation is more memory efficient. +# +# The suggested value is ~ 3000 in order to have the benefits of +# the space efficient encoding without slowing down too much PFADD, +# which is O(N) with the sparse encoding. The value can be raised to +# ~ 10000 when CPU is not a concern, but space is, and the data set is +# composed of many HyperLogLogs with cardinality in the 0 - 15000 range. +hll-sparse-max-bytes 3000 + +# Active rehashing uses 1 millisecond every 100 milliseconds of CPU time in +# order to help rehashing the main Redis hash table (the one mapping top-level +# keys to values). The hash table implementation Redis uses (see dict.c) +# performs a lazy rehashing: the more operation you run into a hash table +# that is rehashing, the more rehashing "steps" are performed, so if the +# server is idle the rehashing is never complete and some more memory is used +# by the hash table. +# +# The default is to use this millisecond 10 times every second in order to +# actively rehash the main dictionaries, freeing memory when possible. +# +# If unsure: +# use "activerehashing no" if you have hard latency requirements and it is +# not a good thing in your environment that Redis can reply from time to time +# to queries with 2 milliseconds delay. +# +# use "activerehashing yes" if you don't have such hard requirements but +# want to free memory asap when possible. +activerehashing yes + +# The client output buffer limits can be used to force disconnection of clients +# that are not reading data from the server fast enough for some reason (a +# common reason is that a Pub/Sub client can't consume messages as fast as the +# publisher can produce them). +# +# The limit can be set differently for the three different classes of clients: +# +# normal -> normal clients including MONITOR clients +# slave -> slave clients +# pubsub -> clients subscribed to at least one pubsub channel or pattern +# +# The syntax of every client-output-buffer-limit directive is the following: +# +# client-output-buffer-limit +# +# A client is immediately disconnected once the hard limit is reached, or if +# the soft limit is reached and remains reached for the specified number of +# seconds (continuously). +# So for instance if the hard limit is 32 megabytes and the soft limit is +# 16 megabytes / 10 seconds, the client will get disconnected immediately +# if the size of the output buffers reach 32 megabytes, but will also get +# disconnected if the client reaches 16 megabytes and continuously overcomes +# the limit for 10 seconds. +# +# By default normal clients are not limited because they don't receive data +# without asking (in a push way), but just after a request, so only +# asynchronous clients may create a scenario where data is requested faster +# than it can read. +# +# Instead there is a default limit for pubsub and slave clients, since +# subscribers and slaves receive data in a push fashion. +# +# Both the hard or the soft limit can be disabled by setting them to zero. +client-output-buffer-limit normal 0 0 0 +client-output-buffer-limit slave 256mb 64mb 60 +client-output-buffer-limit pubsub 32mb 8mb 60 + +# Redis calls an internal function to perform many background tasks, like +# closing connections of clients in timeout, purging expired keys that are +# never requested, and so forth. +# +# Not all tasks are performed with the same frequency, but Redis checks for +# tasks to perform according to the specified "hz" value. +# +# By default "hz" is set to 10. Raising the value will use more CPU when +# Redis is idle, but at the same time will make Redis more responsive when +# there are many keys expiring at the same time, and timeouts may be +# handled with more precision. +# +# The range is between 1 and 500, however a value over 100 is usually not +# a good idea. Most users should use the default of 10 and raise this up to +# 100 only in environments where very low latency is required. +hz 10 + +# When a child rewrites the AOF file, if the following option is enabled +# the file will be fsync-ed every 32 MB of data generated. This is useful +# in order to commit the file to the disk more incrementally and avoid +# big latency spikes. +aof-rewrite-incremental-fsync yes diff --git a/utils/md2man.sh b/utils/md2man.sh new file mode 100755 index 000000000..8100fc4ed --- /dev/null +++ b/utils/md2man.sh @@ -0,0 +1,66 @@ +#!/bin/bash -e +# +# Copyright 2016, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# +# md2man.sh -- convert markdown to groff man pages +# +# usage: md2man.sh file template outfile +# +# This script converts markdown file into groff man page using pandoc. +# It performs some pre- and post-processing for better results: +# - parse input file for YAML metadata block and read man page title, +# section and version +# - cut-off metadata block and license +# - unindent code blocks +# + +set -o pipefail + +filename=$1 +template=$2 +outfile=$3 +title=`sed -n 's/^title:\ *\([a-z]*\).*$/\1/p' $filename` +section=`sed -n 's/^title:.*(\([0-9]\)).*$/\1/p' $filename` +version=`sed -n 's/^date:\ *\(.*\)$/\1/p' $filename` + +cat $filename | sed -n -e '/# NAME #/,$p' |\ +pandoc -s -t man -o $outfile --template=$template \ + -V title=$title -V section=$section \ + -V description='"NVM Library"' -V version="$version" \ + -V year=$(date +"%Y") |\ +sed '/^\.IP/{ +N +/\n\.nf/{ + s/IP/PP/ + } +}'