|
| 1 | +/* |
| 2 | + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one |
| 3 | + * or more contributor license agreements. Licensed under the Elastic License; |
| 4 | + * you may not use this file except in compliance with the Elastic License. |
| 5 | + */ |
| 6 | +#include "seccomp/CSystemCallFilter.h" |
| 7 | + |
| 8 | +#include <core/CLogger.h> |
| 9 | + |
| 10 | +#include <linux/audit.h> |
| 11 | +#include <linux/filter.h> |
| 12 | +#include <sys/prctl.h> |
| 13 | +#include <sys/syscall.h> |
| 14 | + |
| 15 | +#include <cerrno> |
| 16 | +#include <cstdint> |
| 17 | +#include <cstring> |
| 18 | + |
| 19 | +namespace ml { |
| 20 | +namespace seccomp { |
| 21 | + |
| 22 | +namespace { |
| 23 | +// The old x32 ABI always has bit 30 set in the sys call numbers. |
| 24 | +// The x64 architecture should fail these calls |
| 25 | +const std::uint32_t UPPER_NR_LIMIT = 0x3FFFFFFF; |
| 26 | + |
| 27 | +// Offset to the nr field in struct seccomp_data |
| 28 | +const std::uint32_t SECCOMP_DATA_NR_OFFSET = 0x00; |
| 29 | +// Offset to the arch field in struct seccomp_data |
| 30 | +const std::uint32_t SECCOMP_DATA_ARCH_OFFSET = 0x04; |
| 31 | + |
| 32 | +// Copied from seccomp.h |
| 33 | +// seccomp.h cannot be included as it was added in Linux kernel 3.17 |
| 34 | +// and this must build on older versions. |
| 35 | +// TODO: remove on the minumum build kernel version supports seccomp |
| 36 | +#define SECCOMP_MODE_FILTER 2 |
| 37 | +#define SECCOMP_RET_ERRNO 0x00050000U |
| 38 | +#define SECCOMP_RET_ALLOW 0x7fff0000U |
| 39 | +#define SECCOMP_RET_DATA 0x0000ffffU |
| 40 | + |
| 41 | +// Added in Linux 3.5 |
| 42 | +#ifndef PR_SET_NO_NEW_PRIVS |
| 43 | +#define PR_SET_NO_NEW_PRIVS 38 |
| 44 | +#endif |
| 45 | + |
| 46 | +const struct sock_filter FILTER[] = { |
| 47 | + // Load architecture from 'seccomp_data' buffer into accumulator |
| 48 | + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, SECCOMP_DATA_ARCH_OFFSET), |
| 49 | + // Jump to disallow if architecture is not X86_64 |
| 50 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 5), |
| 51 | + // Load the system call number into accumulator |
| 52 | + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, SECCOMP_DATA_NR_OFFSET), |
| 53 | + // Only applies to X86_64 arch. Jump to disallow for calls using the x32 ABI |
| 54 | + BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, UPPER_NR_LIMIT, 34, 0), |
| 55 | + // Allowed sys calls, jump to return allow on match |
| 56 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_read, 34, 0), |
| 57 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_write, 33, 0), |
| 58 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_writev, 32, 0), |
| 59 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_lseek, 31, 0), |
| 60 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_lstat, 30, 0), |
| 61 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_readlink, 29, 0), |
| 62 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_stat, 28, 0), |
| 63 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_fstat, 27, 0), |
| 64 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_open, 26, 0), |
| 65 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_close, 25, 0), |
| 66 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_connect, 24, 0), |
| 67 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clone, 23, 0), |
| 68 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_statfs, 22, 0), |
| 69 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_dup2, 21, 0), |
| 70 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rmdir, 20, 0), // for forecast temp storage |
| 71 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_getdents, 19, 0), // for forecast temp storage |
| 72 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_openat, 18, 0), // for forecast temp storage |
| 73 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_tgkill, 17, 0), // for the crash handler |
| 74 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigaction, 16, 0), // for the crash handler |
| 75 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn, 15, 0), |
| 76 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex, 14, 0), |
| 77 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_madvise, 13, 0), |
| 78 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_unlink, 12, 0), |
| 79 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mknod, 11, 0), |
| 80 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_nanosleep, 10, 0), |
| 81 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_robust_list, 9, 0), |
| 82 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mprotect, 8, 0), |
| 83 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap, 7, 0), |
| 84 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mmap, 6, 0), |
| 85 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_getuid, 5, 0), |
| 86 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_exit_group, 4, 0), |
| 87 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_access, 3, 0), |
| 88 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_brk, 2, 0), |
| 89 | + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_exit, 1, 0), |
| 90 | + // Disallow call with error code EACCES |
| 91 | + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ERRNO | (EACCES & SECCOMP_RET_DATA)), |
| 92 | + // Allow call |
| 93 | + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW)}; |
| 94 | + |
| 95 | +bool canUseSeccompBpf() { |
| 96 | + // This call is expected to fail due to the nullptr argument |
| 97 | + // but the failure mode informs us if the kernel was configured |
| 98 | + // with CONFIG_SECCOMP_FILTER |
| 99 | + // http://man7.org/linux/man-pages/man2/prctl.2.html |
| 100 | + int result = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, nullptr); |
| 101 | + int configError = errno; |
| 102 | + if (result != -1) { |
| 103 | + LOG_ERROR(<< "prctl set seccomp with null argument should have failed"); |
| 104 | + return false; |
| 105 | + } |
| 106 | + |
| 107 | + // If the kernel is not configured with CONFIG_SECCOMP_FILTER |
| 108 | + // or CONFIG_SECCOMP the error is EINVAL. EFAULT indicates the |
| 109 | + // seccomp filters are enabled but the 3rd argument (nullptr) |
| 110 | + // was invalid. |
| 111 | + return configError == EFAULT; |
| 112 | +} |
| 113 | +} |
| 114 | + |
| 115 | +void CSystemCallFilter::installSystemCallFilter() { |
| 116 | + if (canUseSeccompBpf()) { |
| 117 | + LOG_DEBUG(<< "Seccomp BPF filters available"); |
| 118 | + |
| 119 | + // Ensure more permissive privileges cannot be set in future. |
| 120 | + // This must be set before installing the filter. |
| 121 | + // PR_SET_NO_NEW_PRIVS was aded in kernel 3.5 |
| 122 | + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { |
| 123 | + LOG_ERROR(<< "prctl PR_SET_NO_NEW_PRIVS failed: " << std::strerror(errno)); |
| 124 | + return; |
| 125 | + } |
| 126 | + |
| 127 | + struct sock_fprog prog = { |
| 128 | + .len = static_cast<unsigned short>(sizeof(FILTER) / sizeof(FILTER[0])), |
| 129 | + .filter = const_cast<sock_filter*>(FILTER)}; |
| 130 | + |
| 131 | + // Install the filter. |
| 132 | + // prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, filter) was introduced |
| 133 | + // in kernel 3.5. This is functionally equivalent to |
| 134 | + // seccomp(SECCOMP_SET_MODE_FILTER, 0, filter) which was added in |
| 135 | + // kernel 3.17. We choose the older more compatible function. |
| 136 | + // Note this precludes the use of calling seccomp() with the |
| 137 | + // SECCOMP_FILTER_FLAG_TSYNC which is acceptable if the filter |
| 138 | + // is installed by the main thread before any other threads are |
| 139 | + // spawned. |
| 140 | + if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { |
| 141 | + LOG_ERROR(<< "Unable to install Seccomp BPF: " << std::strerror(errno)); |
| 142 | + } else { |
| 143 | + LOG_DEBUG(<< "Seccomp BPF installed"); |
| 144 | + } |
| 145 | + |
| 146 | + } else { |
| 147 | + LOG_DEBUG(<< "Seccomp BPF not available"); |
| 148 | + } |
| 149 | +} |
| 150 | +} |
| 151 | +} |
0 commit comments