-
Notifications
You must be signed in to change notification settings - Fork 174
Description
Currently, it's not documented anywhere, but the only way to create a Program
that has the IS_LINUX_KERNEL
flag set is to go through drgn_program_set_core_dump_fd_internal()
.
Lines 327 to 691 in 64d82dd
static struct drgn_error * | |
drgn_program_set_core_dump_fd_internal(struct drgn_program *prog, int fd, | |
const char *path) | |
{ | |
struct drgn_error *err; | |
GElf_Ehdr ehdr_mem, *ehdr; | |
bool had_platform; | |
bool is_64_bit, little_endian, is_kdump; | |
size_t phnum, i; | |
size_t num_file_segments, j; | |
bool have_phys_addrs = false; | |
bool have_qemu_note = false; | |
const char *vmcoreinfo_note = NULL; | |
size_t vmcoreinfo_size = 0; | |
bool have_nt_taskstruct = false, is_proc_kcore; | |
bool have_vmcoreinfo = prog->vmcoreinfo.raw; | |
bool had_vmcoreinfo = have_vmcoreinfo; | |
prog->core_fd = fd; | |
err = has_kdump_signature(prog, path, &is_kdump); | |
if (err) | |
goto out_fd; | |
if (is_kdump) { | |
err = drgn_program_set_kdump(prog); | |
if (err) | |
goto out_fd; | |
return NULL; | |
} | |
elf_version(EV_CURRENT); | |
prog->core = elf_begin(prog->core_fd, ELF_C_READ, NULL); | |
if (!prog->core) { | |
err = drgn_error_libelf(); | |
goto out_fd; | |
} | |
ehdr = gelf_getehdr(prog->core, &ehdr_mem); | |
if (!ehdr || ehdr->e_type != ET_CORE) { | |
err = drgn_error_format(DRGN_ERROR_INVALID_ARGUMENT, | |
"not an ELF core file"); | |
goto out_elf; | |
} | |
had_platform = prog->has_platform; | |
if (!had_platform) { | |
struct drgn_platform platform; | |
drgn_platform_from_elf(ehdr, &platform); | |
drgn_program_set_platform(prog, &platform); | |
} | |
is_64_bit = ehdr->e_ident[EI_CLASS] == ELFCLASS64; | |
little_endian = ehdr->e_ident[EI_DATA] == ELFDATA2LSB; | |
if (elf_getphdrnum(prog->core, &phnum) != 0) { | |
err = drgn_error_libelf(); | |
goto out_platform; | |
} | |
/* | |
* First pass: count the number of loadable segments, check if p_paddr | |
* is valid, and check for notes. | |
*/ | |
num_file_segments = 0; | |
for (i = 0; i < phnum; i++) { | |
GElf_Phdr phdr_mem, *phdr; | |
phdr = gelf_getphdr(prog->core, i, &phdr_mem); | |
if (!phdr) { | |
err = drgn_error_libelf(); | |
goto out_notes; | |
} | |
if (phdr->p_type == PT_LOAD) { | |
if (phdr->p_paddr) | |
have_phys_addrs = true; | |
num_file_segments++; | |
} else if (phdr->p_type == PT_NOTE) { | |
Elf_Data *data; | |
size_t offset; | |
GElf_Nhdr nhdr; | |
size_t name_offset, desc_offset; | |
data = elf_getdata_rawchunk(prog->core, phdr->p_offset, | |
phdr->p_filesz, | |
note_header_type(phdr->p_align)); | |
if (!data) { | |
err = drgn_error_libelf(); | |
goto out_notes; | |
} | |
offset = 0; | |
while (offset < data->d_size && | |
(offset = gelf_getnote(data, offset, &nhdr, | |
&name_offset, | |
&desc_offset))) { | |
const char *name, *desc; | |
name = (char *)data->d_buf + name_offset; | |
desc = (char *)data->d_buf + desc_offset; | |
if (nhdr.n_namesz == sizeof("CORE") && | |
memcmp(name, "CORE", sizeof("CORE")) == 0) { | |
if (nhdr.n_type == NT_TASKSTRUCT) | |
have_nt_taskstruct = true; | |
} else if (nhdr.n_namesz == sizeof("LINUX") && | |
memcmp(name, "LINUX", | |
sizeof("LINUX")) == 0) { | |
if (nhdr.n_type == NT_ARM_PAC_MASK && | |
nhdr.n_descsz >= | |
2 * sizeof(uint64_t)) { | |
memcpy(&prog->aarch64_insn_pac_mask, | |
(uint64_t *)desc + 1, | |
sizeof(uint64_t)); | |
if (little_endian != | |
HOST_LITTLE_ENDIAN) | |
bswap_64(prog->aarch64_insn_pac_mask); | |
} | |
} else if (nhdr.n_namesz == sizeof("VMCOREINFO") && | |
memcmp(name, "VMCOREINFO", | |
sizeof("VMCOREINFO")) == 0) { | |
vmcoreinfo_note = desc; | |
vmcoreinfo_size = nhdr.n_descsz; | |
/* | |
* This is either a vmcore or | |
* /proc/kcore, so even a p_paddr of 0 | |
* may be valid. | |
*/ | |
have_phys_addrs = true; | |
have_vmcoreinfo = true; | |
} else if (nhdr.n_namesz == sizeof("QEMU") && | |
memcmp(name, "QEMU", | |
sizeof("QEMU")) == 0) { | |
have_qemu_note = true; | |
} | |
} | |
} | |
} | |
if (have_nt_taskstruct) { | |
/* | |
* If the core file has an NT_TASKSTRUCT note and is in /proc, | |
* then it's probably /proc/kcore. | |
*/ | |
struct statfs fs; | |
if (fstatfs(prog->core_fd, &fs) == -1) { | |
err = drgn_error_create_os("fstatfs", errno, path); | |
if (err) | |
goto out_notes; | |
} | |
is_proc_kcore = fs.f_type == 0x9fa0; /* PROC_SUPER_MAGIC */ | |
} else { | |
is_proc_kcore = false; | |
} | |
if (have_vmcoreinfo && !is_proc_kcore) { | |
char *env; | |
/* Use libkdumpfile for ELF vmcores if it was requested. */ | |
env = getenv("DRGN_USE_LIBKDUMPFILE_FOR_ELF"); | |
if (env && atoi(env)) { | |
err = drgn_program_set_kdump(prog); | |
if (err) | |
goto out_notes; | |
return NULL; | |
} | |
} | |
prog->file_segments = malloc_array(num_file_segments, | |
sizeof(*prog->file_segments)); | |
if (!prog->file_segments) { | |
err = &drgn_enomem; | |
goto out_notes; | |
} | |
bool pgtable_reader = | |
(is_proc_kcore || have_vmcoreinfo) && | |
prog->platform.arch->linux_kernel_pgtable_iterator_next; | |
if (pgtable_reader) { | |
/* | |
* Try to read any memory that isn't in the core dump via the | |
* page table. | |
*/ | |
err = drgn_program_add_memory_segment(prog, 0, UINT64_MAX, | |
read_memory_via_pgtable, | |
prog, false); | |
if (err) | |
goto out_segments; | |
} | |
/* Second pass: add the segments. */ | |
for (i = 0, j = 0; i < phnum && j < num_file_segments; i++) { | |
GElf_Phdr phdr_mem, *phdr; | |
phdr = gelf_getphdr(prog->core, i, &phdr_mem); | |
if (!phdr) { | |
err = drgn_error_libelf(); | |
goto out_segments; | |
} | |
if (phdr->p_type != PT_LOAD) | |
continue; | |
prog->file_segments[j].file_offset = phdr->p_offset; | |
prog->file_segments[j].file_size = phdr->p_filesz; | |
prog->file_segments[j].fd = prog->core_fd; | |
prog->file_segments[j].eio_is_fault = false; | |
/* | |
* p_filesz < p_memsz is ambiguous for core dumps. The ELF | |
* specification says that "if the segment's memory size p_memsz | |
* is larger than the file size p_filesz, the 'extra' bytes are | |
* defined to hold the value 0 and to follow the segment's | |
* initialized area." | |
* | |
* However, the Linux kernel generates userspace core dumps with | |
* segments with p_filesz < p_memsz to indicate that the range | |
* between p_filesz and p_memsz was filtered out (see | |
* coredump_filter in core(5)). These bytes were not necessarily | |
* zeroes in the process's memory, which contradicts the ELF | |
* specification in a way. | |
* | |
* As of Linux 5.19, /proc/kcore and /proc/vmcore never have | |
* segments with p_filesz < p_memsz. However, makedumpfile | |
* creates segments with p_filesz < p_memsz to indicate ranges | |
* that were excluded. This is similar to Linux userspace core | |
* dumps, except that makedumpfile can also exclude ranges that | |
* were all zeroes. | |
* | |
* So, for userspace core dumps, we want to fault for ranges | |
* between p_filesz and p_memsz to indicate that the memory was | |
* not saved rather than lying and returning zeroes. For | |
* /proc/kcore, we don't expect to see p_filesz < p_memsz but we | |
* fault to be safe. For Linux kernel core dumps, we can't | |
* distinguish between memory that was excluded because it was | |
* all zeroes and memory that was excluded by makedumpfile for | |
* another reason, so we're forced to always return zeroes. | |
*/ | |
prog->file_segments[j].zerofill = have_vmcoreinfo && !is_proc_kcore; | |
err = drgn_program_add_memory_segment(prog, phdr->p_vaddr, | |
phdr->p_memsz, | |
drgn_read_memory_file, | |
&prog->file_segments[j], | |
false); | |
if (err) | |
goto out_segments; | |
if (have_phys_addrs && | |
phdr->p_paddr != (is_64_bit ? UINT64_MAX : UINT32_MAX)) { | |
err = drgn_program_add_memory_segment(prog, | |
phdr->p_paddr, | |
phdr->p_memsz, | |
drgn_read_memory_file, | |
&prog->file_segments[j], | |
true); | |
if (err) | |
goto out_segments; | |
} | |
j++; | |
} | |
/* | |
* Before Linux kernel commit 464920104bf7 ("/proc/kcore: update | |
* physical address for kcore ram and text") (in v4.11), p_paddr in | |
* /proc/kcore is always zero. If we know the address of the direct | |
* mapping, we can still add physical segments. This needs to be a third | |
* pass, as we may need to read virtual memory to determine the mapping. | |
*/ | |
if (is_proc_kcore && !have_phys_addrs && | |
prog->platform.arch->linux_kernel_live_direct_mapping_fallback) { | |
uint64_t direct_mapping, direct_mapping_size; | |
err = prog->platform.arch->linux_kernel_live_direct_mapping_fallback(prog, | |
&direct_mapping, | |
&direct_mapping_size); | |
if (err) | |
goto out_segments; | |
for (i = 0, j = 0; i < phnum && j < num_file_segments; i++) { | |
GElf_Phdr phdr_mem, *phdr; | |
phdr = gelf_getphdr(prog->core, i, &phdr_mem); | |
if (!phdr) { | |
err = drgn_error_libelf(); | |
goto out_segments; | |
} | |
if (phdr->p_type != PT_LOAD) | |
continue; | |
if (phdr->p_vaddr >= direct_mapping && | |
phdr->p_vaddr - direct_mapping + phdr->p_memsz <= | |
direct_mapping_size) { | |
uint64_t phys_addr; | |
phys_addr = phdr->p_vaddr - direct_mapping; | |
err = drgn_program_add_memory_segment(prog, | |
phys_addr, | |
pgtable_reader ? | |
phdr->p_filesz : | |
phdr->p_memsz, | |
drgn_read_memory_file, | |
&prog->file_segments[j], | |
true); | |
if (err) | |
goto out_segments; | |
} | |
j++; | |
} | |
} | |
if (vmcoreinfo_note && !prog->vmcoreinfo.raw) { | |
err = drgn_program_parse_vmcoreinfo(prog, vmcoreinfo_note, | |
vmcoreinfo_size); | |
if (err) | |
goto out_segments; | |
} | |
if (is_proc_kcore) { | |
if (!have_vmcoreinfo) { | |
err = read_vmcoreinfo_fallback(prog); | |
if (err) | |
goto out_segments; | |
} | |
prog->flags |= (DRGN_PROGRAM_IS_LINUX_KERNEL | | |
DRGN_PROGRAM_IS_LIVE | | |
DRGN_PROGRAM_IS_LOCAL); | |
elf_end(prog->core); | |
prog->core = NULL; | |
} else if (have_vmcoreinfo) { | |
prog->flags |= DRGN_PROGRAM_IS_LINUX_KERNEL; | |
} else if (have_qemu_note) { | |
err = drgn_error_create(DRGN_ERROR_INVALID_ARGUMENT, | |
"unrecognized QEMU memory dump; " | |
"for Linux guests, run QEMU with '-device vmcoreinfo', " | |
"compile the kernel with CONFIG_FW_CFG_SYSFS and CONFIG_KEXEC, " | |
"and load the qemu_fw_cfg kernel module " | |
"before dumping the guest memory " | |
"(requires Linux >= 4.17 and QEMU >= 2.11)"); | |
goto out_segments; | |
} | |
if (prog->flags & DRGN_PROGRAM_IS_LINUX_KERNEL) { | |
err = drgn_program_finish_set_kernel(prog); | |
if (err) | |
goto out_segments; | |
} | |
return NULL; | |
out_segments: | |
drgn_memory_reader_deinit(&prog->reader); | |
drgn_memory_reader_init(&prog->reader); | |
free(prog->file_segments); | |
prog->file_segments = NULL; | |
out_notes: | |
// Reset anything we parsed from ELF notes. | |
prog->aarch64_insn_pac_mask = 0; | |
// Free vmcoreinfo buffer if it was not provided by the caller | |
if (!had_vmcoreinfo) { | |
free(prog->vmcoreinfo.raw); | |
memset(&prog->vmcoreinfo, 0, sizeof(prog->vmcoreinfo)); | |
} | |
out_platform: | |
prog->has_platform = had_platform; | |
out_elf: | |
elf_end(prog->core); | |
prog->core = NULL; | |
out_fd: | |
close(prog->core_fd); | |
prog->core_fd = -1; | |
return err; | |
} |
This means that it's not possible to create a Program that represents the kernel with a custom memory reader. This was the goal of #246, which enabled a really interesting use case, even if setting program flags may not be the preferred way to do so.
It seems to me that as of now, it wouldn't be terribly hard to support this. As far as I can tell, the following things would be necessary:
- The VMCOREINFO would need to have been provided at the creation of the program, along with the platform.
- A Linux kernel object finder should be created and added to the Program.
- A page table memory reader might need to be created and added to the Program. (Though it could be unnecessary)
- The
IS_LINUX_KERNEL
flag should be set.
With that, I believe that the next time Program.loaded_modules()
is called, the kernel module iterator would be activated, and assuming the memory readers work, drgn should be able to proceed as normal.
I was thinking one interesting way to achieve this could be a Program.linux_kernel_main_module()
function which does the above and then returns the kernel main module. However, I'm not sure that it's the right API, so I feel like that's an area to discuss.
Metadata
Metadata
Assignees
Labels
Projects
Status