Skip to content

Commit 7eb95e0

Browse files
Gaurav Batragregkh
authored andcommitted
powerpc/pseries/iommu: IOMMU table is not initialized for kdump over SR-IOV
[ Upstream commit 09a3c1e ] When kdump kernel tries to copy dump data over SR-IOV, LPAR panics due to NULL pointer exception: Kernel attempted to read user page (0) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on read at 0x00000000 Faulting instruction address: 0xc000000020847ad4 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: mlx5_core(+) vmx_crypto pseries_wdt papr_scm libnvdimm mlxfw tls psample sunrpc fuse overlay squashfs loop CPU: 12 PID: 315 Comm: systemd-udevd Not tainted 6.4.0-Test102+ starfive-tech#12 Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_008) hv:phyp pSeries NIP: c000000020847ad4 LR: c00000002083b2dc CTR: 00000000006cd18c REGS: c000000029162ca0 TRAP: 0300 Not tainted (6.4.0-Test102+) MSR: 800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE> CR: 48288244 XER: 00000008 CFAR: c00000002083b2d8 DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 1 ... NIP _find_next_zero_bit+0x24/0x110 LR bitmap_find_next_zero_area_off+0x5c/0xe0 Call Trace: dev_printk_emit+0x38/0x48 (unreliable) iommu_area_alloc+0xc4/0x180 iommu_range_alloc+0x1e8/0x580 iommu_alloc+0x60/0x130 iommu_alloc_coherent+0x158/0x2b0 dma_iommu_alloc_coherent+0x3c/0x50 dma_alloc_attrs+0x170/0x1f0 mlx5_cmd_init+0xc0/0x760 [mlx5_core] mlx5_function_setup+0xf0/0x510 [mlx5_core] mlx5_init_one+0x84/0x210 [mlx5_core] probe_one+0x118/0x2c0 [mlx5_core] local_pci_probe+0x68/0x110 pci_call_probe+0x68/0x200 pci_device_probe+0xbc/0x1a0 really_probe+0x104/0x540 __driver_probe_device+0xb4/0x230 driver_probe_device+0x54/0x130 __driver_attach+0x158/0x2b0 bus_for_each_dev+0xa8/0x130 driver_attach+0x34/0x50 bus_add_driver+0x16c/0x300 driver_register+0xa4/0x1b0 __pci_register_driver+0x68/0x80 mlx5_init+0xb8/0x100 [mlx5_core] do_one_initcall+0x60/0x300 do_init_module+0x7c/0x2b0 At the time of LPAR dump, before kexec hands over control to kdump kernel, DDWs (Dynamic DMA Windows) are scanned and added to the FDT. For the SR-IOV case, default DMA window "ibm,dma-window" is removed from the FDT and DDW added, for the device. Now, kexec hands over control to the kdump kernel. When the kdump kernel initializes, PCI busses are scanned and IOMMU group/tables created, in pci_dma_bus_setup_pSeriesLP(). For the SR-IOV case, there is no "ibm,dma-window". The original commit: b1fc44e, fixes the path where memory is pre-mapped (direct mapped) to the DDW. When TCEs are direct mapped, there is no need to initialize IOMMU tables. iommu_table_setparms_lpar() only considers "ibm,dma-window" property when initiallizing IOMMU table. In the scenario where TCEs are dynamically allocated for SR-IOV, newly created IOMMU table is not initialized. Later, when the device driver tries to enter TCEs for the SR-IOV device, NULL pointer execption is thrown from iommu_area_alloc(). The fix is to initialize the IOMMU table with DDW property stored in the FDT. There are 2 points to remember: 1. For the dedicated adapter, kdump kernel would encounter both default and DDW in FDT. In this case, DDW property is used to initialize the IOMMU table. 2. A DDW could be direct or dynamic mapped. kdump kernel would initialize IOMMU table and mark the existing DDW as "dynamic". This works fine since, at the time of table initialization, iommu_table_clear() makes some space in the DDW, for some predefined number of TCEs which are needed for kdump to succeed. Fixes: b1fc44e ("pseries/iommu/ddw: Fix kdump to work in absence of ibm,dma-window") Signed-off-by: Gaurav Batra <gbatra@linux.vnet.ibm.com> Reviewed-by: Brian King <brking@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://msgid.link/20240125203017.61014-1-gbatra@linux.ibm.com Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 2c96f66 commit 7eb95e0

File tree

1 file changed

+105
-51
lines changed
  • arch/powerpc/platforms/pseries

1 file changed

+105
-51
lines changed

arch/powerpc/platforms/pseries/iommu.c

Lines changed: 105 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -569,29 +569,6 @@ static void iommu_table_setparms(struct pci_controller *phb,
569569

570570
struct iommu_table_ops iommu_table_lpar_multi_ops;
571571

572-
/*
573-
* iommu_table_setparms_lpar
574-
*
575-
* Function: On pSeries LPAR systems, return TCE table info, given a pci bus.
576-
*/
577-
static void iommu_table_setparms_lpar(struct pci_controller *phb,
578-
struct device_node *dn,
579-
struct iommu_table *tbl,
580-
struct iommu_table_group *table_group,
581-
const __be32 *dma_window)
582-
{
583-
unsigned long offset, size, liobn;
584-
585-
of_parse_dma_window(dn, dma_window, &liobn, &offset, &size);
586-
587-
iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL,
588-
&iommu_table_lpar_multi_ops);
589-
590-
591-
table_group->tce32_start = offset;
592-
table_group->tce32_size = size;
593-
}
594-
595572
struct iommu_table_ops iommu_table_pseries_ops = {
596573
.set = tce_build_pSeries,
597574
.clear = tce_free_pSeries,
@@ -719,44 +696,92 @@ struct iommu_table_ops iommu_table_lpar_multi_ops = {
719696
* dynamic 64bit DMA window, walking up the device tree.
720697
*/
721698
static struct device_node *pci_dma_find(struct device_node *dn,
722-
const __be32 **dma_window)
699+
struct dynamic_dma_window_prop *prop)
723700
{
724-
const __be32 *dw = NULL;
701+
const __be32 *default_prop = NULL;
702+
const __be32 *ddw_prop = NULL;
703+
struct device_node *rdn = NULL;
704+
bool default_win = false, ddw_win = false;
725705

726706
for ( ; dn && PCI_DN(dn); dn = dn->parent) {
727-
dw = of_get_property(dn, "ibm,dma-window", NULL);
728-
if (dw) {
729-
if (dma_window)
730-
*dma_window = dw;
731-
return dn;
707+
default_prop = of_get_property(dn, "ibm,dma-window", NULL);
708+
if (default_prop) {
709+
rdn = dn;
710+
default_win = true;
711+
}
712+
ddw_prop = of_get_property(dn, DIRECT64_PROPNAME, NULL);
713+
if (ddw_prop) {
714+
rdn = dn;
715+
ddw_win = true;
716+
break;
717+
}
718+
ddw_prop = of_get_property(dn, DMA64_PROPNAME, NULL);
719+
if (ddw_prop) {
720+
rdn = dn;
721+
ddw_win = true;
722+
break;
732723
}
733-
dw = of_get_property(dn, DIRECT64_PROPNAME, NULL);
734-
if (dw)
735-
return dn;
736-
dw = of_get_property(dn, DMA64_PROPNAME, NULL);
737-
if (dw)
738-
return dn;
724+
725+
/* At least found default window, which is the case for normal boot */
726+
if (default_win)
727+
break;
739728
}
740729

741-
return NULL;
730+
/* For PCI devices there will always be a DMA window, either on the device
731+
* or parent bus
732+
*/
733+
WARN_ON(!(default_win | ddw_win));
734+
735+
/* caller doesn't want to get DMA window property */
736+
if (!prop)
737+
return rdn;
738+
739+
/* parse DMA window property. During normal system boot, only default
740+
* DMA window is passed in OF. But, for kdump, a dedicated adapter might
741+
* have both default and DDW in FDT. In this scenario, DDW takes precedence
742+
* over default window.
743+
*/
744+
if (ddw_win) {
745+
struct dynamic_dma_window_prop *p;
746+
747+
p = (struct dynamic_dma_window_prop *)ddw_prop;
748+
prop->liobn = p->liobn;
749+
prop->dma_base = p->dma_base;
750+
prop->tce_shift = p->tce_shift;
751+
prop->window_shift = p->window_shift;
752+
} else if (default_win) {
753+
unsigned long offset, size, liobn;
754+
755+
of_parse_dma_window(rdn, default_prop, &liobn, &offset, &size);
756+
757+
prop->liobn = cpu_to_be32((u32)liobn);
758+
prop->dma_base = cpu_to_be64(offset);
759+
prop->tce_shift = cpu_to_be32(IOMMU_PAGE_SHIFT_4K);
760+
prop->window_shift = cpu_to_be32(order_base_2(size));
761+
}
762+
763+
return rdn;
742764
}
743765

744766
static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
745767
{
746768
struct iommu_table *tbl;
747769
struct device_node *dn, *pdn;
748770
struct pci_dn *ppci;
749-
const __be32 *dma_window = NULL;
771+
struct dynamic_dma_window_prop prop;
750772

751773
dn = pci_bus_to_OF_node(bus);
752774

753775
pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n",
754776
dn);
755777

756-
pdn = pci_dma_find(dn, &dma_window);
778+
pdn = pci_dma_find(dn, &prop);
757779

758-
if (dma_window == NULL)
759-
pr_debug(" no ibm,dma-window property !\n");
780+
/* In PPC architecture, there will always be DMA window on bus or one of the
781+
* parent bus. During reboot, there will be ibm,dma-window property to
782+
* define DMA window. For kdump, there will at least be default window or DDW
783+
* or both.
784+
*/
760785

761786
ppci = PCI_DN(pdn);
762787

@@ -766,13 +791,24 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
766791
if (!ppci->table_group) {
767792
ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
768793
tbl = ppci->table_group->tables[0];
769-
if (dma_window) {
770-
iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
771-
ppci->table_group, dma_window);
772794

773-
if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
774-
panic("Failed to initialize iommu table");
775-
}
795+
iommu_table_setparms_common(tbl, ppci->phb->bus->number,
796+
be32_to_cpu(prop.liobn),
797+
be64_to_cpu(prop.dma_base),
798+
1ULL << be32_to_cpu(prop.window_shift),
799+
be32_to_cpu(prop.tce_shift), NULL,
800+
&iommu_table_lpar_multi_ops);
801+
802+
/* Only for normal boot with default window. Doesn't matter even
803+
* if we set these with DDW which is 64bit during kdump, since
804+
* these will not be used during kdump.
805+
*/
806+
ppci->table_group->tce32_start = be64_to_cpu(prop.dma_base);
807+
ppci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift);
808+
809+
if (!iommu_init_table(tbl, ppci->phb->node, 0, 0))
810+
panic("Failed to initialize iommu table");
811+
776812
iommu_register_group(ppci->table_group,
777813
pci_domain_nr(bus), 0);
778814
pr_debug(" created table: %p\n", ppci->table_group);
@@ -960,6 +996,12 @@ static void find_existing_ddw_windows_named(const char *name)
960996
continue;
961997
}
962998

999+
/* If at the time of system initialization, there are DDWs in OF,
1000+
* it means this is during kexec. DDW could be direct or dynamic.
1001+
* We will just mark DDWs as "dynamic" since this is kdump path,
1002+
* no need to worry about perforance. ddw_list_new_entry() will
1003+
* set window->direct = false.
1004+
*/
9631005
window = ddw_list_new_entry(pdn, dma64);
9641006
if (!window) {
9651007
of_node_put(pdn);
@@ -1525,8 +1567,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
15251567
{
15261568
struct device_node *pdn, *dn;
15271569
struct iommu_table *tbl;
1528-
const __be32 *dma_window = NULL;
15291570
struct pci_dn *pci;
1571+
struct dynamic_dma_window_prop prop;
15301572

15311573
pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev));
15321574

@@ -1539,7 +1581,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
15391581
dn = pci_device_to_OF_node(dev);
15401582
pr_debug(" node is %pOF\n", dn);
15411583

1542-
pdn = pci_dma_find(dn, &dma_window);
1584+
pdn = pci_dma_find(dn, &prop);
15431585
if (!pdn || !PCI_DN(pdn)) {
15441586
printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: "
15451587
"no DMA window found for pci dev=%s dn=%pOF\n",
@@ -1552,8 +1594,20 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
15521594
if (!pci->table_group) {
15531595
pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
15541596
tbl = pci->table_group->tables[0];
1555-
iommu_table_setparms_lpar(pci->phb, pdn, tbl,
1556-
pci->table_group, dma_window);
1597+
1598+
iommu_table_setparms_common(tbl, pci->phb->bus->number,
1599+
be32_to_cpu(prop.liobn),
1600+
be64_to_cpu(prop.dma_base),
1601+
1ULL << be32_to_cpu(prop.window_shift),
1602+
be32_to_cpu(prop.tce_shift), NULL,
1603+
&iommu_table_lpar_multi_ops);
1604+
1605+
/* Only for normal boot with default window. Doesn't matter even
1606+
* if we set these with DDW which is 64bit during kdump, since
1607+
* these will not be used during kdump.
1608+
*/
1609+
pci->table_group->tce32_start = be64_to_cpu(prop.dma_base);
1610+
pci->table_group->tce32_size = 1 << be32_to_cpu(prop.window_shift);
15571611

15581612
iommu_init_table(tbl, pci->phb->node, 0, 0);
15591613
iommu_register_group(pci->table_group,

0 commit comments

Comments
 (0)