Skip to content

Commit

Permalink
[POWERPC] Use 4kB iommu pages even on 64kB-page systems
Browse files Browse the repository at this point in the history
The 10Gigabit ethernet device drivers appear to be able to chew
up all 256MB of TCE mappings on pSeries systems, as evidenced by
numerous error messages:

 iommu_alloc failed, tbl c0000000010d5c48 vaddr c0000000d875eff0 npages 1

Some experimentation indicates that this is essentially because
one 1500 byte ethernet MTU gets mapped as a 64K DMA region when
the large 64K pages are enabled. Thus, it doesn't take much to
exhaust all of the available DMA mappings for a high-speed card.

This patch changes the iommu allocator to work with its own
unique, distinct page size. Although the patch is long, its
actually quite simple: it just #defines a distinct IOMMU_PAGE_SIZE
and then uses this in all the places that matter.

As a side effect, it also dramatically improves network performance
on platforms with H-calls on iommu translation inserts/removes (since
we no longer call it 16 times for a 1500 bytes packet when the iommu HW
is still 4k).

In the future, we might want to make the IOMMU_PAGE_SIZE a variable
in the iommu_table instance, thus allowing support for different HW
page sizes in the iommu itself.

Signed-off-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Olof Johansson <olof@lixom.net>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
  • Loading branch information
Linas Vepstas authored and paulusmack committed Nov 1, 2006
1 parent dd6c89f commit 5d2efba
Show file tree
Hide file tree
Showing 8 changed files with 80 additions and 81 deletions.
77 changes: 45 additions & 32 deletions arch/powerpc/kernel/iommu.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,17 @@ static int novmerge = 0;
static int novmerge = 1;
#endif

static inline unsigned long iommu_num_pages(unsigned long vaddr,
unsigned long slen)
{
unsigned long npages;

npages = IOMMU_PAGE_ALIGN(vaddr + slen) - (vaddr & IOMMU_PAGE_MASK);
npages >>= IOMMU_PAGE_SHIFT;

return npages;
}

static int __init setup_iommu(char *str)
{
if (!strcmp(str, "novmerge"))
Expand Down Expand Up @@ -178,10 +189,10 @@ static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
}

entry += tbl->it_offset; /* Offset into real TCE table */
ret = entry << PAGE_SHIFT; /* Set the return dma address */
ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */

/* Put the TCEs in the HW table */
ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK,
ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK,
direction);


Expand All @@ -203,7 +214,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
unsigned long entry, free_entry;
unsigned long i;

entry = dma_addr >> PAGE_SHIFT;
entry = dma_addr >> IOMMU_PAGE_SHIFT;
free_entry = entry - tbl->it_offset;

if (((free_entry + npages) > tbl->it_size) ||
Expand Down Expand Up @@ -270,7 +281,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
/* Init first segment length for backout at failure */
outs->dma_length = 0;

DBG("mapping %d elements:\n", nelems);
DBG("sg mapping %d elements:\n", nelems);

spin_lock_irqsave(&(tbl->it_lock), flags);

Expand All @@ -285,9 +296,8 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
}
/* Allocate iommu entries for that segment */
vaddr = (unsigned long)page_address(s->page) + s->offset;
npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK);
npages >>= PAGE_SHIFT;
entry = iommu_range_alloc(tbl, npages, &handle, mask >> PAGE_SHIFT, 0);
npages = iommu_num_pages(vaddr, slen);
entry = iommu_range_alloc(tbl, npages, &handle, mask >> IOMMU_PAGE_SHIFT, 0);

DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen);

Expand All @@ -301,14 +311,14 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,

/* Convert entry to a dma_addr_t */
entry += tbl->it_offset;
dma_addr = entry << PAGE_SHIFT;
dma_addr |= s->offset;
dma_addr = entry << IOMMU_PAGE_SHIFT;
dma_addr |= (s->offset & ~IOMMU_PAGE_MASK);

DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n",
DBG(" - %lu pages, entry: %lx, dma_addr: %lx\n",
npages, entry, dma_addr);

/* Insert into HW table */
ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction);
ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, direction);

/* If we are in an open segment, try merging */
if (segstart != s) {
Expand All @@ -323,7 +333,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
DBG(" can't merge, new segment.\n");
} else {
outs->dma_length += s->length;
DBG(" merged, new len: %lx\n", outs->dma_length);
DBG(" merged, new len: %ux\n", outs->dma_length);
}
}

Expand Down Expand Up @@ -367,9 +377,8 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
if (s->dma_length != 0) {
unsigned long vaddr, npages;

vaddr = s->dma_address & PAGE_MASK;
npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr)
>> PAGE_SHIFT;
vaddr = s->dma_address & IOMMU_PAGE_MASK;
npages = iommu_num_pages(s->dma_address, s->dma_length);
__iommu_free(tbl, vaddr, npages);
s->dma_address = DMA_ERROR_CODE;
s->dma_length = 0;
Expand Down Expand Up @@ -398,8 +407,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,

if (sglist->dma_length == 0)
break;
npages = (PAGE_ALIGN(dma_handle + sglist->dma_length)
- (dma_handle & PAGE_MASK)) >> PAGE_SHIFT;
npages = iommu_num_pages(dma_handle,sglist->dma_length);
__iommu_free(tbl, dma_handle, npages);
sglist++;
}
Expand Down Expand Up @@ -532,20 +540,19 @@ dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
BUG_ON(direction == DMA_NONE);

uaddr = (unsigned long)vaddr;
npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
npages >>= PAGE_SHIFT;
npages = iommu_num_pages(uaddr, size);

if (tbl) {
dma_handle = iommu_alloc(tbl, vaddr, npages, direction,
mask >> PAGE_SHIFT, 0);
mask >> IOMMU_PAGE_SHIFT, 0);
if (dma_handle == DMA_ERROR_CODE) {
if (printk_ratelimit()) {
printk(KERN_INFO "iommu_alloc failed, "
"tbl %p vaddr %p npages %d\n",
tbl, vaddr, npages);
}
} else
dma_handle |= (uaddr & ~PAGE_MASK);
dma_handle |= (uaddr & ~IOMMU_PAGE_MASK);
}

return dma_handle;
Expand All @@ -554,11 +561,14 @@ dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
size_t size, enum dma_data_direction direction)
{
unsigned int npages;

BUG_ON(direction == DMA_NONE);

if (tbl)
iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) -
(dma_handle & PAGE_MASK)) >> PAGE_SHIFT);
if (tbl) {
npages = iommu_num_pages(dma_handle, size);
iommu_free(tbl, dma_handle, npages);
}
}

/* Allocates a contiguous real buffer and creates mappings over it.
Expand All @@ -570,11 +580,11 @@ void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
{
void *ret = NULL;
dma_addr_t mapping;
unsigned int npages, order;
unsigned int order;
unsigned int nio_pages, io_order;
struct page *page;

size = PAGE_ALIGN(size);
npages = size >> PAGE_SHIFT;
order = get_order(size);

/*
Expand All @@ -598,8 +608,10 @@ void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
memset(ret, 0, size);

/* Set up tces to cover the allocated range */
mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL,
mask >> PAGE_SHIFT, order);
nio_pages = size >> IOMMU_PAGE_SHIFT;
io_order = get_iommu_order(size);
mapping = iommu_alloc(tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
mask >> IOMMU_PAGE_SHIFT, io_order);
if (mapping == DMA_ERROR_CODE) {
free_pages((unsigned long)ret, order);
return NULL;
Expand All @@ -611,12 +623,13 @@ void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
void iommu_free_coherent(struct iommu_table *tbl, size_t size,
void *vaddr, dma_addr_t dma_handle)
{
unsigned int npages;

if (tbl) {
unsigned int nio_pages;

size = PAGE_ALIGN(size);
nio_pages = size >> IOMMU_PAGE_SHIFT;
iommu_free(tbl, dma_handle, nio_pages);
size = PAGE_ALIGN(size);
npages = size >> PAGE_SHIFT;
iommu_free(tbl, dma_handle, npages);
free_pages((unsigned long)vaddr, get_order(size));
}
}
4 changes: 2 additions & 2 deletions arch/powerpc/kernel/vio.c
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
&tbl->it_index, &offset, &size);

/* TCE table size - measured in tce entries */
tbl->it_size = size >> PAGE_SHIFT;
tbl->it_size = size >> IOMMU_PAGE_SHIFT;
/* offset for VIO should always be 0 */
tbl->it_offset = offset >> PAGE_SHIFT;
tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
tbl->it_busno = 0;
tbl->it_type = TCE_VB;

Expand Down
11 changes: 2 additions & 9 deletions arch/powerpc/platforms/iseries/iommu.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
u64 rc;
u64 tce, rpn;

index <<= TCE_PAGE_FACTOR;
npages <<= TCE_PAGE_FACTOR;

while (npages--) {
rpn = virt_to_abs(uaddr) >> TCE_SHIFT;
tce = (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
Expand Down Expand Up @@ -75,9 +72,6 @@ static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages)
{
u64 rc;

npages <<= TCE_PAGE_FACTOR;
index <<= TCE_PAGE_FACTOR;

while (npages--) {
rc = HvCallXm_setTce((u64)tbl->it_index, (u64)index, 0);
if (rc)
Expand Down Expand Up @@ -136,10 +130,9 @@ void iommu_table_getparms_iSeries(unsigned long busno,
panic("PCI_DMA: parms->size is zero, parms is 0x%p", parms);

/* itc_size is in pages worth of table, it_size is in # of entries */
tbl->it_size = ((parms->itc_size * TCE_PAGE_SIZE) /
TCE_ENTRY_SIZE) >> TCE_PAGE_FACTOR;
tbl->it_size = (parms->itc_size * TCE_PAGE_SIZE) / TCE_ENTRY_SIZE;
tbl->it_busno = parms->itc_busno;
tbl->it_offset = parms->itc_offset >> TCE_PAGE_FACTOR;
tbl->it_offset = parms->itc_offset;
tbl->it_index = parms->itc_index;
tbl->it_blocksize = 1;
tbl->it_type = virtbus ? TCE_VB : TCE_PCI;
Expand Down
35 changes: 8 additions & 27 deletions arch/powerpc/platforms/pseries/iommu.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,6 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index,
u64 *tcep;
u64 rpn;

index <<= TCE_PAGE_FACTOR;
npages <<= TCE_PAGE_FACTOR;

proto_tce = TCE_PCI_READ; // Read allowed

if (direction != DMA_TO_DEVICE)
Expand All @@ -82,9 +79,6 @@ static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages)
{
u64 *tcep;

npages <<= TCE_PAGE_FACTOR;
index <<= TCE_PAGE_FACTOR;

tcep = ((u64 *)tbl->it_base) + index;

while (npages--)
Expand All @@ -95,7 +89,6 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index)
{
u64 *tcep;

index <<= TCE_PAGE_FACTOR;
tcep = ((u64 *)tbl->it_base) + index;

return *tcep;
Expand All @@ -109,9 +102,6 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
u64 proto_tce, tce;
u64 rpn;

tcenum <<= TCE_PAGE_FACTOR;
npages <<= TCE_PAGE_FACTOR;

rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
proto_tce = TCE_PCI_READ;
if (direction != DMA_TO_DEVICE)
Expand Down Expand Up @@ -146,7 +136,7 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
u64 rpn;
long l, limit;

if (TCE_PAGE_FACTOR == 0 && npages == 1)
if (npages == 1)
return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
direction);

Expand All @@ -164,9 +154,6 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
__get_cpu_var(tce_page) = tcep;
}

tcenum <<= TCE_PAGE_FACTOR;
npages <<= TCE_PAGE_FACTOR;

rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
proto_tce = TCE_PCI_READ;
if (direction != DMA_TO_DEVICE)
Expand Down Expand Up @@ -207,9 +194,6 @@ static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages
{
u64 rc;

tcenum <<= TCE_PAGE_FACTOR;
npages <<= TCE_PAGE_FACTOR;

while (npages--) {
rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0);

Expand All @@ -229,9 +213,6 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n
{
u64 rc;

tcenum <<= TCE_PAGE_FACTOR;
npages <<= TCE_PAGE_FACTOR;

rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);

if (rc && printk_ratelimit()) {
Expand All @@ -248,7 +229,6 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum)
u64 rc;
unsigned long tce_ret;

tcenum <<= TCE_PAGE_FACTOR;
rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret);

if (rc && printk_ratelimit()) {
Expand Down Expand Up @@ -289,7 +269,7 @@ static void iommu_table_setparms(struct pci_controller *phb,
tbl->it_busno = phb->bus->number;

/* Units of tce entries */
tbl->it_offset = phb->dma_window_base_cur >> PAGE_SHIFT;
tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;

/* Test if we are going over 2GB of DMA space */
if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
Expand All @@ -300,7 +280,7 @@ static void iommu_table_setparms(struct pci_controller *phb,
phb->dma_window_base_cur += phb->dma_window_size;

/* Set the tce table size - measured in entries */
tbl->it_size = phb->dma_window_size >> PAGE_SHIFT;
tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;

tbl->it_index = 0;
tbl->it_blocksize = 16;
Expand All @@ -325,8 +305,8 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
tbl->it_base = 0;
tbl->it_blocksize = 16;
tbl->it_type = TCE_PCI;
tbl->it_offset = offset >> PAGE_SHIFT;
tbl->it_size = size >> PAGE_SHIFT;
tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
tbl->it_size = size >> IOMMU_PAGE_SHIFT;
}

static void iommu_bus_setup_pSeries(struct pci_bus *bus)
Expand Down Expand Up @@ -522,8 +502,6 @@ static void iommu_dev_setup_pSeriesLP(struct pci_dev *dev)
const void *dma_window = NULL;
struct pci_dn *pci;

DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, pci_name(dev));

/* dev setup for LPAR is a little tricky, since the device tree might
* contain the dma-window properties per-device and not neccesarily
* for the bus. So we need to search upwards in the tree until we
Expand All @@ -532,6 +510,9 @@ static void iommu_dev_setup_pSeriesLP(struct pci_dev *dev)
*/
dn = pci_device_to_OF_node(dev);

DBG("iommu_dev_setup_pSeriesLP, dev %p (%s) %s\n",
dev, pci_name(dev), dn->full_name);

for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
pdn = pdn->parent) {
dma_window = get_property(pdn, "ibm,dma-window", NULL);
Expand Down
1 change: 0 additions & 1 deletion arch/powerpc/sysdev/dart.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@

#define DART_PAGE_SHIFT 12
#define DART_PAGE_SIZE (1 << DART_PAGE_SHIFT)
#define DART_PAGE_FACTOR (PAGE_SHIFT - DART_PAGE_SHIFT)


#endif /* _POWERPC_SYSDEV_DART_H */
Loading

0 comments on commit 5d2efba

Please sign in to comment.