Skip to content

Commit 8650b8a

Browse files
author
Christoph Hellwig
committed
nfsd: pNFS block layout driver
Add a small shim between core nfsd and filesystems to translate the somewhat cumbersome pNFS data structures and semantics to something more palatable for Linux filesystems. Thanks to Rick McNeal for the old prototype pNFS blocklayout server code, which gave a lot of inspiration to this version even if no code is left from it. Signed-off-by: Christoph Hellwig <hch@lst.de>
1 parent 7fbc106 commit 8650b8a

File tree

7 files changed

+455
-1
lines changed

7 files changed

+455
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
pNFS block layout server user guide
2+
3+
The Linux NFS server now supports the pNFS block layout extension. In this
4+
case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
5+
to handling all the metadata access to the NFS export also hands out layouts
6+
to the clients to directly access the underlying block devices that are
7+
shared with the client.
8+
9+
To use pNFS block layouts with with the Linux NFS server the exported file
10+
system needs to support the pNFS block layouts (currently just XFS), and the
11+
file system must sit on shared storage (typically iSCSI) that is accessible
12+
to the clients in addition to the MDS. As of now the file system needs to
13+
sit directly on the exported volume, striping or concatenation of
14+
volumes on the MDS and clients is not supported yet.
15+
16+
On the server, pNFS block volume support is automatically if the file system
17+
support it. On the client make sure the kernel has the CONFIG_PNFS_BLOCK
18+
option enabled, the blkmapd daemon from nfs-utils is running, and the
19+
file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1).
20+
21+
If the nfsd server needs to fence a non-responding client it calls
22+
/sbin/nfsd-recall-failed with the first argument set to the IP address of
23+
the client, and the second argument set to the device node without the /dev
24+
prefix for the file system to be fenced. Below is an example file that shows
25+
how to translate the device into a serial number from SCSI EVPD 0x80:
26+
27+
cat > /sbin/nfsd-recall-failed << EOF
28+
#!/bin/sh
29+
30+
CLIENT="$1"
31+
DEV="/dev/$2"
32+
EVPD=`sg_inq --page=0x80 ${DEV} | \
33+
grep "Unit serial number:" | \
34+
awk -F ': ' '{print $2}'`
35+
36+
echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
37+
EOF

fs/nfsd/Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
1717
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
1818
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
1919
nfs4acl.o nfs4callback.o nfs4recover.o
20-
nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
20+
nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o

fs/nfsd/blocklayout.c

+189
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
/*
2+
* Copyright (c) 2014 Christoph Hellwig.
3+
*/
4+
#include <linux/exportfs.h>
5+
#include <linux/genhd.h>
6+
#include <linux/slab.h>
7+
8+
#include <linux/nfsd/debug.h>
9+
10+
#include "blocklayoutxdr.h"
11+
#include "pnfs.h"
12+
13+
#define NFSDDBG_FACILITY NFSDDBG_PNFS
14+
15+
16+
static int
17+
nfsd4_block_get_device_info_simple(struct super_block *sb,
18+
struct nfsd4_getdeviceinfo *gdp)
19+
{
20+
struct pnfs_block_deviceaddr *dev;
21+
struct pnfs_block_volume *b;
22+
23+
dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
24+
sizeof(struct pnfs_block_volume), GFP_KERNEL);
25+
if (!dev)
26+
return -ENOMEM;
27+
gdp->gd_device = dev;
28+
29+
dev->nr_volumes = 1;
30+
b = &dev->volumes[0];
31+
32+
b->type = PNFS_BLOCK_VOLUME_SIMPLE;
33+
b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
34+
return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
35+
&b->simple.offset);
36+
}
37+
38+
static __be32
39+
nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
40+
struct nfsd4_getdeviceinfo *gdp)
41+
{
42+
if (sb->s_bdev != sb->s_bdev->bd_contains)
43+
return nfserr_inval;
44+
return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
45+
}
46+
47+
static __be32
48+
nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
49+
struct nfsd4_layoutget *args)
50+
{
51+
struct nfsd4_layout_seg *seg = &args->lg_seg;
52+
struct super_block *sb = inode->i_sb;
53+
u32 block_size = (1 << inode->i_blkbits);
54+
struct pnfs_block_extent *bex;
55+
struct iomap iomap;
56+
u32 device_generation = 0;
57+
int error;
58+
59+
/*
60+
* We do not attempt to support I/O smaller than the fs block size,
61+
* or not aligned to it.
62+
*/
63+
if (args->lg_minlength < block_size) {
64+
dprintk("pnfsd: I/O too small\n");
65+
goto out_layoutunavailable;
66+
}
67+
if (seg->offset & (block_size - 1)) {
68+
dprintk("pnfsd: I/O misaligned\n");
69+
goto out_layoutunavailable;
70+
}
71+
72+
/*
73+
* Some clients barf on non-zero block numbers for NONE or INVALID
74+
* layouts, so make sure to zero the whole structure.
75+
*/
76+
error = -ENOMEM;
77+
bex = kzalloc(sizeof(*bex), GFP_KERNEL);
78+
if (!bex)
79+
goto out_error;
80+
args->lg_content = bex;
81+
82+
error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
83+
&iomap, seg->iomode != IOMODE_READ,
84+
&device_generation);
85+
if (error) {
86+
if (error == -ENXIO)
87+
goto out_layoutunavailable;
88+
goto out_error;
89+
}
90+
91+
if (iomap.length < args->lg_minlength) {
92+
dprintk("pnfsd: extent smaller than minlength\n");
93+
goto out_layoutunavailable;
94+
}
95+
96+
switch (iomap.type) {
97+
case IOMAP_MAPPED:
98+
if (seg->iomode == IOMODE_READ)
99+
bex->es = PNFS_BLOCK_READ_DATA;
100+
else
101+
bex->es = PNFS_BLOCK_READWRITE_DATA;
102+
bex->soff = (iomap.blkno << 9);
103+
break;
104+
case IOMAP_UNWRITTEN:
105+
if (seg->iomode & IOMODE_RW) {
106+
/*
107+
* Crack monkey special case from section 2.3.1.
108+
*/
109+
if (args->lg_minlength == 0) {
110+
dprintk("pnfsd: no soup for you!\n");
111+
goto out_layoutunavailable;
112+
}
113+
114+
bex->es = PNFS_BLOCK_INVALID_DATA;
115+
bex->soff = (iomap.blkno << 9);
116+
break;
117+
}
118+
/*FALLTHRU*/
119+
case IOMAP_HOLE:
120+
if (seg->iomode == IOMODE_READ) {
121+
bex->es = PNFS_BLOCK_NONE_DATA;
122+
break;
123+
}
124+
/*FALLTHRU*/
125+
case IOMAP_DELALLOC:
126+
default:
127+
WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
128+
goto out_layoutunavailable;
129+
}
130+
131+
error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
132+
if (error)
133+
goto out_error;
134+
bex->foff = iomap.offset;
135+
bex->len = iomap.length;
136+
137+
seg->offset = iomap.offset;
138+
seg->length = iomap.length;
139+
140+
dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
141+
return 0;
142+
143+
out_error:
144+
seg->length = 0;
145+
return nfserrno(error);
146+
out_layoutunavailable:
147+
seg->length = 0;
148+
return nfserr_layoutunavailable;
149+
}
150+
151+
static __be32
152+
nfsd4_block_proc_layoutcommit(struct inode *inode,
153+
struct nfsd4_layoutcommit *lcp)
154+
{
155+
loff_t new_size = lcp->lc_last_wr + 1;
156+
struct iattr iattr = { .ia_valid = 0 };
157+
struct iomap *iomaps;
158+
int nr_iomaps;
159+
int error;
160+
161+
nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
162+
lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
163+
if (nr_iomaps < 0)
164+
return nfserrno(nr_iomaps);
165+
166+
if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
167+
timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
168+
lcp->lc_mtime = current_fs_time(inode->i_sb);
169+
iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
170+
iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
171+
172+
if (new_size > i_size_read(inode)) {
173+
iattr.ia_valid |= ATTR_SIZE;
174+
iattr.ia_size = new_size;
175+
}
176+
177+
error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
178+
nr_iomaps, &iattr);
179+
kfree(iomaps);
180+
return nfserrno(error);
181+
}
182+
183+
const struct nfsd4_layout_ops bl_layout_ops = {
184+
.proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo,
185+
.encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
186+
.proc_layoutget = nfsd4_block_proc_layoutget,
187+
.encode_layoutget = nfsd4_block_encode_layoutget,
188+
.proc_layoutcommit = nfsd4_block_proc_layoutcommit,
189+
};

fs/nfsd/blocklayoutxdr.c

+157
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
/*
2+
* Copyright (c) 2014 Christoph Hellwig.
3+
*/
4+
#include <linux/sunrpc/svc.h>
5+
#include <linux/exportfs.h>
6+
#include <linux/nfs4.h>
7+
8+
#include "nfsd.h"
9+
#include "blocklayoutxdr.h"
10+
11+
#define NFSDDBG_FACILITY NFSDDBG_PNFS
12+
13+
14+
__be32
15+
nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
16+
struct nfsd4_layoutget *lgp)
17+
{
18+
struct pnfs_block_extent *b = lgp->lg_content;
19+
int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
20+
__be32 *p;
21+
22+
p = xdr_reserve_space(xdr, sizeof(__be32) + len);
23+
if (!p)
24+
return nfserr_toosmall;
25+
26+
*p++ = cpu_to_be32(len);
27+
*p++ = cpu_to_be32(1); /* we always return a single extent */
28+
29+
p = xdr_encode_opaque_fixed(p, &b->vol_id,
30+
sizeof(struct nfsd4_deviceid));
31+
p = xdr_encode_hyper(p, b->foff);
32+
p = xdr_encode_hyper(p, b->len);
33+
p = xdr_encode_hyper(p, b->soff);
34+
*p++ = cpu_to_be32(b->es);
35+
return 0;
36+
}
37+
38+
static int
39+
nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
40+
{
41+
__be32 *p;
42+
int len;
43+
44+
switch (b->type) {
45+
case PNFS_BLOCK_VOLUME_SIMPLE:
46+
len = 4 + 4 + 8 + 4 + b->simple.sig_len;
47+
p = xdr_reserve_space(xdr, len);
48+
if (!p)
49+
return -ETOOSMALL;
50+
51+
*p++ = cpu_to_be32(b->type);
52+
*p++ = cpu_to_be32(1); /* single signature */
53+
p = xdr_encode_hyper(p, b->simple.offset);
54+
p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
55+
break;
56+
default:
57+
return -ENOTSUPP;
58+
}
59+
60+
return len;
61+
}
62+
63+
__be32
64+
nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
65+
struct nfsd4_getdeviceinfo *gdp)
66+
{
67+
struct pnfs_block_deviceaddr *dev = gdp->gd_device;
68+
int len = sizeof(__be32), ret, i;
69+
__be32 *p;
70+
71+
p = xdr_reserve_space(xdr, len + sizeof(__be32));
72+
if (!p)
73+
return nfserr_resource;
74+
75+
for (i = 0; i < dev->nr_volumes; i++) {
76+
ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
77+
if (ret < 0)
78+
return nfserrno(ret);
79+
len += ret;
80+
}
81+
82+
/*
83+
* Fill in the overall length and number of volumes at the beginning
84+
* of the layout.
85+
*/
86+
*p++ = cpu_to_be32(len);
87+
*p++ = cpu_to_be32(dev->nr_volumes);
88+
return 0;
89+
}
90+
91+
int
92+
nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
93+
u32 block_size)
94+
{
95+
struct iomap *iomaps;
96+
u32 nr_iomaps, expected, i;
97+
98+
if (len < sizeof(u32)) {
99+
dprintk("%s: extent array too small: %u\n", __func__, len);
100+
return -EINVAL;
101+
}
102+
103+
nr_iomaps = be32_to_cpup(p++);
104+
expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
105+
if (len != expected) {
106+
dprintk("%s: extent array size mismatch: %u/%u\n",
107+
__func__, len, expected);
108+
return -EINVAL;
109+
}
110+
111+
iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
112+
if (!iomaps) {
113+
dprintk("%s: failed to allocate extent array\n", __func__);
114+
return -ENOMEM;
115+
}
116+
117+
for (i = 0; i < nr_iomaps; i++) {
118+
struct pnfs_block_extent bex;
119+
120+
memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
121+
p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
122+
123+
p = xdr_decode_hyper(p, &bex.foff);
124+
if (bex.foff & (block_size - 1)) {
125+
dprintk("%s: unaligned offset %lld\n",
126+
__func__, bex.foff);
127+
goto fail;
128+
}
129+
p = xdr_decode_hyper(p, &bex.len);
130+
if (bex.len & (block_size - 1)) {
131+
dprintk("%s: unaligned length %lld\n",
132+
__func__, bex.foff);
133+
goto fail;
134+
}
135+
p = xdr_decode_hyper(p, &bex.soff);
136+
if (bex.soff & (block_size - 1)) {
137+
dprintk("%s: unaligned disk offset %lld\n",
138+
__func__, bex.soff);
139+
goto fail;
140+
}
141+
bex.es = be32_to_cpup(p++);
142+
if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
143+
dprintk("%s: incorrect extent state %d\n",
144+
__func__, bex.es);
145+
goto fail;
146+
}
147+
148+
iomaps[i].offset = bex.foff;
149+
iomaps[i].length = bex.len;
150+
}
151+
152+
*iomapp = iomaps;
153+
return nr_iomaps;
154+
fail:
155+
kfree(iomaps);
156+
return -EINVAL;
157+
}

0 commit comments

Comments
 (0)