diff -up linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.34.noarch/arch/um/os-Linux/mem.c --- linux-2.6.34.noarch/arch/um/os-Linux/mem.c.orig 2010-09-30 10:14:57.591122000 -0400 +++ linux-2.6.34.noarch/arch/um/os-Linux/mem.c 2010-09-30 10:17:08.383984000 -0400 @@ -13,6 +13,7 @@ #include #include #include +#include #include "init.h" #include "kern_constants.h" #include "os.h" diff -up linux-2.6.34.noarch/block/genhd.c.orig linux-2.6.34.noarch/block/genhd.c --- linux-2.6.34.noarch/block/genhd.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/block/genhd.c 2010-09-30 10:17:08.410985000 -0400 @@ -1009,6 +1009,7 @@ static void disk_release(struct device * struct class block_class = { .name = "block", }; +EXPORT_SYMBOL(block_class); static char *block_devnode(struct device *dev, mode_t *mode) { diff -up linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt --- linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt.orig 2010-09-30 10:17:08.376984000 -0400 +++ linux-2.6.34.noarch/Documentation/filesystems/spnfs.txt 2010-09-30 10:17:08.378989000 -0400 @@ -0,0 +1,211 @@ +(c) 2007 Network Appliance Inc. + +spNFS +----- + +An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). + +A file system is mounted by the clients from the MDS, and all file data +is striped across the DSs. + +Identify the machines that will be filling each of these roles. + +The spnfs kernel will be installed on all machines: clients, the MDS and DSs. + + +Building and installing the spNFS kernel +---------------------------------------- + +Get the spNFS kernel from: + + git://linux-nfs.org/~bhalevy/linux-pnfs.git + +Use the pnfs-all-latest branch and add these options to your .config file + + CONFIG_NETWORK_FILESYSTEMS=y + CONFIG_NFS_FS=m + CONFIG_NFS_V4=y + CONFIG_NFS_V4_1=y + CONFIG_PNFS=y + CONFIG_NFSD=m + CONFIG_PNFSD=y + # CONFIG_PNFSD_LOCAL_EXPORT is not set + CONFIG_SPNFS=y + +By default, spNFS uses whole-file layouts. Layout segments can be enabled +by adding: + + CONFIG_SPNFS_LAYOUTSEGMENTS=y + +to your .config file. + +Building and installation of kernel+modules is as usual. +This kernel should be installed and booted on the client, MDS and DSs. + +Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it +takes over the pnfs export interface. + +Building nfs-utils +------------------ + +Get the nfs-utils package containing spnfsd from: + + git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git + +Follow the standard instructions for building nfs-utils. + +After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd +daemon will only be needed on the MDS. + + +Installation +------------ + +The nfs-utils package contains a default spnfsd.conf file in +utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. + +By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under +this directory, mount points must be created for each DS to +be used for pNFS data stripes. These mount points are named by the ip address +of the corresponding DS. In the sample spnfsd.conf, there are two +DSs defined (172.16.28.134 and 172.16.28.141). + +Following the sample spnfsd.conf, + + mkdir /spnfs + +on the MDS (corresponding to DS-Mount-Directory). Then + + mkdir /spnfs/172.16.28.134 + mkdir /spnfs/172.16.28.141 + +to create the mount points for the DSs. + +On the DSs, chose a directory where data stripes will be created by the MDS. +For the sample file, this directory is /pnfs, so on each DS execute: + + mkdir /pnfs + +This directory is specified in the spnfsd.conf file by the DS*_ROOT option +(where * is replaced by the DS number). DS_ROOT is specified relative to +the directory being exported by the DSs. In our example, our DSs are exporting +the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have +the following entry in /etc/exports: + + / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) + +N.B. If we had created a /exports directory and a /pnfs directory under +/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs +(not /exports/pnfs). + +It may be useful to add entries to /etc/fstab on the MDS to automatically +mount the DS_ROOT file systems. For this example, our MDS fstab would +contain: + + 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 + 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 + +The DS mounts must be performed manually or via fstab at this time (automatic +mounting, directory creation, etc. are on the todo list). To perform I/O +through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction +will eventually be removed). + + +On the MDS, choose a file system to use with spNFS and export it, e.g.: + + / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) + +Make sure nfsd and all supporting processes are running on the MDS and DSs. + + +Running +------- + +If rpc_pipefs is not already mounted (if you're running idmapd it probably is), +you may want to add the following line to /etc/fstab: + + rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 + +to automatically mount rpc_pipefs. + +With spnfsd.conf configured for your environment and the mounts mounted as +described above, spnfsd can now be started. + +On the MDS, execute spnfsd: + + spnfsd + +The executable is located in the directory where it was built, and +may also have been installed elsewhere depending on how you built nfs-utils. +It will run in the foreground by default, and in fact will do so despite +any options suggesting the contrary (it's still a debugging build). + +On the client, make sure the nfslayoutdriver module is loaded: + + modprobe nfslayoutdriver + +Then mount the file system from the MDS: + + mount -t nfs4 -o minorversion=1 mds:/ /mnt + +I/O through the MDS is now supported. To use it, do not load the +nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 +(NFSv2 and v3 are not yet supported). + +You may now use spNFS by performing file system activities in /mnt. +If you create files in /mnt, you should see stripe files corresponding to +new files being created on the DSs. The current implementation names the +stripe files based on the inode number of the file on the MDS. For example, +if you create a file foo in /mnt and do an 'ls -li /mnt/foo': + + # ls -li foo + 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo + +You should see stripe files on each under /pnfs (per the sample) named +1233. The file /pnfs/1233 on DS1 will contain the first bytes +of data written to foo, DS2 will contain the next bytes, etc. +Removing /mnt/foo will remove the corresponding stripe files on the DSs. +Other file system operations should behave (mostly :-) as expected. + + +Layout Segments +--------------- + +If the kernel is compiled to support layout segments, there will +be two files created under /proc/fs/spnfs for controlling layout +segment functionality. + +To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: + + echo 1 > /proc/fs/spnfs/layoutseg + +Layout segments can be disabled (returning to whole-file layouts) by +writing a '0' to /proc/fs/spnfs/layoutseg: + + echo 0 > /proc/fs/spnfs/layoutseg + +When layout segments are enabled, the size of the layouts returned can +be specified by writing a decimal number (ascii representation) to +/proc/fs/spnfs/layoutsegsize: + + echo 1024 > /proc/fs/spnfs/layoutsegsize + +The value'0' has a special meaning--it causes the server to return a +layout that is exactly the size requested by the client: + + echo 0 > /proc/fs/spnfs/layoutsegsize + + +Troubleshooting +--------------- + +If you see data being written to the files on the MDS rather than +the stripe files, make sure the nfslayoutdriver is loaded on the client +(see above). + +If you get a "permission denied" error, make sure mountd is running on the mds +(it occasionally fails to start). + +Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com + + diff -up linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.34.noarch/drivers/md/dm-ioctl.c --- linux-2.6.34.noarch/drivers/md/dm-ioctl.c.orig 2010-09-30 10:15:01.214222000 -0400 +++ linux-2.6.34.noarch/drivers/md/dm-ioctl.c 2010-09-30 10:17:08.417985000 -0400 @@ -657,6 +657,12 @@ static int dev_create(struct dm_ioctl *p return r; } +int dm_dev_create(struct dm_ioctl *param) +{ + return dev_create(param, sizeof(*param)); +} +EXPORT_SYMBOL(dm_dev_create); + /* * Always use UUID for lookups if it's present, otherwise use name or dev. */ @@ -751,6 +757,12 @@ static int dev_remove(struct dm_ioctl *p return 0; } +int dm_dev_remove(struct dm_ioctl *param) +{ + return dev_remove(param, sizeof(*param)); +} +EXPORT_SYMBOL(dm_dev_remove); + /* * Check a string doesn't overrun the chunk of * memory we copied from userland. @@ -923,6 +935,12 @@ static int do_resume(struct dm_ioctl *pa return r; } +int dm_do_resume(struct dm_ioctl *param) +{ + return do_resume(param); +} +EXPORT_SYMBOL(dm_do_resume); + /* * Set or unset the suspension state of a device. * If the device already is in the requested state we just return its status. @@ -1200,6 +1218,12 @@ out: return r; } +int dm_table_load(struct dm_ioctl *param, size_t param_size) +{ + return table_load(param, param_size); +} +EXPORT_SYMBOL(dm_table_load); + static int table_clear(struct dm_ioctl *param, size_t param_size) { int r; diff -up linux-2.6.34.noarch/drivers/scsi/hosts.c.orig linux-2.6.34.noarch/drivers/scsi/hosts.c --- linux-2.6.34.noarch/drivers/scsi/hosts.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/drivers/scsi/hosts.c 2010-09-30 10:17:08.422988000 -0400 @@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct put_device(&class_to_shost(dev)->shost_gendev); } -static struct class shost_class = { +struct class shost_class = { .name = "scsi_host", .dev_release = scsi_host_cls_release, }; diff -up linux-2.6.34.noarch/fs/exofs/exofs.h.orig linux-2.6.34.noarch/fs/exofs/exofs.h --- linux-2.6.34.noarch/fs/exofs/exofs.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/exofs/exofs.h 2010-09-30 10:17:08.444986000 -0400 @@ -36,13 +36,9 @@ #include #include #include +#include #include "common.h" -/* FIXME: Remove once pnfs hits mainline - * #include - */ -#include "pnfs.h" - #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) #ifdef CONFIG_EXOFS_DEBUG @@ -103,6 +99,7 @@ struct exofs_sb_info { struct exofs_i_info { struct inode vfs_inode; /* normal in-memory inode */ wait_queue_head_t i_wq; /* wait queue for inode */ + spinlock_t i_layout_lock; /* lock for layout/return/recall */ unsigned long i_flags; /* various atomic flags */ uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ uint32_t i_dir_start_lookup; /* which page to start lookup */ @@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si */ #define OBJ_2BCREATED 0 /* object will be created soon*/ #define OBJ_CREATED 1 /* object has been created on the osd*/ +/* Below are not used atomic but reuse the same i_flags */ +#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ +#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ static inline int obj_2bcreated(struct exofs_i_info *oi) { @@ -304,4 +304,20 @@ extern const struct inode_operations exo extern const struct inode_operations exofs_symlink_inode_operations; extern const struct inode_operations exofs_fast_symlink_inode_operations; +/* export.c */ +typedef int (exofs_recall_fn)(struct inode *inode); +#ifdef CONFIG_PNFSD +int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, + exofs_recall_fn todo); +void exofs_init_export(struct super_block *sb); +#else +static inline int exofs_inode_recall_layout(struct inode *inode, + enum pnfs_iomode iomode, exofs_recall_fn todo) +{ + return todo(inode); +} + +static inline void exofs_init_export(struct super_block *sb) {} +#endif + #endif diff -up linux-2.6.34.noarch/fs/exofs/export.c.orig linux-2.6.34.noarch/fs/exofs/export.c --- linux-2.6.34.noarch/fs/exofs/export.c.orig 2010-09-30 10:17:08.447987000 -0400 +++ linux-2.6.34.noarch/fs/exofs/export.c 2010-09-30 10:17:08.449986000 -0400 @@ -0,0 +1,396 @@ +/* + * export.c - Implementation of the pnfs_export_operations + * + * Copyright (C) 2009 Panasas Inc. + * All rights reserved. + * + * Boaz Harrosh + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include "exofs.h" + +static int exofs_layout_type(struct super_block *sb) +{ + return LAYOUT_OSD2_OBJECTS; +} + +static void set_dev_id(struct pnfs_deviceid *pnfs_devid, u64 sbid, u64 devid) +{ + struct nfsd4_pnfs_deviceid *dev_id = + (struct nfsd4_pnfs_deviceid *)pnfs_devid; + + dev_id->sbid = sbid; + dev_id->devid = devid; +} + +static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, + u64 offset, u64 length, void *cookie) +{ + struct nfsd4_pnfs_cb_layout cbl; + struct pnfsd_cb_ctl cb_ctl; + int status; + + memset(&cb_ctl, 0, sizeof(cb_ctl)); + status = pnfsd_get_cb_op(&cb_ctl); + if (unlikely(status)) { + EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", + __func__, inode->i_ino, status); + goto err; + } + + memset(&cbl, 0, sizeof(cbl)); + cbl.cbl_recall_type = RETURN_FILE; + cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; + cbl.cbl_seg.iomode = iomode; + cbl.cbl_seg.offset = offset; + cbl.cbl_seg.length = length; + cbl.cbl_cookie = cookie; + + status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); + pnfsd_put_cb_op(&cb_ctl); + +err: + return status; +} + +static enum nfsstat4 exofs_layout_get( + struct inode *inode, + struct exp_xdr_stream *xdr, + const struct nfsd4_pnfs_layoutget_arg *args, + struct nfsd4_pnfs_layoutget_res *res) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; + struct exofs_layout *el = &sbi->layout; + struct pnfs_osd_object_cred *creds = NULL; + struct pnfs_osd_layout layout; + __be32 *start; + bool in_recall; + int i, err; + enum nfsstat4 nfserr; + + res->lg_seg.offset = 0; + res->lg_seg.length = NFS4_MAX_UINT64; + res->lg_seg.iomode = IOMODE_RW; + res->lg_return_on_close = true; /* TODO: unused but will be soon */ + + /* skip opaque size, will be filled-in later */ + start = exp_xdr_reserve_qwords(xdr, 1); + if (!start) { + nfserr = NFS4ERR_TOOSMALL; + goto out; + } + + creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); + if (!creds) { + nfserr = NFS4ERR_LAYOUTTRYLATER; + goto out; + } + + /* Fill in a pnfs_osd_layout struct */ + layout.olo_map = sbi->data_map; + + for (i = 0; i < el->s_numdevs; i++) { + struct pnfs_osd_object_cred *cred = &creds[i]; + osd_id id = exofs_oi_objno(oi); + unsigned dev = exofs_layout_od_id(el, id, i); + + set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, + dev); + cred->oc_object_id.oid_partition_id = el->s_pid; + cred->oc_object_id.oid_object_id = id; + cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? + PNFS_OSD_VERSION_1 : + PNFS_OSD_VERSION_2; + cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; + + cred->oc_cap_key.cred_len = 0; + cred->oc_cap_key.cred = NULL; + + cred->oc_cap.cred_len = OSD_CAP_LEN; + cred->oc_cap.cred = oi->i_cred; + } + + layout.olo_comps_index = 0; + layout.olo_num_comps = el->s_numdevs; + layout.olo_comps = creds; + + err = pnfs_osd_xdr_encode_layout(xdr, &layout); + if (err) { + nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ + goto out; + } + + exp_xdr_encode_opaque_len(start, xdr->p); + + spin_lock(&oi->i_layout_lock); + in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); + if (!in_recall) { + __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); + nfserr = NFS4_OK; + } else { + nfserr = NFS4ERR_RECALLCONFLICT; + } + spin_unlock(&oi->i_layout_lock); + +out: + kfree(creds); + EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", + inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); + return nfserr; +} + +/* NOTE: inode mutex must NOT be held */ +static int exofs_layout_commit( + struct inode *inode, + const struct nfsd4_pnfs_layoutcommit_arg *args, + struct nfsd4_pnfs_layoutcommit_res *res) +{ + struct exofs_i_info *oi = exofs_i(inode); + struct timespec mtime; + loff_t i_size; + int in_recall; + + /* In case of a recall we ignore the new size and mtime since they + * are going to be changed again by truncate, and since we cannot take + * the inode lock in that case. + */ + spin_lock(&oi->i_layout_lock); + in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); + spin_unlock(&oi->i_layout_lock); + if (in_recall) { + EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", + inode->i_ino); + return 0; + } + + /* NOTE: I would love to call inode_setattr here + * but i cannot since this will cause an eventual vmtruncate, + * which will cause a layout_recall. So open code the i_size + * and mtime/atime changes under i_mutex. + */ + mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); + + if (args->lc_mtime.seconds) { + mtime.tv_sec = args->lc_mtime.seconds; + mtime.tv_nsec = args->lc_mtime.nseconds; + + /* layout commit may only make time bigger, since there might + * be reordering of the notifications and it might arrive after + * A local change. + * TODO: if mtime > ctime then we know set_attr did an mtime + * in the future. and we can let this update through + */ + if (0 <= timespec_compare(&mtime, &inode->i_mtime)) + mtime = inode->i_mtime; + } else { + mtime = current_fs_time(inode->i_sb); + } + + /* TODO: Will below work? since mark_inode_dirty has it's own + * Time handling + */ + inode->i_atime = inode->i_mtime = mtime; + + i_size = i_size_read(inode); + if (args->lc_newoffset) { + loff_t new_size = args->lc_last_wr + 1; + + if (i_size < new_size) { + i_size_write(inode, i_size = new_size); + res->lc_size_chg = 1; + res->lc_newsize = new_size; + } + } + /* TODO: else { i_size = osd_get_object_length() } */ + +/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ + + mark_inode_dirty_sync(inode); + + mutex_unlock(&inode->i_mutex); + EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", + inode->i_ino, i_size, args->lc_last_wr); + return 0; +} + +static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) +{ + EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + ioerr->oer_errno, ioerr->oer_iswrite, + _LLU(ioerr->oer_component.oid_object_id), + _LLU(ioerr->oer_comp_offset), + _LLU(ioerr->oer_comp_length)); +} + +static int exofs_layout_return( + struct inode *inode, + const struct nfsd4_pnfs_layoutreturn_arg *args) +{ + __be32 *p = args->lrf_body; + unsigned len = exp_xdr_qwords(args->lrf_body_len); + + EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", + inode->i_ino, args->lr_cookie, len); + + while (len >= pnfs_osd_ioerr_xdr_sz()) { + struct pnfs_osd_ioerr ioerr; + + p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); + len -= pnfs_osd_ioerr_xdr_sz(); + exofs_handle_error(&ioerr); + } + + if (args->lr_cookie) { + struct exofs_i_info *oi = exofs_i(inode); + bool in_recall; + + spin_lock(&oi->i_layout_lock); + in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); + __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); + spin_unlock(&oi->i_layout_lock); + + /* TODO: how to communicate cookie with the waiter */ + if (in_recall) + wake_up(&oi->i_wq); /* wakeup any recalls */ + } + + return 0; +} + +int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, + u32 layout_type, + const struct nfsd4_pnfs_deviceid *devid) +{ + struct exofs_sb_info *sbi = sb->s_fs_info; + struct pnfs_osd_deviceaddr devaddr; + const struct osd_dev_info *odi; + u64 devno = devid->devid; + __be32 *start; + int err; + + memset(&devaddr, 0, sizeof(devaddr)); + + if (unlikely(devno >= sbi->layout.s_numdevs)) + return -ENODEV; + + odi = osduld_device_info(sbi->layout.s_ods[devno]); + + devaddr.oda_systemid.len = odi->systemid_len; + devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ + + devaddr.oda_osdname.len = odi->osdname_len ; + devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ + + /* skip opaque size, will be filled-in later */ + start = exp_xdr_reserve_qwords(xdr, 1); + if (!start) { + err = -E2BIG; + goto err; + } + + err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); + if (err) + goto err; + + exp_xdr_encode_opaque_len(start, xdr->p); + + EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", + exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); + return 0; + +err: + EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", + err, exp_xdr_qbytes(xdr->p - start)); + return err; +} + +struct pnfs_export_operations exofs_pnfs_ops = { + .layout_type = exofs_layout_type, + .layout_get = exofs_layout_get, + .layout_commit = exofs_layout_commit, + .layout_return = exofs_layout_return, + .get_device_info = exofs_get_device_info, +}; + +static bool is_layout_returned(struct exofs_i_info *oi) +{ + bool layout_given; + + spin_lock(&oi->i_layout_lock); + layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); + spin_unlock(&oi->i_layout_lock); + + return !layout_given; +} + +int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, + exofs_recall_fn todo) +{ + struct exofs_i_info *oi = exofs_i(inode); + int layout_given; + int error = 0; + + spin_lock(&oi->i_layout_lock); + layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); + __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); + spin_unlock(&oi->i_layout_lock); + + if (!layout_given) + goto exec; + + for (;;) { + EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", + inode->i_ino); + error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, + &oi->i_wq); + switch (error) { + case 0: + case -EAGAIN: + break; + case -ENOENT: + goto exec; + default: + goto err; + } + + error = wait_event_interruptible(oi->i_wq, + is_layout_returned(oi)); + if (error) + goto err; + } + +exec: + error = todo(inode); + +err: + spin_lock(&oi->i_layout_lock); + __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); + spin_unlock(&oi->i_layout_lock); + EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); + return error; +} + +void exofs_init_export(struct super_block *sb) +{ + sb->s_pnfs_op = &exofs_pnfs_ops; +} diff -up linux-2.6.34.noarch/fs/exofs/inode.c.orig linux-2.6.34.noarch/fs/exofs/inode.c --- linux-2.6.34.noarch/fs/exofs/inode.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/exofs/inode.c 2010-09-30 10:17:08.454986000 -0400 @@ -833,7 +833,7 @@ void exofs_truncate(struct inode *inode) if (unlikely(wait_obj_created(oi))) goto fail; - ret = _do_truncate(inode); + ret = exofs_inode_recall_layout(inode, IOMODE_ANY, _do_truncate); if (ret) goto fail; @@ -964,6 +964,7 @@ static void __oi_init(struct exofs_i_inf { init_waitqueue_head(&oi->i_wq); oi->i_flags = 0; + spin_lock_init(&oi->i_layout_lock); } /* * Fill in an inode read from the OSD and set it up for use diff -up linux-2.6.34.noarch/fs/exofs/Kbuild.orig linux-2.6.34.noarch/fs/exofs/Kbuild --- linux-2.6.34.noarch/fs/exofs/Kbuild.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/exofs/Kbuild 2010-09-30 10:17:08.434986000 -0400 @@ -13,4 +13,5 @@ # exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o +exofs-$(CONFIG_PNFSD) += export.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff -up linux-2.6.34.noarch/fs/exofs/Kconfig.orig linux-2.6.34.noarch/fs/exofs/Kconfig --- linux-2.6.34.noarch/fs/exofs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/exofs/Kconfig 2010-09-30 10:17:08.438994000 -0400 @@ -1,6 +1,7 @@ config EXOFS_FS tristate "exofs: OSD based file system support" depends on SCSI_OSD_ULD + select EXPORTFS_OSD_LAYOUT if PNFSD help EXOFS is a file system that uses an OSD storage device, as its backing storage. diff -up linux-2.6.34.noarch/fs/exofs/super.c.orig linux-2.6.34.noarch/fs/exofs/super.c --- linux-2.6.34.noarch/fs/exofs/super.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/exofs/super.c 2010-09-30 10:17:08.465986000 -0400 @@ -621,6 +621,7 @@ static int exofs_fill_super(struct super sb->s_fs_info = sbi; sb->s_op = &exofs_sops; sb->s_export_op = &exofs_export_ops; + exofs_init_export(sb); root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); if (IS_ERR(root)) { EXOFS_ERR("ERROR: exofs_iget failed\n"); diff -up linux-2.6.34.noarch/fs/exportfs/expfs.c.orig linux-2.6.34.noarch/fs/exportfs/expfs.c --- linux-2.6.34.noarch/fs/exportfs/expfs.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/expfs.c 2010-09-30 10:17:08.489990000 -0400 @@ -16,6 +16,13 @@ #include #include +#if defined(CONFIG_PNFSD) +struct pnfsd_cb_ctl pnfsd_cb_ctl = { + .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) +}; +EXPORT_SYMBOL(pnfsd_cb_ctl); +#endif /* CONFIG_PNFSD */ + #define dprintk(fmt, args...) do{}while(0) diff -up linux-2.6.34.noarch/fs/exportfs/Makefile.orig linux-2.6.34.noarch/fs/exportfs/Makefile --- linux-2.6.34.noarch/fs/exportfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/Makefile 2010-09-30 10:17:08.484990000 -0400 @@ -3,4 +3,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs.o -exportfs-objs := expfs.o +exportfs-y := expfs.o +exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o +exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o +exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o diff -up linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c --- linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-09-30 10:17:08.492991000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-09-30 10:17:08.494987000 -0400 @@ -0,0 +1,158 @@ +/* + * linux/fs/nfsd/nfs4blocklayoutxdr.c + * + * + * Created by Rick McNeal on 3/31/08. + * Copyright 2008 __MyCompanyName__. All rights reserved. + * + */ +#include +#include +#include +#include + +static int +bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) +{ + __be32 *p = exp_xdr_reserve_space(xdr, + 12 + 4 + bld->u.simple.bld_sig_len); + + if (!p) + return -ETOOSMALL; + + p = exp_xdr_encode_u32(p, 1); + p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); + exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, + bld->u.simple.bld_sig_len); + + return 0; +} + +static int +bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) +{ + __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); + + if (!p) + return -ETOOSMALL; + + p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); + p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); + exp_xdr_encode_u32(p, bld->u.slice.bld_index); + + return 0; +} + +static int +bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) +{ + return -ENOTSUPP; +} + +static int +bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) +{ + int i; + __be32 *p = exp_xdr_reserve_space(xdr, + 2 + 1 + bld->u.stripe.bld_stripes); + + p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); + p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); + for (i = 0; i < bld->u.stripe.bld_stripes; i++) + p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); + + return 0; +} + +int +blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, + const struct list_head *volumes) +{ + u32 num_vols = 0, + *layoutlen_p = xdr->p; + pnfs_blocklayout_devinfo_t *bld; + int status = 0; + __be32 *p; + + p = exp_xdr_reserve_qwords(xdr, 2); + if (!p) + return -ETOOSMALL; + p += 2; + + /* + * All simple volumes with their signature are required to be listed + * first. + */ + list_for_each_entry(bld, volumes, bld_list) { + num_vols++; + p = exp_xdr_reserve_qwords(xdr, 1); + if (!p) + return -ETOOSMALL; + p = exp_xdr_encode_u32(p, bld->bld_type); + switch (bld->bld_type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + status = bl_encode_simple(xdr, bld); + break; + case PNFS_BLOCK_VOLUME_SLICE: + status = bl_encode_slice(xdr, bld); + break; + case PNFS_BLOCK_VOLUME_CONCAT: + status = bl_encode_concat(xdr, bld); + break; + case PNFS_BLOCK_VOLUME_STRIPE: + status = bl_encode_stripe(xdr, bld); + break; + default: + BUG(); + } + if (status) + goto error; + } + + /* ---- Fill in the overall length and number of volumes ---- */ + p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); + exp_xdr_encode_u32(p, num_vols); + +error: + return status; +} +EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); + +enum nfsstat4 +blocklayout_encode_layout(struct exp_xdr_stream *xdr, + const struct list_head *bl_head) +{ + struct pnfs_blocklayout_layout *b; + u32 *layoutlen_p = xdr->p, + extents = 0; + __be32 *p; + + /* + * Save spot for opaque block layout length and number of extents, + * fill-in later. + */ + p = exp_xdr_reserve_qwords(xdr, 2); + if (!p) + return NFS4ERR_TOOSMALL; + p += 2; + + list_for_each_entry(b, bl_head, bll_list) { + extents++; + p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); + if (!p) + return NFS4ERR_TOOSMALL; + p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); + p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); + p = exp_xdr_encode_u64(p, b->bll_foff); + p = exp_xdr_encode_u64(p, b->bll_len); + p = exp_xdr_encode_u64(p, b->bll_soff); + p = exp_xdr_encode_u32(p, b->bll_es); + } + + /* ---- Fill in the overall length and number of extents ---- */ + p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); + exp_xdr_encode_u32(p, extents); + + return NFS4_OK; +} +EXPORT_SYMBOL_GPL(blocklayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c --- linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-09-30 10:17:08.496992000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-09-30 10:17:08.498993000 -0400 @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include + +/* We do our-own dprintk so filesystems are not dependent on sunrpc */ +#ifdef dprintk +#undef dprintk +#endif +#define dprintk(fmt, args, ...) do { } while (0) + +/* Calculate the XDR length of the GETDEVICEINFO4resok structure + * excluding the gdir_notification and the gdir_device_addr da_layout_type. + */ +static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) +{ + struct pnfs_filelayout_devaddr *fl_addr; + struct pnfs_filelayout_multipath *mp; + int i, j, nwords; + + /* da_addr_body length, indice length, indices, + * multipath_list4 length */ + nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; + for (i = 0; i < fdev->fl_device_length; i++) { + mp = &fdev->fl_device_list[i]; + nwords++; /* multipath list length */ + for (j = 0; j < mp->fl_multipath_length; j++) { + fl_addr = mp->fl_multipath_list; + nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); + nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); + } + } + dprintk("<-- %s nwords %d\n", __func__, nwords); + return nwords; +} + +/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 + * on the response stream. + * Use linux error codes (not nfs) since these values are being + * returned to the file system. + */ +int +filelayout_encode_devinfo(struct exp_xdr_stream *xdr, + const struct pnfs_filelayout_device *fdev) +{ + unsigned int i, j, len = 0, opaque_words; + u32 *p_in; + u32 index_count = fdev->fl_stripeindices_length; + u32 dev_count = fdev->fl_device_length; + int error = 0; + __be32 *p; + + opaque_words = fl_devinfo_xdr_words(fdev); + dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", + __func__, + index_count, + dev_count, + opaque_words*4); + + /* check space for opaque length */ + p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); + if (!p) { + error = -ETOOSMALL; + goto out; + } + + /* Fill in length later */ + p++; + + /* encode device list indices */ + p = exp_xdr_encode_u32(p, index_count); + for (i = 0; i < index_count; i++) + p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); + + /* encode device list */ + p = exp_xdr_encode_u32(p, dev_count); + for (i = 0; i < dev_count; i++) { + struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; + + p = exp_xdr_encode_u32(p, mp->fl_multipath_length); + for (j = 0; j < mp->fl_multipath_length; j++) { + struct pnfs_filelayout_devaddr *da = + &mp->fl_multipath_list[j]; + + /* Encode device info */ + p = exp_xdr_encode_opaque(p, da->r_netid.data, + da->r_netid.len); + p = exp_xdr_encode_opaque(p, da->r_addr.data, + da->r_addr.len); + } + } + + /* backfill in length. Subtract 4 for da_addr_body size */ + len = (char *)p - (char *)p_in; + exp_xdr_encode_u32(p_in, len - 4); + + error = 0; +out: + dprintk("%s: End err %d xdrlen %d\n", + __func__, error, len); + return error; +} +EXPORT_SYMBOL(filelayout_encode_devinfo); + +/* Encodes the loc_body structure from draft 13 + * on the response stream. + * Use linux error codes (not nfs) since these values are being + * returned to the file system. + */ +enum nfsstat4 +filelayout_encode_layout(struct exp_xdr_stream *xdr, + const struct pnfs_filelayout_layout *flp) +{ + u32 len = 0, nfl_util, fhlen, i; + u32 *layoutlen_p; + enum nfsstat4 nfserr; + __be32 *p; + + dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", + __func__, + flp->device_id.pnfs_fsid, + flp->device_id.pnfs_devid, + flp->lg_first_stripe_index, + flp->lg_fh_length); + + /* Ensure file system added at least one file handle */ + if (flp->lg_fh_length <= 0) { + dprintk("%s: File Layout has no file handles!!\n", __func__); + nfserr = NFS4ERR_LAYOUTUNAVAILABLE; + goto out; + } + + /* Ensure room for len, devid, util, first_stripe_index, + * pattern_offset, number of filehandles */ + p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); + if (!p) { + nfserr = NFS4ERR_TOOSMALL; + goto out; + } + + /* save spot for opaque file layout length, fill-in later*/ + p++; + + /* encode device id */ + p = exp_xdr_encode_u64(p, flp->device_id.sbid); + p = exp_xdr_encode_u64(p, flp->device_id.devid); + + /* set and encode flags */ + nfl_util = flp->lg_stripe_unit; + if (flp->lg_commit_through_mds) + nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; + if (flp->lg_stripe_type == STRIPE_DENSE) + nfl_util |= NFL4_UFLG_DENSE; + p = exp_xdr_encode_u32(p, nfl_util); + + /* encode first stripe index */ + p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); + + /* encode striping pattern start */ + p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); + + /* encode number of file handles */ + p = exp_xdr_encode_u32(p, flp->lg_fh_length); + + /* encode file handles */ + for (i = 0; i < flp->lg_fh_length; i++) { + fhlen = flp->lg_fh_list[i].fh_size; + p = exp_xdr_reserve_space(xdr, 4 + fhlen); + if (!p) { + nfserr = NFS4ERR_TOOSMALL; + goto out; + } + p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); + } + + /* Set number of bytes encoded = total_bytes_encoded - length var */ + len = (char *)p - (char *)layoutlen_p; + exp_xdr_encode_u32(layoutlen_p, len - 4); + + nfserr = NFS4_OK; +out: + dprintk("%s: End err %u xdrlen %d\n", + __func__, nfserr, len); + return nfserr; +} +EXPORT_SYMBOL(filelayout_encode_layout); diff -up linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c --- linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-09-30 10:17:08.501989000 -0400 +++ linux-2.6.34.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-09-30 10:17:08.503988000 -0400 @@ -0,0 +1,289 @@ +/* + * pnfs_osd_xdr_enc.c + * + * Object-Based pNFS Layout XDR layer + * + * Copyright (C) 2007-2009 Panasas Inc. + * All rights reserved. + * + * Benny Halevy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +/* + * struct pnfs_osd_data_map { + * u32 odm_num_comps; + * u64 odm_stripe_unit; + * u32 odm_group_width; + * u32 odm_group_depth; + * u32 odm_mirror_cnt; + * u32 odm_raid_algorithm; + * }; + */ +static int pnfs_osd_xdr_encode_data_map( + struct exp_xdr_stream *xdr, + struct pnfs_osd_data_map *data_map) +{ + __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); + + if (!p) + return -E2BIG; + + p = exp_xdr_encode_u32(p, data_map->odm_num_comps); + p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); + p = exp_xdr_encode_u32(p, data_map->odm_group_width); + p = exp_xdr_encode_u32(p, data_map->odm_group_depth); + p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); + p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); + + return 0; +} + +/* + * struct pnfs_osd_objid { + * struct pnfs_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; + */ +static inline int pnfs_osd_xdr_encode_objid( + struct exp_xdr_stream *xdr, + struct pnfs_osd_objid *object_id) +{ + __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); + struct nfsd4_pnfs_deviceid *dev_id = + (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; + + if (!p) + return -E2BIG; + + p = exp_xdr_encode_u64(p, dev_id->sbid); + p = exp_xdr_encode_u64(p, dev_id->devid); + p = exp_xdr_encode_u64(p, object_id->oid_partition_id); + p = exp_xdr_encode_u64(p, object_id->oid_object_id); + + return 0; +} + +/* + * enum pnfs_osd_cap_key_sec4 { + * PNFS_OSD_CAP_KEY_SEC_NONE = 0, + * PNFS_OSD_CAP_KEY_SEC_SSV = 1 + * }; + * + * struct pnfs_osd_object_cred { + * struct pnfs_osd_objid oc_object_id; + * u32 oc_osd_version; + * u32 oc_cap_key_sec; + * struct pnfs_osd_opaque_cred oc_cap_key + * struct pnfs_osd_opaque_cred oc_cap; + * }; + */ +static int pnfs_osd_xdr_encode_object_cred( + struct exp_xdr_stream *xdr, + struct pnfs_osd_object_cred *olo_comp) +{ + __be32 *p; + int err; + + err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); + if (err) + return err; + + p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); + if (!p) + return -E2BIG; + + p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); + + /* No sec for now */ + p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); + p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ + + exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, + olo_comp->oc_cap.cred_len); + + return 0; +} + +/* + * struct pnfs_osd_layout { + * struct pnfs_osd_data_map olo_map; + * u32 olo_comps_index; + * u32 olo_num_comps; + * struct pnfs_osd_object_cred *olo_comps; + * }; + */ +int pnfs_osd_xdr_encode_layout( + struct exp_xdr_stream *xdr, + struct pnfs_osd_layout *pol) +{ + __be32 *p; + u32 i; + int err; + + err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); + if (err) + return err; + + p = exp_xdr_reserve_qwords(xdr, 2); + if (!p) + return -E2BIG; + + p = exp_xdr_encode_u32(p, pol->olo_comps_index); + p = exp_xdr_encode_u32(p, pol->olo_num_comps); + + for (i = 0; i < pol->olo_num_comps; i++) { + err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); + +static int _encode_string(struct exp_xdr_stream *xdr, + const struct nfs4_string *str) +{ + __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); + + if (!p) + return -E2BIG; + exp_xdr_encode_opaque(p, str->data, str->len); + return 0; +} + +/* struct pnfs_osd_deviceaddr { + * struct pnfs_osd_targetid oda_targetid; + * struct pnfs_osd_targetaddr oda_targetaddr; + * u8 oda_lun[8]; + * struct nfs4_string oda_systemid; + * struct pnfs_osd_object_cred oda_root_obj_cred; + * struct nfs4_string oda_osdname; + * }; + */ +int pnfs_osd_xdr_encode_deviceaddr( + struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) +{ + __be32 *p; + int err; + + p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); + if (!p) + return -E2BIG; + + /* Empty oda_targetid */ + p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); + + /* Empty oda_targetaddr for now */ + p = exp_xdr_encode_u32(p, 0); + + /* oda_lun */ + exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); + + err = _encode_string(xdr, &devaddr->oda_systemid); + if (err) + return err; + + err = pnfs_osd_xdr_encode_object_cred(xdr, + &devaddr->oda_root_obj_cred); + if (err) + return err; + + err = _encode_string(xdr, &devaddr->oda_osdname); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); + +/* + * struct pnfs_osd_layoutupdate { + * u32 dsu_valid; + * s64 dsu_delta; + * u32 olu_ioerr_flag; + * }; + */ +__be32 * +pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) +{ + lou->dsu_valid = be32_to_cpu(*p++); + if (lou->dsu_valid) + p = xdr_decode_hyper(p, &lou->dsu_delta); + lou->olu_ioerr_flag = be32_to_cpu(*p++); + return p; +} +EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); + +/* + * struct pnfs_osd_objid { + * struct pnfs_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; + */ +static inline __be32 * +pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) +{ + /* FIXME: p = xdr_decode_fixed(...) */ + memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); + p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); + + p = xdr_decode_hyper(p, &objid->oid_partition_id); + p = xdr_decode_hyper(p, &objid->oid_object_id); + return p; +} + +/* + * struct pnfs_osd_ioerr { + * struct pnfs_osd_objid oer_component; + * u64 oer_comp_offset; + * u64 oer_comp_length; + * u32 oer_iswrite; + * u32 oer_errno; + * }; + */ +__be32 * +pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) +{ + p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); + p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); + p = xdr_decode_hyper(p, &ioerr->oer_comp_length); + ioerr->oer_iswrite = be32_to_cpu(*p++); + ioerr->oer_errno = be32_to_cpu(*p++); + return p; +} +EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); diff -up linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.34.noarch/fs/gfs2/ops_fstype.c --- linux-2.6.34.noarch/fs/gfs2/ops_fstype.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/gfs2/ops_fstype.c 2010-09-30 10:17:08.509988000 -0400 @@ -19,6 +19,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1146,6 +1147,9 @@ static int fill_super(struct super_block sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_export_op = &gfs2_export_ops; +#if defined(CONFIG_PNFSD) + sb->s_pnfs_op = &pnfs_dlm_export_ops; +#endif /* CONFIG_PNFSD */ sb->s_xattr = gfs2_xattr_handlers; sb->s_qcop = &gfs2_quotactl_ops; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff -up linux-2.6.34.noarch/fs/Kconfig.orig linux-2.6.34.noarch/fs/Kconfig --- linux-2.6.34.noarch/fs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/Kconfig 2010-09-30 10:17:08.428989000 -0400 @@ -224,6 +224,31 @@ config LOCKD_V4 config EXPORTFS tristate +config EXPORTFS_FILE_LAYOUT + bool + depends on PNFSD && EXPORTFS + help + Exportfs support for the NFSv4.1 files layout type. + Must be automatically selected by supporting filesystems. + +config EXPORTFS_OSD_LAYOUT + bool + depends on PNFSD && EXPORTFS + help + Exportfs support for the NFSv4.1 objects layout type. + Must be automatically selected by supporting osd + filesystems. + + If unsure, say N. + +config EXPORTFS_BLOCK_LAYOUT + bool + depends on PNFSD && EXPORTFS + help + Exportfs support for the NFSv4.1 blocks layout type. + Must be automatically selected by supporting filesystems. + + config NFS_ACL_SUPPORT tristate select FS_POSIX_ACL diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c --- linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-09-30 10:17:08.528988000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-09-30 10:17:08.529994000 -0400 @@ -0,0 +1,66 @@ +#include +#include +#include +#include +#include +#include +#include +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +struct pipefs_list bl_device_list; +struct dentry *bl_device_pipe; + +ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) +{ + int err; + struct pipefs_hdr *msg; + + dprintk("Entering %s...\n", __func__); + + msg = pipefs_readmsg(filp, src, len); + if (IS_ERR(msg)) { + dprintk("ERROR: unable to read pipefs message.\n"); + return PTR_ERR(msg); + } + + /* now assign the result, which wakes the blocked thread */ + err = pipefs_assign_upcall_reply(msg, &bl_device_list); + if (err) { + dprintk("ERROR: failed to assign upcall with id %u\n", + msg->msgid); + kfree(msg); + } + return len; +} + +static const struct rpc_pipe_ops bl_pipe_ops = { + .upcall = pipefs_generic_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = pipefs_generic_destroy_msg, +}; + +int bl_pipe_init(void) +{ + dprintk("%s: block_device pipefs registering...\n", __func__); + bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); + if (IS_ERR(bl_device_pipe)) + dprintk("ERROR, unable to make block_device pipe\n"); + + if (!bl_device_pipe) + dprintk("bl_device_pipe is NULL!\n"); + else + dprintk("bl_device_pipe created!\n"); + pipefs_init_list(&bl_device_list); + return 0; +} + +void bl_pipe_exit(void) +{ + dprintk("%s: block_device pipefs unregistering...\n", __func__); + if (IS_ERR(bl_device_pipe)) + return ; + pipefs_closepipe(bl_device_pipe); + return; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-09-30 10:17:08.533988000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.c 2010-09-30 10:17:08.535989000 -0400 @@ -0,0 +1,1160 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Fred Isaman + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#include +#include + +#include /* various write calls */ +#include /* struct bio */ +#include +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andy Adamson "); +MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); + +/* Callback operations to the pNFS client */ +static struct pnfs_client_operations *pnfs_block_callback_ops; + +static void print_page(struct page *page) +{ + dprintk("PRINTPAGE page %p\n", page); + dprintk(" PagePrivate %d\n", PagePrivate(page)); + dprintk(" PageUptodate %d\n", PageUptodate(page)); + dprintk(" PageError %d\n", PageError(page)); + dprintk(" PageDirty %d\n", PageDirty(page)); + dprintk(" PageReferenced %d\n", PageReferenced(page)); + dprintk(" PageLocked %d\n", PageLocked(page)); + dprintk(" PageWriteback %d\n", PageWriteback(page)); + dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); + dprintk("\n"); +} + +/* Given the be associated with isect, determine if page data needs to be + * initialized. + */ +static int is_hole(struct pnfs_block_extent *be, sector_t isect) +{ + if (be->be_state == PNFS_BLOCK_NONE_DATA) + return 1; + else if (be->be_state != PNFS_BLOCK_INVALID_DATA) + return 0; + else + return !is_sector_initialized(be->be_inval, isect); +} + +/* Given the be associated with isect, determine if page data can be + * written to disk. + */ +static int is_writable(struct pnfs_block_extent *be, sector_t isect) +{ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) + return 1; + else if (be->be_state != PNFS_BLOCK_INVALID_DATA) + return 0; + else + return is_sector_initialized(be->be_inval, isect); +} + +static int +dont_like_caller(struct nfs_page *req) +{ + if (atomic_read(&req->wb_complete)) { + /* Called by _multi */ + return 1; + } else { + /* Called by _one */ + return 0; + } +} + +static enum pnfs_try_status +bl_commit(struct nfs_write_data *nfs_data, + int sync) +{ + dprintk("%s enter\n", __func__); + return PNFS_NOT_ATTEMPTED; +} + +/* The data we are handed might be spread across several bios. We need + * to track when the last one is finished. + */ +struct parallel_io { + struct kref refcnt; + struct rpc_call_ops call_ops; + void (*pnfs_callback) (void *data); + void *data; +}; + +static inline struct parallel_io *alloc_parallel(void *data) +{ + struct parallel_io *rv; + + rv = kmalloc(sizeof(*rv), GFP_KERNEL); + if (rv) { + rv->data = data; + kref_init(&rv->refcnt); + } + return rv; +} + +static inline void get_parallel(struct parallel_io *p) +{ + kref_get(&p->refcnt); +} + +static void destroy_parallel(struct kref *kref) +{ + struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); + + dprintk("%s enter\n", __func__); + p->pnfs_callback(p->data); + kfree(p); +} + +static inline void put_parallel(struct parallel_io *p) +{ + kref_put(&p->refcnt, destroy_parallel); +} + +static struct bio * +bl_submit_bio(int rw, struct bio *bio) +{ + if (bio) { + get_parallel(bio->bi_private); + dprintk("%s submitting %s bio %u@%llu\n", __func__, + rw == READ ? "read" : "write", + bio->bi_size, (u64)bio->bi_sector); + submit_bio(rw, bio); + } + return NULL; +} + +static inline void +bl_done_with_rpage(struct page *page, const int ok) +{ + if (ok) { + ClearPagePnfsErr(page); + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + SetPagePnfsErr(page); + } + /* Page is unlocked via rpc_release. Should really be done here. */ +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_read(struct bio *bio, int err) +{ + void *data = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + bl_done_with_rpage(page, uptodate); + } while (bvec >= bio->bi_io_vec); + bio_put(bio); + put_parallel(data); +} + +static void bl_read_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_read_data *rdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_read_data, task); + pnfs_block_callback_ops->nfs_readlist_complete(rdata); +} + +static void +bl_end_par_io_read(void *data) +{ + struct nfs_read_data *rdata = data; + + INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); + schedule_work(&rdata->task.u.tk_work); +} + +/* We don't want normal .rpc_call_done callback used, so we replace it + * with this stub. + */ +static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) +{ + return; +} + +static enum pnfs_try_status +bl_read_pagelist(struct nfs_read_data *rdata, + unsigned nr_pages) +{ + int i, hole; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect, extent_length = 0; + struct parallel_io *par; + loff_t f_offset = rdata->args.offset; + size_t count = rdata->args.count; + struct page **pages = rdata->args.pages; + int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; + + dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, + nr_pages, f_offset, count); + + if (dont_like_caller(rdata->req)) { + dprintk("%s dont_like_caller failed\n", __func__); + goto use_mds; + } + if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { + /* We want to fall back to mds in case of read_page + * after error on read_pages. + */ + dprintk("%s PG_pnfserr set\n", __func__); + goto use_mds; + } + par = alloc_parallel(rdata); + if (!par) + goto use_mds; + par->call_ops = *rdata->pdata.call_ops; + par->call_ops.rpc_call_done = bl_rpc_do_nothing; + par->pnfs_callback = bl_end_par_io_read; + /* At this point, we can no longer jump to use_mds */ + + isect = (sector_t) (f_offset >> 9); + /* Code assumes extents are page-aligned */ + for (i = pg_index; i < nr_pages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + put_extent(be); + put_extent(cow_read); + bio = bl_submit_bio(READ, bio); + /* Get the next one */ + be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), + isect, &cow_read); + if (!be) { + /* Error out this page */ + bl_done_with_rpage(pages[i], 0); + break; + } + extent_length = be->be_length - + (isect - be->be_f_offset); + if (cow_read) { + sector_t cow_length = cow_read->be_length - + (isect - cow_read->be_f_offset); + extent_length = min(extent_length, cow_length); + } + } + hole = is_hole(be, isect); + if (hole && !cow_read) { + bio = bl_submit_bio(READ, bio); + /* Fill hole w/ zeroes w/o accessing device */ + dprintk("%s Zeroing page for hole\n", __func__); + zero_user(pages[i], 0, + min_t(int, PAGE_CACHE_SIZE, count)); + print_page(pages[i]); + bl_done_with_rpage(pages[i], 1); + } else { + struct pnfs_block_extent *be_read; + + be_read = (hole && cow_read) ? cow_read : be; + for (;;) { + if (!bio) { + bio = bio_alloc(GFP_NOIO, nr_pages - i); + if (!bio) { + /* Error out this page */ + bl_done_with_rpage(pages[i], 0); + break; + } + bio->bi_sector = isect - + be_read->be_f_offset + + be_read->be_v_offset; + bio->bi_bdev = be_read->be_mdev; + bio->bi_end_io = bl_end_io_read; + bio->bi_private = par; + } + if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) + break; + bio = bl_submit_bio(READ, bio); + } + } + isect += PAGE_CACHE_SIZE >> 9; + extent_length -= PAGE_CACHE_SIZE >> 9; + } + if ((isect << 9) >= rdata->inode->i_size) { + rdata->res.eof = 1; + rdata->res.count = rdata->inode->i_size - f_offset; + } else { + rdata->res.count = (isect << 9) - f_offset; + } + put_extent(be); + put_extent(cow_read); + bl_submit_bio(READ, bio); + put_parallel(par); + return PNFS_ATTEMPTED; + + use_mds: + dprintk("Giving up and using normal NFS\n"); + return PNFS_NOT_ATTEMPTED; +} + +static void mark_extents_written(struct pnfs_block_layout *bl, + __u64 offset, __u32 count) +{ + sector_t isect, end; + struct pnfs_block_extent *be; + + dprintk("%s(%llu, %u)\n", __func__, offset, count); + if (count == 0) + return; + isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; + end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); + end >>= 9; + while (isect < end) { + sector_t len; + be = find_get_extent(bl, isect, NULL); + BUG_ON(!be); /* FIXME */ + len = min(end, be->be_f_offset + be->be_length) - isect; + if (be->be_state == PNFS_BLOCK_INVALID_DATA) + mark_for_commit(be, isect, len); /* What if fails? */ + isect += len; + put_extent(be); + } +} + +/* STUB - this needs thought */ +static inline void +bl_done_with_wpage(struct page *page, const int ok) +{ + if (!ok) { + SetPageError(page); + SetPagePnfsErr(page); + /* This is an inline copy of nfs_zap_mapping */ + /* This is oh so fishy, and needs deep thought */ + if (page->mapping->nrpages != 0) { + struct inode *inode = page->mapping->host; + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); + } + } + /* end_page_writeback called in rpc_release. Should be done here. */ +} + +/* This is basically copied from mpage_end_io_read */ +static void bl_end_io_write(struct bio *bio, int err) +{ + void *data = bio->bi_private; + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + + do { + struct page *page = bvec->bv_page; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + bl_done_with_wpage(page, uptodate); + } while (bvec >= bio->bi_io_vec); + bio_put(bio); + put_parallel(data); +} + +/* Function scheduled for call during bl_end_par_io_write, + * it marks sectors as written and extends the commitlist. + */ +static void bl_write_cleanup(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + if (!wdata->task.tk_status) { + /* Marks for LAYOUTCOMMIT */ + /* BUG - this should be called after each bio, not after + * all finish, unless have some way of storing success/failure + */ + mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), + wdata->args.offset, wdata->args.count); + } + pnfs_block_callback_ops->nfs_writelist_complete(wdata); +} + +/* Called when last of bios associated with a bl_write_pagelist call finishes */ +static void +bl_end_par_io_write(void *data) +{ + struct nfs_write_data *wdata = data; + + /* STUB - ignoring error handling */ + wdata->task.tk_status = 0; + wdata->verf.committed = NFS_FILE_SYNC; + INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); + schedule_work(&wdata->task.u.tk_work); +} + +static enum pnfs_try_status +bl_write_pagelist(struct nfs_write_data *wdata, + unsigned nr_pages, + int sync) +{ + int i; + struct bio *bio = NULL; + struct pnfs_block_extent *be = NULL; + sector_t isect, extent_length = 0; + struct parallel_io *par; + loff_t offset = wdata->args.offset; + size_t count = wdata->args.count; + struct page **pages = wdata->args.pages; + int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; + + dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); + if (!wdata->req->wb_lseg) { + dprintk("%s no lseg, falling back to MDS\n", __func__); + return PNFS_NOT_ATTEMPTED; + } + if (dont_like_caller(wdata->req)) { + dprintk("%s dont_like_caller failed\n", __func__); + return PNFS_NOT_ATTEMPTED; + } + /* At this point, wdata->pages is a (sequential) list of nfs_pages. + * We want to write each, and if there is an error remove it from + * list and call + * nfs_retry_request(req) to have it redone using nfs. + * QUEST? Do as block or per req? Think have to do per block + * as part of end_bio + */ + par = alloc_parallel(wdata); + if (!par) + return PNFS_NOT_ATTEMPTED; + par->call_ops = *wdata->pdata.call_ops; + par->call_ops.rpc_call_done = bl_rpc_do_nothing; + par->pnfs_callback = bl_end_par_io_write; + /* At this point, have to be more careful with error handling */ + + isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); + for (i = pg_index; i < nr_pages; i++) { + if (!extent_length) { + /* We've used up the previous extent */ + put_extent(be); + bio = bl_submit_bio(WRITE, bio); + /* Get the next one */ + be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), + isect, NULL); + if (!be || !is_writable(be, isect)) { + /* FIXME */ + bl_done_with_wpage(pages[i], 0); + break; + } + extent_length = be->be_length - + (isect - be->be_f_offset); + } + for (;;) { + if (!bio) { + bio = bio_alloc(GFP_NOIO, nr_pages - i); + if (!bio) { + /* Error out this page */ + /* FIXME */ + bl_done_with_wpage(pages[i], 0); + break; + } + bio->bi_sector = isect - be->be_f_offset + + be->be_v_offset; + bio->bi_bdev = be->be_mdev; + bio->bi_end_io = bl_end_io_write; + bio->bi_private = par; + } + if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) + break; + bio = bl_submit_bio(WRITE, bio); + } + isect += PAGE_CACHE_SIZE >> 9; + extent_length -= PAGE_CACHE_SIZE >> 9; + } + wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); + put_extent(be); + bl_submit_bio(WRITE, bio); + put_parallel(par); + return PNFS_ATTEMPTED; +} + +/* FIXME - range ignored */ +static void +release_extents(struct pnfs_block_layout *bl, + struct pnfs_layout_range *range) +{ + int i; + struct pnfs_block_extent *be; + + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + while (!list_empty(&bl->bl_extents[i])) { + be = list_first_entry(&bl->bl_extents[i], + struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + put_extent(be); + } + } + spin_unlock(&bl->bl_ext_lock); +} + +static void +release_inval_marks(struct pnfs_inval_markings *marks) +{ + struct pnfs_inval_tracking *pos, *temp; + + list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { + list_del(&pos->it_link); + kfree(pos); + } + return; +} + +/* Note we are relying on caller locking to prevent nasty races. */ +static void +bl_free_layout(struct pnfs_layout_hdr *lo) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + + dprintk("%s enter\n", __func__); + release_extents(bl, NULL); + release_inval_marks(&bl->bl_inval); + kfree(bl); +} + +static struct pnfs_layout_hdr * +bl_alloc_layout(struct inode *inode) +{ + struct pnfs_block_layout *bl; + + dprintk("%s enter\n", __func__); + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return NULL; + spin_lock_init(&bl->bl_ext_lock); + INIT_LIST_HEAD(&bl->bl_extents[0]); + INIT_LIST_HEAD(&bl->bl_extents[1]); + INIT_LIST_HEAD(&bl->bl_commit); + bl->bl_count = 0; + bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; + INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); + return &bl->bl_layout; +} + +static void +bl_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("%s enter\n", __func__); + kfree(lseg); +} + +/* Because the generic infrastructure does not correctly merge layouts, + * we pretty much ignore lseg, and store all data layout wide, so we + * can correctly merge. Eventually we should push some correct merge + * behavior up to the generic code, as the current behavior tends to + * cause lots of unnecessary overlapping LAYOUTGET requests. + */ +static struct pnfs_layout_segment * +bl_alloc_lseg(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr) +{ + struct pnfs_layout_segment *lseg; + int status; + + dprintk("%s enter\n", __func__); + lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); + if (!lseg) + return NULL; + status = nfs4_blk_process_layoutget(lo, lgr); + if (status) { + /* We don't want to call the full-blown bl_free_lseg, + * since on error extents were not touched. + */ + /* STUB - we really want to distinguish between 2 error + * conditions here. This lseg failed, but lo data structures + * are OK, or we hosed the lo data structures. The calling + * code probably needs to distinguish this too. + */ + kfree(lseg); + return ERR_PTR(status); + } + return lseg; +} + +static int +bl_setup_layoutcommit(struct pnfs_layout_hdr *lo, + struct nfs4_layoutcommit_args *arg) +{ + struct nfs_server *nfss = PNFS_NFS_SERVER(lo); + struct bl_layoutupdate_data *layoutupdate_data; + + dprintk("%s enter\n", __func__); + /* Need to ensure commit is block-size aligned */ + if (nfss->pnfs_blksize) { + u64 mask = nfss->pnfs_blksize - 1; + u64 offset = arg->range.offset & mask; + + arg->range.offset -= offset; + arg->range.length += offset + mask; + arg->range.length &= ~mask; + } + + layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), + GFP_KERNEL); + if (unlikely(!layoutupdate_data)) + return -ENOMEM; + INIT_LIST_HEAD(&layoutupdate_data->ranges); + arg->layoutdriver_data = layoutupdate_data; + + return 0; +} + +static void +bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + dprintk("%s enter\n", __func__); + encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); +} + +static void +bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo, + struct nfs4_layoutcommit_args *arg, int status) +{ + dprintk("%s enter\n", __func__); + clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status); + kfree(arg->layoutdriver_data); +} + +static void free_blk_mountid(struct block_mount_id *mid) +{ + if (mid) { + struct pnfs_block_dev *dev; + spin_lock(&mid->bm_lock); + while (!list_empty(&mid->bm_devlist)) { + dev = list_first_entry(&mid->bm_devlist, + struct pnfs_block_dev, + bm_node); + list_del(&dev->bm_node); + free_block_dev(dev); + } + spin_unlock(&mid->bm_lock); + kfree(mid); + } +} + +/* This is mostly copied form the filelayout's get_device_info function. + * It seems much of this should be at the generic pnfs level. + */ +static struct pnfs_block_dev * +nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, + struct pnfs_deviceid *d_id, + struct list_head *sdlist) +{ + struct pnfs_device *dev; + struct pnfs_block_dev *rv = NULL; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + int i, rc; + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + dprintk("%s max_resp_sz %u max_pages %d\n", + __func__, max_resp_sz, max_pages); + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + dprintk("%s kmalloc failed\n", __func__); + return NULL; + } + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + kfree(dev); + return NULL; + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; + } + + /* set dev->area */ + dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); + if (!dev->area) + goto out_free; + + memcpy(&dev->dev_id, d_id, sizeof(*d_id)); + dev->layout_type = LAYOUT_BLOCK_VOLUME; + dev->dev_notify_types = 0; + dev->pages = pages; + dev->pgbase = 0; + dev->pglen = PAGE_SIZE * max_pages; + dev->mincount = 0; + + dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); + rc = pnfs_block_callback_ops->nfs_getdeviceinfo(server, dev); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free; + + rv = nfs4_blk_decode_device(server, dev, sdlist); + out_free: + if (dev->area != NULL) + vunmap(dev->area); + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(dev); + return rv; +} + + +/* + * Retrieve the list of available devices for the mountpoint. + */ +static int +bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh) +{ + struct block_mount_id *b_mt_id = NULL; + struct pnfs_mount_type *mtype = NULL; + struct pnfs_devicelist *dlist = NULL; + struct pnfs_block_dev *bdev; + LIST_HEAD(block_disklist); + int status = 0, i; + + dprintk("%s enter\n", __func__); + + if (server->pnfs_blksize == 0) { + dprintk("%s Server did not return blksize\n", __func__); + return -EINVAL; + } + b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); + if (!b_mt_id) { + status = -ENOMEM; + goto out_error; + } + /* Initialize nfs4 block layout mount id */ + spin_lock_init(&b_mt_id->bm_lock); + INIT_LIST_HEAD(&b_mt_id->bm_devlist); + + dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); + if (!dlist) + goto out_error; + dlist->eof = 0; + while (!dlist->eof) { + status = pnfs_block_callback_ops->nfs_getdevicelist( + server, fh, dlist); + if (status) + goto out_error; + dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", + __func__, dlist->num_devs, dlist->eof); + /* For each device returned in dlist, call GETDEVICEINFO, and + * decode the opaque topology encoding to create a flat + * volume topology, matching VOLUME_SIMPLE disk signatures + * to disks in the visible block disk list. + * Construct an LVM meta device from the flat volume topology. + */ + for (i = 0; i < dlist->num_devs; i++) { + bdev = nfs4_blk_get_deviceinfo(server, fh, + &dlist->dev_id[i], + &block_disklist); + if (!bdev) + goto out_error; + spin_lock(&b_mt_id->bm_lock); + list_add(&bdev->bm_node, &b_mt_id->bm_devlist); + spin_unlock(&b_mt_id->bm_lock); + } + } + dprintk("%s SUCCESS\n", __func__); + server->pnfs_ld_data = b_mt_id; + + out_return: + kfree(dlist); + return status; + + out_error: + free_blk_mountid(b_mt_id); + kfree(mtype); + goto out_return; +} + +static int +bl_uninitialize_mountpoint(struct nfs_server *server) +{ + struct block_mount_id *b_mt_id = server->pnfs_ld_data; + + dprintk("%s enter\n", __func__); + free_blk_mountid(b_mt_id); + dprintk("%s RETURNS\n", __func__); + return 0; +} + +/* STUB - mark intersection of layout and page as bad, so is not + * used again. + */ +static void mark_bad_read(void) +{ + return; +} + +/* Copied from buffer.c */ +static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + /* This happens, due to failed READA attempts. */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); +} + +/* Copied from buffer.c */ +static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) +{ + __end_buffer_read_notouch(bh, uptodate); +} + +/* + * map_block: map a requested I/0 block (isect) into an offset in the LVM + * meta block_device + */ +static void +map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) +{ + dprintk("%s enter be=%p\n", __func__, be); + + set_buffer_mapped(bh); + bh->b_bdev = be->be_mdev; + bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> + (be->be_mdev->bd_inode->i_blkbits - 9); + + dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", + __func__, (long)isect, + (long)bh->b_blocknr, + bh->b_size); + return; +} + +/* Given an unmapped page, zero it (or read in page for COW), + * and set appropriate flags/markings, but it is safe to not initialize + * the range given in [from, to). + */ +/* This is loosely based on nobh_write_begin */ +static int +init_page_for_write(struct pnfs_block_layout *bl, struct page *page, + unsigned from, unsigned to, sector_t **pages_to_mark) +{ + struct buffer_head *bh; + int inval, ret = -EIO; + struct pnfs_block_extent *be = NULL, *cow_read = NULL; + sector_t isect; + + dprintk("%s enter, %p\n", __func__, page); + bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); + if (!bh) { + ret = -ENOMEM; + goto cleanup; + } + + isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); + be = find_get_extent(bl, isect, &cow_read); + if (!be) + goto cleanup; + inval = is_hole(be, isect); + dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); + if (inval) { + if (be->be_state == PNFS_BLOCK_NONE_DATA) { + dprintk("%s PANIC - got NONE_DATA extent %p\n", + __func__, be); + goto cleanup; + } + map_block(isect, be, bh); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + if (PageUptodate(page)) { + /* Do nothing */ + } else if (inval & !cow_read) { + zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); + } else if (0 < from || PAGE_CACHE_SIZE > to) { + struct pnfs_block_extent *read_extent; + + read_extent = (inval && cow_read) ? cow_read : be; + map_block(isect, read_extent, bh); + lock_buffer(bh); + bh->b_end_io = end_buffer_read_nobh; + submit_bh(READ, bh); + dprintk("%s: Waiting for buffer read\n", __func__); + /* XXX Don't really want to hold layout lock here */ + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto cleanup; + } + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + /* There is a BUG here if is a short copy after write_begin, + * but I think this is a generic fs bug. The problem is that + * we have marked the page as initialized, but it is possible + * that the section not copied may never get copied. + */ + ret = mark_initialized_sectors(be->be_inval, isect, + PAGE_CACHE_SECTORS, + pages_to_mark); + /* Want to preallocate mem so above can't fail */ + if (ret) + goto cleanup; + } + SetPageMappedToDisk(page); + ret = 0; + +cleanup: + free_buffer_head(bh); + put_extent(be); + put_extent(cow_read); + if (ret) { + /* Need to mark layout with bad read...should now + * just use nfs4 for reads and writes. + */ + mark_bad_read(); + } + return ret; +} + +static int +bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, + unsigned count, struct pnfs_fsdata *fsdata) +{ + unsigned from, to; + int ret; + sector_t *pages_to_mark = NULL; + struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); + + dprintk("%s enter, %u@%lld\n", __func__, count, pos); + print_page(page); + /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ + if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { + dprintk("%s Can't handle blocksize %llu\n", __func__, + (u64)bl->bl_blocksize); + put_lseg(fsdata->lseg); + fsdata->lseg = NULL; + return 0; + } + if (PageMappedToDisk(page)) { + /* Basically, this is a flag that says we have + * successfully called write_begin already on this page. + */ + /* NOTE - there are cache consistency issues here. + * For example, what if the layout is recalled, then regained? + * If the file is closed and reopened, will the page flags + * be reset? If not, we'll have to use layout info instead of + * the page flag. + */ + return 0; + } + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + count; + ret = init_page_for_write(bl, page, from, to, &pages_to_mark); + if (ret) { + dprintk("%s init page failed with %i", __func__, ret); + /* Revert back to plain NFS and just continue on with + * write. This assumes there is no request attached, which + * should be true if we get here. + */ + BUG_ON(PagePrivate(page)); + put_lseg(fsdata->lseg); + fsdata->lseg = NULL; + kfree(pages_to_mark); + ret = 0; + } else { + fsdata->private = pages_to_mark; + } + return ret; +} + +/* CAREFUL - what happens if copied < count??? */ +static int +bl_write_end(struct inode *inode, struct page *page, loff_t pos, + unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) +{ + dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); + print_page(page); + if (lseg) + SetPageUptodate(page); + return 0; +} + +/* Return any memory allocated to fsdata->private, and take advantage + * of no page locks to mark pages noted in write_begin as needing + * initialization. + */ +static void +bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) +{ + struct page *page; + pgoff_t index; + sector_t *pos; + struct address_space *mapping = filp->f_mapping; + struct pnfs_fsdata *fake_data; + struct pnfs_layout_segment *lseg; + + if (!fsdata) + return; + lseg = fsdata->lseg; + if (!lseg) + return; + pos = fsdata->private; + if (!pos) + return; + dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); + for (; *pos != ~0; pos++) { + index = *pos >> (PAGE_CACHE_SHIFT - 9); + /* XXX How do we properly deal with failures here??? */ + page = grab_cache_page_write_begin(mapping, index, 0); + if (!page) { + printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); + continue; + } + dprintk("%s: Examining block page\n", __func__); + print_page(page); + if (!PageMappedToDisk(page)) { + /* XXX How do we properly deal with failures here??? */ + dprintk("%s Marking block page\n", __func__); + init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, + PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, + NULL); + print_page(page); + fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); + if (!fake_data) { + printk(KERN_ERR "%s BUG BUG BUG NoMem\n", + __func__); + unlock_page(page); + continue; + } + get_lseg(lseg); + fake_data->lseg = lseg; + fake_data->bypass_eof = 1; + mapping->a_ops->write_end(filp, mapping, + index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, + PAGE_CACHE_SIZE, + page, fake_data); + /* Note fake_data is freed by nfs_write_end */ + } else + unlock_page(page); + } + kfree(fsdata->private); + fsdata->private = NULL; +} + +static ssize_t +bl_get_stripesize(struct pnfs_layout_hdr *lo) +{ + dprintk("%s enter\n", __func__); + return 0; +} + +/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. + * Should return False if there is a reason requests can not be coalesced, + * otherwise, should default to returning True. + */ +static int +bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + dprintk("%s enter\n", __func__); + if (pgio->pg_iswrite) + return prev->wb_lseg == req->wb_lseg; + else + return 1; +} + +static struct layoutdriver_io_operations blocklayout_io_operations = { + .commit = bl_commit, + .read_pagelist = bl_read_pagelist, + .write_pagelist = bl_write_pagelist, + .write_begin = bl_write_begin, + .write_end = bl_write_end, + .write_end_cleanup = bl_write_end_cleanup, + .alloc_layout = bl_alloc_layout, + .free_layout = bl_free_layout, + .alloc_lseg = bl_alloc_lseg, + .free_lseg = bl_free_lseg, + .setup_layoutcommit = bl_setup_layoutcommit, + .encode_layoutcommit = bl_encode_layoutcommit, + .cleanup_layoutcommit = bl_cleanup_layoutcommit, + .initialize_mountpoint = bl_initialize_mountpoint, + .uninitialize_mountpoint = bl_uninitialize_mountpoint, +}; + +static struct layoutdriver_policy_operations blocklayout_policy_operations = { + .get_stripesize = bl_get_stripesize, + .pg_test = bl_pg_test, +}; + +static struct pnfs_layoutdriver_type blocklayout_type = { + .id = LAYOUT_BLOCK_VOLUME, + .name = "LAYOUT_BLOCK_VOLUME", + .ld_io_ops = &blocklayout_io_operations, + .ld_policy_ops = &blocklayout_policy_operations, +}; + +static int __init nfs4blocklayout_init(void) +{ + dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); + + pnfs_block_callback_ops = pnfs_register_layoutdriver(&blocklayout_type); + bl_pipe_init(); + return 0; +} + +static void __exit nfs4blocklayout_exit(void) +{ + dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", + __func__); + + pnfs_unregister_layoutdriver(&blocklayout_type); + bl_pipe_exit(); +} + +module_init(nfs4blocklayout_init); +module_exit(nfs4blocklayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-09-30 10:17:08.542991000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-09-30 10:17:08.544989000 -0400 @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdev.c + * + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Fred Isaman + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#include +#include /* __bread */ + +#include +#include +#include + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) +{ + uint32_t *q = p + XDR_QUADLEN(nbytes); + if (unlikely(q > end || q < p)) + return NULL; + return p; +} +EXPORT_SYMBOL(blk_overflow); + +/* Open a block_device by device number. */ +struct block_device *nfs4_blkdev_get(dev_t dev) +{ + struct block_device *bd; + + dprintk("%s enter\n", __func__); + bd = open_by_devnum(dev, FMODE_READ); + if (IS_ERR(bd)) + goto fail; + return bd; +fail: + dprintk("%s failed to open device : %ld\n", + __func__, PTR_ERR(bd)); + return NULL; +} + +/* + * Release the block device + */ +int nfs4_blkdev_put(struct block_device *bdev) +{ + dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + bd_release(bdev); + return blkdev_put(bdev, FMODE_READ); +} + +/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded + * in dev->dev_addr_buf. + */ +struct pnfs_block_dev * +nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev, + struct list_head *sdlist) +{ + struct pnfs_block_dev *rv = NULL; + struct block_device *bd = NULL; + struct pipefs_hdr *msg = NULL, *reply = NULL; + uint32_t major, minor; + + dprintk("%s enter\n", __func__); + + if (IS_ERR(bl_device_pipe)) + return NULL; + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, + dev->mincount); + msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, + dev->mincount); + if (IS_ERR(msg)) { + dprintk("ERROR: couldn't make pipefs message.\n"); + goto out_err; + } + msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); + msg->status = BL_DEVICE_REQUEST_INIT; + + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); + reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, + &bl_device_list, 0, 0); + + if (IS_ERR(reply)) { + dprintk("ERROR: upcall_waitreply failed\n"); + goto out_err; + } + if (reply->status != BL_DEVICE_REQUEST_PROC) { + dprintk("%s failed to open device: %ld\n", + __func__, PTR_ERR(bd)); + goto out_err; + } + memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); + memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), + sizeof(uint32_t)); + bd = nfs4_blkdev_get(MKDEV(major, minor)); + if (IS_ERR(bd)) { + dprintk("%s failed to open device : %ld\n", + __func__, PTR_ERR(bd)); + goto out_err; + } + + rv = kzalloc(sizeof(*rv), GFP_KERNEL); + if (!rv) + goto out_err; + + rv->bm_mdev = bd; + memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct pnfs_deviceid)); + dprintk("%s Created device %s with bd_block_size %u\n", + __func__, + bd->bd_disk->disk_name, + bd->bd_block_size); + kfree(reply); + kfree(msg); + return rv; + +out_err: + kfree(rv); + if (!IS_ERR(reply)) + kfree(reply); + if (!IS_ERR(msg)) + kfree(msg); + return NULL; +} + +/* Map deviceid returned by the server to constructed block_device */ +static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, + struct pnfs_deviceid *id) +{ + struct block_device *rv = NULL; + struct block_mount_id *mid; + struct pnfs_block_dev *dev; + + dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); + mid = BLK_ID(lo); + spin_lock(&mid->bm_lock); + list_for_each_entry(dev, &mid->bm_devlist, bm_node) { + if (memcmp(id->data, dev->bm_mdevid.data, + NFS4_PNFS_DEVICEID4_SIZE) == 0) { + rv = dev->bm_mdev; + goto out; + } + } + out: + spin_unlock(&mid->bm_lock); + dprintk("%s returning %p\n", __func__, rv); + return rv; +} + +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { + u32 mode; /* R or RW */ + u64 start; /* Expected start of next non-COW extent */ + u64 inval; /* Start of INVAL coverage */ + u64 cowread; /* End of COW read coverage */ +}; + +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, + struct layout_verification *lv) +{ + if (lv->mode == IOMODE_READ) { + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA) + return -EIO; + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } + /* lv->mode == IOMODE_RW */ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + if (lv->cowread > lv->start) + return -EIO; + lv->start += be->be_length; + lv->inval = lv->start; + return 0; + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { + if (be->be_f_offset > lv->start) + return -EIO; + if (be->be_f_offset < lv->inval) + return -EIO; + if (be->be_f_offset < lv->cowread) + return -EIO; + /* It looks like you might want to min this with lv->start, + * but you really don't. + */ + lv->inval = lv->inval + be->be_length; + lv->cowread = be->be_f_offset + be->be_length; + return 0; + } else + return -EIO; +} + +/* XDR decode pnfs_block_layout4 structure */ +int +nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + uint32_t *p = (uint32_t *)lgr->layout.buf; + uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); + int i, status = -EIO; + uint32_t count; + struct pnfs_block_extent *be = NULL, *save; + uint64_t tmp; /* Used by READSECTOR */ + struct layout_verification lv = { + .mode = lgr->range.iomode, + .start = lgr->range.offset >> 9, + .inval = lgr->range.offset >> 9, + .cowread = lgr->range.offset >> 9, + }; + + LIST_HEAD(extents); + + BLK_READBUF(p, end, 4); + READ32(count); + + dprintk("%s enter, number of extents %i\n", __func__, count); + BLK_READBUF(p, end, (28 + NFS4_PNFS_DEVICEID4_SIZE) * count); + + /* Decode individual extents, putting them in temporary + * staging area until whole layout is decoded to make error + * recovery easier. + */ + for (i = 0; i < count; i++) { + be = alloc_extent(); + if (!be) { + status = -ENOMEM; + goto out_err; + } + READ_DEVID(&be->be_devid); + be->be_mdev = translate_devid(lo, &be->be_devid); + if (!be->be_mdev) + goto out_err; + /* The next three values are read in as bytes, + * but stored as 512-byte sector lengths + */ + READ_SECTOR(be->be_f_offset); + READ_SECTOR(be->be_length); + READ_SECTOR(be->be_v_offset); + READ32(be->be_state); + if (be->be_state == PNFS_BLOCK_INVALID_DATA) + be->be_inval = &bl->bl_inval; + if (verify_extent(be, &lv)) { + dprintk("%s verify failed\n", __func__); + goto out_err; + } + list_add_tail(&be->be_node, &extents); + } + if (p != end) { + dprintk("%s Undecoded cruft at end of opaque\n", __func__); + be = NULL; + goto out_err; + } + if (lgr->range.offset + lgr->range.length != lv.start << 9) { + dprintk("%s Final length mismatch\n", __func__); + be = NULL; + goto out_err; + } + if (lv.start < lv.cowread) { + dprintk("%s Final uncovered COW extent\n", __func__); + be = NULL; + goto out_err; + } + /* Extents decoded properly, now try to merge them in to + * existing layout extents. + */ + spin_lock(&bl->bl_ext_lock); + list_for_each_entry_safe(be, save, &extents, be_node) { + list_del(&be->be_node); + status = add_and_merge_extent(bl, be); + if (status) { + spin_unlock(&bl->bl_ext_lock); + /* This is a fairly catastrophic error, as the + * entire layout extent lists are now corrupted. + * We should have some way to distinguish this. + */ + be = NULL; + goto out_err; + } + } + spin_unlock(&bl->bl_ext_lock); + status = 0; + out: + dprintk("%s returns %i\n", __func__, status); + return status; + + out_err: + put_extent(be); + while (!list_empty(&extents)) { + be = list_first_entry(&extents, struct pnfs_block_extent, + be_node); + list_del(&be->be_node); + put_extent(be); + } + goto out; +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-09-30 10:17:08.546994000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-09-30 10:17:08.548993000 -0400 @@ -0,0 +1,120 @@ +/* + * linux/fs/nfs/blocklayout/blocklayoutdm.c + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2007 The Regents of the University of Michigan. + * All rights reserved. + * + * Fred Isaman + * Andy Adamson + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include /* gendisk - used in a dprintk*/ +#include +#include + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* Defines used for calculating memory usage in nfs4_blk_flatten() */ +#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ +#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) +#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) +#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ + (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) +#define roundup8(x) (((x)+7) & ~7) +#define sizeof8(x) roundup8(sizeof(x)) + +static int dev_remove(dev_t dev) +{ + int ret = 1; + struct pipefs_hdr *msg = NULL, *reply = NULL; + uint64_t bl_dev; + uint32_t major = MAJOR(dev), minor = MINOR(dev); + + dprintk("Entering %s\n", __func__); + + if (IS_ERR(bl_device_pipe)) + return ret; + + memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); + memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); + msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, + sizeof(uint64_t)); + if (IS_ERR(msg)) { + dprintk("ERROR: couldn't make pipefs message.\n"); + goto out; + } + msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); + msg->status = BL_DEVICE_REQUEST_INIT; + + reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, + &bl_device_list, 0, 0); + if (IS_ERR(reply)) { + dprintk("ERROR: upcall_waitreply failed\n"); + goto out; + } + + if (reply->status == BL_DEVICE_REQUEST_PROC) + ret = 0; /*TODO: what to return*/ +out: + if (!IS_ERR(reply)) + kfree(reply); + if (!IS_ERR(msg)) + kfree(msg); + return ret; +} + +/* + * Release meta device + */ +static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) +{ + int rv; + + dprintk("%s Releasing\n", __func__); + /* XXX Check return? */ + rv = nfs4_blkdev_put(bdev->bm_mdev); + dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); + + rv = dev_remove(bdev->bm_mdev->bd_dev); + dprintk("%s Returns %d\n", __func__, rv); + return rv; +} + +void free_block_dev(struct pnfs_block_dev *bdev) +{ + if (bdev) { + if (bdev->bm_mdev) { + dprintk("%s Removing DM device: %d:%d\n", + __func__, + MAJOR(bdev->bm_mdev->bd_dev), + MINOR(bdev->bm_mdev->bd_dev)); + /* XXX Check status ?? */ + nfs4_blk_metadev_release(bdev); + } + kfree(bdev); + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h --- linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-09-30 10:17:08.538988000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/blocklayout.h 2010-09-30 10:17:08.539994000 -0400 @@ -0,0 +1,302 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Fred Isaman + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ +#ifndef FS_NFS_NFS4BLOCKLAYOUT_H +#define FS_NFS_NFS4BLOCKLAYOUT_H + +#include +#include +#include /* Needed for struct dm_ioctl*/ + +#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) + +#define PG_pnfserr PG_owner_priv_1 +#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) +#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) +#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) + +extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ +extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ +extern int dm_do_resume(struct dm_ioctl *param); +extern int dm_table_load(struct dm_ioctl *param, size_t param_size); + +struct block_mount_id { + spinlock_t bm_lock; /* protects list */ + struct list_head bm_devlist; /* holds pnfs_block_dev */ +}; + +struct pnfs_block_dev { + struct list_head bm_node; + struct pnfs_deviceid bm_mdevid; /* associated devid */ + struct block_device *bm_mdev; /* meta device itself */ +}; + +/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ +struct visible_block_device { + struct list_head vi_node; + struct block_device *vi_bdev; + int vi_mapped; + int vi_put_done; +}; + +enum blk_vol_type { + PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ + PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ + PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ + PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ +}; + +/* All disk offset/lengths are stored in 512-byte sectors */ +struct pnfs_blk_volume { + uint32_t bv_type; + sector_t bv_size; + struct pnfs_blk_volume **bv_vols; + int bv_vol_n; + union { + dev_t bv_dev; + sector_t bv_stripe_unit; + sector_t bv_offset; + }; +}; + +/* Since components need not be aligned, cannot use sector_t */ +struct pnfs_blk_sig_comp { + int64_t bs_offset; /* In bytes */ + uint32_t bs_length; /* In bytes */ + char *bs_string; +}; + +/* Maximum number of signatures components in a simple volume */ +# define PNFS_BLOCK_MAX_SIG_COMP 16 + +struct pnfs_blk_sig { + int si_num_comps; + struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; +}; + +enum exstate4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ +}; + +#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ + +struct my_tree_t { + sector_t mtt_step_size; /* Internal sector alignment */ + struct list_head mtt_stub; /* Should be a radix tree */ +}; + +struct pnfs_inval_markings { + spinlock_t im_lock; + struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ + sector_t im_block_size; /* Server blocksize in sectors */ +}; + +struct pnfs_inval_tracking { + struct list_head it_link; + int it_sector; + int it_tags; +}; + +/* sector_t fields are all in 512-byte sectors */ +struct pnfs_block_extent { + struct kref be_refcnt; + struct list_head be_node; /* link into lseg list */ + struct pnfs_deviceid be_devid; /* STUB - remevable??? */ + struct block_device *be_mdev; + sector_t be_f_offset; /* the starting offset in the file */ + sector_t be_length; /* the size of the extent */ + sector_t be_v_offset; /* the starting offset in the volume */ + enum exstate4 be_state; /* the state of this extent */ + struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +}; + +/* Shortened extent used by LAYOUTCOMMIT */ +struct pnfs_block_short_extent { + struct list_head bse_node; + struct pnfs_deviceid bse_devid; /* STUB - removable??? */ + struct block_device *bse_mdev; + sector_t bse_f_offset; /* the starting offset in the file */ + sector_t bse_length; /* the size of the extent */ +}; + +static inline void +INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) +{ + spin_lock_init(&marks->im_lock); + INIT_LIST_HEAD(&marks->im_tree.mtt_stub); + marks->im_block_size = blocksize; + marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, + blocksize); +} + +enum extentclass4 { + RW_EXTENT = 0, /* READWRTE and INVAL */ + RO_EXTENT = 1, /* READ and NONE */ + EXTENT_LISTS = 2, +}; + +static inline int choose_list(enum exstate4 state) +{ + if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) + return RO_EXTENT; + else + return RW_EXTENT; +} + +struct pnfs_block_layout { + struct pnfs_layout_hdr bl_layout; + struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ + spinlock_t bl_ext_lock; /* Protects list manipulation */ + struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ + struct list_head bl_commit; /* Needs layout commit */ + unsigned int bl_count; /* entries in bl_commit */ + sector_t bl_blocksize; /* Server blocksize in sectors */ +}; + +/* this struct is comunicated between: + * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit + */ +struct bl_layoutupdate_data { + struct list_head ranges; +}; + +#define BLK_ID(lo) ((struct block_mount_id *)(PNFS_NFS_SERVER(lo)->pnfs_ld_data)) + +static inline struct pnfs_block_layout * +BLK_LO2EXT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct pnfs_block_layout, bl_layout); +} + +static inline struct pnfs_block_layout * +BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) +{ + return BLK_LO2EXT(lseg->layout); +} + +uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); + +#define BLK_READBUF(p, e, nbytes) do { \ + p = blk_overflow(p, e, nbytes); \ + if (!p) { \ + printk(KERN_WARNING \ + "%s: reply buffer overflowed in line %d.\n", \ + __func__, __LINE__); \ + goto out_err; \ + } \ +} while (0) + +#define READ32(x) (x) = ntohl(*p++) +#define READ64(x) do { \ + (x) = (uint64_t)ntohl(*p++) << 32; \ + (x) |= ntohl(*p++); \ +} while (0) +#define COPYMEM(x, nbytes) do { \ + memcpy((x), p, nbytes); \ + p += XDR_QUADLEN(nbytes); \ +} while (0) +#define READ_DEVID(x) COPYMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) +#define READ_SECTOR(x) do { \ + READ64(tmp); \ + if (tmp & 0x1ff) { \ + printk(KERN_WARNING \ + "%s Value not 512-byte aligned at line %d\n", \ + __func__, __LINE__); \ + goto out_err; \ + } \ + (x) = tmp >> 9; \ +} while (0) + +#define WRITE32(n) do { \ + *p++ = htonl(n); \ + } while (0) +#define WRITE64(n) do { \ + *p++ = htonl((uint32_t)((n) >> 32)); \ + *p++ = htonl((uint32_t)(n)); \ +} while (0) +#define WRITEMEM(ptr, nbytes) do { \ + p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ +} while (0) +#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) + +/* blocklayoutdev.c */ +struct block_device *nfs4_blkdev_get(dev_t dev); +int nfs4_blkdev_put(struct block_device *bdev); +struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, + struct pnfs_device *dev, + struct list_head *sdlist); +int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, + struct nfs4_layoutget_res *lgr); +int nfs4_blk_create_block_disk_list(struct list_head *); +void nfs4_blk_destroy_disk_list(struct list_head *); +/* blocklayoutdm.c */ +int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); +void free_block_dev(struct pnfs_block_dev *bdev); +/* extents.c */ +struct pnfs_block_extent * +find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read); +int mark_initialized_sectors(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length, + sector_t **pages); +void put_extent(struct pnfs_block_extent *be); +struct pnfs_block_extent *alloc_extent(void); +struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); +int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); +int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg); +void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status); +int add_and_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length); + +#include + +extern struct pipefs_list bl_device_list; +extern struct dentry *bl_device_pipe; + +int bl_pipe_init(void); +void bl_pipe_exit(void); + +#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ +#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ +#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ +#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ +#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ + +#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c --- linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c.orig 2010-09-30 10:17:08.565989000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/extents.c 2010-09-30 10:17:08.567989000 -0400 @@ -0,0 +1,948 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Fred Isaman + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include "blocklayout.h" +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* Bit numbers */ +#define EXTENT_INITIALIZED 0 +#define EXTENT_WRITTEN 1 +#define EXTENT_IN_COMMIT 2 +#define INTERNAL_EXISTS MY_MAX_TAGS +#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) + +/* Returns largest t<=s s.t. t%base==0 */ +static inline sector_t normalize(sector_t s, int base) +{ + sector_t tmp = s; /* Since do_div modifies its argument */ + return s - do_div(tmp, base); +} + +static inline sector_t normalize_up(sector_t s, int base) +{ + return normalize(s + base - 1, base); +} + +/* Complete stub using list while determine API wanted */ + +/* Returns tags, or negative */ +static int32_t _find_entry(struct my_tree_t *tree, u64 s) +{ + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu) enter\n", __func__, s); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) + return pos->it_tags & INTERNAL_MASK; + else + break; + } + return -ENOENT; +} + +static inline +int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) +{ + int32_t tags; + + dprintk("%s(%llu, %i) enter\n", __func__, s, tag); + s = normalize(s, tree->mtt_step_size); + tags = _find_entry(tree, s); + if ((tags < 0) || !(tags & (1 << tag))) + return 0; + else + return 1; +} + +/* Creates entry with tag, or if entry already exists, unions tag to it. + * If storage is not NULL, newly created entry will use it. + * Returns number of entries added, or negative on error. + */ +static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, + struct pnfs_inval_tracking *storage) +{ + int found = 0; + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) { + found = 1; + break; + } else + break; + } + if (found) { + pos->it_tags |= (1 << tag); + return 0; + } else { + struct pnfs_inval_tracking *new; + if (storage) + new = storage; + else { + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + } + new->it_sector = s; + new->it_tags = (1 << tag); + list_add(&new->it_link, &pos->it_link); + return 1; + } +} + +/* XXXX Really want option to not create */ +/* Over range, unions tag with existing entries, else creates entry with tag */ +static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) +{ + u64 i; + + dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); + for (i = normalize(s, tree->mtt_step_size); i < s + length; + i += tree->mtt_step_size) + if (_add_entry(tree, i, tag, NULL)) + return -ENOMEM; + return 0; +} + +/* Ensure that future operations on given range of tree will not malloc */ +static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) +{ + u64 start, end, s; + int count, i, used = 0, status = -ENOMEM; + struct pnfs_inval_tracking **storage; + + dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); + start = normalize(offset, tree->mtt_step_size); + end = normalize_up(offset + length, tree->mtt_step_size); + count = (int)(end - start) / (int)tree->mtt_step_size; + + /* Pre-malloc what memory we might need */ + storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); + if (!storage) + return -ENOMEM; + for (i = 0; i < count; i++) { + storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), + GFP_KERNEL); + if (!storage[i]) + goto out_cleanup; + } + + /* Now need lock - HOW??? */ + + for (s = start; s < end; s += tree->mtt_step_size) + used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); + + /* Unlock - HOW??? */ + status = 0; + + out_cleanup: + for (i = used; i < count; i++) { + if (!storage[i]) + break; + kfree(storage[i]); + } + kfree(storage); + return status; +} + +static void set_needs_init(sector_t *array, sector_t offset) +{ + sector_t *p = array; + + dprintk("%s enter\n", __func__); + if (!p) + return; + while (*p < offset) + p++; + if (*p == offset) + return; + else if (*p == ~0) { + *p++ = offset; + *p = ~0; + return; + } else { + sector_t *save = p; + dprintk("%s Adding %llu\n", __func__, (u64)offset); + while (*p != ~0) + p++; + p++; + memmove(save + 1, save, (char *)p - (char *)save); + *save = offset; + return; + } +} + +/* We are relying on page lock to serialize this */ +int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) +{ + int rv; + + spin_lock(&marks->im_lock); + rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); + spin_unlock(&marks->im_lock); + return rv; +} + +/* Assume start, end already sector aligned */ +static int +_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) +{ + struct pnfs_inval_tracking *pos; + u64 expect = 0; + + dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector >= end) + continue; + if (!expect) { + if ((pos->it_sector == end - tree->mtt_step_size) && + (pos->it_tags & (1 << tag))) { + expect = pos->it_sector - tree->mtt_step_size; + if (expect < start) + return 1; + continue; + } else { + return 0; + } + } + if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) + return 0; + expect -= tree->mtt_step_size; + if (expect < start) + return 1; + } + return 0; +} + +static int is_range_written(struct pnfs_inval_markings *marks, + sector_t start, sector_t end) +{ + int rv; + + spin_lock(&marks->im_lock); + rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); + spin_unlock(&marks->im_lock); + return rv; +} + +/* Marks sectors in [offest, offset_length) as having been initialized. + * All lengths are step-aligned, where step is min(pagesize, blocksize). + * Notes where partial block is initialized, and helps prepare it for + * complete initialization later. + */ +/* Currently assumes offset is page-aligned */ +int mark_initialized_sectors(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length, + sector_t **pages) +{ + sector_t s, start, end; + sector_t *array = NULL; /* Pages to mark */ + + dprintk("%s(offset=%llu,len=%llu) enter\n", + __func__, (u64)offset, (u64)length); + s = max((sector_t) 3, + 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); + dprintk("%s set max=%llu\n", __func__, (u64)s); + if (pages) { + array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); + if (!array) + goto outerr; + array[0] = ~0; + } + + start = normalize(offset, marks->im_block_size); + end = normalize_up(offset + length, marks->im_block_size); + if (_preload_range(&marks->im_tree, start, end - start)) + goto outerr; + + spin_lock(&marks->im_lock); + + for (s = normalize_up(start, PAGE_CACHE_SECTORS); + s < offset; s += PAGE_CACHE_SECTORS) { + dprintk("%s pre-area pages\n", __func__); + /* Portion of used block is not initialized */ + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) + set_needs_init(array, s); + } + if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) + goto out_unlock; + for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); + s < end; s += PAGE_CACHE_SECTORS) { + dprintk("%s post-area pages\n", __func__); + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) + set_needs_init(array, s); + } + + spin_unlock(&marks->im_lock); + + if (pages) { + if (array[0] == ~0) { + kfree(array); + *pages = NULL; + } else + *pages = array; + } + return 0; + + out_unlock: + spin_unlock(&marks->im_lock); + outerr: + if (pages) { + kfree(array); + *pages = NULL; + } + return -ENOMEM; +} + +/* Marks sectors in [offest, offset+length) as having been written to disk. + * All lengths should be block aligned. + */ +int mark_written_sectors(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length) +{ + int status; + + dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, + (u64)offset, (u64)length); + spin_lock(&marks->im_lock); + status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); + spin_unlock(&marks->im_lock); + return status; +} + +static void print_short_extent(struct pnfs_block_short_extent *be) +{ + dprintk("PRINT SHORT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); + dprintk(" be_length %llu\n", (u64)be->bse_length); + } +} + +void print_clist(struct list_head *list, unsigned int count) +{ + struct pnfs_block_short_extent *be; + unsigned int i = 0; + + dprintk("****************\n"); + dprintk("Extent list looks like:\n"); + list_for_each_entry(be, list, bse_node) { + i++; + print_short_extent(be); + } + if (i != count) + dprintk("\n\nExpected %u entries\n\n\n", count); + dprintk("****************\n"); +} + +/* Note: In theory, we should do more checking that devid's match between + * old and new, but if they don't, the lists are too corrupt to salvage anyway. + */ +/* Note this is very similar to add_and_merge_extent */ +static void add_to_commitlist(struct pnfs_block_layout *bl, + struct pnfs_block_short_extent *new) +{ + struct list_head *clist = &bl->bl_commit; + struct pnfs_block_short_extent *old, *save; + sector_t end = new->bse_f_offset + new->bse_length; + + dprintk("%s enter\n", __func__); + print_short_extent(new); + print_clist(clist, bl->bl_count); + bl->bl_count++; + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe(old, save, clist, bse_node) { + if (new->bse_f_offset < old->bse_f_offset) + break; + if (end <= old->bse_f_offset + old->bse_length) { + /* Range is already in list */ + bl->bl_count--; + kfree(new); + return; + } else if (new->bse_f_offset <= + old->bse_f_offset + old->bse_length) { + /* new overlaps or abuts existing be */ + if (new->bse_mdev == old->bse_mdev) { + /* extend new to fully replace old */ + new->bse_length += new->bse_f_offset - + old->bse_f_offset; + new->bse_f_offset = old->bse_f_offset; + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + } + /* Note that if we never hit the above break, old will not point to a + * valid extent. However, in that case &old->bse_node==list. + */ + list_add_tail(&new->bse_node, &old->bse_node); + /* Scan forward for overlaps. If we find any, extend new and + * remove the overlapped extent. + */ + old = list_prepare_entry(new, clist, bse_node); + list_for_each_entry_safe_continue(old, save, clist, bse_node) { + if (end < old->bse_f_offset) + break; + /* new overlaps or abuts old */ + if (new->bse_mdev == old->bse_mdev) { + if (end < old->bse_f_offset + old->bse_length) { + /* extend new to fully cover old */ + end = old->bse_f_offset + old->bse_length; + new->bse_length = end - new->bse_f_offset; + } + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + dprintk("%s: after merging\n", __func__); + print_clist(clist, bl->bl_count); +} + +/* Note the range described by offset, length is guaranteed to be contained + * within be. + */ +int mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length) +{ + sector_t new_end, end = offset + length; + struct pnfs_block_short_extent *new; + struct pnfs_block_layout *bl = container_of(be->be_inval, + struct pnfs_block_layout, + bl_inval); + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + + mark_written_sectors(be->be_inval, offset, length); + /* We want to add the range to commit list, but it must be + * block-normalized, and verified that the normalized range has + * been entirely written to disk. + */ + new->bse_f_offset = offset; + offset = normalize(offset, bl->bl_blocksize); + if (offset < new->bse_f_offset) { + if (is_range_written(be->be_inval, offset, new->bse_f_offset)) + new->bse_f_offset = offset; + else + new->bse_f_offset = offset + bl->bl_blocksize; + } + new_end = normalize_up(end, bl->bl_blocksize); + if (end < new_end) { + if (is_range_written(be->be_inval, end, new_end)) + end = new_end; + else + end = new_end - bl->bl_blocksize; + } + if (end <= new->bse_f_offset) { + kfree(new); + return 0; + } + new->bse_length = end - new->bse_f_offset; + new->bse_devid = be->be_devid; + new->bse_mdev = be->be_mdev; + + spin_lock(&bl->bl_ext_lock); + /* new will be freed, either by add_to_commitlist if it decides not + * to use it, or after LAYOUTCOMMIT uses it in the commitlist. + */ + add_to_commitlist(bl, new); + spin_unlock(&bl->bl_ext_lock); + return 0; +} + +static void print_bl_extent(struct pnfs_block_extent *be) +{ + dprintk("PRINT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); + dprintk(" be_length %llu\n", (u64)be->be_length); + dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); + dprintk(" be_state %d\n", be->be_state); + } +} + +static void +destroy_extent(struct kref *kref) +{ + struct pnfs_block_extent *be; + + be = container_of(kref, struct pnfs_block_extent, be_refcnt); + dprintk("%s be=%p\n", __func__, be); + kfree(be); +} + +void +put_extent(struct pnfs_block_extent *be) +{ + if (be) { + dprintk("%s enter %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_put(&be->be_refcnt, destroy_extent); + } +} + +struct pnfs_block_extent *alloc_extent(void) +{ + struct pnfs_block_extent *be; + + be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); + if (!be) + return NULL; + INIT_LIST_HEAD(&be->be_node); + kref_init(&be->be_refcnt); + be->be_inval = NULL; + return be; +} + +struct pnfs_block_extent * +get_extent(struct pnfs_block_extent *be) +{ + if (be) + kref_get(&be->be_refcnt); + return be; +} + +void print_elist(struct list_head *list) +{ + struct pnfs_block_extent *be; + dprintk("****************\n"); + dprintk("Extent list looks like:\n"); + list_for_each_entry(be, list, be_node) { + print_bl_extent(be); + } + dprintk("****************\n"); +} + +static inline int +extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) +{ + /* Note this assumes new->be_f_offset >= old->be_f_offset */ + return (new->be_state == old->be_state) && + ((new->be_state == PNFS_BLOCK_NONE_DATA) || + ((new->be_v_offset - old->be_v_offset == + new->be_f_offset - old->be_f_offset) && + new->be_mdev == old->be_mdev)); +} + +/* Adds new to appropriate list in bl, modifying new and removing existing + * extents as appropriate to deal with overlaps. + * + * See find_get_extent for list constraints. + * + * Refcount on new is already set. If end up not using it, or error out, + * need to put the reference. + * + * Lock is held by caller. + */ +int +add_and_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be, *tmp; + sector_t end = new->be_f_offset + new->be_length; + struct list_head *list; + + dprintk("%s enter with be=%p\n", __func__, new); + print_bl_extent(new); + list = &bl->bl_extents[choose_list(new->be_state)]; + print_elist(list); + + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe_reverse(be, tmp, list, be_node) { + if (new->be_f_offset >= be->be_f_offset + be->be_length) + break; + if (new->be_f_offset >= be->be_f_offset) { + if (end <= be->be_f_offset + be->be_length) { + /* new is a subset of existing be*/ + if (extents_consistent(be, new)) { + dprintk("%s: new is subset, ignoring\n", + __func__); + put_extent(new); + return 0; + } else { + goto out_err; + } + } else { + /* |<-- be -->| + * |<-- new -->| */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + new->be_length += new->be_f_offset - + be->be_f_offset; + new->be_f_offset = be->be_f_offset; + new->be_v_offset = be->be_v_offset; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + put_extent(be); + } else { + goto out_err; + } + } + } else if (end >= be->be_f_offset + be->be_length) { + /* new extent overlap existing be */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + put_extent(be); + } else { + goto out_err; + } + } else if (end > be->be_f_offset) { + /* |<-- be -->| + *|<-- new -->| */ + if (extents_consistent(new, be)) { + /* extend new to fully replace be */ + new->be_length += be->be_f_offset + be->be_length - + new->be_f_offset - new->be_length; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + put_extent(be); + } else { + goto out_err; + } + } + } + /* Note that if we never hit the above break, be will not point to a + * valid extent. However, in that case &be->be_node==list. + */ + list_add(&new->be_node, &be->be_node); + dprintk("%s: inserting new\n", __func__); + print_elist(list); + /* STUB - The per-list consistency checks have all been done, + * should now check cross-list consistency. + */ + return 0; + + out_err: + put_extent(new); + return -EIO; +} + +/* Returns extent, or NULL. If a second READ extent exists, it is returned + * in cow_read, if given. + * + * The extents are kept in two seperate ordered lists, one for READ and NONE, + * one for READWRITE and INVALID. Within each list, we assume: + * 1. Extents are ordered by file offset. + * 2. For any given isect, there is at most one extents that matches. + */ +struct pnfs_block_extent * +find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read) +{ + struct pnfs_block_extent *be, *cow, *ret; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + cow = ret = NULL; + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + if (ret && + (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) + break; + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + if (!ret) + ret = be; + else if (be->be_state != PNFS_BLOCK_READ_DATA) + put_extent(be); + else + cow = be; + break; + } + } + } + spin_unlock(&bl->bl_ext_lock); + if (cow_read) + *cow_read = cow; + print_bl_extent(ret); + return ret; +} + +/* Similar to find_get_extent, but called with lock held, and ignores cow */ +static struct pnfs_block_extent * +find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) +{ + struct pnfs_block_extent *be, *ret = NULL; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + for (i = 0; i < EXTENT_LISTS; i++) { + if (ret) + break; + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + ret = be; + break; + } + } + } + print_bl_extent(ret); + return ret; +} + +int +encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + sector_t start, end; + struct pnfs_block_short_extent *lce, *save; + unsigned int count = 0; + struct bl_layoutupdate_data *bld = arg->layoutdriver_data; + struct list_head *ranges = &bld->ranges; + __be32 *p, *xdr_start; + + dprintk("%s enter\n", __func__); + start = arg->range.offset >> 9; + end = start + (arg->range.length >> 9); + dprintk("%s set start=%llu, end=%llu\n", + __func__, (u64)start, (u64)end); + + /* BUG - creation of bl_commit is buggy - need to wait for + * entire block to be marked WRITTEN before it can be added. + */ + spin_lock(&bl->bl_ext_lock); + /* Want to adjust for possible truncate */ + /* We now want to adjust argument range */ + + /* XDR encode the ranges found */ + xdr_start = xdr_reserve_space(xdr, 8); + if (!xdr_start) + goto out; + list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { + p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); + if (!p) + break; + WRITE_DEVID(&lce->bse_devid); + WRITE64(lce->bse_f_offset << 9); + WRITE64(lce->bse_length << 9); + WRITE64(0LL); + WRITE32(PNFS_BLOCK_READWRITE_DATA); + list_del(&lce->bse_node); + list_add_tail(&lce->bse_node, ranges); + bl->bl_count--; + count++; + } + xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); + xdr_start[1] = cpu_to_be32(count); +out: + spin_unlock(&bl->bl_ext_lock); + dprintk("%s found %i ranges\n", __func__, count); + return 0; +} + +/* Helper function to set_to_rw that initialize a new extent */ +static void +_prep_new_extent(struct pnfs_block_extent *new, + struct pnfs_block_extent *orig, + sector_t offset, sector_t length, int state) +{ + kref_init(&new->be_refcnt); + /* don't need to INIT_LIST_HEAD(&new->be_node) */ + memcpy(&new->be_devid, &orig->be_devid, sizeof(struct pnfs_deviceid)); + new->be_mdev = orig->be_mdev; + new->be_f_offset = offset; + new->be_length = length; + new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; + new->be_state = state; + new->be_inval = orig->be_inval; +} + +/* Tries to merge be with extent in front of it in list. + * Frees storage if not used. + */ +static struct pnfs_block_extent * +_front_merge(struct pnfs_block_extent *be, struct list_head *head, + struct pnfs_block_extent *storage) +{ + struct pnfs_block_extent *prev; + + if (!storage) + goto no_merge; + if (&be->be_node == head || be->be_node.prev == head) + goto no_merge; + prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); + if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || + !extents_consistent(prev, be)) + goto no_merge; + _prep_new_extent(storage, prev, prev->be_f_offset, + prev->be_length + be->be_length, prev->be_state); + list_replace(&prev->be_node, &storage->be_node); + put_extent(prev); + list_del(&be->be_node); + put_extent(be); + return storage; + + no_merge: + kfree(storage); + return be; +} + +static u64 +set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) +{ + u64 rv = offset + length; + struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; + struct pnfs_block_extent *children[3]; + struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; + int i = 0, j; + + dprintk("%s(%llu, %llu)\n", __func__, offset, length); + /* Create storage for up to three new extents e1, e2, e3 */ + e1 = kmalloc(sizeof(*e1), GFP_KERNEL); + e2 = kmalloc(sizeof(*e2), GFP_KERNEL); + e3 = kmalloc(sizeof(*e3), GFP_KERNEL); + /* BUG - we are ignoring any failure */ + if (!e1 || !e2 || !e3) + goto out_nosplit; + + spin_lock(&bl->bl_ext_lock); + be = find_get_extent_locked(bl, offset); + rv = be->be_f_offset + be->be_length; + if (be->be_state != PNFS_BLOCK_INVALID_DATA) { + spin_unlock(&bl->bl_ext_lock); + goto out_nosplit; + } + /* Add e* to children, bumping e*'s krefs */ + if (be->be_f_offset != offset) { + _prep_new_extent(e1, be, be->be_f_offset, + offset - be->be_f_offset, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e1; + print_bl_extent(e1); + } else + merge1 = e1; + _prep_new_extent(e2, be, offset, + min(length, be->be_f_offset + be->be_length - offset), + PNFS_BLOCK_READWRITE_DATA); + children[i++] = e2; + print_bl_extent(e2); + if (offset + length < be->be_f_offset + be->be_length) { + _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, + be->be_f_offset + be->be_length - + offset - length, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e3; + print_bl_extent(e3); + } else + merge2 = e3; + + /* Remove be from list, and insert the e* */ + /* We don't get refs on e*, since this list is the base reference + * set when init'ed. + */ + if (i < 3) + children[i] = NULL; + new = children[0]; + list_replace(&be->be_node, &new->be_node); + put_extent(be); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); + for (j = 1; j < i; j++) { + old = new; + new = children[j]; + list_add(&new->be_node, &old->be_node); + } + if (merge2) { + /* This is a HACK, should just create a _back_merge function */ + new = list_entry(new->be_node.next, + struct pnfs_block_extent, be_node); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); + } + spin_unlock(&bl->bl_ext_lock); + + /* Since we removed the base reference above, be is now scheduled for + * destruction. + */ + put_extent(be); + dprintk("%s returns %llu after split\n", __func__, rv); + return rv; + + out_nosplit: + kfree(e1); + kfree(e2); + kfree(e3); + dprintk("%s returns %llu without splitting\n", __func__, rv); + return rv; +} + +void +clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status) +{ + struct bl_layoutupdate_data *bld = arg->layoutdriver_data; + struct pnfs_block_short_extent *lce, *save; + + dprintk("%s status %d\n", __func__, status); + list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { + if (likely(!status)) { + u64 offset = lce->bse_f_offset; + u64 end = offset + lce->bse_length; + + do { + offset = set_to_rw(bl, offset, end - offset); + } while (offset < end); + + kfree(lce); + } else { + spin_lock(&bl->bl_ext_lock); + add_to_commitlist(bl, lce); + spin_unlock(&bl->bl_ext_lock); + } + } +} diff -up linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile --- linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile.orig 2010-09-30 10:17:08.524988000 -0400 +++ linux-2.6.34.noarch/fs/nfs/blocklayout/Makefile 2010-09-30 10:17:08.525996000 -0400 @@ -0,0 +1,6 @@ +# +# Makefile for the pNFS block layout driver kernel module +# +obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o +blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ + extents.o block-device-discovery-pipe.o diff -up linux-2.6.34.noarch/fs/nfs/callback.h.orig linux-2.6.34.noarch/fs/nfs/callback.h --- linux-2.6.34.noarch/fs/nfs/callback.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfs/callback.h 2010-09-30 10:17:08.585990000 -0400 @@ -111,6 +111,13 @@ extern int nfs41_validate_delegation_sta #define RCA4_TYPE_MASK_RDATA_DLG 0 #define RCA4_TYPE_MASK_WDATA_DLG 1 +#define RCA4_TYPE_MASK_DIR_DLG 2 +#define RCA4_TYPE_MASK_FILE_LAYOUT 3 +#define RCA4_TYPE_MASK_BLK_LAYOUT 4 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 +#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 +#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 struct cb_recallanyargs { struct sockaddr *craa_addr; @@ -127,6 +134,39 @@ struct cb_recallslotargs { extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, void *dummy); +struct cb_layoutrecallargs { + struct sockaddr *cbl_addr; + struct nfs_fh cbl_fh; + struct pnfs_layout_range cbl_seg; + struct nfs_fsid cbl_fsid; + uint32_t cbl_recall_type; + uint32_t cbl_layout_type; + uint32_t cbl_layoutchanged; + nfs4_stateid cbl_stateid; +}; + +extern unsigned nfs4_callback_layoutrecall( + struct cb_layoutrecallargs *args, + void *dummy); + +struct cb_devicenotifyitem { + uint32_t cbd_notify_type; + uint32_t cbd_layout_type; + struct pnfs_deviceid cbd_dev_id; + uint32_t cbd_immediate; +}; + +/* XXX: Should be dynamic up to max compound size */ +#define NFS4_DEV_NOTIFY_MAXENTRIES 10 +struct cb_devicenotifyargs { + struct sockaddr *addr; + int ndevs; + struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; +}; + +extern unsigned nfs4_callback_devicenotify( + struct cb_devicenotifyargs *args, + void *dummy); #endif /* CONFIG_NFS_V4_1 */ extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); diff -up linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig linux-2.6.34.noarch/fs/nfs/callback_proc.c --- linux-2.6.34.noarch/fs/nfs/callback_proc.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfs/callback_proc.c 2010-09-30 10:17:08.591990000 -0400 @@ -8,10 +8,15 @@ #include #include #include +#include +#include +#include +#include #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "internal.h" +#include "pnfs.h" #ifdef NFS_DEBUG #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -62,16 +67,6 @@ out: return res->status; } -static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) -{ -#if defined(CONFIG_NFS_V4_1) - if (clp->cl_minorversion > 0) - return nfs41_validate_delegation_stateid; -#endif - return nfs4_validate_delegation_stateid; -} - - __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) { struct nfs_client *clp; @@ -92,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_re inode = nfs_delegation_find_inode(clp, &args->fh); if (inode != NULL) { /* Set up a helper thread to actually return the delegation */ - switch (nfs_async_inode_return_delegation(inode, &args->stateid, - nfs_validate_delegation_stateid(clp))) { + switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { case 0: res = 0; break; @@ -116,24 +110,364 @@ out: int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) { - if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, - sizeof(delegation->stateid.data)) != 0) + if (delegation == NULL || memcmp(delegation->stateid.u.data, + stateid->u.data, + sizeof(delegation->stateid.u.data))) return 0; return 1; } #if defined(CONFIG_NFS_V4_1) +static bool +pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo, + const nfs4_stateid stateid) +{ + int seqlock; + bool res; + u32 oldseqid, newseqid; + + do { + seqlock = read_seqbegin(&lo->seqlock); + oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid); + newseqid = be32_to_cpu(stateid.u.stateid.seqid); + res = !memcmp(lo->stateid.u.stateid.other, + stateid.u.stateid.other, + NFS4_STATEID_OTHER_SIZE); + if (res) { /* comparing layout stateids */ + if (oldseqid == ~0) + res = (newseqid == 1); + else + res = (newseqid == oldseqid + 1); + } else { /* open stateid */ + res = !memcmp(lo->stateid.u.data, + &zero_stateid, + NFS4_STATEID_SIZE); + if (res) + res = (newseqid == 1); + } + } while (read_seqretry(&lo->seqlock, seqlock)); + + return res; +} + +/* + * Retrieve an inode based on layout recall parameters + * + * Note: caller must iput(inode) to dereference the inode. + */ +static struct inode * +nfs_layoutrecall_find_inode(struct nfs_client *clp, + const struct cb_layoutrecallargs *args) +{ + struct nfs_inode *nfsi; + struct pnfs_layout_hdr *lo; + struct nfs_server *server; + struct inode *ino = NULL; + + dprintk("%s: Begin recall_type=%d clp %p\n", + __func__, args->cbl_recall_type, clp); + + spin_lock(&clp->cl_lock); + list_for_each_entry(lo, &clp->cl_layouts, layouts) { + nfsi = PNFS_NFS_INODE(lo); + if (!nfsi) + continue; + + dprintk("%s: Searching inode=%lu\n", + __func__, nfsi->vfs_inode.i_ino); + + if (args->cbl_recall_type == RETURN_FILE) { + if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh)) + continue; + } else if (args->cbl_recall_type == RETURN_FSID) { + server = NFS_SERVER(&nfsi->vfs_inode); + if (server->fsid.major != args->cbl_fsid.major || + server->fsid.minor != args->cbl_fsid.minor) + continue; + } + + /* Make sure client didn't clean up layout without + * telling the server */ + if (!has_layout(nfsi)) + continue; + + ino = igrab(&nfsi->vfs_inode); + dprintk("%s: Found inode=%p\n", __func__, ino); + break; + } + spin_unlock(&clp->cl_lock); + return ino; +} + +struct recall_layout_threadargs { + struct inode *inode; + struct nfs_client *clp; + struct completion started; + struct cb_layoutrecallargs *rl; + int result; +}; + +static int pnfs_recall_layout(void *data) +{ + struct inode *inode, *ino; + struct nfs_client *clp; + struct cb_layoutrecallargs rl; + struct nfs4_layoutreturn *lrp; + struct recall_layout_threadargs *args = + (struct recall_layout_threadargs *)data; + int status = 0; + + daemonize("nfsv4-layoutreturn"); + + dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n", + __func__, args->rl->cbl_recall_type, + args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor); + + clp = args->clp; + inode = args->inode; + rl = *args->rl; + + /* support whole file layouts only */ + rl.cbl_seg.offset = 0; + rl.cbl_seg.length = NFS4_MAX_UINT64; + + if (rl.cbl_recall_type == RETURN_FILE) { + if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout, + rl.cbl_stateid)) + status = pnfs_return_layout(inode, &rl.cbl_seg, + &rl.cbl_stateid, RETURN_FILE, + false); + else + status = cpu_to_be32(NFS4ERR_DELAY); + if (status) + dprintk("%s RETURN_FILE error: %d\n", __func__, status); + else + status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); + args->result = status; + complete(&args->started); + goto out; + } + + status = cpu_to_be32(NFS4_OK); + args->result = status; + complete(&args->started); + args = NULL; + + /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ + while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { + /* FIXME: need to check status on pnfs_return_layout */ + pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); + iput(ino); + } + + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); + if (!lrp) { + dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", + __func__); + goto out; + } + + /* send final layoutreturn */ + lrp->args.reclaim = 0; + lrp->args.layout_type = rl.cbl_layout_type; + lrp->args.return_type = rl.cbl_recall_type; + lrp->args.range = rl.cbl_seg; + lrp->args.inode = inode; + nfs4_proc_layoutreturn(lrp, true); + +out: + clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); + nfs_put_client(clp); + module_put_and_exit(0); + dprintk("%s: exit status %d\n", __func__, 0); + return 0; +} + +/* + * Asynchronous layout recall! + */ +static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, + struct cb_layoutrecallargs *rl) +{ + struct recall_layout_threadargs data = { + .clp = clp, + .inode = inode, + .rl = rl, + }; + struct task_struct *t; + int status = -EAGAIN; + + dprintk("%s: -->\n", __func__); + + /* FIXME: do not allow two concurrent layout recalls */ + if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) + return status; + + init_completion(&data.started); + __module_get(THIS_MODULE); + if (!atomic_inc_not_zero(&clp->cl_count)) + goto out_put_no_client; + + t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); + if (IS_ERR(t)) { + printk(KERN_INFO "NFS: Layout recall callback thread failed " + "for client (clientid %08x/%08x)\n", + (unsigned)(clp->cl_clientid >> 32), + (unsigned)(clp->cl_clientid)); + status = PTR_ERR(t); + goto out_module_put; + } + wait_for_completion(&data.started); + return data.result; +out_module_put: + nfs_put_client(clp); +out_put_no_client: + clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); + module_put(THIS_MODULE); + return status; +} + +static int pnfs_recall_all_layouts(struct nfs_client *clp) +{ + struct cb_layoutrecallargs rl; + struct inode *inode; + int status = 0; + + rl.cbl_recall_type = RETURN_ALL; + rl.cbl_seg.iomode = IOMODE_ANY; + rl.cbl_seg.offset = 0; + rl.cbl_seg.length = NFS4_MAX_UINT64; + + /* we need the inode to get the nfs_server struct */ + inode = nfs_layoutrecall_find_inode(clp, &rl); + if (!inode) + return status; + status = pnfs_async_return_layout(clp, inode, &rl); + iput(inode); + + return status; +} + +__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, + void *dummy) +{ + struct nfs_client *clp; + struct inode *inode = NULL; + __be32 res; + int status; + unsigned int num_client = 0; + + dprintk("%s: -->\n", __func__); + + res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->cbl_addr, 4); + if (clp == NULL) { + dprintk("%s: no client for addr %u.%u.%u.%u\n", + __func__, NIPQUAD(args->cbl_addr)); + goto out; + } + + res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); + do { + struct nfs_client *prev = clp; + num_client++; + /* the callback must come from the MDS personality */ + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) + goto loop; + if (args->cbl_recall_type == RETURN_FILE) { + inode = nfs_layoutrecall_find_inode(clp, args); + if (inode != NULL) { + status = pnfs_async_return_layout(clp, inode, + args); + if (status) + res = cpu_to_be32(NFS4ERR_DELAY); + iput(inode); + } + } else { /* _ALL or _FSID */ + /* we need the inode to get the nfs_server struct */ + inode = nfs_layoutrecall_find_inode(clp, args); + if (!inode) + goto loop; + status = pnfs_async_return_layout(clp, inode, args); + if (status) + res = cpu_to_be32(NFS4ERR_DELAY); + iput(inode); + } +loop: + clp = nfs_find_client_next(prev); + nfs_put_client(prev); + } while (clp != NULL); + +out: + dprintk("%s: exit with status = %d numclient %u\n", + __func__, ntohl(res), num_client); + return res; +} + +/* Remove the deviceid(s) from the nfs_client deviceid cache */ +static __be32 pnfs_devicenotify_client(struct nfs_client *clp, + struct cb_devicenotifyargs *args) +{ + uint32_t type; + int i; + + dprintk("%s: --> clp %p\n", __func__, clp); + + for (i = 0; i < args->ndevs; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + type = dev->cbd_notify_type; + if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache) + nfs4_delete_device(clp->cl_devid_cache, + &dev->cbd_dev_id); + else if (type == NOTIFY_DEVICEID4_CHANGE) + printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " + "not supported\n", __func__); + } + return 0; +} + +__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, + void *dummy) +{ + struct nfs_client *clp; + __be32 res = 0; + unsigned int num_client = 0; + + dprintk("%s: -->\n", __func__); + + res = __constant_htonl(NFS4ERR_INVAL); + clp = nfs_find_client(args->addr, 4); + if (clp == NULL) { + dprintk("%s: no client for addr %u.%u.%u.%u\n", + __func__, NIPQUAD(args->addr)); + goto out; + } + + do { + struct nfs_client *prev = clp; + num_client++; + res = pnfs_devicenotify_client(clp, args); + clp = nfs_find_client_next(prev); + nfs_put_client(prev); + } while (clp != NULL); + +out: + dprintk("%s: exit with status = %d numclient %u\n", + __func__, ntohl(res), num_client); + return res; +} + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) { if (delegation == NULL) return 0; - /* seqid is 4-bytes long */ - if (((u32 *) &stateid->data)[0] != 0) + if (stateid->u.stateid.seqid != 0) return 0; - if (memcmp(&delegation->stateid.data[4], &stateid->data[4], - sizeof(stateid->data)-4)) + if (memcmp(&delegation->stateid.u.stateid.other, + &stateid->u.stateid.other, + NFS4_STATEID_OTHER_SIZE)) return 0; return 1; @@ -335,13 +669,37 @@ out: return status; } +static inline bool +validate_bitmap_values(const unsigned long *mask) +{ + int i; + + if (*mask == 0) + return true; + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) || + test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) || + test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) || + test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) || + test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask)) + return true; + for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN; + i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++) + if (test_bit(i, mask)) + return true; + for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN; + i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++) + if (test_bit(i, mask)) + return true; + return false; +} + __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) { struct nfs_client *clp; __be32 status; fmode_t flags = 0; - status = htonl(NFS4ERR_OP_NOT_IN_SESSION); + status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); clp = nfs_find_client(args->craa_addr, 4); if (clp == NULL) goto out; @@ -349,16 +707,25 @@ __be32 nfs4_callback_recallany(struct cb dprintk("NFS: RECALL_ANY callback request from %s\n", rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + status = cpu_to_be32(NFS4ERR_INVAL); + if (!validate_bitmap_values((const unsigned long *) + &args->craa_type_mask)) + return status; + + status = cpu_to_be32(NFS4_OK); if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) &args->craa_type_mask)) flags = FMODE_READ; if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) &args->craa_type_mask)) flags |= FMODE_WRITE; + if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) + &args->craa_type_mask)) + if (pnfs_recall_all_layouts(clp) == -EAGAIN) + status = cpu_to_be32(NFS4ERR_DELAY); if (flags) nfs_expire_all_delegation_types(clp, flags); - status = htonl(NFS4_OK); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff -up linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.34.noarch/fs/nfs/callback_xdr.c --- linux-2.6.34.noarch/fs/nfs/callback_xdr.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfs/callback_xdr.c 2010-09-30 10:17:08.597991000 -0400 @@ -22,6 +22,8 @@ #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #if defined(CONFIG_NFS_V4_1) +#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 4 + 1 + 3) #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) @@ -136,7 +138,7 @@ static __be32 decode_stateid(struct xdr_ p = read_buf(xdr, 16); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(stateid->data, p, 16); + memcpy(stateid->u.data, p, 16); return 0; } @@ -220,6 +222,148 @@ out: #if defined(CONFIG_NFS_V4_1) +static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_layoutrecallargs *args) +{ + __be32 *p; + __be32 status = 0; + + args->cbl_addr = svc_addr(rqstp); + p = read_buf(xdr, 4 * sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + + args->cbl_layout_type = ntohl(*p++); + args->cbl_seg.iomode = ntohl(*p++); + args->cbl_layoutchanged = ntohl(*p++); + args->cbl_recall_type = ntohl(*p++); + + if (likely(args->cbl_recall_type == RETURN_FILE)) { + status = decode_fh(xdr, &args->cbl_fh); + if (unlikely(status != 0)) + goto out; + + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_seg.offset); + p = xdr_decode_hyper(p, &args->cbl_seg.length); + status = decode_stateid(xdr, &args->cbl_stateid); + if (unlikely(status != 0)) + goto out; + } else if (args->cbl_recall_type == RETURN_FSID) { + p = read_buf(xdr, 2 * sizeof(uint64_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_BADXDR); + goto out; + } + p = xdr_decode_hyper(p, &args->cbl_fsid.major); + p = xdr_decode_hyper(p, &args->cbl_fsid.minor); + } + dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d " + "fsid %llx-%llx fhsize %d\n", __func__, + args->cbl_layout_type, args->cbl_seg.iomode, + args->cbl_layoutchanged, args->cbl_recall_type, + args->cbl_fsid.major, args->cbl_fsid.minor, + args->cbl_fh.size); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + +static +__be32 decode_devicenotify_args(struct svc_rqst *rqstp, + struct xdr_stream *xdr, + struct cb_devicenotifyargs *args) +{ + __be32 *p; + __be32 status = 0; + u32 tmp; + int n, i; + args->ndevs = 0; + + args->addr = svc_addr(rqstp); + + /* Num of device notifications */ + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_RESOURCE); + goto out; + } + n = ntohl(*p++); + if (n <= 0) + goto out; + + /* XXX: need to possibly return error in this case */ + if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { + dprintk("%s: Processing (%d) notifications out of (%d)\n", + __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); + n = NFS4_DEV_NOTIFY_MAXENTRIES; + } + + /* Decode each dev notification */ + for (i = 0; i < n; i++) { + struct cb_devicenotifyitem *dev = &args->devs[i]; + + p = read_buf(xdr, (4 * sizeof(uint32_t)) + + NFS4_PNFS_DEVICEID4_SIZE); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_RESOURCE); + goto out; + } + + tmp = ntohl(*p++); /* bitmap size */ + if (tmp != 1) { + status = htonl(NFS4ERR_INVAL); + goto out; + } + dev->cbd_notify_type = ntohl(*p++); + if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && + dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { + status = htonl(NFS4ERR_INVAL); + goto out; + } + + tmp = ntohl(*p++); /* opaque size */ + if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && + (tmp != NFS4_PNFS_DEVICEID4_SIZE + 8)) || + ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && + (tmp != NFS4_PNFS_DEVICEID4_SIZE + 4))) { + status = htonl(NFS4ERR_INVAL); + goto out; + } + dev->cbd_layout_type = ntohl(*p++); + memcpy(dev->cbd_dev_id.data, p, NFS4_PNFS_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); + + if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { + p = read_buf(xdr, sizeof(uint32_t)); + if (unlikely(p == NULL)) { + status = htonl(NFS4ERR_DELAY); + goto out; + } + dev->cbd_immediate = ntohl(*p++); + } else { + dev->cbd_immediate = 0; + } + + args->ndevs++; + + dprintk("%s: type %d layout 0x%x immediate %d\n", + __func__, dev->cbd_notify_type, dev->cbd_layout_type, + dev->cbd_immediate); + } +out: + dprintk("%s: status %d ndevs %d\n", + __func__, ntohl(status), args->ndevs); + return status; +} + static __be32 decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) { @@ -574,11 +718,11 @@ preprocess_nfs41_op(int nop, unsigned in case OP_CB_SEQUENCE: case OP_CB_RECALL_ANY: case OP_CB_RECALL_SLOT: + case OP_CB_LAYOUTRECALL: + case OP_CB_NOTIFY_DEVICEID: *op = &callback_ops[op_nr]; break; - case OP_CB_LAYOUTRECALL: - case OP_CB_NOTIFY_DEVICEID: case OP_CB_NOTIFY: case OP_CB_PUSH_DELEG: case OP_CB_RECALLABLE_OBJ_AVAIL: @@ -739,6 +883,18 @@ static struct callback_op callback_ops[] .res_maxsize = CB_OP_RECALL_RES_MAXSZ, }, #if defined(CONFIG_NFS_V4_1) + [OP_CB_LAYOUTRECALL] = { + .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, + .decode_args = + (callback_decode_arg_t)decode_layoutrecall_args, + .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, + }, + [OP_CB_NOTIFY_DEVICEID] = { + .process_op = (callback_process_op_t)nfs4_callback_devicenotify, + .decode_args = + (callback_decode_arg_t)decode_devicenotify_args, + .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, + }, [OP_CB_SEQUENCE] = { .process_op = (callback_process_op_t)nfs4_callback_sequence, .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, diff -up linux-2.6.34.noarch/fs/nfs/client.c.orig linux-2.6.34.noarch/fs/nfs/client.c --- linux-2.6.34.noarch/fs/nfs/client.c.orig 2010-09-30 10:15:17.723710000 -0400 +++ linux-2.6.34.noarch/fs/nfs/client.c 2010-09-30 10:17:08.603991000 -0400 @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -48,6 +49,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_CLIENT @@ -150,11 +152,14 @@ static struct nfs_client *nfs_alloc_clie clp->cl_boot_time = CURRENT_TIME; clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_minorversion = cl_init->minorversion; + clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; #endif cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) clp->cl_machine_cred = cred; - +#if defined(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&clp->cl_layouts); +#endif nfs_fscache_get_client_cookie(clp); return clp; @@ -178,7 +183,7 @@ static void nfs4_clear_client_minor_vers clp->cl_session = NULL; } - clp->cl_call_sync = _nfs4_call_sync; + clp->cl_mvops = nfs_v4_minor_ops[0]; #endif /* CONFIG_NFS_V4_1 */ } @@ -188,7 +193,7 @@ static void nfs4_clear_client_minor_vers static void nfs4_destroy_callback(struct nfs_client *clp) { if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_minorversion); + nfs_callback_down(clp->cl_mvops->minor_version); } static void nfs4_shutdown_client(struct nfs_client *clp) @@ -251,6 +256,7 @@ void nfs_put_client(struct nfs_client *c nfs_free_client(clp); } } +EXPORT_SYMBOL(nfs_put_client); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* @@ -343,7 +349,7 @@ static int nfs_sockaddr_match_ipaddr(con * Test if two socket addresses represent the same actual socket, * by comparing (only) relevant fields, including the port number. */ -static int nfs_sockaddr_cmp(const struct sockaddr *sa1, +int nfs_sockaddr_cmp(const struct sockaddr *sa1, const struct sockaddr *sa2) { if (sa1->sa_family != sa2->sa_family) @@ -357,6 +363,7 @@ static int nfs_sockaddr_cmp(const struct } return 0; } +EXPORT_SYMBOL(nfs_sockaddr_cmp); /* * Find a client by IP address and protocol version @@ -548,6 +555,7 @@ int nfs4_check_client_ready(struct nfs_c return -EPROTONOSUPPORT; return 0; } +EXPORT_SYMBOL(nfs4_check_client_ready); /* * Initialise the timeout values for a connection @@ -865,9 +873,34 @@ error: } /* + * Initialize the pNFS layout driver and setup pNFS related parameters + */ +static void nfs4_init_pnfs(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) +{ +#if defined(CONFIG_NFS_V4_1) + struct nfs_client *clp = server->nfs_client; + + if (nfs4_has_session(clp) && + (clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) { + server->pnfs_blksize = fsinfo->blksize; + set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); + pnfs_set_ds_iosize(server); + } +#endif /* CONFIG_NFS_V4_1 */ +} + +static void nfs4_uninit_pnfs(struct nfs_server *server) +{ +#if defined(CONFIG_NFS_V4_1) + if (server->nfs_client && nfs4_has_session(server->nfs_client)) + unmount_pnfs_layoutdriver(server); +#endif /* CONFIG_NFS_V4_1 */ +} + +/* * Load up the server record from information gained in an fsinfo record */ -static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) +static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) { unsigned long max_rpc_payload; @@ -897,6 +930,8 @@ static void nfs_server_set_fsinfo(struct if (server->wsize > NFS_MAX_FILE_IO_SIZE) server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs4_init_pnfs(server, mntfh, fsinfo); + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); @@ -938,7 +973,7 @@ static int nfs_probe_fsinfo(struct nfs_s if (error < 0) goto out_error; - nfs_server_set_fsinfo(server, &fsinfo); + nfs_server_set_fsinfo(server, mntfh, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { @@ -1016,6 +1051,7 @@ void nfs_free_server(struct nfs_server * { dprintk("--> nfs_free_server()\n"); + nfs4_uninit_pnfs(server); spin_lock(&nfs_client_lock); list_del(&server->client_link); list_del(&server->master_link); @@ -1126,7 +1162,7 @@ static int nfs4_init_callback(struct nfs return error; } - error = nfs_callback_up(clp->cl_minorversion, + error = nfs_callback_up(clp->cl_mvops->minor_version, clp->cl_rpcclient->cl_xprt); if (error < 0) { dprintk("%s: failed to start callback. Error = %d\n", @@ -1143,10 +1179,8 @@ static int nfs4_init_callback(struct nfs */ static int nfs4_init_client_minor_version(struct nfs_client *clp) { - clp->cl_call_sync = _nfs4_call_sync; - #if defined(CONFIG_NFS_V4_1) - if (clp->cl_minorversion) { + if (clp->cl_mvops->minor_version) { struct nfs4_session *session = NULL; /* * Create the session and mark it expired. @@ -1158,7 +1192,13 @@ static int nfs4_init_client_minor_versio return -ENOMEM; clp->cl_session = session; - clp->cl_call_sync = _nfs4_call_sync_session; + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + clp->cl_cons_state = NFS_CS_SESSION_INITING; } #endif /* CONFIG_NFS_V4_1 */ @@ -1216,7 +1256,7 @@ error: /* * Set up an NFS4 client */ -static int nfs4_set_client(struct nfs_server *server, +int nfs4_set_client(struct nfs_server *server, const char *hostname, const struct sockaddr *addr, const size_t addrlen, @@ -1259,6 +1299,7 @@ error: dprintk("<-- nfs4_set_client() = xerror %d\n", error); return error; } +EXPORT_SYMBOL(nfs4_set_client); /* @@ -1448,7 +1489,7 @@ struct nfs_server *nfs4_create_referral_ data->authflavor, parent_server->client->cl_xprt->prot, parent_server->client->cl_timeout, - parent_client->cl_minorversion); + parent_client->cl_mvops->minor_version); if (error < 0) goto error; diff -up linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig linux-2.6.34.noarch/fs/nfsd/bl_com.c --- linux-2.6.34.noarch/fs/nfsd/bl_com.c.orig 2010-09-30 10:17:08.822996000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/bl_com.c 2010-09-30 10:17:08.824003000 -0400 @@ -0,0 +1,292 @@ +#if defined(CONFIG_SPNFS_BLOCK) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, + char __user *, size_t); +static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); +static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); + +static struct rpc_pipe_ops bl_upcall_ops = { + .upcall = bl_pipe_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = bl_pipe_destroy_msg, +}; + +bl_comm_t *bl_comm_global; + +int +nfsd_bl_start(void) +{ + bl_comm_t *bl_comm = NULL; + struct path path; + struct nameidata nd; + int rc; + + dprintk("%s: starting pipe\n", __func__); + if (bl_comm_global) + return -EEXIST; + + path.mnt = rpc_get_mount(); + if (IS_ERR(path.mnt)) + return PTR_ERR(path.mnt); + + /* FIXME: do not abuse rpc_pipefs/nfs */ + rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); + if (rc) + goto err; + + bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); + if (!bl_comm) { + rc = -ENOMEM; + goto err; + } + + /* FIXME: rename to "spnfs_block" */ + bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, + &bl_upcall_ops, 0); + if (IS_ERR(bl_comm->pipe_dentry)) { + rc = -EPIPE; + goto err; + } + mutex_init(&bl_comm->lock); + mutex_init(&bl_comm->pipe_lock); + init_waitqueue_head(&bl_comm->pipe_wq); + + bl_comm_global = bl_comm; + return 0; +err: + rpc_put_mount(); + kfree(bl_comm); + return rc; +} + +void +nfsd_bl_stop(void) +{ + bl_comm_t *c = bl_comm_global; + + dprintk("%s: stopping pipe\n", __func__); + if (!c) + return; + rpc_unlink(c->pipe_dentry); + rpc_put_mount(); + bl_comm_global = NULL; + kfree(c); +} + +static ssize_t +bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, + size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + ssize_t mlen = msg->len - msg->copied, + left; + + if (mlen > buflen) + mlen = buflen; + + left = copy_to_user(dst, data, mlen); + if (left < 0) { + msg->errno = left; + return left; + } + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + + return mlen; +} + +static ssize_t +bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); + bl_comm_t *bc = (bl_comm_t *)rpci->private; + bl_comm_msg_t *im = &bc->msg; + int ret; + bl_comm_res_t *res; + + + if (mlen == 0) { + im->msg_status = PNFS_BLOCK_FAILURE; + im->msg_res = NULL; + wake_up(&bc->pipe_wq); + return -EFAULT; + } + + if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) + return -ENOMEM; + + if (copy_from_user(res, src, mlen)) { + kfree(res); + return -EFAULT; + } + + mutex_lock(&bc->pipe_lock); + + ret = mlen; + im->msg_status = res->res_status; + im->msg_res = res; + + wake_up(&bc->pipe_wq); + mutex_unlock(&bc->pipe_lock); + return ret; +} + +static void +bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + bl_comm_msg_t *im = msg->data; + bl_comm_t *bc = container_of(im, struct bl_comm, msg); + + if (msg->errno >= 0) + return; + + mutex_lock(&bc->pipe_lock); + im->msg_status = PNFS_BLOCK_FAILURE; + wake_up(&bc->pipe_wq); + mutex_unlock(&bc->pipe_lock); +} + +int +bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) +{ + struct rpc_pipe_msg msg; + DECLARE_WAITQUEUE(wq, current); + int rval = 1; + bl_comm_msg_t *m = &bc->msg; + + if (bc == NULL) { + dprintk("%s: No pNFS block daemon available\n", __func__); + return 1; + } + + mutex_lock(&bc->lock); + mutex_lock(&bc->pipe_lock); + + memcpy(m, upmsg, sizeof (*m)); + + memset(&msg, 0, sizeof (msg)); + msg.data = m; + msg.len = sizeof (*m); + + add_wait_queue(&bc->pipe_wq, &wq); + rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); + if (rval < 0) { + remove_wait_queue(&bc->pipe_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&bc->pipe_lock); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&bc->pipe_wq, &wq); + mutex_lock(&bc->pipe_lock); + + if (m->msg_status == PNFS_BLOCK_SUCCESS) { + *res = m->msg_res; + rval = 0; + } else + rval = 1; + +out: + mutex_unlock(&bc->pipe_lock); + mutex_unlock(&bc->lock); + return rval; +} + +static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, + loff_t *offset) +{ + int cmd, + rc; + bl_comm_t *bc = bl_comm_global; + bl_comm_msg_t msg; + bl_comm_res_t *res; + + if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) + return -EFAULT; + switch (cmd) { + case PNFS_BLOCK_CTL_STOP: + msg.msg_type = PNFS_UPCALL_MSG_STOP; + (void) bl_upcall(bc, &msg, &res); + kfree(res); + nfsd_bl_stop(); + break; + + case PNFS_BLOCK_CTL_START: + rc = nfsd_bl_start(); + if (rc != 0) + return rc; + break; + + case PNFS_BLOCK_CTL_VERS: + msg.msg_type = PNFS_UPCALL_MSG_VERS; + msg.u.msg_vers = PNFS_UPCALL_VERS; + if (bl_upcall(bc, &msg, &res)) { + dprintk("%s: Failed to contact pNFS block daemon\n", + __func__); + return 0; + } + kfree(res); + break; + + default: + dprintk("%s: unknown ctl command %d\n", __func__, cmd); + break; + } + return len; +} + +static struct file_operations ctl_ops = { + .write = ctl_write, +}; + +/* + * bl_init_proc -- set up proc interfaces + * + * Creating a pnfs_block directory isn't really required at this point + * since we've only got a single node in that directory. If the need for + * more nodes doesn't present itself shortly this code should revert + * to a single top level node. McNeal 11-Aug-2008. + */ +int +bl_init_proc(void) +{ + struct proc_dir_entry *e; + + e = proc_mkdir("fs/pnfs_block", NULL); + if (!e) + return -ENOMEM; + + e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); + if (!e) + return -ENOMEM; + e->proc_fops = &ctl_ops; + + return 0; +} +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.34.noarch/fs/nfsd/bl_ops.c --- linux-2.6.34.noarch/fs/nfsd/bl_ops.c.orig 2010-09-30 10:17:08.827998000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/bl_ops.c 2010-09-30 10:17:08.829998000 -0400 @@ -0,0 +1,1672 @@ +/* + * bl_ops.c + * spNFS + * + * Created by Rick McNeal on 4/1/08. + * Copyright 2008 __MyCompanyName__. All rights reserved. + * + */ + +/* + * Block layout operations. + * + * These functions, with the exception of pnfs_block_enabled, are assigned to + * the super block s_export_op structure. + */ +#if defined(CONFIG_SPNFS_BLOCK) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pnfsd.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define BL_LAYOUT_HASH_BITS 4 +#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) +#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) +#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) + +#define bl_layout_hashval(id) \ + ((id) & BL_LAYOUT_HASH_MASK) + +#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) +#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) +#define _2SECTS(v) ((v) >> 9) + +#ifndef READ32 +#define READ32(x) (x) = ntohl(*p++) +#define READ64(x) do { \ +(x) = (u64)ntohl(*p++) << 32; \ +(x) |= ntohl(*p++); \ +} while (0) +#endif + + +typedef enum {True, False} boolean_t; +/* ---- block layoutget and commit structure ---- */ +typedef struct bl_layout_rec { + struct list_head blr_hash, + blr_layouts; + dev_t blr_rdev; + struct inode *blr_inode; + int blr_recalled; // debug + u64 blr_orig_size, + blr_commit_size, + blr_ext_size; + spinlock_t blr_lock; // Protects blr_layouts +} bl_layout_rec_t; + +static struct list_head layout_hash; +static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; +static spinlock_t layout_hashtbl_lock; + +/* ---- prototypes ---- */ +static boolean_t device_slice(dev_t devid); +static boolean_t device_dm(dev_t devid); +static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); +static bl_layout_rec_t *layout_inode_find(struct inode *i); +static void layout_inode_del(struct inode *i); +static char *map_state2name(enum pnfs_block_extent_state4 s); +static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); +static void bld_free(pnfs_blocklayout_devinfo_t *bld); +static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, + dev_t devid, int local_index); +static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, + dev_t devid, int my_loc, int idx); +static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, + struct nfsd4_layout_seg *seg); +struct list_head *layout_cache_iter(bl_layout_rec_t *r, + struct list_head *bl_possible, struct nfsd4_layout_seg *seg); +static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); +static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); +static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); +static void print_bll(pnfs_blocklayout_layout_t *b, char *); +static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, + struct list_head *h, struct nfsd4_layout_seg *seg); +static inline void bll_collapse(bl_layout_rec_t *r, + pnfs_blocklayout_layout_t *c); +static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, + enum bl_cache_state state, struct list_head *h); +static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, + enum bl_cache_state c, struct list_head *h); +static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, + enum pnfs_block_extent_state4 *s); +static void extents_setup(struct fiemap_extent_info *fei); +static void extents_count(struct fiemap_extent_info *fei, struct inode *i, + u64 foff, u64 len); +static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, + u64 foff, u64 len); +static boolean_t extents_process(struct fiemap_extent_info *fei, + struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, + pnfs_blocklayout_layout_t *b); +static void extents_cleanup(struct fiemap_extent_info *fei); + +void +nfsd_bl_init(void) +{ + int i; + dprintk("%s loaded\n", __func__); + + spin_lock_init(&layout_hashtbl_lock); + INIT_LIST_HEAD(&layout_hash); + for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) + INIT_LIST_HEAD(&layout_hashtbl[i]); + bl_init_proc(); +} + +/* + * pnfs_block_enabled -- check to see if this file system should be export as + * block pnfs + */ +int +pnfs_block_enabled(struct inode *inode, int ex_flags) +{ + bl_comm_msg_t msg; + bl_comm_res_t *res = NULL; + static int bl_comm_once = 0; + + dprintk("--> %s\n", __func__); + /* + * FIXME: Figure out method to determine if this file system should + * be exported. The following areas need to be checked. + * (1) Validate that this file system was exported as a pNFS + * block-layout + * (2) Has there been successful communication with the + * volume daemon? + */ + /* Check #1 */ +#ifdef notyet + if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { + dprintk("%s: pnfs_block not set in export\n", __func__); + return 0; + } +#endif + + /* Check #1 */ + if (!bl_comm_once) { + msg.msg_type = PNFS_UPCALL_MSG_VERS; + msg.u.msg_vers = PNFS_UPCALL_VERS; + if (bl_upcall(bl_comm_global, &msg, &res)) { + dprintk("%s: Failed to contact pNFS block daemon\n", + __func__); + return 0; + } + if (msg.u.msg_vers != res->u.vers) { + dprintk("%s: vers mismatch, kernel != daemon\n", + __func__); + kfree(res); + return 0; + } + } + bl_comm_once = 1; + + kfree(res); + + dprintk("<-- %s okay\n", __func__); + return 1; +} + +int +bl_layout_type(struct super_block *sb) +{ + return LAYOUT_BLOCK_VOLUME; +} + +int +bl_getdeviceiter(struct super_block *sb, + u32 layout_type, + struct nfsd4_pnfs_dev_iter_res *res) +{ + res->gd_eof = 1; + if (res->gd_cookie) + return -ENOENT; + res->gd_devid = sb->s_dev; + res->gd_verf = 1; + res->gd_cookie = 1; + return 0; +} + +static int +bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, + const struct nfsd4_pnfs_deviceid *devid) +{ + pnfs_blocklayout_devinfo_t *bld_slice_p, + *bld_simple_p, + *bld; + int status = -EIO, + location = 0; + struct list_head volumes; + + dprintk("--> %s\n", __func__); + INIT_LIST_HEAD(&volumes); + + bld_simple_p = bld_simple(&volumes, devid->devid, + location++); + if (!bld_simple_p) + goto out; + bld_slice_p = bld_slice(&volumes, devid->devid, location++, + bld_simple_p->bld_index_loc); + + if (!bld_slice_p) + goto out; + + status = blocklayout_encode_devinfo(xdr, &volumes); + +out: + while (!list_empty(&volumes)) { + bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, + bld_list); + if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) + kfree(bld->u.simple.bld_sig); + bld_free(bld); + } + + dprintk("<-- %s (rval %d)\n", __func__, status); + return status; +} + +static int +bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, + const struct nfsd4_pnfs_deviceid *devid) +{ + pnfs_blocklayout_devinfo_t *bld = NULL; + int status = -EIO, // default to error + i, + location = 0; + struct list_head volumes; + bl_comm_msg_t msg; + bl_comm_res_t *res; + + dprintk("--> %s\n", __func__); + INIT_LIST_HEAD(&volumes); + + msg.msg_type = PNFS_UPCALL_MSG_DMGET; + msg.u.msg_dev = devid->devid; + if (bl_upcall(bl_comm_global, &msg, &res)) { + dprintk("%s: upcall for DMGET failed\n", __func__); + goto out; + } + + /* + * Don't use bld_alloc() here. If used this will be the first volume + * type added to the list whereas the protocol requires it to be the + * last. + */ + bld = kmalloc(sizeof (*bld), GFP_KERNEL); + if (!bld) + goto out; + memset(bld, 0, sizeof (*bld)); + bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; + bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; + bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; + dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, + bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); + + bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * + sizeof (int), GFP_KERNEL); + if (!bld->u.stripe.bld_stripe_indexs) + goto out; + + for (i = 0; i < bld->u.stripe.bld_stripes; i++) { + dev_t dev; + pnfs_blocklayout_devinfo_t *bldp; + + dev = MKDEV(res->u.stripe.devs[i].major, + res->u.stripe.devs[i].minor); + if (dev == 0) + goto out; + + bldp = bld_simple(&volumes, dev, location++); + if (!bldp) { + dprintk("%s: bld_simple failed\n", __func__); + goto out; + } + bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); + + if (!bldp) { + dprintk("%s: bld_slice failed\n", __func__); + goto out; + } + bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; + + } + list_add_tail(&bld->bld_list, &volumes); + status = blocklayout_encode_devinfo(xdr, &volumes); + +out: + while (!list_empty(&volumes)) { + bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, + bld_list); + switch (bld->bld_type) { + case PNFS_BLOCK_VOLUME_SLICE: + case PNFS_BLOCK_VOLUME_CONCAT: + // No memory to release for these + break; + case PNFS_BLOCK_VOLUME_SIMPLE: + kfree(bld->u.simple.bld_sig); + break; + case PNFS_BLOCK_VOLUME_STRIPE: + kfree(bld->u.stripe.bld_stripe_indexs); + break; + } + bld_free(bld); + } + kfree(res); + dprintk("<-- %s (rval %d)\n", __func__, status); + return status; +} + +/* + * bl_getdeviceinfo -- determine device tree for requested devid + */ +int +bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, + u32 layout_type, + const struct nfsd4_pnfs_deviceid *devid) +{ + if (device_slice(devid->devid) == True) + return bl_getdeviceinfo_slice(sb, xdr, devid); + else if (device_dm(devid->devid) == True) + return bl_getdeviceinfo_dm(sb, xdr, devid); + return -EINVAL; +} + +enum nfsstat4 +bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, + const struct nfsd4_pnfs_layoutget_arg *arg, + struct nfsd4_pnfs_layoutget_res *res) +{ + pnfs_blocklayout_layout_t *b; + bl_layout_rec_t *r; + struct list_head bl_possible, + *bl_candidates = NULL; + boolean_t del_on_error = False; + int adj; + enum nfsstat4 nfserr = NFS4_OK; + + dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", + __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), + _2SECTS(res->lg_seg.length), res->lg_seg.iomode); + + if (res->lg_seg.length == 0) { + printk("%s: request length of 0, error condition\n", __func__); + return NFS4ERR_BADLAYOUT; + } + + /* + * Adjust the length as required per spec. + * - First case is were the length is set to (u64)-1. Cheap means to + * define the end of the file. + * - Second case is were the I/O mode is read-only, but the request is + * past the end of the file so the request needs to be trimed. + */ + if ((res->lg_seg.length == NFS4_MAX_UINT64) || + (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && + (res->lg_seg.iomode == IOMODE_READ))) + res->lg_seg.length = i->i_size - res->lg_seg.offset; + + adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; + res->lg_seg.offset -= adj; + res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; + + if (res->lg_seg.iomode != IOMODE_READ) + if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE, + res->lg_seg.offset, res->lg_seg.length)) + return NFS4ERR_IO; + + INIT_LIST_HEAD(&bl_possible); + + if ((r = layout_inode_find(i)) == NULL) { + if (layout_inode_add(i, &r) == False) { + printk("%s: layout_inode_add failed\n", __func__); + return NFS4ERR_IO; + } + del_on_error = True; + } + BUG_ON(!r); + + spin_lock(&r->blr_lock); + + if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { + /* + * This will send LAYOUTTRYAGAIN error to the client. + */ + dprintk("%s: layout_cache_fill_from() failed\n", __func__); + nfserr = NFS4ERR_LAYOUTTRYLATER; + goto layoutget_cleanup; + } + + res->lg_return_on_close = 1; + res->lg_seg.length = 0; + + bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); + if (!bl_candidates) { + nfserr = NFS4ERR_LAYOUTTRYLATER; + goto layoutget_cleanup; + } + + layout_cache_merge(r, bl_candidates); + if (layout_cache_update(r, bl_candidates)) { + /* ---- Failed to allocate memory. ---- */ + dprintk("%s: layout_cache_update() failed\n", __func__); + nfserr = NFS4ERR_LAYOUTTRYLATER; + goto layoutget_cleanup; + } + + nfserr = blocklayout_encode_layout(xdr, bl_candidates); + if (nfserr) + dprintk("%s: layoutget xdr routine failed\n", __func__); + +layoutget_cleanup: + if (bl_candidates) { + while (!list_empty(bl_candidates)) { + b = list_entry(bl_candidates->next, + struct pnfs_blocklayout_layout, bll_list); + list_del(&b->bll_list); + kfree(b); + } + } + + spin_unlock(&r->blr_lock); + if (unlikely(nfserr)) { + if (del_on_error == True) + layout_inode_del(i); + res->lg_seg.length = 0; + res->lg_seg.offset = 0; + } + + dprintk("<-- %s (rval %u)\n", __func__, nfserr); + return nfserr; +} + +/* + * bl_layoutcommit -- commit changes, especially size, to file systemj + * + * Currently this routine isn't called and everything is handled within + * nfsd4_layoutcommit(). By not calling this routine the server doesn't + * handle a partial return, a set of extents, of the layout. The extents + * are decoded here, but nothing is done with them. If this routine is + * be called the interface must change to pass the 'dentry' pointer such + * that notify_change() can be called. + */ +int +bl_layoutcommit(struct inode *i, + const struct nfsd4_pnfs_layoutcommit_arg *args, + struct nfsd4_pnfs_layoutcommit_res *res) +{ + bl_layout_rec_t *r; + int status = 0; + u64 lw_plus; + + dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); + r = layout_inode_find(i); + if (r) { + lw_plus = args->lc_last_wr + 1; + if (args->lc_newoffset) { + dprintk(" lc_last_wr %Lu\n", lw_plus); + if (r->blr_orig_size < lw_plus) { + r->blr_orig_size = lw_plus; + res->lc_size_chg = 1; + res->lc_newsize = lw_plus; + } + } + + if (args->lc_up_len) { + int extents, + i; + struct pnfs_blocklayout_layout *b; + __be32 *p = args->lc_up_layout; + + /* + * Client is returning a set of extents which + * should/could be used to update the file system. + * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 + */ + READ32(extents); + dprintk(" Client returning %d extents: data size %d\n", + extents, args->lc_up_len); + b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * + extents, GFP_KERNEL); + if (b) { + for (i = 0; i < extents; i++) { + READ64(b[i].bll_vol_id.sbid); + READ64(b[i].bll_vol_id.devid); + READ64(b[i].bll_foff); + READ64(b[i].bll_len); + READ64(b[i].bll_soff); + READ32(b[i].bll_es); + dprintk(" %d: foff %Lu, len %Lu, soff %Lu " + "state %s\n", + i, _2SECTS(b[i].bll_foff), + _2SECTS(b[i].bll_len), + _2SECTS(b[i].bll_soff), + map_state2name(b[i].bll_es)); + } + kfree(b); + } else { + status = -ENOMEM; + } + } + } else + dprintk("%s: Unexpected commit to inode %p\n", __func__, i); + + dprintk("<-- %s (rval %d)\n", __func__, status); + return status; +} + +int +bl_layoutreturn(struct inode *i, + const struct nfsd4_pnfs_layoutreturn_arg *args) +{ + int status = 0; + bl_layout_rec_t *r; + + dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); + + r = layout_inode_find(i); + if (r) { + spin_lock(&r->blr_lock); + layout_cache_del(r, &args->lr_seg); + spin_unlock(&r->blr_lock); + dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", + r->blr_ext_size, i->i_size, r->blr_orig_size); + } + + layout_inode_del(i); + dprintk("<-- %s (rval %d)\n", __func__, status); + return status; +} + +int +bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) +{ + struct super_block *sb; + struct nfsd4_pnfs_cb_layout lr; + bl_layout_rec_t *r; + pnfs_blocklayout_layout_t *b; + u64 adj; + + dprintk("--> %s\n", __func__); + BUG_ON(!len); + switch (type) { + case RETURN_FILE: + sb = inode->i_sb; + dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", + inode->i_sb->s_dev, inode->i_ino, + _2SECTS(offset), _2SECTS(len)); + break; + case RETURN_FSID: + sb = inode->i_sb; + dprintk("%s: recalling layout for fsid x (unimplemented)\n", + __func__); + return 0; + case RETURN_ALL: + /* + * XXX figure out how to get a sb since there's no + * inode ptr + */ + dprintk("%s: recalling all layouts (unimplemented)\n", + __func__); + return 0; + default: + return -EINVAL; + } + +restart: + r = layout_inode_find(inode); + if (r && len && !r->blr_recalled) { + spin_lock(&r->blr_lock); + list_for_each_entry(b, &r->blr_layouts, bll_list) { + if (!r->blr_recalled && !b->bll_recalled && + (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { + b->bll_recalled = 1; + lr.cbl_recall_type = type; + lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; + lr.cbl_seg.clientid = 0; + lr.cbl_seg.offset = 0; + lr.cbl_seg.length = NFS4_MAX_UINT64; + r->blr_recalled = 1; + dprintk(" FULL LAYOUTRECALL\n"); + lr.cbl_seg.iomode = IOMODE_ANY; + + /* + * Currently there are only two cases where the + * layout is being returned. + * (1) Someone is issuing a NFS_WRITE operation + * to this layout. + * (2) The file has been truncated which means + * the layout is immediately made invalid. + * In both cases the client must write any + * uncommitted modifications to the server via + * NFS_WRITE. + */ + lr.cbl_layoutchanged = 1; + + /* + * Need to drop the lock because we'll get a + * layoutreturn which will block waiting for + * the lock. The request will come in on the + * same thread which will cause a deadlock. + */ + spin_unlock(&r->blr_lock); + nfsd_layout_recall_cb(sb, inode, &lr); + adj = MIN(b->bll_len - (offset - b->bll_foff), + len); + offset += adj; + len -= adj; + if (!len) { + spin_lock(&r->blr_lock); + break; + } + /* + * Since layoutreturn will have been called we + * can't assume blr_layouts is still valid, + * so restart. + */ + goto restart; + } + } + spin_unlock(&r->blr_lock); + } + + dprintk("<-- %s\n", __func__); + return 0; +} + +/* + * []------------------------------------------------------------------[] + * | Support functions from here on down. | + * []------------------------------------------------------------------[] + */ + +/* + * bld_simple -- given a dev_t build a simple volume structure + * + * Simple volume contains the device signature and offset to that data in + * the storage volume. + */ +static pnfs_blocklayout_devinfo_t * +bld_simple(struct list_head *volumes, dev_t devid, int local_index) +{ + pnfs_blocklayout_devinfo_t *bld = NULL; + bl_comm_msg_t msg; + bl_comm_res_t *res = NULL; + + msg.msg_type = PNFS_UPCALL_MSG_GETSIG; + msg.u.msg_dev = devid; + if (bl_upcall(bl_comm_global, &msg, &res)) { + dprintk("%s: Failed to get signature information\n", __func__); + goto error; + } + + bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); + if (!bld) + return NULL; + + bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; + bld->u.simple.bld_sig_len = res->u.sig.len; + bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); + if (!bld->u.simple.bld_sig) + goto error; + + memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); + kfree(res); + return bld; + +error: + if (bld) + bld_free(bld); + if (res) + kfree(res); + dprintk("%s: error in bld_simple\n", __func__); + return NULL; +} + +/* + * bld_slice -- given a dev_t build a slice volume structure + * + * A slice volume contains the length of the slice/partition and its offset + * from the beginning of the storage volume. There's also a reference to + * the "simple" volume which contains this slice. + */ +static pnfs_blocklayout_devinfo_t * +bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) +{ + pnfs_blocklayout_devinfo_t *bld; + bl_comm_msg_t msg; + bl_comm_res_t *res; + + dprintk("--> %s\n", __func__); + bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); + if (!bld) + return NULL; + + msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; + msg.u.msg_dev = devid; + if (bl_upcall(bl_comm_global, &msg, &res)) { + dprintk("Upcall to get slice info failed\n"); + bld_free(bld); + return NULL; + } + + bld->bld_devid.devid = devid; + bld->bld_index_loc = my_loc; + bld->u.slice.bld_start = res->u.slice.start * 512LL; + bld->u.slice.bld_len = res->u.slice.length * 512LL; + bld->u.slice.bld_index = simple_loc; + + dprintk("%s: start %Lu, len %Lu\n", __func__, + bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); + + kfree(res); + dprintk("<-- %s (rval %p)\n", __func__, bld); + return bld; +} + +static int +layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, + struct nfsd4_layout_seg *seg) +{ + pnfs_blocklayout_layout_t *n; + + dprintk("--> %s\n", __func__); + + if (!list_empty(&r->blr_layouts)) + if (layout_cache_fill_from_list(r, h, seg) == False) + return -EIO; + + /* + * This deals with two conditions. + * (1) When blr_layouts is empty we need to create the first entry + * (2) When the range requested falls past the end of any current + * layout the residual must be taken care of. + */ + if (seg->length) { + n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); + if (!n) + return -ENOMEM; + dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), + _2SECTS(n->bll_len)); + } + + dprintk("<-- %s\n", __func__); + return 0; +} + +struct list_head * +layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, + struct nfsd4_layout_seg *seg) +{ + pnfs_blocklayout_layout_t *b, + *n = NULL; + struct list_head *bl_candidates = NULL; + struct fiemap_extent_info fei; + struct inode *i; + dev_t dev; + + dev = r->blr_rdev; + i = r->blr_inode; + + dprintk("--> %s\n", __func__); + bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); + if (!bl_candidates) + return NULL; + INIT_LIST_HEAD(bl_candidates); + extents_setup(&fei); + + list_for_each_entry(b, bl_possible, bll_list) { + if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { + + extents_count(&fei, i, b->bll_foff, b->bll_len); + if (fei.fi_extents_mapped) { + + /* + * Common case here. Got a range which has + * extents. Now get those extents and process + * them into pNFS extents. + */ + if (extents_get(&fei, i, b->bll_foff, + b->bll_len) == False) + goto cleanup; + if (extents_process(&fei, bl_candidates, + seg, dev, b) == False) + goto cleanup; + extents_cleanup(&fei); + + } else if (seg->iomode == IOMODE_READ) { + + /* + * Found a hole in a file while reading. No + * problem, just create a pNFS extent for the + * range and let the client know there's no + * backing store. + */ + n = bll_alloc(b->bll_foff, b->bll_len, + BLOCK_LAYOUT_NEW, bl_candidates); + n->bll_es = PNFS_BLOCK_NONE_DATA; + n->bll_vol_id.sbid = 0; + n->bll_vol_id.devid = dev; + seg->length += b->bll_len; + } else { + + /* + * There's a problem here. Since the iomode + * is read/write fallocate should have allocated + * any necessary storage for the given range. + */ + dprintk(" Extent count for RW is 0\n"); + goto cleanup; + } + + } else { + n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); + seg->length += n->bll_len; + } + + if (r->blr_ext_size < (b->bll_foff + b->bll_len)) + r->blr_ext_size = b->bll_foff + b->bll_len; + } + + while (!list_empty(bl_possible)) { + b = list_entry(bl_possible->next, + struct pnfs_blocklayout_layout, bll_list); + list_del(&b->bll_list); + kfree(b); + } + + b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, + bll_list); + seg->offset = b->bll_foff; + dprintk("<-- %s okay\n", __func__); + return bl_candidates; + +cleanup: + extents_cleanup(&fei); + if (bl_candidates) + kfree(bl_candidates); + dprintk("<-- %s, error occurred\n", __func__); + return NULL; +} + +/* + * layout_cache_merge -- collapse layouts which make up a contiguous range. + */ +static void +layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) +{ + pnfs_blocklayout_layout_t *b, + *p; + + dprintk("--> %s\n", __func__); +restart: + p = NULL; + list_for_each_entry(b, h, bll_list) { + if (p && (BLL_S_END(p) == b->bll_soff) && + (p->bll_es == b->bll_es) && + (b->bll_es != PNFS_BLOCK_NONE_DATA)) { + /* + * We've got a condidate. + */ +#ifdef too_verbose + dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", + _2SECTS(b->bll_foff), _2SECTS(b->bll_len), + _2SECTS(b->bll_soff), + _2SECTS(p->bll_foff), _2SECTS(p->bll_len), + _2SECTS(b->bll_soff)); +#endif + + if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) + p->bll_cache_state = BLOCK_LAYOUT_UPDATE; + p->bll_len += b->bll_len; + list_del(&b->bll_list); + kfree(b); + goto restart; + } else if (p && (BLL_F_END(p) == b->bll_foff) && + (p->bll_es == b->bll_es) && + (b->bll_es == PNFS_BLOCK_NONE_DATA)) { + p->bll_len += b->bll_len; + list_del(&b->bll_list); + kfree(b); + goto restart; + } else + p = b; + } + dprintk("<-- %s\n", __func__); +} + +static int +layout_cache_update(bl_layout_rec_t *r, struct list_head *h) +{ + pnfs_blocklayout_layout_t *b, + *c, + *n; + boolean_t status = 0; + + dprintk("--> %s\n", __func__); + if (list_empty(&r->blr_layouts)) { + /* ---- Just add entries and return ---- */ + dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, + r->blr_inode->i_ino); + list_for_each_entry(b, h, bll_list) { + c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, + &r->blr_layouts); + if (!c) { + status = -ENOMEM; + break; + } + dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", + _2SECTS(c->bll_foff), _2SECTS(c->bll_len), + _2SECTS(c->bll_soff), c->bll_es); + } + return status; + } + + list_for_each_entry(b, h, bll_list) { + BUG_ON(!b->bll_vol_id.devid); + if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { + boolean_t found = False; + list_for_each_entry(c, &r->blr_layouts, bll_list) { + if ((b->bll_soff >= c->bll_soff) && + (b->bll_soff < BLL_S_END(c)) && + (b->bll_es != PNFS_BLOCK_NONE_DATA)) { + u64 u; + + if ((b->bll_foff < c->bll_foff) || + (b->bll_foff > BLL_F_END(c))) + BUG(); + + u = BLL_S_END(b) - BLL_S_END(c); + /* + * The updated cache entry has to be + * different than the current. + * Otherwise the cache state for 'b' + * should be BLOCK_LAYOUT_CACHE. + */ + BUG_ON(BLL_S_END(b) < BLL_S_END(c)); + + dprintk(" " + "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", + _2SECTS(c->bll_foff), + _2SECTS(c->bll_len), + _2SECTS(c->bll_soff), + _2SECTS(c->bll_len + u)); + c->bll_len += u; + bll_collapse(r, c); + found = True; + break; + } + } + + if (found == False) { + dprintk(" ERROR Expected to find" + " %Lu(f):%Lu(l):%Lu(s), but didn't\n", + _2SECTS(b->bll_foff), _2SECTS(b->bll_len), + _2SECTS(b->bll_soff)); + list_for_each_entry(c, &r->blr_layouts, bll_list) + print_bll(c, "Cached"); + BUG(); + } + } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { + + c = list_first_entry(&r->blr_layouts, + struct pnfs_blocklayout_layout, bll_list); + if (b->bll_foff < c->bll_foff) { + /* + * Special case where new entry is before + * first cached entry. + */ + c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); + list_add(&c->bll_list, &r->blr_layouts); + dprintk(" new entry at head of list at %Lu, " + "len %Lu\n", + _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); + } else { + list_for_each_entry(c, &r->blr_layouts, + bll_list) { + n = list_entry(c->bll_list.next, + struct pnfs_blocklayout_layout, + bll_list); + /* + * This is ugly, but can't think of + * another way to examine this case. + * Consider the following. Need to + * add an entry which starts at 40 + * and the cache has the following + * entries: + * Start Length + * 10 5 + * 30 5 + * 50 5 + * So, need to look and see if the new + * entry starts after the current + * cache, but before the next one. + * There's a catch in that the next + * entry might not be valid as it's + * really just a pointer to the list + * head. + */ + if (((b->bll_foff >= + BLL_F_END(c)) && + (c->bll_list.next == &r->blr_layouts)) || + ((b->bll_foff >= + BLL_F_END(c)) && + (b->bll_foff < n->bll_foff))) { + + n = bll_alloc_dup(b, + BLOCK_LAYOUT_CACHE, NULL); + dprintk(" adding new %Lu:%Lu" + " after %Lu:%Lu\n", + _2SECTS(n->bll_foff), + _2SECTS(n->bll_len), + _2SECTS(c->bll_foff), + _2SECTS(c->bll_len)); + list_add(&n->bll_list, + &c->bll_list); + break; + } + } + } + } + } + dprintk("<-- %s\n", __func__); + return status; +} + +static void +layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) +{ + struct pnfs_blocklayout_layout *b, + *n; + u64 len; + struct nfsd4_layout_seg seg = *seg_in; + + dprintk("--> %s\n", __func__); + if (seg.length == NFS4_MAX_UINT64) { + r->blr_recalled = 0; + dprintk(" Fast return of all layouts\n"); + while (!list_empty(&r->blr_layouts)) { + b = list_entry(r->blr_layouts.next, + struct pnfs_blocklayout_layout, bll_list); + dprintk(" foff %Lu, len %Lu, soff %Lu\n", + _2SECTS(b->bll_foff), _2SECTS(b->bll_len), + _2SECTS(b->bll_soff)); + list_del(&b->bll_list); + kfree(b); + } + dprintk("<-- %s\n", __func__); + return; + } + +restart: + list_for_each_entry(b, &r->blr_layouts, bll_list) { + if (seg.offset == b->bll_foff) { + /* + * This handle the following three cases: + * (1) return layout matches entire cache layout + * (2) return layout matches beginning portion of cache + * (3) return layout matches entire cache layout and + * into next entry. Varies from #1 in end case. + */ + dprintk(" match on offsets, %Lu:%Lu\n", + _2SECTS(seg.offset), _2SECTS(seg.length)); + len = MIN(seg.length, b->bll_len); + b->bll_foff += len; + b->bll_soff += len; + b->bll_len -= len; + seg.length -= len; + seg.offset += len; + if (!b->bll_len) { + list_del(&b->bll_list); + kfree(b); + dprintk(" removing cache line\n"); + if (!seg.length) { + dprintk(" also finished\n"); + goto complete; + } + /* + * Since 'b' was freed we can't continue at the + * next entry which is referenced as + * b->bll_list.next by the list_for_each_entry + * macro. Need to restart the loop. + * TODO: Think about creating a dummy 'b' which + * would keep list_for_each_entry() happy. + */ + goto restart; + } + if (!seg.length) { + dprintk(" finished, but cache line not" + "empty\n"); + goto complete; + } + } else if ((seg.offset >= b->bll_foff) && + (seg.offset < BLL_F_END(b))) { + /* + * layout being returned is within this cache line. + */ + dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", + _2SECTS(seg.offset), _2SECTS(seg.length), + _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); + BUG_ON(!seg.length); + if ((seg.offset + seg.length) >= BLL_F_END(b)) { + /* + * Layout returned starts in the middle of + * cache entry and just need to trim back + * cache to shorter length. + */ + dprintk(" trim back cache line\n"); + len = seg.offset - b->bll_foff; + seg.offset += b->bll_len - len; + seg.length -= b->bll_len - len; + b->bll_len = len; + if (!seg.length) + return; + } else { + /* + * Need to split current cache layout because + * chunk is being removed from the middle. + */ + dprintk(" split cache line\n"); + len = seg.offset + seg.length; + n = bll_alloc(len, + (b->bll_foff + b->bll_len) - len, + BLOCK_LAYOUT_CACHE, NULL); + n->bll_soff = b->bll_soff + len; + list_add(&n->bll_list, &b->bll_list); + b->bll_len = seg.offset - b->bll_foff; + return; + } + } + } +complete: + if (list_empty(&r->blr_layouts)) + r->blr_recalled = 0; + dprintk("<-- %s\n", __func__); +} + +/* + * layout_cache_fill_from_list -- fills from cache list + * + * NOTE: This routine was only seperated out from layout_cache_file_from() + * to reduce the indentation level which makes the code easier to read. + */ +static inline boolean_t +layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, + struct nfsd4_layout_seg *seg) +{ + pnfs_blocklayout_layout_t *b, + *n; + enum pnfs_block_extent_state4 s; + + list_for_each_entry(b, &r->blr_layouts, bll_list) { + if (seg->offset < b->bll_foff) { + n = bll_alloc(seg->offset, + MIN(seg->length, b->bll_foff - seg->offset), + BLOCK_LAYOUT_NEW, NULL); + if (!n) + return False; + + list_add(&n->bll_list, h->prev); + dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", + _2SECTS(n->bll_foff), _2SECTS(n->bll_len), + _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); + seg->offset += n->bll_len; + seg->length -= n->bll_len; + if (!seg->length) + break; + } + + if ((seg->offset >= b->bll_foff) && + (seg->offset < BLL_F_END(b))) { + if (layout_conflict(b, seg->iomode, &s) == False) { + dprintk(" CONFLICT FOUND: " + "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", + _2SECTS(b->bll_foff), _2SECTS(b->bll_len), + _2SECTS(b->bll_soff), b->bll_es, + seg->iomode); + return False; + } + n = bll_alloc(seg->offset, + MIN(seg->length, BLL_F_END(b) - seg->offset), + BLOCK_LAYOUT_CACHE, h); + dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " + "in %Lu(f):%Lu(l):%Lu(s):%d\n", + _2SECTS(n->bll_foff), _2SECTS(n->bll_len), + _2SECTS(b->bll_foff), _2SECTS(b->bll_len), + _2SECTS(b->bll_soff), b->bll_es); + if (!n) + return False; + + n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; + n->bll_vol_id.sbid = 0; + n->bll_vol_id.devid = b->bll_vol_id.devid; + n->bll_es = s; + seg->offset += n->bll_len; + seg->length -= n->bll_len; + if (!seg->length) + break; + } + } + return True; +} + +static u64 +bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, + dev_t dev) +{ + pnfs_blocklayout_layout_t *n; + + n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); + if (!n) + return 0; + n->bll_es = PNFS_BLOCK_NONE_DATA; + n->bll_vol_id.sbid = 0; + n->bll_vol_id.devid = dev; + + return n->bll_len; +} + +static void +extents_setup(struct fiemap_extent_info *fei) +{ + fei->fi_extents_start = NULL; +} + +/* + * extents_count -- Determine the number of extents for a given range. + * + * No need to call set_fs() here because the function + * doesn't use copy_to_user() if it's only counting + * the number of extents needed. + */ +static void +extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) +{ + dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); + fei->fi_flags = FIEMAP_FLAG_SYNC; + fei->fi_extents_max = 0; + fei->fi_extents_start = NULL; + fei->fi_extents_mapped = 0; + i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); +} + +/* + * extents_get -- Get list of extents for range + * + * extents_count() must have been called before this routine such that + * fi_extents_mapped is known. + */ +static boolean_t +extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) +{ + int m_space, + rval; + struct fiemap_extent *fe; + mm_segment_t old_fs = get_fs(); + + /* + * Now malloc the correct amount of space + * needed. It's possible for the file to have changed + * between calls which would require more space for + * the extents. If that occurs the last extent will + * not have FIEMAP_EXTENT_LAST set and the error will + * be caught in extents_process(). + */ + m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); + fe = kmalloc(m_space, GFP_KERNEL); + if (!fe) + return False; + memset(fe, 0, m_space); + + fei->fi_extents_max = fei->fi_extents_mapped; + fei->fi_extents_mapped = 0; + fei->fi_extents_start = fe; + + set_fs(KERNEL_DS); + rval = i->i_op->fiemap(i, fei, foff, len + + (1 << i->i_sb->s_blocksize_bits) - 1); + set_fs(old_fs); + + if (rval || !fei->fi_extents_mapped) { + dprintk(" No extents. Wanted %d, got %d\n", + fei->fi_extents_max, fei->fi_extents_mapped); + kfree(fe); + fei->fi_extents_start = NULL; + return False; + } else + return True; +} + +/* + * extents_process -- runs through the extent returned from the file system and + * creates block layout entries. + */ +static boolean_t +extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, + struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) +{ + struct fiemap_extent *fep, + *fep_last = NULL; + int i; + pnfs_blocklayout_layout_t *n; + u64 last_end, + rval; + + dprintk("--> %s\n", __func__); + for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; + i++, fep++) { + + BUG_ON(!fep->fe_physical); + /* + * Deal with corner cases of hoel-y files. + */ + if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != + fep->fe_logical)) { + + /* + * If the last extent doesn't end logically + * at the beginning of the current we've got + * hole and need to create a pNFS extent. + */ + dprintk(" Got a hole at %Ld:%Ld \n", + _2SECTS(fep_last->fe_logical), + _2SECTS(fep_last->fe_length)); + last_end = fep_last->fe_logical + fep_last->fe_length; + rval = bll_alloc_holey(bl_candidates, last_end, + fep->fe_logical - last_end, dev); + if (!rval) + return False; + seg->length += rval; + } + + n = bll_alloc(fep->fe_logical, fep->fe_length, + BLOCK_LAYOUT_NEW, bl_candidates); + if (unlikely(n == NULL)) { + dprintk("%s: bll_alloc failed\n", __func__); + return False; + } + + n->bll_soff = fep->fe_physical; + n->bll_es = seg->iomode == IOMODE_READ ? + PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; + n->bll_vol_id.sbid = 0; + n->bll_vol_id.devid = dev; + seg->length += fep->fe_length; + print_bll(n, "New extent"); + fep_last = fep; + } + dprintk("<-- %s (i=%d)\n", __func__, i); + + return True; +} + +static void +extents_cleanup(struct fiemap_extent_info *fei) +{ + if (fei->fi_extents_start) { + kfree(fei->fi_extents_start); + fei->fi_extents_start = NULL; + } +} + +/* + * device_slice -- check to see if device is a slice or DM + */ +static boolean_t +device_slice(dev_t devid) +{ + struct block_device *bd = open_by_devnum(devid, FMODE_READ); + boolean_t rval = False; + + if (bd) { + if (bd->bd_disk->minors > 1) + rval = True; + blkdev_put(bd, FMODE_READ); + } + return rval; +} + +/* + * device_dm -- check to see if device is a Device Mapper volume. + * + * Returns 1 for DM or 0 if not + */ +static boolean_t +device_dm(dev_t devid) +{ + boolean_t rval = False; + bl_comm_msg_t msg; + bl_comm_res_t *res; + + msg.msg_type = PNFS_UPCALL_MSG_DMCHK; + msg.u.msg_dev = devid; + if (bl_upcall(bl_comm_global, &msg, &res)) { + dprintk("Failed upcall to check on DM status\n"); + } else if (res->u.dm_vol) { + rval = True; + dprintk("Device is DM volume\n"); + } else + dprintk("Device is not DM volume\n"); + kfree(res); + + return rval; +} + +static boolean_t +layout_inode_add(struct inode *i, bl_layout_rec_t **p) +{ + bl_layout_rec_t *r = NULL; + + if (!i->i_op->fiemap || !i->i_op->fallocate) { + printk("pNFS: file system doesn't support required fiemap or" + "fallocate methods\n"); + return False; + } + + r = kmalloc(sizeof (*r), GFP_KERNEL); + if (!r) + goto error; + + r->blr_rdev = i->i_sb->s_dev; + r->blr_inode = i; + r->blr_orig_size = i->i_size; + r->blr_ext_size = 0; + r->blr_recalled = 0; + INIT_LIST_HEAD(&r->blr_layouts); + spin_lock_init(&r->blr_lock); + spin_lock(&layout_hashtbl_lock); + list_add_tail(&r->blr_hash, &layout_hash); + spin_unlock(&layout_hashtbl_lock); + *p = r; + return True; + +error: + if (r) + kfree(r); + return False; +} + +static bl_layout_rec_t * +__layout_inode_find(struct inode *i) +{ + bl_layout_rec_t *r; + + if (!list_empty(&layout_hash)) { + list_for_each_entry(r, &layout_hash, blr_hash) { + if ((r->blr_inode->i_ino == i->i_ino) && + (r->blr_rdev == i->i_sb->s_dev)) { + return r; + } + } + } + return NULL; +} + +static bl_layout_rec_t * +layout_inode_find(struct inode *i) +{ + bl_layout_rec_t *r; + + spin_lock(&layout_hashtbl_lock); + r = __layout_inode_find(i); + spin_unlock(&layout_hashtbl_lock); + + return r; +} + +static void +layout_inode_del(struct inode *i) +{ + bl_layout_rec_t *r; + + spin_lock(&layout_hashtbl_lock); + r = __layout_inode_find(i); + if (r) { + spin_lock(&r->blr_lock); + if (list_empty(&r->blr_layouts)) { + list_del(&r->blr_hash); + spin_unlock(&r->blr_lock); + kfree(r); + } else { + spin_unlock(&r->blr_lock); + } + } else { + dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", + __func__, i->i_sb->s_dev, i->i_ino); + } + spin_unlock(&layout_hashtbl_lock); +} + +/* + * map_state2name -- converts state in ascii string. + * + * Used for debug messages only. + */ +static char * +map_state2name(enum pnfs_block_extent_state4 s) +{ + switch (s) { + case PNFS_BLOCK_READWRITE_DATA: return " RW"; + case PNFS_BLOCK_READ_DATA: return " RO"; + case PNFS_BLOCK_INVALID_DATA: return "INVALID"; + case PNFS_BLOCK_NONE_DATA: return " NONE"; + default: + BUG(); + } +} + +static pnfs_blocklayout_devinfo_t * +bld_alloc(struct list_head *volumes, int type) +{ + pnfs_blocklayout_devinfo_t *bld; + + bld = kmalloc(sizeof (*bld), GFP_KERNEL); + if (!bld) + return NULL; + + memset(bld, 0, sizeof (*bld)); + bld->bld_type = type; + list_add_tail(&bld->bld_list, volumes); + + return bld; +} + +static void +bld_free(pnfs_blocklayout_devinfo_t *bld) +{ + list_del(&bld->bld_list); + kfree(bld); +} + +static void +print_bll(pnfs_blocklayout_layout_t *b, char *text) +{ + dprintk(" BLL: %s\n", text); + dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", + _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), + map_state2name(b->bll_es)); +} + +static inline void +bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) +{ + pnfs_blocklayout_layout_t *n; + int dbg_count = 0; + u64 endpoint; + + BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); + while (c->bll_list.next != &r->blr_layouts) { + n = list_entry(c->bll_list.next, + struct pnfs_blocklayout_layout, bll_list); + endpoint = BLL_S_END(c); + if ((n->bll_soff >= c->bll_soff) && + (n->bll_soff < endpoint)) { + if (endpoint < BLL_S_END(n)) { + /* + * The following is possible. + * + * + * Existing: +---+ +---+ + * New: +-----------------------+ + * The client request merge entries together + * but didn't require picking up all of the + * last entry. So, we still need to delete + * the last entry and add the remaining space + * to the new entry. + */ + c->bll_len += BLL_S_END(n) - endpoint; + } + dbg_count++; + list_del(&n->bll_list); + kfree(n); + } else { + break; + } + } + /* ---- Debug only, remove before integration ---- */ + if (dbg_count) + dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", + dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); +} + +static pnfs_blocklayout_layout_t * +bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) +{ + pnfs_blocklayout_layout_t *n = NULL; + + n = kmalloc(sizeof (*n), GFP_KERNEL); + if (n) { + memset(n, 0, sizeof (*n)); + n->bll_foff = offset; + n->bll_len = len; + n->bll_cache_state = state; + if (h) + list_add_tail(&n->bll_list, h); + } + return n; +} + +static pnfs_blocklayout_layout_t * +bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, + struct list_head *h) +{ + pnfs_blocklayout_layout_t *n = NULL; + + n = bll_alloc(b->bll_foff, b->bll_len, c, h); + if (n) { + n->bll_es = b->bll_es; + n->bll_soff = b->bll_soff; + n->bll_vol_id.devid = b->bll_vol_id.devid; + } + return n; +} + +static inline boolean_t +layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, + enum pnfs_block_extent_state4 *s) +{ + /* ---- Normal case ---- */ + *s = b->bll_es; + + switch (b->bll_es) { + case PNFS_BLOCK_READWRITE_DATA: + if (iomode == IOMODE_READ) + *s = PNFS_BLOCK_READ_DATA; + /* ---- Any use is permitted. ---- */ + break; + case PNFS_BLOCK_READ_DATA: + /* ---- Committed as read only data. ---- */ + if (iomode == IOMODE_RW) + return False; + break; + case PNFS_BLOCK_INVALID_DATA: + /* ---- Blocks have been allocated, but not initialized ---- */ + if (iomode == IOMODE_READ) + *s = PNFS_BLOCK_NONE_DATA; + break; + case PNFS_BLOCK_NONE_DATA: + /* ---- Hole-y file. No backing store avail. ---- */ + if (iomode != IOMODE_READ) + return False; + break; + default: + BUG(); + } + return True; +} + +#endif /* CONFIG_SPNFS_BLOCK */ diff -up linux-2.6.34.noarch/fs/nfs/delegation.c.orig linux-2.6.34.noarch/fs/nfs/delegation.c --- linux-2.6.34.noarch/fs/nfs/delegation.c.orig 2010-09-30 10:15:17.729711000 -0400 +++ linux-2.6.34.noarch/fs/nfs/delegation.c 2010-09-30 10:17:08.609991000 -0400 @@ -104,7 +104,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) continue; - if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) + if (memcmp(state->stateid.u.data, stateid->u.data, + sizeof(state->stateid.u.data)) != 0) continue; get_nfs_open_context(ctx); spin_unlock(&inode->i_lock); @@ -133,8 +134,8 @@ void nfs_inode_reclaim_delegation(struct if (delegation != NULL) { spin_lock(&delegation->lock); if (delegation->inode != NULL) { - memcpy(delegation->stateid.data, res->delegation.data, - sizeof(delegation->stateid.data)); + memcpy(delegation->stateid.u.data, res->delegation.u.data, + sizeof(delegation->stateid.u.data)); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; oldcred = delegation->cred; @@ -187,8 +188,9 @@ static struct nfs_delegation *nfs_detach if (delegation == NULL) goto nomatch; spin_lock(&delegation->lock); - if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, - sizeof(delegation->stateid.data)) != 0) + if (stateid != NULL && memcmp(delegation->stateid.u.data, + stateid->u.data, + sizeof(delegation->stateid.u.data)) != 0) goto nomatch_unlock; list_del_rcu(&delegation->super_list); delegation->inode = NULL; @@ -216,8 +218,8 @@ int nfs_inode_set_delegation(struct inod delegation = kmalloc(sizeof(*delegation), GFP_NOFS); if (delegation == NULL) return -ENOMEM; - memcpy(delegation->stateid.data, res->delegation.data, - sizeof(delegation->stateid.data)); + memcpy(delegation->stateid.u.data, res->delegation.u.data, + sizeof(delegation->stateid.u.data)); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; delegation->change_attr = nfsi->change_attr; @@ -471,9 +473,7 @@ void nfs_expire_unreferenced_delegations /* * Asynchronous delegation recall! */ -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, - int (*validate_stateid)(struct nfs_delegation *delegation, - const nfs4_stateid *stateid)) +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_delegation *delegation; @@ -481,7 +481,7 @@ int nfs_async_inode_return_delegation(st rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (!validate_stateid(delegation, stateid)) { + if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { rcu_read_unlock(); return -ENOENT; } @@ -562,7 +562,8 @@ int nfs4_copy_delegation_stateid(nfs4_st rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); if (delegation != NULL) { - memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); + memcpy(dst->u.data, delegation->stateid.u.data, + sizeof(dst->u.data)); ret = 1; } rcu_read_unlock(); diff -up linux-2.6.34.noarch/fs/nfs/delegation.h.orig linux-2.6.34.noarch/fs/nfs/delegation.h --- linux-2.6.34.noarch/fs/nfs/delegation.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfs/delegation.h 2010-09-30 10:17:08.615000000 -0400 @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); int nfs_inode_return_delegation(struct inode *inode); -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, - int (*validate_stateid)(struct nfs_delegation *delegation, - const nfs4_stateid *stateid)); +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff -up linux-2.6.34.noarch/fs/nfsd/export.c.orig linux-2.6.34.noarch/fs/nfsd/export.c --- linux-2.6.34.noarch/fs/nfsd/export.c.orig 2010-09-30 10:15:18.314726000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/export.c 2010-09-30 10:17:08.834999000 -0400 @@ -17,11 +17,19 @@ #include #include +#include +#if defined(CONFIG_SPNFS) +#include +#if defined(CONFIG_SPNFS_BLOCK) +#include +#endif +#endif #include #include #include "nfsd.h" #include "nfsfh.h" +#include "pnfsd.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -352,6 +360,40 @@ static int svc_export_upcall(struct cach return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); } +#if defined(CONFIG_PNFSD) +static struct pnfsd_cb_operations pnfsd_cb_op = { + .cb_layout_recall = nfsd_layout_recall_cb, + .cb_device_notify = nfsd_device_notify_cb, + + .cb_get_state = nfs4_pnfs_cb_get_state, + .cb_change_state = nfs4_pnfs_cb_change_state, +}; + +#if defined(CONFIG_SPNFS) +static struct pnfs_export_operations spnfs_export_ops = { + .layout_type = spnfs_layout_type, + .get_device_info = spnfs_getdeviceinfo, + .get_device_iter = spnfs_getdeviceiter, + .layout_get = spnfs_layoutget, + .layout_return = spnfs_layoutreturn, +}; + +static struct pnfs_export_operations spnfs_ds_export_ops = { + .get_state = spnfs_get_state, +}; + +#if defined(CONFIG_SPNFS_BLOCK) +static struct pnfs_export_operations bl_export_ops = { + .layout_type = bl_layout_type, + .get_device_info = bl_getdeviceinfo, + .get_device_iter = bl_getdeviceiter, + .layout_get = bl_layoutget, + .layout_return = bl_layoutreturn, +}; +#endif /* CONFIG_SPNFS_BLOCK */ +#endif /* CONFIG_SPNFS */ +#endif /* CONFIG_PNFSD */ + static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); @@ -395,6 +437,47 @@ static int check_export(struct inode *in return -EINVAL; } +#if !defined(CONFIG_SPNFS) + if (inode->i_sb->s_pnfs_op && + (!inode->i_sb->s_pnfs_op->layout_type || + !inode->i_sb->s_pnfs_op->get_device_info || + !inode->i_sb->s_pnfs_op->layout_get)) { + dprintk("exp_export: export of invalid fs pnfs export ops.\n"); + return -EINVAL; + } +#endif /* CONFIG_SPNFS */ + +#if defined(CONFIG_PNFSD_LOCAL_EXPORT) + if (!inode->i_sb->s_pnfs_op) + pnfsd_lexp_init(inode); + return 0; +#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ + +#if defined(CONFIG_SPNFS) +#if defined(CONFIG_SPNFS_BLOCK) + if (pnfs_block_enabled(inode, *flags)) { + dprintk("set pnfs block export structure... \n"); + inode->i_sb->s_pnfs_op = &bl_export_ops; + } else +#endif /* CONFIG_SPNFS_BLOCK */ + /* + * spnfs_enabled() indicates we're an MDS. + * XXX Better to check an export time option as well. + */ + if (spnfs_enabled()) { + dprintk("set spnfs export structure...\n"); + inode->i_sb->s_pnfs_op = &spnfs_export_ops; + } else { + dprintk("%s spnfs not in use\n", __func__); + + /* + * get_state is needed if we're a DS using spnfs. + * XXX Better to check an export time option instead. + */ + inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; + } +#endif /* CONFIG_SPNFS */ + return 0; } @@ -586,6 +669,8 @@ static int svc_export_parse(struct cache if (exp.ex_uuid == NULL) err = -ENOMEM; } + } else if (strcmp(buf, "pnfs") == 0) { + exp.ex_pnfs = 1; } else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else @@ -660,6 +745,8 @@ static int svc_export_show(struct seq_fi seq_printf(m, "%02x", exp->ex_uuid[i]); } } + if (exp->ex_pnfs) + seq_puts(m, ",pnfs"); show_secinfo(m, exp); } seq_puts(m, ")\n"); @@ -687,6 +774,7 @@ static void svc_export_init(struct cache new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; + new->ex_pnfs = 0; } static void export_update(struct cache_head *cnew, struct cache_head *citem) @@ -699,6 +787,7 @@ static void export_update(struct cache_h new->ex_anon_uid = item->ex_anon_uid; new->ex_anon_gid = item->ex_anon_gid; new->ex_fsid = item->ex_fsid; + new->ex_pnfs = item->ex_pnfs; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; new->ex_pathname = item->ex_pathname; @@ -1635,8 +1724,17 @@ nfsd_export_init(void) if (rv) return rv; rv = cache_register(&svc_expkey_cache); - if (rv) + if (rv) { cache_unregister(&svc_export_cache); + goto out; + } +#if defined(CONFIG_PNFSD) + spin_lock(&pnfsd_cb_ctl.lock); + pnfsd_cb_ctl.module = THIS_MODULE; + pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; + spin_unlock(&pnfsd_cb_ctl.lock); +#endif /* CONFIG_PNFSD */ +out: return rv; } @@ -1664,6 +1762,12 @@ nfsd_export_shutdown(void) exp_writelock(); +#if defined(CONFIG_PNFSD) + spin_lock(&pnfsd_cb_ctl.lock); + pnfsd_cb_ctl.module = NULL; + pnfsd_cb_ctl.cb_op = NULL; + spin_unlock(&pnfsd_cb_ctl.lock); +#endif /* CONFIG_PNFSD */ cache_unregister(&svc_expkey_cache); cache_unregister(&svc_export_cache); svcauth_unix_purge(); diff -up linux-2.6.34.noarch/fs/nfs/direct.c.orig linux-2.6.34.noarch/fs/nfs/direct.c --- linux-2.6.34.noarch/fs/nfs/direct.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfs/direct.c 2010-09-30 10:17:08.620991000 -0400 @@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea .rpc_release = nfs_direct_read_release, }; +static long nfs_direct_read_execute(struct nfs_read_data *data, + struct rpc_task_setup *task_setup_data, + struct rpc_message *msg) +{ + struct inode *inode = data->inode; + struct rpc_task *task; + + nfs_fattr_init(&data->fattr); + msg->rpc_argp = &data->args; + msg->rpc_resp = &data->res; + + task_setup_data->task = &data->task; + task_setup_data->callback_data = data; + NFS_PROTO(inode)->read_setup(data, msg); + + task = rpc_run_task(task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + + rpc_put_task(task); + + dprintk("NFS: %5u initiated direct read call " + "(req %s/%lld, %u bytes @ offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + return 0; +} + /* * For each rsize'd chunk of the user's buffer, dispatch an NFS READ * operation. If nfs_readdata_alloc() or get_user_pages() fails, @@ -283,7 +315,6 @@ static ssize_t nfs_direct_read_schedule_ unsigned long user_addr = (unsigned long)iov->iov_base; size_t count = iov->iov_len; size_t rsize = NFS_SERVER(inode)->rsize; - struct rpc_task *task; struct rpc_message msg = { .rpc_cred = ctx->cred, }; @@ -343,26 +374,9 @@ static ssize_t nfs_direct_read_schedule_ data->res.fattr = &data->fattr; data->res.eof = 0; data->res.count = bytes; - nfs_fattr_init(&data->fattr); - msg.rpc_argp = &data->args; - msg.rpc_resp = &data->res; - task_setup_data.task = &data->task; - task_setup_data.callback_data = data; - NFS_PROTO(inode)->read_setup(data, &msg); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - break; - rpc_put_task(task); - - dprintk("NFS: %5u initiated direct read call " - "(req %s/%Ld, %zu bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - bytes, - (unsigned long long)data->args.offset); + if (nfs_direct_read_execute(data, &task_setup_data, &msg)) + break; started += bytes; user_addr += bytes; @@ -448,12 +462,15 @@ static void nfs_direct_free_writedata(st } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static long nfs_direct_write_execute(struct nfs_write_data *data, + struct rpc_task_setup *task_setup_data, + struct rpc_message *msg); + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct inode *inode = dreq->inode; struct list_head *p; struct nfs_write_data *data; - struct rpc_task *task; struct rpc_message msg = { .rpc_cred = dreq->ctx->cred, }; @@ -487,25 +504,7 @@ static void nfs_direct_write_reschedule( * Reuse data->task; data->args should not have changed * since the original request was sent. */ - task_setup_data.task = &data->task; - task_setup_data.callback_data = data; - msg.rpc_argp = &data->args; - msg.rpc_resp = &data->res; - NFS_PROTO(inode)->write_setup(data, &msg); - - /* - * We're called via an RPC callback, so BKL is already held. - */ - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); - - dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - data->args.count, - (unsigned long long)data->args.offset); + nfs_direct_write_execute(data, &task_setup_data, &msg); } if (put_dreq(dreq)) @@ -548,10 +547,31 @@ static const struct rpc_call_ops nfs_com .rpc_release = nfs_direct_commit_release, }; +static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, + struct nfs_write_data *data, + struct rpc_task_setup *task_setup_data, + struct rpc_message *msg) +{ + struct rpc_task *task; + + NFS_PROTO(data->inode)->commit_setup(data, msg); + + /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ + dreq->commit_data = NULL; + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + task = rpc_run_task(task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + + rpc_put_task(task); + return 0; +} + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) { struct nfs_write_data *data = dreq->commit_data; - struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, @@ -579,16 +599,7 @@ static void nfs_direct_commit_schedule(s data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); - NFS_PROTO(data->inode)->commit_setup(data, &msg); - - /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ - dreq->commit_data = NULL; - - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); - - task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); } static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) @@ -690,6 +701,36 @@ static const struct rpc_call_ops nfs_wri .rpc_release = nfs_direct_write_release, }; +static long nfs_direct_write_execute(struct nfs_write_data *data, + struct rpc_task_setup *task_setup_data, + struct rpc_message *msg) +{ + struct inode *inode = data->inode; + struct rpc_task *task; + + task_setup_data->task = &data->task; + task_setup_data->callback_data = data; + msg->rpc_argp = &data->args; + msg->rpc_resp = &data->res; + NFS_PROTO(inode)->write_setup(data, msg); + + task = rpc_run_task(task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + + rpc_put_task(task); + + dprintk("NFS: %5u initiated direct write call " + "(req %s/%lld, %u bytes @ offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + return 0; +} + /* * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE * operation. If nfs_writedata_alloc() or get_user_pages() fails, @@ -705,7 +746,6 @@ static ssize_t nfs_direct_write_schedule struct inode *inode = ctx->path.dentry->d_inode; unsigned long user_addr = (unsigned long)iov->iov_base; size_t count = iov->iov_len; - struct rpc_task *task; struct rpc_message msg = { .rpc_cred = ctx->cred, }; @@ -771,24 +811,8 @@ static ssize_t nfs_direct_write_schedule data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); - task_setup_data.task = &data->task; - task_setup_data.callback_data = data; - msg.rpc_argp = &data->args; - msg.rpc_resp = &data->res; - NFS_PROTO(inode)->write_setup(data, &msg); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - break; - rpc_put_task(task); - - dprintk("NFS: %5u initiated direct write call " - "(req %s/%Ld, %zu bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - bytes, - (unsigned long long)data->args.offset); + if (nfs_direct_write_execute(data, &task_setup_data, &msg)) + break; started += bytes; user_addr += bytes; diff -up linux-2.6.34.noarch/fs/nfsd/Kconfig.orig linux-2.6.34.noarch/fs/nfsd/Kconfig --- linux-2.6.34.noarch/fs/nfsd/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/Kconfig 2010-09-30 10:17:08.815000000 -0400 @@ -79,3 +79,52 @@ config NFSD_V4 available from http://linux-nfs.org/. If unsure, say N. + +config PNFSD + bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" + depends on NFSD_V4 && EXPERIMENTAL + select EXPORTFS_FILE_LAYOUT + help + This option enables support for the parallel NFS features of the + minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) + in the kernel's NFS server. + + Unless you're an NFS developer, say N. + +config PNFSD_LOCAL_EXPORT + bool "Enable pNFS support for exporting local filesystems for debugging purposes" + depends on PNFSD + help + Say Y here if you want your pNFS server to export local file systems + over the files layout type. With this option the MDS (metadata + server) functions also as a single DS (data server). This is mostly + useful for development and debugging purposes. + + If unsure, say N. + +config SPNFS + bool "Provide spNFS server support (EXPERIMENTAL)" + depends on PNFSD + select RPCSEC_GSS_KRB5 + help + Say Y here if you want spNFS server support. + + If unsure, say N. + +config SPNFS_LAYOUTSEGMENTS + bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" + depends on SPNFS + select RPCSEC_GSS_KRB5 + help + Say Y here if you want spNFS to be able to return layout segments. + + If unsure, say N. + +config SPNFS_BLOCK + bool "Provide Block Layout server support (EXPERIMENTAL)" + depends on SPNFS + select EXPORTFS_BLOCK_LAYOUT + help + Say Y here if you want spNFS block layout support + + If unsure, say N. diff -up linux-2.6.34.noarch/fs/nfsd/Makefile.orig linux-2.6.34.noarch/fs/nfsd/Makefile --- linux-2.6.34.noarch/fs/nfsd/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/Makefile 2010-09-30 10:17:08.820000000 -0400 @@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ nfs4acl.o nfs4callback.o nfs4recover.o +nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o +nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o +nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o +nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o diff -up linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4callback.c --- linux-2.6.34.noarch/fs/nfsd/nfs4callback.c.orig 2010-09-30 10:15:18.320728000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4callback.c 2010-09-30 10:17:08.841998000 -0400 @@ -40,7 +40,6 @@ #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 -#define NFS4_STATEID_SIZE 16 /* Index of predefined Linux callback client operations */ @@ -48,11 +47,17 @@ enum { NFSPROC4_CLNT_CB_NULL = 0, NFSPROC4_CLNT_CB_RECALL, NFSPROC4_CLNT_CB_SEQUENCE, +#if defined(CONFIG_PNFSD) + NFSPROC4_CLNT_CB_LAYOUT, + NFSPROC4_CLNT_CB_DEVICE, +#endif }; enum nfs_cb_opnum4 { OP_CB_RECALL = 4, + OP_CB_LAYOUT = 5, OP_CB_SEQUENCE = 11, + OP_CB_DEVICE = 14, }; #define NFS4_MAXTAGLEN 20 @@ -78,6 +83,19 @@ enum nfs_cb_opnum4 { #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 3 + \ + enc_nfs4_fh_sz + 4) +#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) +#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 6) +#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) /* * Generic encode routines from fs/nfs/nfs4xdr.c @@ -94,6 +112,10 @@ xdr_writemem(__be32 *p, const void *ptr, } #define WRITE32(n) *p++ = htonl(n) +#define WRITE64(n) do { \ + *p++ = htonl((u32)((n) >> 32)); \ + *p++ = htonl((u32)(n)); \ +} while (0) #define WRITEMEM(ptr,nbytes) do { \ p = xdr_writemem(p, ptr, nbytes); \ } while (0) @@ -204,6 +226,16 @@ nfs_cb_stat_to_errno(int stat) */ static void +encode_stateid(struct xdr_stream *xdr, stateid_t *sid) +{ + __be32 *p; + + RESERVE_SPACE(sizeof(stateid_t)); + WRITE32(sid->si_generation); + WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); +} + +static void encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) { __be32 * p; @@ -228,10 +260,10 @@ encode_cb_recall(struct xdr_stream *xdr, __be32 *p; int len = dp->dl_fh.fh_size; - RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); + RESERVE_SPACE(4); WRITE32(OP_CB_RECALL); - WRITE32(dp->dl_stateid.si_generation); - WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); + encode_stateid(xdr, &dp->dl_stateid); + RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); WRITE32(0); /* truncate optimization not implemented */ WRITE32(len); WRITEMEM(&dp->dl_fh.fh_base, len); @@ -259,6 +291,111 @@ encode_cb_sequence(struct xdr_stream *xd hdr->nops++; } +#if defined(CONFIG_PNFSD) + +#include "pnfsd.h" + +static void +encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr, + struct nfs4_cb_compound_hdr *hdr) +{ + u32 *p; + + BUG_ON(hdr->minorversion == 0); + + RESERVE_SPACE(20); + WRITE32(OP_CB_LAYOUT); + WRITE32(clr->cb.cbl_seg.layout_type); + WRITE32(clr->cb.cbl_seg.iomode); + WRITE32(clr->cb.cbl_layoutchanged); + WRITE32(clr->cb.cbl_recall_type); + if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { + struct nfs4_fsid fsid = clr->cb.cbl_fsid; + + RESERVE_SPACE(16); + WRITE64(fsid.major); + WRITE64(fsid.minor); + dprintk("%s: type %x iomode %d changed %d recall_type %d " + "fsid 0x%llx-0x%llx\n", + __func__, clr->cb.cbl_seg.layout_type, + clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, + clr->cb.cbl_recall_type, fsid.major, fsid.minor); + } else if (clr->cb.cbl_recall_type == RETURN_FILE) { + int len = clr->clr_file->fi_fhlen; + stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; + + RESERVE_SPACE(20 + len); + WRITE32(len); + WRITEMEM(clr->clr_file->fi_fhval, len); + WRITE64(clr->cb.cbl_seg.offset); + WRITE64(clr->cb.cbl_seg.length); + encode_stateid(xdr, cbl_sid); + dprintk("%s: type %x iomode %d changed %d recall_type %d " + "offset %lld length %lld stateid " STATEID_FMT "\n", + __func__, clr->cb.cbl_seg.layout_type, + clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, + clr->cb.cbl_recall_type, + clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, + STATEID_VAL(cbl_sid)); + } else { + dprintk("%s: type %x iomode %d changed %d recall_type %d\n", + __func__, clr->cb.cbl_seg.layout_type, + clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, + clr->cb.cbl_recall_type); + } + hdr->nops++; +} + +static void +encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd, + struct nfs4_cb_compound_hdr *hdr) +{ + u32 *p; + int i; + int len = nd->nd_list->cbd_len; + struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; + + dprintk("NFSD %s: --> num %d\n", __func__, len); + + BUG_ON(hdr->minorversion == 0); + + RESERVE_SPACE(8); + WRITE32(OP_CB_DEVICE); + + /* notify4 cnda_changes<>; */ + WRITE32(len); + for (i = 0; i < len; i++) { + dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", + __func__, cbd[i].cbd_notify_type, + cbd[i].cbd_layout_type, + cbd[i].cbd_devid.sbid, + cbd[i].cbd_devid.devid, + cbd[i].cbd_immediate, i); + + BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && + cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); + RESERVE_SPACE(32); + /* bitmap4 notify_mask; */ + WRITE32(1); + WRITE32(cbd[i].cbd_notify_type); + /* opaque notify_vals<>; */ + if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) + WRITE32(24); + else + WRITE32(20); + WRITE32(cbd[i].cbd_layout_type); + WRITE64(cbd[i].cbd_devid.sbid); + WRITE64(cbd[i].cbd_devid.devid); + + if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { + RESERVE_SPACE(4); + WRITE32(cbd[i].cbd_immediate); + } + } + hdr->nops++; +} +#endif /* CONFIG_PNFSD */ + static int nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) { @@ -288,6 +425,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst * return 0; } +#if defined(CONFIG_PNFSD) +static int +nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p, + struct nfs4_rpc_args *rpc_args) +{ + struct xdr_stream xdr; + struct nfs4_layoutrecall *args = rpc_args->args_op; + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = rpc_args->args_seq.cbs_minorversion, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_cb_compound_hdr(&xdr, &hdr); + encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); + encode_cb_layout(&xdr, args, &hdr); + encode_cb_nops(&hdr); + return 0; +} + +static int +nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p, + struct nfs4_rpc_args *rpc_args) +{ + struct xdr_stream xdr; + struct nfs4_notify_device *args = rpc_args->args_op; + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = rpc_args->args_seq.cbs_minorversion, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_cb_compound_hdr(&xdr, &hdr); + encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); + encode_cb_device(&xdr, args, &hdr); + encode_cb_nops(&hdr); + return 0; +} +#endif /* CONFIG_PNFSD */ static int decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ @@ -403,6 +579,48 @@ out: return status; } +#if defined(CONFIG_PNFSD) +static int +nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p, + struct nfsd4_cb_sequence *seq) +{ + struct xdr_stream xdr; + struct nfs4_cb_compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_cb_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_cb_sequence(&xdr, seq, rqstp); + if (status) + goto out; + status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT); +out: + return status; +} + +static int +nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p, + struct nfsd4_cb_sequence *seq) +{ + struct xdr_stream xdr; + struct nfs4_cb_compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_cb_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_cb_sequence(&xdr, seq, rqstp); + if (status) + goto out; + status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE); +out: + return status; +} +#endif /* CONFIG_PNFSD */ + /* * RPC procedure tables */ @@ -420,6 +638,10 @@ out: static struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), +#if defined(CONFIG_PNFSD) + PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout), + PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device), +#endif }; static struct rpc_version nfs_cb_version4 = { @@ -606,10 +828,9 @@ out: * TODO: cb_sequence should support referring call lists, cachethis, multiple * slots, and mark callback channel down on communication errors. */ -static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) +static void nfsd4_cb_prepare_sequence(struct rpc_task *task, + struct nfs4_client *clp) { - struct nfs4_delegation *dp = calldata; - struct nfs4_client *clp = dp->dl_client; struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; u32 minorversion = clp->cl_cb_conn.cb_minorversion; int status = 0; @@ -629,11 +850,15 @@ static void nfsd4_cb_prepare(struct rpc_ rpc_call_start(task); } -static void nfsd4_cb_done(struct rpc_task *task, void *calldata) +static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) { struct nfs4_delegation *dp = calldata; - struct nfs4_client *clp = dp->dl_client; + nfsd4_cb_prepare_sequence(task, dp->dl_client); +} +static void nfsd4_cb_done_sequence(struct rpc_task *task, + struct nfs4_client *clp) +{ dprintk("%s: minorversion=%d\n", __func__, clp->cl_cb_conn.cb_minorversion); @@ -657,7 +882,7 @@ static void nfsd4_cb_recall_done(struct struct nfs4_client *clp = dp->dl_client; struct rpc_clnt *current_rpc_client = clp->cl_cb_client; - nfsd4_cb_done(task, calldata); + nfsd4_cb_done_sequence(task, clp); if (current_rpc_client == NULL) { /* We're shutting down; give up. */ @@ -688,7 +913,7 @@ static void nfsd4_cb_recall_done(struct if (dp->dl_retries--) { rpc_delay(task, 2*HZ); task->tk_status = 0; - rpc_restart_call(task); + rpc_restart_call_prepare(task); return; } else { atomic_set(&clp->cl_cb_set, 0); @@ -704,7 +929,7 @@ static void nfsd4_cb_recall_release(void } static const struct rpc_call_ops nfsd4_cb_recall_ops = { - .rpc_call_prepare = nfsd4_cb_prepare, + .rpc_call_prepare = nfsd4_cb_recall_prepare, .rpc_call_done = nfsd4_cb_recall_done, .rpc_release = nfsd4_cb_recall_release, }; @@ -781,3 +1006,173 @@ void nfsd4_cb_recall(struct nfs4_delegat { queue_work(callback_wq, &dp->dl_recall.cb_work); } + +#if defined(CONFIG_PNFSD) +static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutrecall *clr = calldata; + nfsd4_cb_prepare_sequence(task, clr->clr_client); +} + +static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutrecall *clr = calldata; + struct nfs4_client *clp = clr->clr_client; + + nfsd4_cb_done_sequence(task, clp); + + if (!task->tk_status) + return; + + printk("%s: clp %p cb_client %p fp %p failed with status %d\n", + __func__, + clp, + clp->cl_cb_client, + clr->clr_file, + task->tk_status); + + switch (task->tk_status) { + case -EIO: + /* Network partition? */ + atomic_set(&clp->cl_cb_set, 0); + warn_no_callback_path(clp, task->tk_status); + /* FIXME: + * The pnfs standard states that we need to only expire + * the client after at-least "lease time" .eg lease-time * 2 + * when failing to communicate a recall + */ + break; + case -NFS4ERR_DELAY: + /* Pole the client until it's done with the layout */ + rpc_delay(task, HZ/100); /* 10 mili-seconds */ + task->tk_status = 0; + rpc_restart_call_prepare(task); + break; + case -NFS4ERR_NOMATCHING_LAYOUT: + task->tk_status = 0; + nomatching_layout(clr); + } +} + +static void nfsd4_cb_layout_release(void *calldata) +{ + struct nfs4_layoutrecall *clr = calldata; + kfree(clr->clr_args); + clr->clr_args = NULL; + put_layoutrecall(clr); +} + +static const struct rpc_call_ops nfsd4_cb_layout_ops = { + .rpc_call_prepare = nfsd4_cb_layout_prepare, + .rpc_call_done = nfsd4_cb_layout_done, + .rpc_release = nfsd4_cb_layout_release, +}; + +/* + * Called with state lock. + */ +int +nfsd4_cb_layout(struct nfs4_layoutrecall *clr) +{ + struct nfs4_client *clp = clr->clr_client; + struct rpc_clnt *clnt = clp->cl_cb_client; + struct nfs4_rpc_args *args; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT], + .rpc_cred = callback_cred + }; + int status; + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) { + status = -ENOMEM; + goto out; + } + clr->clr_args = args; + args->args_op = clr; + msg.rpc_argp = args; + status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, + &nfsd4_cb_layout_ops, clr); +out: + if (status) { + kfree(args); + put_layoutrecall(clr); + } + dprintk("NFSD: nfsd4_cb_layout: status %d\n", status); + return status; +} + +static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_notify_device *cbnd = calldata; + nfsd4_cb_prepare_sequence(task, cbnd->nd_client); +} + +static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_notify_device *cbnd = calldata; + struct nfs4_client *clp = cbnd->nd_client; + + nfsd4_cb_done_sequence(task, clp); + + dprintk("%s: clp %p cb_client %p: status %d\n", + __func__, + clp, + clp->cl_cb_client, + task->tk_status); + + if (task->tk_status == -EIO) { + /* Network partition? */ + atomic_set(&clp->cl_cb_set, 0); + warn_no_callback_path(clp, task->tk_status); + } +} + +static void nfsd4_cb_device_release(void *calldata) +{ + struct nfs4_notify_device *cbnd = calldata; + kfree(cbnd->nd_args); + cbnd->nd_args = NULL; + kfree(cbnd); +} + +static const struct rpc_call_ops nfsd4_cb_device_ops = { + .rpc_call_prepare = nfsd4_cb_device_prepare, + .rpc_call_done = nfsd4_cb_device_done, + .rpc_release = nfsd4_cb_device_release, +}; + +/* + * Called with state lock. + */ +int +nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) +{ + struct nfs4_client *clp = cbnd->nd_client; + struct rpc_clnt *clnt = clp->cl_cb_client; + struct nfs4_rpc_args *args; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE], + .rpc_cred = callback_cred + }; + int status = -EIO; + + dprintk("%s: clp %p\n", __func__, clp); + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) { + status = -ENOMEM; + goto out; + } + args->args_op = cbnd; + msg.rpc_argp = args; + + status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, + &nfsd4_cb_device_ops, cbnd); +out: + if (status) + kfree(args); + dprintk("%s: status %d\n", __func__, status); + return status; +} +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c --- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-09-30 10:17:08.845997000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsd.c 2010-09-30 10:17:08.863998000 -0400 @@ -0,0 +1,1679 @@ +/****************************************************************************** + * + * (c) 2007 Network Appliance, Inc. All Rights Reserved. + * (c) 2009 NetApp. All Rights Reserved. + * + * NetApp provides this source code under the GPL v2 License. + * The GPL v2 license is available at + * http://opensource.org/licenses/gpl-license.php. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *****************************************************************************/ + +#include "pnfsd.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +/* Globals */ +static u32 current_layoutid = 1; + +/* + * Currently used for manipulating the layout state. + */ +static DEFINE_SPINLOCK(layout_lock); + +#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) +# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) +#else +# define BUG_ON_UNLOCKED_LAYOUT() +#endif + +/* + * Layout state - NFSv4.1 pNFS + */ +static struct kmem_cache *pnfs_layout_slab; +static struct kmem_cache *pnfs_layoutrecall_slab; + +/* hash table for nfsd4_pnfs_deviceid.sbid */ +#define SBID_HASH_BITS 8 +#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) +#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) + +struct sbid_tracker { + u64 id; + struct super_block *sb; + struct list_head hash; +}; + +static u64 current_sbid; +static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; + +static inline unsigned long +sbid_hashval(struct super_block *sb) +{ + return hash_ptr(sb, SBID_HASH_BITS); +} + +static inline struct sbid_tracker * +alloc_sbid(void) +{ + return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); +} + +static void +destroy_sbid(struct sbid_tracker *sbid) +{ + spin_lock(&layout_lock); + list_del(&sbid->hash); + spin_unlock(&layout_lock); + kfree(sbid); +} + +void +nfsd4_free_pnfs_slabs(void) +{ + int i; + struct sbid_tracker *sbid; + + nfsd4_free_slab(&pnfs_layout_slab); + nfsd4_free_slab(&pnfs_layoutrecall_slab); + + for (i = 0; i < SBID_HASH_SIZE; i++) { + while (!list_empty(&sbid_hashtbl[i])) { + sbid = list_first_entry(&sbid_hashtbl[i], + struct sbid_tracker, + hash); + destroy_sbid(sbid); + } + } +} + +int +nfsd4_init_pnfs_slabs(void) +{ + int i; + + pnfs_layout_slab = kmem_cache_create("pnfs_layouts", + sizeof(struct nfs4_layout), 0, 0, NULL); + if (pnfs_layout_slab == NULL) + return -ENOMEM; + pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", + sizeof(struct nfs4_layoutrecall), 0, 0, NULL); + if (pnfs_layoutrecall_slab == NULL) + return -ENOMEM; + + for (i = 0; i < SBID_HASH_SIZE; i++) { + INIT_LIST_HEAD(&sbid_hashtbl[i]); + } + + return 0; +} + +/* XXX: Need to implement the notify types and track which + * clients have which devices. */ +void pnfs_set_device_notify(clientid_t *clid, unsigned int types) +{ + struct nfs4_client *clp; + dprintk("%s: -->\n", __func__); + + nfs4_lock_state(); + /* Indicate that client has a device so we can only notify + * the correct clients */ + clp = find_confirmed_client(clid); + if (clp) { + atomic_inc(&clp->cl_deviceref); + dprintk("%s: Incr device count (clnt %p) to %d\n", + __func__, clp, atomic_read(&clp->cl_deviceref)); + } + nfs4_unlock_state(); +} + +/* Clear notifications for this client + * XXX: Do we need to loop through a clean up all + * krefs when nfsd cleans up the client? */ +void pnfs_clear_device_notify(struct nfs4_client *clp) +{ + atomic_dec(&clp->cl_deviceref); + dprintk("%s: Decr device count (clnt %p) to %d\n", + __func__, clp, atomic_read(&clp->cl_deviceref)); +} + +static struct nfs4_layout_state * +alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, + stateid_t *stateid) +{ + struct nfs4_layout_state *new; + + /* FIXME: use a kmem_cache */ + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return new; + get_nfs4_file(fp); + INIT_LIST_HEAD(&new->ls_perfile); + INIT_LIST_HEAD(&new->ls_layouts); + kref_init(&new->ls_ref); + new->ls_client = clp; + new->ls_file = fp; + new->ls_stateid.si_boot = stateid->si_boot; + new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ + new->ls_stateid.si_generation = 1; + spin_lock(&layout_lock); + new->ls_stateid.si_fileid = current_layoutid++; + list_add(&new->ls_perfile, &fp->fi_layout_states); + spin_unlock(&layout_lock); + return new; +} + +static inline void +get_layout_state(struct nfs4_layout_state *ls) +{ + kref_get(&ls->ls_ref); +} + +static void +destroy_layout_state_common(struct nfs4_layout_state *ls) +{ + struct nfs4_file *fp = ls->ls_file; + + dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, + ls->ls_client); + BUG_ON(!list_empty(&ls->ls_layouts)); + kfree(ls); + put_nfs4_file(fp); +} + +static void +destroy_layout_state(struct kref *kref) +{ + struct nfs4_layout_state *ls = + container_of(kref, struct nfs4_layout_state, ls_ref); + + spin_lock(&layout_lock); + list_del(&ls->ls_perfile); + spin_unlock(&layout_lock); + destroy_layout_state_common(ls); +} + +static void +destroy_layout_state_locked(struct kref *kref) +{ + struct nfs4_layout_state *ls = + container_of(kref, struct nfs4_layout_state, ls_ref); + + list_del(&ls->ls_perfile); + destroy_layout_state_common(ls); +} + +static inline void +put_layout_state(struct nfs4_layout_state *ls) +{ + dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, + atomic_read(&ls->ls_ref.refcount)); + kref_put(&ls->ls_ref, destroy_layout_state); +} + +static inline void +put_layout_state_locked(struct nfs4_layout_state *ls) +{ + dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, + atomic_read(&ls->ls_ref.refcount)); + kref_put(&ls->ls_ref, destroy_layout_state_locked); +} + +/* + * Search the fp->fi_layout_state list for a layout state with the clientid. + * If not found, then this is a 'first open/delegation/lock stateid' from + * the client for this file. + * Called under the layout_lock. + */ +static struct nfs4_layout_state * +find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) +{ + struct nfs4_layout_state *ls; + + BUG_ON_UNLOCKED_LAYOUT(); + list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { + if (ls->ls_client == clp) { + dprintk("pNFS %s: before GET ls %p ls_ref %d\n", + __func__, ls, + atomic_read(&ls->ls_ref.refcount)); + get_layout_state(ls); + return ls; + } + } + return NULL; +} + +static __be32 +verify_stateid(struct nfs4_file *fp, stateid_t *stateid) +{ + struct nfs4_stateid *local = NULL; + struct nfs4_delegation *temp = NULL; + + /* check if open or lock stateid */ + local = find_stateid(stateid, RD_STATE); + if (local) + return 0; + temp = find_delegation_stateid(fp->fi_inode, stateid); + if (temp) + return 0; + return nfserr_bad_stateid; +} + +/* + * nfs4_preocess_layout_stateid () + * + * We have looked up the nfs4_file corresponding to the current_fh, and + * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() + * that make sense with a layout stateid. + * + * Called with the state_lock held + * Returns zero and stateid is updated, or error. + * + * Note: the struct nfs4_layout_state pointer is only set by layoutget. + */ +static __be32 +nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, + stateid_t *stateid, struct nfs4_layout_state **lsp) +{ + struct nfs4_layout_state *ls = NULL; + __be32 status = 0; + + dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); + + dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(stateid)); + + status = nfs4_check_stateid(stateid); + if (status) + goto out; + + /* Is this the first use of this layout ? */ + spin_lock(&layout_lock); + ls = find_get_layout_state(clp, fp); + spin_unlock(&layout_lock); + if (!ls) { + /* Only alloc layout state on layoutget (which sets lsp). */ + if (!lsp) { + dprintk("%s ERROR: Not layoutget & no layout stateid\n", + __func__); + status = nfserr_bad_stateid; + goto out; + } + dprintk("%s Initial stateid for layout: file %p client %p\n", + __func__, fp, clp); + + /* verify input stateid */ + status = verify_stateid(fp, stateid); + if (status) { + dprintk("%s ERROR: invalid open/deleg/lock stateid\n", + __func__); + goto out; + } + ls = alloc_init_layout_state(clp, fp, stateid); + if (!ls) { + dprintk("%s pNFS ERROR: no memory for layout state\n", + __func__); + status = nfserr_resource; + goto out; + } + } else { + dprintk("%s Not initial stateid. Layout state %p file %p\n", + __func__, ls, fp); + + /* BAD STATEID */ + status = nfserr_bad_stateid; + if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, + sizeof(stateid_opaque_t)) != 0) { + + /* if a LAYOUTGET operation and stateid is a valid + * open/deleg/lock stateid, accept it as a parallel + * initial layout stateid + */ + if (lsp && ((verify_stateid(fp, stateid)) == 0)) { + dprintk("%s parallel initial layout state\n", + __func__); + goto update; + } + + dprintk("%s ERROR bad opaque in stateid 1\n", __func__); + goto out_put; + } + + /* stateid is a valid layout stateid for this file. */ + if (stateid->si_generation > ls->ls_stateid.si_generation) { + dprintk("%s bad stateid 1\n", __func__); + goto out_put; + } +update: + update_stateid(&ls->ls_stateid); + dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", + __func__, ls->ls_stateid.si_generation, ls); + } + status = 0; + /* Set the stateid to be encoded */ + memcpy(stateid, &ls->ls_stateid, sizeof(stateid_t)); + + /* Return the layout state if requested */ + if (lsp) { + get_layout_state(ls); + *lsp = ls; + } + dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(&ls->ls_stateid)); +out_put: + dprintk("%s PUT LO STATE:\n", __func__); + put_layout_state(ls); +out: + dprintk("<-- %s status %d\n", __func__, htonl(status)); + + return status; +} + +static inline struct nfs4_layout * +alloc_layout(void) +{ + return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); +} + +static inline void +free_layout(struct nfs4_layout *lp) +{ + kmem_cache_free(pnfs_layout_slab, lp); +} + +static void +init_layout(struct nfs4_layout_state *ls, + struct nfs4_layout *lp, + struct nfs4_file *fp, + struct nfs4_client *clp, + struct svc_fh *current_fh, + struct nfsd4_layout_seg *seg) +{ + dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, + ls, lp, clp, fp, fp->fi_inode); + + get_nfs4_file(fp); + lp->lo_client = clp; + lp->lo_file = fp; + get_layout_state(ls); + lp->lo_state = ls; + memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); + spin_lock(&layout_lock); + list_add_tail(&lp->lo_perstate, &ls->ls_layouts); + list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); + list_add_tail(&lp->lo_perfile, &fp->fi_layouts); + spin_unlock(&layout_lock); + dprintk("pNFS %s end\n", __func__); +} + +static void +dequeue_layout(struct nfs4_layout *lp) +{ + BUG_ON_UNLOCKED_LAYOUT(); + list_del(&lp->lo_perclnt); + list_del(&lp->lo_perfile); + list_del(&lp->lo_perstate); +} + +static void +destroy_layout(struct nfs4_layout *lp) +{ + struct nfs4_client *clp; + struct nfs4_file *fp; + struct nfs4_layout_state *ls; + + BUG_ON_UNLOCKED_LAYOUT(); + clp = lp->lo_client; + fp = lp->lo_file; + ls = lp->lo_state; + dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", + __func__, lp, clp, fp, fp->fi_inode, + list_empty(&ls->ls_layouts)); + + kmem_cache_free(pnfs_layout_slab, lp); + /* release references taken by init_layout */ + put_layout_state_locked(ls); + put_nfs4_file(fp); +} + +void fs_layout_return(struct super_block *sb, struct inode *ino, + struct nfsd4_pnfs_layoutreturn *lrp, int flags, + void *recall_cookie) +{ + int ret; + + if (unlikely(!sb->s_pnfs_op->layout_return)) + return; + + lrp->lr_flags = flags; + lrp->args.lr_cookie = recall_cookie; + + if (!ino) /* FSID or ALL */ + ino = sb->s_root->d_inode; + + ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); + dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " + "cookie = %p flags 0x%x status=%d\n", + __func__, ino->i_ino, lrp->args.lr_seg.iomode, + lrp->args.lr_seg.offset, lrp->args.lr_seg.length, + recall_cookie, flags, ret); +} + +static u64 +alloc_init_sbid(struct super_block *sb) +{ + struct sbid_tracker *sbid; + struct sbid_tracker *new = alloc_sbid(); + unsigned long hash_idx = sbid_hashval(sb); + u64 id = 0; + + if (likely(new)) { + spin_lock(&layout_lock); + id = ++current_sbid; + new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); + id = new->id; + BUG_ON(id == 0); + new->sb = sb; + + list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) + if (sbid->sb == sb) { + kfree(new); + id = sbid->id; + spin_unlock(&layout_lock); + return id; + } + list_add(&new->hash, &sbid_hashtbl[hash_idx]); + spin_unlock(&layout_lock); + } + return id; +} + +struct super_block * +find_sbid_id(u64 id) +{ + struct sbid_tracker *sbid; + struct super_block *sb = NULL; + unsigned long hash_idx = id & SBID_HASH_MASK; + int pos = 0; + + spin_lock(&layout_lock); + list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { + pos++; + if (sbid->id != id) + continue; + if (pos > 1) + list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); + sb = sbid->sb; + break; + } + spin_unlock(&layout_lock); + return sb; +} + +u64 +find_create_sbid(struct super_block *sb) +{ + struct sbid_tracker *sbid; + unsigned long hash_idx = sbid_hashval(sb); + int pos = 0; + u64 id = 0; + + spin_lock(&layout_lock); + list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { + pos++; + if (sbid->sb != sb) + continue; + if (pos > 1) + list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); + id = sbid->id; + break; + } + spin_unlock(&layout_lock); + + if (!id) + id = alloc_init_sbid(sb); + + return id; +} + +/* + * Create a layoutrecall structure + * An optional layoutrecall can be cloned (except for the layoutrecall lists) + */ +static struct nfs4_layoutrecall * +alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, + struct nfs4_client *clp, + struct nfs4_file *lrfile) +{ + struct nfs4_layoutrecall *clr; + + dprintk("NFSD %s\n", __func__); + clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); + if (clr == NULL) + return clr; + + dprintk("NFSD %s -->\n", __func__); + + memset(clr, 0, sizeof(*clr)); + if (lrfile) + get_nfs4_file(lrfile); + clr->clr_client = clp; + clr->clr_file = lrfile; + clr->cb = *cbl; + + kref_init(&clr->clr_ref); + INIT_LIST_HEAD(&clr->clr_perclnt); + + dprintk("NFSD %s return %p\n", __func__, clr); + return clr; +} + +static void +get_layoutrecall(struct nfs4_layoutrecall *clr) +{ + dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, + atomic_read(&clr->clr_ref.refcount)); + kref_get(&clr->clr_ref); +} + +static void +destroy_layoutrecall(struct kref *kref) +{ + struct nfs4_layoutrecall *clr = + container_of(kref, struct nfs4_layoutrecall, clr_ref); + dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, + clr->clr_file, clr->clr_client); + BUG_ON(!list_empty(&clr->clr_perclnt)); + if (clr->clr_file) + put_nfs4_file(clr->clr_file); + kmem_cache_free(pnfs_layoutrecall_slab, clr); +} + +int +put_layoutrecall(struct nfs4_layoutrecall *clr) +{ + dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, + atomic_read(&clr->clr_ref.refcount)); + return kref_put(&clr->clr_ref, destroy_layoutrecall); +} + +void * +layoutrecall_done(struct nfs4_layoutrecall *clr) +{ + void *recall_cookie = clr->cb.cbl_cookie; + struct nfs4_layoutrecall *parent = clr->parent; + + dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, + atomic_read(&clr->clr_ref.refcount)); + BUG_ON_UNLOCKED_LAYOUT(); + list_del_init(&clr->clr_perclnt); + put_layoutrecall(clr); + + if (parent && !put_layoutrecall(parent)) + recall_cookie = NULL; + + return recall_cookie; +} + +/* + * get_state() and cb_get_state() are + */ +void +release_pnfs_ds_dev_list(struct nfs4_stateid *stp) +{ + struct pnfs_ds_dev_entry *ddp; + + while (!list_empty(&stp->st_pnfs_ds_id)) { + ddp = list_entry(stp->st_pnfs_ds_id.next, + struct pnfs_ds_dev_entry, dd_dev_entry); + list_del(&ddp->dd_dev_entry); + kfree(ddp); + } +} + +static int +nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) +{ + struct pnfs_ds_dev_entry *ddp; + + ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); + if (!ddp) + return -ENOMEM; + + INIT_LIST_HEAD(&ddp->dd_dev_entry); + list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); + ddp->dd_dsid = dsid; + return 0; +} + +/* + * are two octet ranges overlapping? + * start1 last1 + * |-----------------| + * start2 last2 + * |----------------| + */ +static inline int +lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) +{ + u64 start1 = l1->offset; + u64 last1 = last_byte_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 last2 = last_byte_offset(start2, l2->length); + int ret; + + /* if last1 == start2 there's a single byte overlap */ + ret = (last2 >= start1) && (last1 >= start2); + dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, + l1->offset, l1->length, l2->offset, l2->length, ret); + return ret; +} + +static inline int +same_fsid_major(struct nfs4_fsid *fsid, u64 major) +{ + return fsid->major == major; +} + +static inline int +same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) +{ + return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); +} + +/* + * find a layout recall conflicting with the specified layoutget + */ +static int +is_layout_recalled(struct nfs4_client *clp, + struct svc_fh *current_fh, + struct nfsd4_layout_seg *seg) +{ + struct nfs4_layoutrecall *clr; + + spin_lock(&layout_lock); + list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { + if (clr->cb.cbl_seg.layout_type != seg->layout_type) + continue; + if (clr->cb.cbl_recall_type == RETURN_ALL) + goto found; + if (clr->cb.cbl_recall_type == RETURN_FSID) { + if (same_fsid(&clr->cb.cbl_fsid, current_fh)) + goto found; + else + continue; + } + BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); + if (clr->cb.cbl_seg.clientid == seg->clientid && + lo_seg_overlapping(&clr->cb.cbl_seg, seg)) + goto found; + } + spin_unlock(&layout_lock); + return 0; +found: + spin_unlock(&layout_lock); + return 1; +} + +/* + * are two octet ranges overlapping or adjacent? + */ +static inline int +lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + /* is end1 == start2 ranges are adjacent */ + return (end2 >= start1) && (end1 >= start2); +} + +static void +extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) +{ + u64 lo_start = lo->offset; + u64 lo_end = end_offset(lo_start, lo->length); + u64 lg_start = lg->offset; + u64 lg_end = end_offset(lg_start, lg->length); + + /* lo already covers lg? */ + if (lo_start <= lg_start && lg_end <= lo_end) + return; + + /* extend start offset */ + if (lo_start > lg_start) + lo_start = lg_start; + + /* extend end offset */ + if (lo_end < lg_end) + lo_end = lg_end; + + lo->offset = lo_start; + lo->length = (lo_end == NFS4_MAX_UINT64) ? + lo_end : lo_end - lo_start; +} + +static struct nfs4_layout * +merge_layout(struct nfs4_file *fp, + struct nfs4_client *clp, + struct nfsd4_layout_seg *seg) +{ + struct nfs4_layout *lp = NULL; + + spin_lock(&layout_lock); + list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) + if (lp->lo_seg.layout_type == seg->layout_type && + lp->lo_seg.clientid == seg->clientid && + lp->lo_seg.iomode == seg->iomode && + lo_seg_mergeable(&lp->lo_seg, seg)) { + extend_layout(&lp->lo_seg, seg); + break; + } + spin_unlock(&layout_lock); + + return lp; +} + +__be32 +nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, + struct exp_xdr_stream *xdr) +{ + u32 status; + __be32 nfserr; + struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; + struct super_block *sb = ino->i_sb; + int can_merge; + struct nfs4_file *fp; + struct nfs4_client *clp; + struct nfs4_layout *lp = NULL; + struct nfs4_layout_state *ls = NULL; + struct nfsd4_pnfs_layoutget_arg args = { + .lg_minlength = lgp->lg_minlength, + .lg_fh = &lgp->lg_fhp->fh_handle, + }; + struct nfsd4_pnfs_layoutget_res res = { + .lg_seg = lgp->lg_seg, + }; + + dprintk("NFSD: %s Begin\n", __func__); + + args.lg_sbid = find_create_sbid(sb); + if (!args.lg_sbid) { + nfserr = nfserr_layouttrylater; + goto out; + } + + can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && + sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); + + nfs4_lock_state(); + fp = find_alloc_file(ino, lgp->lg_fhp); + clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); + dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); + if (!fp || !clp) { + nfserr = nfserr_inval; + goto out_unlock; + } + + /* Check decoded layout stateid */ + nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); + if (nfserr) + goto out_unlock; + + if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { + nfserr = nfserr_recallconflict; + goto out; + } + + /* pre-alloc layout in case we can't merge after we call + * the file system + */ + lp = alloc_layout(); + if (!lp) { + nfserr = nfserr_layouttrylater; + goto out_unlock; + } + + dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " + "iomode %u offset %llu length %llu\n", + __func__, lgp->lg_seg.layout_type, + exp_xdr_qbytes(xdr->end - xdr->p), + lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); + + /* FIXME: need to eliminate the use of the state lock */ + nfs4_unlock_state(); + status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); + nfs4_lock_state(); + + dprintk("pNFS %s: post-export status %u " + "iomode %u offset %llu length %llu\n", + __func__, status, res.lg_seg.iomode, + res.lg_seg.offset, res.lg_seg.length); + + /* + * The allowable error codes for the layout_get pNFS export + * operations vector function (from the file system) can be + * expanded as needed to include other errors defined for + * the RFC 5561 LAYOUTGET operation. + */ + switch (status) { + case 0: + nfserr = NFS4_OK; + break; + case NFS4ERR_ACCESS: + case NFS4ERR_BADIOMODE: + /* No support for LAYOUTIOMODE4_RW layouts */ + case NFS4ERR_BADLAYOUT: + /* No layout matching loga_minlength rules */ + case NFS4ERR_INVAL: + case NFS4ERR_IO: + case NFS4ERR_LAYOUTTRYLATER: + case NFS4ERR_LAYOUTUNAVAILABLE: + case NFS4ERR_LOCKED: + case NFS4ERR_NOSPC: + case NFS4ERR_RECALLCONFLICT: + case NFS4ERR_SERVERFAULT: + case NFS4ERR_TOOSMALL: + /* Requested layout too big for loga_maxcount */ + case NFS4ERR_WRONG_TYPE: + /* Not a regular file */ + nfserr = cpu_to_be32(status); + goto out_freelayout; + default: + BUG(); + nfserr = nfserr_serverfault; + } + + lgp->lg_seg = res.lg_seg; + lgp->lg_roc = res.lg_return_on_close; + + /* SUCCESS! + * Can the new layout be merged into an existing one? + * If so, free unused layout struct + */ + if (can_merge && merge_layout(fp, clp, &res.lg_seg)) + goto out_freelayout; + + /* Can't merge, so let's initialize this new layout */ + init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg); +out_unlock: + if (ls) + put_layout_state(ls); + if (fp) + put_nfs4_file(fp); + nfs4_unlock_state(); +out: + dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, + be32_to_cpu(nfserr)); + return nfserr; +out_freelayout: + free_layout(lp); + goto out_unlock; +} + +static void +trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) +{ + u64 lo_start = lo->offset; + u64 lo_end = end_offset(lo_start, lo->length); + u64 lr_start = lr->offset; + u64 lr_end = end_offset(lr_start, lr->length); + + dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, + lo->offset, lo->length, lr->offset, lr->length); + + /* lr fully covers lo? */ + if (lr_start <= lo_start && lo_end <= lr_end) { + lo->length = 0; + goto out; + } + + /* + * split not supported yet. retain layout segment. + * remains must be returned by the client + * on the final layout return. + */ + if (lo_start < lr_start && lr_end < lo_end) { + dprintk("%s: split not supported\n", __func__); + goto out; + } + + if (lo_start < lr_start) + lo_end = lr_start - 1; + else /* lr_end < lo_end */ + lo_start = lr_end + 1; + + lo->offset = lo_start; + lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; +out: + dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); +} + +static int +pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, + struct nfsd4_pnfs_layoutreturn *lrp) +{ + int layouts_found = 0; + struct nfs4_layout *lp, *nextlp; + + dprintk("%s: clp %p fp %p\n", __func__, clp, fp); + spin_lock(&layout_lock); + list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { + dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", + __func__, lp, + lp->lo_client, clp, + lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, + lp->lo_seg.iomode, lrp->args.lr_seg.iomode); + if (lp->lo_client != clp || + lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || + (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && + lrp->args.lr_seg.iomode != IOMODE_ANY) || + !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) + continue; + layouts_found++; + trim_layout(&lp->lo_seg, &lrp->args.lr_seg); + if (!lp->lo_seg.length) { + lrp->lrs_present = 0; + dequeue_layout(lp); + destroy_layout(lp); + } + } + spin_unlock(&layout_lock); + + return layouts_found; +} + +static int +pnfs_return_client_layouts(struct nfs4_client *clp, + struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) +{ + int layouts_found = 0; + struct nfs4_layout *lp, *nextlp; + + spin_lock(&layout_lock); + list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { + if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || + (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && + lrp->args.lr_seg.iomode != IOMODE_ANY)) + continue; + + if (lrp->args.lr_return_type == RETURN_FSID && + !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) + continue; + + layouts_found++; + dequeue_layout(lp); + destroy_layout(lp); + } + spin_unlock(&layout_lock); + + return layouts_found; +} + +static int +recall_return_perfect_match(struct nfs4_layoutrecall *clr, + struct nfsd4_pnfs_layoutreturn *lrp, + struct nfs4_file *fp, + struct svc_fh *current_fh) +{ + if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || + clr->cb.cbl_recall_type != lrp->args.lr_return_type) + return 0; + + return (clr->cb.cbl_recall_type == RETURN_FILE && + clr->clr_file == fp && + clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && + clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || + + (clr->cb.cbl_recall_type == RETURN_FSID && + same_fsid(&clr->cb.cbl_fsid, current_fh)) || + + clr->cb.cbl_recall_type == RETURN_ALL; +} + +static int +recall_return_partial_match(struct nfs4_layoutrecall *clr, + struct nfsd4_pnfs_layoutreturn *lrp, + struct nfs4_file *fp, + struct svc_fh *current_fh) +{ + /* iomode matching? */ + if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && + clr->cb.cbl_seg.iomode != IOMODE_ANY && + lrp->args.lr_seg.iomode != IOMODE_ANY) + return 0; + + if (clr->cb.cbl_recall_type == RETURN_ALL || + lrp->args.lr_return_type == RETURN_ALL) + return 1; + + /* fsid matches? */ + if (clr->cb.cbl_recall_type == RETURN_FSID || + lrp->args.lr_return_type == RETURN_FSID) + return same_fsid(&clr->cb.cbl_fsid, current_fh); + + /* file matches, range overlapping? */ + return clr->clr_file == fp && + lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); +} + +int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, + struct nfsd4_pnfs_layoutreturn *lrp) +{ + int status = 0; + int layouts_found = 0; + struct inode *ino = current_fh->fh_dentry->d_inode; + struct nfs4_file *fp = NULL; + struct nfs4_client *clp; + struct nfs4_layoutrecall *clr, *nextclr; + u64 ex_fsid = current_fh->fh_export->ex_fsid; + void *recall_cookie = NULL; + + dprintk("NFSD: %s\n", __func__); + + nfs4_lock_state(); + clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); + if (!clp) + goto out; + + if (lrp->args.lr_return_type == RETURN_FILE) { + fp = find_file(ino); + if (!fp) { + printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " + "ino %p:%lu\n", + __func__, ino, ino ? ino->i_ino : 0L); + goto out; + } + + /* Check the stateid */ + dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); + status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, + NULL); + if (status) + goto out_put_file; + + /* update layouts */ + layouts_found = pnfs_return_file_layouts(clp, fp, lrp); + /* optimize for the all-empty case */ + if (list_empty(&fp->fi_layouts)) + recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; + } else { + layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); + } + + dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " + "return_type %d fsid 0x%llx offset %llu length %llu: " + "layouts_found %d\n", + __func__, clp, fp, lrp->args.lr_seg.layout_type, + lrp->args.lr_seg.iomode, lrp->args.lr_return_type, + ex_fsid, + lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); + + /* update layoutrecalls + * note: for RETURN_{FSID,ALL}, fp may be NULL + */ + spin_lock(&layout_lock); + list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, + clr_perclnt) { + if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) + continue; + + if (recall_return_perfect_match(clr, lrp, fp, current_fh)) + recall_cookie = layoutrecall_done(clr); + else if (layouts_found && + recall_return_partial_match(clr, lrp, fp, current_fh)) + clr->clr_time = CURRENT_TIME; + } + spin_unlock(&layout_lock); + +out_put_file: + if (fp) + put_nfs4_file(fp); +out: + nfs4_unlock_state(); + + /* call exported filesystem layout_return (ignore return-code) */ + fs_layout_return(sb, ino, lrp, 0, recall_cookie); + + dprintk("pNFS %s: exit status %d \n", __func__, status); + return status; +} + +/* + * PNFS Metadata server export operations callback for get_state + * + * called by the cluster fs when it receives a get_state() from a data + * server. + * returns status, or pnfs_get_state* with pnfs_get_state->status set. + * + */ +int +nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) +{ + struct nfs4_stateid *stp; + int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ + int status = -EINVAL; + struct inode *ino; + struct nfs4_delegation *dl; + stateid_t *stid = (stateid_t *)&arg->stid; + + dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, + STATEID_VAL(stid), arg->ino); + + nfs4_lock_state(); + stp = find_stateid(stid, flags); + if (!stp) { + ino = iget_locked(sb, arg->ino); + if (!ino) + goto out; + + if (ino->i_state & I_NEW) { + iget_failed(ino); + goto out; + } + + dl = find_delegation_stateid(ino, stid); + if (dl) + status = 0; + + iput(ino); + } else { + /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ + + /* arg->devid is the Data server id, set by the cluster fs */ + status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); + if (status) + goto out; + + arg->access = stp->st_access_bmap; + *(clientid_t *)&arg->clid = + stp->st_stateowner->so_client->cl_clientid; + } +out: + nfs4_unlock_state(); + return status; +} + +static int +cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, + stateid_t *lsid) +{ + int found = 0; + struct nfs4_layout *lp; + struct nfs4_layout_state *ls; + + spin_lock(&layout_lock); + list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { + if (lp->lo_file != lrfile) + continue; + + ls = find_get_layout_state(clp, lrfile); + if (!ls) { + /* This shouldn't happen as the file should have a + * layout stateid if it has a layout. + */ + printk(KERN_ERR "%s: file %p has no layout stateid\n", + __func__, lrfile); + WARN_ON(1); + break; + } + update_stateid(&ls->ls_stateid); + memcpy(lsid, &ls->ls_stateid, sizeof(stateid_t)); + put_layout_state_locked(ls); + found = 1; + break; + } + spin_unlock(&layout_lock); + + return found; +} + +static int +cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) +{ + int found = 0; + struct nfs4_layout *lp; + + /* note: minor version unused */ + spin_lock(&layout_lock); + list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) + if (lp->lo_file->fi_fsid.major == fsid->major) { + found = 1; + break; + } + spin_unlock(&layout_lock); + return found; +} + +static int +cl_has_any_layout(struct nfs4_client *clp) +{ + return !list_empty(&clp->cl_layouts); +} + +static int +cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, + struct nfs4_file *lrfile, stateid_t *lsid) +{ + switch (cbl->cbl_recall_type) { + case RETURN_FILE: + return cl_has_file_layout(clp, lrfile, lsid); + case RETURN_FSID: + return cl_has_fsid_layout(clp, &cbl->cbl_fsid); + default: + return cl_has_any_layout(clp); + } +} + +/* + * Called without the layout_lock. + */ +void +nomatching_layout(struct nfs4_layoutrecall *clr) +{ + struct nfsd4_pnfs_layoutreturn lr = { + .args.lr_return_type = clr->cb.cbl_recall_type, + .args.lr_seg = clr->cb.cbl_seg, + }; + struct inode *inode; + void *recall_cookie; + + if (clr->clr_file) { + inode = igrab(clr->clr_file->fi_inode); + if (WARN_ON(!inode)) + return; + } else { + inode = NULL; + } + + dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, + clr->clr_client, clr->clr_file); + + if (clr->cb.cbl_recall_type == RETURN_FILE) + pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr); + else + pnfs_return_client_layouts(clr->clr_client, &lr, + clr->cb.cbl_fsid.major); + + spin_lock(&layout_lock); + recall_cookie = layoutrecall_done(clr); + spin_unlock(&layout_lock); + + fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, + recall_cookie); + iput(inode); +} + +void pnfs_expire_client(struct nfs4_client *clp) +{ + for (;;) { + struct nfs4_layoutrecall *lrp = NULL; + + spin_lock(&layout_lock); + if (!list_empty(&clp->cl_layoutrecalls)) { + lrp = list_entry(clp->cl_layoutrecalls.next, + struct nfs4_layoutrecall, clr_perclnt); + get_layoutrecall(lrp); + } + spin_unlock(&layout_lock); + if (!lrp) + break; + + dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); + BUG_ON(lrp->clr_client != clp); + nomatching_layout(lrp); + put_layoutrecall(lrp); + } + + for (;;) { + struct nfs4_layout *lp = NULL; + struct inode *inode = NULL; + struct nfsd4_pnfs_layoutreturn lr; + bool empty = false; + + spin_lock(&layout_lock); + if (!list_empty(&clp->cl_layouts)) { + lp = list_entry(clp->cl_layouts.next, + struct nfs4_layout, lo_perclnt); + inode = igrab(lp->lo_file->fi_inode); + memset(&lr, 0, sizeof(lr)); + lr.args.lr_return_type = RETURN_FILE; + lr.args.lr_seg = lp->lo_seg; + empty = list_empty(&lp->lo_file->fi_layouts); + BUG_ON(lp->lo_client != clp); + dequeue_layout(lp); + destroy_layout(lp); /* do not access lp after this */ + } + spin_unlock(&layout_lock); + if (!lp) + break; + + if (WARN_ON(!inode)) + break; + + dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, + lp, clp); + + fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, + empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); + iput(inode); + } +} + +struct create_recall_list_arg { + struct nfsd4_pnfs_cb_layout *cbl; + struct nfs4_file *lrfile; + struct list_head *todolist; + unsigned todo_count; +}; + +/* + * look for matching layout for the given client + * and add a pending layout recall to the todo list + * if found any. + * returns: + * 0 if layouts found or negative error. + */ +static int +lo_recall_per_client(struct nfs4_client *clp, void *p) +{ + stateid_t lsid; + struct nfs4_layoutrecall *pending; + struct create_recall_list_arg *arg = p; + + memset(&lsid, 0, sizeof(lsid)); + if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) + return 0; + + /* Matching put done by layoutreturn */ + pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); + /* out of memory, drain todo queue */ + if (!pending) + return -ENOMEM; + + *(stateid_t *)&pending->cb.cbl_sid = lsid; + list_add(&pending->clr_perclnt, arg->todolist); + arg->todo_count++; + return 0; +} + +/* Create a layoutrecall structure for each client based on the + * original structure. */ +int +create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, + struct nfsd4_pnfs_cb_layout *cbl, + struct nfs4_file *lrfile) +{ + struct nfs4_client *clp; + struct create_recall_list_arg arg = { + .cbl = cbl, + .lrfile = lrfile, + .todolist = todolist, + }; + int status = 0; + + dprintk("%s: -->\n", __func__); + + /* If client given by fs, just do single client */ + if (cbl->cbl_seg.clientid) { + clp = find_confirmed_client( + (clientid_t *)&cbl->cbl_seg.clientid); + if (!clp) { + status = -ENOENT; + dprintk("%s: clientid %llx not found\n", __func__, + (unsigned long long)cbl->cbl_seg.clientid); + goto out; + } + + status = lo_recall_per_client(clp, &arg); + } else { + /* Check all clients for layout matches */ + status = filter_confirmed_clients(lo_recall_per_client, &arg); + } + +out: + *todo_len = arg.todo_count; + dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); + return status; +} + +/* + * Recall layouts asynchronously + * Called with state lock. + */ +static int +spawn_layout_recall(struct super_block *sb, struct list_head *todolist, + unsigned todo_len) +{ + struct nfs4_layoutrecall *pending; + struct nfs4_layoutrecall *parent = NULL; + int status = 0; + + dprintk("%s: -->\n", __func__); + + if (todo_len > 1) { + pending = list_entry(todolist->next, struct nfs4_layoutrecall, + clr_perclnt); + + parent = alloc_init_layoutrecall(&pending->cb, NULL, + pending->clr_file); + if (unlikely(!parent)) { + /* We want forward progress. If parent cannot be + * allocated take the first one as parent but don't + * execute it. Caller must check for -EAGAIN, if so + * When the partial recalls return, + * nfsd_layout_recall_cb should be called again. + */ + list_del_init(&pending->clr_perclnt); + if (todo_len > 2) { + parent = pending; + } else { + parent = NULL; + put_layoutrecall(pending); + } + --todo_len; + status = -ENOMEM; + } + } + + while (!list_empty(todolist)) { + pending = list_entry(todolist->next, struct nfs4_layoutrecall, + clr_perclnt); + list_del_init(&pending->clr_perclnt); + dprintk("%s: clp %p cb_client %p fp %p\n", __func__, + pending->clr_client, + pending->clr_client->cl_cb_client, + pending->clr_file); + if (unlikely(!pending->clr_client->cl_cb_client)) { + printk(KERN_INFO + "%s: clientid %08x/%08x has no callback path\n", + __func__, + pending->clr_client->cl_clientid.cl_boot, + pending->clr_client->cl_clientid.cl_id); + put_layoutrecall(pending); + continue; + } + + pending->clr_time = CURRENT_TIME; + pending->clr_sb = sb; + if (parent) { + /* If we created a parent its initial ref count is 1. + * We will need to de-ref it eventually. So we just + * don't increment on behalf of the last one. + */ + if (todo_len != 1) + get_layoutrecall(parent); + } + pending->parent = parent; + get_layoutrecall(pending); + /* Add to list so corresponding layoutreturn can find req */ + list_add(&pending->clr_perclnt, + &pending->clr_client->cl_layoutrecalls); + + nfsd4_cb_layout(pending); + --todo_len; + } + + return status; +} + +/* + * Spawn a thread to perform a recall layout + * + */ +int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, + struct nfsd4_pnfs_cb_layout *cbl) +{ + int status; + struct nfs4_file *lrfile = NULL; + struct list_head todolist; + unsigned todo_len = 0; + + dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); + BUG_ON(!cbl); + BUG_ON(cbl->cbl_recall_type != RETURN_FILE && + cbl->cbl_recall_type != RETURN_FSID && + cbl->cbl_recall_type != RETURN_ALL); + BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); + BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && + cbl->cbl_seg.iomode != IOMODE_RW && + cbl->cbl_seg.iomode != IOMODE_ANY); + + if (nfsd_serv == NULL) { + dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); + return -ENOENT; + } + + nfs4_lock_state(); + status = -ENOENT; + if (inode) { + lrfile = find_file(inode); + if (!lrfile) { + dprintk("NFSD nfsd_layout_recall_cb: " + "nfs4_file not found\n"); + goto err; + } + if (cbl->cbl_recall_type == RETURN_FSID) + cbl->cbl_fsid = lrfile->fi_fsid; + } + + INIT_LIST_HEAD(&todolist); + + /* If no cookie provided by FS, return a default one */ + if (!cbl->cbl_cookie) + cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; + + status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); + if (list_empty(&todolist)) { + status = -ENOENT; + } else { + /* process todolist even if create_layout_recall_list + * returned an error */ + int status2 = spawn_layout_recall(sb, &todolist, todo_len); + if (status2) + status = status2; + } + +err: + nfs4_unlock_state(); + if (lrfile) + put_nfs4_file(lrfile); + return (todo_len && status) ? -EAGAIN : status; +} + +struct create_device_notify_list_arg { + struct list_head *todolist; + struct nfsd4_pnfs_cb_dev_list *ndl; +}; + +static int +create_device_notify_per_cl(struct nfs4_client *clp, void *p) +{ + struct nfs4_notify_device *cbnd; + struct create_device_notify_list_arg *arg = p; + + if (atomic_read(&clp->cl_deviceref) <= 0) + return 0; + + cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL); + if (!cbnd) + return -ENOMEM; + + cbnd->nd_list = arg->ndl; + cbnd->nd_client = clp; + list_add(&cbnd->nd_perclnt, arg->todolist); + return 0; +} + +/* Create a list of clients to send device notifications. */ +int +create_device_notify_list(struct list_head *todolist, + struct nfsd4_pnfs_cb_dev_list *ndl) +{ + int status; + struct create_device_notify_list_arg arg = { + .todolist = todolist, + .ndl = ndl, + }; + + nfs4_lock_state(); + status = filter_confirmed_clients(create_device_notify_per_cl, &arg); + nfs4_unlock_state(); + + return status; +} + +/* + * For each client that a device, send a device notification. + * XXX: Need to track which clients have which devices. + */ +int nfsd_device_notify_cb(struct super_block *sb, + struct nfsd4_pnfs_cb_dev_list *ndl) +{ + struct nfs4_notify_device *cbnd; + unsigned int notify_num = 0; + int status2, status = 0; + struct list_head todolist; + + BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); + + dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); + + if (nfsd_serv == NULL) + return -ENOENT; + + INIT_LIST_HEAD(&todolist); + + status = create_device_notify_list(&todolist, ndl); + + while (!list_empty(&todolist)) { + cbnd = list_entry(todolist.next, struct nfs4_notify_device, + nd_perclnt); + list_del_init(&cbnd->nd_perclnt); + status2 = nfsd4_cb_notify_device(cbnd); + pnfs_clear_device_notify(cbnd->nd_client); + if (status2) { + kfree(cbnd); + status = status2; + } + notify_num++; + } + + dprintk("NFSD %s: status %d clients %u\n", + __func__, status, notify_num); + return status; +} diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c --- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-09-30 10:17:08.866999000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-09-30 10:17:08.868998000 -0400 @@ -0,0 +1,461 @@ +/****************************************************************************** + * + * (c) 2007 Network Appliance, Inc. All Rights Reserved. + * (c) 2009 NetApp. All Rights Reserved. + * + * NetApp provides this source code under the GPL v2 License. + * The GPL v2 license is available at + * http://opensource.org/licenses/gpl-license.php. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "nfsfh.h" +#include "nfsd.h" + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +/* Just use a linked list. Do not expect more than 32 dlm_device_entries + * the first implementation will just use one device per cluster file system + */ + +static LIST_HEAD(dlm_device_list); +static DEFINE_SPINLOCK(dlm_device_list_lock); + +struct dlm_device_entry { + struct list_head dlm_dev_list; + char disk_name[DISK_NAME_LEN]; + int num_ds; + char ds_list[NFSD_DLM_DS_LIST_MAX]; +}; + +static struct dlm_device_entry * +_nfsd4_find_pnfs_dlm_device(char *disk_name) +{ + struct dlm_device_entry *dlm_pdev; + + dprintk("--> %s disk name %s\n", __func__, disk_name); + spin_lock(&dlm_device_list_lock); + list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { + dprintk("%s Look for dlm_pdev %s\n", __func__, + dlm_pdev->disk_name); + if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { + spin_unlock(&dlm_device_list_lock); + return dlm_pdev; + } + } + spin_unlock(&dlm_device_list_lock); + return NULL; +} + +static struct dlm_device_entry * +nfsd4_find_pnfs_dlm_device(struct super_block *sb) { + char dname[BDEVNAME_SIZE]; + + bdevname(sb->s_bdev, dname); + return _nfsd4_find_pnfs_dlm_device(dname); +} + +ssize_t +nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) +{ + char *pos = buf; + ssize_t size = 0; + struct dlm_device_entry *dlm_pdev; + int ret = -EINVAL; + + spin_lock(&dlm_device_list_lock); + list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) + { + int advanced; + advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); + if (advanced >= buflen - size) + goto out; + size += advanced; + pos += advanced; + } + ret = size; + +out: + spin_unlock(&dlm_device_list_lock); + return ret; +} + +bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) +{ + char *start = ds_list; + + *num_ds = 0; + + while (*start) { + struct sockaddr_storage tempAddr; + int ipLen = strcspn(start, ","); + + if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) + return false; + (*num_ds)++; + start += ipLen + 1; + } + return true; +} + +/* + * pnfs_dlm_device string format: + * block-device-path:, + * + * Examples + * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with + * two data servers for the dlm cluster file system mounted on /dev/sda. + * + * /dev/sda:192.168.1.96,192.168.1.100' + * replaces the data server list for /dev/sda + * + * Only the deviceid == 1 is supported. Can add device id to + * pnfs_dlm_device string when needed. + * + * Only the round robin each data server once stripe index is supported. + */ +int +nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) + +{ + struct dlm_device_entry *new, *found; + char *bufp = pnfs_dlm_device; + char *endp = bufp + strlen(bufp); + int err = -ENOMEM; + + dprintk("--> %s len %d\n", __func__, len); + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return err; + + err = -EINVAL; + /* disk_name */ + /* FIXME: need to check for valid disk_name. search superblocks? + * check for slash dev slash ? + */ + len = strcspn(bufp, ":"); + if (len > DISK_NAME_LEN) + goto out_free; + memcpy(new->disk_name, bufp, len); + + err = -EINVAL; + bufp += len + 1; + if (bufp >= endp) + goto out_free; + + /* data server list */ + /* FIXME: need to check for comma separated valid ip format */ + len = strcspn(bufp, ":"); + if (len > NFSD_DLM_DS_LIST_MAX) + goto out_free; + memcpy(new->ds_list, bufp, len); + + + /* validate the ips */ + if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) + goto out_free; + + dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, + new->disk_name, new->num_ds, new->ds_list); + + found = _nfsd4_find_pnfs_dlm_device(new->disk_name); + if (found) { + /* FIXME: should compare found->ds_list with new->ds_list + * and if it is different, kick off a CB_NOTIFY change + * deviceid. + */ + dprintk("%s pnfs_dlm_device %s:%s already in cache " + " replace ds_list with new ds_list %s\n", __func__, + found->disk_name, found->ds_list, new->ds_list); + memset(found->ds_list, 0, DISK_NAME_LEN); + memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); + found->num_ds = new->num_ds; + kfree(new); + } else { + dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, + new->disk_name, new->ds_list); + spin_lock(&dlm_device_list_lock); + list_add(&new->dlm_dev_list, &dlm_device_list); + spin_unlock(&dlm_device_list_lock); + } + dprintk("<-- %s Success\n", __func__); + return 0; + +out_free: + kfree(new); + dprintk("<-- %s returns %d\n", __func__, err); + return err; +} + +void nfsd4_pnfs_dlm_shutdown(void) +{ + struct dlm_device_entry *dlm_pdev, *next; + + dprintk("--> %s\n", __func__); + + spin_lock(&dlm_device_list_lock); + list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, + dlm_dev_list) { + list_del(&dlm_pdev->dlm_dev_list); + kfree(dlm_pdev); + } + spin_unlock(&dlm_device_list_lock); +} + +static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, + u32 layout_type, + struct nfsd4_pnfs_dev_iter_res *res) +{ + if (layout_type != LAYOUT_NFSV4_1_FILES) { + printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " + "(type: %x)\n", __func__, layout_type); + return -ENOTSUPP; + } + + res->gd_eof = 1; + if (res->gd_cookie) + return -ENOENT; + + res->gd_cookie = 1; + res->gd_verf = 1; + res->gd_devid = 1; + return 0; +} + +static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, + struct exp_xdr_stream *xdr, + u32 layout_type, + const struct nfsd4_pnfs_deviceid *devid) +{ + int err, len, i = 0; + struct pnfs_filelayout_device fdev; + struct pnfs_filelayout_devaddr *daddr; + struct dlm_device_entry *dlm_pdev; + char *bufp; + + err = -ENOTSUPP; + if (layout_type != LAYOUT_NFSV4_1_FILES) { + dprintk("%s: ERROR: layout type isn't 'file' " + "(type: %x)\n", __func__, layout_type); + return err; + } + + /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO + * with a gdia_device_id != 1 is invalid. + */ + err = -EINVAL; + if (devid->devid != 1) { + dprintk("%s: WARNING: didn't receive a deviceid of " + "1 (got: 0x%llx)\n", __func__, devid->devid); + return err; + } + + /* + * If the DS list has not been established, return -EINVAL + */ + dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); + if (!dlm_pdev) { + dprintk("%s: DEBUG: disk %s Not Found\n", __func__, + sb->s_bdev->bd_disk->disk_name); + return err; + } + + dprintk("%s: Found disk %s with DS list |%s|\n", + __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); + + memset(&fdev, '\0', sizeof(fdev)); + fdev.fl_device_length = dlm_pdev->num_ds; + + err = -ENOMEM; + len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; + fdev.fl_device_list = kzalloc(len, GFP_KERNEL); + if (!fdev.fl_device_list) { + printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " + "buffer for %d DSes.\n", __func__, i); + fdev.fl_device_length = 0; + goto out; + } + + /* Set a simple stripe indicie */ + fdev.fl_stripeindices_length = fdev.fl_device_length; + fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * + fdev.fl_stripeindices_length, GFP_KERNEL); + + if (!fdev.fl_stripeindices_list) { + printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " + "list buffer for %d DSes.\n", __func__, i); + goto out; + } + for (i = 0; i < fdev.fl_stripeindices_length; i++) + fdev.fl_stripeindices_list[i] = i; + + /* Transfer the data server list with a single multipath entry */ + bufp = dlm_pdev->ds_list; + for (i = 0; i < fdev.fl_device_length; i++) { + daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); + if (!daddr) { + printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " + "addr buffer.\n", __func__); + goto out; + } + + daddr->r_netid.data = "tcp"; + daddr->r_netid.len = 3; + + len = strcspn(bufp, ","); + daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); + memcpy(daddr->r_addr.data, bufp, len); + /* + * append the port number. interpreted as two more bytes + * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. + */ + memcpy(daddr->r_addr.data + len, ".8.1", 4); + daddr->r_addr.len = len + 4; + + fdev.fl_device_list[i].fl_multipath_length = 1; + fdev.fl_device_list[i].fl_multipath_list = daddr; + + dprintk("%s: encoding DS |%s|\n", __func__, bufp); + + bufp += len + 1; + } + + /* have nfsd encode the device info */ + err = filelayout_encode_devinfo(xdr, &fdev); +out: + for (i = 0; i < fdev.fl_device_length; i++) + kfree(fdev.fl_device_list[i].fl_multipath_list); + kfree(fdev.fl_device_list); + kfree(fdev.fl_stripeindices_list); + dprintk("<-- %s returns %d\n", __func__, err); + return err; +} + +static int get_stripe_unit(int blocksize) +{ + if (blocksize >= NFSSVC_MAXBLKSIZE) + return blocksize; + return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); +} + +/* + * Look up inode block device in pnfs_dlm_device list. + * Hash on the inode->i_ino and number of data servers. + */ +static int dlm_ino_hash(struct inode *ino) +{ + struct dlm_device_entry *de; + u32 hash_mask = 0; + + /* If can't find the inode block device in the pnfs_dlm_deivce list + * then don't hand out a layout + */ + de = nfsd4_find_pnfs_dlm_device(ino->i_sb); + if (!de) + return -1; + hash_mask = de->num_ds - 1; + return ino->i_ino & hash_mask; +} + +static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, + struct exp_xdr_stream *xdr, + const struct nfsd4_pnfs_layoutget_arg *args, + struct nfsd4_pnfs_layoutget_res *res) +{ + struct pnfs_filelayout_layout *layout = NULL; + struct knfsd_fh *fhp = NULL; + int index; + enum nfsstat4 rc = NFS4_OK; + + dprintk("%s: LAYOUT_GET\n", __func__); + + /* DLM exported file systems only support layouts for READ */ + if (res->lg_seg.iomode == IOMODE_RW) + return NFS4ERR_BADIOMODE; + + index = dlm_ino_hash(inode); + dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, + inode->i_ino); + if (index < 0) + return NFS4ERR_LAYOUTUNAVAILABLE; + + res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; + /* Always give out whole file layouts */ + res->lg_seg.offset = 0; + res->lg_seg.length = NFS4_MAX_UINT64; + /* Always give out READ ONLY layouts */ + res->lg_seg.iomode = IOMODE_READ; + + layout = kzalloc(sizeof(*layout), GFP_KERNEL); + if (layout == NULL) { + rc = NFS4ERR_LAYOUTTRYLATER; + goto error; + } + + /* Set file layout response args */ + layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; + layout->lg_stripe_type = STRIPE_SPARSE; + layout->lg_commit_through_mds = false; + layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); + layout->lg_fh_length = 1; + layout->device_id.sbid = args->lg_sbid; + layout->device_id.devid = 1; /*FSFTEMP*/ + layout->lg_first_stripe_index = index; /*FSFTEMP*/ + layout->lg_pattern_offset = 0; + + fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); + if (fhp == NULL) { + rc = NFS4ERR_LAYOUTTRYLATER; + goto error; + } + + memcpy(fhp, args->lg_fh, sizeof(*fhp)); + pnfs_fh_mark_ds(fhp); + layout->lg_fh_list = fhp; + + /* Call nfsd to encode layout */ + rc = filelayout_encode_layout(xdr, layout); +exit: + kfree(layout); + kfree(fhp); + return rc; + +error: + res->lg_seg.length = 0; + goto exit; +} + +static int +nfsd4_pnfs_dlm_layouttype(struct super_block *sb) +{ + return LAYOUT_NFSV4_1_FILES; +} + +/* For use by DLM cluster file systems exported by pNFSD */ +const struct pnfs_export_operations pnfs_dlm_export_ops = { + .layout_type = nfsd4_pnfs_dlm_layouttype, + .get_device_info = nfsd4_pnfs_dlm_getdevinfo, + .get_device_iter = nfsd4_pnfs_dlm_getdeviter, + .layout_get = nfsd4_pnfs_dlm_layoutget, +}; +EXPORT_SYMBOL(pnfs_dlm_export_ops); diff -up linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c --- linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-09-30 10:17:08.871998000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4pnfsds.c 2010-09-30 10:17:08.873003000 -0400 @@ -0,0 +1,620 @@ +/* +* linux/fs/nfsd/nfs4pnfsds.c +* +* Copyright (c) 2005 The Regents of the University of Michigan. +* All rights reserved. +* +* Andy Adamson +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the University nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED +* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ +#if defined(CONFIG_PNFSD) + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +#include +#include +#include +#include +#include +#include + +#include "nfsd.h" +#include "pnfsd.h" +#include "state.h" + +/* + ******************* + * PNFS + ******************* + */ +/* + * Hash tables for pNFS Data Server state + * + * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using + * this data server (DS). + * + * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained + * from any MDS. + * + * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained + * from any MDS. + * + */ +/* Hash tables for clientid state */ +#define CLIENT_HASH_BITS 4 +#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) +#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) + +#define clientid_hashval(id) \ + ((id) & CLIENT_HASH_MASK) + +/* hash table for pnfs_ds_stateid */ +#define STATEID_HASH_BITS 10 +#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) +#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) + +#define stateid_hashval(owner_id, file_id) \ + (((owner_id) + (file_id)) & STATEID_HASH_MASK) + +static struct list_head mds_id_tbl; +static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; +static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; + +static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); +static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); + +/* Mutex for data server state. Needs to be separate from + * mds state mutex since a node can be both mds and ds */ +static DEFINE_MUTEX(ds_mutex); +static struct thread_info *ds_mutex_owner; + +static void +ds_lock_state(void) +{ + mutex_lock(&ds_mutex); + ds_mutex_owner = current_thread_info(); +} + +static void +ds_unlock_state(void) +{ + BUG_ON(ds_mutex_owner != current_thread_info()); + ds_mutex_owner = NULL; + mutex_unlock(&ds_mutex); +} + +static int +cmp_clid(const clientid_t *cl1, const clientid_t *cl2) +{ + return (cl1->cl_boot == cl2->cl_boot) && + (cl1->cl_id == cl2->cl_id); +} + +void +nfs4_pnfs_state_init(void) +{ + int i; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) + INIT_LIST_HEAD(&mds_clid_hashtbl[i]); + + for (i = 0; i < STATEID_HASH_SIZE; i++) + INIT_LIST_HEAD(&ds_stid_hashtbl[i]); + + INIT_LIST_HEAD(&mds_id_tbl); +} + +static struct pnfs_mds_id * +find_pnfs_mds_id(u32 mdsid) +{ + struct pnfs_mds_id *local = NULL; + + dprintk("pNFSD: %s\n", __func__); + list_for_each_entry(local, &mds_id_tbl, di_hash) { + if (local->di_mdsid == mdsid) + return local; + } + return NULL; +} + +static struct pnfs_ds_clientid * +find_pnfs_ds_clientid(const clientid_t *clid) +{ + struct pnfs_ds_clientid *local = NULL; + unsigned int hashval; + + dprintk("pNFSD: %s\n", __func__); + + hashval = clientid_hashval(clid->cl_id); + list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { + if (cmp_clid(&local->dc_mdsclid, clid)) + return local; + } + return NULL; +} + +static struct pnfs_ds_stateid * +find_pnfs_ds_stateid(stateid_t *stid) +{ + struct pnfs_ds_stateid *local = NULL; + u32 st_id = stid->si_stateownerid; + u32 f_id = stid->si_fileid; + unsigned int hashval; + + dprintk("pNFSD: %s\n", __func__); + + hashval = stateid_hashval(st_id, f_id); + list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) + if ((local->ds_stid.si_stateownerid == st_id) && + (local->ds_stid.si_fileid == f_id) && + (local->ds_stid.si_boot == stid->si_boot)) { + stateid_t *sid = &local->ds_stid; + dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", + __func__, local, local->ds_flags, + STATEID_VAL(sid)); + return local; + } + return NULL; +} + +static void +release_ds_mdsid(struct kref *kref) +{ + struct pnfs_mds_id *mdp = + container_of(kref, struct pnfs_mds_id, di_ref); + dprintk("pNFSD: %s\n", __func__); + + list_del(&mdp->di_hash); + list_del(&mdp->di_mdsclid); + kfree(mdp); +} + +static void +release_ds_clientid(struct kref *kref) +{ + struct pnfs_ds_clientid *dcp = + container_of(kref, struct pnfs_ds_clientid, dc_ref); + struct pnfs_mds_id *mdp; + dprintk("pNFSD: %s\n", __func__); + + mdp = find_pnfs_mds_id(dcp->dc_mdsid); + if (mdp) + put_ds_mdsid(mdp); + + list_del(&dcp->dc_hash); + list_del(&dcp->dc_stateid); + list_del(&dcp->dc_permdsid); + kfree(dcp); +} + +static void +release_ds_stateid(struct kref *kref) +{ + struct pnfs_ds_stateid *dsp = + container_of(kref, struct pnfs_ds_stateid, ds_ref); + struct pnfs_ds_clientid *dcp; + dprintk("pNFS %s: dsp %p\n", __func__, dsp); + + dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); + if (dcp) + put_ds_clientid(dcp); + + list_del(&dsp->ds_hash); + list_del(&dsp->ds_perclid); + kfree(dsp); +} + +static inline void +put_ds_clientid(struct pnfs_ds_clientid *dcp) +{ + dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, + atomic_read(&dcp->dc_ref.refcount)); + kref_put(&dcp->dc_ref, release_ds_clientid); +} + +static inline void +get_ds_clientid(struct pnfs_ds_clientid *dcp) +{ + dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, + atomic_read(&dcp->dc_ref.refcount)); + kref_get(&dcp->dc_ref); +} + +static inline void +put_ds_mdsid(struct pnfs_mds_id *mdp) +{ + dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, + atomic_read(&mdp->di_ref.refcount)); + kref_put(&mdp->di_ref, release_ds_mdsid); +} + +static inline void +get_ds_mdsid(struct pnfs_mds_id *mdp) +{ + dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, + atomic_read(&mdp->di_ref.refcount)); + kref_get(&mdp->di_ref); +} + +static inline void +put_ds_stateid(struct pnfs_ds_stateid *dsp) +{ + dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, + atomic_read(&dsp->ds_ref.refcount)); + kref_put(&dsp->ds_ref, release_ds_stateid); +} + +static inline void +get_ds_stateid(struct pnfs_ds_stateid *dsp) +{ + dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, + atomic_read(&dsp->ds_ref.refcount)); + kref_get(&dsp->ds_ref); +} + +void +nfs4_pnfs_state_shutdown(void) +{ + struct pnfs_ds_stateid *dsp; + int i; + + dprintk("pNFSD %s: -->\n", __func__); + + ds_lock_state(); + for (i = 0; i < STATEID_HASH_SIZE; i++) { + while (!list_empty(&ds_stid_hashtbl[i])) { + dsp = list_entry(ds_stid_hashtbl[i].next, + struct pnfs_ds_stateid, ds_hash); + put_ds_stateid(dsp); + } + } + ds_unlock_state(); +} + +static struct pnfs_mds_id * +alloc_init_mds_id(struct pnfs_get_state *gsp) +{ + struct pnfs_mds_id *mdp; + + dprintk("pNFSD: %s\n", __func__); + + mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); + if (!mdp) + return NULL; + INIT_LIST_HEAD(&mdp->di_hash); + INIT_LIST_HEAD(&mdp->di_mdsclid); + list_add(&mdp->di_hash, &mds_id_tbl); + mdp->di_mdsid = gsp->dsid; + mdp->di_mdsboot = 0; + kref_init(&mdp->di_ref); + return mdp; +} + +static struct pnfs_ds_clientid * +alloc_init_ds_clientid(struct pnfs_get_state *gsp) +{ + struct pnfs_mds_id *mdp; + struct pnfs_ds_clientid *dcp; + clientid_t *clid = (clientid_t *)&gsp->clid; + unsigned int hashval = clientid_hashval(clid->cl_id); + + dprintk("pNFSD: %s\n", __func__); + + mdp = find_pnfs_mds_id(gsp->dsid); + if (!mdp) { + mdp = alloc_init_mds_id(gsp); + if (!mdp) + return NULL; + } else { + get_ds_mdsid(mdp); + } + + dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); + if (!dcp) + return NULL; + + INIT_LIST_HEAD(&dcp->dc_hash); + INIT_LIST_HEAD(&dcp->dc_stateid); + INIT_LIST_HEAD(&dcp->dc_permdsid); + list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); + list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); + dcp->dc_mdsclid = *clid; + kref_init(&dcp->dc_ref); + dcp->dc_mdsid = gsp->dsid; + return dcp; +} + +static struct pnfs_ds_stateid * +alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) +{ + struct pnfs_ds_stateid *dsp; + u32 st_id = stidp->si_stateownerid; + u32 f_id = stidp->si_fileid; + unsigned int hashval; + + dprintk("pNFSD: %s\n", __func__); + + dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); + if (!dsp) + return dsp; + + INIT_LIST_HEAD(&dsp->ds_hash); + INIT_LIST_HEAD(&dsp->ds_perclid); + memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); + fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); + dsp->ds_access = 0; + dsp->ds_status = 0; + dsp->ds_flags = 0L; + kref_init(&dsp->ds_ref); + set_bit(DS_STATEID_NEW, &dsp->ds_flags); + clear_bit(DS_STATEID_VALID, &dsp->ds_flags); + clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); + init_waitqueue_head(&dsp->ds_waitq); + + hashval = stateid_hashval(st_id, f_id); + list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); + dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); + return dsp; +} + +static int +update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, + struct pnfs_get_state *gsp) +{ + struct pnfs_ds_clientid *dcp; + int new = 0; + + dprintk("pNFSD: %s dsp %p\n", __func__, dsp); + + dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); + if (!dcp) { + dcp = alloc_init_ds_clientid(gsp); + if (!dcp) + return 1; + new = 1; + } + if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { + list_add(&dsp->ds_perclid, &dcp->dc_stateid); + if (!new) + get_ds_clientid(dcp); + } + + memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); + dsp->ds_access = gsp->access; + dsp->ds_status = 0; + dsp->ds_verifier[0] = gsp->verifier[0]; + dsp->ds_verifier[1] = gsp->verifier[1]; + memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); + set_bit(DS_STATEID_VALID, &dsp->ds_flags); + clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); + clear_bit(DS_STATEID_NEW, &dsp->ds_flags); + return 0; +} + +int +nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) +{ + stateid_t *stid = (stateid_t *)&gs->stid; + struct pnfs_ds_stateid *dsp; + + dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, + STATEID_VAL(stid)); + + ds_lock_state(); + dsp = find_pnfs_ds_stateid(stid); + if (dsp) + put_ds_stateid(dsp); + ds_unlock_state(); + + dprintk("pNFSD: %s dsp %p\n", __func__, dsp); + + if (dsp) + return 0; + return -ENOENT; +} + +/* Retrieves and validates stateid. + * If stateid exists and its fields match, return it. + * If stateid exists but either the generation or + * ownerids don't match, check with mds to see if it is valid. + * If the stateid doesn't exist, the first thread creates a + * invalid *marker* stateid, then checks to see if the + * stateid exists on the mds. If so, it validates the *marker* + * stateid and updates its fields. Subsequent threads that + * find the *marker* stateid wait until it is valid or an error + * occurs. + * Called with ds_state_lock. + */ +static struct pnfs_ds_stateid * +nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) +{ + struct inode *ino = cfh->fh_dentry->d_inode; + struct super_block *sb; + struct pnfs_ds_stateid *dsp = NULL; + struct pnfs_get_state gs = { + .access = 0, + }; + int status = 0, waiter = 0; + + dprintk("pNFSD: %s -->\n", __func__); + + dsp = find_pnfs_ds_stateid(stidp); + if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && + (stidp->si_generation == dsp->ds_stid.si_generation)) + goto out_noput; + + sb = ino->i_sb; + if (!sb || !sb->s_pnfs_op->get_state) + goto out_noput; + + /* Uninitialize current state if it exists yet it doesn't match. + * If it is already invalid, another thread is checking state */ + if (dsp) { + if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) + waiter = 1; + } else { + dsp = alloc_init_ds_stateid(cfh, stidp); + if (!dsp) + goto out_noput; + } + + dprintk("pNFSD: %s Starting loop\n", __func__); + get_ds_stateid(dsp); + while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { + ds_unlock_state(); + + /* Another thread is checking the state */ + if (waiter) { + dprintk("pNFSD: %s waiting\n", __func__); + wait_event_interruptible_timeout(dsp->ds_waitq, + (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || + test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), + msecs_to_jiffies(1024)); + dprintk("pNFSD: %s awake\n", __func__); + ds_lock_state(); + if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) + goto out; + + continue; + } + + /* Validate stateid on mds */ + dprintk("pNFSD: %s Checking state on MDS\n", __func__); + memcpy(&gs.stid, stidp, sizeof(stateid_t)); + status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); + dprintk("pNFSD: %s from MDS status %d\n", __func__, status); + ds_lock_state(); + /* if !status and stateid is valid, update id and mark valid */ + if (status || update_ds_stateid(dsp, cfh, &gs)) { + set_bit(DS_STATEID_ERROR, &dsp->ds_flags); + /* remove invalid stateid from list */ + put_ds_stateid(dsp); + wake_up(&dsp->ds_waitq); + goto out; + } + + wake_up(&dsp->ds_waitq); + } +out: + if (dsp) + put_ds_stateid(dsp); +out_noput: + if (dsp) + dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", + __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); + /* If error, return null */ + if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) + dsp = NULL; + dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); + return dsp; +} + +int +nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) +{ + struct pnfs_ds_stateid *dsp; + int status = 0; + + dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, + STATEID_VAL(stateid)); + + /* Must release state lock while verifying stateid on mds */ + nfs4_unlock_state(); + ds_lock_state(); + dsp = nfsv4_ds_get_state(cfh, stateid); + if (dsp) { + get_ds_stateid(dsp); + dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, + STATEID_VAL(&dsp->ds_stid)); + + dprintk("NFSD: %s: dsp %p fh_size %u:%u " + "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " + "gen %x:%x\n", + __func__, dsp, + cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, + ((unsigned *)&cfh->fh_handle.fh_base)[0], + ((unsigned *)&cfh->fh_handle.fh_base)[1], + ((unsigned *)&cfh->fh_handle.fh_base)[2], + ((unsigned *)&cfh->fh_handle.fh_base)[3], + ((unsigned *)&dsp->ds_fh.fh_base)[0], + ((unsigned *)&dsp->ds_fh.fh_base)[1], + ((unsigned *)&dsp->ds_fh.fh_base)[2], + ((unsigned *)&dsp->ds_fh.fh_base)[3], + stateid->si_generation, dsp->ds_stid.si_generation); + } + + if (!dsp || + (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || + (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, + dsp->ds_fh.fh_size) != 0) || + (stateid->si_generation > dsp->ds_stid.si_generation)) + status = nfserr_bad_stateid; + else if (stateid->si_generation < dsp->ds_stid.si_generation) + status = nfserr_old_stateid; + + if (dsp) + put_ds_stateid(dsp); + ds_unlock_state(); + nfs4_lock_state(); + dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); + return status; +} + +void +nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) +{ + struct pnfs_ds_stateid *dsp = NULL; + + dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); + + ds_lock_state(); + if (stateid != NULL) { + dsp = find_pnfs_ds_stateid(stateid); + if (dsp) + get_ds_stateid(dsp); + } + + /* XXX: Should we fetch the stateid or wait if some other + * thread is currently retrieving the stateid ? */ + if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { + *p++ = dsp->ds_verifier[0]; + *p++ = dsp->ds_verifier[1]; + put_ds_stateid(dsp); + } else { + /* must be on MDS */ + ds_unlock_state(); + sb->s_pnfs_op->get_verifier(sb, p); + ds_lock_state(); + p += 2; + } + ds_unlock_state(); + dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); + return; +} + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4proc.c --- linux-2.6.34.noarch/fs/nfsd/nfs4proc.c.orig 2010-09-30 10:15:18.334728000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4proc.c 2010-09-30 10:17:08.878998000 -0400 @@ -34,10 +34,14 @@ */ #include #include +#include +#include +#include #include "cache.h" #include "xdr4.h" #include "vfs.h" +#include "pnfsd.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc * set, (2) sets open->op_stateid, (3) sets open->op_delegation. */ status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); +#if defined(CONFIG_SPNFS) + if (!status && spnfs_enabled()) { + struct inode *inode = cstate->current_fh.fh_dentry->d_inode; + + status = spnfs_open(inode, open); + if (status) { + dprintk( + "nfsd: pNFS could not be enabled for inode: %lu\n", + inode->i_ino); + /* + * XXX When there's a failure then need to indicate to + * future ops that no pNFS is available. Should I save + * the status in the inode? It's kind of a big hammer. + * But there may be no stripes available? + */ + } + } +#endif /* CONFIG_SPNFS */ out: if (open->op_stateowner) { nfs4_get_stateowner(open->op_stateowner); @@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str &access->ac_supported); } +static void +nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) +{ + u32 *p = (u32 *)verf->data; + +#if defined(CONFIG_PNFSD) + if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { + nfs4_ds_get_verifier(NULL, sb, p); + return; + } +#endif /* CONFIG_PNFSD */ + + *p++ = nfssvc_boot.tv_sec; + *p++ = nfssvc_boot.tv_usec; +} + static __be32 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_commit *commit) { __be32 status; - u32 *p = (u32 *)commit->co_verf.data; - *p++ = nfssvc_boot.tv_sec; - *p++ = nfssvc_boot.tv_usec; - + nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, + &commit->co_verf); status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, commit->co_count); if (status == nfserr_symlink) @@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru { stateid_t *stateid = &write->wr_stateid; struct file *filp = NULL; - u32 *p; __be32 status = nfs_ok; unsigned long cnt; @@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; - p = (u32 *)write->wr_verifier.data; - *p++ = nfssvc_boot.tv_sec; - *p++ = nfssvc_boot.tv_usec; + nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, + &write->wr_verifier); +#if defined(CONFIG_SPNFS) +#if defined(CONFIG_SPNFS_BLOCK) + if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { + status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, + RETURN_FILE, write->wr_offset, write->wr_buflen); + if (!status) { + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); + } + } else +#endif + + if (spnfs_enabled()) { + status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, + write->wr_offset, write->wr_buflen, write->wr_vlen, + rqstp); + if (status == nfs_ok) { + /* DMXXX: HACK to get filesize set */ + /* write one byte at offset+length-1 */ + struct kvec k[1]; + char zero = 0; + unsigned long cnt = 1; + + k[0].iov_base = (void *)&zero; + k[0].iov_len = 1; + nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset+write->wr_buflen-1, k, 1, + &cnt, &write->wr_how_written); + } + } else /* we're not an MDS */ + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); +#else status = nfsd_write(rqstp, &cstate->current_fh, filp, write->wr_offset, rqstp->rq_vec, write->wr_vlen, &cnt, &write->wr_how_written); +#endif /* CONFIG_SPNFS */ + if (filp) fput(filp); @@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str return status == nfserr_same ? nfs_ok : status; } +#if defined(CONFIG_PNFSD) + +static __be32 +nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, + unsigned int layout_type) +{ + int status, type; + + /* check to see if pNFS is supported. */ + status = nfserr_layoutunavailable; + if (exp && exp->ex_pnfs == 0) { + dprintk("%s: Underlying file system " + "is not exported over pNFS\n", __func__); + goto out; + } + if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { + dprintk("%s: Underlying file system " + "does not support pNFS\n", __func__); + goto out; + } + + type = sb->s_pnfs_op->layout_type(sb); + + /* check to see if requested layout type is supported. */ + status = nfserr_unknown_layouttype; + if (!type) + dprintk("BUG: %s: layout_type 0 is reserved and must not be " + "used by filesystem\n", __func__); + else if (type != layout_type) + dprintk("%s: requested layout type %d " + "does not match supported type %d\n", + __func__, layout_type, type); + else + status = nfs_ok; +out: + return status; +} + +static __be32 +nfsd4_getdevlist(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_pnfs_getdevlist *gdlp) +{ + struct super_block *sb; + struct svc_fh *current_fh = &cstate->current_fh; + int status; + + dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", + __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, + gdlp->gd_cookie, gdlp->gd_verf); + + + status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); + if (status) + goto out; + + status = nfserr_inval; + sb = current_fh->fh_dentry->d_inode->i_sb; + if (!sb) + goto out; + + /* We must be able to encode at list one device */ + if (!gdlp->gd_maxdevices) + goto out; + + /* Ensure underlying file system supports pNFS and, + * if so, the requested layout type + */ + status = nfsd4_layout_verify(sb, current_fh->fh_export, + gdlp->gd_layout_type); + if (status) + goto out; + + /* Do nothing if underlying file system does not support + * getdevicelist */ + if (!sb->s_pnfs_op->get_device_iter) { + status = nfserr_notsupp; + goto out; + } + + /* Set up arguments so device can be retrieved at encode time */ + gdlp->gd_fhp = &cstate->current_fh; +out: + return status; +} + +static __be32 +nfsd4_getdevinfo(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_pnfs_getdevinfo *gdp) +{ + struct super_block *sb; + int status; + clientid_t clid; + + dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", + __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, + gdp->gd_devid.devid, gdp->gd_maxcount); + + status = nfserr_inval; + sb = find_sbid_id(gdp->gd_devid.sbid); + dprintk("%s: sb %p\n", __func__, sb); + if (!sb) { + status = nfserr_noent; + goto out; + } + + /* Ensure underlying file system supports pNFS and, + * if so, the requested layout type + */ + status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); + if (status) + goto out; + + /* Set up arguments so device can be retrieved at encode time */ + gdp->gd_sb = sb; + + /* Update notifications */ + copy_clientid(&clid, cstate->session); + pnfs_set_device_notify(&clid, gdp->gd_notify_types); +out: + return status; +} + +static __be32 +nfsd4_layoutget(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_pnfs_layoutget *lgp) +{ + int status; + struct super_block *sb; + struct svc_fh *current_fh = &cstate->current_fh; + + status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); + if (status) + goto out; + + status = nfserr_inval; + sb = current_fh->fh_dentry->d_inode->i_sb; + if (!sb) + goto out; + + /* Ensure underlying file system supports pNFS and, + * if so, the requested layout type + */ + status = nfsd4_layout_verify(sb, current_fh->fh_export, + lgp->lg_seg.layout_type); + if (status) + goto out; + + status = nfserr_badiomode; + if (lgp->lg_seg.iomode != IOMODE_READ && + lgp->lg_seg.iomode != IOMODE_RW) { + dprintk("pNFS %s: invalid iomode %d\n", __func__, + lgp->lg_seg.iomode); + goto out; + } + + /* Set up arguments so layout can be retrieved at encode time */ + lgp->lg_fhp = current_fh; + copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); + status = nfs_ok; +out: + return status; +} + +static __be32 +nfsd4_layoutcommit(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_pnfs_layoutcommit *lcp) +{ + int status; + struct inode *ino = NULL; + struct iattr ia; + struct super_block *sb; + struct svc_fh *current_fh = &cstate->current_fh; + + dprintk("NFSD: nfsd4_layoutcommit \n"); + status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); + if (status) + goto out; + + status = nfserr_inval; + ino = current_fh->fh_dentry->d_inode; + if (!ino) + goto out; + + status = nfserr_inval; + sb = ino->i_sb; + if (!sb) + goto out; + + /* Ensure underlying file system supports pNFS and, + * if so, the requested layout type + */ + status = nfsd4_layout_verify(sb, current_fh->fh_export, + lcp->args.lc_seg.layout_type); + if (status) + goto out; + + /* This will only extend the file length. Do a quick + * check to see if there is any point in waiting for the update + * locks. + * TODO: Is this correct for all back ends? + */ + dprintk("%s:new offset: %d new size: %llu old size: %lld\n", + __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, + ino->i_size); + + /* Set clientid from sessionid */ + copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); + lcp->res.lc_size_chg = 0; + if (sb->s_pnfs_op->layout_commit) { + status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); + dprintk("%s:layout_commit result %d\n", __func__, status); + } else { + fh_lock(current_fh); + if ((lcp->args.lc_newoffset == 0) || + ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { + status = 0; + lcp->res.lc_size_chg = 0; + fh_unlock(current_fh); + goto out; + } + + /* Try our best to update the file size */ + dprintk("%s: Modifying file size\n", __func__); + ia.ia_valid = ATTR_SIZE; + ia.ia_size = lcp->args.lc_last_wr + 1; + status = notify_change(current_fh->fh_dentry, &ia); + fh_unlock(current_fh); + dprintk("%s:notify_change result %d\n", __func__, status); + } + + if (!status && lcp->res.lc_size_chg && + EX_ISSYNC(current_fh->fh_export)) { + dprintk("%s: Synchronously writing inode size %llu\n", + __func__, ino->i_size); + write_inode_now(ino, 1); + lcp->res.lc_newsize = i_size_read(ino); + } +out: + return status; +} + +static __be32 +nfsd4_layoutreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_pnfs_layoutreturn *lrp) +{ + int status; + struct super_block *sb; + struct svc_fh *current_fh = &cstate->current_fh; + + status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); + if (status) + goto out; + + status = nfserr_inval; + sb = current_fh->fh_dentry->d_inode->i_sb; + if (!sb) + goto out; + + /* Ensure underlying file system supports pNFS and, + * if so, the requested layout type + */ + status = nfsd4_layout_verify(sb, current_fh->fh_export, + lrp->args.lr_seg.layout_type); + if (status) + goto out; + + status = nfserr_inval; + if (lrp->args.lr_return_type != RETURN_FILE && + lrp->args.lr_return_type != RETURN_FSID && + lrp->args.lr_return_type != RETURN_ALL) { + dprintk("pNFS %s: invalid return_type %d\n", __func__, + lrp->args.lr_return_type); + goto out; + } + + status = nfserr_inval; + if (lrp->args.lr_seg.iomode != IOMODE_READ && + lrp->args.lr_seg.iomode != IOMODE_RW && + lrp->args.lr_seg.iomode != IOMODE_ANY) { + dprintk("pNFS %s: invalid iomode %d\n", __func__, + lrp->args.lr_seg.iomode); + goto out; + } + + /* Set clientid from sessionid */ + copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); + lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); + status = nfs4_pnfs_return_layout(sb, current_fh, lrp); +out: + dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", + __func__, status, lrp->args.lr_return_type, lrp->lrs_present); + return status; +} +#endif /* CONFIG_PNFSD */ + /* * NULL call. */ @@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[ .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_RECLAIM_COMPLETE", }, +#if defined(CONFIG_PNFSD) + [OP_GETDEVICELIST] = { + .op_func = (nfsd4op_func)nfsd4_getdevlist, + .op_name = "OP_GETDEVICELIST", + }, + [OP_GETDEVICEINFO] = { + .op_func = (nfsd4op_func)nfsd4_getdevinfo, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_GETDEVICEINFO", + }, + [OP_LAYOUTGET] = { + .op_func = (nfsd4op_func)nfsd4_layoutget, + .op_name = "OP_LAYOUTGET", + }, + [OP_LAYOUTCOMMIT] = { + .op_func = (nfsd4op_func)nfsd4_layoutcommit, + .op_name = "OP_LAYOUTCOMMIT", + }, + [OP_LAYOUTRETURN] = { + .op_func = (nfsd4op_func)nfsd4_layoutreturn, + .op_name = "OP_LAYOUTRETURN", + }, +#endif /* CONFIG_PNFSD */ }; static const char *nfsd4_op_name(unsigned opnum) diff -up linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4state.c --- linux-2.6.34.noarch/fs/nfsd/nfs4state.c.orig 2010-09-30 10:15:18.345729000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4state.c 2010-09-30 10:17:08.887003000 -0400 @@ -42,6 +42,8 @@ #include "xdr4.h" #include "vfs.h" +#include "pnfsd.h" + #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ @@ -60,8 +62,6 @@ static u64 current_sessionid = 1; #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) /* forward declarations */ -static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); -static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static void nfs4_set_recdir(char *recdir); @@ -69,6 +69,7 @@ static void nfs4_set_recdir(char *recdir /* Currently used for almost all code touching nfsv4 state: */ static DEFINE_MUTEX(client_mutex); +struct task_struct *client_mutex_owner; /* * Currently used for the del_recall_lru and file hash table. In an @@ -86,11 +87,21 @@ void nfs4_lock_state(void) { mutex_lock(&client_mutex); + client_mutex_owner = current; +} + +#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) + +void +nfs4_bug_on_unlocked_state(void) +{ + BUG_ON(client_mutex_owner != current); } void nfs4_unlock_state(void) { + client_mutex_owner = NULL; mutex_unlock(&client_mutex); } @@ -109,7 +120,7 @@ opaque_hashval(const void *ptr, int nbyt static struct list_head del_recall_lru; -static inline void +inline void put_nfs4_file(struct nfs4_file *fi) { if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { @@ -120,7 +131,7 @@ put_nfs4_file(struct nfs4_file *fi) } } -static inline void +inline void get_nfs4_file(struct nfs4_file *fi) { atomic_inc(&fi->fi_ref); @@ -230,7 +241,10 @@ nfs4_close_delegation(struct nfs4_delega * but we want to remove the lease in any case. */ if (dp->dl_flock) vfs_setlease(filp, F_UNLCK, &dp->dl_flock); + BUG_ON_UNLOCKED_STATE(); + nfs4_unlock_state(); /* allow nested layout recall/return */ nfsd_close(filp); + nfs4_lock_state(); } /* Called under the state lock. */ @@ -266,8 +280,8 @@ static DEFINE_SPINLOCK(client_lock); * reclaim_str_hashtbl[] holds known client info from previous reset/reboot * used in reboot/reset lease grace period processing * - * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed - * setclientid_confirmed info. + * conf_id_hashtbl[], and conf_str_hashtbl[] hold + * confirmed setclientid_confirmed info. * * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed * setclientid info. @@ -292,6 +306,7 @@ static void unhash_generic_stateid(struc list_del(&stp->st_hash); list_del(&stp->st_perfile); list_del(&stp->st_perstateowner); + release_pnfs_ds_dev_list(stp); } static void free_generic_stateid(struct nfs4_stateid *stp) @@ -345,7 +360,10 @@ static void release_open_stateid(struct { unhash_generic_stateid(stp); release_stateid_lockowners(stp); + BUG_ON_UNLOCKED_STATE(); + nfs4_unlock_state(); /* allow nested layout recall/return */ nfsd_close(stp->st_vfs_file); + nfs4_lock_state(); free_generic_stateid(stp); } @@ -739,6 +757,8 @@ expire_client(struct nfs4_client *clp) struct nfs4_delegation *dp; struct list_head reaplist; + BUG_ON_UNLOCKED_STATE(); + INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); while (!list_empty(&clp->cl_delegations)) { @@ -758,6 +778,7 @@ expire_client(struct nfs4_client *clp) sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); release_openowner(sop); } + pnfs_expire_client(clp); nfsd4_set_callback_client(clp, NULL); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); @@ -770,6 +791,13 @@ expire_client(struct nfs4_client *clp) spin_unlock(&client_lock); } +void expire_client_lock(struct nfs4_client *clp) +{ + nfs4_lock_state(); + expire_client(clp); + nfs4_unlock_state(); +} + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) { memcpy(target->cl_verifier.data, source->data, @@ -859,6 +887,11 @@ static struct nfs4_client *create_client INIT_LIST_HEAD(&clp->cl_strhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); +#if defined(CONFIG_PNFSD) + INIT_LIST_HEAD(&clp->cl_layouts); + INIT_LIST_HEAD(&clp->cl_layoutrecalls); + atomic_set(&clp->cl_deviceref, 0); +#endif /* CONFIG_PNFSD */ INIT_LIST_HEAD(&clp->cl_sessions); INIT_LIST_HEAD(&clp->cl_lru); clp->cl_time = get_seconds(); @@ -908,7 +941,7 @@ move_to_confirmed(struct nfs4_client *cl renew_client(clp); } -static struct nfs4_client * +struct nfs4_client * find_confirmed_client(clientid_t *clid) { struct nfs4_client *clp; @@ -978,6 +1011,24 @@ find_unconfirmed_client_by_str(const cha return NULL; } +int +filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), + void *arg) +{ + struct nfs4_client *clp, *next; + int i, status = 0; + + for (i = 0; i < CLIENT_HASH_SIZE; i++) + list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], + cl_strhash) { + status = func(clp, arg); + if (status) + break; + } + + return status; +} + static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) { @@ -1110,8 +1161,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co static void nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) { - /* pNFS is not supported */ +#if defined(CONFIG_PNFSD) + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | + EXCHGID4_FLAG_USE_PNFS_DS; +#else /* CONFIG_PNFSD */ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; +#endif /* CONFIG_PNFSD */ /* Referrals are supported, Migration is not. */ new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; @@ -1301,6 +1356,13 @@ nfsd4_create_session(struct svc_rqst *rq struct nfsd4_clid_slot *cs_slot = NULL; int status = 0; +#if defined(CONFIG_PNFSD_LOCAL_EXPORT) + /* XXX hack to get local ip address */ + memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, + sizeof(pnfsd_lexp_addr)); + pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; +#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ + nfs4_lock_state(); unconf = find_unconfirmed_client(&cr_ses->clientid); conf = find_confirmed_client(&cr_ses->clientid); @@ -1340,25 +1402,26 @@ nfsd4_create_session(struct svc_rqst *rq cs_slot->sl_seqid++; /* from 0 to 1 */ move_to_confirmed(unconf); - if (cr_ses->flags & SESSION4_BACK_CHAN) { - unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; - svc_xprt_get(rqstp->rq_xprt); - rpc_copy_addr( - (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, - sa); - unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); - unconf->cl_cb_conn.cb_minorversion = - cstate->minorversion; - unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; - unconf->cl_cb_seq_nr = 1; - nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); - } + if (is_ds_only_session(unconf->cl_exchange_flags)) + cr_ses->flags &= ~SESSION4_BACK_CHAN; + conf = unconf; } else { status = nfserr_stale_clientid; goto out; } + if (cr_ses->flags & SESSION4_BACK_CHAN) { + conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; + svc_xprt_get(rqstp->rq_xprt); + rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa); + conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); + conf->cl_cb_conn.cb_minorversion = cstate->minorversion; + conf->cl_cb_conn.cb_prog = cr_ses->callback_prog; + conf->cl_cb_seq_nr = 1; + nfsd4_probe_callback(conf, &conf->cl_cb_conn); + } + /* * We do not support RDMA or persistent sessions */ @@ -1746,7 +1809,7 @@ out: /* OPEN Share state helper functions */ static inline struct nfs4_file * -alloc_init_file(struct inode *ino) +alloc_init_file(struct inode *ino, struct svc_fh *current_fh) { struct nfs4_file *fp; unsigned int hashval = file_hashval(ino); @@ -1760,6 +1823,16 @@ alloc_init_file(struct inode *ino) fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; fp->fi_had_conflict = false; +#if defined(CONFIG_PNFSD) + INIT_LIST_HEAD(&fp->fi_layouts); + INIT_LIST_HEAD(&fp->fi_layout_states); + fp->fi_fsid.major = current_fh->fh_export->ex_fsid; + fp->fi_fsid.minor = 0; + fp->fi_fhlen = current_fh->fh_handle.fh_size; + BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); + memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, + fp->fi_fhlen); +#endif /* CONFIG_PNFSD */ spin_lock(&recall_lock); list_add(&fp->fi_hash, &file_hashtbl[hashval]); spin_unlock(&recall_lock); @@ -1768,7 +1841,7 @@ alloc_init_file(struct inode *ino) return NULL; } -static void +void nfsd4_free_slab(struct kmem_cache **slab) { if (*slab == NULL) @@ -1784,6 +1857,7 @@ nfsd4_free_slabs(void) nfsd4_free_slab(&file_slab); nfsd4_free_slab(&stateid_slab); nfsd4_free_slab(&deleg_slab); + nfsd4_free_pnfs_slabs(); } static int @@ -1805,6 +1879,8 @@ nfsd4_init_slabs(void) sizeof(struct nfs4_delegation), 0, 0, NULL); if (deleg_slab == NULL) goto out_nomem; + if (nfsd4_init_pnfs_slabs()) + goto out_nomem; return 0; out_nomem: nfsd4_free_slabs(); @@ -1878,6 +1954,9 @@ init_stateid(struct nfs4_stateid *stp, s INIT_LIST_HEAD(&stp->st_perstateowner); INIT_LIST_HEAD(&stp->st_lockowners); INIT_LIST_HEAD(&stp->st_perfile); +#if defined(CONFIG_PNFSD) + INIT_LIST_HEAD(&stp->st_pnfs_ds_id); +#endif /* CONFIG_PNFSD */ list_add(&stp->st_hash, &stateid_hashtbl[hashval]); list_add(&stp->st_perstateowner, &sop->so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); @@ -1919,6 +1998,7 @@ find_openstateowner_str(unsigned int has { struct nfs4_stateowner *so = NULL; + BUG_ON_UNLOCKED_STATE(); list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { if (same_owner_str(so, &open->op_owner, &open->op_clientid)) return so; @@ -1927,7 +2007,7 @@ find_openstateowner_str(unsigned int has } /* search file_hashtbl[] for file */ -static struct nfs4_file * +struct nfs4_file * find_file(struct inode *ino) { unsigned int hashval = file_hashval(ino); @@ -1945,6 +2025,18 @@ find_file(struct inode *ino) return NULL; } +struct nfs4_file * +find_alloc_file(struct inode *ino, struct svc_fh *current_fh) +{ + struct nfs4_file *fp; + + fp = find_file(ino); + if (fp) + return fp; + + return alloc_init_file(ino, current_fh); +} + static inline int access_valid(u32 x, u32 minorversion) { if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) @@ -2503,7 +2595,7 @@ nfsd4_process_open2(struct svc_rqst *rqs if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) goto out; status = nfserr_resource; - fp = alloc_init_file(ino); + fp = alloc_init_file(ino, current_fh); if (fp == NULL) goto out; } @@ -2730,7 +2822,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; } -static int +int STALE_STATEID(stateid_t *stateid) { if (stateid->si_boot == boot_time) @@ -2740,6 +2832,16 @@ STALE_STATEID(stateid_t *stateid) return 1; } +__be32 +nfs4_check_stateid(stateid_t *stateid) +{ + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + return nfserr_bad_stateid; + if (STALE_STATEID(stateid)) + return nfserr_stale_stateid; + return 0; +} + static inline int access_permit_read(unsigned long access_bmap) { @@ -2848,6 +2950,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ if (grace_disallows_io(ino)) return nfserr_grace; +#if defined(CONFIG_PNFSD) + if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + status = nfserr_bad_stateid; + else +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + { + dprintk("%s Don't check DS stateid\n", __func__); + return 0; + } +#else /* CONFIG_GFS2_FS_LOCKING_DLM */ + status = nfs4_preprocess_pnfs_ds_stateid(current_fh, + stateid); +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + goto out; + } +#endif /* CONFIG_PNFSD */ + if (nfsd4_has_session(cstate)) flags |= HAS_SESSION; @@ -2924,13 +3044,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co *stpp = NULL; *sopp = NULL; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { - dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); - return nfserr_bad_stateid; - } - - if (STALE_STATEID(stateid)) - return nfserr_stale_stateid; + status = nfs4_check_stateid(stateid); + if (status) + return status; if (nfsd4_has_session(cstate)) flags |= HAS_SESSION; @@ -3205,11 +3321,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp if (nfsd4_has_session(cstate)) flags |= HAS_SESSION; nfs4_lock_state(); - status = nfserr_bad_stateid; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - goto out; - status = nfserr_stale_stateid; - if (STALE_STATEID(stateid)) + status = nfs4_check_stateid(stateid); + if (status) goto out; status = nfserr_bad_stateid; if (!is_delegation_stateid(stateid)) @@ -3238,26 +3351,6 @@ out: #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) -static inline u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end: NFS4_MAX_UINT64; -} - -/* last octet in a range */ -static inline u64 -last_byte_offset(u64 start, u64 len) -{ - u64 end; - - BUG_ON(!len); - end = start + len; - return end > start ? end - 1: NFS4_MAX_UINT64; -} - #define lockownerid_hashval(id) \ ((id) & LOCK_HASH_MASK) @@ -3274,7 +3367,7 @@ static struct list_head lock_ownerid_has static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; -static struct nfs4_stateid * +struct nfs4_stateid * find_stateid(stateid_t *stid, int flags) { struct nfs4_stateid *local; @@ -3303,7 +3396,7 @@ find_stateid(stateid_t *stid, int flags) return NULL; } -static struct nfs4_delegation * +struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid) { struct nfs4_file *fp; @@ -3436,6 +3529,9 @@ alloc_init_lock_stateid(struct nfs4_stat INIT_LIST_HEAD(&stp->st_perfile); INIT_LIST_HEAD(&stp->st_perstateowner); INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ +#if defined(CONFIG_PNFSD) + INIT_LIST_HEAD(&stp->st_pnfs_ds_id); +#endif /* CONFIG_PNFSD */ list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); list_add(&stp->st_perfile, &fp->fi_stateids); list_add(&stp->st_perstateowner, &sop->so_stateids); @@ -3998,6 +4094,9 @@ nfs4_state_init(void) INIT_LIST_HEAD(&client_lru); INIT_LIST_HEAD(&del_recall_lru); reclaim_str_hashtbl_size = 0; +#if defined(CONFIG_PNFSD) + nfs4_pnfs_state_init(); +#endif /* CONFIG_PNFSD */ return 0; } @@ -4110,6 +4209,7 @@ __nfs4_state_shutdown(void) } nfsd4_shutdown_recdir(); + nfs4_pnfs_state_shutdown(); nfs4_init = 0; } diff -up linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c --- linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c.orig 2010-09-30 10:15:18.353734000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfs4xdr.c 2010-09-30 10:17:08.894999000 -0400 @@ -47,9 +47,14 @@ #include #include #include +#include +#include +#include +#include #include "xdr4.h" #include "vfs.h" +#include "pnfsd.h" #define NFSDDBG_FACILITY NFSDDBG_XDR @@ -1234,6 +1239,138 @@ nfsd4_decode_sequence(struct nfsd4_compo DECODE_TAIL; } +#if defined(CONFIG_PNFSD) +static __be32 +nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, + struct nfsd4_pnfs_getdevlist *gdevl) +{ + DECODE_HEAD; + + READ_BUF(16 + sizeof(nfs4_verifier)); + READ32(gdevl->gd_layout_type); + READ32(gdevl->gd_maxdevices); + READ64(gdevl->gd_cookie); + COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, + struct nfsd4_pnfs_getdevinfo *gdev) +{ + u32 num; + DECODE_HEAD; + + READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); + READ64(gdev->gd_devid.sbid); + READ64(gdev->gd_devid.devid); + READ32(gdev->gd_layout_type); + READ32(gdev->gd_maxcount); + READ32(num); + if (num) { + READ_BUF(4); + READ32(gdev->gd_notify_types); + } else { + gdev->gd_notify_types = 0; + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, + struct nfsd4_pnfs_layoutget *lgp) +{ + DECODE_HEAD; + + READ_BUF(36); + READ32(lgp->lg_signal); + READ32(lgp->lg_seg.layout_type); + READ32(lgp->lg_seg.iomode); + READ64(lgp->lg_seg.offset); + READ64(lgp->lg_seg.length); + READ64(lgp->lg_minlength); + nfsd4_decode_stateid(argp, &lgp->lg_sid); + READ_BUF(4); + READ32(lgp->lg_maxcount); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, + struct nfsd4_pnfs_layoutcommit *lcp) +{ + DECODE_HEAD; + u32 timechange; + + READ_BUF(20); + READ64(lcp->args.lc_seg.offset); + READ64(lcp->args.lc_seg.length); + READ32(lcp->args.lc_reclaim); + nfsd4_decode_stateid(argp, &lcp->lc_sid); + READ_BUF(4); + READ32(lcp->args.lc_newoffset); + if (lcp->args.lc_newoffset) { + READ_BUF(8); + READ64(lcp->args.lc_last_wr); + } else + lcp->args.lc_last_wr = 0; + READ_BUF(4); + READ32(timechange); + if (timechange) { + READ_BUF(12); + READ64(lcp->args.lc_mtime.seconds); + READ32(lcp->args.lc_mtime.nseconds); + } else { + lcp->args.lc_mtime.seconds = 0; + lcp->args.lc_mtime.nseconds = 0; + } + READ_BUF(8); + READ32(lcp->args.lc_seg.layout_type); + /* XXX: saving XDR'ed layout update. Since we don't have the + * current_fh yet, and therefore no export_ops, we can't call + * the layout specific decode routines. File and pVFS2 + * do not use the layout update.... + */ + READ32(lcp->args.lc_up_len); + if (lcp->args.lc_up_len > 0) { + READ_BUF(lcp->args.lc_up_len); + READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, + struct nfsd4_pnfs_layoutreturn *lrp) +{ + DECODE_HEAD; + + READ_BUF(16); + READ32(lrp->args.lr_reclaim); + READ32(lrp->args.lr_seg.layout_type); + READ32(lrp->args.lr_seg.iomode); + READ32(lrp->args.lr_return_type); + if (lrp->args.lr_return_type == RETURN_FILE) { + READ_BUF(16); + READ64(lrp->args.lr_seg.offset); + READ64(lrp->args.lr_seg.length); + nfsd4_decode_stateid(argp, &lrp->lr_sid); + READ_BUF(4); + READ32(lrp->args.lrf_body_len); + if (lrp->args.lrf_body_len > 0) { + READ_BUF(lrp->args.lrf_body_len); + READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); + } + } + + DECODE_TAIL; +} +#endif /* CONFIG_PNFSD */ + static __be32 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) { @@ -1335,11 +1472,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, +#if defined(CONFIG_PNFSD) + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, +#else /* CONFIG_PNFSD */ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, +#endif /* CONFIG_PNFSD */ [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -2136,6 +2281,36 @@ out_acl: } WRITE64(stat.ino); } +#if defined(CONFIG_PNFSD) + if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { + struct super_block *sb = dentry->d_inode->i_sb; + int type = 0; + + /* Query the filesystem for supported pNFS layout types. + * Currently, we only support one layout type per file system. + * The export_ops->layout_type() returns the pnfs_layouttype4. + */ + buflen -= 4; + if (buflen < 0) /* length */ + goto out_resource; + + if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) + type = sb->s_pnfs_op->layout_type(sb); + if (type) { + if ((buflen -= 4) < 0) /* type */ + goto out_resource; + WRITE32(1); /* length */ + WRITE32(type); /* type */ + } else + WRITE32(0); /* length */ + } + + if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { + if ((buflen -= 4) < 0) + goto out_resource; + WRITE32(stat.blksize); + } +#endif /* CONFIG_PNFSD */ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { WRITE32(3); WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); @@ -2366,6 +2541,10 @@ nfsd4_encode_commit(struct nfsd4_compoun if (!nfserr) { RESERVE_SPACE(8); WRITEMEM(commit->co_verf.data, 8); + dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", + ((u32 *)(&commit->co_verf.data))[0], + ((u32 *)(&commit->co_verf.data))[1]); + ADJUST_ARGS(); } return nfserr; @@ -2620,9 +2799,20 @@ nfsd4_encode_read(struct nfsd4_compoundr } read->rd_vlen = v; +#if defined(CONFIG_SPNFS) + if (spnfs_enabled()) + nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, + read->rd_offset, &maxcount, read->rd_vlen, + resp->rqstp); + else /* we're not an MDS */ + nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); +#else nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount); +#endif /* CONFIG_SPNFS */ if (nfserr == nfserr_symlink) nfserr = nfserr_inval; @@ -2926,6 +3116,9 @@ nfsd4_encode_write(struct nfsd4_compound WRITE32(write->wr_bytes_written); WRITE32(write->wr_how_written); WRITEMEM(write->wr_verifier.data, 8); + dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", + ((u32 *)(&write->wr_verifier.data))[0], + ((u32 *)(&write->wr_verifier.data))[1]); ADJUST_ARGS(); } return nfserr; @@ -3069,6 +3262,343 @@ nfsd4_encode_sequence(struct nfsd4_compo return 0; } +#if defined(CONFIG_PNFSD) + +/* Uses the export interface to iterate through the available devices + * and encodes them on the response stream. + */ +static __be32 +nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, + struct nfsd4_pnfs_getdevlist *gdevl, + unsigned int *dev_count) +{ + struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; + __be32 nfserr; + int status; + __be32 *p; + struct nfsd4_pnfs_dev_iter_res res = { + .gd_cookie = gdevl->gd_cookie, + .gd_verf = gdevl->gd_verf, + .gd_eof = 0 + }; + u64 sbid; + + dprintk("%s: Begin\n", __func__); + + sbid = find_create_sbid(sb); + *dev_count = 0; + do { + status = sb->s_pnfs_op->get_device_iter(sb, + gdevl->gd_layout_type, + &res); + if (status) { + if (status == -ENOENT) { + res.gd_eof = 1; + /* return success */ + break; + } + nfserr = nfserrno(status); + goto out_err; + } + + /* Encode device id and layout type */ + RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); + WRITE64((__be64)sbid); + WRITE64(res.gd_devid); /* devid minor */ + ADJUST_ARGS(); + (*dev_count)++; + } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); + gdevl->gd_cookie = res.gd_cookie; + gdevl->gd_verf = res.gd_verf; + gdevl->gd_eof = res.gd_eof; + nfserr = nfs_ok; +out_err: + dprintk("%s: Encoded %u devices\n", __func__, *dev_count); + return nfserr; +} + +/* Encodes the response of get device list. +*/ +static __be32 +nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_pnfs_getdevlist *gdevl) +{ + unsigned int dev_count = 0, lead_count; + u32 *p_in = resp->p; + __be32 *p; + + dprintk("%s: err %d\n", __func__, nfserr); + if (nfserr) + return nfserr; + + /* Ensure we have room for cookie, verifier, and devlist len, + * which we will backfill in after we encode as many devices as possible + */ + lead_count = 8 + sizeof(nfs4_verifier) + 4; + RESERVE_SPACE(lead_count); + /* skip past these values */ + p += XDR_QUADLEN(lead_count); + ADJUST_ARGS(); + + /* Iterate over as many device ids as possible on the xdr stream */ + nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); + if (nfserr) + goto out_err; + + /* Backfill in cookie, verf and number of devices encoded */ + p = p_in; + WRITE64(gdevl->gd_cookie); + WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); + WRITE32(dev_count); + + /* Skip over devices */ + p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); + ADJUST_ARGS(); + + /* are we at the end of devices? */ + RESERVE_SPACE(4); + WRITE32(gdevl->gd_eof); + ADJUST_ARGS(); + + dprintk("%s: done.\n", __func__); + + nfserr = nfs_ok; +out: + return nfserr; +out_err: + p = p_in; + ADJUST_ARGS(); + goto out; +} + +/* For a given device id, have the file system retrieve and encode the + * associated device. For file layout, the encoding function is + * passed down to the file system. The file system then has the option + * of using this encoding function or one of its own. + * + * Note: the file system must return the XDR size of struct device_addr4 + * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the + * gdir_mincount calculation. + */ +static __be32 +nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_pnfs_getdevinfo *gdev) +{ + struct super_block *sb; + int maxcount = 0, type_notify_len = 12; + __be32 *p, *p_save = NULL, *p_in = resp->p; + struct exp_xdr_stream xdr; + + dprintk("%s: err %d\n", __func__, nfserr); + if (nfserr) + return nfserr; + + sb = gdev->gd_sb; + + if (gdev->gd_maxcount != 0) { + /* FIXME: this will be bound by the session max response */ + maxcount = svc_max_payload(resp->rqstp); + if (maxcount > gdev->gd_maxcount) + maxcount = gdev->gd_maxcount; + + /* Ensure have room for type and notify field */ + maxcount -= type_notify_len; + if (maxcount < 0) { + nfserr = -ETOOSMALL; + goto toosmall; + } + } + + RESERVE_SPACE(4); + WRITE32(gdev->gd_layout_type); + ADJUST_ARGS(); + + /* If maxcount is 0 then just update notifications */ + if (gdev->gd_maxcount == 0) + goto handle_notifications; + + xdr.p = p_save = resp->p; + xdr.end = resp->end; + if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) + xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); + + nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, + &gdev->gd_devid); + if (nfserr) + goto err; + + /* The file system should never write 0 bytes without + * returning an error + */ + BUG_ON(xdr.p == p_save); + BUG_ON(xdr.p > xdr.end); + + /* Update the xdr stream with the number of bytes encoded + * by the file system. + */ + p = xdr.p; + ADJUST_ARGS(); + +handle_notifications: + /* Encode supported device notifications */ + RESERVE_SPACE(4); + if (sb->s_pnfs_op->set_device_notify) { + struct pnfs_devnotify_arg dn_args; + + dn_args.dn_layout_type = gdev->gd_layout_type; + dn_args.dn_devid = gdev->gd_devid; + dn_args.dn_notify_types = gdev->gd_notify_types; + nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); + if (nfserr) + goto err; + WRITE32(dn_args.dn_notify_types); + } else { + WRITE32(0); + } + ADJUST_ARGS(); + +out: + return nfserrno(nfserr); +toosmall: + dprintk("%s: maxcount too small\n", __func__); + RESERVE_SPACE(4); + WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); + ADJUST_ARGS(); + goto out; +err: + /* Rewind to the beginning */ + p = p_in; + ADJUST_ARGS(); + if (nfserr == -ETOOSMALL) + goto toosmall; + printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); + goto out; +} + +static __be32 +nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, + __be32 nfserr, + struct nfsd4_pnfs_layoutget *lgp) +{ + int maxcount, leadcount; + struct super_block *sb; + struct exp_xdr_stream xdr; + __be32 *p, *p_save, *p_start = resp->p; + + dprintk("%s: err %d\n", __func__, nfserr); + if (nfserr) + return nfserr; + + sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; + maxcount = PAGE_SIZE; + if (maxcount > lgp->lg_maxcount) + maxcount = lgp->lg_maxcount; + + /* Check for space on xdr stream */ + leadcount = 36 + sizeof(stateid_opaque_t); + RESERVE_SPACE(leadcount); + /* encode layout metadata after file system encodes layout */ + p += XDR_QUADLEN(leadcount); + ADJUST_ARGS(); + + /* Ensure have room for ret_on_close, off, len, iomode, type */ + maxcount -= leadcount; + if (maxcount < 0) { + printk(KERN_ERR "%s: buffer too small\n", __func__); + nfserr = nfserr_toosmall; + goto err; + } + + /* Set xdr info so file system can encode layout */ + xdr.p = p_save = resp->p; + xdr.end = resp->end; + if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) + xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); + + /* Retrieve, encode, and merge layout; process stateid */ + nfserr = nfs4_pnfs_get_layout(lgp, &xdr); + if (nfserr) + goto err; + + /* Ensure file system returned enough bytes for the client + * to access. + */ + if (lgp->lg_seg.length < lgp->lg_minlength) { + nfserr = nfserr_badlayout; + goto err; + } + + /* The file system should never write 0 bytes without + * returning an error + */ + BUG_ON(xdr.p == p_save); + + /* Rewind to beginning and encode attrs */ + resp->p = p_start; + RESERVE_SPACE(4); + WRITE32(lgp->lg_roc); /* return on close */ + ADJUST_ARGS(); + nfsd4_encode_stateid(resp, &lgp->lg_sid); + RESERVE_SPACE(28); + /* Note: response logr_layout array count, always one for now */ + WRITE32(1); + WRITE64(lgp->lg_seg.offset); + WRITE64(lgp->lg_seg.length); + WRITE32(lgp->lg_seg.iomode); + WRITE32(lgp->lg_seg.layout_type); + + /* Update the xdr stream with the number of bytes written + * by the file system + */ + p = xdr.p; + ADJUST_ARGS(); + + return nfs_ok; +err: + resp->p = p_start; + return nfserr; +} + +static __be32 +nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_pnfs_layoutcommit *lcp) +{ + __be32 *p; + + if (nfserr) + goto out; + + RESERVE_SPACE(4); + WRITE32(lcp->res.lc_size_chg); + ADJUST_ARGS(); + if (lcp->res.lc_size_chg) { + RESERVE_SPACE(8); + WRITE64(lcp->res.lc_newsize); + ADJUST_ARGS(); + } +out: + return nfserr; +} + +static __be32 +nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_pnfs_layoutreturn *lrp) +{ + __be32 *p; + + if (nfserr) + goto out; + + RESERVE_SPACE(4); + WRITE32(lrp->lrs_present != 0); /* got stateid? */ + ADJUST_ARGS(); + if (lrp->lrs_present) + nfsd4_encode_stateid(resp, &lrp->lr_sid); +out: + return nfserr; +} +#endif /* CONFIG_PNFSD */ + static __be32 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) { @@ -3129,11 +3659,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, +#if defined(CONFIG_PNFSD) + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, +#else /* CONFIG_PNFSD */ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, +#endif /* CONFIG_PNFSD */ [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff -up linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.34.noarch/fs/nfsd/nfsctl.c --- linux-2.6.34.noarch/fs/nfsd/nfsctl.c.orig 2010-09-30 10:15:18.364728000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsctl.c 2010-09-30 10:17:08.900002000 -0400 @@ -13,10 +13,15 @@ #include #include #include +#include #include "nfsd.h" #include "cache.h" +#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) +#include +#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ + /* * We have a single directory with 9 nodes in it. */ @@ -49,6 +54,9 @@ enum { NFSD_Gracetime, NFSD_RecoveryDir, #endif +#ifdef CONFIG_PNFSD + NFSD_pnfs_dlm_device, +#endif }; /* @@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct fi static ssize_t write_gracetime(struct file *file, char *buf, size_t size); static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif +#ifdef CONFIG_PNFSD +static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); +#endif static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Svc] = write_svc, @@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file [NFSD_Gracetime] = write_gracetime, [NFSD_RecoveryDir] = write_recoverydir, #endif +#ifdef CONFIG_PNFSD + [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, +#endif }; static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) @@ -1349,6 +1363,68 @@ static ssize_t write_recoverydir(struct #endif +#ifdef CONFIG_PNFSD + +static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, + size_t size) +{ + char *mesg = buf; + char *pnfs_dlm_device; + int max_size = NFSD_PNFS_DLM_DEVICE_MAX; + int len, ret = 0; + + if (size > 0) { + ret = -EINVAL; + if (size > max_size || buf[size-1] != '\n') + return ret; + buf[size-1] = 0; + + pnfs_dlm_device = mesg; + len = qword_get(&mesg, pnfs_dlm_device, size); + if (len <= 0) + return ret; + + ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); + } else + return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); + + return ret <= 0 ? ret : strlen(buf); +} + +/** + * write_pnfs_dlm_device - Set or report the current pNFS data server list + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing a block device name, + * a colon, and then a comma separated + * list of pNFS data server IPv4 addresses + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing a block device name, a colon, and + * then a comma separated list of pNFS + * data server IPv4 addresses. + * return code is the size in bytes of the string + * On error: return code is a negative errno value + */ +static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) +{ + ssize_t rv; + + mutex_lock(&nfsd_mutex); + rv = __write_pnfs_dlm_device(file, buf, size); + mutex_unlock(&nfsd_mutex); + return rv; +} + +#endif /* CONFIG_PNFSD */ + /*----------------------------------------------------------------------------*/ /* * populating the filesystem. @@ -1383,6 +1459,10 @@ static int nfsd_fill_super(struct super_ [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, #endif +#ifdef CONFIG_PNFSD + [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, + S_IWUSR|S_IRUSR}, +#endif /* last one */ {""} }; return simple_fill_super(sb, 0x6e667364, nfsd_files); @@ -1421,6 +1501,9 @@ static int create_proc_exports_entry(voi } #endif +#if defined(CONFIG_SPNFS_BLOCK) +int nfsd_bl_init(void); +#endif static int __init init_nfsd(void) { int retval; @@ -1443,6 +1526,15 @@ static int __init init_nfsd(void) retval = create_proc_exports_entry(); if (retval) goto out_free_idmap; +#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) + retval = spnfs_init_proc(); + if (retval != 0) + goto out_free_idmap; +#if defined(CONFIG_SPNFS_BLOCK) + nfsd_bl_init(); +#endif /* CONFIG_SPNFS_BLOCK */ +#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ + retval = register_filesystem(&nfsd_fs_type); if (retval) goto out_free_all; @@ -1465,7 +1557,22 @@ out_free_stat: static void __exit exit_nfsd(void) { +#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) + remove_proc_entry("fs/nfs/spnfs/recall", NULL); + remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); + remove_proc_entry("fs/nfs/spnfs/getfh", NULL); + remove_proc_entry("fs/nfs/spnfs/config", NULL); + remove_proc_entry("fs/nfs/spnfs/ctl", NULL); + remove_proc_entry("fs/nfs/spnfs", NULL); +#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ + +#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) + remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); + remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); +#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ + nfsd_export_shutdown(); + nfsd4_pnfs_dlm_shutdown(); nfsd_reply_cache_shutdown(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff -up linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig linux-2.6.34.noarch/fs/nfsd/nfsd.h --- linux-2.6.34.noarch/fs/nfsd/nfsd.h.orig 2010-09-30 10:15:18.370728000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsd.h 2010-09-30 10:17:08.906000000 -0400 @@ -285,11 +285,17 @@ extern time_t nfsd4_grace; #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 +#if defined(CONFIG_PNFSD) +#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) +#else /* CONFIG_PNFSD */ #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ NFSD4_SUPPORTED_ATTRS_WORD1 +#endif /* CONFIG_PNFSD */ #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ - (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) + (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ + FATTR4_WORD2_LAYOUT_BLKSIZE) static inline u32 nfsd_suppattrs0(u32 minorversion) { diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.c --- linux-2.6.34.noarch/fs/nfsd/nfsfh.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsfh.c 2010-09-30 10:17:08.911003000 -0400 @@ -10,6 +10,7 @@ #include #include +#include #include "nfsd.h" #include "vfs.h" #include "auth.h" @@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) { struct knfsd_fh *fh = &fhp->fh_handle; + int fsid_type; struct fid *fid = NULL, sfid; struct svc_export *exp; struct dentry *dentry; @@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct return error; if (fh->fh_auth_type != 0) return error; - len = key_len(fh->fh_fsid_type) / 4; + fsid_type = pnfs_fh_fsid_type(fh); + len = key_len(fsid_type) / 4; if (len == 0) return error; if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { @@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct data_left -= len; if (data_left < 0) return error; - exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); + exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); fid = (struct fid *)(fh->fh_auth + len); } else { __u32 tfh[2]; diff -up linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.34.noarch/fs/nfsd/nfsfh.h --- linux-2.6.34.noarch/fs/nfsd/nfsfh.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfsfh.h 2010-09-30 10:17:08.917000000 -0400 @@ -14,6 +14,7 @@ enum nfsd_fsid { FSID_UUID8, FSID_UUID16, FSID_UUID16_INUM, + FSID_MAX }; enum fsid_source { @@ -205,4 +206,42 @@ fh_unlock(struct svc_fh *fhp) } } +#if defined(CONFIG_PNFSD) + +/* + * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied + * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how + * to handle a given stateid. + */ +static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) +{ + return fh->fh_fsid_type >= FSID_MAX; +} + +static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) +{ + BUG_ON(fh->fh_version != 1); + BUG_ON(pnfs_fh_is_ds(fh)); + fh->fh_fsid_type += FSID_MAX; +} + +#else /* CONFIG_PNFSD */ + +static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) +{ + return 0; +} + +#endif /* CONFIG_PNFSD */ + +/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ +static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) +{ + int fsid_type = fh->fh_fsid_type; + + if (pnfs_fh_is_ds(fh)) + return fsid_type - FSID_MAX; + return fsid_type; +} + #endif /* _LINUX_NFSD_FH_INT_H */ diff -up linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.34.noarch/fs/nfsd/nfssvc.c --- linux-2.6.34.noarch/fs/nfsd/nfssvc.c.orig 2010-09-30 10:15:05.063337000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/nfssvc.c 2010-09-30 10:17:08.922000000 -0400 @@ -115,7 +115,7 @@ struct svc_program nfsd_program = { }; -u32 nfsd_supported_minorversion; +u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; int nfsd_vers(int vers, enum vers_op change) { diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.34.noarch/fs/nfsd/pnfsd.h --- linux-2.6.34.noarch/fs/nfsd/pnfsd.h.orig 2010-09-30 10:17:08.924003000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/pnfsd.h 2010-09-30 10:17:08.926004000 -0400 @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2005 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef LINUX_NFSD_PNFSD_H +#define LINUX_NFSD_PNFSD_H + +#include +#include + +#include "state.h" +#include "xdr4.h" + +/* outstanding layout stateid */ +struct nfs4_layout_state { + struct list_head ls_perfile; + struct list_head ls_layouts; /* list of nfs4_layouts */ + struct kref ls_ref; + struct nfs4_client *ls_client; + struct nfs4_file *ls_file; + stateid_t ls_stateid; +}; + +/* outstanding layout */ +struct nfs4_layout { + struct list_head lo_perfile; /* hash by f_id */ + struct list_head lo_perclnt; /* hash by clientid */ + struct list_head lo_perstate; + struct nfs4_file *lo_file; /* backpointer */ + struct nfs4_client *lo_client; + struct nfs4_layout_state *lo_state; + struct nfsd4_layout_seg lo_seg; +}; + +struct pnfs_inval_state { + struct knfsd_fh mdsfh; /* needed only by invalidate all */ + stateid_t stid; + clientid_t clid; + u32 status; +}; + +/* pNFS Data Server state */ +#define DS_STATEID_VALID 0 +#define DS_STATEID_ERROR 1 +#define DS_STATEID_NEW 2 + +struct pnfs_ds_stateid { + struct list_head ds_hash; /* ds_stateid hash entry */ + struct list_head ds_perclid; /* per client hash entry */ + stateid_t ds_stid; + struct knfsd_fh ds_fh; + unsigned long ds_access; + u32 ds_status; /* from MDS */ + u32 ds_verifier[2]; /* from MDS */ + wait_queue_head_t ds_waitq; + unsigned long ds_flags; + struct kref ds_ref; + clientid_t ds_mdsclid; +}; + +struct pnfs_ds_clientid { + struct list_head dc_hash; /* mds_clid_hashtbl entry */ + struct list_head dc_stateid; /* ds_stateid head */ + struct list_head dc_permdsid; /* per mdsid hash entry */ + clientid_t dc_mdsclid; + struct kref dc_ref; + uint32_t dc_mdsid; +}; + +struct pnfs_mds_id { + struct list_head di_hash; /* mds_nodeid list entry */ + struct list_head di_mdsclid; /* mds_clientid head */ + uint32_t di_mdsid; + time_t di_mdsboot; /* mds boot time */ + struct kref di_ref; +}; + +/* notify device request (from exported filesystem) */ +struct nfs4_notify_device { + struct nfsd4_pnfs_cb_dev_list *nd_list; + struct nfs4_client *nd_client; + struct list_head nd_perclnt; + + void *nd_args; /* nfsd internal */ +}; + +u64 find_create_sbid(struct super_block *); +struct super_block *find_sbid_id(u64); +__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); +int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, + struct nfsd4_pnfs_layoutreturn *); +int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); +int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); +void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); +int put_layoutrecall(struct nfs4_layoutrecall *); +void nomatching_layout(struct nfs4_layoutrecall *); +void *layoutrecall_done(struct nfs4_layoutrecall *); +int nfsd4_cb_layout(struct nfs4_layoutrecall *); +int nfsd_layout_recall_cb(struct super_block *, struct inode *, + struct nfsd4_pnfs_cb_layout *); +int nfsd_device_notify_cb(struct super_block *, + struct nfsd4_pnfs_cb_dev_list *); +int nfsd4_cb_notify_device(struct nfs4_notify_device *); +void pnfs_set_device_notify(clientid_t *, unsigned int types); +void pnfs_clear_device_notify(struct nfs4_client *); + +#if defined(CONFIG_PNFSD_LOCAL_EXPORT) +extern struct sockaddr pnfsd_lexp_addr; +extern size_t pnfs_lexp_addr_len; + +extern void pnfsd_lexp_init(struct inode *); +#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ + +#endif /* LINUX_NFSD_PNFSD_H */ diff -up linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c --- linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-09-30 10:17:08.928999000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/pnfsd_lexp.c 2010-09-30 10:17:08.930006000 -0400 @@ -0,0 +1,225 @@ +/* + * linux/fs/nfsd/pnfs_lexp.c + * + * pNFS export of local filesystems. + * + * Export local file systems over the files layout type. + * The MDS (metadata server) functions also as a single DS (data server). + * This is mostly useful for development and debugging purposes. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) 2008 Benny Halevy, + * + * Initial implementation was based on the pnfs-gfs2 patches done + * by David M. Richter + */ + +#include +#include + +#include "pnfsd.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +struct sockaddr pnfsd_lexp_addr; +size_t pnfs_lexp_addr_len; + +static int +pnfsd_lexp_layout_type(struct super_block *sb) +{ + int ret = LAYOUT_NFSV4_1_FILES; + dprintk("<-- %s: return %d\n", __func__, ret); + return ret; +} + +static int +pnfsd_lexp_get_device_iter(struct super_block *sb, + u32 layout_type, + struct nfsd4_pnfs_dev_iter_res *res) +{ + dprintk("--> %s: sb=%p\n", __func__, sb); + + BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); + + res->gd_eof = 1; + if (res->gd_cookie) + return -ENOENT; + res->gd_cookie = 1; + res->gd_verf = 1; + res->gd_devid = 1; + + dprintk("<-- %s: return 0\n", __func__); + return 0; +} + +static int +pnfsd_lexp_get_device_info(struct super_block *sb, + struct exp_xdr_stream *xdr, + u32 layout_type, + const struct nfsd4_pnfs_deviceid *devid) +{ + int err; + struct pnfs_filelayout_device fdev; + struct pnfs_filelayout_multipath fl_devices[1]; + u32 fl_stripe_indices[1] = { 0 }; + struct pnfs_filelayout_devaddr daddr; + /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ + char daddr_buf[8*4 + 2*3 + 10]; + + dprintk("--> %s: sb=%p\n", __func__, sb); + + BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); + + memset(&fdev, '\0', sizeof(fdev)); + + if (devid->devid != 1) { + printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " + "(got: 0x%llx)\n", __func__, devid->devid); + err = -EINVAL; + goto out; + } + + /* count the number of comma-delimited DS IPs */ + fdev.fl_device_length = 1; + fdev.fl_device_list = fl_devices; + + fdev.fl_stripeindices_length = fdev.fl_device_length; + fdev.fl_stripeindices_list = fl_stripe_indices; + + daddr.r_addr.data = daddr_buf; + daddr.r_addr.len = sizeof(daddr_buf); + err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); + if (err < 0) + goto out; + daddr.r_addr.len = err; + switch (pnfsd_lexp_addr.sa_family) { + case AF_INET: + daddr.r_netid.data = "tcp"; + daddr.r_netid.len = 3; + break; + case AF_INET6: + daddr.r_netid.data = "tcp6"; + daddr.r_netid.len = 4; + break; + default: + BUG(); + } + fdev.fl_device_list[0].fl_multipath_length = 1; + fdev.fl_device_list[0].fl_multipath_list = &daddr; + + /* have nfsd encode the device info */ + err = filelayout_encode_devinfo(xdr, &fdev); +out: + dprintk("<-- %s: return %d\n", __func__, err); + return err; +} + +static int get_stripe_unit(int blocksize) +{ + if (blocksize < NFSSVC_MAXBLKSIZE) + blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); + dprintk("%s: return %d\n", __func__, blocksize); + return blocksize; +} + +static enum nfsstat4 +pnfsd_lexp_layout_get(struct inode *inode, + struct exp_xdr_stream *xdr, + const struct nfsd4_pnfs_layoutget_arg *arg, + struct nfsd4_pnfs_layoutget_res *res) +{ + enum nfsstat4 rc = NFS4_OK; + struct pnfs_filelayout_layout *layout = NULL; + struct knfsd_fh *fhp = NULL; + + dprintk("--> %s: inode=%p\n", __func__, inode); + + res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; + res->lg_seg.offset = 0; + res->lg_seg.length = NFS4_MAX_UINT64; + + layout = kzalloc(sizeof(*layout), GFP_KERNEL); + if (layout == NULL) { + rc = -ENOMEM; + goto error; + } + + /* Set file layout response args */ + layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; + layout->lg_stripe_type = STRIPE_SPARSE; + layout->lg_commit_through_mds = true; + layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); + layout->lg_fh_length = 1; + layout->device_id.sbid = arg->lg_sbid; + layout->device_id.devid = 1; /*FSFTEMP*/ + layout->lg_first_stripe_index = 0; /*FSFTEMP*/ + layout->lg_pattern_offset = 0; + + fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); + if (fhp == NULL) { + rc = -ENOMEM; + goto error; + } + + memcpy(fhp, arg->lg_fh, sizeof(*fhp)); + pnfs_fh_mark_ds(fhp); + layout->lg_fh_list = fhp; + + /* Call nfsd to encode layout */ + rc = filelayout_encode_layout(xdr, layout); +exit: + kfree(layout); + kfree(fhp); + dprintk("<-- %s: return %d\n", __func__, rc); + return rc; + +error: + res->lg_seg.length = 0; + goto exit; +} + +static int +pnfsd_lexp_layout_commit(struct inode *inode, + const struct nfsd4_pnfs_layoutcommit_arg *args, + struct nfsd4_pnfs_layoutcommit_res *res) +{ + dprintk("%s: (unimplemented)\n", __func__); + + return 0; +} + +static int +pnfsd_lexp_layout_return(struct inode *inode, + const struct nfsd4_pnfs_layoutreturn_arg *args) +{ + dprintk("%s: (unimplemented)\n", __func__); + + return 0; +} + +static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, + struct pnfs_get_state *p) +{ + return 0; /* just use the current stateid */ +} + +static struct pnfs_export_operations pnfsd_lexp_ops = { + .layout_type = pnfsd_lexp_layout_type, + .get_device_info = pnfsd_lexp_get_device_info, + .get_device_iter = pnfsd_lexp_get_device_iter, + .layout_get = pnfsd_lexp_layout_get, + .layout_commit = pnfsd_lexp_layout_commit, + .layout_return = pnfsd_lexp_layout_return, + .get_state = pnfsd_lexp_get_state, +}; + +void +pnfsd_lexp_init(struct inode *inode) +{ + dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); + inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; +} diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_com.c --- linux-2.6.34.noarch/fs/nfsd/spnfs_com.c.orig 2010-09-30 10:17:08.933003000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/spnfs_com.c 2010-09-30 10:17:08.935000000 -0400 @@ -0,0 +1,535 @@ +/* + * fs/nfsd/spnfs_com.c + * + * Communcation layer between spNFS kernel and userspace + * Based heavily on idmap.c + * + */ + +/* + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include +#include + +#include + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, + char __user *, size_t); +static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, + size_t); +static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); + +static struct rpc_pipe_ops spnfs_upcall_ops = { + .upcall = spnfs_pipe_upcall, + .downcall = spnfs_pipe_downcall, + .destroy_msg = spnfs_pipe_destroy_msg, +}; + +/* evil global variable */ +struct spnfs *global_spnfs; +struct spnfs_config *spnfs_config; +#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS +int spnfs_use_layoutsegments; +uint64_t layoutsegment_size; +#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ + +/* + * Used by spnfs_enabled() + * Tracks if the subsystem has been initialized at some point. It doesn't + * matter if it's not currently initialized. + */ +static int spnfs_enabled_at_some_point; + +/* call this to start the ball rolling */ +/* code it like we're going to avoid the global variable in the future */ +int +nfsd_spnfs_new(void) +{ + struct spnfs *spnfs = NULL; + struct path path; + struct nameidata nd; + int rc; + + if (global_spnfs != NULL) + return -EEXIST; + + path.mnt = rpc_get_mount(); + if (IS_ERR(path.mnt)) + return PTR_ERR(path.mnt); + + /* FIXME: do not abuse rpc_pipefs/nfs */ + rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); + if (rc) + goto err; + + spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); + if (spnfs == NULL){ + rc = -ENOMEM; + goto err; + } + + spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, + &spnfs_upcall_ops, 0); + if (IS_ERR(spnfs->spnfs_dentry)) { + rc = -EPIPE; + goto err; + } + + mutex_init(&spnfs->spnfs_lock); + mutex_init(&spnfs->spnfs_plock); + init_waitqueue_head(&spnfs->spnfs_wq); + + global_spnfs = spnfs; + spnfs_enabled_at_some_point = 1; + + return 0; +err: + rpc_put_mount(); + kfree(spnfs); + return rc; +} + +/* again, code it like we're going to remove the global variable */ +void +nfsd_spnfs_delete(void) +{ + struct spnfs *spnfs = global_spnfs; + + if (!spnfs) + return; + rpc_unlink(spnfs->spnfs_dentry); + rpc_put_mount(); + global_spnfs = NULL; + kfree(spnfs); +} + +/* RPC pipefs upcall/downcall routines */ +/* looks like this code is invoked by the rpc_pipe code */ +/* to handle upcalls on things we've queued elsewhere */ +/* See nfs_idmap_id for an exmaple of enqueueing */ +static ssize_t +spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char __user *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + ssize_t mlen = msg->len - msg->copied; + ssize_t left; + + if (mlen > buflen) + mlen = buflen; + + left = copy_to_user(dst, data, mlen); + if (left < 0) { + msg->errno = left; + return left; + } + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + return mlen; +} + +static ssize_t +spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); + struct spnfs *spnfs = (struct spnfs *)rpci->private; + struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; + int ret; + + if (mlen != sizeof(struct spnfs_msg)) + return -ENOSPC; + + im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); + if (im_in == NULL) + return -ENOMEM; + + if (copy_from_user(im_in, src, mlen) != 0) + return -EFAULT; + + mutex_lock(&spnfs->spnfs_plock); + + ret = mlen; + im->im_status = im_in->im_status; + /* If we got an error, terminate now, and wake up pending upcalls */ + if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { + wake_up(&spnfs->spnfs_wq); + goto out; + } + + ret = -EINVAL; + /* Did we match the current upcall? */ + /* DMXXX: do not understand the comment above, from original code */ + /* DMXXX: when do we _not_ match the current upcall? */ + /* DMXXX: anyway, let's to a simplistic check */ + if (im_in->im_type == im->im_type) { + /* copy the response into the spnfs struct */ + memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); + ret = mlen; + } else + dprintk("spnfs: downcall type != upcall type\n"); + + + wake_up(&spnfs->spnfs_wq); +/* DMXXX handle rval processing */ +out: + mutex_unlock(&spnfs->spnfs_plock); + kfree(im_in); + return ret; +} + +static void +spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct spnfs_msg *im = msg->data; + struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); + + if (msg->errno >= 0) + return; + mutex_lock(&spnfs->spnfs_plock); + im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ + wake_up(&spnfs->spnfs_wq); + mutex_unlock(&spnfs->spnfs_plock); +} + +/* generic upcall. called by functions in spnfs_ops.c */ +int +spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, + union spnfs_msg_res *res) +{ + struct rpc_pipe_msg msg; + struct spnfs_msg *im; + DECLARE_WAITQUEUE(wq, current); + int ret = -EIO; + int rval; + + im = &spnfs->spnfs_im; + + mutex_lock(&spnfs->spnfs_lock); + mutex_lock(&spnfs->spnfs_plock); + + memset(im, 0, sizeof(*im)); + memcpy(im, upmsg, sizeof(*upmsg)); + + memset(&msg, 0, sizeof(msg)); + msg.data = im; + msg.len = sizeof(*im); + + add_wait_queue(&spnfs->spnfs_wq, &wq); + rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); + if (rval < 0) { + remove_wait_queue(&spnfs->spnfs_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&spnfs->spnfs_plock); + schedule(); + current->state = TASK_RUNNING; + remove_wait_queue(&spnfs->spnfs_wq, &wq); + mutex_lock(&spnfs->spnfs_plock); + + if (im->im_status & SPNFS_STATUS_SUCCESS) { + /* copy our result from the upcall */ + memcpy(res, &im->im_res, sizeof(*res)); + ret = 0; + } + +out: + memset(im, 0, sizeof(*im)); + mutex_unlock(&spnfs->spnfs_plock); + mutex_unlock(&spnfs->spnfs_lock); + return(ret); +} + +/* + * This is used to determine if the spnfsd daemon has been started at + * least once since the system came up. This is used to by the export + * mechanism to decide if spnfs is in use. + * + * Returns non-zero if the spnfsd has initialized the communication pipe + * at least once. + */ +int spnfs_enabled(void) +{ + return spnfs_enabled_at_some_point; +} + +#ifdef CONFIG_PROC_FS + +/* + * procfs virtual files for user/kernel space communication: + * + * ctl - currently just an on/off switch...can be expanded + * getfh - fd to fh conversion + * recall - recall a layout from the command line, for example: + * echo > /proc/fs/spnfs/recall + * config - configuration info, e.g., stripe size, num ds, etc. + */ + +/*-------------- start ctl -------------------------*/ +static ssize_t ctl_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + int cmd, rc; + + if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) + return -EFAULT; + if (cmd) { + rc = nfsd_spnfs_new(); + if (rc != 0) + return rc; + } else + nfsd_spnfs_delete(); + + return count; +} + +static const struct file_operations ctl_ops = { + .write = ctl_write, +}; +/*-------------- end ctl ---------------------------*/ + +/*-------------- start config -------------------------*/ +static ssize_t config_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + static struct spnfs_config cfg; + + if (copy_from_user(&cfg, buf, count)) + return -EFAULT; + + spnfs_config = &cfg; + return 0; +} + +static const struct file_operations config_ops = { + .write = config_write, +}; +/*-------------- end config ---------------------------*/ + +/*-------------- start getfh -----------------------*/ +static int getfh_open(struct inode *inode, struct file *file) +{ + file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); + if (file->private_data == NULL) + return -ENOMEM; + + return 0; +} + +static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, + loff_t *offset) +{ + if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) + return -EFAULT; + + return count; +} + +static ssize_t getfh_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + int fd; + + if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) + return -EFAULT; + if (spnfs_getfh(fd, file->private_data) != 0) + return -EIO; + + return count; +} + +static int getfh_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static const struct file_operations getfh_ops = { + .open = getfh_open, + .read = getfh_read, + .write = getfh_write, + .release = getfh_release, +}; +/*-------------- end getfh ------------------------*/ + + +/*-------------- start recall layout --------------*/ +static ssize_t recall_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + char input[128]; + char *path, *str, *p; + int rc; + u64 off = 0, len = 0; + + if (count > 128) + return -EINVAL; + + if (copy_from_user(input, buf, count)) + return -EFAULT; + + /* assumes newline-terminated path */ + p = memchr(input, '\n', count); + if (p == NULL) + return -EINVAL; + *p = '\0'; + + /* + * Scan for path and, optionally, an offset and length + * of a layout segment to be recalled; if there are two + * fields, they're assumed to be path and offset. + */ + p = input; + path = strsep(&p, " "); + if (path == NULL) + return -EINVAL; + + str = strsep(&p, " "); + if (str != NULL) { + rc = strict_strtoull(str, 10, &off); + if (rc != 0) + return -EINVAL; + + str = strsep(&p, " "); + if (str != NULL) { + rc = strict_strtoull(str, 10, &len); + if (rc != 0) + return -EINVAL; + } + } + + rc = spnfs_test_layoutrecall(path, off, len); + if (rc != 0) + return rc; + + return count; +} + +static const struct file_operations recall_ops = { + .write = recall_write, +}; +/*-------------- end recall layout --------------*/ + + +#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS +/*-------------- start layoutseg -------------------------*/ +static ssize_t layoutseg_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + char cmd[3]; + + if (copy_from_user(cmd, buf, 1)) + return -EFAULT; + if (cmd[0] == '0') + spnfs_use_layoutsegments = 0; + else + spnfs_use_layoutsegments = 1; + + return count; +} + +static const struct file_operations layoutseg_ops = { + .write = layoutseg_write, +}; +/*-------------- end layoutseg ---------------------------*/ + +/*-------------- start layoutsegsize -------------------------*/ +static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + char cmd[50]; + + if (copy_from_user(cmd, buf, 49)) + return -EFAULT; + layoutsegment_size = simple_strtoull(cmd, NULL, 10); + + return count; +} + +static const struct file_operations layoutsegsize_ops = { + .write = layoutsegsize_write, +}; +/*-------------- end layoutsegsize ---------------------------*/ +#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ + +int +spnfs_init_proc(void) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir("fs/spnfs", NULL); + if (!entry) + return -ENOMEM; + + entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); + if (!entry) + return -ENOMEM; + entry->proc_fops = &ctl_ops; + + entry = create_proc_entry("fs/spnfs/config", 0, NULL); + if (!entry) + return -ENOMEM; + entry->proc_fops = &config_ops; + + entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); + if (!entry) + return -ENOMEM; + entry->proc_fops = &getfh_ops; + + entry = create_proc_entry("fs/spnfs/recall", 0, NULL); + if (!entry) + return -ENOMEM; + entry->proc_fops = &recall_ops; + +#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS + entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); + if (!entry) + return -ENOMEM; + entry->proc_fops = &layoutseg_ops; + + entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); + if (!entry) + return -ENOMEM; + entry->proc_fops = &layoutsegsize_ops; +#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ + + return 0; +} +#endif /* CONFIG_PROC_FS */ diff -up linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c --- linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c.orig 2010-09-30 10:17:08.938003000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/spnfs_ops.c 2010-09-30 10:17:08.940000000 -0400 @@ -0,0 +1,878 @@ +/* + * fs/nfsd/spnfs_ops.c + * + * Communcation layer between spNFS kernel and userspace + * + */ +/****************************************************************************** + +(c) 2007 Network Appliance, Inc. All Rights Reserved. + +Network Appliance provides this source code under the GPL v2 License. +The GPL v2 license is available at +http://opensource.org/licenses/gpl-license.php. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pnfsd.h" + +/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ +/* #define CONFIG_SPNFS_TEST 1 */ + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +/* + * The functions that are called from elsewhere in the kernel + * to perform tasks in userspace + * + */ + +#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS +extern int spnfs_use_layoutsegments; +extern uint64_t layoutsegment_size; +#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ +extern struct spnfs *global_spnfs; + +int +spnfs_layout_type(struct super_block *sb) +{ + return LAYOUT_NFSV4_1_FILES; +} + +enum nfsstat4 +spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, + const struct nfsd4_pnfs_layoutget_arg *lg_arg, + struct nfsd4_pnfs_layoutget_res *lg_res) +{ + struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ + struct spnfs_msg *im = NULL; + union spnfs_msg_res *res = NULL; + struct pnfs_filelayout_layout *flp = NULL; + int status, i; + enum nfsstat4 nfserr; + + im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); + if (im == NULL) { + nfserr = NFS4ERR_LAYOUTTRYLATER; + goto layoutget_cleanup; + } + + res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); + if (res == NULL) { + nfserr = NFS4ERR_LAYOUTTRYLATER; + goto layoutget_cleanup; + } + + im->im_type = SPNFS_TYPE_LAYOUTGET; + im->im_args.layoutget_args.inode = inode->i_ino; + im->im_args.layoutget_args.generation = inode->i_generation; + + /* call function to queue the msg for upcall */ + if (spnfs_upcall(spnfs, im, res) != 0) { + dprintk("failed spnfs upcall: layoutget\n"); + nfserr = NFS4ERR_LAYOUTUNAVAILABLE; + goto layoutget_cleanup; + } + status = res->layoutget_res.status; + if (status != 0) { + /* FIXME? until user mode is fixed, translate system error */ + switch (status) { + case -E2BIG: + case -ETOOSMALL: + nfserr = NFS4ERR_TOOSMALL; + break; + case -ENOMEM: + case -EAGAIN: + case -EINTR: + nfserr = NFS4ERR_LAYOUTTRYLATER; + break; + case -ENOENT: + nfserr = NFS4ERR_BADLAYOUT; + break; + default: + nfserr = NFS4ERR_LAYOUTUNAVAILABLE; + } + dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", + status, nfserr); + goto layoutget_cleanup; + } + + lg_res->lg_return_on_close = 0; +#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) + /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ + /* the amount requested by the client. */ + if (spnfs_use_layoutsegments) { + if (layoutsegment_size != 0) + lg_res->lg_seg.length = layoutsegment_size; + } else + lg_res->lg_seg.length = NFS4_MAX_UINT64; +#else + lg_res->lg_seg.length = NFS4_MAX_UINT64; +#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ + + flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); + if (flp == NULL) { + nfserr = NFS4ERR_LAYOUTTRYLATER; + goto layoutget_cleanup; + } + flp->device_id.sbid = lg_arg->lg_sbid; + flp->device_id.devid = res->layoutget_res.devid; + flp->lg_layout_type = 1; /* XXX */ + flp->lg_stripe_type = res->layoutget_res.stripe_type; + flp->lg_commit_through_mds = 0; + flp->lg_stripe_unit = res->layoutget_res.stripe_size; + flp->lg_first_stripe_index = 0; + flp->lg_pattern_offset = 0; + flp->lg_fh_length = res->layoutget_res.stripe_count; + + flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), + GFP_KERNEL); + if (flp->lg_fh_list == NULL) { + nfserr = NFS4ERR_LAYOUTTRYLATER; + goto layoutget_cleanup; + } + /* + * FIX: Doing an extra copy here. Should group res.flist's fh_len + * and fh_val into a knfsd_fh structure. + */ + for (i = 0; i < flp->lg_fh_length; i++) { + flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; + memcpy(&flp->lg_fh_list[i].fh_base, + res->layoutget_res.flist[i].fh_val, + res->layoutget_res.flist[i].fh_len); + } + + /* encode the layoutget body */ + nfserr = filelayout_encode_layout(xdr, flp); + +layoutget_cleanup: + if (flp) { + if (flp->lg_fh_list) + kfree(flp->lg_fh_list); + kfree(flp); + } + kfree(im); + kfree(res); + + return nfserr; +} + +int +spnfs_layoutcommit(void) +{ + return 0; +} + +int +spnfs_layoutreturn(struct inode *inode, + const struct nfsd4_pnfs_layoutreturn_arg *args) +{ + return 0; +} + +int +spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) +{ + struct super_block *sb; + struct nfsd4_pnfs_cb_layout lr; + + switch (type) { + case RETURN_FILE: + sb = inode->i_sb; + dprintk("%s: recalling layout for ino = %lu\n", + __func__, inode->i_ino); + break; + case RETURN_FSID: + sb = inode->i_sb; + dprintk("%s: recalling layout for fsid x (unimplemented)\n", + __func__); + return 0; + case RETURN_ALL: + /* XXX figure out how to get a sb since there's no inode ptr */ + dprintk("%s: recalling all layouts (unimplemented)\n", + __func__); + return 0; + default: + return -EINVAL; + } + + lr.cbl_recall_type = type; + lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; + lr.cbl_seg.clientid = 0; + lr.cbl_seg.offset = offset; + lr.cbl_seg.length = len; + lr.cbl_seg.iomode = IOMODE_ANY; + lr.cbl_layoutchanged = 0; + + nfsd_layout_recall_cb(sb, inode, &lr); + + return 0; +} + + +int +spnfs_test_layoutrecall(char *path, u64 offset, u64 len) +{ + struct nameidata nd; + struct inode *inode; + int type, rc; + + dprintk("%s: path=%s, offset=%llu, len=%llu\n", + __func__, path, offset, len); + + if (strcmp(path, "all") == 0) { + inode = NULL; + type = RETURN_ALL; + } else { + rc = path_lookup(path, 0, &nd); + if (rc != 0) + return -ENOENT; + + /* + * XXX todo: add a RETURN_FSID scenario here...maybe if + * inode is a dir... + */ + + inode = nd.path.dentry->d_inode; + type = RETURN_FILE; + } + + if (len == 0) + len = NFS4_MAX_UINT64; + + rc = spnfs_layoutrecall(inode, type, offset, len); + + if (type != RETURN_ALL) + path_put(&nd.path); + return rc; +} + +int +spnfs_getdeviceiter(struct super_block *sb, + u32 layout_type, + struct nfsd4_pnfs_dev_iter_res *gd_res) +{ + struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ + struct spnfs_msg *im = NULL; + union spnfs_msg_res *res = NULL; + int status = 0; + + im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); + if (im == NULL) { + status = -ENOMEM; + goto getdeviceiter_out; + } + + res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); + if (res == NULL) { + status = -ENOMEM; + goto getdeviceiter_out; + } + + im->im_type = SPNFS_TYPE_GETDEVICEITER; + im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; + im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; + + /* call function to queue the msg for upcall */ + status = spnfs_upcall(spnfs, im, res); + if (status != 0) { + dprintk("%s spnfs upcall failure: %d\n", __func__, status); + status = -EIO; + goto getdeviceiter_out; + } + status = res->getdeviceiter_res.status; + + if (res->getdeviceiter_res.eof) + gd_res->gd_eof = 1; + else { + gd_res->gd_devid = res->getdeviceiter_res.devid; + gd_res->gd_cookie = res->getdeviceiter_res.cookie; + gd_res->gd_verf = res->getdeviceiter_res.verf; + gd_res->gd_eof = 0; + } + +getdeviceiter_out: + kfree(im); + kfree(res); + + return status; +} + +#ifdef CONFIG_SPNFS_TEST +/* + * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the + * 1024 encoded stripe indices. + * + * Skip the devaddr4 length and encode the indicies count (1024) in the + * rq_res.head and set the rq_res.head length. + * + * Set the rq_res page_len to 4096 (for the 1024 stripe indices). + * Set the rq_res xdr_buf tail base to rq_respages[0] just after the + * rq_res head to hold the rest of the getdeviceinfo return. + * + * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and + * rq_respages[rq_resused] contains the rq_res.pages. + */ +static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, + const struct pnfs_filelayout_device *fdev) +{ + struct nfsd4_compoundres *resp = info->resp; + struct svc_rqst *rqstp = resp->rqstp; + struct xdr_buf *xb = &resp->rqstp->rq_res; + __be32 *p; + + p = nfsd4_xdr_reserve_space(resp, 8); + p++; /* Fill in length later */ + *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ + resp->p = p; + + xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; + xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; + xb->page_base = 0; + xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ + xb->tail[0].iov_base = resp->p; + resp->end = xb->head[0].iov_base + PAGE_SIZE; + xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; + return 0; +} +/* + * Return a stripeindices of length 1024 to test + * the pNFS client multipage getdeviceinfo implementation. + * + * Encode a page of stripe indices. + */ +static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, + struct spnfs_device *dev, + struct pnfs_devinfo_arg *info) +{ + struct svc_rqst *rqstp = info->xdr.resp->rqstp; + __be32 *p; + int i, j = 0; + + p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); + fldev->fl_stripeindices_length = 1024; + /* round-robin the data servers device index into the stripe indicie */ + for (i = 0; i < 1024; i++) { + *p++ = cpu_to_be32(j); + if (j < dev->dscount - 1) + j++; + else + j = 0; + } + fldev->fl_stripeindices_list = NULL; +} +#endif /* CONFIG_SPNFS_TEST */ + +int +spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, + u32 layout_type, + const struct nfsd4_pnfs_deviceid *devid) +{ + struct spnfs *spnfs = global_spnfs; + struct spnfs_msg *im = NULL; + union spnfs_msg_res *res = NULL; + struct spnfs_device *dev; + struct pnfs_filelayout_device *fldev = NULL; + struct pnfs_filelayout_multipath *mp = NULL; + struct pnfs_filelayout_devaddr *fldap = NULL; + int status = 0, i, len; + + im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); + if (im == NULL) { + status = -ENOMEM; + goto getdeviceinfo_out; + } + + res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); + if (res == NULL) { + status = -ENOMEM; + goto getdeviceinfo_out; + } + + im->im_type = SPNFS_TYPE_GETDEVICEINFO; + /* XXX FIX: figure out what to do about fsid */ + im->im_args.getdeviceinfo_args.devid = devid->devid; + + /* call function to queue the msg for upcall */ + status = spnfs_upcall(spnfs, im, res); + if (status != 0) { + dprintk("%s spnfs upcall failure: %d\n", __func__, status); + status = -EIO; + goto getdeviceinfo_out; + } + status = res->getdeviceinfo_res.status; + if (status != 0) + goto getdeviceinfo_out; + + dev = &res->getdeviceinfo_res.devinfo; + + /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ + fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); + if (fldev == NULL) { + status = -ENOMEM; + goto getdeviceinfo_out; + } + + /* + * Stripe count is the same as data server count for our purposes + */ + fldev->fl_stripeindices_length = dev->dscount; + fldev->fl_device_length = dev->dscount; + + /* Set stripe indices */ +#ifdef CONFIG_SPNFS_TEST + spnfs_set_test_indices(fldev, dev, info); + fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; +#else /* CONFIG_SPNFS_TEST */ + fldev->fl_stripeindices_list = + kmalloc(fldev->fl_stripeindices_length * sizeof(u32), + GFP_KERNEL); + if (fldev->fl_stripeindices_list == NULL) { + status = -ENOMEM; + goto getdeviceinfo_out; + } + for (i = 0; i < fldev->fl_stripeindices_length; i++) + fldev->fl_stripeindices_list[i] = i; +#endif /* CONFIG_SPNFS_TEST */ + + /* + * Set the device's data server addresses No multipath for spnfs, + * so mp length is always 1. + * + */ + fldev->fl_device_list = + kmalloc(fldev->fl_device_length * + sizeof(struct pnfs_filelayout_multipath), + GFP_KERNEL); + if (fldev->fl_device_list == NULL) { + status = -ENOMEM; + goto getdeviceinfo_out; + } + for (i = 0; i < fldev->fl_device_length; i++) { + mp = &fldev->fl_device_list[i]; + mp->fl_multipath_length = 1; + mp->fl_multipath_list = + kmalloc(sizeof(struct pnfs_filelayout_devaddr), + GFP_KERNEL); + if (mp->fl_multipath_list == NULL) { + status = -ENOMEM; + goto getdeviceinfo_out; + } + fldap = mp->fl_multipath_list; + + /* + * Copy the netid into the device address, for example: "tcp" + */ + len = strlen(dev->dslist[i].netid); + fldap->r_netid.data = kmalloc(len, GFP_KERNEL); + if (fldap->r_netid.data == NULL) { + status = -ENOMEM; + goto getdeviceinfo_out; + } + memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); + fldap->r_netid.len = len; + + /* + * Copy the network address into the device address, + * for example: "10.35.9.16.08.01" + */ + len = strlen(dev->dslist[i].addr); + fldap->r_addr.data = kmalloc(len, GFP_KERNEL); + if (fldap->r_addr.data == NULL) { + status = -ENOMEM; + goto getdeviceinfo_out; + } + memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); + fldap->r_addr.len = len; + } + + /* encode the device data */ + status = filelayout_encode_devinfo(xdr, fldev); + +getdeviceinfo_out: + if (fldev) { + kfree(fldev->fl_stripeindices_list); + if (fldev->fl_device_list) { + for (i = 0; i < fldev->fl_device_length; i++) { + fldap = + fldev->fl_device_list[i].fl_multipath_list; + kfree(fldap->r_netid.data); + kfree(fldap->r_addr.data); + kfree(fldap); + } + kfree(fldev->fl_device_list); + } + kfree(fldev); + } + + kfree(im); + kfree(res); + + return status; +} + +int +spnfs_setattr(void) +{ + return 0; +} + +int +spnfs_open(struct inode *inode, struct nfsd4_open *open) +{ + struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ + struct spnfs_msg *im = NULL; + union spnfs_msg_res *res = NULL; + int status = 0; + + im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); + if (im == NULL) { + status = -ENOMEM; + goto open_out; + } + + res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); + if (res == NULL) { + status = -ENOMEM; + goto open_out; + } + + im->im_type = SPNFS_TYPE_OPEN; + im->im_args.open_args.inode = inode->i_ino; + im->im_args.open_args.generation = inode->i_generation; + im->im_args.open_args.create = open->op_create; + im->im_args.open_args.createmode = open->op_createmode; + im->im_args.open_args.truncate = open->op_truncate; + + /* call function to queue the msg for upcall */ + status = spnfs_upcall(spnfs, im, res); + if (status != 0) { + dprintk("%s spnfs upcall failure: %d\n", __func__, status); + status = -EIO; + goto open_out; + } + status = res->open_res.status; + +open_out: + kfree(im); + kfree(res); + + return status; +} + +int +spnfs_create(void) +{ + return 0; +} + +/* + * Invokes the spnfsd with the inode number of the object to remove. + * The file has already been removed on the MDS, so all the spnsfd + * daemon does is remove the stripes. + * Returns 0 on success otherwise error code + */ +int +spnfs_remove(unsigned long ino, unsigned long generation) +{ + struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ + struct spnfs_msg *im = NULL; + union spnfs_msg_res *res = NULL; + int status = 0; + + im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); + if (im == NULL) { + status = -ENOMEM; + goto remove_out; + } + + res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); + if (res == NULL) { + status = -ENOMEM; + goto remove_out; + } + + im->im_type = SPNFS_TYPE_REMOVE; + im->im_args.remove_args.inode = ino; + im->im_args.remove_args.generation = generation; + + /* call function to queue the msg for upcall */ + status = spnfs_upcall(spnfs, im, res); + if (status != 0) { + dprintk("%s spnfs upcall failure: %d\n", __func__, status); + status = -EIO; + goto remove_out; + } + status = res->remove_res.status; + +remove_out: + kfree(im); + kfree(res); + + return status; +} + +static int +read_one(struct inode *inode, loff_t offset, size_t len, char *buf, + struct file **filp) +{ + loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; + size_t iolen; + int completed = 0, ds, err; + + while (len > 0) { + tmp = offset; + soff = do_div(tmp, spnfs_config->stripe_size); + snum = tmp; + ds = do_div(tmp, spnfs_config->num_ds); + if (spnfs_config->dense_striping == 0) + soffset = offset; + else { + tmp = snum; + do_div(tmp, spnfs_config->num_ds); + soffset = tmp * spnfs_config->stripe_size + soff; + } + if (len < spnfs_config->stripe_size - soff) + iolen = len; + else + iolen = spnfs_config->stripe_size - soff; + + pos = soffset; + err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); + if (err < 0) + return -EIO; + if (err == 0) + break; + filp[ds]->f_pos = pos; + iolen = err; + completed += iolen; + len -= iolen; + offset += iolen; + bufoffset += iolen; + } + + return completed; +} + +static __be32 +read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, + struct svc_rqst *rqstp) +{ + int i, vnum, err, bytecount = 0; + char path[128]; + struct file *filp[SPNFS_MAX_DATA_SERVERS]; + size_t iolen; + __be32 status = nfs_ok; + + /* + * XXX We should just be doing this at open time, but it gets + * kind of messy storing this info in nfsd's state structures + * and piggybacking its path through the various state handling + * functions. Revisit this. + */ + memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); + for (i = 0; i < spnfs_config->num_ds; i++) { + sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], + inode->i_ino, inode->i_generation); + filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); + if (filp[i] == NULL) { + status = nfserr_io; + goto read_out; + } + get_file(filp[i]); + } + + for (vnum = 0 ; vnum < vlen ; vnum++) { + iolen = rqstp->rq_vec[vnum].iov_len; + err = read_one(inode, offset + bytecount, iolen, + (char *)rqstp->rq_vec[vnum].iov_base, filp); + if (err < 0) { + status = nfserr_io; + goto read_out; + } + if (err < iolen) { + bytecount += err; + goto read_out; + } + bytecount += rqstp->rq_vec[vnum].iov_len; + } + +read_out: + *lenp = bytecount; + for (i = 0; i < spnfs_config->num_ds; i++) { + if (filp[i]) { + filp_close(filp[i], current->files); + fput(filp[i]); + } + } + return status; +} + +__be32 +spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, + struct svc_rqst *rqstp) +{ + if (spnfs_config) + return read(inode, offset, lenp, vlen, rqstp); + else { + printk(KERN_ERR "Please upgrade to latest spnfsd\n"); + return nfserr_notsupp; + } +} + +static int +write_one(struct inode *inode, loff_t offset, size_t len, char *buf, + struct file **filp) +{ + loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; + size_t iolen; + int completed = 0, ds, err; + + while (len > 0) { + tmp = offset; + soff = do_div(tmp, spnfs_config->stripe_size); + snum = tmp; + ds = do_div(tmp, spnfs_config->num_ds); + if (spnfs_config->dense_striping == 0) + soffset = offset; + else { + tmp = snum; + do_div(tmp, spnfs_config->num_ds); + soffset = tmp * spnfs_config->stripe_size + soff; + } + if (len < spnfs_config->stripe_size - soff) + iolen = len; + else + iolen = spnfs_config->stripe_size - soff; + + pos = soffset; + err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); + if (err < 0) + return -EIO; + filp[ds]->f_pos = pos; + iolen = err; + completed += iolen; + len -= iolen; + offset += iolen; + bufoffset += iolen; + } + + return completed; +} + +static __be32 +write(struct inode *inode, loff_t offset, size_t len, int vlen, + struct svc_rqst *rqstp) +{ + int i, vnum, err, bytecount = 0; + char path[128]; + struct file *filp[SPNFS_MAX_DATA_SERVERS]; + size_t iolen; + __be32 status = nfs_ok; + + /* + * XXX We should just be doing this at open time, but it gets + * kind of messy storing this info in nfsd's state structures + * and piggybacking its path through the various state handling + * functions. Revisit this. + */ + memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); + for (i = 0; i < spnfs_config->num_ds; i++) { + sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], + inode->i_ino, inode->i_generation); + filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); + if (filp[i] == NULL) { + status = nfserr_io; + goto write_out; + } + get_file(filp[i]); + } + + for (vnum = 0; vnum < vlen; vnum++) { + iolen = rqstp->rq_vec[vnum].iov_len; + err = write_one(inode, offset + bytecount, iolen, + (char *)rqstp->rq_vec[vnum].iov_base, filp); + if (err != iolen) { + dprintk("spnfs_write: err=%d expected %Zd\n", err, len); + status = nfserr_io; + goto write_out; + } + bytecount += rqstp->rq_vec[vnum].iov_len; + } + +write_out: + for (i = 0; i < spnfs_config->num_ds; i++) { + if (filp[i]) { + filp_close(filp[i], current->files); + fput(filp[i]); + } + } + + return status; +} + +__be32 +spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, + struct svc_rqst *rqstp) +{ + if (spnfs_config) + return write(inode, offset, len, vlen, rqstp); + else { + printk(KERN_ERR "Please upgrade to latest spnfsd\n"); + return nfserr_notsupp; + } +} + +int +spnfs_commit(void) +{ + return 0; +} + +/* + * Return the state for this object. + * At this time simply return 0 to indicate success and use the existing state + */ +int +spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) +{ + return 0; +} + +/* + * Return the filehandle for the specified file descriptor + */ +int +spnfs_getfh(int fd, struct nfs_fh *fh) +{ + struct file *file; + + file = fget(fd); + if (file == NULL) + return -EIO; + + memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); + fput(file); + return 0; +} diff -up linux-2.6.34.noarch/fs/nfsd/state.h.orig linux-2.6.34.noarch/fs/nfsd/state.h --- linux-2.6.34.noarch/fs/nfsd/state.h.orig 2010-09-30 10:15:18.375737000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/state.h 2010-09-30 10:17:08.964002000 -0400 @@ -242,6 +242,12 @@ struct nfs4_client { u32 cl_cb_seq_nr; struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ /* wait here for slots */ +#if defined(CONFIG_PNFSD) + struct list_head cl_layouts; /* outstanding layouts */ + struct list_head cl_layoutrecalls; /* outstanding layoutrecall + callbacks */ + atomic_t cl_deviceref; /* Num outstanding devs */ +#endif /* CONFIG_PNFSD */ }; static inline void @@ -342,12 +348,31 @@ struct nfs4_file { struct list_head fi_hash; /* hash by "struct inode *" */ struct list_head fi_stateids; struct list_head fi_delegations; +#if defined(CONFIG_PNFSD) + struct list_head fi_layouts; + struct list_head fi_layout_states; +#endif /* CONFIG_PNFSD */ struct inode *fi_inode; u32 fi_id; /* used with stateowner->so_id * for stateid_hashtbl hash */ bool fi_had_conflict; +#if defined(CONFIG_PNFSD) + /* used by layoutget / layoutrecall */ + struct nfs4_fsid fi_fsid; + u32 fi_fhlen; + u8 fi_fhval[NFS4_FHSIZE]; +#endif /* CONFIG_PNFSD */ }; +#if defined(CONFIG_PNFSD) +/* pNFS Metadata server state */ + +struct pnfs_ds_dev_entry { + struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ + u32 dd_dsid; +}; +#endif /* CONFIG_PNFSD */ + /* * nfs4_stateid can either be an open stateid or (eventually) a lock stateid * @@ -370,6 +395,9 @@ struct nfs4_stateid { struct list_head st_perfile; struct list_head st_perstateowner; struct list_head st_lockowners; +#if defined(CONFIG_PNFSD) + struct list_head st_pnfs_ds_id; +#endif /* CONFIG_PNFSD */ struct nfs4_stateowner * st_stateowner; struct nfs4_file * st_file; stateid_t st_stateid; @@ -421,6 +449,34 @@ extern void nfsd4_recdir_purge_old(void) extern int nfsd4_create_clid_dir(struct nfs4_client *clp); extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); extern void release_session_client(struct nfsd4_session *); +extern void nfsd4_free_slab(struct kmem_cache **); +extern struct nfs4_file *find_file(struct inode *); +extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); +extern void put_nfs4_file(struct nfs4_file *); +extern void get_nfs4_file(struct nfs4_file *); +extern struct nfs4_client *find_confirmed_client(clientid_t *); +extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); +extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); +extern __be32 nfs4_check_stateid(stateid_t *); +extern void expire_client_lock(struct nfs4_client *); +extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); + +#if defined(CONFIG_PNFSD) +extern int nfsd4_init_pnfs_slabs(void); +extern void nfsd4_free_pnfs_slabs(void); +extern void pnfs_expire_client(struct nfs4_client *); +extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); +extern void nfs4_pnfs_state_init(void); +extern void nfs4_pnfs_state_shutdown(void); +extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); +extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); +#else /* CONFIG_PNFSD */ +static inline void nfsd4_free_pnfs_slabs(void) {} +static inline int nfsd4_init_pnfs_slabs(void) { return 0; } +static inline void pnfs_expire_client(struct nfs4_client *clp) {} +static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} +static inline void nfs4_pnfs_state_shutdown(void) {} +#endif /* CONFIG_PNFSD */ static inline void nfs4_put_stateowner(struct nfs4_stateowner *so) @@ -434,4 +490,24 @@ nfs4_get_stateowner(struct nfs4_stateown kref_get(&so->so_ref); } +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1 : NFS4_MAX_UINT64; +} + #endif /* NFSD4_STATE_H */ diff -up linux-2.6.34.noarch/fs/nfsd/vfs.c.orig linux-2.6.34.noarch/fs/nfsd/vfs.c --- linux-2.6.34.noarch/fs/nfsd/vfs.c.orig 2010-09-30 10:15:05.090335000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/vfs.c 2010-09-30 10:17:08.970001000 -0400 @@ -37,7 +37,12 @@ #ifdef CONFIG_NFSD_V4 #include #include +#include +#include #endif /* CONFIG_NFSD_V4 */ +#if defined(CONFIG_SPNFS_BLOCK) +#include +#endif #include "nfsd.h" #include "vfs.h" @@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; +#if defined(CONFIG_SPNFS_BLOCK) + if (pnfs_block_enabled(inode, 0)) { + err = bl_layoutrecall(inode, RETURN_FILE, + iap->ia_size, inode->i_size - iap->ia_size); + } +#endif /* CONFIG_SPNFS_BLOCK */ } /* @@ -1703,6 +1714,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru struct inode *fdir, *tdir; __be32 err; int host_err; +#ifdef CONFIG_SPNFS + unsigned long ino = 0; + unsigned long generation = 0; + unsigned int nlink = 0; +#endif /* CONFIG_SPNFS */ err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) @@ -1766,7 +1782,26 @@ nfsd_rename(struct svc_rqst *rqstp, stru if (host_err) goto out_dput_new; +#ifdef CONFIG_SPNFS + /* + * if the target is a preexisting regular file, remember the + * inode number and generation so we can delete the stripes; + * save the link count as well so that the stripes only get + * get deleted when the last link is deleted + */ + if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { + ino = ndentry->d_inode->i_ino; + generation = ndentry->d_inode->i_generation; + nlink = ndentry->d_inode->i_nlink; + } +#endif /* CONFIG_SPNFS */ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); +#ifdef CONFIG_SPNFS + if (spnfs_enabled() && (!host_err && ino && nlink == 1)) + spnfs_remove(ino, generation); +#endif /* CONFIG_SPNFS */ + if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) @@ -1807,6 +1842,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru struct inode *dirp; __be32 err; int host_err; +#if defined(CONFIG_SPNFS) + unsigned long ino; + unsigned long generation; + unsigned int nlink; +#endif /* defined(CONFIG_SPNFS) */ err = nfserr_acces; if (!flen || isdotent(fname, flen)) @@ -1830,6 +1870,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru goto out; } +#if defined(CONFIG_SPNFS) + /* + * Remember the inode number to communicate to the spnfsd + * for removal of stripes; save the link count as well so that + * the stripes only get get deleted when the last link is deleted + */ + ino = rdentry->d_inode->i_ino; + generation = rdentry->d_inode->i_generation; + nlink = rdentry->d_inode->i_nlink; +#endif /* defined(CONFIG_SPNFS) */ + if (!type) type = rdentry->d_inode->i_mode & S_IFMT; @@ -1854,6 +1905,29 @@ nfsd_unlink(struct svc_rqst *rqstp, stru if (!host_err) host_err = commit_metadata(fhp); +#if defined(CONFIG_SPNFS) + /* + * spnfs: notify spnfsd of removal to destroy stripes + */ +/* + sb = current_fh->fh_dentry->d_inode->i_sb; + if (sb->s_export_op->spnfs_remove) { +*/ + dprintk("%s check if spnfs_enabled\n", __FUNCTION__); + if (spnfs_enabled() && nlink == 1) { + BUG_ON(ino == 0); + dprintk("%s calling spnfs_remove inumber=%ld\n", + __FUNCTION__, ino); + if (spnfs_remove(ino, generation) == 0) { + dprintk("%s spnfs_remove success\n", __FUNCTION__); + } else { + /* XXX How do we make this atomic? */ + printk(KERN_WARNING "nfsd: pNFS could not " + "remove stripes for inode: %ld\n", ino); + } + } +#endif /* defined(CONFIG_SPNFS) */ + mnt_drop_write(fhp->fh_export->ex_path.mnt); out_nfserr: err = nfserrno(host_err); diff -up linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig linux-2.6.34.noarch/fs/nfsd/xdr4.h --- linux-2.6.34.noarch/fs/nfsd/xdr4.h.orig 2010-09-30 10:15:18.395731000 -0400 +++ linux-2.6.34.noarch/fs/nfsd/xdr4.h 2010-09-30 10:17:08.978004000 -0400 @@ -37,6 +37,8 @@ #ifndef _LINUX_NFSD_XDR4_H #define _LINUX_NFSD_XDR4_H +#include + #include "state.h" #include "nfsd.h" @@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete { u32 rca_one_fs; }; +struct nfsd4_pnfs_getdevinfo { + struct nfsd4_pnfs_deviceid gd_devid; /* request */ + u32 gd_layout_type; /* request */ + u32 gd_maxcount; /* request */ + u32 gd_notify_types;/* request */ + struct super_block *gd_sb; +}; + +struct nfsd4_pnfs_getdevlist { + u32 gd_layout_type; /* request */ + u32 gd_maxdevices; /* request */ + u64 gd_cookie; /* request - response */ + u64 gd_verf; /* request - response */ + struct svc_fh *gd_fhp; /* response */ + u32 gd_eof; /* response */ +}; + +struct nfsd4_pnfs_layoutget { + u64 lg_minlength; /* request */ + u32 lg_signal; /* request */ + u32 lg_maxcount; /* request */ + struct svc_fh *lg_fhp; /* request */ + stateid_t lg_sid; /* request/response */ + struct nfsd4_layout_seg lg_seg; /* request/response */ + u32 lg_roc; /* response */ +}; + +struct nfsd4_pnfs_layoutcommit { + struct nfsd4_pnfs_layoutcommit_arg args; + stateid_t lc_sid; /* request */ + struct nfsd4_pnfs_layoutcommit_res res; +}; + +enum layoutreturn_flags { + LR_FLAG_INTERN = 1 << 0, /* internal return */ + LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ +}; + +struct nfsd4_pnfs_layoutreturn { + struct nfsd4_pnfs_layoutreturn_arg args; + u32 lr_flags; + stateid_t lr_sid; /* request/resopnse */ + u32 lrs_present; /* response */ +}; + struct nfsd4_op { int opnum; __be32 status; @@ -426,6 +473,13 @@ struct nfsd4_op { struct nfsd4_destroy_session destroy_session; struct nfsd4_sequence sequence; struct nfsd4_reclaim_complete reclaim_complete; +#if defined(CONFIG_PNFSD) + struct nfsd4_pnfs_getdevlist pnfs_getdevlist; + struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; + struct nfsd4_pnfs_layoutget pnfs_layoutget; + struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; + struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; +#endif /* CONFIG_PNFSD */ } u; struct nfs4_replay * replay; }; diff -up linux-2.6.34.noarch/fs/nfs/file.c.orig linux-2.6.34.noarch/fs/nfs/file.c --- linux-2.6.34.noarch/fs/nfs/file.c.orig 2010-09-30 10:15:17.741713000 -0400 +++ linux-2.6.34.noarch/fs/nfs/file.c 2010-09-30 10:17:08.626991000 -0400 @@ -36,6 +36,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_FILE @@ -388,12 +389,17 @@ static int nfs_write_begin(struct file * pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct page *page; int once_thru = 0; + struct pnfs_layout_segment *lseg; dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); + pnfs_update_layout(mapping->host, + nfs_file_open_context(file), + 0, NFS4_MAX_UINT64, IOMODE_RW, + &lseg); start: /* * Prevent starvation issues if someone is doing a consistency @@ -402,17 +408,22 @@ start: ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, nfs_wait_bit_killable, TASK_KILLABLE); if (ret) - return ret; + goto out; page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; + if (!page) { + ret = -ENOMEM; + goto out; + } *pagep = page; - ret = nfs_flush_incompatible(file, page); + ret = nfs_flush_incompatible(file, page, lseg); if (ret) { unlock_page(page); page_cache_release(page); + *pagep = NULL; + *fsdata = NULL; + goto out; } else if (!once_thru && nfs_want_read_modify_write(file, page, pos, len)) { once_thru = 1; @@ -421,6 +432,12 @@ start: if (!ret) goto start; } + ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); + out: + if (ret) { + put_lseg(lseg); + *fsdata = NULL; + } return ret; } @@ -430,6 +447,7 @@ static int nfs_write_end(struct file *fi { unsigned offset = pos & (PAGE_CACHE_SIZE - 1); int status; + struct pnfs_layout_segment *lseg; dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", file->f_path.dentry->d_parent->d_name.name, @@ -456,10 +474,17 @@ static int nfs_write_end(struct file *fi zero_user_segment(page, pglen, PAGE_CACHE_SIZE); } - status = nfs_updatepage(file, page, offset, copied); + lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); + status = pnfs_write_end(file, page, pos, len, copied, lseg); + if (status) + goto out; + status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + out: unlock_page(page); page_cache_release(page); + pnfs_write_end_cleanup(file, fsdata); + put_lseg(lseg); if (status < 0) return status; @@ -570,6 +595,8 @@ static int nfs_vm_page_mkwrite(struct vm /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + /* XXX Do we want to call pnfs_update_layout here? */ + lock_page(page); mapping = page->mapping; if (mapping != dentry->d_inode->i_mapping) @@ -580,11 +607,11 @@ static int nfs_vm_page_mkwrite(struct vm if (pagelen == 0) goto out_unlock; - ret = nfs_flush_incompatible(filp, page); + ret = nfs_flush_incompatible(filp, page, NULL); if (ret != 0) goto out_unlock; - ret = nfs_updatepage(filp, page, 0, pagelen); + ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL); out_unlock: if (!ret) return VM_FAULT_LOCKED; diff -up linux-2.6.34.noarch/fs/nfs/inode.c.orig linux-2.6.34.noarch/fs/nfs/inode.c --- linux-2.6.34.noarch/fs/nfs/inode.c.orig 2010-09-30 10:15:17.769716000 -0400 +++ linux-2.6.34.noarch/fs/nfs/inode.c 2010-09-30 10:17:08.632991000 -0400 @@ -48,6 +48,7 @@ #include "internal.h" #include "fscache.h" #include "dns_resolve.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -278,7 +279,7 @@ nfs_fhget(struct super_block *sb, struct */ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; + inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { @@ -530,6 +531,68 @@ out: return err; } +static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) +{ + atomic_set(&l_ctx->count, 1); + l_ctx->lockowner = current->files; + l_ctx->pid = current->tgid; + INIT_LIST_HEAD(&l_ctx->list); +} + +static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *pos; + + list_for_each_entry(pos, &ctx->lock_context.list, list) { + if (pos->lockowner != current->files) + continue; + if (pos->pid != current->tgid) + continue; + atomic_inc(&pos->count); + return pos; + } + return NULL; +} + +struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *res, *new = NULL; + struct inode *inode = ctx->path.dentry->d_inode; + + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + spin_unlock(&inode->i_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return NULL; + nfs_init_lock_context(new); + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + list_add_tail(&new->list, &ctx->lock_context.list); + new->open_context = ctx; + res = new; + new = NULL; + } + } + spin_unlock(&inode->i_lock); + kfree(new); + return res; +} + +void nfs_put_lock_context(struct nfs_lock_context *l_ctx) +{ + struct nfs_open_context *ctx = l_ctx->open_context; + struct inode *inode = ctx->path.dentry->d_inode; + + if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + return; + list_del(&l_ctx->list); + spin_unlock(&inode->i_lock); + kfree(l_ctx); +} + /** * nfs_close_context - Common close_context() routine NFSv2/v3 * @ctx: pointer to context @@ -566,11 +629,11 @@ static struct nfs_open_context *alloc_nf path_get(&ctx->path); ctx->cred = get_rpccred(cred); ctx->state = NULL; - ctx->lockowner = current->files; ctx->flags = 0; ctx->error = 0; ctx->dir_cookie = 0; - atomic_set(&ctx->count, 1); + nfs_init_lock_context(&ctx->lock_context); + ctx->lock_context.open_context = ctx; } return ctx; } @@ -578,15 +641,16 @@ static struct nfs_open_context *alloc_nf struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL) - atomic_inc(&ctx->count); + atomic_inc(&ctx->lock_context.count); return ctx; } +EXPORT_SYMBOL(get_nfs_open_context); static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { struct inode *inode = ctx->path.dentry->d_inode; - if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) + if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) return; list_del(&ctx->list); spin_unlock(&inode->i_lock); @@ -933,6 +997,7 @@ void nfs_fattr_init(struct nfs_fattr *fa fattr->time_start = jiffies; fattr->gencount = nfs_inc_attr_generation_counter(); } +EXPORT_SYMBOL(nfs_fattr_init); struct nfs_fattr *nfs_alloc_fattr(void) { @@ -1142,6 +1207,14 @@ static int nfs_update_inode(struct inode server->fsid = fattr->fsid; /* + * file needs layout commit, server attributes may be stale + */ + if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { + dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", + __func__, inode->i_sb->s_id, inode->i_ino); + return 0; + } + /* * Update the read time so we don't revalidate too often. */ nfsi->read_cache_jiffies = fattr->time_start; @@ -1340,9 +1413,10 @@ static int nfs_update_inode(struct inode */ void nfs4_clear_inode(struct inode *inode) { + pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); + /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); - /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); } #endif @@ -1367,7 +1441,10 @@ struct inode *nfs_alloc_inode(struct sup void nfs_destroy_inode(struct inode *inode) { - kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); + struct nfs_inode *nfsi = NFS_I(inode); + + pnfs_destroy_layout(nfsi); + kmem_cache_free(nfs_inode_cachep, nfsi); } static inline void nfs4_init_once(struct nfs_inode *nfsi) @@ -1377,6 +1454,11 @@ static inline void nfs4_init_once(struct nfsi->delegation = NULL; nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); +#ifdef CONFIG_NFS_V4_1 + init_waitqueue_head(&nfsi->lo_waitq); + nfsi->pnfs_layout_suspend = 0; + nfsi->layout = NULL; +#endif /* CONFIG_NFS_V4_1 */ #endif } @@ -1488,6 +1570,12 @@ static int __init init_nfs_fs(void) if (err) goto out0; +#ifdef CONFIG_NFS_V4_1 + err = pnfs_initialize(); + if (err) + goto out00; +#endif /* CONFIG_NFS_V4_1 */ + #ifdef CONFIG_PROC_FS rpc_proc_register(&nfs_rpcstat); #endif @@ -1498,6 +1586,10 @@ out: #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif +#ifdef CONFIG_NFS_V4_1 +out00: + pnfs_uninitialize(); +#endif /* CONFIG_NFS_V4_1 */ nfs_destroy_directcache(); out0: nfs_destroy_writepagecache(); @@ -1531,6 +1623,9 @@ static void __exit exit_nfs_fs(void) #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif +#ifdef CONFIG_NFS_V4_1 + pnfs_uninitialize(); +#endif unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); diff -up linux-2.6.34.noarch/fs/nfs/internal.h.orig linux-2.6.34.noarch/fs/nfs/internal.h --- linux-2.6.34.noarch/fs/nfs/internal.h.orig 2010-09-30 10:15:17.775713000 -0400 +++ linux-2.6.34.noarch/fs/nfs/internal.h 2010-09-30 10:17:08.637996000 -0400 @@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern int nfs4_check_client_ready(struct nfs_client *clp); +extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2); +extern int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, + const char *ip_addr, + rpc_authflavor_t authflavour, + int proto, const struct rpc_timeout *timeparms, + u32 minorversion); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead extern struct rpc_procinfo nfs4_procedures[]; #endif +extern int nfs4_recover_expired_lease(struct nfs_client *clp); + /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); @@ -248,10 +260,31 @@ extern int nfs4_get_rootfh(struct nfs_se #endif /* read.c */ +extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops); +extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern int pnfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern int nfs_initiate_commit(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); +extern int pnfs_initiate_commit(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how, int pnfs); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +extern void nfs_mark_list_commit(struct list_head *head); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *); diff -up linux-2.6.34.noarch/fs/nfs/Kconfig.orig linux-2.6.34.noarch/fs/nfs/Kconfig --- linux-2.6.34.noarch/fs/nfs/Kconfig.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfs/Kconfig 2010-09-30 10:17:08.515988000 -0400 @@ -79,10 +79,48 @@ config NFS_V4_1 depends on NFS_V4 && EXPERIMENTAL help This option enables support for minor version 1 of the NFSv4 protocol - (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. + (RFC5661) including support for the parallel NFS (pNFS) features + in the kernel's NFS client. Unless you're an NFS developer, say N. +config PNFS_FILE_LAYOUT + tristate "NFS client support for the pNFS nfs-files layout (DEVELOPER ONLY)" + depends on NFS_FS && NFS_V4_1 + default y + help + This option enables support for the pNFS nfs-files layout. + + Unless you're an NFS developer, say N. + +config PNFS_OBJLAYOUT + tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" + depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD + help + Say M here if you want your pNFS client to support the Objects Layout Driver. + Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and + upper level driver (SCSI_OSD_ULD). + + If unsure, say N. + +config PNFS_PANLAYOUT + tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" + depends on PNFS_OBJLAYOUT + help + Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. + + If unsure, say N. + +config PNFS_BLOCK + tristate "Provide a pNFS block client (EXPERIMENTAL)" + depends on NFS_FS && NFS_V4_1 + select MD + select BLK_DEV_DM + help + Say M or y here if you want your pNfs client to support the block protocol + + If unsure, say N. + config ROOT_NFS bool "Root file system on NFS" depends on NFS_FS=y && IP_PNP diff -up linux-2.6.34.noarch/fs/nfs/Makefile.orig linux-2.6.34.noarch/fs/nfs/Makefile --- linux-2.6.34.noarch/fs/nfs/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfs/Makefile 2010-09-30 10:17:08.520988000 -0400 @@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o +nfs-$(CONFIG_NFS_V4_1) += pnfs.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o + +obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o +nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o + +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ +obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ diff -up linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs3proc.c --- linux-2.6.34.noarch/fs/nfs/nfs3proc.c.orig 2010-09-30 10:15:17.806716000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs3proc.c 2010-09-30 10:17:08.643994000 -0400 @@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs3_proc_get_root, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c --- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c.orig 2010-09-30 10:17:08.652995000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.c 2010-09-30 10:17:08.654992000 -0400 @@ -0,0 +1,768 @@ +/* + * linux/fs/nfs/nfs4filelayout.c + * + * Module for the pnfs nfs4 file layout driver. + * Defines all I/O and Policy interface operations, plus code + * to register itself with the pNFS client. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Dean Hildebrand + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nfs4filelayout.h" +#include "nfs4_fs.h" +#include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dean Hildebrand "); +MODULE_DESCRIPTION("The NFSv4 file layout driver"); + +/* Callback operations to the pNFS client */ +struct pnfs_client_operations *pnfs_callback_ops; + +/* Forward declaration */ +struct layoutdriver_io_operations filelayout_io_operations; + +int +filelayout_initialize_mountpoint(struct nfs_server *nfss, + const struct nfs_fh *mntfh) +{ + int status = nfs4_alloc_init_deviceid_cache(nfss->nfs_client, + nfs4_fl_free_deviceid_callback); + if (status) { + printk(KERN_WARNING "%s: deviceid cache could not be " + "initialized\n", __func__); + return status; + } + dprintk("%s: deviceid cache has been initialized successfully\n", + __func__); + return 0; +} + +/* Uninitialize a mountpoint by destroying its device list */ +int +filelayout_uninitialize_mountpoint(struct nfs_server *nfss) +{ + dprintk("--> %s\n", __func__); + + if (nfss->pnfs_curr_ld && nfss->nfs_client->cl_devid_cache) + nfs4_put_deviceid_cache(nfss->nfs_client); + return 0; +} + +/* This function is used by the layout driver to calculate the + * offset of the file on the dserver based on whether the + * layout type is STRIPE_DENSE or STRIPE_SPARSE + */ +static loff_t +filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); + + switch (flseg->stripe_type) { + case STRIPE_SPARSE: + return offset; + + case STRIPE_DENSE: + { + u32 stripe_width; + u64 tmp, off; + u32 unit = flseg->stripe_unit; + + stripe_width = unit * FILE_DSADDR(lseg)->stripe_count; + tmp = off = offset - flseg->pattern_offset; + do_div(tmp, stripe_width); + return tmp * unit + do_div(off, unit); + } + default: + BUG(); + } + + /* We should never get here... just to stop the gcc warning */ + return 0; +} + +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void filelayout_read_call_done(struct rpc_task *task, void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + if (rdata->fldata.orig_offset) { + dprintk("%s new off %llu orig offset %llu\n", __func__, + rdata->args.offset, rdata->fldata.orig_offset); + rdata->args.offset = rdata->fldata.orig_offset; + } + + /* Note this may cause RPC to be resent */ + rdata->pdata.call_ops->rpc_call_done(task, data); +} + +static void filelayout_read_release(void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + put_lseg(rdata->pdata.lseg); + rdata->pdata.lseg = NULL; + rdata->pdata.call_ops->rpc_release(data); +} + +static void filelayout_write_call_done(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + if (wdata->fldata.orig_offset) { + dprintk("%s new off %llu orig offset %llu\n", __func__, + wdata->args.offset, wdata->fldata.orig_offset); + wdata->args.offset = wdata->fldata.orig_offset; + } + + /* Note this may cause RPC to be resent */ + wdata->pdata.call_ops->rpc_call_done(task, data); +} + +static void filelayout_write_release(void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + put_lseg(wdata->pdata.lseg); + wdata->pdata.lseg = NULL; + wdata->pdata.call_ops->rpc_release(data); +} + +struct rpc_call_ops filelayout_read_call_ops = { + .rpc_call_prepare = nfs_read_prepare, + .rpc_call_done = filelayout_read_call_done, + .rpc_release = filelayout_read_release, +}; + +struct rpc_call_ops filelayout_write_call_ops = { + .rpc_call_prepare = nfs_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_release = filelayout_write_release, +}; + +/* Perform sync or async reads. + * + * An optimization for the NFS file layout driver + * allows the original read/write data structs to be passed in the + * last argument. + * + * TODO: join with write_pagelist? + */ +static enum pnfs_try_status +filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) +{ + struct pnfs_layout_segment *lseg = data->pdata.lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 idx; + struct nfs_fh *fh; + + dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", + __func__, data->inode->i_ino, nr_pages, + data->args.pgbase, (size_t)data->args.count, offset); + + /* Retrieve the correct rpc_client for the byte range */ + idx = nfs4_fl_calc_ds_index(lseg, offset); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s USE DS:ip %x %s\n", __func__, + htonl(ds->ds_ip_addr), ds->r_addr); + + /* just try the first data server for the index..*/ + data->fldata.ds_nfs_client = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, offset); + if (fh) + data->args.fh = fh; + + /* + * Now get the file offset on the dserver + * Set the read offset to this offset, and + * save the original offset in orig_offset + * In the case of aync reads, the offset will be reset in the + * call_ops->rpc_call_done() routine. + */ + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->fldata.orig_offset = offset; + + /* Perform an asynchronous read */ + nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, + &filelayout_read_call_ops); + + data->pdata.pnfs_error = 0; + + return PNFS_ATTEMPTED; +} + +/* Perform async writes. */ +static enum pnfs_try_status +filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) +{ + struct pnfs_layout_segment *lseg = data->pdata.lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 idx; + struct nfs_fh *fh; + + /* Retrieve the correct rpc_client for the byte range */ + idx = nfs4_fl_calc_ds_index(lseg, offset); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu %s\n", __func__, + data->inode->i_ino, sync, (size_t) data->args.count, offset, + htonl(ds->ds_ip_addr), ntohs(ds->ds_port), ds->r_addr); + + data->fldata.ds_nfs_client = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, offset); + if (fh) + data->args.fh = fh; + /* + * Get the file offset on the dserver. Set the write offset to + * this offset and save the original offset. + */ + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->fldata.orig_offset = offset; + + /* + * Perform an asynchronous write The offset will be reset in the + * call_ops->rpc_call_done() routine + */ + nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, + &filelayout_write_call_ops, sync); + + data->pdata.pnfs_error = 0; + return PNFS_ATTEMPTED; +} + +/* + * Create a filelayout layout structure and return it. The pNFS client + * will use the pnfs_layout_hdr type to refer to the layout for this + * inode from now on. + */ +static struct pnfs_layout_hdr * +filelayout_alloc_layout(struct inode *inode) +{ + struct nfs4_filelayout *flp; + + dprintk("NFS_FILELAYOUT: allocating layout\n"); + flp = kzalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL); + return flp ? &flp->fl_layout : NULL; +} + +/* Free a filelayout layout structure */ +static void +filelayout_free_layout(struct pnfs_layout_hdr *lo) +{ + dprintk("NFS_FILELAYOUT: freeing layout\n"); + kfree(FILE_LO(lo)); +} + +/* + * filelayout_check_layout() + * + * Make sure layout segment parameters are sane WRT the device. + * + * Notes: + * 1) current code insists that # stripe index = # data servers in ds_list + * which is wrong. + * 2) pattern_offset is ignored and must == 0 which is wrong; + * 3) the pattern_offset needs to be a mutliple of the stripe unit. + * 4) stripe unit is multiple of page size + */ + +static int +filelayout_check_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct nfs4_filelayout_segment *fl = LSEG_LD_DATA(lseg); + struct nfs4_file_layout_dsaddr *dsaddr; + int status = -EINVAL; + struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo)); + + dprintk("--> %s\n", __func__); + /* find in list or get from server and reference the deviceid */ + dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, &fl->dev_id); + if (dsaddr == NULL) { + dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id); + if (dsaddr == NULL) { + dprintk("%s NO device for dev_id %s\n", + __func__, deviceid_fmt(&fl->dev_id)); + goto out; + } + } + if (fl->first_stripe_index < 0 || + fl->first_stripe_index > dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %d\n", + __func__, fl->first_stripe_index); + goto out_put; + } + + if (fl->pattern_offset != 0) { + dprintk("%s Unsupported no-zero pattern_offset %Ld\n", + __func__, fl->pattern_offset); + goto out_put; + } + + if (fl->stripe_unit % PAGE_SIZE) { + dprintk("%s Stripe unit (%u) not page aligned\n", + __func__, fl->stripe_unit); + goto out_put; + } + + /* XXX only support SPARSE packing. Don't support use MDS open fh */ + if (!(fl->num_fh == 1 || fl->num_fh == dsaddr->ds_num)) { + dprintk("%s num_fh %u not equal to 1 or ds_num %u\n", + __func__, fl->num_fh, dsaddr->ds_num); + goto out_put; + } + + if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { + dprintk("%s Stripe unit (%u) not aligned with rsize %u " + "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, + nfss->wsize); + } + + nfs4_set_layout_deviceid(lseg, &dsaddr->deviceid); + + status = 0; +out: + dprintk("--> %s returns %d\n", __func__, status); + return status; +out_put: + nfs4_put_unset_layout_deviceid(lseg, &dsaddr->deviceid, + nfs4_fl_free_deviceid_callback); + goto out; +} + +static void _filelayout_free_lseg(struct pnfs_layout_segment *lseg); +static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl); + +/* Decode layout and store in layoutid. Overwrite any existing layout + * information for this file. + */ +static int +filelayout_set_layout(struct nfs4_filelayout *flo, + struct nfs4_filelayout_segment *fl, + struct nfs4_layoutget_res *lgr) +{ + uint32_t *p = (uint32_t *)lgr->layout.buf; + uint32_t nfl_util; + int i; + + dprintk("%s: set_layout_map Begin\n", __func__); + + memcpy(&fl->dev_id, p, NFS4_PNFS_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); + nfl_util = be32_to_cpup(p++); + if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) + fl->commit_through_mds = 1; + if (nfl_util & NFL4_UFLG_DENSE) + fl->stripe_type = STRIPE_DENSE; + else + fl->stripe_type = STRIPE_SPARSE; + fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; + + if (!flo->stripe_unit) + flo->stripe_unit = fl->stripe_unit; + else if (flo->stripe_unit != fl->stripe_unit) { + printk(KERN_NOTICE "%s: updating strip_unit from %u to %u\n", + __func__, flo->stripe_unit, fl->stripe_unit); + flo->stripe_unit = fl->stripe_unit; + } + + fl->first_stripe_index = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &fl->pattern_offset); + fl->num_fh = be32_to_cpup(p++); + + dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu dev_id %s\n", + __func__, nfl_util, fl->num_fh, fl->first_stripe_index, + fl->pattern_offset, deviceid_fmt(&fl->dev_id)); + + if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) { + fl->fh_array = vmalloc(fl->num_fh * sizeof(struct nfs_fh)); + if (fl->fh_array) + memset(fl->fh_array, 0, + fl->num_fh * sizeof(struct nfs_fh)); + } else { + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh), + GFP_KERNEL); + } + if (!fl->fh_array) + return -ENOMEM; + + for (i = 0; i < fl->num_fh; i++) { + /* fh */ + fl->fh_array[i].size = be32_to_cpup(p++); + if (sizeof(struct nfs_fh) < fl->fh_array[i].size) { + printk(KERN_ERR "Too big fh %d received %d\n", + i, fl->fh_array[i].size); + /* Layout is now invalid, pretend it doesn't exist */ + filelayout_free_fh_array(fl); + fl->num_fh = 0; + break; + } + memcpy(fl->fh_array[i].data, p, fl->fh_array[i].size); + p += XDR_QUADLEN(fl->fh_array[i].size); + dprintk("DEBUG: %s: fh len %d\n", __func__, + fl->fh_array[i].size); + } + + return 0; +} + +static struct pnfs_layout_segment * +filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, + struct nfs4_layoutget_res *lgr) +{ + struct nfs4_filelayout *flo = FILE_LO(layoutid); + struct pnfs_layout_segment *lseg; + int rc; + + dprintk("--> %s\n", __func__); + lseg = kzalloc(sizeof(struct pnfs_layout_segment) + + sizeof(struct nfs4_filelayout_segment), GFP_KERNEL); + if (!lseg) + return NULL; + + rc = filelayout_set_layout(flo, LSEG_LD_DATA(lseg), lgr); + + if (rc != 0 || filelayout_check_layout(layoutid, lseg)) { + _filelayout_free_lseg(lseg); + lseg = NULL; + } + return lseg; +} + +static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) +{ + if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) + vfree(fl->fh_array); + else + kfree(fl->fh_array); + + fl->fh_array = NULL; +} + +static void +_filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + filelayout_free_fh_array(LSEG_LD_DATA(lseg)); + kfree(lseg); +} + +static void +filelayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + dprintk("--> %s\n", __func__); + nfs4_put_unset_layout_deviceid(lseg, lseg->deviceid, + nfs4_fl_free_deviceid_callback); + _filelayout_free_lseg(lseg); +} + +/* Allocate a new nfs_write_data struct and initialize */ +static struct nfs_write_data * +filelayout_clone_write_data(struct nfs_write_data *old) +{ + static struct nfs_write_data *new; + + new = nfs_commitdata_alloc(); + if (!new) + goto out; + kref_init(&new->refcount); + new->parent = old; + kref_get(&old->refcount); + new->inode = old->inode; + new->cred = old->cred; + new->args.offset = 0; + new->args.count = 0; + new->res.count = 0; + new->res.fattr = &new->fattr; + nfs_fattr_init(&new->fattr); + new->res.verf = &new->verf; + new->args.context = get_nfs_open_context(old->args.context); + new->pdata.lseg = NULL; + new->pdata.call_ops = old->pdata.call_ops; + new->pdata.how = old->pdata.how; +out: + return new; +} + +static void filelayout_commit_call_done(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + wdata->pdata.call_ops->rpc_call_done(task, data); +} + +static struct rpc_call_ops filelayout_commit_call_ops = { + .rpc_call_prepare = nfs_write_prepare, + .rpc_call_done = filelayout_commit_call_done, + .rpc_release = filelayout_write_release, +}; + +/* + * Execute a COMMIT op to the MDS or to each data server on which a page + * in 'pages' exists. + * Invoke the pnfs_commit_complete callback. + */ +enum pnfs_try_status +filelayout_commit(struct nfs_write_data *data, int sync) +{ + LIST_HEAD(head); + struct nfs_page *req; + loff_t file_offset = 0; + u16 idx, i; + struct list_head **ds_page_list = NULL; + u16 *indices_used; + int num_indices_seen = 0; + const struct rpc_call_ops *call_ops; + struct rpc_clnt *clnt; + struct nfs_write_data **clone_list = NULL; + struct nfs_write_data *dsdata; + struct nfs4_pnfs_ds *ds; + + dprintk("%s data %p sync %d\n", __func__, data, sync); + + /* Alloc room for both in one go */ + ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * + (sizeof(u16) + sizeof(struct list_head *)), + GFP_KERNEL); + if (!ds_page_list) + goto mem_error; + indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); + /* + * Sort pages based on which ds to send to. + * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. + * Note we are assuming there is only a single lseg in play. + * When that is not true, we could first sort on lseg, then + * sort within each as we do here. + */ + while (!list_empty(&data->pages)) { + req = nfs_list_entry(data->pages.next); + nfs_list_remove_request(req); + if (!req->wb_lseg || + ((struct nfs4_filelayout_segment *) + LSEG_LD_DATA(req->wb_lseg))->commit_through_mds) + idx = NFS4_PNFS_MAX_MULTI_CNT; + else { + file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; + idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); + } + if (ds_page_list[idx]) { + /* Already seen this idx */ + list_add(&req->wb_list, ds_page_list[idx]); + } else { + /* New idx not seen so far */ + list_add_tail(&req->wb_list, &head); + indices_used[num_indices_seen++] = idx; + } + ds_page_list[idx] = &req->wb_list; + } + /* Once created, clone must be released via call_op */ + clone_list = kzalloc(num_indices_seen * + sizeof(struct nfs_write_data *), GFP_KERNEL); + if (!clone_list) + goto mem_error; + for (i = 0; i < num_indices_seen - 1; i++) { + clone_list[i] = filelayout_clone_write_data(data); + if (!clone_list[i]) + goto mem_error; + } + clone_list[i] = data; + /* + * Now send off the RPCs to each ds. Note that it is important + * that any RPC to the MDS be sent last (or at least after all + * clones have been made.) + */ + for (i = 0; i < num_indices_seen; i++) { + dsdata = clone_list[i]; + idx = indices_used[i]; + list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); + if (idx == NFS4_PNFS_MAX_MULTI_CNT) { + call_ops = data->pdata.call_ops;; + clnt = NFS_CLIENT(dsdata->inode); + ds = NULL; + } else { + struct nfs_fh *fh; + + call_ops = &filelayout_commit_call_ops; + req = nfs_list_entry(dsdata->pages.next); + ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); + if (!ds) { + /* Trigger retry of this chunk through MDS */ + dsdata->task.tk_status = -EIO; + data->pdata.call_ops->rpc_release(dsdata); + continue; + } + clnt = ds->ds_clp->cl_rpcclient; + dsdata->fldata.ds_nfs_client = ds->ds_clp; + file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; + fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); + if (fh) + dsdata->args.fh = fh; + } + dprintk("%s: Initiating commit: %llu USE DS:\n", + __func__, file_offset); + print_ds(ds); + + /* Send COMMIT to data server */ + nfs_initiate_commit(dsdata, clnt, call_ops, sync); + } + kfree(clone_list); + kfree(ds_page_list); + data->pdata.pnfs_error = 0; + return PNFS_ATTEMPTED; + + mem_error: + if (clone_list) { + for (i = 0; i < num_indices_seen - 1; i++) { + if (!clone_list[i]) + break; + data->pdata.call_ops->rpc_release(clone_list[i]); + } + kfree(clone_list); + } + kfree(ds_page_list); + /* One of these will be empty, but doesn't hurt to do both */ + nfs_mark_list_commit(&head); + nfs_mark_list_commit(&data->pages); + data->pdata.call_ops->rpc_release(data); + return PNFS_ATTEMPTED; +} + +/* Return the stripesize for the specified file */ +ssize_t +filelayout_get_stripesize(struct pnfs_layout_hdr *lo) +{ + struct nfs4_filelayout *flo = FILE_LO(lo); + + return flo->stripe_unit; +} + +/* + * filelayout_pg_test(). Called by nfs_can_coalesce_requests() + * + * return 1 : coalesce page + * return 0 : don't coalesce page + */ +int +filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + u64 p_stripe, r_stripe; + + if (pgio->pg_boundary == 0) + return 1; + p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; + r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; + + do_div(p_stripe, pgio->pg_boundary); + do_div(r_stripe, pgio->pg_boundary); + + return (p_stripe == r_stripe); +} + +struct layoutdriver_io_operations filelayout_io_operations = { + .commit = filelayout_commit, + .read_pagelist = filelayout_read_pagelist, + .write_pagelist = filelayout_write_pagelist, + .alloc_layout = filelayout_alloc_layout, + .free_layout = filelayout_free_layout, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, + .initialize_mountpoint = filelayout_initialize_mountpoint, + .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, +}; + +struct layoutdriver_policy_operations filelayout_policy_operations = { + .flags = PNFS_USE_RPC_CODE, + .get_stripesize = filelayout_get_stripesize, + .pg_test = filelayout_pg_test, +}; + +struct pnfs_layoutdriver_type filelayout_type = { + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .ld_io_ops = &filelayout_io_operations, + .ld_policy_ops = &filelayout_policy_operations, +}; + +static int __init nfs4filelayout_init(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", + __func__); + + /* + * Need to register file_operations struct with global list to indicate + * that NFS4 file layout is a possible pNFS I/O module + */ + pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type); + + return 0; +} + +static void __exit nfs4filelayout_exit(void) +{ + printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", + __func__); + + /* Unregister NFS4 file layout driver with pNFS client*/ + pnfs_unregister_layoutdriver(&filelayout_type); +} + +module_init(nfs4filelayout_init); +module_exit(nfs4filelayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c --- linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-09-30 10:17:08.661995000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4filelayoutdev.c 2010-09-30 10:17:08.663993000 -0400 @@ -0,0 +1,635 @@ +/* + * linux/fs/nfs/nfs4filelayoutdev.c + * + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Dean Hildebrand + * Garth Goodson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include "nfs4filelayout.h" +#include "internal.h" +#include "nfs4_fs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static LIST_HEAD(nfs4_data_server_cache); + +void +print_ds(struct nfs4_pnfs_ds *ds) +{ + if (ds == NULL) { + dprintk("%s NULL device \n", __func__); + return; + } + dprintk(" ip_addr %x\n", ntohl(ds->ds_ip_addr)); + dprintk(" port %hu\n", ntohs(ds->ds_port)); + dprintk(" client %p\n", ds->ds_clp); + dprintk(" ref count %d\n", atomic_read(&ds->ds_count)); + if (ds->ds_clp) + dprintk(" cl_exchange_flags %x\n", + ds->ds_clp->cl_exchange_flags); + dprintk(" ip:port %s\n", ds->r_addr); +} + +void +print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) +{ + int i; + + dprintk("%s dsaddr->ds_num %d\n", __func__, + dsaddr->ds_num); + for (i = 0; i < dsaddr->ds_num; i++) + print_ds(dsaddr->ds_list[i]); +} + +/* Debugging function assuming a 64bit major/minor split of the deviceid */ +char * +deviceid_fmt(const struct pnfs_deviceid *dev_id) +{ + static char buf[17]; + uint32_t *p = (uint32_t *)dev_id->data; + uint64_t major, minor; + + p = xdr_decode_hyper(p, &major); + p = xdr_decode_hyper(p, &minor); + + sprintf(buf, "%08llu %08llu", major, minor); + return buf; +} + +/* nfs4_ds_cache_lock is held */ +static inline struct nfs4_pnfs_ds * +_data_server_lookup(u32 ip_addr, u32 port) +{ + struct nfs4_pnfs_ds *ds; + + dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", + ntohl(ip_addr), ntohs(port)); + + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { + if (ds->ds_ip_addr == ip_addr && + ds->ds_port == port) { + return ds; + } + } + return NULL; +} + +/* Create an rpc to the data server defined in 'dev_list' */ +static int +nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) +{ + struct nfs_server *tmp; + struct sockaddr_in sin; + struct rpc_clnt *mds_clnt = mds_srv->client; + struct nfs_client *clp = mds_srv->nfs_client; + struct sockaddr *mds_addr; + int err = 0; + + dprintk("--> %s ip:port %s au_flavor %d\n", __func__, + ds->r_addr, mds_clnt->cl_auth->au_flavor); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ds->ds_ip_addr; + sin.sin_port = ds->ds_port; + + /* + * If this DS is also the MDS, use the MDS session only if the + * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. + */ + mds_addr = (struct sockaddr *)&clp->cl_addr; + if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { + printk(KERN_INFO "ip:port %s is not a pNFS Data " + "Server\n", ds->r_addr); + err = -ENODEV; + } else { + atomic_inc(&clp->cl_count); + ds->ds_clp = clp; + dprintk("%s Using MDS Session for DS\n", __func__); + } + goto out; + } + + /* Temporay server for nfs4_set_client */ + tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); + if (!tmp) + goto out; + + /* + * Set a retrans, timeout interval, and authflavor equual to the MDS + * values. Use the MDS nfs_client cl_ipaddr field so as to use the + * same co_ownerid as the MDS. + */ + err = nfs4_set_client(tmp, + mds_srv->nfs_client->cl_hostname, + (struct sockaddr *)&sin, + sizeof(struct sockaddr), + mds_srv->nfs_client->cl_ipaddr, + mds_clnt->cl_auth->au_flavor, + IPPROTO_TCP, + mds_clnt->cl_xprt->timeout, + 1 /* minorversion */); + if (err < 0) + goto out_free; + + clp = tmp->nfs_client; + + /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ + dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); + clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; + + err = nfs4_recover_expired_lease(clp); + if (!err) + err = nfs4_check_client_ready(clp); + if (err) + goto out_put; + + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { + printk(KERN_INFO "ip:port %s is not a pNFS Data Server\n", + ds->r_addr); + err = -ENODEV; + goto out_put; + } + /* + * Mask the (possibly) returned EXCHGID4_FLAG_USE_PNFS_MDS pNFS role + * The is_ds_only_session depends on this. + */ + clp->cl_exchange_flags &= ~EXCHGID4_FLAG_USE_PNFS_MDS; + /* + * Set DS lease equal to the MDS lease, renewal is scheduled in + * create_session + */ + spin_lock(&mds_srv->nfs_client->cl_lock); + clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; + spin_unlock(&mds_srv->nfs_client->cl_lock); + clp->cl_last_renewal = jiffies; + + clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + ds->ds_clp = clp; + + dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + clp->cl_rpcclient); +out_free: + kfree(tmp); +out: + dprintk("%s Returns %d\n", __func__, err); + return err; +out_put: + nfs_put_client(clp); + goto out_free; +} + +static void +destroy_ds(struct nfs4_pnfs_ds *ds) +{ + dprintk("--> %s\n", __func__); + print_ds(ds); + + if (ds->ds_clp) + nfs_put_client(ds->ds_clp); + kfree(ds); +} + +static void +nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) +{ + struct nfs4_pnfs_ds *ds; + int i; + + dprintk("%s: device id=%s\n", __func__, + deviceid_fmt(&dsaddr->deviceid.de_id)); + + for (i = 0; i < dsaddr->ds_num; i++) { + ds = dsaddr->ds_list[i]; + if (ds != NULL) { + if (atomic_dec_and_lock(&ds->ds_count, + &nfs4_ds_cache_lock)) { + list_del_init(&ds->ds_node); + spin_unlock(&nfs4_ds_cache_lock); + destroy_ds(ds); + } + } + } + kfree(dsaddr->stripe_indices); + kfree(dsaddr); +} + +void +nfs4_fl_free_deviceid_callback(struct kref *kref) +{ + struct nfs4_deviceid *device = + container_of(kref, struct nfs4_deviceid, de_kref); + struct nfs4_file_layout_dsaddr *dsaddr = + container_of(device, struct nfs4_file_layout_dsaddr, deviceid); + + nfs4_fl_free_deviceid(dsaddr); +} + +static void +nfs4_pnfs_ds_add(struct inode *inode, struct nfs4_pnfs_ds **dsp, + u32 ip_addr, u32 port, char *r_addr, int len) +{ + struct nfs4_pnfs_ds *tmp_ds, *ds; + + *dsp = NULL; + + ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); + if (!ds) + return; + + spin_lock(&nfs4_ds_cache_lock); + tmp_ds = _data_server_lookup(ip_addr, port); + if (tmp_ds == NULL) { + ds->ds_ip_addr = ip_addr; + ds->ds_port = port; + strncpy(ds->r_addr, r_addr, len); + atomic_set(&ds->ds_count, 1); + INIT_LIST_HEAD(&ds->ds_node); + ds->ds_clp = NULL; + list_add(&ds->ds_node, &nfs4_data_server_cache); + *dsp = ds; + dprintk("%s add new data server ip 0x%x\n", __func__, + ds->ds_ip_addr); + spin_unlock(&nfs4_ds_cache_lock); + } else { + atomic_inc(&tmp_ds->ds_count); + *dsp = tmp_ds; + dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_ip_addr, + atomic_read(&tmp_ds->ds_count)); + spin_unlock(&nfs4_ds_cache_lock); + kfree(ds); + } +} + +static struct nfs4_pnfs_ds * +decode_and_add_ds(uint32_t **pp, struct inode *inode) +{ + struct nfs4_pnfs_ds *ds = NULL; + char r_addr[29]; /* max size of ip/port string */ + int len; + u32 ip_addr, port; + int tmp[6]; + uint32_t *p = *pp; + + dprintk("%s enter\n", __func__); + /* check and skip r_netid */ + len = be32_to_cpup(p++); + /* "tcp" */ + if (len != 3) { + printk("%s: ERROR: non TCP r_netid len %d\n", + __func__, len); + goto out_err; + } + /* + * Read the bytes into a temporary buffer + * XXX: should probably sanity check them + */ + tmp[0] = be32_to_cpup(p++); + + len = be32_to_cpup(p++); + if (len >= sizeof(r_addr)) { + printk("%s: ERROR: Device ip/port too long (%d)\n", + __func__, len); + goto out_err; + } + memcpy(r_addr, p, len); + p += XDR_QUADLEN(len); + *pp = p; + r_addr[len] = '\0'; + sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1], + &tmp[2], &tmp[3], &tmp[4], &tmp[5]); + ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) | (tmp[2]<<8) | (tmp[3])); + port = htons((tmp[4] << 8) | (tmp[5])); + + nfs4_pnfs_ds_add(inode, &ds, ip_addr, port, r_addr, len); + + dprintk("%s: addr:port string = %s\n", __func__, r_addr); + return ds; +out_err: + dprintk("%s returned NULL\n", __func__); + return NULL; +} + +/* Decode opaque device data and return the result */ +static struct nfs4_file_layout_dsaddr* +decode_device(struct inode *ino, struct pnfs_device *pdev) +{ + int i, dummy; + u32 cnt, num; + u8 *indexp; + uint32_t *p = (u32 *)pdev->area, *indicesp; + struct nfs4_file_layout_dsaddr *dsaddr; + + /* Get the stripe count (number of stripe index) */ + cnt = be32_to_cpup(p++); + dprintk("%s stripe count %d\n", __func__, cnt); + if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { + printk(KERN_WARNING "%s: stripe count %d greater than " + "supported maximum %d\n", __func__, + cnt, NFS4_PNFS_MAX_STRIPE_CNT); + goto out_err; + } + + /* Check the multipath list count */ + indicesp = p; + p += XDR_QUADLEN(cnt << 2); + num = be32_to_cpup(p++); + dprintk("%s ds_num %u\n", __func__, num); + if (num > NFS4_PNFS_MAX_MULTI_CNT) { + printk(KERN_WARNING "%s: multipath count %d greater than " + "supported maximum %d\n", __func__, + num, NFS4_PNFS_MAX_MULTI_CNT); + goto out_err; + } + dsaddr = kzalloc(sizeof(*dsaddr) + + (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), + GFP_KERNEL); + if (!dsaddr) + goto out_err; + + dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); + if (!dsaddr->stripe_indices) + goto out_err_free; + + dsaddr->stripe_count = cnt; + dsaddr->ds_num = num; + + memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, + NFS4_PNFS_DEVICEID4_SIZE); + + /* Go back an read stripe indices */ + p = indicesp; + indexp = &dsaddr->stripe_indices[0]; + for (i = 0; i < dsaddr->stripe_count; i++) { + dummy = be32_to_cpup(p++); + *indexp = dummy; /* bound by NFS4_PNFS_MAX_MULTI_CNT */ + indexp++; + } + /* Skip already read multipath list count */ + p++; + + for (i = 0; i < dsaddr->ds_num; i++) { + int j; + + dummy = be32_to_cpup(p++); /* multipath count */ + if (dummy > 1) { + printk(KERN_WARNING + "%s: Multipath count %d not supported, " + "skipping all greater than 1\n", __func__, + dummy); + } + for (j = 0; j < dummy; j++) { + if (j == 0) { + dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); + if (dsaddr->ds_list[i] == NULL) + goto out_err_free; + } else { + u32 len; + /* skip extra multipath */ + len = be32_to_cpup(p++); + p += XDR_QUADLEN(len); + len = be32_to_cpup(p++); + p += XDR_QUADLEN(len); + continue; + } + } + } + nfs4_init_deviceid_node(&dsaddr->deviceid); + + return dsaddr; + +out_err_free: + nfs4_fl_free_deviceid(dsaddr); +out_err: + dprintk("%s ERROR: returning NULL\n", __func__); + return NULL; +} + +/* + * Decode the opaque device specified in 'dev' + * and add it to the list of available devices. + * If the deviceid is already cached, nfs4_add_deviceid will return + * a pointer to the cached struct and throw away the new. + */ +static struct nfs4_file_layout_dsaddr* +decode_and_add_device(struct inode *inode, struct pnfs_device *dev) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + struct nfs4_deviceid *d; + + dsaddr = decode_device(inode, dev); + if (!dsaddr) { + printk(KERN_WARNING "%s: Could not decode or add device\n", + __func__); + return NULL; + } + + d = nfs4_add_get_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, + &dsaddr->deviceid); + + return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); +} + +/* + * Retrieve the information for dev_id, add it to the list + * of available devices, and return it. + */ +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id) +{ + struct pnfs_device *pdev = NULL; + u32 max_resp_sz; + int max_pages; + struct page **pages = NULL; + struct nfs4_file_layout_dsaddr *dsaddr = NULL; + int rc, i; + struct nfs_server *server = NFS_SERVER(inode); + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + max_pages = max_resp_sz >> PAGE_SHIFT; + dprintk("%s inode %p max_resp_sz %u max_pages %d\n", + __func__, inode, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); + if (pdev == NULL) + return NULL; + + pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); + if (pages == NULL) { + kfree(pdev); + return NULL; + } + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(GFP_KERNEL); + if (!pages[i]) + goto out_free; + } + + /* set pdev->area */ + pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); + if (!pdev->area) + goto out_free; + + memcpy(&pdev->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE); + pdev->layout_type = LAYOUT_NFSV4_1_FILES; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = PAGE_SIZE * max_pages; + pdev->mincount = 0; + /* TODO: Update types when CB_NOTIFY_DEVICEID is available */ + pdev->dev_notify_types = 0; + + rc = pnfs_callback_ops->nfs_getdeviceinfo(server, pdev); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + dsaddr = decode_and_add_device(inode, pdev); +out_free: + if (pdev->area != NULL) + vunmap(pdev->area); + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); + kfree(pdev); + dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); + return dsaddr; +} + +struct nfs4_file_layout_dsaddr * +nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct pnfs_deviceid *id) +{ + struct nfs4_deviceid *d; + + d = nfs4_find_get_deviceid(clp->cl_devid_cache, id); + dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__, + deviceid_fmt(id), d); + return (d == NULL) ? NULL : + container_of(d, struct nfs4_file_layout_dsaddr, deviceid); +} + +/* + * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit + * Then: ((res + fsi) % dsaddr->stripe_count) + */ +static inline u32 +_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); + u64 tmp; + + tmp = offset - flseg->pattern_offset; + do_div(tmp, flseg->stripe_unit); + tmp += flseg->first_stripe_index; + return do_div(tmp, FILE_DSADDR(lseg)->stripe_count); +} + +u32 +nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) +{ + u32 j; + + j = _nfs4_fl_calc_j_index(lseg, offset); + return FILE_DSADDR(lseg)->stripe_indices[j]; +} + +struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); + u32 i; + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + return NULL; + else + i = nfs4_fl_calc_ds_index(lseg, offset); + } else + i = _nfs4_fl_calc_j_index(lseg, offset); + return &flseg->fh_array[i]; +} + +struct nfs4_pnfs_ds * +nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) +{ + struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); + struct nfs4_file_layout_dsaddr *dsaddr; + + dsaddr = FILE_DSADDR(lseg); + if (dsaddr->ds_list[ds_idx] == NULL) { + printk(KERN_ERR "%s: No data server for device id (%s)!!\n", + __func__, deviceid_fmt(&flseg->dev_id)); + return NULL; + } + + if (!dsaddr->ds_list[ds_idx]->ds_clp) { + int err; + + err = nfs4_pnfs_ds_create(PNFS_NFS_SERVER(lseg->layout), + dsaddr->ds_list[ds_idx]); + if (err) { + printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", + __func__, err); + return NULL; + } + } + dprintk("%s: dev_id=%s, ds_idx=%u\n", + __func__, deviceid_fmt(&flseg->dev_id), ds_idx); + + return dsaddr->ds_list[ds_idx]; +} + diff -up linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h --- linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h.orig 2010-09-30 10:17:08.657991000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4filelayout.h 2010-09-30 10:17:08.658997000 -0400 @@ -0,0 +1,96 @@ +/* + * pnfs_nfs4filelayout.h + * + * NFSv4 file layout driver data structures. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Dean Hildebrand + */ + +#ifndef FS_NFS_NFS4FILELAYOUT_H +#define FS_NFS_NFS4FILELAYOUT_H + +#include +#include + +#define NFS4_PNFS_DEV_HASH_BITS 5 +#define NFS4_PNFS_DEV_HASH_SIZE (1 << NFS4_PNFS_DEV_HASH_BITS) +#define NFS4_PNFS_DEV_HASH_MASK (NFS4_PNFS_DEV_HASH_SIZE - 1) + +#define NFS4_PNFS_MAX_STRIPE_CNT 4096 +#define NFS4_PNFS_MAX_MULTI_CNT 64 /* 256 fit into a u8 stripe_index */ +#define NFS4_PNFS_MAX_MULTI_DS 2 + +#define FILE_DSADDR(lseg) (container_of(lseg->deviceid, \ + struct nfs4_file_layout_dsaddr, \ + deviceid)) + +enum stripetype4 { + STRIPE_SPARSE = 1, + STRIPE_DENSE = 2 +}; + +/* Individual ip address */ +struct nfs4_pnfs_ds { + struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + u32 ds_ip_addr; + u32 ds_port; + struct nfs_client *ds_clp; + atomic_t ds_count; + char r_addr[29]; +}; + +struct nfs4_file_layout_dsaddr { + struct nfs4_deviceid deviceid; + u32 stripe_count; + u8 *stripe_indices; + u32 ds_num; + struct nfs4_pnfs_ds *ds_list[1]; +}; + +struct nfs4_pnfs_dev_hlist { + rwlock_t dev_lock; + struct hlist_head dev_list[NFS4_PNFS_DEV_HASH_SIZE]; +}; + +struct nfs4_filelayout_segment { + u32 stripe_type; + u32 commit_through_mds; + u32 stripe_unit; + u32 first_stripe_index; + u64 pattern_offset; + struct pnfs_deviceid dev_id; + unsigned int num_fh; + struct nfs_fh *fh_array; +}; + +struct nfs4_filelayout { + struct pnfs_layout_hdr fl_layout; + u32 stripe_unit; +}; + +extern struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); + +static inline struct nfs4_filelayout * +FILE_LO(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct nfs4_filelayout, fl_layout); +} + +extern struct pnfs_client_operations *pnfs_callback_ops; + +extern void nfs4_fl_free_deviceid_callback(struct kref *); +extern void print_ds(struct nfs4_pnfs_ds *ds); +char *deviceid_fmt(const struct pnfs_deviceid *dev_id); +u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); +struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, + u32 ds_idx); +extern struct nfs4_file_layout_dsaddr * +nfs4_fl_find_get_deviceid(struct nfs_client *, struct pnfs_deviceid *dev_id); +struct nfs4_file_layout_dsaddr * +get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id); + +#endif /* FS_NFS_NFS4FILELAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.34.noarch/fs/nfs/nfs4_fs.h --- linux-2.6.34.noarch/fs/nfs/nfs4_fs.h.orig 2010-09-30 10:15:17.839715000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4_fs.h 2010-09-30 10:17:08.649992000 -0400 @@ -45,8 +45,28 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, NFS4CLNT_SESSION_RESET, - NFS4CLNT_SESSION_DRAINING, NFS4CLNT_RECALL_SLOT, + NFS4CLNT_LAYOUT_RECALL, +}; + +enum nfs4_session_state { + NFS4_SESSION_INITING, + NFS4_SESSION_DRAINING, +}; + +struct nfs4_minor_version_ops { + u32 minor_version; + + int (*call_sync)(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply); + int (*validate_stateid)(struct nfs_delegation *, + const nfs4_stateid *); + const struct nfs4_state_recovery_ops *reboot_recovery_ops; + const struct nfs4_state_recovery_ops *nograce_recovery_ops; + const struct nfs4_state_maintenance_ops *state_renewal_ops; }; /* @@ -89,7 +109,6 @@ struct nfs_unique_id { */ struct nfs4_state_owner { struct nfs_unique_id so_owner_id; - struct nfs_client *so_client; struct nfs_server *so_server; struct rb_node so_client_node; @@ -99,7 +118,6 @@ struct nfs4_state_owner { atomic_t so_count; unsigned long so_flags; struct list_head so_states; - struct list_head so_delegations; struct nfs_seqid_counter so_seqid; struct rpc_sequence so_sequence; }; @@ -125,10 +143,20 @@ enum { * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) */ +struct nfs4_lock_owner { + unsigned int lo_type; +#define NFS4_ANY_LOCK_TYPE (0U) +#define NFS4_FLOCK_LOCK_TYPE (1U << 0) +#define NFS4_POSIX_LOCK_TYPE (1U << 1) + union { + fl_owner_t posix_owner; + pid_t flock_owner; + } lo_u; +}; + struct nfs4_lock_state { struct list_head ls_locks; /* Other lock stateids */ struct nfs4_state * ls_state; /* Pointer to open state */ - fl_owner_t ls_owner; /* POSIX lock owner */ #define NFS_LOCK_INITIALIZED 1 int ls_flags; struct nfs_seqid_counter ls_seqid; @@ -136,6 +164,7 @@ struct nfs4_lock_state { struct nfs_unique_id ls_id; nfs4_stateid ls_stateid; atomic_t ls_count; + struct nfs4_lock_owner ls_owner; }; /* bits for nfs4_state->flags */ @@ -219,22 +248,34 @@ extern int nfs4_open_revalidate(struct i extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); +extern void nfs4_release_lockowner(const struct nfs4_lock_state *); -extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; -extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; #if defined(CONFIG_NFS_V4_1) -extern int nfs4_setup_sequence(struct nfs_client *clp, +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return server->nfs_client->cl_session; +} + +extern int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_session *ds_session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); +extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); extern int nfs4_proc_create_session(struct nfs_client *); extern int nfs4_proc_destroy_session(struct nfs4_session *); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); #else /* CONFIG_NFS_v4_1 */ -static inline int nfs4_setup_sequence(struct nfs_client *clp, +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return NULL; +} + +static inline int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_session *ds_session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task) { @@ -247,12 +288,12 @@ static inline int nfs4_init_session(stru } #endif /* CONFIG_NFS_V4_1 */ -extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; +extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; extern const u32 nfs4_fattr_bitmap[2]; extern const u32 nfs4_statfs_bitmap[2]; extern const u32 nfs4_pathconf_bitmap[2]; -extern const u32 nfs4_fsinfo_bitmap[2]; +extern const u32 nfs4_fsinfo_bitmap[3]; extern const u32 nfs4_fs_locations_bitmap[2]; /* nfs4renewd.c */ @@ -284,7 +325,7 @@ extern void nfs41_handle_sequence_flag_e extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); +extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -293,6 +334,7 @@ extern void nfs_increment_lock_seqid(int extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); +/* write.c */ extern const nfs4_stateid zero_stateid; /* nfs4xdr.c */ diff -up linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.34.noarch/fs/nfs/nfs4proc.c --- linux-2.6.34.noarch/fs/nfs/nfs4proc.c.orig 2010-09-30 10:15:17.855715000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4proc.c 2010-09-30 10:17:08.673994000 -0400 @@ -49,12 +49,14 @@ #include #include #include +#include #include "nfs4_fs.h" #include "delegation.h" #include "internal.h" #include "iostat.h" #include "callback.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -67,7 +69,7 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); -static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); +static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, @@ -125,11 +127,16 @@ const u32 nfs4_pathconf_bitmap[2] = { 0 }; -const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE +const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE | FATTR4_WORD0_LEASE_TIME, +#ifdef CONFIG_NFS_V4_1 + FATTR4_WORD1_FS_LAYOUT_TYPES, + FATTR4_WORD2_LAYOUT_BLKSIZE +#else /* CONFIG_NFS_V4_1 */ 0 +#endif /* CONFIG_NFS_V4_1 */ }; const u32 nfs4_fs_locations_bitmap[2] = { @@ -356,7 +363,7 @@ static void nfs41_check_drain_session_co { struct rpc_task *task; - if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { + if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); if (task) rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); @@ -370,12 +377,11 @@ static void nfs41_check_drain_session_co complete(&ses->complete); } -static void nfs41_sequence_free_slot(const struct nfs_client *clp, - struct nfs4_sequence_res *res) +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot_table *tbl; - tbl = &clp->cl_session->fc_slot_table; + tbl = &res->sr_session->fc_slot_table; if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { /* just wake up the next guy waiting since * we may have not consumed a slot after all */ @@ -385,18 +391,17 @@ static void nfs41_sequence_free_slot(con spin_lock(&tbl->slot_tbl_lock); nfs4_free_slot(tbl, res->sr_slotid); - nfs41_check_drain_session_complete(clp->cl_session); + nfs41_check_drain_session_complete(res->sr_session); spin_unlock(&tbl->slot_tbl_lock); res->sr_slotid = NFS4_MAX_SLOT_TABLE; } -static void nfs41_sequence_done(struct nfs_client *clp, - struct nfs4_sequence_res *res, - int rpc_status) +static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { unsigned long timestamp; struct nfs4_slot_table *tbl; struct nfs4_slot *slot; + struct nfs_client *clp; /* * sr_status remains 1 if an RPC level error occurred. The server @@ -411,13 +416,16 @@ static void nfs41_sequence_done(struct n if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) goto out; + tbl = &res->sr_session->fc_slot_table; + slot = tbl->slots + res->sr_slotid; + /* Check the SEQUENCE operation status */ - if (res->sr_status == 0) { - tbl = &clp->cl_session->fc_slot_table; - slot = tbl->slots + res->sr_slotid; + switch (res->sr_status) { + case 0: /* Update the slot's sequence and clientid lease timer */ ++slot->seq_nr; timestamp = res->sr_renewal_time; + clp = res->sr_session->clp; spin_lock(&clp->cl_lock); if (time_before(clp->cl_last_renewal, timestamp)) clp->cl_last_renewal = timestamp; @@ -425,11 +433,39 @@ static void nfs41_sequence_done(struct n /* Check sequence flags */ if (atomic_read(&clp->cl_count) > 1) nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + break; + case -NFS4ERR_DELAY: + /* The server detected a resend of the RPC call and + * returned NFS4ERR_DELAY as per Section 2.10.6.2 + * of RFC5661. + */ + dprintk("%s: slot=%d seq=%d: Operation in progress\n", + __func__, res->sr_slotid, slot->seq_nr); + goto out_retry; + default: + /* Just update the slot sequence no. */ + ++slot->seq_nr; } out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); - nfs41_sequence_free_slot(clp, res); + nfs41_sequence_free_slot(res); + return 1; +out_retry: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + rpc_restart_call(task); + /* FIXME: rpc_restart_call() should be made to return success/fail */ + if (RPC_ASSASSINATED(task)) + goto out; + return 0; +} + +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + if (res->sr_session == NULL) + return 1; + return nfs41_sequence_done(task, res); } /* @@ -480,12 +516,11 @@ static int nfs41_setup_sequence(struct n if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) return 0; - memset(res, 0, sizeof(*res)); res->sr_slotid = NFS4_MAX_SLOT_TABLE; tbl = &session->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && + if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { /* * The state manager will wait until the slot table is empty. @@ -525,6 +560,7 @@ static int nfs41_setup_sequence(struct n res->sr_session = session; res->sr_slotid = slotid; res->sr_renewal_time = jiffies; + res->sr_status_flags = 0; /* * sr_status is only set in decode_sequence, and so will remain * set to 1 if an rpc level failure occurs. @@ -533,33 +569,36 @@ static int nfs41_setup_sequence(struct n return 0; } -int nfs4_setup_sequence(struct nfs_client *clp, +int nfs4_setup_sequence(const struct nfs_server *server, + struct nfs4_session *ds_session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task) { + struct nfs4_session *session = nfs4_get_session(server); int ret = 0; + if (ds_session) + session = ds_session; + if (session == NULL) { + args->sa_session = NULL; + res->sr_session = NULL; + goto out; + } + dprintk("--> %s clp %p session %p sr_slotid %d\n", - __func__, clp, clp->cl_session, res->sr_slotid); + __func__, session->clp, session, res->sr_slotid); - if (!nfs4_has_session(clp)) - goto out; - ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, + ret = nfs41_setup_sequence(session, args, res, cache_reply, task); - if (ret && ret != -EAGAIN) { - /* terminate rpc task */ - task->tk_status = ret; - task->tk_action = NULL; - } out: dprintk("<-- %s status=%d\n", __func__, ret); return ret; } struct nfs41_call_sync_data { - struct nfs_client *clp; + const struct nfs_server *seq_server; struct nfs4_sequence_args *seq_args; struct nfs4_sequence_res *seq_res; int cache_reply; @@ -569,9 +608,9 @@ static void nfs41_call_sync_prepare(stru { struct nfs41_call_sync_data *data = calldata; - dprintk("--> %s data->clp->cl_session %p\n", __func__, - data->clp->cl_session); - if (nfs4_setup_sequence(data->clp, data->seq_args, + dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); + + if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, data->seq_res, data->cache_reply, task)) return; rpc_call_start(task); @@ -587,7 +626,7 @@ static void nfs41_call_sync_done(struct { struct nfs41_call_sync_data *data = calldata; - nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); + nfs41_sequence_done(task, data->seq_res); } struct rpc_call_ops nfs41_call_sync_ops = { @@ -600,8 +639,7 @@ struct rpc_call_ops nfs41_call_priv_sync .rpc_call_done = nfs41_call_sync_done, }; -static int nfs4_call_sync_sequence(struct nfs_client *clp, - struct rpc_clnt *clnt, +static int nfs4_call_sync_sequence(struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -611,13 +649,13 @@ static int nfs4_call_sync_sequence(struc int ret; struct rpc_task *task; struct nfs41_call_sync_data data = { - .clp = clp, + .seq_server = server, .seq_args = args, .seq_res = res, .cache_reply = cache_reply, }; struct rpc_task_setup task_setup = { - .rpc_client = clnt, + .rpc_client = server->client, .rpc_message = msg, .callback_ops = &nfs41_call_sync_ops, .callback_data = &data @@ -642,10 +680,15 @@ int _nfs4_call_sync_session(struct nfs_s struct nfs4_sequence_res *res, int cache_reply) { - return nfs4_call_sync_sequence(server->nfs_client, server->client, - msg, args, res, cache_reply, 0); + return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); } +#else +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + return 1; +} #endif /* CONFIG_NFS_V4_1 */ int _nfs4_call_sync(struct nfs_server *server, @@ -659,18 +702,9 @@ int _nfs4_call_sync(struct nfs_server *s } #define nfs4_call_sync(server, msg, args, res, cache_reply) \ - (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ + (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ &(res)->seq_res, (cache_reply)) -static void nfs4_sequence_done(const struct nfs_server *server, - struct nfs4_sequence_res *res, int rpc_status) -{ -#ifdef CONFIG_NFS_V4_1 - if (nfs4_has_session(server->nfs_client)) - nfs41_sequence_done(server->nfs_client, res, rpc_status); -#endif /* CONFIG_NFS_V4_1 */ -} - static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) { struct nfs_inode *nfsi = NFS_I(dir); @@ -745,19 +779,14 @@ static struct nfs4_opendata *nfs4_openda p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; - if (flags & O_EXCL) { - if (nfs4_has_persistent_session(server->nfs_client)) { - /* GUARDED */ - p->o_arg.u.attrs = &p->attrs; - memcpy(&p->attrs, attrs, sizeof(p->attrs)); - } else { /* EXCLUSIVE4_1 */ - u32 *s = (u32 *) p->o_arg.u.verifier.data; - s[0] = jiffies; - s[1] = current->pid; - } - } else if (flags & O_CREAT) { + if (flags & O_CREAT) { + u32 *s; + p->o_arg.u.attrs = &p->attrs; memcpy(&p->attrs, attrs, sizeof(p->attrs)); + s = (u32 *) p->o_arg.u.verifier.data; + s[0] = jiffies; + s[1] = current->pid; } p->c_arg.fh = &p->o_res.fh; p->c_arg.stateid = &p->o_res.stateid; @@ -851,8 +880,10 @@ static void update_open_stateflags(struc static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) { if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) - memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); - memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); + memcpy(state->stateid.u.data, stateid->u.data, + sizeof(state->stateid.u.data)); + memcpy(state->open_stateid.u.data, stateid->u.data, + sizeof(state->open_stateid.u.data)); switch (fmode) { case FMODE_READ: set_bit(NFS_O_RDONLY_STATE, &state->flags); @@ -880,7 +911,8 @@ static void __update_open_stateid(struct */ write_seqlock(&state->seqlock); if (deleg_stateid != NULL) { - memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); + memcpy(state->stateid.u.data, deleg_stateid->u.data, + sizeof(state->stateid.u.data)); set_bit(NFS_DELEGATED_STATE, &state->flags); } if (open_stateid != NULL) @@ -911,7 +943,8 @@ static int update_open_stateid(struct nf if (delegation == NULL) delegation = &deleg_cur->stateid; - else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) + else if (memcmp(deleg_cur->stateid.u.data, delegation->u.data, + NFS4_STATEID_SIZE) != 0) goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); @@ -973,7 +1006,8 @@ static struct nfs4_state *nfs4_try_open_ break; } /* Save the delegation */ - memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); + memcpy(stateid.u.data, delegation->stateid.u.data, + sizeof(stateid.u.data)); rcu_read_unlock(); ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); if (ret != 0) @@ -1127,10 +1161,13 @@ static int nfs4_open_recover(struct nfs4 * Check if we need to update the current stateid. */ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && - memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { + memcmp(state->stateid.u.data, state->open_stateid.u.data, + sizeof(state->stateid.u.data)) != 0) { write_seqlock(&state->seqlock); if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) - memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); + memcpy(state->stateid.u.data, + state->open_stateid.u.data, + sizeof(state->stateid.u.data)); write_sequnlock(&state->seqlock); } return 0; @@ -1199,8 +1236,8 @@ static int _nfs4_open_delegation_recall( if (IS_ERR(opendata)) return PTR_ERR(opendata); opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; - memcpy(opendata->o_arg.u.delegation.data, stateid->data, - sizeof(opendata->o_arg.u.delegation.data)); + memcpy(opendata->o_arg.u.delegation.u.data, stateid->u.data, + sizeof(opendata->o_arg.u.delegation.u.data)); ret = nfs4_open_recover(opendata, state); nfs4_opendata_put(opendata); return ret; @@ -1258,8 +1295,8 @@ static void nfs4_open_confirm_done(struc if (RPC_ASSASSINATED(task)) return; if (data->rpc_status == 0) { - memcpy(data->o_res.stateid.data, data->c_res.stateid.data, - sizeof(data->o_res.stateid.data)); + memcpy(data->o_res.stateid.u.data, data->c_res.stateid.u.data, + sizeof(data->o_res.stateid.u.data)); nfs_confirm_seqid(&data->owner->so_seqid, 0); renew_lease(data->o_res.server, data->timestamp); data->rpc_done = 1; @@ -1356,13 +1393,13 @@ static void nfs4_open_prepare(struct rpc } /* Update sequence id. */ data->o_arg.id = sp->so_owner_id.id; - data->o_arg.clientid = sp->so_client->cl_clientid; + data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; - if (nfs4_setup_sequence(data->o_arg.server->nfs_client, + if (nfs4_setup_sequence(data->o_arg.server, NULL, &data->o_arg.seq_args, &data->o_res.seq_res, 1, task)) return; @@ -1385,8 +1422,8 @@ static void nfs4_open_done(struct rpc_ta data->rpc_status = task->tk_status; - nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->o_res.seq_res)) + return; if (RPC_ASSASSINATED(task)) return; @@ -1539,9 +1576,8 @@ static int _nfs4_proc_open(struct nfs4_o return 0; } -static int nfs4_recover_expired_lease(struct nfs_server *server) +int nfs4_recover_expired_lease(struct nfs_client *clp) { - struct nfs_client *clp = server->nfs_client; unsigned int loop; int ret; @@ -1557,6 +1593,7 @@ static int nfs4_recover_expired_lease(st } return ret; } +EXPORT_SYMBOL(nfs4_recover_expired_lease); /* * OPEN_EXPIRED: @@ -1646,7 +1683,7 @@ static int _nfs4_do_open(struct inode *d dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); goto out_err; } - status = nfs4_recover_expired_lease(server); + status = nfs4_recover_expired_lease(server->nfs_client); if (status != 0) goto err_put_state_owner; if (path->dentry->d_inode != NULL) @@ -1773,7 +1810,7 @@ static int _nfs4_do_setattr(struct inode if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { /* Use that stateid */ } else if (state != NULL) { - nfs4_copy_stateid(&arg.stateid, state, current->files); + nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); } else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); @@ -1838,7 +1875,8 @@ static void nfs4_close_done(struct rpc_t struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); - nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) + return; if (RPC_ASSASSINATED(task)) return; /* hmm. we are done with the inode, and in the process of freeing @@ -1858,7 +1896,7 @@ static void nfs4_close_done(struct rpc_t if (calldata->arg.fmode == 0) break; default: - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } nfs_release_seqid(calldata->arg.seqid); @@ -1903,7 +1941,7 @@ static void nfs4_close_prepare(struct rp nfs_fattr_init(calldata->res.fattr); calldata->timestamp = jiffies; - if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, + if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, &calldata->arg.seq_args, &calldata->res.seq_res, 1, task)) return; @@ -2325,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs4_state *state = NULL; int status; + if (pnfs_ld_layoutret_on_setattr(inode)) + pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); + nfs_fattr_init(fattr); /* Search for an existing open(O_WRITE) file */ @@ -2650,8 +2691,9 @@ static int nfs4_proc_unlink_done(struct { struct nfs_removeres *res = task->tk_msg.rpc_resp; - nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); - if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; + if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); nfs_post_op_update_inode(dir, res->dir_attr); @@ -3092,18 +3134,31 @@ static int nfs4_proc_pathconf(struct nfs static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_server *server = NFS_SERVER(data->inode); + struct nfs_client *client = server->nfs_client; dprintk("--> %s\n", __func__); - nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); +#ifdef CONFIG_NFS_V4_1 + if (data->pdata.pnfsflags & PNFS_NO_RPC) + return 0; + + /* Is this a DS session */ + if (data->fldata.ds_nfs_client) { + dprintk("%s DS read\n", __func__); + client = data->fldata.ds_nfs_client; + } +#endif /* CONFIG_NFS_V4_1 */ + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; - if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, server->nfs_client); + if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { + nfs_restart_rpc(task, client); return -EAGAIN; } nfs_invalidate_atime(data->inode); - if (task->tk_status > 0) + if (task->tk_status > 0 && client == server->nfs_client) renew_lease(server, data->timestamp); return 0; } @@ -3114,20 +3169,56 @@ static void nfs4_proc_read_setup(struct msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; } +static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) +{ +#ifdef CONFIG_NFS_V4_1 + pnfs_update_last_write(nfsi, data->args.offset, data->res.count); + pnfs_need_layoutcommit(nfsi, data->args.context); +#endif /* CONFIG_NFS_V4_1 */ +} + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - - nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, - task->tk_status); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_client *client = server->nfs_client; - if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { - nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + +#ifdef CONFIG_NFS_V4_1 + /* restore original count after retry? */ + if (data->pdata.orig_count) { + dprintk("%s: restoring original count %u\n", __func__, + data->pdata.orig_count); + data->args.count = data->pdata.orig_count; + } + + if (data->pdata.pnfsflags & PNFS_NO_RPC) + return 0; + + /* Is this a DS session */ + if (data->fldata.ds_nfs_client) { + dprintk("%s DS write\n", __func__); + client = data->fldata.ds_nfs_client; + } +#endif /* CONFIG_NFS_V4_1 */ + + if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { + nfs_restart_rpc(task, client); return -EAGAIN; } + + /* + * MDS write: renew lease + * DS write: update lastbyte written, mark for layout commit + */ if (task->tk_status >= 0) { - renew_lease(NFS_SERVER(inode), data->timestamp); - nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); + if (client == server->nfs_client) { + renew_lease(server, data->timestamp); + nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); + } else + pnfs4_update_write_done(NFS_I(inode), data); } return 0; } @@ -3140,20 +3231,42 @@ static void nfs4_proc_write_setup(struct data->res.server = server; data->timestamp = jiffies; +#ifdef CONFIG_NFS_V4_1 + /* writes to DS use pnfs vector */ + if (data->fldata.ds_nfs_client) { + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; + return; + } +#endif /* CONFIG_NFS_V4_1 */ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; } static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - - nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, - task->tk_status); - if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { + struct nfs_server *server = NFS_SERVER(data->inode); + struct nfs_client *client = server->nfs_client; + +#ifdef CONFIG_NFS_V4_1 + if (data->pdata.pnfsflags & PNFS_NO_RPC) + return 0; + + /* Is this a DS session */ + if (data->fldata.ds_nfs_client) { + dprintk("%s DS commit\n", __func__); + client = data->fldata.ds_nfs_client; + } +#endif /* CONFIG_NFS_V4_1 */ + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + + if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; } - nfs_refresh_inode(inode, data->res.fattr); + if (client == server->nfs_client) + nfs_refresh_inode(inode, data->res.fattr); return 0; } @@ -3163,6 +3276,12 @@ static void nfs4_proc_commit_setup(struc data->args.bitmask = server->cache_consistency_bitmask; data->res.server = server; +#if defined(CONFIG_NFS_V4_1) + if (data->fldata.ds_nfs_client) { + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; + return; + } +#endif /* CONFIG_NFS_V4_1 */ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; } @@ -3466,9 +3585,12 @@ static int nfs4_proc_set_acl(struct inod } static int -_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) { - if (!clp || task->tk_status >= 0) + if (!clp) + clp = server->nfs_client; + + if (task->tk_status >= 0) return 0; switch(task->tk_status) { case -NFS4ERR_ADMIN_REVOKED: @@ -3493,8 +3615,9 @@ _nfs4_async_handle_error(struct rpc_task case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_SEQ_FALSE_RETRY: case -NFS4ERR_SEQ_MISORDERED: - dprintk("%s ERROR %d, Reset session\n", __func__, - task->tk_status); + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); nfs4_schedule_state_recovery(clp); task->tk_status = 0; return -EAGAIN; @@ -3514,6 +3637,8 @@ _nfs4_async_handle_error(struct rpc_task task->tk_status = nfs4_map_errors(task->tk_status); return 0; do_state_recovery: + if (is_ds_only_client(clp)) + return 0; rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); nfs4_schedule_state_recovery(clp); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) @@ -3522,12 +3647,6 @@ do_state_recovery: return -EAGAIN; } -static int -nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) -{ - return _nfs4_async_handle_error(task, server, server->nfs_client, state); -} - int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred, struct nfs4_setclientid_res *res) @@ -3643,8 +3762,8 @@ static void nfs4_delegreturn_done(struct { struct nfs4_delegreturndata *data = calldata; - nfs4_sequence_done(data->res.server, &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; switch (task->tk_status) { case -NFS4ERR_STALE_STATEID: @@ -3653,8 +3772,8 @@ static void nfs4_delegreturn_done(struct renew_lease(data->res.server, data->timestamp); break; default: - if (nfs4_async_handle_error(task, data->res.server, NULL) == - -EAGAIN) { + if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) + == -EAGAIN) { nfs_restart_rpc(task, data->res.server->nfs_client); return; } @@ -3674,7 +3793,7 @@ static void nfs4_delegreturn_prepare(str d_data = (struct nfs4_delegreturndata *)data; - if (nfs4_setup_sequence(d_data->res.server->nfs_client, + if (nfs4_setup_sequence(d_data->res.server, NULL, &d_data->args.seq_args, &d_data->res.seq_res, 1, task)) return; @@ -3894,15 +4013,16 @@ static void nfs4_locku_done(struct rpc_t { struct nfs4_unlockdata *calldata = data; - nfs4_sequence_done(calldata->server, &calldata->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) + return; if (RPC_ASSASSINATED(task)) return; switch (task->tk_status) { case 0: - memcpy(calldata->lsp->ls_stateid.data, - calldata->res.stateid.data, - sizeof(calldata->lsp->ls_stateid.data)); + memcpy(calldata->lsp->ls_stateid.u.data, + calldata->res.stateid.u.data, + sizeof(calldata->lsp->ls_stateid.u. + data)); renew_lease(calldata->server, calldata->timestamp); break; case -NFS4ERR_BAD_STATEID: @@ -3911,7 +4031,7 @@ static void nfs4_locku_done(struct rpc_t case -NFS4ERR_EXPIRED: break; default: - if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) nfs_restart_rpc(task, calldata->server->nfs_client); } @@ -3929,7 +4049,7 @@ static void nfs4_locku_prepare(struct rp return; } calldata->timestamp = jiffies; - if (nfs4_setup_sequence(calldata->server->nfs_client, + if (nfs4_setup_sequence(calldata->server, NULL, &calldata->arg.seq_args, &calldata->res.seq_res, 1, task)) return; @@ -4084,7 +4204,8 @@ static void nfs4_lock_prepare(struct rpc } else data->arg.new_lock_owner = 0; data->timestamp = jiffies; - if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, + if (nfs4_setup_sequence(data->server, NULL, + &data->arg.seq_args, &data->res.seq_res, 1, task)) return; rpc_call_start(task); @@ -4103,8 +4224,8 @@ static void nfs4_lock_done(struct rpc_ta dprintk("%s: begin!\n", __func__); - nfs4_sequence_done(data->server, &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; data->rpc_status = task->tk_status; if (RPC_ASSASSINATED(task)) @@ -4116,8 +4237,8 @@ static void nfs4_lock_done(struct rpc_ta goto out; } if (data->rpc_status == 0) { - memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, - sizeof(data->lsp->ls_stateid.data)); + memcpy(data->lsp->ls_stateid.u.data, data->res.stateid.u.data, + sizeof(data->lsp->ls_stateid.u.data)); data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); } @@ -4426,6 +4547,34 @@ out: return err; } +static void nfs4_release_lockowner_release(void *calldata) +{ + kfree(calldata); +} + +const struct rpc_call_ops nfs4_release_lockowner_ops = { + .rpc_release = nfs4_release_lockowner_release, +}; + +void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) +{ + struct nfs_server *server = lsp->ls_state->owner->so_server; + struct nfs_release_lockowner_args *args; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], + }; + + if (server->nfs_client->cl_mvops->minor_version != 0) + return; + args = kmalloc(sizeof(*args), GFP_NOFS); + if (!args) + return; + args->lock_owner.clientid = server->nfs_client->cl_clientid; + args->lock_owner.id = lsp->ls_id.id; + msg.rpc_argp = args; + rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); +} + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, @@ -4528,7 +4677,7 @@ int nfs4_proc_exchange_id(struct nfs_cli nfs4_verifier verifier; struct nfs41_exchange_id_args args = { .client = clp, - .flags = clp->cl_exchange_flags, + .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R, }; struct nfs41_exchange_id_res res = { .client = clp, @@ -4576,6 +4725,7 @@ int nfs4_proc_exchange_id(struct nfs_cli dprintk("<-- %s status= %d\n", __func__, status); return status; } +EXPORT_SYMBOL(nfs4_proc_exchange_id); struct nfs4_get_lease_time_data { struct nfs4_get_lease_time_args *args; @@ -4613,7 +4763,8 @@ static void nfs4_get_lease_time_done(str (struct nfs4_get_lease_time_data *)calldata; dprintk("--> %s\n", __func__); - nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); + if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) + return; switch (task->tk_status) { case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: @@ -4807,13 +4958,6 @@ struct nfs4_session *nfs4_alloc_session( if (!session) return NULL; - /* - * The create session reply races with the server back - * channel probe. Mark the client NFS_CS_SESSION_INITING - * so that the client back channel can find the - * nfs_client struct - */ - clp->cl_cons_state = NFS_CS_SESSION_INITING; init_completion(&session->complete); tbl = &session->fc_slot_table; @@ -4826,6 +4970,8 @@ struct nfs4_session *nfs4_alloc_session( spin_lock_init(&tbl->slot_tbl_lock); rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + session->session_state = 1<clp = clp; return session; } @@ -5042,6 +5188,10 @@ int nfs4_init_session(struct nfs_server if (!nfs4_has_session(clp)) return 0; + session = clp->cl_session; + if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) + return 0; + rsize = server->rsize; if (rsize == 0) rsize = NFS_MAX_FILE_IO_SIZE; @@ -5049,11 +5199,10 @@ int nfs4_init_session(struct nfs_server if (wsize == 0) wsize = NFS_MAX_FILE_IO_SIZE; - session = clp->cl_session; session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; - ret = nfs4_recover_expired_lease(server); + ret = nfs4_recover_expired_lease(server->nfs_client); if (!ret) ret = nfs4_check_client_ready(clp); return ret; @@ -5062,69 +5211,70 @@ int nfs4_init_session(struct nfs_server /* * Renew the cl_session lease. */ -static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) -{ +struct nfs4_sequence_data { + struct nfs_client *clp; struct nfs4_sequence_args args; struct nfs4_sequence_res res; - - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], - .rpc_argp = &args, - .rpc_resp = &res, - .rpc_cred = cred, - }; - - args.sa_cache_this = 0; - - return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, - &res, args.sa_cache_this, 1); -} +}; static void nfs41_sequence_release(void *data) { - struct nfs_client *clp = (struct nfs_client *)data; + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; if (atomic_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); + kfree(calldata); +} + +static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case -NFS4ERR_DELAY: + case -EKEYEXPIRED: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return -EAGAIN; + default: + nfs4_schedule_state_recovery(clp); + } + return 0; } static void nfs41_sequence_call_done(struct rpc_task *task, void *data) { - struct nfs_client *clp = (struct nfs_client *)data; + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; - nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); + if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) + return; if (task->tk_status < 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); if (atomic_read(&clp->cl_count) == 1) goto out; - if (_nfs4_async_handle_error(task, NULL, clp, NULL) - == -EAGAIN) { - nfs_restart_rpc(task, clp); + if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); return; } } dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); out: - kfree(task->tk_msg.rpc_argp); - kfree(task->tk_msg.rpc_resp); - dprintk("<-- %s\n", __func__); } static void nfs41_sequence_prepare(struct rpc_task *task, void *data) { - struct nfs_client *clp; + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; struct nfs4_sequence_args *args; struct nfs4_sequence_res *res; - clp = (struct nfs_client *)data; args = task->tk_msg.rpc_argp; res = task->tk_msg.rpc_resp; - if (nfs4_setup_sequence(clp, args, res, 0, task)) + if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) return; rpc_call_start(task); } @@ -5135,32 +5285,67 @@ static const struct rpc_call_ops nfs41_s .rpc_release = nfs41_sequence_release, }; -static int nfs41_proc_async_sequence(struct nfs_client *clp, - struct rpc_cred *cred) +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) { - struct nfs4_sequence_args *args; - struct nfs4_sequence_res *res; + struct nfs4_sequence_data *calldata; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], .rpc_cred = cred, }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs41_sequence_ops, + .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, + }; if (!atomic_inc_not_zero(&clp->cl_count)) - return -EIO; - args = kzalloc(sizeof(*args), GFP_NOFS); - res = kzalloc(sizeof(*res), GFP_NOFS); - if (!args || !res) { - kfree(args); - kfree(res); + return ERR_PTR(-EIO); + calldata = kmalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) { nfs_put_client(clp); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } - res->sr_slotid = NFS4_MAX_SLOT_TABLE; - msg.rpc_argp = args; - msg.rpc_resp = res; + calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + calldata->clp = clp; + task_setup_data.callback_data = calldata; - return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, - &nfs41_sequence_ops, (void *)clp); + return rpc_run_task(&task_setup_data); +} + +static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret = 0; + + task = _nfs41_proc_sequence(clp, cred); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret; + + task = _nfs41_proc_sequence(clp, cred); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + ret = rpc_wait_for_completion_task(task); + if (!ret) + ret = task->tk_status; + rpc_put_task(task); +out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; } struct nfs4_reclaim_complete_data { @@ -5174,13 +5359,31 @@ static void nfs4_reclaim_complete_prepar struct nfs4_reclaim_complete_data *calldata = data; rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, + if (nfs41_setup_sequence(calldata->clp->cl_session, + &calldata->arg.seq_args, &calldata->res.seq_res, 0, task)) return; rpc_call_start(task); } +static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case 0: + case -NFS4ERR_COMPLETE_ALREADY: + case -NFS4ERR_WRONG_CRED: /* What to do here? */ + break; + case -NFS4ERR_DELAY: + case -EKEYEXPIRED: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return -EAGAIN; + default: + nfs4_schedule_state_recovery(clp); + } + return 0; +} + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) { struct nfs4_reclaim_complete_data *calldata = data; @@ -5188,32 +5391,13 @@ static void nfs4_reclaim_complete_done(s struct nfs4_sequence_res *res = &calldata->res.seq_res; dprintk("--> %s\n", __func__); - nfs41_sequence_done(clp, res, task->tk_status); - switch (task->tk_status) { - case 0: - case -NFS4ERR_COMPLETE_ALREADY: - break; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_DEADSESSION: - /* - * Handle the session error, but do not retry the operation, as - * we have no way of telling whether the clientid had to be - * reset before we got our reply. If reset, a new wave of - * reclaim operations will follow, containing their own reclaim - * complete. We don't want our retry to get on the way of - * recovery by incorrectly indicating to the server that we're - * done reclaiming state since the process had to be restarted. - */ - _nfs4_async_handle_error(task, NULL, clp, NULL); - break; - default: - if (_nfs4_async_handle_error( - task, NULL, clp, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return; - } - } + if (!nfs41_sequence_done(task, res)) + return; + if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } dprintk("<-- %s\n", __func__); } @@ -5270,6 +5454,404 @@ out: dprintk("<-- %s status=%d\n", __func__, status); return status; } + +static void +nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct inode *ino = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(ino); + + dprintk("--> %s\n", __func__); + if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, + &lgp->res.seq_res, 0, task)) + return; + rpc_call_start(task); +} + +static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + struct inode *ino = lgp->args.inode; + struct nfs_server *server = NFS_SERVER(ino); + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &lgp->res.seq_res)) + return; + + if (RPC_ASSASSINATED(task)) + return; + + pnfs_get_layout_done(lgp, task->tk_status); + + if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, server->nfs_client); + + lgp->status = task->tk_status; + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutget_release(void *calldata) +{ + struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); + pnfs_layout_release(NFS_I(lgp->args.inode)->layout, NULL); + if (lgp->res.layout.buf != NULL) + free_page((unsigned long) lgp->res.layout.buf); + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutget_call_ops = { + .rpc_call_prepare = nfs4_layoutget_prepare, + .rpc_call_done = nfs4_layoutget_done, + .rpc_release = nfs4_layoutget_release, +}; + +/* FIXME: We need to call nfs4_handle_exception + * and deal with retries. + * Currently we can't since we release lgp and its contents. + */ +static int _nfs4_proc_layoutget(struct nfs4_layoutget *lgp) +{ + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], + .rpc_argp = &lgp->args, + .rpc_resp = &lgp->res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutget_call_ops, + .callback_data = lgp, + .flags = RPC_TASK_ASYNC, + }; + int status = 0; + + dprintk("--> %s\n", __func__); + + lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); + if (lgp->res.layout.buf == NULL) { + nfs4_layoutget_release(lgp); + return -ENOMEM; + } + + lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = lgp->status; + if (status != 0) + goto out; + status = pnfs_layout_process(lgp); +out: + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) +{ + struct nfs_server *server = NFS_SERVER(lgp->args.inode); + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, _nfs4_proc_layoutget(lgp), + &exception); + } while (exception.retry); + return err; +} + +static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data) +{ + struct nfs4_layoutcommit_data *ldata = + (struct nfs4_layoutcommit_data *)data; + struct nfs_server *server = NFS_SERVER(ldata->args.inode); + + if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, + &ldata->res.seq_res, 1, task)) + return; + rpc_call_start(task); +} + +static void +nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutcommit_data *data = + (struct nfs4_layoutcommit_data *)calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + if (RPC_ASSASSINATED(task)) + return; + + if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, server->nfs_client); + + data->status = task->tk_status; +} + +static void nfs4_layoutcommit_release(void *lcdata) +{ + struct nfs4_layoutcommit_data *data = + (struct nfs4_layoutcommit_data *)lcdata; + + put_rpccred(data->cred); + pnfs_cleanup_layoutcommit(lcdata); + pnfs_layoutcommit_free(lcdata); + /* Matched by get_layout in pnfs_layoutcommit_inode */ + put_layout(data->args.inode); +} + +static const struct rpc_call_ops nfs4_layoutcommit_ops = { + .rpc_call_prepare = nfs4_layoutcommit_prepare, + .rpc_call_done = nfs4_layoutcommit_done, + .rpc_release = nfs4_layoutcommit_release, +}; + +/* Execute a layoutcommit to the server */ +static int +_nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = NFS_CLIENT(data->args.inode), + .rpc_message = &msg, + .callback_ops = &nfs4_layoutcommit_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + int status = 0; + + dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " + "type: %d issync %d\n", + data->task.tk_pid, + data->args.range.length, + data->args.range.offset, + data->args.lastbytewritten, + data->args.layout_type, issync); + + data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (!issync) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = data->status; +out: + dprintk("%s: status %d\n", __func__, status); + rpc_put_task(task); + return 0; +} + +int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) +{ + struct nfs4_exception exception = { }; + struct nfs_server *server = NFS_SERVER(data->args.inode); + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_proc_layoutcommit(data, issync), + &exception); + } while (exception.retry); + return err; +} + +static void +nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + struct inode *ino = lrp->args.inode; + struct nfs_server *server = NFS_SERVER(ino); + + dprintk("--> %s\n", __func__); + if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args, + &lrp->res.seq_res, 0, task)) + return; + rpc_call_start(task); +} + +static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + struct inode *ino = lrp->args.inode; + struct nfs_server *server = NFS_SERVER(ino); + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &lrp->res.seq_res)) + return; + + if (RPC_ASSASSINATED(task)) + return; + + if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, server->nfs_client); + + dprintk("<-- %s\n", __func__); +} + +static void nfs4_layoutreturn_release(void *calldata) +{ + struct nfs4_layoutreturn *lrp = calldata; + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; + + dprintk("--> %s return_type %d lo %p\n", __func__, + lrp->args.return_type, lo); + + if (lrp->args.return_type == RETURN_FILE) { + if (!lrp->res.lrs_present) + pnfs_set_layout_stateid(lo, &zero_stateid); + pnfs_layout_release(lo, &lrp->args.range); + } + kfree(calldata); + dprintk("<-- %s\n", __func__); +} + +static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { + .rpc_call_prepare = nfs4_layoutreturn_prepare, + .rpc_call_done = nfs4_layoutreturn_done, + .rpc_release = nfs4_layoutreturn_release, +}; + +int _nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) +{ + struct inode *ino = lrp->args.inode; + struct nfs_server *server = NFS_SERVER(ino); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], + .rpc_argp = &lrp->args, + .rpc_resp = &lrp->res, + }; + struct rpc_task_setup task_setup_data = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs4_layoutreturn_call_ops, + .callback_data = lrp, + .flags = RPC_TASK_ASYNC, + }; + int status = 0; + + dprintk("--> %s\n", __func__); + lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + if (!issync) + goto out; + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) + goto out; + status = task->tk_status; +out: + dprintk("<-- %s\n", __func__); + rpc_put_task(task); + return status; +} + +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) +{ + struct nfs_server *server = NFS_SERVER(lrp->args.inode); + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(server, + _nfs4_proc_layoutreturn(lrp, issync), + &exception); + } while (exception.retry); + + return err; +} + +/* + * Retrieve the list of Data Server devices from the MDS. + */ +static int _nfs4_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist) +{ + struct nfs4_getdevicelist_args args = { + .fh = fh, + .layoutclass = server->pnfs_curr_ld->id, + }; + struct nfs4_getdevicelist_res res = { + .devlist = devlist, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server, &msg, &args, &res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + return status; +} + +int nfs4_proc_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist) +{ + struct nfs4_exception exception = { }; + int err; + + do { + err = nfs4_handle_exception(server, + _nfs4_getdevicelist(server, fh, devlist), + &exception); + } while (exception.retry); + + dprintk("nfs4_pnfs_getdevlist: err=%d, num_devs=%u\n", + err, devlist->num_devs); + + return err; +} + +int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) +{ + struct nfs4_getdeviceinfo_args args = { + .pdev = pdev, + }; + struct nfs4_getdeviceinfo_res res = { + .pdev = pdev, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], + .rpc_argp = &args, + .rpc_resp = &res, + }; + int status; + + dprintk("--> %s\n", __func__); + status = nfs4_call_sync(server, &msg, &args, &res, 0); + dprintk("<-- %s status=%d\n", __func__, status); + + return status; +} + #endif /* CONFIG_NFS_V4_1 */ struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { @@ -5327,28 +5909,30 @@ struct nfs4_state_maintenance_ops nfs41_ }; #endif -/* - * Per minor version reboot and network partition recovery ops - */ - -struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { - &nfs40_reboot_recovery_ops, -#if defined(CONFIG_NFS_V4_1) - &nfs41_reboot_recovery_ops, -#endif +static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { + .minor_version = 0, + .call_sync = _nfs4_call_sync, + .validate_stateid = nfs4_validate_delegation_stateid, + .reboot_recovery_ops = &nfs40_reboot_recovery_ops, + .nograce_recovery_ops = &nfs40_nograce_recovery_ops, + .state_renewal_ops = &nfs40_state_renewal_ops, }; -struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { - &nfs40_nograce_recovery_ops, #if defined(CONFIG_NFS_V4_1) - &nfs41_nograce_recovery_ops, -#endif +static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { + .minor_version = 1, + .call_sync = _nfs4_call_sync_session, + .validate_stateid = nfs41_validate_delegation_stateid, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, }; +#endif -struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { - &nfs40_state_renewal_ops, +const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { + [0] = &nfs_v4_0_minor_ops, #if defined(CONFIG_NFS_V4_1) - &nfs41_state_renewal_ops, + [1] = &nfs_v4_1_minor_ops, #endif }; @@ -5366,6 +5950,7 @@ const struct nfs_rpc_ops nfs_v4_clientop .dentry_ops = &nfs4_dentry_operations, .dir_inode_ops = &nfs4_dir_inode_operations, .file_inode_ops = &nfs4_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs4_proc_get_root, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.34.noarch/fs/nfs/nfs4renewd.c --- linux-2.6.34.noarch/fs/nfs/nfs4renewd.c.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4renewd.c 2010-09-30 10:17:08.679993000 -0400 @@ -54,17 +54,17 @@ void nfs4_renew_state(struct work_struct *work) { - struct nfs4_state_maintenance_ops *ops; + const struct nfs4_state_maintenance_ops *ops; struct nfs_client *clp = container_of(work, struct nfs_client, cl_renewd.work); struct rpc_cred *cred; long lease; unsigned long last, now; - ops = nfs4_state_renewal_ops[clp->cl_minorversion]; + ops = clp->cl_mvops->state_renewal_ops; dprintk("%s: start\n", __func__); /* Are there any active superblocks? */ - if (list_empty(&clp->cl_superblocks)) + if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) goto out; spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff -up linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig linux-2.6.34.noarch/fs/nfs/nfs4state.c --- linux-2.6.34.noarch/fs/nfs/nfs4state.c.orig 2010-09-30 10:15:17.863715000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4state.c 2010-09-30 10:17:08.685993000 -0400 @@ -48,11 +48,13 @@ #include #include #include +#include #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "internal.h" +#include "pnfs.h" #define OPENOWNER_POOL_SIZE 8 @@ -126,6 +128,11 @@ static int nfs41_setup_state_renewal(str int status; struct nfs_fsinfo fsinfo; + if (is_ds_only_client(clp)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + status = nfs4_proc_get_lease_time(clp, &fsinfo); if (status == 0) { /* Update lease time and schedule renewal */ @@ -145,7 +152,9 @@ static void nfs4_end_drain_session(struc struct nfs4_session *ses = clp->cl_session; int max_slots; - if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { + if (ses == NULL) + return; + if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { spin_lock(&ses->fc_slot_table.slot_tbl_lock); max_slots = ses->fc_slot_table.max_slots; while (max_slots--) { @@ -167,7 +176,7 @@ static int nfs4_begin_drain_session(stru struct nfs4_slot_table *tbl = &ses->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); - set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); + set_bit(NFS4_SESSION_DRAINING, &ses->session_state); if (tbl->highest_used_slotid != -1) { INIT_COMPLETION(ses->complete); spin_unlock(&tbl->slot_tbl_lock); @@ -371,7 +380,6 @@ nfs4_alloc_state_owner(void) return NULL; spin_lock_init(&sp->so_lock); INIT_LIST_HEAD(&sp->so_states); - INIT_LIST_HEAD(&sp->so_delegations); rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); sp->so_seqid.sequence = &sp->so_sequence; spin_lock_init(&sp->so_sequence.lock); @@ -384,7 +392,7 @@ static void nfs4_drop_state_owner(struct nfs4_state_owner *sp) { if (!RB_EMPTY_NODE(&sp->so_client_node)) { - struct nfs_client *clp = sp->so_client; + struct nfs_client *clp = sp->so_server->nfs_client; spin_lock(&clp->cl_lock); rb_erase(&sp->so_client_node, &clp->cl_state_owners); @@ -406,7 +414,6 @@ struct nfs4_state_owner *nfs4_get_state_ new = nfs4_alloc_state_owner(); if (new == NULL) return NULL; - new->so_client = clp; new->so_server = server; new->so_cred = cred; spin_lock(&clp->cl_lock); @@ -423,7 +430,7 @@ struct nfs4_state_owner *nfs4_get_state_ void nfs4_put_state_owner(struct nfs4_state_owner *sp) { - struct nfs_client *clp = sp->so_client; + struct nfs_client *clp = sp->so_server->nfs_client; struct rpc_cred *cred = sp->so_cred; if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) @@ -583,8 +590,24 @@ static void __nfs4_close(struct path *pa if (!call_close) { nfs4_put_open_state(state); nfs4_put_state_owner(owner); - } else + } else { + u32 roc_iomode; + struct nfs_inode *nfsi = NFS_I(state->inode); + + if (has_layout(nfsi) && + (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { + struct pnfs_layout_range range = { + .iomode = roc_iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + pnfs_return_layout(state->inode, &range, NULL, + RETURN_FILE, wait); + } + nfs4_do_close(path, state, gfp_mask, wait); + } } void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) @@ -602,12 +625,21 @@ void nfs4_close_sync(struct path *path, * that is compatible with current->files */ static struct nfs4_lock_state * -__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) { struct nfs4_lock_state *pos; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner != fl_owner) + if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) continue; + switch (pos->ls_owner.lo_type) { + case NFS4_POSIX_LOCK_TYPE: + if (pos->ls_owner.lo_u.posix_owner != fl_owner) + continue; + break; + case NFS4_FLOCK_LOCK_TYPE: + if (pos->ls_owner.lo_u.flock_owner != fl_pid) + continue; + } atomic_inc(&pos->ls_count); return pos; } @@ -619,10 +651,10 @@ __nfs4_find_lock_state(struct nfs4_state * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) { struct nfs4_lock_state *lsp; - struct nfs_client *clp = state->owner->so_client; + struct nfs_client *clp = state->owner->so_server->nfs_client; lsp = kzalloc(sizeof(*lsp), GFP_NOFS); if (lsp == NULL) @@ -633,7 +665,18 @@ static struct nfs4_lock_state *nfs4_allo lsp->ls_seqid.sequence = &lsp->ls_sequence; atomic_set(&lsp->ls_count, 1); lsp->ls_state = state; - lsp->ls_owner = fl_owner; + lsp->ls_owner.lo_type = type; + switch (lsp->ls_owner.lo_type) { + case NFS4_FLOCK_LOCK_TYPE: + lsp->ls_owner.lo_u.flock_owner = fl_pid; + break; + case NFS4_POSIX_LOCK_TYPE: + lsp->ls_owner.lo_u.posix_owner = fl_owner; + break; + default: + kfree(lsp); + return NULL; + } spin_lock(&clp->cl_lock); nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); spin_unlock(&clp->cl_lock); @@ -643,7 +686,7 @@ static struct nfs4_lock_state *nfs4_allo static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) { - struct nfs_client *clp = lsp->ls_state->owner->so_client; + struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; spin_lock(&clp->cl_lock); nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); @@ -657,13 +700,13 @@ static void nfs4_free_lock_state(struct * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) +static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) { struct nfs4_lock_state *lsp, *new = NULL; for(;;) { spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, owner); + lsp = __nfs4_find_lock_state(state, owner, pid, type); if (lsp != NULL) break; if (new != NULL) { @@ -674,7 +717,7 @@ static struct nfs4_lock_state *nfs4_get_ break; } spin_unlock(&state->state_lock); - new = nfs4_alloc_lock_state(state, owner); + new = nfs4_alloc_lock_state(state, owner, pid, type); if (new == NULL) return NULL; } @@ -701,6 +744,8 @@ void nfs4_put_lock_state(struct nfs4_loc if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); + if (lsp->ls_flags & NFS_LOCK_INITIALIZED) + nfs4_release_lockowner(lsp); nfs4_free_lock_state(lsp); } @@ -728,7 +773,12 @@ int nfs4_set_lock_state(struct nfs4_stat if (fl->fl_ops != NULL) return 0; - lsp = nfs4_get_lock_state(state, fl->fl_owner); + if (fl->fl_flags & FL_POSIX) + lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); + else if (fl->fl_flags & FL_FLOCK) + lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); + else + return -EINVAL; if (lsp == NULL) return -ENOMEM; fl->fl_u.nfs4_fl.owner = lsp; @@ -740,7 +790,7 @@ int nfs4_set_lock_state(struct nfs4_stat * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) +void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) { struct nfs4_lock_state *lsp; int seq; @@ -753,7 +803,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst return; spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner); + lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); spin_unlock(&state->state_lock); @@ -1031,8 +1081,8 @@ restart: * Open state on this file cannot be recovered * All we can do is revert to using the zero stateid. */ - memset(state->stateid.data, 0, - sizeof(state->stateid.data)); + memset(state->stateid.u.data, 0, + sizeof(state->stateid.u.data)); /* Mark the file as being 'closed' */ state->state = 0; break; @@ -1041,11 +1091,11 @@ restart: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - nfs4_state_mark_reclaim_nograce(sp->so_client, state); + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); break; case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: - nfs4_state_mark_reclaim_nograce(sp->so_client, state); + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -1120,8 +1170,7 @@ static void nfs4_state_end_reclaim_reboo if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) return; - nfs4_reclaim_complete(clp, - nfs4_reboot_recovery_ops[clp->cl_minorversion]); + nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); @@ -1211,8 +1260,8 @@ restart: static int nfs4_check_lease(struct nfs_client *clp) { struct rpc_cred *cred; - struct nfs4_state_maintenance_ops *ops = - nfs4_state_renewal_ops[clp->cl_minorversion]; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; int status = -NFS4ERR_EXPIRED; /* Is the client already known to have an expired lease? */ @@ -1235,8 +1284,8 @@ out: static int nfs4_reclaim_lease(struct nfs_client *clp) { struct rpc_cred *cred; - struct nfs4_state_recovery_ops *ops = - nfs4_reboot_recovery_ops[clp->cl_minorversion]; + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; int status = -ENOENT; cred = ops->get_clid_cred(clp); @@ -1421,6 +1470,7 @@ static void nfs4_state_manager(struct nf } clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); + pnfs_destroy_all_layouts(clp); } if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { @@ -1444,7 +1494,7 @@ static void nfs4_state_manager(struct nf /* First recover reboot state... */ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { status = nfs4_do_reclaim(clp, - nfs4_reboot_recovery_ops[clp->cl_minorversion]); + clp->cl_mvops->reboot_recovery_ops); if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) continue; @@ -1458,7 +1508,7 @@ static void nfs4_state_manager(struct nf /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { status = nfs4_do_reclaim(clp, - nfs4_nograce_recovery_ops[clp->cl_minorversion]); + clp->cl_mvops->nograce_recovery_ops); if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff -up linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.34.noarch/fs/nfs/nfs4xdr.c --- linux-2.6.34.noarch/fs/nfs/nfs4xdr.c.orig 2010-09-30 10:15:17.872720000 -0400 +++ linux-2.6.34.noarch/fs/nfs/nfs4xdr.c 2010-09-30 10:17:08.709998000 -0400 @@ -50,8 +50,10 @@ #include #include #include +#include #include "nfs4_fs.h" #include "internal.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -89,7 +91,7 @@ static int nfs4_stat_to_errno(int); #define encode_getfh_maxsz (op_encode_hdr_maxsz) #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ ((3+NFS4_FHSIZE) >> 2)) -#define nfs4_fattr_bitmap_maxsz 3 +#define nfs4_fattr_bitmap_maxsz 4 #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) @@ -111,7 +113,11 @@ static int nfs4_stat_to_errno(int); #define encode_restorefh_maxsz (op_encode_hdr_maxsz) #define decode_restorefh_maxsz (op_decode_hdr_maxsz) #define encode_fsinfo_maxsz (encode_getattr_maxsz) -#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) +/* The 5 accounts for the PNFS attributes, and assumes that at most three + * layout types will be returned. + */ +#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz + 8 + 5) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ @@ -202,14 +208,17 @@ static int nfs4_stat_to_errno(int); #define encode_link_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) +#define encode_lockowner_maxsz (7) #define encode_lock_maxsz (op_encode_hdr_maxsz + \ 7 + \ - 1 + encode_stateid_maxsz + 8) + 1 + encode_stateid_maxsz + 1 + \ + encode_lockowner_maxsz) #define decode_lock_denied_maxsz \ (8 + decode_lockowner_maxsz) #define decode_lock_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) -#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) +#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ + encode_lockowner_maxsz) #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ @@ -217,6 +226,11 @@ static int nfs4_stat_to_errno(int); 4) #define decode_locku_maxsz (op_decode_hdr_maxsz + \ decode_stateid_maxsz) +#define encode_release_lockowner_maxsz \ + (op_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define decode_release_lockowner_maxsz \ + (op_decode_hdr_maxsz) #define encode_access_maxsz (op_encode_hdr_maxsz + 1) #define decode_access_maxsz (op_decode_hdr_maxsz + 2) #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ @@ -302,6 +316,35 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) +#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ + encode_verifier_maxsz) +#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + \ + decode_verifier_maxsz + \ + XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ + NFS4_PNFS_DEVICEID4_SIZE)) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ + XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE)) +#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ + 4 /*layout type */ + \ + 4 /* opaque devaddr4 length */ +\ + 4 /* notification bitmap length */ + \ + 4 /* notification bitmap */) +#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ + encode_stateid_maxsz) +#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ + decode_stateid_maxsz + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) +#define encode_layoutcommit_maxsz (18 + \ + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ + op_encode_hdr_maxsz + \ + encode_stateid_maxsz) +#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) +#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + 1 /* FIXME: opaque lrf_body always empty at + *the moment */) +#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ + 1 + decode_stateid_maxsz) #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 @@ -471,6 +514,12 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_locku_maxsz) +#define NFS4_enc_release_lockowner_sz \ + (compound_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define NFS4_dec_release_lockowner_sz \ + (compound_decode_hdr_maxsz + \ + decode_lockowner_maxsz) #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -685,6 +734,60 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_reclaim_complete_maxsz) +#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_getdevicelist_maxsz) +#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_getdevicelist_maxsz) +#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_getdeviceinfo_maxsz) +#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_getdeviceinfo_maxsz) +#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutget_maxsz) +#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutget_maxsz) +#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_putfh_maxsz + \ + encode_layoutcommit_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutcommit_maxsz + \ + decode_getattr_maxsz) +#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_layoutreturn_maxsz) +#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutreturn_maxsz) +#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz +\ + encode_putfh_maxsz + \ + encode_write_maxsz) +#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_write_maxsz) +#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_commit_maxsz) +#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_commit_maxsz) const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + compound_encode_hdr_maxsz + @@ -915,7 +1018,7 @@ static void encode_close(struct xdr_stre p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_CLOSE); *p++ = cpu_to_be32(arg->seqid->sequence->counter); - xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_close_maxsz; } @@ -989,6 +1092,35 @@ static void encode_getattr_two(struct xd hdr->replen += decode_getattr_maxsz; } +static void +encode_getattr_three(struct xdr_stream *xdr, + uint32_t bm0, uint32_t bm1, uint32_t bm2, + struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_GETATTR); + if (bm2) { + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(3); + *p++ = cpu_to_be32(bm0); + *p++ = cpu_to_be32(bm1); + *p = cpu_to_be32(bm2); + } else if (bm1) { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); + } else { + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bm0); + } + hdr->nops++; + hdr->replen += decode_getattr_maxsz; +} + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], @@ -997,8 +1129,11 @@ static void encode_getfattr(struct xdr_s static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) { - encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], - bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); + encode_getattr_three(xdr, + bitmask[0] & nfs4_fsinfo_bitmap[0], + bitmask[1] & nfs4_fsinfo_bitmap[1], + bitmask[2] & nfs4_fsinfo_bitmap[2], + hdr); } static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) @@ -1042,6 +1177,17 @@ static inline uint64_t nfs4_lock_length( return fl->fl_end - fl->fl_start + 1; } +static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) +{ + __be32 *p; + + p = reserve_space(xdr, 28); + p = xdr_encode_hyper(p, lowner->clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, lowner->id); +} + /* * opcode,type,reclaim,offset,length,new_lock_owner = 32 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 @@ -1058,18 +1204,16 @@ static void encode_lock(struct xdr_strea p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(args->open_seqid->sequence->counter); - p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, args->open_stateid->u.data, + NFS4_STATEID_SIZE); *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); - p = xdr_encode_hyper(p, args->lock_owner.clientid); - *p++ = cpu_to_be32(16); - p = xdr_encode_opaque_fixed(p, "lock id:", 8); - xdr_encode_hyper(p, args->lock_owner.id); + encode_lockowner(xdr, &args->lock_owner); } else { p = reserve_space(xdr, NFS4_STATEID_SIZE+4); - p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, args->lock_stateid->u.data, NFS4_STATEID_SIZE); *p = cpu_to_be32(args->lock_seqid->sequence->counter); } hdr->nops++; @@ -1080,15 +1224,12 @@ static void encode_lockt(struct xdr_stre { __be32 *p; - p = reserve_space(xdr, 52); + p = reserve_space(xdr, 24); *p++ = cpu_to_be32(OP_LOCKT); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); p = xdr_encode_hyper(p, args->fl->fl_start); p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); - p = xdr_encode_hyper(p, args->lock_owner.clientid); - *p++ = cpu_to_be32(16); - p = xdr_encode_opaque_fixed(p, "lock id:", 8); - xdr_encode_hyper(p, args->lock_owner.id); + encode_lockowner(xdr, &args->lock_owner); hdr->nops++; hdr->replen += decode_lockt_maxsz; } @@ -1101,13 +1242,25 @@ static void encode_locku(struct xdr_stre *p++ = cpu_to_be32(OP_LOCKU); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); *p++ = cpu_to_be32(args->seqid->sequence->counter); - p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, args->stateid->u.data, + NFS4_STATEID_SIZE); p = xdr_encode_hyper(p, args->fl->fl_start); xdr_encode_hyper(p, nfs4_lock_length(args->fl)); hdr->nops++; hdr->replen += decode_locku_maxsz; } +static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); + encode_lockowner(xdr, lowner); + hdr->nops++; + hdr->replen += decode_release_lockowner_maxsz; +} + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { int len = name->len; @@ -1172,7 +1325,7 @@ static inline void encode_createmode(str break; default: clp = arg->server->nfs_client; - if (clp->cl_minorversion > 0) { + if (clp->cl_mvops->minor_version > 0) { if (nfs4_has_persistent_session(clp)) { *p = cpu_to_be32(NFS4_CREATE_GUARDED); encode_attrs(xdr, arg->u.attrs, arg->server); @@ -1251,7 +1404,7 @@ static inline void encode_claim_delegate p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); - xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); encode_string(xdr, name->len, name->name); } @@ -1282,7 +1435,7 @@ static void encode_open_confirm(struct x p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(OP_OPEN_CONFIRM); - p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); *p = cpu_to_be32(arg->seqid->sequence->counter); hdr->nops++; hdr->replen += decode_open_confirm_maxsz; @@ -1294,7 +1447,7 @@ static void encode_open_downgrade(struct p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); - p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); hdr->nops++; @@ -1324,17 +1477,17 @@ static void encode_putrootfh(struct xdr_ hdr->replen += decode_putrootfh_maxsz; } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) +static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) { nfs4_stateid stateid; __be32 *p; p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { - nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); - xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); + nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); + xdr_encode_opaque_fixed(p, stateid.u.data, NFS4_STATEID_SIZE); } else - xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); } static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) @@ -1344,7 +1497,7 @@ static void encode_read(struct xdr_strea p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_READ); - encode_stateid(xdr, args->context); + encode_stateid(xdr, args->context, args->lock_context); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); @@ -1448,7 +1601,7 @@ encode_setacl(struct xdr_stream *xdr, st p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_SETATTR); - xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); p = reserve_space(xdr, 2*4); *p++ = cpu_to_be32(1); *p = cpu_to_be32(FATTR4_WORD0_ACL); @@ -1479,7 +1632,7 @@ static void encode_setattr(struct xdr_st p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_SETATTR); - xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, arg->stateid.u.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_setattr_maxsz; encode_attrs(xdr, arg->iap, server); @@ -1523,7 +1676,7 @@ static void encode_write(struct xdr_stre p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_WRITE); - encode_stateid(xdr, args->context); + encode_stateid(xdr, args->context, args->lock_context); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); @@ -1542,7 +1695,7 @@ static void encode_delegreturn(struct xd p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); *p++ = cpu_to_be32(OP_DELEGRETURN); - xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_delegreturn_maxsz; } @@ -1696,6 +1849,162 @@ static void encode_sequence(struct xdr_s #endif /* CONFIG_NFS_V4_1 */ } +#ifdef CONFIG_NFS_V4_1 +static void +encode_getdevicelist(struct xdr_stream *xdr, + const struct nfs4_getdevicelist_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + nfs4_verifier dummy = { + .data = "dummmmmy", + }; + + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(OP_GETDEVICELIST); + *p++ = cpu_to_be32(args->layoutclass); + *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); + xdr_encode_hyper(p, 0ULL); /* cookie */ + encode_nfs4_verifier(xdr, &dummy); + hdr->nops++; +} + +static void +encode_getdeviceinfo(struct xdr_stream *xdr, + const struct nfs4_getdeviceinfo_args *args, + struct compound_hdr *hdr) +{ + int has_bitmap = (args->pdev->dev_notify_types != 0); + int len = 16 + NFS4_PNFS_DEVICEID4_SIZE + (has_bitmap * 4); + __be32 *p; + + p = reserve_space(xdr, len); + *p++ = cpu_to_be32(OP_GETDEVICEINFO); + p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, + NFS4_PNFS_DEVICEID4_SIZE); + *p++ = cpu_to_be32(args->pdev->layout_type); + *p++ = cpu_to_be32(args->pdev->pglen + len); /* gdia_maxcount */ + *p++ = cpu_to_be32(has_bitmap); /* bitmap length [01] */ + if (has_bitmap) + *p = cpu_to_be32(args->pdev->dev_notify_types); + hdr->nops++; +} + +static void +encode_layoutget(struct xdr_stream *xdr, + const struct nfs4_layoutget_args *args, + struct compound_hdr *hdr) +{ + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTGET); + *p++ = cpu_to_be32(0); /* Signal layout available */ + *p++ = cpu_to_be32(args->type); + *p++ = cpu_to_be32(args->range.iomode); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + p = xdr_encode_hyper(p, args->minlength); + pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); + p = xdr_encode_opaque_fixed(p, &stateid.u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->maxcount); + + dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", + __func__, + args->type, + args->range.iomode, + (unsigned long)args->range.offset, + (unsigned long)args->range.length, + args->maxcount); + hdr->nops++; + hdr->replen += decode_layoutget_maxsz; +} + +static int +encode_layoutcommit(struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args, + struct compound_hdr *hdr) +{ + struct layoutdriver_io_operations *ld_io_ops = + NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; + __be32 *p; + + dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, + args->range.length, args->range.offset, args->lastbytewritten, + args->layout_type); + + p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + *p++ = cpu_to_be32(0); /* reclaim */ + p = xdr_encode_opaque_fixed(p, args->stateid.u.data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ + p = xdr_encode_hyper(p, args->lastbytewritten); + *p = cpu_to_be32(args->time_modify_changed != 0); + if (args->time_modify_changed) { + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(args->time_modify.tv_sec); + *p = cpu_to_be32(args->time_modify.tv_nsec); + } + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(args->layout_type); + + if (ld_io_ops->encode_layoutcommit) { + ld_io_ops->encode_layoutcommit(NFS_I(args->inode)->layout, + xdr, args); + } else { + p = reserve_space(xdr, 4); + xdr_encode_opaque(p, NULL, 0); + } + + hdr->nops++; + hdr->replen += decode_layoutcommit_maxsz; + return 0; +} + +static void +encode_layoutreturn(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + struct compound_hdr *hdr) +{ + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, 20); + *p++ = cpu_to_be32(OP_LAYOUTRETURN); + *p++ = cpu_to_be32(args->reclaim); + *p++ = cpu_to_be32(args->layout_type); + *p++ = cpu_to_be32(args->range.iomode); + *p = cpu_to_be32(args->return_type); + if (args->return_type == RETURN_FILE) { + struct layoutdriver_io_operations *ld_io_ops = + NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; + + p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); + pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); + p = xdr_encode_opaque_fixed(p, &stateid.u.data, + NFS4_STATEID_SIZE); + dprintk("%s: call %pF\n", __func__, + ld_io_ops->encode_layoutreturn); + if (ld_io_ops->encode_layoutreturn) { + ld_io_ops->encode_layoutreturn( + NFS_I(args->inode)->layout, xdr, args); + } else { + p = reserve_space(xdr, 4); + *p = cpu_to_be32(0); + } + } + hdr->nops++; + hdr->replen += decode_layoutreturn_maxsz; +} +#endif /* CONFIG_NFS_V4_1 */ + /* * END OF "GENERIC" ENCODE ROUTINES. */ @@ -1704,7 +2013,7 @@ static u32 nfs4_xdr_minorversion(const s { #if defined(CONFIG_NFS_V4_1) if (args->sa_session) - return args->sa_session->clp->cl_minorversion; + return args->sa_session->clp->cl_mvops->minor_version; #endif /* CONFIG_NFS_V4_1 */ return 0; } @@ -2048,6 +2357,20 @@ static int nfs4_xdr_enc_locku(struct rpc return 0; } +static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = 0, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_release_lockowner(&xdr, &args->lock_owner, &hdr); + encode_nops(&hdr); + return 0; +} + /* * Encode a READLINK request */ @@ -2330,7 +2653,7 @@ static int nfs4_xdr_enc_setclientid_conf struct compound_hdr hdr = { .nops = 0, }; - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, req, &hdr); @@ -2395,7 +2718,7 @@ static int nfs4_xdr_enc_exchange_id(stru { struct xdr_stream xdr; struct compound_hdr hdr = { - .minorversion = args->client->cl_minorversion, + .minorversion = args->client->cl_mvops->minor_version, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); @@ -2413,7 +2736,7 @@ static int nfs4_xdr_enc_create_session(s { struct xdr_stream xdr; struct compound_hdr hdr = { - .minorversion = args->client->cl_minorversion, + .minorversion = args->client->cl_mvops->minor_version, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); @@ -2431,7 +2754,7 @@ static int nfs4_xdr_enc_destroy_session( { struct xdr_stream xdr; struct compound_hdr hdr = { - .minorversion = session->clp->cl_minorversion, + .minorversion = session->clp->cl_mvops->minor_version, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); @@ -2469,7 +2792,7 @@ static int nfs4_xdr_enc_get_lease_time(s struct compound_hdr hdr = { .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), }; - const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; + const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, req, &hdr); @@ -2499,6 +2822,159 @@ static int nfs4_xdr_enc_reclaim_complete return 0; } +/* + * Encode GETDEVICELIST request + */ +static int +nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p, + struct nfs4_getdevicelist_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_putfh(&xdr, args->fh, &hdr); + encode_getdevicelist(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + +/* + * Encode GETDEVICEINFO request + */ +static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, + struct nfs4_getdeviceinfo_args *args) +{ + struct xdr_stream xdr; + struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + int replen; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_getdeviceinfo(&xdr, args, &hdr); + + /* set up reply kvec. Subtract notification bitmap max size (8) + * so that notification bitmap is put in xdr_buf tail */ + replen = (RPC_REPHDRSIZE + auth->au_rslack + + NFS4_dec_getdeviceinfo_sz - 8) << 2; + xdr_inline_pages(&req->rq_rcv_buf, replen, args->pdev->pages, + args->pdev->pgbase, args->pdev->pglen); + dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", + __func__, replen, args->pdev->pages, + args->pdev->pgbase, args->pdev->pglen); + + encode_nops(&hdr); + return 0; +} + +/* + * Encode LAYOUTGET request + */ +static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, + struct nfs4_layoutget_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_putfh(&xdr, NFS_FH(args->inode), &hdr); + encode_layoutget(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + +/* + * Encode LAYOUTCOMMIT request + */ +static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p, + struct nfs4_layoutcommit_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_putfh(&xdr, args->fh, &hdr); + encode_layoutcommit(&xdr, args, &hdr); + encode_getfattr(&xdr, args->bitmask, &hdr); + encode_nops(&hdr); + return 0; +} + +/* + * Encode LAYOUTRETURN request + */ +static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p, + struct nfs4_layoutreturn_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_putfh(&xdr, NFS_FH(args->inode), &hdr); + encode_layoutreturn(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + +/* + * Encode a pNFS File Layout Data Server WRITE request + */ +static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p, + struct nfs_writeargs *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_putfh(&xdr, args->fh, &hdr); + encode_write(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} + +/* + * Encode a pNFS File Layout Data Server COMMIT request + */ +static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p, + struct nfs_writeargs *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_sequence(&xdr, &args->seq_args, &hdr); + encode_putfh(&xdr, args->fh, &hdr); + encode_commit(&xdr, args, &hdr); + encode_nops(&hdr); + return 0; +} #endif /* CONFIG_NFS_V4_1 */ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -2599,14 +3075,17 @@ static int decode_attr_bitmap(struct xdr goto out_overflow; bmlen = be32_to_cpup(p); - bitmap[0] = bitmap[1] = 0; + bitmap[0] = bitmap[1] = bitmap[2] = 0; p = xdr_inline_decode(xdr, (bmlen << 2)); if (unlikely(!p)) goto out_overflow; if (bmlen > 0) { bitmap[0] = be32_to_cpup(p++); - if (bmlen > 1) - bitmap[1] = be32_to_cpup(p); + if (bmlen > 1) { + bitmap[1] = be32_to_cpup(p++); + if (bmlen > 2) + bitmap[2] = be32_to_cpup(p); + } } return 0; out_overflow: @@ -2635,8 +3114,9 @@ static int decode_attr_supported(struct decode_attr_bitmap(xdr, bitmask); bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; } else - bitmask[0] = bitmask[1] = 0; - dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); + bitmask[0] = bitmask[1] = bitmask[2] = 0; + dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, + bitmask[0], bitmask[1], bitmask[2]); return 0; } @@ -3565,7 +4045,7 @@ static int decode_opaque_fixed(struct xd static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { - return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); + return decode_opaque_fixed(xdr, stateid->u.data, NFS4_STATEID_SIZE); } static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) @@ -3621,7 +4101,7 @@ out_overflow: static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) { __be32 *savep; - uint32_t attrlen, bitmap[2] = {0}; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3647,7 +4127,7 @@ xdr_error: static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) { __be32 *savep; - uint32_t attrlen, bitmap[2] = {0}; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3679,7 +4159,7 @@ xdr_error: static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) { __be32 *savep; - uint32_t attrlen, bitmap[2] = {0}; + uint32_t attrlen, bitmap[3] = {0}; int status; if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) @@ -3705,7 +4185,7 @@ static int decode_getfattr(struct xdr_st { __be32 *savep; uint32_t attrlen, - bitmap[2] = {0}, + bitmap[3] = {0}, type; int status; umode_t fmode = 0; @@ -3824,24 +4304,101 @@ xdr_error: return status; } - -static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) +#if defined(CONFIG_NFS_V4_1) +/* + * Decode potentially multiple layout types. Currently we only support + * one layout driver per file system. + */ +static int decode_pnfs_list(struct xdr_stream *xdr, uint32_t *layoutclass) { - __be32 *savep; - uint32_t attrlen, bitmap[2]; - int status; + uint32_t *p; + int num; - if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) - goto xdr_error; - if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) - goto xdr_error; - if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) - goto xdr_error; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + num = be32_to_cpup(p); - fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ + /* pNFS is not supported by the underlying file system */ + if (num == 0) { + *layoutclass = 0; + return 0; + } - if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) - goto xdr_error; + /* TODO: We will eventually support multiple layout drivers ? */ + if (num > 1) + printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " + "per filesystem not supported\n", __func__); + + /* Decode and set first layout type */ + p = xdr_inline_decode(xdr, num * 4); + if (unlikely(!p)) + goto out_overflow; + *layoutclass = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * The type of file system exported + */ +static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *layoutclass) +{ + int status = 0; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); + if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) + return -EIO; + if (likely(bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES)) { + status = decode_pnfs_list(xdr, layoutclass); + bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; + } + return status; +} + +/* + * The prefered block size for layout directed io + */ +static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, + uint32_t *res) +{ + __be32 *p; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); + *res = 0; + if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) { + print_overflow_msg(__func__, xdr); + return -EIO; + } + *res = be32_to_cpup(p); + bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; + } + return 0; +} +#endif /* CONFIG_NFS_V4_1 */ + +static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) +{ + __be32 *savep; + uint32_t attrlen, bitmap[3]; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto xdr_error; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto xdr_error; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto xdr_error; + + fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ + + if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) + goto xdr_error; if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) goto xdr_error; if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) @@ -3850,6 +4407,14 @@ static int decode_fsinfo(struct xdr_stre if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) goto xdr_error; fsinfo->wtpref = fsinfo->wtmax; +#if defined(CONFIG_NFS_V4_1) + status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + if (status) + goto xdr_error; + status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); + if (status) + goto xdr_error; +#endif /* CONFIG_NFS_V4_1 */ status = verify_attr_len(xdr, savep, attrlen); xdr_error: @@ -3973,6 +4538,11 @@ static int decode_locku(struct xdr_strea return status; } +static int decode_release_lockowner(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); +} + static int decode_lookup(struct xdr_stream *xdr) { return decode_op_hdr(xdr, OP_LOOKUP); @@ -4333,7 +4903,7 @@ static int decode_getacl(struct xdr_stre { __be32 *savep; uint32_t attrlen, - bitmap[2] = {0}; + bitmap[3] = {0}; struct kvec *iov = req->rq_rcv_buf.head; int status; @@ -4682,6 +5252,226 @@ out_overflow: #endif /* CONFIG_NFS_V4_1 */ } +#if defined(CONFIG_NFS_V4_1) +/* + * TODO: Need to handle case when EOF != true; + */ +static int decode_getdevicelist(struct xdr_stream *xdr, + struct pnfs_devicelist *res) +{ + __be32 *p; + int status, i; + struct nfs_writeverf verftemp; + + status = decode_op_hdr(xdr, OP_GETDEVICELIST); + if (status) + return status; + + p = xdr_inline_decode(xdr, 8 + 8 + 4); + if (unlikely(!p)) + goto out_overflow; + + /* TODO: Skip cookie for now */ + p += 2; + + /* Read verifier */ + p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); + + res->num_devs = be32_to_cpup(p); + + dprintk("%s: num_dev %d\n", __func__, res->num_devs); + + if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) + return -NFS4ERR_REP_TOO_BIG; + + p = xdr_inline_decode(xdr, + res->num_devs * NFS4_PNFS_DEVICEID4_SIZE + 4); + if (unlikely(!p)) + goto out_overflow; + for (i = 0; i < res->num_devs; i++) + p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, + NFS4_PNFS_DEVICEID4_SIZE); + res->eof = be32_to_cpup(p); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_getdeviceinfo(struct xdr_stream *xdr, + struct pnfs_device *pdev) +{ + __be32 *p; + uint32_t len, type; + int status; + + status = decode_op_hdr(xdr, OP_GETDEVICEINFO); + if (status) { + if (status == -ETOOSMALL) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pdev->mincount = be32_to_cpup(p); + dprintk("%s: Min count too small. mincnt = %u\n", + __func__, pdev->mincount); + } + return status; + } + + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + type = be32_to_cpup(p++); + if (type != pdev->layout_type) { + dprintk("%s: layout mismatch req: %u pdev: %u\n", + __func__, pdev->layout_type, type); + return -EINVAL; + } + /* + * Get the length of the opaque device_addr4. xdr_read_pages places + * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) + * and places the remaining xdr data in xdr_buf->tail + */ + pdev->mincount = be32_to_cpup(p); + xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ + + /* At most one bitmap word */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + if (len) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + pdev->dev_notify_types = be32_to_cpup(p); + } else + pdev->dev_notify_types = 0; + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, + struct nfs4_layoutget_res *res) +{ + __be32 *p; + int status; + u32 layout_count, dummy; + + status = decode_op_hdr(xdr, OP_LAYOUTGET); + if (status) + return status; + p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); + if (unlikely(!p)) + goto out_overflow; + res->return_on_close = be32_to_cpup(p++); + p = xdr_decode_opaque_fixed(p, res->stateid.u.data, NFS4_STATEID_SIZE); + layout_count = be32_to_cpup(p); + if (!layout_count) { + dprintk("%s: server responded with empty layout array\n", + __func__); + return -EINVAL; + } + + p = xdr_inline_decode(xdr, 24); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &res->range.offset); + p = xdr_decode_hyper(p, &res->range.length); + res->range.iomode = be32_to_cpup(p++); + res->type = be32_to_cpup(p++); + + status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); + if (unlikely(status)) + return status; + + dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", + __func__, + (unsigned long)res->range.offset, + (unsigned long)res->range.length, + res->range.iomode, + res->type, + res->layout.len); + + /* presuambly, nfs4_proc_layoutget allocated a single page */ + if (res->layout.len > PAGE_SIZE) + return -ENOMEM; + memcpy(res->layout.buf, p, res->layout.len); + + /* FIXME: the whole layout array should be passed up to the pnfs + * client */ + if (layout_count > 1) { + dprintk("%s: server responded with %d layouts, dropping tail\n", + __func__, layout_count); + + while (--layout_count) { + p = xdr_inline_decode(xdr, 24); + if (unlikely(!p)) + goto out_overflow; + status = decode_opaque_inline(xdr, &dummy, (char **)&p); + if (unlikely(status)) + return status; + } + } + + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutreturn(struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTRETURN); + if (status) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->lrs_present = be32_to_cpup(p); + if (res->lrs_present) + status = decode_stateid(xdr, &res->stateid); + return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_layoutcommit(struct xdr_stream *xdr, + struct rpc_rqst *req, + struct nfs4_layoutcommit_res *res) +{ + __be32 *p; + int status; + + status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->sizechanged = be32_to_cpup(p); + + if (res->sizechanged) { + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &res->newsize); + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} +#endif /* CONFIG_NFS_V4_1 */ + /* * END OF "GENERIC" DECODE ROUTINES. */ @@ -5259,6 +6049,19 @@ out: return status; } +static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (!status) + status = decode_release_lockowner(&xdr); + return status; +} + /* * Decode READLINK response */ @@ -5696,6 +6499,186 @@ static int nfs4_xdr_dec_reclaim_complete status = decode_reclaim_complete(&xdr, (void *)NULL); return status; } + +/* + * Decode GETDEVICELIST response + */ +static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_getdevicelist_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + dprintk("encoding getdevicelist!\n"); + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status != 0) + goto out; + status = decode_putfh(&xdr); + if (status != 0) + goto out; + status = decode_getdevicelist(&xdr, res->devlist); +out: + return status; +} + +/* + * Decode GETDEVINFO response + */ +static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_getdeviceinfo_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status != 0) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status != 0) + goto out; + status = decode_getdeviceinfo(&xdr, res->pdev); +out: + return status; +} + +/* + * Decode LAYOUTGET response + */ +static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_layoutget_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_layoutget(&xdr, rqstp, res); +out: + return status; +} + +/* + * Decode LAYOUTRETURN response + */ +static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_layoutreturn_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_layoutreturn(&xdr, res); +out: + return status; +} + +/* + * Decode LAYOUTCOMMIT response + */ +static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs4_layoutcommit_res *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_layoutcommit(&xdr, rqstp, res); + if (status) + goto out; + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); +out: + return status; +} + +/* + * Decode pNFS File Layout Data Server WRITE response + */ +static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs_writeres *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_write(&xdr, res); + if (!status) + return res->count; +out: + return status; +} + +/* + * Decode pNFS File Layout Data Server COMMIT response + */ +static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p, + struct nfs_writeres *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_sequence(&xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_commit(&xdr, res); +out: + return status; +} #endif /* CONFIG_NFS_V4_1 */ __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) @@ -5866,6 +6849,7 @@ struct rpc_procinfo nfs4_procedures[] = PROC(GETACL, enc_getacl, dec_getacl), PROC(SETACL, enc_setacl, dec_setacl), PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), + PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), #if defined(CONFIG_NFS_V4_1) PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), PROC(CREATE_SESSION, enc_create_session, dec_create_session), @@ -5873,6 +6857,13 @@ struct rpc_procinfo nfs4_procedures[] = PROC(SEQUENCE, enc_sequence, dec_sequence), PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), + PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), + PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), + PROC(LAYOUTGET, enc_layoutget, dec_layoutget), + PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), + PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), + PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), + PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), #endif /* CONFIG_NFS_V4_1 */ }; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild --- linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild.orig 2010-09-30 10:17:08.713997000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/Kbuild 2010-09-30 10:17:08.715994000 -0400 @@ -0,0 +1,11 @@ +# +# Makefile for the pNFS Objects Layout Driver kernel module +# +objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o +obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o + +# +# Panasas pNFS Layout Driver kernel module +# +panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o +obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c --- linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-09-30 10:17:08.717999000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/objio_osd.c 2010-09-30 10:17:08.719998000 -0400 @@ -0,0 +1,1087 @@ +/* + * objio_osd.c + * + * pNFS Objects layout implementation over open-osd initiator library + * + * Copyright (C) 2009 Panasas Inc. + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +#define _LLU(x) ((unsigned long long)x) + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), +}; + +/* A per mountpoint struct currently for device cache */ +struct objio_mount_type { + struct list_head dev_list; + spinlock_t dev_list_lock; +}; + +struct _dev_ent { + struct list_head list; + struct pnfs_deviceid d_id; + struct osd_dev *od; +}; + +static void _dev_list_remove_all(struct objio_mount_type *omt) +{ + spin_lock(&omt->dev_list_lock); + + while (!list_empty(&omt->dev_list)) { + struct _dev_ent *de = list_entry(omt->dev_list.next, + struct _dev_ent, list); + + list_del_init(&de->list); + osduld_put_device(de->od); + kfree(de); + } + + spin_unlock(&omt->dev_list_lock); +} + +static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, + struct pnfs_deviceid *d_id) +{ + struct list_head *le; + + list_for_each(le, &omt->dev_list) { + struct _dev_ent *de = list_entry(le, struct _dev_ent, list); + + if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) + return de->od; + } + + return NULL; +} + +static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, + struct pnfs_deviceid *d_id) +{ + struct osd_dev *od; + + spin_lock(&omt->dev_list_lock); + od = ___dev_list_find(omt, d_id); + spin_unlock(&omt->dev_list_lock); + return od; +} + +static int _dev_list_add(struct objio_mount_type *omt, + struct pnfs_deviceid *d_id, struct osd_dev *od) +{ + struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); + + if (!de) + return -ENOMEM; + + spin_lock(&omt->dev_list_lock); + + if (___dev_list_find(omt, d_id)) { + kfree(de); + goto out; + } + + de->d_id = *d_id; + de->od = od; + list_add(&de->list, &omt->dev_list); + +out: + spin_unlock(&omt->dev_list_lock); + return 0; +} + +struct objio_segment { + struct pnfs_osd_layout *layout; + + unsigned mirrors_p1; + unsigned stripe_unit; + unsigned group_width; /* Data stripe_units without integrity comps */ + u64 group_depth; + unsigned group_count; + + unsigned num_comps; + /* variable length */ + struct osd_dev *ods[1]; +}; + +struct objio_state; +typedef ssize_t (*objio_done_fn)(struct objio_state *ios); + +struct objio_state { + /* Generic layer */ + struct objlayout_io_state ol_state; + + struct objio_segment *objio_seg; + + struct kref kref; + objio_done_fn done; + void *private; + + unsigned long length; + unsigned numdevs; /* Actually used devs in this IO */ + /* A per-device variable array of size numdevs */ + struct _objio_per_comp { + struct bio *bio; + struct osd_request *or; + unsigned long length; + u64 offset; + unsigned dev; + } per_dev[]; +}; + +/* Send and wait for a get_device_info of devices in the layout, + then look them up with the osd_initiator library */ +static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned comp) +{ + struct pnfs_osd_layout *layout = objio_seg->layout; + struct pnfs_osd_deviceaddr *deviceaddr; + struct pnfs_deviceid *d_id; + struct osd_dev *od; + struct osd_dev_info odi; + struct objio_mount_type *omt = PNFS_NFS_SERVER(pnfslay)->pnfs_ld_data; + int err; + + d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; + + od = _dev_list_find(omt, d_id); + if (od) + return od; + + err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); + if (unlikely(err)) { + dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); + return ERR_PTR(err); + } + + odi.systemid_len = deviceaddr->oda_systemid.len; + if (odi.systemid_len > sizeof(odi.systemid)) { + err = -EINVAL; + goto out; + } else if (odi.systemid_len) + memcpy(odi.systemid, deviceaddr->oda_systemid.data, + odi.systemid_len); + odi.osdname_len = deviceaddr->oda_osdname.len; + odi.osdname = (u8 *)deviceaddr->oda_osdname.data; + + if (!odi.osdname_len && !odi.systemid_len) { + dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", + __func__); + err = -ENODEV; + goto out; + } + + od = osduld_info_lookup(&odi); + if (unlikely(IS_ERR(od))) { + err = PTR_ERR(od); + dprintk("%s: osduld_info_lookup => %d\n", __func__, err); + goto out; + } + + _dev_list_add(omt, d_id, od); + +out: + dprintk("%s: return=%d\n", __func__, err); + objlayout_put_deviceinfo(deviceaddr); + return err ? ERR_PTR(err) : od; +} + +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg) +{ + struct pnfs_osd_layout *layout = objio_seg->layout; + unsigned i, num_comps = layout->olo_num_comps; + int err; + + /* lookup all devices */ + for (i = 0; i < num_comps; i++) { + struct osd_dev *od; + + od = _device_lookup(pnfslay, objio_seg, i); + if (unlikely(IS_ERR(od))) { + err = PTR_ERR(od); + goto out; + } + objio_seg->ods[i] = od; + } + objio_seg->num_comps = num_comps; + err = 0; + +out: + dprintk("%s: return=%d\n", __func__, err); + return err; +} + +static int _verify_data_map(struct pnfs_osd_layout *layout) +{ + struct pnfs_osd_data_map *data_map = &layout->olo_map; + u64 stripe_length; + u32 group_width; + +/* FIXME: Only raid0 for now. if not go through MDS */ + if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { + printk(KERN_ERR "Only RAID_0 for now\n"); + return -ENOTSUPP; + } + if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { + printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", + data_map->odm_num_comps, data_map->odm_mirror_cnt); + return -EINVAL; + } + + if (data_map->odm_group_width) + group_width = data_map->odm_group_width; + else + group_width = data_map->odm_num_comps / + (data_map->odm_mirror_cnt + 1); + + stripe_length = (u64)data_map->odm_stripe_unit * group_width; + if (stripe_length >= (1ULL << 32)) { + printk(KERN_ERR "Total Stripe length(0x%llx)" + " >= 32bit is not supported\n", _LLU(stripe_length)); + return -ENOTSUPP; + } + + if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { + printk(KERN_ERR "Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(data_map->odm_stripe_unit), PAGE_SIZE); + return -ENOTSUPP; + } + + return 0; +} + +int objio_alloc_lseg(void **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_segment *lseg, + struct pnfs_osd_layout *layout) +{ + struct objio_segment *objio_seg; + int err; + + err = _verify_data_map(layout); + if (unlikely(err)) + return err; + + objio_seg = kzalloc(sizeof(*objio_seg) + + (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), + GFP_KERNEL); + if (!objio_seg) + return -ENOMEM; + + objio_seg->layout = layout; + err = objio_devices_lookup(pnfslay, objio_seg); + if (err) + goto free_seg; + + objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; + objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; + if (layout->olo_map.odm_group_width) { + objio_seg->group_width = layout->olo_map.odm_group_width; + objio_seg->group_depth = layout->olo_map.odm_group_depth; + objio_seg->group_count = layout->olo_map.odm_num_comps / + objio_seg->mirrors_p1 / + objio_seg->group_width; + } else { + objio_seg->group_width = layout->olo_map.odm_num_comps / + objio_seg->mirrors_p1; + objio_seg->group_depth = -1; + objio_seg->group_count = 1; + } + + *outp = objio_seg; + return 0; + +free_seg: + dprintk("%s: Error: return %d\n", __func__, err); + kfree(objio_seg); + *outp = NULL; + return err; +} + +void objio_free_lseg(void *p) +{ + struct objio_segment *objio_seg = p; + + kfree(objio_seg); +} + +int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) +{ + struct objio_segment *objio_seg = seg; + struct objio_state *ios; + const unsigned first_size = sizeof(*ios) + + objio_seg->num_comps * sizeof(ios->per_dev[0]); + const unsigned sec_size = objio_seg->num_comps * + sizeof(ios->ol_state.ioerrs[0]); + + dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); + ios = kzalloc(first_size + sec_size, GFP_KERNEL); + if (unlikely(!ios)) + return -ENOMEM; + + ios->objio_seg = objio_seg; + ios->ol_state.ioerrs = ((void *)ios) + first_size; + ios->ol_state.num_comps = objio_seg->num_comps; + + *outp = &ios->ol_state; + return 0; +} + +void objio_free_io_state(struct objlayout_io_state *ol_state) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + + kfree(ios); +} + +enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) +{ + switch (oep) { + case OSD_ERR_PRI_NO_ERROR: + return (enum pnfs_osd_errno)0; + + case OSD_ERR_PRI_CLEAR_PAGES: + BUG_ON(1); + return 0; + + case OSD_ERR_PRI_RESOURCE: + return PNFS_OSD_ERR_RESOURCE; + case OSD_ERR_PRI_BAD_CRED: + return PNFS_OSD_ERR_BAD_CRED; + case OSD_ERR_PRI_NO_ACCESS: + return PNFS_OSD_ERR_NO_ACCESS; + case OSD_ERR_PRI_UNREACHABLE: + return PNFS_OSD_ERR_UNREACHABLE; + case OSD_ERR_PRI_NOT_FOUND: + return PNFS_OSD_ERR_NOT_FOUND; + case OSD_ERR_PRI_NO_SPACE: + return PNFS_OSD_ERR_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case OSD_ERR_PRI_EIO: + return PNFS_OSD_ERR_EIO; + } +} + +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + __bio_for_each_segment(bv, bio, i, 0) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + +static int _io_check(struct objio_state *ios, bool is_write) +{ + enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; + int lin_ret = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + struct osd_request *or = ios->per_dev[i].or; + int ret; + + if (!or) + continue; + + ret = osd_req_decode_sense(or, &osi); + if (likely(!ret)) + continue; + + if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { + /* start read offset passed endof file */ + BUG_ON(is_write); + _clear_bio(ios->per_dev[i].bio); + dprintk("%s: start read offset passed end of file " + "offset=0x%llx, length=0x%lx\n", __func__, + _LLU(ios->per_dev[i].offset), + ios->per_dev[i].length); + + continue; /* we recovered */ + } + objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, + osd_pri_2_pnfs_err(osi.osd_err_pri), + ios->per_dev[i].offset, + ios->per_dev[i].length, + is_write); + + if (osi.osd_err_pri >= oep) { + oep = osi.osd_err_pri; + lin_ret = ret; + } + } + + return lin_ret; +} + +/* + * Common IO state helpers. + */ +static void _io_free(struct objio_state *ios) +{ + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct _objio_per_comp *per_dev = &ios->per_dev[i]; + + if (per_dev->or) { + osd_end_request(per_dev->or); + per_dev->or = NULL; + } + + if (per_dev->bio) { + bio_put(per_dev->bio); + per_dev->bio = NULL; + } + } +} + +struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) +{ + unsigned min_dev = ios->objio_seg->layout->olo_comps_index; + unsigned max_dev = min_dev + ios->ol_state.num_comps; + + BUG_ON(dev < min_dev || max_dev <= dev); + return ios->objio_seg->ods[dev - min_dev]; +} + +struct _striping_info { + u64 obj_offset; + u64 group_length; + u64 total_group_length; + u64 Major; + unsigned dev; + unsigned unit_off; +}; + +static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, + struct _striping_info *si) +{ + u32 stripe_unit = ios->objio_seg->stripe_unit; + u32 group_width = ios->objio_seg->group_width; + u64 group_depth = ios->objio_seg->group_depth; + u32 U = stripe_unit * group_width; + + u64 T = U * group_depth; + u64 S = T * ios->objio_seg->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodU = file_offset - M * S; + u32 G = div64_u64(LmodU, T); + u64 H = LmodU - G * T; + + u32 N = div_u64(H, U); + + div_u64_rem(file_offset, stripe_unit, &si->unit_off); + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; + si->dev *= ios->objio_seg->mirrors_p1; + + si->group_length = T - H; + si->total_group_length = T; + si->Major = M; +} + +static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, + unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) +{ + unsigned pg = *cur_pg; + struct request_queue *q = + osd_request_queue(_io_od(ios, per_dev->dev)); + + per_dev->length += cur_len; + + if (per_dev->bio == NULL) { + unsigned stripes = ios->ol_state.num_comps / + ios->objio_seg->mirrors_p1; + unsigned pages_in_stripe = stripes * + (ios->objio_seg->stripe_unit / PAGE_SIZE); + unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / + stripes; + + per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); + if (unlikely(!per_dev->bio)) { + dprintk("Faild to allocate BIO size=%u\n", bio_size); + return -ENOMEM; + } + } + + while (cur_len > 0) { + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; + + BUG_ON(ios->ol_state.nr_pages <= pg); + cur_len -= pglen; + + added_len = bio_add_pc_page(q, per_dev->bio, + ios->ol_state.pages[pg], pglen, pgbase); + if (unlikely(pglen != added_len)) + return -ENOMEM; + pgbase = 0; + ++pg; + } + BUG_ON(cur_len); + + *cur_pg = pg; + return 0; +} + +static int _prepare_one_group(struct objio_state *ios, u64 length, + struct _striping_info *si, unsigned first_comp, + unsigned *last_pg) +{ + unsigned stripe_unit = ios->objio_seg->stripe_unit; + unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; + unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; + unsigned dev = si->dev; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned comp = first_comp + (dev - first_dev); + unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; + unsigned cur_pg = *last_pg; + int ret = 0; + + while (length) { + struct _objio_per_comp *per_dev = &ios->per_dev[comp]; + unsigned cur_len, page_off = 0; + + if (!per_dev->length) { + per_dev->dev = dev; + if (dev < si->dev) { + per_dev->offset = si->obj_offset + stripe_unit - + si->unit_off; + cur_len = stripe_unit; + } else if (dev == si->dev) { + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && + (page_off != ios->ol_state.pgbase)); + } else { /* dev > si->dev */ + per_dev->offset = si->obj_offset - si->unit_off; + cur_len = stripe_unit; + } + + if (max_comp < comp) + max_comp = comp; + + dev += mirrors_p1; + dev = (dev % devs_in_group) + first_dev; + } else { + cur_len = stripe_unit; + } + if (cur_len >= length) + cur_len = length; + + ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, + cur_len); + if (unlikely(ret)) + goto out; + + comp += mirrors_p1; + comp = (comp % devs_in_group) + first_comp; + + length -= cur_len; + ios->length += cur_len; + } +out: + ios->numdevs = max_comp + mirrors_p1; + *last_pg = cur_pg; + return ret; +} + +static int _io_rw_pagelist(struct objio_state *ios) +{ + u64 length = ios->ol_state.count; + struct _striping_info si; + unsigned devs_in_group = ios->objio_seg->group_width * + ios->objio_seg->mirrors_p1; + unsigned first_comp = 0; + unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; + unsigned last_pg = 0; + int ret = 0; + + _calc_stripe_info(ios, ios->ol_state.offset, &si); + while (length) { + if (length < si.group_length) + si.group_length = length; + + ret = _prepare_one_group(ios, si.group_length, &si, first_comp, + &last_pg); + if (unlikely(ret)) + goto out; + + length -= si.group_length; + + si.group_length = si.total_group_length; + si.unit_off = 0; + ++si.Major; + si.obj_offset = si.Major * ios->objio_seg->stripe_unit * + ios->objio_seg->group_depth; + + si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; + si.dev %= num_comps; + + first_comp += devs_in_group; + first_comp %= num_comps; + } + +out: + if (!ios->length) + return ret; + + return 0; +} + +static ssize_t _sync_done(struct objio_state *ios) +{ + struct completion *waiting = ios->private; + + complete(waiting); + return 0; +} + +static void _last_io(struct kref *kref) +{ + struct objio_state *ios = container_of(kref, struct objio_state, kref); + + ios->done(ios); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct objio_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +static ssize_t _io_exec(struct objio_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + ssize_t status = 0; /* sync status */ + unsigned i; + objio_done_fn saved_done_fn = ios->done; + bool sync = ios->ol_state.sync; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + + if (!or) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + + if (sync) { + wait_for_completion(&wait); + status = saved_done_fn(ios); + } + + return status; +} + +/* + * read + */ +static ssize_t _read_done(struct objio_state *ios) +{ + ssize_t status; + int ret = _io_check(ios, false); + + _io_free(ios); + + if (likely(!ret)) + status = ios->length; + else + status = ret; + + objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); + return status; +} + +static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) +{ + struct osd_request *or = NULL; + struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; + unsigned dev = per_dev->dev; + struct pnfs_osd_object_cred *cred = + &ios->objio_seg->layout->olo_comps[dev]; + struct osd_obj_id obj = { + .partition = cred->oc_object_id.oid_partition_id, + .id = cred->oc_object_id.oid_object_id, + }; + int ret; + + or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + per_dev->or = or; + + osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); + + ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + if (ret) { + dprintk("%s: Faild to osd_finalize_request() => %d\n", + __func__, ret); + goto err; + } + + dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", + __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), + per_dev->length); + +err: + return ret; +} + +static ssize_t _read_exec(struct objio_state *ios) +{ + unsigned i; + int ret; + + for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { + if (!ios->per_dev[i].length) + continue; + ret = _read_mirrors(ios, i); + if (unlikely(ret)) + goto err; + } + + ios->done = _read_done; + return _io_exec(ios); /* In sync mode exec returns the io status */ + +err: + _io_free(ios); + return ret; +} + +ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + int ret; + + ret = _io_rw_pagelist(ios); + if (unlikely(ret)) + return ret; + + return _read_exec(ios); +} + +/* + * write + */ +static ssize_t _write_done(struct objio_state *ios) +{ + ssize_t status; + int ret = _io_check(ios, true); + + _io_free(ios); + + if (likely(!ret)) { + /* FIXME: should be based on the OSD's persistence model + * See OSD2r05 Section 4.13 Data persistence model */ + ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; + status = ios->length; + } else { + status = ret; + } + + objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); + return status; +} + +static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) +{ + struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; + int ret; + + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct osd_request *or = NULL; + struct pnfs_osd_object_cred *cred = + &ios->objio_seg->layout->olo_comps[dev]; + struct osd_obj_id obj = { + .partition = cred->oc_object_id.oid_partition_id, + .id = cred->oc_object_id.oid_object_id, + }; + struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; + struct bio *bio; + + or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); + if (unlikely(!or)) { + ret = -ENOMEM; + goto err; + } + per_dev->or = or; + + if (per_dev != master_dev) { + bio = bio_kmalloc(GFP_KERNEL, + master_dev->bio->bi_max_vecs); + if (unlikely(!bio)) { + dprintk("Faild to allocate BIO size=%u\n", + master_dev->bio->bi_max_vecs); + ret = -ENOMEM; + goto err; + } + + __bio_clone(bio, master_dev->bio); + bio->bi_bdev = NULL; + bio->bi_next = NULL; + per_dev->bio = bio; + per_dev->dev = dev; + per_dev->length = master_dev->length; + per_dev->offset = master_dev->offset; + } else { + bio = master_dev->bio; + /* FIXME: bio_set_dir() */ + bio->bi_rw |= (1 << BIO_RW); + } + + osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); + + ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + if (ret) { + dprintk("%s: Faild to osd_finalize_request() => %d\n", + __func__, ret); + goto err; + } + + dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", + __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), + per_dev->length); + } + +err: + return ret; +} + +static ssize_t _write_exec(struct objio_state *ios) +{ + unsigned i; + int ret; + + for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { + if (!ios->per_dev[i].length) + continue; + ret = _write_mirrors(ios, i); + if (unlikely(ret)) + goto err; + } + + ios->done = _write_done; + return _io_exec(ios); /* In sync mode exec returns the io->status */ + +err: + _io_free(ios); + return ret; +} + +ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) +{ + struct objio_state *ios = container_of(ol_state, struct objio_state, + ol_state); + int ret; + + /* TODO: ios->stable = stable; */ + ret = _io_rw_pagelist(ios); + if (unlikely(ret)) + return ret; + + return _write_exec(ios); +} + +/* + * Policy Operations + */ + +/* + * Return the stripe size for the specified file + */ +ssize_t +objlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) +{ + ssize_t sz, maxsz = -1; + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &pnfslay->segs, fi_list) { + int n; + struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); + struct pnfs_osd_layout *lo = + (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; + struct pnfs_osd_data_map *map = &lo->olo_map; + + n = map->odm_group_width; + if (n == 0) + n = map->odm_num_comps / (map->odm_mirror_cnt + 1); + + switch (map->odm_raid_algorithm) { + case PNFS_OSD_RAID_0: + break; + + case PNFS_OSD_RAID_4: + case PNFS_OSD_RAID_5: + n -= 1; + break; + + case PNFS_OSD_RAID_PQ: + n -= 2; + break; + + default: + BUG_ON(1); + } + sz = map->odm_stripe_unit * n; + if (sz > maxsz) + maxsz = sz; + } + dprintk("%s: Return %Zx\n", __func__, maxsz); + return maxsz; +} + +/* + * Get the max [rw]size + */ +static ssize_t +objlayout_get_blocksize(void) +{ + ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; + + return sz; +} + +static struct layoutdriver_policy_operations objlayout_policy_operations = { +/* + * Don't gather across stripes, but rather gather (coalesce) up to + * the stripe size. + * + * FIXME: change interface to use merge_align, merge_count + */ + .flags = PNFS_LAYOUTRET_ON_SETATTR, + .get_stripesize = objlayout_get_stripesize, + .get_blocksize = objlayout_get_blocksize, +}; + +static struct pnfs_layoutdriver_type objlayout_type = { + .id = LAYOUT_OSD2_OBJECTS, + .name = "LAYOUT_OSD2_OBJECTS", + .ld_io_ops = &objlayout_io_operations, + .ld_policy_ops = &objlayout_policy_operations, +}; + +void *objio_init_mt(void) +{ + struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); + + if (!omt) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&omt->dev_list); + spin_lock_init(&omt->dev_list_lock); + return omt; +} + +void objio_fini_mt(void *mountid) +{ + _dev_list_remove_all(mountid); + kfree(mountid); +} + +MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); +MODULE_AUTHOR("Benny Halevy "); +MODULE_LICENSE("GPL"); + +static int __init +objlayout_init(void) +{ + pnfs_client_ops = pnfs_register_layoutdriver(&objlayout_type); + printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", + __func__); + return 0; +} + +static void __exit +objlayout_exit(void) +{ + pnfs_unregister_layoutdriver(&objlayout_type); + printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", + __func__); +} + +module_init(objlayout_init); +module_exit(objlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c --- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-09-30 10:17:08.722997000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.c 2010-09-30 10:17:08.724995000 -0400 @@ -0,0 +1,790 @@ +/* + * objlayout.c + * + * pNFS layout driver for Panasas OSDs + * + * Copyright (C) 2007-2009 Panasas Inc. + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "objlayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +struct pnfs_client_operations *pnfs_client_ops; + +/* + * Create a objlayout layout structure for the given inode and return it. + */ +static struct pnfs_layout_hdr * +objlayout_alloc_layout(struct inode *inode) +{ + struct objlayout *objlay; + + objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); + if (objlay) { + spin_lock_init(&objlay->lock); + INIT_LIST_HEAD(&objlay->err_list); + } + dprintk("%s: Return %p\n", __func__, objlay); + return &objlay->pnfs_layout; +} + +/* + * Free an objlayout layout structure + */ +static void +objlayout_free_layout(struct pnfs_layout_hdr *lo) +{ + struct objlayout *objlay = OBJLAYOUT(lo); + + dprintk("%s: objlay %p\n", __func__, objlay); + + WARN_ON(!list_empty(&objlay->err_list)); + kfree(objlay); +} + +/* + * Unmarshall layout and store it in pnfslay. + */ +static struct pnfs_layout_segment * +objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, + struct nfs4_layoutget_res *lgr) +{ + int status; + void *layout = lgr->layout.buf; + struct pnfs_layout_segment *lseg; + struct objlayout_segment *objlseg; + struct pnfs_osd_layout *pnfs_osd_layout; + + dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); + + BUG_ON(!layout); + + status = -ENOMEM; + lseg = kzalloc(sizeof(*lseg) + sizeof(*objlseg) + + pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); + if (!lseg) + goto err; + + objlseg = LSEG_LD_DATA(lseg); + pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; + pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); + + status = objio_alloc_lseg(&objlseg->internal, pnfslay, lseg, + pnfs_osd_layout); + if (status) + goto err; + + dprintk("%s: Return %p\n", __func__, lseg); + return lseg; + + err: + kfree(lseg); + return ERR_PTR(status); +} + +/* + * Free a layout segement + */ +static void +objlayout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct objlayout_segment *objlseg; + + dprintk("%s: freeing layout segment %p\n", __func__, lseg); + + if (unlikely(!lseg)) + return; + + objlseg = LSEG_LD_DATA(lseg); + objio_free_lseg(objlseg->internal); + kfree(lseg); +} + +/* + * I/O Operations + */ +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1 : NFS4_MAX_UINT64; +} + +static struct objlayout_io_state * +objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, + struct page **pages, + unsigned pgbase, + unsigned nr_pages, + loff_t offset, + size_t count, + struct pnfs_layout_segment *lseg, + void *rpcdata) +{ + struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); + struct objlayout_io_state *state; + u64 lseg_end_offset; + size_t size_nr_pages; + + dprintk("%s: allocating io_state\n", __func__); + if (objio_alloc_io_state(objlseg->internal, &state)) + return NULL; + + BUG_ON(offset < lseg->range.offset); + lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); + BUG_ON(offset >= lseg_end_offset); + if (offset + count > lseg_end_offset) { + count = lseg->range.length - (offset - lseg->range.offset); + dprintk("%s: truncated count %Zd\n", __func__, count); + } + + if (pgbase > PAGE_SIZE) { + unsigned n = pgbase >> PAGE_SHIFT; + + pgbase &= ~PAGE_MASK; + pages += n; + nr_pages -= n; + } + + size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; + BUG_ON(nr_pages < size_nr_pages); + if (nr_pages > size_nr_pages) + nr_pages = size_nr_pages; + + INIT_LIST_HEAD(&state->err_list); + state->lseg = lseg; + state->rpcdata = rpcdata; + state->pages = pages; + state->pgbase = pgbase; + state->nr_pages = nr_pages; + state->offset = offset; + state->count = count; + state->sync = 0; + + return state; +} + +static void +objlayout_free_io_state(struct objlayout_io_state *state) +{ + dprintk("%s: freeing io_state\n", __func__); + if (unlikely(!state)) + return; + + objio_free_io_state(state); +} + +/* + * I/O done common code + */ +static void +objlayout_iodone(struct objlayout_io_state *state) +{ + dprintk("%s: state %p status\n", __func__, state); + + if (likely(state->status >= 0)) { + objlayout_free_io_state(state); + } else { + struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); + + spin_lock(&objlay->lock); + objlay->delta_space_valid = OBJ_DSU_INVALID; + list_add(&objlay->err_list, &state->err_list); + spin_unlock(&objlay->lock); + } +} + +/* + * objlayout_io_set_result - Set an osd_error code on a specific osd comp. + * + * The @index component IO failed (error returned from target). Register + * the error for later reporting at layout-return. + */ +void +objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, + int osd_error, u64 offset, u64 length, bool is_write) +{ + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; + + BUG_ON(index >= state->num_comps); + if (osd_error) { + struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg); + struct pnfs_osd_layout *layout = + (typeof(layout))objlseg->pnfs_osd_layout; + + ioerr->oer_component = layout->olo_comps[index].oc_object_id; + ioerr->oer_comp_offset = offset; + ioerr->oer_comp_length = length; + ioerr->oer_iswrite = is_write; + ioerr->oer_errno = osd_error; + + dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " + "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", + __func__, index, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + } else { + /* User need not call if no error is reported */ + ioerr->oer_errno = 0; + } +} + +static void _rpc_commit_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + + pnfs_client_ops->nfs_commit_complete(wdata); +} + +/* + * Commit data remotely on OSDs + */ +enum pnfs_try_status +objlayout_commit(struct nfs_write_data *wdata, int how) +{ + int status = PNFS_ATTEMPTED; + + INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); + schedule_work(&wdata->task.u.tk_work); + dprintk("%s: Return %d\n", __func__, status); + return status; +} + +/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_read_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_read_data *rdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_read_data, task); + + pnfs_client_ops->nfs_readlist_complete(rdata); +} + +void +objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) +{ + int eof = state->eof; + struct nfs_read_data *rdata; + + state->status = status; + dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); + rdata = state->rpcdata; + rdata->task.tk_status = status; + if (status >= 0) { + rdata->res.count = status; + rdata->res.eof = eof; + } + objlayout_iodone(state); + /* must not use state after this point */ + + if (sync) + pnfs_client_ops->nfs_readlist_complete(rdata); + else { + INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); + schedule_work(&rdata->task.u.tk_work); + } +} + +/* + * Perform sync or async reads. + */ +enum pnfs_try_status +objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) +{ + loff_t offset = rdata->args.offset; + size_t count = rdata->args.count; + struct objlayout_io_state *state; + ssize_t status = 0; + loff_t eof; + + dprintk("%s: Begin inode %p offset %llu count %d\n", + __func__, rdata->inode, offset, (int)count); + + eof = i_size_read(rdata->inode); + if (unlikely(offset + count > eof)) { + if (offset >= eof) { + status = 0; + rdata->res.count = 0; + rdata->res.eof = 1; + goto out; + } + count = eof - offset; + } + + state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, + rdata->args.pages, rdata->args.pgbase, + nr_pages, offset, count, + rdata->pdata.lseg, rdata); + if (unlikely(!state)) { + status = -ENOMEM; + goto out; + } + + state->eof = state->offset + state->count >= eof; + + status = objio_read_pagelist(state); + out: + dprintk("%s: Return status %Zd\n", __func__, status); + rdata->pdata.pnfs_error = status; + return PNFS_ATTEMPTED; +} + +/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). + * This is because the osd completion is called with ints-off from + * the block layer + */ +static void _rpc_write_complete(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + + pnfs_client_ops->nfs_writelist_complete(wdata); +} + +void +objlayout_write_done(struct objlayout_io_state *state, ssize_t status, + bool sync) +{ + struct nfs_write_data *wdata; + + dprintk("%s: Begin\n", __func__); + wdata = state->rpcdata; + state->status = status; + wdata->task.tk_status = status; + if (status >= 0) { + wdata->res.count = status; + wdata->verf.committed = state->committed; + dprintk("%s: Return status %d committed %d\n", + __func__, wdata->task.tk_status, + wdata->verf.committed); + } else + dprintk("%s: Return status %d\n", + __func__, wdata->task.tk_status); + objlayout_iodone(state); + /* must not use state after this point */ + + if (sync) + pnfs_client_ops->nfs_writelist_complete(wdata); + else { + INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); + schedule_work(&wdata->task.u.tk_work); + } +} + +/* + * Perform sync or async writes. + */ +enum pnfs_try_status +objlayout_write_pagelist(struct nfs_write_data *wdata, + unsigned nr_pages, + int how) +{ + struct objlayout_io_state *state; + ssize_t status; + + dprintk("%s: Begin inode %p offset %llu count %u\n", + __func__, wdata->inode, wdata->args.offset, wdata->args.count); + + state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, + wdata->args.pages, + wdata->args.pgbase, + nr_pages, + wdata->args.offset, + wdata->args.count, + wdata->pdata.lseg, wdata); + if (unlikely(!state)) { + status = -ENOMEM; + goto out; + } + + state->sync = how & FLUSH_SYNC; + + status = objio_write_pagelist(state, how & FLUSH_STABLE); + out: + dprintk("%s: Return status %Zd\n", __func__, status); + wdata->pdata.pnfs_error = status; + return PNFS_ATTEMPTED; +} + +void +objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct pnfs_osd_layoutupdate lou; + __be32 *start; + + dprintk("%s: Begin\n", __func__); + + spin_lock(&objlay->lock); + lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); + lou.dsu_delta = objlay->delta_space_used; + objlay->delta_space_used = 0; + objlay->delta_space_valid = OBJ_DSU_INIT; + lou.olu_ioerr_flag = !list_empty(&objlay->err_list); + spin_unlock(&objlay->lock); + + start = xdr_reserve_space(xdr, 4); + + BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + + dprintk("%s: Return delta_space_used %lld err %d\n", __func__, + lou.dsu_delta, lou.olu_ioerr_flag); +} + +static int +err_prio(u32 oer_errno) +{ + switch (oer_errno) { + case 0: + return 0; + + case PNFS_OSD_ERR_RESOURCE: + return OSD_ERR_PRI_RESOURCE; + case PNFS_OSD_ERR_BAD_CRED: + return OSD_ERR_PRI_BAD_CRED; + case PNFS_OSD_ERR_NO_ACCESS: + return OSD_ERR_PRI_NO_ACCESS; + case PNFS_OSD_ERR_UNREACHABLE: + return OSD_ERR_PRI_UNREACHABLE; + case PNFS_OSD_ERR_NOT_FOUND: + return OSD_ERR_PRI_NOT_FOUND; + case PNFS_OSD_ERR_NO_SPACE: + return OSD_ERR_PRI_NO_SPACE; + default: + WARN_ON(1); + /* fallthrough */ + case PNFS_OSD_ERR_EIO: + return OSD_ERR_PRI_EIO; + } +} + +static void +merge_ioerr(struct pnfs_osd_ioerr *dest_err, + const struct pnfs_osd_ioerr *src_err) +{ + u64 dest_end, src_end; + + if (!dest_err->oer_errno) { + *dest_err = *src_err; + /* accumulated device must be blank */ + memset(&dest_err->oer_component.oid_device_id, 0, + sizeof(dest_err->oer_component.oid_device_id)); + + return; + } + + if (dest_err->oer_component.oid_partition_id != + src_err->oer_component.oid_partition_id) + dest_err->oer_component.oid_partition_id = 0; + + if (dest_err->oer_component.oid_object_id != + src_err->oer_component.oid_object_id) + dest_err->oer_component.oid_object_id = 0; + + if (dest_err->oer_comp_offset > src_err->oer_comp_offset) + dest_err->oer_comp_offset = src_err->oer_comp_offset; + + dest_end = end_offset(dest_err->oer_comp_offset, + dest_err->oer_comp_length); + src_end = end_offset(src_err->oer_comp_offset, + src_err->oer_comp_length); + if (dest_end < src_end) + dest_end = src_end; + + dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; + + if ((src_err->oer_iswrite == dest_err->oer_iswrite) && + (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { + dest_err->oer_errno = src_err->oer_errno; + } else if (src_err->oer_iswrite) { + dest_err->oer_iswrite = true; + dest_err->oer_errno = src_err->oer_errno; + } +} + +static void +encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) +{ + struct objlayout_io_state *state, *tmp; + struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; + + list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + unsigned i; + + for (i = 0; i < state->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + merge_ioerr(&accumulated_err, ioerr); + } + list_del(&state->err_list); + objlayout_free_io_state(state); + } + + BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); +} + +void +objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct objlayout *objlay = OBJLAYOUT(pnfslay); + struct objlayout_io_state *state, *tmp; + __be32 *start, *uninitialized_var(last_xdr); + + dprintk("%s: Begin\n", __func__); + start = xdr_reserve_space(xdr, 4); + BUG_ON(!start); + + spin_lock(&objlay->lock); + + list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + unsigned i; + int res = 0; + + for (i = 0; i < state->num_comps && !res; i++) { + struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + + if (!ioerr->oer_errno) + continue; + + dprintk("%s: err[%d]: errno=%d is_write=%d " + "dev(%llx:%llx) par=0x%llx obj=0x%llx " + "offset=0x%llx length=0x%llx\n", + __func__, i, ioerr->oer_errno, + ioerr->oer_iswrite, + _DEVID_LO(&ioerr->oer_component.oid_device_id), + _DEVID_HI(&ioerr->oer_component.oid_device_id), + ioerr->oer_component.oid_partition_id, + ioerr->oer_component.oid_object_id, + ioerr->oer_comp_offset, + ioerr->oer_comp_length); + + last_xdr = xdr->p; + res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); + } + if (unlikely(res)) { + /* no space for even one error descriptor */ + BUG_ON(last_xdr == start + 1); + + /* we've encountered a situation with lots and lots of + * errors and no space to encode them all. Use the last + * available slot to report the union of all the + * remaining errors. + */ + xdr_rewind_stream(xdr, last_xdr - + pnfs_osd_ioerr_xdr_sz() / 4); + encode_accumulated_error(objlay, xdr); + goto loop_done; + } + list_del(&state->err_list); + objlayout_free_io_state(state); + } +loop_done: + spin_unlock(&objlay->lock); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); + dprintk("%s: Return\n", __func__); +} + +struct objlayout_deviceinfo { + struct page *page; + struct pnfs_osd_deviceaddr da; /* This must be last */ +}; + +/* Initialize and call nfs_getdeviceinfo, then decode and return a + * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() + * should be called. + */ +int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) +{ + struct objlayout_deviceinfo *odi; + struct pnfs_device pd; + struct super_block *sb; + struct page *page; + size_t sz; + u32 *p; + int err; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + pd.area = page_address(page); + + memcpy(&pd.dev_id, d_id, sizeof(*d_id)); + pd.layout_type = LAYOUT_OSD2_OBJECTS; + pd.dev_notify_types = 0; + pd.pages = &page; + pd.pgbase = 0; + pd.pglen = PAGE_SIZE; + pd.mincount = 0; + + sb = PNFS_INODE(pnfslay)->i_sb; + err = pnfs_client_ops->nfs_getdeviceinfo(PNFS_NFS_SERVER(pnfslay), &pd); + dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); + if (err) + goto err_out; + + p = pd.area; + sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); + odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); + if (!odi) { + err = -ENOMEM; + goto err_out; + } + pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); + odi->page = page; + *deviceaddr = &odi->da; + return 0; + +err_out: + __free_page(page); + return err; +} + +void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) +{ + struct objlayout_deviceinfo *odi = container_of(deviceaddr, + struct objlayout_deviceinfo, + da); + + __free_page(odi->page); + kfree(odi); +} + +/* + * Initialize a mountpoint by retrieving the list of + * available devices for it. + * Return the pnfs_mount_type structure so the + * pNFS_client can refer to the mount point later on. + */ +static int +objlayout_initialize_mountpoint(struct nfs_server *server, + const struct nfs_fh *mntfh) +{ + void *data; + + data = objio_init_mt(); + if (IS_ERR(data)) { + printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", + __func__, PTR_ERR(data)); + return PTR_ERR(data); + } + server->pnfs_ld_data = data; + + dprintk("%s: Return data=%p\n", __func__, data); + return 0; +} + +/* + * Uninitialize a mountpoint + */ +static int +objlayout_uninitialize_mountpoint(struct nfs_server *server) +{ + dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); + objio_fini_mt(server->pnfs_ld_data); + return 0; +} + +struct layoutdriver_io_operations objlayout_io_operations = { + .commit = objlayout_commit, + .read_pagelist = objlayout_read_pagelist, + .write_pagelist = objlayout_write_pagelist, + .alloc_layout = objlayout_alloc_layout, + .free_layout = objlayout_free_layout, + .alloc_lseg = objlayout_alloc_lseg, + .free_lseg = objlayout_free_lseg, + .encode_layoutcommit = objlayout_encode_layoutcommit, + .encode_layoutreturn = objlayout_encode_layoutreturn, + .initialize_mountpoint = objlayout_initialize_mountpoint, + .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, +}; diff -up linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h --- linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-09-30 10:17:08.727996000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/objlayout.h 2010-09-30 10:17:08.729004000 -0400 @@ -0,0 +1,171 @@ +/* + * objlayout.h + * + * Data types and function declerations for interfacing with the + * pNFS standard object layout driver. + * + * Copyright (C) 2007-2009 Panasas Inc. + * All rights reserved. + * + * Benny Halevy + * Boaz Harrosh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _OBJLAYOUT_H +#define _OBJLAYOUT_H + +#include +#include +#include + +/* + * in-core layout segment + */ +struct objlayout_segment { + void *internal; /* for provider internal use */ + u8 pnfs_osd_layout[]; +}; + +/* + * per-inode layout + */ +struct objlayout { + struct pnfs_layout_hdr pnfs_layout; + + /* for layout_commit */ + enum osd_delta_space_valid_enum { + OBJ_DSU_INIT = 0, + OBJ_DSU_VALID, + OBJ_DSU_INVALID, + } delta_space_valid; + s64 delta_space_used; /* consumed by write ops */ + + /* for layout_return */ + spinlock_t lock; + struct list_head err_list; +}; + +static inline struct objlayout * +OBJLAYOUT(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct objlayout, pnfs_layout); +} + +/* + * per-I/O operation state + * embedded in objects provider io_state data structure + */ +struct objlayout_io_state { + struct pnfs_layout_segment *lseg; + + struct page **pages; + unsigned pgbase; + unsigned nr_pages; + unsigned long count; + loff_t offset; + bool sync; + + void *rpcdata; + int status; /* res */ + int eof; /* res */ + int committed; /* res */ + + /* Error reporting (layout_return) */ + struct list_head err_list; + unsigned num_comps; + /* Pointer to array of error descriptors of size num_comps. + * It should contain as many entries as devices in the osd_layout + * that participate in the I/O. It is up to the io_engine to allocate + * needed space and set num_comps. + */ + struct pnfs_osd_ioerr *ioerrs; +}; + +/* + * Raid engine I/O API + */ +extern void *objio_init_mt(void); +extern void objio_fini_mt(void *mt); + +extern int objio_alloc_lseg(void **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_segment *lseg, + struct pnfs_osd_layout *layout); +extern void objio_free_lseg(void *p); + +extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); +extern void objio_free_io_state(struct objlayout_io_state *state); + +extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); +extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, + bool stable); + +/* + * callback API + */ +extern void objlayout_io_set_result(struct objlayout_io_state *state, + unsigned index, int osd_error, + u64 offset, u64 length, bool is_write); + +static inline void +objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) +{ + struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); + + /* If one of the I/Os errored out and the delta_space_used was + * invalid we render the complete report as invalid. Protocol mandate + * the DSU be accurate or not reported. + */ + spin_lock(&objlay->lock); + if (objlay->delta_space_valid != OBJ_DSU_INVALID) { + objlay->delta_space_valid = OBJ_DSU_VALID; + objlay->delta_space_used += space_used; + } + spin_unlock(&objlay->lock); +} + +extern void objlayout_read_done(struct objlayout_io_state *state, + ssize_t status, bool sync); +extern void objlayout_write_done(struct objlayout_io_state *state, + ssize_t status, bool sync); + +extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, + struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); +extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); + +/* + * exported generic objects function vectors + */ +extern struct layoutdriver_io_operations objlayout_io_operations; +extern struct pnfs_client_operations *pnfs_client_ops; + +#endif /* _OBJLAYOUT_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c --- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-09-30 10:17:08.731997000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.c 2010-09-30 10:17:08.733995000 -0400 @@ -0,0 +1,734 @@ +/* + * panfs_shim.c + * + * Shim layer for interfacing with the Panasas DirectFlow module I/O stack + * + * Copyright (C) 2007-2009 Panasas Inc. + * All rights reserved. + * + * Benny Halevy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * See the file COPYING included with this distribution for more details. + * + */ + +#include +#include +#include + +#include "objlayout.h" +#include "panfs_shim.h" + +#include + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +struct panfs_export_operations *panfs_export_ops; + +void * +objio_init_mt(void) +{ + return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; +} + +void objio_fini_mt(void *mountid) +{ +} + +static int +panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, + struct pnfs_osd_data_map *lo_map, + pan_agg_layout_hdr_t *hdr) +{ + if (lo_map->odm_mirror_cnt) { + hdr->type = PAN_AGG_RAID1; + hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; + } else if (layout->olo_num_comps > 1) { + hdr->type = PAN_AGG_RAID0; + hdr->hdr.raid0.num_comps = layout->olo_num_comps; + hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; + } else + hdr->type = PAN_AGG_SIMPLE; + return 0; +} + +static int +panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, + struct pnfs_osd_data_map *lo_map, + pan_agg_layout_hdr_t *hdr) +{ + if (lo_map->odm_mirror_cnt) + goto err; + + if (lo_map->odm_group_width || lo_map->odm_group_depth) { + if (!lo_map->odm_group_width || !lo_map->odm_group_depth) + goto err; + + hdr->type = PAN_AGG_GRP_RAID5_LEFT; + hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; + if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) + goto err; + hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; + hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; + hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; + /* this is a guess, panasas server is not supposed to + hand out layotu otherwise */ + hdr->hdr.grp_raid5_left.group_layout_policy = + PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; + } else { + hdr->type = PAN_AGG_RAID5_LEFT; + hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; + if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) + goto err; + hdr->hdr.raid5_left.stripe_unit2 = + hdr->hdr.raid5_left.stripe_unit1 = + hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; + } + + return 0; +err: + return -EINVAL; +} + +/* + * Convert a pnfs_osd data map into Panasas aggregation layout header + */ +static int +panfs_shim_conv_pnfs_osd_data_map( + struct pnfs_osd_layout *layout, + pan_agg_layout_hdr_t *hdr) +{ + int status = -EINVAL; + struct pnfs_osd_data_map *lo_map = &layout->olo_map; + + if (!layout->olo_num_comps) { + dprintk("%s: !!layout.n_comps(%u)\n", __func__, + layout->olo_num_comps); + goto err; + } + + switch (lo_map->odm_raid_algorithm) { + case PNFS_OSD_RAID_0: + if (layout->olo_num_comps != lo_map->odm_num_comps || + layout->olo_comps_index) { + dprintk("%s: !!PNFS_OSD_RAID_0 " + "layout.n_comps(%u) map.n_comps(%u) " + "comps_index(%u)\n", __func__, + layout->olo_num_comps, + lo_map->odm_num_comps, + layout->olo_comps_index); + goto err; + } + status = panfs_shim_conv_raid01(layout, lo_map, hdr); + break; + + case PNFS_OSD_RAID_5: + if (!lo_map->odm_group_width) { + if (layout->olo_num_comps != lo_map->odm_num_comps || + layout->olo_comps_index) { + dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " + "layout.n_comps(%u)!=map.n_comps(%u) " + "|| comps_index(%u)\n", __func__, + layout->olo_num_comps, + lo_map->odm_num_comps, + layout->olo_comps_index); + goto err; + } + } else if ((layout->olo_num_comps != lo_map->odm_num_comps && + layout->olo_num_comps > lo_map->odm_group_width) || + (layout->olo_comps_index % lo_map->odm_group_width)){ + dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " + "layout.n_comps(%u) map.n_comps(%u) " + "comps_index(%u)\n", __func__, + lo_map->odm_group_width, + layout->olo_num_comps, + lo_map->odm_num_comps, + layout->olo_comps_index); + goto err; + } + status = panfs_shim_conv_raid5(layout, lo_map, hdr); + break; + + case PNFS_OSD_RAID_4: + case PNFS_OSD_RAID_PQ: + default: + dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, + lo_map->odm_raid_algorithm); + goto err; + } + + return 0; + +err: + return status; +} + +/* + * Convert pnfs_osd layout into Panasas map and caps type + */ +int +objio_alloc_lseg(void **outp, + struct pnfs_layout_hdr *pnfslay, + struct pnfs_layout_segment *lseg, + struct pnfs_osd_layout *layout) +{ + int i, total_comps; + int status; + struct pnfs_osd_object_cred *lo_comp; + pan_size_t alloc_sz, local_sz; + pan_sm_map_cap_t *mcs = NULL; + u8 *buf; + pan_agg_comp_obj_t *pan_comp; + pan_sm_sec_t *pan_sec; + + status = -EINVAL; + if (layout->olo_num_comps < layout->olo_map.odm_group_width) { + total_comps = layout->olo_comps_index + layout->olo_num_comps; + } else { + /* allocate full map, otherwise SAM gets confused */ + total_comps = layout->olo_map.odm_num_comps; + } + alloc_sz = total_comps * + (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); + for (i = 0; i < layout->olo_num_comps; i++) { + void *p = layout->olo_comps[i].oc_cap.cred; + if (panfs_export_ops->sm_sec_t_get_size_otw( + (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) + goto err; + alloc_sz += local_sz; + } + + status = -ENOMEM; + mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); + if (!mcs) + goto err; + buf = (u8 *)&mcs[1]; + + mcs->offset = lseg->range.offset; + mcs->length = lseg->range.length; +#if 0 + /* FIXME: for now */ + mcs->expiration_time.ts_sec = 0; + mcs->expiration_time.ts_nsec = 0; +#endif + mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; + status = panfs_shim_conv_pnfs_osd_data_map(layout, + &mcs->full_map.layout_hdr); + if (status) + goto err; + + mcs->full_map.components.size = total_comps; + mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; + buf += total_comps * sizeof(pan_agg_comp_obj_t); + + mcs->secs.size = total_comps; + mcs->secs.data = (pan_sm_sec_t *)buf; + buf += total_comps * sizeof(pan_sm_sec_t); + + lo_comp = layout->olo_comps; + pan_comp = mcs->full_map.components.data + layout->olo_comps_index; + pan_sec = mcs->secs.data + layout->olo_comps_index; + for (i = 0; i < layout->olo_num_comps; i++) { + void *p; + pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; + struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; + u64 dev_id = __be64_to_cpup( + (__be64 *)oc_obj_id->oid_device_id.data + 1); + + dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", + __func__, i, + __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), + __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), + oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); + + if (i == 0) { + /* make up mgr_id to calm sam down */ + pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, + &obj_id->dev_id); + obj_id->grp_id = oc_obj_id->oid_partition_id; + obj_id->obj_id = oc_obj_id->oid_object_id; + } + + if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { + dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", + __func__, i, (u64)obj_id->grp_id, + lo_comp->oc_object_id.oid_partition_id); + status = -EINVAL; + goto err; + } + + if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { + dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", + __func__, i, obj_id->obj_id, + lo_comp->oc_object_id.oid_object_id); + status = -EINVAL; + goto err; + } + + pan_comp->dev_id = dev_id; + if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { + dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", + __func__, i, obj_id->dev_id); + status = -EINVAL; + goto err; + } + if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { + dprintk("%s: degraded maps not supported yet\n", + __func__); + status = -ENOTSUPP; + goto err; + } + pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; + if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { + dprintk("%s: cap key security not supported yet\n", + __func__); + status = -ENOTSUPP; + goto err; + } + + p = lo_comp->oc_cap.cred; + panfs_export_ops->sm_sec_t_unmarshall( + (pan_sm_sec_otw_t *)&p, + pan_sec, + buf, + alloc_sz, + NULL, + &local_sz); + buf += local_sz; + alloc_sz -= local_sz; + + lo_comp++; + pan_comp++; + pan_sec++; + } + + *outp = mcs; + dprintk("%s:Return mcs=%p\n", __func__, mcs); + return 0; + +err: + objio_free_lseg(mcs); + dprintk("%s:Error %d\n", __func__, status); + return status; +} + +/* + * Free a Panasas map and caps type + */ +void +objio_free_lseg(void *p) +{ + kfree(p); +} + +/* + * I/O routines + */ +int +objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) +{ + struct panfs_shim_io_state *p; + + dprintk("%s: allocating io_state\n", __func__); + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + *outp = &p->ol_state; + return 0; +} + +/* + * Free an I/O state + */ +void +objio_free_io_state(struct objlayout_io_state *ol_state) +{ + struct panfs_shim_io_state *state = container_of(ol_state, + struct panfs_shim_io_state, ol_state); + int i; + + dprintk("%s: freeing io_state\n", __func__); + for (i = 0; i < state->ol_state.nr_pages; i++) + kunmap(state->ol_state.pages[i]); + + if (state->ucreds) + panfs_export_ops->ucreds_put(state->ucreds); + kfree(state->sg_list); + kfree(state); +} + +static int +panfs_shim_pages_to_sg( + struct panfs_shim_io_state *state, + struct page **pages, + unsigned int pgbase, + unsigned nr_pages, + size_t count) +{ + unsigned i, n; + pan_sg_entry_t *sg; + + dprintk("%s pgbase %u nr_pages %u count %d " + "pg0 %p flags 0x%x index %llu\n", + __func__, pgbase, nr_pages, (int)count, pages[0], + (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); + + sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); + if (sg == NULL) + return -ENOMEM; + + dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", + __func__, sg, pages, pgbase, nr_pages); + + for (i = 0; i < nr_pages; i++) { + sg[i].buffer = (char *)kmap(pages[i]) + pgbase; + n = PAGE_SIZE - pgbase; + pgbase = 0; + if (n > count) + n = count; + sg[i].chunk_size = n; + count -= n; + if (likely(count)) { + sg[i].next = &sg[i+1]; + } else { + /* we're done */ + sg[i].next = NULL; + break; + } + } + BUG_ON(count); + + state->sg_list = sg; + return 0; +} + +/* + * Callback function for async reads + */ +static void +panfs_shim_read_done( + void *arg1, + void *arg2, + pan_sam_read_res_t *res_p, + pan_status_t rc) +{ + struct panfs_shim_io_state *state = arg1; + ssize_t status; + + dprintk("%s: Begin\n", __func__); + if (!res_p) + res_p = &state->u.read.res; + if (rc == PAN_SUCCESS) + rc = res_p->result; + if (rc == PAN_SUCCESS) { + status = res_p->length; + WARN_ON(status < 0); + } else { + status = -panfs_export_ops->convert_rc(rc); + dprintk("%s: pan_sam_read rc %d: status %Zd\n", + __func__, rc, status); + } + dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); + objlayout_read_done(&state->ol_state, status, true); +} + +ssize_t +objio_read_pagelist(struct objlayout_io_state *ol_state) +{ + struct panfs_shim_io_state *state = container_of(ol_state, + struct panfs_shim_io_state, ol_state); + struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); + pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; + ssize_t status = 0; + pan_status_t rc = PAN_SUCCESS; + + dprintk("%s: Begin\n", __func__); + + status = panfs_shim_pages_to_sg(state, ol_state->pages, + ol_state->pgbase, ol_state->nr_pages, + ol_state->count); + if (unlikely(status)) + goto err; + + state->obj_sec.min_security = 0; + state->obj_sec.map_ccaps = mcs; + + rc = panfs_export_ops->ucreds_get(&state->ucreds); + if (unlikely(rc)) { + status = -EACCES; + goto err; + } + + state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; + state->u.read.args.offset = ol_state->offset; + rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, + &state->u.read.args, + &state->obj_sec, + state->sg_list, + state->ucreds, + ol_state->sync ? + NULL : panfs_shim_read_done, + state, NULL, + &state->u.read.res); + if (rc != PAN_ERR_IN_PROGRESS) + panfs_shim_read_done(state, NULL, &state->u.read.res, rc); + err: + dprintk("%s: Return %Zd\n", __func__, status); + return status; +} + +/* + * Callback function for async writes + */ +static void +panfs_shim_write_done( + void *arg1, + void *arg2, + pan_sam_write_res_t *res_p, + pan_status_t rc) +{ + struct panfs_shim_io_state *state = arg1; + ssize_t status; + + dprintk("%s: Begin\n", __func__); + if (!res_p) + res_p = &state->u.write.res; + if (rc == PAN_SUCCESS) + rc = res_p->result; + if (rc == PAN_SUCCESS) { +/* state->ol_state.committed = NFS_FILE_SYNC;*/ + state->ol_state.committed = NFS_UNSTABLE; + status = res_p->length; + WARN_ON(status < 0); + + objlayout_add_delta_space_used(&state->ol_state, + res_p->delta_capacity_used); + } else { + status = -panfs_export_ops->convert_rc(rc); + dprintk("%s: pan_sam_write rc %u: status %Zd\n", + __func__, rc, status); + } + dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); + objlayout_write_done(&state->ol_state, status, true); +} + +ssize_t +objio_write_pagelist(struct objlayout_io_state *ol_state, + bool stable /* unused, PanOSD writes are stable */) +{ + struct panfs_shim_io_state *state = container_of(ol_state, + struct panfs_shim_io_state, ol_state); + struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); + pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; + ssize_t status = 0; + pan_status_t rc = PAN_SUCCESS; + + dprintk("%s: Begin\n", __func__); + + status = panfs_shim_pages_to_sg(state, ol_state->pages, + ol_state->pgbase, ol_state->nr_pages, + ol_state->count); + if (unlikely(status)) + goto err; + + state->obj_sec.min_security = 0; + state->obj_sec.map_ccaps = mcs; + + rc = panfs_export_ops->ucreds_get(&state->ucreds); + if (unlikely(rc)) { + status = -EACCES; + goto err; + } + + state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; + state->u.write.args.offset = ol_state->offset; + rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, + &state->u.write.args, + &state->obj_sec, + state->sg_list, + state->ucreds, + ol_state->sync ? + NULL : panfs_shim_write_done, + state, + NULL, + &state->u.write.res); + if (rc != PAN_ERR_IN_PROGRESS) + panfs_shim_write_done(state, NULL, &state->u.write.res, rc); + err: + dprintk("%s: Return %Zd\n", __func__, status); + return status; +} + +int +panfs_shim_register(struct panfs_export_operations *ops) +{ + if (panfs_export_ops) { + printk(KERN_INFO + "%s: panfs already registered (panfs ops %p)\n", + __func__, panfs_export_ops); + return -EINVAL; + } + + printk(KERN_INFO "%s: registering panfs ops %p\n", + __func__, ops); + + panfs_export_ops = ops; + return 0; +} +EXPORT_SYMBOL(panfs_shim_register); + +int +panfs_shim_unregister(void) +{ + if (!panfs_export_ops) { + printk(KERN_INFO "%s: panfs is not registered\n", __func__); + return -EINVAL; + } + + printk(KERN_INFO "%s: unregistering panfs ops %p\n", + __func__, panfs_export_ops); + + panfs_export_ops = NULL; + return 0; +} +EXPORT_SYMBOL(panfs_shim_unregister); + +/* + * Policy Operations + */ + +/* + * Return the stripe size for the specified file + */ +ssize_t +panlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) +{ + ssize_t sz, maxsz = -1; + struct pnfs_layout_segment *lseg; + + dprintk("%s: Begin\n", __func__); + + list_for_each_entry(lseg, &pnfslay->segs, fi_list) { + int n; + struct objlayout_segment *panlseg = LSEG_LD_DATA(lseg); + struct pnfs_osd_layout *lo = + (struct pnfs_osd_layout *)panlseg->pnfs_osd_layout; + struct pnfs_osd_data_map *map = &lo->olo_map; + + n = map->odm_group_width; + if (n == 0) + n = map->odm_num_comps / (map->odm_mirror_cnt + 1); + + switch (map->odm_raid_algorithm) { + case PNFS_OSD_RAID_0: + break; + + case PNFS_OSD_RAID_4: + case PNFS_OSD_RAID_5: + n -= 1; + n *= 8; /* FIXME: until we have 2-D coalescing */ + break; + + case PNFS_OSD_RAID_PQ: + n -= 2; + break; + + default: + BUG_ON(1); + } + sz = map->odm_stripe_unit * n; + if (sz > maxsz) + maxsz = sz; + } + dprintk("%s: Return %Zd\n", __func__, maxsz); + return maxsz; +} + +#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) +#define PANLAYOUT_DEF_STRIPE_WIDTH 9 +#define PANLAYOUT_MAX_STRIPE_WIDTH 11 +#define PANLAYOUT_MAX_GATHER_STRIPES 8 + +/* + * Get the max [rw]size + */ +static ssize_t +panlayout_get_blocksize(void) +{ + ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * + PANLAYOUT_DEF_STRIPE_UNIT * + PANLAYOUT_MAX_GATHER_STRIPES; + dprintk("%s: Return %Zd\n", __func__, sz); + return sz; +} + +static struct layoutdriver_policy_operations panlayout_policy_operations = { +/* + * Don't gather across stripes, but rather gather (coalesce) up to + * the stripe size. + * + * FIXME: change interface to use merge_align, merge_count + */ + .flags = PNFS_LAYOUTRET_ON_SETATTR, + .get_stripesize = panlayout_get_stripesize, + .get_blocksize = panlayout_get_blocksize, +}; + +#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) + +static struct pnfs_layoutdriver_type panlayout_type = { + .id = PNFS_LAYOUT_PANOSD, + .name = "PNFS_LAYOUT_PANOSD", + .ld_io_ops = &objlayout_io_operations, + .ld_policy_ops = &panlayout_policy_operations, +}; + +MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); +MODULE_AUTHOR("Benny Halevy "); +MODULE_LICENSE("GPL"); + +static int __init +panlayout_init(void) +{ + pnfs_client_ops = pnfs_register_layoutdriver(&panlayout_type); + printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", + __func__); + return 0; +} + +static void __exit +panlayout_exit(void) +{ + pnfs_unregister_layoutdriver(&panlayout_type); + printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", + __func__); +} + +module_init(panlayout_init); +module_exit(panlayout_exit); diff -up linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h --- linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-09-30 10:17:08.736995000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/panfs_shim.h 2010-09-30 10:17:08.738995000 -0400 @@ -0,0 +1,482 @@ +/* + * panfs_shim.h + * + * Data types and external function declerations for interfacing with + * panfs (Panasas DirectFlow) I/O stack + * + * Copyright (C) 2007 Panasas Inc. + * All rights reserved. + * + * Benny Halevy + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * See the file COPYING included with this distribution for more details. + * + */ + +#ifndef _PANLAYOUT_PANFS_SHIM_H +#define _PANLAYOUT_PANFS_SHIM_H + +typedef s8 pan_int8_t; +typedef u8 pan_uint8_t; +typedef s16 pan_int16_t; +typedef u16 pan_uint16_t; +typedef s32 pan_int32_t; +typedef u32 pan_uint32_t; +typedef s64 pan_int64_t; +typedef u64 pan_uint64_t; + +/* + * from pan_base_types.h + */ +typedef pan_uint64_t pan_rpc_none_t; +typedef pan_uint32_t pan_rpc_arrdim_t; +typedef pan_uint32_t pan_status_t; +typedef pan_uint8_t pan_otw_t; +typedef pan_uint8_t pan_pad_t; + +typedef pan_uint32_t pan_timespec_sec_t; +typedef pan_uint32_t pan_timespec_nsec_t; + +typedef struct pan_timespec_s pan_timespec_t; +struct pan_timespec_s { + pan_timespec_sec_t ts_sec; + pan_timespec_nsec_t ts_nsec; +}; + +/* + * from pan_std_types.h + */ +typedef pan_uint32_t pan_size_t; +typedef int pan_bool_t; + +/* + * from pan_common_error.h + */ +#define PAN_SUCCESS ((pan_status_t)0) +#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) + +/* + * from pan_sg.h + */ +typedef struct pan_sg_entry_s pan_sg_entry_t; +struct pan_sg_entry_s { + void *buffer; /* pointer to memory */ + pan_uint32_t chunk_size; /* size of each chunk (bytes) */ + pan_sg_entry_t *next; +}; + +/* + * from pan_storage.h + */ +typedef pan_uint64_t pan_stor_dev_id_t; +typedef pan_uint32_t pan_stor_obj_grp_id_t; +typedef pan_uint64_t pan_stor_obj_uniq_t; +typedef pan_uint32_t pan_stor_action_t; +typedef pan_uint8_t pan_stor_cap_key_t[20]; + +typedef pan_uint8_t pan_stor_key_type_t; +typedef pan_uint64_t pan_stor_len_t; +typedef pan_int64_t pan_stor_delta_len_t; +typedef pan_uint64_t pan_stor_offset_t; +typedef pan_uint16_t pan_stor_op_t; + +typedef pan_uint16_t pan_stor_sec_level_t; + +struct pan_stor_obj_id_s { + pan_stor_dev_id_t dev_id; + pan_stor_obj_uniq_t obj_id; + pan_stor_obj_grp_id_t grp_id; +}; + +typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; + +#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) +#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) +#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) +#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) +#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) +#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) +#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) +#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) + +/* + * from pan_aggregation_map.h + */ +typedef pan_uint8_t pan_agg_type_t; +typedef pan_uint64_t pan_agg_map_version_t; +typedef pan_uint8_t pan_agg_obj_state_t; +typedef pan_uint8_t pan_agg_comp_state_t; +typedef pan_uint8_t pan_agg_comp_flag_t; + +#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) +#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) +#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) +#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) +#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) +#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) +#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) +#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) +#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) +#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) +#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) +#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) +#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) +#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) +#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) +#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) + +struct pan_aggregation_map_s { + pan_agg_map_version_t version; + pan_agg_obj_state_t avail_state; + pan_stor_obj_id_t obj_id; +}; + +typedef struct pan_aggregation_map_s pan_aggregation_map_t; + +struct pan_agg_comp_obj_s { + pan_stor_dev_id_t dev_id; + pan_agg_comp_state_t avail_state; + pan_agg_comp_flag_t comp_flags; +}; + +typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; + +struct pan_agg_simple_header_s { + pan_uint8_t unused; +}; + +typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; + +struct pan_agg_raid1_header_s { + pan_uint16_t num_comps; +}; + +typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; + +struct pan_agg_raid0_header_s { + pan_uint16_t num_comps; + pan_uint32_t stripe_unit; +}; + +typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; + +struct pan_agg_raid5_left_header_s { + pan_uint16_t num_comps; + pan_uint32_t stripe_unit0; + pan_uint32_t stripe_unit1; + pan_uint32_t stripe_unit2; +}; + +typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; + +typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; + +struct pan_agg_grp_raid5_left_header_s { + pan_uint16_t num_comps; + pan_uint32_t stripe_unit; + pan_uint16_t rg_width; + pan_uint16_t rg_depth; + pan_uint8_t group_layout_policy; +}; + +#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) +#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) + +#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) +#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) +#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) +#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) +#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) +#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) +#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) +#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) + +struct pan_agg_layout_hdr_s { + pan_agg_type_t type; + pan_pad_t pad[3]; + union { + pan_uint64_t null; + pan_agg_simple_header_t simple; + pan_agg_raid1_header_t raid1; + pan_agg_raid0_header_t raid0; + pan_agg_raid5_left_header_t raid5_left; + pan_agg_grp_raid5_left_header_t grp_raid5_left; + } hdr; +}; + +typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; + +struct pan_agg_comp_obj_a_s { + pan_rpc_arrdim_t size; + pan_agg_comp_obj_t *data; +}; +typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; + +struct pan_agg_full_map_s { + pan_aggregation_map_t map_hdr; + pan_agg_layout_hdr_t layout_hdr; + pan_agg_comp_obj_a components; +}; + +typedef struct pan_agg_full_map_s pan_agg_full_map_t; + +/* + * from pan_obsd_rpc_types.h + */ +typedef pan_uint8_t pan_obsd_security_key_a[16]; + +typedef pan_uint8_t pan_obsd_capability_key_a[20]; + +typedef pan_uint8_t pan_obsd_key_holder_id_t; + +#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) +#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) + +struct pan_obsd_key_holder_s { + pan_obsd_key_holder_id_t select; + pan_pad_t pad[3]; + union { + pan_obsd_security_key_a basis_key; + pan_obsd_capability_key_a cap_key; + } key; +}; + +typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; + +/* + * from pan_sm_sec.h + */ +typedef pan_uint8_t pan_sm_sec_type_t; +typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; + +struct pan_obsd_capability_generic_otw_t_s { + pan_rpc_arrdim_t size; + pan_uint8_t *data; +}; +typedef struct pan_obsd_capability_generic_otw_t_s + pan_obsd_capability_generic_otw_t; + +struct pan_sm_sec_obsd_s { + pan_obsd_key_holder_t key; + pan_obsd_capability_generic_otw_t cap_otw; + pan_sm_sec_otw_allo_mode_t allo_mode; +}; + +typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; + +struct pan_sm_sec_s { + pan_sm_sec_type_t type; + pan_pad_t pad[3]; + union { + pan_rpc_none_t none; + pan_sm_sec_obsd_t obsd; + } variant; +}; + +typedef struct pan_sm_sec_s pan_sm_sec_t; + +struct pan_sm_sec_a_s { + pan_rpc_arrdim_t size; + pan_sm_sec_t *data; +}; +typedef struct pan_sm_sec_a_s pan_sm_sec_a; +typedef pan_otw_t *pan_sm_sec_otw_t; + +/* + * from pan_sm_types.h + */ +typedef pan_uint64_t pan_sm_cap_handle_t; + +struct pan_sm_map_cap_s { + pan_agg_full_map_t full_map; + pan_stor_offset_t offset; + pan_stor_len_t length; + pan_sm_sec_a secs; + pan_sm_cap_handle_t handle; + pan_timespec_t expiration_time; + pan_stor_action_t action_mask; + pan_uint32_t flags; +}; + +typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; + +/* + * from pan_sm_ops.h + */ +typedef pan_rpc_none_t pan_sm_cache_ptr_t; + +/* + * from pan_sam_api.h + */ +typedef pan_uint32_t pan_sam_access_flags_t; + +typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; +struct pan_sam_dev_error_s { + pan_stor_dev_id_t dev_id; + pan_stor_op_t stor_op; + pan_status_t error; +}; + +typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; +struct pan_sam_ext_status_s { + pan_uint32_t available; + pan_uint32_t size; + pan_sam_dev_error_t *errors; +}; + +enum pan_sam_rpc_sec_sel_e { + PAN_SAM_RPC_SEC_DEFAULT, + PAN_SAM_RPC_SEC_ATLEAST, + PAN_SAM_RPC_SEC_EXACTLY +}; +typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; + +typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; +struct pan_sam_obj_sec_s { + pan_stor_sec_level_t min_security; + pan_sm_map_cap_t *map_ccaps; +}; + +typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; +struct pan_sam_rpc_sec_s { + pan_sam_rpc_sec_sel_t selector; +}; + +typedef struct pan_sam_read_args_s pan_sam_read_args_t; +struct pan_sam_read_args_s { + pan_stor_obj_id_t obj_id; + pan_sm_cache_ptr_t obj_ent; + void *return_attr; + void *checksum; + pan_stor_offset_t offset; + pan_uint16_t sm_options; + void *callout; + void *callout_arg; +}; + +typedef struct pan_sam_read_res_s pan_sam_read_res_t; +struct pan_sam_read_res_s { + pan_status_t result; + pan_sam_ext_status_t ext_status; + pan_stor_len_t length; + void *attr; + void *checksum; +}; + +typedef void (*pan_sam_read_cb_t)( + void *user_arg1, + void *user_arg2, + pan_sam_read_res_t *res_p, + pan_status_t status); + +#define PAN_SAM_ACCESS_NONE 0x0000 +#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 + +typedef struct pan_sam_write_args_s pan_sam_write_args_t; +struct pan_sam_write_args_s { + pan_stor_obj_id_t obj_id; + pan_sm_cache_ptr_t obj_ent; + pan_stor_offset_t offset; + void *attr; + void *return_attr; +}; + +typedef struct pan_sam_write_res_s pan_sam_write_res_t; +struct pan_sam_write_res_s { + pan_status_t result; + pan_sam_ext_status_t ext_status; + pan_stor_len_t length; + pan_stor_delta_len_t delta_capacity_used; + pan_bool_t parity_dirty; + void *attr; +}; + +typedef void (*pan_sam_write_cb_t)( + void *user_arg1, + void *user_arg2, + pan_sam_write_res_t *res_p, + pan_status_t status); + +/* + * from pan_mgr_types.h + */ +#define PAN_MGR_ID_TYPE_SHIFT 56 +#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) +#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) + +typedef pan_uint16_t pan_mgr_type_t; +typedef pan_uint64_t pan_mgr_id_t; + +#define PAN_MGR_SM ((pan_mgr_type_t) 2U) +#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) + +/* + * from pan_mgr_types_c.h + */ +#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ + pan_mgr_id_t _id1, _id2; \ +\ + _id1 = (_mgr_type_); \ + _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ + _id1 &= PAN_MGR_ID_TYPE_MASK; \ + _id2 = (_mgr_uniq_); \ + _id2 &= PAN_MGR_ID_UNIQ_MASK; \ + _id1 |= _id2; \ + *(_mgr_id_p_) = _id1; \ +} + +/* + * from pan_storage_c.h + */ +#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ + ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ + == PAN_MGR_OBSD) + +/* + * pnfs_shim internal definitions + */ + +struct panfs_shim_io_state { + struct objlayout_io_state ol_state; + + pan_sg_entry_t *sg_list; + pan_sam_obj_sec_t obj_sec; + void *ucreds; + union { + struct { + pan_sam_read_args_t args; + pan_sam_read_res_t res; + } read; + struct { + pan_sam_write_args_t args; + pan_sam_write_res_t res; + } write; + } u; +}; + +#endif /* _PANLAYOUT_PANFS_SHIM_H */ diff -up linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c --- linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-09-30 10:17:08.741996000 -0400 +++ linux-2.6.34.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-09-30 10:17:08.743002000 -0400 @@ -0,0 +1,435 @@ +/* + * pnfs_osd_xdr.c + * + * Object-Based pNFS Layout XDR layer + * + * Copyright (C) 2007-2009 Panasas Inc. + * All rights reserved. + * + * Benny Halevy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +/* + * The following implementation is based on these Internet Drafts: + * + * draft-ietf-nfsv4-minorversion-21 + * draft-ietf-nfsv4-pnfs-obj-12 + */ + +/* + * struct pnfs_osd_objid { + * struct pnfs_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + * }; + */ +static inline u32 * +pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) +{ + COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); + READ64(objid->oid_partition_id); + READ64(objid->oid_object_id); + return p; +} + +static inline u32 * +pnfs_osd_xdr_decode_opaque_cred(u32 *p, + struct pnfs_osd_opaque_cred *opaque_cred) +{ + READ32(opaque_cred->cred_len); + COPYMEM(opaque_cred->cred, opaque_cred->cred_len); + return p; +} + +/* + * struct pnfs_osd_object_cred { + * struct pnfs_osd_objid oc_object_id; + * u32 oc_osd_version; + * u32 oc_cap_key_sec; + * struct pnfs_osd_opaque_cred oc_cap_key + * struct pnfs_osd_opaque_cred oc_cap; + * }; + */ +static inline u32 * +pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, + u8 **credp) +{ + u8 *cred; + + p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); + READ32(comp->oc_osd_version); + READ32(comp->oc_cap_key_sec); + + cred = *credp; + comp->oc_cap_key.cred = cred; + p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); + cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); + comp->oc_cap.cred = cred; + p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); + cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); + *credp = cred; + + return p; +} + +/* + * struct pnfs_osd_data_map { + * u32 odm_num_comps; + * u64 odm_stripe_unit; + * u32 odm_group_width; + * u32 odm_group_depth; + * u32 odm_mirror_cnt; + * u32 odm_raid_algorithm; + * }; + */ +static inline u32 * +pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) +{ + READ32(data_map->odm_num_comps); + READ64(data_map->odm_stripe_unit); + READ32(data_map->odm_group_width); + READ32(data_map->odm_group_depth); + READ32(data_map->odm_mirror_cnt); + READ32(data_map->odm_raid_algorithm); + dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " + "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", + __func__, + data_map->odm_num_comps, + (unsigned long long)data_map->odm_stripe_unit, + data_map->odm_group_width, + data_map->odm_group_depth, + data_map->odm_mirror_cnt, + data_map->odm_raid_algorithm); + return p; +} + +struct pnfs_osd_layout * +pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) +{ + int i; + u32 *start = p; + struct pnfs_osd_object_cred *comp; + u8 *cred; + + p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); + READ32(layout->olo_comps_index); + READ32(layout->olo_num_comps); + layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); + comp = layout->olo_comps; + cred = (u8 *)(comp + layout->olo_num_comps); + dprintk("%s: comps_index=%u num_comps=%u\n", + __func__, layout->olo_comps_index, layout->olo_num_comps); + for (i = 0; i < layout->olo_num_comps; i++) { + p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); + dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " + "key_len=%u cap_len=%u\n", + __func__, i, + _DEVID_LO(&comp->oc_object_id.oid_device_id), + _DEVID_HI(&comp->oc_object_id.oid_device_id), + comp->oc_object_id.oid_partition_id, + comp->oc_object_id.oid_object_id, + comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); + comp++; + } + dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, + (char *)p - (char *)start, cred, (char *)cred - (char *)layout); + return layout; +} + +/* + * Get Device Information Decoding + * + * Note: since Device Information is currently done synchronously, most + * of the actual fields are left inside the rpc buffer and are only + * pointed to by the pnfs_osd_deviceaddr members. So the read buffer + * should not be freed while the returned information is in use. + */ + +u32 *__xdr_read_calc_nfs4_string( + u32 *p, struct nfs4_string *str, u8 **freespace) +{ + u32 len; + char *data; + bool need_copy; + + READ32(len); + data = (char *)p; + + if (data[len]) { /* Not null terminated we'll need extra space */ + data = *freespace; + *freespace += len + 1; + need_copy = true; + } else { + need_copy = false; + } + + if (str) { + str->len = len; + str->data = data; + if (need_copy) { + memcpy(data, p, len); + data[len] = 0; + } + } + + p += XDR_QUADLEN(len); + return p; +} + +u32 *__xdr_read_calc_u8_opaque( + u32 *p, struct nfs4_string *str) +{ + u32 len; + + READ32(len); + + if (str) { + str->len = len; + str->data = (char *)p; + } + + p += XDR_QUADLEN(len); + return p; +} + +/* + * struct pnfs_osd_targetid { + * u32 oti_type; + * struct nfs4_string oti_scsi_device_id; + * }; + */ +u32 *__xdr_read_calc_targetid( + u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) +{ + u32 oti_type; + + READ32(oti_type); + if (targetid) + targetid->oti_type = oti_type; + + switch (oti_type) { + case OBJ_TARGET_SCSI_NAME: + case OBJ_TARGET_SCSI_DEVICE_ID: + p = __xdr_read_calc_u8_opaque(p, + targetid ? &targetid->oti_scsi_device_id : NULL); + } + + return p; +} + +/* + * struct pnfs_osd_net_addr { + * struct nfs4_string r_netid; + * struct nfs4_string r_addr; + * }; + */ +u32 *__xdr_read_calc_net_addr( + u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) +{ + + p = __xdr_read_calc_nfs4_string(p, + netaddr ? &netaddr->r_netid : NULL, + freespace); + + p = __xdr_read_calc_nfs4_string(p, + netaddr ? &netaddr->r_addr : NULL, + freespace); + + return p; +} + +/* + * struct pnfs_osd_targetaddr { + * u32 ota_available; + * struct pnfs_osd_net_addr ota_netaddr; + * }; + */ +u32 *__xdr_read_calc_targetaddr( + u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) +{ + u32 ota_available; + + READ32(ota_available); + if (targetaddr) + targetaddr->ota_available = ota_available; + + if (ota_available) { + p = __xdr_read_calc_net_addr(p, + targetaddr ? &targetaddr->ota_netaddr : NULL, + freespace); + } + + return p; +} + +/* + * struct pnfs_osd_deviceaddr { + * struct pnfs_osd_targetid oda_targetid; + * struct pnfs_osd_targetaddr oda_targetaddr; + * u8 oda_lun[8]; + * struct nfs4_string oda_systemid; + * struct pnfs_osd_object_cred oda_root_obj_cred; + * struct nfs4_string oda_osdname; + * }; + */ +u32 *__xdr_read_calc_deviceaddr( + u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) +{ + p = __xdr_read_calc_targetid(p, + deviceaddr ? &deviceaddr->oda_targetid : NULL, + freespace); + + p = __xdr_read_calc_targetaddr(p, + deviceaddr ? &deviceaddr->oda_targetaddr : NULL, + freespace); + + if (deviceaddr) + COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); + else + p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); + + p = __xdr_read_calc_u8_opaque(p, + deviceaddr ? &deviceaddr->oda_systemid : NULL); + + if (deviceaddr) { + p = pnfs_osd_xdr_decode_object_cred(p, + &deviceaddr->oda_root_obj_cred, freespace); + } else { + *freespace += pnfs_osd_object_cred_incore_sz(p); + p += pnfs_osd_object_cred_xdr_sz(p); + } + + p = __xdr_read_calc_u8_opaque(p, + deviceaddr ? &deviceaddr->oda_osdname : NULL); + + return p; +} + +size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) +{ + u8 *null_freespace = NULL; + size_t sz; + + __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); + sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; + + return sz; +} + +void pnfs_osd_xdr_decode_deviceaddr( + struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) +{ + u8 *freespace = (u8 *)(deviceaddr + 1); + + __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); +} + +/* + * struct pnfs_osd_layoutupdate { + * u32 dsu_valid; + * s64 dsu_delta; + * u32 olu_ioerr_flag; + * }; + */ +int +pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, + struct pnfs_osd_layoutupdate *lou) +{ + __be32 *p = xdr_reserve_space(xdr, 16); + + if (!p) + return -E2BIG; + + *p++ = cpu_to_be32(lou->dsu_valid); + if (lou->dsu_valid) + p = xdr_encode_hyper(p, lou->dsu_delta); + *p++ = cpu_to_be32(lou->olu_ioerr_flag); + return 0; +} + +/* + * struct pnfs_osd_objid { + * struct pnfs_deviceid oid_device_id; + * u64 oid_partition_id; + * u64 oid_object_id; + */ +static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, + struct pnfs_osd_objid *object_id) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 32); + if (!p) + return -E2BIG; + + p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, + sizeof(object_id->oid_device_id.data)); + p = xdr_encode_hyper(p, object_id->oid_partition_id); + p = xdr_encode_hyper(p, object_id->oid_object_id); + + return 0; +} + +/* + * struct pnfs_osd_ioerr { + * struct pnfs_osd_objid oer_component; + * u64 oer_comp_offset; + * u64 oer_comp_length; + * u32 oer_iswrite; + * u32 oer_errno; + * }; + */ +int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, + struct pnfs_osd_ioerr *ioerr) +{ + __be32 *p; + int ret; + + ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); + if (ret) + return ret; + + p = xdr_reserve_space(xdr, 24); + if (!p) + return -E2BIG; + + p = xdr_encode_hyper(p, ioerr->oer_comp_offset); + p = xdr_encode_hyper(p, ioerr->oer_comp_length); + *p++ = cpu_to_be32(ioerr->oer_iswrite); + *p = cpu_to_be32(ioerr->oer_errno); + + return 0; +} diff -up linux-2.6.34.noarch/fs/nfs/pagelist.c.orig linux-2.6.34.noarch/fs/nfs/pagelist.c --- linux-2.6.34.noarch/fs/nfs/pagelist.c.orig 2010-09-30 10:15:17.899715000 -0400 +++ linux-2.6.34.noarch/fs/nfs/pagelist.c 2010-09-30 10:17:08.748995000 -0400 @@ -20,6 +20,7 @@ #include #include "internal.h" +#include "pnfs.h" static struct kmem_cache *nfs_page_cachep; @@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p) struct nfs_page * nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, struct page *page, - unsigned int offset, unsigned int count) + unsigned int offset, unsigned int count, + struct pnfs_layout_segment *lseg) { struct nfs_page *req; @@ -79,7 +81,11 @@ nfs_create_request(struct nfs_open_conte req->wb_pgbase = offset; req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); + req->wb_lock_context = nfs_get_lock_context(ctx); kref_init(&req->wb_kref); + req->wb_lseg = lseg; + if (lseg) + get_lseg(lseg); return req; } @@ -141,18 +147,26 @@ void nfs_clear_request(struct nfs_page * { struct page *page = req->wb_page; struct nfs_open_context *ctx = req->wb_context; + struct nfs_lock_context *l_ctx = req->wb_lock_context; if (page != NULL) { page_cache_release(page); req->wb_page = NULL; } + if (l_ctx != NULL) { + nfs_put_lock_context(l_ctx); + req->wb_lock_context = NULL; + } if (ctx != NULL) { put_nfs_open_context(ctx); req->wb_context = NULL; } + if (req->wb_lseg != NULL) { + put_lseg(req->wb_lseg); + req->wb_lseg = NULL; + } } - /** * nfs_release_request - Release the count on an NFS read/write request * @req: request to release @@ -231,11 +245,12 @@ void nfs_pageio_init(struct nfs_pageio_d * Return 'true' if this is the case, else return 'false'. */ static int nfs_can_coalesce_requests(struct nfs_page *prev, - struct nfs_page *req) + struct nfs_page *req, + struct nfs_pageio_descriptor *pgio) { if (req->wb_context->cred != prev->wb_context->cred) return 0; - if (req->wb_context->lockowner != prev->wb_context->lockowner) + if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) return 0; if (req->wb_context->state != prev->wb_context->state) return 0; @@ -245,6 +260,12 @@ static int nfs_can_coalesce_requests(str return 0; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) return 0; + if (req->wb_lseg != prev->wb_lseg) + return 0; +#ifdef CONFIG_NFS_V4_1 + if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) + return 0; +#endif /* CONFIG_NFS_V4_1 */ return 1; } @@ -277,7 +298,7 @@ static int nfs_pageio_do_add_request(str if (newlen > desc->pg_bsize) return 0; prev = nfs_list_entry(desc->pg_list.prev); - if (!nfs_can_coalesce_requests(prev, req)) + if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; } else desc->pg_base = req->wb_pgbase; @@ -366,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs * @idx_start: lower bound of page->index to scan * @npages: idx_start + npages sets the upper bound to scan. * @tag: tag to scan for + * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver * * Moves elements from one of the inode request lists. * If the number of requests is set to 0, the entire address_space @@ -375,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs */ int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, pgoff_t idx_start, - unsigned int npages, int tag) + unsigned int npages, int tag, int *use_pnfs) { struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; struct nfs_page *req; @@ -406,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, tag); nfs_list_add_request(req, dst); + if (req->wb_lseg) + *use_pnfs = 1; res++; if (res == INT_MAX) goto out; diff -up linux-2.6.34.noarch/fs/nfs/pnfs.c.orig linux-2.6.34.noarch/fs/nfs/pnfs.c --- linux-2.6.34.noarch/fs/nfs/pnfs.c.orig 2010-09-30 10:17:08.752997000 -0400 +++ linux-2.6.34.noarch/fs/nfs/pnfs.c 2010-09-30 10:17:08.754995000 -0400 @@ -0,0 +1,2039 @@ +/* + * linux/fs/nfs/pnfs.c + * + * pNFS functions to call and manage layout drivers. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Dean Hildebrand + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "nfs4_fs.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +#define MIN_POOL_LC (4) + +static int pnfs_initialized; + +static void pnfs_free_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range); +static inline void get_layout(struct pnfs_layout_hdr *lo); + +/* Locking: + * + * pnfs_spinlock: + * protects pnfs_modules_tbl. + */ +static spinlock_t pnfs_spinlock = __SPIN_LOCK_UNLOCKED(pnfs_spinlock); + +/* + * pnfs_modules_tbl holds all pnfs modules + */ +static struct list_head pnfs_modules_tbl; +static struct kmem_cache *pnfs_cachep; +static mempool_t *pnfs_layoutcommit_mempool; + +static inline struct nfs4_layoutcommit_data *pnfs_layoutcommit_alloc(void) +{ + struct nfs4_layoutcommit_data *p = + mempool_alloc(pnfs_layoutcommit_mempool, GFP_NOFS); + if (p) + memset(p, 0, sizeof(*p)); + + return p; +} + +void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *p) +{ + mempool_free(p, pnfs_layoutcommit_mempool); +} + +/* + * struct pnfs_module - One per pNFS device module. + */ +struct pnfs_module { + struct pnfs_layoutdriver_type *pnfs_ld_type; + struct list_head pnfs_tblid; +}; + +int +pnfs_initialize(void) +{ + INIT_LIST_HEAD(&pnfs_modules_tbl); + + pnfs_cachep = kmem_cache_create("nfs4_layoutcommit_data", + sizeof(struct nfs4_layoutcommit_data), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (pnfs_cachep == NULL) + return -ENOMEM; + + pnfs_layoutcommit_mempool = mempool_create(MIN_POOL_LC, + mempool_alloc_slab, + mempool_free_slab, + pnfs_cachep); + if (pnfs_layoutcommit_mempool == NULL) { + kmem_cache_destroy(pnfs_cachep); + return -ENOMEM; + } + + pnfs_initialized = 1; + return 0; +} + +void pnfs_uninitialize(void) +{ + mempool_destroy(pnfs_layoutcommit_mempool); + kmem_cache_destroy(pnfs_cachep); +} + +/* search pnfs_modules_tbl for right pnfs module */ +static int +find_pnfs(u32 id, struct pnfs_module **module) { + struct pnfs_module *local = NULL; + + dprintk("PNFS: %s: Searching for %u\n", __func__, id); + list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) { + if (local->pnfs_ld_type->id == id) { + *module = local; + return(1); + } + } + return 0; +} + +/* Set cred to indicate we require a layoutcommit + * If we don't even have a layout, we don't need to commit it. + */ +void +pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) +{ + dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); + spin_lock(&nfsi->vfs_inode.i_lock); + if (has_layout(nfsi) && + !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state)) { + nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred); + __set_bit(NFS_INO_LAYOUTCOMMIT, + &nfsi->layout->state); + nfsi->change_attr++; + spin_unlock(&nfsi->vfs_inode.i_lock); + dprintk("%s: Set layoutcommit\n", __func__); + return; + } + spin_unlock(&nfsi->vfs_inode.i_lock); +} + +/* Update last_write_offset for layoutcommit. + * TODO: We should only use commited extents, but the current nfs + * implementation does not calculate the written range in nfs_commit_done. + * We therefore update this field in writeback_done. + */ +void +pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) +{ + loff_t end_pos; + + spin_lock(&nfsi->vfs_inode.i_lock); + if (offset < nfsi->layout->write_begin_pos) + nfsi->layout->write_begin_pos = offset; + end_pos = offset + extent - 1; /* I'm being inclusive */ + if (end_pos > nfsi->layout->write_end_pos) + nfsi->layout->write_end_pos = end_pos; + dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", + __func__, + (unsigned long) extent, + (unsigned long) offset , + (unsigned long) nfsi->layout->write_begin_pos, + (unsigned long) nfsi->layout->write_end_pos); + spin_unlock(&nfsi->vfs_inode.i_lock); +} + +/* Unitialize a mountpoint in a layout driver */ +void +unmount_pnfs_layoutdriver(struct nfs_server *nfss) +{ + if (PNFS_EXISTS_LDIO_OP(nfss, uninitialize_mountpoint)) + nfss->pnfs_curr_ld->ld_io_ops->uninitialize_mountpoint(nfss); +} + +/* + * Set the server pnfs module to the first registered pnfs_type. + * Only one pNFS layout driver is supported. + */ +void +set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, + u32 id) +{ + struct pnfs_module *mod = NULL; + + if (server->pnfs_curr_ld) + return; + + if (!find_pnfs(id, &mod)) { + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + find_pnfs(id, &mod); + } + + if (!mod) { + dprintk("%s: No pNFS module found for %u. ", __func__, id); + goto out_err; + } + + server->pnfs_curr_ld = mod->pnfs_ld_type; + if (mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint( + server, mntfh)) { + printk(KERN_ERR "%s: Error initializing mount point " + "for layout driver %u. ", __func__, id); + goto out_err; + } + + dprintk("%s: pNFS module for %u set\n", __func__, id); + return; + +out_err: + dprintk("Using NFSv4 I/O\n"); + server->pnfs_curr_ld = NULL; +} + +/* Allow I/O module to set its functions structure */ +struct pnfs_client_operations* +pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + struct pnfs_module *pnfs_mod; + struct layoutdriver_io_operations *io_ops = ld_type->ld_io_ops; + + if (!pnfs_initialized) { + printk(KERN_ERR "%s Registration failure. " + "pNFS not initialized.\n", __func__); + return NULL; + } + + if (!io_ops || !io_ops->alloc_layout || !io_ops->free_layout) { + printk(KERN_ERR "%s Layout driver must provide " + "alloc_layout and free_layout.\n", __func__); + return NULL; + } + + if (!io_ops->alloc_lseg || !io_ops->free_lseg) { + printk(KERN_ERR "%s Layout driver must provide " + "alloc_lseg and free_lseg.\n", __func__); + return NULL; + } + + if (!io_ops->read_pagelist || !io_ops->write_pagelist || + !io_ops->commit) { + printk(KERN_ERR "%s Layout driver must provide " + "read_pagelist, write_pagelist, and commit.\n", + __func__); + return NULL; + } + + pnfs_mod = kmalloc(sizeof(struct pnfs_module), GFP_KERNEL); + if (pnfs_mod != NULL) { + dprintk("%s Registering id:%u name:%s\n", + __func__, + ld_type->id, + ld_type->name); + pnfs_mod->pnfs_ld_type = ld_type; + INIT_LIST_HEAD(&pnfs_mod->pnfs_tblid); + + spin_lock(&pnfs_spinlock); + list_add(&pnfs_mod->pnfs_tblid, &pnfs_modules_tbl); + spin_unlock(&pnfs_spinlock); + } + + return &pnfs_ops; +} + +/* Allow I/O module to set its functions structure */ +void +pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) +{ + struct pnfs_module *pnfs_mod; + + if (find_pnfs(ld_type->id, &pnfs_mod)) { + dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); + spin_lock(&pnfs_spinlock); + list_del(&pnfs_mod->pnfs_tblid); + spin_unlock(&pnfs_spinlock); + kfree(pnfs_mod); + } +} + +/* + * pNFS client layout cache + */ +#if defined(CONFIG_SMP) +#define BUG_ON_UNLOCKED_INO(ino) \ + BUG_ON(!spin_is_locked(&ino->i_lock)) +#define BUG_ON_UNLOCKED_LO(lo) \ + BUG_ON_UNLOCKED_INO(PNFS_INODE(lo)) +#else /* CONFIG_SMP */ +#define BUG_ON_UNLOCKED_INO(lo) do {} while (0) +#define BUG_ON_UNLOCKED_LO(lo) do {} while (0) +#endif /* CONFIG_SMP */ + +static inline void +get_layout(struct pnfs_layout_hdr *lo) +{ + BUG_ON_UNLOCKED_LO(lo); + lo->refcount++; +} + +static inline void +put_layout_locked(struct pnfs_layout_hdr *lo) +{ + BUG_ON_UNLOCKED_LO(lo); + BUG_ON(lo->refcount <= 0); + + lo->refcount--; + if (!lo->refcount) { + struct layoutdriver_io_operations *io_ops = PNFS_LD_IO_OPS(lo); + struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); + + dprintk("%s: freeing layout cache %p\n", __func__, lo); + WARN_ON(!list_empty(&lo->layouts)); + io_ops->free_layout(lo); + nfsi->layout = NULL; + } +} + +void +put_layout(struct inode *inode) +{ + spin_lock(&inode->i_lock); + put_layout_locked(NFS_I(inode)->layout); + spin_unlock(&inode->i_lock); + +} + +void +pnfs_layout_release(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) +{ + struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); + + spin_lock(&nfsi->vfs_inode.i_lock); + if (range) + pnfs_free_layout(lo, range); + /* + * Matched in _pnfs_update_layout for layoutget + * and by get_layout in _pnfs_return_layout for layoutreturn + */ + put_layout_locked(lo); + spin_unlock(&nfsi->vfs_inode.i_lock); + wake_up_all(&nfsi->lo_waitq); +} + +void +pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo; + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + spin_lock(&nfsi->vfs_inode.i_lock); + lo = nfsi->layout; + if (lo) { + pnfs_free_layout(lo, &range); + WARN_ON(!list_empty(&nfsi->layout->segs)); + WARN_ON(!list_empty(&nfsi->layout->layouts)); + + if (nfsi->layout->refcount != 1) + printk(KERN_WARNING "%s: layout refcount not=1 %d\n", + __func__, nfsi->layout->refcount); + WARN_ON(nfsi->layout->refcount != 1); + + /* Matched by refcount set to 1 in alloc_init_layout */ + put_layout_locked(lo); + } + spin_unlock(&nfsi->vfs_inode.i_lock); +} + +/* + * Called by the state manger to remove all layouts established under an + * expired lease. + */ +void +pnfs_destroy_all_layouts(struct nfs_client *clp) +{ + struct pnfs_layout_hdr *lo; + + while (!list_empty(&clp->cl_layouts)) { + lo = list_entry(clp->cl_layouts.next, struct pnfs_layout_hdr, + layouts); + dprintk("%s freeing layout for inode %lu\n", __func__, + lo->inode->i_ino); + pnfs_destroy_layout(NFS_I(lo->inode)); + } +} + +static inline void +init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) +{ + INIT_LIST_HEAD(&lseg->fi_list); + kref_init(&lseg->kref); + lseg->valid = true; + lseg->layout = lo; +} + +static void +destroy_lseg(struct kref *kref) +{ + struct pnfs_layout_segment *lseg = + container_of(kref, struct pnfs_layout_segment, kref); + + dprintk("--> %s\n", __func__); + /* Matched by get_layout in pnfs_insert_layout */ + put_layout_locked(lseg->layout); + PNFS_LD_IO_OPS(lseg->layout)->free_lseg(lseg); +} + +static void +put_lseg_locked(struct pnfs_layout_segment *lseg) +{ + bool do_wake_up; + struct nfs_inode *nfsi; + + if (!lseg) + return; + + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->kref.refcount), lseg->valid); + do_wake_up = !lseg->valid; + nfsi = PNFS_NFS_INODE(lseg->layout); + kref_put(&lseg->kref, destroy_lseg); + if (do_wake_up) + wake_up(&nfsi->lo_waitq); +} + +void +put_lseg(struct pnfs_layout_segment *lseg) +{ + bool do_wake_up; + struct nfs_inode *nfsi; + + if (!lseg) + return; + + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->kref.refcount), lseg->valid); + do_wake_up = !lseg->valid; + nfsi = PNFS_NFS_INODE(lseg->layout); + spin_lock(&nfsi->vfs_inode.i_lock); + kref_put(&lseg->kref, destroy_lseg); + spin_unlock(&nfsi->vfs_inode.i_lock); + if (do_wake_up) + wake_up(&nfsi->lo_waitq); +} +EXPORT_SYMBOL(put_lseg); + +void get_lseg(struct pnfs_layout_segment *lseg) +{ + kref_get(&lseg->kref); +} +EXPORT_SYMBOL(get_lseg); + +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end: NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1: NFS4_MAX_UINT64; +} + +/* + * is l2 fully contained in l1? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static inline int +lo_seg_contained(struct pnfs_layout_range *l1, + struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (start1 <= start2) && (end1 >= end2); +} + +/* + * is l1 and l2 intersecting? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static inline int +lo_seg_intersecting(struct pnfs_layout_range *l1, + struct pnfs_layout_range *l2) +{ + u64 start1 = l1->offset; + u64 end1 = end_offset(start1, l1->length); + u64 start2 = l2->offset; + u64 end2 = end_offset(start2, l2->length); + + return (end1 == NFS4_MAX_UINT64 || end1 > start2) && + (end2 == NFS4_MAX_UINT64 || end2 > start1); +} + +void +pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid) +{ + write_seqlock(&lo->seqlock); + memcpy(lo->stateid.u.data, stateid->u.data, sizeof(lo->stateid.u.data)); + write_sequnlock(&lo->seqlock); +} + +void +pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo) +{ + int seq; + + dprintk("--> %s\n", __func__); + + do { + seq = read_seqbegin(&lo->seqlock); + memcpy(dst->u.data, lo->stateid.u.data, + sizeof(lo->stateid.u.data)); + } while (read_seqretry(&lo->seqlock, seq)); + + dprintk("<-- %s\n", __func__); +} + +static void +pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, + struct nfs4_state *state) +{ + int seq; + + dprintk("--> %s\n", __func__); + + write_seqlock(&lo->seqlock); + if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) + do { + seq = read_seqbegin(&state->seqlock); + memcpy(lo->stateid.u.data, state->stateid.u.data, + sizeof(state->stateid.u.data)); + } while (read_seqretry(&state->seqlock, seq)); + write_sequnlock(&lo->seqlock); + dprintk("<-- %s\n", __func__); +} + +/* +* Get layout from server. +* for now, assume that whole file layouts are requested. +* arg->offset: 0 +* arg->length: all ones +*/ +static int +send_layoutget(struct inode *ino, + struct nfs_open_context *ctx, + struct pnfs_layout_range *range, + struct pnfs_layout_segment **lsegpp, + struct pnfs_layout_hdr *lo) +{ + int status; + struct nfs_server *server = NFS_SERVER(ino); + struct nfs4_layoutget *lgp; + + dprintk("--> %s\n", __func__); + + lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); + if (lgp == NULL) { + pnfs_layout_release(lo, NULL); + return -ENOMEM; + } + lgp->args.minlength = NFS4_MAX_UINT64; + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + lgp->args.range.iomode = range->iomode; + lgp->args.range.offset = 0; + lgp->args.range.length = NFS4_MAX_UINT64; + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->lsegpp = lsegpp; + + if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) { + struct nfs_open_context *oldctx = ctx; + + if (!oldctx) { + ctx = nfs_find_open_context(ino, NULL, + (range->iomode == IOMODE_READ) ? + FMODE_READ: FMODE_WRITE); + BUG_ON(!ctx); + } + /* Set the layout stateid from the open stateid */ + pnfs_layout_from_open_stateid(NFS_I(ino)->layout, ctx->state); + if (!oldctx) + put_nfs_open_context(ctx); + } + + /* Retrieve layout information from server */ + status = nfs4_proc_layoutget(lgp); + + dprintk("<-- %s status %d\n", __func__, status); + return status; +} + +/* + * iomode matching rules: + * range lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true + * RW READ false + * RW RW true + * READ READ true + * READ RW false + */ +static inline int +should_free_lseg(struct pnfs_layout_segment *lseg, + struct pnfs_layout_range *range) +{ + return (range->iomode == IOMODE_ANY || + lseg->range.iomode == range->iomode) && + lo_seg_intersecting(&lseg->range, range); +} + +static struct pnfs_layout_segment * +has_layout_to_return(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) +{ + struct pnfs_layout_segment *out = NULL, *lseg; + dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", + __func__, lo, range->offset, range->length, range->iomode); + + BUG_ON_UNLOCKED_LO(lo); + list_for_each_entry (lseg, &lo->segs, fi_list) + if (should_free_lseg(lseg, range)) { + out = lseg; + break; + } + + dprintk("%s:Return lseg=%p\n", __func__, out); + return out; +} + +static inline bool +_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg) +{ + return atomic_read(&lseg->kref.refcount) == 1; +} + + +static void +pnfs_free_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) +{ + struct pnfs_layout_segment *lseg, *next; + dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", + __func__, lo, range->offset, range->length, range->iomode); + + BUG_ON_UNLOCKED_LO(lo); + list_for_each_entry_safe (lseg, next, &lo->segs, fi_list) { + if (!should_free_lseg(lseg, range) || + !_pnfs_can_return_lseg(lseg)) + continue; + dprintk("%s: freeing lseg %p iomode %d " + "offset %llu length %llu\n", __func__, + lseg, lseg->range.iomode, lseg->range.offset, + lseg->range.length); + list_del(&lseg->fi_list); + put_lseg_locked(lseg); + } + if (list_empty(&lo->segs)) { + struct nfs_client *clp; + + clp = PNFS_NFS_SERVER(lo)->nfs_client; + spin_lock(&clp->cl_lock); + list_del_init(&lo->layouts); + spin_unlock(&clp->cl_lock); + pnfs_set_layout_stateid(lo, &zero_stateid); + } + + dprintk("%s:Return\n", __func__); +} + +static bool +pnfs_return_layout_barrier(struct nfs_inode *nfsi, + struct pnfs_layout_range *range) +{ + struct pnfs_layout_segment *lseg; + bool ret = false; + + spin_lock(&nfsi->vfs_inode.i_lock); + list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) { + if (!should_free_lseg(lseg, range)) + continue; + lseg->valid = false; + if (!_pnfs_can_return_lseg(lseg)) { + dprintk("%s: wait on lseg %p refcount %d\n", + __func__, lseg, + atomic_read(&lseg->kref.refcount)); + ret = true; + } + } + spin_unlock(&nfsi->vfs_inode.i_lock); + dprintk("%s:Return %d\n", __func__, ret); + return ret; +} + +static int +return_layout(struct inode *ino, struct pnfs_layout_range *range, + enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo, + bool wait) +{ + struct nfs4_layoutreturn *lrp; + struct nfs_server *server = NFS_SERVER(ino); + int status = -ENOMEM; + + dprintk("--> %s\n", __func__); + + BUG_ON(type != RETURN_FILE); + + lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); + if (lrp == NULL) { + if (lo && (type == RETURN_FILE)) + pnfs_layout_release(lo, NULL); + goto out; + } + lrp->args.reclaim = 0; + lrp->args.layout_type = server->pnfs_curr_ld->id; + lrp->args.return_type = type; + lrp->args.range = *range; + lrp->args.inode = ino; + + status = nfs4_proc_layoutreturn(lrp, wait); +out: + dprintk("<-- %s status: %d\n", __func__, status); + return status; +} + +int +_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, + const nfs4_stateid *stateid, /* optional */ + enum pnfs_layoutreturn_type type, + bool wait) +{ + struct pnfs_layout_hdr *lo = NULL; + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_range arg; + int status = 0; + + dprintk("--> %s type %d\n", __func__, type); + + + arg.iomode = range ? range->iomode : IOMODE_ANY; + arg.offset = 0; + arg.length = NFS4_MAX_UINT64; + + if (type == RETURN_FILE) { + spin_lock(&ino->i_lock); + lo = nfsi->layout; + if (lo && !has_layout_to_return(lo, &arg)) { + lo = NULL; + } + if (!lo) { + spin_unlock(&ino->i_lock); + dprintk("%s: no layout segments to return\n", __func__); + goto out; + } + + /* Reference for layoutreturn matched in pnfs_layout_release */ + get_layout(lo); + + spin_unlock(&ino->i_lock); + + if (pnfs_return_layout_barrier(nfsi, &arg)) { + if (stateid) { /* callback */ + status = -EAGAIN; + goto out_put; + } + dprintk("%s: waiting\n", __func__); + wait_event(nfsi->lo_waitq, + !pnfs_return_layout_barrier(nfsi, &arg)); + } + + if (layoutcommit_needed(nfsi)) { + if (stateid && !wait) { /* callback */ + dprintk("%s: layoutcommit pending\n", __func__); + status = -EAGAIN; + goto out_put; + } + status = pnfs_layoutcommit_inode(ino, wait); + if (status) { + /* Return layout even if layoutcommit fails */ + dprintk("%s: layoutcommit failed, status=%d. " + "Returning layout anyway\n", + __func__, status); + } + } + + if (!stateid) + status = return_layout(ino, &arg, type, lo, wait); + else + pnfs_layout_release(lo, &arg); + } +out: + dprintk("<-- %s status: %d\n", __func__, status); + return status; +out_put: + put_layout(ino); + goto out; +} + +/* + * cmp two layout segments for sorting into layout cache + */ +static inline s64 +cmp_layout(struct pnfs_layout_range *l1, + struct pnfs_layout_range *l2) +{ + s64 d; + + /* higher offset > lower offset */ + d = l1->offset - l2->offset; + if (d) + return d; + + /* longer length > shorter length */ + d = l1->length - l2->length; + if (d) + return d; + + /* read > read/write */ + return (int)(l1->iomode == IOMODE_READ) - + (int)(l2->iomode == IOMODE_READ); +} + +static void +pnfs_insert_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_segment *lp; + int found = 0; + + dprintk("%s:Begin\n", __func__); + + BUG_ON_UNLOCKED_LO(lo); + if (list_empty(&lo->segs)) { + struct nfs_client *clp = PNFS_NFS_SERVER(lo)->nfs_client; + + spin_lock(&clp->cl_lock); + BUG_ON(!list_empty(&lo->layouts)); + list_add_tail(&lo->layouts, &clp->cl_layouts); + spin_unlock(&clp->cl_lock); + } + list_for_each_entry (lp, &lo->segs, fi_list) { + if (cmp_layout(&lp->range, &lseg->range) > 0) + continue; + list_add_tail(&lseg->fi_list, &lp->fi_list); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu before " + "lp %p iomode %d offset %llu length %llu\n", + __func__, lseg, lseg->range.iomode, + lseg->range.offset, lseg->range.length, + lp, lp->range.iomode, lp->range.offset, + lp->range.length); + found = 1; + break; + } + if (!found) { + list_add_tail(&lseg->fi_list, &lo->segs); + dprintk("%s: inserted lseg %p " + "iomode %d offset %llu length %llu at tail\n", + __func__, lseg, lseg->range.iomode, + lseg->range.offset, lseg->range.length); + } + get_layout(lo); + + dprintk("%s:Return\n", __func__); +} + +/* + * Each layoutdriver embeds pnfs_layout_hdr as the first field in it's + * per-layout type layout cache structure and returns it ZEROed + * from layoutdriver_io_ops->alloc_layout + */ +static struct pnfs_layout_hdr * +alloc_init_layout(struct inode *ino) +{ + struct pnfs_layout_hdr *lo; + struct layoutdriver_io_operations *io_ops; + + io_ops = NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops; + lo = io_ops->alloc_layout(ino); + if (!lo) { + printk(KERN_ERR + "%s: out of memory: io_ops->alloc_layout failed\n", + __func__); + return NULL; + } + lo->refcount = 1; + INIT_LIST_HEAD(&lo->layouts); + INIT_LIST_HEAD(&lo->segs); + seqlock_init(&lo->seqlock); + lo->inode = ino; + return lo; +} + +/* + * Retrieve and possibly allocate the inode layout + * + * ino->i_lock must be taken by the caller. + */ +static struct pnfs_layout_hdr * +pnfs_alloc_layout(struct inode *ino) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *new = NULL; + + dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); + + BUG_ON_UNLOCKED_INO(ino); + if (likely(nfsi->layout)) + return nfsi->layout; + + spin_unlock(&ino->i_lock); + new = alloc_init_layout(ino); + spin_lock(&ino->i_lock); + + if (likely(nfsi->layout == NULL)) { /* Won the race? */ + nfsi->layout = new; + } else if (new) { + /* Reference the layout accross i_lock release and grab */ + get_layout(nfsi->layout); + spin_unlock(&ino->i_lock); + NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops->free_layout(new); + spin_lock(&ino->i_lock); + put_layout_locked(nfsi->layout); + } + return nfsi->layout; +} + +/* + * iomode matching rules: + * range lseg match + * ----- ----- ----- + * ANY READ true + * ANY RW true + * RW READ false + * RW RW true + * READ READ true + * READ RW true + */ +static inline int +has_matching_lseg(struct pnfs_layout_segment *lseg, + struct pnfs_layout_range *range) +{ + struct pnfs_layout_range range1; + + if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || + !lo_seg_intersecting(&lseg->range, range)) + return 0; + + /* range1 covers only the first byte in the range */ + range1 = *range; + range1.length = 1; + return lo_seg_contained(&lseg->range, &range1); +} + +/* + * lookup range in layout + */ +static struct pnfs_layout_segment * +pnfs_has_layout(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range, + bool take_ref, + bool only_valid) +{ + struct pnfs_layout_segment *lseg, *ret = NULL; + + dprintk("%s:Begin\n", __func__); + + BUG_ON_UNLOCKED_LO(lo); + list_for_each_entry (lseg, &lo->segs, fi_list) { + if (has_matching_lseg(lseg, range) && + (lseg->valid || !only_valid)) { + ret = lseg; + if (take_ref) + get_lseg(ret); + break; + } + if (cmp_layout(range, &lseg->range) > 0) + break; + } + + dprintk("%s:Return lseg %p take_ref %d ref %d valid %d\n", + __func__, ret, take_ref, + ret ? atomic_read(&ret->kref.refcount) : 0, + ret ? ret->valid : 0); + return ret; +} + +/* Update the file's layout for the given range and iomode. + * Layout is retreived from the server if needed. + * If lsegpp is given, the appropriate layout segment is referenced and + * returned to the caller. + */ +void +_pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + struct pnfs_layout_segment **lsegpp) +{ + struct pnfs_layout_range arg = { + .iomode = iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + struct nfs_inode *nfsi = NFS_I(ino); + struct pnfs_layout_hdr *lo; + struct pnfs_layout_segment *lseg = NULL; + bool take_ref = (lsegpp != NULL); + + if (take_ref) + *lsegpp = NULL; + spin_lock(&ino->i_lock); + lo = pnfs_alloc_layout(ino); + if (lo == NULL) { + dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); + goto out_unlock; + } + + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); + if (lseg && !lseg->valid) { + if (take_ref) + put_lseg_locked(lseg); + /* someone is cleaning the layout */ + lseg = NULL; + goto out_unlock; + } + + if (lseg) { + dprintk("%s: Using cached lseg %p for %llu@%llu iomode %d)\n", + __func__, + lseg, + arg.length, + arg.offset, + arg.iomode); + + goto out_unlock; + } + + /* if get layout already failed once goto out */ + if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) { + if (unlikely(nfsi->pnfs_layout_suspend && + get_seconds() >= nfsi->pnfs_layout_suspend)) { + dprintk("%s: layout_get resumed\n", __func__); + clear_bit(lo_fail_bit(iomode), + &nfsi->layout->state); + nfsi->pnfs_layout_suspend = 0; + } else + goto out_unlock; + } + + /* Reference the layout for layoutget matched in pnfs_layout_release */ + get_layout(lo); + spin_unlock(&ino->i_lock); + + send_layoutget(ino, ctx, &arg, lsegpp, lo); +out: + dprintk("%s end, state 0x%lx lseg %p\n", __func__, + nfsi->layout->state, lseg); + return; +out_unlock: + if (lsegpp) + *lsegpp = lseg; + spin_unlock(&ino->i_lock); + goto out; +} + +void +pnfs_get_layout_done(struct nfs4_layoutget *lgp, int rpc_status) +{ + struct pnfs_layout_segment *lseg = NULL; + struct nfs_inode *nfsi = NFS_I(lgp->args.inode); + time_t suspend = 0; + + dprintk("-->%s\n", __func__); + + lgp->status = rpc_status; + if (likely(!rpc_status)) { + if (unlikely(lgp->res.layout.len < 0)) { + printk(KERN_ERR + "%s: ERROR Returned layout size is ZERO\n", __func__); + lgp->status = -EIO; + } + goto out; + } + + dprintk("%s: ERROR retrieving layout %d\n", __func__, rpc_status); + switch (rpc_status) { + case -NFS4ERR_BADLAYOUT: + lgp->status = -ENOENT; + /* FALLTHROUGH */ + case -EACCES: /* NFS4ERR_ACCESS */ + /* transient error, don't mark with NFS_INO_LAYOUT_FAILED */ + goto out; + + case -NFS4ERR_LAYOUTTRYLATER: + case -NFS4ERR_RECALLCONFLICT: + case -NFS4ERR_OLD_STATEID: + case -EAGAIN: /* NFS4ERR_LOCKED */ + lgp->status = -NFS4ERR_DELAY; /* for nfs4_handle_exception */ + /* FALLTHROUGH */ + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + goto out; + + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + /* The layout is expected to be returned at this point. + * This should clear the layout stateid as well */ + suspend = get_seconds() + 1; + break; + + case -NFS4ERR_LAYOUTUNAVAILABLE: + lgp->status = -ENOTSUPP; + break; + + case -NFS4ERR_REP_TOO_BIG: + case -NFS4ERR_REP_TOO_BIG_TO_CACHE: + lgp->status = -E2BIG; + break; + + /* Leave the following errors untranslated */ + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_DQUOT: + case -EINVAL: /* NFS4ERR_INVAL */ + case -EIO: /* NFS4ERR_IO */ + case -NFS4ERR_FHEXPIRED: + case -NFS4ERR_MOVED: + case -NFS4ERR_NOSPC: + case -ESERVERFAULT: /* NFS4ERR_SERVERFAULT */ + case -ESTALE: /* NFS4ERR_STALE */ + case -ETOOSMALL: /* NFS4ERR_TOOSMALL */ + break; + + /* The following errors are our fault and should never happen */ + case -NFS4ERR_BADIOMODE: + case -NFS4ERR_BADXDR: + case -NFS4ERR_REQ_TOO_BIG: + case -NFS4ERR_UNKNOWN_LAYOUTTYPE: + case -NFS4ERR_WRONG_TYPE: + lgp->status = -EINVAL; + /* FALLTHROUGH */ + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_NOFILEHANDLE: + case -ENOTSUPP: /* NFS4ERR_NOTSUPP */ + case -NFS4ERR_OPENMODE: + case -NFS4ERR_OP_NOT_IN_SESSION: + case -NFS4ERR_TOO_MANY_OPS: + dprintk("%s: error %d: should never happen\n", __func__, + rpc_status); + break; + + /* The following errors are the server's fault */ + default: + dprintk("%s: illegal error %d\n", __func__, rpc_status); + lgp->status = -EIO; + break; + } + + /* remember that get layout failed and suspend trying */ + nfsi->pnfs_layout_suspend = suspend; + set_bit(lo_fail_bit(lgp->args.range.iomode), + &nfsi->layout->state); + dprintk("%s: layout_get suspended until %ld\n", + __func__, suspend); +out: + dprintk("%s end (err:%d) state 0x%lx lseg %p\n", + __func__, lgp->status, nfsi->layout->state, lseg); + return; +} + +int +pnfs_layout_process(struct nfs4_layoutget *lgp) +{ + struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct nfs4_layoutget_res *res = &lgp->res; + struct pnfs_layout_segment *lseg; + struct inode *ino = PNFS_INODE(lo); + int status = 0; + + /* Inject layout blob into I/O device driver */ + lseg = PNFS_LD_IO_OPS(lo)->alloc_lseg(lo, res); + if (!lseg || IS_ERR(lseg)) { + if (!lseg) + status = -ENOMEM; + else + status = PTR_ERR(lseg); + dprintk("%s: Could not allocate layout: error %d\n", + __func__, status); + goto out; + } + + spin_lock(&ino->i_lock); + init_lseg(lo, lseg); + lseg->range = res->range; + if (lgp->lsegpp) { + get_lseg(lseg); + *lgp->lsegpp = lseg; + } + pnfs_insert_layout(lo, lseg); + + if (res->return_on_close) { + lo->roc_iomode |= res->range.iomode; + if (!lo->roc_iomode) + lo->roc_iomode = IOMODE_ANY; + } + + /* Done processing layoutget. Set the layout stateid */ + pnfs_set_layout_stateid(lo, &res->stateid); + spin_unlock(&ino->i_lock); +out: + return status; +} + +void +readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, + size_t *count) +{ + struct page *first, *last; + loff_t foff, i_size = i_size_read(inode); + pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + size_t range; + + + first = list_entry((pages)->prev, struct page, lru); + last = list_entry((pages)->next, struct page, lru); + + foff = (loff_t)first->index << PAGE_CACHE_SHIFT; + + range = (last->index - first->index) * PAGE_CACHE_SIZE; + if (last->index == end_index) + range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; + else + range += PAGE_CACHE_SIZE; + dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, + range); + *offset = foff; + *count = range; +} + +void +pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) +{ + struct pnfs_layout_hdr *lo; + struct pnfs_layoutdriver_type *ld; + + pgio->pg_test = NULL; + + lo = NFS_I(inode)->layout; + ld = NFS_SERVER(inode)->pnfs_curr_ld; + if (!pnfs_enabled_sb(NFS_SERVER(inode)) || !lo) + return; + + if (ld->ld_policy_ops) + pgio->pg_test = ld->ld_policy_ops->pg_test; +} + +static u32 +pnfs_getboundary(struct inode *inode) +{ + u32 stripe_size = 0; + struct nfs_server *nfss = NFS_SERVER(inode); + struct layoutdriver_policy_operations *policy_ops; + + if (!nfss->pnfs_curr_ld) + goto out; + + policy_ops = nfss->pnfs_curr_ld->ld_policy_ops; + if (!policy_ops || !policy_ops->get_stripesize) + goto out; + + /* The default is to not gather across stripes */ + if (pnfs_ld_gather_across_stripes(nfss->pnfs_curr_ld)) + goto out; + + spin_lock(&inode->i_lock); + if (NFS_I(inode)->layout) + stripe_size = policy_ops->get_stripesize(NFS_I(inode)->layout); + spin_unlock(&inode->i_lock); +out: + return stripe_size; +} + +/* + * rsize is already set by caller to MDS rsize. + */ +void +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode, + struct nfs_open_context *ctx, + struct list_head *pages, + size_t *rsize) +{ + struct nfs_server *nfss = NFS_SERVER(inode); + size_t count = 0; + loff_t loff; + + pgio->pg_iswrite = 0; + pgio->pg_boundary = 0; + pgio->pg_test = NULL; + pgio->pg_lseg = NULL; + + if (!pnfs_enabled_sb(nfss)) + return; + + /* Calculate the total read-ahead count */ + readahead_range(inode, pages, &loff, &count); + + if (count > 0) { + _pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ, + &pgio->pg_lseg); + if (!pgio->pg_lseg) + return; + + *rsize = NFS_SERVER(inode)->ds_rsize; + pgio->pg_boundary = pnfs_getboundary(inode); + if (pgio->pg_boundary) + pnfs_set_pg_test(inode, pgio); + } +} + +void +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, + size_t *wsize) +{ + struct nfs_server *server = NFS_SERVER(inode); + + pgio->pg_iswrite = 1; + if (!pnfs_enabled_sb(server)) { + pgio->pg_boundary = 0; + pgio->pg_test = NULL; + return; + } + pgio->pg_boundary = pnfs_getboundary(inode); + pnfs_set_pg_test(inode, pgio); + *wsize = server->ds_wsize; +} + +/* Return I/O buffer size for a layout driver + * This value will determine what size reads and writes + * will be gathered into and sent to the data servers. + * blocksize must be a multiple of the page cache size. + */ +unsigned int +pnfs_getiosize(struct nfs_server *server) +{ + if (!PNFS_EXISTS_LDPOLICY_OP(server, get_blocksize)) + return 0; + return server->pnfs_curr_ld->ld_policy_ops->get_blocksize(); +} + +void +pnfs_set_ds_iosize(struct nfs_server *server) +{ + unsigned dssize = pnfs_getiosize(server); + + /* Set buffer size for data servers */ + if (dssize > 0) { + server->ds_rsize = server->ds_wsize = + nfs_block_size(dssize, NULL); + } else { + server->ds_wsize = server->wsize; + server->ds_rsize = server->rsize; + } +} + +static int +pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) +{ + put_lseg(pdata->lseg); + pdata->lseg = NULL; + pdata->call_ops->rpc_call_done(task, data); + if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) + return -EAGAIN; + if (pdata->pnfsflags & PNFS_NO_RPC) { + pdata->call_ops->rpc_release(data); + } else { + /* + * just restore original rpc call ops + * rpc_release will be called later by the rpc scheduling layer. + */ + task->tk_ops = pdata->call_ops; + } + return 0; +} + +/* Post-write completion function + * Invoked by all layout drivers when write_pagelist is done. + * + * NOTE: callers set data->pnfsflags PNFS_NO_RPC + * so that the NFS cleanup routines perform only the page cache + * cleanup. + */ +static void +pnfs_write_retry(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_write_data *wdata; + struct pnfs_layout_range range; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + wdata = container_of(task, struct nfs_write_data, task); + range.iomode = IOMODE_RW; + range.offset = wdata->args.offset; + range.length = wdata->args.count; + _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true); + pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), + wdata->pdata.call_ops, wdata->pdata.how); +} + +static void +pnfs_writeback_done(struct nfs_write_data *data) +{ + struct pnfs_call_data *pdata = &data->pdata; + + dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); + + /* update last write offset and need layout commit + * for non-files layout types (files layout calls + * pnfs4_write_done for this) + */ + if ((pdata->pnfsflags & PNFS_NO_RPC) && + data->task.tk_status >= 0 && data->res.count > 0) { + struct nfs_inode *nfsi = NFS_I(data->inode); + + pnfs_update_last_write(nfsi, data->args.offset, data->res.count); + pnfs_need_layoutcommit(nfsi, data->args.context); + } + + if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { + INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); + queue_work(nfsiod_workqueue, &data->task.u.tk_work); + } +} + +static void _pnfs_clear_lseg_from_pages(struct list_head *head) +{ + struct nfs_page *req; + + list_for_each_entry(req, head, wb_list) { + put_lseg(req->wb_lseg); + req->wb_lseg = NULL; + } +} + +/* + * Call the appropriate parallel I/O subsystem write function. + * If no I/O device driver exists, or one does match the returned + * fstype, then return a positive status for regular NFS processing. + * + * TODO: Is wdata->how and wdata->args.stable always the same value? + * TODO: It seems in NFS, the server may not do a stable write even + * though it was requested (and vice-versa?). To check, it looks + * in data->res.verf->committed. Do we need this ability + * for non-file layout drivers? + */ +enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *wdata, + const struct rpc_call_ops *call_ops, int how) +{ + struct inode *inode = wdata->inode; + enum pnfs_try_status trypnfs; + struct nfs_server *nfss = NFS_SERVER(inode); + struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; + + wdata->pdata.call_ops = call_ops; + wdata->pdata.pnfs_error = 0; + wdata->pdata.how = how; + + dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, + inode->i_ino, wdata->args.count, wdata->args.offset, how); + + get_lseg(lseg); + + if (!pnfs_use_rpc(nfss)) + wdata->pdata.pnfsflags |= PNFS_NO_RPC; + wdata->pdata.lseg = lseg; + trypnfs = nfss->pnfs_curr_ld->ld_io_ops->write_pagelist(wdata, + nfs_page_array_len(wdata->args.pgbase, wdata->args.count), + how); + + if (trypnfs == PNFS_NOT_ATTEMPTED) { + wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; + wdata->pdata.lseg = NULL; + put_lseg(lseg); + _pnfs_clear_lseg_from_pages(&wdata->pages); + } else { + nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); + } + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + +/* Post-read completion function. Invoked by all layout drivers when + * read_pagelist is done + */ +static void +pnfs_read_retry(struct work_struct *work) +{ + struct rpc_task *task; + struct nfs_read_data *rdata; + struct pnfs_layout_range range; + + dprintk("%s enter\n", __func__); + task = container_of(work, struct rpc_task, u.tk_work); + rdata = container_of(task, struct nfs_read_data, task); + range.iomode = IOMODE_RW; + range.offset = rdata->args.offset; + range.length = rdata->args.count; + _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true); + pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), + rdata->pdata.call_ops); +} + +static void +pnfs_read_done(struct nfs_read_data *data) +{ + struct pnfs_call_data *pdata = &data->pdata; + + dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); + + if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { + INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); + queue_work(nfsiod_workqueue, &data->task.u.tk_work); + } +} + +/* + * Call the appropriate parallel I/O subsystem read function. + * If no I/O device driver exists, or one does match the returned + * fstype, then return a positive status for regular NFS processing. + */ +enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *rdata, + const struct rpc_call_ops *call_ops) +{ + struct inode *inode = rdata->inode; + struct nfs_server *nfss = NFS_SERVER(inode); + struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; + enum pnfs_try_status trypnfs; + + rdata->pdata.call_ops = call_ops; + rdata->pdata.pnfs_error = 0; + + dprintk("%s: Reading ino:%lu %u@%llu\n", + __func__, inode->i_ino, rdata->args.count, rdata->args.offset); + + get_lseg(lseg); + + if (!pnfs_use_rpc(nfss)) + rdata->pdata.pnfsflags |= PNFS_NO_RPC; + rdata->pdata.lseg = lseg; + trypnfs = nfss->pnfs_curr_ld->ld_io_ops->read_pagelist(rdata, + nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; + rdata->pdata.lseg = NULL; + put_lseg(lseg); + _pnfs_clear_lseg_from_pages(&rdata->pages); + } else { + nfs_inc_stats(inode, NFSIOS_PNFS_READ); + } + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + +/* + * This gives the layout driver an opportunity to read in page "around" + * the data to be written. It returns 0 on success, otherwise an error code + * which will either be passed up to user, or ignored if + * some previous part of write succeeded. + * Note the range [pos, pos+len-1] is entirely within the page. + */ +int _pnfs_write_begin(struct inode *inode, struct page *page, + loff_t pos, unsigned len, + struct pnfs_layout_segment *lseg, + struct pnfs_fsdata **fsdata) +{ + struct pnfs_fsdata *data; + int status = 0; + + dprintk("--> %s: pos=%llu len=%u\n", + __func__, (unsigned long long)pos, len); + data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); + if (!data) { + status = -ENOMEM; + goto out; + } + data->lseg = lseg; /* refcount passed into data to be managed there */ + status = NFS_SERVER(inode)->pnfs_curr_ld->ld_io_ops->write_begin( + lseg, page, pos, len, data); + if (status) { + kfree(data); + data = NULL; + } +out: + *fsdata = data; + dprintk("<-- %s: status=%d\n", __func__, status); + return status; +} + +/* Return 0 on succes, negative on failure */ +/* CAREFUL - what happens if copied < len??? */ +int _pnfs_write_end(struct inode *inode, struct page *page, + loff_t pos, unsigned len, unsigned copied, + struct pnfs_layout_segment *lseg) +{ + struct nfs_server *nfss = NFS_SERVER(inode); + int status; + + status = nfss->pnfs_curr_ld->ld_io_ops->write_end(inode, page, + pos, len, copied, lseg); + return status; +} + +/* pNFS Commit callback function for all layout drivers */ +static void +pnfs_commit_done(struct nfs_write_data *data) +{ + struct pnfs_call_data *pdata = &data->pdata; + + dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); + + if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { + struct pnfs_layout_range range = { + .iomode = IOMODE_RW, + .offset = data->args.offset, + .length = data->args.count, + }; + dprintk("%s: retrying\n", __func__); + _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, + true); + pnfs_initiate_commit(data, NFS_CLIENT(data->inode), + pdata->call_ops, pdata->how, 1); + } +} + +enum pnfs_try_status +pnfs_try_to_commit(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, int sync) +{ + struct inode *inode = data->inode; + struct nfs_server *nfss = NFS_SERVER(data->inode); + enum pnfs_try_status trypnfs; + + dprintk("%s: Begin\n", __func__); + + if (!pnfs_use_rpc(nfss)) + data->pdata.pnfsflags |= PNFS_NO_RPC; + /* We need to account for possibility that + * each nfs_page can point to a different lseg (or be NULL). + * For the immediate case of whole-file-only layouts, we at + * least know there can be only a single lseg. + * We still have to account for the possibility of some being NULL. + * This will be done by passing the buck to the layout driver. + */ + data->pdata.call_ops = call_ops; + data->pdata.pnfs_error = 0; + data->pdata.how = sync; + data->pdata.lseg = NULL; + trypnfs = nfss->pnfs_curr_ld->ld_io_ops->commit(data, sync); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + data->pdata.pnfsflags &= ~PNFS_NO_RPC; + _pnfs_clear_lseg_from_pages(&data->pages); + } else + nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) +{ + struct nfs_server *nfss = NFS_SERVER(data->args.inode); + + /* TODO: Maybe we should avoid this by allowing the layout driver + * to directly xdr its layout on the wire. + */ + if (nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit) + nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit( + NFS_I(data->args.inode)->layout, + &data->args, data->status); +} + +/* + * Set up the argument/result storage required for the RPC call. + */ +static int +pnfs_layoutcommit_setup(struct inode *inode, + struct nfs4_layoutcommit_data *data, + loff_t write_begin_pos, loff_t write_end_pos) +{ + struct nfs_server *nfss = NFS_SERVER(inode); + int result = 0; + + dprintk("--> %s\n", __func__); + + data->args.inode = inode; + data->args.fh = NFS_FH(inode); + data->args.layout_type = nfss->pnfs_curr_ld->id; + data->res.fattr = &data->fattr; + nfs_fattr_init(&data->fattr); + + /* TODO: Need to determine the correct values */ + data->args.time_modify_changed = 0; + + /* Set values from inode so it can be reset + */ + data->args.range.iomode = IOMODE_RW; + data->args.range.offset = write_begin_pos; + data->args.range.length = write_end_pos - write_begin_pos + 1; + data->args.lastbytewritten = min(write_end_pos, + i_size_read(inode) - 1); + data->args.bitmask = nfss->attr_bitmask; + data->res.server = nfss; + + /* Call layout driver to set the arguments */ + if (nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit) + result = nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit( + NFS_I(inode)->layout, &data->args); + + dprintk("<-- %s Status %d\n", __func__, result); + return result; +} + +/* Issue a async layoutcommit for an inode. + */ +int +pnfs_layoutcommit_inode(struct inode *inode, int sync) +{ + struct nfs4_layoutcommit_data *data; + struct nfs_inode *nfsi = NFS_I(inode); + loff_t write_begin_pos; + loff_t write_end_pos; + + int status = 0; + + dprintk("%s Begin (sync:%d)\n", __func__, sync); + + BUG_ON(!has_layout(nfsi)); + + data = pnfs_layoutcommit_alloc(); + if (!data) + return -ENOMEM; + + spin_lock(&inode->i_lock); + if (!layoutcommit_needed(nfsi)) { + spin_unlock(&inode->i_lock); + goto out_free; + } + + /* Clear layoutcommit properties in the inode so + * new lc info can be generated + */ + write_begin_pos = nfsi->layout->write_begin_pos; + write_end_pos = nfsi->layout->write_end_pos; + data->cred = nfsi->layout->cred; + nfsi->layout->write_begin_pos = 0; + nfsi->layout->write_end_pos = 0; + nfsi->layout->cred = NULL; + __clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); + pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout); + + /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ + get_layout(NFS_I(inode)->layout); + + spin_unlock(&inode->i_lock); + + /* Set up layout commit args */ + status = pnfs_layoutcommit_setup(inode, data, write_begin_pos, + write_end_pos); + if (status) { + /* The layout driver failed to setup the layoutcommit */ + put_rpccred(data->cred); + put_layout(inode); + goto out_free; + } + status = nfs4_proc_layoutcommit(data, sync); +out: + dprintk("%s end (err:%d)\n", __func__, status); + return status; +out_free: + pnfs_layoutcommit_free(data); + goto out; +} + +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) +{ + if (fsdata) { + /* lseg refcounting handled directly in nfs_Write_end */ + kfree(fsdata); + } +} + +/* Callback operations for layout drivers. + */ +struct pnfs_client_operations pnfs_ops = { + .nfs_getdevicelist = nfs4_proc_getdevicelist, + .nfs_getdeviceinfo = nfs4_proc_getdeviceinfo, + .nfs_readlist_complete = pnfs_read_done, + .nfs_writelist_complete = pnfs_writeback_done, + .nfs_commit_complete = pnfs_commit_done, +}; + +EXPORT_SYMBOL(pnfs_unregister_layoutdriver); +EXPORT_SYMBOL(pnfs_register_layoutdriver); + + +/* Device ID cache. Supports one layout type per struct nfs_client */ +int +nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, + void (*free_callback)(struct kref *)) +{ + struct nfs4_deviceid_cache *c; + + c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); + if (!c) + return -ENOMEM; + spin_lock(&clp->cl_lock); + if (clp->cl_devid_cache != NULL) { + kref_get(&clp->cl_devid_cache->dc_kref); + spin_unlock(&clp->cl_lock); + dprintk("%s [kref [%d]]\n", __func__, + atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); + kfree(c); + } else { + int i; + + spin_lock_init(&c->dc_lock); + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++) + INIT_HLIST_HEAD(&c->dc_deviceids[i]); + kref_init(&c->dc_kref); + c->dc_free_callback = free_callback; + clp->cl_devid_cache = c; + spin_unlock(&clp->cl_lock); + dprintk("%s [new]\n", __func__); + } + return 0; +} +EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); + +void +nfs4_init_deviceid_node(struct nfs4_deviceid *d) +{ + INIT_HLIST_NODE(&d->de_node); + kref_init(&d->de_kref); +} +EXPORT_SYMBOL(nfs4_init_deviceid_node); + +/* Called from layoutdriver_io_operations->alloc_lseg */ +void +nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d) +{ + dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); + l->deviceid = d; +} +EXPORT_SYMBOL(nfs4_set_layout_deviceid); + +/* Called from layoutdriver_io_operations->free_lseg */ +void +nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *l, + struct nfs4_deviceid *d, + void (*free_callback)(struct kref *)) +{ + dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); + l->deviceid = NULL; + kref_put(&d->de_kref, free_callback); +} +EXPORT_SYMBOL(nfs4_put_unset_layout_deviceid); + +/* Find and reference a deviceid */ +struct nfs4_deviceid * +nfs4_find_get_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) +{ + struct nfs4_deviceid *d; + struct hlist_node *n; + long hash = nfs4_deviceid_hash(id); + + dprintk("--> %s hash %ld\n", __func__, hash); + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { + if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { + if (!atomic_inc_not_zero(&d->de_kref.refcount)) { + goto fail; + } else { + rcu_read_unlock(); + return d; + } + } + } +fail: + rcu_read_unlock(); + return NULL; +} +EXPORT_SYMBOL(nfs4_find_get_deviceid); + +/* + * Add and kref_get a deviceid. + * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new + */ +struct nfs4_deviceid * +nfs4_add_get_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) +{ + struct nfs4_deviceid *d; + struct hlist_node *n; + long hash = nfs4_deviceid_hash(&new->de_id); + + dprintk("--> %s hash %ld\n", __func__, hash); + spin_lock(&c->dc_lock); + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { + if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { + kref_get(&d->de_kref); + spin_unlock(&c->dc_lock); + dprintk("%s [discard]\n", __func__); + c->dc_free_callback(&new->de_kref); + return d; + } + } + hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); + kref_get(&new->de_kref); + spin_unlock(&c->dc_lock); + dprintk("%s [new]\n", __func__); + return new; +} +EXPORT_SYMBOL(nfs4_add_get_deviceid); + +/* + * Remove the first deviceid from a hash bucket, or return 0 if bucket list + * is empty. + */ +static int +nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash, + struct pnfs_deviceid *id) +{ + struct nfs4_deviceid *d; + struct hlist_node *n; + + dprintk("--> %s hash %ld\n", __func__, hash); + spin_lock(&c->dc_lock); + hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { + if (id && memcmp(id, &d->de_id, NFS4_PNFS_DEVICEID4_SIZE)) + continue; + hlist_del_rcu(&d->de_node); + spin_unlock(&c->dc_lock); + synchronize_rcu(); + dprintk("%s [%d]\n", __func__, + atomic_read(&d->de_kref.refcount)); + kref_put(&d->de_kref, c->dc_free_callback); + return 1; + } + spin_unlock(&c->dc_lock); + return 0; +} + +void +nfs4_delete_device(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) +{ + long hash = nfs4_deviceid_hash(id); + + nfs4_remove_deviceid(c, hash, id); +} +EXPORT_SYMBOL(nfs4_delete_device); + +static void +nfs4_free_deviceid_cache(struct kref *kref) +{ + struct nfs4_deviceid_cache *cache = + container_of(kref, struct nfs4_deviceid_cache, dc_kref); + long i; + + for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) + while (nfs4_remove_deviceid(cache, i, NULL)) + ; + kfree(cache); +} + +void +nfs4_put_deviceid_cache(struct nfs_client *clp) +{ + struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache; + int refcount; + + dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); + spin_lock(&clp->cl_lock); + refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount); + if (refcount == 1) + clp->cl_devid_cache = NULL; + spin_unlock(&clp->cl_lock); + dprintk("%s [%d]\n", __func__, refcount); + kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); +} +EXPORT_SYMBOL(nfs4_put_deviceid_cache); diff -up linux-2.6.34.noarch/fs/nfs/pnfs.h.orig linux-2.6.34.noarch/fs/nfs/pnfs.h --- linux-2.6.34.noarch/fs/nfs/pnfs.h.orig 2010-09-30 10:17:08.757998000 -0400 +++ linux-2.6.34.noarch/fs/nfs/pnfs.h 2010-09-30 10:17:08.759996000 -0400 @@ -0,0 +1,354 @@ +/* + * fs/nfs/pnfs.h + * + * pNFS client data structures. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Dean Hildebrand + */ + +#ifndef FS_NFS_PNFS_H +#define FS_NFS_PNFS_H + +#include + +#ifdef CONFIG_NFS_V4_1 + +#include +#include +#include "iostat.h" + +/* nfs4proc.c */ +extern int nfs4_proc_getdevicelist(struct nfs_server *server, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist); +extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, + struct pnfs_device *dev); +extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); +extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, + int issync); +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); + +/* pnfs.c */ +extern const nfs4_stateid zero_stateid; + +void _pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + loff_t pos, u64 count, enum pnfs_iomode access_type, + struct pnfs_layout_segment **lsegpp); + +int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, + const nfs4_stateid *stateid, /* optional */ + enum pnfs_layoutreturn_type, bool wait); +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); +void unmount_pnfs_layoutdriver(struct nfs_server *); +enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, + const struct rpc_call_ops *, int); +enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, + const struct rpc_call_ops *); +int pnfs_initialize(void); +void pnfs_uninitialize(void); +void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *data); +void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); +int pnfs_layoutcommit_inode(struct inode *inode, int sync); +void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); +void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); +unsigned int pnfs_getiosize(struct nfs_server *server); +void pnfs_set_ds_iosize(struct nfs_server *server); +enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, + const struct rpc_call_ops *, int); +void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, + struct nfs_open_context *, struct list_head *, + size_t *); +void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, + size_t *); +void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); +void pnfs_get_layout_done(struct nfs4_layoutget *, int rpc_status); +int pnfs_layout_process(struct nfs4_layoutget *lgp); +void pnfs_layout_release(struct pnfs_layout_hdr *, struct pnfs_layout_range *range); +void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid); +void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_all_layouts(struct nfs_client *); +void put_layout(struct inode *inode); +void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo); +int _pnfs_write_begin(struct inode *inode, struct page *page, + loff_t pos, unsigned len, + struct pnfs_layout_segment *lseg, + struct pnfs_fsdata **fsdata); +int _pnfs_write_end(struct inode *inode, struct page *page, + loff_t pos, unsigned len, unsigned copied, + struct pnfs_layout_segment *lseg); + +#define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld && \ + (srv)->pnfs_curr_ld->ld_io_ops && \ + (srv)->pnfs_curr_ld->ld_io_ops->opname) +#define PNFS_EXISTS_LDPOLICY_OP(srv, opname) ((srv)->pnfs_curr_ld && \ + (srv)->pnfs_curr_ld->ld_policy_ops && \ + (srv)->pnfs_curr_ld->ld_policy_ops->opname) + +#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" + +static inline int lo_fail_bit(u32 iomode) +{ + return iomode == IOMODE_RW ? + NFS_INO_RW_LAYOUT_FAILED : NFS_INO_RO_LAYOUT_FAILED; +} + +/* Return true if a layout driver is being used for this mountpoint */ +static inline int pnfs_enabled_sb(struct nfs_server *nfss) +{ + return nfss->pnfs_curr_ld != NULL; +} + +static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, + struct pnfs_fsdata *fsdata) +{ + return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || + !fsdata->bypass_eof; +} + +/* Should the pNFS client commit and return the layout upon a setattr */ +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return false; + return NFS_SERVER(inode)->pnfs_curr_ld->ld_policy_ops->flags & + PNFS_LAYOUTRET_ON_SETATTR; +} + +/* Should the pNFS client commit and return the layout on close + */ +static inline int +pnfs_layout_roc_iomode(struct nfs_inode *nfsi) +{ + return nfsi->layout->roc_iomode; +} + +static inline int pnfs_write_begin(struct file *filp, struct page *page, + loff_t pos, unsigned len, + struct pnfs_layout_segment *lseg, + void **fsdata) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct nfs_server *nfss = NFS_SERVER(inode); + int status = 0; + + *fsdata = lseg; + if (lseg && PNFS_EXISTS_LDIO_OP(nfss, write_begin)) + status = _pnfs_write_begin(inode, page, pos, len, lseg, + (struct pnfs_fsdata **) fsdata); + return status; +} + +static inline int pnfs_write_end(struct file *filp, struct page *page, + loff_t pos, unsigned len, unsigned copied, + struct pnfs_layout_segment *lseg) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct nfs_server *nfss = NFS_SERVER(inode); + + if (PNFS_EXISTS_LDIO_OP(nfss, write_end)) + return _pnfs_write_end(inode, page, pos, len, copied, lseg); + else + return 0; +} + +static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) +{ + if (fsdata) { + struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); + + if (PNFS_EXISTS_LDIO_OP(nfss, write_end_cleanup)) + nfss->pnfs_curr_ld->ld_io_ops->write_end_cleanup(filp, fsdata); + if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) + pnfs_free_fsdata(fsdata); + } +} + +static inline int pnfs_return_layout(struct inode *ino, + struct pnfs_layout_range *range, + const nfs4_stateid *stateid, /* optional */ + enum pnfs_layoutreturn_type type, + bool wait) +{ + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_server *nfss = NFS_SERVER(ino); + + if (pnfs_enabled_sb(nfss) && + (type != RETURN_FILE || has_layout(nfsi))) + return _pnfs_return_layout(ino, range, stateid, type, wait); + + return 0; +} + +static inline void pnfs_update_layout(struct inode *ino, + struct nfs_open_context *ctx, + loff_t pos, u64 count, enum pnfs_iomode access_type, + struct pnfs_layout_segment **lsegpp) +{ + struct nfs_server *nfss = NFS_SERVER(ino); + + if (pnfs_enabled_sb(nfss)) + _pnfs_update_layout(ino, ctx, pos, count, access_type, lsegpp); + else { + if (lsegpp) + *lsegpp = NULL; + } +} + +static inline int pnfs_get_write_status(struct nfs_write_data *data) +{ + return data->pdata.pnfs_error; +} + +static inline int pnfs_get_read_status(struct nfs_read_data *data) +{ + return data->pdata.pnfs_error; +} + +static inline int pnfs_use_rpc(struct nfs_server *nfss) +{ + if (pnfs_enabled_sb(nfss)) + return pnfs_ld_use_rpc_code(nfss->pnfs_curr_ld); + + return 1; +} + +static inline struct pnfs_layout_segment * +nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) +{ + if (fsdata) { + struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); + + if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) + return ((struct pnfs_fsdata *) fsdata)->lseg; + } + return fsdata; +} +#else /* CONFIG_NFS_V4_1 */ + +static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) +{ +} + +static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ +} + +static inline void get_lseg(struct pnfs_layout_segment *lseg) +{ +} + +static inline void put_lseg(struct pnfs_layout_segment *lseg) +{ +} + +static inline void +pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, + loff_t pos, u64 count, enum pnfs_iomode access_type, + struct pnfs_layout_segment **lsegpp) +{ + if (lsegpp) + *lsegpp = NULL; +} + +static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, + struct pnfs_fsdata *fsdata) +{ + return 1; +} + +static inline enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *data, + const struct rpc_call_ops *call_ops) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline enum pnfs_try_status +pnfs_try_to_commit(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + +static inline int pnfs_write_begin(struct file *filp, struct page *page, + loff_t pos, unsigned len, + struct pnfs_layout_segment *lseg, + void **fsdata) +{ + *fsdata = NULL; + return 0; +} + +static inline int pnfs_write_end(struct file *filp, struct page *page, + loff_t pos, unsigned len, unsigned copied, + struct pnfs_layout_segment *lseg) +{ + return 0; +} + +static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) +{ +} + +static inline int pnfs_get_write_status(struct nfs_write_data *data) +{ + return 0; +} + +static inline int pnfs_get_read_status(struct nfs_read_data *data) +{ + return 0; +} + +static inline int pnfs_use_rpc(struct nfs_server *nfss) +{ + return 1; +} + +static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) +{ + return 0; +} + +static inline bool +pnfs_ld_layoutret_on_setattr(struct inode *inode) +{ + return false; +} + +static inline int +pnfs_layout_roc_iomode(struct nfs_inode *nfsi) +{ + return 0; +} + +static inline int pnfs_return_layout(struct inode *ino, + struct pnfs_layout_range *range, + const nfs4_stateid *stateid, /* optional */ + enum pnfs_layoutreturn_type type, + bool wait) +{ + return 0; +} + +static inline struct pnfs_layout_segment * +nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) +{ + return NULL; +} + +#endif /* CONFIG_NFS_V4_1 */ + +#endif /* FS_NFS_PNFS_H */ diff -up linux-2.6.34.noarch/fs/nfs/proc.c.orig linux-2.6.34.noarch/fs/nfs/proc.c --- linux-2.6.34.noarch/fs/nfs/proc.c.orig 2010-09-30 10:15:17.904725000 -0400 +++ linux-2.6.34.noarch/fs/nfs/proc.c 2010-09-30 10:17:08.764996000 -0400 @@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru fattr = nfs_alloc_fattr(); status = -ENOMEM; if (fh == NULL || fattr == NULL) - goto out; + goto out_free; status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); @@ -455,7 +455,7 @@ nfs_proc_symlink(struct inode *dir, stru */ if (status == 0) status = nfs_instantiate(dentry, fh, fattr); - +out_free: nfs_free_fattr(fattr); nfs_free_fhandle(fh); out: @@ -694,6 +694,7 @@ const struct nfs_rpc_ops nfs_v2_clientop .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs_dir_inode_operations, .file_inode_ops = &nfs_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs_proc_get_root, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff -up linux-2.6.34.noarch/fs/nfs/read.c.orig linux-2.6.34.noarch/fs/nfs/read.c --- linux-2.6.34.noarch/fs/nfs/read.c.orig 2010-09-30 10:15:17.910723000 -0400 +++ linux-2.6.34.noarch/fs/nfs/read.c 2010-09-30 10:17:08.770996000 -0400 @@ -18,8 +18,12 @@ #include #include #include +#include +#include #include +#include +#include "pnfs.h" #include "nfs4_fs.h" #include "internal.h" @@ -117,11 +121,14 @@ int nfs_readpage_async(struct nfs_open_c LIST_HEAD(one_request); struct nfs_page *new; unsigned int len; + struct pnfs_layout_segment *lseg; len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(ctx, inode, page, 0, len); + pnfs_update_layout(inode, ctx, 0, NFS4_MAX_UINT64, IOMODE_READ, &lseg); + new = nfs_create_request(ctx, inode, page, 0, len, lseg); + put_lseg(lseg); if (IS_ERR(new)) { unlock_page(page); return PTR_ERR(new); @@ -155,24 +162,20 @@ static void nfs_readpage_release(struct nfs_release_request(req); } -/* - * Set up the NFS read request struct - */ -static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset) +int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = data->inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { .task = &data->task, - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .rpc_message = &msg, .callback_ops = call_ops, .callback_data = data, @@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ .flags = RPC_TASK_ASYNC | swap_flags, }; + /* Set up the initial task struct. */ + NFS_PROTO(inode)->read_setup(data, &msg); + + dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL(nfs_initiate_read); + +int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops) +{ + if (data->req->wb_lseg && + (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) + return pnfs_get_read_status(data); + + return nfs_initiate_read(data, clnt, call_ops); +} + +/* + * Set up the NFS read request struct + */ +static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + data->req = req; data->inode = inode; - data->cred = msg.rpc_cred; + data->cred = req->wb_context->cred; data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -190,27 +230,14 @@ static int nfs_read_rpcsetup(struct nfs_ data->args.pages = data->pagevec; data->args.count = count; data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; data->res.fattr = &data->fattr; data->res.count = count; data->res.eof = 0; nfs_fattr_init(&data->fattr); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->read_setup(data, &msg); - - dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - rpc_put_task(task); - return 0; + return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); } static void @@ -354,7 +381,14 @@ static void nfs_readpage_retry(struct rp { struct nfs_readargs *argp = &data->args; struct nfs_readres *resp = &data->res; + struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; +#ifdef CONFIG_NFS_V4_1 + if (data->fldata.ds_nfs_client) { + dprintk("%s DS read\n", __func__); + clp = data->fldata.ds_nfs_client; + } +#endif /* CONFIG_NFS_V4_1 */ if (resp->eof || resp->count == argp->count) return; @@ -368,7 +402,10 @@ static void nfs_readpage_retry(struct rp argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; - nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); +#ifdef CONFIG_NFS_V4_1 + data->pdata.pnfs_error = -EAGAIN; +#endif /* CONFIG_NFS_V4_1 */ + nfs_restart_rpc(task, clp); } /* @@ -409,13 +446,19 @@ static void nfs_readpage_release_partial void nfs_read_prepare(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; + struct nfs4_session *ds_session = NULL; - if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, + if (data->fldata.ds_nfs_client) { + dprintk("%s DS read\n", __func__); + ds_session = data->fldata.ds_nfs_client->cl_session; + } + if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, &data->args.seq_args, &data->res.seq_res, 0, task)) return; rpc_call_start(task); } +EXPORT_SYMBOL(nfs_read_prepare); #endif /* CONFIG_NFS_V4_1 */ static const struct rpc_call_ops nfs_read_partial_ops = { @@ -568,7 +611,8 @@ readpage_async_filler(void *data, struct if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(desc->ctx, inode, page, 0, len); + new = nfs_create_request(desc->ctx, inode, page, 0, len, + desc->pgio->pg_lseg); if (IS_ERR(new)) goto out_error; @@ -624,6 +668,9 @@ int nfs_readpages(struct file *filp, str if (ret == 0) goto read_complete; /* all pages were read */ +#ifdef CONFIG_NFS_V4_1 + pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); +#endif /* CONFIG_NFS_V4_1 */ if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else @@ -632,6 +679,7 @@ int nfs_readpages(struct file *filp, str ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); nfs_pageio_complete(&pgio); + put_lseg(pgio.pg_lseg); npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: diff -up linux-2.6.34.noarch/fs/nfs/super.c.orig linux-2.6.34.noarch/fs/nfs/super.c --- linux-2.6.34.noarch/fs/nfs/super.c.orig 2010-09-30 10:15:17.918722000 -0400 +++ linux-2.6.34.noarch/fs/nfs/super.c 2010-09-30 10:17:08.777998000 -0400 @@ -64,6 +64,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -676,6 +677,28 @@ static int nfs_show_options(struct seq_f return 0; } +#ifdef CONFIG_NFS_V4_1 +void show_sessions(struct seq_file *m, struct nfs_server *server) +{ + if (nfs4_has_session(server->nfs_client)) + seq_printf(m, ",sessions"); +} +#else +void show_sessions(struct seq_file *m, struct nfs_server *server) {} +#endif + +#ifdef CONFIG_NFS_V4_1 +void show_pnfs(struct seq_file *m, struct nfs_server *server) +{ + seq_printf(m, ",pnfs="); + if (server->pnfs_curr_ld) + seq_printf(m, "%s", server->pnfs_curr_ld->name); + else + seq_printf(m, "not configured"); +} +#else /* CONFIG_NFS_V4_1 */ +void show_pnfs(struct seq_file *m, struct nfs_server *server) {} +#endif /* CONFIG_NFS_V4_1 */ /* * Present statistical information for this VFS mountpoint @@ -714,6 +737,8 @@ static int nfs_show_stats(struct seq_fil seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); + show_sessions(m, nfss); + show_pnfs(m, nfss); } #endif diff -up linux-2.6.34.noarch/fs/nfs/unlink.c.orig linux-2.6.34.noarch/fs/nfs/unlink.c --- linux-2.6.34.noarch/fs/nfs/unlink.c.orig 2010-09-30 10:15:17.932726000 -0400 +++ linux-2.6.34.noarch/fs/nfs/unlink.c 2010-09-30 10:17:08.783003000 -0400 @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); - if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, &data->res.seq_res, 1, task)) return; rpc_call_start(task); diff -up linux-2.6.34.noarch/fs/nfs/write.c.orig linux-2.6.34.noarch/fs/nfs/write.c --- linux-2.6.34.noarch/fs/nfs/write.c.orig 2010-09-30 10:15:05.044337000 -0400 +++ linux-2.6.34.noarch/fs/nfs/write.c 2010-09-30 10:17:08.789996000 -0400 @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -28,6 +29,7 @@ #include "iostat.h" #include "nfs4_fs.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -59,6 +61,7 @@ struct nfs_write_data *nfs_commitdata_al } return p; } +EXPORT_SYMBOL(nfs_commitdata_alloc); void nfs_commit_free(struct nfs_write_data *p) { @@ -66,6 +69,7 @@ void nfs_commit_free(struct nfs_write_da kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } +EXPORT_SYMBOL(nfs_commit_free); struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { @@ -418,6 +422,17 @@ static void nfs_inode_remove_request(str nfs_clear_request(req); nfs_release_request(req); } +static void +nfs_mark_request_nopnfs(struct nfs_page *req) +{ + struct pnfs_layout_segment *lseg = req->wb_lseg; + + if (req->wb_lseg == NULL) + return; + req->wb_lseg = NULL; + put_lseg(lseg); + dprintk(" retry through MDS\n"); +} static void nfs_mark_request_dirty(struct nfs_page *req) @@ -523,7 +538,7 @@ nfs_need_commit(struct nfs_inode *nfsi) * The requests are *not* checked to ensure that they form a contiguous set. */ static int -nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) +nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) { struct nfs_inode *nfsi = NFS_I(inode); int ret; @@ -531,7 +546,8 @@ nfs_scan_commit(struct inode *inode, str if (!nfs_need_commit(nfsi)) return 0; - ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); + ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, + use_pnfs); if (ret > 0) nfsi->ncommit -= ret; if (nfs_need_commit(NFS_I(inode))) @@ -560,7 +576,8 @@ static inline int nfs_scan_commit(struct static struct nfs_page *nfs_try_to_update_request(struct inode *inode, struct page *page, unsigned int offset, - unsigned int bytes) + unsigned int bytes, + struct pnfs_layout_segment *lseg) { struct nfs_page *req; unsigned int rqend; @@ -585,8 +602,8 @@ static struct nfs_page *nfs_try_to_updat * Note: nfs_flush_incompatible() will already * have flushed out requests having wrong owners. */ - if (offset > rqend - || end < req->wb_offset) + if (offset > rqend || end < req->wb_offset || + req->wb_lseg != lseg) goto out_flushme; if (nfs_set_page_tag_locked(req)) @@ -634,16 +651,17 @@ out_err: * already called nfs_flush_incompatible() if necessary. */ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, - struct page *page, unsigned int offset, unsigned int bytes) + struct page *page, unsigned int offset, unsigned int bytes, + struct pnfs_layout_segment *lseg) { struct inode *inode = page->mapping->host; struct nfs_page *req; int error; - req = nfs_try_to_update_request(inode, page, offset, bytes); + req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); if (req != NULL) goto out; - req = nfs_create_request(ctx, inode, page, offset, bytes); + req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); if (IS_ERR(req)) goto out; error = nfs_inode_add_request(inode, req); @@ -656,23 +674,27 @@ out: } static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, - unsigned int offset, unsigned int count) + unsigned int offset, unsigned int count, + struct pnfs_layout_segment *lseg, + void *fsdata) { struct nfs_page *req; - req = nfs_setup_write_request(ctx, page, offset, count); + req = nfs_setup_write_request(ctx, page, offset, count, lseg); if (IS_ERR(req)) return PTR_ERR(req); nfs_mark_request_dirty(req); /* Update file length */ - nfs_grow_file(page, offset, count); + if (pnfs_grow_ok(lseg, fsdata)) + nfs_grow_file(page, offset, count); nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); nfs_mark_request_dirty(req); nfs_clear_page_tag_locked(req); return 0; } -int nfs_flush_incompatible(struct file *file, struct page *page) +int nfs_flush_incompatible(struct file *file, struct page *page, + struct pnfs_layout_segment *lseg) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct nfs_page *req; @@ -689,7 +711,10 @@ int nfs_flush_incompatible(struct file * req = nfs_page_find_request(page); if (req == NULL) return 0; - do_flush = req->wb_page != page || req->wb_context != ctx; + do_flush = req->wb_page != page || req->wb_context != ctx || + req->wb_lock_context->lockowner != current->files || + req->wb_lock_context->pid != current->tgid || + req->wb_lseg != lseg; nfs_release_request(req); if (!do_flush) return 0; @@ -716,7 +741,8 @@ static int nfs_write_pageuptodate(struct * things with a page scheduled for an RPC call (e.g. invalidate it). */ int nfs_updatepage(struct file *file, struct page *page, - unsigned int offset, unsigned int count) + unsigned int offset, unsigned int count, + struct pnfs_layout_segment *lseg, void *fsdata) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = page->mapping->host; @@ -741,7 +767,7 @@ int nfs_updatepage(struct file *file, st offset = 0; } - status = nfs_writepage_setup(ctx, page, offset, count); + status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); if (status < 0) nfs_set_pageerror(page); @@ -771,25 +797,21 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_write_rpcsetup(struct nfs_page *req, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset, - int how) +int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = data->inode; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .task = &data->task, .rpc_message = &msg, .callback_ops = call_ops, @@ -800,12 +822,62 @@ static int nfs_write_rpcsetup(struct nfs }; int ret = 0; + /* Set up the initial task struct. */ + NFS_PROTO(inode)->write_setup(data, &msg); + + dprintk("NFS: %5u initiated write call " + "(req %s/%lld, %u bytes @ offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} +EXPORT_SYMBOL(nfs_initiate_write); + +int pnfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) +{ + if (data->req->wb_lseg && + (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) + return pnfs_get_write_status(data); + + return nfs_initiate_write(data, clnt, call_ops, how); +} + +/* + * Set up the argument/result storage required for the RPC call. + */ +static int nfs_write_rpcsetup(struct nfs_page *req, + struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset, + int how) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ data->req = req; data->inode = inode = req->wb_context->path.dentry->d_inode; - data->cred = msg.rpc_cred; + data->cred = req->wb_context->cred; data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -813,6 +885,7 @@ static int nfs_write_rpcsetup(struct nfs data->args.pages = data->pagevec; data->args.count = count; data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; if (how & FLUSH_STABLE) { data->args.stable = NFS_DATA_SYNC; @@ -825,30 +898,7 @@ static int nfs_write_rpcsetup(struct nfs data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->write_setup(data, &msg); - - dprintk("NFS: %5u initiated write call " - "(req %s/%lld, %u bytes @ offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } - rpc_put_task(task); -out: - return ret; + return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -859,6 +909,7 @@ static void nfs_redirty_request(struct n { struct page *page = req->wb_page; + nfs_mark_request_nopnfs(req); nfs_mark_request_dirty(req); nfs_clear_page_tag_locked(req); nfs_end_page_writeback(page); @@ -971,6 +1022,10 @@ static void nfs_pageio_init_write(struct { size_t wsize = NFS_SERVER(inode)->wsize; +#ifdef CONFIG_NFS_V4_1 + pnfs_pageio_init_write(pgio, inode, &wsize); +#endif /* CONFIG_NFS_V4_1 */ + if (wsize < PAGE_CACHE_SIZE) nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); else @@ -1036,13 +1091,27 @@ out: void nfs_write_prepare(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; + struct nfs4_session *ds_session = NULL; - if (nfs4_setup_sequence(clp, &data->args.seq_args, + if (data->fldata.ds_nfs_client) { + dprintk("%s DS read\n", __func__); + ds_session = data->fldata.ds_nfs_client->cl_session; + } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { + /* retrying via MDS? */ + data->pdata.orig_count = data->args.count; + data->args.count = NFS_SERVER(data->inode)->wsize; + dprintk("%s: trimmed count %u to wsize %u\n", __func__, + data->pdata.orig_count, data->args.count); + } else + data->pdata.orig_count = 0; + + if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, 1, task)) return; rpc_call_start(task); } +EXPORT_SYMBOL(nfs_write_prepare); #endif /* CONFIG_NFS_V4_1 */ static const struct rpc_call_ops nfs_write_partial_ops = { @@ -1126,10 +1195,11 @@ int nfs_writeback_done(struct rpc_task * struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; struct nfs_server *server = NFS_SERVER(data->inode); + struct nfs_client *clp = server->nfs_client; int status; - dprintk("NFS: %5u nfs_writeback_done (status %d)\n", - task->tk_pid, task->tk_status); + dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", + task->tk_pid, task->tk_status, resp->count); /* * ->write_done will attempt to use post-op attributes to detect @@ -1142,6 +1212,13 @@ int nfs_writeback_done(struct rpc_task * if (status != 0) return status; nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); +#ifdef CONFIG_NFS_V4_1 + /* Is this a DS session */ + if (data->fldata.ds_nfs_client) { + dprintk("%s DS write\n", __func__); + clp = data->fldata.ds_nfs_client; + } +#endif /* CONFIG_NFS_V4_1 */ #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) if (resp->verf->committed < argp->stable && task->tk_status >= 0) { @@ -1158,7 +1235,7 @@ int nfs_writeback_done(struct rpc_task * if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", - server->nfs_client->cl_hostname, + clp->cl_hostname, resp->verf->committed, argp->stable); complain = jiffies + 300 * HZ; } @@ -1168,6 +1245,9 @@ int nfs_writeback_done(struct rpc_task * if (task->tk_status >= 0 && resp->count < argp->count) { static unsigned long complain; + dprintk("NFS: short write:" + " (resp->count %u) < (argp->count = %u)\n", + resp->count, argp->count); nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); /* Has the server at least made some progress? */ @@ -1184,7 +1264,10 @@ int nfs_writeback_done(struct rpc_task * */ argp->stable = NFS_FILE_SYNC; } - nfs_restart_rpc(task, server->nfs_client); +#ifdef CONFIG_NFS_V4_1 + data->pdata.pnfs_error = -EAGAIN; +#endif /* CONFIG_NFS_V4_1 */ + nfs_restart_rpc(task, clp); return -EAGAIN; } if (time_before(complain, jiffies)) { @@ -1228,40 +1311,73 @@ static void nfs_commitdata_release(void nfs_commit_free(wdata); } -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_commit_rpcsetup(struct list_head *head, - struct nfs_write_data *data, - int how) +int nfs_initiate_commit(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) { - struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = first->wb_context->path.dentry->d_inode; + struct inode *inode = data->inode; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = first->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { .task = &data->task, - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .rpc_message = &msg, - .callback_ops = &nfs_commit_ops, + .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, .priority = priority, }; + /* Set up the initial task struct. */ + NFS_PROTO(inode)->commit_setup(data, &msg); + + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL(nfs_initiate_commit); + + +int pnfs_initiate_commit(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how, int pnfs) +{ + if (pnfs && + (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) + return pnfs_get_write_status(data); + + return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); +} + +/* + * Set up the argument/result storage required for the RPC call. + */ +static int nfs_commit_rpcsetup(struct list_head *head, + struct nfs_write_data *data, + int how, int pnfs) +{ + struct nfs_page *first = nfs_list_entry(head->next); + struct inode *inode = first->wb_context->path.dentry->d_inode; + /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ list_splice_init(head, &data->pages); data->inode = inode; - data->cred = msg.rpc_cred; + data->cred = first->wb_context->cred; data->args.fh = NFS_FH(data->inode); /* Note: we always request a commit of the entire inode */ @@ -1272,45 +1388,47 @@ static int nfs_commit_rpcsetup(struct li data->res.fattr = &data->fattr; data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); + kref_init(&data->refcount); + data->parent = NULL; + data->args.context = first->wb_context; /* used by commit done */ - /* Set up the initial task struct. */ - NFS_PROTO(inode)->commit_setup(data, &msg); + return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, + how, pnfs); +} - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +/* Handle memory error during commit */ +void nfs_mark_list_commit(struct list_head *head) +{ + struct nfs_page *req; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - rpc_put_task(task); - return 0; + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_mark_request_commit(req); + dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); + dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + BDI_RECLAIMABLE); + nfs_clear_page_tag_locked(req); + } } +EXPORT_SYMBOL(nfs_mark_list_commit); /* * Commit dirty pages */ static int -nfs_commit_list(struct inode *inode, struct list_head *head, int how) +nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) { struct nfs_write_data *data; - struct nfs_page *req; data = nfs_commitdata_alloc(); - if (!data) goto out_bad; /* Set up the argument struct */ - return nfs_commit_rpcsetup(head, data, how); + return nfs_commit_rpcsetup(head, data, how, pnfs); out_bad: - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_mark_request_commit(req); - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, - BDI_RECLAIMABLE); - nfs_clear_page_tag_locked(req); - } + nfs_mark_list_commit(head); nfs_commit_clear_lock(NFS_I(inode)); return -ENOMEM; } @@ -1330,6 +1448,19 @@ static void nfs_commit_done(struct rpc_t return; } +static inline void nfs_commit_cleanup(struct kref *kref) +{ + struct nfs_write_data *data; + + data = container_of(kref, struct nfs_write_data, refcount); + /* Clear lock only when all cloned commits are finished */ + if (data->parent) + kref_put(&data->parent->refcount, nfs_commit_cleanup); + else + nfs_commit_clear_lock(NFS_I(data->inode)); + nfs_commitdata_release(data); +} + static void nfs_commit_release(void *calldata) { struct nfs_write_data *data = calldata; @@ -1347,6 +1478,11 @@ static void nfs_commit_release(void *cal req->wb_bytes, (long long)req_offset(req)); if (status < 0) { + if (req->wb_lseg) { + nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + goto next; + } nfs_context_set_write_error(req->wb_context, status); nfs_inode_remove_request(req); dprintk(", error = %d\n", status); @@ -1363,12 +1499,12 @@ static void nfs_commit_release(void *cal } /* We have a mismatch. Write the page again */ dprintk(" mismatch\n"); + nfs_mark_request_nopnfs(req); nfs_mark_request_dirty(req); next: nfs_clear_page_tag_locked(req); } - nfs_commit_clear_lock(NFS_I(data->inode)); - nfs_commitdata_release(calldata); + kref_put(&data->refcount, nfs_commit_cleanup); } static const struct rpc_call_ops nfs_commit_ops = { @@ -1384,21 +1520,22 @@ int nfs_commit_inode(struct inode *inode LIST_HEAD(head); int may_wait = how & FLUSH_SYNC; int res = 0; + int use_pnfs = 0; if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) goto out_mark_dirty; spin_lock(&inode->i_lock); - res = nfs_scan_commit(inode, &head, 0, 0); + res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); spin_unlock(&inode->i_lock); if (res) { - int error = nfs_commit_list(inode, &head, how); + int error = nfs_commit_list(inode, &head, how, use_pnfs); if (error < 0) return error; - if (may_wait) + if (may_wait) { wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, nfs_wait_bit_killable, TASK_KILLABLE); - else + } else goto out_mark_dirty; } else nfs_commit_clear_lock(NFS_I(inode)); @@ -1451,7 +1588,18 @@ static int nfs_commit_unstable_pages(str int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return nfs_commit_unstable_pages(inode, wbc); + int ret; + ret = nfs_commit_unstable_pages(inode, wbc); + if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { + int err, sync = wbc->sync_mode; + + if (wbc->nonblocking || wbc->for_background) + sync = 0; + err = pnfs_layoutcommit_inode(inode, sync); + if (err < 0) + ret = err; + } + return ret; } /* @@ -1459,6 +1607,7 @@ int nfs_write_inode(struct inode *inode, */ int nfs_wb_all(struct inode *inode) { + int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, @@ -1466,7 +1615,8 @@ int nfs_wb_all(struct inode *inode) .range_end = LLONG_MAX, }; - return sync_inode(inode, &wbc); + ret = sync_inode(inode, &wbc); + return ret; } int nfs_wb_page_cancel(struct inode *inode, struct page *page) diff -up linux-2.6.34.noarch/include/linux/exportfs.h.orig linux-2.6.34.noarch/include/linux/exportfs.h --- linux-2.6.34.noarch/include/linux/exportfs.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/exportfs.h 2010-09-30 10:17:09.002005000 -0400 @@ -2,6 +2,7 @@ #define LINUX_EXPORTFS_H 1 #include +#include struct dentry; struct inode; @@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_pare struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); +#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) +struct pnfs_filelayout_device; +struct pnfs_filelayout_layout; + +extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, + const struct pnfs_filelayout_device *fdev); +extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, + const struct pnfs_filelayout_layout *flp); +#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ + +#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) +struct list_head; + +extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, + const struct list_head *volumes); + +extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, + const struct list_head *layouts); +#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ + +#if defined(CONFIG_PNFSD) +#include + +struct pnfsd_cb_operations; + +struct pnfsd_cb_ctl { + spinlock_t lock; + struct module *module; + const struct pnfsd_cb_operations *cb_op; +}; + +/* in expfs.c so that file systems can depend on it */ +extern struct pnfsd_cb_ctl pnfsd_cb_ctl; + +static inline int +pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) +{ + int ret = -ENOENT; + + spin_lock(&pnfsd_cb_ctl.lock); + if (!pnfsd_cb_ctl.cb_op) + goto out; + if (!try_module_get(pnfsd_cb_ctl.module)) + goto out; + ctl->cb_op = pnfsd_cb_ctl.cb_op; + ctl->module = pnfsd_cb_ctl.module; + ret = 0; +out: + spin_unlock(&pnfsd_cb_ctl.lock); + return ret; +} + +static inline void +pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) +{ + module_put(ctl->module); +} +#endif /* CONFIG_PNFSD */ #endif /* LINUX_EXPORTFS_H */ diff -up linux-2.6.34.noarch/include/linux/exp_xdr.h.orig linux-2.6.34.noarch/include/linux/exp_xdr.h --- linux-2.6.34.noarch/include/linux/exp_xdr.h.orig 2010-09-30 10:17:08.988005000 -0400 +++ linux-2.6.34.noarch/include/linux/exp_xdr.h 2010-09-30 10:17:08.990007000 -0400 @@ -0,0 +1,141 @@ +#ifndef _LINUX_EXP_XDR_H +#define _LINUX_EXP_XDR_H + +#include +#include +#include + +struct exp_xdr_stream { + __be32 *p; + __be32 *end; +}; + +/** + * exp_xdr_qwords - Calculate the number of quad-words holding nbytes + * @nbytes: number of bytes to encode + */ +static inline size_t +exp_xdr_qwords(__u32 nbytes) +{ + return DIV_ROUND_UP(nbytes, 4); +} + +/** + * exp_xdr_qbytes - Calculate the number of bytes holding qwords + * @qwords: number of quad-words to encode + */ +static inline size_t +exp_xdr_qbytes(size_t qwords) +{ + return qwords << 2; +} + +/** + * exp_xdr_reserve_space - Reserve buffer space for sending + * @xdr: pointer to exp_xdr_stream + * @nbytes: number of bytes to reserve + * + * Checks that we have enough buffer space to encode 'nbytes' more + * bytes of data. If so, update the xdr stream. + */ +static inline __be32 * +exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr->p; + __be32 *q; + + /* align nbytes on the next 32-bit boundary */ + q = p + exp_xdr_qwords(nbytes); + if (unlikely(q > xdr->end || q < p)) + return NULL; + xdr->p = q; + return p; +} + +/** + * exp_xdr_reserve_qwords - Reserve buffer space for sending + * @xdr: pointer to exp_xdr_stream + * @nwords: number of quad words (u32's) to reserve + */ +static inline __be32 * +exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) +{ + return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); +} + +/** + * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream + * @p: pointer to encoding destination + * @val: value to encode + */ +static inline __be32 * +exp_xdr_encode_u32(__be32 *p, __u32 val) +{ + *p = cpu_to_be32(val); + return p + 1; +} + +/** + * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream + * @p: pointer to encoding destination + * @val: value to encode + */ +static inline __be32 * +exp_xdr_encode_u64(__be32 *p, __u64 val) +{ + put_unaligned_be64(val, p); + return p + 2; +} + +/** + * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream + * @p: pointer to encoding destination + * @ptr: pointer to the array of bytes + * @nbytes: number of bytes to encode + */ +static inline __be32 * +exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) +{ + if (likely(nbytes != 0)) { + unsigned int qwords = exp_xdr_qwords(nbytes); + unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; + + memcpy(p, ptr, nbytes); + if (padding != 0) + memset((char *)p + nbytes, 0, padding); + p += qwords; + } + return p; +} + +/** + * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream + * @p: pointer to encoding destination + * @ptr: pointer to the opaque array + * @nbytes: number of bytes to encode + * + * Encodes the 32-bit opaque size in bytes followed by the opaque value. + */ +static inline __be32 * +exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) +{ + p = exp_xdr_encode_u32(p, nbytes); + return exp_xdr_encode_bytes(p, ptr, nbytes); +} + +/** + * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream + * @lenp: pointer to the opaque length destination + * @endp: pointer to the end of the opaque array + * + * Encodes the 32-bit opaque size in bytes given the start and end pointers + */ +static inline __be32 * +exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) +{ + size_t nbytes = (char *)endp - (char *)(lenp + 1); + + exp_xdr_encode_u32(lenp, nbytes); + return lenp + 1 + exp_xdr_qwords(nbytes); +} +#endif /* _LINUX_EXP_XDR_H */ diff -up linux-2.6.34.noarch/include/linux/fs.h.orig linux-2.6.34.noarch/include/linux/fs.h --- linux-2.6.34.noarch/include/linux/fs.h.orig 2010-09-30 10:15:16.980690000 -0400 +++ linux-2.6.34.noarch/include/linux/fs.h 2010-09-30 10:17:09.015004000 -0400 @@ -387,6 +387,7 @@ struct inodes_stat_t { #include struct export_operations; +struct pnfs_export_operations; struct hd_geometry; struct iovec; struct nameidata; @@ -1329,6 +1330,7 @@ struct super_block { const struct dquot_operations *dq_op; const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; + const struct pnfs_export_operations *s_pnfs_op; unsigned long s_flags; unsigned long s_magic; struct dentry *s_root; diff -up linux-2.6.34.noarch/include/linux/nfs4.h.orig linux-2.6.34.noarch/include/linux/nfs4.h --- linux-2.6.34.noarch/include/linux/nfs4.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs4.h 2010-09-30 10:17:09.047005000 -0400 @@ -17,7 +17,10 @@ #define NFS4_BITMAP_SIZE 2 #define NFS4_VERIFIER_SIZE 8 -#define NFS4_STATEID_SIZE 16 +#define NFS4_CLIENTID_SIZE 8 +#define NFS4_STATEID_SEQID_SIZE 4 +#define NFS4_STATEID_OTHER_SIZE 12 +#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) #define NFS4_FHSIZE 128 #define NFS4_MAXPATHLEN PATH_MAX #define NFS4_MAXNAMLEN NAME_MAX @@ -119,6 +122,13 @@ #define EXCHGID4_FLAG_MASK_A 0x40070003 #define EXCHGID4_FLAG_MASK_R 0x80070003 +static inline bool +is_ds_only_session(u32 exchange_flags) +{ + u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; + return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; +} + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 @@ -166,8 +176,25 @@ struct nfs4_acl { struct nfs4_ace aces[0]; }; +struct nfs4_fsid { + u64 major; + u64 minor; +}; + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; -typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; +typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; + +struct nfs41_stateid { + __be32 seqid; + char other[NFS4_STATEID_OTHER_SIZE]; +} __attribute__ ((packed)); + +typedef struct { + union { + char data[NFS4_STATEID_SIZE]; + struct nfs41_stateid stateid; + } u; +} nfs4_stateid; enum nfs_opnum4 { OP_ACCESS = 3, @@ -471,6 +498,8 @@ enum lock_type4 { #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) +#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) +#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define NFSPROC4_NULL 0 #define NFSPROC4_COMPOUND 1 @@ -523,6 +552,7 @@ enum { NFSPROC4_CLNT_GETACL, NFSPROC4_CLNT_SETACL, NFSPROC4_CLNT_FS_LOCATIONS, + NFSPROC4_CLNT_RELEASE_LOCKOWNER, /* nfs41 */ NFSPROC4_CLNT_EXCHANGE_ID, @@ -531,6 +561,13 @@ enum { NFSPROC4_CLNT_SEQUENCE, NFSPROC4_CLNT_GET_LEASE_TIME, NFSPROC4_CLNT_RECLAIM_COMPLETE, + NFSPROC4_CLNT_LAYOUTGET, + NFSPROC4_CLNT_LAYOUTCOMMIT, + NFSPROC4_CLNT_LAYOUTRETURN, + NFSPROC4_CLNT_GETDEVICELIST, + NFSPROC4_CLNT_GETDEVICEINFO, + NFSPROC4_CLNT_PNFS_WRITE, + NFSPROC4_CLNT_PNFS_COMMIT, }; /* nfs41 types */ @@ -549,6 +586,43 @@ enum state_protect_how4 { SP4_SSV = 2 }; +enum pnfs_layouttype { + LAYOUT_NFSV4_1_FILES = 1, + LAYOUT_OSD2_OBJECTS = 2, + LAYOUT_BLOCK_VOLUME = 3, +}; + +/* used for both layout return and recall */ +enum pnfs_layoutreturn_type { + RETURN_FILE = 1, + RETURN_FSID = 2, + RETURN_ALL = 3 +}; + +enum pnfs_iomode { + IOMODE_READ = 1, + IOMODE_RW = 2, + IOMODE_ANY = 3, +}; + +enum pnfs_notify_deviceid_type4 { + NOTIFY_DEVICEID4_CHANGE = 1 << 1, + NOTIFY_DEVICEID4_DELETE = 1 << 2, +}; + +#define NFL4_UFLG_MASK 0x0000003F +#define NFL4_UFLG_DENSE 0x00000001 +#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 +#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 + +/* Encoded in the loh_body field of type layouthint4 */ +enum filelayout_hint_care4 { + NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, + NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, + NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, + NFLH4_CARE_STRIPE_COUNT = 0x00000080 +}; + #endif #endif diff -up linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfs4_pnfs.h --- linux-2.6.34.noarch/include/linux/nfs4_pnfs.h.orig 2010-09-30 10:17:09.057007000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs4_pnfs.h 2010-09-30 10:17:09.059005000 -0400 @@ -0,0 +1,329 @@ +/* + * include/linux/nfs4_pnfs.h + * + * Common data structures needed by the pnfs client and pnfs layout driver. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Dean Hildebrand + */ + +#ifndef LINUX_NFS4_PNFS_H +#define LINUX_NFS4_PNFS_H + +#include + +enum pnfs_try_status { + PNFS_ATTEMPTED = 0, + PNFS_NOT_ATTEMPTED = 1, +}; + +#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 + +/* Per-layout driver specific registration structure */ +struct pnfs_layoutdriver_type { + const u32 id; + const char *name; + struct layoutdriver_io_operations *ld_io_ops; + struct layoutdriver_policy_operations *ld_policy_ops; +}; + +struct pnfs_fsdata { + int bypass_eof; + struct pnfs_layout_segment *lseg; + void *private; +}; + +#if defined(CONFIG_NFS_V4_1) + +static inline struct nfs_inode * +PNFS_NFS_INODE(struct pnfs_layout_hdr *lo) +{ + return NFS_I(lo->inode); +} + +static inline struct inode * +PNFS_INODE(struct pnfs_layout_hdr *lo) +{ + return lo->inode; +} + +static inline struct nfs_server * +PNFS_NFS_SERVER(struct pnfs_layout_hdr *lo) +{ + return NFS_SERVER(PNFS_INODE(lo)); +} + +static inline struct pnfs_layoutdriver_type * +PNFS_LD(struct pnfs_layout_hdr *lo) +{ + return NFS_SERVER(PNFS_INODE(lo))->pnfs_curr_ld; +} + +static inline struct layoutdriver_io_operations * +PNFS_LD_IO_OPS(struct pnfs_layout_hdr *lo) +{ + return PNFS_LD(lo)->ld_io_ops; +} + +static inline struct layoutdriver_policy_operations * +PNFS_LD_POLICY_OPS(struct pnfs_layout_hdr *lo) +{ + return PNFS_LD(lo)->ld_policy_ops; +} + +static inline bool +has_layout(struct nfs_inode *nfsi) +{ + return nfsi->layout != NULL; +} + +static inline bool +layoutcommit_needed(struct nfs_inode *nfsi) +{ + return has_layout(nfsi) && + test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); +} + +extern void put_lseg(struct pnfs_layout_segment *lseg); +extern void get_lseg(struct pnfs_layout_segment *lseg); + +#else /* CONFIG_NFS_V4_1 */ + +static inline bool +has_layout(struct nfs_inode *nfsi) +{ + return false; +} + +static inline bool +layoutcommit_needed(struct nfs_inode *nfsi) +{ + return 0; +} + +#endif /* CONFIG_NFS_V4_1 */ + +struct pnfs_layout_segment { + struct list_head fi_list; + struct pnfs_layout_range range; + struct kref kref; + bool valid; + struct pnfs_layout_hdr *layout; + struct nfs4_deviceid *deviceid; + u8 ld_data[]; /* layout driver private data */ +}; + +static inline void * +LSEG_LD_DATA(struct pnfs_layout_segment *lseg) +{ + return lseg->ld_data; +} + +/* Layout driver I/O operations. + * Either the pagecache or non-pagecache read/write operations must be implemented + */ +struct layoutdriver_io_operations { + /* Functions that use the pagecache. + * If use_pagecache == 1, then these functions must be implemented. + */ + /* read and write pagelist should return just 0 (to indicate that + * the layout code has taken control) or 1 (to indicate that the + * layout code wishes to fall back to normal nfs.) If 0 is returned, + * information can be passed back through nfs_data->res and + * nfs_data->task.tk_status, and the appropriate pnfs done function + * MUST be called. + */ + enum pnfs_try_status + (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); + enum pnfs_try_status + (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); + int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, + loff_t pos, unsigned count, + struct pnfs_fsdata *fsdata); + int (*write_end)(struct inode *inode, struct page *page, loff_t pos, + unsigned count, unsigned copied, + struct pnfs_layout_segment *lseg); + void (*write_end_cleanup)(struct file *filp, + struct pnfs_fsdata *fsdata); + + /* Consistency ops */ + /* 2 problems: + * 1) the page list contains nfs_pages, NOT pages + * 2) currently the NFS code doesn't create a page array (as it does with read/write) + */ + enum pnfs_try_status + (*commit) (struct nfs_write_data *nfs_data, int how); + + /* Layout information. For each inode, alloc_layout is executed once to retrieve an + * inode specific layout structure. Each subsequent layoutget operation results in + * a set_layout call to set the opaque layout in the layout driver.*/ + struct pnfs_layout_hdr * (*alloc_layout) (struct inode *inode); + void (*free_layout) (struct pnfs_layout_hdr *); + struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); + void (*free_lseg) (struct pnfs_layout_segment *lseg); + + int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid, + struct nfs4_layoutcommit_args *args); + void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *args); + void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid, + struct nfs4_layoutcommit_args *args, + int status); + void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args); + + /* Registration information for a new mounted file system + */ + int (*initialize_mountpoint) (struct nfs_server *, + const struct nfs_fh * mntfh); + int (*uninitialize_mountpoint) (struct nfs_server *server); +}; + +enum layoutdriver_policy_flags { + /* Should the full nfs rpc cleanup code be used after io */ + PNFS_USE_RPC_CODE = 1 << 0, + + /* Should the NFS req. gather algorithm cross stripe boundaries? */ + PNFS_GATHER_ACROSS_STRIPES = 1 << 1, + + /* Should the pNFS client commit and return the layout upon a setattr */ + PNFS_LAYOUTRET_ON_SETATTR = 1 << 3, +}; + +struct layoutdriver_policy_operations { + unsigned flags; + + /* The stripe size of the file system */ + ssize_t (*get_stripesize) (struct pnfs_layout_hdr *layoutid); + + /* test for nfs page cache coalescing */ + int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + + /* Retreive the block size of the file system. + * If gather_across_stripes == 1, then the file system will gather + * requests into the block size. + * TODO: Where will the layout driver get this info? It is hard + * coded in PVFS2. + */ + ssize_t (*get_blocksize) (void); +}; + +/* Should the full nfs rpc cleanup code be used after io */ +static inline int +pnfs_ld_use_rpc_code(struct pnfs_layoutdriver_type *ld) +{ + return ld->ld_policy_ops->flags & PNFS_USE_RPC_CODE; +} + +/* Should the NFS req. gather algorithm cross stripe boundaries? */ +static inline int +pnfs_ld_gather_across_stripes(struct pnfs_layoutdriver_type *ld) +{ + return ld->ld_policy_ops->flags & PNFS_GATHER_ACROSS_STRIPES; +} + +struct pnfs_device { + struct pnfs_deviceid dev_id; + unsigned int layout_type; + unsigned int mincount; + struct page **pages; + void *area; + unsigned int pgbase; + unsigned int pglen; + unsigned int dev_notify_types; +}; + +struct pnfs_devicelist { + unsigned int eof; + unsigned int num_devs; + struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; +}; + +/* + * Device ID RCU cache. A device ID is unique per client ID and layout type. + */ +#define NFS4_DEVICE_ID_HASH_BITS 5 +#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) +#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) + +static inline u32 +nfs4_deviceid_hash(struct pnfs_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_DEVICE_ID_HASH_MASK; +} + +struct nfs4_deviceid_cache { + spinlock_t dc_lock; + struct kref dc_kref; + void (*dc_free_callback)(struct kref *); + struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; +}; + +/* Device ID cache node */ +struct nfs4_deviceid { + struct hlist_node de_node; + struct pnfs_deviceid de_id; + struct kref de_kref; +}; + +extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, + void (*free_callback)(struct kref *)); +extern void nfs4_put_deviceid_cache(struct nfs_client *); +extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); +extern struct nfs4_deviceid *nfs4_find_get_deviceid( + struct nfs4_deviceid_cache *, + struct pnfs_deviceid *); +extern struct nfs4_deviceid *nfs4_add_get_deviceid(struct nfs4_deviceid_cache *, + struct nfs4_deviceid *); +extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *, + struct nfs4_deviceid *); +extern void nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *, + struct nfs4_deviceid *, + void (*free_callback)(struct kref *)); +extern void nfs4_delete_device(struct nfs4_deviceid_cache *, + struct pnfs_deviceid *); + +/* pNFS client callback functions. + * These operations allow the layout driver to access pNFS client + * specific information or call pNFS client->server operations. + * E.g., getdeviceinfo, I/O callbacks, etc + */ +struct pnfs_client_operations { + int (*nfs_getdevicelist) (struct nfs_server *, + const struct nfs_fh *fh, + struct pnfs_devicelist *devlist); + int (*nfs_getdeviceinfo) (struct nfs_server *, + struct pnfs_device *dev); + + /* Post read callback. */ + void (*nfs_readlist_complete) (struct nfs_read_data *nfs_data); + + /* Post write callback. */ + void (*nfs_writelist_complete) (struct nfs_write_data *nfs_data); + + /* Post commit callback. */ + void (*nfs_commit_complete) (struct nfs_write_data *nfs_data); + void (*nfs_return_layout) (struct inode *); +}; + +extern struct pnfs_client_operations pnfs_ops; + +extern struct pnfs_client_operations *pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); +extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); + +#define NFS4_PNFS_MAX_LAYOUTS 4 +#define NFS4_PNFS_PRIVATE_LAYOUT 0x80000000 + +#endif /* LINUX_NFS4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig linux-2.6.34.noarch/include/linux/nfsd4_block.h --- linux-2.6.34.noarch/include/linux/nfsd4_block.h.orig 2010-09-30 10:17:09.178011000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd4_block.h 2010-09-30 10:17:09.180010000 -0400 @@ -0,0 +1,101 @@ +#ifndef NFSD4_BLOCK +#define NFSD4_BLOCK + +#include +#include +#include +#include + +#define PNFS_BLOCK_SUCCESS 1 +#define PNFS_BLOCK_FAILURE 0 + +#define PNFS_BLOCK_CTL_START 1 +#define PNFS_BLOCK_CTL_STOP 2 +#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current + * version from kernel via an upcall. + */ + +#define PNFS_UPCALL_MSG_STOP 0 +#define PNFS_UPCALL_MSG_GETSIG 1 +#define PNFS_UPCALL_MSG_GETSLICE 2 +#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume +#define PNFS_UPCALL_MSG_DMGET 4 +#define PNFS_UPCALL_MSG_VERS 5 + +#define PNFS_UPCALL_VERS 8 + +typedef struct stripe_dev { + int major, + minor, + offset; +} stripe_dev_t; + +typedef struct bl_comm_res { + int res_status; + union { + struct { + long long start, + length; + } slice; + struct { + int num_stripes, + stripe_size; + stripe_dev_t devs[]; + } stripe; + struct { + long long sector; + int offset, + len; + char sig[]; + } sig; + int vers, + dm_vol; + } u; +} bl_comm_res_t; + +typedef struct bl_comm_msg { + int msg_type, + msg_status; + union { + dev_t msg_dev; + int msg_vers; + } u; + bl_comm_res_t *msg_res; +} bl_comm_msg_t; + +#ifdef __KERNEL__ + +typedef struct bl_comm { + /* ---- protects access to this structure ---- */ + struct mutex lock; + /* ---- protects access to rpc pipe ---- */ + struct mutex pipe_lock; + struct dentry *pipe_dentry; + wait_queue_head_t pipe_wq; + bl_comm_msg_t msg; +} bl_comm_t; + +int pnfs_block_enabled(struct inode *, int); +int bl_layout_type(struct super_block *sb); +int bl_getdeviceiter(struct super_block *, u32 layout_type, + struct nfsd4_pnfs_dev_iter_res *); +int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, + u32 layout_type, + const struct nfsd4_pnfs_deviceid *); +enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, + const struct nfsd4_pnfs_layoutget_arg *, + struct nfsd4_pnfs_layoutget_res *); +int bl_layoutcommit(struct inode *, + const struct nfsd4_pnfs_layoutcommit_arg *, + struct nfsd4_pnfs_layoutcommit_res *); +int bl_layoutreturn(struct inode *, + const struct nfsd4_pnfs_layoutreturn_arg *); +int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); +int bl_init_proc(void); +int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); + +extern bl_comm_t *bl_comm_global; // Ugly... +#endif /* __KERNEL__ */ + +#endif /* NFSD4_BLOCK */ + diff -up linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h --- linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h.orig 2010-09-30 10:17:09.190013000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd4_spnfs.h 2010-09-30 10:17:09.192012000 -0400 @@ -0,0 +1,345 @@ +/* + * include/linux/nfsd4_spnfs.h + * + * spNFS - simple pNFS implementation with userspace daemon + * + */ + +/****************************************************************************** + +(c) 2007 Network Appliance, Inc. All Rights Reserved. + +Network Appliance provides this source code under the GPL v2 License. +The GPL v2 license is available at +http://opensource.org/licenses/gpl-license.php. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +******************************************************************************/ + +#ifndef NFS_SPNFS_H +#define NFS_SPNFS_H + + +#ifdef __KERNEL__ +#include "exportfs.h" +#include "sunrpc/svc.h" +#include "nfsd/nfsfh.h" +#else +#include +#endif /* __KERNEL__ */ + +#define SPNFS_STATUS_INVALIDMSG 0x01 +#define SPNFS_STATUS_AGAIN 0x02 +#define SPNFS_STATUS_FAIL 0x04 +#define SPNFS_STATUS_SUCCESS 0x08 + +#define SPNFS_TYPE_LAYOUTGET 0x01 +#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 +#define SPNFS_TYPE_LAYOUTRETURN 0x03 +#define SPNFS_TYPE_GETDEVICEITER 0x04 +#define SPNFS_TYPE_GETDEVICEINFO 0x05 +#define SPNFS_TYPE_SETATTR 0x06 +#define SPNFS_TYPE_OPEN 0x07 +#define SPNFS_TYPE_CLOSE 0x08 +#define SPNFS_TYPE_CREATE 0x09 +#define SPNFS_TYPE_REMOVE 0x0a +#define SPNFS_TYPE_COMMIT 0x0b +#define SPNFS_TYPE_READ 0x0c +#define SPNFS_TYPE_WRITE 0x0d + +#define SPNFS_MAX_DEVICES 1 +#define SPNFS_MAX_DATA_SERVERS 16 +#define SPNFS_MAX_IO 512 + +/* layout */ +struct spnfs_msg_layoutget_args { + unsigned long inode; + unsigned long generation; +}; + +struct spnfs_filelayout_list { + u_int32_t fh_len; + unsigned char fh_val[128]; /* DMXXX fix this const */ +}; + +struct spnfs_msg_layoutget_res { + int status; + u_int64_t devid; + u_int64_t stripe_size; + u_int32_t stripe_type; + u_int32_t stripe_count; + struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; +}; + +/* layoutcommit */ +struct spnfs_msg_layoutcommit_args { + unsigned long inode; + unsigned long generation; + u_int64_t file_size; +}; + +struct spnfs_msg_layoutcommit_res { + int status; +}; + +/* layoutreturn */ +/* No op for the daemon */ +/* +struct spnfs_msg_layoutreturn_args { +}; + +struct spnfs_msg_layoutreturn_res { +}; +*/ + +/* getdeviceiter */ +struct spnfs_msg_getdeviceiter_args { + unsigned long inode; + u_int64_t cookie; + u_int64_t verf; +}; + +struct spnfs_msg_getdeviceiter_res { + int status; + u_int64_t devid; + u_int64_t cookie; + u_int64_t verf; + u_int32_t eof; +}; + +/* getdeviceinfo */ +struct spnfs_data_server { + u_int32_t dsid; + char netid[5]; + char addr[29]; +}; + +struct spnfs_device { + u_int64_t devid; + int dscount; + struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; +}; + +struct spnfs_msg_getdeviceinfo_args { + u_int64_t devid; +}; + +struct spnfs_msg_getdeviceinfo_res { + int status; + struct spnfs_device devinfo; +}; + +/* setattr */ +struct spnfs_msg_setattr_args { + unsigned long inode; + unsigned long generation; + int file_size; +}; + +struct spnfs_msg_setattr_res { + int status; +}; + +/* open */ +struct spnfs_msg_open_args { + unsigned long inode; + unsigned long generation; + int create; + int createmode; + int truncate; +}; + +struct spnfs_msg_open_res { + int status; +}; + +/* close */ +/* No op for daemon */ +struct spnfs_msg_close_args { + int x; +}; + +struct spnfs_msg_close_res { + int y; +}; + +/* create */ +/* +struct spnfs_msg_create_args { + int x; +}; + +struct spnfs_msg_create_res { + int y; +}; +*/ + +/* remove */ +struct spnfs_msg_remove_args { + unsigned long inode; + unsigned long generation; +}; + +struct spnfs_msg_remove_res { + int status; +}; + +/* commit */ +/* +struct spnfs_msg_commit_args { + int x; +}; + +struct spnfs_msg_commit_res { + int y; +}; +*/ + +/* read */ +struct spnfs_msg_read_args { + unsigned long inode; + unsigned long generation; + loff_t offset; + unsigned long len; +}; + +struct spnfs_msg_read_res { + int status; + char data[SPNFS_MAX_IO]; +}; + +/* write */ +struct spnfs_msg_write_args { + unsigned long inode; + unsigned long generation; + loff_t offset; + unsigned long len; + char data[SPNFS_MAX_IO]; +}; + +struct spnfs_msg_write_res { + int status; +}; + +/* bundle args and responses */ +union spnfs_msg_args { + struct spnfs_msg_layoutget_args layoutget_args; + struct spnfs_msg_layoutcommit_args layoutcommit_args; +/* + struct spnfs_msg_layoutreturn_args layoutreturn_args; +*/ + struct spnfs_msg_getdeviceiter_args getdeviceiter_args; + struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; + struct spnfs_msg_setattr_args setattr_args; + struct spnfs_msg_open_args open_args; + struct spnfs_msg_close_args close_args; +/* + struct spnfs_msg_create_args create_args; +*/ + struct spnfs_msg_remove_args remove_args; +/* + struct spnfs_msg_commit_args commit_args; +*/ + struct spnfs_msg_read_args read_args; + struct spnfs_msg_write_args write_args; +}; + +union spnfs_msg_res { + struct spnfs_msg_layoutget_res layoutget_res; + struct spnfs_msg_layoutcommit_res layoutcommit_res; +/* + struct spnfs_msg_layoutreturn_res layoutreturn_res; +*/ + struct spnfs_msg_getdeviceiter_res getdeviceiter_res; + struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; + struct spnfs_msg_setattr_res setattr_res; + struct spnfs_msg_open_res open_res; + struct spnfs_msg_close_res close_res; +/* + struct spnfs_msg_create_res create_res; +*/ + struct spnfs_msg_remove_res remove_res; +/* + struct spnfs_msg_commit_res commit_res; +*/ + struct spnfs_msg_read_res read_res; + struct spnfs_msg_write_res write_res; +}; + +/* a spnfs message, args and response */ +struct spnfs_msg { + unsigned char im_type; + unsigned char im_status; + union spnfs_msg_args im_args; + union spnfs_msg_res im_res; +}; + +/* spnfs configuration info */ +struct spnfs_config { + unsigned char dense_striping; + int stripe_size; + int num_ds; + char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ +}; + +#if defined(__KERNEL__) && defined(CONFIG_SPNFS) + +#include + +/* pipe mgmt structure. messages flow through here */ +struct spnfs { + struct dentry *spnfs_dentry; /* dentry for pipe */ + wait_queue_head_t spnfs_wq; + struct spnfs_msg spnfs_im; /* spnfs message */ + struct mutex spnfs_lock; /* Serializes upcalls */ + struct mutex spnfs_plock; +}; + +struct nfsd4_open; + +int spnfs_layout_type(struct super_block *); +enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, + const struct nfsd4_pnfs_layoutget_arg *, + struct nfsd4_pnfs_layoutget_res *); +int spnfs_layoutcommit(void); +int spnfs_layoutreturn(struct inode *, + const struct nfsd4_pnfs_layoutreturn_arg *); +int spnfs_getdeviceiter(struct super_block *, + u32 layout_type, + struct nfsd4_pnfs_dev_iter_res *); +int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, + u32 layout_type, + const struct nfsd4_pnfs_deviceid *); +int spnfs_setattr(void); +int spnfs_open(struct inode *, struct nfsd4_open *); +int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); +int spnfs_remove(unsigned long, unsigned long); +__be32 spnfs_read(struct inode *, loff_t, unsigned long *, + int, struct svc_rqst *); +__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); +int spnfs_getfh(int, struct nfs_fh *); +int spnfs_test_layoutrecall(char *, u64, u64); +int spnfs_layoutrecall(struct inode *, int, u64, u64); + +int nfsd_spnfs_new(void); +void nfsd_spnfs_delete(void); +int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); +int spnfs_enabled(void); +int spnfs_init_proc(void); + +extern struct spnfs_config *spnfs_config; + +#endif /* __KERNEL__ && CONFIG_SPNFS */ + +#endif /* NFS_SPNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/const.h.orig linux-2.6.34.noarch/include/linux/nfsd/const.h --- linux-2.6.34.noarch/include/linux/nfsd/const.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/const.h 2010-09-30 10:17:09.139009000 -0400 @@ -29,6 +29,7 @@ #ifdef __KERNEL__ #include +#include /* * Largest number of bytes we need to allocate for an NFS diff -up linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig linux-2.6.34.noarch/include/linux/nfsd/debug.h --- linux-2.6.34.noarch/include/linux/nfsd/debug.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/debug.h 2010-09-30 10:17:09.144010000 -0400 @@ -32,6 +32,8 @@ #define NFSDDBG_REPCACHE 0x0080 #define NFSDDBG_XDR 0x0100 #define NFSDDBG_LOCKD 0x0200 +#define NFSDDBG_PNFS 0x0400 +#define NFSDDBG_FILELAYOUT 0x0800 #define NFSDDBG_ALL 0x7FFF #define NFSDDBG_NOCHANGE 0xFFFF diff -up linux-2.6.34.noarch/include/linux/nfsd/export.h.orig linux-2.6.34.noarch/include/linux/nfsd/export.h --- linux-2.6.34.noarch/include/linux/nfsd/export.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/export.h 2010-09-30 10:17:09.149010000 -0400 @@ -100,6 +100,7 @@ struct svc_export { uid_t ex_anon_uid; gid_t ex_anon_gid; int ex_fsid; + int ex_pnfs; unsigned char * ex_uuid; /* 16 byte fsid */ struct nfsd4_fs_locations ex_fslocs; int ex_nflavors; diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h --- linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-09-30 10:17:09.153006000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-09-30 10:17:09.154012000 -0400 @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef NFSD_NFS4LAYOUTXDR_H +#define NFSD_NFS4LAYOUTXDR_H + +#include +#include + +/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ +struct pnfs_filelayout_devaddr { + struct xdr_netobj r_netid; + struct xdr_netobj r_addr; +}; + +/* list of multipath servers */ +struct pnfs_filelayout_multipath { + u32 fl_multipath_length; + struct pnfs_filelayout_devaddr *fl_multipath_list; +}; + +struct pnfs_filelayout_device { + u32 fl_stripeindices_length; + u32 *fl_stripeindices_list; + u32 fl_device_length; + struct pnfs_filelayout_multipath *fl_device_list; +}; + +struct pnfs_filelayout_layout { + u32 lg_layout_type; /* response */ + u32 lg_stripe_type; /* response */ + u32 lg_commit_through_mds; /* response */ + u64 lg_stripe_unit; /* response */ + u64 lg_pattern_offset; /* response */ + u32 lg_first_stripe_index; /* response */ + struct nfsd4_pnfs_deviceid device_id; /* response */ + u32 lg_fh_length; /* response */ + struct knfsd_fh *lg_fh_list; /* response */ +}; + +enum stripetype4 { + STRIPE_SPARSE = 1, + STRIPE_DENSE = 2 +}; + +enum pnfs_block_extent_state4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, + PNFS_BLOCK_NONE_DATA = 3 +}; + +enum pnfs_block_volume_type4 { + PNFS_BLOCK_VOLUME_SIMPLE = 0, + PNFS_BLOCK_VOLUME_SLICE = 1, + PNFS_BLOCK_VOLUME_CONCAT = 2, + PNFS_BLOCK_VOLUME_STRIPE = 3, +}; +typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; + +enum bl_cache_state { + BLOCK_LAYOUT_NEW = 0, + BLOCK_LAYOUT_CACHE = 1, + BLOCK_LAYOUT_UPDATE = 2, +}; + +typedef struct pnfs_blocklayout_layout { + struct list_head bll_list; + struct nfsd4_pnfs_deviceid bll_vol_id; + u64 bll_foff; // file offset + u64 bll_len; + u64 bll_soff; // storage offset + int bll_recalled; + enum pnfs_block_extent_state4 bll_es; + enum bl_cache_state bll_cache_state; +} pnfs_blocklayout_layout_t; + +typedef struct pnfs_blocklayout_devinfo { + struct list_head bld_list; + pnfs_block_volume_type4 bld_type; + struct nfsd4_pnfs_deviceid bld_devid; + int bld_index_loc; + union { + struct { + u64 bld_offset; + u32 bld_sig_len, + *bld_sig; + } simple; + struct { + u64 bld_start, + bld_len; + u32 bld_index; /* Index of Simple Volume */ + } slice; + struct { + u32 bld_stripes; + u64 bld_chunk_size; + u32 *bld_stripe_indexs; + } stripe; + } u; +} pnfs_blocklayout_devinfo_t; + +#endif /* NFSD_NFS4LAYOUTXDR_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h --- linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-09-30 10:17:09.157010000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-09-30 10:17:09.159008000 -0400 @@ -0,0 +1,54 @@ +/****************************************************************************** + * + * (c) 2007 Network Appliance, Inc. All Rights Reserved. + * (c) 2009 NetApp. All Rights Reserved. + * + * NetApp provides this source code under the GPL v2 License. + * The GPL v2 license is available at + * http://opensource.org/licenses/gpl-license.php. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ +#include + +/* + * Length of comma separated pnfs data server IPv4 addresses. Enough room for + * 32 addresses. + */ +#define NFSD_DLM_DS_LIST_MAX 512 +/* + * Length of colon separated pnfs dlm device of the form + * disk_name:comma separated data server IPv4 address + */ +#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) + +#ifdef CONFIG_PNFSD + +/* For use by DLM cluster file systems exported by pNFSD */ +extern const struct pnfs_export_operations pnfs_dlm_export_ops; + +int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); + +void nfsd4_pnfs_dlm_shutdown(void); + +ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); + +#else /* CONFIG_PNFSD */ + +static inline void nfsd4_pnfs_dlm_shutdown(void) +{ + return; +} + +#endif /* CONFIG_PNFSD */ diff -up linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h --- linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-09-30 10:17:09.162007000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-09-30 10:17:09.163012000 -0400 @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _LINUX_NFSD_NFSD4_PNFS_H +#define _LINUX_NFSD_NFSD4_PNFS_H + +#include +#include +#include + +struct nfsd4_pnfs_deviceid { + u64 sbid; /* per-superblock unique ID */ + u64 devid; /* filesystem-wide unique device ID */ +}; + +struct nfsd4_pnfs_dev_iter_res { + u64 gd_cookie; /* request/repsonse */ + u64 gd_verf; /* request/repsonse */ + u64 gd_devid; /* response */ + u32 gd_eof; /* response */ +}; + +/* Arguments for set_device_notify */ +struct pnfs_devnotify_arg { + struct nfsd4_pnfs_deviceid dn_devid; /* request */ + u32 dn_layout_type; /* request */ + u32 dn_notify_types; /* request/response */ +}; + +struct nfsd4_layout_seg { + u64 clientid; + u32 layout_type; + u32 iomode; + u64 offset; + u64 length; +}; + +/* Used by layout_get to encode layout (loc_body var in spec) + * Args: + * minlength - min number of accessible bytes given by layout + * fsid - Major part of struct pnfs_deviceid. File system uses this + * to build the deviceid returned in the layout. + * fh - fs can modify the file handle for use on data servers + * seg - layout info requested and layout info returned + * xdr - xdr info + * return_on_close - true if layout to be returned on file close + */ + +struct nfsd4_pnfs_layoutget_arg { + u64 lg_minlength; + u64 lg_sbid; + const struct knfsd_fh *lg_fh; +}; + +struct nfsd4_pnfs_layoutget_res { + struct nfsd4_layout_seg lg_seg; /* request/resopnse */ + u32 lg_return_on_close; +}; + +struct nfsd4_pnfs_layoutcommit_arg { + struct nfsd4_layout_seg lc_seg; /* request */ + u32 lc_reclaim; /* request */ + u32 lc_newoffset; /* request */ + u64 lc_last_wr; /* request */ + struct nfstime4 lc_mtime; /* request */ + u32 lc_up_len; /* layout length */ + void *lc_up_layout; /* decoded by callback */ +}; + +struct nfsd4_pnfs_layoutcommit_res { + u32 lc_size_chg; /* boolean for response */ + u64 lc_newsize; /* response */ +}; + +#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ + +struct nfsd4_pnfs_layoutreturn_arg { + u32 lr_return_type; /* request */ + struct nfsd4_layout_seg lr_seg; /* request */ + u32 lr_reclaim; /* request */ + u32 lrf_body_len; /* request */ + void *lrf_body; /* request */ + void *lr_cookie; /* fs private */ +}; + +/* pNFS Metadata to Data server state communication */ +struct pnfs_get_state { + u32 dsid; /* request */ + u64 ino; /* request */ + nfs4_stateid stid; /* request;response */ + nfs4_clientid clid; /* response */ + u32 access; /* response */ + u32 stid_gen; /* response */ + u32 verifier[2]; /* response */ +}; + +/* + * pNFS export operations vector. + * + * The filesystem must implement the following methods: + * layout_type + * get_device_info + * layout_get + * + * All other methods are optional and can be set to NULL if not implemented. + */ +struct pnfs_export_operations { + /* Returns the supported pnfs_layouttype4. */ + int (*layout_type) (struct super_block *); + + /* Encode device info onto the xdr stream. */ + int (*get_device_info) (struct super_block *, + struct exp_xdr_stream *, + u32 layout_type, + const struct nfsd4_pnfs_deviceid *); + + /* Retrieve all available devices via an iterator. + * arg->cookie == 0 indicates the beginning of the list, + * otherwise arg->verf is used to verify that the list hasn't changed + * while retrieved. + * + * On output, the filesystem sets the devid based on the current cookie + * and sets res->cookie and res->verf corresponding to the next entry. + * When the last entry in the list is retrieved, res->eof is set to 1. + */ + int (*get_device_iter) (struct super_block *, + u32 layout_type, + struct nfsd4_pnfs_dev_iter_res *); + + int (*set_device_notify) (struct super_block *, + struct pnfs_devnotify_arg *); + + /* Retrieve and encode a layout for inode onto the xdr stream. + * arg->minlength is the minimum number of accessible bytes required + * by the client. + * The maximum number of bytes to encode the layout is given by + * the xdr stream end pointer. + * arg->fsid contains the major part of struct pnfs_deviceid. + * The file system uses this to build the deviceid returned + * in the layout. + * res->seg - layout segment requested and layout info returned. + * res->fh can be modified the file handle for use on data servers + * res->return_on_close - true if layout to be returned on file close + * + * return one of the following nfs errors: + * NFS_OK Success + * NFS4ERR_ACCESS Permission error + * NFS4ERR_BADIOMODE Server does not support requested iomode + * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules + * NFS4ERR_INVAL Parameter other than layout is invalid + * NFS4ERR_IO I/O error + * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later + * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file + * NFS4ERR_LOCKED Lock conflict + * NFS4ERR_NOSPC Out-of-space error occured + * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to + * a conflicting CB_LAYOUTRECALL + * NFS4ERR_SERVERFAULT Server went bezerk + * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout + * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) + */ + enum nfsstat4 (*layout_get) (struct inode *, + struct exp_xdr_stream *xdr, + const struct nfsd4_pnfs_layoutget_arg *, + struct nfsd4_pnfs_layoutget_res *); + + /* Commit changes to layout */ + int (*layout_commit) (struct inode *, + const struct nfsd4_pnfs_layoutcommit_arg *, + struct nfsd4_pnfs_layoutcommit_res *); + + /* Returns the layout */ + int (*layout_return) (struct inode *, + const struct nfsd4_pnfs_layoutreturn_arg *); + + /* Can layout segments be merged for this layout type? */ + int (*can_merge_layouts) (u32 layout_type); + + /* pNFS Files layout specific operations */ + + /* Get the write verifier for DS (called on MDS only) */ + void (*get_verifier) (struct super_block *, u32 *p); + /* Call fs on DS only */ + int (*get_state) (struct inode *, struct knfsd_fh *, + struct pnfs_get_state *); +}; + +struct nfsd4_pnfs_cb_layout { + u32 cbl_recall_type; /* request */ + struct nfsd4_layout_seg cbl_seg; /* request */ + u32 cbl_layoutchanged; /* request */ + nfs4_stateid cbl_sid; /* request */ + struct nfs4_fsid cbl_fsid; + void *cbl_cookie; /* fs private */ +}; + +/* layoutrecall request (from exported filesystem) */ +struct nfs4_layoutrecall { + struct kref clr_ref; + struct nfsd4_pnfs_cb_layout cb; /* request */ + struct list_head clr_perclnt; /* on cl_layoutrecalls */ + struct nfs4_client *clr_client; + struct nfs4_file *clr_file; + struct timespec clr_time; /* last activity */ + struct super_block *clr_sb; /* We might not have a file */ + struct nfs4_layoutrecall *parent; /* The initiating recall */ + + void *clr_args; /* nfsd internal */ +}; + +struct nfsd4_pnfs_cb_dev_item { + u32 cbd_notify_type; /* request */ + u32 cbd_layout_type; /* request */ + struct nfsd4_pnfs_deviceid cbd_devid; /* request */ + u32 cbd_immediate; /* request */ +}; + +struct nfsd4_pnfs_cb_dev_list { + u32 cbd_len; /* request */ + struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ +}; + +/* + * callbacks provided by the nfsd + */ +struct pnfsd_cb_operations { + /* Generic callbacks */ + int (*cb_layout_recall) (struct super_block *, struct inode *, + struct nfsd4_pnfs_cb_layout *); + int (*cb_device_notify) (struct super_block *, + struct nfsd4_pnfs_cb_dev_list *); + + /* pNFS Files layout specific callbacks */ + + /* Callback from fs on MDS only */ + int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); + /* Callback from fs on DS only */ + int (*cb_change_state) (struct pnfs_get_state *); +}; + +#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ diff -up linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.34.noarch/include/linux/nfsd/syscall.h --- linux-2.6.34.noarch/include/linux/nfsd/syscall.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/nfsd/syscall.h 2010-09-30 10:17:09.168010000 -0400 @@ -29,6 +29,7 @@ /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ +#define NFSCTL_FD2FH 9 /* get a fh from a fd */ /* SVC */ struct nfsctl_svc { @@ -71,6 +72,11 @@ struct nfsctl_fsparm { int gd_maxlen; }; +/* FD2FH */ +struct nfsctl_fd2fh { + int fd; +}; + /* * This is the argument union. */ @@ -82,6 +88,7 @@ struct nfsctl_arg { struct nfsctl_export u_export; struct nfsctl_fdparm u_getfd; struct nfsctl_fsparm u_getfs; + struct nfsctl_fd2fh u_fd2fh; /* * The following dummy member is needed to preserve binary compatibility * on platforms where alignof(void*)>alignof(int). It's needed because @@ -95,6 +102,7 @@ struct nfsctl_arg { #define ca_export u.u_export #define ca_getfd u.u_getfd #define ca_getfs u.u_getfs +#define ca_fd2fh u.u_fd2fh }; union nfsctl_res { diff -up linux-2.6.34.noarch/include/linux/nfs_fs.h.orig linux-2.6.34.noarch/include/linux/nfs_fs.h --- linux-2.6.34.noarch/include/linux/nfs_fs.h.orig 2010-09-30 10:15:17.949718000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_fs.h 2010-09-30 10:17:09.071005000 -0400 @@ -72,13 +72,20 @@ struct nfs_access_entry { int mask; }; +struct nfs_lock_context { + atomic_t count; + struct list_head list; + struct nfs_open_context *open_context; + fl_owner_t lockowner; + pid_t pid; +}; + struct nfs4_state; struct nfs_open_context { - atomic_t count; + struct nfs_lock_context lock_context; struct path path; struct rpc_cred *cred; struct nfs4_state *state; - fl_owner_t lockowner; fmode_t mode; unsigned long flags; @@ -97,6 +104,27 @@ struct nfs_delegation; struct posix_acl; +struct pnfs_layout_hdr { + int refcount; + struct list_head layouts; /* other client layouts */ + struct list_head segs; /* layout segments list */ + int roc_iomode;/* return on close iomode, 0=none */ + seqlock_t seqlock; /* Protects the stateid */ + nfs4_stateid stateid; + unsigned long state; +#define NFS_INO_RO_LAYOUT_FAILED 0 /* ro layoutget failed stop trying */ +#define NFS_INO_RW_LAYOUT_FAILED 1 /* rw layoutget failed stop trying */ +#define NFS_INO_LAYOUTCOMMIT 2 /* LAYOUTCOMMIT needed */ + + struct rpc_cred *cred; /* layoutcommit credential */ + /* DH: These vars keep track of the maximum write range + * so the values can be used for layoutcommit. + */ + loff_t write_begin_pos; + loff_t write_end_pos; + struct inode *inode; +}; + /* * nfs fs inode data in memory */ @@ -181,6 +209,13 @@ struct nfs_inode { struct nfs_delegation *delegation; fmode_t delegation_state; struct rw_semaphore rwsem; + + /* pNFS layout information */ +#if defined(CONFIG_NFS_V4_1) + wait_queue_head_t lo_waitq; + struct pnfs_layout_hdr *layout; + time_t pnfs_layout_suspend; +#endif /* CONFIG_NFS_V4_1 */ #endif /* CONFIG_NFS_V4*/ #ifdef CONFIG_NFS_FSCACHE struct fscache_cookie *fscache; @@ -353,6 +388,8 @@ extern void nfs_setattr_update_inode(str extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); +extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); +extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); @@ -481,8 +518,12 @@ extern void nfs_unblock_sillyrename(stru extern int nfs_congestion_kb; extern int nfs_writepage(struct page *page, struct writeback_control *wbc); extern int nfs_writepages(struct address_space *, struct writeback_control *); -extern int nfs_flush_incompatible(struct file *file, struct page *page); -extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); +struct pnfs_layout_segment; +extern int nfs_flush_incompatible(struct file *file, struct page *page, + struct pnfs_layout_segment *lseg); +extern int nfs_updatepage(struct file *, struct page *, + unsigned int offset, unsigned int count, + struct pnfs_layout_segment *lseg, void *fsdata); extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); /* @@ -604,6 +645,8 @@ extern void * nfs_root_data(void); #define NFSDBG_CLIENT 0x0200 #define NFSDBG_MOUNT 0x0400 #define NFSDBG_FSCACHE 0x0800 +#define NFSDBG_PNFS 0x1000 +#define NFSDBG_PNFS_LD 0x2000 #define NFSDBG_ALL 0xFFFF #ifdef __KERNEL__ diff -up linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.34.noarch/include/linux/nfs_fs_sb.h --- linux-2.6.34.noarch/include/linux/nfs_fs_sb.h.orig 2010-09-30 10:15:17.959722000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_fs_sb.h 2010-09-30 10:17:09.083008000 -0400 @@ -15,6 +15,7 @@ struct nlm_host; struct nfs4_sequence_args; struct nfs4_sequence_res; struct nfs_server; +struct nfs4_minor_version_ops; /* * The nfs_client identifies our client state to the server. @@ -70,11 +71,7 @@ struct nfs_client { */ char cl_ipaddr[48]; unsigned char cl_id_uniquifier; - int (* cl_call_sync)(struct nfs_server *server, - struct rpc_message *msg, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - int cache_reply); + const struct nfs4_minor_version_ops *cl_mvops; #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_V4_1 @@ -85,6 +82,8 @@ struct nfs_client { /* The flags used for obtaining the clientid during EXCHANGE_ID */ u32 cl_exchange_flags; struct nfs4_session *cl_session; /* sharred session */ + struct list_head cl_layouts; + struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ #endif /* CONFIG_NFS_V4_1 */ #ifdef CONFIG_NFS_FSCACHE @@ -92,6 +91,16 @@ struct nfs_client { #endif }; +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ +#ifdef CONFIG_NFS_V4_1 + return is_ds_only_session(clp->cl_exchange_flags); +#else + return false; +#endif +} + /* * NFS client parameters stored in the superblock. */ @@ -136,7 +145,7 @@ struct nfs_server { #endif #ifdef CONFIG_NFS_V4 - u32 attr_bitmask[2];/* V4 bitmask representing the set + u32 attr_bitmask[3];/* V4 bitmask representing the set of attributes supported on this filesystem */ u32 cache_consistency_bitmask[2]; @@ -148,6 +157,15 @@ struct nfs_server { that are supported on this filesystem */ #endif + +#ifdef CONFIG_NFS_V4_1 + u32 pnfs_blksize; /* layout_blksize attr */ + struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ + void *pnfs_ld_data; /* Per-mount data */ + unsigned int ds_rsize; /* Data server read size */ + unsigned int ds_wsize; /* Data server write size */ +#endif /* CONFIG_NFS_V4_1 */ + void (*destroy)(struct nfs_server *); atomic_t active; /* Keep trace of any activity to this server */ diff -up linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig linux-2.6.34.noarch/include/linux/nfs_iostat.h --- linux-2.6.34.noarch/include/linux/nfs_iostat.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_iostat.h 2010-09-30 10:17:09.110005000 -0400 @@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTREAD, NFSIOS_SHORTWRITE, NFSIOS_DELAY, + NFSIOS_PNFS_READ, + NFSIOS_PNFS_WRITE, + NFSIOS_PNFS_COMMIT, __NFSIOS_COUNTSMAX, }; diff -up linux-2.6.34.noarch/include/linux/nfs_page.h.orig linux-2.6.34.noarch/include/linux/nfs_page.h --- linux-2.6.34.noarch/include/linux/nfs_page.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_page.h 2010-09-30 10:17:09.122008000 -0400 @@ -39,6 +39,7 @@ struct nfs_page { struct list_head wb_list; /* Defines state of page: */ struct page *wb_page; /* page to read in/write out */ struct nfs_open_context *wb_context; /* File state context info */ + struct nfs_lock_context *wb_lock_context; /* lock context info */ atomic_t wb_complete; /* i/os we're waiting for */ pgoff_t wb_index; /* Offset >> PAGE_CACHE_SHIFT */ unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ @@ -47,6 +48,7 @@ struct nfs_page { struct kref wb_kref; /* reference count */ unsigned long wb_flags; struct nfs_writeverf wb_verf; /* Commit cookie */ + struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ }; struct nfs_pageio_descriptor { @@ -60,6 +62,12 @@ struct nfs_pageio_descriptor { int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); int pg_ioflags; int pg_error; + struct pnfs_layout_segment *pg_lseg; +#ifdef CONFIG_NFS_V4_1 + int pg_iswrite; + int pg_boundary; + int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); +#endif /* CONFIG_NFS_V4_1 */ }; #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) @@ -68,13 +76,15 @@ extern struct nfs_page *nfs_create_reque struct inode *inode, struct page *page, unsigned int offset, - unsigned int count); + unsigned int count, + struct pnfs_layout_segment *lseg); extern void nfs_clear_request(struct nfs_page *req); extern void nfs_release_request(struct nfs_page *req); extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, - pgoff_t idx_start, unsigned int npages, int tag); + pgoff_t idx_start, unsigned int npages, int tag, + int *use_pnfs); extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), diff -up linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig linux-2.6.34.noarch/include/linux/nfs_xdr.h --- linux-2.6.34.noarch/include/linux/nfs_xdr.h.orig 2010-09-30 10:15:17.965727000 -0400 +++ linux-2.6.34.noarch/include/linux/nfs_xdr.h 2010-09-30 10:17:09.134006000 -0400 @@ -3,6 +3,8 @@ #include #include +#include +#include /* * To change the maximum rsize and wsize supported by the NFS client, adjust @@ -10,7 +12,7 @@ * support a megabyte or more. The default is left at 4096 bytes, which is * reasonable for NFS over UDP. */ -#define NFS_MAX_FILE_IO_SIZE (1048576U) +#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) #define NFS_DEF_FILE_IO_SIZE (4096U) #define NFS_MIN_FILE_IO_SIZE (1024U) @@ -113,6 +115,10 @@ struct nfs_fsinfo { __u32 dtpref; /* pref. readdir transfer size */ __u64 maxfilesize; __u32 lease_time; /* in seconds */ +#if defined(CONFIG_NFS_V4_1) + __u32 layouttype; /* supported pnfs layout driver */ + __u32 blksize; /* preferred pnfs io block size */ +#endif }; struct nfs_fsstat { @@ -185,6 +191,125 @@ struct nfs4_get_lease_time_res { struct nfs4_sequence_res lr_seq_res; }; +#define PNFS_LAYOUT_MAXSIZE 4096 +#define NFS4_PNFS_DEVICEID4_SIZE 16 + +struct pnfs_deviceid { + char data[NFS4_PNFS_DEVICEID4_SIZE]; +}; + +struct nfs4_layoutdriver_data { + __u32 len; + void *buf; +}; + +struct pnfs_layout_range { + u32 iomode; + u64 offset; + u64 length; +}; + +struct nfs4_layoutget_args { + __u32 type; + struct pnfs_layout_range range; + __u64 minlength; + __u32 maxcount; + struct inode *inode; + struct nfs4_sequence_args seq_args; +}; + +struct nfs4_layoutget_res { + __u32 return_on_close; + struct pnfs_layout_range range; + __u32 type; + nfs4_stateid stateid; + struct nfs4_layoutdriver_data layout; + struct nfs4_sequence_res seq_res; +}; + +struct nfs4_layoutget { + struct nfs4_layoutget_args args; + struct nfs4_layoutget_res res; + struct pnfs_layout_segment **lsegpp; + int status; +}; + +struct nfs4_layoutcommit_args { + nfs4_stateid stateid; + __u64 lastbytewritten; + __u32 time_modify_changed; + struct timespec time_modify; + const u32 *bitmask; + struct nfs_fh *fh; + struct inode *inode; + + /* Values set by layout driver */ + struct pnfs_layout_range range; + __u32 layout_type; + void *layoutdriver_data; + struct nfs4_sequence_args seq_args; +}; + +struct nfs4_layoutcommit_res { + __u32 sizechanged; + __u64 newsize; + struct nfs_fattr *fattr; + const struct nfs_server *server; + struct nfs4_sequence_res seq_res; +}; + +struct nfs4_layoutcommit_data { + struct rpc_task task; + struct rpc_cred *cred; + struct nfs_fattr fattr; + struct nfs4_layoutcommit_args args; + struct nfs4_layoutcommit_res res; + int status; +}; + +struct nfs4_layoutreturn_args { + __u32 reclaim; + __u32 layout_type; + __u32 return_type; + struct pnfs_layout_range range; + struct inode *inode; + struct nfs4_sequence_args seq_args; +}; + +struct nfs4_layoutreturn_res { + struct nfs4_sequence_res seq_res; + u32 lrs_present; + nfs4_stateid stateid; +}; + +struct nfs4_layoutreturn { + struct nfs4_layoutreturn_args args; + struct nfs4_layoutreturn_res res; + struct rpc_cred *cred; + int rpc_status; +}; + +struct nfs4_getdevicelist_args { + const struct nfs_fh *fh; + u32 layoutclass; + struct nfs4_sequence_args seq_args; +}; + +struct nfs4_getdevicelist_res { + struct pnfs_devicelist *devlist; + struct nfs4_sequence_res seq_res; +}; + +struct nfs4_getdeviceinfo_args { + struct pnfs_device *pdev; + struct nfs4_sequence_args seq_args; +}; + +struct nfs4_getdeviceinfo_res { + struct pnfs_device *pdev; + struct nfs4_sequence_res seq_res; +}; + /* * Arguments to the open call. */ @@ -196,8 +321,10 @@ struct nfs_openargs { __u64 clientid; __u64 id; union { - struct iattr * attrs; /* UNCHECKED, GUARDED */ - nfs4_verifier verifier; /* EXCLUSIVE */ + struct { + struct iattr * attrs; /* UNCHECKED, GUARDED */ + nfs4_verifier verifier; /* EXCLUSIVE */ + }; nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ fmode_t delegation_type; /* CLAIM_PREVIOUS */ } u; @@ -313,6 +440,10 @@ struct nfs_lockt_res { struct nfs4_sequence_res seq_res; }; +struct nfs_release_lockowner_args { + struct nfs_lowner lock_owner; +}; + struct nfs4_delegreturnargs { const struct nfs_fh *fhandle; const nfs4_stateid *stateid; @@ -332,6 +463,7 @@ struct nfs4_delegreturnres { struct nfs_readargs { struct nfs_fh * fh; struct nfs_open_context *context; + struct nfs_lock_context *lock_context; __u64 offset; __u32 count; unsigned int pgbase; @@ -352,6 +484,7 @@ struct nfs_readres { struct nfs_writeargs { struct nfs_fh * fh; struct nfs_open_context *context; + struct nfs_lock_context *lock_context; __u64 offset; __u32 count; enum nfs3_stable_how stable; @@ -846,7 +979,7 @@ struct nfs4_server_caps_arg { }; struct nfs4_server_caps_res { - u32 attr_bitmask[2]; + u32 attr_bitmask[3]; u32 acl_bitmask; u32 has_links; u32 has_symlinks; @@ -961,6 +1094,27 @@ struct nfs_page; #define NFS_PAGEVEC_SIZE (8U) +#if defined(CONFIG_NFS_V4_1) +/* pnfsflag values */ +#define PNFS_NO_RPC 0x0001 /* non rpc result callback switch */ + +/* pnfs-specific data needed for read, write, and commit calls */ +struct pnfs_call_data { + struct pnfs_layout_segment *lseg; + const struct rpc_call_ops *call_ops; + u32 orig_count; /* for retry via MDS */ + int pnfs_error; + u8 pnfsflags; + u8 how; /* for FLUSH_STABLE */ +}; + +/* files layout-type specific data for read, write, and commit */ +struct pnfs_fl_call_data { + struct nfs_client *ds_nfs_client; + __u64 orig_offset; +}; +#endif /* CONFIG_NFS_V4_1 */ + struct nfs_read_data { int flags; struct rpc_task task; @@ -976,10 +1130,16 @@ struct nfs_read_data { #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif +#if defined(CONFIG_NFS_V4_1) + struct pnfs_call_data pdata; + struct pnfs_fl_call_data fldata; +#endif /* CONFIG_NFS_V4_1 */ struct page *page_array[NFS_PAGEVEC_SIZE]; }; struct nfs_write_data { + struct kref refcount; /* For pnfs commit splitting */ + struct nfs_write_data *parent; /* For pnfs commit splitting */ int flags; struct rpc_task task; struct inode *inode; @@ -995,6 +1155,10 @@ struct nfs_write_data { #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif +#if defined(CONFIG_NFS_V4_1) + struct pnfs_call_data pdata; + struct pnfs_fl_call_data fldata; +#endif /* CONFIG_NFS_V4_1 */ struct page *page_array[NFS_PAGEVEC_SIZE]; }; @@ -1008,6 +1172,7 @@ struct nfs_rpc_ops { const struct dentry_operations *dentry_ops; const struct inode_operations *dir_inode_ops; const struct inode_operations *file_inode_ops; + const struct file_operations *file_ops; int (*getroot) (struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); @@ -1072,6 +1237,7 @@ struct nfs_rpc_ops { extern const struct nfs_rpc_ops nfs_v2_clientops; extern const struct nfs_rpc_ops nfs_v3_clientops; extern const struct nfs_rpc_ops nfs_v4_clientops; +extern const struct nfs_rpc_ops pnfs_v4_clientops; extern struct rpc_version nfs_version2; extern struct rpc_version nfs_version3; extern struct rpc_version nfs_version4; diff -up linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.34.noarch/include/linux/panfs_shim_api.h --- linux-2.6.34.noarch/include/linux/panfs_shim_api.h.orig 2010-09-30 10:17:09.202009000 -0400 +++ linux-2.6.34.noarch/include/linux/panfs_shim_api.h 2010-09-30 10:17:09.204008000 -0400 @@ -0,0 +1,57 @@ +#ifndef _PANFS_SHIM_API_H +#define _PANFS_SHIM_API_H + +/* + * imported panfs functions + */ +struct panfs_export_operations { + int (*convert_rc)(pan_status_t rc); + + int (*sm_sec_t_get_size_otw)( + pan_sm_sec_otw_t *var, + pan_size_t *core_sizep, + pan_size_t *wire_size, + void *buf_end); + + int (*sm_sec_t_unmarshall)( + pan_sm_sec_otw_t *in, + pan_sm_sec_t *out, + void *buf, + pan_size_t size, + pan_size_t *otw_consumed, + pan_size_t *in_core_consumed); + + int (*ucreds_get)(void **ucreds_pp); + + void (*ucreds_put)(void *ucreds); + + int (*sam_read)( + pan_sam_access_flags_t flags, + pan_sam_read_args_t *args_p, + pan_sam_obj_sec_t *obj_sec_p, + pan_sg_entry_t *data_p, + void *ucreds, + pan_sam_read_cb_t closure, + void *user_arg1, + void *user_arg2, + pan_sam_read_res_t *res_p); + + int (*sam_write)( + pan_sam_access_flags_t flags, + pan_sam_write_args_t *args_p, + pan_sam_obj_sec_t *obj_sec_p, + pan_sg_entry_t *data_p, + void *ucreds, + pan_sam_write_cb_t closure, + void *user_arg1, + void *user_arg2, + pan_sam_write_res_t *res_p); +}; + +extern int +panfs_shim_register(struct panfs_export_operations *ops); + +extern int +panfs_shim_unregister(void); + +#endif /* _PANFS_SHIM_API_H */ diff -up linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h --- linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-09-30 10:17:09.214010000 -0400 +++ linux-2.6.34.noarch/include/linux/pnfs_osd_xdr.h 2010-09-30 10:17:09.215014000 -0400 @@ -0,0 +1,439 @@ +/* + * pnfs_osd_xdr.h + * + * pNFS-osd on-the-wire data structures + * + * Copyright (C) 2007-2009 Panasas Inc. + * All rights reserved. + * + * Benny Halevy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * See the file COPYING included with this distribution for more details. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the Panasas company nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef __PNFS_OSD_XDR_H__ +#define __PNFS_OSD_XDR_H__ + +#include +#include +#include +#include + +#define PNFS_OSD_OSDNAME_MAXSIZE 256 + +/* + * START OF "GENERIC" DECODE ROUTINES. + * These may look a little ugly since they are imported from a "generic" + * set of XDR encode/decode routines which are intended to be shared by + * all of our NFSv4 implementations (OpenBSD, MacOS X...). + * + * If the pain of reading these is too great, it should be a straightforward + * task to translate them into Linux-specific versions which are more + * consistent with the style used in NFSv2/v3... + */ +#define READ32(x) (x) = ntohl(*p++) +#define READ64(x) do { \ + (x) = (u64)ntohl(*p++) << 32; \ + (x) |= ntohl(*p++); \ +} while (0) +#define COPYMEM(x, nbytes) do { \ + memcpy((x), p, nbytes); \ + p += XDR_QUADLEN(nbytes); \ +} while (0) + +/* + * draft-ietf-nfsv4-minorversion-22 + * draft-ietf-nfsv4-pnfs-obj-12 + */ + +/* Layout Structure */ + +enum pnfs_osd_raid_algorithm4 { + PNFS_OSD_RAID_0 = 1, + PNFS_OSD_RAID_4 = 2, + PNFS_OSD_RAID_5 = 3, + PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ +}; + +/* struct pnfs_osd_data_map4 { + * uint32_t odm_num_comps; + * length4 odm_stripe_unit; + * uint32_t odm_group_width; + * uint32_t odm_group_depth; + * uint32_t odm_mirror_cnt; + * pnfs_osd_raid_algorithm4 odm_raid_algorithm; + * }; + */ +struct pnfs_osd_data_map { + u32 odm_num_comps; + u64 odm_stripe_unit; + u32 odm_group_width; + u32 odm_group_depth; + u32 odm_mirror_cnt; + u32 odm_raid_algorithm; +}; + +static inline int +pnfs_osd_data_map_xdr_sz(void) +{ + return 1 + 2 + 1 + 1 + 1 + 1; +} + +static inline size_t +pnfs_osd_data_map_incore_sz(void) +{ + return sizeof(struct pnfs_osd_data_map); +} + +/* struct pnfs_osd_objid4 { + * deviceid4 oid_device_id; + * uint64_t oid_partition_id; + * uint64_t oid_object_id; + * }; + */ +struct pnfs_osd_objid { + struct pnfs_deviceid oid_device_id; + u64 oid_partition_id; + u64 oid_object_id; +}; + +/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ +#define _DEVID_LO(oid_device_id) \ + (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) + +#define _DEVID_HI(oid_device_id) \ + (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) + +static inline int +pnfs_osd_objid_xdr_sz(void) +{ + return (NFS4_PNFS_DEVICEID4_SIZE / 4) + 2 + 2; +} + +static inline size_t +pnfs_osd_objid_incore_sz(void) +{ + return sizeof(struct pnfs_osd_objid); +} + +enum pnfs_osd_version { + PNFS_OSD_MISSING = 0, + PNFS_OSD_VERSION_1 = 1, + PNFS_OSD_VERSION_2 = 2 +}; + +struct pnfs_osd_opaque_cred { + u32 cred_len; + u8 *cred; +}; + +static inline int +pnfs_osd_opaque_cred_xdr_sz(u32 *p) +{ + u32 *start = p; + u32 n; + + READ32(n); + p += XDR_QUADLEN(n); + return p - start; +} + +static inline size_t +pnfs_osd_opaque_cred_incore_sz(u32 *p) +{ + u32 n; + + READ32(n); + return XDR_QUADLEN(n) * 4; +} + +enum pnfs_osd_cap_key_sec { + PNFS_OSD_CAP_KEY_SEC_NONE = 0, + PNFS_OSD_CAP_KEY_SEC_SSV = 1, +}; + +/* struct pnfs_osd_object_cred4 { + * pnfs_osd_objid4 oc_object_id; + * pnfs_osd_version4 oc_osd_version; + * pnfs_osd_cap_key_sec4 oc_cap_key_sec; + * opaque oc_capability_key<>; + * opaque oc_capability<>; + * }; + */ +struct pnfs_osd_object_cred { + struct pnfs_osd_objid oc_object_id; + u32 oc_osd_version; + u32 oc_cap_key_sec; + struct pnfs_osd_opaque_cred oc_cap_key; + struct pnfs_osd_opaque_cred oc_cap; +}; + +static inline int +pnfs_osd_object_cred_xdr_sz(u32 *p) +{ + u32 *start = p; + + p += pnfs_osd_objid_xdr_sz() + 2; + p += pnfs_osd_opaque_cred_xdr_sz(p); + p += pnfs_osd_opaque_cred_xdr_sz(p); + return p - start; +} + +static inline size_t +pnfs_osd_object_cred_incore_sz(u32 *p) +{ + size_t sz = sizeof(struct pnfs_osd_object_cred); + + p += pnfs_osd_objid_xdr_sz() + 2; + sz += pnfs_osd_opaque_cred_incore_sz(p); + p += pnfs_osd_opaque_cred_xdr_sz(p); + sz += pnfs_osd_opaque_cred_incore_sz(p); + return sz; +} + +/* struct pnfs_osd_layout4 { + * pnfs_osd_data_map4 olo_map; + * uint32_t olo_comps_index; + * pnfs_osd_object_cred4 olo_components<>; + * }; + */ +struct pnfs_osd_layout { + struct pnfs_osd_data_map olo_map; + u32 olo_comps_index; + u32 olo_num_comps; + struct pnfs_osd_object_cred *olo_comps; +}; + +static inline int +pnfs_osd_layout_xdr_sz(u32 *p) +{ + u32 *start = p; + u32 n; + + p += pnfs_osd_data_map_xdr_sz() + 1; + READ32(n); + while ((int)(n--) > 0) + p += pnfs_osd_object_cred_xdr_sz(p); + return p - start; +} + +static inline size_t +pnfs_osd_layout_incore_sz(u32 *p) +{ + u32 n; + size_t sz; + + p += pnfs_osd_data_map_xdr_sz() + 1; + READ32(n); + sz = sizeof(struct pnfs_osd_layout); + while ((int)(n--) > 0) { + sz += pnfs_osd_object_cred_incore_sz(p); + p += pnfs_osd_object_cred_xdr_sz(p); + } + return sz; +} + +/* Device Address */ + +enum pnfs_osd_targetid_type { + OBJ_TARGET_ANON = 1, + OBJ_TARGET_SCSI_NAME = 2, + OBJ_TARGET_SCSI_DEVICE_ID = 3, +}; + +/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { + * case OBJ_TARGET_SCSI_NAME: + * string oti_scsi_name<>; + * + * case OBJ_TARGET_SCSI_DEVICE_ID: + * opaque oti_scsi_device_id<>; + * + * default: + * void; + * }; + * + * union pnfs_osd_targetaddr4 switch (bool ota_available) { + * case TRUE: + * netaddr4 ota_netaddr; + * case FALSE: + * void; + * }; + * + * struct pnfs_osd_deviceaddr4 { + * pnfs_osd_targetid4 oda_targetid; + * pnfs_osd_targetaddr4 oda_targetaddr; + * uint64_t oda_lun; + * opaque oda_systemid<>; + * pnfs_osd_object_cred4 oda_root_obj_cred; + * opaque oda_osdname<>; + * }; + */ +struct pnfs_osd_targetid { + u32 oti_type; + struct nfs4_string oti_scsi_device_id; +}; + +enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; + +/* struct netaddr4 { + * // see struct rpcb in RFC1833 + * string r_netid<>; // network id + * string r_addr<>; // universal address + * }; + */ +struct pnfs_osd_net_addr { + struct nfs4_string r_netid; + struct nfs4_string r_addr; +}; + +struct pnfs_osd_targetaddr { + u32 ota_available; + struct pnfs_osd_net_addr ota_netaddr; +}; + +enum { + NETWORK_ID_MAX = 16 / 4, + UNIVERSAL_ADDRESS_MAX = 64 / 4, + PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, +}; + +struct pnfs_osd_deviceaddr { + struct pnfs_osd_targetid oda_targetid; + struct pnfs_osd_targetaddr oda_targetaddr; + u8 oda_lun[8]; + struct nfs4_string oda_systemid; + struct pnfs_osd_object_cred oda_root_obj_cred; + struct nfs4_string oda_osdname; +}; + +enum { + ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, + PNFS_OSD_DEVICEADDR_MAX = + PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + + 2 /*oda_lun*/ + + 1 + OSD_SYSTEMID_LEN + + 1 + ODA_OSDNAME_MAX, +}; + +/* LAYOUTCOMMIT: layoutupdate */ + +/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { + * case TRUE: + * int64_t dsu_delta; + * case FALSE: + * void; + * }; + * + * struct pnfs_osd_layoutupdate4 { + * pnfs_osd_deltaspaceused4 olu_delta_space_used; + * bool olu_ioerr_flag; + * }; + */ +struct pnfs_osd_layoutupdate { + u32 dsu_valid; + s64 dsu_delta; + u32 olu_ioerr_flag; +}; + +/* LAYOUTRETURN: I/O Rrror Report */ + +enum pnfs_osd_errno { + PNFS_OSD_ERR_EIO = 1, + PNFS_OSD_ERR_NOT_FOUND = 2, + PNFS_OSD_ERR_NO_SPACE = 3, + PNFS_OSD_ERR_BAD_CRED = 4, + PNFS_OSD_ERR_NO_ACCESS = 5, + PNFS_OSD_ERR_UNREACHABLE = 6, + PNFS_OSD_ERR_RESOURCE = 7 +}; + +/* struct pnfs_osd_ioerr4 { + * pnfs_osd_objid4 oer_component; + * length4 oer_comp_offset; + * length4 oer_comp_length; + * bool oer_iswrite; + * pnfs_osd_errno4 oer_errno; + * }; + */ +struct pnfs_osd_ioerr { + struct pnfs_osd_objid oer_component; + u64 oer_comp_offset; + u64 oer_comp_length; + u32 oer_iswrite; + u32 oer_errno; +}; + +static inline unsigned +pnfs_osd_ioerr_xdr_sz(void) +{ + return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; +} + +/* OSD XDR API */ + +/* Layout helpers */ +extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( + struct pnfs_osd_layout *layout, u32 *p); + +extern int pnfs_osd_xdr_encode_layout( + struct exp_xdr_stream *xdr, + struct pnfs_osd_layout *layout); + +/* Device Info helpers */ + +/* First pass calculate total size for space needed */ +extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); + +/* Note: some strings pointed to inside @deviceaddr might point + * to space inside @p. @p should stay valid while @deviceaddr + * is in use. + * It is assumed that @deviceaddr points to bigger memory of size + * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() + */ +extern void pnfs_osd_xdr_decode_deviceaddr( + struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); + +/* For Servers */ +extern int pnfs_osd_xdr_encode_deviceaddr( + struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); + +/* layoutupdate (layout_commit) xdr helpers */ +extern int +pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, + struct pnfs_osd_layoutupdate *lou); +extern __be32 * +pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); + +/* osd_ioerror encoding/decoding (layout_return) */ +extern int +pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); +extern __be32 * +pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); + +#endif /* __PNFS_OSD_XDR_H__ */ diff -up linux-2.6.34.noarch/include/linux/posix_acl.h.orig linux-2.6.34.noarch/include/linux/posix_acl.h --- linux-2.6.34.noarch/include/linux/posix_acl.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/posix_acl.h 2010-09-30 10:17:09.227023000 -0400 @@ -8,6 +8,7 @@ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H +#include #include #define ACL_UNDEFINED_ID (-1) diff -up linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h --- linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/sunrpc/msg_prot.h 2010-09-30 10:17:09.233014000 -0400 @@ -14,6 +14,8 @@ /* size of an XDR encoding unit in bytes, i.e. 32bit */ #define XDR_UNIT (4) +#include + /* spec defines authentication flavor as an unsigned 32 bit integer */ typedef u32 rpc_authflavor_t; diff -up linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h --- linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-09-30 10:17:09.238025000 -0400 @@ -3,6 +3,7 @@ #ifdef __KERNEL__ +#include #include struct rpc_pipe_msg { @@ -11,6 +12,10 @@ struct rpc_pipe_msg { size_t len; size_t copied; int errno; +#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ +#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ +#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA + u8 flags; }; struct rpc_pipe_ops { diff -up linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h --- linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-09-30 10:17:09.242015000 -0400 +++ linux-2.6.34.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-09-30 10:17:09.244014000 -0400 @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2008 The Regents of the University of Michigan. + * All rights reserved. + * + * David M. Richter + * + * Drawing on work done by Andy Adamson and + * Marius Eriksen . Thanks for the help over the + * years, guys. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * With thanks to CITI's project sponsor and partner, IBM. + */ + +#ifndef _SIMPLE_RPC_PIPEFS_H_ +#define _SIMPLE_RPC_PIPEFS_H_ + +#include +#include +#include +#include +#include +#include + + +#define payload_of(headerp) ((void *)(headerp + 1)) + +/* + * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. + * Messages may simply be the header itself, although having an optional + * data payload follow the header allows much more flexibility. + * + * Messages are created using pipefs_alloc_init_msg() and + * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an + * (optional) data payload. + * + * Given a struct pipefs_hdr *msg that has a struct foo payload, the data + * can be accessed using: struct foo *foop = payload_of(msg) + */ +struct pipefs_hdr { + u32 msgid; + u8 type; + u8 flags; + u16 totallen; /* length of entire message, including hdr itself */ + u32 status; +}; + +/* + * struct pipefs_list -- a type of list used for tracking callers who've made an + * upcall and are blocked waiting for a reply. + * + * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). + */ +struct pipefs_list { + struct list_head list; + spinlock_t list_lock; +}; + + +/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ +extern struct dentry *pipefs_mkpipe(const char *name, + const struct rpc_pipe_ops *ops, + int wait_for_open); +extern void pipefs_closepipe(struct dentry *pipe); +extern void pipefs_init_list(struct pipefs_list *list); +extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, + void *data, u16 datalen); +extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, + u8 flags, void *data, + u16 datalen, u16 padlen); +extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, + struct pipefs_hdr *msg, + struct pipefs_list + *uplist, u8 upflags, + u32 timeout); +extern int pipefs_queue_upcall_noreply(struct dentry *pipe, + struct pipefs_hdr *msg, u8 upflags); +extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, + struct pipefs_list *uplist); +extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, + const char __user *src, size_t len); +extern ssize_t pipefs_generic_upcall(struct file *filp, + struct rpc_pipe_msg *rpcmsg, + char __user *dst, size_t buflen); +extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); + +#endif /* _SIMPLE_RPC_PIPEFS_H_ */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h --- linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/include/linux/sunrpc/svc_xprt.h 2010-09-30 10:17:09.249016000 -0400 @@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con return buf; } + +/* + * Print a network address in a universal format (see rfc1833 and nfsv4.1) + */ +static inline int __svc_print_netaddr(struct sockaddr *addr, + struct xdr_netobj *na) +{ + u16 port; + ssize_t len; + + switch (addr->sa_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)addr; + port = ntohs(sin->sin_port); + + len = snprintf(na->data, na->len, "%pI4.%u.%u", + &sin->sin_addr, + port >> 8, port & 0xff); + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; + port = ntohs(sin6->sin6_port); + + len = snprintf(na->data, na->len, "%pI6.%u.%u", + &sin6->sin6_addr, + port >> 8, port & 0xff); + break; + } + default: + snprintf(na->data, na->len, "unknown address type: %d", + addr->sa_family); + len = -EINVAL; + break; + } + return len; +} #endif /* SUNRPC_SVC_XPRT_H */ diff -up linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.34.noarch/include/linux/sunrpc/xdr.h --- linux-2.6.34.noarch/include/linux/sunrpc/xdr.h.orig 2010-09-30 10:15:18.029721000 -0400 +++ linux-2.6.34.noarch/include/linux/sunrpc/xdr.h 2010-09-30 10:17:09.254021000 -0400 @@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) return p + 2; } +static inline __be32 * +xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) +{ + memcpy(ptr, p, len); + return p + XDR_QUADLEN(len); +} + /* * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) */ @@ -197,6 +204,7 @@ struct xdr_stream { extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); +extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, unsigned int len); extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); diff -up linux-2.6.34.noarch/net/sunrpc/Makefile.orig linux-2.6.34.noarch/net/sunrpc/Makefile --- linux-2.6.34.noarch/net/sunrpc/Makefile.orig 2010-05-16 17:17:36.000000000 -0400 +++ linux-2.6.34.noarch/net/sunrpc/Makefile 2010-09-30 10:17:09.263013000 -0400 @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ sunrpc_syms.o cache.o rpc_pipe.o \ - svc_xprt.o + svc_xprt.o simple_rpc_pipefs.o sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff -up linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c --- linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-09-30 10:17:09.267010000 -0400 +++ linux-2.6.34.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-09-30 10:17:09.268015000 -0400 @@ -0,0 +1,424 @@ +/* + * net/sunrpc/simple_rpc_pipefs.c + * + * Copyright (c) 2008 The Regents of the University of Michigan. + * All rights reserved. + * + * David M. Richter + * + * Drawing on work done by Andy Adamson and + * Marius Eriksen . Thanks for the help over the + * years, guys. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * With thanks to CITI's project sponsor and partner, IBM. + */ + +#include +#include +#include +#include + + +/* + * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs + * filesystem. + * + * If @wait_for_open is non-zero and an upcall is later queued but the userland + * end of the pipe has not yet been opened, the upcall will remain queued until + * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. + */ +struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, + int wait_for_open) +{ + struct dentry *dir, *pipe; + struct vfsmount *mnt; + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) { + pipe = ERR_CAST(mnt); + goto out; + } + dir = mnt->mnt_root; + if (!dir) { + pipe = ERR_PTR(-ENOENT); + goto out; + } + pipe = rpc_mkpipe(dir, name, NULL, ops, + wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); +out: + return pipe; +} +EXPORT_SYMBOL(pipefs_mkpipe); + +/* + * Shutdown a pipe made by pipefs_mkpipe(). + * XXX: do we need to retain an extra reference on the mount? + */ +void pipefs_closepipe(struct dentry *pipe) +{ + rpc_unlink(pipe); + rpc_put_mount(); +} +EXPORT_SYMBOL(pipefs_closepipe); + +/* + * Initialize a struct pipefs_list -- which are a way to keep track of callers + * who're blocked having made an upcall and are awaiting a reply. + * + * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how + * to use them. + */ +inline void pipefs_init_list(struct pipefs_list *list) +{ + INIT_LIST_HEAD(&list->list); + spin_lock_init(&list->list_lock); +} +EXPORT_SYMBOL(pipefs_init_list); + +/* + * Alloc/init a generic pipefs message header and copy into its message body + * an arbitrary data payload. + * + * struct pipefs_hdr's are meant to serve as generic, general-purpose message + * headers for easy rpc_pipefs I/O. When an upcall is made, the + * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered + * therein. --And yes, the naming can seem a little confusing at first: + * + * When one thinks of an upcall "message", in simple_rpc_pipefs that's a + * struct pipefs_hdr (possibly with an attached message body). A + * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" + * message is delivered and processed. + */ +struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, + void *data, u16 datalen, u16 padlen) +{ + u16 totallen; + struct pipefs_hdr *msg = NULL; + + totallen = sizeof(*msg) + datalen + padlen; + if (totallen > PAGE_SIZE) { + msg = ERR_PTR(-E2BIG); + goto out; + } + + msg = kzalloc(totallen, GFP_KERNEL); + if (!msg) { + msg = ERR_PTR(-ENOMEM); + goto out; + } + + msg->msgid = msgid; + msg->type = type; + msg->flags = flags; + msg->totallen = totallen; + memcpy(payload_of(msg), data, datalen); +out: + return msg; +} +EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); + +/* + * See the description of pipefs_alloc_init_msg_padded(). + */ +struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, + void *data, u16 datalen) +{ + return pipefs_alloc_init_msg_padded(msgid, type, flags, data, + datalen, 0); +} +EXPORT_SYMBOL(pipefs_alloc_init_msg); + + +static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, + struct pipefs_hdr *msg, u8 upflags) +{ + memset(rpcmsg, 0, sizeof(*rpcmsg)); + rpcmsg->data = msg; + rpcmsg->len = msg->totallen; + rpcmsg->flags = upflags; +} + +static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, + u8 upflags) +{ + struct rpc_pipe_msg *rpcmsg; + + rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); + if (!rpcmsg) + return ERR_PTR(-ENOMEM); + + pipefs_init_rpcmsg(rpcmsg, msg, upflags); + return rpcmsg; +} + + +/* represents an upcall that'll block and wait for a reply */ +struct pipefs_upcall { + u32 msgid; + struct rpc_pipe_msg rpcmsg; + struct list_head list; + wait_queue_head_t waitq; + struct pipefs_hdr *reply; +}; + + +static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, + struct pipefs_hdr *msg, u8 upflags) +{ + upcall->reply = NULL; + upcall->msgid = msg->msgid; + INIT_LIST_HEAD(&upcall->list); + init_waitqueue_head(&upcall->waitq); + pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); +} + +static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, + struct pipefs_upcall *upcall, + struct pipefs_list *uplist, + u32 timeout) +{ + int err = 0; + DECLARE_WAITQUEUE(wq, current); + + add_wait_queue(&upcall->waitq, &wq); + spin_lock(&uplist->list_lock); + list_add(&upcall->list, &uplist->list); + spin_unlock(&uplist->list_lock); + + err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); + if (err < 0) + goto out; + + if (timeout) { + /* retval of 0 means timer expired */ + err = schedule_timeout_uninterruptible(timeout); + if (err == 0 && upcall->reply == NULL) + err = -ETIMEDOUT; + } else { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + } + +out: + spin_lock(&uplist->list_lock); + list_del_init(&upcall->list); + spin_unlock(&uplist->list_lock); + remove_wait_queue(&upcall->waitq, &wq); + return err; +} + +/* + * Queue a pipefs msg for an upcall to userspace, place the calling thread + * on @uplist, and block the thread to wait for a reply. If @timeout is + * nonzero, the thread will be blocked for at most @timeout jiffies. + * + * (To convert time units into jiffies, consider the functions + * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and + * timespec_to_jiffies().) + * + * Once a reply is received by your downcall handler, call + * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, + * assign the reply, and wake the waiting thread. + * + * This function's return value pointer may be an error and should be checked + * with IS_ERR() before attempting to access the reply message. + * + * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() + * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG + * flag is set in @upflags. See also rpc_pipe_fs.h. + */ +struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, + struct pipefs_hdr *msg, + struct pipefs_list *uplist, + u8 upflags, u32 timeout) +{ + int err = 0; + struct pipefs_upcall upcall; + + pipefs_init_upcall_waitreply(&upcall, msg, upflags); + err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); + if (err < 0) { + kfree(upcall.reply); + upcall.reply = ERR_PTR(err); + } + + return upcall.reply; +} +EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); + +/* + * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., + * no reply is expected). + * + * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() + * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG + * flag is set in @upflags. See also rpc_pipe_fs.h. + */ +int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, + u8 upflags) +{ + int err = 0; + struct rpc_pipe_msg *rpcmsg; + + upflags |= PIPEFS_AUTOFREE_RPCMSG; + rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); + if (IS_ERR(rpcmsg)) { + err = PTR_ERR(rpcmsg); + goto out; + } + err = rpc_queue_upcall(pipe->d_inode, rpcmsg); +out: + return err; +} +EXPORT_SYMBOL(pipefs_queue_upcall_noreply); + + +static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, + struct pipefs_list *uplist) +{ + struct pipefs_upcall *upcall; + + spin_lock(&uplist->list_lock); + list_for_each_entry(upcall, &uplist->list, list) + if (upcall->msgid == msgid) + goto out; + upcall = NULL; +out: + spin_unlock(&uplist->list_lock); + return upcall; +} + +/* + * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall + * message and have determined that it is a reply to a waiting upcall, + * you can use this function to find the appropriate upcall, assign the result, + * and wake the upcall thread. + * + * The reply message must have the same msgid as the original upcall message's. + * + * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). + */ +int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, + struct pipefs_list *uplist) +{ + int err = 0; + struct pipefs_upcall *upcall; + + upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); + if (!upcall) { + printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " + "for msgid %d\n", __func__, reply->msgid); + err = -ENOENT; + goto out; + } + upcall->reply = reply; + wake_up(&upcall->waitq); +out: + return err; +} +EXPORT_SYMBOL(pipefs_assign_upcall_reply); + +/* + * Generic method to read-in and return a newly-allocated message which begins + * with a struct pipefs_hdr. + */ +struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, + size_t len) +{ + int err = 0, hdrsize; + struct pipefs_hdr *msg = NULL; + + hdrsize = sizeof(*msg); + if (len < hdrsize) { + printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", + __func__, (int) len, hdrsize); + err = -EINVAL; + goto out; + } + + msg = kzalloc(len, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out; + } + if (copy_from_user(msg, src, len)) + err = -EFAULT; +out: + if (err) { + kfree(msg); + msg = ERR_PTR(err); + } + return msg; +} +EXPORT_SYMBOL(pipefs_readmsg); + +/* + * Generic rpc_pipe_ops->upcall() handler implementation. + * + * Don't call this directly: to make an upcall, use + * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). + */ +ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, + char __user *dst, size_t buflen) +{ + char *data; + ssize_t len, left; + + data = (char *)rpcmsg->data + rpcmsg->copied; + len = rpcmsg->len - rpcmsg->copied; + if (len > buflen) + len = buflen; + + left = copy_to_user(dst, data, len); + if (left < 0) { + rpcmsg->errno = left; + return left; + } + + len -= left; + rpcmsg->copied += len; + rpcmsg->errno = 0; + return len; +} +EXPORT_SYMBOL(pipefs_generic_upcall); + +/* + * Generic rpc_pipe_ops->destroy_msg() handler implementation. + * + * Items are only freed if @rpcmsg->flags has been set appropriately. + * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. + */ +void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) +{ + if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) + kfree(rpcmsg->data); + if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) + kfree(rpcmsg); +} +EXPORT_SYMBOL(pipefs_generic_destroy_msg); diff -up linux-2.6.34.noarch/net/sunrpc/xdr.c.orig linux-2.6.34.noarch/net/sunrpc/xdr.c --- linux-2.6.34.noarch/net/sunrpc/xdr.c.orig 2010-09-30 10:15:18.189725000 -0400 +++ linux-2.6.34.noarch/net/sunrpc/xdr.c 2010-09-30 10:17:09.274010000 -0400 @@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, { struct kvec *tail; size_t copy; - char *p; unsigned int pglen = buf->page_len; + unsigned int tailbuf_len; tail = buf->tail; BUG_ON (len > pglen); + tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; + /* Shift the tail first */ - if (tail->iov_len != 0) { - p = (char *)tail->iov_base + len; + if (tailbuf_len != 0) { + unsigned int free_space = tailbuf_len - tail->iov_len; + + if (len < free_space) + free_space = len; + tail->iov_len += free_space; + + copy = len; if (tail->iov_len > len) { - copy = tail->iov_len - len; - memmove(p, tail->iov_base, copy); + char *p = (char *)tail->iov_base + len; + memmove(p, tail->iov_base, tail->iov_len - len); } else - buf->buflen -= len; - /* Copy from the inlined pages into the tail */ - copy = len; - if (copy > tail->iov_len) copy = tail->iov_len; + /* Copy from the inlined pages into the tail */ _copy_from_pages((char *)tail->iov_base, buf->pages, buf->page_base + pglen - len, copy); @@ -496,6 +501,27 @@ __be32 * xdr_reserve_space(struct xdr_st EXPORT_SYMBOL_GPL(xdr_reserve_space); /** + * xdr_rewind_stream - rewind a stream back to some checkpoint + * @xdr: pointer to xdr_stream + * @q: some checkpoint at historical place of @xdr + * + * Restors an xdr stream to some historical point. @q must be + * a logical xdr point in the past that was sampled by @q = @xdr->p. + */ +__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) +{ + size_t nbytes = (xdr->p - q) << 2; + + BUG_ON(xdr->p < q); + BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); + xdr->p = q; + xdr->iov->iov_len -= nbytes; + xdr->buf->len -= nbytes; + return q; +} +EXPORT_SYMBOL_GPL(xdr_rewind_stream); + +/** * xdr_write_pages - Insert a list of pages into an XDR buffer for sending * @xdr: pointer to xdr_stream * @pages: list of pages