From 04b78851f233408e5b62285695520f58991ca8ea Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Jan 28 2011 16:47:59 +0000 Subject: - Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-24 Signed-off-by: Steve Dickson --- diff --git a/kernel.spec b/kernel.spec index b764187..3d03c74 100644 --- a/kernel.spec +++ b/kernel.spec @@ -23,7 +23,7 @@ Summary: The Linux kernel # # (Uncomment the '#' and both spaces below to set the buildid.) # -%define buildid .pnfs35.2010.08.19 +%define buildid .pnfs35.2010.08.24 ################################################################### # The buildid can also be specified on the rpmbuild command line @@ -831,7 +831,7 @@ Patch13703: btrfs-fix-typo-in-fallocate-to-make-it-honor-actual-size.patch # rhbz#643758 Patch13704: hostap_cs-fix-sleeping-function-called-from-invalid-context.patch -Patch13000: pnfs-all-2.6.35-2010-08-19.patch +Patch13000: pnfs-all-2.6.35-2010-08-24-f14.patch Patch13001: linux-2.6-pnfs-compile.patch Patch13002: linux-2.6.35-inline.patch @@ -1572,7 +1572,7 @@ ApplyPatch block-check-for-proper-length-of-iov-entries-earlier-in-blk_rq_map_us # rhbz#643758 ApplyPatch hostap_cs-fix-sleeping-function-called-from-invalid-context.patch -ApplyPatch pnfs-all-2.6.35-2010-08-19.patch +ApplyPatch pnfs-all-2.6.35-2010-08-24-f14.patch ApplyPatch linux-2.6-pnfs-compile.patch ApplyPatch linux-2.6.35-inline.patch @@ -2628,6 +2628,9 @@ fi * Wed Sep 01 2010 Dave Jones 2.6.35.4-15 - Improved version of the VIA Velocity DMA fix. +* Tue Aug 31 2010 Steve Dickson +- Updated to the latest pNFS tag: pnfs-all-2.6.35-2010-08-24 + * Tue Aug 31 2010 Kyle McMartin 2.6.35.4-14 - efifb-add-more-models.patch: Add patch from Luke Macken to support more Mac models (rhbz#528232) diff --git a/pnfs-all-2.6.35-2010-08-24-f14.patch b/pnfs-all-2.6.35-2010-08-24-f14.patch new file mode 100644 index 0000000..af77993 --- /dev/null +++ b/pnfs-all-2.6.35-2010-08-24-f14.patch @@ -0,0 +1,31778 @@ +diff -up linux-2.6.35.noarch/arch/um/os-Linux/mem.c.orig linux-2.6.35.noarch/arch/um/os-Linux/mem.c +--- linux-2.6.35.noarch/arch/um/os-Linux/mem.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/arch/um/os-Linux/mem.c 2010-08-31 21:11:40.879098120 -0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include "init.h" + #include "kern_constants.h" + #include "os.h" +diff -up linux-2.6.35.noarch/block/genhd.c.orig linux-2.6.35.noarch/block/genhd.c +--- linux-2.6.35.noarch/block/genhd.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/block/genhd.c 2010-08-31 21:11:40.880118142 -0400 +@@ -1009,6 +1009,7 @@ static void disk_release(struct device * + struct class block_class = { + .name = "block", + }; ++EXPORT_SYMBOL(block_class); + + static char *block_devnode(struct device *dev, mode_t *mode) + { +diff -up linux-2.6.35.noarch/Documentation/filesystems/spnfs.txt.orig linux-2.6.35.noarch/Documentation/filesystems/spnfs.txt +--- linux-2.6.35.noarch/Documentation/filesystems/spnfs.txt.orig 2010-08-31 21:11:40.879098120 -0400 ++++ linux-2.6.35.noarch/Documentation/filesystems/spnfs.txt 2010-08-31 21:11:40.879098120 -0400 +@@ -0,0 +1,211 @@ ++(c) 2007 Network Appliance Inc. ++ ++spNFS ++----- ++ ++An spNFS system consists of a Meta Data Server (MDS), a number of Client machines (C) and a number of Data Servers (DS). ++ ++A file system is mounted by the clients from the MDS, and all file data ++is striped across the DSs. ++ ++Identify the machines that will be filling each of these roles. ++ ++The spnfs kernel will be installed on all machines: clients, the MDS and DSs. ++ ++ ++Building and installing the spNFS kernel ++---------------------------------------- ++ ++Get the spNFS kernel from: ++ ++ git://linux-nfs.org/~bhalevy/linux-pnfs.git ++ ++Use the pnfs-all-latest branch and add these options to your .config file ++ ++ CONFIG_NETWORK_FILESYSTEMS=y ++ CONFIG_NFS_FS=m ++ CONFIG_NFS_V4=y ++ CONFIG_NFS_V4_1=y ++ CONFIG_PNFS=y ++ CONFIG_NFSD=m ++ CONFIG_PNFSD=y ++ # CONFIG_PNFSD_LOCAL_EXPORT is not set ++ CONFIG_SPNFS=y ++ ++By default, spNFS uses whole-file layouts. Layout segments can be enabled ++by adding: ++ ++ CONFIG_SPNFS_LAYOUTSEGMENTS=y ++ ++to your .config file. ++ ++Building and installation of kernel+modules is as usual. ++This kernel should be installed and booted on the client, MDS and DSs. ++ ++Note that CONFIG_PNFSD_LOCAL_EXPORT must be disabled for spnfs as it ++takes over the pnfs export interface. ++ ++Building nfs-utils ++------------------ ++ ++Get the nfs-utils package containing spnfsd from: ++ ++ git://linux-nfs.org/~bhalevy/pnfs-nfs-utils.git ++ ++Follow the standard instructions for building nfs-utils. ++ ++After building, the spnfsd daemon will be located in utils/spnfsd. The spnfsd ++daemon will only be needed on the MDS. ++ ++ ++Installation ++------------ ++ ++The nfs-utils package contains a default spnfsd.conf file in ++utils/spnfsd/spnfsd.conf. Copy this file to /etc/spnfsd.conf. ++ ++By default, the DS-Mount-Directory is set to /spnfs (see spnfsd.conf). Under ++this directory, mount points must be created for each DS to ++be used for pNFS data stripes. These mount points are named by the ip address ++of the corresponding DS. In the sample spnfsd.conf, there are two ++DSs defined (172.16.28.134 and 172.16.28.141). ++ ++Following the sample spnfsd.conf, ++ ++ mkdir /spnfs ++ ++on the MDS (corresponding to DS-Mount-Directory). Then ++ ++ mkdir /spnfs/172.16.28.134 ++ mkdir /spnfs/172.16.28.141 ++ ++to create the mount points for the DSs. ++ ++On the DSs, chose a directory where data stripes will be created by the MDS. ++For the sample file, this directory is /pnfs, so on each DS execute: ++ ++ mkdir /pnfs ++ ++This directory is specified in the spnfsd.conf file by the DS*_ROOT option ++(where * is replaced by the DS number). DS_ROOT is specified relative to ++the directory being exported by the DSs. In our example, our DSs are exporting ++the root directory (/) and therefore our DS_ROOT is /pnfs. On the DSs, we have ++the following entry in /etc/exports: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check) ++ ++N.B. If we had created a /exports directory and a /pnfs directory under ++/exports, and if we were exporting /exports, then DS_ROOT would still be /pnfs ++(not /exports/pnfs). ++ ++It may be useful to add entries to /etc/fstab on the MDS to automatically ++mount the DS_ROOT file systems. For this example, our MDS fstab would ++contain: ++ ++ 172.17.84.128:/pnfs /spnfs/172.17.84.128 nfs defaults 1 2 ++ 172.17.84.122:/pnfs /spnfs/172.17.84.122 nfs defaults 1 2 ++ ++The DS mounts must be performed manually or via fstab at this time (automatic ++mounting, directory creation, etc. are on the todo list). To perform I/O ++through the MDS, the DS mounts MUST use NFSv3 at this time (this restriction ++will eventually be removed). ++ ++ ++On the MDS, choose a file system to use with spNFS and export it, e.g.: ++ ++ / *(rw,fsid=0,insecure,no_root_squash,sync,no_subtree_check,pnfs) ++ ++Make sure nfsd and all supporting processes are running on the MDS and DSs. ++ ++ ++Running ++------- ++ ++If rpc_pipefs is not already mounted (if you're running idmapd it probably is), ++you may want to add the following line to /etc/fstab: ++ ++ rpc_pipefs /var/lib/nfs/rpc_pipefs rpc_pipefs defaults 0 0 ++ ++to automatically mount rpc_pipefs. ++ ++With spnfsd.conf configured for your environment and the mounts mounted as ++described above, spnfsd can now be started. ++ ++On the MDS, execute spnfsd: ++ ++ spnfsd ++ ++The executable is located in the directory where it was built, and ++may also have been installed elsewhere depending on how you built nfs-utils. ++It will run in the foreground by default, and in fact will do so despite ++any options suggesting the contrary (it's still a debugging build). ++ ++On the client, make sure the nfslayoutdriver module is loaded: ++ ++ modprobe nfslayoutdriver ++ ++Then mount the file system from the MDS: ++ ++ mount -t nfs4 -o minorversion=1 mds:/ /mnt ++ ++I/O through the MDS is now supported. To use it, do not load the ++nfslayoutdriver on the client, and mount the MDS using NFSv4 or 4.1 ++(NFSv2 and v3 are not yet supported). ++ ++You may now use spNFS by performing file system activities in /mnt. ++If you create files in /mnt, you should see stripe files corresponding to ++new files being created on the DSs. The current implementation names the ++stripe files based on the inode number of the file on the MDS. For example, ++if you create a file foo in /mnt and do an 'ls -li /mnt/foo': ++ ++ # ls -li foo ++ 1233 -rw-r--r-- 1 root root 0 Nov 29 15:54 foo ++ ++You should see stripe files on each under /pnfs (per the sample) named ++1233. The file /pnfs/1233 on DS1 will contain the first bytes ++of data written to foo, DS2 will contain the next bytes, etc. ++Removing /mnt/foo will remove the corresponding stripe files on the DSs. ++Other file system operations should behave (mostly :-) as expected. ++ ++ ++Layout Segments ++--------------- ++ ++If the kernel is compiled to support layout segments, there will ++be two files created under /proc/fs/spnfs for controlling layout ++segment functionality. ++ ++To enable layout segments, write a '1' to /proc/fs/spnfs/layoutseg, e.g.: ++ ++ echo 1 > /proc/fs/spnfs/layoutseg ++ ++Layout segments can be disabled (returning to whole-file layouts) by ++writing a '0' to /proc/fs/spnfs/layoutseg: ++ ++ echo 0 > /proc/fs/spnfs/layoutseg ++ ++When layout segments are enabled, the size of the layouts returned can ++be specified by writing a decimal number (ascii representation) to ++/proc/fs/spnfs/layoutsegsize: ++ ++ echo 1024 > /proc/fs/spnfs/layoutsegsize ++ ++The value'0' has a special meaning--it causes the server to return a ++layout that is exactly the size requested by the client: ++ ++ echo 0 > /proc/fs/spnfs/layoutsegsize ++ ++ ++Troubleshooting ++--------------- ++ ++If you see data being written to the files on the MDS rather than ++the stripe files, make sure the nfslayoutdriver is loaded on the client ++(see above). ++ ++If you get a "permission denied" error, make sure mountd is running on the mds ++(it occasionally fails to start). ++ ++Bugs, enhancements, compliments, complaints to: dmuntz@netapp.com ++ ++ +diff -up linux-2.6.35.noarch/drivers/md/dm-ioctl.c.orig linux-2.6.35.noarch/drivers/md/dm-ioctl.c +--- linux-2.6.35.noarch/drivers/md/dm-ioctl.c.orig 2010-08-31 19:12:23.802189085 -0400 ++++ linux-2.6.35.noarch/drivers/md/dm-ioctl.c 2010-08-31 21:11:40.881097886 -0400 +@@ -662,6 +662,12 @@ static int dev_create(struct dm_ioctl *p + return r; + } + ++int dm_dev_create(struct dm_ioctl *param) ++{ ++ return dev_create(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_create); ++ + /* + * Always use UUID for lookups if it's present, otherwise use name or dev. + */ +@@ -757,6 +763,12 @@ static int dev_remove(struct dm_ioctl *p + return 0; + } + ++int dm_dev_remove(struct dm_ioctl *param) ++{ ++ return dev_remove(param, sizeof(*param)); ++} ++EXPORT_SYMBOL(dm_dev_remove); ++ + /* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. +@@ -929,6 +941,12 @@ static int do_resume(struct dm_ioctl *pa + return r; + } + ++int dm_do_resume(struct dm_ioctl *param) ++{ ++ return do_resume(param); ++} ++EXPORT_SYMBOL(dm_do_resume); ++ + /* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. +@@ -1206,6 +1224,12 @@ out: + return r; + } + ++int dm_table_load(struct dm_ioctl *param, size_t param_size) ++{ ++ return table_load(param, param_size); ++} ++EXPORT_SYMBOL(dm_table_load); ++ + static int table_clear(struct dm_ioctl *param, size_t param_size) + { + int r; +diff -up linux-2.6.35.noarch/drivers/scsi/hosts.c.orig linux-2.6.35.noarch/drivers/scsi/hosts.c +--- linux-2.6.35.noarch/drivers/scsi/hosts.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/drivers/scsi/hosts.c 2010-08-31 21:11:40.882097900 -0400 +@@ -49,7 +49,7 @@ static void scsi_host_cls_release(struct + put_device(&class_to_shost(dev)->shost_gendev); + } + +-static struct class shost_class = { ++struct class shost_class = { + .name = "scsi_host", + .dev_release = scsi_host_cls_release, + }; +diff -up linux-2.6.35.noarch/fs/exofs/exofs.h.orig linux-2.6.35.noarch/fs/exofs/exofs.h +--- linux-2.6.35.noarch/fs/exofs/exofs.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/exofs/exofs.h 2010-08-31 21:11:40.885114741 -0400 +@@ -36,13 +36,9 @@ + #include + #include + #include ++#include + #include "common.h" + +-/* FIXME: Remove once pnfs hits mainline +- * #include +- */ +-#include "pnfs.h" +- + #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) + + #ifdef CONFIG_EXOFS_DEBUG +@@ -103,6 +99,7 @@ struct exofs_sb_info { + struct exofs_i_info { + struct inode vfs_inode; /* normal in-memory inode */ + wait_queue_head_t i_wq; /* wait queue for inode */ ++ spinlock_t i_layout_lock; /* lock for layout/return/recall */ + unsigned long i_flags; /* various atomic flags */ + uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ + uint32_t i_dir_start_lookup; /* which page to start lookup */ +@@ -166,6 +163,9 @@ static inline unsigned exofs_io_state_si + */ + #define OBJ_2BCREATED 0 /* object will be created soon*/ + #define OBJ_CREATED 1 /* object has been created on the osd*/ ++/* Below are not used atomic but reuse the same i_flags */ ++#define OBJ_LAYOUT_IS_GIVEN 2 /* inode has given layouts to clients*/ ++#define OBJ_IN_LAYOUT_RECALL 3 /* inode is in the middle of a layout recall*/ + + static inline int obj_2bcreated(struct exofs_i_info *oi) + { +@@ -304,4 +304,20 @@ extern const struct inode_operations exo + extern const struct inode_operations exofs_symlink_inode_operations; + extern const struct inode_operations exofs_fast_symlink_inode_operations; + ++/* export.c */ ++typedef int (exofs_recall_fn)(struct inode *inode); ++#ifdef CONFIG_PNFSD ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo); ++void exofs_init_export(struct super_block *sb); ++#else ++static inline int exofs_inode_recall_layout(struct inode *inode, ++ enum pnfs_iomode iomode, exofs_recall_fn todo) ++{ ++ return todo(inode); ++} ++ ++static inline void exofs_init_export(struct super_block *sb) {} ++#endif ++ + #endif +diff -up linux-2.6.35.noarch/fs/exofs/export.c.orig linux-2.6.35.noarch/fs/exofs/export.c +--- linux-2.6.35.noarch/fs/exofs/export.c.orig 2010-08-31 21:11:40.886160609 -0400 ++++ linux-2.6.35.noarch/fs/exofs/export.c 2010-08-31 21:11:40.886160609 -0400 +@@ -0,0 +1,396 @@ ++/* ++ * export.c - Implementation of the pnfs_export_operations ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Boaz Harrosh ++ * ++ * This file is part of exofs. ++ * ++ * exofs is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation. Since it is based on ext2, and the only ++ * valid version of GPL for the Linux kernel is version 2, the only valid ++ * version of GPL for exofs is version 2. ++ * ++ * exofs is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with exofs; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include "exofs.h" ++ ++static int exofs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_OSD2_OBJECTS; ++} ++ ++static void set_dev_id(struct pnfs_deviceid *pnfs_devid, u64 sbid, u64 devid) ++{ ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)pnfs_devid; ++ ++ dev_id->sbid = sbid; ++ dev_id->devid = devid; ++} ++ ++static int cb_layout_recall(struct inode *inode, enum pnfs_iomode iomode, ++ u64 offset, u64 length, void *cookie) ++{ ++ struct nfsd4_pnfs_cb_layout cbl; ++ struct pnfsd_cb_ctl cb_ctl; ++ int status; ++ ++ memset(&cb_ctl, 0, sizeof(cb_ctl)); ++ status = pnfsd_get_cb_op(&cb_ctl); ++ if (unlikely(status)) { ++ EXOFS_ERR("%s: nfsd unloaded!! inode (0x%lx) status=%d\n", ++ __func__, inode->i_ino, status); ++ goto err; ++ } ++ ++ memset(&cbl, 0, sizeof(cbl)); ++ cbl.cbl_recall_type = RETURN_FILE; ++ cbl.cbl_seg.layout_type = LAYOUT_OSD2_OBJECTS; ++ cbl.cbl_seg.iomode = iomode; ++ cbl.cbl_seg.offset = offset; ++ cbl.cbl_seg.length = length; ++ cbl.cbl_cookie = cookie; ++ ++ status = cb_ctl.cb_op->cb_layout_recall(inode->i_sb, inode, &cbl); ++ pnfsd_put_cb_op(&cb_ctl); ++ ++err: ++ return status; ++} ++ ++static enum nfsstat4 exofs_layout_get( ++ struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; ++ struct exofs_layout *el = &sbi->layout; ++ struct pnfs_osd_object_cred *creds = NULL; ++ struct pnfs_osd_layout layout; ++ __be32 *start; ++ bool in_recall; ++ int i, err; ++ enum nfsstat4 nfserr; ++ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ res->lg_seg.iomode = IOMODE_RW; ++ res->lg_return_on_close = true; /* TODO: unused but will be soon */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ creds = kcalloc(el->s_numdevs, sizeof(*creds), GFP_KERNEL); ++ if (!creds) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto out; ++ } ++ ++ /* Fill in a pnfs_osd_layout struct */ ++ layout.olo_map = sbi->data_map; ++ ++ for (i = 0; i < el->s_numdevs; i++) { ++ struct pnfs_osd_object_cred *cred = &creds[i]; ++ osd_id id = exofs_oi_objno(oi); ++ unsigned dev = exofs_layout_od_id(el, id, i); ++ ++ set_dev_id(&cred->oc_object_id.oid_device_id, args->lg_sbid, ++ dev); ++ cred->oc_object_id.oid_partition_id = el->s_pid; ++ cred->oc_object_id.oid_object_id = id; ++ cred->oc_osd_version = osd_dev_is_ver1(el->s_ods[dev]) ? ++ PNFS_OSD_VERSION_1 : ++ PNFS_OSD_VERSION_2; ++ cred->oc_cap_key_sec = PNFS_OSD_CAP_KEY_SEC_NONE; ++ ++ cred->oc_cap_key.cred_len = 0; ++ cred->oc_cap_key.cred = NULL; ++ ++ cred->oc_cap.cred_len = OSD_CAP_LEN; ++ cred->oc_cap.cred = oi->i_cred; ++ } ++ ++ layout.olo_comps_index = 0; ++ layout.olo_num_comps = el->s_numdevs; ++ layout.olo_comps = creds; ++ ++ err = pnfs_osd_xdr_encode_layout(xdr, &layout); ++ if (err) { ++ nfserr = NFS4ERR_TOOSMALL; /* FIXME: Change osd_xdr error codes */ ++ goto out; ++ } ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ if (!in_recall) { ++ __set_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ nfserr = NFS4_OK; ++ } else { ++ nfserr = NFS4ERR_RECALLCONFLICT; ++ } ++ spin_unlock(&oi->i_layout_lock); ++ ++out: ++ kfree(creds); ++ EXOFS_DBGMSG("(0x%lx) nfserr=%u xdr_bytes=%zu\n", ++ inode->i_ino, nfserr, exp_xdr_qbytes(xdr->p - start)); ++ return nfserr; ++} ++ ++/* NOTE: inode mutex must NOT be held */ ++static int exofs_layout_commit( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ struct timespec mtime; ++ loff_t i_size; ++ int in_recall; ++ ++ /* In case of a recall we ignore the new size and mtime since they ++ * are going to be changed again by truncate, and since we cannot take ++ * the inode lock in that case. ++ */ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ if (in_recall) { ++ EXOFS_DBGMSG("(0x%lx) commit was called during recall\n", ++ inode->i_ino); ++ return 0; ++ } ++ ++ /* NOTE: I would love to call inode_setattr here ++ * but i cannot since this will cause an eventual vmtruncate, ++ * which will cause a layout_recall. So open code the i_size ++ * and mtime/atime changes under i_mutex. ++ */ ++ mutex_lock_nested(&inode->i_mutex, I_MUTEX_NORMAL); ++ ++ if (args->lc_mtime.seconds) { ++ mtime.tv_sec = args->lc_mtime.seconds; ++ mtime.tv_nsec = args->lc_mtime.nseconds; ++ ++ /* layout commit may only make time bigger, since there might ++ * be reordering of the notifications and it might arrive after ++ * A local change. ++ * TODO: if mtime > ctime then we know set_attr did an mtime ++ * in the future. and we can let this update through ++ */ ++ if (0 <= timespec_compare(&mtime, &inode->i_mtime)) ++ mtime = inode->i_mtime; ++ } else { ++ mtime = current_fs_time(inode->i_sb); ++ } ++ ++ /* TODO: Will below work? since mark_inode_dirty has it's own ++ * Time handling ++ */ ++ inode->i_atime = inode->i_mtime = mtime; ++ ++ i_size = i_size_read(inode); ++ if (args->lc_newoffset) { ++ loff_t new_size = args->lc_last_wr + 1; ++ ++ if (i_size < new_size) { ++ i_size_write(inode, i_size = new_size); ++ res->lc_size_chg = 1; ++ res->lc_newsize = new_size; ++ } ++ } ++ /* TODO: else { i_size = osd_get_object_length() } */ ++ ++/* TODO: exofs does not currently use the osd_xdr part of the layout_commit */ ++ ++ mark_inode_dirty_sync(inode); ++ ++ mutex_unlock(&inode->i_mutex); ++ EXOFS_DBGMSG("(0x%lx) i_size=0x%llx lcp->off=0x%llx\n", ++ inode->i_ino, i_size, args->lc_last_wr); ++ return 0; ++} ++ ++static void exofs_handle_error(struct pnfs_osd_ioerr *ioerr) ++{ ++ EXOFS_ERR("exofs_handle_error: errno=%d is_write=%d obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ ioerr->oer_errno, ioerr->oer_iswrite, ++ _LLU(ioerr->oer_component.oid_object_id), ++ _LLU(ioerr->oer_comp_offset), ++ _LLU(ioerr->oer_comp_length)); ++} ++ ++static int exofs_layout_return( ++ struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ __be32 *p = args->lrf_body; ++ unsigned len = exp_xdr_qwords(args->lrf_body_len); ++ ++ EXOFS_DBGMSG("(0x%lx) cookie %p xdr_len %d\n", ++ inode->i_ino, args->lr_cookie, len); ++ ++ while (len >= pnfs_osd_ioerr_xdr_sz()) { ++ struct pnfs_osd_ioerr ioerr; ++ ++ p = pnfs_osd_xdr_decode_ioerr(&ioerr, p); ++ len -= pnfs_osd_ioerr_xdr_sz(); ++ exofs_handle_error(&ioerr); ++ } ++ ++ if (args->lr_cookie) { ++ struct exofs_i_info *oi = exofs_i(inode); ++ bool in_recall; ++ ++ spin_lock(&oi->i_layout_lock); ++ in_recall = test_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ __clear_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ /* TODO: how to communicate cookie with the waiter */ ++ if (in_recall) ++ wake_up(&oi->i_wq); /* wakeup any recalls */ ++ } ++ ++ return 0; ++} ++ ++int exofs_get_device_info(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct exofs_sb_info *sbi = sb->s_fs_info; ++ struct pnfs_osd_deviceaddr devaddr; ++ const struct osd_dev_info *odi; ++ u64 devno = devid->devid; ++ __be32 *start; ++ int err; ++ ++ memset(&devaddr, 0, sizeof(devaddr)); ++ ++ if (unlikely(devno >= sbi->layout.s_numdevs)) ++ return -ENODEV; ++ ++ odi = osduld_device_info(sbi->layout.s_ods[devno]); ++ ++ devaddr.oda_systemid.len = odi->systemid_len; ++ devaddr.oda_systemid.data = (void *)odi->systemid; /* !const cast */ ++ ++ devaddr.oda_osdname.len = odi->osdname_len ; ++ devaddr.oda_osdname.data = (void *)odi->osdname;/* !const cast */ ++ ++ /* skip opaque size, will be filled-in later */ ++ start = exp_xdr_reserve_qwords(xdr, 1); ++ if (!start) { ++ err = -E2BIG; ++ goto err; ++ } ++ ++ err = pnfs_osd_xdr_encode_deviceaddr(xdr, &devaddr); ++ if (err) ++ goto err; ++ ++ exp_xdr_encode_opaque_len(start, xdr->p); ++ ++ EXOFS_DBGMSG("xdr_bytes=%Zu devno=%lld osdname-%s\n", ++ exp_xdr_qbytes(xdr->p - start), devno, odi->osdname); ++ return 0; ++ ++err: ++ EXOFS_DBGMSG("Error: err=%d at_byte=%zu\n", ++ err, exp_xdr_qbytes(xdr->p - start)); ++ return err; ++} ++ ++struct pnfs_export_operations exofs_pnfs_ops = { ++ .layout_type = exofs_layout_type, ++ .layout_get = exofs_layout_get, ++ .layout_commit = exofs_layout_commit, ++ .layout_return = exofs_layout_return, ++ .get_device_info = exofs_get_device_info, ++}; ++ ++static bool is_layout_returned(struct exofs_i_info *oi) ++{ ++ bool layout_given; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ return !layout_given; ++} ++ ++int exofs_inode_recall_layout(struct inode *inode, enum pnfs_iomode iomode, ++ exofs_recall_fn todo) ++{ ++ struct exofs_i_info *oi = exofs_i(inode); ++ int layout_given; ++ int error = 0; ++ ++ spin_lock(&oi->i_layout_lock); ++ layout_given = test_bit(OBJ_LAYOUT_IS_GIVEN, &oi->i_flags); ++ __set_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ ++ if (!layout_given) ++ goto exec; ++ ++ for (;;) { ++ EXOFS_DBGMSG("(0x%lx) has_layout issue a recall\n", ++ inode->i_ino); ++ error = cb_layout_recall(inode, iomode, 0, NFS4_MAX_UINT64, ++ &oi->i_wq); ++ switch (error) { ++ case 0: ++ case -EAGAIN: ++ break; ++ case -ENOENT: ++ goto exec; ++ default: ++ goto err; ++ } ++ ++ error = wait_event_interruptible(oi->i_wq, ++ is_layout_returned(oi)); ++ if (error) ++ goto err; ++ } ++ ++exec: ++ error = todo(inode); ++ ++err: ++ spin_lock(&oi->i_layout_lock); ++ __clear_bit(OBJ_IN_LAYOUT_RECALL, &oi->i_flags); ++ spin_unlock(&oi->i_layout_lock); ++ EXOFS_DBGMSG("(0x%lx) return=>%d\n", inode->i_ino, error); ++ return error; ++} ++ ++void exofs_init_export(struct super_block *sb) ++{ ++ sb->s_pnfs_op = &exofs_pnfs_ops; ++} +diff -up linux-2.6.35.noarch/fs/exofs/inode.c.orig linux-2.6.35.noarch/fs/exofs/inode.c +--- linux-2.6.35.noarch/fs/exofs/inode.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/exofs/inode.c 2010-08-31 21:11:40.887170623 -0400 +@@ -863,7 +863,7 @@ void exofs_truncate(struct inode *inode) + if (unlikely(wait_obj_created(oi))) + goto fail; + +- ret = _do_truncate(inode); ++ ret = exofs_inode_recall_layout(inode, IOMODE_ANY, _do_truncate); + if (ret) + goto fail; + +@@ -994,6 +994,7 @@ static void __oi_init(struct exofs_i_inf + { + init_waitqueue_head(&oi->i_wq); + oi->i_flags = 0; ++ spin_lock_init(&oi->i_layout_lock); + } + /* + * Fill in an inode read from the OSD and set it up for use +diff -up linux-2.6.35.noarch/fs/exofs/Kbuild.orig linux-2.6.35.noarch/fs/exofs/Kbuild +--- linux-2.6.35.noarch/fs/exofs/Kbuild.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/exofs/Kbuild 2010-08-31 21:11:40.884160557 -0400 +@@ -13,4 +13,5 @@ + # + + exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o ++exofs-$(CONFIG_PNFSD) += export.o + obj-$(CONFIG_EXOFS_FS) += exofs.o +diff -up linux-2.6.35.noarch/fs/exofs/Kconfig.orig linux-2.6.35.noarch/fs/exofs/Kconfig +--- linux-2.6.35.noarch/fs/exofs/Kconfig.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/exofs/Kconfig 2010-08-31 21:11:40.884160557 -0400 +@@ -1,6 +1,7 @@ + config EXOFS_FS + tristate "exofs: OSD based file system support" + depends on SCSI_OSD_ULD ++ select EXPORTFS_OSD_LAYOUT if PNFSD + help + EXOFS is a file system that uses an OSD storage device, + as its backing storage. +diff -up linux-2.6.35.noarch/fs/exofs/super.c.orig linux-2.6.35.noarch/fs/exofs/super.c +--- linux-2.6.35.noarch/fs/exofs/super.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/exofs/super.c 2010-08-31 21:11:40.889140759 -0400 +@@ -621,6 +621,7 @@ static int exofs_fill_super(struct super + sb->s_fs_info = sbi; + sb->s_op = &exofs_sops; + sb->s_export_op = &exofs_export_ops; ++ exofs_init_export(sb); + root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); + if (IS_ERR(root)) { + EXOFS_ERR("ERROR: exofs_iget failed\n"); +diff -up linux-2.6.35.noarch/fs/exportfs/expfs.c.orig linux-2.6.35.noarch/fs/exportfs/expfs.c +--- linux-2.6.35.noarch/fs/exportfs/expfs.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/exportfs/expfs.c 2010-08-31 21:11:40.890150938 -0400 +@@ -16,6 +16,13 @@ + #include + #include + ++#if defined(CONFIG_PNFSD) ++struct pnfsd_cb_ctl pnfsd_cb_ctl = { ++ .lock = __SPIN_LOCK_UNLOCKED(pnfsd_cb_ctl.lock) ++}; ++EXPORT_SYMBOL(pnfsd_cb_ctl); ++#endif /* CONFIG_PNFSD */ ++ + #define dprintk(fmt, args...) do{}while(0) + + +diff -up linux-2.6.35.noarch/fs/exportfs/Makefile.orig linux-2.6.35.noarch/fs/exportfs/Makefile +--- linux-2.6.35.noarch/fs/exportfs/Makefile.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/exportfs/Makefile 2010-08-31 21:11:40.889140759 -0400 +@@ -3,4 +3,7 @@ + + obj-$(CONFIG_EXPORTFS) += exportfs.o + +-exportfs-objs := expfs.o ++exportfs-y := expfs.o ++exportfs-$(CONFIG_EXPORTFS_FILE_LAYOUT) += nfs4filelayoutxdr.o ++exportfs-$(CONFIG_EXPORTFS_OSD_LAYOUT) += pnfs_osd_xdr_srv.o ++exportfs-$(CONFIG_EXPORTFS_BLOCK_LAYOUT) += nfs4blocklayoutxdr.o +diff -up linux-2.6.35.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig linux-2.6.35.noarch/fs/exportfs/nfs4blocklayoutxdr.c +--- linux-2.6.35.noarch/fs/exportfs/nfs4blocklayoutxdr.c.orig 2010-08-31 21:11:40.891160475 -0400 ++++ linux-2.6.35.noarch/fs/exportfs/nfs4blocklayoutxdr.c 2010-08-31 21:11:40.891160475 -0400 +@@ -0,0 +1,158 @@ ++/* ++ * linux/fs/nfsd/nfs4blocklayoutxdr.c ++ * ++ * ++ * Created by Rick McNeal on 3/31/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++#include ++#include ++#include ++#include ++ ++static int ++bl_encode_simple(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 12 + 4 + bld->u.simple.bld_sig_len); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u32(p, 1); ++ p = exp_xdr_encode_u64(p, bld->u.simple.bld_offset); ++ exp_xdr_encode_opaque(p, bld->u.simple.bld_sig, ++ bld->u.simple.bld_sig_len); ++ ++ return 0; ++} ++ ++static int ++bl_encode_slice(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2 + 2 + 1); ++ ++ if (!p) ++ return -ETOOSMALL; ++ ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_start); ++ p = exp_xdr_encode_u64(p, bld->u.slice.bld_len); ++ exp_xdr_encode_u32(p, bld->u.slice.bld_index); ++ ++ return 0; ++} ++ ++static int ++bl_encode_concat(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ return -ENOTSUPP; ++} ++ ++static int ++bl_encode_stripe(struct exp_xdr_stream *xdr, pnfs_blocklayout_devinfo_t *bld) ++{ ++ int i; ++ __be32 *p = exp_xdr_reserve_space(xdr, ++ 2 + 1 + bld->u.stripe.bld_stripes); ++ ++ p = exp_xdr_encode_u64(p, bld->u.stripe.bld_chunk_size); ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripes); ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) ++ p = exp_xdr_encode_u32(p, bld->u.stripe.bld_stripe_indexs[i]); ++ ++ return 0; ++} ++ ++int ++blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes) ++{ ++ u32 num_vols = 0, ++ *layoutlen_p = xdr->p; ++ pnfs_blocklayout_devinfo_t *bld; ++ int status = 0; ++ __be32 *p; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -ETOOSMALL; ++ p += 2; ++ ++ /* ++ * All simple volumes with their signature are required to be listed ++ * first. ++ */ ++ list_for_each_entry(bld, volumes, bld_list) { ++ num_vols++; ++ p = exp_xdr_reserve_qwords(xdr, 1); ++ if (!p) ++ return -ETOOSMALL; ++ p = exp_xdr_encode_u32(p, bld->bld_type); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ status = bl_encode_simple(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_SLICE: ++ status = bl_encode_slice(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ status = bl_encode_concat(xdr, bld); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ status = bl_encode_stripe(xdr, bld); ++ break; ++ default: ++ BUG(); ++ } ++ if (status) ++ goto error; ++ } ++ ++ /* ---- Fill in the overall length and number of volumes ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (xdr->p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, num_vols); ++ ++error: ++ return status; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_devinfo); ++ ++enum nfsstat4 ++blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *bl_head) ++{ ++ struct pnfs_blocklayout_layout *b; ++ u32 *layoutlen_p = xdr->p, ++ extents = 0; ++ __be32 *p; ++ ++ /* ++ * Save spot for opaque block layout length and number of extents, ++ * fill-in later. ++ */ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p += 2; ++ ++ list_for_each_entry(b, bl_head, bll_list) { ++ extents++; ++ p = exp_xdr_reserve_qwords(xdr, 5 * 2 + 1); ++ if (!p) ++ return NFS4ERR_TOOSMALL; ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.sbid); ++ p = exp_xdr_encode_u64(p, b->bll_vol_id.devid); ++ p = exp_xdr_encode_u64(p, b->bll_foff); ++ p = exp_xdr_encode_u64(p, b->bll_len); ++ p = exp_xdr_encode_u64(p, b->bll_soff); ++ p = exp_xdr_encode_u32(p, b->bll_es); ++ } ++ ++ /* ---- Fill in the overall length and number of extents ---- */ ++ p = exp_xdr_encode_u32(layoutlen_p, (p - layoutlen_p - 1) * 4); ++ exp_xdr_encode_u32(p, extents); ++ ++ return NFS4_OK; ++} ++EXPORT_SYMBOL_GPL(blocklayout_encode_layout); +diff -up linux-2.6.35.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig linux-2.6.35.noarch/fs/exportfs/nfs4filelayoutxdr.c +--- linux-2.6.35.noarch/fs/exportfs/nfs4filelayoutxdr.c.orig 2010-08-31 21:11:40.891160475 -0400 ++++ linux-2.6.35.noarch/fs/exportfs/nfs4filelayoutxdr.c 2010-08-31 21:11:40.891160475 -0400 +@@ -0,0 +1,218 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++/* We do our-own dprintk so filesystems are not dependent on sunrpc */ ++#ifdef dprintk ++#undef dprintk ++#endif ++#define dprintk(fmt, args, ...) do { } while (0) ++ ++/* Calculate the XDR length of the GETDEVICEINFO4resok structure ++ * excluding the gdir_notification and the gdir_device_addr da_layout_type. ++ */ ++static int fl_devinfo_xdr_words(const struct pnfs_filelayout_device *fdev) ++{ ++ struct pnfs_filelayout_devaddr *fl_addr; ++ struct pnfs_filelayout_multipath *mp; ++ int i, j, nwords; ++ ++ /* da_addr_body length, indice length, indices, ++ * multipath_list4 length */ ++ nwords = 1 + 1 + fdev->fl_stripeindices_length + 1; ++ for (i = 0; i < fdev->fl_device_length; i++) { ++ mp = &fdev->fl_device_list[i]; ++ nwords++; /* multipath list length */ ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ fl_addr = mp->fl_multipath_list; ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_netid.len); ++ nwords += 1 + exp_xdr_qwords(fl_addr->r_addr.len); ++ } ++ } ++ dprintk("<-- %s nwords %d\n", __func__, nwords); ++ return nwords; ++} ++ ++/* Encodes the nfsv4_1_file_layout_ds_addr4 structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++int ++filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ unsigned int i, j, len = 0, opaque_words; ++ u32 *p_in; ++ u32 index_count = fdev->fl_stripeindices_length; ++ u32 dev_count = fdev->fl_device_length; ++ int error = 0; ++ __be32 *p; ++ ++ opaque_words = fl_devinfo_xdr_words(fdev); ++ dprintk("%s: Begin indx_cnt: %u dev_cnt: %u total size %u\n", ++ __func__, ++ index_count, ++ dev_count, ++ opaque_words*4); ++ ++ /* check space for opaque length */ ++ p = p_in = exp_xdr_reserve_qwords(xdr, opaque_words); ++ if (!p) { ++ error = -ETOOSMALL; ++ goto out; ++ } ++ ++ /* Fill in length later */ ++ p++; ++ ++ /* encode device list indices */ ++ p = exp_xdr_encode_u32(p, index_count); ++ for (i = 0; i < index_count; i++) ++ p = exp_xdr_encode_u32(p, fdev->fl_stripeindices_list[i]); ++ ++ /* encode device list */ ++ p = exp_xdr_encode_u32(p, dev_count); ++ for (i = 0; i < dev_count; i++) { ++ struct pnfs_filelayout_multipath *mp = &fdev->fl_device_list[i]; ++ ++ p = exp_xdr_encode_u32(p, mp->fl_multipath_length); ++ for (j = 0; j < mp->fl_multipath_length; j++) { ++ struct pnfs_filelayout_devaddr *da = ++ &mp->fl_multipath_list[j]; ++ ++ /* Encode device info */ ++ p = exp_xdr_encode_opaque(p, da->r_netid.data, ++ da->r_netid.len); ++ p = exp_xdr_encode_opaque(p, da->r_addr.data, ++ da->r_addr.len); ++ } ++ } ++ ++ /* backfill in length. Subtract 4 for da_addr_body size */ ++ len = (char *)p - (char *)p_in; ++ exp_xdr_encode_u32(p_in, len - 4); ++ ++ error = 0; ++out: ++ dprintk("%s: End err %d xdrlen %d\n", ++ __func__, error, len); ++ return error; ++} ++EXPORT_SYMBOL(filelayout_encode_devinfo); ++ ++/* Encodes the loc_body structure from draft 13 ++ * on the response stream. ++ * Use linux error codes (not nfs) since these values are being ++ * returned to the file system. ++ */ ++enum nfsstat4 ++filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp) ++{ ++ u32 len = 0, nfl_util, fhlen, i; ++ u32 *layoutlen_p; ++ enum nfsstat4 nfserr; ++ __be32 *p; ++ ++ dprintk("%s: device_id %llx:%llx fsi %u, numfh %u\n", ++ __func__, ++ flp->device_id.pnfs_fsid, ++ flp->device_id.pnfs_devid, ++ flp->lg_first_stripe_index, ++ flp->lg_fh_length); ++ ++ /* Ensure file system added at least one file handle */ ++ if (flp->lg_fh_length <= 0) { ++ dprintk("%s: File Layout has no file handles!!\n", __func__); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto out; ++ } ++ ++ /* Ensure room for len, devid, util, first_stripe_index, ++ * pattern_offset, number of filehandles */ ++ p = layoutlen_p = exp_xdr_reserve_qwords(xdr, 1+2+2+1+1+2+1); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ ++ /* save spot for opaque file layout length, fill-in later*/ ++ p++; ++ ++ /* encode device id */ ++ p = exp_xdr_encode_u64(p, flp->device_id.sbid); ++ p = exp_xdr_encode_u64(p, flp->device_id.devid); ++ ++ /* set and encode flags */ ++ nfl_util = flp->lg_stripe_unit; ++ if (flp->lg_commit_through_mds) ++ nfl_util |= NFL4_UFLG_COMMIT_THRU_MDS; ++ if (flp->lg_stripe_type == STRIPE_DENSE) ++ nfl_util |= NFL4_UFLG_DENSE; ++ p = exp_xdr_encode_u32(p, nfl_util); ++ ++ /* encode first stripe index */ ++ p = exp_xdr_encode_u32(p, flp->lg_first_stripe_index); ++ ++ /* encode striping pattern start */ ++ p = exp_xdr_encode_u64(p, flp->lg_pattern_offset); ++ ++ /* encode number of file handles */ ++ p = exp_xdr_encode_u32(p, flp->lg_fh_length); ++ ++ /* encode file handles */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ fhlen = flp->lg_fh_list[i].fh_size; ++ p = exp_xdr_reserve_space(xdr, 4 + fhlen); ++ if (!p) { ++ nfserr = NFS4ERR_TOOSMALL; ++ goto out; ++ } ++ p = exp_xdr_encode_opaque(p, &flp->lg_fh_list[i].fh_base, fhlen); ++ } ++ ++ /* Set number of bytes encoded = total_bytes_encoded - length var */ ++ len = (char *)p - (char *)layoutlen_p; ++ exp_xdr_encode_u32(layoutlen_p, len - 4); ++ ++ nfserr = NFS4_OK; ++out: ++ dprintk("%s: End err %u xdrlen %d\n", ++ __func__, nfserr, len); ++ return nfserr; ++} ++EXPORT_SYMBOL(filelayout_encode_layout); +diff -up linux-2.6.35.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig linux-2.6.35.noarch/fs/exportfs/pnfs_osd_xdr_srv.c +--- linux-2.6.35.noarch/fs/exportfs/pnfs_osd_xdr_srv.c.orig 2010-08-31 21:11:40.892181210 -0400 ++++ linux-2.6.35.noarch/fs/exportfs/pnfs_osd_xdr_srv.c 2010-08-31 21:11:40.892181210 -0400 +@@ -0,0 +1,289 @@ ++/* ++ * pnfs_osd_xdr_enc.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_data_map( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_data_map *data_map) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 1+2+1+1+1+1); ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, data_map->odm_num_comps); ++ p = exp_xdr_encode_u64(p, data_map->odm_stripe_unit); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_width); ++ p = exp_xdr_encode_u32(p, data_map->odm_group_depth); ++ p = exp_xdr_encode_u32(p, data_map->odm_mirror_cnt); ++ p = exp_xdr_encode_u32(p, data_map->odm_raid_algorithm); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline int pnfs_osd_xdr_encode_objid( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p = exp_xdr_reserve_qwords(xdr, 2+2+2+2); ++ struct nfsd4_pnfs_deviceid *dev_id = ++ (struct nfsd4_pnfs_deviceid *)&object_id->oid_device_id; ++ ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u64(p, dev_id->sbid); ++ p = exp_xdr_encode_u64(p, dev_id->devid); ++ p = exp_xdr_encode_u64(p, object_id->oid_partition_id); ++ p = exp_xdr_encode_u64(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * enum pnfs_osd_cap_key_sec4 { ++ * PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ * PNFS_OSD_CAP_KEY_SEC_SSV = 1 ++ * }; ++ * ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static int pnfs_osd_xdr_encode_object_cred( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_object_cred *olo_comp) ++{ ++ __be32 *p; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_objid(xdr, &olo_comp->oc_object_id); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_space(xdr, 3*4 + 4+olo_comp->oc_cap.cred_len); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, olo_comp->oc_osd_version); ++ ++ /* No sec for now */ ++ p = exp_xdr_encode_u32(p, PNFS_OSD_CAP_KEY_SEC_NONE); ++ p = exp_xdr_encode_u32(p, 0); /* opaque oc_capability_key<> */ ++ ++ exp_xdr_encode_opaque(p, olo_comp->oc_cap.cred, ++ olo_comp->oc_cap.cred_len); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_layout { ++ * struct pnfs_osd_data_map olo_map; ++ * u32 olo_comps_index; ++ * u32 olo_num_comps; ++ * struct pnfs_osd_object_cred *olo_comps; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *pol) ++{ ++ __be32 *p; ++ u32 i; ++ int err; ++ ++ err = pnfs_osd_xdr_encode_data_map(xdr, &pol->olo_map); ++ if (err) ++ return err; ++ ++ p = exp_xdr_reserve_qwords(xdr, 2); ++ if (!p) ++ return -E2BIG; ++ ++ p = exp_xdr_encode_u32(p, pol->olo_comps_index); ++ p = exp_xdr_encode_u32(p, pol->olo_num_comps); ++ ++ for (i = 0; i < pol->olo_num_comps; i++) { ++ err = pnfs_osd_xdr_encode_object_cred(xdr, &pol->olo_comps[i]); ++ if (err) ++ return err; ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_layout); ++ ++static int _encode_string(struct exp_xdr_stream *xdr, ++ const struct nfs4_string *str) ++{ ++ __be32 *p = exp_xdr_reserve_space(xdr, 4 + str->len); ++ ++ if (!p) ++ return -E2BIG; ++ exp_xdr_encode_opaque(p, str->data, str->len); ++ return 0; ++} ++ ++/* struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr) ++{ ++ __be32 *p; ++ int err; ++ ++ p = exp_xdr_reserve_space(xdr, 4 + 4 + sizeof(devaddr->oda_lun)); ++ if (!p) ++ return -E2BIG; ++ ++ /* Empty oda_targetid */ ++ p = exp_xdr_encode_u32(p, OBJ_TARGET_ANON); ++ ++ /* Empty oda_targetaddr for now */ ++ p = exp_xdr_encode_u32(p, 0); ++ ++ /* oda_lun */ ++ exp_xdr_encode_bytes(p, devaddr->oda_lun, sizeof(devaddr->oda_lun)); ++ ++ err = _encode_string(xdr, &devaddr->oda_systemid); ++ if (err) ++ return err; ++ ++ err = pnfs_osd_xdr_encode_object_cred(xdr, ++ &devaddr->oda_root_obj_cred); ++ if (err) ++ return err; ++ ++ err = _encode_string(xdr, &devaddr->oda_osdname); ++ if (err) ++ return err; ++ ++ return 0; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_encode_deviceaddr); ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p) ++{ ++ lou->dsu_valid = be32_to_cpu(*p++); ++ if (lou->dsu_valid) ++ p = xdr_decode_hyper(p, &lou->dsu_delta); ++ lou->olu_ioerr_flag = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_layoutupdate); ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline __be32 * ++pnfs_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) ++{ ++ /* FIXME: p = xdr_decode_fixed(...) */ ++ memcpy(objid->oid_device_id.data, p, sizeof(objid->oid_device_id.data)); ++ p += XDR_QUADLEN(sizeof(objid->oid_device_id.data)); ++ ++ p = xdr_decode_hyper(p, &objid->oid_partition_id); ++ p = xdr_decode_hyper(p, &objid->oid_object_id); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++__be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p) ++{ ++ p = pnfs_osd_xdr_decode_objid(p, &ioerr->oer_component); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_offset); ++ p = xdr_decode_hyper(p, &ioerr->oer_comp_length); ++ ioerr->oer_iswrite = be32_to_cpu(*p++); ++ ioerr->oer_errno = be32_to_cpu(*p++); ++ return p; ++} ++EXPORT_SYMBOL(pnfs_osd_xdr_decode_ioerr); +diff -up linux-2.6.35.noarch/fs/gfs2/ops_fstype.c.orig linux-2.6.35.noarch/fs/gfs2/ops_fstype.c +--- linux-2.6.35.noarch/fs/gfs2/ops_fstype.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/gfs2/ops_fstype.c 2010-08-31 21:11:40.893160525 -0400 +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "gfs2.h" + #include "incore.h" +@@ -1147,6 +1148,9 @@ static int fill_super(struct super_block + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; ++#if defined(CONFIG_PNFSD) ++ sb->s_pnfs_op = &pnfs_dlm_export_ops; ++#endif /* CONFIG_PNFSD */ + sb->s_xattr = gfs2_xattr_handlers; + sb->s_qcop = &gfs2_quotactl_ops; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; +diff -up linux-2.6.35.noarch/fs/Kconfig.orig linux-2.6.35.noarch/fs/Kconfig +--- linux-2.6.35.noarch/fs/Kconfig.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/Kconfig 2010-08-31 21:11:40.883109385 -0400 +@@ -224,6 +224,31 @@ config LOCKD_V4 + config EXPORTFS + tristate + ++config EXPORTFS_FILE_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 files layout type. ++ Must be automatically selected by supporting filesystems. ++ ++config EXPORTFS_OSD_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 objects layout type. ++ Must be automatically selected by supporting osd ++ filesystems. ++ ++ If unsure, say N. ++ ++config EXPORTFS_BLOCK_LAYOUT ++ bool ++ depends on PNFSD && EXPORTFS ++ help ++ Exportfs support for the NFSv4.1 blocks layout type. ++ Must be automatically selected by supporting filesystems. ++ ++ + config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL +diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c +--- linux-2.6.35.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c.orig 2010-08-31 21:11:40.896170526 -0400 ++++ linux-2.6.35.noarch/fs/nfs/blocklayout/block-device-discovery-pipe.c 2010-08-31 21:11:40.896170526 -0400 +@@ -0,0 +1,66 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pipefs_list bl_device_list; ++struct dentry *bl_device_pipe; ++ ++ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t len) ++{ ++ int err; ++ struct pipefs_hdr *msg; ++ ++ dprintk("Entering %s...\n", __func__); ++ ++ msg = pipefs_readmsg(filp, src, len); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: unable to read pipefs message.\n"); ++ return PTR_ERR(msg); ++ } ++ ++ /* now assign the result, which wakes the blocked thread */ ++ err = pipefs_assign_upcall_reply(msg, &bl_device_list); ++ if (err) { ++ dprintk("ERROR: failed to assign upcall with id %u\n", ++ msg->msgid); ++ kfree(msg); ++ } ++ return len; ++} ++ ++static const struct rpc_pipe_ops bl_pipe_ops = { ++ .upcall = pipefs_generic_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = pipefs_generic_destroy_msg, ++}; ++ ++int bl_pipe_init(void) ++{ ++ dprintk("%s: block_device pipefs registering...\n", __func__); ++ bl_device_pipe = pipefs_mkpipe("bl_device_pipe", &bl_pipe_ops, 1); ++ if (IS_ERR(bl_device_pipe)) ++ dprintk("ERROR, unable to make block_device pipe\n"); ++ ++ if (!bl_device_pipe) ++ dprintk("bl_device_pipe is NULL!\n"); ++ else ++ dprintk("bl_device_pipe created!\n"); ++ pipefs_init_list(&bl_device_list); ++ return 0; ++} ++ ++void bl_pipe_exit(void) ++{ ++ dprintk("%s: block_device pipefs unregistering...\n", __func__); ++ if (IS_ERR(bl_device_pipe)) ++ return ; ++ pipefs_closepipe(bl_device_pipe); ++ return; ++} +diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c +--- linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c.orig 2010-08-31 21:11:40.897085251 -0400 ++++ linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.c 2010-08-31 21:11:40.898150313 -0400 +@@ -0,0 +1,1160 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include ++ ++#include /* various write calls */ ++#include /* struct bio */ ++#include ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Andy Adamson "); ++MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); ++ ++/* Callback operations to the pNFS client */ ++static struct pnfs_client_operations *pnfs_block_callback_ops; ++ ++static void print_page(struct page *page) ++{ ++ dprintk("PRINTPAGE page %p\n", page); ++ dprintk(" PagePrivate %d\n", PagePrivate(page)); ++ dprintk(" PageUptodate %d\n", PageUptodate(page)); ++ dprintk(" PageError %d\n", PageError(page)); ++ dprintk(" PageDirty %d\n", PageDirty(page)); ++ dprintk(" PageReferenced %d\n", PageReferenced(page)); ++ dprintk(" PageLocked %d\n", PageLocked(page)); ++ dprintk(" PageWriteback %d\n", PageWriteback(page)); ++ dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); ++ dprintk("\n"); ++} ++ ++/* Given the be associated with isect, determine if page data needs to be ++ * initialized. ++ */ ++static int is_hole(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return !is_sector_initialized(be->be_inval, isect); ++} ++ ++/* Given the be associated with isect, determine if page data can be ++ * written to disk. ++ */ ++static int is_writable(struct pnfs_block_extent *be, sector_t isect) ++{ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) ++ return 1; ++ else if (be->be_state != PNFS_BLOCK_INVALID_DATA) ++ return 0; ++ else ++ return is_sector_initialized(be->be_inval, isect); ++} ++ ++static int ++dont_like_caller(struct nfs_page *req) ++{ ++ if (atomic_read(&req->wb_complete)) { ++ /* Called by _multi */ ++ return 1; ++ } else { ++ /* Called by _one */ ++ return 0; ++ } ++} ++ ++static enum pnfs_try_status ++bl_commit(struct nfs_write_data *nfs_data, ++ int sync) ++{ ++ dprintk("%s enter\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++/* The data we are handed might be spread across several bios. We need ++ * to track when the last one is finished. ++ */ ++struct parallel_io { ++ struct kref refcnt; ++ struct rpc_call_ops call_ops; ++ void (*pnfs_callback) (void *data); ++ void *data; ++}; ++ ++static inline struct parallel_io *alloc_parallel(void *data) ++{ ++ struct parallel_io *rv; ++ ++ rv = kmalloc(sizeof(*rv), GFP_KERNEL); ++ if (rv) { ++ rv->data = data; ++ kref_init(&rv->refcnt); ++ } ++ return rv; ++} ++ ++static inline void get_parallel(struct parallel_io *p) ++{ ++ kref_get(&p->refcnt); ++} ++ ++static void destroy_parallel(struct kref *kref) ++{ ++ struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); ++ ++ dprintk("%s enter\n", __func__); ++ p->pnfs_callback(p->data); ++ kfree(p); ++} ++ ++static inline void put_parallel(struct parallel_io *p) ++{ ++ kref_put(&p->refcnt, destroy_parallel); ++} ++ ++static struct bio * ++bl_submit_bio(int rw, struct bio *bio) ++{ ++ if (bio) { ++ get_parallel(bio->bi_private); ++ dprintk("%s submitting %s bio %u@%llu\n", __func__, ++ rw == READ ? "read" : "write", ++ bio->bi_size, (u64)bio->bi_sector); ++ submit_bio(rw, bio); ++ } ++ return NULL; ++} ++ ++static inline void ++bl_done_with_rpage(struct page *page, const int ok) ++{ ++ if (ok) { ++ ClearPagePnfsErr(page); ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ } ++ /* Page is unlocked via rpc_release. Should really be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_read(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_rpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++static void bl_read_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ pnfs_block_callback_ops->nfs_readlist_complete(rdata); ++} ++ ++static void ++bl_end_par_io_read(void *data) ++{ ++ struct nfs_read_data *rdata = data; ++ ++ INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); ++ schedule_work(&rdata->task.u.tk_work); ++} ++ ++/* We don't want normal .rpc_call_done callback used, so we replace it ++ * with this stub. ++ */ ++static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata) ++{ ++ return; ++} ++ ++static enum pnfs_try_status ++bl_read_pagelist(struct nfs_read_data *rdata, ++ unsigned nr_pages) ++{ ++ int i, hole; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t f_offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct page **pages = rdata->args.pages; ++ int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__, ++ nr_pages, f_offset, count); ++ ++ if (dont_like_caller(rdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ goto use_mds; ++ } ++ if ((nr_pages == 1) && PagePnfsErr(rdata->req->wb_page)) { ++ /* We want to fall back to mds in case of read_page ++ * after error on read_pages. ++ */ ++ dprintk("%s PG_pnfserr set\n", __func__); ++ goto use_mds; ++ } ++ par = alloc_parallel(rdata); ++ if (!par) ++ goto use_mds; ++ par->call_ops = *rdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_read; ++ /* At this point, we can no longer jump to use_mds */ ++ ++ isect = (sector_t) (f_offset >> 9); ++ /* Code assumes extents are page-aligned */ ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ put_extent(cow_read); ++ bio = bl_submit_bio(READ, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(rdata->pdata.lseg), ++ isect, &cow_read); ++ if (!be) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ if (cow_read) { ++ sector_t cow_length = cow_read->be_length - ++ (isect - cow_read->be_f_offset); ++ extent_length = min(extent_length, cow_length); ++ } ++ } ++ hole = is_hole(be, isect); ++ if (hole && !cow_read) { ++ bio = bl_submit_bio(READ, bio); ++ /* Fill hole w/ zeroes w/o accessing device */ ++ dprintk("%s Zeroing page for hole\n", __func__); ++ zero_user(pages[i], 0, ++ min_t(int, PAGE_CACHE_SIZE, count)); ++ print_page(pages[i]); ++ bl_done_with_rpage(pages[i], 1); ++ } else { ++ struct pnfs_block_extent *be_read; ++ ++ be_read = (hole && cow_read) ? cow_read : be; ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ bl_done_with_rpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - ++ be_read->be_f_offset + ++ be_read->be_v_offset; ++ bio->bi_bdev = be_read->be_mdev; ++ bio->bi_end_io = bl_end_io_read; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(READ, bio); ++ } ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ if ((isect << 9) >= rdata->inode->i_size) { ++ rdata->res.eof = 1; ++ rdata->res.count = rdata->inode->i_size - f_offset; ++ } else { ++ rdata->res.count = (isect << 9) - f_offset; ++ } ++ put_extent(be); ++ put_extent(cow_read); ++ bl_submit_bio(READ, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++ ++ use_mds: ++ dprintk("Giving up and using normal NFS\n"); ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static void mark_extents_written(struct pnfs_block_layout *bl, ++ __u64 offset, __u32 count) ++{ ++ sector_t isect, end; ++ struct pnfs_block_extent *be; ++ ++ dprintk("%s(%llu, %u)\n", __func__, offset, count); ++ if (count == 0) ++ return; ++ isect = (offset & (long)(PAGE_CACHE_MASK)) >> 9; ++ end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); ++ end >>= 9; ++ while (isect < end) { ++ sector_t len; ++ be = find_get_extent(bl, isect, NULL); ++ BUG_ON(!be); /* FIXME */ ++ len = min(end, be->be_f_offset + be->be_length) - isect; ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ mark_for_commit(be, isect, len); /* What if fails? */ ++ isect += len; ++ put_extent(be); ++ } ++} ++ ++/* STUB - this needs thought */ ++static inline void ++bl_done_with_wpage(struct page *page, const int ok) ++{ ++ if (!ok) { ++ SetPageError(page); ++ SetPagePnfsErr(page); ++ /* This is an inline copy of nfs_zap_mapping */ ++ /* This is oh so fishy, and needs deep thought */ ++ if (page->mapping->nrpages != 0) { ++ struct inode *inode = page->mapping->host; ++ spin_lock(&inode->i_lock); ++ NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; ++ spin_unlock(&inode->i_lock); ++ } ++ } ++ /* end_page_writeback called in rpc_release. Should be done here. */ ++} ++ ++/* This is basically copied from mpage_end_io_read */ ++static void bl_end_io_write(struct bio *bio, int err) ++{ ++ void *data = bio->bi_private; ++ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); ++ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; ++ ++ do { ++ struct page *page = bvec->bv_page; ++ ++ if (--bvec >= bio->bi_io_vec) ++ prefetchw(&bvec->bv_page->flags); ++ bl_done_with_wpage(page, uptodate); ++ } while (bvec >= bio->bi_io_vec); ++ bio_put(bio); ++ put_parallel(data); ++} ++ ++/* Function scheduled for call during bl_end_par_io_write, ++ * it marks sectors as written and extends the commitlist. ++ */ ++static void bl_write_cleanup(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ if (!wdata->task.tk_status) { ++ /* Marks for LAYOUTCOMMIT */ ++ /* BUG - this should be called after each bio, not after ++ * all finish, unless have some way of storing success/failure ++ */ ++ mark_extents_written(BLK_LSEG2EXT(wdata->pdata.lseg), ++ wdata->args.offset, wdata->args.count); ++ } ++ pnfs_block_callback_ops->nfs_writelist_complete(wdata); ++} ++ ++/* Called when last of bios associated with a bl_write_pagelist call finishes */ ++static void ++bl_end_par_io_write(void *data) ++{ ++ struct nfs_write_data *wdata = data; ++ ++ /* STUB - ignoring error handling */ ++ wdata->task.tk_status = 0; ++ wdata->verf.committed = NFS_FILE_SYNC; ++ INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); ++ schedule_work(&wdata->task.u.tk_work); ++} ++ ++static enum pnfs_try_status ++bl_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int sync) ++{ ++ int i; ++ struct bio *bio = NULL; ++ struct pnfs_block_extent *be = NULL; ++ sector_t isect, extent_length = 0; ++ struct parallel_io *par; ++ loff_t offset = wdata->args.offset; ++ size_t count = wdata->args.count; ++ struct page **pages = wdata->args.pages; ++ int pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; ++ ++ dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); ++ if (!wdata->req->wb_lseg) { ++ dprintk("%s no lseg, falling back to MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ if (dont_like_caller(wdata->req)) { ++ dprintk("%s dont_like_caller failed\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ /* At this point, wdata->pages is a (sequential) list of nfs_pages. ++ * We want to write each, and if there is an error remove it from ++ * list and call ++ * nfs_retry_request(req) to have it redone using nfs. ++ * QUEST? Do as block or per req? Think have to do per block ++ * as part of end_bio ++ */ ++ par = alloc_parallel(wdata); ++ if (!par) ++ return PNFS_NOT_ATTEMPTED; ++ par->call_ops = *wdata->pdata.call_ops; ++ par->call_ops.rpc_call_done = bl_rpc_do_nothing; ++ par->pnfs_callback = bl_end_par_io_write; ++ /* At this point, have to be more careful with error handling */ ++ ++ isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> 9); ++ for (i = pg_index; i < nr_pages; i++) { ++ if (!extent_length) { ++ /* We've used up the previous extent */ ++ put_extent(be); ++ bio = bl_submit_bio(WRITE, bio); ++ /* Get the next one */ ++ be = find_get_extent(BLK_LSEG2EXT(wdata->pdata.lseg), ++ isect, NULL); ++ if (!be || !is_writable(be, isect)) { ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ extent_length = be->be_length - ++ (isect - be->be_f_offset); ++ } ++ for (;;) { ++ if (!bio) { ++ bio = bio_alloc(GFP_NOIO, nr_pages - i); ++ if (!bio) { ++ /* Error out this page */ ++ /* FIXME */ ++ bl_done_with_wpage(pages[i], 0); ++ break; ++ } ++ bio->bi_sector = isect - be->be_f_offset + ++ be->be_v_offset; ++ bio->bi_bdev = be->be_mdev; ++ bio->bi_end_io = bl_end_io_write; ++ bio->bi_private = par; ++ } ++ if (bio_add_page(bio, pages[i], PAGE_SIZE, 0)) ++ break; ++ bio = bl_submit_bio(WRITE, bio); ++ } ++ isect += PAGE_CACHE_SIZE >> 9; ++ extent_length -= PAGE_CACHE_SIZE >> 9; ++ } ++ wdata->res.count = (isect << 9) - (offset & (long)PAGE_CACHE_MASK); ++ put_extent(be); ++ bl_submit_bio(WRITE, bio); ++ put_parallel(par); ++ return PNFS_ATTEMPTED; ++} ++ ++/* FIXME - range ignored */ ++static void ++release_extents(struct pnfs_block_layout *bl, ++ struct pnfs_layout_range *range) ++{ ++ int i; ++ struct pnfs_block_extent *be; ++ ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ while (!list_empty(&bl->bl_extents[i])) { ++ be = list_first_entry(&bl->bl_extents[i], ++ struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++} ++ ++static void ++release_inval_marks(struct pnfs_inval_markings *marks) ++{ ++ struct pnfs_inval_tracking *pos, *temp; ++ ++ list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { ++ list_del(&pos->it_link); ++ kfree(pos); ++ } ++ return; ++} ++ ++/* Note we are relying on caller locking to prevent nasty races. */ ++static void ++bl_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ ++ dprintk("%s enter\n", __func__); ++ release_extents(bl, NULL); ++ release_inval_marks(&bl->bl_inval); ++ kfree(bl); ++} ++ ++static struct pnfs_layout_hdr * ++bl_alloc_layout(struct inode *inode) ++{ ++ struct pnfs_block_layout *bl; ++ ++ dprintk("%s enter\n", __func__); ++ bl = kzalloc(sizeof(*bl), GFP_KERNEL); ++ if (!bl) ++ return NULL; ++ spin_lock_init(&bl->bl_ext_lock); ++ INIT_LIST_HEAD(&bl->bl_extents[0]); ++ INIT_LIST_HEAD(&bl->bl_extents[1]); ++ INIT_LIST_HEAD(&bl->bl_commit); ++ bl->bl_count = 0; ++ bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> 9; ++ INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); ++ return &bl->bl_layout; ++} ++ ++static void ++bl_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter\n", __func__); ++ kfree(lseg); ++} ++ ++/* Because the generic infrastructure does not correctly merge layouts, ++ * we pretty much ignore lseg, and store all data layout wide, so we ++ * can correctly merge. Eventually we should push some correct merge ++ * behavior up to the generic code, as the current behavior tends to ++ * cause lots of unnecessary overlapping LAYOUTGET requests. ++ */ ++static struct pnfs_layout_segment * ++bl_alloc_lseg(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_layout_segment *lseg; ++ int status; ++ ++ dprintk("%s enter\n", __func__); ++ lseg = kzalloc(sizeof(*lseg) + 0, GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ status = nfs4_blk_process_layoutget(lo, lgr); ++ if (status) { ++ /* We don't want to call the full-blown bl_free_lseg, ++ * since on error extents were not touched. ++ */ ++ /* STUB - we really want to distinguish between 2 error ++ * conditions here. This lseg failed, but lo data structures ++ * are OK, or we hosed the lo data structures. The calling ++ * code probably needs to distinguish this too. ++ */ ++ kfree(lseg); ++ return ERR_PTR(status); ++ } ++ return lseg; ++} ++ ++static int ++bl_setup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg) ++{ ++ struct nfs_server *nfss = PNFS_NFS_SERVER(lo); ++ struct bl_layoutupdate_data *layoutupdate_data; ++ ++ dprintk("%s enter\n", __func__); ++ /* Need to ensure commit is block-size aligned */ ++ if (nfss->pnfs_blksize) { ++ u64 mask = nfss->pnfs_blksize - 1; ++ u64 offset = arg->range.offset & mask; ++ ++ arg->range.offset -= offset; ++ arg->range.length += offset + mask; ++ arg->range.length &= ~mask; ++ } ++ ++ layoutupdate_data = kmalloc(sizeof(struct bl_layoutupdate_data), ++ GFP_KERNEL); ++ if (unlikely(!layoutupdate_data)) ++ return -ENOMEM; ++ INIT_LIST_HEAD(&layoutupdate_data->ranges); ++ arg->layoutdriver_data = layoutupdate_data; ++ ++ return 0; ++} ++ ++static void ++bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ dprintk("%s enter\n", __func__); ++ encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); ++} ++ ++static void ++bl_cleanup_layoutcommit(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutcommit_args *arg, int status) ++{ ++ dprintk("%s enter\n", __func__); ++ clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), arg, status); ++ kfree(arg->layoutdriver_data); ++} ++ ++static void free_blk_mountid(struct block_mount_id *mid) ++{ ++ if (mid) { ++ struct pnfs_block_dev *dev; ++ spin_lock(&mid->bm_lock); ++ while (!list_empty(&mid->bm_devlist)) { ++ dev = list_first_entry(&mid->bm_devlist, ++ struct pnfs_block_dev, ++ bm_node); ++ list_del(&dev->bm_node); ++ free_block_dev(dev); ++ } ++ spin_unlock(&mid->bm_lock); ++ kfree(mid); ++ } ++} ++ ++/* This is mostly copied form the filelayout's get_device_info function. ++ * It seems much of this should be at the generic pnfs level. ++ */ ++static struct pnfs_block_dev * ++nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, ++ struct pnfs_deviceid *d_id, ++ struct list_head *sdlist) ++{ ++ struct pnfs_device *dev; ++ struct pnfs_block_dev *rv = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ int i, rc; ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s max_resp_sz %u max_pages %d\n", ++ __func__, max_resp_sz, max_pages); ++ ++ dev = kmalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) { ++ dprintk("%s kmalloc failed\n", __func__); ++ return NULL; ++ } ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(dev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set dev->area */ ++ dev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!dev->area) ++ goto out_free; ++ ++ memcpy(&dev->dev_id, d_id, sizeof(*d_id)); ++ dev->layout_type = LAYOUT_BLOCK_VOLUME; ++ dev->dev_notify_types = 0; ++ dev->pages = pages; ++ dev->pgbase = 0; ++ dev->pglen = PAGE_SIZE * max_pages; ++ dev->mincount = 0; ++ ++ dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); ++ rc = pnfs_block_callback_ops->nfs_getdeviceinfo(server, dev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ rv = nfs4_blk_decode_device(server, dev, sdlist); ++ out_free: ++ if (dev->area != NULL) ++ vunmap(dev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(dev); ++ return rv; ++} ++ ++ ++/* ++ * Retrieve the list of available devices for the mountpoint. ++ */ ++static int ++bl_initialize_mountpoint(struct nfs_server *server, const struct nfs_fh *fh) ++{ ++ struct block_mount_id *b_mt_id = NULL; ++ struct pnfs_mount_type *mtype = NULL; ++ struct pnfs_devicelist *dlist = NULL; ++ struct pnfs_block_dev *bdev; ++ LIST_HEAD(block_disklist); ++ int status = 0, i; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (server->pnfs_blksize == 0) { ++ dprintk("%s Server did not return blksize\n", __func__); ++ return -EINVAL; ++ } ++ b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_KERNEL); ++ if (!b_mt_id) { ++ status = -ENOMEM; ++ goto out_error; ++ } ++ /* Initialize nfs4 block layout mount id */ ++ spin_lock_init(&b_mt_id->bm_lock); ++ INIT_LIST_HEAD(&b_mt_id->bm_devlist); ++ ++ dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_KERNEL); ++ if (!dlist) ++ goto out_error; ++ dlist->eof = 0; ++ while (!dlist->eof) { ++ status = pnfs_block_callback_ops->nfs_getdevicelist( ++ server, fh, dlist); ++ if (status) ++ goto out_error; ++ dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", ++ __func__, dlist->num_devs, dlist->eof); ++ /* For each device returned in dlist, call GETDEVICEINFO, and ++ * decode the opaque topology encoding to create a flat ++ * volume topology, matching VOLUME_SIMPLE disk signatures ++ * to disks in the visible block disk list. ++ * Construct an LVM meta device from the flat volume topology. ++ */ ++ for (i = 0; i < dlist->num_devs; i++) { ++ bdev = nfs4_blk_get_deviceinfo(server, fh, ++ &dlist->dev_id[i], ++ &block_disklist); ++ if (!bdev) ++ goto out_error; ++ spin_lock(&b_mt_id->bm_lock); ++ list_add(&bdev->bm_node, &b_mt_id->bm_devlist); ++ spin_unlock(&b_mt_id->bm_lock); ++ } ++ } ++ dprintk("%s SUCCESS\n", __func__); ++ server->pnfs_ld_data = b_mt_id; ++ ++ out_return: ++ kfree(dlist); ++ return status; ++ ++ out_error: ++ free_blk_mountid(b_mt_id); ++ kfree(mtype); ++ goto out_return; ++} ++ ++static int ++bl_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ struct block_mount_id *b_mt_id = server->pnfs_ld_data; ++ ++ dprintk("%s enter\n", __func__); ++ free_blk_mountid(b_mt_id); ++ dprintk("%s RETURNS\n", __func__); ++ return 0; ++} ++ ++/* STUB - mark intersection of layout and page as bad, so is not ++ * used again. ++ */ ++static void mark_bad_read(void) ++{ ++ return; ++} ++ ++/* Copied from buffer.c */ ++static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) ++{ ++ if (uptodate) { ++ set_buffer_uptodate(bh); ++ } else { ++ /* This happens, due to failed READA attempts. */ ++ clear_buffer_uptodate(bh); ++ } ++ unlock_buffer(bh); ++} ++ ++/* Copied from buffer.c */ ++static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) ++{ ++ __end_buffer_read_notouch(bh, uptodate); ++} ++ ++/* ++ * map_block: map a requested I/0 block (isect) into an offset in the LVM ++ * meta block_device ++ */ ++static void ++map_block(sector_t isect, struct pnfs_block_extent *be, struct buffer_head *bh) ++{ ++ dprintk("%s enter be=%p\n", __func__, be); ++ ++ set_buffer_mapped(bh); ++ bh->b_bdev = be->be_mdev; ++ bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> ++ (be->be_mdev->bd_inode->i_blkbits - 9); ++ ++ dprintk("%s isect %ld, bh->b_blocknr %ld, using bsize %Zd\n", ++ __func__, (long)isect, ++ (long)bh->b_blocknr, ++ bh->b_size); ++ return; ++} ++ ++/* Given an unmapped page, zero it (or read in page for COW), ++ * and set appropriate flags/markings, but it is safe to not initialize ++ * the range given in [from, to). ++ */ ++/* This is loosely based on nobh_write_begin */ ++static int ++init_page_for_write(struct pnfs_block_layout *bl, struct page *page, ++ unsigned from, unsigned to, sector_t **pages_to_mark) ++{ ++ struct buffer_head *bh; ++ int inval, ret = -EIO; ++ struct pnfs_block_extent *be = NULL, *cow_read = NULL; ++ sector_t isect; ++ ++ dprintk("%s enter, %p\n", __func__, page); ++ bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); ++ if (!bh) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ ++ isect = (sector_t)page->index << (PAGE_CACHE_SHIFT - 9); ++ be = find_get_extent(bl, isect, &cow_read); ++ if (!be) ++ goto cleanup; ++ inval = is_hole(be, isect); ++ dprintk("%s inval=%i, from=%u, to=%u\n", __func__, inval, from, to); ++ if (inval) { ++ if (be->be_state == PNFS_BLOCK_NONE_DATA) { ++ dprintk("%s PANIC - got NONE_DATA extent %p\n", ++ __func__, be); ++ goto cleanup; ++ } ++ map_block(isect, be, bh); ++ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); ++ } ++ if (PageUptodate(page)) { ++ /* Do nothing */ ++ } else if (inval & !cow_read) { ++ zero_user_segments(page, 0, from, to, PAGE_CACHE_SIZE); ++ } else if (0 < from || PAGE_CACHE_SIZE > to) { ++ struct pnfs_block_extent *read_extent; ++ ++ read_extent = (inval && cow_read) ? cow_read : be; ++ map_block(isect, read_extent, bh); ++ lock_buffer(bh); ++ bh->b_end_io = end_buffer_read_nobh; ++ submit_bh(READ, bh); ++ dprintk("%s: Waiting for buffer read\n", __func__); ++ /* XXX Don't really want to hold layout lock here */ ++ wait_on_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ goto cleanup; ++ } ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ /* There is a BUG here if is a short copy after write_begin, ++ * but I think this is a generic fs bug. The problem is that ++ * we have marked the page as initialized, but it is possible ++ * that the section not copied may never get copied. ++ */ ++ ret = mark_initialized_sectors(be->be_inval, isect, ++ PAGE_CACHE_SECTORS, ++ pages_to_mark); ++ /* Want to preallocate mem so above can't fail */ ++ if (ret) ++ goto cleanup; ++ } ++ SetPageMappedToDisk(page); ++ ret = 0; ++ ++cleanup: ++ free_buffer_head(bh); ++ put_extent(be); ++ put_extent(cow_read); ++ if (ret) { ++ /* Need to mark layout with bad read...should now ++ * just use nfs4 for reads and writes. ++ */ ++ mark_bad_read(); ++ } ++ return ret; ++} ++ ++static int ++bl_write_begin(struct pnfs_layout_segment *lseg, struct page *page, loff_t pos, ++ unsigned count, struct pnfs_fsdata *fsdata) ++{ ++ unsigned from, to; ++ int ret; ++ sector_t *pages_to_mark = NULL; ++ struct pnfs_block_layout *bl = BLK_LSEG2EXT(lseg); ++ ++ dprintk("%s enter, %u@%lld\n", __func__, count, pos); ++ print_page(page); ++ /* The following code assumes blocksize >= PAGE_CACHE_SIZE */ ++ if (bl->bl_blocksize < (PAGE_CACHE_SIZE >> 9)) { ++ dprintk("%s Can't handle blocksize %llu\n", __func__, ++ (u64)bl->bl_blocksize); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ return 0; ++ } ++ if (PageMappedToDisk(page)) { ++ /* Basically, this is a flag that says we have ++ * successfully called write_begin already on this page. ++ */ ++ /* NOTE - there are cache consistency issues here. ++ * For example, what if the layout is recalled, then regained? ++ * If the file is closed and reopened, will the page flags ++ * be reset? If not, we'll have to use layout info instead of ++ * the page flag. ++ */ ++ return 0; ++ } ++ from = pos & (PAGE_CACHE_SIZE - 1); ++ to = from + count; ++ ret = init_page_for_write(bl, page, from, to, &pages_to_mark); ++ if (ret) { ++ dprintk("%s init page failed with %i", __func__, ret); ++ /* Revert back to plain NFS and just continue on with ++ * write. This assumes there is no request attached, which ++ * should be true if we get here. ++ */ ++ BUG_ON(PagePrivate(page)); ++ put_lseg(fsdata->lseg); ++ fsdata->lseg = NULL; ++ kfree(pages_to_mark); ++ ret = 0; ++ } else { ++ fsdata->private = pages_to_mark; ++ } ++ return ret; ++} ++ ++/* CAREFUL - what happens if copied < count??? */ ++static int ++bl_write_end(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, struct pnfs_layout_segment *lseg) ++{ ++ dprintk("%s enter, %u@%lld, lseg=%p\n", __func__, count, pos, lseg); ++ print_page(page); ++ if (lseg) ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* Return any memory allocated to fsdata->private, and take advantage ++ * of no page locks to mark pages noted in write_begin as needing ++ * initialization. ++ */ ++static void ++bl_write_end_cleanup(struct file *filp, struct pnfs_fsdata *fsdata) ++{ ++ struct page *page; ++ pgoff_t index; ++ sector_t *pos; ++ struct address_space *mapping = filp->f_mapping; ++ struct pnfs_fsdata *fake_data; ++ struct pnfs_layout_segment *lseg; ++ ++ if (!fsdata) ++ return; ++ lseg = fsdata->lseg; ++ if (!lseg) ++ return; ++ pos = fsdata->private; ++ if (!pos) ++ return; ++ dprintk("%s enter with pos=%llu\n", __func__, (u64)(*pos)); ++ for (; *pos != ~0; pos++) { ++ index = *pos >> (PAGE_CACHE_SHIFT - 9); ++ /* XXX How do we properly deal with failures here??? */ ++ page = grab_cache_page_write_begin(mapping, index, 0); ++ if (!page) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", __func__); ++ continue; ++ } ++ dprintk("%s: Examining block page\n", __func__); ++ print_page(page); ++ if (!PageMappedToDisk(page)) { ++ /* XXX How do we properly deal with failures here??? */ ++ dprintk("%s Marking block page\n", __func__); ++ init_page_for_write(BLK_LSEG2EXT(fsdata->lseg), page, ++ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, ++ NULL); ++ print_page(page); ++ fake_data = kzalloc(sizeof(*fake_data), GFP_KERNEL); ++ if (!fake_data) { ++ printk(KERN_ERR "%s BUG BUG BUG NoMem\n", ++ __func__); ++ unlock_page(page); ++ continue; ++ } ++ get_lseg(lseg); ++ fake_data->lseg = lseg; ++ fake_data->bypass_eof = 1; ++ mapping->a_ops->write_end(filp, mapping, ++ index << PAGE_CACHE_SHIFT, ++ PAGE_CACHE_SIZE, ++ PAGE_CACHE_SIZE, ++ page, fake_data); ++ /* Note fake_data is freed by nfs_write_end */ ++ } else ++ unlock_page(page); ++ } ++ kfree(fsdata->private); ++ fsdata->private = NULL; ++} ++ ++static ssize_t ++bl_get_stripesize(struct pnfs_layout_hdr *lo) ++{ ++ dprintk("%s enter\n", __func__); ++ return 0; ++} ++ ++/* This is called by nfs_can_coalesce_requests via nfs_pageio_do_add_request. ++ * Should return False if there is a reason requests can not be coalesced, ++ * otherwise, should default to returning True. ++ */ ++static int ++bl_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ dprintk("%s enter\n", __func__); ++ if (pgio->pg_iswrite) ++ return prev->wb_lseg == req->wb_lseg; ++ else ++ return 1; ++} ++ ++static struct layoutdriver_io_operations blocklayout_io_operations = { ++ .commit = bl_commit, ++ .read_pagelist = bl_read_pagelist, ++ .write_pagelist = bl_write_pagelist, ++ .write_begin = bl_write_begin, ++ .write_end = bl_write_end, ++ .write_end_cleanup = bl_write_end_cleanup, ++ .alloc_layout = bl_alloc_layout, ++ .free_layout = bl_free_layout, ++ .alloc_lseg = bl_alloc_lseg, ++ .free_lseg = bl_free_lseg, ++ .setup_layoutcommit = bl_setup_layoutcommit, ++ .encode_layoutcommit = bl_encode_layoutcommit, ++ .cleanup_layoutcommit = bl_cleanup_layoutcommit, ++ .initialize_mountpoint = bl_initialize_mountpoint, ++ .uninitialize_mountpoint = bl_uninitialize_mountpoint, ++}; ++ ++static struct layoutdriver_policy_operations blocklayout_policy_operations = { ++ .get_stripesize = bl_get_stripesize, ++ .pg_test = bl_pg_test, ++}; ++ ++static struct pnfs_layoutdriver_type blocklayout_type = { ++ .id = LAYOUT_BLOCK_VOLUME, ++ .name = "LAYOUT_BLOCK_VOLUME", ++ .ld_io_ops = &blocklayout_io_operations, ++ .ld_policy_ops = &blocklayout_policy_operations, ++}; ++ ++static int __init nfs4blocklayout_init(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); ++ ++ pnfs_block_callback_ops = pnfs_register_layoutdriver(&blocklayout_type); ++ bl_pipe_init(); ++ return 0; ++} ++ ++static void __exit nfs4blocklayout_exit(void) ++{ ++ dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", ++ __func__); ++ ++ pnfs_unregister_layoutdriver(&blocklayout_type); ++ bl_pipe_exit(); ++} ++ ++module_init(nfs4blocklayout_init); ++module_exit(nfs4blocklayout_exit); +diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdev.c +--- linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdev.c.orig 2010-08-31 21:11:40.899160378 -0400 ++++ linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdev.c 2010-08-31 21:11:40.899160378 -0400 +@@ -0,0 +1,335 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#include ++#include /* __bread */ ++ ++#include ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes) ++{ ++ uint32_t *q = p + XDR_QUADLEN(nbytes); ++ if (unlikely(q > end || q < p)) ++ return NULL; ++ return p; ++} ++EXPORT_SYMBOL(blk_overflow); ++ ++/* Open a block_device by device number. */ ++struct block_device *nfs4_blkdev_get(dev_t dev) ++{ ++ struct block_device *bd; ++ ++ dprintk("%s enter\n", __func__); ++ bd = open_by_devnum(dev, FMODE_READ); ++ if (IS_ERR(bd)) ++ goto fail; ++ return bd; ++fail: ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ return NULL; ++} ++ ++/* ++ * Release the block device ++ */ ++int nfs4_blkdev_put(struct block_device *bdev) ++{ ++ dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), ++ MINOR(bdev->bd_dev)); ++ bd_release(bdev); ++ return blkdev_put(bdev, FMODE_READ); ++} ++ ++/* Decodes pnfs_block_deviceaddr4 (draft-8) which is XDR encoded ++ * in dev->dev_addr_buf. ++ */ ++struct pnfs_block_dev * ++nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist) ++{ ++ struct pnfs_block_dev *rv = NULL; ++ struct block_device *bd = NULL; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint32_t major, minor; ++ ++ dprintk("%s enter\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return NULL; ++ dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); ++ dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, ++ dev->mincount); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_MOUNT, 0, dev->area, ++ dev->mincount); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out_err; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ dprintk("%s CALLING USERSPACE DAEMON\n", __func__); ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out_err; ++ } ++ if (reply->status != BL_DEVICE_REQUEST_PROC) { ++ dprintk("%s failed to open device: %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ memcpy(&major, (uint32_t *)(payload_of(reply)), sizeof(uint32_t)); ++ memcpy(&minor, (uint32_t *)(payload_of(reply) + sizeof(uint32_t)), ++ sizeof(uint32_t)); ++ bd = nfs4_blkdev_get(MKDEV(major, minor)); ++ if (IS_ERR(bd)) { ++ dprintk("%s failed to open device : %ld\n", ++ __func__, PTR_ERR(bd)); ++ goto out_err; ++ } ++ ++ rv = kzalloc(sizeof(*rv), GFP_KERNEL); ++ if (!rv) ++ goto out_err; ++ ++ rv->bm_mdev = bd; ++ memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct pnfs_deviceid)); ++ dprintk("%s Created device %s with bd_block_size %u\n", ++ __func__, ++ bd->bd_disk->disk_name, ++ bd->bd_block_size); ++ kfree(reply); ++ kfree(msg); ++ return rv; ++ ++out_err: ++ kfree(rv); ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return NULL; ++} ++ ++/* Map deviceid returned by the server to constructed block_device */ ++static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, ++ struct pnfs_deviceid *id) ++{ ++ struct block_device *rv = NULL; ++ struct block_mount_id *mid; ++ struct pnfs_block_dev *dev; ++ ++ dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); ++ mid = BLK_ID(lo); ++ spin_lock(&mid->bm_lock); ++ list_for_each_entry(dev, &mid->bm_devlist, bm_node) { ++ if (memcmp(id->data, dev->bm_mdevid.data, ++ NFS4_PNFS_DEVICEID4_SIZE) == 0) { ++ rv = dev->bm_mdev; ++ goto out; ++ } ++ } ++ out: ++ spin_unlock(&mid->bm_lock); ++ dprintk("%s returning %p\n", __func__, rv); ++ return rv; ++} ++ ++/* Tracks info needed to ensure extents in layout obey constraints of spec */ ++struct layout_verification { ++ u32 mode; /* R or RW */ ++ u64 start; /* Expected start of next non-COW extent */ ++ u64 inval; /* Start of INVAL coverage */ ++ u64 cowread; /* End of COW read coverage */ ++}; ++ ++/* Verify the extent meets the layout requirements of the pnfs-block draft, ++ * section 2.3.1. ++ */ ++static int verify_extent(struct pnfs_block_extent *be, ++ struct layout_verification *lv) ++{ ++ if (lv->mode == IOMODE_READ) { ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA || ++ be->be_state == PNFS_BLOCK_INVALID_DATA) ++ return -EIO; ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } ++ /* lv->mode == IOMODE_RW */ ++ if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ if (lv->cowread > lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ lv->inval = lv->start; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { ++ if (be->be_f_offset != lv->start) ++ return -EIO; ++ lv->start += be->be_length; ++ return 0; ++ } else if (be->be_state == PNFS_BLOCK_READ_DATA) { ++ if (be->be_f_offset > lv->start) ++ return -EIO; ++ if (be->be_f_offset < lv->inval) ++ return -EIO; ++ if (be->be_f_offset < lv->cowread) ++ return -EIO; ++ /* It looks like you might want to min this with lv->start, ++ * but you really don't. ++ */ ++ lv->inval = lv->inval + be->be_length; ++ lv->cowread = be->be_f_offset + be->be_length; ++ return 0; ++ } else ++ return -EIO; ++} ++ ++/* XDR decode pnfs_block_layout4 structure */ ++int ++nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct pnfs_block_layout *bl = BLK_LO2EXT(lo); ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t *end = (uint32_t *)((char *)lgr->layout.buf + lgr->layout.len); ++ int i, status = -EIO; ++ uint32_t count; ++ struct pnfs_block_extent *be = NULL, *save; ++ uint64_t tmp; /* Used by READSECTOR */ ++ struct layout_verification lv = { ++ .mode = lgr->range.iomode, ++ .start = lgr->range.offset >> 9, ++ .inval = lgr->range.offset >> 9, ++ .cowread = lgr->range.offset >> 9, ++ }; ++ ++ LIST_HEAD(extents); ++ ++ BLK_READBUF(p, end, 4); ++ READ32(count); ++ ++ dprintk("%s enter, number of extents %i\n", __func__, count); ++ BLK_READBUF(p, end, (28 + NFS4_PNFS_DEVICEID4_SIZE) * count); ++ ++ /* Decode individual extents, putting them in temporary ++ * staging area until whole layout is decoded to make error ++ * recovery easier. ++ */ ++ for (i = 0; i < count; i++) { ++ be = alloc_extent(); ++ if (!be) { ++ status = -ENOMEM; ++ goto out_err; ++ } ++ READ_DEVID(&be->be_devid); ++ be->be_mdev = translate_devid(lo, &be->be_devid); ++ if (!be->be_mdev) ++ goto out_err; ++ /* The next three values are read in as bytes, ++ * but stored as 512-byte sector lengths ++ */ ++ READ_SECTOR(be->be_f_offset); ++ READ_SECTOR(be->be_length); ++ READ_SECTOR(be->be_v_offset); ++ READ32(be->be_state); ++ if (be->be_state == PNFS_BLOCK_INVALID_DATA) ++ be->be_inval = &bl->bl_inval; ++ if (verify_extent(be, &lv)) { ++ dprintk("%s verify failed\n", __func__); ++ goto out_err; ++ } ++ list_add_tail(&be->be_node, &extents); ++ } ++ if (p != end) { ++ dprintk("%s Undecoded cruft at end of opaque\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lgr->range.offset + lgr->range.length != lv.start << 9) { ++ dprintk("%s Final length mismatch\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ if (lv.start < lv.cowread) { ++ dprintk("%s Final uncovered COW extent\n", __func__); ++ be = NULL; ++ goto out_err; ++ } ++ /* Extents decoded properly, now try to merge them in to ++ * existing layout extents. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ list_for_each_entry_safe(be, save, &extents, be_node) { ++ list_del(&be->be_node); ++ status = add_and_merge_extent(bl, be); ++ if (status) { ++ spin_unlock(&bl->bl_ext_lock); ++ /* This is a fairly catastrophic error, as the ++ * entire layout extent lists are now corrupted. ++ * We should have some way to distinguish this. ++ */ ++ be = NULL; ++ goto out_err; ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ status = 0; ++ out: ++ dprintk("%s returns %i\n", __func__, status); ++ return status; ++ ++ out_err: ++ put_extent(be); ++ while (!list_empty(&extents)) { ++ be = list_first_entry(&extents, struct pnfs_block_extent, ++ be_node); ++ list_del(&be->be_node); ++ put_extent(be); ++ } ++ goto out; ++} +diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdm.c +--- linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdm.c.orig 2010-08-31 21:11:40.900171086 -0400 ++++ linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayoutdm.c 2010-08-31 21:11:40.900171086 -0400 +@@ -0,0 +1,120 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayoutdm.c ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2007 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Fred Isaman ++ * Andy Adamson ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include /* gendisk - used in a dprintk*/ ++#include ++#include ++ ++#include "blocklayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Defines used for calculating memory usage in nfs4_blk_flatten() */ ++#define ARGSIZE 24 /* Max bytes needed for linear target arg string */ ++#define SPECSIZE (sizeof8(struct dm_target_spec) + ARGSIZE) ++#define SPECS_PER_PAGE (PAGE_SIZE / SPECSIZE) ++#define SPEC_HEADER_ADJUST (SPECS_PER_PAGE - \ ++ (PAGE_SIZE - sizeof8(struct dm_ioctl)) / SPECSIZE) ++#define roundup8(x) (((x)+7) & ~7) ++#define sizeof8(x) roundup8(sizeof(x)) ++ ++static int dev_remove(dev_t dev) ++{ ++ int ret = 1; ++ struct pipefs_hdr *msg = NULL, *reply = NULL; ++ uint64_t bl_dev; ++ uint32_t major = MAJOR(dev), minor = MINOR(dev); ++ ++ dprintk("Entering %s\n", __func__); ++ ++ if (IS_ERR(bl_device_pipe)) ++ return ret; ++ ++ memcpy((void *)&bl_dev, &major, sizeof(uint32_t)); ++ memcpy((void *)&bl_dev + sizeof(uint32_t), &minor, sizeof(uint32_t)); ++ msg = pipefs_alloc_init_msg(0, BL_DEVICE_UMOUNT, 0, (void *)&bl_dev, ++ sizeof(uint64_t)); ++ if (IS_ERR(msg)) { ++ dprintk("ERROR: couldn't make pipefs message.\n"); ++ goto out; ++ } ++ msg->msgid = hash_ptr(&msg, sizeof(msg->msgid) * 8); ++ msg->status = BL_DEVICE_REQUEST_INIT; ++ ++ reply = pipefs_queue_upcall_waitreply(bl_device_pipe, msg, ++ &bl_device_list, 0, 0); ++ if (IS_ERR(reply)) { ++ dprintk("ERROR: upcall_waitreply failed\n"); ++ goto out; ++ } ++ ++ if (reply->status == BL_DEVICE_REQUEST_PROC) ++ ret = 0; /*TODO: what to return*/ ++out: ++ if (!IS_ERR(reply)) ++ kfree(reply); ++ if (!IS_ERR(msg)) ++ kfree(msg); ++ return ret; ++} ++ ++/* ++ * Release meta device ++ */ ++static int nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) ++{ ++ int rv; ++ ++ dprintk("%s Releasing\n", __func__); ++ /* XXX Check return? */ ++ rv = nfs4_blkdev_put(bdev->bm_mdev); ++ dprintk("%s nfs4_blkdev_put returns %d\n", __func__, rv); ++ ++ rv = dev_remove(bdev->bm_mdev->bd_dev); ++ dprintk("%s Returns %d\n", __func__, rv); ++ return rv; ++} ++ ++void free_block_dev(struct pnfs_block_dev *bdev) ++{ ++ if (bdev) { ++ if (bdev->bm_mdev) { ++ dprintk("%s Removing DM device: %d:%d\n", ++ __func__, ++ MAJOR(bdev->bm_mdev->bd_dev), ++ MINOR(bdev->bm_mdev->bd_dev)); ++ /* XXX Check status ?? */ ++ nfs4_blk_metadev_release(bdev); ++ } ++ kfree(bdev); ++ } ++} +diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.h.orig linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.h +--- linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.h.orig 2010-08-31 21:11:40.898150313 -0400 ++++ linux-2.6.35.noarch/fs/nfs/blocklayout/blocklayout.h 2010-08-31 21:11:40.899160378 -0400 +@@ -0,0 +1,302 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++#ifndef FS_NFS_NFS4BLOCKLAYOUT_H ++#define FS_NFS_NFS4BLOCKLAYOUT_H ++ ++#include ++#include ++#include /* Needed for struct dm_ioctl*/ ++ ++#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> 9) ++ ++#define PG_pnfserr PG_owner_priv_1 ++#define PagePnfsErr(page) test_bit(PG_pnfserr, &(page)->flags) ++#define SetPagePnfsErr(page) set_bit(PG_pnfserr, &(page)->flags) ++#define ClearPagePnfsErr(page) clear_bit(PG_pnfserr, &(page)->flags) ++ ++extern int dm_dev_create(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_dev_remove(struct dm_ioctl *param); /* from dm-ioctl.c */ ++extern int dm_do_resume(struct dm_ioctl *param); ++extern int dm_table_load(struct dm_ioctl *param, size_t param_size); ++ ++struct block_mount_id { ++ spinlock_t bm_lock; /* protects list */ ++ struct list_head bm_devlist; /* holds pnfs_block_dev */ ++}; ++ ++struct pnfs_block_dev { ++ struct list_head bm_node; ++ struct pnfs_deviceid bm_mdevid; /* associated devid */ ++ struct block_device *bm_mdev; /* meta device itself */ ++}; ++ ++/* holds visible disks that can be matched against VOLUME_SIMPLE signatures */ ++struct visible_block_device { ++ struct list_head vi_node; ++ struct block_device *vi_bdev; ++ int vi_mapped; ++ int vi_put_done; ++}; ++ ++enum blk_vol_type { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, /* maps to a single LU */ ++ PNFS_BLOCK_VOLUME_SLICE = 1, /* slice of another volume */ ++ PNFS_BLOCK_VOLUME_CONCAT = 2, /* concatenation of multiple volumes */ ++ PNFS_BLOCK_VOLUME_STRIPE = 3 /* striped across multiple volumes */ ++}; ++ ++/* All disk offset/lengths are stored in 512-byte sectors */ ++struct pnfs_blk_volume { ++ uint32_t bv_type; ++ sector_t bv_size; ++ struct pnfs_blk_volume **bv_vols; ++ int bv_vol_n; ++ union { ++ dev_t bv_dev; ++ sector_t bv_stripe_unit; ++ sector_t bv_offset; ++ }; ++}; ++ ++/* Since components need not be aligned, cannot use sector_t */ ++struct pnfs_blk_sig_comp { ++ int64_t bs_offset; /* In bytes */ ++ uint32_t bs_length; /* In bytes */ ++ char *bs_string; ++}; ++ ++/* Maximum number of signatures components in a simple volume */ ++# define PNFS_BLOCK_MAX_SIG_COMP 16 ++ ++struct pnfs_blk_sig { ++ int si_num_comps; ++ struct pnfs_blk_sig_comp si_comps[PNFS_BLOCK_MAX_SIG_COMP]; ++}; ++ ++enum exstate4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ ++ PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ ++}; ++ ++#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ ++ ++struct my_tree_t { ++ sector_t mtt_step_size; /* Internal sector alignment */ ++ struct list_head mtt_stub; /* Should be a radix tree */ ++}; ++ ++struct pnfs_inval_markings { ++ spinlock_t im_lock; ++ struct my_tree_t im_tree; /* Sectors that need LAYOUTCOMMIT */ ++ sector_t im_block_size; /* Server blocksize in sectors */ ++}; ++ ++struct pnfs_inval_tracking { ++ struct list_head it_link; ++ int it_sector; ++ int it_tags; ++}; ++ ++/* sector_t fields are all in 512-byte sectors */ ++struct pnfs_block_extent { ++ struct kref be_refcnt; ++ struct list_head be_node; /* link into lseg list */ ++ struct pnfs_deviceid be_devid; /* STUB - remevable??? */ ++ struct block_device *be_mdev; ++ sector_t be_f_offset; /* the starting offset in the file */ ++ sector_t be_length; /* the size of the extent */ ++ sector_t be_v_offset; /* the starting offset in the volume */ ++ enum exstate4 be_state; /* the state of this extent */ ++ struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ ++}; ++ ++/* Shortened extent used by LAYOUTCOMMIT */ ++struct pnfs_block_short_extent { ++ struct list_head bse_node; ++ struct pnfs_deviceid bse_devid; /* STUB - removable??? */ ++ struct block_device *bse_mdev; ++ sector_t bse_f_offset; /* the starting offset in the file */ ++ sector_t bse_length; /* the size of the extent */ ++}; ++ ++static inline void ++INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) ++{ ++ spin_lock_init(&marks->im_lock); ++ INIT_LIST_HEAD(&marks->im_tree.mtt_stub); ++ marks->im_block_size = blocksize; ++ marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, ++ blocksize); ++} ++ ++enum extentclass4 { ++ RW_EXTENT = 0, /* READWRTE and INVAL */ ++ RO_EXTENT = 1, /* READ and NONE */ ++ EXTENT_LISTS = 2, ++}; ++ ++static inline int choose_list(enum exstate4 state) ++{ ++ if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) ++ return RO_EXTENT; ++ else ++ return RW_EXTENT; ++} ++ ++struct pnfs_block_layout { ++ struct pnfs_layout_hdr bl_layout; ++ struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ ++ spinlock_t bl_ext_lock; /* Protects list manipulation */ ++ struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ ++ struct list_head bl_commit; /* Needs layout commit */ ++ unsigned int bl_count; /* entries in bl_commit */ ++ sector_t bl_blocksize; /* Server blocksize in sectors */ ++}; ++ ++/* this struct is comunicated between: ++ * bl_setup_layoutcommit && bl_encode_layoutcommit && bl_cleanup_layoutcommit ++ */ ++struct bl_layoutupdate_data { ++ struct list_head ranges; ++}; ++ ++#define BLK_ID(lo) ((struct block_mount_id *)(PNFS_NFS_SERVER(lo)->pnfs_ld_data)) ++ ++static inline struct pnfs_block_layout * ++BLK_LO2EXT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct pnfs_block_layout, bl_layout); ++} ++ ++static inline struct pnfs_block_layout * ++BLK_LSEG2EXT(struct pnfs_layout_segment *lseg) ++{ ++ return BLK_LO2EXT(lseg->layout); ++} ++ ++uint32_t *blk_overflow(uint32_t *p, uint32_t *end, size_t nbytes); ++ ++#define BLK_READBUF(p, e, nbytes) do { \ ++ p = blk_overflow(p, e, nbytes); \ ++ if (!p) { \ ++ printk(KERN_WARNING \ ++ "%s: reply buffer overflowed in line %d.\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++} while (0) ++ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (uint64_t)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++#define READ_DEVID(x) COPYMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++#define READ_SECTOR(x) do { \ ++ READ64(tmp); \ ++ if (tmp & 0x1ff) { \ ++ printk(KERN_WARNING \ ++ "%s Value not 512-byte aligned at line %d\n", \ ++ __func__, __LINE__); \ ++ goto out_err; \ ++ } \ ++ (x) = tmp >> 9; \ ++} while (0) ++ ++#define WRITE32(n) do { \ ++ *p++ = htonl(n); \ ++ } while (0) ++#define WRITE64(n) do { \ ++ *p++ = htonl((uint32_t)((n) >> 32)); \ ++ *p++ = htonl((uint32_t)(n)); \ ++} while (0) ++#define WRITEMEM(ptr, nbytes) do { \ ++ p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ ++} while (0) ++#define WRITE_DEVID(x) WRITEMEM((x)->data, NFS4_PNFS_DEVICEID4_SIZE) ++ ++/* blocklayoutdev.c */ ++struct block_device *nfs4_blkdev_get(dev_t dev); ++int nfs4_blkdev_put(struct block_device *bdev); ++struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, ++ struct pnfs_device *dev, ++ struct list_head *sdlist); ++int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, ++ struct nfs4_layoutget_res *lgr); ++int nfs4_blk_create_block_disk_list(struct list_head *); ++void nfs4_blk_destroy_disk_list(struct list_head *); ++/* blocklayoutdm.c */ ++int nfs4_blk_flatten(struct pnfs_blk_volume *, int, struct pnfs_block_dev *); ++void free_block_dev(struct pnfs_block_dev *bdev); ++/* extents.c */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read); ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages); ++void put_extent(struct pnfs_block_extent *be); ++struct pnfs_block_extent *alloc_extent(void); ++struct pnfs_block_extent *get_extent(struct pnfs_block_extent *be); ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect); ++int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg); ++void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status); ++int add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new); ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length); ++ ++#include ++ ++extern struct pipefs_list bl_device_list; ++extern struct dentry *bl_device_pipe; ++ ++int bl_pipe_init(void); ++void bl_pipe_exit(void); ++ ++#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */ ++#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/ ++#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */ ++#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ ++#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ ++ ++#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ +diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/extents.c.orig linux-2.6.35.noarch/fs/nfs/blocklayout/extents.c +--- linux-2.6.35.noarch/fs/nfs/blocklayout/extents.c.orig 2010-08-31 21:11:40.901160489 -0400 ++++ linux-2.6.35.noarch/fs/nfs/blocklayout/extents.c 2010-08-31 21:11:40.901160489 -0400 +@@ -0,0 +1,948 @@ ++/* ++ * linux/fs/nfs/blocklayout/blocklayout.h ++ * ++ * Module for the NFSv4.1 pNFS block layout driver. ++ * ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * Fred Isaman ++ * ++ * permission is granted to use, copy, create derivative works and ++ * redistribute this software and such derivative works for any purpose, ++ * so long as the name of the university of michigan is not used in ++ * any advertising or publicity pertaining to the use or distribution ++ * of this software without specific, written prior authorization. if ++ * the above copyright notice or any other identification of the ++ * university of michigan is included in any copy of any portion of ++ * this software, then the disclaimer below must also be included. ++ * ++ * this software is provided as is, without representation from the ++ * university of michigan as to its fitness for any purpose, and without ++ * warranty by the university of michigan of any kind, either express ++ * or implied, including without limitation the implied warranties of ++ * merchantability and fitness for a particular purpose. the regents ++ * of the university of michigan shall not be liable for any damages, ++ * including special, indirect, incidental, or consequential damages, ++ * with respect to any claim arising out or in connection with the use ++ * of the software, even if it has been or is hereafter advised of the ++ * possibility of such damages. ++ */ ++ ++#include "blocklayout.h" ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* Bit numbers */ ++#define EXTENT_INITIALIZED 0 ++#define EXTENT_WRITTEN 1 ++#define EXTENT_IN_COMMIT 2 ++#define INTERNAL_EXISTS MY_MAX_TAGS ++#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) ++ ++/* Returns largest t<=s s.t. t%base==0 */ ++static inline sector_t normalize(sector_t s, int base) ++{ ++ sector_t tmp = s; /* Since do_div modifies its argument */ ++ return s - do_div(tmp, base); ++} ++ ++static inline sector_t normalize_up(sector_t s, int base) ++{ ++ return normalize(s + base - 1, base); ++} ++ ++/* Complete stub using list while determine API wanted */ ++ ++/* Returns tags, or negative */ ++static int32_t _find_entry(struct my_tree_t *tree, u64 s) ++{ ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu) enter\n", __func__, s); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) ++ return pos->it_tags & INTERNAL_MASK; ++ else ++ break; ++ } ++ return -ENOENT; ++} ++ ++static inline ++int _has_tag(struct my_tree_t *tree, u64 s, int32_t tag) ++{ ++ int32_t tags; ++ ++ dprintk("%s(%llu, %i) enter\n", __func__, s, tag); ++ s = normalize(s, tree->mtt_step_size); ++ tags = _find_entry(tree, s); ++ if ((tags < 0) || !(tags & (1 << tag))) ++ return 0; ++ else ++ return 1; ++} ++ ++/* Creates entry with tag, or if entry already exists, unions tag to it. ++ * If storage is not NULL, newly created entry will use it. ++ * Returns number of entries added, or negative on error. ++ */ ++static int _add_entry(struct my_tree_t *tree, u64 s, int32_t tag, ++ struct pnfs_inval_tracking *storage) ++{ ++ int found = 0; ++ struct pnfs_inval_tracking *pos; ++ ++ dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector > s) ++ continue; ++ else if (pos->it_sector == s) { ++ found = 1; ++ break; ++ } else ++ break; ++ } ++ if (found) { ++ pos->it_tags |= (1 << tag); ++ return 0; ++ } else { ++ struct pnfs_inval_tracking *new; ++ if (storage) ++ new = storage; ++ else { ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ } ++ new->it_sector = s; ++ new->it_tags = (1 << tag); ++ list_add(&new->it_link, &pos->it_link); ++ return 1; ++ } ++} ++ ++/* XXXX Really want option to not create */ ++/* Over range, unions tag with existing entries, else creates entry with tag */ ++static int _set_range(struct my_tree_t *tree, int32_t tag, u64 s, u64 length) ++{ ++ u64 i; ++ ++ dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); ++ for (i = normalize(s, tree->mtt_step_size); i < s + length; ++ i += tree->mtt_step_size) ++ if (_add_entry(tree, i, tag, NULL)) ++ return -ENOMEM; ++ return 0; ++} ++ ++/* Ensure that future operations on given range of tree will not malloc */ ++static int _preload_range(struct my_tree_t *tree, u64 offset, u64 length) ++{ ++ u64 start, end, s; ++ int count, i, used = 0, status = -ENOMEM; ++ struct pnfs_inval_tracking **storage; ++ ++ dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); ++ start = normalize(offset, tree->mtt_step_size); ++ end = normalize_up(offset + length, tree->mtt_step_size); ++ count = (int)(end - start) / (int)tree->mtt_step_size; ++ ++ /* Pre-malloc what memory we might need */ ++ storage = kmalloc(sizeof(*storage) * count, GFP_KERNEL); ++ if (!storage) ++ return -ENOMEM; ++ for (i = 0; i < count; i++) { ++ storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), ++ GFP_KERNEL); ++ if (!storage[i]) ++ goto out_cleanup; ++ } ++ ++ /* Now need lock - HOW??? */ ++ ++ for (s = start; s < end; s += tree->mtt_step_size) ++ used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); ++ ++ /* Unlock - HOW??? */ ++ status = 0; ++ ++ out_cleanup: ++ for (i = used; i < count; i++) { ++ if (!storage[i]) ++ break; ++ kfree(storage[i]); ++ } ++ kfree(storage); ++ return status; ++} ++ ++static void set_needs_init(sector_t *array, sector_t offset) ++{ ++ sector_t *p = array; ++ ++ dprintk("%s enter\n", __func__); ++ if (!p) ++ return; ++ while (*p < offset) ++ p++; ++ if (*p == offset) ++ return; ++ else if (*p == ~0) { ++ *p++ = offset; ++ *p = ~0; ++ return; ++ } else { ++ sector_t *save = p; ++ dprintk("%s Adding %llu\n", __func__, (u64)offset); ++ while (*p != ~0) ++ p++; ++ p++; ++ memmove(save + 1, save, (char *)p - (char *)save); ++ *save = offset; ++ return; ++ } ++} ++ ++/* We are relying on page lock to serialize this */ ++int is_sector_initialized(struct pnfs_inval_markings *marks, sector_t isect) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Assume start, end already sector aligned */ ++static int ++_range_has_tag(struct my_tree_t *tree, u64 start, u64 end, int32_t tag) ++{ ++ struct pnfs_inval_tracking *pos; ++ u64 expect = 0; ++ ++ dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); ++ list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { ++ if (pos->it_sector >= end) ++ continue; ++ if (!expect) { ++ if ((pos->it_sector == end - tree->mtt_step_size) && ++ (pos->it_tags & (1 << tag))) { ++ expect = pos->it_sector - tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ continue; ++ } else { ++ return 0; ++ } ++ } ++ if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) ++ return 0; ++ expect -= tree->mtt_step_size; ++ if (expect < start) ++ return 1; ++ } ++ return 0; ++} ++ ++static int is_range_written(struct pnfs_inval_markings *marks, ++ sector_t start, sector_t end) ++{ ++ int rv; ++ ++ spin_lock(&marks->im_lock); ++ rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); ++ spin_unlock(&marks->im_lock); ++ return rv; ++} ++ ++/* Marks sectors in [offest, offset_length) as having been initialized. ++ * All lengths are step-aligned, where step is min(pagesize, blocksize). ++ * Notes where partial block is initialized, and helps prepare it for ++ * complete initialization later. ++ */ ++/* Currently assumes offset is page-aligned */ ++int mark_initialized_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length, ++ sector_t **pages) ++{ ++ sector_t s, start, end; ++ sector_t *array = NULL; /* Pages to mark */ ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", ++ __func__, (u64)offset, (u64)length); ++ s = max((sector_t) 3, ++ 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); ++ dprintk("%s set max=%llu\n", __func__, (u64)s); ++ if (pages) { ++ array = kmalloc(s * sizeof(sector_t), GFP_KERNEL); ++ if (!array) ++ goto outerr; ++ array[0] = ~0; ++ } ++ ++ start = normalize(offset, marks->im_block_size); ++ end = normalize_up(offset + length, marks->im_block_size); ++ if (_preload_range(&marks->im_tree, start, end - start)) ++ goto outerr; ++ ++ spin_lock(&marks->im_lock); ++ ++ for (s = normalize_up(start, PAGE_CACHE_SECTORS); ++ s < offset; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s pre-area pages\n", __func__); ++ /* Portion of used block is not initialized */ ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) ++ goto out_unlock; ++ for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); ++ s < end; s += PAGE_CACHE_SECTORS) { ++ dprintk("%s post-area pages\n", __func__); ++ if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) ++ set_needs_init(array, s); ++ } ++ ++ spin_unlock(&marks->im_lock); ++ ++ if (pages) { ++ if (array[0] == ~0) { ++ kfree(array); ++ *pages = NULL; ++ } else ++ *pages = array; ++ } ++ return 0; ++ ++ out_unlock: ++ spin_unlock(&marks->im_lock); ++ outerr: ++ if (pages) { ++ kfree(array); ++ *pages = NULL; ++ } ++ return -ENOMEM; ++} ++ ++/* Marks sectors in [offest, offset+length) as having been written to disk. ++ * All lengths should be block aligned. ++ */ ++int mark_written_sectors(struct pnfs_inval_markings *marks, ++ sector_t offset, sector_t length) ++{ ++ int status; ++ ++ dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, ++ (u64)offset, (u64)length); ++ spin_lock(&marks->im_lock); ++ status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); ++ spin_unlock(&marks->im_lock); ++ return status; ++} ++ ++static void print_short_extent(struct pnfs_block_short_extent *be) ++{ ++ dprintk("PRINT SHORT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->bse_length); ++ } ++} ++ ++void print_clist(struct list_head *list, unsigned int count) ++{ ++ struct pnfs_block_short_extent *be; ++ unsigned int i = 0; ++ ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, bse_node) { ++ i++; ++ print_short_extent(be); ++ } ++ if (i != count) ++ dprintk("\n\nExpected %u entries\n\n\n", count); ++ dprintk("****************\n"); ++} ++ ++/* Note: In theory, we should do more checking that devid's match between ++ * old and new, but if they don't, the lists are too corrupt to salvage anyway. ++ */ ++/* Note this is very similar to add_and_merge_extent */ ++static void add_to_commitlist(struct pnfs_block_layout *bl, ++ struct pnfs_block_short_extent *new) ++{ ++ struct list_head *clist = &bl->bl_commit; ++ struct pnfs_block_short_extent *old, *save; ++ sector_t end = new->bse_f_offset + new->bse_length; ++ ++ dprintk("%s enter\n", __func__); ++ print_short_extent(new); ++ print_clist(clist, bl->bl_count); ++ bl->bl_count++; ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe(old, save, clist, bse_node) { ++ if (new->bse_f_offset < old->bse_f_offset) ++ break; ++ if (end <= old->bse_f_offset + old->bse_length) { ++ /* Range is already in list */ ++ bl->bl_count--; ++ kfree(new); ++ return; ++ } else if (new->bse_f_offset <= ++ old->bse_f_offset + old->bse_length) { ++ /* new overlaps or abuts existing be */ ++ if (new->bse_mdev == old->bse_mdev) { ++ /* extend new to fully replace old */ ++ new->bse_length += new->bse_f_offset - ++ old->bse_f_offset; ++ new->bse_f_offset = old->bse_f_offset; ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ } ++ /* Note that if we never hit the above break, old will not point to a ++ * valid extent. However, in that case &old->bse_node==list. ++ */ ++ list_add_tail(&new->bse_node, &old->bse_node); ++ /* Scan forward for overlaps. If we find any, extend new and ++ * remove the overlapped extent. ++ */ ++ old = list_prepare_entry(new, clist, bse_node); ++ list_for_each_entry_safe_continue(old, save, clist, bse_node) { ++ if (end < old->bse_f_offset) ++ break; ++ /* new overlaps or abuts old */ ++ if (new->bse_mdev == old->bse_mdev) { ++ if (end < old->bse_f_offset + old->bse_length) { ++ /* extend new to fully cover old */ ++ end = old->bse_f_offset + old->bse_length; ++ new->bse_length = end - new->bse_f_offset; ++ } ++ list_del(&old->bse_node); ++ bl->bl_count--; ++ kfree(old); ++ } ++ } ++ dprintk("%s: after merging\n", __func__); ++ print_clist(clist, bl->bl_count); ++} ++ ++/* Note the range described by offset, length is guaranteed to be contained ++ * within be. ++ */ ++int mark_for_commit(struct pnfs_block_extent *be, ++ sector_t offset, sector_t length) ++{ ++ sector_t new_end, end = offset + length; ++ struct pnfs_block_short_extent *new; ++ struct pnfs_block_layout *bl = container_of(be->be_inval, ++ struct pnfs_block_layout, ++ bl_inval); ++ ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return -ENOMEM; ++ ++ mark_written_sectors(be->be_inval, offset, length); ++ /* We want to add the range to commit list, but it must be ++ * block-normalized, and verified that the normalized range has ++ * been entirely written to disk. ++ */ ++ new->bse_f_offset = offset; ++ offset = normalize(offset, bl->bl_blocksize); ++ if (offset < new->bse_f_offset) { ++ if (is_range_written(be->be_inval, offset, new->bse_f_offset)) ++ new->bse_f_offset = offset; ++ else ++ new->bse_f_offset = offset + bl->bl_blocksize; ++ } ++ new_end = normalize_up(end, bl->bl_blocksize); ++ if (end < new_end) { ++ if (is_range_written(be->be_inval, end, new_end)) ++ end = new_end; ++ else ++ end = new_end - bl->bl_blocksize; ++ } ++ if (end <= new->bse_f_offset) { ++ kfree(new); ++ return 0; ++ } ++ new->bse_length = end - new->bse_f_offset; ++ new->bse_devid = be->be_devid; ++ new->bse_mdev = be->be_mdev; ++ ++ spin_lock(&bl->bl_ext_lock); ++ /* new will be freed, either by add_to_commitlist if it decides not ++ * to use it, or after LAYOUTCOMMIT uses it in the commitlist. ++ */ ++ add_to_commitlist(bl, new); ++ spin_unlock(&bl->bl_ext_lock); ++ return 0; ++} ++ ++static void print_bl_extent(struct pnfs_block_extent *be) ++{ ++ dprintk("PRINT EXTENT extent %p\n", be); ++ if (be) { ++ dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); ++ dprintk(" be_length %llu\n", (u64)be->be_length); ++ dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); ++ dprintk(" be_state %d\n", be->be_state); ++ } ++} ++ ++static void ++destroy_extent(struct kref *kref) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = container_of(kref, struct pnfs_block_extent, be_refcnt); ++ dprintk("%s be=%p\n", __func__, be); ++ kfree(be); ++} ++ ++void ++put_extent(struct pnfs_block_extent *be) ++{ ++ if (be) { ++ dprintk("%s enter %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_put(&be->be_refcnt, destroy_extent); ++ } ++} ++ ++struct pnfs_block_extent *alloc_extent(void) ++{ ++ struct pnfs_block_extent *be; ++ ++ be = kmalloc(sizeof(struct pnfs_block_extent), GFP_KERNEL); ++ if (!be) ++ return NULL; ++ INIT_LIST_HEAD(&be->be_node); ++ kref_init(&be->be_refcnt); ++ be->be_inval = NULL; ++ return be; ++} ++ ++struct pnfs_block_extent * ++get_extent(struct pnfs_block_extent *be) ++{ ++ if (be) ++ kref_get(&be->be_refcnt); ++ return be; ++} ++ ++void print_elist(struct list_head *list) ++{ ++ struct pnfs_block_extent *be; ++ dprintk("****************\n"); ++ dprintk("Extent list looks like:\n"); ++ list_for_each_entry(be, list, be_node) { ++ print_bl_extent(be); ++ } ++ dprintk("****************\n"); ++} ++ ++static inline int ++extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) ++{ ++ /* Note this assumes new->be_f_offset >= old->be_f_offset */ ++ return (new->be_state == old->be_state) && ++ ((new->be_state == PNFS_BLOCK_NONE_DATA) || ++ ((new->be_v_offset - old->be_v_offset == ++ new->be_f_offset - old->be_f_offset) && ++ new->be_mdev == old->be_mdev)); ++} ++ ++/* Adds new to appropriate list in bl, modifying new and removing existing ++ * extents as appropriate to deal with overlaps. ++ * ++ * See find_get_extent for list constraints. ++ * ++ * Refcount on new is already set. If end up not using it, or error out, ++ * need to put the reference. ++ * ++ * Lock is held by caller. ++ */ ++int ++add_and_merge_extent(struct pnfs_block_layout *bl, ++ struct pnfs_block_extent *new) ++{ ++ struct pnfs_block_extent *be, *tmp; ++ sector_t end = new->be_f_offset + new->be_length; ++ struct list_head *list; ++ ++ dprintk("%s enter with be=%p\n", __func__, new); ++ print_bl_extent(new); ++ list = &bl->bl_extents[choose_list(new->be_state)]; ++ print_elist(list); ++ ++ /* Scan for proper place to insert, extending new to the left ++ * as much as possible. ++ */ ++ list_for_each_entry_safe_reverse(be, tmp, list, be_node) { ++ if (new->be_f_offset >= be->be_f_offset + be->be_length) ++ break; ++ if (new->be_f_offset >= be->be_f_offset) { ++ if (end <= be->be_f_offset + be->be_length) { ++ /* new is a subset of existing be*/ ++ if (extents_consistent(be, new)) { ++ dprintk("%s: new is subset, ignoring\n", ++ __func__); ++ put_extent(new); ++ return 0; ++ } else { ++ goto out_err; ++ } ++ } else { ++ /* |<-- be -->| ++ * |<-- new -->| */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ new->be_length += new->be_f_offset - ++ be->be_f_offset; ++ new->be_f_offset = be->be_f_offset; ++ new->be_v_offset = be->be_v_offset; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } else if (end >= be->be_f_offset + be->be_length) { ++ /* new extent overlap existing be */ ++ if (extents_consistent(be, new)) { ++ /* extend new to fully replace be */ ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } else if (end > be->be_f_offset) { ++ /* |<-- be -->| ++ *|<-- new -->| */ ++ if (extents_consistent(new, be)) { ++ /* extend new to fully replace be */ ++ new->be_length += be->be_f_offset + be->be_length - ++ new->be_f_offset - new->be_length; ++ dprintk("%s: removing %p\n", __func__, be); ++ list_del(&be->be_node); ++ put_extent(be); ++ } else { ++ goto out_err; ++ } ++ } ++ } ++ /* Note that if we never hit the above break, be will not point to a ++ * valid extent. However, in that case &be->be_node==list. ++ */ ++ list_add(&new->be_node, &be->be_node); ++ dprintk("%s: inserting new\n", __func__); ++ print_elist(list); ++ /* STUB - The per-list consistency checks have all been done, ++ * should now check cross-list consistency. ++ */ ++ return 0; ++ ++ out_err: ++ put_extent(new); ++ return -EIO; ++} ++ ++/* Returns extent, or NULL. If a second READ extent exists, it is returned ++ * in cow_read, if given. ++ * ++ * The extents are kept in two seperate ordered lists, one for READ and NONE, ++ * one for READWRITE and INVALID. Within each list, we assume: ++ * 1. Extents are ordered by file offset. ++ * 2. For any given isect, there is at most one extents that matches. ++ */ ++struct pnfs_block_extent * ++find_get_extent(struct pnfs_block_layout *bl, sector_t isect, ++ struct pnfs_block_extent **cow_read) ++{ ++ struct pnfs_block_extent *be, *cow, *ret; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ cow = ret = NULL; ++ spin_lock(&bl->bl_ext_lock); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret && ++ (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ if (!ret) ++ ret = be; ++ else if (be->be_state != PNFS_BLOCK_READ_DATA) ++ put_extent(be); ++ else ++ cow = be; ++ break; ++ } ++ } ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ if (cow_read) ++ *cow_read = cow; ++ print_bl_extent(ret); ++ return ret; ++} ++ ++/* Similar to find_get_extent, but called with lock held, and ignores cow */ ++static struct pnfs_block_extent * ++find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) ++{ ++ struct pnfs_block_extent *be, *ret = NULL; ++ int i; ++ ++ dprintk("%s enter with isect %llu\n", __func__, (u64)isect); ++ for (i = 0; i < EXTENT_LISTS; i++) { ++ if (ret) ++ break; ++ list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { ++ if (isect >= be->be_f_offset + be->be_length) ++ break; ++ if (isect >= be->be_f_offset) { ++ /* We have found an extent */ ++ dprintk("%s Get %p (%i)\n", __func__, be, ++ atomic_read(&be->be_refcnt.refcount)); ++ kref_get(&be->be_refcnt); ++ ret = be; ++ break; ++ } ++ } ++ } ++ print_bl_extent(ret); ++ return ret; ++} ++ ++int ++encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *arg) ++{ ++ sector_t start, end; ++ struct pnfs_block_short_extent *lce, *save; ++ unsigned int count = 0; ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct list_head *ranges = &bld->ranges; ++ __be32 *p, *xdr_start; ++ ++ dprintk("%s enter\n", __func__); ++ start = arg->range.offset >> 9; ++ end = start + (arg->range.length >> 9); ++ dprintk("%s set start=%llu, end=%llu\n", ++ __func__, (u64)start, (u64)end); ++ ++ /* BUG - creation of bl_commit is buggy - need to wait for ++ * entire block to be marked WRITTEN before it can be added. ++ */ ++ spin_lock(&bl->bl_ext_lock); ++ /* Want to adjust for possible truncate */ ++ /* We now want to adjust argument range */ ++ ++ /* XDR encode the ranges found */ ++ xdr_start = xdr_reserve_space(xdr, 8); ++ if (!xdr_start) ++ goto out; ++ list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { ++ p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); ++ if (!p) ++ break; ++ WRITE_DEVID(&lce->bse_devid); ++ WRITE64(lce->bse_f_offset << 9); ++ WRITE64(lce->bse_length << 9); ++ WRITE64(0LL); ++ WRITE32(PNFS_BLOCK_READWRITE_DATA); ++ list_del(&lce->bse_node); ++ list_add_tail(&lce->bse_node, ranges); ++ bl->bl_count--; ++ count++; ++ } ++ xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); ++ xdr_start[1] = cpu_to_be32(count); ++out: ++ spin_unlock(&bl->bl_ext_lock); ++ dprintk("%s found %i ranges\n", __func__, count); ++ return 0; ++} ++ ++/* Helper function to set_to_rw that initialize a new extent */ ++static void ++_prep_new_extent(struct pnfs_block_extent *new, ++ struct pnfs_block_extent *orig, ++ sector_t offset, sector_t length, int state) ++{ ++ kref_init(&new->be_refcnt); ++ /* don't need to INIT_LIST_HEAD(&new->be_node) */ ++ memcpy(&new->be_devid, &orig->be_devid, sizeof(struct pnfs_deviceid)); ++ new->be_mdev = orig->be_mdev; ++ new->be_f_offset = offset; ++ new->be_length = length; ++ new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; ++ new->be_state = state; ++ new->be_inval = orig->be_inval; ++} ++ ++/* Tries to merge be with extent in front of it in list. ++ * Frees storage if not used. ++ */ ++static struct pnfs_block_extent * ++_front_merge(struct pnfs_block_extent *be, struct list_head *head, ++ struct pnfs_block_extent *storage) ++{ ++ struct pnfs_block_extent *prev; ++ ++ if (!storage) ++ goto no_merge; ++ if (&be->be_node == head || be->be_node.prev == head) ++ goto no_merge; ++ prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); ++ if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || ++ !extents_consistent(prev, be)) ++ goto no_merge; ++ _prep_new_extent(storage, prev, prev->be_f_offset, ++ prev->be_length + be->be_length, prev->be_state); ++ list_replace(&prev->be_node, &storage->be_node); ++ put_extent(prev); ++ list_del(&be->be_node); ++ put_extent(be); ++ return storage; ++ ++ no_merge: ++ kfree(storage); ++ return be; ++} ++ ++static u64 ++set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) ++{ ++ u64 rv = offset + length; ++ struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; ++ struct pnfs_block_extent *children[3]; ++ struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; ++ int i = 0, j; ++ ++ dprintk("%s(%llu, %llu)\n", __func__, offset, length); ++ /* Create storage for up to three new extents e1, e2, e3 */ ++ e1 = kmalloc(sizeof(*e1), GFP_KERNEL); ++ e2 = kmalloc(sizeof(*e2), GFP_KERNEL); ++ e3 = kmalloc(sizeof(*e3), GFP_KERNEL); ++ /* BUG - we are ignoring any failure */ ++ if (!e1 || !e2 || !e3) ++ goto out_nosplit; ++ ++ spin_lock(&bl->bl_ext_lock); ++ be = find_get_extent_locked(bl, offset); ++ rv = be->be_f_offset + be->be_length; ++ if (be->be_state != PNFS_BLOCK_INVALID_DATA) { ++ spin_unlock(&bl->bl_ext_lock); ++ goto out_nosplit; ++ } ++ /* Add e* to children, bumping e*'s krefs */ ++ if (be->be_f_offset != offset) { ++ _prep_new_extent(e1, be, be->be_f_offset, ++ offset - be->be_f_offset, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e1; ++ print_bl_extent(e1); ++ } else ++ merge1 = e1; ++ _prep_new_extent(e2, be, offset, ++ min(length, be->be_f_offset + be->be_length - offset), ++ PNFS_BLOCK_READWRITE_DATA); ++ children[i++] = e2; ++ print_bl_extent(e2); ++ if (offset + length < be->be_f_offset + be->be_length) { ++ _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, ++ be->be_f_offset + be->be_length - ++ offset - length, ++ PNFS_BLOCK_INVALID_DATA); ++ children[i++] = e3; ++ print_bl_extent(e3); ++ } else ++ merge2 = e3; ++ ++ /* Remove be from list, and insert the e* */ ++ /* We don't get refs on e*, since this list is the base reference ++ * set when init'ed. ++ */ ++ if (i < 3) ++ children[i] = NULL; ++ new = children[0]; ++ list_replace(&be->be_node, &new->be_node); ++ put_extent(be); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); ++ for (j = 1; j < i; j++) { ++ old = new; ++ new = children[j]; ++ list_add(&new->be_node, &old->be_node); ++ } ++ if (merge2) { ++ /* This is a HACK, should just create a _back_merge function */ ++ new = list_entry(new->be_node.next, ++ struct pnfs_block_extent, be_node); ++ new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); ++ } ++ spin_unlock(&bl->bl_ext_lock); ++ ++ /* Since we removed the base reference above, be is now scheduled for ++ * destruction. ++ */ ++ put_extent(be); ++ dprintk("%s returns %llu after split\n", __func__, rv); ++ return rv; ++ ++ out_nosplit: ++ kfree(e1); ++ kfree(e2); ++ kfree(e3); ++ dprintk("%s returns %llu without splitting\n", __func__, rv); ++ return rv; ++} ++ ++void ++clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, ++ const struct nfs4_layoutcommit_args *arg, ++ int status) ++{ ++ struct bl_layoutupdate_data *bld = arg->layoutdriver_data; ++ struct pnfs_block_short_extent *lce, *save; ++ ++ dprintk("%s status %d\n", __func__, status); ++ list_for_each_entry_safe_reverse(lce, save, &bld->ranges, bse_node) { ++ if (likely(!status)) { ++ u64 offset = lce->bse_f_offset; ++ u64 end = offset + lce->bse_length; ++ ++ do { ++ offset = set_to_rw(bl, offset, end - offset); ++ } while (offset < end); ++ ++ kfree(lce); ++ } else { ++ spin_lock(&bl->bl_ext_lock); ++ add_to_commitlist(bl, lce); ++ spin_unlock(&bl->bl_ext_lock); ++ } ++ } ++} +diff -up linux-2.6.35.noarch/fs/nfs/blocklayout/Makefile.orig linux-2.6.35.noarch/fs/nfs/blocklayout/Makefile +--- linux-2.6.35.noarch/fs/nfs/blocklayout/Makefile.orig 2010-08-31 21:11:40.895022280 -0400 ++++ linux-2.6.35.noarch/fs/nfs/blocklayout/Makefile 2010-08-31 21:11:40.895022280 -0400 +@@ -0,0 +1,6 @@ ++# ++# Makefile for the pNFS block layout driver kernel module ++# ++obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o ++blocklayoutdriver-objs := blocklayout.o blocklayoutdev.o blocklayoutdm.o \ ++ extents.o block-device-discovery-pipe.o +diff -up linux-2.6.35.noarch/fs/nfs/callback.h.orig linux-2.6.35.noarch/fs/nfs/callback.h +--- linux-2.6.35.noarch/fs/nfs/callback.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/callback.h 2010-08-31 21:11:40.902160833 -0400 +@@ -111,6 +111,13 @@ extern int nfs41_validate_delegation_sta + + #define RCA4_TYPE_MASK_RDATA_DLG 0 + #define RCA4_TYPE_MASK_WDATA_DLG 1 ++#define RCA4_TYPE_MASK_DIR_DLG 2 ++#define RCA4_TYPE_MASK_FILE_LAYOUT 3 ++#define RCA4_TYPE_MASK_BLK_LAYOUT 4 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MIN 8 ++#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 ++#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 + + struct cb_recallanyargs { + struct sockaddr *craa_addr; +@@ -127,6 +134,39 @@ struct cb_recallslotargs { + extern unsigned nfs4_callback_recallslot(struct cb_recallslotargs *args, + void *dummy); + ++struct cb_layoutrecallargs { ++ struct sockaddr *cbl_addr; ++ struct nfs_fh cbl_fh; ++ struct pnfs_layout_range cbl_seg; ++ struct nfs_fsid cbl_fsid; ++ uint32_t cbl_recall_type; ++ uint32_t cbl_layout_type; ++ uint32_t cbl_layoutchanged; ++ nfs4_stateid cbl_stateid; ++}; ++ ++extern unsigned nfs4_callback_layoutrecall( ++ struct cb_layoutrecallargs *args, ++ void *dummy); ++ ++struct cb_devicenotifyitem { ++ uint32_t cbd_notify_type; ++ uint32_t cbd_layout_type; ++ struct pnfs_deviceid cbd_dev_id; ++ uint32_t cbd_immediate; ++}; ++ ++/* XXX: Should be dynamic up to max compound size */ ++#define NFS4_DEV_NOTIFY_MAXENTRIES 10 ++struct cb_devicenotifyargs { ++ struct sockaddr *addr; ++ int ndevs; ++ struct cb_devicenotifyitem devs[NFS4_DEV_NOTIFY_MAXENTRIES]; ++}; ++ ++extern unsigned nfs4_callback_devicenotify( ++ struct cb_devicenotifyargs *args, ++ void *dummy); + #endif /* CONFIG_NFS_V4_1 */ + + extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *res); +diff -up linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig linux-2.6.35.noarch/fs/nfs/callback_proc.c +--- linux-2.6.35.noarch/fs/nfs/callback_proc.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/callback_proc.c 2010-08-31 21:11:40.903160565 -0400 +@@ -8,10 +8,15 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #ifdef NFS_DEBUG + #define NFSDBG_FACILITY NFSDBG_CALLBACK +@@ -62,16 +67,6 @@ out: + return res->status; + } + +-static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) +-{ +-#if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion > 0) +- return nfs41_validate_delegation_stateid; +-#endif +- return nfs4_validate_delegation_stateid; +-} +- +- + __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) + { + struct nfs_client *clp; +@@ -92,8 +87,7 @@ __be32 nfs4_callback_recall(struct cb_re + inode = nfs_delegation_find_inode(clp, &args->fh); + if (inode != NULL) { + /* Set up a helper thread to actually return the delegation */ +- switch (nfs_async_inode_return_delegation(inode, &args->stateid, +- nfs_validate_delegation_stateid(clp))) { ++ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { + case 0: + res = 0; + break; +@@ -116,24 +110,364 @@ out: + + int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { +- if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (delegation == NULL || memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data))) + return 0; + return 1; + } + + #if defined(CONFIG_NFS_V4_1) + ++static bool ++pnfs_is_next_layout_stateid(const struct pnfs_layout_hdr *lo, ++ const nfs4_stateid stateid) ++{ ++ int seqlock; ++ bool res; ++ u32 oldseqid, newseqid; ++ ++ do { ++ seqlock = read_seqbegin(&lo->seqlock); ++ oldseqid = be32_to_cpu(lo->stateid.u.stateid.seqid); ++ newseqid = be32_to_cpu(stateid.u.stateid.seqid); ++ res = !memcmp(lo->stateid.u.stateid.other, ++ stateid.u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE); ++ if (res) { /* comparing layout stateids */ ++ if (oldseqid == ~0) ++ res = (newseqid == 1); ++ else ++ res = (newseqid == oldseqid + 1); ++ } else { /* open stateid */ ++ res = !memcmp(lo->stateid.u.data, ++ &zero_stateid, ++ NFS4_STATEID_SIZE); ++ if (res) ++ res = (newseqid == 1); ++ } ++ } while (read_seqretry(&lo->seqlock, seqlock)); ++ ++ return res; ++} ++ ++/* ++ * Retrieve an inode based on layout recall parameters ++ * ++ * Note: caller must iput(inode) to dereference the inode. ++ */ ++static struct inode * ++nfs_layoutrecall_find_inode(struct nfs_client *clp, ++ const struct cb_layoutrecallargs *args) ++{ ++ struct nfs_inode *nfsi; ++ struct pnfs_layout_hdr *lo; ++ struct nfs_server *server; ++ struct inode *ino = NULL; ++ ++ dprintk("%s: Begin recall_type=%d clp %p\n", ++ __func__, args->cbl_recall_type, clp); ++ ++ spin_lock(&clp->cl_lock); ++ list_for_each_entry(lo, &clp->cl_layouts, layouts) { ++ nfsi = PNFS_NFS_INODE(lo); ++ if (!nfsi) ++ continue; ++ ++ dprintk("%s: Searching inode=%lu\n", ++ __func__, nfsi->vfs_inode.i_ino); ++ ++ if (args->cbl_recall_type == RETURN_FILE) { ++ if (nfs_compare_fh(&args->cbl_fh, &nfsi->fh)) ++ continue; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ server = NFS_SERVER(&nfsi->vfs_inode); ++ if (server->fsid.major != args->cbl_fsid.major || ++ server->fsid.minor != args->cbl_fsid.minor) ++ continue; ++ } ++ ++ /* Make sure client didn't clean up layout without ++ * telling the server */ ++ if (!has_layout(nfsi)) ++ continue; ++ ++ ino = igrab(&nfsi->vfs_inode); ++ dprintk("%s: Found inode=%p\n", __func__, ino); ++ break; ++ } ++ spin_unlock(&clp->cl_lock); ++ return ino; ++} ++ ++struct recall_layout_threadargs { ++ struct inode *inode; ++ struct nfs_client *clp; ++ struct completion started; ++ struct cb_layoutrecallargs *rl; ++ int result; ++}; ++ ++static int pnfs_recall_layout(void *data) ++{ ++ struct inode *inode, *ino; ++ struct nfs_client *clp; ++ struct cb_layoutrecallargs rl; ++ struct nfs4_layoutreturn *lrp; ++ struct recall_layout_threadargs *args = ++ (struct recall_layout_threadargs *)data; ++ int status = 0; ++ ++ daemonize("nfsv4-layoutreturn"); ++ ++ dprintk("%s: recall_type=%d fsid 0x%llx-0x%llx start\n", ++ __func__, args->rl->cbl_recall_type, ++ args->rl->cbl_fsid.major, args->rl->cbl_fsid.minor); ++ ++ clp = args->clp; ++ inode = args->inode; ++ rl = *args->rl; ++ ++ /* support whole file layouts only */ ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ if (rl.cbl_recall_type == RETURN_FILE) { ++ if (pnfs_is_next_layout_stateid(NFS_I(inode)->layout, ++ rl.cbl_stateid)) ++ status = pnfs_return_layout(inode, &rl.cbl_seg, ++ &rl.cbl_stateid, RETURN_FILE, ++ false); ++ else ++ status = cpu_to_be32(NFS4ERR_DELAY); ++ if (status) ++ dprintk("%s RETURN_FILE error: %d\n", __func__, status); ++ else ++ status = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ args->result = status; ++ complete(&args->started); ++ goto out; ++ } ++ ++ status = cpu_to_be32(NFS4_OK); ++ args->result = status; ++ complete(&args->started); ++ args = NULL; ++ ++ /* IMPROVEME: This loop is inefficient, running in O(|s_inodes|^2) */ ++ while ((ino = nfs_layoutrecall_find_inode(clp, &rl)) != NULL) { ++ /* FIXME: need to check status on pnfs_return_layout */ ++ pnfs_return_layout(ino, &rl.cbl_seg, NULL, RETURN_FILE, false); ++ iput(ino); ++ } ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (!lrp) { ++ dprintk("%s: allocation failed. Cannot send last LAYOUTRETURN\n", ++ __func__); ++ goto out; ++ } ++ ++ /* send final layoutreturn */ ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = rl.cbl_layout_type; ++ lrp->args.return_type = rl.cbl_recall_type; ++ lrp->args.range = rl.cbl_seg; ++ lrp->args.inode = inode; ++ nfs4_proc_layoutreturn(lrp, true); ++ ++out: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ nfs_put_client(clp); ++ module_put_and_exit(0); ++ dprintk("%s: exit status %d\n", __func__, 0); ++ return 0; ++} ++ ++/* ++ * Asynchronous layout recall! ++ */ ++static int pnfs_async_return_layout(struct nfs_client *clp, struct inode *inode, ++ struct cb_layoutrecallargs *rl) ++{ ++ struct recall_layout_threadargs data = { ++ .clp = clp, ++ .inode = inode, ++ .rl = rl, ++ }; ++ struct task_struct *t; ++ int status = -EAGAIN; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* FIXME: do not allow two concurrent layout recalls */ ++ if (test_and_set_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state)) ++ return status; ++ ++ init_completion(&data.started); ++ __module_get(THIS_MODULE); ++ if (!atomic_inc_not_zero(&clp->cl_count)) ++ goto out_put_no_client; ++ ++ t = kthread_run(pnfs_recall_layout, &data, "%s", "pnfs_recall_layout"); ++ if (IS_ERR(t)) { ++ printk(KERN_INFO "NFS: Layout recall callback thread failed " ++ "for client (clientid %08x/%08x)\n", ++ (unsigned)(clp->cl_clientid >> 32), ++ (unsigned)(clp->cl_clientid)); ++ status = PTR_ERR(t); ++ goto out_module_put; ++ } ++ wait_for_completion(&data.started); ++ return data.result; ++out_module_put: ++ nfs_put_client(clp); ++out_put_no_client: ++ clear_bit(NFS4CLNT_LAYOUT_RECALL, &clp->cl_state); ++ module_put(THIS_MODULE); ++ return status; ++} ++ ++static int pnfs_recall_all_layouts(struct nfs_client *clp) ++{ ++ struct cb_layoutrecallargs rl; ++ struct inode *inode; ++ int status = 0; ++ ++ rl.cbl_recall_type = RETURN_ALL; ++ rl.cbl_seg.iomode = IOMODE_ANY; ++ rl.cbl_seg.offset = 0; ++ rl.cbl_seg.length = NFS4_MAX_UINT64; ++ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, &rl); ++ if (!inode) ++ return status; ++ status = pnfs_async_return_layout(clp, inode, &rl); ++ iput(inode); ++ ++ return status; ++} ++ ++__be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ struct inode *inode = NULL; ++ __be32 res; ++ int status; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); ++ clp = nfs_find_client(args->cbl_addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->cbl_addr)); ++ goto out; ++ } ++ ++ res = cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT); ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ /* the callback must come from the MDS personality */ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) ++ goto loop; ++ if (args->cbl_recall_type == RETURN_FILE) { ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (inode != NULL) { ++ status = pnfs_async_return_layout(clp, inode, ++ args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++ } else { /* _ALL or _FSID */ ++ /* we need the inode to get the nfs_server struct */ ++ inode = nfs_layoutrecall_find_inode(clp, args); ++ if (!inode) ++ goto loop; ++ status = pnfs_async_return_layout(clp, inode, args); ++ if (status) ++ res = cpu_to_be32(NFS4ERR_DELAY); ++ iput(inode); ++ } ++loop: ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ ++/* Remove the deviceid(s) from the nfs_client deviceid cache */ ++static __be32 pnfs_devicenotify_client(struct nfs_client *clp, ++ struct cb_devicenotifyargs *args) ++{ ++ uint32_t type; ++ int i; ++ ++ dprintk("%s: --> clp %p\n", __func__, clp); ++ ++ for (i = 0; i < args->ndevs; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ type = dev->cbd_notify_type; ++ if (type == NOTIFY_DEVICEID4_DELETE && clp->cl_devid_cache) ++ nfs4_delete_device(clp->cl_devid_cache, ++ &dev->cbd_dev_id); ++ else if (type == NOTIFY_DEVICEID4_CHANGE) ++ printk(KERN_ERR "%s: NOTIFY_DEVICEID4_CHANGE " ++ "not supported\n", __func__); ++ } ++ return 0; ++} ++ ++__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, ++ void *dummy) ++{ ++ struct nfs_client *clp; ++ __be32 res = 0; ++ unsigned int num_client = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ res = __constant_htonl(NFS4ERR_INVAL); ++ clp = nfs_find_client(args->addr, 4); ++ if (clp == NULL) { ++ dprintk("%s: no client for addr %u.%u.%u.%u\n", ++ __func__, NIPQUAD(args->addr)); ++ goto out; ++ } ++ ++ do { ++ struct nfs_client *prev = clp; ++ num_client++; ++ res = pnfs_devicenotify_client(clp, args); ++ clp = nfs_find_client_next(prev); ++ nfs_put_client(prev); ++ } while (clp != NULL); ++ ++out: ++ dprintk("%s: exit with status = %d numclient %u\n", ++ __func__, ntohl(res), num_client); ++ return res; ++} ++ + int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) + { + if (delegation == NULL) + return 0; + +- /* seqid is 4-bytes long */ +- if (((u32 *) &stateid->data)[0] != 0) ++ if (stateid->u.stateid.seqid != 0) + return 0; +- if (memcmp(&delegation->stateid.data[4], &stateid->data[4], +- sizeof(stateid->data)-4)) ++ if (memcmp(&delegation->stateid.u.stateid.other, ++ &stateid->u.stateid.other, ++ NFS4_STATEID_OTHER_SIZE)) + return 0; + + return 1; +@@ -335,13 +669,37 @@ out: + return status; + } + ++static inline bool ++validate_bitmap_values(const unsigned long *mask) ++{ ++ int i; ++ ++ if (*mask == 0) ++ return true; ++ if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_WDATA_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_DIR_DLG, mask) || ++ test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, mask) || ++ test_bit(RCA4_TYPE_MASK_BLK_LAYOUT, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OBJ_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OBJ_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ for (i = RCA4_TYPE_MASK_OTHER_LAYOUT_MIN; ++ i <= RCA4_TYPE_MASK_OTHER_LAYOUT_MAX; i++) ++ if (test_bit(i, mask)) ++ return true; ++ return false; ++} ++ + __be32 nfs4_callback_recallany(struct cb_recallanyargs *args, void *dummy) + { + struct nfs_client *clp; + __be32 status; + fmode_t flags = 0; + +- status = htonl(NFS4ERR_OP_NOT_IN_SESSION); ++ status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); + clp = nfs_find_client(args->craa_addr, 4); + if (clp == NULL) + goto out; +@@ -349,16 +707,25 @@ __be32 nfs4_callback_recallany(struct cb + dprintk("NFS: RECALL_ANY callback request from %s\n", + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + ++ status = cpu_to_be32(NFS4ERR_INVAL); ++ if (!validate_bitmap_values((const unsigned long *) ++ &args->craa_type_mask)) ++ return status; ++ ++ status = cpu_to_be32(NFS4_OK); + if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags = FMODE_READ; + if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) + &args->craa_type_mask)) + flags |= FMODE_WRITE; ++ if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) ++ &args->craa_type_mask)) ++ if (pnfs_recall_all_layouts(clp) == -EAGAIN) ++ status = cpu_to_be32(NFS4ERR_DELAY); + + if (flags) + nfs_expire_all_delegation_types(clp, flags); +- status = htonl(NFS4_OK); + out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +diff -up linux-2.6.35.noarch/fs/nfs/callback_xdr.c.orig linux-2.6.35.noarch/fs/nfs/callback_xdr.c +--- linux-2.6.35.noarch/fs/nfs/callback_xdr.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/callback_xdr.c 2010-08-31 21:11:40.904160537 -0400 +@@ -22,6 +22,8 @@ + #define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + + #if defined(CONFIG_NFS_V4_1) ++#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) ++#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) + #define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ + 4 + 1 + 3) + #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +@@ -136,7 +138,7 @@ static __be32 decode_stateid(struct xdr_ + p = read_buf(xdr, 16); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); +- memcpy(stateid->data, p, 16); ++ memcpy(stateid->u.data, p, 16); + return 0; + } + +@@ -220,6 +222,148 @@ out: + + #if defined(CONFIG_NFS_V4_1) + ++static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_layoutrecallargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ ++ args->cbl_addr = svc_addr(rqstp); ++ p = read_buf(xdr, 4 * sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ ++ args->cbl_layout_type = ntohl(*p++); ++ args->cbl_seg.iomode = ntohl(*p++); ++ args->cbl_layoutchanged = ntohl(*p++); ++ args->cbl_recall_type = ntohl(*p++); ++ ++ if (likely(args->cbl_recall_type == RETURN_FILE)) { ++ status = decode_fh(xdr, &args->cbl_fh); ++ if (unlikely(status != 0)) ++ goto out; ++ ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_seg.offset); ++ p = xdr_decode_hyper(p, &args->cbl_seg.length); ++ status = decode_stateid(xdr, &args->cbl_stateid); ++ if (unlikely(status != 0)) ++ goto out; ++ } else if (args->cbl_recall_type == RETURN_FSID) { ++ p = read_buf(xdr, 2 * sizeof(uint64_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_BADXDR); ++ goto out; ++ } ++ p = xdr_decode_hyper(p, &args->cbl_fsid.major); ++ p = xdr_decode_hyper(p, &args->cbl_fsid.minor); ++ } ++ dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d " ++ "fsid %llx-%llx fhsize %d\n", __func__, ++ args->cbl_layout_type, args->cbl_seg.iomode, ++ args->cbl_layoutchanged, args->cbl_recall_type, ++ args->cbl_fsid.major, args->cbl_fsid.minor, ++ args->cbl_fh.size); ++out: ++ dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); ++ return status; ++} ++ ++static ++__be32 decode_devicenotify_args(struct svc_rqst *rqstp, ++ struct xdr_stream *xdr, ++ struct cb_devicenotifyargs *args) ++{ ++ __be32 *p; ++ __be32 status = 0; ++ u32 tmp; ++ int n, i; ++ args->ndevs = 0; ++ ++ args->addr = svc_addr(rqstp); ++ ++ /* Num of device notifications */ ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ n = ntohl(*p++); ++ if (n <= 0) ++ goto out; ++ ++ /* XXX: need to possibly return error in this case */ ++ if (n > NFS4_DEV_NOTIFY_MAXENTRIES) { ++ dprintk("%s: Processing (%d) notifications out of (%d)\n", ++ __func__, NFS4_DEV_NOTIFY_MAXENTRIES, n); ++ n = NFS4_DEV_NOTIFY_MAXENTRIES; ++ } ++ ++ /* Decode each dev notification */ ++ for (i = 0; i < n; i++) { ++ struct cb_devicenotifyitem *dev = &args->devs[i]; ++ ++ p = read_buf(xdr, (4 * sizeof(uint32_t)) ++ + NFS4_PNFS_DEVICEID4_SIZE); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_RESOURCE); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* bitmap size */ ++ if (tmp != 1) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_notify_type = ntohl(*p++); ++ if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ ++ tmp = ntohl(*p++); /* opaque size */ ++ if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 8)) || ++ ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) && ++ (tmp != NFS4_PNFS_DEVICEID4_SIZE + 4))) { ++ status = htonl(NFS4ERR_INVAL); ++ goto out; ++ } ++ dev->cbd_layout_type = ntohl(*p++); ++ memcpy(dev->cbd_dev_id.data, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ ++ if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { ++ p = read_buf(xdr, sizeof(uint32_t)); ++ if (unlikely(p == NULL)) { ++ status = htonl(NFS4ERR_DELAY); ++ goto out; ++ } ++ dev->cbd_immediate = ntohl(*p++); ++ } else { ++ dev->cbd_immediate = 0; ++ } ++ ++ args->ndevs++; ++ ++ dprintk("%s: type %d layout 0x%x immediate %d\n", ++ __func__, dev->cbd_notify_type, dev->cbd_layout_type, ++ dev->cbd_immediate); ++ } ++out: ++ dprintk("%s: status %d ndevs %d\n", ++ __func__, ntohl(status), args->ndevs); ++ return status; ++} ++ + static __be32 decode_sessionid(struct xdr_stream *xdr, + struct nfs4_sessionid *sid) + { +@@ -574,11 +718,11 @@ preprocess_nfs41_op(int nop, unsigned in + case OP_CB_SEQUENCE: + case OP_CB_RECALL_ANY: + case OP_CB_RECALL_SLOT: ++ case OP_CB_LAYOUTRECALL: ++ case OP_CB_NOTIFY_DEVICEID: + *op = &callback_ops[op_nr]; + break; + +- case OP_CB_LAYOUTRECALL: +- case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY: + case OP_CB_PUSH_DELEG: + case OP_CB_RECALLABLE_OBJ_AVAIL: +@@ -739,6 +883,18 @@ static struct callback_op callback_ops[] + .res_maxsize = CB_OP_RECALL_RES_MAXSZ, + }, + #if defined(CONFIG_NFS_V4_1) ++ [OP_CB_LAYOUTRECALL] = { ++ .process_op = (callback_process_op_t)nfs4_callback_layoutrecall, ++ .decode_args = ++ (callback_decode_arg_t)decode_layoutrecall_args, ++ .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, ++ }, ++ [OP_CB_NOTIFY_DEVICEID] = { ++ .process_op = (callback_process_op_t)nfs4_callback_devicenotify, ++ .decode_args = ++ (callback_decode_arg_t)decode_devicenotify_args, ++ .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ, ++ }, + [OP_CB_SEQUENCE] = { + .process_op = (callback_process_op_t)nfs4_callback_sequence, + .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, +diff -up linux-2.6.35.noarch/fs/nfs/client.c.orig linux-2.6.35.noarch/fs/nfs/client.c +--- linux-2.6.35.noarch/fs/nfs/client.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/client.c 2010-08-31 21:11:40.905170519 -0400 +@@ -39,6 +39,7 @@ + #include + #include + #include ++#include + + #include + +@@ -48,6 +49,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_CLIENT + +@@ -150,11 +152,14 @@ static struct nfs_client *nfs_alloc_clie + clp->cl_boot_time = CURRENT_TIME; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; + clp->cl_minorversion = cl_init->minorversion; ++ clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; + #endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; +- ++#if defined(CONFIG_NFS_V4_1) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++#endif + nfs_fscache_get_client_cookie(clp); + + return clp; +@@ -178,7 +183,7 @@ static void nfs4_clear_client_minor_vers + clp->cl_session = NULL; + } + +- clp->cl_call_sync = _nfs4_call_sync; ++ clp->cl_mvops = nfs_v4_minor_ops[0]; + #endif /* CONFIG_NFS_V4_1 */ + } + +@@ -188,7 +193,7 @@ static void nfs4_clear_client_minor_vers + static void nfs4_destroy_callback(struct nfs_client *clp) + { + if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) +- nfs_callback_down(clp->cl_minorversion); ++ nfs_callback_down(clp->cl_mvops->minor_version); + } + + static void nfs4_shutdown_client(struct nfs_client *clp) +@@ -251,6 +256,7 @@ void nfs_put_client(struct nfs_client *c + nfs_free_client(clp); + } + } ++EXPORT_SYMBOL(nfs_put_client); + + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + /* +@@ -343,7 +349,7 @@ static int nfs_sockaddr_match_ipaddr(con + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields, including the port number. + */ +-static int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) + { + if (sa1->sa_family != sa2->sa_family) +@@ -357,6 +363,7 @@ static int nfs_sockaddr_cmp(const struct + } + return 0; + } ++EXPORT_SYMBOL(nfs_sockaddr_cmp); + + /* + * Find a client by IP address and protocol version +@@ -548,6 +555,7 @@ int nfs4_check_client_ready(struct nfs_c + return -EPROTONOSUPPORT; + return 0; + } ++EXPORT_SYMBOL(nfs4_check_client_ready); + + /* + * Initialise the timeout values for a connection +@@ -865,9 +873,34 @@ error: + } + + /* ++ * Initialize the pNFS layout driver and setup pNFS related parameters ++ */ ++static void nfs4_init_pnfs(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ struct nfs_client *clp = server->nfs_client; ++ ++ if (nfs4_has_session(clp) && ++ (clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) { ++ server->pnfs_blksize = fsinfo->blksize; ++ set_pnfs_layoutdriver(server, mntfh, fsinfo->layouttype); ++ pnfs_set_ds_iosize(server); ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++static void nfs4_uninit_pnfs(struct nfs_server *server) ++{ ++#if defined(CONFIG_NFS_V4_1) ++ if (server->nfs_client && nfs4_has_session(server->nfs_client)) ++ unmount_pnfs_layoutdriver(server); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ ++/* + * Load up the server record from information gained in an fsinfo record + */ +-static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) ++static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) + { + unsigned long max_rpc_payload; + +@@ -897,6 +930,8 @@ static void nfs_server_set_fsinfo(struct + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; ++ nfs4_init_pnfs(server, mntfh, fsinfo); ++ + server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL); + + server->dtsize = nfs_block_size(fsinfo->dtpref, NULL); +@@ -938,7 +973,7 @@ static int nfs_probe_fsinfo(struct nfs_s + if (error < 0) + goto out_error; + +- nfs_server_set_fsinfo(server, &fsinfo); ++ nfs_server_set_fsinfo(server, mntfh, &fsinfo); + + /* Get some general file system info */ + if (server->namelen == 0) { +@@ -1016,6 +1051,7 @@ void nfs_free_server(struct nfs_server * + { + dprintk("--> nfs_free_server()\n"); + ++ nfs4_uninit_pnfs(server); + spin_lock(&nfs_client_lock); + list_del(&server->client_link); + list_del(&server->master_link); +@@ -1126,7 +1162,7 @@ static int nfs4_init_callback(struct nfs + return error; + } + +- error = nfs_callback_up(clp->cl_minorversion, ++ error = nfs_callback_up(clp->cl_mvops->minor_version, + clp->cl_rpcclient->cl_xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", +@@ -1143,10 +1179,8 @@ static int nfs4_init_callback(struct nfs + */ + static int nfs4_init_client_minor_version(struct nfs_client *clp) + { +- clp->cl_call_sync = _nfs4_call_sync; +- + #if defined(CONFIG_NFS_V4_1) +- if (clp->cl_minorversion) { ++ if (clp->cl_mvops->minor_version) { + struct nfs4_session *session = NULL; + /* + * Create the session and mark it expired. +@@ -1158,7 +1192,13 @@ static int nfs4_init_client_minor_versio + return -ENOMEM; + + clp->cl_session = session; +- clp->cl_call_sync = _nfs4_call_sync_session; ++ /* ++ * The create session reply races with the server back ++ * channel probe. Mark the client NFS_CS_SESSION_INITING ++ * so that the client back channel can find the ++ * nfs_client struct ++ */ ++ clp->cl_cons_state = NFS_CS_SESSION_INITING; + } + #endif /* CONFIG_NFS_V4_1 */ + +@@ -1216,7 +1256,7 @@ error: + /* + * Set up an NFS4 client + */ +-static int nfs4_set_client(struct nfs_server *server, ++int nfs4_set_client(struct nfs_server *server, + const char *hostname, + const struct sockaddr *addr, + const size_t addrlen, +@@ -1259,6 +1299,7 @@ error: + dprintk("<-- nfs4_set_client() = xerror %d\n", error); + return error; + } ++EXPORT_SYMBOL(nfs4_set_client); + + + /* +@@ -1454,7 +1495,7 @@ struct nfs_server *nfs4_create_referral_ + data->authflavor, + parent_server->client->cl_xprt->prot, + parent_server->client->cl_timeout, +- parent_client->cl_minorversion); ++ parent_client->cl_mvops->minor_version); + if (error < 0) + goto error; + +diff -up linux-2.6.35.noarch/fs/nfsd/bl_com.c.orig linux-2.6.35.noarch/fs/nfsd/bl_com.c +--- linux-2.6.35.noarch/fs/nfsd/bl_com.c.orig 2010-08-31 21:11:40.944171081 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/bl_com.c 2010-08-31 21:11:40.945160531 -0400 +@@ -0,0 +1,292 @@ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++static ssize_t bl_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); ++static void bl_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops bl_upcall_ops = { ++ .upcall = bl_pipe_upcall, ++ .downcall = bl_pipe_downcall, ++ .destroy_msg = bl_pipe_destroy_msg, ++}; ++ ++bl_comm_t *bl_comm_global; ++ ++int ++nfsd_bl_start(void) ++{ ++ bl_comm_t *bl_comm = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ dprintk("%s: starting pipe\n", __func__); ++ if (bl_comm_global) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ bl_comm = kzalloc(sizeof (*bl_comm), GFP_KERNEL); ++ if (!bl_comm) { ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ /* FIXME: rename to "spnfs_block" */ ++ bl_comm->pipe_dentry = rpc_mkpipe(nd.path.dentry, "pnfs_block", bl_comm, ++ &bl_upcall_ops, 0); ++ if (IS_ERR(bl_comm->pipe_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ mutex_init(&bl_comm->lock); ++ mutex_init(&bl_comm->pipe_lock); ++ init_waitqueue_head(&bl_comm->pipe_wq); ++ ++ bl_comm_global = bl_comm; ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(bl_comm); ++ return rc; ++} ++ ++void ++nfsd_bl_stop(void) ++{ ++ bl_comm_t *c = bl_comm_global; ++ ++ dprintk("%s: stopping pipe\n", __func__); ++ if (!c) ++ return; ++ rpc_unlink(c->pipe_dentry); ++ rpc_put_mount(); ++ bl_comm_global = NULL; ++ kfree(c); ++} ++ ++static ssize_t ++bl_pipe_upcall(struct file *file, struct rpc_pipe_msg *msg, char __user *dst, ++ size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied, ++ left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ ++ return mlen; ++} ++ ++static ssize_t ++bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ bl_comm_t *bc = (bl_comm_t *)rpci->private; ++ bl_comm_msg_t *im = &bc->msg; ++ int ret; ++ bl_comm_res_t *res; ++ ++ ++ if (mlen == 0) { ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ im->msg_res = NULL; ++ wake_up(&bc->pipe_wq); ++ return -EFAULT; ++ } ++ ++ if ((res = kmalloc(mlen, GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(res, src, mlen)) { ++ kfree(res); ++ return -EFAULT; ++ } ++ ++ mutex_lock(&bc->pipe_lock); ++ ++ ret = mlen; ++ im->msg_status = res->res_status; ++ im->msg_res = res; ++ ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++ return ret; ++} ++ ++static void ++bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ bl_comm_msg_t *im = msg->data; ++ bl_comm_t *bc = container_of(im, struct bl_comm, msg); ++ ++ if (msg->errno >= 0) ++ return; ++ ++ mutex_lock(&bc->pipe_lock); ++ im->msg_status = PNFS_BLOCK_FAILURE; ++ wake_up(&bc->pipe_wq); ++ mutex_unlock(&bc->pipe_lock); ++} ++ ++int ++bl_upcall(bl_comm_t *bc, bl_comm_msg_t *upmsg, bl_comm_res_t **res) ++{ ++ struct rpc_pipe_msg msg; ++ DECLARE_WAITQUEUE(wq, current); ++ int rval = 1; ++ bl_comm_msg_t *m = &bc->msg; ++ ++ if (bc == NULL) { ++ dprintk("%s: No pNFS block daemon available\n", __func__); ++ return 1; ++ } ++ ++ mutex_lock(&bc->lock); ++ mutex_lock(&bc->pipe_lock); ++ ++ memcpy(m, upmsg, sizeof (*m)); ++ ++ memset(&msg, 0, sizeof (msg)); ++ msg.data = m; ++ msg.len = sizeof (*m); ++ ++ add_wait_queue(&bc->pipe_wq, &wq); ++ rval = rpc_queue_upcall(bc->pipe_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&bc->pipe_lock); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ remove_wait_queue(&bc->pipe_wq, &wq); ++ mutex_lock(&bc->pipe_lock); ++ ++ if (m->msg_status == PNFS_BLOCK_SUCCESS) { ++ *res = m->msg_res; ++ rval = 0; ++ } else ++ rval = 1; ++ ++out: ++ mutex_unlock(&bc->pipe_lock); ++ mutex_unlock(&bc->lock); ++ return rval; ++} ++ ++static ssize_t ctl_write(struct file *file, const char __user *buf, size_t len, ++ loff_t *offset) ++{ ++ int cmd, ++ rc; ++ bl_comm_t *bc = bl_comm_global; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof (int))) ++ return -EFAULT; ++ switch (cmd) { ++ case PNFS_BLOCK_CTL_STOP: ++ msg.msg_type = PNFS_UPCALL_MSG_STOP; ++ (void) bl_upcall(bc, &msg, &res); ++ kfree(res); ++ nfsd_bl_stop(); ++ break; ++ ++ case PNFS_BLOCK_CTL_START: ++ rc = nfsd_bl_start(); ++ if (rc != 0) ++ return rc; ++ break; ++ ++ case PNFS_BLOCK_CTL_VERS: ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bc, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ kfree(res); ++ break; ++ ++ default: ++ dprintk("%s: unknown ctl command %d\n", __func__, cmd); ++ break; ++ } ++ return len; ++} ++ ++static struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++ ++/* ++ * bl_init_proc -- set up proc interfaces ++ * ++ * Creating a pnfs_block directory isn't really required at this point ++ * since we've only got a single node in that directory. If the need for ++ * more nodes doesn't present itself shortly this code should revert ++ * to a single top level node. McNeal 11-Aug-2008. ++ */ ++int ++bl_init_proc(void) ++{ ++ struct proc_dir_entry *e; ++ ++ e = proc_mkdir("fs/pnfs_block", NULL); ++ if (!e) ++ return -ENOMEM; ++ ++ e = create_proc_entry("fs/pnfs_block/ctl", 0, NULL); ++ if (!e) ++ return -ENOMEM; ++ e->proc_fops = &ctl_ops; ++ ++ return 0; ++} ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.35.noarch/fs/nfsd/bl_ops.c.orig linux-2.6.35.noarch/fs/nfsd/bl_ops.c +--- linux-2.6.35.noarch/fs/nfsd/bl_ops.c.orig 2010-08-31 21:11:40.946170512 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/bl_ops.c 2010-08-31 21:11:40.946170512 -0400 +@@ -0,0 +1,1672 @@ ++/* ++ * bl_ops.c ++ * spNFS ++ * ++ * Created by Rick McNeal on 4/1/08. ++ * Copyright 2008 __MyCompanyName__. All rights reserved. ++ * ++ */ ++ ++/* ++ * Block layout operations. ++ * ++ * These functions, with the exception of pnfs_block_enabled, are assigned to ++ * the super block s_export_op structure. ++ */ ++#if defined(CONFIG_SPNFS_BLOCK) ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++ ++#define BL_LAYOUT_HASH_BITS 4 ++#define BL_LAYOUT_HASH_SIZE (1 << BL_LAYOUT_HASH_BITS) ++#define BL_LAYOUT_HASH_MASK (BL_LAYOUT_HASH_SIZE - 1) ++#define BL_LIST_REQ (sizeof (struct dm_ioctl) + 256) ++ ++#define bl_layout_hashval(id) \ ++ ((id) & BL_LAYOUT_HASH_MASK) ++ ++#define BLL_F_END(p) ((p)->bll_foff + (p)->bll_len) ++#define BLL_S_END(p) ((p)->bll_soff + (p)->bll_len) ++#define _2SECTS(v) ((v) >> 9) ++ ++#ifndef READ32 ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++(x) = (u64)ntohl(*p++) << 32; \ ++(x) |= ntohl(*p++); \ ++} while (0) ++#endif ++ ++ ++typedef enum {True, False} boolean_t; ++/* ---- block layoutget and commit structure ---- */ ++typedef struct bl_layout_rec { ++ struct list_head blr_hash, ++ blr_layouts; ++ dev_t blr_rdev; ++ struct inode *blr_inode; ++ int blr_recalled; // debug ++ u64 blr_orig_size, ++ blr_commit_size, ++ blr_ext_size; ++ spinlock_t blr_lock; // Protects blr_layouts ++} bl_layout_rec_t; ++ ++static struct list_head layout_hash; ++static struct list_head layout_hashtbl[BL_LAYOUT_HASH_SIZE]; ++static spinlock_t layout_hashtbl_lock; ++ ++/* ---- prototypes ---- */ ++static boolean_t device_slice(dev_t devid); ++static boolean_t device_dm(dev_t devid); ++static boolean_t layout_inode_add(struct inode *i, bl_layout_rec_t **); ++static bl_layout_rec_t *layout_inode_find(struct inode *i); ++static void layout_inode_del(struct inode *i); ++static char *map_state2name(enum pnfs_block_extent_state4 s); ++static pnfs_blocklayout_devinfo_t *bld_alloc(struct list_head *volume, int type); ++static void bld_free(pnfs_blocklayout_devinfo_t *bld); ++static pnfs_blocklayout_devinfo_t *bld_simple(struct list_head *volumes, ++ dev_t devid, int local_index); ++static pnfs_blocklayout_devinfo_t *bld_slice(struct list_head *volumes, ++ dev_t devid, int my_loc, int idx); ++static int layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg); ++struct list_head *layout_cache_iter(bl_layout_rec_t *r, ++ struct list_head *bl_possible, struct nfsd4_layout_seg *seg); ++static void layout_cache_merge(bl_layout_rec_t *r, struct list_head *h); ++static int layout_cache_update(bl_layout_rec_t *r, struct list_head *h); ++static void layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg); ++static void print_bll(pnfs_blocklayout_layout_t *b, char *); ++static inline boolean_t layout_cache_fill_from_list(bl_layout_rec_t *r, ++ struct list_head *h, struct nfsd4_layout_seg *seg); ++static inline void bll_collapse(bl_layout_rec_t *r, ++ pnfs_blocklayout_layout_t *c); ++static pnfs_blocklayout_layout_t *bll_alloc(u64 offset, u64 len, ++ enum bl_cache_state state, struct list_head *h); ++static pnfs_blocklayout_layout_t *bll_alloc_dup(pnfs_blocklayout_layout_t *b, ++ enum bl_cache_state c, struct list_head *h); ++static inline boolean_t layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s); ++static void extents_setup(struct fiemap_extent_info *fei); ++static void extents_count(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_get(struct fiemap_extent_info *fei, struct inode *i, ++ u64 foff, u64 len); ++static boolean_t extents_process(struct fiemap_extent_info *fei, ++ struct list_head *bl_candidates, struct nfsd4_layout_seg *, dev_t dev, ++ pnfs_blocklayout_layout_t *b); ++static void extents_cleanup(struct fiemap_extent_info *fei); ++ ++void ++nfsd_bl_init(void) ++{ ++ int i; ++ dprintk("%s loaded\n", __func__); ++ ++ spin_lock_init(&layout_hashtbl_lock); ++ INIT_LIST_HEAD(&layout_hash); ++ for (i = 0; i < BL_LAYOUT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&layout_hashtbl[i]); ++ bl_init_proc(); ++} ++ ++/* ++ * pnfs_block_enabled -- check to see if this file system should be export as ++ * block pnfs ++ */ ++int ++pnfs_block_enabled(struct inode *inode, int ex_flags) ++{ ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ static int bl_comm_once = 0; ++ ++ dprintk("--> %s\n", __func__); ++ /* ++ * FIXME: Figure out method to determine if this file system should ++ * be exported. The following areas need to be checked. ++ * (1) Validate that this file system was exported as a pNFS ++ * block-layout ++ * (2) Has there been successful communication with the ++ * volume daemon? ++ */ ++ /* Check #1 */ ++#ifdef notyet ++ if (!(ex_flags & NFSEXP_PNFS_BLOCK)) { ++ dprintk("%s: pnfs_block not set in export\n", __func__); ++ return 0; ++ } ++#endif ++ ++ /* Check #1 */ ++ if (!bl_comm_once) { ++ msg.msg_type = PNFS_UPCALL_MSG_VERS; ++ msg.u.msg_vers = PNFS_UPCALL_VERS; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to contact pNFS block daemon\n", ++ __func__); ++ return 0; ++ } ++ if (msg.u.msg_vers != res->u.vers) { ++ dprintk("%s: vers mismatch, kernel != daemon\n", ++ __func__); ++ kfree(res); ++ return 0; ++ } ++ } ++ bl_comm_once = 1; ++ ++ kfree(res); ++ ++ dprintk("<-- %s okay\n", __func__); ++ return 1; ++} ++ ++int ++bl_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_BLOCK_VOLUME; ++} ++ ++int ++bl_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_devid = sb->s_dev; ++ res->gd_verf = 1; ++ res->gd_cookie = 1; ++ return 0; ++} ++ ++static int ++bl_getdeviceinfo_slice(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld_slice_p, ++ *bld_simple_p, ++ *bld; ++ int status = -EIO, ++ location = 0; ++ struct list_head volumes; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ bld_simple_p = bld_simple(&volumes, devid->devid, ++ location++); ++ if (!bld_simple_p) ++ goto out; ++ bld_slice_p = bld_slice(&volumes, devid->devid, location++, ++ bld_simple_p->bld_index_loc); ++ ++ if (!bld_slice_p) ++ goto out; ++ ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ if (bld->bld_type == PNFS_BLOCK_VOLUME_SIMPLE) ++ kfree(bld->u.simple.bld_sig); ++ bld_free(bld); ++ } ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++static int ++bl_getdeviceinfo_dm(struct super_block *sb, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ int status = -EIO, // default to error ++ i, ++ location = 0; ++ struct list_head volumes; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ INIT_LIST_HEAD(&volumes); ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMGET; ++ msg.u.msg_dev = devid->devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: upcall for DMGET failed\n", __func__); ++ goto out; ++ } ++ ++ /* ++ * Don't use bld_alloc() here. If used this will be the first volume ++ * type added to the list whereas the protocol requires it to be the ++ * last. ++ */ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ goto out; ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = PNFS_BLOCK_VOLUME_STRIPE; ++ bld->u.stripe.bld_stripes = res->u.stripe.num_stripes; ++ bld->u.stripe.bld_chunk_size = res->u.stripe.stripe_size * 512LL; ++ dprintk("%s: stripes %d, chunk_size %Lu\n", __func__, ++ bld->u.stripe.bld_stripes, bld->u.stripe.bld_chunk_size / 512LL); ++ ++ bld->u.stripe.bld_stripe_indexs = kmalloc(bld->u.stripe.bld_stripes * ++ sizeof (int), GFP_KERNEL); ++ if (!bld->u.stripe.bld_stripe_indexs) ++ goto out; ++ ++ for (i = 0; i < bld->u.stripe.bld_stripes; i++) { ++ dev_t dev; ++ pnfs_blocklayout_devinfo_t *bldp; ++ ++ dev = MKDEV(res->u.stripe.devs[i].major, ++ res->u.stripe.devs[i].minor); ++ if (dev == 0) ++ goto out; ++ ++ bldp = bld_simple(&volumes, dev, location++); ++ if (!bldp) { ++ dprintk("%s: bld_simple failed\n", __func__); ++ goto out; ++ } ++ bldp = bld_slice(&volumes, dev, location++, bldp->bld_index_loc); ++ ++ if (!bldp) { ++ dprintk("%s: bld_slice failed\n", __func__); ++ goto out; ++ } ++ bld->u.stripe.bld_stripe_indexs[i] = bldp->bld_index_loc; ++ ++ } ++ list_add_tail(&bld->bld_list, &volumes); ++ status = blocklayout_encode_devinfo(xdr, &volumes); ++ ++out: ++ while (!list_empty(&volumes)) { ++ bld = list_entry(volumes.next, pnfs_blocklayout_devinfo_t, ++ bld_list); ++ switch (bld->bld_type) { ++ case PNFS_BLOCK_VOLUME_SLICE: ++ case PNFS_BLOCK_VOLUME_CONCAT: ++ // No memory to release for these ++ break; ++ case PNFS_BLOCK_VOLUME_SIMPLE: ++ kfree(bld->u.simple.bld_sig); ++ break; ++ case PNFS_BLOCK_VOLUME_STRIPE: ++ kfree(bld->u.stripe.bld_stripe_indexs); ++ break; ++ } ++ bld_free(bld); ++ } ++ kfree(res); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++/* ++ * bl_getdeviceinfo -- determine device tree for requested devid ++ */ ++int ++bl_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ if (device_slice(devid->devid) == True) ++ return bl_getdeviceinfo_slice(sb, xdr, devid); ++ else if (device_dm(devid->devid) == True) ++ return bl_getdeviceinfo_dm(sb, xdr, devid); ++ return -EINVAL; ++} ++ ++enum nfsstat4 ++bl_layoutget(struct inode *i, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ pnfs_blocklayout_layout_t *b; ++ bl_layout_rec_t *r; ++ struct list_head bl_possible, ++ *bl_candidates = NULL; ++ boolean_t del_on_error = False; ++ int adj; ++ enum nfsstat4 nfserr = NFS4_OK; ++ ++ dprintk("--> %s (inode=[0x%x:%lu], offset=%Lu, len=%Lu, iomode=%d)\n", ++ __func__, i->i_sb->s_dev, i->i_ino, _2SECTS(res->lg_seg.offset), ++ _2SECTS(res->lg_seg.length), res->lg_seg.iomode); ++ ++ if (res->lg_seg.length == 0) { ++ printk("%s: request length of 0, error condition\n", __func__); ++ return NFS4ERR_BADLAYOUT; ++ } ++ ++ /* ++ * Adjust the length as required per spec. ++ * - First case is were the length is set to (u64)-1. Cheap means to ++ * define the end of the file. ++ * - Second case is were the I/O mode is read-only, but the request is ++ * past the end of the file so the request needs to be trimed. ++ */ ++ if ((res->lg_seg.length == NFS4_MAX_UINT64) || ++ (((res->lg_seg.offset + res->lg_seg.length) > i->i_size) && ++ (res->lg_seg.iomode == IOMODE_READ))) ++ res->lg_seg.length = i->i_size - res->lg_seg.offset; ++ ++ adj = (res->lg_seg.offset & 511) ? res->lg_seg.offset & 511 : 0; ++ res->lg_seg.offset -= adj; ++ res->lg_seg.length = (res->lg_seg.length + adj + 511) & ~511; ++ ++ if (res->lg_seg.iomode != IOMODE_READ) ++ if (i->i_op->fallocate(i, FALLOC_FL_KEEP_SIZE, ++ res->lg_seg.offset, res->lg_seg.length)) ++ return NFS4ERR_IO; ++ ++ INIT_LIST_HEAD(&bl_possible); ++ ++ if ((r = layout_inode_find(i)) == NULL) { ++ if (layout_inode_add(i, &r) == False) { ++ printk("%s: layout_inode_add failed\n", __func__); ++ return NFS4ERR_IO; ++ } ++ del_on_error = True; ++ } ++ BUG_ON(!r); ++ ++ spin_lock(&r->blr_lock); ++ ++ if (layout_cache_fill_from(r, &bl_possible, &res->lg_seg)) { ++ /* ++ * This will send LAYOUTTRYAGAIN error to the client. ++ */ ++ dprintk("%s: layout_cache_fill_from() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res->lg_return_on_close = 1; ++ res->lg_seg.length = 0; ++ ++ bl_candidates = layout_cache_iter(r, &bl_possible, &res->lg_seg); ++ if (!bl_candidates) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ layout_cache_merge(r, bl_candidates); ++ if (layout_cache_update(r, bl_candidates)) { ++ /* ---- Failed to allocate memory. ---- */ ++ dprintk("%s: layout_cache_update() failed\n", __func__); ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ nfserr = blocklayout_encode_layout(xdr, bl_candidates); ++ if (nfserr) ++ dprintk("%s: layoutget xdr routine failed\n", __func__); ++ ++layoutget_cleanup: ++ if (bl_candidates) { ++ while (!list_empty(bl_candidates)) { ++ b = list_entry(bl_candidates->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ } ++ ++ spin_unlock(&r->blr_lock); ++ if (unlikely(nfserr)) { ++ if (del_on_error == True) ++ layout_inode_del(i); ++ res->lg_seg.length = 0; ++ res->lg_seg.offset = 0; ++ } ++ ++ dprintk("<-- %s (rval %u)\n", __func__, nfserr); ++ return nfserr; ++} ++ ++/* ++ * bl_layoutcommit -- commit changes, especially size, to file systemj ++ * ++ * Currently this routine isn't called and everything is handled within ++ * nfsd4_layoutcommit(). By not calling this routine the server doesn't ++ * handle a partial return, a set of extents, of the layout. The extents ++ * are decoded here, but nothing is done with them. If this routine is ++ * be called the interface must change to pass the 'dentry' pointer such ++ * that notify_change() can be called. ++ */ ++int ++bl_layoutcommit(struct inode *i, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ bl_layout_rec_t *r; ++ int status = 0; ++ u64 lw_plus; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ r = layout_inode_find(i); ++ if (r) { ++ lw_plus = args->lc_last_wr + 1; ++ if (args->lc_newoffset) { ++ dprintk(" lc_last_wr %Lu\n", lw_plus); ++ if (r->blr_orig_size < lw_plus) { ++ r->blr_orig_size = lw_plus; ++ res->lc_size_chg = 1; ++ res->lc_newsize = lw_plus; ++ } ++ } ++ ++ if (args->lc_up_len) { ++ int extents, ++ i; ++ struct pnfs_blocklayout_layout *b; ++ __be32 *p = args->lc_up_layout; ++ ++ /* ++ * Client is returning a set of extents which ++ * should/could be used to update the file system. ++ * See section 2.3.2 in draft-ietf-nfsv4-pnfs-block-08 ++ */ ++ READ32(extents); ++ dprintk(" Client returning %d extents: data size %d\n", ++ extents, args->lc_up_len); ++ b = kmalloc(sizeof (struct pnfs_blocklayout_layout) * ++ extents, GFP_KERNEL); ++ if (b) { ++ for (i = 0; i < extents; i++) { ++ READ64(b[i].bll_vol_id.sbid); ++ READ64(b[i].bll_vol_id.devid); ++ READ64(b[i].bll_foff); ++ READ64(b[i].bll_len); ++ READ64(b[i].bll_soff); ++ READ32(b[i].bll_es); ++ dprintk(" %d: foff %Lu, len %Lu, soff %Lu " ++ "state %s\n", ++ i, _2SECTS(b[i].bll_foff), ++ _2SECTS(b[i].bll_len), ++ _2SECTS(b[i].bll_soff), ++ map_state2name(b[i].bll_es)); ++ } ++ kfree(b); ++ } else { ++ status = -ENOMEM; ++ } ++ } ++ } else ++ dprintk("%s: Unexpected commit to inode %p\n", __func__, i); ++ ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutreturn(struct inode *i, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ int status = 0; ++ bl_layout_rec_t *r; ++ ++ dprintk("--> %s (ino [0x%x:%lu])\n", __func__, i->i_sb->s_dev, i->i_ino); ++ ++ r = layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ layout_cache_del(r, &args->lr_seg); ++ spin_unlock(&r->blr_lock); ++ dprintk(" ext_size %Lu, i_size %Lu, orig_size %Lu\n", ++ r->blr_ext_size, i->i_size, r->blr_orig_size); ++ } ++ ++ layout_inode_del(i); ++ dprintk("<-- %s (rval %d)\n", __func__, status); ++ return status; ++} ++ ++int ++bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ bl_layout_rec_t *r; ++ pnfs_blocklayout_layout_t *b; ++ u64 adj; ++ ++ dprintk("--> %s\n", __func__); ++ BUG_ON(!len); ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk(" recalling layout [0x%x:%lu], %Lu:%Lu\n", ++ inode->i_sb->s_dev, inode->i_ino, ++ _2SECTS(offset), _2SECTS(len)); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* ++ * XXX figure out how to get a sb since there's no ++ * inode ptr ++ */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++restart: ++ r = layout_inode_find(inode); ++ if (r && len && !r->blr_recalled) { ++ spin_lock(&r->blr_lock); ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (!r->blr_recalled && !b->bll_recalled && ++ (offset >= b->bll_foff) && (offset < BLL_F_END(b))) { ++ b->bll_recalled = 1; ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_BLOCK_VOLUME; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = 0; ++ lr.cbl_seg.length = NFS4_MAX_UINT64; ++ r->blr_recalled = 1; ++ dprintk(" FULL LAYOUTRECALL\n"); ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ ++ /* ++ * Currently there are only two cases where the ++ * layout is being returned. ++ * (1) Someone is issuing a NFS_WRITE operation ++ * to this layout. ++ * (2) The file has been truncated which means ++ * the layout is immediately made invalid. ++ * In both cases the client must write any ++ * uncommitted modifications to the server via ++ * NFS_WRITE. ++ */ ++ lr.cbl_layoutchanged = 1; ++ ++ /* ++ * Need to drop the lock because we'll get a ++ * layoutreturn which will block waiting for ++ * the lock. The request will come in on the ++ * same thread which will cause a deadlock. ++ */ ++ spin_unlock(&r->blr_lock); ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ adj = MIN(b->bll_len - (offset - b->bll_foff), ++ len); ++ offset += adj; ++ len -= adj; ++ if (!len) { ++ spin_lock(&r->blr_lock); ++ break; ++ } ++ /* ++ * Since layoutreturn will have been called we ++ * can't assume blr_layouts is still valid, ++ * so restart. ++ */ ++ goto restart; ++ } ++ } ++ spin_unlock(&r->blr_lock); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++/* ++ * []------------------------------------------------------------------[] ++ * | Support functions from here on down. | ++ * []------------------------------------------------------------------[] ++ */ ++ ++/* ++ * bld_simple -- given a dev_t build a simple volume structure ++ * ++ * Simple volume contains the device signature and offset to that data in ++ * the storage volume. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_simple(struct list_head *volumes, dev_t devid, int local_index) ++{ ++ pnfs_blocklayout_devinfo_t *bld = NULL; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res = NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSIG; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("%s: Failed to get signature information\n", __func__); ++ goto error; ++ } ++ ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SIMPLE); ++ if (!bld) ++ return NULL; ++ ++ bld->u.simple.bld_offset = (res->u.sig.sector * 512LL) + res->u.sig.offset; ++ bld->u.simple.bld_sig_len = res->u.sig.len; ++ bld->u.simple.bld_sig = kmalloc(res->u.sig.len, GFP_KERNEL); ++ if (!bld->u.simple.bld_sig) ++ goto error; ++ ++ memcpy(bld->u.simple.bld_sig, res->u.sig.sig, res->u.sig.len); ++ kfree(res); ++ return bld; ++ ++error: ++ if (bld) ++ bld_free(bld); ++ if (res) ++ kfree(res); ++ dprintk("%s: error in bld_simple\n", __func__); ++ return NULL; ++} ++ ++/* ++ * bld_slice -- given a dev_t build a slice volume structure ++ * ++ * A slice volume contains the length of the slice/partition and its offset ++ * from the beginning of the storage volume. There's also a reference to ++ * the "simple" volume which contains this slice. ++ */ ++static pnfs_blocklayout_devinfo_t * ++bld_slice(struct list_head *volumes, dev_t devid, int my_loc, int simple_loc) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ dprintk("--> %s\n", __func__); ++ bld = bld_alloc(volumes, PNFS_BLOCK_VOLUME_SLICE); ++ if (!bld) ++ return NULL; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_GETSLICE; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Upcall to get slice info failed\n"); ++ bld_free(bld); ++ return NULL; ++ } ++ ++ bld->bld_devid.devid = devid; ++ bld->bld_index_loc = my_loc; ++ bld->u.slice.bld_start = res->u.slice.start * 512LL; ++ bld->u.slice.bld_len = res->u.slice.length * 512LL; ++ bld->u.slice.bld_index = simple_loc; ++ ++ dprintk("%s: start %Lu, len %Lu\n", __func__, ++ bld->u.slice.bld_start / 512LL, bld->u.slice.bld_len / 512LL); ++ ++ kfree(res); ++ dprintk("<-- %s (rval %p)\n", __func__, bld); ++ return bld; ++} ++ ++static int ++layout_cache_fill_from(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!list_empty(&r->blr_layouts)) ++ if (layout_cache_fill_from_list(r, h, seg) == False) ++ return -EIO; ++ ++ /* ++ * This deals with two conditions. ++ * (1) When blr_layouts is empty we need to create the first entry ++ * (2) When the range requested falls past the end of any current ++ * layout the residual must be taken care of. ++ */ ++ if (seg->length) { ++ n = bll_alloc(seg->offset, seg->length, BLOCK_LAYOUT_NEW, h); ++ if (!n) ++ return -ENOMEM; ++ dprintk(" remaining at %Lu, len %Lu\n", _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len)); ++ } ++ ++ dprintk("<-- %s\n", __func__); ++ return 0; ++} ++ ++struct list_head * ++layout_cache_iter(bl_layout_rec_t *r, struct list_head *bl_possible, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n = NULL; ++ struct list_head *bl_candidates = NULL; ++ struct fiemap_extent_info fei; ++ struct inode *i; ++ dev_t dev; ++ ++ dev = r->blr_rdev; ++ i = r->blr_inode; ++ ++ dprintk("--> %s\n", __func__); ++ bl_candidates = kmalloc(sizeof (*bl_candidates), GFP_KERNEL); ++ if (!bl_candidates) ++ return NULL; ++ INIT_LIST_HEAD(bl_candidates); ++ extents_setup(&fei); ++ ++ list_for_each_entry(b, bl_possible, bll_list) { ++ if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ extents_count(&fei, i, b->bll_foff, b->bll_len); ++ if (fei.fi_extents_mapped) { ++ ++ /* ++ * Common case here. Got a range which has ++ * extents. Now get those extents and process ++ * them into pNFS extents. ++ */ ++ if (extents_get(&fei, i, b->bll_foff, ++ b->bll_len) == False) ++ goto cleanup; ++ if (extents_process(&fei, bl_candidates, ++ seg, dev, b) == False) ++ goto cleanup; ++ extents_cleanup(&fei); ++ ++ } else if (seg->iomode == IOMODE_READ) { ++ ++ /* ++ * Found a hole in a file while reading. No ++ * problem, just create a pNFS extent for the ++ * range and let the client know there's no ++ * backing store. ++ */ ++ n = bll_alloc(b->bll_foff, b->bll_len, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += b->bll_len; ++ } else { ++ ++ /* ++ * There's a problem here. Since the iomode ++ * is read/write fallocate should have allocated ++ * any necessary storage for the given range. ++ */ ++ dprintk(" Extent count for RW is 0\n"); ++ goto cleanup; ++ } ++ ++ } else { ++ n = bll_alloc_dup(b, b->bll_cache_state, bl_candidates); ++ seg->length += n->bll_len; ++ } ++ ++ if (r->blr_ext_size < (b->bll_foff + b->bll_len)) ++ r->blr_ext_size = b->bll_foff + b->bll_len; ++ } ++ ++ while (!list_empty(bl_possible)) { ++ b = list_entry(bl_possible->next, ++ struct pnfs_blocklayout_layout, bll_list); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ ++ b = list_first_entry(bl_candidates, struct pnfs_blocklayout_layout, ++ bll_list); ++ seg->offset = b->bll_foff; ++ dprintk("<-- %s okay\n", __func__); ++ return bl_candidates; ++ ++cleanup: ++ extents_cleanup(&fei); ++ if (bl_candidates) ++ kfree(bl_candidates); ++ dprintk("<-- %s, error occurred\n", __func__); ++ return NULL; ++} ++ ++/* ++ * layout_cache_merge -- collapse layouts which make up a contiguous range. ++ */ ++static void ++layout_cache_merge(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *p; ++ ++ dprintk("--> %s\n", __func__); ++restart: ++ p = NULL; ++ list_for_each_entry(b, h, bll_list) { ++ if (p && (BLL_S_END(p) == b->bll_soff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ /* ++ * We've got a condidate. ++ */ ++#ifdef too_verbose ++ dprintk(" merge %Lu(f):%Lu(l):%Lu(s) into %Lu(f):%Lu(l):%Lu(s)\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), ++ _2SECTS(p->bll_foff), _2SECTS(p->bll_len), ++ _2SECTS(b->bll_soff)); ++#endif ++ ++ if (p->bll_cache_state == BLOCK_LAYOUT_CACHE) ++ p->bll_cache_state = BLOCK_LAYOUT_UPDATE; ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else if (p && (BLL_F_END(p) == b->bll_foff) && ++ (p->bll_es == b->bll_es) && ++ (b->bll_es == PNFS_BLOCK_NONE_DATA)) { ++ p->bll_len += b->bll_len; ++ list_del(&b->bll_list); ++ kfree(b); ++ goto restart; ++ } else ++ p = b; ++ } ++ dprintk("<-- %s\n", __func__); ++} ++ ++static int ++layout_cache_update(bl_layout_rec_t *r, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *c, ++ *n; ++ boolean_t status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ if (list_empty(&r->blr_layouts)) { ++ /* ---- Just add entries and return ---- */ ++ dprintk(" cache empty for inode 0x%x:%ld\n", r->blr_rdev, ++ r->blr_inode->i_ino); ++ list_for_each_entry(b, h, bll_list) { ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, ++ &r->blr_layouts); ++ if (!c) { ++ status = -ENOMEM; ++ break; ++ } ++ dprintk(" adding %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), c->bll_es); ++ } ++ return status; ++ } ++ ++ list_for_each_entry(b, h, bll_list) { ++ BUG_ON(!b->bll_vol_id.devid); ++ if (b->bll_cache_state == BLOCK_LAYOUT_UPDATE) { ++ boolean_t found = False; ++ list_for_each_entry(c, &r->blr_layouts, bll_list) { ++ if ((b->bll_soff >= c->bll_soff) && ++ (b->bll_soff < BLL_S_END(c)) && ++ (b->bll_es != PNFS_BLOCK_NONE_DATA)) { ++ u64 u; ++ ++ if ((b->bll_foff < c->bll_foff) || ++ (b->bll_foff > BLL_F_END(c))) ++ BUG(); ++ ++ u = BLL_S_END(b) - BLL_S_END(c); ++ /* ++ * The updated cache entry has to be ++ * different than the current. ++ * Otherwise the cache state for 'b' ++ * should be BLOCK_LAYOUT_CACHE. ++ */ ++ BUG_ON(BLL_S_END(b) < BLL_S_END(c)); ++ ++ dprintk(" " ++ "updating %Lu(f):%Lu(l):%Lu(s) to len %Lu\n", ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len), ++ _2SECTS(c->bll_soff), ++ _2SECTS(c->bll_len + u)); ++ c->bll_len += u; ++ bll_collapse(r, c); ++ found = True; ++ break; ++ } ++ } ++ ++ if (found == False) { ++ dprintk(" ERROR Expected to find" ++ " %Lu(f):%Lu(l):%Lu(s), but didn't\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_for_each_entry(c, &r->blr_layouts, bll_list) ++ print_bll(c, "Cached"); ++ BUG(); ++ } ++ } else if (b->bll_cache_state == BLOCK_LAYOUT_NEW) { ++ ++ c = list_first_entry(&r->blr_layouts, ++ struct pnfs_blocklayout_layout, bll_list); ++ if (b->bll_foff < c->bll_foff) { ++ /* ++ * Special case where new entry is before ++ * first cached entry. ++ */ ++ c = bll_alloc_dup(b, BLOCK_LAYOUT_CACHE, NULL); ++ list_add(&c->bll_list, &r->blr_layouts); ++ dprintk(" new entry at head of list at %Lu, " ++ "len %Lu\n", ++ _2SECTS(c->bll_foff), _2SECTS(c->bll_len)); ++ } else { ++ list_for_each_entry(c, &r->blr_layouts, ++ bll_list) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, ++ bll_list); ++ /* ++ * This is ugly, but can't think of ++ * another way to examine this case. ++ * Consider the following. Need to ++ * add an entry which starts at 40 ++ * and the cache has the following ++ * entries: ++ * Start Length ++ * 10 5 ++ * 30 5 ++ * 50 5 ++ * So, need to look and see if the new ++ * entry starts after the current ++ * cache, but before the next one. ++ * There's a catch in that the next ++ * entry might not be valid as it's ++ * really just a pointer to the list ++ * head. ++ */ ++ if (((b->bll_foff >= ++ BLL_F_END(c)) && ++ (c->bll_list.next == &r->blr_layouts)) || ++ ((b->bll_foff >= ++ BLL_F_END(c)) && ++ (b->bll_foff < n->bll_foff))) { ++ ++ n = bll_alloc_dup(b, ++ BLOCK_LAYOUT_CACHE, NULL); ++ dprintk(" adding new %Lu:%Lu" ++ " after %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), ++ _2SECTS(n->bll_len), ++ _2SECTS(c->bll_foff), ++ _2SECTS(c->bll_len)); ++ list_add(&n->bll_list, ++ &c->bll_list); ++ break; ++ } ++ } ++ } ++ } ++ } ++ dprintk("<-- %s\n", __func__); ++ return status; ++} ++ ++static void ++layout_cache_del(bl_layout_rec_t *r, const struct nfsd4_layout_seg *seg_in) ++{ ++ struct pnfs_blocklayout_layout *b, ++ *n; ++ u64 len; ++ struct nfsd4_layout_seg seg = *seg_in; ++ ++ dprintk("--> %s\n", __func__); ++ if (seg.length == NFS4_MAX_UINT64) { ++ r->blr_recalled = 0; ++ dprintk(" Fast return of all layouts\n"); ++ while (!list_empty(&r->blr_layouts)) { ++ b = list_entry(r->blr_layouts.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ dprintk(" foff %Lu, len %Lu, soff %Lu\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff)); ++ list_del(&b->bll_list); ++ kfree(b); ++ } ++ dprintk("<-- %s\n", __func__); ++ return; ++ } ++ ++restart: ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg.offset == b->bll_foff) { ++ /* ++ * This handle the following three cases: ++ * (1) return layout matches entire cache layout ++ * (2) return layout matches beginning portion of cache ++ * (3) return layout matches entire cache layout and ++ * into next entry. Varies from #1 in end case. ++ */ ++ dprintk(" match on offsets, %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length)); ++ len = MIN(seg.length, b->bll_len); ++ b->bll_foff += len; ++ b->bll_soff += len; ++ b->bll_len -= len; ++ seg.length -= len; ++ seg.offset += len; ++ if (!b->bll_len) { ++ list_del(&b->bll_list); ++ kfree(b); ++ dprintk(" removing cache line\n"); ++ if (!seg.length) { ++ dprintk(" also finished\n"); ++ goto complete; ++ } ++ /* ++ * Since 'b' was freed we can't continue at the ++ * next entry which is referenced as ++ * b->bll_list.next by the list_for_each_entry ++ * macro. Need to restart the loop. ++ * TODO: Think about creating a dummy 'b' which ++ * would keep list_for_each_entry() happy. ++ */ ++ goto restart; ++ } ++ if (!seg.length) { ++ dprintk(" finished, but cache line not" ++ "empty\n"); ++ goto complete; ++ } ++ } else if ((seg.offset >= b->bll_foff) && ++ (seg.offset < BLL_F_END(b))) { ++ /* ++ * layout being returned is within this cache line. ++ */ ++ dprintk(" layout %Lu:%Lu within cache line %Lu:%Lu\n", ++ _2SECTS(seg.offset), _2SECTS(seg.length), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ BUG_ON(!seg.length); ++ if ((seg.offset + seg.length) >= BLL_F_END(b)) { ++ /* ++ * Layout returned starts in the middle of ++ * cache entry and just need to trim back ++ * cache to shorter length. ++ */ ++ dprintk(" trim back cache line\n"); ++ len = seg.offset - b->bll_foff; ++ seg.offset += b->bll_len - len; ++ seg.length -= b->bll_len - len; ++ b->bll_len = len; ++ if (!seg.length) ++ return; ++ } else { ++ /* ++ * Need to split current cache layout because ++ * chunk is being removed from the middle. ++ */ ++ dprintk(" split cache line\n"); ++ len = seg.offset + seg.length; ++ n = bll_alloc(len, ++ (b->bll_foff + b->bll_len) - len, ++ BLOCK_LAYOUT_CACHE, NULL); ++ n->bll_soff = b->bll_soff + len; ++ list_add(&n->bll_list, &b->bll_list); ++ b->bll_len = seg.offset - b->bll_foff; ++ return; ++ } ++ } ++ } ++complete: ++ if (list_empty(&r->blr_layouts)) ++ r->blr_recalled = 0; ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++ * layout_cache_fill_from_list -- fills from cache list ++ * ++ * NOTE: This routine was only seperated out from layout_cache_file_from() ++ * to reduce the indentation level which makes the code easier to read. ++ */ ++static inline boolean_t ++layout_cache_fill_from_list(bl_layout_rec_t *r, struct list_head *h, ++ struct nfsd4_layout_seg *seg) ++{ ++ pnfs_blocklayout_layout_t *b, ++ *n; ++ enum pnfs_block_extent_state4 s; ++ ++ list_for_each_entry(b, &r->blr_layouts, bll_list) { ++ if (seg->offset < b->bll_foff) { ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, b->bll_foff - seg->offset), ++ BLOCK_LAYOUT_NEW, NULL); ++ if (!n) ++ return False; ++ ++ list_add(&n->bll_list, h->prev); ++ dprintk(" new: %Lu:%Lu, added before %Lu:%Lu\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len)); ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ ++ if ((seg->offset >= b->bll_foff) && ++ (seg->offset < BLL_F_END(b))) { ++ if (layout_conflict(b, seg->iomode, &s) == False) { ++ dprintk(" CONFLICT FOUND: " ++ "%Lu(f):%Lu(l):%Lu(s) state %d, iomode %d\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es, ++ seg->iomode); ++ return False; ++ } ++ n = bll_alloc(seg->offset, ++ MIN(seg->length, BLL_F_END(b) - seg->offset), ++ BLOCK_LAYOUT_CACHE, h); ++ dprintk(" CACHE hit: Found %Lu(f):%Lu(l): " ++ "in %Lu(f):%Lu(l):%Lu(s):%d\n", ++ _2SECTS(n->bll_foff), _2SECTS(n->bll_len), ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_len), ++ _2SECTS(b->bll_soff), b->bll_es); ++ if (!n) ++ return False; ++ ++ n->bll_soff = b->bll_soff + seg->offset - b->bll_foff; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ n->bll_es = s; ++ seg->offset += n->bll_len; ++ seg->length -= n->bll_len; ++ if (!seg->length) ++ break; ++ } ++ } ++ return True; ++} ++ ++static u64 ++bll_alloc_holey(struct list_head *bl_candidates, u64 offset, u64 length, ++ dev_t dev) ++{ ++ pnfs_blocklayout_layout_t *n; ++ ++ n = bll_alloc(offset, length, BLOCK_LAYOUT_NEW, bl_candidates); ++ if (!n) ++ return 0; ++ n->bll_es = PNFS_BLOCK_NONE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ ++ return n->bll_len; ++} ++ ++static void ++extents_setup(struct fiemap_extent_info *fei) ++{ ++ fei->fi_extents_start = NULL; ++} ++ ++/* ++ * extents_count -- Determine the number of extents for a given range. ++ * ++ * No need to call set_fs() here because the function ++ * doesn't use copy_to_user() if it's only counting ++ * the number of extents needed. ++ */ ++static void ++extents_count(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ dprintk(" Need fiemap of %Ld:%Ld\n", _2SECTS(foff), _2SECTS(len)); ++ fei->fi_flags = FIEMAP_FLAG_SYNC; ++ fei->fi_extents_max = 0; ++ fei->fi_extents_start = NULL; ++ fei->fi_extents_mapped = 0; ++ i->i_op->fiemap(i, fei, foff, len + (1 << i->i_sb->s_blocksize_bits) - 1); ++} ++ ++/* ++ * extents_get -- Get list of extents for range ++ * ++ * extents_count() must have been called before this routine such that ++ * fi_extents_mapped is known. ++ */ ++static boolean_t ++extents_get(struct fiemap_extent_info *fei, struct inode *i, u64 foff, u64 len) ++{ ++ int m_space, ++ rval; ++ struct fiemap_extent *fe; ++ mm_segment_t old_fs = get_fs(); ++ ++ /* ++ * Now malloc the correct amount of space ++ * needed. It's possible for the file to have changed ++ * between calls which would require more space for ++ * the extents. If that occurs the last extent will ++ * not have FIEMAP_EXTENT_LAST set and the error will ++ * be caught in extents_process(). ++ */ ++ m_space = fei->fi_extents_mapped * sizeof (struct fiemap_extent); ++ fe = kmalloc(m_space, GFP_KERNEL); ++ if (!fe) ++ return False; ++ memset(fe, 0, m_space); ++ ++ fei->fi_extents_max = fei->fi_extents_mapped; ++ fei->fi_extents_mapped = 0; ++ fei->fi_extents_start = fe; ++ ++ set_fs(KERNEL_DS); ++ rval = i->i_op->fiemap(i, fei, foff, len + ++ (1 << i->i_sb->s_blocksize_bits) - 1); ++ set_fs(old_fs); ++ ++ if (rval || !fei->fi_extents_mapped) { ++ dprintk(" No extents. Wanted %d, got %d\n", ++ fei->fi_extents_max, fei->fi_extents_mapped); ++ kfree(fe); ++ fei->fi_extents_start = NULL; ++ return False; ++ } else ++ return True; ++} ++ ++/* ++ * extents_process -- runs through the extent returned from the file system and ++ * creates block layout entries. ++ */ ++static boolean_t ++extents_process(struct fiemap_extent_info *fei, struct list_head *bl_candidates, ++ struct nfsd4_layout_seg *seg, dev_t dev, pnfs_blocklayout_layout_t *b) ++{ ++ struct fiemap_extent *fep, ++ *fep_last = NULL; ++ int i; ++ pnfs_blocklayout_layout_t *n; ++ u64 last_end, ++ rval; ++ ++ dprintk("--> %s\n", __func__); ++ for (fep = fei->fi_extents_start, i = 0; i < fei->fi_extents_mapped; ++ i++, fep++) { ++ ++ BUG_ON(!fep->fe_physical); ++ /* ++ * Deal with corner cases of hoel-y files. ++ */ ++ if (fep_last && ((fep_last->fe_logical + fep_last->fe_length) != ++ fep->fe_logical)) { ++ ++ /* ++ * If the last extent doesn't end logically ++ * at the beginning of the current we've got ++ * hole and need to create a pNFS extent. ++ */ ++ dprintk(" Got a hole at %Ld:%Ld \n", ++ _2SECTS(fep_last->fe_logical), ++ _2SECTS(fep_last->fe_length)); ++ last_end = fep_last->fe_logical + fep_last->fe_length; ++ rval = bll_alloc_holey(bl_candidates, last_end, ++ fep->fe_logical - last_end, dev); ++ if (!rval) ++ return False; ++ seg->length += rval; ++ } ++ ++ n = bll_alloc(fep->fe_logical, fep->fe_length, ++ BLOCK_LAYOUT_NEW, bl_candidates); ++ if (unlikely(n == NULL)) { ++ dprintk("%s: bll_alloc failed\n", __func__); ++ return False; ++ } ++ ++ n->bll_soff = fep->fe_physical; ++ n->bll_es = seg->iomode == IOMODE_READ ? ++ PNFS_BLOCK_READ_DATA : PNFS_BLOCK_READWRITE_DATA; ++ n->bll_vol_id.sbid = 0; ++ n->bll_vol_id.devid = dev; ++ seg->length += fep->fe_length; ++ print_bll(n, "New extent"); ++ fep_last = fep; ++ } ++ dprintk("<-- %s (i=%d)\n", __func__, i); ++ ++ return True; ++} ++ ++static void ++extents_cleanup(struct fiemap_extent_info *fei) ++{ ++ if (fei->fi_extents_start) { ++ kfree(fei->fi_extents_start); ++ fei->fi_extents_start = NULL; ++ } ++} ++ ++/* ++ * device_slice -- check to see if device is a slice or DM ++ */ ++static boolean_t ++device_slice(dev_t devid) ++{ ++ struct block_device *bd = open_by_devnum(devid, FMODE_READ); ++ boolean_t rval = False; ++ ++ if (bd) { ++ if (bd->bd_disk->minors > 1) ++ rval = True; ++ blkdev_put(bd, FMODE_READ); ++ } ++ return rval; ++} ++ ++/* ++ * device_dm -- check to see if device is a Device Mapper volume. ++ * ++ * Returns 1 for DM or 0 if not ++ */ ++static boolean_t ++device_dm(dev_t devid) ++{ ++ boolean_t rval = False; ++ bl_comm_msg_t msg; ++ bl_comm_res_t *res; ++ ++ msg.msg_type = PNFS_UPCALL_MSG_DMCHK; ++ msg.u.msg_dev = devid; ++ if (bl_upcall(bl_comm_global, &msg, &res)) { ++ dprintk("Failed upcall to check on DM status\n"); ++ } else if (res->u.dm_vol) { ++ rval = True; ++ dprintk("Device is DM volume\n"); ++ } else ++ dprintk("Device is not DM volume\n"); ++ kfree(res); ++ ++ return rval; ++} ++ ++static boolean_t ++layout_inode_add(struct inode *i, bl_layout_rec_t **p) ++{ ++ bl_layout_rec_t *r = NULL; ++ ++ if (!i->i_op->fiemap || !i->i_op->fallocate) { ++ printk("pNFS: file system doesn't support required fiemap or" ++ "fallocate methods\n"); ++ return False; ++ } ++ ++ r = kmalloc(sizeof (*r), GFP_KERNEL); ++ if (!r) ++ goto error; ++ ++ r->blr_rdev = i->i_sb->s_dev; ++ r->blr_inode = i; ++ r->blr_orig_size = i->i_size; ++ r->blr_ext_size = 0; ++ r->blr_recalled = 0; ++ INIT_LIST_HEAD(&r->blr_layouts); ++ spin_lock_init(&r->blr_lock); ++ spin_lock(&layout_hashtbl_lock); ++ list_add_tail(&r->blr_hash, &layout_hash); ++ spin_unlock(&layout_hashtbl_lock); ++ *p = r; ++ return True; ++ ++error: ++ if (r) ++ kfree(r); ++ return False; ++} ++ ++static bl_layout_rec_t * ++__layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ if (!list_empty(&layout_hash)) { ++ list_for_each_entry(r, &layout_hash, blr_hash) { ++ if ((r->blr_inode->i_ino == i->i_ino) && ++ (r->blr_rdev == i->i_sb->s_dev)) { ++ return r; ++ } ++ } ++ } ++ return NULL; ++} ++ ++static bl_layout_rec_t * ++layout_inode_find(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ spin_unlock(&layout_hashtbl_lock); ++ ++ return r; ++} ++ ++static void ++layout_inode_del(struct inode *i) ++{ ++ bl_layout_rec_t *r; ++ ++ spin_lock(&layout_hashtbl_lock); ++ r = __layout_inode_find(i); ++ if (r) { ++ spin_lock(&r->blr_lock); ++ if (list_empty(&r->blr_layouts)) { ++ list_del(&r->blr_hash); ++ spin_unlock(&r->blr_lock); ++ kfree(r); ++ } else { ++ spin_unlock(&r->blr_lock); ++ } ++ } else { ++ dprintk("%s: failed to find inode [0x%x:%lu] in table for delete\n", ++ __func__, i->i_sb->s_dev, i->i_ino); ++ } ++ spin_unlock(&layout_hashtbl_lock); ++} ++ ++/* ++ * map_state2name -- converts state in ascii string. ++ * ++ * Used for debug messages only. ++ */ ++static char * ++map_state2name(enum pnfs_block_extent_state4 s) ++{ ++ switch (s) { ++ case PNFS_BLOCK_READWRITE_DATA: return " RW"; ++ case PNFS_BLOCK_READ_DATA: return " RO"; ++ case PNFS_BLOCK_INVALID_DATA: return "INVALID"; ++ case PNFS_BLOCK_NONE_DATA: return " NONE"; ++ default: ++ BUG(); ++ } ++} ++ ++static pnfs_blocklayout_devinfo_t * ++bld_alloc(struct list_head *volumes, int type) ++{ ++ pnfs_blocklayout_devinfo_t *bld; ++ ++ bld = kmalloc(sizeof (*bld), GFP_KERNEL); ++ if (!bld) ++ return NULL; ++ ++ memset(bld, 0, sizeof (*bld)); ++ bld->bld_type = type; ++ list_add_tail(&bld->bld_list, volumes); ++ ++ return bld; ++} ++ ++static void ++bld_free(pnfs_blocklayout_devinfo_t *bld) ++{ ++ list_del(&bld->bld_list); ++ kfree(bld); ++} ++ ++static void ++print_bll(pnfs_blocklayout_layout_t *b, char *text) ++{ ++ dprintk(" BLL: %s\n", text); ++ dprintk(" foff %Lu, soff %Lu, len %Lu, state %s\n", ++ _2SECTS(b->bll_foff), _2SECTS(b->bll_soff), _2SECTS(b->bll_len), ++ map_state2name(b->bll_es)); ++} ++ ++static inline void ++bll_collapse(bl_layout_rec_t *r, pnfs_blocklayout_layout_t *c) ++{ ++ pnfs_blocklayout_layout_t *n; ++ int dbg_count = 0; ++ u64 endpoint; ++ ++ BUG_ON(c->bll_es == PNFS_BLOCK_NONE_DATA); ++ while (c->bll_list.next != &r->blr_layouts) { ++ n = list_entry(c->bll_list.next, ++ struct pnfs_blocklayout_layout, bll_list); ++ endpoint = BLL_S_END(c); ++ if ((n->bll_soff >= c->bll_soff) && ++ (n->bll_soff < endpoint)) { ++ if (endpoint < BLL_S_END(n)) { ++ /* ++ * The following is possible. ++ * ++ * ++ * Existing: +---+ +---+ ++ * New: +-----------------------+ ++ * The client request merge entries together ++ * but didn't require picking up all of the ++ * last entry. So, we still need to delete ++ * the last entry and add the remaining space ++ * to the new entry. ++ */ ++ c->bll_len += BLL_S_END(n) - endpoint; ++ } ++ dbg_count++; ++ list_del(&n->bll_list); ++ kfree(n); ++ } else { ++ break; ++ } ++ } ++ /* ---- Debug only, remove before integration ---- */ ++ if (dbg_count) ++ dprintk(" Collapsed %d cache entries between %Lu(s) and %Lu(s)\n", ++ dbg_count, _2SECTS(c->bll_soff), _2SECTS(BLL_S_END(c))); ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc(u64 offset, u64 len, enum bl_cache_state state, struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = kmalloc(sizeof (*n), GFP_KERNEL); ++ if (n) { ++ memset(n, 0, sizeof (*n)); ++ n->bll_foff = offset; ++ n->bll_len = len; ++ n->bll_cache_state = state; ++ if (h) ++ list_add_tail(&n->bll_list, h); ++ } ++ return n; ++} ++ ++static pnfs_blocklayout_layout_t * ++bll_alloc_dup(pnfs_blocklayout_layout_t *b, enum bl_cache_state c, ++ struct list_head *h) ++{ ++ pnfs_blocklayout_layout_t *n = NULL; ++ ++ n = bll_alloc(b->bll_foff, b->bll_len, c, h); ++ if (n) { ++ n->bll_es = b->bll_es; ++ n->bll_soff = b->bll_soff; ++ n->bll_vol_id.devid = b->bll_vol_id.devid; ++ } ++ return n; ++} ++ ++static inline boolean_t ++layout_conflict(pnfs_blocklayout_layout_t *b, u32 iomode, ++ enum pnfs_block_extent_state4 *s) ++{ ++ /* ---- Normal case ---- */ ++ *s = b->bll_es; ++ ++ switch (b->bll_es) { ++ case PNFS_BLOCK_READWRITE_DATA: ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_READ_DATA; ++ /* ---- Any use is permitted. ---- */ ++ break; ++ case PNFS_BLOCK_READ_DATA: ++ /* ---- Committed as read only data. ---- */ ++ if (iomode == IOMODE_RW) ++ return False; ++ break; ++ case PNFS_BLOCK_INVALID_DATA: ++ /* ---- Blocks have been allocated, but not initialized ---- */ ++ if (iomode == IOMODE_READ) ++ *s = PNFS_BLOCK_NONE_DATA; ++ break; ++ case PNFS_BLOCK_NONE_DATA: ++ /* ---- Hole-y file. No backing store avail. ---- */ ++ if (iomode != IOMODE_READ) ++ return False; ++ break; ++ default: ++ BUG(); ++ } ++ return True; ++} ++ ++#endif /* CONFIG_SPNFS_BLOCK */ +diff -up linux-2.6.35.noarch/fs/nfs/delegation.c.orig linux-2.6.35.noarch/fs/nfs/delegation.c +--- linux-2.6.35.noarch/fs/nfs/delegation.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/delegation.c 2010-08-31 21:11:40.906051757 -0400 +@@ -104,7 +104,8 @@ again: + continue; + if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + continue; +- if (memcmp(state->stateid.data, stateid->data, sizeof(state->stateid.data)) != 0) ++ if (memcmp(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)) != 0) + continue; + get_nfs_open_context(ctx); + spin_unlock(&inode->i_lock); +@@ -133,8 +134,8 @@ void nfs_inode_reclaim_delegation(struct + if (delegation != NULL) { + spin_lock(&delegation->lock); + if (delegation->inode != NULL) { +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + oldcred = delegation->cred; +@@ -187,8 +188,9 @@ static struct nfs_delegation *nfs_detach + if (delegation == NULL) + goto nomatch; + spin_lock(&delegation->lock); +- if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data, +- sizeof(delegation->stateid.data)) != 0) ++ if (stateid != NULL && memcmp(delegation->stateid.u.data, ++ stateid->u.data, ++ sizeof(delegation->stateid.u.data)) != 0) + goto nomatch_unlock; + list_del_rcu(&delegation->super_list); + delegation->inode = NULL; +@@ -216,8 +218,8 @@ int nfs_inode_set_delegation(struct inod + delegation = kmalloc(sizeof(*delegation), GFP_NOFS); + if (delegation == NULL) + return -ENOMEM; +- memcpy(delegation->stateid.data, res->delegation.data, +- sizeof(delegation->stateid.data)); ++ memcpy(delegation->stateid.u.data, res->delegation.u.data, ++ sizeof(delegation->stateid.u.data)); + delegation->type = res->delegation_type; + delegation->maxsize = res->maxsize; + delegation->change_attr = nfsi->change_attr; +@@ -471,9 +473,7 @@ void nfs_expire_unreferenced_delegations + /* + * Asynchronous delegation recall! + */ +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)) ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) + { + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; +@@ -481,7 +481,7 @@ int nfs_async_inode_return_delegation(st + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + +- if (!validate_stateid(delegation, stateid)) { ++ if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { + rcu_read_unlock(); + return -ENOENT; + } +@@ -562,7 +562,8 @@ int nfs4_copy_delegation_stateid(nfs4_st + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) { +- memcpy(dst->data, delegation->stateid.data, sizeof(dst->data)); ++ memcpy(dst->u.data, delegation->stateid.u.data, ++ sizeof(dst->u.data)); + ret = 1; + } + rcu_read_unlock(); +diff -up linux-2.6.35.noarch/fs/nfs/delegation.h.orig linux-2.6.35.noarch/fs/nfs/delegation.h +--- linux-2.6.35.noarch/fs/nfs/delegation.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/delegation.h 2010-08-31 21:11:40.907160710 -0400 +@@ -34,9 +34,7 @@ enum { + int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); + int nfs_inode_return_delegation(struct inode *inode); +-int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, +- int (*validate_stateid)(struct nfs_delegation *delegation, +- const nfs4_stateid *stateid)); ++int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); + void nfs_inode_return_delegation_noreclaim(struct inode *inode); + + struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); +diff -up linux-2.6.35.noarch/fs/nfsd/export.c.orig linux-2.6.35.noarch/fs/nfsd/export.c +--- linux-2.6.35.noarch/fs/nfsd/export.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/export.c 2010-08-31 21:11:40.948150354 -0400 +@@ -17,11 +17,19 @@ + #include + #include + ++#include ++#if defined(CONFIG_SPNFS) ++#include ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif ++#endif + #include + #include + + #include "nfsd.h" + #include "nfsfh.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_EXPORT + +@@ -352,6 +360,40 @@ static int svc_export_upcall(struct cach + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); + } + ++#if defined(CONFIG_PNFSD) ++static struct pnfsd_cb_operations pnfsd_cb_op = { ++ .cb_layout_recall = nfsd_layout_recall_cb, ++ .cb_device_notify = nfsd_device_notify_cb, ++ ++ .cb_get_state = nfs4_pnfs_cb_get_state, ++ .cb_change_state = nfs4_pnfs_cb_change_state, ++}; ++ ++#if defined(CONFIG_SPNFS) ++static struct pnfs_export_operations spnfs_export_ops = { ++ .layout_type = spnfs_layout_type, ++ .get_device_info = spnfs_getdeviceinfo, ++ .get_device_iter = spnfs_getdeviceiter, ++ .layout_get = spnfs_layoutget, ++ .layout_return = spnfs_layoutreturn, ++}; ++ ++static struct pnfs_export_operations spnfs_ds_export_ops = { ++ .get_state = spnfs_get_state, ++}; ++ ++#if defined(CONFIG_SPNFS_BLOCK) ++static struct pnfs_export_operations bl_export_ops = { ++ .layout_type = bl_layout_type, ++ .get_device_info = bl_getdeviceinfo, ++ .get_device_iter = bl_getdeviceiter, ++ .layout_get = bl_layoutget, ++ .layout_return = bl_layoutreturn, ++}; ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_SPNFS */ ++#endif /* CONFIG_PNFSD */ ++ + static struct svc_export *svc_export_update(struct svc_export *new, + struct svc_export *old); + static struct svc_export *svc_export_lookup(struct svc_export *); +@@ -395,6 +437,47 @@ static int check_export(struct inode *in + return -EINVAL; + } + ++#if !defined(CONFIG_SPNFS) ++ if (inode->i_sb->s_pnfs_op && ++ (!inode->i_sb->s_pnfs_op->layout_type || ++ !inode->i_sb->s_pnfs_op->get_device_info || ++ !inode->i_sb->s_pnfs_op->layout_get)) { ++ dprintk("exp_export: export of invalid fs pnfs export ops.\n"); ++ return -EINVAL; ++ } ++#endif /* CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ if (!inode->i_sb->s_pnfs_op) ++ pnfsd_lexp_init(inode); ++ return 0; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, *flags)) { ++ dprintk("set pnfs block export structure... \n"); ++ inode->i_sb->s_pnfs_op = &bl_export_ops; ++ } else ++#endif /* CONFIG_SPNFS_BLOCK */ ++ /* ++ * spnfs_enabled() indicates we're an MDS. ++ * XXX Better to check an export time option as well. ++ */ ++ if (spnfs_enabled()) { ++ dprintk("set spnfs export structure...\n"); ++ inode->i_sb->s_pnfs_op = &spnfs_export_ops; ++ } else { ++ dprintk("%s spnfs not in use\n", __func__); ++ ++ /* ++ * get_state is needed if we're a DS using spnfs. ++ * XXX Better to check an export time option instead. ++ */ ++ inode->i_sb->s_pnfs_op = &spnfs_ds_export_ops; ++ } ++#endif /* CONFIG_SPNFS */ ++ + return 0; + + } +@@ -586,6 +669,8 @@ static int svc_export_parse(struct cache + if (exp.ex_uuid == NULL) + err = -ENOMEM; + } ++ } else if (strcmp(buf, "pnfs") == 0) { ++ exp.ex_pnfs = 1; + } else if (strcmp(buf, "secinfo") == 0) + err = secinfo_parse(&mesg, buf, &exp); + else +@@ -660,6 +745,8 @@ static int svc_export_show(struct seq_fi + seq_printf(m, "%02x", exp->ex_uuid[i]); + } + } ++ if (exp->ex_pnfs) ++ seq_puts(m, ",pnfs"); + show_secinfo(m, exp); + } + seq_puts(m, ")\n"); +@@ -687,6 +774,7 @@ static void svc_export_init(struct cache + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; ++ new->ex_pnfs = 0; + } + + static void export_update(struct cache_head *cnew, struct cache_head *citem) +@@ -699,6 +787,7 @@ static void export_update(struct cache_h + new->ex_anon_uid = item->ex_anon_uid; + new->ex_anon_gid = item->ex_anon_gid; + new->ex_fsid = item->ex_fsid; ++ new->ex_pnfs = item->ex_pnfs; + new->ex_uuid = item->ex_uuid; + item->ex_uuid = NULL; + new->ex_pathname = item->ex_pathname; +@@ -1635,8 +1724,17 @@ nfsd_export_init(void) + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); +- if (rv) ++ if (rv) { + cache_unregister(&svc_export_cache); ++ goto out; ++ } ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = THIS_MODULE; ++ pnfsd_cb_ctl.cb_op = &pnfsd_cb_op; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ ++out: + return rv; + + } +@@ -1664,6 +1762,12 @@ nfsd_export_shutdown(void) + + exp_writelock(); + ++#if defined(CONFIG_PNFSD) ++ spin_lock(&pnfsd_cb_ctl.lock); ++ pnfsd_cb_ctl.module = NULL; ++ pnfsd_cb_ctl.cb_op = NULL; ++ spin_unlock(&pnfsd_cb_ctl.lock); ++#endif /* CONFIG_PNFSD */ + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); + svcauth_unix_purge(); +diff -up linux-2.6.35.noarch/fs/nfs/direct.c.orig linux-2.6.35.noarch/fs/nfs/direct.c +--- linux-2.6.35.noarch/fs/nfs/direct.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/direct.c 2010-08-31 21:11:40.908170524 -0400 +@@ -267,6 +267,38 @@ static const struct rpc_call_ops nfs_rea + .rpc_release = nfs_direct_read_release, + }; + ++static long nfs_direct_read_execute(struct nfs_read_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ nfs_fattr_init(&data->fattr); ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ NFS_PROTO(inode)->read_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct read call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each rsize'd chunk of the user's buffer, dispatch an NFS READ + * operation. If nfs_readdata_alloc() or get_user_pages() fails, +@@ -283,7 +315,6 @@ static ssize_t nfs_direct_read_schedule_ + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; + size_t rsize = NFS_SERVER(inode)->rsize; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -343,26 +374,9 @@ static ssize_t nfs_direct_read_schedule_ + data->res.fattr = &data->fattr; + data->res.eof = 0; + data->res.count = bytes; +- nfs_fattr_init(&data->fattr); +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct read call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_read_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +@@ -448,12 +462,15 @@ static void nfs_direct_free_writedata(st + } + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg); ++ + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) + { + struct inode *inode = dreq->inode; + struct list_head *p; + struct nfs_write_data *data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = dreq->ctx->cred, + }; +@@ -487,25 +504,7 @@ static void nfs_direct_write_reschedule( + * Reuse data->task; data->args should not have changed + * since the original request was sent. + */ +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- /* +- * We're called via an RPC callback, so BKL is already held. +- */ +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); +- +- dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- data->args.count, +- (unsigned long long)data->args.offset); ++ nfs_direct_write_execute(data, &task_setup_data, &msg); + } + + if (put_dreq(dreq)) +@@ -548,10 +547,31 @@ static const struct rpc_call_ops nfs_com + .rpc_release = nfs_direct_commit_release, + }; + ++static long nfs_direct_commit_execute(struct nfs_direct_req *dreq, ++ struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct rpc_task *task; ++ ++ NFS_PROTO(data->inode)->commit_setup(data, msg); ++ ++ /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ ++ dreq->commit_data = NULL; ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ return 0; ++} ++ + static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) + { + struct nfs_write_data *data = dreq->commit_data; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +@@ -579,16 +599,7 @@ static void nfs_direct_commit_schedule(s + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- NFS_PROTO(data->inode)->commit_setup(data, &msg); +- +- /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ +- dreq->commit_data = NULL; +- +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); +- +- task = rpc_run_task(&task_setup_data); +- if (!IS_ERR(task)) +- rpc_put_task(task); ++ nfs_direct_commit_execute(dreq, data, &task_setup_data, &msg); + } + + static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +@@ -690,6 +701,36 @@ static const struct rpc_call_ops nfs_wri + .rpc_release = nfs_direct_write_release, + }; + ++static long nfs_direct_write_execute(struct nfs_write_data *data, ++ struct rpc_task_setup *task_setup_data, ++ struct rpc_message *msg) ++{ ++ struct inode *inode = data->inode; ++ struct rpc_task *task; ++ ++ task_setup_data->task = &data->task; ++ task_setup_data->callback_data = data; ++ msg->rpc_argp = &data->args; ++ msg->rpc_resp = &data->res; ++ NFS_PROTO(inode)->write_setup(data, msg); ++ ++ task = rpc_run_task(task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ rpc_put_task(task); ++ ++ dprintk("NFS: %5u initiated direct write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ return 0; ++} ++ + /* + * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE + * operation. If nfs_writedata_alloc() or get_user_pages() fails, +@@ -705,7 +746,6 @@ static ssize_t nfs_direct_write_schedule + struct inode *inode = ctx->path.dentry->d_inode; + unsigned long user_addr = (unsigned long)iov->iov_base; + size_t count = iov->iov_len; +- struct rpc_task *task; + struct rpc_message msg = { + .rpc_cred = ctx->cred, + }; +@@ -771,24 +811,8 @@ static ssize_t nfs_direct_write_schedule + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- task_setup_data.task = &data->task; +- task_setup_data.callback_data = data; +- msg.rpc_argp = &data->args; +- msg.rpc_resp = &data->res; +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- break; +- rpc_put_task(task); +- +- dprintk("NFS: %5u initiated direct write call " +- "(req %s/%Ld, %zu bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- bytes, +- (unsigned long long)data->args.offset); ++ if (nfs_direct_write_execute(data, &task_setup_data, &msg)) ++ break; + + started += bytes; + user_addr += bytes; +diff -up linux-2.6.35.noarch/fs/nfsd/Kconfig.orig linux-2.6.35.noarch/fs/nfsd/Kconfig +--- linux-2.6.35.noarch/fs/nfsd/Kconfig.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/Kconfig 2010-08-31 21:11:40.943150294 -0400 +@@ -79,3 +79,52 @@ config NFSD_V4 + available from http://linux-nfs.org/. + + If unsure, say N. ++ ++config PNFSD ++ bool "NFSv4.1 server support for Parallel NFS (pNFS) (DEVELOPER ONLY)" ++ depends on NFSD_V4 && EXPERIMENTAL ++ select EXPORTFS_FILE_LAYOUT ++ help ++ This option enables support for the parallel NFS features of the ++ minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) ++ in the kernel's NFS server. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFSD_LOCAL_EXPORT ++ bool "Enable pNFS support for exporting local filesystems for debugging purposes" ++ depends on PNFSD ++ help ++ Say Y here if you want your pNFS server to export local file systems ++ over the files layout type. With this option the MDS (metadata ++ server) functions also as a single DS (data server). This is mostly ++ useful for development and debugging purposes. ++ ++ If unsure, say N. ++ ++config SPNFS ++ bool "Provide spNFS server support (EXPERIMENTAL)" ++ depends on PNFSD ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS server support. ++ ++ If unsure, say N. ++ ++config SPNFS_LAYOUTSEGMENTS ++ bool "Allow spNFS to return partial file layouts (EXPERIMENTAL)" ++ depends on SPNFS ++ select RPCSEC_GSS_KRB5 ++ help ++ Say Y here if you want spNFS to be able to return layout segments. ++ ++ If unsure, say N. ++ ++config SPNFS_BLOCK ++ bool "Provide Block Layout server support (EXPERIMENTAL)" ++ depends on SPNFS ++ select EXPORTFS_BLOCK_LAYOUT ++ help ++ Say Y here if you want spNFS block layout support ++ ++ If unsure, say N. +diff -up linux-2.6.35.noarch/fs/nfsd/Makefile.orig linux-2.6.35.noarch/fs/nfsd/Makefile +--- linux-2.6.35.noarch/fs/nfsd/Makefile.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/Makefile 2010-08-31 21:11:40.944171081 -0400 +@@ -11,3 +11,7 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs + nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o + nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ + nfs4acl.o nfs4callback.o nfs4recover.o ++nfsd-$(CONFIG_PNFSD) += nfs4pnfsd.o nfs4pnfsdlm.o nfs4pnfsds.o ++nfsd-$(CONFIG_PNFSD_LOCAL_EXPORT) += pnfsd_lexp.o ++nfsd-$(CONFIG_SPNFS) += spnfs_com.o spnfs_ops.o ++nfsd-$(CONFIG_SPNFS_BLOCK) += bl_com.o bl_ops.o +diff -up linux-2.6.35.noarch/fs/nfsd/nfs4callback.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4callback.c +--- linux-2.6.35.noarch/fs/nfsd/nfs4callback.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfs4callback.c 2010-08-31 21:11:40.949160374 -0400 +@@ -41,7 +41,6 @@ + + #define NFSPROC4_CB_NULL 0 + #define NFSPROC4_CB_COMPOUND 1 +-#define NFS4_STATEID_SIZE 16 + + /* Index of predefined Linux callback client operations */ + +@@ -49,11 +48,17 @@ enum { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, ++#if defined(CONFIG_PNFSD) ++ NFSPROC4_CLNT_CB_LAYOUT, ++ NFSPROC4_CLNT_CB_DEVICE, ++#endif + }; + + enum nfs_cb_opnum4 { + OP_CB_RECALL = 4, ++ OP_CB_LAYOUT = 5, + OP_CB_SEQUENCE = 11, ++ OP_CB_DEVICE = 14, + }; + + #define NFS4_MAXTAGLEN 20 +@@ -79,6 +84,19 @@ enum nfs_cb_opnum4 { + #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) ++#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 3 + \ ++ enc_nfs4_fh_sz + 4) ++#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) ++#define NFS4_enc_cb_device_sz (cb_compound_enc_hdr_sz + \ ++ cb_sequence_enc_sz + \ ++ 1 + 6) ++#define NFS4_dec_cb_device_sz (cb_compound_dec_hdr_sz + \ ++ cb_sequence_dec_sz + \ ++ op_dec_sz) + + /* + * Generic encode routines from fs/nfs/nfs4xdr.c +@@ -95,6 +113,10 @@ xdr_writemem(__be32 *p, const void *ptr, + } + + #define WRITE32(n) *p++ = htonl(n) ++#define WRITE64(n) do { \ ++ *p++ = htonl((u32)((n) >> 32)); \ ++ *p++ = htonl((u32)(n)); \ ++} while (0) + #define WRITEMEM(ptr,nbytes) do { \ + p = xdr_writemem(p, ptr, nbytes); \ + } while (0) +@@ -205,6 +227,16 @@ nfs_cb_stat_to_errno(int stat) + */ + + static void ++encode_stateid(struct xdr_stream *xdr, stateid_t *sid) ++{ ++ __be32 *p; ++ ++ RESERVE_SPACE(sizeof(stateid_t)); ++ WRITE32(sid->si_generation); ++ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); ++} ++ ++static void + encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) + { + __be32 * p; +@@ -229,10 +261,10 @@ encode_cb_recall(struct xdr_stream *xdr, + __be32 *p; + int len = dp->dl_fh.fh_size; + +- RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); ++ RESERVE_SPACE(4); + WRITE32(OP_CB_RECALL); +- WRITE32(dp->dl_stateid.si_generation); +- WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); ++ encode_stateid(xdr, &dp->dl_stateid); ++ RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); + WRITE32(0); /* truncate optimization not implemented */ + WRITE32(len); + WRITEMEM(&dp->dl_fh.fh_base, len); +@@ -260,6 +292,111 @@ encode_cb_sequence(struct xdr_stream *xd + hdr->nops++; + } + ++#if defined(CONFIG_PNFSD) ++ ++#include "pnfsd.h" ++ ++static void ++encode_cb_layout(struct xdr_stream *xdr, struct nfs4_layoutrecall *clr, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(20); ++ WRITE32(OP_CB_LAYOUT); ++ WRITE32(clr->cb.cbl_seg.layout_type); ++ WRITE32(clr->cb.cbl_seg.iomode); ++ WRITE32(clr->cb.cbl_layoutchanged); ++ WRITE32(clr->cb.cbl_recall_type); ++ if (unlikely(clr->cb.cbl_recall_type == RETURN_FSID)) { ++ struct nfs4_fsid fsid = clr->cb.cbl_fsid; ++ ++ RESERVE_SPACE(16); ++ WRITE64(fsid.major); ++ WRITE64(fsid.minor); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "fsid 0x%llx-0x%llx\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, fsid.major, fsid.minor); ++ } else if (clr->cb.cbl_recall_type == RETURN_FILE) { ++ int len = clr->clr_file->fi_fhlen; ++ stateid_t *cbl_sid = (stateid_t *)&clr->cb.cbl_sid; ++ ++ RESERVE_SPACE(20 + len); ++ WRITE32(len); ++ WRITEMEM(clr->clr_file->fi_fhval, len); ++ WRITE64(clr->cb.cbl_seg.offset); ++ WRITE64(clr->cb.cbl_seg.length); ++ encode_stateid(xdr, cbl_sid); ++ dprintk("%s: type %x iomode %d changed %d recall_type %d " ++ "offset %lld length %lld stateid " STATEID_FMT "\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type, ++ clr->cb.cbl_seg.offset, clr->cb.cbl_seg.length, ++ STATEID_VAL(cbl_sid)); ++ } else { ++ dprintk("%s: type %x iomode %d changed %d recall_type %d\n", ++ __func__, clr->cb.cbl_seg.layout_type, ++ clr->cb.cbl_seg.iomode, clr->cb.cbl_layoutchanged, ++ clr->cb.cbl_recall_type); ++ } ++ hdr->nops++; ++} ++ ++static void ++encode_cb_device(struct xdr_stream *xdr, struct nfs4_notify_device *nd, ++ struct nfs4_cb_compound_hdr *hdr) ++{ ++ u32 *p; ++ int i; ++ int len = nd->nd_list->cbd_len; ++ struct nfsd4_pnfs_cb_dev_item *cbd = nd->nd_list->cbd_list; ++ ++ dprintk("NFSD %s: --> num %d\n", __func__, len); ++ ++ BUG_ON(hdr->minorversion == 0); ++ ++ RESERVE_SPACE(8); ++ WRITE32(OP_CB_DEVICE); ++ ++ /* notify4 cnda_changes<>; */ ++ WRITE32(len); ++ for (i = 0; i < len; i++) { ++ dprintk("%s: nt %d lt %d devid x%llx-x%llx im %d i %d\n", ++ __func__, cbd[i].cbd_notify_type, ++ cbd[i].cbd_layout_type, ++ cbd[i].cbd_devid.sbid, ++ cbd[i].cbd_devid.devid, ++ cbd[i].cbd_immediate, i); ++ ++ BUG_ON(cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_CHANGE && ++ cbd[i].cbd_notify_type != NOTIFY_DEVICEID4_DELETE); ++ RESERVE_SPACE(32); ++ /* bitmap4 notify_mask; */ ++ WRITE32(1); ++ WRITE32(cbd[i].cbd_notify_type); ++ /* opaque notify_vals<>; */ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) ++ WRITE32(24); ++ else ++ WRITE32(20); ++ WRITE32(cbd[i].cbd_layout_type); ++ WRITE64(cbd[i].cbd_devid.sbid); ++ WRITE64(cbd[i].cbd_devid.devid); ++ ++ if (cbd[i].cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) { ++ RESERVE_SPACE(4); ++ WRITE32(cbd[i].cbd_immediate); ++ } ++ } ++ hdr->nops++; ++} ++#endif /* CONFIG_PNFSD */ ++ + static int + nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) + { +@@ -289,6 +426,45 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst * + return 0; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_layoutrecall *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_layout(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++ ++static int ++nfs4_xdr_enc_cb_device(struct rpc_rqst *req, u32 *p, ++ struct nfs4_rpc_args *rpc_args) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_notify_device *args = rpc_args->args_op; ++ struct nfs4_cb_compound_hdr hdr = { ++ .ident = 0, ++ .minorversion = rpc_args->args_seq.cbs_minorversion, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_cb_compound_hdr(&xdr, &hdr); ++ encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr); ++ encode_cb_device(&xdr, args, &hdr); ++ encode_cb_nops(&hdr); ++ return 0; ++} ++#endif /* CONFIG_PNFSD */ + + static int + decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ +@@ -404,6 +580,48 @@ out: + return status; + } + ++#if defined(CONFIG_PNFSD) ++static int ++nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_LAYOUT); ++out: ++ return status; ++} ++ ++static int ++nfs4_xdr_dec_cb_device(struct rpc_rqst *rqstp, u32 *p, ++ struct nfsd4_cb_sequence *seq) ++{ ++ struct xdr_stream xdr; ++ struct nfs4_cb_compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_cb_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_cb_sequence(&xdr, seq, rqstp); ++ if (status) ++ goto out; ++ status = decode_cb_op_hdr(&xdr, OP_CB_DEVICE); ++out: ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * RPC procedure tables + */ +@@ -421,6 +639,10 @@ out: + static struct rpc_procinfo nfs4_cb_procedures[] = { + PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), + PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), ++#if defined(CONFIG_PNFSD) ++ PROC(CB_LAYOUT, COMPOUND, enc_cb_layout, dec_cb_layout), ++ PROC(CB_DEVICE, COMPOUND, enc_cb_device, dec_cb_device), ++#endif + }; + + static struct rpc_version nfs_cb_version4 = { +@@ -606,10 +828,9 @@ out: + * TODO: cb_sequence should support referring call lists, cachethis, multiple + * slots, and mark callback channel down on communication errors. + */ +-static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_prepare_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) + { +- struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; + struct nfs4_rpc_args *args = task->tk_msg.rpc_argp; + u32 minorversion = clp->cl_cb_conn.cb_minorversion; + int status = 0; +@@ -629,11 +850,15 @@ static void nfsd4_cb_prepare(struct rpc_ + rpc_call_start(task); + } + +-static void nfsd4_cb_done(struct rpc_task *task, void *calldata) ++static void nfsd4_cb_recall_prepare(struct rpc_task *task, void *calldata) + { + struct nfs4_delegation *dp = calldata; +- struct nfs4_client *clp = dp->dl_client; ++ nfsd4_cb_prepare_sequence(task, dp->dl_client); ++} + ++static void nfsd4_cb_done_sequence(struct rpc_task *task, ++ struct nfs4_client *clp) ++{ + dprintk("%s: minorversion=%d\n", __func__, + clp->cl_cb_conn.cb_minorversion); + +@@ -657,7 +882,7 @@ static void nfsd4_cb_recall_done(struct + struct nfs4_client *clp = dp->dl_client; + struct rpc_clnt *current_rpc_client = clp->cl_cb_client; + +- nfsd4_cb_done(task, calldata); ++ nfsd4_cb_done_sequence(task, clp); + + if (current_rpc_client == NULL) { + /* We're shutting down; give up. */ +@@ -688,7 +913,7 @@ static void nfsd4_cb_recall_done(struct + if (dp->dl_retries--) { + rpc_delay(task, 2*HZ); + task->tk_status = 0; +- rpc_restart_call(task); ++ rpc_restart_call_prepare(task); + return; + } else { + atomic_set(&clp->cl_cb_set, 0); +@@ -704,7 +929,7 @@ static void nfsd4_cb_recall_release(void + } + + static const struct rpc_call_ops nfsd4_cb_recall_ops = { +- .rpc_call_prepare = nfsd4_cb_prepare, ++ .rpc_call_prepare = nfsd4_cb_recall_prepare, + .rpc_call_done = nfsd4_cb_recall_done, + .rpc_release = nfsd4_cb_recall_release, + }; +@@ -781,3 +1006,173 @@ void nfsd4_cb_recall(struct nfs4_delegat + { + queue_work(callback_wq, &dp->dl_recall.cb_work); + } ++ ++#if defined(CONFIG_PNFSD) ++static void nfsd4_cb_layout_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ nfsd4_cb_prepare_sequence(task, clr->clr_client); ++} ++ ++static void nfsd4_cb_layout_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ struct nfs4_client *clp = clr->clr_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ if (!task->tk_status) ++ return; ++ ++ printk("%s: clp %p cb_client %p fp %p failed with status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ clr->clr_file, ++ task->tk_status); ++ ++ switch (task->tk_status) { ++ case -EIO: ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ /* FIXME: ++ * The pnfs standard states that we need to only expire ++ * the client after at-least "lease time" .eg lease-time * 2 ++ * when failing to communicate a recall ++ */ ++ break; ++ case -NFS4ERR_DELAY: ++ /* Pole the client until it's done with the layout */ ++ rpc_delay(task, HZ/100); /* 10 mili-seconds */ ++ task->tk_status = 0; ++ rpc_restart_call_prepare(task); ++ break; ++ case -NFS4ERR_NOMATCHING_LAYOUT: ++ task->tk_status = 0; ++ nomatching_layout(clr); ++ } ++} ++ ++static void nfsd4_cb_layout_release(void *calldata) ++{ ++ struct nfs4_layoutrecall *clr = calldata; ++ kfree(clr->clr_args); ++ clr->clr_args = NULL; ++ put_layoutrecall(clr); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_layout_ops = { ++ .rpc_call_prepare = nfsd4_cb_layout_prepare, ++ .rpc_call_done = nfsd4_cb_layout_done, ++ .rpc_release = nfsd4_cb_layout_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfs4_client *clp = clr->clr_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_LAYOUT], ++ .rpc_cred = callback_cred ++ }; ++ int status; ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ clr->clr_args = args; ++ args->args_op = clr; ++ msg.rpc_argp = args; ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_layout_ops, clr); ++out: ++ if (status) { ++ kfree(args); ++ put_layoutrecall(clr); ++ } ++ dprintk("NFSD: nfsd4_cb_layout: status %d\n", status); ++ return status; ++} ++ ++static void nfsd4_cb_device_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ nfsd4_cb_prepare_sequence(task, cbnd->nd_client); ++} ++ ++static void nfsd4_cb_device_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ struct nfs4_client *clp = cbnd->nd_client; ++ ++ nfsd4_cb_done_sequence(task, clp); ++ ++ dprintk("%s: clp %p cb_client %p: status %d\n", ++ __func__, ++ clp, ++ clp->cl_cb_client, ++ task->tk_status); ++ ++ if (task->tk_status == -EIO) { ++ /* Network partition? */ ++ atomic_set(&clp->cl_cb_set, 0); ++ warn_no_callback_path(clp, task->tk_status); ++ } ++} ++ ++static void nfsd4_cb_device_release(void *calldata) ++{ ++ struct nfs4_notify_device *cbnd = calldata; ++ kfree(cbnd->nd_args); ++ cbnd->nd_args = NULL; ++ kfree(cbnd); ++} ++ ++static const struct rpc_call_ops nfsd4_cb_device_ops = { ++ .rpc_call_prepare = nfsd4_cb_device_prepare, ++ .rpc_call_done = nfsd4_cb_device_done, ++ .rpc_release = nfsd4_cb_device_release, ++}; ++ ++/* ++ * Called with state lock. ++ */ ++int ++nfsd4_cb_notify_device(struct nfs4_notify_device *cbnd) ++{ ++ struct nfs4_client *clp = cbnd->nd_client; ++ struct rpc_clnt *clnt = clp->cl_cb_client; ++ struct nfs4_rpc_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_DEVICE], ++ .rpc_cred = callback_cred ++ }; ++ int status = -EIO; ++ ++ dprintk("%s: clp %p\n", __func__, clp); ++ ++ args = kzalloc(sizeof(*args), GFP_KERNEL); ++ if (!args) { ++ status = -ENOMEM; ++ goto out; ++ } ++ args->args_op = cbnd; ++ msg.rpc_argp = args; ++ ++ status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, ++ &nfsd4_cb_device_ops, cbnd); ++out: ++ if (status) ++ kfree(args); ++ dprintk("%s: status %d\n", __func__, status); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.35.noarch/fs/nfsd/nfs4pnfsd.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4pnfsd.c +--- linux-2.6.35.noarch/fs/nfsd/nfs4pnfsd.c.orig 2010-08-31 21:11:40.950150325 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfs4pnfsd.c 2010-08-31 21:11:40.951150172 -0400 +@@ -0,0 +1,1679 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ *****************************************************************************/ ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Globals */ ++static u32 current_layoutid = 1; ++ ++/* ++ * Currently used for manipulating the layout state. ++ */ ++static DEFINE_SPINLOCK(layout_lock); ++ ++#if defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_SMP) ++# define BUG_ON_UNLOCKED_LAYOUT() BUG_ON(!spin_is_locked(&layout_lock)) ++#else ++# define BUG_ON_UNLOCKED_LAYOUT() ++#endif ++ ++/* ++ * Layout state - NFSv4.1 pNFS ++ */ ++static struct kmem_cache *pnfs_layout_slab; ++static struct kmem_cache *pnfs_layoutrecall_slab; ++ ++/* hash table for nfsd4_pnfs_deviceid.sbid */ ++#define SBID_HASH_BITS 8 ++#define SBID_HASH_SIZE (1 << SBID_HASH_BITS) ++#define SBID_HASH_MASK (SBID_HASH_SIZE - 1) ++ ++struct sbid_tracker { ++ u64 id; ++ struct super_block *sb; ++ struct list_head hash; ++}; ++ ++static u64 current_sbid; ++static struct list_head sbid_hashtbl[SBID_HASH_SIZE]; ++ ++static inline unsigned long ++sbid_hashval(struct super_block *sb) ++{ ++ return hash_ptr(sb, SBID_HASH_BITS); ++} ++ ++static inline struct sbid_tracker * ++alloc_sbid(void) ++{ ++ return kmalloc(sizeof(struct sbid_tracker), GFP_KERNEL); ++} ++ ++static void ++destroy_sbid(struct sbid_tracker *sbid) ++{ ++ spin_lock(&layout_lock); ++ list_del(&sbid->hash); ++ spin_unlock(&layout_lock); ++ kfree(sbid); ++} ++ ++void ++nfsd4_free_pnfs_slabs(void) ++{ ++ int i; ++ struct sbid_tracker *sbid; ++ ++ nfsd4_free_slab(&pnfs_layout_slab); ++ nfsd4_free_slab(&pnfs_layoutrecall_slab); ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ while (!list_empty(&sbid_hashtbl[i])) { ++ sbid = list_first_entry(&sbid_hashtbl[i], ++ struct sbid_tracker, ++ hash); ++ destroy_sbid(sbid); ++ } ++ } ++} ++ ++int ++nfsd4_init_pnfs_slabs(void) ++{ ++ int i; ++ ++ pnfs_layout_slab = kmem_cache_create("pnfs_layouts", ++ sizeof(struct nfs4_layout), 0, 0, NULL); ++ if (pnfs_layout_slab == NULL) ++ return -ENOMEM; ++ pnfs_layoutrecall_slab = kmem_cache_create("pnfs_layoutrecalls", ++ sizeof(struct nfs4_layoutrecall), 0, 0, NULL); ++ if (pnfs_layoutrecall_slab == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < SBID_HASH_SIZE; i++) { ++ INIT_LIST_HEAD(&sbid_hashtbl[i]); ++ } ++ ++ return 0; ++} ++ ++/* XXX: Need to implement the notify types and track which ++ * clients have which devices. */ ++void pnfs_set_device_notify(clientid_t *clid, unsigned int types) ++{ ++ struct nfs4_client *clp; ++ dprintk("%s: -->\n", __func__); ++ ++ nfs4_lock_state(); ++ /* Indicate that client has a device so we can only notify ++ * the correct clients */ ++ clp = find_confirmed_client(clid); ++ if (clp) { ++ atomic_inc(&clp->cl_deviceref); ++ dprintk("%s: Incr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++ } ++ nfs4_unlock_state(); ++} ++ ++/* Clear notifications for this client ++ * XXX: Do we need to loop through a clean up all ++ * krefs when nfsd cleans up the client? */ ++void pnfs_clear_device_notify(struct nfs4_client *clp) ++{ ++ atomic_dec(&clp->cl_deviceref); ++ dprintk("%s: Decr device count (clnt %p) to %d\n", ++ __func__, clp, atomic_read(&clp->cl_deviceref)); ++} ++ ++static struct nfs4_layout_state * ++alloc_init_layout_state(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid) ++{ ++ struct nfs4_layout_state *new; ++ ++ /* FIXME: use a kmem_cache */ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return new; ++ get_nfs4_file(fp); ++ INIT_LIST_HEAD(&new->ls_perfile); ++ INIT_LIST_HEAD(&new->ls_layouts); ++ kref_init(&new->ls_ref); ++ new->ls_client = clp; ++ new->ls_file = fp; ++ new->ls_stateid.si_boot = stateid->si_boot; ++ new->ls_stateid.si_stateownerid = 0; /* identifies layout stateid */ ++ new->ls_stateid.si_generation = 1; ++ spin_lock(&layout_lock); ++ new->ls_stateid.si_fileid = current_layoutid++; ++ list_add(&new->ls_perfile, &fp->fi_layout_states); ++ spin_unlock(&layout_lock); ++ return new; ++} ++ ++static inline void ++get_layout_state(struct nfs4_layout_state *ls) ++{ ++ kref_get(&ls->ls_ref); ++} ++ ++static void ++destroy_layout_state_common(struct nfs4_layout_state *ls) ++{ ++ struct nfs4_file *fp = ls->ls_file; ++ ++ dprintk("pNFS %s: ls %p fp %p clp %p\n", __func__, ls, fp, ++ ls->ls_client); ++ BUG_ON(!list_empty(&ls->ls_layouts)); ++ kfree(ls); ++ put_nfs4_file(fp); ++} ++ ++static void ++destroy_layout_state(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ spin_lock(&layout_lock); ++ list_del(&ls->ls_perfile); ++ spin_unlock(&layout_lock); ++ destroy_layout_state_common(ls); ++} ++ ++static void ++destroy_layout_state_locked(struct kref *kref) ++{ ++ struct nfs4_layout_state *ls = ++ container_of(kref, struct nfs4_layout_state, ls_ref); ++ ++ list_del(&ls->ls_perfile); ++ destroy_layout_state_common(ls); ++} ++ ++static inline void ++put_layout_state(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state); ++} ++ ++static inline void ++put_layout_state_locked(struct nfs4_layout_state *ls) ++{ ++ dprintk("pNFS %s: ls %p ls_ref %d\n", __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ kref_put(&ls->ls_ref, destroy_layout_state_locked); ++} ++ ++/* ++ * Search the fp->fi_layout_state list for a layout state with the clientid. ++ * If not found, then this is a 'first open/delegation/lock stateid' from ++ * the client for this file. ++ * Called under the layout_lock. ++ */ ++static struct nfs4_layout_state * ++find_get_layout_state(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_for_each_entry(ls, &fp->fi_layout_states, ls_perfile) { ++ if (ls->ls_client == clp) { ++ dprintk("pNFS %s: before GET ls %p ls_ref %d\n", ++ __func__, ls, ++ atomic_read(&ls->ls_ref.refcount)); ++ get_layout_state(ls); ++ return ls; ++ } ++ } ++ return NULL; ++} ++ ++static __be32 ++verify_stateid(struct nfs4_file *fp, stateid_t *stateid) ++{ ++ struct nfs4_stateid *local = NULL; ++ struct nfs4_delegation *temp = NULL; ++ ++ /* check if open or lock stateid */ ++ local = find_stateid(stateid, RD_STATE); ++ if (local) ++ return 0; ++ temp = find_delegation_stateid(fp->fi_inode, stateid); ++ if (temp) ++ return 0; ++ return nfserr_bad_stateid; ++} ++ ++/* ++ * nfs4_preocess_layout_stateid () ++ * ++ * We have looked up the nfs4_file corresponding to the current_fh, and ++ * confirmed the clientid. Pull the few tests from nfs4_preprocess_stateid_op() ++ * that make sense with a layout stateid. ++ * ++ * Called with the state_lock held ++ * Returns zero and stateid is updated, or error. ++ * ++ * Note: the struct nfs4_layout_state pointer is only set by layoutget. ++ */ ++static __be32 ++nfs4_process_layout_stateid(struct nfs4_client *clp, struct nfs4_file *fp, ++ stateid_t *stateid, struct nfs4_layout_state **lsp) ++{ ++ struct nfs4_layout_state *ls = NULL; ++ __be32 status = 0; ++ ++ dprintk("--> %s clp %p fp %p \n", __func__, clp, fp); ++ ++ dprintk("%s: operation stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ goto out; ++ ++ /* Is this the first use of this layout ? */ ++ spin_lock(&layout_lock); ++ ls = find_get_layout_state(clp, fp); ++ spin_unlock(&layout_lock); ++ if (!ls) { ++ /* Only alloc layout state on layoutget (which sets lsp). */ ++ if (!lsp) { ++ dprintk("%s ERROR: Not layoutget & no layout stateid\n", ++ __func__); ++ status = nfserr_bad_stateid; ++ goto out; ++ } ++ dprintk("%s Initial stateid for layout: file %p client %p\n", ++ __func__, fp, clp); ++ ++ /* verify input stateid */ ++ status = verify_stateid(fp, stateid); ++ if (status) { ++ dprintk("%s ERROR: invalid open/deleg/lock stateid\n", ++ __func__); ++ goto out; ++ } ++ ls = alloc_init_layout_state(clp, fp, stateid); ++ if (!ls) { ++ dprintk("%s pNFS ERROR: no memory for layout state\n", ++ __func__); ++ status = nfserr_resource; ++ goto out; ++ } ++ } else { ++ dprintk("%s Not initial stateid. Layout state %p file %p\n", ++ __func__, ls, fp); ++ ++ /* BAD STATEID */ ++ status = nfserr_bad_stateid; ++ if (memcmp(&ls->ls_stateid.si_opaque, &stateid->si_opaque, ++ sizeof(stateid_opaque_t)) != 0) { ++ ++ /* if a LAYOUTGET operation and stateid is a valid ++ * open/deleg/lock stateid, accept it as a parallel ++ * initial layout stateid ++ */ ++ if (lsp && ((verify_stateid(fp, stateid)) == 0)) { ++ dprintk("%s parallel initial layout state\n", ++ __func__); ++ goto update; ++ } ++ ++ dprintk("%s ERROR bad opaque in stateid 1\n", __func__); ++ goto out_put; ++ } ++ ++ /* stateid is a valid layout stateid for this file. */ ++ if (stateid->si_generation > ls->ls_stateid.si_generation) { ++ dprintk("%s bad stateid 1\n", __func__); ++ goto out_put; ++ } ++update: ++ update_stateid(&ls->ls_stateid); ++ dprintk("%s Updated ls_stateid to %d on layoutstate %p\n", ++ __func__, ls->ls_stateid.si_generation, ls); ++ } ++ status = 0; ++ /* Set the stateid to be encoded */ ++ memcpy(stateid, &ls->ls_stateid, sizeof(stateid_t)); ++ ++ /* Return the layout state if requested */ ++ if (lsp) { ++ get_layout_state(ls); ++ *lsp = ls; ++ } ++ dprintk("%s: layout stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(&ls->ls_stateid)); ++out_put: ++ dprintk("%s PUT LO STATE:\n", __func__); ++ put_layout_state(ls); ++out: ++ dprintk("<-- %s status %d\n", __func__, htonl(status)); ++ ++ return status; ++} ++ ++static inline struct nfs4_layout * ++alloc_layout(void) ++{ ++ return kmem_cache_alloc(pnfs_layout_slab, GFP_KERNEL); ++} ++ ++static inline void ++free_layout(struct nfs4_layout *lp) ++{ ++ kmem_cache_free(pnfs_layout_slab, lp); ++} ++ ++static void ++init_layout(struct nfs4_layout_state *ls, ++ struct nfs4_layout *lp, ++ struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ dprintk("pNFS %s: ls %p lp %p clp %p fp %p ino %p\n", __func__, ++ ls, lp, clp, fp, fp->fi_inode); ++ ++ get_nfs4_file(fp); ++ lp->lo_client = clp; ++ lp->lo_file = fp; ++ get_layout_state(ls); ++ lp->lo_state = ls; ++ memcpy(&lp->lo_seg, seg, sizeof(lp->lo_seg)); ++ spin_lock(&layout_lock); ++ list_add_tail(&lp->lo_perstate, &ls->ls_layouts); ++ list_add_tail(&lp->lo_perclnt, &clp->cl_layouts); ++ list_add_tail(&lp->lo_perfile, &fp->fi_layouts); ++ spin_unlock(&layout_lock); ++ dprintk("pNFS %s end\n", __func__); ++} ++ ++static void ++dequeue_layout(struct nfs4_layout *lp) ++{ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del(&lp->lo_perclnt); ++ list_del(&lp->lo_perfile); ++ list_del(&lp->lo_perstate); ++} ++ ++static void ++destroy_layout(struct nfs4_layout *lp) ++{ ++ struct nfs4_client *clp; ++ struct nfs4_file *fp; ++ struct nfs4_layout_state *ls; ++ ++ BUG_ON_UNLOCKED_LAYOUT(); ++ clp = lp->lo_client; ++ fp = lp->lo_file; ++ ls = lp->lo_state; ++ dprintk("pNFS %s: lp %p clp %p fp %p ino %p ls_layouts empty %d\n", ++ __func__, lp, clp, fp, fp->fi_inode, ++ list_empty(&ls->ls_layouts)); ++ ++ kmem_cache_free(pnfs_layout_slab, lp); ++ /* release references taken by init_layout */ ++ put_layout_state_locked(ls); ++ put_nfs4_file(fp); ++} ++ ++void fs_layout_return(struct super_block *sb, struct inode *ino, ++ struct nfsd4_pnfs_layoutreturn *lrp, int flags, ++ void *recall_cookie) ++{ ++ int ret; ++ ++ if (unlikely(!sb->s_pnfs_op->layout_return)) ++ return; ++ ++ lrp->lr_flags = flags; ++ lrp->args.lr_cookie = recall_cookie; ++ ++ if (!ino) /* FSID or ALL */ ++ ino = sb->s_root->d_inode; ++ ++ ret = sb->s_pnfs_op->layout_return(ino, &lrp->args); ++ dprintk("%s: inode %lu iomode=%d offset=0x%llx length=0x%llx " ++ "cookie = %p flags 0x%x status=%d\n", ++ __func__, ino->i_ino, lrp->args.lr_seg.iomode, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, ++ recall_cookie, flags, ret); ++} ++ ++static u64 ++alloc_init_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ struct sbid_tracker *new = alloc_sbid(); ++ unsigned long hash_idx = sbid_hashval(sb); ++ u64 id = 0; ++ ++ if (likely(new)) { ++ spin_lock(&layout_lock); ++ id = ++current_sbid; ++ new->id = (id << SBID_HASH_BITS) | (hash_idx & SBID_HASH_MASK); ++ id = new->id; ++ BUG_ON(id == 0); ++ new->sb = sb; ++ ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) ++ if (sbid->sb == sb) { ++ kfree(new); ++ id = sbid->id; ++ spin_unlock(&layout_lock); ++ return id; ++ } ++ list_add(&new->hash, &sbid_hashtbl[hash_idx]); ++ spin_unlock(&layout_lock); ++ } ++ return id; ++} ++ ++struct super_block * ++find_sbid_id(u64 id) ++{ ++ struct sbid_tracker *sbid; ++ struct super_block *sb = NULL; ++ unsigned long hash_idx = id & SBID_HASH_MASK; ++ int pos = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->id != id) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ sb = sbid->sb; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return sb; ++} ++ ++u64 ++find_create_sbid(struct super_block *sb) ++{ ++ struct sbid_tracker *sbid; ++ unsigned long hash_idx = sbid_hashval(sb); ++ int pos = 0; ++ u64 id = 0; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (sbid, &sbid_hashtbl[hash_idx], hash) { ++ pos++; ++ if (sbid->sb != sb) ++ continue; ++ if (pos > 1) ++ list_move(&sbid->hash, &sbid_hashtbl[hash_idx]); ++ id = sbid->id; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ if (!id) ++ id = alloc_init_sbid(sb); ++ ++ return id; ++} ++ ++/* ++ * Create a layoutrecall structure ++ * An optional layoutrecall can be cloned (except for the layoutrecall lists) ++ */ ++static struct nfs4_layoutrecall * ++alloc_init_layoutrecall(struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_client *clp, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ dprintk("NFSD %s\n", __func__); ++ clr = kmem_cache_alloc(pnfs_layoutrecall_slab, GFP_KERNEL); ++ if (clr == NULL) ++ return clr; ++ ++ dprintk("NFSD %s -->\n", __func__); ++ ++ memset(clr, 0, sizeof(*clr)); ++ if (lrfile) ++ get_nfs4_file(lrfile); ++ clr->clr_client = clp; ++ clr->clr_file = lrfile; ++ clr->cb = *cbl; ++ ++ kref_init(&clr->clr_ref); ++ INIT_LIST_HEAD(&clr->clr_perclnt); ++ ++ dprintk("NFSD %s return %p\n", __func__, clr); ++ return clr; ++} ++ ++static void ++get_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ kref_get(&clr->clr_ref); ++} ++ ++static void ++destroy_layoutrecall(struct kref *kref) ++{ ++ struct nfs4_layoutrecall *clr = ++ container_of(kref, struct nfs4_layoutrecall, clr_ref); ++ dprintk("pNFS %s: clr %p fp %p clp %p\n", __func__, clr, ++ clr->clr_file, clr->clr_client); ++ BUG_ON(!list_empty(&clr->clr_perclnt)); ++ if (clr->clr_file) ++ put_nfs4_file(clr->clr_file); ++ kmem_cache_free(pnfs_layoutrecall_slab, clr); ++} ++ ++int ++put_layoutrecall(struct nfs4_layoutrecall *clr) ++{ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ return kref_put(&clr->clr_ref, destroy_layoutrecall); ++} ++ ++void * ++layoutrecall_done(struct nfs4_layoutrecall *clr) ++{ ++ void *recall_cookie = clr->cb.cbl_cookie; ++ struct nfs4_layoutrecall *parent = clr->parent; ++ ++ dprintk("pNFS %s: clr %p clr_ref %d\n", __func__, clr, ++ atomic_read(&clr->clr_ref.refcount)); ++ BUG_ON_UNLOCKED_LAYOUT(); ++ list_del_init(&clr->clr_perclnt); ++ put_layoutrecall(clr); ++ ++ if (parent && !put_layoutrecall(parent)) ++ recall_cookie = NULL; ++ ++ return recall_cookie; ++} ++ ++/* ++ * get_state() and cb_get_state() are ++ */ ++void ++release_pnfs_ds_dev_list(struct nfs4_stateid *stp) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ while (!list_empty(&stp->st_pnfs_ds_id)) { ++ ddp = list_entry(stp->st_pnfs_ds_id.next, ++ struct pnfs_ds_dev_entry, dd_dev_entry); ++ list_del(&ddp->dd_dev_entry); ++ kfree(ddp); ++ } ++} ++ ++static int ++nfs4_add_pnfs_ds_dev(struct nfs4_stateid *stp, u32 dsid) ++{ ++ struct pnfs_ds_dev_entry *ddp; ++ ++ ddp = kmalloc(sizeof(*ddp), GFP_KERNEL); ++ if (!ddp) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&ddp->dd_dev_entry); ++ list_add(&ddp->dd_dev_entry, &stp->st_pnfs_ds_id); ++ ddp->dd_dsid = dsid; ++ return 0; ++} ++ ++/* ++ * are two octet ranges overlapping? ++ * start1 last1 ++ * |-----------------| ++ * start2 last2 ++ * |----------------| ++ */ ++static inline int ++lo_seg_overlapping(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 last1 = last_byte_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 last2 = last_byte_offset(start2, l2->length); ++ int ret; ++ ++ /* if last1 == start2 there's a single byte overlap */ ++ ret = (last2 >= start1) && (last1 >= start2); ++ dprintk("%s: l1 %llu:%lld l2 %llu:%lld ret=%d\n", __func__, ++ l1->offset, l1->length, l2->offset, l2->length, ret); ++ return ret; ++} ++ ++static inline int ++same_fsid_major(struct nfs4_fsid *fsid, u64 major) ++{ ++ return fsid->major == major; ++} ++ ++static inline int ++same_fsid(struct nfs4_fsid *fsid, struct svc_fh *current_fh) ++{ ++ return same_fsid_major(fsid, current_fh->fh_export->ex_fsid); ++} ++ ++/* ++ * find a layout recall conflicting with the specified layoutget ++ */ ++static int ++is_layout_recalled(struct nfs4_client *clp, ++ struct svc_fh *current_fh, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layoutrecall *clr; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (clr, &clp->cl_layoutrecalls, clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != seg->layout_type) ++ continue; ++ if (clr->cb.cbl_recall_type == RETURN_ALL) ++ goto found; ++ if (clr->cb.cbl_recall_type == RETURN_FSID) { ++ if (same_fsid(&clr->cb.cbl_fsid, current_fh)) ++ goto found; ++ else ++ continue; ++ } ++ BUG_ON(clr->cb.cbl_recall_type != RETURN_FILE); ++ if (clr->cb.cbl_seg.clientid == seg->clientid && ++ lo_seg_overlapping(&clr->cb.cbl_seg, seg)) ++ goto found; ++ } ++ spin_unlock(&layout_lock); ++ return 0; ++found: ++ spin_unlock(&layout_lock); ++ return 1; ++} ++ ++/* ++ * are two octet ranges overlapping or adjacent? ++ */ ++static inline int ++lo_seg_mergeable(struct nfsd4_layout_seg *l1, struct nfsd4_layout_seg *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ /* is end1 == start2 ranges are adjacent */ ++ return (end2 >= start1) && (end1 >= start2); ++} ++ ++static void ++extend_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lg) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lg_start = lg->offset; ++ u64 lg_end = end_offset(lg_start, lg->length); ++ ++ /* lo already covers lg? */ ++ if (lo_start <= lg_start && lg_end <= lo_end) ++ return; ++ ++ /* extend start offset */ ++ if (lo_start > lg_start) ++ lo_start = lg_start; ++ ++ /* extend end offset */ ++ if (lo_end < lg_end) ++ lo_end = lg_end; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? ++ lo_end : lo_end - lo_start; ++} ++ ++static struct nfs4_layout * ++merge_layout(struct nfs4_file *fp, ++ struct nfs4_client *clp, ++ struct nfsd4_layout_seg *seg) ++{ ++ struct nfs4_layout *lp = NULL; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry (lp, &fp->fi_layouts, lo_perfile) ++ if (lp->lo_seg.layout_type == seg->layout_type && ++ lp->lo_seg.clientid == seg->clientid && ++ lp->lo_seg.iomode == seg->iomode && ++ lo_seg_mergeable(&lp->lo_seg, seg)) { ++ extend_layout(&lp->lo_seg, seg); ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return lp; ++} ++ ++__be32 ++nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *lgp, ++ struct exp_xdr_stream *xdr) ++{ ++ u32 status; ++ __be32 nfserr; ++ struct inode *ino = lgp->lg_fhp->fh_dentry->d_inode; ++ struct super_block *sb = ino->i_sb; ++ int can_merge; ++ struct nfs4_file *fp; ++ struct nfs4_client *clp; ++ struct nfs4_layout *lp = NULL; ++ struct nfs4_layout_state *ls = NULL; ++ struct nfsd4_pnfs_layoutget_arg args = { ++ .lg_minlength = lgp->lg_minlength, ++ .lg_fh = &lgp->lg_fhp->fh_handle, ++ }; ++ struct nfsd4_pnfs_layoutget_res res = { ++ .lg_seg = lgp->lg_seg, ++ }; ++ ++ dprintk("NFSD: %s Begin\n", __func__); ++ ++ args.lg_sbid = find_create_sbid(sb); ++ if (!args.lg_sbid) { ++ nfserr = nfserr_layouttrylater; ++ goto out; ++ } ++ ++ can_merge = sb->s_pnfs_op->can_merge_layouts != NULL && ++ sb->s_pnfs_op->can_merge_layouts(lgp->lg_seg.layout_type); ++ ++ nfs4_lock_state(); ++ fp = find_alloc_file(ino, lgp->lg_fhp); ++ clp = find_confirmed_client((clientid_t *)&lgp->lg_seg.clientid); ++ dprintk("pNFS %s: fp %p clp %p \n", __func__, fp, clp); ++ if (!fp || !clp) { ++ nfserr = nfserr_inval; ++ goto out_unlock; ++ } ++ ++ /* Check decoded layout stateid */ ++ nfserr = nfs4_process_layout_stateid(clp, fp, &lgp->lg_sid, &ls); ++ if (nfserr) ++ goto out_unlock; ++ ++ if (is_layout_recalled(clp, lgp->lg_fhp, &lgp->lg_seg)) { ++ nfserr = nfserr_recallconflict; ++ goto out; ++ } ++ ++ /* pre-alloc layout in case we can't merge after we call ++ * the file system ++ */ ++ lp = alloc_layout(); ++ if (!lp) { ++ nfserr = nfserr_layouttrylater; ++ goto out_unlock; ++ } ++ ++ dprintk("pNFS %s: pre-export type 0x%x maxcount %Zd " ++ "iomode %u offset %llu length %llu\n", ++ __func__, lgp->lg_seg.layout_type, ++ exp_xdr_qbytes(xdr->end - xdr->p), ++ lgp->lg_seg.iomode, lgp->lg_seg.offset, lgp->lg_seg.length); ++ ++ /* FIXME: need to eliminate the use of the state lock */ ++ nfs4_unlock_state(); ++ status = sb->s_pnfs_op->layout_get(ino, xdr, &args, &res); ++ nfs4_lock_state(); ++ ++ dprintk("pNFS %s: post-export status %u " ++ "iomode %u offset %llu length %llu\n", ++ __func__, status, res.lg_seg.iomode, ++ res.lg_seg.offset, res.lg_seg.length); ++ ++ /* ++ * The allowable error codes for the layout_get pNFS export ++ * operations vector function (from the file system) can be ++ * expanded as needed to include other errors defined for ++ * the RFC 5561 LAYOUTGET operation. ++ */ ++ switch (status) { ++ case 0: ++ nfserr = NFS4_OK; ++ break; ++ case NFS4ERR_ACCESS: ++ case NFS4ERR_BADIOMODE: ++ /* No support for LAYOUTIOMODE4_RW layouts */ ++ case NFS4ERR_BADLAYOUT: ++ /* No layout matching loga_minlength rules */ ++ case NFS4ERR_INVAL: ++ case NFS4ERR_IO: ++ case NFS4ERR_LAYOUTTRYLATER: ++ case NFS4ERR_LAYOUTUNAVAILABLE: ++ case NFS4ERR_LOCKED: ++ case NFS4ERR_NOSPC: ++ case NFS4ERR_RECALLCONFLICT: ++ case NFS4ERR_SERVERFAULT: ++ case NFS4ERR_TOOSMALL: ++ /* Requested layout too big for loga_maxcount */ ++ case NFS4ERR_WRONG_TYPE: ++ /* Not a regular file */ ++ nfserr = cpu_to_be32(status); ++ goto out_freelayout; ++ default: ++ BUG(); ++ nfserr = nfserr_serverfault; ++ } ++ ++ lgp->lg_seg = res.lg_seg; ++ lgp->lg_roc = res.lg_return_on_close; ++ ++ /* SUCCESS! ++ * Can the new layout be merged into an existing one? ++ * If so, free unused layout struct ++ */ ++ if (can_merge && merge_layout(fp, clp, &res.lg_seg)) ++ goto out_freelayout; ++ ++ /* Can't merge, so let's initialize this new layout */ ++ init_layout(ls, lp, fp, clp, lgp->lg_fhp, &res.lg_seg); ++out_unlock: ++ if (ls) ++ put_layout_state(ls); ++ if (fp) ++ put_nfs4_file(fp); ++ nfs4_unlock_state(); ++out: ++ dprintk("pNFS %s: lp %p exit nfserr %u\n", __func__, lp, ++ be32_to_cpu(nfserr)); ++ return nfserr; ++out_freelayout: ++ free_layout(lp); ++ goto out_unlock; ++} ++ ++static void ++trim_layout(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *lr) ++{ ++ u64 lo_start = lo->offset; ++ u64 lo_end = end_offset(lo_start, lo->length); ++ u64 lr_start = lr->offset; ++ u64 lr_end = end_offset(lr_start, lr->length); ++ ++ dprintk("%s:Begin lo %llu:%lld lr %llu:%lld\n", __func__, ++ lo->offset, lo->length, lr->offset, lr->length); ++ ++ /* lr fully covers lo? */ ++ if (lr_start <= lo_start && lo_end <= lr_end) { ++ lo->length = 0; ++ goto out; ++ } ++ ++ /* ++ * split not supported yet. retain layout segment. ++ * remains must be returned by the client ++ * on the final layout return. ++ */ ++ if (lo_start < lr_start && lr_end < lo_end) { ++ dprintk("%s: split not supported\n", __func__); ++ goto out; ++ } ++ ++ if (lo_start < lr_start) ++ lo_end = lr_start - 1; ++ else /* lr_end < lo_end */ ++ lo_start = lr_end + 1; ++ ++ lo->offset = lo_start; ++ lo->length = (lo_end == NFS4_MAX_UINT64) ? lo_end : lo_end - lo_start; ++out: ++ dprintk("%s:End lo %llu:%lld\n", __func__, lo->offset, lo->length); ++} ++ ++static int ++pnfs_return_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ dprintk("%s: clp %p fp %p\n", __func__, clp, fp); ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &fp->fi_layouts, lo_perfile) { ++ dprintk("%s: lp %p client %p,%p lo_type %x,%x iomode %d,%d\n", ++ __func__, lp, ++ lp->lo_client, clp, ++ lp->lo_seg.layout_type, lrp->args.lr_seg.layout_type, ++ lp->lo_seg.iomode, lrp->args.lr_seg.iomode); ++ if (lp->lo_client != clp || ++ lp->lo_seg.layout_type != lrp->args.lr_seg.layout_type || ++ (lp->lo_seg.iomode != lrp->args.lr_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) || ++ !lo_seg_overlapping(&lp->lo_seg, &lrp->args.lr_seg)) ++ continue; ++ layouts_found++; ++ trim_layout(&lp->lo_seg, &lrp->args.lr_seg); ++ if (!lp->lo_seg.length) { ++ lrp->lrs_present = 0; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++pnfs_return_client_layouts(struct nfs4_client *clp, ++ struct nfsd4_pnfs_layoutreturn *lrp, u64 ex_fsid) ++{ ++ int layouts_found = 0; ++ struct nfs4_layout *lp, *nextlp; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (lp, nextlp, &clp->cl_layouts, lo_perclnt) { ++ if (lrp->args.lr_seg.layout_type != lp->lo_seg.layout_type || ++ (lrp->args.lr_seg.iomode != lp->lo_seg.iomode && ++ lrp->args.lr_seg.iomode != IOMODE_ANY)) ++ continue; ++ ++ if (lrp->args.lr_return_type == RETURN_FSID && ++ !same_fsid_major(&lp->lo_file->fi_fsid, ex_fsid)) ++ continue; ++ ++ layouts_found++; ++ dequeue_layout(lp); ++ destroy_layout(lp); ++ } ++ spin_unlock(&layout_lock); ++ ++ return layouts_found; ++} ++ ++static int ++recall_return_perfect_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode || ++ clr->cb.cbl_recall_type != lrp->args.lr_return_type) ++ return 0; ++ ++ return (clr->cb.cbl_recall_type == RETURN_FILE && ++ clr->clr_file == fp && ++ clr->cb.cbl_seg.offset == lrp->args.lr_seg.offset && ++ clr->cb.cbl_seg.length == lrp->args.lr_seg.length) || ++ ++ (clr->cb.cbl_recall_type == RETURN_FSID && ++ same_fsid(&clr->cb.cbl_fsid, current_fh)) || ++ ++ clr->cb.cbl_recall_type == RETURN_ALL; ++} ++ ++static int ++recall_return_partial_match(struct nfs4_layoutrecall *clr, ++ struct nfsd4_pnfs_layoutreturn *lrp, ++ struct nfs4_file *fp, ++ struct svc_fh *current_fh) ++{ ++ /* iomode matching? */ ++ if (clr->cb.cbl_seg.iomode != lrp->args.lr_seg.iomode && ++ clr->cb.cbl_seg.iomode != IOMODE_ANY && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) ++ return 0; ++ ++ if (clr->cb.cbl_recall_type == RETURN_ALL || ++ lrp->args.lr_return_type == RETURN_ALL) ++ return 1; ++ ++ /* fsid matches? */ ++ if (clr->cb.cbl_recall_type == RETURN_FSID || ++ lrp->args.lr_return_type == RETURN_FSID) ++ return same_fsid(&clr->cb.cbl_fsid, current_fh); ++ ++ /* file matches, range overlapping? */ ++ return clr->clr_file == fp && ++ lo_seg_overlapping(&clr->cb.cbl_seg, &lrp->args.lr_seg); ++} ++ ++int nfs4_pnfs_return_layout(struct super_block *sb, struct svc_fh *current_fh, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status = 0; ++ int layouts_found = 0; ++ struct inode *ino = current_fh->fh_dentry->d_inode; ++ struct nfs4_file *fp = NULL; ++ struct nfs4_client *clp; ++ struct nfs4_layoutrecall *clr, *nextclr; ++ u64 ex_fsid = current_fh->fh_export->ex_fsid; ++ void *recall_cookie = NULL; ++ ++ dprintk("NFSD: %s\n", __func__); ++ ++ nfs4_lock_state(); ++ clp = find_confirmed_client((clientid_t *)&lrp->args.lr_seg.clientid); ++ if (!clp) ++ goto out; ++ ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ fp = find_file(ino); ++ if (!fp) { ++ printk(KERN_ERR "%s: RETURN_FILE: no nfs4_file for " ++ "ino %p:%lu\n", ++ __func__, ino, ino ? ino->i_ino : 0L); ++ goto out; ++ } ++ ++ /* Check the stateid */ ++ dprintk("%s PROCESS LO_STATEID inode %p\n", __func__, ino); ++ status = nfs4_process_layout_stateid(clp, fp, &lrp->lr_sid, ++ NULL); ++ if (status) ++ goto out_put_file; ++ ++ /* update layouts */ ++ layouts_found = pnfs_return_file_layouts(clp, fp, lrp); ++ /* optimize for the all-empty case */ ++ if (list_empty(&fp->fi_layouts)) ++ recall_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ } else { ++ layouts_found = pnfs_return_client_layouts(clp, lrp, ex_fsid); ++ } ++ ++ dprintk("pNFS %s: clp %p fp %p layout_type 0x%x iomode %d " ++ "return_type %d fsid 0x%llx offset %llu length %llu: " ++ "layouts_found %d\n", ++ __func__, clp, fp, lrp->args.lr_seg.layout_type, ++ lrp->args.lr_seg.iomode, lrp->args.lr_return_type, ++ ex_fsid, ++ lrp->args.lr_seg.offset, lrp->args.lr_seg.length, layouts_found); ++ ++ /* update layoutrecalls ++ * note: for RETURN_{FSID,ALL}, fp may be NULL ++ */ ++ spin_lock(&layout_lock); ++ list_for_each_entry_safe (clr, nextclr, &clp->cl_layoutrecalls, ++ clr_perclnt) { ++ if (clr->cb.cbl_seg.layout_type != lrp->args.lr_seg.layout_type) ++ continue; ++ ++ if (recall_return_perfect_match(clr, lrp, fp, current_fh)) ++ recall_cookie = layoutrecall_done(clr); ++ else if (layouts_found && ++ recall_return_partial_match(clr, lrp, fp, current_fh)) ++ clr->clr_time = CURRENT_TIME; ++ } ++ spin_unlock(&layout_lock); ++ ++out_put_file: ++ if (fp) ++ put_nfs4_file(fp); ++out: ++ nfs4_unlock_state(); ++ ++ /* call exported filesystem layout_return (ignore return-code) */ ++ fs_layout_return(sb, ino, lrp, 0, recall_cookie); ++ ++ dprintk("pNFS %s: exit status %d \n", __func__, status); ++ return status; ++} ++ ++/* ++ * PNFS Metadata server export operations callback for get_state ++ * ++ * called by the cluster fs when it receives a get_state() from a data ++ * server. ++ * returns status, or pnfs_get_state* with pnfs_get_state->status set. ++ * ++ */ ++int ++nfs4_pnfs_cb_get_state(struct super_block *sb, struct pnfs_get_state *arg) ++{ ++ struct nfs4_stateid *stp; ++ int flags = LOCK_STATE | OPEN_STATE; /* search both hash tables */ ++ int status = -EINVAL; ++ struct inode *ino; ++ struct nfs4_delegation *dl; ++ stateid_t *stid = (stateid_t *)&arg->stid; ++ ++ dprintk("NFSD: %s sid=" STATEID_FMT " ino %llu\n", __func__, ++ STATEID_VAL(stid), arg->ino); ++ ++ nfs4_lock_state(); ++ stp = find_stateid(stid, flags); ++ if (!stp) { ++ ino = iget_locked(sb, arg->ino); ++ if (!ino) ++ goto out; ++ ++ if (ino->i_state & I_NEW) { ++ iget_failed(ino); ++ goto out; ++ } ++ ++ dl = find_delegation_stateid(ino, stid); ++ if (dl) ++ status = 0; ++ ++ iput(ino); ++ } else { ++ /* XXX ANDROS: marc removed nfs4_check_fh - how come? */ ++ ++ /* arg->devid is the Data server id, set by the cluster fs */ ++ status = nfs4_add_pnfs_ds_dev(stp, arg->dsid); ++ if (status) ++ goto out; ++ ++ arg->access = stp->st_access_bmap; ++ *(clientid_t *)&arg->clid = ++ stp->st_stateowner->so_client->cl_clientid; ++ } ++out: ++ nfs4_unlock_state(); ++ return status; ++} ++ ++static int ++cl_has_file_layout(struct nfs4_client *clp, struct nfs4_file *lrfile, ++ stateid_t *lsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ struct nfs4_layout_state *ls; ++ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) { ++ if (lp->lo_file != lrfile) ++ continue; ++ ++ ls = find_get_layout_state(clp, lrfile); ++ if (!ls) { ++ /* This shouldn't happen as the file should have a ++ * layout stateid if it has a layout. ++ */ ++ printk(KERN_ERR "%s: file %p has no layout stateid\n", ++ __func__, lrfile); ++ WARN_ON(1); ++ break; ++ } ++ update_stateid(&ls->ls_stateid); ++ memcpy(lsid, &ls->ls_stateid, sizeof(stateid_t)); ++ put_layout_state_locked(ls); ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ ++ return found; ++} ++ ++static int ++cl_has_fsid_layout(struct nfs4_client *clp, struct nfs4_fsid *fsid) ++{ ++ int found = 0; ++ struct nfs4_layout *lp; ++ ++ /* note: minor version unused */ ++ spin_lock(&layout_lock); ++ list_for_each_entry(lp, &clp->cl_layouts, lo_perclnt) ++ if (lp->lo_file->fi_fsid.major == fsid->major) { ++ found = 1; ++ break; ++ } ++ spin_unlock(&layout_lock); ++ return found; ++} ++ ++static int ++cl_has_any_layout(struct nfs4_client *clp) ++{ ++ return !list_empty(&clp->cl_layouts); ++} ++ ++static int ++cl_has_layout(struct nfs4_client *clp, struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile, stateid_t *lsid) ++{ ++ switch (cbl->cbl_recall_type) { ++ case RETURN_FILE: ++ return cl_has_file_layout(clp, lrfile, lsid); ++ case RETURN_FSID: ++ return cl_has_fsid_layout(clp, &cbl->cbl_fsid); ++ default: ++ return cl_has_any_layout(clp); ++ } ++} ++ ++/* ++ * Called without the layout_lock. ++ */ ++void ++nomatching_layout(struct nfs4_layoutrecall *clr) ++{ ++ struct nfsd4_pnfs_layoutreturn lr = { ++ .args.lr_return_type = clr->cb.cbl_recall_type, ++ .args.lr_seg = clr->cb.cbl_seg, ++ }; ++ struct inode *inode; ++ void *recall_cookie; ++ ++ if (clr->clr_file) { ++ inode = igrab(clr->clr_file->fi_inode); ++ if (WARN_ON(!inode)) ++ return; ++ } else { ++ inode = NULL; ++ } ++ ++ dprintk("%s: clp %p fp %p: simulating layout_return\n", __func__, ++ clr->clr_client, clr->clr_file); ++ ++ if (clr->cb.cbl_recall_type == RETURN_FILE) ++ pnfs_return_file_layouts(clr->clr_client, clr->clr_file, &lr); ++ else ++ pnfs_return_client_layouts(clr->clr_client, &lr, ++ clr->cb.cbl_fsid.major); ++ ++ spin_lock(&layout_lock); ++ recall_cookie = layoutrecall_done(clr); ++ spin_unlock(&layout_lock); ++ ++ fs_layout_return(clr->clr_sb, inode, &lr, LR_FLAG_INTERN, ++ recall_cookie); ++ iput(inode); ++} ++ ++void pnfs_expire_client(struct nfs4_client *clp) ++{ ++ for (;;) { ++ struct nfs4_layoutrecall *lrp = NULL; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layoutrecalls)) { ++ lrp = list_entry(clp->cl_layoutrecalls.next, ++ struct nfs4_layoutrecall, clr_perclnt); ++ get_layoutrecall(lrp); ++ } ++ spin_unlock(&layout_lock); ++ if (!lrp) ++ break; ++ ++ dprintk("%s: lrp %p, fp %p\n", __func__, lrp, lrp->clr_file); ++ BUG_ON(lrp->clr_client != clp); ++ nomatching_layout(lrp); ++ put_layoutrecall(lrp); ++ } ++ ++ for (;;) { ++ struct nfs4_layout *lp = NULL; ++ struct inode *inode = NULL; ++ struct nfsd4_pnfs_layoutreturn lr; ++ bool empty = false; ++ ++ spin_lock(&layout_lock); ++ if (!list_empty(&clp->cl_layouts)) { ++ lp = list_entry(clp->cl_layouts.next, ++ struct nfs4_layout, lo_perclnt); ++ inode = igrab(lp->lo_file->fi_inode); ++ memset(&lr, 0, sizeof(lr)); ++ lr.args.lr_return_type = RETURN_FILE; ++ lr.args.lr_seg = lp->lo_seg; ++ empty = list_empty(&lp->lo_file->fi_layouts); ++ BUG_ON(lp->lo_client != clp); ++ dequeue_layout(lp); ++ destroy_layout(lp); /* do not access lp after this */ ++ } ++ spin_unlock(&layout_lock); ++ if (!lp) ++ break; ++ ++ if (WARN_ON(!inode)) ++ break; ++ ++ dprintk("%s: inode %lu lp %p clp %p\n", __func__, inode->i_ino, ++ lp, clp); ++ ++ fs_layout_return(inode->i_sb, inode, &lr, LR_FLAG_EXPIRE, ++ empty ? PNFS_LAST_LAYOUT_NO_RECALLS : NULL); ++ iput(inode); ++ } ++} ++ ++struct create_recall_list_arg { ++ struct nfsd4_pnfs_cb_layout *cbl; ++ struct nfs4_file *lrfile; ++ struct list_head *todolist; ++ unsigned todo_count; ++}; ++ ++/* ++ * look for matching layout for the given client ++ * and add a pending layout recall to the todo list ++ * if found any. ++ * returns: ++ * 0 if layouts found or negative error. ++ */ ++static int ++lo_recall_per_client(struct nfs4_client *clp, void *p) ++{ ++ stateid_t lsid; ++ struct nfs4_layoutrecall *pending; ++ struct create_recall_list_arg *arg = p; ++ ++ memset(&lsid, 0, sizeof(lsid)); ++ if (!cl_has_layout(clp, arg->cbl, arg->lrfile, &lsid)) ++ return 0; ++ ++ /* Matching put done by layoutreturn */ ++ pending = alloc_init_layoutrecall(arg->cbl, clp, arg->lrfile); ++ /* out of memory, drain todo queue */ ++ if (!pending) ++ return -ENOMEM; ++ ++ *(stateid_t *)&pending->cb.cbl_sid = lsid; ++ list_add(&pending->clr_perclnt, arg->todolist); ++ arg->todo_count++; ++ return 0; ++} ++ ++/* Create a layoutrecall structure for each client based on the ++ * original structure. */ ++int ++create_layout_recall_list(struct list_head *todolist, unsigned *todo_len, ++ struct nfsd4_pnfs_cb_layout *cbl, ++ struct nfs4_file *lrfile) ++{ ++ struct nfs4_client *clp; ++ struct create_recall_list_arg arg = { ++ .cbl = cbl, ++ .lrfile = lrfile, ++ .todolist = todolist, ++ }; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ /* If client given by fs, just do single client */ ++ if (cbl->cbl_seg.clientid) { ++ clp = find_confirmed_client( ++ (clientid_t *)&cbl->cbl_seg.clientid); ++ if (!clp) { ++ status = -ENOENT; ++ dprintk("%s: clientid %llx not found\n", __func__, ++ (unsigned long long)cbl->cbl_seg.clientid); ++ goto out; ++ } ++ ++ status = lo_recall_per_client(clp, &arg); ++ } else { ++ /* Check all clients for layout matches */ ++ status = filter_confirmed_clients(lo_recall_per_client, &arg); ++ } ++ ++out: ++ *todo_len = arg.todo_count; ++ dprintk("%s: <-- list len %u status %d\n", __func__, *todo_len, status); ++ return status; ++} ++ ++/* ++ * Recall layouts asynchronously ++ * Called with state lock. ++ */ ++static int ++spawn_layout_recall(struct super_block *sb, struct list_head *todolist, ++ unsigned todo_len) ++{ ++ struct nfs4_layoutrecall *pending; ++ struct nfs4_layoutrecall *parent = NULL; ++ int status = 0; ++ ++ dprintk("%s: -->\n", __func__); ++ ++ if (todo_len > 1) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ ++ parent = alloc_init_layoutrecall(&pending->cb, NULL, ++ pending->clr_file); ++ if (unlikely(!parent)) { ++ /* We want forward progress. If parent cannot be ++ * allocated take the first one as parent but don't ++ * execute it. Caller must check for -EAGAIN, if so ++ * When the partial recalls return, ++ * nfsd_layout_recall_cb should be called again. ++ */ ++ list_del_init(&pending->clr_perclnt); ++ if (todo_len > 2) { ++ parent = pending; ++ } else { ++ parent = NULL; ++ put_layoutrecall(pending); ++ } ++ --todo_len; ++ status = -ENOMEM; ++ } ++ } ++ ++ while (!list_empty(todolist)) { ++ pending = list_entry(todolist->next, struct nfs4_layoutrecall, ++ clr_perclnt); ++ list_del_init(&pending->clr_perclnt); ++ dprintk("%s: clp %p cb_client %p fp %p\n", __func__, ++ pending->clr_client, ++ pending->clr_client->cl_cb_client, ++ pending->clr_file); ++ if (unlikely(!pending->clr_client->cl_cb_client)) { ++ printk(KERN_INFO ++ "%s: clientid %08x/%08x has no callback path\n", ++ __func__, ++ pending->clr_client->cl_clientid.cl_boot, ++ pending->clr_client->cl_clientid.cl_id); ++ put_layoutrecall(pending); ++ continue; ++ } ++ ++ pending->clr_time = CURRENT_TIME; ++ pending->clr_sb = sb; ++ if (parent) { ++ /* If we created a parent its initial ref count is 1. ++ * We will need to de-ref it eventually. So we just ++ * don't increment on behalf of the last one. ++ */ ++ if (todo_len != 1) ++ get_layoutrecall(parent); ++ } ++ pending->parent = parent; ++ get_layoutrecall(pending); ++ /* Add to list so corresponding layoutreturn can find req */ ++ list_add(&pending->clr_perclnt, ++ &pending->clr_client->cl_layoutrecalls); ++ ++ nfsd4_cb_layout(pending); ++ --todo_len; ++ } ++ ++ return status; ++} ++ ++/* ++ * Spawn a thread to perform a recall layout ++ * ++ */ ++int nfsd_layout_recall_cb(struct super_block *sb, struct inode *inode, ++ struct nfsd4_pnfs_cb_layout *cbl) ++{ ++ int status; ++ struct nfs4_file *lrfile = NULL; ++ struct list_head todolist; ++ unsigned todo_len = 0; ++ ++ dprintk("NFSD nfsd_layout_recall_cb: inode %p cbl %p\n", inode, cbl); ++ BUG_ON(!cbl); ++ BUG_ON(cbl->cbl_recall_type != RETURN_FILE && ++ cbl->cbl_recall_type != RETURN_FSID && ++ cbl->cbl_recall_type != RETURN_ALL); ++ BUG_ON(cbl->cbl_recall_type == RETURN_FILE && !inode); ++ BUG_ON(cbl->cbl_seg.iomode != IOMODE_READ && ++ cbl->cbl_seg.iomode != IOMODE_RW && ++ cbl->cbl_seg.iomode != IOMODE_ANY); ++ ++ if (nfsd_serv == NULL) { ++ dprintk("NFSD nfsd_layout_recall_cb: nfsd_serv == NULL\n"); ++ return -ENOENT; ++ } ++ ++ nfs4_lock_state(); ++ status = -ENOENT; ++ if (inode) { ++ lrfile = find_file(inode); ++ if (!lrfile) { ++ dprintk("NFSD nfsd_layout_recall_cb: " ++ "nfs4_file not found\n"); ++ goto err; ++ } ++ if (cbl->cbl_recall_type == RETURN_FSID) ++ cbl->cbl_fsid = lrfile->fi_fsid; ++ } ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ /* If no cookie provided by FS, return a default one */ ++ if (!cbl->cbl_cookie) ++ cbl->cbl_cookie = PNFS_LAST_LAYOUT_NO_RECALLS; ++ ++ status = create_layout_recall_list(&todolist, &todo_len, cbl, lrfile); ++ if (list_empty(&todolist)) { ++ status = -ENOENT; ++ } else { ++ /* process todolist even if create_layout_recall_list ++ * returned an error */ ++ int status2 = spawn_layout_recall(sb, &todolist, todo_len); ++ if (status2) ++ status = status2; ++ } ++ ++err: ++ nfs4_unlock_state(); ++ if (lrfile) ++ put_nfs4_file(lrfile); ++ return (todo_len && status) ? -EAGAIN : status; ++} ++ ++struct create_device_notify_list_arg { ++ struct list_head *todolist; ++ struct nfsd4_pnfs_cb_dev_list *ndl; ++}; ++ ++static int ++create_device_notify_per_cl(struct nfs4_client *clp, void *p) ++{ ++ struct nfs4_notify_device *cbnd; ++ struct create_device_notify_list_arg *arg = p; ++ ++ if (atomic_read(&clp->cl_deviceref) <= 0) ++ return 0; ++ ++ cbnd = kmalloc(sizeof(*cbnd), GFP_KERNEL); ++ if (!cbnd) ++ return -ENOMEM; ++ ++ cbnd->nd_list = arg->ndl; ++ cbnd->nd_client = clp; ++ list_add(&cbnd->nd_perclnt, arg->todolist); ++ return 0; ++} ++ ++/* Create a list of clients to send device notifications. */ ++int ++create_device_notify_list(struct list_head *todolist, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ int status; ++ struct create_device_notify_list_arg arg = { ++ .todolist = todolist, ++ .ndl = ndl, ++ }; ++ ++ nfs4_lock_state(); ++ status = filter_confirmed_clients(create_device_notify_per_cl, &arg); ++ nfs4_unlock_state(); ++ ++ return status; ++} ++ ++/* ++ * For each client that a device, send a device notification. ++ * XXX: Need to track which clients have which devices. ++ */ ++int nfsd_device_notify_cb(struct super_block *sb, ++ struct nfsd4_pnfs_cb_dev_list *ndl) ++{ ++ struct nfs4_notify_device *cbnd; ++ unsigned int notify_num = 0; ++ int status2, status = 0; ++ struct list_head todolist; ++ ++ BUG_ON(!ndl || ndl->cbd_len == 0 || !ndl->cbd_list); ++ ++ dprintk("NFSD %s: cbl %p len %u\n", __func__, ndl, ndl->cbd_len); ++ ++ if (nfsd_serv == NULL) ++ return -ENOENT; ++ ++ INIT_LIST_HEAD(&todolist); ++ ++ status = create_device_notify_list(&todolist, ndl); ++ ++ while (!list_empty(&todolist)) { ++ cbnd = list_entry(todolist.next, struct nfs4_notify_device, ++ nd_perclnt); ++ list_del_init(&cbnd->nd_perclnt); ++ status2 = nfsd4_cb_notify_device(cbnd); ++ pnfs_clear_device_notify(cbnd->nd_client); ++ if (status2) { ++ kfree(cbnd); ++ status = status2; ++ } ++ notify_num++; ++ } ++ ++ dprintk("NFSD %s: status %d clients %u\n", ++ __func__, status, notify_num); ++ return status; ++} +diff -up linux-2.6.35.noarch/fs/nfsd/nfs4pnfsdlm.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4pnfsdlm.c +--- linux-2.6.35.noarch/fs/nfsd/nfs4pnfsdlm.c.orig 2010-08-31 21:11:40.952150363 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfs4pnfsdlm.c 2010-08-31 21:11:40.952150363 -0400 +@@ -0,0 +1,461 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsfh.h" ++#include "nfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++/* Just use a linked list. Do not expect more than 32 dlm_device_entries ++ * the first implementation will just use one device per cluster file system ++ */ ++ ++static LIST_HEAD(dlm_device_list); ++static DEFINE_SPINLOCK(dlm_device_list_lock); ++ ++struct dlm_device_entry { ++ struct list_head dlm_dev_list; ++ char disk_name[DISK_NAME_LEN]; ++ int num_ds; ++ char ds_list[NFSD_DLM_DS_LIST_MAX]; ++}; ++ ++static struct dlm_device_entry * ++_nfsd4_find_pnfs_dlm_device(char *disk_name) ++{ ++ struct dlm_device_entry *dlm_pdev; ++ ++ dprintk("--> %s disk name %s\n", __func__, disk_name); ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) { ++ dprintk("%s Look for dlm_pdev %s\n", __func__, ++ dlm_pdev->disk_name); ++ if (!memcmp(dlm_pdev->disk_name, disk_name, strlen(disk_name))) { ++ spin_unlock(&dlm_device_list_lock); ++ return dlm_pdev; ++ } ++ } ++ spin_unlock(&dlm_device_list_lock); ++ return NULL; ++} ++ ++static struct dlm_device_entry * ++nfsd4_find_pnfs_dlm_device(struct super_block *sb) { ++ char dname[BDEVNAME_SIZE]; ++ ++ bdevname(sb->s_bdev, dname); ++ return _nfsd4_find_pnfs_dlm_device(dname); ++} ++ ++ssize_t ++nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen) ++{ ++ char *pos = buf; ++ ssize_t size = 0; ++ struct dlm_device_entry *dlm_pdev; ++ int ret = -EINVAL; ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry(dlm_pdev, &dlm_device_list, dlm_dev_list) ++ { ++ int advanced; ++ advanced = snprintf(pos, buflen - size, "%s:%s\n", dlm_pdev->disk_name, dlm_pdev->ds_list); ++ if (advanced >= buflen - size) ++ goto out; ++ size += advanced; ++ pos += advanced; ++ } ++ ret = size; ++ ++out: ++ spin_unlock(&dlm_device_list_lock); ++ return ret; ++} ++ ++bool nfsd4_validate_pnfs_dlm_device(char *ds_list, int *num_ds) ++{ ++ char *start = ds_list; ++ ++ *num_ds = 0; ++ ++ while (*start) { ++ struct sockaddr_storage tempAddr; ++ int ipLen = strcspn(start, ","); ++ ++ if (!rpc_pton(start, ipLen, (struct sockaddr *)&tempAddr, sizeof(tempAddr))) ++ return false; ++ (*num_ds)++; ++ start += ipLen + 1; ++ } ++ return true; ++} ++ ++/* ++ * pnfs_dlm_device string format: ++ * block-device-path:, ++ * ++ * Examples ++ * /dev/sda:192.168.1.96,192.168.1.97' creates a data server list with ++ * two data servers for the dlm cluster file system mounted on /dev/sda. ++ * ++ * /dev/sda:192.168.1.96,192.168.1.100' ++ * replaces the data server list for /dev/sda ++ * ++ * Only the deviceid == 1 is supported. Can add device id to ++ * pnfs_dlm_device string when needed. ++ * ++ * Only the round robin each data server once stripe index is supported. ++ */ ++int ++nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len) ++ ++{ ++ struct dlm_device_entry *new, *found; ++ char *bufp = pnfs_dlm_device; ++ char *endp = bufp + strlen(bufp); ++ int err = -ENOMEM; ++ ++ dprintk("--> %s len %d\n", __func__, len); ++ ++ new = kzalloc(sizeof(*new), GFP_KERNEL); ++ if (!new) ++ return err; ++ ++ err = -EINVAL; ++ /* disk_name */ ++ /* FIXME: need to check for valid disk_name. search superblocks? ++ * check for slash dev slash ? ++ */ ++ len = strcspn(bufp, ":"); ++ if (len > DISK_NAME_LEN) ++ goto out_free; ++ memcpy(new->disk_name, bufp, len); ++ ++ err = -EINVAL; ++ bufp += len + 1; ++ if (bufp >= endp) ++ goto out_free; ++ ++ /* data server list */ ++ /* FIXME: need to check for comma separated valid ip format */ ++ len = strcspn(bufp, ":"); ++ if (len > NFSD_DLM_DS_LIST_MAX) ++ goto out_free; ++ memcpy(new->ds_list, bufp, len); ++ ++ ++ /* validate the ips */ ++ if (!nfsd4_validate_pnfs_dlm_device(new->ds_list, &(new->num_ds))) ++ goto out_free; ++ ++ dprintk("%s disk_name %s num_ds %d ds_list %s\n", __func__, ++ new->disk_name, new->num_ds, new->ds_list); ++ ++ found = _nfsd4_find_pnfs_dlm_device(new->disk_name); ++ if (found) { ++ /* FIXME: should compare found->ds_list with new->ds_list ++ * and if it is different, kick off a CB_NOTIFY change ++ * deviceid. ++ */ ++ dprintk("%s pnfs_dlm_device %s:%s already in cache " ++ " replace ds_list with new ds_list %s\n", __func__, ++ found->disk_name, found->ds_list, new->ds_list); ++ memset(found->ds_list, 0, DISK_NAME_LEN); ++ memcpy(found->ds_list, new->ds_list, strlen(new->ds_list)); ++ found->num_ds = new->num_ds; ++ kfree(new); ++ } else { ++ dprintk("%s Adding pnfs_dlm_device %s:%s\n", __func__, ++ new->disk_name, new->ds_list); ++ spin_lock(&dlm_device_list_lock); ++ list_add(&new->dlm_dev_list, &dlm_device_list); ++ spin_unlock(&dlm_device_list_lock); ++ } ++ dprintk("<-- %s Success\n", __func__); ++ return 0; ++ ++out_free: ++ kfree(new); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ struct dlm_device_entry *dlm_pdev, *next; ++ ++ dprintk("--> %s\n", __func__); ++ ++ spin_lock(&dlm_device_list_lock); ++ list_for_each_entry_safe (dlm_pdev, next, &dlm_device_list, ++ dlm_dev_list) { ++ list_del(&dlm_pdev->dlm_dev_list); ++ kfree(dlm_pdev); ++ } ++ spin_unlock(&dlm_device_list_lock); ++} ++ ++static int nfsd4_pnfs_dlm_getdeviter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ printk(KERN_ERR "%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return -ENOTSUPP; ++ } ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ return 0; ++} ++ ++static int nfsd4_pnfs_dlm_getdevinfo(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err, len, i = 0; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_devaddr *daddr; ++ struct dlm_device_entry *dlm_pdev; ++ char *bufp; ++ ++ err = -ENOTSUPP; ++ if (layout_type != LAYOUT_NFSV4_1_FILES) { ++ dprintk("%s: ERROR: layout type isn't 'file' " ++ "(type: %x)\n", __func__, layout_type); ++ return err; ++ } ++ ++ /* We only hand out a deviceid of 1 in LAYOUTGET, so a GETDEVICEINFO ++ * with a gdia_device_id != 1 is invalid. ++ */ ++ err = -EINVAL; ++ if (devid->devid != 1) { ++ dprintk("%s: WARNING: didn't receive a deviceid of " ++ "1 (got: 0x%llx)\n", __func__, devid->devid); ++ return err; ++ } ++ ++ /* ++ * If the DS list has not been established, return -EINVAL ++ */ ++ dlm_pdev = nfsd4_find_pnfs_dlm_device(sb); ++ if (!dlm_pdev) { ++ dprintk("%s: DEBUG: disk %s Not Found\n", __func__, ++ sb->s_bdev->bd_disk->disk_name); ++ return err; ++ } ++ ++ dprintk("%s: Found disk %s with DS list |%s|\n", ++ __func__, dlm_pdev->disk_name, dlm_pdev->ds_list); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ fdev.fl_device_length = dlm_pdev->num_ds; ++ ++ err = -ENOMEM; ++ len = sizeof(*fdev.fl_device_list) * fdev.fl_device_length; ++ fdev.fl_device_list = kzalloc(len, GFP_KERNEL); ++ if (!fdev.fl_device_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device list " ++ "buffer for %d DSes.\n", __func__, i); ++ fdev.fl_device_length = 0; ++ goto out; ++ } ++ ++ /* Set a simple stripe indicie */ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = kzalloc(sizeof(u32) * ++ fdev.fl_stripeindices_length, GFP_KERNEL); ++ ++ if (!fdev.fl_stripeindices_list) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a stripeindices " ++ "list buffer for %d DSes.\n", __func__, i); ++ goto out; ++ } ++ for (i = 0; i < fdev.fl_stripeindices_length; i++) ++ fdev.fl_stripeindices_list[i] = i; ++ ++ /* Transfer the data server list with a single multipath entry */ ++ bufp = dlm_pdev->ds_list; ++ for (i = 0; i < fdev.fl_device_length; i++) { ++ daddr = kmalloc(sizeof(*daddr), GFP_KERNEL); ++ if (!daddr) { ++ printk(KERN_ERR "%s: ERROR: unable to kmalloc a device " ++ "addr buffer.\n", __func__); ++ goto out; ++ } ++ ++ daddr->r_netid.data = "tcp"; ++ daddr->r_netid.len = 3; ++ ++ len = strcspn(bufp, ","); ++ daddr->r_addr.data = kmalloc(len + 4, GFP_KERNEL); ++ memcpy(daddr->r_addr.data, bufp, len); ++ /* ++ * append the port number. interpreted as two more bytes ++ * beyond the quad: ".8.1" -> 0x08.0x01 -> 0x0801 = port 2049. ++ */ ++ memcpy(daddr->r_addr.data + len, ".8.1", 4); ++ daddr->r_addr.len = len + 4; ++ ++ fdev.fl_device_list[i].fl_multipath_length = 1; ++ fdev.fl_device_list[i].fl_multipath_list = daddr; ++ ++ dprintk("%s: encoding DS |%s|\n", __func__, bufp); ++ ++ bufp += len + 1; ++ } ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ for (i = 0; i < fdev.fl_device_length; i++) ++ kfree(fdev.fl_device_list[i].fl_multipath_list); ++ kfree(fdev.fl_device_list); ++ kfree(fdev.fl_stripeindices_list); ++ dprintk("<-- %s returns %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize >= NFSSVC_MAXBLKSIZE) ++ return blocksize; ++ return NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++} ++ ++/* ++ * Look up inode block device in pnfs_dlm_device list. ++ * Hash on the inode->i_ino and number of data servers. ++ */ ++static int dlm_ino_hash(struct inode *ino) ++{ ++ struct dlm_device_entry *de; ++ u32 hash_mask = 0; ++ ++ /* If can't find the inode block device in the pnfs_dlm_deivce list ++ * then don't hand out a layout ++ */ ++ de = nfsd4_find_pnfs_dlm_device(ino->i_sb); ++ if (!de) ++ return -1; ++ hash_mask = de->num_ds - 1; ++ return ino->i_ino & hash_mask; ++} ++ ++static enum nfsstat4 nfsd4_pnfs_dlm_layoutget(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *args, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ int index; ++ enum nfsstat4 rc = NFS4_OK; ++ ++ dprintk("%s: LAYOUT_GET\n", __func__); ++ ++ /* DLM exported file systems only support layouts for READ */ ++ if (res->lg_seg.iomode == IOMODE_RW) ++ return NFS4ERR_BADIOMODE; ++ ++ index = dlm_ino_hash(inode); ++ dprintk("%s first stripe index %d i_ino %lu\n", __func__, index, ++ inode->i_ino); ++ if (index < 0) ++ return NFS4ERR_LAYOUTUNAVAILABLE; ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ /* Always give out whole file layouts */ ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ /* Always give out READ ONLY layouts */ ++ res->lg_seg.iomode = IOMODE_READ; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = false; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = args->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = index; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = NFS4ERR_LAYOUTTRYLATER; ++ goto error; ++ } ++ ++ memcpy(fhp, args->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++nfsd4_pnfs_dlm_layouttype(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++const struct pnfs_export_operations pnfs_dlm_export_ops = { ++ .layout_type = nfsd4_pnfs_dlm_layouttype, ++ .get_device_info = nfsd4_pnfs_dlm_getdevinfo, ++ .get_device_iter = nfsd4_pnfs_dlm_getdeviter, ++ .layout_get = nfsd4_pnfs_dlm_layoutget, ++}; ++EXPORT_SYMBOL(pnfs_dlm_export_ops); +diff -up linux-2.6.35.noarch/fs/nfsd/nfs4pnfsds.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4pnfsds.c +--- linux-2.6.35.noarch/fs/nfsd/nfs4pnfsds.c.orig 2010-08-31 21:11:40.953160412 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfs4pnfsds.c 2010-08-31 21:11:40.953160412 -0400 +@@ -0,0 +1,620 @@ ++/* ++* linux/fs/nfsd/nfs4pnfsds.c ++* ++* Copyright (c) 2005 The Regents of the University of Michigan. ++* All rights reserved. ++* ++* Andy Adamson ++* ++* Redistribution and use in source and binary forms, with or without ++* modification, are permitted provided that the following conditions ++* are met: ++* ++* 1. Redistributions of source code must retain the above copyright ++* notice, this list of conditions and the following disclaimer. ++* 2. Redistributions in binary form must reproduce the above copyright ++* notice, this list of conditions and the following disclaimer in the ++* documentation and/or other materials provided with the distribution. ++* 3. Neither the name of the University nor the names of its ++* contributors may be used to endorse or promote products derived ++* from this software without specific prior written permission. ++* ++* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++* ++*/ ++#if defined(CONFIG_PNFSD) ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfsd.h" ++#include "pnfsd.h" ++#include "state.h" ++ ++/* ++ ******************* ++ * PNFS ++ ******************* ++ */ ++/* ++ * Hash tables for pNFS Data Server state ++ * ++ * mds_nodeid: list of struct pnfs_mds_id one per Metadata server (MDS) using ++ * this data server (DS). ++ * ++ * mds_clid_hashtbl[]: uses clientid_hashval(), hash of all clientids obtained ++ * from any MDS. ++ * ++ * ds_stid_hashtbl[]: uses stateid_hashval(), hash of all stateids obtained ++ * from any MDS. ++ * ++ */ ++/* Hash tables for clientid state */ ++#define CLIENT_HASH_BITS 4 ++#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) ++#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) ++ ++#define clientid_hashval(id) \ ++ ((id) & CLIENT_HASH_MASK) ++ ++/* hash table for pnfs_ds_stateid */ ++#define STATEID_HASH_BITS 10 ++#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) ++#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) ++ ++#define stateid_hashval(owner_id, file_id) \ ++ (((owner_id) + (file_id)) & STATEID_HASH_MASK) ++ ++static struct list_head mds_id_tbl; ++static struct list_head mds_clid_hashtbl[CLIENT_HASH_SIZE]; ++static struct list_head ds_stid_hashtbl[STATEID_HASH_SIZE]; ++ ++static inline void put_ds_clientid(struct pnfs_ds_clientid *dcp); ++static inline void put_ds_mdsid(struct pnfs_mds_id *mdp); ++ ++/* Mutex for data server state. Needs to be separate from ++ * mds state mutex since a node can be both mds and ds */ ++static DEFINE_MUTEX(ds_mutex); ++static struct thread_info *ds_mutex_owner; ++ ++static void ++ds_lock_state(void) ++{ ++ mutex_lock(&ds_mutex); ++ ds_mutex_owner = current_thread_info(); ++} ++ ++static void ++ds_unlock_state(void) ++{ ++ BUG_ON(ds_mutex_owner != current_thread_info()); ++ ds_mutex_owner = NULL; ++ mutex_unlock(&ds_mutex); ++} ++ ++static int ++cmp_clid(const clientid_t *cl1, const clientid_t *cl2) ++{ ++ return (cl1->cl_boot == cl2->cl_boot) && ++ (cl1->cl_id == cl2->cl_id); ++} ++ ++void ++nfs4_pnfs_state_init(void) ++{ ++ int i; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&mds_clid_hashtbl[i]); ++ ++ for (i = 0; i < STATEID_HASH_SIZE; i++) ++ INIT_LIST_HEAD(&ds_stid_hashtbl[i]); ++ ++ INIT_LIST_HEAD(&mds_id_tbl); ++} ++ ++static struct pnfs_mds_id * ++find_pnfs_mds_id(u32 mdsid) ++{ ++ struct pnfs_mds_id *local = NULL; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ list_for_each_entry(local, &mds_id_tbl, di_hash) { ++ if (local->di_mdsid == mdsid) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_clientid * ++find_pnfs_ds_clientid(const clientid_t *clid) ++{ ++ struct pnfs_ds_clientid *local = NULL; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = clientid_hashval(clid->cl_id); ++ list_for_each_entry(local, &mds_clid_hashtbl[hashval], dc_hash) { ++ if (cmp_clid(&local->dc_mdsclid, clid)) ++ return local; ++ } ++ return NULL; ++} ++ ++static struct pnfs_ds_stateid * ++find_pnfs_ds_stateid(stateid_t *stid) ++{ ++ struct pnfs_ds_stateid *local = NULL; ++ u32 st_id = stid->si_stateownerid; ++ u32 f_id = stid->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_for_each_entry(local, &ds_stid_hashtbl[hashval], ds_hash) ++ if ((local->ds_stid.si_stateownerid == st_id) && ++ (local->ds_stid.si_fileid == f_id) && ++ (local->ds_stid.si_boot == stid->si_boot)) { ++ stateid_t *sid = &local->ds_stid; ++ dprintk("NFSD: %s <-- %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, local, local->ds_flags, ++ STATEID_VAL(sid)); ++ return local; ++ } ++ return NULL; ++} ++ ++static void ++release_ds_mdsid(struct kref *kref) ++{ ++ struct pnfs_mds_id *mdp = ++ container_of(kref, struct pnfs_mds_id, di_ref); ++ dprintk("pNFSD: %s\n", __func__); ++ ++ list_del(&mdp->di_hash); ++ list_del(&mdp->di_mdsclid); ++ kfree(mdp); ++} ++ ++static void ++release_ds_clientid(struct kref *kref) ++{ ++ struct pnfs_ds_clientid *dcp = ++ container_of(kref, struct pnfs_ds_clientid, dc_ref); ++ struct pnfs_mds_id *mdp; ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(dcp->dc_mdsid); ++ if (mdp) ++ put_ds_mdsid(mdp); ++ ++ list_del(&dcp->dc_hash); ++ list_del(&dcp->dc_stateid); ++ list_del(&dcp->dc_permdsid); ++ kfree(dcp); ++} ++ ++static void ++release_ds_stateid(struct kref *kref) ++{ ++ struct pnfs_ds_stateid *dsp = ++ container_of(kref, struct pnfs_ds_stateid, ds_ref); ++ struct pnfs_ds_clientid *dcp; ++ dprintk("pNFS %s: dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid(&dsp->ds_mdsclid); ++ if (dcp) ++ put_ds_clientid(dcp); ++ ++ list_del(&dsp->ds_hash); ++ list_del(&dsp->ds_perclid); ++ kfree(dsp); ++} ++ ++static inline void ++put_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_put(&dcp->dc_ref, release_ds_clientid); ++} ++ ++static inline void ++get_ds_clientid(struct pnfs_ds_clientid *dcp) ++{ ++ dprintk("pNFS %s: dcp %p ref %d\n", __func__, dcp, ++ atomic_read(&dcp->dc_ref.refcount)); ++ kref_get(&dcp->dc_ref); ++} ++ ++static inline void ++put_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_put(&mdp->di_ref, release_ds_mdsid); ++} ++ ++static inline void ++get_ds_mdsid(struct pnfs_mds_id *mdp) ++{ ++ dprintk("pNFS %s: mdp %p ref %d\n", __func__, mdp, ++ atomic_read(&mdp->di_ref.refcount)); ++ kref_get(&mdp->di_ref); ++} ++ ++static inline void ++put_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_put(&dsp->ds_ref, release_ds_stateid); ++} ++ ++static inline void ++get_ds_stateid(struct pnfs_ds_stateid *dsp) ++{ ++ dprintk("pNFS %s: dsp %p ref %d\n", __func__, dsp, ++ atomic_read(&dsp->ds_ref.refcount)); ++ kref_get(&dsp->ds_ref); ++} ++ ++void ++nfs4_pnfs_state_shutdown(void) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int i; ++ ++ dprintk("pNFSD %s: -->\n", __func__); ++ ++ ds_lock_state(); ++ for (i = 0; i < STATEID_HASH_SIZE; i++) { ++ while (!list_empty(&ds_stid_hashtbl[i])) { ++ dsp = list_entry(ds_stid_hashtbl[i].next, ++ struct pnfs_ds_stateid, ds_hash); ++ put_ds_stateid(dsp); ++ } ++ } ++ ds_unlock_state(); ++} ++ ++static struct pnfs_mds_id * ++alloc_init_mds_id(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = kmalloc(sizeof(*mdp), GFP_KERNEL); ++ if (!mdp) ++ return NULL; ++ INIT_LIST_HEAD(&mdp->di_hash); ++ INIT_LIST_HEAD(&mdp->di_mdsclid); ++ list_add(&mdp->di_hash, &mds_id_tbl); ++ mdp->di_mdsid = gsp->dsid; ++ mdp->di_mdsboot = 0; ++ kref_init(&mdp->di_ref); ++ return mdp; ++} ++ ++static struct pnfs_ds_clientid * ++alloc_init_ds_clientid(struct pnfs_get_state *gsp) ++{ ++ struct pnfs_mds_id *mdp; ++ struct pnfs_ds_clientid *dcp; ++ clientid_t *clid = (clientid_t *)&gsp->clid; ++ unsigned int hashval = clientid_hashval(clid->cl_id); ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ mdp = find_pnfs_mds_id(gsp->dsid); ++ if (!mdp) { ++ mdp = alloc_init_mds_id(gsp); ++ if (!mdp) ++ return NULL; ++ } else { ++ get_ds_mdsid(mdp); ++ } ++ ++ dcp = kmalloc(sizeof(*dcp), GFP_KERNEL); ++ if (!dcp) ++ return NULL; ++ ++ INIT_LIST_HEAD(&dcp->dc_hash); ++ INIT_LIST_HEAD(&dcp->dc_stateid); ++ INIT_LIST_HEAD(&dcp->dc_permdsid); ++ list_add(&dcp->dc_hash, &mds_clid_hashtbl[hashval]); ++ list_add(&dcp->dc_permdsid, &mdp->di_mdsclid); ++ dcp->dc_mdsclid = *clid; ++ kref_init(&dcp->dc_ref); ++ dcp->dc_mdsid = gsp->dsid; ++ return dcp; ++} ++ ++static struct pnfs_ds_stateid * ++alloc_init_ds_stateid(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct pnfs_ds_stateid *dsp; ++ u32 st_id = stidp->si_stateownerid; ++ u32 f_id = stidp->si_fileid; ++ unsigned int hashval; ++ ++ dprintk("pNFSD: %s\n", __func__); ++ ++ dsp = kmalloc(sizeof(*dsp), GFP_KERNEL); ++ if (!dsp) ++ return dsp; ++ ++ INIT_LIST_HEAD(&dsp->ds_hash); ++ INIT_LIST_HEAD(&dsp->ds_perclid); ++ memcpy(&dsp->ds_stid, stidp, sizeof(stateid_t)); ++ fh_copy_shallow(&dsp->ds_fh, &cfh->fh_handle); ++ dsp->ds_access = 0; ++ dsp->ds_status = 0; ++ dsp->ds_flags = 0L; ++ kref_init(&dsp->ds_ref); ++ set_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ clear_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ init_waitqueue_head(&dsp->ds_waitq); ++ ++ hashval = stateid_hashval(st_id, f_id); ++ list_add(&dsp->ds_hash, &ds_stid_hashtbl[hashval]); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++static int ++update_ds_stateid(struct pnfs_ds_stateid *dsp, struct svc_fh *cfh, ++ struct pnfs_get_state *gsp) ++{ ++ struct pnfs_ds_clientid *dcp; ++ int new = 0; ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ dcp = find_pnfs_ds_clientid((clientid_t *)&gsp->clid); ++ if (!dcp) { ++ dcp = alloc_init_ds_clientid(gsp); ++ if (!dcp) ++ return 1; ++ new = 1; ++ } ++ if (test_bit(DS_STATEID_NEW, &dsp->ds_flags)) { ++ list_add(&dsp->ds_perclid, &dcp->dc_stateid); ++ if (!new) ++ get_ds_clientid(dcp); ++ } ++ ++ memcpy(&dsp->ds_stid, &gsp->stid, sizeof(stateid_t)); ++ dsp->ds_access = gsp->access; ++ dsp->ds_status = 0; ++ dsp->ds_verifier[0] = gsp->verifier[0]; ++ dsp->ds_verifier[1] = gsp->verifier[1]; ++ memcpy(&dsp->ds_mdsclid, &gsp->clid, sizeof(clientid_t)); ++ set_bit(DS_STATEID_VALID, &dsp->ds_flags); ++ clear_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ clear_bit(DS_STATEID_NEW, &dsp->ds_flags); ++ return 0; ++} ++ ++int ++nfs4_pnfs_cb_change_state(struct pnfs_get_state *gs) ++{ ++ stateid_t *stid = (stateid_t *)&gs->stid; ++ struct pnfs_ds_stateid *dsp; ++ ++ dprintk("pNFSD: %s stateid=" STATEID_FMT "\n", __func__, ++ STATEID_VAL(stid)); ++ ++ ds_lock_state(); ++ dsp = find_pnfs_ds_stateid(stid); ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ ++ dprintk("pNFSD: %s dsp %p\n", __func__, dsp); ++ ++ if (dsp) ++ return 0; ++ return -ENOENT; ++} ++ ++/* Retrieves and validates stateid. ++ * If stateid exists and its fields match, return it. ++ * If stateid exists but either the generation or ++ * ownerids don't match, check with mds to see if it is valid. ++ * If the stateid doesn't exist, the first thread creates a ++ * invalid *marker* stateid, then checks to see if the ++ * stateid exists on the mds. If so, it validates the *marker* ++ * stateid and updates its fields. Subsequent threads that ++ * find the *marker* stateid wait until it is valid or an error ++ * occurs. ++ * Called with ds_state_lock. ++ */ ++static struct pnfs_ds_stateid * ++nfsv4_ds_get_state(struct svc_fh *cfh, stateid_t *stidp) ++{ ++ struct inode *ino = cfh->fh_dentry->d_inode; ++ struct super_block *sb; ++ struct pnfs_ds_stateid *dsp = NULL; ++ struct pnfs_get_state gs = { ++ .access = 0, ++ }; ++ int status = 0, waiter = 0; ++ ++ dprintk("pNFSD: %s -->\n", __func__); ++ ++ dsp = find_pnfs_ds_stateid(stidp); ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags) && ++ (stidp->si_generation == dsp->ds_stid.si_generation)) ++ goto out_noput; ++ ++ sb = ino->i_sb; ++ if (!sb || !sb->s_pnfs_op->get_state) ++ goto out_noput; ++ ++ /* Uninitialize current state if it exists yet it doesn't match. ++ * If it is already invalid, another thread is checking state */ ++ if (dsp) { ++ if (!test_and_clear_bit(DS_STATEID_VALID, &dsp->ds_flags)) ++ waiter = 1; ++ } else { ++ dsp = alloc_init_ds_stateid(cfh, stidp); ++ if (!dsp) ++ goto out_noput; ++ } ++ ++ dprintk("pNFSD: %s Starting loop\n", __func__); ++ get_ds_stateid(dsp); ++ while (!test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ ds_unlock_state(); ++ ++ /* Another thread is checking the state */ ++ if (waiter) { ++ dprintk("pNFSD: %s waiting\n", __func__); ++ wait_event_interruptible_timeout(dsp->ds_waitq, ++ (test_bit(DS_STATEID_VALID, &dsp->ds_flags) || ++ test_bit(DS_STATEID_ERROR, &dsp->ds_flags)), ++ msecs_to_jiffies(1024)); ++ dprintk("pNFSD: %s awake\n", __func__); ++ ds_lock_state(); ++ if (test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ goto out; ++ ++ continue; ++ } ++ ++ /* Validate stateid on mds */ ++ dprintk("pNFSD: %s Checking state on MDS\n", __func__); ++ memcpy(&gs.stid, stidp, sizeof(stateid_t)); ++ status = sb->s_pnfs_op->get_state(ino, &cfh->fh_handle, &gs); ++ dprintk("pNFSD: %s from MDS status %d\n", __func__, status); ++ ds_lock_state(); ++ /* if !status and stateid is valid, update id and mark valid */ ++ if (status || update_ds_stateid(dsp, cfh, &gs)) { ++ set_bit(DS_STATEID_ERROR, &dsp->ds_flags); ++ /* remove invalid stateid from list */ ++ put_ds_stateid(dsp); ++ wake_up(&dsp->ds_waitq); ++ goto out; ++ } ++ ++ wake_up(&dsp->ds_waitq); ++ } ++out: ++ if (dsp) ++ put_ds_stateid(dsp); ++out_noput: ++ if (dsp) ++ dprintk("pNFSD: %s <-- dsp %p ds_flags %lx " STATEID_FMT "\n", ++ __func__, dsp, dsp->ds_flags, STATEID_VAL(&dsp->ds_stid)); ++ /* If error, return null */ ++ if (dsp && test_bit(DS_STATEID_ERROR, &dsp->ds_flags)) ++ dsp = NULL; ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return dsp; ++} ++ ++int ++nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *cfh, stateid_t *stateid) ++{ ++ struct pnfs_ds_stateid *dsp; ++ int status = 0; ++ ++ dprintk("pNFSD: %s --> " STATEID_FMT "\n", __func__, ++ STATEID_VAL(stateid)); ++ ++ /* Must release state lock while verifying stateid on mds */ ++ nfs4_unlock_state(); ++ ds_lock_state(); ++ dsp = nfsv4_ds_get_state(cfh, stateid); ++ if (dsp) { ++ get_ds_stateid(dsp); ++ dprintk("pNFSD: %s Found " STATEID_FMT "\n", __func__, ++ STATEID_VAL(&dsp->ds_stid)); ++ ++ dprintk("NFSD: %s: dsp %p fh_size %u:%u " ++ "fh [%08x:%08x:%08x:%08x]:[%08x:%08x:%08x:%08x] " ++ "gen %x:%x\n", ++ __func__, dsp, ++ cfh->fh_handle.fh_size, dsp->ds_fh.fh_size, ++ ((unsigned *)&cfh->fh_handle.fh_base)[0], ++ ((unsigned *)&cfh->fh_handle.fh_base)[1], ++ ((unsigned *)&cfh->fh_handle.fh_base)[2], ++ ((unsigned *)&cfh->fh_handle.fh_base)[3], ++ ((unsigned *)&dsp->ds_fh.fh_base)[0], ++ ((unsigned *)&dsp->ds_fh.fh_base)[1], ++ ((unsigned *)&dsp->ds_fh.fh_base)[2], ++ ((unsigned *)&dsp->ds_fh.fh_base)[3], ++ stateid->si_generation, dsp->ds_stid.si_generation); ++ } ++ ++ if (!dsp || ++ (cfh->fh_handle.fh_size != dsp->ds_fh.fh_size) || ++ (memcmp(&cfh->fh_handle.fh_base, &dsp->ds_fh.fh_base, ++ dsp->ds_fh.fh_size) != 0) || ++ (stateid->si_generation > dsp->ds_stid.si_generation)) ++ status = nfserr_bad_stateid; ++ else if (stateid->si_generation < dsp->ds_stid.si_generation) ++ status = nfserr_old_stateid; ++ ++ if (dsp) ++ put_ds_stateid(dsp); ++ ds_unlock_state(); ++ nfs4_lock_state(); ++ dprintk("pNFSD: %s <-- status %d\n", __func__, be32_to_cpu(status)); ++ return status; ++} ++ ++void ++nfs4_ds_get_verifier(stateid_t *stateid, struct super_block *sb, u32 *p) ++{ ++ struct pnfs_ds_stateid *dsp = NULL; ++ ++ dprintk("pNFSD: %s --> stid %p\n", __func__, stateid); ++ ++ ds_lock_state(); ++ if (stateid != NULL) { ++ dsp = find_pnfs_ds_stateid(stateid); ++ if (dsp) ++ get_ds_stateid(dsp); ++ } ++ ++ /* XXX: Should we fetch the stateid or wait if some other ++ * thread is currently retrieving the stateid ? */ ++ if (dsp && test_bit(DS_STATEID_VALID, &dsp->ds_flags)) { ++ *p++ = dsp->ds_verifier[0]; ++ *p++ = dsp->ds_verifier[1]; ++ put_ds_stateid(dsp); ++ } else { ++ /* must be on MDS */ ++ ds_unlock_state(); ++ sb->s_pnfs_op->get_verifier(sb, p); ++ ds_lock_state(); ++ p += 2; ++ } ++ ds_unlock_state(); ++ dprintk("pNFSD: %s <-- dsp %p\n", __func__, dsp); ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.35.noarch/fs/nfsd/nfs4proc.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4proc.c +--- linux-2.6.35.noarch/fs/nfsd/nfs4proc.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfs4proc.c 2010-08-31 21:11:40.954160471 -0400 +@@ -34,10 +34,14 @@ + */ + #include + #include ++#include ++#include ++#include + + #include "cache.h" + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_PROC + +@@ -372,6 +376,24 @@ nfsd4_open(struct svc_rqst *rqstp, struc + * set, (2) sets open->op_stateid, (3) sets open->op_delegation. + */ + status = nfsd4_process_open2(rqstp, &cstate->current_fh, open); ++#if defined(CONFIG_SPNFS) ++ if (!status && spnfs_enabled()) { ++ struct inode *inode = cstate->current_fh.fh_dentry->d_inode; ++ ++ status = spnfs_open(inode, open); ++ if (status) { ++ dprintk( ++ "nfsd: pNFS could not be enabled for inode: %lu\n", ++ inode->i_ino); ++ /* ++ * XXX When there's a failure then need to indicate to ++ * future ops that no pNFS is available. Should I save ++ * the status in the inode? It's kind of a big hammer. ++ * But there may be no stripes available? ++ */ ++ } ++ } ++#endif /* CONFIG_SPNFS */ + out: + if (open->op_stateowner) { + nfs4_get_stateowner(open->op_stateowner); +@@ -454,16 +476,30 @@ nfsd4_access(struct svc_rqst *rqstp, str + &access->ac_supported); + } + ++static void ++nfsd4_get_verifier(struct super_block *sb, nfs4_verifier *verf) ++{ ++ u32 *p = (u32 *)verf->data; ++ ++#if defined(CONFIG_PNFSD) ++ if (sb->s_pnfs_op && sb->s_pnfs_op->get_verifier) { ++ nfs4_ds_get_verifier(NULL, sb, p); ++ return; ++ } ++#endif /* CONFIG_PNFSD */ ++ ++ *p++ = nfssvc_boot.tv_sec; ++ *p++ = nfssvc_boot.tv_usec; ++} ++ + static __be32 + nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_commit *commit) + { + __be32 status; + +- u32 *p = (u32 *)commit->co_verf.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; +- ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &commit->co_verf); + status = nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, + commit->co_count); + if (status == nfserr_symlink) +@@ -816,7 +852,6 @@ nfsd4_write(struct svc_rqst *rqstp, stru + { + stateid_t *stateid = &write->wr_stateid; + struct file *filp = NULL; +- u32 *p; + __be32 status = nfs_ok; + unsigned long cnt; + +@@ -838,13 +873,49 @@ nfsd4_write(struct svc_rqst *rqstp, stru + + cnt = write->wr_buflen; + write->wr_how_written = write->wr_stable_how; +- p = (u32 *)write->wr_verifier.data; +- *p++ = nfssvc_boot.tv_sec; +- *p++ = nfssvc_boot.tv_usec; + ++ nfsd4_get_verifier(cstate->current_fh.fh_dentry->d_inode->i_sb, ++ &write->wr_verifier); ++#if defined(CONFIG_SPNFS) ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(cstate->current_fh.fh_dentry->d_inode, 0)) { ++ status = bl_layoutrecall(cstate->current_fh.fh_dentry->d_inode, ++ RETURN_FILE, write->wr_offset, write->wr_buflen); ++ if (!status) { ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++ } ++ } else ++#endif ++ ++ if (spnfs_enabled()) { ++ status = spnfs_write(cstate->current_fh.fh_dentry->d_inode, ++ write->wr_offset, write->wr_buflen, write->wr_vlen, ++ rqstp); ++ if (status == nfs_ok) { ++ /* DMXXX: HACK to get filesize set */ ++ /* write one byte at offset+length-1 */ ++ struct kvec k[1]; ++ char zero = 0; ++ unsigned long cnt = 1; ++ ++ k[0].iov_base = (void *)&zero; ++ k[0].iov_len = 1; ++ nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset+write->wr_buflen-1, k, 1, ++ &cnt, &write->wr_how_written); ++ } ++ } else /* we're not an MDS */ ++ status = nfsd_write(rqstp, &cstate->current_fh, filp, ++ write->wr_offset, rqstp->rq_vec, write->wr_vlen, ++ &cnt, &write->wr_how_written); ++#else + status = nfsd_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, write->wr_vlen, + &cnt, &write->wr_how_written); ++#endif /* CONFIG_SPNFS */ ++ + if (filp) + fput(filp); + +@@ -935,6 +1006,306 @@ nfsd4_verify(struct svc_rqst *rqstp, str + return status == nfserr_same ? nfs_ok : status; + } + ++#if defined(CONFIG_PNFSD) ++ ++static __be32 ++nfsd4_layout_verify(struct super_block *sb, struct svc_export *exp, ++ unsigned int layout_type) ++{ ++ int status, type; ++ ++ /* check to see if pNFS is supported. */ ++ status = nfserr_layoutunavailable; ++ if (exp && exp->ex_pnfs == 0) { ++ dprintk("%s: Underlying file system " ++ "is not exported over pNFS\n", __func__); ++ goto out; ++ } ++ if (!sb->s_pnfs_op || !sb->s_pnfs_op->layout_type) { ++ dprintk("%s: Underlying file system " ++ "does not support pNFS\n", __func__); ++ goto out; ++ } ++ ++ type = sb->s_pnfs_op->layout_type(sb); ++ ++ /* check to see if requested layout type is supported. */ ++ status = nfserr_unknown_layouttype; ++ if (!type) ++ dprintk("BUG: %s: layout_type 0 is reserved and must not be " ++ "used by filesystem\n", __func__); ++ else if (type != layout_type) ++ dprintk("%s: requested layout type %d " ++ "does not match supported type %d\n", ++ __func__, layout_type, type); ++ else ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevlist(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevlist *gdlp) ++{ ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ int status; ++ ++ dprintk("%s: type %u maxdevices %u cookie %llu verf %llu\n", ++ __func__, gdlp->gd_layout_type, gdlp->gd_maxdevices, ++ gdlp->gd_cookie, gdlp->gd_verf); ++ ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* We must be able to encode at list one device */ ++ if (!gdlp->gd_maxdevices) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ gdlp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Do nothing if underlying file system does not support ++ * getdevicelist */ ++ if (!sb->s_pnfs_op->get_device_iter) { ++ status = nfserr_notsupp; ++ goto out; ++ } ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdlp->gd_fhp = &cstate->current_fh; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_getdevinfo(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_getdevinfo *gdp) ++{ ++ struct super_block *sb; ++ int status; ++ clientid_t clid; ++ ++ dprintk("%s: layout_type %u dev_id %llx:%llx maxcnt %u\n", ++ __func__, gdp->gd_layout_type, gdp->gd_devid.sbid, ++ gdp->gd_devid.devid, gdp->gd_maxcount); ++ ++ status = nfserr_inval; ++ sb = find_sbid_id(gdp->gd_devid.sbid); ++ dprintk("%s: sb %p\n", __func__, sb); ++ if (!sb) { ++ status = nfserr_noent; ++ goto out; ++ } ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, NULL, gdp->gd_layout_type); ++ if (status) ++ goto out; ++ ++ /* Set up arguments so device can be retrieved at encode time */ ++ gdp->gd_sb = sb; ++ ++ /* Update notifications */ ++ copy_clientid(&clid, cstate->session); ++ pnfs_set_device_notify(&clid, gdp->gd_notify_types); ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutget(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lgp->lg_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_badiomode; ++ if (lgp->lg_seg.iomode != IOMODE_READ && ++ lgp->lg_seg.iomode != IOMODE_RW) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lgp->lg_seg.iomode); ++ goto out; ++ } ++ ++ /* Set up arguments so layout can be retrieved at encode time */ ++ lgp->lg_fhp = current_fh; ++ copy_clientid((clientid_t *)&lgp->lg_seg.clientid, cstate->session); ++ status = nfs_ok; ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutcommit(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ int status; ++ struct inode *ino = NULL; ++ struct iattr ia; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ dprintk("NFSD: nfsd4_layoutcommit \n"); ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ ino = current_fh->fh_dentry->d_inode; ++ if (!ino) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = ino->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lcp->args.lc_seg.layout_type); ++ if (status) ++ goto out; ++ ++ /* This will only extend the file length. Do a quick ++ * check to see if there is any point in waiting for the update ++ * locks. ++ * TODO: Is this correct for all back ends? ++ */ ++ dprintk("%s:new offset: %d new size: %llu old size: %lld\n", ++ __func__, lcp->args.lc_newoffset, lcp->args.lc_last_wr + 1, ++ ino->i_size); ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lcp->args.lc_seg.clientid, cstate->session); ++ lcp->res.lc_size_chg = 0; ++ if (sb->s_pnfs_op->layout_commit) { ++ status = sb->s_pnfs_op->layout_commit(ino, &lcp->args, &lcp->res); ++ dprintk("%s:layout_commit result %d\n", __func__, status); ++ } else { ++ fh_lock(current_fh); ++ if ((lcp->args.lc_newoffset == 0) || ++ ((lcp->args.lc_last_wr + 1) <= ino->i_size)) { ++ status = 0; ++ lcp->res.lc_size_chg = 0; ++ fh_unlock(current_fh); ++ goto out; ++ } ++ ++ /* Try our best to update the file size */ ++ dprintk("%s: Modifying file size\n", __func__); ++ ia.ia_valid = ATTR_SIZE; ++ ia.ia_size = lcp->args.lc_last_wr + 1; ++ status = notify_change(current_fh->fh_dentry, &ia); ++ fh_unlock(current_fh); ++ dprintk("%s:notify_change result %d\n", __func__, status); ++ } ++ ++ if (!status && lcp->res.lc_size_chg && ++ EX_ISSYNC(current_fh->fh_export)) { ++ dprintk("%s: Synchronously writing inode size %llu\n", ++ __func__, ino->i_size); ++ write_inode_now(ino, 1); ++ lcp->res.lc_newsize = i_size_read(ino); ++ } ++out: ++ return status; ++} ++ ++static __be32 ++nfsd4_layoutreturn(struct svc_rqst *rqstp, ++ struct nfsd4_compound_state *cstate, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ int status; ++ struct super_block *sb; ++ struct svc_fh *current_fh = &cstate->current_fh; ++ ++ status = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (!sb) ++ goto out; ++ ++ /* Ensure underlying file system supports pNFS and, ++ * if so, the requested layout type ++ */ ++ status = nfsd4_layout_verify(sb, current_fh->fh_export, ++ lrp->args.lr_seg.layout_type); ++ if (status) ++ goto out; ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_return_type != RETURN_FILE && ++ lrp->args.lr_return_type != RETURN_FSID && ++ lrp->args.lr_return_type != RETURN_ALL) { ++ dprintk("pNFS %s: invalid return_type %d\n", __func__, ++ lrp->args.lr_return_type); ++ goto out; ++ } ++ ++ status = nfserr_inval; ++ if (lrp->args.lr_seg.iomode != IOMODE_READ && ++ lrp->args.lr_seg.iomode != IOMODE_RW && ++ lrp->args.lr_seg.iomode != IOMODE_ANY) { ++ dprintk("pNFS %s: invalid iomode %d\n", __func__, ++ lrp->args.lr_seg.iomode); ++ goto out; ++ } ++ ++ /* Set clientid from sessionid */ ++ copy_clientid((clientid_t *)&lrp->args.lr_seg.clientid, cstate->session); ++ lrp->lrs_present = (lrp->args.lr_return_type == RETURN_FILE); ++ status = nfs4_pnfs_return_layout(sb, current_fh, lrp); ++out: ++ dprintk("pNFS %s: status %d return_type 0x%x lrs_present %d\n", ++ __func__, status, lrp->args.lr_return_type, lrp->lrs_present); ++ return status; ++} ++#endif /* CONFIG_PNFSD */ ++ + /* + * NULL call. + */ +@@ -1317,6 +1688,29 @@ static struct nfsd4_operation nfsd4_ops[ + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_RECLAIM_COMPLETE", + }, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICELIST] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevlist, ++ .op_name = "OP_GETDEVICELIST", ++ }, ++ [OP_GETDEVICEINFO] = { ++ .op_func = (nfsd4op_func)nfsd4_getdevinfo, ++ .op_flags = ALLOWED_WITHOUT_FH, ++ .op_name = "OP_GETDEVICEINFO", ++ }, ++ [OP_LAYOUTGET] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutget, ++ .op_name = "OP_LAYOUTGET", ++ }, ++ [OP_LAYOUTCOMMIT] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutcommit, ++ .op_name = "OP_LAYOUTCOMMIT", ++ }, ++ [OP_LAYOUTRETURN] = { ++ .op_func = (nfsd4op_func)nfsd4_layoutreturn, ++ .op_name = "OP_LAYOUTRETURN", ++ }, ++#endif /* CONFIG_PNFSD */ + }; + + static const char *nfsd4_op_name(unsigned opnum) +diff -up linux-2.6.35.noarch/fs/nfsd/nfs4state.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4state.c +--- linux-2.6.35.noarch/fs/nfsd/nfs4state.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfs4state.c 2010-08-31 21:11:40.956150336 -0400 +@@ -42,6 +42,8 @@ + #include "xdr4.h" + #include "vfs.h" + ++#include "pnfsd.h" ++ + #define NFSDDBG_FACILITY NFSDDBG_PROC + + /* Globals */ +@@ -60,8 +62,6 @@ static u64 current_sessionid = 1; + #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) + + /* forward declarations */ +-static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +-static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); + static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; + static void nfs4_set_recdir(char *recdir); + +@@ -69,6 +69,7 @@ static void nfs4_set_recdir(char *recdir + + /* Currently used for almost all code touching nfsv4 state: */ + static DEFINE_MUTEX(client_mutex); ++struct task_struct *client_mutex_owner; + + /* + * Currently used for the del_recall_lru and file hash table. In an +@@ -86,11 +87,21 @@ void + nfs4_lock_state(void) + { + mutex_lock(&client_mutex); ++ client_mutex_owner = current; ++} ++ ++#define BUG_ON_UNLOCKED_STATE() BUG_ON(client_mutex_owner != current) ++ ++void ++nfs4_bug_on_unlocked_state(void) ++{ ++ BUG_ON(client_mutex_owner != current); + } + + void + nfs4_unlock_state(void) + { ++ client_mutex_owner = NULL; + mutex_unlock(&client_mutex); + } + +@@ -109,7 +120,7 @@ opaque_hashval(const void *ptr, int nbyt + + static struct list_head del_recall_lru; + +-static inline void ++inline void + put_nfs4_file(struct nfs4_file *fi) + { + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { +@@ -120,7 +131,7 @@ put_nfs4_file(struct nfs4_file *fi) + } + } + +-static inline void ++inline void + get_nfs4_file(struct nfs4_file *fi) + { + atomic_inc(&fi->fi_ref); +@@ -230,7 +241,10 @@ nfs4_close_delegation(struct nfs4_delega + * but we want to remove the lease in any case. */ + if (dp->dl_flock) + vfs_setlease(filp, F_UNLCK, &dp->dl_flock); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(filp); ++ nfs4_lock_state(); + } + + /* Called under the state lock. */ +@@ -266,8 +280,8 @@ static DEFINE_SPINLOCK(client_lock); + * reclaim_str_hashtbl[] holds known client info from previous reset/reboot + * used in reboot/reset lease grace period processing + * +- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed +- * setclientid_confirmed info. ++ * conf_id_hashtbl[], and conf_str_hashtbl[] hold ++ * confirmed setclientid_confirmed info. + * + * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed + * setclientid info. +@@ -292,6 +306,7 @@ static void unhash_generic_stateid(struc + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); ++ release_pnfs_ds_dev_list(stp); + } + + static void free_generic_stateid(struct nfs4_stateid *stp) +@@ -345,7 +360,10 @@ static void release_open_stateid(struct + { + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); ++ BUG_ON_UNLOCKED_STATE(); ++ nfs4_unlock_state(); /* allow nested layout recall/return */ + nfsd_close(stp->st_vfs_file); ++ nfs4_lock_state(); + free_generic_stateid(stp); + } + +@@ -739,6 +757,8 @@ expire_client(struct nfs4_client *clp) + struct nfs4_delegation *dp; + struct list_head reaplist; + ++ BUG_ON_UNLOCKED_STATE(); ++ + INIT_LIST_HEAD(&reaplist); + spin_lock(&recall_lock); + while (!list_empty(&clp->cl_delegations)) { +@@ -758,6 +778,7 @@ expire_client(struct nfs4_client *clp) + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); + release_openowner(sop); + } ++ pnfs_expire_client(clp); + nfsd4_set_callback_client(clp, NULL); + if (clp->cl_cb_conn.cb_xprt) + svc_xprt_put(clp->cl_cb_conn.cb_xprt); +@@ -770,6 +791,13 @@ expire_client(struct nfs4_client *clp) + spin_unlock(&client_lock); + } + ++void expire_client_lock(struct nfs4_client *clp) ++{ ++ nfs4_lock_state(); ++ expire_client(clp); ++ nfs4_unlock_state(); ++} ++ + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) + { + memcpy(target->cl_verifier.data, source->data, +@@ -859,6 +887,11 @@ static struct nfs4_client *create_client + INIT_LIST_HEAD(&clp->cl_strhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&clp->cl_layouts); ++ INIT_LIST_HEAD(&clp->cl_layoutrecalls); ++ atomic_set(&clp->cl_deviceref, 0); ++#endif /* CONFIG_PNFSD */ + INIT_LIST_HEAD(&clp->cl_sessions); + INIT_LIST_HEAD(&clp->cl_lru); + clp->cl_time = get_seconds(); +@@ -908,7 +941,7 @@ move_to_confirmed(struct nfs4_client *cl + renew_client(clp); + } + +-static struct nfs4_client * ++struct nfs4_client * + find_confirmed_client(clientid_t *clid) + { + struct nfs4_client *clp; +@@ -978,6 +1011,24 @@ find_unconfirmed_client_by_str(const cha + return NULL; + } + ++int ++filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), ++ void *arg) ++{ ++ struct nfs4_client *clp, *next; ++ int i, status = 0; ++ ++ for (i = 0; i < CLIENT_HASH_SIZE; i++) ++ list_for_each_entry_safe (clp, next, &conf_str_hashtbl[i], ++ cl_strhash) { ++ status = func(clp, arg); ++ if (status) ++ break; ++ } ++ ++ return status; ++} ++ + static void + gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid) + { +@@ -1110,8 +1161,12 @@ nfsd4_replay_cache_entry(struct nfsd4_co + static void + nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) + { +- /* pNFS is not supported */ ++#if defined(CONFIG_PNFSD) ++ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS | ++ EXCHGID4_FLAG_USE_PNFS_DS; ++#else /* CONFIG_PNFSD */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; ++#endif /* CONFIG_PNFSD */ + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; +@@ -1301,6 +1356,13 @@ nfsd4_create_session(struct svc_rqst *rq + struct nfsd4_clid_slot *cs_slot = NULL; + int status = 0; + ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++ /* XXX hack to get local ip address */ ++ memcpy(&pnfsd_lexp_addr, &rqstp->rq_xprt->xpt_local, ++ sizeof(pnfsd_lexp_addr)); ++ pnfs_lexp_addr_len = rqstp->rq_xprt->xpt_locallen; ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); +@@ -1340,25 +1402,26 @@ nfsd4_create_session(struct svc_rqst *rq + cs_slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + +- if (cr_ses->flags & SESSION4_BACK_CHAN) { +- unconf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; +- svc_xprt_get(rqstp->rq_xprt); +- rpc_copy_addr( +- (struct sockaddr *)&unconf->cl_cb_conn.cb_addr, +- sa); +- unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); +- unconf->cl_cb_conn.cb_minorversion = +- cstate->minorversion; +- unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog; +- unconf->cl_cb_seq_nr = 1; +- nfsd4_probe_callback(unconf, &unconf->cl_cb_conn); +- } ++ if (is_ds_only_session(unconf->cl_exchange_flags)) ++ cr_ses->flags &= ~SESSION4_BACK_CHAN; ++ + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + ++ if (cr_ses->flags & SESSION4_BACK_CHAN) { ++ conf->cl_cb_conn.cb_xprt = rqstp->rq_xprt; ++ svc_xprt_get(rqstp->rq_xprt); ++ rpc_copy_addr((struct sockaddr *)&conf->cl_cb_conn.cb_addr, sa); ++ conf->cl_cb_conn.cb_addrlen = svc_addr_len(sa); ++ conf->cl_cb_conn.cb_minorversion = cstate->minorversion; ++ conf->cl_cb_conn.cb_prog = cr_ses->callback_prog; ++ conf->cl_cb_seq_nr = 1; ++ nfsd4_probe_callback(conf, &conf->cl_cb_conn); ++ } ++ + /* + * We do not support RDMA or persistent sessions + */ +@@ -1746,7 +1809,7 @@ out: + + /* OPEN Share state helper functions */ + static inline struct nfs4_file * +-alloc_init_file(struct inode *ino) ++alloc_init_file(struct inode *ino, struct svc_fh *current_fh) + { + struct nfs4_file *fp; + unsigned int hashval = file_hashval(ino); +@@ -1760,6 +1823,16 @@ alloc_init_file(struct inode *ino) + fp->fi_inode = igrab(ino); + fp->fi_id = current_fileid++; + fp->fi_had_conflict = false; ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&fp->fi_layouts); ++ INIT_LIST_HEAD(&fp->fi_layout_states); ++ fp->fi_fsid.major = current_fh->fh_export->ex_fsid; ++ fp->fi_fsid.minor = 0; ++ fp->fi_fhlen = current_fh->fh_handle.fh_size; ++ BUG_ON(fp->fi_fhlen > sizeof(fp->fi_fhval)); ++ memcpy(fp->fi_fhval, ¤t_fh->fh_handle.fh_base, ++ fp->fi_fhlen); ++#endif /* CONFIG_PNFSD */ + spin_lock(&recall_lock); + list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); +@@ -1768,7 +1841,7 @@ alloc_init_file(struct inode *ino) + return NULL; + } + +-static void ++void + nfsd4_free_slab(struct kmem_cache **slab) + { + if (*slab == NULL) +@@ -1784,6 +1857,7 @@ nfsd4_free_slabs(void) + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); ++ nfsd4_free_pnfs_slabs(); + } + + static int +@@ -1805,6 +1879,8 @@ nfsd4_init_slabs(void) + sizeof(struct nfs4_delegation), 0, 0, NULL); + if (deleg_slab == NULL) + goto out_nomem; ++ if (nfsd4_init_pnfs_slabs()) ++ goto out_nomem; + return 0; + out_nomem: + nfsd4_free_slabs(); +@@ -1878,6 +1954,9 @@ init_stateid(struct nfs4_stateid *stp, s + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); + INIT_LIST_HEAD(&stp->st_perfile); ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &stateid_hashtbl[hashval]); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); +@@ -1919,6 +1998,7 @@ find_openstateowner_str(unsigned int has + { + struct nfs4_stateowner *so = NULL; + ++ BUG_ON_UNLOCKED_STATE(); + list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { + if (same_owner_str(so, &open->op_owner, &open->op_clientid)) + return so; +@@ -1927,7 +2007,7 @@ find_openstateowner_str(unsigned int has + } + + /* search file_hashtbl[] for file */ +-static struct nfs4_file * ++struct nfs4_file * + find_file(struct inode *ino) + { + unsigned int hashval = file_hashval(ino); +@@ -1945,6 +2025,18 @@ find_file(struct inode *ino) + return NULL; + } + ++struct nfs4_file * ++find_alloc_file(struct inode *ino, struct svc_fh *current_fh) ++{ ++ struct nfs4_file *fp; ++ ++ fp = find_file(ino); ++ if (fp) ++ return fp; ++ ++ return alloc_init_file(ino, current_fh); ++} ++ + static inline int access_valid(u32 x, u32 minorversion) + { + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) +@@ -2503,7 +2595,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; + status = nfserr_resource; +- fp = alloc_init_file(ino); ++ fp = alloc_init_file(ino, current_fh); + if (fp == NULL) + goto out; + } +@@ -2730,7 +2822,7 @@ nfs4_check_fh(struct svc_fh *fhp, struct + return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; + } + +-static int ++int + STALE_STATEID(stateid_t *stateid) + { + if (stateid->si_boot == boot_time) +@@ -2740,6 +2832,16 @@ STALE_STATEID(stateid_t *stateid) + return 1; + } + ++__be32 ++nfs4_check_stateid(stateid_t *stateid) ++{ ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ return nfserr_bad_stateid; ++ if (STALE_STATEID(stateid)) ++ return nfserr_stale_stateid; ++ return 0; ++} ++ + static inline int + access_permit_read(unsigned long access_bmap) + { +@@ -2848,6 +2950,24 @@ nfs4_preprocess_stateid_op(struct nfsd4_ + if (grace_disallows_io(ino)) + return nfserr_grace; + ++#if defined(CONFIG_PNFSD) ++ if (pnfs_fh_is_ds(¤t_fh->fh_handle)) { ++ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) ++ status = nfserr_bad_stateid; ++ else ++#ifdef CONFIG_GFS2_FS_LOCKING_DLM ++ { ++ dprintk("%s Don't check DS stateid\n", __func__); ++ return 0; ++ } ++#else /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ status = nfs4_preprocess_pnfs_ds_stateid(current_fh, ++ stateid); ++#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ ++ goto out; ++ } ++#endif /* CONFIG_PNFSD */ ++ + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + +@@ -2924,13 +3044,9 @@ nfs4_preprocess_seqid_op(struct nfsd4_co + *stpp = NULL; + *sopp = NULL; + +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { +- dprintk("NFSD: preprocess_seqid_op: magic stateid!\n"); +- return nfserr_bad_stateid; +- } +- +- if (STALE_STATEID(stateid)) +- return nfserr_stale_stateid; ++ status = nfs4_check_stateid(stateid); ++ if (status) ++ return status; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; +@@ -3205,11 +3321,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + nfs4_lock_state(); +- status = nfserr_bad_stateid; +- if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) +- goto out; +- status = nfserr_stale_stateid; +- if (STALE_STATEID(stateid)) ++ status = nfs4_check_stateid(stateid); ++ if (status) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) +@@ -3238,26 +3351,6 @@ out: + #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) + #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) + +-static inline u64 +-end_offset(u64 start, u64 len) +-{ +- u64 end; +- +- end = start + len; +- return end >= start ? end: NFS4_MAX_UINT64; +-} +- +-/* last octet in a range */ +-static inline u64 +-last_byte_offset(u64 start, u64 len) +-{ +- u64 end; +- +- BUG_ON(!len); +- end = start + len; +- return end > start ? end - 1: NFS4_MAX_UINT64; +-} +- + #define lockownerid_hashval(id) \ + ((id) & LOCK_HASH_MASK) + +@@ -3274,7 +3367,7 @@ static struct list_head lock_ownerid_has + static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; + static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; + +-static struct nfs4_stateid * ++struct nfs4_stateid * + find_stateid(stateid_t *stid, int flags) + { + struct nfs4_stateid *local; +@@ -3303,7 +3396,7 @@ find_stateid(stateid_t *stid, int flags) + return NULL; + } + +-static struct nfs4_delegation * ++struct nfs4_delegation * + find_delegation_stateid(struct inode *ino, stateid_t *stid) + { + struct nfs4_file *fp; +@@ -3436,6 +3529,9 @@ alloc_init_lock_stateid(struct nfs4_stat + INIT_LIST_HEAD(&stp->st_perfile); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ ++#if defined(CONFIG_PNFSD) ++ INIT_LIST_HEAD(&stp->st_pnfs_ds_id); ++#endif /* CONFIG_PNFSD */ + list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); +@@ -3998,6 +4094,9 @@ nfs4_state_init(void) + INIT_LIST_HEAD(&client_lru); + INIT_LIST_HEAD(&del_recall_lru); + reclaim_str_hashtbl_size = 0; ++#if defined(CONFIG_PNFSD) ++ nfs4_pnfs_state_init(); ++#endif /* CONFIG_PNFSD */ + return 0; + } + +@@ -4110,6 +4209,7 @@ __nfs4_state_shutdown(void) + } + + nfsd4_shutdown_recdir(); ++ nfs4_pnfs_state_shutdown(); + nfs4_init = 0; + } + +diff -up linux-2.6.35.noarch/fs/nfsd/nfs4xdr.c.orig linux-2.6.35.noarch/fs/nfsd/nfs4xdr.c +--- linux-2.6.35.noarch/fs/nfsd/nfs4xdr.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfs4xdr.c 2010-08-31 21:11:40.959140127 -0400 +@@ -47,9 +47,14 @@ + #include + #include + #include ++#include ++#include ++#include ++#include + + #include "xdr4.h" + #include "vfs.h" ++#include "pnfsd.h" + + #define NFSDDBG_FACILITY NFSDDBG_XDR + +@@ -1244,6 +1249,138 @@ static __be32 nfsd4_decode_reclaim_compl + DECODE_TAIL; + } + ++#if defined(CONFIG_PNFSD) ++static __be32 ++nfsd4_decode_getdevlist(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16 + sizeof(nfs4_verifier)); ++ READ32(gdevl->gd_layout_type); ++ READ32(gdevl->gd_maxdevices); ++ READ64(gdevl->gd_cookie); ++ COPYMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_getdevinfo(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ u32 num; ++ DECODE_HEAD; ++ ++ READ_BUF(12 + sizeof(struct nfsd4_pnfs_deviceid)); ++ READ64(gdev->gd_devid.sbid); ++ READ64(gdev->gd_devid.devid); ++ READ32(gdev->gd_layout_type); ++ READ32(gdev->gd_maxcount); ++ READ32(num); ++ if (num) { ++ READ_BUF(4); ++ READ32(gdev->gd_notify_types); ++ } else { ++ gdev->gd_notify_types = 0; ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(36); ++ READ32(lgp->lg_signal); ++ READ32(lgp->lg_seg.layout_type); ++ READ32(lgp->lg_seg.iomode); ++ READ64(lgp->lg_seg.offset); ++ READ64(lgp->lg_seg.length); ++ READ64(lgp->lg_minlength); ++ nfsd4_decode_stateid(argp, &lgp->lg_sid); ++ READ_BUF(4); ++ READ32(lgp->lg_maxcount); ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ DECODE_HEAD; ++ u32 timechange; ++ ++ READ_BUF(20); ++ READ64(lcp->args.lc_seg.offset); ++ READ64(lcp->args.lc_seg.length); ++ READ32(lcp->args.lc_reclaim); ++ nfsd4_decode_stateid(argp, &lcp->lc_sid); ++ READ_BUF(4); ++ READ32(lcp->args.lc_newoffset); ++ if (lcp->args.lc_newoffset) { ++ READ_BUF(8); ++ READ64(lcp->args.lc_last_wr); ++ } else ++ lcp->args.lc_last_wr = 0; ++ READ_BUF(4); ++ READ32(timechange); ++ if (timechange) { ++ READ_BUF(12); ++ READ64(lcp->args.lc_mtime.seconds); ++ READ32(lcp->args.lc_mtime.nseconds); ++ } else { ++ lcp->args.lc_mtime.seconds = 0; ++ lcp->args.lc_mtime.nseconds = 0; ++ } ++ READ_BUF(8); ++ READ32(lcp->args.lc_seg.layout_type); ++ /* XXX: saving XDR'ed layout update. Since we don't have the ++ * current_fh yet, and therefore no export_ops, we can't call ++ * the layout specific decode routines. File and pVFS2 ++ * do not use the layout update.... ++ */ ++ READ32(lcp->args.lc_up_len); ++ if (lcp->args.lc_up_len > 0) { ++ READ_BUF(lcp->args.lc_up_len); ++ READMEM(lcp->args.lc_up_layout, lcp->args.lc_up_len); ++ } ++ ++ DECODE_TAIL; ++} ++ ++static __be32 ++nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ DECODE_HEAD; ++ ++ READ_BUF(16); ++ READ32(lrp->args.lr_reclaim); ++ READ32(lrp->args.lr_seg.layout_type); ++ READ32(lrp->args.lr_seg.iomode); ++ READ32(lrp->args.lr_return_type); ++ if (lrp->args.lr_return_type == RETURN_FILE) { ++ READ_BUF(16); ++ READ64(lrp->args.lr_seg.offset); ++ READ64(lrp->args.lr_seg.length); ++ nfsd4_decode_stateid(argp, &lrp->lr_sid); ++ READ_BUF(4); ++ READ32(lrp->args.lrf_body_len); ++ if (lrp->args.lrf_body_len > 0) { ++ READ_BUF(lrp->args.lrf_body_len); ++ READMEM(lrp->args.lrf_body, lrp->args.lrf_body_len); ++ } ++ } ++ ++ DECODE_TAIL; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) + { +@@ -1345,11 +1482,19 @@ static nfsd4_dec nfsd41_dec_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, +@@ -2146,6 +2291,36 @@ out_acl: + } + WRITE64(stat.ino); + } ++#if defined(CONFIG_PNFSD) ++ if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { ++ struct super_block *sb = dentry->d_inode->i_sb; ++ int type = 0; ++ ++ /* Query the filesystem for supported pNFS layout types. ++ * Currently, we only support one layout type per file system. ++ * The export_ops->layout_type() returns the pnfs_layouttype4. ++ */ ++ buflen -= 4; ++ if (buflen < 0) /* length */ ++ goto out_resource; ++ ++ if (sb && sb->s_pnfs_op && sb->s_pnfs_op->layout_type) ++ type = sb->s_pnfs_op->layout_type(sb); ++ if (type) { ++ if ((buflen -= 4) < 0) /* type */ ++ goto out_resource; ++ WRITE32(1); /* length */ ++ WRITE32(type); /* type */ ++ } else ++ WRITE32(0); /* length */ ++ } ++ ++ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ if ((buflen -= 4) < 0) ++ goto out_resource; ++ WRITE32(stat.blksize); ++ } ++#endif /* CONFIG_PNFSD */ + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); +@@ -2376,6 +2551,10 @@ nfsd4_encode_commit(struct nfsd4_compoun + if (!nfserr) { + RESERVE_SPACE(8); + WRITEMEM(commit->co_verf.data, 8); ++ dprintk("NFSD: nfsd4_encode_commit: verifier %x:%x\n", ++ ((u32 *)(&commit->co_verf.data))[0], ++ ((u32 *)(&commit->co_verf.data))[1]); ++ + ADJUST_ARGS(); + } + return nfserr; +@@ -2630,9 +2809,20 @@ nfsd4_encode_read(struct nfsd4_compoundr + } + read->rd_vlen = v; + ++#if defined(CONFIG_SPNFS) ++ if (spnfs_enabled()) ++ nfserr = spnfs_read(read->rd_fhp->fh_dentry->d_inode, ++ read->rd_offset, &maxcount, read->rd_vlen, ++ resp->rqstp); ++ else /* we're not an MDS */ ++ nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, ++ read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, ++ &maxcount); ++#else + nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, + &maxcount); ++#endif /* CONFIG_SPNFS */ + + if (nfserr == nfserr_symlink) + nfserr = nfserr_inval; +@@ -2936,6 +3126,9 @@ nfsd4_encode_write(struct nfsd4_compound + WRITE32(write->wr_bytes_written); + WRITE32(write->wr_how_written); + WRITEMEM(write->wr_verifier.data, 8); ++ dprintk("NFSD: nfsd4_encode_write: verifier %x:%x\n", ++ ((u32 *)(&write->wr_verifier.data))[0], ++ ((u32 *)(&write->wr_verifier.data))[1]); + ADJUST_ARGS(); + } + return nfserr; +@@ -3079,6 +3272,343 @@ nfsd4_encode_sequence(struct nfsd4_compo + return 0; + } + ++#if defined(CONFIG_PNFSD) ++ ++/* Uses the export interface to iterate through the available devices ++ * and encodes them on the response stream. ++ */ ++static __be32 ++nfsd4_encode_devlist_iterator(struct nfsd4_compoundres *resp, ++ struct nfsd4_pnfs_getdevlist *gdevl, ++ unsigned int *dev_count) ++{ ++ struct super_block *sb = gdevl->gd_fhp->fh_dentry->d_inode->i_sb; ++ __be32 nfserr; ++ int status; ++ __be32 *p; ++ struct nfsd4_pnfs_dev_iter_res res = { ++ .gd_cookie = gdevl->gd_cookie, ++ .gd_verf = gdevl->gd_verf, ++ .gd_eof = 0 ++ }; ++ u64 sbid; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ sbid = find_create_sbid(sb); ++ *dev_count = 0; ++ do { ++ status = sb->s_pnfs_op->get_device_iter(sb, ++ gdevl->gd_layout_type, ++ &res); ++ if (status) { ++ if (status == -ENOENT) { ++ res.gd_eof = 1; ++ /* return success */ ++ break; ++ } ++ nfserr = nfserrno(status); ++ goto out_err; ++ } ++ ++ /* Encode device id and layout type */ ++ RESERVE_SPACE(sizeof(struct nfsd4_pnfs_deviceid)); ++ WRITE64((__be64)sbid); ++ WRITE64(res.gd_devid); /* devid minor */ ++ ADJUST_ARGS(); ++ (*dev_count)++; ++ } while (*dev_count < gdevl->gd_maxdevices && !res.gd_eof); ++ gdevl->gd_cookie = res.gd_cookie; ++ gdevl->gd_verf = res.gd_verf; ++ gdevl->gd_eof = res.gd_eof; ++ nfserr = nfs_ok; ++out_err: ++ dprintk("%s: Encoded %u devices\n", __func__, *dev_count); ++ return nfserr; ++} ++ ++/* Encodes the response of get device list. ++*/ ++static __be32 ++nfsd4_encode_getdevlist(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevlist *gdevl) ++{ ++ unsigned int dev_count = 0, lead_count; ++ u32 *p_in = resp->p; ++ __be32 *p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ /* Ensure we have room for cookie, verifier, and devlist len, ++ * which we will backfill in after we encode as many devices as possible ++ */ ++ lead_count = 8 + sizeof(nfs4_verifier) + 4; ++ RESERVE_SPACE(lead_count); ++ /* skip past these values */ ++ p += XDR_QUADLEN(lead_count); ++ ADJUST_ARGS(); ++ ++ /* Iterate over as many device ids as possible on the xdr stream */ ++ nfserr = nfsd4_encode_devlist_iterator(resp, gdevl, &dev_count); ++ if (nfserr) ++ goto out_err; ++ ++ /* Backfill in cookie, verf and number of devices encoded */ ++ p = p_in; ++ WRITE64(gdevl->gd_cookie); ++ WRITEMEM(&gdevl->gd_verf, sizeof(nfs4_verifier)); ++ WRITE32(dev_count); ++ ++ /* Skip over devices */ ++ p += XDR_QUADLEN(dev_count * sizeof(struct nfsd4_pnfs_deviceid)); ++ ADJUST_ARGS(); ++ ++ /* are we at the end of devices? */ ++ RESERVE_SPACE(4); ++ WRITE32(gdevl->gd_eof); ++ ADJUST_ARGS(); ++ ++ dprintk("%s: done.\n", __func__); ++ ++ nfserr = nfs_ok; ++out: ++ return nfserr; ++out_err: ++ p = p_in; ++ ADJUST_ARGS(); ++ goto out; ++} ++ ++/* For a given device id, have the file system retrieve and encode the ++ * associated device. For file layout, the encoding function is ++ * passed down to the file system. The file system then has the option ++ * of using this encoding function or one of its own. ++ * ++ * Note: the file system must return the XDR size of struct device_addr4 ++ * da_addr_body in pnfs_xdr_info.bytes_written on NFS4ERR_TOOSMALL for the ++ * gdir_mincount calculation. ++ */ ++static __be32 ++nfsd4_encode_getdevinfo(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_getdevinfo *gdev) ++{ ++ struct super_block *sb; ++ int maxcount = 0, type_notify_len = 12; ++ __be32 *p, *p_save = NULL, *p_in = resp->p; ++ struct exp_xdr_stream xdr; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = gdev->gd_sb; ++ ++ if (gdev->gd_maxcount != 0) { ++ /* FIXME: this will be bound by the session max response */ ++ maxcount = svc_max_payload(resp->rqstp); ++ if (maxcount > gdev->gd_maxcount) ++ maxcount = gdev->gd_maxcount; ++ ++ /* Ensure have room for type and notify field */ ++ maxcount -= type_notify_len; ++ if (maxcount < 0) { ++ nfserr = -ETOOSMALL; ++ goto toosmall; ++ } ++ } ++ ++ RESERVE_SPACE(4); ++ WRITE32(gdev->gd_layout_type); ++ ADJUST_ARGS(); ++ ++ /* If maxcount is 0 then just update notifications */ ++ if (gdev->gd_maxcount == 0) ++ goto handle_notifications; ++ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ nfserr = sb->s_pnfs_op->get_device_info(sb, &xdr, gdev->gd_layout_type, ++ &gdev->gd_devid); ++ if (nfserr) ++ goto err; ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ BUG_ON(xdr.p > xdr.end); ++ ++ /* Update the xdr stream with the number of bytes encoded ++ * by the file system. ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++handle_notifications: ++ /* Encode supported device notifications */ ++ RESERVE_SPACE(4); ++ if (sb->s_pnfs_op->set_device_notify) { ++ struct pnfs_devnotify_arg dn_args; ++ ++ dn_args.dn_layout_type = gdev->gd_layout_type; ++ dn_args.dn_devid = gdev->gd_devid; ++ dn_args.dn_notify_types = gdev->gd_notify_types; ++ nfserr = sb->s_pnfs_op->set_device_notify(sb, &dn_args); ++ if (nfserr) ++ goto err; ++ WRITE32(dn_args.dn_notify_types); ++ } else { ++ WRITE32(0); ++ } ++ ADJUST_ARGS(); ++ ++out: ++ return nfserrno(nfserr); ++toosmall: ++ dprintk("%s: maxcount too small\n", __func__); ++ RESERVE_SPACE(4); ++ WRITE32((p_save ? (xdr.p - p_save) * 4 : 0) + type_notify_len); ++ ADJUST_ARGS(); ++ goto out; ++err: ++ /* Rewind to the beginning */ ++ p = p_in; ++ ADJUST_ARGS(); ++ if (nfserr == -ETOOSMALL) ++ goto toosmall; ++ printk(KERN_ERR "%s: export ERROR %d\n", __func__, nfserr); ++ goto out; ++} ++ ++static __be32 ++nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, ++ __be32 nfserr, ++ struct nfsd4_pnfs_layoutget *lgp) ++{ ++ int maxcount, leadcount; ++ struct super_block *sb; ++ struct exp_xdr_stream xdr; ++ __be32 *p, *p_save, *p_start = resp->p; ++ ++ dprintk("%s: err %d\n", __func__, nfserr); ++ if (nfserr) ++ return nfserr; ++ ++ sb = lgp->lg_fhp->fh_dentry->d_inode->i_sb; ++ maxcount = PAGE_SIZE; ++ if (maxcount > lgp->lg_maxcount) ++ maxcount = lgp->lg_maxcount; ++ ++ /* Check for space on xdr stream */ ++ leadcount = 36 + sizeof(stateid_opaque_t); ++ RESERVE_SPACE(leadcount); ++ /* encode layout metadata after file system encodes layout */ ++ p += XDR_QUADLEN(leadcount); ++ ADJUST_ARGS(); ++ ++ /* Ensure have room for ret_on_close, off, len, iomode, type */ ++ maxcount -= leadcount; ++ if (maxcount < 0) { ++ printk(KERN_ERR "%s: buffer too small\n", __func__); ++ nfserr = nfserr_toosmall; ++ goto err; ++ } ++ ++ /* Set xdr info so file system can encode layout */ ++ xdr.p = p_save = resp->p; ++ xdr.end = resp->end; ++ if (xdr.end - xdr.p > exp_xdr_qwords(maxcount & ~3)) ++ xdr.end = xdr.p + exp_xdr_qwords(maxcount & ~3); ++ ++ /* Retrieve, encode, and merge layout; process stateid */ ++ nfserr = nfs4_pnfs_get_layout(lgp, &xdr); ++ if (nfserr) ++ goto err; ++ ++ /* Ensure file system returned enough bytes for the client ++ * to access. ++ */ ++ if (lgp->lg_seg.length < lgp->lg_minlength) { ++ nfserr = nfserr_badlayout; ++ goto err; ++ } ++ ++ /* The file system should never write 0 bytes without ++ * returning an error ++ */ ++ BUG_ON(xdr.p == p_save); ++ ++ /* Rewind to beginning and encode attrs */ ++ resp->p = p_start; ++ RESERVE_SPACE(4); ++ WRITE32(lgp->lg_roc); /* return on close */ ++ ADJUST_ARGS(); ++ nfsd4_encode_stateid(resp, &lgp->lg_sid); ++ RESERVE_SPACE(28); ++ /* Note: response logr_layout array count, always one for now */ ++ WRITE32(1); ++ WRITE64(lgp->lg_seg.offset); ++ WRITE64(lgp->lg_seg.length); ++ WRITE32(lgp->lg_seg.iomode); ++ WRITE32(lgp->lg_seg.layout_type); ++ ++ /* Update the xdr stream with the number of bytes written ++ * by the file system ++ */ ++ p = xdr.p; ++ ADJUST_ARGS(); ++ ++ return nfs_ok; ++err: ++ resp->p = p_start; ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutcommit *lcp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lcp->res.lc_size_chg); ++ ADJUST_ARGS(); ++ if (lcp->res.lc_size_chg) { ++ RESERVE_SPACE(8); ++ WRITE64(lcp->res.lc_newsize); ++ ADJUST_ARGS(); ++ } ++out: ++ return nfserr; ++} ++ ++static __be32 ++nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, ++ struct nfsd4_pnfs_layoutreturn *lrp) ++{ ++ __be32 *p; ++ ++ if (nfserr) ++ goto out; ++ ++ RESERVE_SPACE(4); ++ WRITE32(lrp->lrs_present != 0); /* got stateid? */ ++ ADJUST_ARGS(); ++ if (lrp->lrs_present) ++ nfsd4_encode_stateid(resp, &lrp->lr_sid); ++out: ++ return nfserr; ++} ++#endif /* CONFIG_PNFSD */ ++ + static __be32 + nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) + { +@@ -3139,11 +3669,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, ++#if defined(CONFIG_PNFSD) ++ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdevinfo, ++ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_getdevlist, ++ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, ++ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, ++ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, ++#else /* CONFIG_PNFSD */ + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, ++#endif /* CONFIG_PNFSD */ + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, +diff -up linux-2.6.35.noarch/fs/nfsd/nfsctl.c.orig linux-2.6.35.noarch/fs/nfsd/nfsctl.c +--- linux-2.6.35.noarch/fs/nfsd/nfsctl.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfsctl.c 2010-08-31 21:11:40.960150387 -0400 +@@ -13,10 +13,15 @@ + #include + #include + #include ++#include + + #include "nfsd.h" + #include "cache.h" + ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++#include ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + /* + * We have a single directory with 9 nodes in it. + */ +@@ -49,6 +54,9 @@ enum { + NFSD_Gracetime, + NFSD_RecoveryDir, + #endif ++#ifdef CONFIG_PNFSD ++ NFSD_pnfs_dlm_device, ++#endif + }; + + /* +@@ -74,6 +82,9 @@ static ssize_t write_leasetime(struct fi + static ssize_t write_gracetime(struct file *file, char *buf, size_t size); + static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); + #endif ++#ifdef CONFIG_PNFSD ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size); ++#endif + + static ssize_t (*write_op[])(struct file *, char *, size_t) = { + [NFSD_Svc] = write_svc, +@@ -96,6 +107,9 @@ static ssize_t (*write_op[])(struct file + [NFSD_Gracetime] = write_gracetime, + [NFSD_RecoveryDir] = write_recoverydir, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = write_pnfs_dlm_device, ++#endif + }; + + static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) +@@ -1349,6 +1363,68 @@ static ssize_t write_recoverydir(struct + + #endif + ++#ifdef CONFIG_PNFSD ++ ++static ssize_t __write_pnfs_dlm_device(struct file *file, char *buf, ++ size_t size) ++{ ++ char *mesg = buf; ++ char *pnfs_dlm_device; ++ int max_size = NFSD_PNFS_DLM_DEVICE_MAX; ++ int len, ret = 0; ++ ++ if (size > 0) { ++ ret = -EINVAL; ++ if (size > max_size || buf[size-1] != '\n') ++ return ret; ++ buf[size-1] = 0; ++ ++ pnfs_dlm_device = mesg; ++ len = qword_get(&mesg, pnfs_dlm_device, size); ++ if (len <= 0) ++ return ret; ++ ++ ret = nfsd4_set_pnfs_dlm_device(pnfs_dlm_device, len); ++ } else ++ return nfsd4_get_pnfs_dlm_device_list(buf, SIMPLE_TRANSACTION_LIMIT); ++ ++ return ret <= 0 ? ret : strlen(buf); ++} ++ ++/** ++ * write_pnfs_dlm_device - Set or report the current pNFS data server list ++ * ++ * Input: ++ * buf: ignored ++ * size: zero ++ * ++ * OR ++ * ++ * Input: ++ * buf: C string containing a block device name, ++ * a colon, and then a comma separated ++ * list of pNFS data server IPv4 addresses ++ * size: non-zero length of C string in @buf ++ * Output: ++ * On success: passed-in buffer filled with '\n'-terminated C ++ * string containing a block device name, a colon, and ++ * then a comma separated list of pNFS ++ * data server IPv4 addresses. ++ * return code is the size in bytes of the string ++ * On error: return code is a negative errno value ++ */ ++static ssize_t write_pnfs_dlm_device(struct file *file, char *buf, size_t size) ++{ ++ ssize_t rv; ++ ++ mutex_lock(&nfsd_mutex); ++ rv = __write_pnfs_dlm_device(file, buf, size); ++ mutex_unlock(&nfsd_mutex); ++ return rv; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ + /*----------------------------------------------------------------------------*/ + /* + * populating the filesystem. +@@ -1383,6 +1459,10 @@ static int nfsd_fill_super(struct super_ + [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + #endif ++#ifdef CONFIG_PNFSD ++ [NFSD_pnfs_dlm_device] = {"pnfs_dlm_device", &transaction_ops, ++ S_IWUSR|S_IRUSR}, ++#endif + /* last one */ {""} + }; + return simple_fill_super(sb, 0x6e667364, nfsd_files); +@@ -1421,6 +1501,9 @@ static int create_proc_exports_entry(voi + } + #endif + ++#if defined(CONFIG_SPNFS_BLOCK) ++int nfsd_bl_init(void); ++#endif + static int __init init_nfsd(void) + { + int retval; +@@ -1443,6 +1526,15 @@ static int __init init_nfsd(void) + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ retval = spnfs_init_proc(); ++ if (retval != 0) ++ goto out_free_idmap; ++#if defined(CONFIG_SPNFS_BLOCK) ++ nfsd_bl_init(); ++#endif /* CONFIG_SPNFS_BLOCK */ ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ + retval = register_filesystem(&nfsd_fs_type); + if (retval) + goto out_free_all; +@@ -1465,7 +1557,22 @@ out_free_stat: + + static void __exit exit_nfsd(void) + { ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS) ++ remove_proc_entry("fs/nfs/spnfs/recall", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/getfh", NULL); ++ remove_proc_entry("fs/nfs/spnfs/config", NULL); ++ remove_proc_entry("fs/nfs/spnfs/ctl", NULL); ++ remove_proc_entry("fs/nfs/spnfs", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS */ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ remove_proc_entry("fs/nfs/spnfs/layoutseg", NULL); ++ remove_proc_entry("fs/nfs/spnfs/layoutsegsize", NULL); ++#endif /* CONFIG_PROC_FS && CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ + nfsd_export_shutdown(); ++ nfsd4_pnfs_dlm_shutdown(); + nfsd_reply_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); +diff -up linux-2.6.35.noarch/fs/nfsd/nfsd.h.orig linux-2.6.35.noarch/fs/nfsd/nfsd.h +--- linux-2.6.35.noarch/fs/nfsd/nfsd.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfsd.h 2010-08-31 21:11:40.961160861 -0400 +@@ -285,11 +285,17 @@ extern time_t nfsd4_grace; + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ + NFSD4_SUPPORTED_ATTRS_WORD0 + ++#if defined(CONFIG_PNFSD) ++#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ ++ (NFSD4_SUPPORTED_ATTRS_WORD1 | FATTR4_WORD1_FS_LAYOUT_TYPES) ++#else /* CONFIG_PNFSD */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ + NFSD4_SUPPORTED_ATTRS_WORD1 ++#endif /* CONFIG_PNFSD */ + + #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ +- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) ++ (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT | \ ++ FATTR4_WORD2_LAYOUT_BLKSIZE) + + static inline u32 nfsd_suppattrs0(u32 minorversion) + { +diff -up linux-2.6.35.noarch/fs/nfsd/nfsfh.c.orig linux-2.6.35.noarch/fs/nfsd/nfsfh.c +--- linux-2.6.35.noarch/fs/nfsd/nfsfh.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfsfh.c 2010-08-31 21:11:40.962160533 -0400 +@@ -10,6 +10,7 @@ + #include + + #include ++#include + #include "nfsd.h" + #include "vfs.h" + #include "auth.h" +@@ -139,6 +140,7 @@ static inline __be32 check_pseudo_root(s + static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) + { + struct knfsd_fh *fh = &fhp->fh_handle; ++ int fsid_type; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; +@@ -159,7 +161,8 @@ static __be32 nfsd_set_fh_dentry(struct + return error; + if (fh->fh_auth_type != 0) + return error; +- len = key_len(fh->fh_fsid_type) / 4; ++ fsid_type = pnfs_fh_fsid_type(fh); ++ len = key_len(fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { +@@ -172,7 +175,7 @@ static __be32 nfsd_set_fh_dentry(struct + data_left -= len; + if (data_left < 0) + return error; +- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); ++ exp = rqst_exp_find(rqstp, fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; +diff -up linux-2.6.35.noarch/fs/nfsd/nfsfh.h.orig linux-2.6.35.noarch/fs/nfsd/nfsfh.h +--- linux-2.6.35.noarch/fs/nfsd/nfsfh.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfsfh.h 2010-08-31 21:11:40.963170389 -0400 +@@ -14,6 +14,7 @@ enum nfsd_fsid { + FSID_UUID8, + FSID_UUID16, + FSID_UUID16_INUM, ++ FSID_MAX + }; + + enum fsid_source { +@@ -205,4 +206,42 @@ fh_unlock(struct svc_fh *fhp) + } + } + ++#if defined(CONFIG_PNFSD) ++ ++/* ++ * fh_fsid_type is overloaded to indicate whether a filehandle was one supplied ++ * to a DS by LAYOUTGET. nfs4_preprocess_stateid_op() uses this to decide how ++ * to handle a given stateid. ++ */ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return fh->fh_fsid_type >= FSID_MAX; ++} ++ ++static inline void pnfs_fh_mark_ds(struct knfsd_fh *fh) ++{ ++ BUG_ON(fh->fh_version != 1); ++ BUG_ON(pnfs_fh_is_ds(fh)); ++ fh->fh_fsid_type += FSID_MAX; ++} ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline int pnfs_fh_is_ds(struct knfsd_fh *fh) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_PNFSD */ ++ ++/* allows fh_verify() to check the real fsid_type (i.e., not overloaded). */ ++static inline int pnfs_fh_fsid_type(struct knfsd_fh *fh) ++{ ++ int fsid_type = fh->fh_fsid_type; ++ ++ if (pnfs_fh_is_ds(fh)) ++ return fsid_type - FSID_MAX; ++ return fsid_type; ++} ++ + #endif /* _LINUX_NFSD_FH_INT_H */ +diff -up linux-2.6.35.noarch/fs/nfsd/nfssvc.c.orig linux-2.6.35.noarch/fs/nfsd/nfssvc.c +--- linux-2.6.35.noarch/fs/nfsd/nfssvc.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/nfssvc.c 2010-08-31 21:11:40.963170389 -0400 +@@ -115,7 +115,7 @@ struct svc_program nfsd_program = { + + }; + +-u32 nfsd_supported_minorversion; ++u32 nfsd_supported_minorversion = NFSD_SUPPORTED_MINOR_VERSION; + + int nfsd_vers(int vers, enum vers_op change) + { +diff -up linux-2.6.35.noarch/fs/nfsd/pnfsd.h.orig linux-2.6.35.noarch/fs/nfsd/pnfsd.h +--- linux-2.6.35.noarch/fs/nfsd/pnfsd.h.orig 2010-08-31 21:11:40.964171061 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/pnfsd.h 2010-08-31 21:11:40.964171061 -0400 +@@ -0,0 +1,143 @@ ++/* ++ * Copyright (c) 2005 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef LINUX_NFSD_PNFSD_H ++#define LINUX_NFSD_PNFSD_H ++ ++#include ++#include ++ ++#include "state.h" ++#include "xdr4.h" ++ ++/* outstanding layout stateid */ ++struct nfs4_layout_state { ++ struct list_head ls_perfile; ++ struct list_head ls_layouts; /* list of nfs4_layouts */ ++ struct kref ls_ref; ++ struct nfs4_client *ls_client; ++ struct nfs4_file *ls_file; ++ stateid_t ls_stateid; ++}; ++ ++/* outstanding layout */ ++struct nfs4_layout { ++ struct list_head lo_perfile; /* hash by f_id */ ++ struct list_head lo_perclnt; /* hash by clientid */ ++ struct list_head lo_perstate; ++ struct nfs4_file *lo_file; /* backpointer */ ++ struct nfs4_client *lo_client; ++ struct nfs4_layout_state *lo_state; ++ struct nfsd4_layout_seg lo_seg; ++}; ++ ++struct pnfs_inval_state { ++ struct knfsd_fh mdsfh; /* needed only by invalidate all */ ++ stateid_t stid; ++ clientid_t clid; ++ u32 status; ++}; ++ ++/* pNFS Data Server state */ ++#define DS_STATEID_VALID 0 ++#define DS_STATEID_ERROR 1 ++#define DS_STATEID_NEW 2 ++ ++struct pnfs_ds_stateid { ++ struct list_head ds_hash; /* ds_stateid hash entry */ ++ struct list_head ds_perclid; /* per client hash entry */ ++ stateid_t ds_stid; ++ struct knfsd_fh ds_fh; ++ unsigned long ds_access; ++ u32 ds_status; /* from MDS */ ++ u32 ds_verifier[2]; /* from MDS */ ++ wait_queue_head_t ds_waitq; ++ unsigned long ds_flags; ++ struct kref ds_ref; ++ clientid_t ds_mdsclid; ++}; ++ ++struct pnfs_ds_clientid { ++ struct list_head dc_hash; /* mds_clid_hashtbl entry */ ++ struct list_head dc_stateid; /* ds_stateid head */ ++ struct list_head dc_permdsid; /* per mdsid hash entry */ ++ clientid_t dc_mdsclid; ++ struct kref dc_ref; ++ uint32_t dc_mdsid; ++}; ++ ++struct pnfs_mds_id { ++ struct list_head di_hash; /* mds_nodeid list entry */ ++ struct list_head di_mdsclid; /* mds_clientid head */ ++ uint32_t di_mdsid; ++ time_t di_mdsboot; /* mds boot time */ ++ struct kref di_ref; ++}; ++ ++/* notify device request (from exported filesystem) */ ++struct nfs4_notify_device { ++ struct nfsd4_pnfs_cb_dev_list *nd_list; ++ struct nfs4_client *nd_client; ++ struct list_head nd_perclnt; ++ ++ void *nd_args; /* nfsd internal */ ++}; ++ ++u64 find_create_sbid(struct super_block *); ++struct super_block *find_sbid_id(u64); ++__be32 nfs4_pnfs_get_layout(struct nfsd4_pnfs_layoutget *, struct exp_xdr_stream *); ++int nfs4_pnfs_return_layout(struct super_block *, struct svc_fh *, ++ struct nfsd4_pnfs_layoutreturn *); ++int nfs4_pnfs_cb_get_state(struct super_block *, struct pnfs_get_state *); ++int nfs4_pnfs_cb_change_state(struct pnfs_get_state *); ++void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++int put_layoutrecall(struct nfs4_layoutrecall *); ++void nomatching_layout(struct nfs4_layoutrecall *); ++void *layoutrecall_done(struct nfs4_layoutrecall *); ++int nfsd4_cb_layout(struct nfs4_layoutrecall *); ++int nfsd_layout_recall_cb(struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++int nfsd_device_notify_cb(struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++int nfsd4_cb_notify_device(struct nfs4_notify_device *); ++void pnfs_set_device_notify(clientid_t *, unsigned int types); ++void pnfs_clear_device_notify(struct nfs4_client *); ++ ++#if defined(CONFIG_PNFSD_LOCAL_EXPORT) ++extern struct sockaddr pnfsd_lexp_addr; ++extern size_t pnfs_lexp_addr_len; ++ ++extern void pnfsd_lexp_init(struct inode *); ++#endif /* CONFIG_PNFSD_LOCAL_EXPORT */ ++ ++#endif /* LINUX_NFSD_PNFSD_H */ +diff -up linux-2.6.35.noarch/fs/nfsd/pnfsd_lexp.c.orig linux-2.6.35.noarch/fs/nfsd/pnfsd_lexp.c +--- linux-2.6.35.noarch/fs/nfsd/pnfsd_lexp.c.orig 2010-08-31 21:11:40.965029541 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/pnfsd_lexp.c 2010-08-31 21:11:40.965029541 -0400 +@@ -0,0 +1,225 @@ ++/* ++ * linux/fs/nfsd/pnfs_lexp.c ++ * ++ * pNFS export of local filesystems. ++ * ++ * Export local file systems over the files layout type. ++ * The MDS (metadata server) functions also as a single DS (data server). ++ * This is mostly useful for development and debugging purposes. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * Copyright (C) 2008 Benny Halevy, ++ * ++ * Initial implementation was based on the pnfs-gfs2 patches done ++ * by David M. Richter ++ */ ++ ++#include ++#include ++ ++#include "pnfsd.h" ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++struct sockaddr pnfsd_lexp_addr; ++size_t pnfs_lexp_addr_len; ++ ++static int ++pnfsd_lexp_layout_type(struct super_block *sb) ++{ ++ int ret = LAYOUT_NFSV4_1_FILES; ++ dprintk("<-- %s: return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++pnfsd_lexp_get_device_iter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *res) ++{ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ res->gd_eof = 1; ++ if (res->gd_cookie) ++ return -ENOENT; ++ res->gd_cookie = 1; ++ res->gd_verf = 1; ++ res->gd_devid = 1; ++ ++ dprintk("<-- %s: return 0\n", __func__); ++ return 0; ++} ++ ++static int ++pnfsd_lexp_get_device_info(struct super_block *sb, ++ struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ int err; ++ struct pnfs_filelayout_device fdev; ++ struct pnfs_filelayout_multipath fl_devices[1]; ++ u32 fl_stripe_indices[1] = { 0 }; ++ struct pnfs_filelayout_devaddr daddr; ++ /* %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x.%03u.%03u */ ++ char daddr_buf[8*4 + 2*3 + 10]; ++ ++ dprintk("--> %s: sb=%p\n", __func__, sb); ++ ++ BUG_ON(layout_type != LAYOUT_NFSV4_1_FILES); ++ ++ memset(&fdev, '\0', sizeof(fdev)); ++ ++ if (devid->devid != 1) { ++ printk(KERN_ERR "%s: WARNING: didn't receive a deviceid of 1 " ++ "(got: 0x%llx)\n", __func__, devid->devid); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ /* count the number of comma-delimited DS IPs */ ++ fdev.fl_device_length = 1; ++ fdev.fl_device_list = fl_devices; ++ ++ fdev.fl_stripeindices_length = fdev.fl_device_length; ++ fdev.fl_stripeindices_list = fl_stripe_indices; ++ ++ daddr.r_addr.data = daddr_buf; ++ daddr.r_addr.len = sizeof(daddr_buf); ++ err = __svc_print_netaddr(&pnfsd_lexp_addr, &daddr.r_addr); ++ if (err < 0) ++ goto out; ++ daddr.r_addr.len = err; ++ switch (pnfsd_lexp_addr.sa_family) { ++ case AF_INET: ++ daddr.r_netid.data = "tcp"; ++ daddr.r_netid.len = 3; ++ break; ++ case AF_INET6: ++ daddr.r_netid.data = "tcp6"; ++ daddr.r_netid.len = 4; ++ break; ++ default: ++ BUG(); ++ } ++ fdev.fl_device_list[0].fl_multipath_length = 1; ++ fdev.fl_device_list[0].fl_multipath_list = &daddr; ++ ++ /* have nfsd encode the device info */ ++ err = filelayout_encode_devinfo(xdr, &fdev); ++out: ++ dprintk("<-- %s: return %d\n", __func__, err); ++ return err; ++} ++ ++static int get_stripe_unit(int blocksize) ++{ ++ if (blocksize < NFSSVC_MAXBLKSIZE) ++ blocksize = NFSSVC_MAXBLKSIZE - (NFSSVC_MAXBLKSIZE % blocksize); ++ dprintk("%s: return %d\n", __func__, blocksize); ++ return blocksize; ++} ++ ++static enum nfsstat4 ++pnfsd_lexp_layout_get(struct inode *inode, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *arg, ++ struct nfsd4_pnfs_layoutget_res *res) ++{ ++ enum nfsstat4 rc = NFS4_OK; ++ struct pnfs_filelayout_layout *layout = NULL; ++ struct knfsd_fh *fhp = NULL; ++ ++ dprintk("--> %s: inode=%p\n", __func__, inode); ++ ++ res->lg_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ res->lg_seg.offset = 0; ++ res->lg_seg.length = NFS4_MAX_UINT64; ++ ++ layout = kzalloc(sizeof(*layout), GFP_KERNEL); ++ if (layout == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ /* Set file layout response args */ ++ layout->lg_layout_type = LAYOUT_NFSV4_1_FILES; ++ layout->lg_stripe_type = STRIPE_SPARSE; ++ layout->lg_commit_through_mds = true; ++ layout->lg_stripe_unit = get_stripe_unit(inode->i_sb->s_blocksize); ++ layout->lg_fh_length = 1; ++ layout->device_id.sbid = arg->lg_sbid; ++ layout->device_id.devid = 1; /*FSFTEMP*/ ++ layout->lg_first_stripe_index = 0; /*FSFTEMP*/ ++ layout->lg_pattern_offset = 0; ++ ++ fhp = kmalloc(sizeof(*fhp), GFP_KERNEL); ++ if (fhp == NULL) { ++ rc = -ENOMEM; ++ goto error; ++ } ++ ++ memcpy(fhp, arg->lg_fh, sizeof(*fhp)); ++ pnfs_fh_mark_ds(fhp); ++ layout->lg_fh_list = fhp; ++ ++ /* Call nfsd to encode layout */ ++ rc = filelayout_encode_layout(xdr, layout); ++exit: ++ kfree(layout); ++ kfree(fhp); ++ dprintk("<-- %s: return %d\n", __func__, rc); ++ return rc; ++ ++error: ++ res->lg_seg.length = 0; ++ goto exit; ++} ++ ++static int ++pnfsd_lexp_layout_commit(struct inode *inode, ++ const struct nfsd4_pnfs_layoutcommit_arg *args, ++ struct nfsd4_pnfs_layoutcommit_res *res) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int ++pnfsd_lexp_layout_return(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ dprintk("%s: (unimplemented)\n", __func__); ++ ++ return 0; ++} ++ ++static int pnfsd_lexp_get_state(struct inode *inode, struct knfsd_fh *fh, ++ struct pnfs_get_state *p) ++{ ++ return 0; /* just use the current stateid */ ++} ++ ++static struct pnfs_export_operations pnfsd_lexp_ops = { ++ .layout_type = pnfsd_lexp_layout_type, ++ .get_device_info = pnfsd_lexp_get_device_info, ++ .get_device_iter = pnfsd_lexp_get_device_iter, ++ .layout_get = pnfsd_lexp_layout_get, ++ .layout_commit = pnfsd_lexp_layout_commit, ++ .layout_return = pnfsd_lexp_layout_return, ++ .get_state = pnfsd_lexp_get_state, ++}; ++ ++void ++pnfsd_lexp_init(struct inode *inode) ++{ ++ dprintk("%s: &pnfsd_lexp_ops=%p\n", __func__, &pnfsd_lexp_ops); ++ inode->i_sb->s_pnfs_op = &pnfsd_lexp_ops; ++} +diff -up linux-2.6.35.noarch/fs/nfsd/spnfs_com.c.orig linux-2.6.35.noarch/fs/nfsd/spnfs_com.c +--- linux-2.6.35.noarch/fs/nfsd/spnfs_com.c.orig 2010-08-31 21:11:40.966160507 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/spnfs_com.c 2010-08-31 21:11:40.966160507 -0400 +@@ -0,0 +1,535 @@ ++/* ++ * fs/nfsd/spnfs_com.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * Based heavily on idmap.c ++ * ++ */ ++ ++/* ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Marius Aamodt Eriksen ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define NFSDDBG_FACILITY NFSDDBG_PROC ++ ++static ssize_t spnfs_pipe_upcall(struct file *, struct rpc_pipe_msg *, ++ char __user *, size_t); ++static ssize_t spnfs_pipe_downcall(struct file *, const char __user *, ++ size_t); ++static void spnfs_pipe_destroy_msg(struct rpc_pipe_msg *); ++ ++static struct rpc_pipe_ops spnfs_upcall_ops = { ++ .upcall = spnfs_pipe_upcall, ++ .downcall = spnfs_pipe_downcall, ++ .destroy_msg = spnfs_pipe_destroy_msg, ++}; ++ ++/* evil global variable */ ++struct spnfs *global_spnfs; ++struct spnfs_config *spnfs_config; ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++int spnfs_use_layoutsegments; ++uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++/* ++ * Used by spnfs_enabled() ++ * Tracks if the subsystem has been initialized at some point. It doesn't ++ * matter if it's not currently initialized. ++ */ ++static int spnfs_enabled_at_some_point; ++ ++/* call this to start the ball rolling */ ++/* code it like we're going to avoid the global variable in the future */ ++int ++nfsd_spnfs_new(void) ++{ ++ struct spnfs *spnfs = NULL; ++ struct path path; ++ struct nameidata nd; ++ int rc; ++ ++ if (global_spnfs != NULL) ++ return -EEXIST; ++ ++ path.mnt = rpc_get_mount(); ++ if (IS_ERR(path.mnt)) ++ return PTR_ERR(path.mnt); ++ ++ /* FIXME: do not abuse rpc_pipefs/nfs */ ++ rc = vfs_path_lookup(path.mnt->mnt_root, path.mnt, "/nfs", 0, &nd); ++ if (rc) ++ goto err; ++ ++ spnfs = kzalloc(sizeof(*spnfs), GFP_KERNEL); ++ if (spnfs == NULL){ ++ rc = -ENOMEM; ++ goto err; ++ } ++ ++ spnfs->spnfs_dentry = rpc_mkpipe(nd.path.dentry, "spnfs", spnfs, ++ &spnfs_upcall_ops, 0); ++ if (IS_ERR(spnfs->spnfs_dentry)) { ++ rc = -EPIPE; ++ goto err; ++ } ++ ++ mutex_init(&spnfs->spnfs_lock); ++ mutex_init(&spnfs->spnfs_plock); ++ init_waitqueue_head(&spnfs->spnfs_wq); ++ ++ global_spnfs = spnfs; ++ spnfs_enabled_at_some_point = 1; ++ ++ return 0; ++err: ++ rpc_put_mount(); ++ kfree(spnfs); ++ return rc; ++} ++ ++/* again, code it like we're going to remove the global variable */ ++void ++nfsd_spnfs_delete(void) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ ++ if (!spnfs) ++ return; ++ rpc_unlink(spnfs->spnfs_dentry); ++ rpc_put_mount(); ++ global_spnfs = NULL; ++ kfree(spnfs); ++} ++ ++/* RPC pipefs upcall/downcall routines */ ++/* looks like this code is invoked by the rpc_pipe code */ ++/* to handle upcalls on things we've queued elsewhere */ ++/* See nfs_idmap_id for an exmaple of enqueueing */ ++static ssize_t ++spnfs_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, ++ char __user *dst, size_t buflen) ++{ ++ char *data = (char *)msg->data + msg->copied; ++ ssize_t mlen = msg->len - msg->copied; ++ ssize_t left; ++ ++ if (mlen > buflen) ++ mlen = buflen; ++ ++ left = copy_to_user(dst, data, mlen); ++ if (left < 0) { ++ msg->errno = left; ++ return left; ++ } ++ mlen -= left; ++ msg->copied += mlen; ++ msg->errno = 0; ++ return mlen; ++} ++ ++static ssize_t ++spnfs_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ++{ ++ struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); ++ struct spnfs *spnfs = (struct spnfs *)rpci->private; ++ struct spnfs_msg *im_in = NULL, *im = &spnfs->spnfs_im; ++ int ret; ++ ++ if (mlen != sizeof(struct spnfs_msg)) ++ return -ENOSPC; ++ ++ im_in = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im_in == NULL) ++ return -ENOMEM; ++ ++ if (copy_from_user(im_in, src, mlen) != 0) ++ return -EFAULT; ++ ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ ret = mlen; ++ im->im_status = im_in->im_status; ++ /* If we got an error, terminate now, and wake up pending upcalls */ ++ if (!(im_in->im_status & SPNFS_STATUS_SUCCESS)) { ++ wake_up(&spnfs->spnfs_wq); ++ goto out; ++ } ++ ++ ret = -EINVAL; ++ /* Did we match the current upcall? */ ++ /* DMXXX: do not understand the comment above, from original code */ ++ /* DMXXX: when do we _not_ match the current upcall? */ ++ /* DMXXX: anyway, let's to a simplistic check */ ++ if (im_in->im_type == im->im_type) { ++ /* copy the response into the spnfs struct */ ++ memcpy(&im->im_res, &im_in->im_res, sizeof(im->im_res)); ++ ret = mlen; ++ } else ++ dprintk("spnfs: downcall type != upcall type\n"); ++ ++ ++ wake_up(&spnfs->spnfs_wq); ++/* DMXXX handle rval processing */ ++out: ++ mutex_unlock(&spnfs->spnfs_plock); ++ kfree(im_in); ++ return ret; ++} ++ ++static void ++spnfs_pipe_destroy_msg(struct rpc_pipe_msg *msg) ++{ ++ struct spnfs_msg *im = msg->data; ++ struct spnfs *spnfs = container_of(im, struct spnfs, spnfs_im); ++ ++ if (msg->errno >= 0) ++ return; ++ mutex_lock(&spnfs->spnfs_plock); ++ im->im_status = SPNFS_STATUS_FAIL; /* DMXXX */ ++ wake_up(&spnfs->spnfs_wq); ++ mutex_unlock(&spnfs->spnfs_plock); ++} ++ ++/* generic upcall. called by functions in spnfs_ops.c */ ++int ++spnfs_upcall(struct spnfs *spnfs, struct spnfs_msg *upmsg, ++ union spnfs_msg_res *res) ++{ ++ struct rpc_pipe_msg msg; ++ struct spnfs_msg *im; ++ DECLARE_WAITQUEUE(wq, current); ++ int ret = -EIO; ++ int rval; ++ ++ im = &spnfs->spnfs_im; ++ ++ mutex_lock(&spnfs->spnfs_lock); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ memset(im, 0, sizeof(*im)); ++ memcpy(im, upmsg, sizeof(*upmsg)); ++ ++ memset(&msg, 0, sizeof(msg)); ++ msg.data = im; ++ msg.len = sizeof(*im); ++ ++ add_wait_queue(&spnfs->spnfs_wq, &wq); ++ rval = rpc_queue_upcall(spnfs->spnfs_dentry->d_inode, &msg); ++ if (rval < 0) { ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ goto out; ++ } ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ mutex_unlock(&spnfs->spnfs_plock); ++ schedule(); ++ current->state = TASK_RUNNING; ++ remove_wait_queue(&spnfs->spnfs_wq, &wq); ++ mutex_lock(&spnfs->spnfs_plock); ++ ++ if (im->im_status & SPNFS_STATUS_SUCCESS) { ++ /* copy our result from the upcall */ ++ memcpy(res, &im->im_res, sizeof(*res)); ++ ret = 0; ++ } ++ ++out: ++ memset(im, 0, sizeof(*im)); ++ mutex_unlock(&spnfs->spnfs_plock); ++ mutex_unlock(&spnfs->spnfs_lock); ++ return(ret); ++} ++ ++/* ++ * This is used to determine if the spnfsd daemon has been started at ++ * least once since the system came up. This is used to by the export ++ * mechanism to decide if spnfs is in use. ++ * ++ * Returns non-zero if the spnfsd has initialized the communication pipe ++ * at least once. ++ */ ++int spnfs_enabled(void) ++{ ++ return spnfs_enabled_at_some_point; ++} ++ ++#ifdef CONFIG_PROC_FS ++ ++/* ++ * procfs virtual files for user/kernel space communication: ++ * ++ * ctl - currently just an on/off switch...can be expanded ++ * getfh - fd to fh conversion ++ * recall - recall a layout from the command line, for example: ++ * echo > /proc/fs/spnfs/recall ++ * config - configuration info, e.g., stripe size, num ds, etc. ++ */ ++ ++/*-------------- start ctl -------------------------*/ ++static ssize_t ctl_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int cmd, rc; ++ ++ if (copy_from_user((int *)&cmd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (cmd) { ++ rc = nfsd_spnfs_new(); ++ if (rc != 0) ++ return rc; ++ } else ++ nfsd_spnfs_delete(); ++ ++ return count; ++} ++ ++static const struct file_operations ctl_ops = { ++ .write = ctl_write, ++}; ++/*-------------- end ctl ---------------------------*/ ++ ++/*-------------- start config -------------------------*/ ++static ssize_t config_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ static struct spnfs_config cfg; ++ ++ if (copy_from_user(&cfg, buf, count)) ++ return -EFAULT; ++ ++ spnfs_config = &cfg; ++ return 0; ++} ++ ++static const struct file_operations config_ops = { ++ .write = config_write, ++}; ++/*-------------- end config ---------------------------*/ ++ ++/*-------------- start getfh -----------------------*/ ++static int getfh_open(struct inode *inode, struct file *file) ++{ ++ file->private_data = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); ++ if (file->private_data == NULL) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static ssize_t getfh_read(struct file *file, char __user *buf, size_t count, ++ loff_t *offset) ++{ ++ if (copy_to_user(buf, file->private_data, sizeof(struct nfs_fh))) ++ return -EFAULT; ++ ++ return count; ++} ++ ++static ssize_t getfh_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ int fd; ++ ++ if (copy_from_user((int *)&fd, (int *)buf, sizeof(int))) ++ return -EFAULT; ++ if (spnfs_getfh(fd, file->private_data) != 0) ++ return -EIO; ++ ++ return count; ++} ++ ++static int getfh_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static const struct file_operations getfh_ops = { ++ .open = getfh_open, ++ .read = getfh_read, ++ .write = getfh_write, ++ .release = getfh_release, ++}; ++/*-------------- end getfh ------------------------*/ ++ ++ ++/*-------------- start recall layout --------------*/ ++static ssize_t recall_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char input[128]; ++ char *path, *str, *p; ++ int rc; ++ u64 off = 0, len = 0; ++ ++ if (count > 128) ++ return -EINVAL; ++ ++ if (copy_from_user(input, buf, count)) ++ return -EFAULT; ++ ++ /* assumes newline-terminated path */ ++ p = memchr(input, '\n', count); ++ if (p == NULL) ++ return -EINVAL; ++ *p = '\0'; ++ ++ /* ++ * Scan for path and, optionally, an offset and length ++ * of a layout segment to be recalled; if there are two ++ * fields, they're assumed to be path and offset. ++ */ ++ p = input; ++ path = strsep(&p, " "); ++ if (path == NULL) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &off); ++ if (rc != 0) ++ return -EINVAL; ++ ++ str = strsep(&p, " "); ++ if (str != NULL) { ++ rc = strict_strtoull(str, 10, &len); ++ if (rc != 0) ++ return -EINVAL; ++ } ++ } ++ ++ rc = spnfs_test_layoutrecall(path, off, len); ++ if (rc != 0) ++ return rc; ++ ++ return count; ++} ++ ++static const struct file_operations recall_ops = { ++ .write = recall_write, ++}; ++/*-------------- end recall layout --------------*/ ++ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++/*-------------- start layoutseg -------------------------*/ ++static ssize_t layoutseg_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[3]; ++ ++ if (copy_from_user(cmd, buf, 1)) ++ return -EFAULT; ++ if (cmd[0] == '0') ++ spnfs_use_layoutsegments = 0; ++ else ++ spnfs_use_layoutsegments = 1; ++ ++ return count; ++} ++ ++static const struct file_operations layoutseg_ops = { ++ .write = layoutseg_write, ++}; ++/*-------------- end layoutseg ---------------------------*/ ++ ++/*-------------- start layoutsegsize -------------------------*/ ++static ssize_t layoutsegsize_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *offset) ++{ ++ char cmd[50]; ++ ++ if (copy_from_user(cmd, buf, 49)) ++ return -EFAULT; ++ layoutsegment_size = simple_strtoull(cmd, NULL, 10); ++ ++ return count; ++} ++ ++static const struct file_operations layoutsegsize_ops = { ++ .write = layoutsegsize_write, ++}; ++/*-------------- end layoutsegsize ---------------------------*/ ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++int ++spnfs_init_proc(void) ++{ ++ struct proc_dir_entry *entry; ++ ++ entry = proc_mkdir("fs/spnfs", NULL); ++ if (!entry) ++ return -ENOMEM; ++ ++ entry = create_proc_entry("fs/spnfs/ctl", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &ctl_ops; ++ ++ entry = create_proc_entry("fs/spnfs/config", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &config_ops; ++ ++ entry = create_proc_entry("fs/spnfs/getfh", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &getfh_ops; ++ ++ entry = create_proc_entry("fs/spnfs/recall", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &recall_ops; ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++ entry = create_proc_entry("fs/spnfs/layoutseg", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutseg_ops; ++ ++ entry = create_proc_entry("fs/spnfs/layoutsegsize", 0, NULL); ++ if (!entry) ++ return -ENOMEM; ++ entry->proc_fops = &layoutsegsize_ops; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ return 0; ++} ++#endif /* CONFIG_PROC_FS */ +diff -up linux-2.6.35.noarch/fs/nfsd/spnfs_ops.c.orig linux-2.6.35.noarch/fs/nfsd/spnfs_ops.c +--- linux-2.6.35.noarch/fs/nfsd/spnfs_ops.c.orig 2010-08-31 21:11:40.967160411 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/spnfs_ops.c 2010-08-31 21:11:40.967160411 -0400 +@@ -0,0 +1,878 @@ ++/* ++ * fs/nfsd/spnfs_ops.c ++ * ++ * Communcation layer between spNFS kernel and userspace ++ * ++ */ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "pnfsd.h" ++ ++/* comment out CONFIG_SPNFS_TEST for non-test behaviour */ ++/* #define CONFIG_SPNFS_TEST 1 */ ++ ++#define NFSDDBG_FACILITY NFSDDBG_PNFS ++ ++/* ++ * The functions that are called from elsewhere in the kernel ++ * to perform tasks in userspace ++ * ++ */ ++ ++#ifdef CONFIG_SPNFS_LAYOUTSEGMENTS ++extern int spnfs_use_layoutsegments; ++extern uint64_t layoutsegment_size; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++extern struct spnfs *global_spnfs; ++ ++int ++spnfs_layout_type(struct super_block *sb) ++{ ++ return LAYOUT_NFSV4_1_FILES; ++} ++ ++enum nfsstat4 ++spnfs_layoutget(struct inode *inode, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *lg_arg, ++ struct nfsd4_pnfs_layoutget_res *lg_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct pnfs_filelayout_layout *flp = NULL; ++ int status, i; ++ enum nfsstat4 nfserr; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ ++ im->im_type = SPNFS_TYPE_LAYOUTGET; ++ im->im_args.layoutget_args.inode = inode->i_ino; ++ im->im_args.layoutget_args.generation = inode->i_generation; ++ ++ /* call function to queue the msg for upcall */ ++ if (spnfs_upcall(spnfs, im, res) != 0) { ++ dprintk("failed spnfs upcall: layoutget\n"); ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ goto layoutget_cleanup; ++ } ++ status = res->layoutget_res.status; ++ if (status != 0) { ++ /* FIXME? until user mode is fixed, translate system error */ ++ switch (status) { ++ case -E2BIG: ++ case -ETOOSMALL: ++ nfserr = NFS4ERR_TOOSMALL; ++ break; ++ case -ENOMEM: ++ case -EAGAIN: ++ case -EINTR: ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ break; ++ case -ENOENT: ++ nfserr = NFS4ERR_BADLAYOUT; ++ break; ++ default: ++ nfserr = NFS4ERR_LAYOUTUNAVAILABLE; ++ } ++ dprintk("spnfs layout_get upcall: status=%d nfserr=%u\n", ++ status, nfserr); ++ goto layoutget_cleanup; ++ } ++ ++ lg_res->lg_return_on_close = 0; ++#if defined(CONFIG_SPNFS_LAYOUTSEGMENTS) ++ /* if spnfs_use_layoutsegments & layoutsegment_size == 0, use */ ++ /* the amount requested by the client. */ ++ if (spnfs_use_layoutsegments) { ++ if (layoutsegment_size != 0) ++ lg_res->lg_seg.length = layoutsegment_size; ++ } else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#else ++ lg_res->lg_seg.length = NFS4_MAX_UINT64; ++#endif /* CONFIG_SPNFS_LAYOUTSEGMENTS */ ++ ++ flp = kmalloc(sizeof(struct pnfs_filelayout_layout), GFP_KERNEL); ++ if (flp == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ flp->device_id.sbid = lg_arg->lg_sbid; ++ flp->device_id.devid = res->layoutget_res.devid; ++ flp->lg_layout_type = 1; /* XXX */ ++ flp->lg_stripe_type = res->layoutget_res.stripe_type; ++ flp->lg_commit_through_mds = 0; ++ flp->lg_stripe_unit = res->layoutget_res.stripe_size; ++ flp->lg_first_stripe_index = 0; ++ flp->lg_pattern_offset = 0; ++ flp->lg_fh_length = res->layoutget_res.stripe_count; ++ ++ flp->lg_fh_list = kmalloc(flp->lg_fh_length * sizeof(struct knfsd_fh), ++ GFP_KERNEL); ++ if (flp->lg_fh_list == NULL) { ++ nfserr = NFS4ERR_LAYOUTTRYLATER; ++ goto layoutget_cleanup; ++ } ++ /* ++ * FIX: Doing an extra copy here. Should group res.flist's fh_len ++ * and fh_val into a knfsd_fh structure. ++ */ ++ for (i = 0; i < flp->lg_fh_length; i++) { ++ flp->lg_fh_list[i].fh_size = res->layoutget_res.flist[i].fh_len; ++ memcpy(&flp->lg_fh_list[i].fh_base, ++ res->layoutget_res.flist[i].fh_val, ++ res->layoutget_res.flist[i].fh_len); ++ } ++ ++ /* encode the layoutget body */ ++ nfserr = filelayout_encode_layout(xdr, flp); ++ ++layoutget_cleanup: ++ if (flp) { ++ if (flp->lg_fh_list) ++ kfree(flp->lg_fh_list); ++ kfree(flp); ++ } ++ kfree(im); ++ kfree(res); ++ ++ return nfserr; ++} ++ ++int ++spnfs_layoutcommit(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutreturn(struct inode *inode, ++ const struct nfsd4_pnfs_layoutreturn_arg *args) ++{ ++ return 0; ++} ++ ++int ++spnfs_layoutrecall(struct inode *inode, int type, u64 offset, u64 len) ++{ ++ struct super_block *sb; ++ struct nfsd4_pnfs_cb_layout lr; ++ ++ switch (type) { ++ case RETURN_FILE: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for ino = %lu\n", ++ __func__, inode->i_ino); ++ break; ++ case RETURN_FSID: ++ sb = inode->i_sb; ++ dprintk("%s: recalling layout for fsid x (unimplemented)\n", ++ __func__); ++ return 0; ++ case RETURN_ALL: ++ /* XXX figure out how to get a sb since there's no inode ptr */ ++ dprintk("%s: recalling all layouts (unimplemented)\n", ++ __func__); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ ++ lr.cbl_recall_type = type; ++ lr.cbl_seg.layout_type = LAYOUT_NFSV4_1_FILES; ++ lr.cbl_seg.clientid = 0; ++ lr.cbl_seg.offset = offset; ++ lr.cbl_seg.length = len; ++ lr.cbl_seg.iomode = IOMODE_ANY; ++ lr.cbl_layoutchanged = 0; ++ ++ nfsd_layout_recall_cb(sb, inode, &lr); ++ ++ return 0; ++} ++ ++ ++int ++spnfs_test_layoutrecall(char *path, u64 offset, u64 len) ++{ ++ struct nameidata nd; ++ struct inode *inode; ++ int type, rc; ++ ++ dprintk("%s: path=%s, offset=%llu, len=%llu\n", ++ __func__, path, offset, len); ++ ++ if (strcmp(path, "all") == 0) { ++ inode = NULL; ++ type = RETURN_ALL; ++ } else { ++ rc = path_lookup(path, 0, &nd); ++ if (rc != 0) ++ return -ENOENT; ++ ++ /* ++ * XXX todo: add a RETURN_FSID scenario here...maybe if ++ * inode is a dir... ++ */ ++ ++ inode = nd.path.dentry->d_inode; ++ type = RETURN_FILE; ++ } ++ ++ if (len == 0) ++ len = NFS4_MAX_UINT64; ++ ++ rc = spnfs_layoutrecall(inode, type, offset, len); ++ ++ if (type != RETURN_ALL) ++ path_put(&nd.path); ++ return rc; ++} ++ ++int ++spnfs_getdeviceiter(struct super_block *sb, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *gd_res) ++{ ++ struct spnfs *spnfs = global_spnfs; /* XXX keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceiter_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEITER; ++ im->im_args.getdeviceiter_args.cookie = gd_res->gd_cookie; ++ im->im_args.getdeviceiter_args.verf = gd_res->gd_verf; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceiter_out; ++ } ++ status = res->getdeviceiter_res.status; ++ ++ if (res->getdeviceiter_res.eof) ++ gd_res->gd_eof = 1; ++ else { ++ gd_res->gd_devid = res->getdeviceiter_res.devid; ++ gd_res->gd_cookie = res->getdeviceiter_res.cookie; ++ gd_res->gd_verf = res->getdeviceiter_res.verf; ++ gd_res->gd_eof = 0; ++ } ++ ++getdeviceiter_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++#ifdef CONFIG_SPNFS_TEST ++/* ++ * Setup the rq_res xdr_buf. The svc_rqst rq_respages[1] page contains the ++ * 1024 encoded stripe indices. ++ * ++ * Skip the devaddr4 length and encode the indicies count (1024) in the ++ * rq_res.head and set the rq_res.head length. ++ * ++ * Set the rq_res page_len to 4096 (for the 1024 stripe indices). ++ * Set the rq_res xdr_buf tail base to rq_respages[0] just after the ++ * rq_res head to hold the rest of the getdeviceinfo return. ++ * ++ * So rq_respages[rq_resused - 1] contains the rq_res.head and rq_res.tail and ++ * rq_respages[rq_resused] contains the rq_res.pages. ++ */ ++static int spnfs_test_indices_xdr(struct pnfs_xdr_info *info, ++ const struct pnfs_filelayout_device *fdev) ++{ ++ struct nfsd4_compoundres *resp = info->resp; ++ struct svc_rqst *rqstp = resp->rqstp; ++ struct xdr_buf *xb = &resp->rqstp->rq_res; ++ __be32 *p; ++ ++ p = nfsd4_xdr_reserve_space(resp, 8); ++ p++; /* Fill in length later */ ++ *p++ = cpu_to_be32(fdev->fl_stripeindices_length); /* 1024 */ ++ resp->p = p; ++ ++ xb->head[0].iov_len = (char *)resp->p - (char *)xb->head[0].iov_base; ++ xb->pages = &rqstp->rq_respages[rqstp->rq_resused]; ++ xb->page_base = 0; ++ xb->page_len = PAGE_SIZE; /* page of 1024 encoded indices */ ++ xb->tail[0].iov_base = resp->p; ++ resp->end = xb->head[0].iov_base + PAGE_SIZE; ++ xb->tail[0].iov_len = (char *)resp->end - (char *)resp->p; ++ return 0; ++} ++/* ++ * Return a stripeindices of length 1024 to test ++ * the pNFS client multipage getdeviceinfo implementation. ++ * ++ * Encode a page of stripe indices. ++ */ ++static void spnfs_set_test_indices(struct pnfs_filelayout_device *fldev, ++ struct spnfs_device *dev, ++ struct pnfs_devinfo_arg *info) ++{ ++ struct svc_rqst *rqstp = info->xdr.resp->rqstp; ++ __be32 *p; ++ int i, j = 0; ++ ++ p = (__be32 *)page_address(rqstp->rq_respages[rqstp->rq_resused]); ++ fldev->fl_stripeindices_length = 1024; ++ /* round-robin the data servers device index into the stripe indicie */ ++ for (i = 0; i < 1024; i++) { ++ *p++ = cpu_to_be32(j); ++ if (j < dev->dscount - 1) ++ j++; ++ else ++ j = 0; ++ } ++ fldev->fl_stripeindices_list = NULL; ++} ++#endif /* CONFIG_SPNFS_TEST */ ++ ++int ++spnfs_getdeviceinfo(struct super_block *sb, struct exp_xdr_stream *xdr, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *devid) ++{ ++ struct spnfs *spnfs = global_spnfs; ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ struct spnfs_device *dev; ++ struct pnfs_filelayout_device *fldev = NULL; ++ struct pnfs_filelayout_multipath *mp = NULL; ++ struct pnfs_filelayout_devaddr *fldap = NULL; ++ int status = 0, i, len; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_GETDEVICEINFO; ++ /* XXX FIX: figure out what to do about fsid */ ++ im->im_args.getdeviceinfo_args.devid = devid->devid; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto getdeviceinfo_out; ++ } ++ status = res->getdeviceinfo_res.status; ++ if (status != 0) ++ goto getdeviceinfo_out; ++ ++ dev = &res->getdeviceinfo_res.devinfo; ++ ++ /* Fill in the device data, i.e., nfs4_1_file_layout_ds_addr4 */ ++ fldev = kzalloc(sizeof(struct pnfs_filelayout_device), GFP_KERNEL); ++ if (fldev == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ ++ /* ++ * Stripe count is the same as data server count for our purposes ++ */ ++ fldev->fl_stripeindices_length = dev->dscount; ++ fldev->fl_device_length = dev->dscount; ++ ++ /* Set stripe indices */ ++#ifdef CONFIG_SPNFS_TEST ++ spnfs_set_test_indices(fldev, dev, info); ++ fldev->fl_enc_stripe_indices = spnfs_test_indices_xdr; ++#else /* CONFIG_SPNFS_TEST */ ++ fldev->fl_stripeindices_list = ++ kmalloc(fldev->fl_stripeindices_length * sizeof(u32), ++ GFP_KERNEL); ++ if (fldev->fl_stripeindices_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_stripeindices_length; i++) ++ fldev->fl_stripeindices_list[i] = i; ++#endif /* CONFIG_SPNFS_TEST */ ++ ++ /* ++ * Set the device's data server addresses No multipath for spnfs, ++ * so mp length is always 1. ++ * ++ */ ++ fldev->fl_device_list = ++ kmalloc(fldev->fl_device_length * ++ sizeof(struct pnfs_filelayout_multipath), ++ GFP_KERNEL); ++ if (fldev->fl_device_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ mp = &fldev->fl_device_list[i]; ++ mp->fl_multipath_length = 1; ++ mp->fl_multipath_list = ++ kmalloc(sizeof(struct pnfs_filelayout_devaddr), ++ GFP_KERNEL); ++ if (mp->fl_multipath_list == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ fldap = mp->fl_multipath_list; ++ ++ /* ++ * Copy the netid into the device address, for example: "tcp" ++ */ ++ len = strlen(dev->dslist[i].netid); ++ fldap->r_netid.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_netid.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_netid.data, dev->dslist[i].netid, len); ++ fldap->r_netid.len = len; ++ ++ /* ++ * Copy the network address into the device address, ++ * for example: "10.35.9.16.08.01" ++ */ ++ len = strlen(dev->dslist[i].addr); ++ fldap->r_addr.data = kmalloc(len, GFP_KERNEL); ++ if (fldap->r_addr.data == NULL) { ++ status = -ENOMEM; ++ goto getdeviceinfo_out; ++ } ++ memcpy(fldap->r_addr.data, dev->dslist[i].addr, len); ++ fldap->r_addr.len = len; ++ } ++ ++ /* encode the device data */ ++ status = filelayout_encode_devinfo(xdr, fldev); ++ ++getdeviceinfo_out: ++ if (fldev) { ++ kfree(fldev->fl_stripeindices_list); ++ if (fldev->fl_device_list) { ++ for (i = 0; i < fldev->fl_device_length; i++) { ++ fldap = ++ fldev->fl_device_list[i].fl_multipath_list; ++ kfree(fldap->r_netid.data); ++ kfree(fldap->r_addr.data); ++ kfree(fldap); ++ } ++ kfree(fldev->fl_device_list); ++ } ++ kfree(fldev); ++ } ++ ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_setattr(void) ++{ ++ return 0; ++} ++ ++int ++spnfs_open(struct inode *inode, struct nfsd4_open *open) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto open_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_OPEN; ++ im->im_args.open_args.inode = inode->i_ino; ++ im->im_args.open_args.generation = inode->i_generation; ++ im->im_args.open_args.create = open->op_create; ++ im->im_args.open_args.createmode = open->op_createmode; ++ im->im_args.open_args.truncate = open->op_truncate; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto open_out; ++ } ++ status = res->open_res.status; ++ ++open_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++int ++spnfs_create(void) ++{ ++ return 0; ++} ++ ++/* ++ * Invokes the spnfsd with the inode number of the object to remove. ++ * The file has already been removed on the MDS, so all the spnsfd ++ * daemon does is remove the stripes. ++ * Returns 0 on success otherwise error code ++ */ ++int ++spnfs_remove(unsigned long ino, unsigned long generation) ++{ ++ struct spnfs *spnfs = global_spnfs; /* keep up the pretence */ ++ struct spnfs_msg *im = NULL; ++ union spnfs_msg_res *res = NULL; ++ int status = 0; ++ ++ im = kmalloc(sizeof(struct spnfs_msg), GFP_KERNEL); ++ if (im == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ res = kmalloc(sizeof(union spnfs_msg_res), GFP_KERNEL); ++ if (res == NULL) { ++ status = -ENOMEM; ++ goto remove_out; ++ } ++ ++ im->im_type = SPNFS_TYPE_REMOVE; ++ im->im_args.remove_args.inode = ino; ++ im->im_args.remove_args.generation = generation; ++ ++ /* call function to queue the msg for upcall */ ++ status = spnfs_upcall(spnfs, im, res); ++ if (status != 0) { ++ dprintk("%s spnfs upcall failure: %d\n", __func__, status); ++ status = -EIO; ++ goto remove_out; ++ } ++ status = res->remove_res.status; ++ ++remove_out: ++ kfree(im); ++ kfree(res); ++ ++ return status; ++} ++ ++static int ++read_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_read(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ if (err == 0) ++ break; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDONLY | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0 ; vnum < vlen ; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = read_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err < 0) { ++ status = nfserr_io; ++ goto read_out; ++ } ++ if (err < iolen) { ++ bytecount += err; ++ goto read_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++read_out: ++ *lenp = bytecount; ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ return status; ++} ++ ++__be32 ++spnfs_read(struct inode *inode, loff_t offset, unsigned long *lenp, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return read(inode, offset, lenp, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++static int ++write_one(struct inode *inode, loff_t offset, size_t len, char *buf, ++ struct file **filp) ++{ ++ loff_t bufoffset = 0, soffset, pos, snum, soff, tmp; ++ size_t iolen; ++ int completed = 0, ds, err; ++ ++ while (len > 0) { ++ tmp = offset; ++ soff = do_div(tmp, spnfs_config->stripe_size); ++ snum = tmp; ++ ds = do_div(tmp, spnfs_config->num_ds); ++ if (spnfs_config->dense_striping == 0) ++ soffset = offset; ++ else { ++ tmp = snum; ++ do_div(tmp, spnfs_config->num_ds); ++ soffset = tmp * spnfs_config->stripe_size + soff; ++ } ++ if (len < spnfs_config->stripe_size - soff) ++ iolen = len; ++ else ++ iolen = spnfs_config->stripe_size - soff; ++ ++ pos = soffset; ++ err = vfs_write(filp[ds], buf + bufoffset, iolen, &pos); ++ if (err < 0) ++ return -EIO; ++ filp[ds]->f_pos = pos; ++ iolen = err; ++ completed += iolen; ++ len -= iolen; ++ offset += iolen; ++ bufoffset += iolen; ++ } ++ ++ return completed; ++} ++ ++static __be32 ++write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ int i, vnum, err, bytecount = 0; ++ char path[128]; ++ struct file *filp[SPNFS_MAX_DATA_SERVERS]; ++ size_t iolen; ++ __be32 status = nfs_ok; ++ ++ /* ++ * XXX We should just be doing this at open time, but it gets ++ * kind of messy storing this info in nfsd's state structures ++ * and piggybacking its path through the various state handling ++ * functions. Revisit this. ++ */ ++ memset(filp, 0, SPNFS_MAX_DATA_SERVERS * sizeof(struct file *)); ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ sprintf(path, "%s/%ld.%u", spnfs_config->ds_dir[i], ++ inode->i_ino, inode->i_generation); ++ filp[i] = filp_open(path, O_RDWR | O_LARGEFILE, 0); ++ if (filp[i] == NULL) { ++ status = nfserr_io; ++ goto write_out; ++ } ++ get_file(filp[i]); ++ } ++ ++ for (vnum = 0; vnum < vlen; vnum++) { ++ iolen = rqstp->rq_vec[vnum].iov_len; ++ err = write_one(inode, offset + bytecount, iolen, ++ (char *)rqstp->rq_vec[vnum].iov_base, filp); ++ if (err != iolen) { ++ dprintk("spnfs_write: err=%d expected %Zd\n", err, len); ++ status = nfserr_io; ++ goto write_out; ++ } ++ bytecount += rqstp->rq_vec[vnum].iov_len; ++ } ++ ++write_out: ++ for (i = 0; i < spnfs_config->num_ds; i++) { ++ if (filp[i]) { ++ filp_close(filp[i], current->files); ++ fput(filp[i]); ++ } ++ } ++ ++ return status; ++} ++ ++__be32 ++spnfs_write(struct inode *inode, loff_t offset, size_t len, int vlen, ++ struct svc_rqst *rqstp) ++{ ++ if (spnfs_config) ++ return write(inode, offset, len, vlen, rqstp); ++ else { ++ printk(KERN_ERR "Please upgrade to latest spnfsd\n"); ++ return nfserr_notsupp; ++ } ++} ++ ++int ++spnfs_commit(void) ++{ ++ return 0; ++} ++ ++/* ++ * Return the state for this object. ++ * At this time simply return 0 to indicate success and use the existing state ++ */ ++int ++spnfs_get_state(struct inode *inode, struct knfsd_fh *fh, struct pnfs_get_state *arg) ++{ ++ return 0; ++} ++ ++/* ++ * Return the filehandle for the specified file descriptor ++ */ ++int ++spnfs_getfh(int fd, struct nfs_fh *fh) ++{ ++ struct file *file; ++ ++ file = fget(fd); ++ if (file == NULL) ++ return -EIO; ++ ++ memcpy(fh, NFS_FH(file->f_dentry->d_inode), sizeof(struct nfs_fh)); ++ fput(file); ++ return 0; ++} +diff -up linux-2.6.35.noarch/fs/nfsd/state.h.orig linux-2.6.35.noarch/fs/nfsd/state.h +--- linux-2.6.35.noarch/fs/nfsd/state.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/state.h 2010-08-31 21:11:40.968088277 -0400 +@@ -242,6 +242,12 @@ struct nfs4_client { + u32 cl_cb_seq_nr; + struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ + /* wait here for slots */ ++#if defined(CONFIG_PNFSD) ++ struct list_head cl_layouts; /* outstanding layouts */ ++ struct list_head cl_layoutrecalls; /* outstanding layoutrecall ++ callbacks */ ++ atomic_t cl_deviceref; /* Num outstanding devs */ ++#endif /* CONFIG_PNFSD */ + }; + + static inline void +@@ -342,12 +348,31 @@ struct nfs4_file { + struct list_head fi_hash; /* hash by "struct inode *" */ + struct list_head fi_stateids; + struct list_head fi_delegations; ++#if defined(CONFIG_PNFSD) ++ struct list_head fi_layouts; ++ struct list_head fi_layout_states; ++#endif /* CONFIG_PNFSD */ + struct inode *fi_inode; + u32 fi_id; /* used with stateowner->so_id + * for stateid_hashtbl hash */ + bool fi_had_conflict; ++#if defined(CONFIG_PNFSD) ++ /* used by layoutget / layoutrecall */ ++ struct nfs4_fsid fi_fsid; ++ u32 fi_fhlen; ++ u8 fi_fhval[NFS4_FHSIZE]; ++#endif /* CONFIG_PNFSD */ + }; + ++#if defined(CONFIG_PNFSD) ++/* pNFS Metadata server state */ ++ ++struct pnfs_ds_dev_entry { ++ struct list_head dd_dev_entry; /* st_pnfs_ds_id entry */ ++ u32 dd_dsid; ++}; ++#endif /* CONFIG_PNFSD */ ++ + /* + * nfs4_stateid can either be an open stateid or (eventually) a lock stateid + * +@@ -370,6 +395,9 @@ struct nfs4_stateid { + struct list_head st_perfile; + struct list_head st_perstateowner; + struct list_head st_lockowners; ++#if defined(CONFIG_PNFSD) ++ struct list_head st_pnfs_ds_id; ++#endif /* CONFIG_PNFSD */ + struct nfs4_stateowner * st_stateowner; + struct nfs4_file * st_file; + stateid_t st_stateid; +@@ -421,6 +449,34 @@ extern void nfsd4_recdir_purge_old(void) + extern int nfsd4_create_clid_dir(struct nfs4_client *clp); + extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); + extern void release_session_client(struct nfsd4_session *); ++extern void nfsd4_free_slab(struct kmem_cache **); ++extern struct nfs4_file *find_file(struct inode *); ++extern struct nfs4_file *find_alloc_file(struct inode *, struct svc_fh *); ++extern void put_nfs4_file(struct nfs4_file *); ++extern void get_nfs4_file(struct nfs4_file *); ++extern struct nfs4_client *find_confirmed_client(clientid_t *); ++extern struct nfs4_stateid *find_stateid(stateid_t *, int flags); ++extern struct nfs4_delegation *find_delegation_stateid(struct inode *, stateid_t *); ++extern __be32 nfs4_check_stateid(stateid_t *); ++extern void expire_client_lock(struct nfs4_client *); ++extern int filter_confirmed_clients(int (* func)(struct nfs4_client *, void *), void *); ++ ++#if defined(CONFIG_PNFSD) ++extern int nfsd4_init_pnfs_slabs(void); ++extern void nfsd4_free_pnfs_slabs(void); ++extern void pnfs_expire_client(struct nfs4_client *); ++extern void release_pnfs_ds_dev_list(struct nfs4_stateid *); ++extern void nfs4_pnfs_state_init(void); ++extern void nfs4_pnfs_state_shutdown(void); ++extern void nfs4_ds_get_verifier(stateid_t *, struct super_block *, u32 *); ++extern int nfs4_preprocess_pnfs_ds_stateid(struct svc_fh *, stateid_t *); ++#else /* CONFIG_PNFSD */ ++static inline void nfsd4_free_pnfs_slabs(void) {} ++static inline int nfsd4_init_pnfs_slabs(void) { return 0; } ++static inline void pnfs_expire_client(struct nfs4_client *clp) {} ++static inline void release_pnfs_ds_dev_list(struct nfs4_stateid *stp) {} ++static inline void nfs4_pnfs_state_shutdown(void) {} ++#endif /* CONFIG_PNFSD */ + + static inline void + nfs4_put_stateowner(struct nfs4_stateowner *so) +@@ -434,4 +490,24 @@ nfs4_get_stateowner(struct nfs4_stateown + kref_get(&so->so_ref); + } + ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ + #endif /* NFSD4_STATE_H */ +diff -up linux-2.6.35.noarch/fs/nfsd/vfs.c.orig linux-2.6.35.noarch/fs/nfsd/vfs.c +--- linux-2.6.35.noarch/fs/nfsd/vfs.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/vfs.c 2010-08-31 21:11:40.969160725 -0400 +@@ -37,7 +37,12 @@ + #ifdef CONFIG_NFSD_V4 + #include + #include ++#include ++#include + #endif /* CONFIG_NFSD_V4 */ ++#if defined(CONFIG_SPNFS_BLOCK) ++#include ++#endif + + #include "nfsd.h" + #include "vfs.h" +@@ -383,6 +388,12 @@ nfsd_setattr(struct svc_rqst *rqstp, str + NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE); + if (err) + goto out; ++#if defined(CONFIG_SPNFS_BLOCK) ++ if (pnfs_block_enabled(inode, 0)) { ++ err = bl_layoutrecall(inode, RETURN_FILE, ++ iap->ia_size, inode->i_size - iap->ia_size); ++ } ++#endif /* CONFIG_SPNFS_BLOCK */ + } + + /* +@@ -1702,6 +1713,11 @@ nfsd_rename(struct svc_rqst *rqstp, stru + struct inode *fdir, *tdir; + __be32 err; + int host_err; ++#ifdef CONFIG_SPNFS ++ unsigned long ino = 0; ++ unsigned long generation = 0; ++ unsigned int nlink = 0; ++#endif /* CONFIG_SPNFS */ + + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); + if (err) +@@ -1765,7 +1781,26 @@ nfsd_rename(struct svc_rqst *rqstp, stru + if (host_err) + goto out_dput_new; + ++#ifdef CONFIG_SPNFS ++ /* ++ * if the target is a preexisting regular file, remember the ++ * inode number and generation so we can delete the stripes; ++ * save the link count as well so that the stripes only get ++ * get deleted when the last link is deleted ++ */ ++ if (ndentry && ndentry->d_inode && S_ISREG(ndentry->d_inode->i_mode)) { ++ ino = ndentry->d_inode->i_ino; ++ generation = ndentry->d_inode->i_generation; ++ nlink = ndentry->d_inode->i_nlink; ++ } ++#endif /* CONFIG_SPNFS */ ++ + host_err = vfs_rename(fdir, odentry, tdir, ndentry); ++#ifdef CONFIG_SPNFS ++ if (spnfs_enabled() && (!host_err && ino && nlink == 1)) ++ spnfs_remove(ino, generation); ++#endif /* CONFIG_SPNFS */ ++ + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) +@@ -1806,6 +1841,11 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + struct inode *dirp; + __be32 err; + int host_err; ++#if defined(CONFIG_SPNFS) ++ unsigned long ino; ++ unsigned long generation; ++ unsigned int nlink; ++#endif /* defined(CONFIG_SPNFS) */ + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) +@@ -1829,6 +1869,17 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + goto out; + } + ++#if defined(CONFIG_SPNFS) ++ /* ++ * Remember the inode number to communicate to the spnfsd ++ * for removal of stripes; save the link count as well so that ++ * the stripes only get get deleted when the last link is deleted ++ */ ++ ino = rdentry->d_inode->i_ino; ++ generation = rdentry->d_inode->i_generation; ++ nlink = rdentry->d_inode->i_nlink; ++#endif /* defined(CONFIG_SPNFS) */ ++ + if (!type) + type = rdentry->d_inode->i_mode & S_IFMT; + +@@ -1853,6 +1904,29 @@ nfsd_unlink(struct svc_rqst *rqstp, stru + if (!host_err) + host_err = commit_metadata(fhp); + ++#if defined(CONFIG_SPNFS) ++ /* ++ * spnfs: notify spnfsd of removal to destroy stripes ++ */ ++/* ++ sb = current_fh->fh_dentry->d_inode->i_sb; ++ if (sb->s_export_op->spnfs_remove) { ++*/ ++ dprintk("%s check if spnfs_enabled\n", __FUNCTION__); ++ if (spnfs_enabled() && nlink == 1) { ++ BUG_ON(ino == 0); ++ dprintk("%s calling spnfs_remove inumber=%ld\n", ++ __FUNCTION__, ino); ++ if (spnfs_remove(ino, generation) == 0) { ++ dprintk("%s spnfs_remove success\n", __FUNCTION__); ++ } else { ++ /* XXX How do we make this atomic? */ ++ printk(KERN_WARNING "nfsd: pNFS could not " ++ "remove stripes for inode: %ld\n", ino); ++ } ++ } ++#endif /* defined(CONFIG_SPNFS) */ ++ + mnt_drop_write(fhp->fh_export->ex_path.mnt); + out_nfserr: + err = nfserrno(host_err); +diff -up linux-2.6.35.noarch/fs/nfsd/xdr4.h.orig linux-2.6.35.noarch/fs/nfsd/xdr4.h +--- linux-2.6.35.noarch/fs/nfsd/xdr4.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfsd/xdr4.h 2010-08-31 21:11:40.970160774 -0400 +@@ -37,6 +37,8 @@ + #ifndef _LINUX_NFSD_XDR4_H + #define _LINUX_NFSD_XDR4_H + ++#include ++ + #include "state.h" + #include "nfsd.h" + +@@ -385,6 +387,51 @@ struct nfsd4_reclaim_complete { + u32 rca_one_fs; + }; + ++struct nfsd4_pnfs_getdevinfo { ++ struct nfsd4_pnfs_deviceid gd_devid; /* request */ ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxcount; /* request */ ++ u32 gd_notify_types;/* request */ ++ struct super_block *gd_sb; ++}; ++ ++struct nfsd4_pnfs_getdevlist { ++ u32 gd_layout_type; /* request */ ++ u32 gd_maxdevices; /* request */ ++ u64 gd_cookie; /* request - response */ ++ u64 gd_verf; /* request - response */ ++ struct svc_fh *gd_fhp; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutget { ++ u64 lg_minlength; /* request */ ++ u32 lg_signal; /* request */ ++ u32 lg_maxcount; /* request */ ++ struct svc_fh *lg_fhp; /* request */ ++ stateid_t lg_sid; /* request/response */ ++ struct nfsd4_layout_seg lg_seg; /* request/response */ ++ u32 lg_roc; /* response */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit { ++ struct nfsd4_pnfs_layoutcommit_arg args; ++ stateid_t lc_sid; /* request */ ++ struct nfsd4_pnfs_layoutcommit_res res; ++}; ++ ++enum layoutreturn_flags { ++ LR_FLAG_INTERN = 1 << 0, /* internal return */ ++ LR_FLAG_EXPIRE = 1 << 1, /* return on client expiration */ ++}; ++ ++struct nfsd4_pnfs_layoutreturn { ++ struct nfsd4_pnfs_layoutreturn_arg args; ++ u32 lr_flags; ++ stateid_t lr_sid; /* request/resopnse */ ++ u32 lrs_present; /* response */ ++}; ++ + struct nfsd4_op { + int opnum; + __be32 status; +@@ -426,6 +473,13 @@ struct nfsd4_op { + struct nfsd4_destroy_session destroy_session; + struct nfsd4_sequence sequence; + struct nfsd4_reclaim_complete reclaim_complete; ++#if defined(CONFIG_PNFSD) ++ struct nfsd4_pnfs_getdevlist pnfs_getdevlist; ++ struct nfsd4_pnfs_getdevinfo pnfs_getdevinfo; ++ struct nfsd4_pnfs_layoutget pnfs_layoutget; ++ struct nfsd4_pnfs_layoutcommit pnfs_layoutcommit; ++ struct nfsd4_pnfs_layoutreturn pnfs_layoutreturn; ++#endif /* CONFIG_PNFSD */ + } u; + struct nfs4_replay * replay; + }; +diff -up linux-2.6.35.noarch/fs/nfs/file.c.orig linux-2.6.35.noarch/fs/nfs/file.c +--- linux-2.6.35.noarch/fs/nfs/file.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/file.c 2010-08-31 21:11:40.909160446 -0400 +@@ -36,6 +36,7 @@ + #include "internal.h" + #include "iostat.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_FILE + +@@ -389,12 +390,17 @@ static int nfs_write_begin(struct file * + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int once_thru = 0; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, + file->f_path.dentry->d_name.name, + mapping->host->i_ino, len, (long long) pos); + ++ pnfs_update_layout(mapping->host, ++ nfs_file_open_context(file), ++ 0, NFS4_MAX_UINT64, IOMODE_RW, ++ &lseg); + start: + /* + * Prevent starvation issues if someone is doing a consistency +@@ -403,17 +409,22 @@ start: + ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (ret) +- return ret; ++ goto out; + + page = grab_cache_page_write_begin(mapping, index, flags); +- if (!page) +- return -ENOMEM; ++ if (!page) { ++ ret = -ENOMEM; ++ goto out; ++ } + *pagep = page; + +- ret = nfs_flush_incompatible(file, page); ++ ret = nfs_flush_incompatible(file, page, lseg); + if (ret) { + unlock_page(page); + page_cache_release(page); ++ *pagep = NULL; ++ *fsdata = NULL; ++ goto out; + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; +@@ -422,6 +433,12 @@ start: + if (!ret) + goto start; + } ++ ret = pnfs_write_begin(file, page, pos, len, lseg, fsdata); ++ out: ++ if (ret) { ++ put_lseg(lseg); ++ *fsdata = NULL; ++ } + return ret; + } + +@@ -431,6 +448,7 @@ static int nfs_write_end(struct file *fi + { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int status; ++ struct pnfs_layout_segment *lseg; + + dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n", + file->f_path.dentry->d_parent->d_name.name, +@@ -457,10 +475,17 @@ static int nfs_write_end(struct file *fi + zero_user_segment(page, pglen, PAGE_CACHE_SIZE); + } + +- status = nfs_updatepage(file, page, offset, copied); ++ lseg = nfs4_pull_lseg_from_fsdata(file, fsdata); ++ status = pnfs_write_end(file, page, pos, len, copied, lseg); ++ if (status) ++ goto out; ++ status = nfs_updatepage(file, page, offset, copied, lseg, fsdata); + ++ out: + unlock_page(page); + page_cache_release(page); ++ pnfs_write_end_cleanup(file, fsdata); ++ put_lseg(lseg); + + if (status < 0) + return status; +@@ -571,6 +596,8 @@ static int nfs_vm_page_mkwrite(struct vm + /* make sure the cache has finished storing the page */ + nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); + ++ /* XXX Do we want to call pnfs_update_layout here? */ ++ + lock_page(page); + mapping = page->mapping; + if (mapping != dentry->d_inode->i_mapping) +@@ -581,11 +608,11 @@ static int nfs_vm_page_mkwrite(struct vm + if (pagelen == 0) + goto out_unlock; + +- ret = nfs_flush_incompatible(filp, page); ++ ret = nfs_flush_incompatible(filp, page, NULL); + if (ret != 0) + goto out_unlock; + +- ret = nfs_updatepage(filp, page, 0, pagelen); ++ ret = nfs_updatepage(filp, page, 0, pagelen, NULL, NULL); + out_unlock: + if (!ret) + return VM_FAULT_LOCKED; +diff -up linux-2.6.35.noarch/fs/nfs/inode.c.orig linux-2.6.35.noarch/fs/nfs/inode.c +--- linux-2.6.35.noarch/fs/nfs/inode.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/inode.c 2010-08-31 21:11:40.910160405 -0400 +@@ -48,6 +48,7 @@ + #include "internal.h" + #include "fscache.h" + #include "dns_resolve.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -278,7 +279,7 @@ nfs_fhget(struct super_block *sb, struct + */ + inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; + if (S_ISREG(inode->i_mode)) { +- inode->i_fop = &nfs_file_operations; ++ inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; + inode->i_data.a_ops = &nfs_file_aops; + inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; + } else if (S_ISDIR(inode->i_mode)) { +@@ -530,6 +531,68 @@ out: + return err; + } + ++static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ atomic_set(&l_ctx->count, 1); ++ l_ctx->lockowner = current->files; ++ l_ctx->pid = current->tgid; ++ INIT_LIST_HEAD(&l_ctx->list); ++} ++ ++static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *pos; ++ ++ list_for_each_entry(pos, &ctx->lock_context.list, list) { ++ if (pos->lockowner != current->files) ++ continue; ++ if (pos->pid != current->tgid) ++ continue; ++ atomic_inc(&pos->count); ++ return pos; ++ } ++ return NULL; ++} ++ ++struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) ++{ ++ struct nfs_lock_context *res, *new = NULL; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ spin_unlock(&inode->i_lock); ++ new = kmalloc(sizeof(*new), GFP_KERNEL); ++ if (new == NULL) ++ return NULL; ++ nfs_init_lock_context(new); ++ spin_lock(&inode->i_lock); ++ res = __nfs_find_lock_context(ctx); ++ if (res == NULL) { ++ list_add_tail(&new->list, &ctx->lock_context.list); ++ new->open_context = ctx; ++ res = new; ++ new = NULL; ++ } ++ } ++ spin_unlock(&inode->i_lock); ++ kfree(new); ++ return res; ++} ++ ++void nfs_put_lock_context(struct nfs_lock_context *l_ctx) ++{ ++ struct nfs_open_context *ctx = l_ctx->open_context; ++ struct inode *inode = ctx->path.dentry->d_inode; ++ ++ if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) ++ return; ++ list_del(&l_ctx->list); ++ spin_unlock(&inode->i_lock); ++ kfree(l_ctx); ++} ++ + /** + * nfs_close_context - Common close_context() routine NFSv2/v3 + * @ctx: pointer to context +@@ -566,11 +629,11 @@ static struct nfs_open_context *alloc_nf + path_get(&ctx->path); + ctx->cred = get_rpccred(cred); + ctx->state = NULL; +- ctx->lockowner = current->files; + ctx->flags = 0; + ctx->error = 0; + ctx->dir_cookie = 0; +- atomic_set(&ctx->count, 1); ++ nfs_init_lock_context(&ctx->lock_context); ++ ctx->lock_context.open_context = ctx; + } + return ctx; + } +@@ -578,15 +641,16 @@ static struct nfs_open_context *alloc_nf + struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) + { + if (ctx != NULL) +- atomic_inc(&ctx->count); ++ atomic_inc(&ctx->lock_context.count); + return ctx; + } ++EXPORT_SYMBOL(get_nfs_open_context); + + static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) + { + struct inode *inode = ctx->path.dentry->d_inode; + +- if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) ++ if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + return; + list_del(&ctx->list); + spin_unlock(&inode->i_lock); +@@ -933,6 +997,7 @@ void nfs_fattr_init(struct nfs_fattr *fa + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); + } ++EXPORT_SYMBOL(nfs_fattr_init); + + struct nfs_fattr *nfs_alloc_fattr(void) + { +@@ -1142,6 +1207,14 @@ static int nfs_update_inode(struct inode + server->fsid = fattr->fsid; + + /* ++ * file needs layout commit, server attributes may be stale ++ */ ++ if (layoutcommit_needed(nfsi) && nfsi->change_attr >= fattr->change_attr) { ++ dprintk("NFS: %s: layoutcommit is needed for file %s/%ld\n", ++ __func__, inode->i_sb->s_id, inode->i_ino); ++ return 0; ++ } ++ /* + * Update the read time so we don't revalidate too often. + */ + nfsi->read_cache_jiffies = fattr->time_start; +@@ -1340,9 +1413,10 @@ static int nfs_update_inode(struct inode + */ + void nfs4_clear_inode(struct inode *inode) + { ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + /* If we are holding a delegation, return it! */ + nfs_inode_return_delegation_noreclaim(inode); +- /* First call standard NFS clear_inode() code */ + nfs_clear_inode(inode); + } + #endif +@@ -1367,7 +1441,10 @@ struct inode *nfs_alloc_inode(struct sup + + void nfs_destroy_inode(struct inode *inode) + { +- kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); ++ struct nfs_inode *nfsi = NFS_I(inode); ++ ++ pnfs_destroy_layout(nfsi); ++ kmem_cache_free(nfs_inode_cachep, nfsi); + } + + static inline void nfs4_init_once(struct nfs_inode *nfsi) +@@ -1377,6 +1454,11 @@ static inline void nfs4_init_once(struct + nfsi->delegation = NULL; + nfsi->delegation_state = 0; + init_rwsem(&nfsi->rwsem); ++#ifdef CONFIG_NFS_V4_1 ++ init_waitqueue_head(&nfsi->lo_waitq); ++ nfsi->pnfs_layout_suspend = 0; ++ nfsi->layout = NULL; ++#endif /* CONFIG_NFS_V4_1 */ + #endif + } + +@@ -1488,6 +1570,12 @@ static int __init init_nfs_fs(void) + if (err) + goto out0; + ++#ifdef CONFIG_NFS_V4_1 ++ err = pnfs_initialize(); ++ if (err) ++ goto out00; ++#endif /* CONFIG_NFS_V4_1 */ ++ + #ifdef CONFIG_PROC_FS + rpc_proc_register(&nfs_rpcstat); + #endif +@@ -1498,6 +1586,10 @@ out: + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++out00: ++ pnfs_uninitialize(); ++#endif /* CONFIG_NFS_V4_1 */ + nfs_destroy_directcache(); + out0: + nfs_destroy_writepagecache(); +@@ -1531,6 +1623,9 @@ static void __exit exit_nfs_fs(void) + #ifdef CONFIG_PROC_FS + rpc_proc_unregister("nfs"); + #endif ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_uninitialize(); ++#endif + unregister_nfs_fs(); + nfs_fs_proc_exit(); + nfsiod_stop(); +diff -up linux-2.6.35.noarch/fs/nfs/internal.h.orig linux-2.6.35.noarch/fs/nfs/internal.h +--- linux-2.6.35.noarch/fs/nfs/internal.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/internal.h 2010-08-31 21:11:40.911118062 -0400 +@@ -139,6 +139,16 @@ extern struct nfs_server *nfs_clone_serv + struct nfs_fattr *); + extern void nfs_mark_client_ready(struct nfs_client *clp, int state); + extern int nfs4_check_client_ready(struct nfs_client *clp); ++extern int nfs_sockaddr_cmp(const struct sockaddr *sa1, ++ const struct sockaddr *sa2); ++extern int nfs4_set_client(struct nfs_server *server, ++ const char *hostname, ++ const struct sockaddr *addr, ++ const size_t addrlen, ++ const char *ip_addr, ++ rpc_authflavor_t authflavour, ++ int proto, const struct rpc_timeout *timeparms, ++ u32 minorversion); + #ifdef CONFIG_PROC_FS + extern int __init nfs_fs_proc_init(void); + extern void nfs_fs_proc_exit(void); +@@ -201,6 +211,8 @@ extern const u32 nfs41_maxwrite_overhead + extern struct rpc_procinfo nfs4_procedures[]; + #endif + ++extern int nfs4_recover_expired_lease(struct nfs_client *clp); ++ + /* proc.c */ + void nfs_close_context(struct nfs_open_context *ctx, int is_sync); + +@@ -249,10 +261,31 @@ extern int nfs4_get_rootfh(struct nfs_se + #endif + + /* read.c */ ++extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); ++extern int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops); + extern void nfs_read_prepare(struct rpc_task *task, void *calldata); + + /* write.c */ ++extern int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how); ++extern int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs); + extern void nfs_write_prepare(struct rpc_task *task, void *calldata); ++extern void nfs_mark_list_commit(struct list_head *head); + #ifdef CONFIG_MIGRATION + extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +diff -up linux-2.6.35.noarch/fs/nfs/Kconfig.orig linux-2.6.35.noarch/fs/nfs/Kconfig +--- linux-2.6.35.noarch/fs/nfs/Kconfig.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/Kconfig 2010-08-31 21:11:40.894079166 -0400 +@@ -79,10 +79,48 @@ config NFS_V4_1 + depends on NFS_V4 && EXPERIMENTAL + help + This option enables support for minor version 1 of the NFSv4 protocol +- (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. ++ (RFC5661) including support for the parallel NFS (pNFS) features ++ in the kernel's NFS client. + + Unless you're an NFS developer, say N. + ++config PNFS_FILE_LAYOUT ++ tristate "NFS client support for the pNFS nfs-files layout (DEVELOPER ONLY)" ++ depends on NFS_FS && NFS_V4_1 ++ default y ++ help ++ This option enables support for the pNFS nfs-files layout. ++ ++ Unless you're an NFS developer, say N. ++ ++config PNFS_OBJLAYOUT ++ tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD ++ help ++ Say M here if you want your pNFS client to support the Objects Layout Driver. ++ Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and ++ upper level driver (SCSI_OSD_ULD). ++ ++ If unsure, say N. ++ ++config PNFS_PANLAYOUT ++ tristate "Provide support for the Panasas OSD Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)" ++ depends on PNFS_OBJLAYOUT ++ help ++ Say M or y here if you want your pNFS client to support the Panasas OSD Layout Driver. ++ ++ If unsure, say N. ++ ++config PNFS_BLOCK ++ tristate "Provide a pNFS block client (EXPERIMENTAL)" ++ depends on NFS_FS && NFS_V4_1 ++ select MD ++ select BLK_DEV_DM ++ help ++ Say M or y here if you want your pNfs client to support the block protocol ++ ++ If unsure, say N. ++ + config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP +diff -up linux-2.6.35.noarch/fs/nfs/Makefile.orig linux-2.6.35.noarch/fs/nfs/Makefile +--- linux-2.6.35.noarch/fs/nfs/Makefile.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/Makefile 2010-08-31 21:11:40.895022280 -0400 +@@ -15,5 +15,12 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4x + delegation.o idmap.o \ + callback.o callback_xdr.o callback_proc.o \ + nfs4namespace.o ++nfs-$(CONFIG_NFS_V4_1) += pnfs.o + nfs-$(CONFIG_SYSCTL) += sysctl.o + nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o ++ ++obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o ++nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o ++ ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ ++obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +diff -up linux-2.6.35.noarch/fs/nfs/nfs3proc.c.orig linux-2.6.35.noarch/fs/nfs/nfs3proc.c +--- linux-2.6.35.noarch/fs/nfs/nfs3proc.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/nfs3proc.c 2010-08-31 21:11:40.912160566 -0400 +@@ -833,6 +833,7 @@ const struct nfs_rpc_ops nfs_v3_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs3_proc_get_root, + .getattr = nfs3_proc_getattr, + .setattr = nfs3_proc_setattr, +diff -up linux-2.6.35.noarch/fs/nfs/nfs4filelayout.c.orig linux-2.6.35.noarch/fs/nfs/nfs4filelayout.c +--- linux-2.6.35.noarch/fs/nfs/nfs4filelayout.c.orig 2010-08-31 21:11:40.913170556 -0400 ++++ linux-2.6.35.noarch/fs/nfs/nfs4filelayout.c 2010-08-31 21:11:40.914160450 -0400 +@@ -0,0 +1,768 @@ ++/* ++ * linux/fs/nfs/nfs4filelayout.c ++ * ++ * Module for the pnfs nfs4 file layout driver. ++ * Defines all I/O and Policy interface operations, plus code ++ * to register itself with the pNFS client. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "nfs4filelayout.h" ++#include "nfs4_fs.h" ++#include "internal.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Dean Hildebrand "); ++MODULE_DESCRIPTION("The NFSv4 file layout driver"); ++ ++/* Callback operations to the pNFS client */ ++struct pnfs_client_operations *pnfs_callback_ops; ++ ++/* Forward declaration */ ++struct layoutdriver_io_operations filelayout_io_operations; ++ ++int ++filelayout_initialize_mountpoint(struct nfs_server *nfss, ++ const struct nfs_fh *mntfh) ++{ ++ int status = nfs4_alloc_init_deviceid_cache(nfss->nfs_client, ++ nfs4_fl_free_deviceid_callback); ++ if (status) { ++ printk(KERN_WARNING "%s: deviceid cache could not be " ++ "initialized\n", __func__); ++ return status; ++ } ++ dprintk("%s: deviceid cache has been initialized successfully\n", ++ __func__); ++ return 0; ++} ++ ++/* Uninitialize a mountpoint by destroying its device list */ ++int ++filelayout_uninitialize_mountpoint(struct nfs_server *nfss) ++{ ++ dprintk("--> %s\n", __func__); ++ ++ if (nfss->pnfs_curr_ld && nfss->nfs_client->cl_devid_cache) ++ nfs4_put_deviceid_cache(nfss->nfs_client); ++ return 0; ++} ++ ++/* This function is used by the layout driver to calculate the ++ * offset of the file on the dserver based on whether the ++ * layout type is STRIPE_DENSE or STRIPE_SPARSE ++ */ ++static loff_t ++filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ ++ switch (flseg->stripe_type) { ++ case STRIPE_SPARSE: ++ return offset; ++ ++ case STRIPE_DENSE: ++ { ++ u32 stripe_width; ++ u64 tmp, off; ++ u32 unit = flseg->stripe_unit; ++ ++ stripe_width = unit * FILE_DSADDR(lseg)->stripe_count; ++ tmp = off = offset - flseg->pattern_offset; ++ do_div(tmp, stripe_width); ++ return tmp * unit + do_div(off, unit); ++ } ++ default: ++ BUG(); ++ } ++ ++ /* We should never get here... just to stop the gcc warning */ ++ return 0; ++} ++ ++/* ++ * Call ops for the async read/write cases ++ * In the case of dense layouts, the offset needs to be reset to its ++ * original value. ++ */ ++static void filelayout_read_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ if (rdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ rdata->args.offset, rdata->fldata.orig_offset); ++ rdata->args.offset = rdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ rdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_read_release(void *data) ++{ ++ struct nfs_read_data *rdata = (struct nfs_read_data *)data; ++ ++ put_lseg(rdata->pdata.lseg); ++ rdata->pdata.lseg = NULL; ++ rdata->pdata.call_ops->rpc_release(data); ++} ++ ++static void filelayout_write_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ if (wdata->fldata.orig_offset) { ++ dprintk("%s new off %llu orig offset %llu\n", __func__, ++ wdata->args.offset, wdata->fldata.orig_offset); ++ wdata->args.offset = wdata->fldata.orig_offset; ++ } ++ ++ /* Note this may cause RPC to be resent */ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static void filelayout_write_release(void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ put_lseg(wdata->pdata.lseg); ++ wdata->pdata.lseg = NULL; ++ wdata->pdata.call_ops->rpc_release(data); ++} ++ ++struct rpc_call_ops filelayout_read_call_ops = { ++ .rpc_call_prepare = nfs_read_prepare, ++ .rpc_call_done = filelayout_read_call_done, ++ .rpc_release = filelayout_read_release, ++}; ++ ++struct rpc_call_ops filelayout_write_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_write_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* Perform sync or async reads. ++ * ++ * An optimization for the NFS file layout driver ++ * allows the original read/write data structs to be passed in the ++ * last argument. ++ * ++ * TODO: join with write_pagelist? ++ */ ++static enum pnfs_try_status ++filelayout_read_pagelist(struct nfs_read_data *data, unsigned nr_pages) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ dprintk("--> %s ino %lu nr_pages %d pgbase %u req %Zu@%llu\n", ++ __func__, data->inode->i_ino, nr_pages, ++ data->args.pgbase, (size_t)data->args.count, offset); ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s USE DS:ip %x %s\n", __func__, ++ htonl(ds->ds_ip_addr), ds->r_addr); ++ ++ /* just try the first data server for the index..*/ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ ++ /* ++ * Now get the file offset on the dserver ++ * Set the read offset to this offset, and ++ * save the original offset in orig_offset ++ * In the case of aync reads, the offset will be reset in the ++ * call_ops->rpc_call_done() routine. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* Perform an asynchronous read */ ++ nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_read_call_ops); ++ ++ data->pdata.pnfs_error = 0; ++ ++ return PNFS_ATTEMPTED; ++} ++ ++/* Perform async writes. */ ++static enum pnfs_try_status ++filelayout_write_pagelist(struct nfs_write_data *data, unsigned nr_pages, int sync) ++{ ++ struct pnfs_layout_segment *lseg = data->pdata.lseg; ++ struct nfs4_pnfs_ds *ds; ++ loff_t offset = data->args.offset; ++ u32 idx; ++ struct nfs_fh *fh; ++ ++ /* Retrieve the correct rpc_client for the byte range */ ++ idx = nfs4_fl_calc_ds_index(lseg, offset); ++ ds = nfs4_fl_prepare_ds(lseg, idx); ++ if (!ds) { ++ printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); ++ return PNFS_NOT_ATTEMPTED; ++ } ++ dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu %s\n", __func__, ++ data->inode->i_ino, sync, (size_t) data->args.count, offset, ++ htonl(ds->ds_ip_addr), ntohs(ds->ds_port), ds->r_addr); ++ ++ data->fldata.ds_nfs_client = ds->ds_clp; ++ fh = nfs4_fl_select_ds_fh(lseg, offset); ++ if (fh) ++ data->args.fh = fh; ++ /* ++ * Get the file offset on the dserver. Set the write offset to ++ * this offset and save the original offset. ++ */ ++ data->args.offset = filelayout_get_dserver_offset(lseg, offset); ++ data->fldata.orig_offset = offset; ++ ++ /* ++ * Perform an asynchronous write The offset will be reset in the ++ * call_ops->rpc_call_done() routine ++ */ ++ nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, ++ &filelayout_write_call_ops, sync); ++ ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++} ++ ++/* ++ * Create a filelayout layout structure and return it. The pNFS client ++ * will use the pnfs_layout_hdr type to refer to the layout for this ++ * inode from now on. ++ */ ++static struct pnfs_layout_hdr * ++filelayout_alloc_layout(struct inode *inode) ++{ ++ struct nfs4_filelayout *flp; ++ ++ dprintk("NFS_FILELAYOUT: allocating layout\n"); ++ flp = kzalloc(sizeof(struct nfs4_filelayout), GFP_KERNEL); ++ return flp ? &flp->fl_layout : NULL; ++} ++ ++/* Free a filelayout layout structure */ ++static void ++filelayout_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ dprintk("NFS_FILELAYOUT: freeing layout\n"); ++ kfree(FILE_LO(lo)); ++} ++ ++/* ++ * filelayout_check_layout() ++ * ++ * Make sure layout segment parameters are sane WRT the device. ++ * ++ * Notes: ++ * 1) current code insists that # stripe index = # data servers in ds_list ++ * which is wrong. ++ * 2) pattern_offset is ignored and must == 0 which is wrong; ++ * 3) the pattern_offset needs to be a mutliple of the stripe unit. ++ * 4) stripe unit is multiple of page size ++ */ ++ ++static int ++filelayout_check_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs4_filelayout_segment *fl = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ int status = -EINVAL; ++ struct nfs_server *nfss = NFS_SERVER(PNFS_INODE(lo)); ++ ++ dprintk("--> %s\n", __func__); ++ /* find in list or get from server and reference the deviceid */ ++ dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, &fl->dev_id); ++ if (dsaddr == NULL) { ++ dsaddr = get_device_info(PNFS_INODE(lo), &fl->dev_id); ++ if (dsaddr == NULL) { ++ dprintk("%s NO device for dev_id %s\n", ++ __func__, deviceid_fmt(&fl->dev_id)); ++ goto out; ++ } ++ } ++ if (fl->first_stripe_index < 0 || ++ fl->first_stripe_index > dsaddr->stripe_count) { ++ dprintk("%s Bad first_stripe_index %d\n", ++ __func__, fl->first_stripe_index); ++ goto out_put; ++ } ++ ++ if (fl->pattern_offset != 0) { ++ dprintk("%s Unsupported no-zero pattern_offset %Ld\n", ++ __func__, fl->pattern_offset); ++ goto out_put; ++ } ++ ++ if (fl->stripe_unit % PAGE_SIZE) { ++ dprintk("%s Stripe unit (%u) not page aligned\n", ++ __func__, fl->stripe_unit); ++ goto out_put; ++ } ++ ++ /* XXX only support SPARSE packing. Don't support use MDS open fh */ ++ if (!(fl->num_fh == 1 || fl->num_fh == dsaddr->ds_num)) { ++ dprintk("%s num_fh %u not equal to 1 or ds_num %u\n", ++ __func__, fl->num_fh, dsaddr->ds_num); ++ goto out_put; ++ } ++ ++ if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) { ++ dprintk("%s Stripe unit (%u) not aligned with rsize %u " ++ "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize, ++ nfss->wsize); ++ } ++ ++ nfs4_set_layout_deviceid(lseg, &dsaddr->deviceid); ++ ++ status = 0; ++out: ++ dprintk("--> %s returns %d\n", __func__, status); ++ return status; ++out_put: ++ nfs4_put_unset_layout_deviceid(lseg, &dsaddr->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ goto out; ++} ++ ++static void _filelayout_free_lseg(struct pnfs_layout_segment *lseg); ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl); ++ ++/* Decode layout and store in layoutid. Overwrite any existing layout ++ * information for this file. ++ */ ++static int ++filelayout_set_layout(struct nfs4_filelayout *flo, ++ struct nfs4_filelayout_segment *fl, ++ struct nfs4_layoutget_res *lgr) ++{ ++ uint32_t *p = (uint32_t *)lgr->layout.buf; ++ uint32_t nfl_util; ++ int i; ++ ++ dprintk("%s: set_layout_map Begin\n", __func__); ++ ++ memcpy(&fl->dev_id, p, NFS4_PNFS_DEVICEID4_SIZE); ++ p += XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE); ++ nfl_util = be32_to_cpup(p++); ++ if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) ++ fl->commit_through_mds = 1; ++ if (nfl_util & NFL4_UFLG_DENSE) ++ fl->stripe_type = STRIPE_DENSE; ++ else ++ fl->stripe_type = STRIPE_SPARSE; ++ fl->stripe_unit = nfl_util & ~NFL4_UFLG_MASK; ++ ++ if (!flo->stripe_unit) ++ flo->stripe_unit = fl->stripe_unit; ++ else if (flo->stripe_unit != fl->stripe_unit) { ++ printk(KERN_NOTICE "%s: updating strip_unit from %u to %u\n", ++ __func__, flo->stripe_unit, fl->stripe_unit); ++ flo->stripe_unit = fl->stripe_unit; ++ } ++ ++ fl->first_stripe_index = be32_to_cpup(p++); ++ p = xdr_decode_hyper(p, &fl->pattern_offset); ++ fl->num_fh = be32_to_cpup(p++); ++ ++ dprintk("%s: nfl_util 0x%X num_fh %u fsi %u po %llu dev_id %s\n", ++ __func__, nfl_util, fl->num_fh, fl->first_stripe_index, ++ fl->pattern_offset, deviceid_fmt(&fl->dev_id)); ++ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) { ++ fl->fh_array = vmalloc(fl->num_fh * sizeof(struct nfs_fh)); ++ if (fl->fh_array) ++ memset(fl->fh_array, 0, ++ fl->num_fh * sizeof(struct nfs_fh)); ++ } else { ++ fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh), ++ GFP_KERNEL); ++ } ++ if (!fl->fh_array) ++ return -ENOMEM; ++ ++ for (i = 0; i < fl->num_fh; i++) { ++ /* fh */ ++ fl->fh_array[i].size = be32_to_cpup(p++); ++ if (sizeof(struct nfs_fh) < fl->fh_array[i].size) { ++ printk(KERN_ERR "Too big fh %d received %d\n", ++ i, fl->fh_array[i].size); ++ /* Layout is now invalid, pretend it doesn't exist */ ++ filelayout_free_fh_array(fl); ++ fl->num_fh = 0; ++ break; ++ } ++ memcpy(fl->fh_array[i].data, p, fl->fh_array[i].size); ++ p += XDR_QUADLEN(fl->fh_array[i].size); ++ dprintk("DEBUG: %s: fh len %d\n", __func__, ++ fl->fh_array[i].size); ++ } ++ ++ return 0; ++} ++ ++static struct pnfs_layout_segment * ++filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutget_res *lgr) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(layoutid); ++ struct pnfs_layout_segment *lseg; ++ int rc; ++ ++ dprintk("--> %s\n", __func__); ++ lseg = kzalloc(sizeof(struct pnfs_layout_segment) + ++ sizeof(struct nfs4_filelayout_segment), GFP_KERNEL); ++ if (!lseg) ++ return NULL; ++ ++ rc = filelayout_set_layout(flo, LSEG_LD_DATA(lseg), lgr); ++ ++ if (rc != 0 || filelayout_check_layout(layoutid, lseg)) { ++ _filelayout_free_lseg(lseg); ++ lseg = NULL; ++ } ++ return lseg; ++} ++ ++static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl) ++{ ++ if (fl->num_fh * sizeof(struct nfs_fh) > 2*PAGE_SIZE) ++ vfree(fl->fh_array); ++ else ++ kfree(fl->fh_array); ++ ++ fl->fh_array = NULL; ++} ++ ++static void ++_filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ filelayout_free_fh_array(LSEG_LD_DATA(lseg)); ++ kfree(lseg); ++} ++ ++static void ++filelayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ dprintk("--> %s\n", __func__); ++ nfs4_put_unset_layout_deviceid(lseg, lseg->deviceid, ++ nfs4_fl_free_deviceid_callback); ++ _filelayout_free_lseg(lseg); ++} ++ ++/* Allocate a new nfs_write_data struct and initialize */ ++static struct nfs_write_data * ++filelayout_clone_write_data(struct nfs_write_data *old) ++{ ++ static struct nfs_write_data *new; ++ ++ new = nfs_commitdata_alloc(); ++ if (!new) ++ goto out; ++ kref_init(&new->refcount); ++ new->parent = old; ++ kref_get(&old->refcount); ++ new->inode = old->inode; ++ new->cred = old->cred; ++ new->args.offset = 0; ++ new->args.count = 0; ++ new->res.count = 0; ++ new->res.fattr = &new->fattr; ++ nfs_fattr_init(&new->fattr); ++ new->res.verf = &new->verf; ++ new->args.context = get_nfs_open_context(old->args.context); ++ new->pdata.lseg = NULL; ++ new->pdata.call_ops = old->pdata.call_ops; ++ new->pdata.how = old->pdata.how; ++out: ++ return new; ++} ++ ++static void filelayout_commit_call_done(struct rpc_task *task, void *data) ++{ ++ struct nfs_write_data *wdata = (struct nfs_write_data *)data; ++ ++ wdata->pdata.call_ops->rpc_call_done(task, data); ++} ++ ++static struct rpc_call_ops filelayout_commit_call_ops = { ++ .rpc_call_prepare = nfs_write_prepare, ++ .rpc_call_done = filelayout_commit_call_done, ++ .rpc_release = filelayout_write_release, ++}; ++ ++/* ++ * Execute a COMMIT op to the MDS or to each data server on which a page ++ * in 'pages' exists. ++ * Invoke the pnfs_commit_complete callback. ++ */ ++enum pnfs_try_status ++filelayout_commit(struct nfs_write_data *data, int sync) ++{ ++ LIST_HEAD(head); ++ struct nfs_page *req; ++ loff_t file_offset = 0; ++ u16 idx, i; ++ struct list_head **ds_page_list = NULL; ++ u16 *indices_used; ++ int num_indices_seen = 0; ++ const struct rpc_call_ops *call_ops; ++ struct rpc_clnt *clnt; ++ struct nfs_write_data **clone_list = NULL; ++ struct nfs_write_data *dsdata; ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("%s data %p sync %d\n", __func__, data, sync); ++ ++ /* Alloc room for both in one go */ ++ ds_page_list = kzalloc((NFS4_PNFS_MAX_MULTI_CNT + 1) * ++ (sizeof(u16) + sizeof(struct list_head *)), ++ GFP_KERNEL); ++ if (!ds_page_list) ++ goto mem_error; ++ indices_used = (u16 *) (ds_page_list + NFS4_PNFS_MAX_MULTI_CNT + 1); ++ /* ++ * Sort pages based on which ds to send to. ++ * MDS is given index equal to NFS4_PNFS_MAX_MULTI_CNT. ++ * Note we are assuming there is only a single lseg in play. ++ * When that is not true, we could first sort on lseg, then ++ * sort within each as we do here. ++ */ ++ while (!list_empty(&data->pages)) { ++ req = nfs_list_entry(data->pages.next); ++ nfs_list_remove_request(req); ++ if (!req->wb_lseg || ++ ((struct nfs4_filelayout_segment *) ++ LSEG_LD_DATA(req->wb_lseg))->commit_through_mds) ++ idx = NFS4_PNFS_MAX_MULTI_CNT; ++ else { ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ idx = nfs4_fl_calc_ds_index(req->wb_lseg, file_offset); ++ } ++ if (ds_page_list[idx]) { ++ /* Already seen this idx */ ++ list_add(&req->wb_list, ds_page_list[idx]); ++ } else { ++ /* New idx not seen so far */ ++ list_add_tail(&req->wb_list, &head); ++ indices_used[num_indices_seen++] = idx; ++ } ++ ds_page_list[idx] = &req->wb_list; ++ } ++ /* Once created, clone must be released via call_op */ ++ clone_list = kzalloc(num_indices_seen * ++ sizeof(struct nfs_write_data *), GFP_KERNEL); ++ if (!clone_list) ++ goto mem_error; ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ clone_list[i] = filelayout_clone_write_data(data); ++ if (!clone_list[i]) ++ goto mem_error; ++ } ++ clone_list[i] = data; ++ /* ++ * Now send off the RPCs to each ds. Note that it is important ++ * that any RPC to the MDS be sent last (or at least after all ++ * clones have been made.) ++ */ ++ for (i = 0; i < num_indices_seen; i++) { ++ dsdata = clone_list[i]; ++ idx = indices_used[i]; ++ list_cut_position(&dsdata->pages, &head, ds_page_list[idx]); ++ if (idx == NFS4_PNFS_MAX_MULTI_CNT) { ++ call_ops = data->pdata.call_ops;; ++ clnt = NFS_CLIENT(dsdata->inode); ++ ds = NULL; ++ } else { ++ struct nfs_fh *fh; ++ ++ call_ops = &filelayout_commit_call_ops; ++ req = nfs_list_entry(dsdata->pages.next); ++ ds = nfs4_fl_prepare_ds(req->wb_lseg, idx); ++ if (!ds) { ++ /* Trigger retry of this chunk through MDS */ ++ dsdata->task.tk_status = -EIO; ++ data->pdata.call_ops->rpc_release(dsdata); ++ continue; ++ } ++ clnt = ds->ds_clp->cl_rpcclient; ++ dsdata->fldata.ds_nfs_client = ds->ds_clp; ++ file_offset = (loff_t)req->wb_index << PAGE_CACHE_SHIFT; ++ fh = nfs4_fl_select_ds_fh(req->wb_lseg, file_offset); ++ if (fh) ++ dsdata->args.fh = fh; ++ } ++ dprintk("%s: Initiating commit: %llu USE DS:\n", ++ __func__, file_offset); ++ print_ds(ds); ++ ++ /* Send COMMIT to data server */ ++ nfs_initiate_commit(dsdata, clnt, call_ops, sync); ++ } ++ kfree(clone_list); ++ kfree(ds_page_list); ++ data->pdata.pnfs_error = 0; ++ return PNFS_ATTEMPTED; ++ ++ mem_error: ++ if (clone_list) { ++ for (i = 0; i < num_indices_seen - 1; i++) { ++ if (!clone_list[i]) ++ break; ++ data->pdata.call_ops->rpc_release(clone_list[i]); ++ } ++ kfree(clone_list); ++ } ++ kfree(ds_page_list); ++ /* One of these will be empty, but doesn't hurt to do both */ ++ nfs_mark_list_commit(&head); ++ nfs_mark_list_commit(&data->pages); ++ data->pdata.call_ops->rpc_release(data); ++ return PNFS_ATTEMPTED; ++} ++ ++/* Return the stripesize for the specified file */ ++ssize_t ++filelayout_get_stripesize(struct pnfs_layout_hdr *lo) ++{ ++ struct nfs4_filelayout *flo = FILE_LO(lo); ++ ++ return flo->stripe_unit; ++} ++ ++/* ++ * filelayout_pg_test(). Called by nfs_can_coalesce_requests() ++ * ++ * return 1 : coalesce page ++ * return 0 : don't coalesce page ++ */ ++int ++filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, ++ struct nfs_page *req) ++{ ++ u64 p_stripe, r_stripe; ++ ++ if (pgio->pg_boundary == 0) ++ return 1; ++ p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; ++ r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; ++ ++ do_div(p_stripe, pgio->pg_boundary); ++ do_div(r_stripe, pgio->pg_boundary); ++ ++ return (p_stripe == r_stripe); ++} ++ ++struct layoutdriver_io_operations filelayout_io_operations = { ++ .commit = filelayout_commit, ++ .read_pagelist = filelayout_read_pagelist, ++ .write_pagelist = filelayout_write_pagelist, ++ .alloc_layout = filelayout_alloc_layout, ++ .free_layout = filelayout_free_layout, ++ .alloc_lseg = filelayout_alloc_lseg, ++ .free_lseg = filelayout_free_lseg, ++ .initialize_mountpoint = filelayout_initialize_mountpoint, ++ .uninitialize_mountpoint = filelayout_uninitialize_mountpoint, ++}; ++ ++struct layoutdriver_policy_operations filelayout_policy_operations = { ++ .flags = PNFS_USE_RPC_CODE, ++ .get_stripesize = filelayout_get_stripesize, ++ .pg_test = filelayout_pg_test, ++}; ++ ++struct pnfs_layoutdriver_type filelayout_type = { ++ .id = LAYOUT_NFSV4_1_FILES, ++ .name = "LAYOUT_NFSV4_1_FILES", ++ .ld_io_ops = &filelayout_io_operations, ++ .ld_policy_ops = &filelayout_policy_operations, ++}; ++ ++static int __init nfs4filelayout_init(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Registering...\n", ++ __func__); ++ ++ /* ++ * Need to register file_operations struct with global list to indicate ++ * that NFS4 file layout is a possible pNFS I/O module ++ */ ++ pnfs_callback_ops = pnfs_register_layoutdriver(&filelayout_type); ++ ++ return 0; ++} ++ ++static void __exit nfs4filelayout_exit(void) ++{ ++ printk(KERN_INFO "%s: NFSv4 File Layout Driver Unregistering...\n", ++ __func__); ++ ++ /* Unregister NFS4 file layout driver with pNFS client*/ ++ pnfs_unregister_layoutdriver(&filelayout_type); ++} ++ ++module_init(nfs4filelayout_init); ++module_exit(nfs4filelayout_exit); +diff -up linux-2.6.35.noarch/fs/nfs/nfs4filelayoutdev.c.orig linux-2.6.35.noarch/fs/nfs/nfs4filelayoutdev.c +--- linux-2.6.35.noarch/fs/nfs/nfs4filelayoutdev.c.orig 2010-08-31 21:11:40.915165014 -0400 ++++ linux-2.6.35.noarch/fs/nfs/nfs4filelayoutdev.c 2010-08-31 21:11:40.915165014 -0400 +@@ -0,0 +1,635 @@ ++/* ++ * linux/fs/nfs/nfs4filelayoutdev.c ++ * ++ * Device operations for the pnfs nfs4 file layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * Garth Goodson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include "nfs4filelayout.h" ++#include "internal.h" ++#include "nfs4_fs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++DEFINE_SPINLOCK(nfs4_ds_cache_lock); ++static LIST_HEAD(nfs4_data_server_cache); ++ ++void ++print_ds(struct nfs4_pnfs_ds *ds) ++{ ++ if (ds == NULL) { ++ dprintk("%s NULL device \n", __func__); ++ return; ++ } ++ dprintk(" ip_addr %x\n", ntohl(ds->ds_ip_addr)); ++ dprintk(" port %hu\n", ntohs(ds->ds_port)); ++ dprintk(" client %p\n", ds->ds_clp); ++ dprintk(" ref count %d\n", atomic_read(&ds->ds_count)); ++ if (ds->ds_clp) ++ dprintk(" cl_exchange_flags %x\n", ++ ds->ds_clp->cl_exchange_flags); ++ dprintk(" ip:port %s\n", ds->r_addr); ++} ++ ++void ++print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ int i; ++ ++ dprintk("%s dsaddr->ds_num %d\n", __func__, ++ dsaddr->ds_num); ++ for (i = 0; i < dsaddr->ds_num; i++) ++ print_ds(dsaddr->ds_list[i]); ++} ++ ++/* Debugging function assuming a 64bit major/minor split of the deviceid */ ++char * ++deviceid_fmt(const struct pnfs_deviceid *dev_id) ++{ ++ static char buf[17]; ++ uint32_t *p = (uint32_t *)dev_id->data; ++ uint64_t major, minor; ++ ++ p = xdr_decode_hyper(p, &major); ++ p = xdr_decode_hyper(p, &minor); ++ ++ sprintf(buf, "%08llu %08llu", major, minor); ++ return buf; ++} ++ ++/* nfs4_ds_cache_lock is held */ ++static inline struct nfs4_pnfs_ds * ++_data_server_lookup(u32 ip_addr, u32 port) ++{ ++ struct nfs4_pnfs_ds *ds; ++ ++ dprintk("_data_server_lookup: ip_addr=%x port=%hu\n", ++ ntohl(ip_addr), ntohs(port)); ++ ++ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) { ++ if (ds->ds_ip_addr == ip_addr && ++ ds->ds_port == port) { ++ return ds; ++ } ++ } ++ return NULL; ++} ++ ++/* Create an rpc to the data server defined in 'dev_list' */ ++static int ++nfs4_pnfs_ds_create(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) ++{ ++ struct nfs_server *tmp; ++ struct sockaddr_in sin; ++ struct rpc_clnt *mds_clnt = mds_srv->client; ++ struct nfs_client *clp = mds_srv->nfs_client; ++ struct sockaddr *mds_addr; ++ int err = 0; ++ ++ dprintk("--> %s ip:port %s au_flavor %d\n", __func__, ++ ds->r_addr, mds_clnt->cl_auth->au_flavor); ++ ++ sin.sin_family = AF_INET; ++ sin.sin_addr.s_addr = ds->ds_ip_addr; ++ sin.sin_port = ds->ds_port; ++ ++ /* ++ * If this DS is also the MDS, use the MDS session only if the ++ * MDS exchangeid flags show the EXCHGID4_FLAG_USE_PNFS_DS pNFS role. ++ */ ++ mds_addr = (struct sockaddr *)&clp->cl_addr; ++ if (nfs_sockaddr_cmp((struct sockaddr *)&sin, mds_addr)) { ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data " ++ "Server\n", ds->r_addr); ++ err = -ENODEV; ++ } else { ++ atomic_inc(&clp->cl_count); ++ ds->ds_clp = clp; ++ dprintk("%s Using MDS Session for DS\n", __func__); ++ } ++ goto out; ++ } ++ ++ /* Temporay server for nfs4_set_client */ ++ tmp = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); ++ if (!tmp) ++ goto out; ++ ++ /* ++ * Set a retrans, timeout interval, and authflavor equual to the MDS ++ * values. Use the MDS nfs_client cl_ipaddr field so as to use the ++ * same co_ownerid as the MDS. ++ */ ++ err = nfs4_set_client(tmp, ++ mds_srv->nfs_client->cl_hostname, ++ (struct sockaddr *)&sin, ++ sizeof(struct sockaddr), ++ mds_srv->nfs_client->cl_ipaddr, ++ mds_clnt->cl_auth->au_flavor, ++ IPPROTO_TCP, ++ mds_clnt->cl_xprt->timeout, ++ 1 /* minorversion */); ++ if (err < 0) ++ goto out_free; ++ ++ clp = tmp->nfs_client; ++ ++ /* Ask for only the EXCHGID4_FLAG_USE_PNFS_DS pNFS role */ ++ dprintk("%s EXCHANGE_ID for clp %p\n", __func__, clp); ++ clp->cl_exchange_flags = EXCHGID4_FLAG_USE_PNFS_DS; ++ ++ err = nfs4_recover_expired_lease(clp); ++ if (!err) ++ err = nfs4_check_client_ready(clp); ++ if (err) ++ goto out_put; ++ ++ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS)) { ++ printk(KERN_INFO "ip:port %s is not a pNFS Data Server\n", ++ ds->r_addr); ++ err = -ENODEV; ++ goto out_put; ++ } ++ /* ++ * Mask the (possibly) returned EXCHGID4_FLAG_USE_PNFS_MDS pNFS role ++ * The is_ds_only_session depends on this. ++ */ ++ clp->cl_exchange_flags &= ~EXCHGID4_FLAG_USE_PNFS_MDS; ++ /* ++ * Set DS lease equal to the MDS lease, renewal is scheduled in ++ * create_session ++ */ ++ spin_lock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; ++ spin_unlock(&mds_srv->nfs_client->cl_lock); ++ clp->cl_last_renewal = jiffies; ++ ++ clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); ++ ds->ds_clp = clp; ++ ++ dprintk("%s: ip=%x, port=%hu, rpcclient %p\n", __func__, ++ ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), ++ clp->cl_rpcclient); ++out_free: ++ kfree(tmp); ++out: ++ dprintk("%s Returns %d\n", __func__, err); ++ return err; ++out_put: ++ nfs_put_client(clp); ++ goto out_free; ++} ++ ++static void ++destroy_ds(struct nfs4_pnfs_ds *ds) ++{ ++ dprintk("--> %s\n", __func__); ++ print_ds(ds); ++ ++ if (ds->ds_clp) ++ nfs_put_client(ds->ds_clp); ++ kfree(ds); ++} ++ ++static void ++nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) ++{ ++ struct nfs4_pnfs_ds *ds; ++ int i; ++ ++ dprintk("%s: device id=%s\n", __func__, ++ deviceid_fmt(&dsaddr->deviceid.de_id)); ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ ds = dsaddr->ds_list[i]; ++ if (ds != NULL) { ++ if (atomic_dec_and_lock(&ds->ds_count, ++ &nfs4_ds_cache_lock)) { ++ list_del_init(&ds->ds_node); ++ spin_unlock(&nfs4_ds_cache_lock); ++ destroy_ds(ds); ++ } ++ } ++ } ++ kfree(dsaddr->stripe_indices); ++ kfree(dsaddr); ++} ++ ++void ++nfs4_fl_free_deviceid_callback(struct kref *kref) ++{ ++ struct nfs4_deviceid *device = ++ container_of(kref, struct nfs4_deviceid, de_kref); ++ struct nfs4_file_layout_dsaddr *dsaddr = ++ container_of(device, struct nfs4_file_layout_dsaddr, deviceid); ++ ++ nfs4_fl_free_deviceid(dsaddr); ++} ++ ++static void ++nfs4_pnfs_ds_add(struct inode *inode, struct nfs4_pnfs_ds **dsp, ++ u32 ip_addr, u32 port, char *r_addr, int len) ++{ ++ struct nfs4_pnfs_ds *tmp_ds, *ds; ++ ++ *dsp = NULL; ++ ++ ds = kzalloc(sizeof(*tmp_ds), GFP_KERNEL); ++ if (!ds) ++ return; ++ ++ spin_lock(&nfs4_ds_cache_lock); ++ tmp_ds = _data_server_lookup(ip_addr, port); ++ if (tmp_ds == NULL) { ++ ds->ds_ip_addr = ip_addr; ++ ds->ds_port = port; ++ strncpy(ds->r_addr, r_addr, len); ++ atomic_set(&ds->ds_count, 1); ++ INIT_LIST_HEAD(&ds->ds_node); ++ ds->ds_clp = NULL; ++ list_add(&ds->ds_node, &nfs4_data_server_cache); ++ *dsp = ds; ++ dprintk("%s add new data server ip 0x%x\n", __func__, ++ ds->ds_ip_addr); ++ spin_unlock(&nfs4_ds_cache_lock); ++ } else { ++ atomic_inc(&tmp_ds->ds_count); ++ *dsp = tmp_ds; ++ dprintk("%s data server found ip 0x%x, inc'ed ds_count to %d\n", ++ __func__, tmp_ds->ds_ip_addr, ++ atomic_read(&tmp_ds->ds_count)); ++ spin_unlock(&nfs4_ds_cache_lock); ++ kfree(ds); ++ } ++} ++ ++static struct nfs4_pnfs_ds * ++decode_and_add_ds(uint32_t **pp, struct inode *inode) ++{ ++ struct nfs4_pnfs_ds *ds = NULL; ++ char r_addr[29]; /* max size of ip/port string */ ++ int len; ++ u32 ip_addr, port; ++ int tmp[6]; ++ uint32_t *p = *pp; ++ ++ dprintk("%s enter\n", __func__); ++ /* check and skip r_netid */ ++ len = be32_to_cpup(p++); ++ /* "tcp" */ ++ if (len != 3) { ++ printk("%s: ERROR: non TCP r_netid len %d\n", ++ __func__, len); ++ goto out_err; ++ } ++ /* ++ * Read the bytes into a temporary buffer ++ * XXX: should probably sanity check them ++ */ ++ tmp[0] = be32_to_cpup(p++); ++ ++ len = be32_to_cpup(p++); ++ if (len >= sizeof(r_addr)) { ++ printk("%s: ERROR: Device ip/port too long (%d)\n", ++ __func__, len); ++ goto out_err; ++ } ++ memcpy(r_addr, p, len); ++ p += XDR_QUADLEN(len); ++ *pp = p; ++ r_addr[len] = '\0'; ++ sscanf(r_addr, "%d.%d.%d.%d.%d.%d", &tmp[0], &tmp[1], ++ &tmp[2], &tmp[3], &tmp[4], &tmp[5]); ++ ip_addr = htonl((tmp[0]<<24) | (tmp[1]<<16) | (tmp[2]<<8) | (tmp[3])); ++ port = htons((tmp[4] << 8) | (tmp[5])); ++ ++ nfs4_pnfs_ds_add(inode, &ds, ip_addr, port, r_addr, len); ++ ++ dprintk("%s: addr:port string = %s\n", __func__, r_addr); ++ return ds; ++out_err: ++ dprintk("%s returned NULL\n", __func__); ++ return NULL; ++} ++ ++/* Decode opaque device data and return the result */ ++static struct nfs4_file_layout_dsaddr* ++decode_device(struct inode *ino, struct pnfs_device *pdev) ++{ ++ int i, dummy; ++ u32 cnt, num; ++ u8 *indexp; ++ uint32_t *p = (u32 *)pdev->area, *indicesp; ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ /* Get the stripe count (number of stripe index) */ ++ cnt = be32_to_cpup(p++); ++ dprintk("%s stripe count %d\n", __func__, cnt); ++ if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { ++ printk(KERN_WARNING "%s: stripe count %d greater than " ++ "supported maximum %d\n", __func__, ++ cnt, NFS4_PNFS_MAX_STRIPE_CNT); ++ goto out_err; ++ } ++ ++ /* Check the multipath list count */ ++ indicesp = p; ++ p += XDR_QUADLEN(cnt << 2); ++ num = be32_to_cpup(p++); ++ dprintk("%s ds_num %u\n", __func__, num); ++ if (num > NFS4_PNFS_MAX_MULTI_CNT) { ++ printk(KERN_WARNING "%s: multipath count %d greater than " ++ "supported maximum %d\n", __func__, ++ num, NFS4_PNFS_MAX_MULTI_CNT); ++ goto out_err; ++ } ++ dsaddr = kzalloc(sizeof(*dsaddr) + ++ (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), ++ GFP_KERNEL); ++ if (!dsaddr) ++ goto out_err; ++ ++ dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL); ++ if (!dsaddr->stripe_indices) ++ goto out_err_free; ++ ++ dsaddr->stripe_count = cnt; ++ dsaddr->ds_num = num; ++ ++ memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ ++ /* Go back an read stripe indices */ ++ p = indicesp; ++ indexp = &dsaddr->stripe_indices[0]; ++ for (i = 0; i < dsaddr->stripe_count; i++) { ++ dummy = be32_to_cpup(p++); ++ *indexp = dummy; /* bound by NFS4_PNFS_MAX_MULTI_CNT */ ++ indexp++; ++ } ++ /* Skip already read multipath list count */ ++ p++; ++ ++ for (i = 0; i < dsaddr->ds_num; i++) { ++ int j; ++ ++ dummy = be32_to_cpup(p++); /* multipath count */ ++ if (dummy > 1) { ++ printk(KERN_WARNING ++ "%s: Multipath count %d not supported, " ++ "skipping all greater than 1\n", __func__, ++ dummy); ++ } ++ for (j = 0; j < dummy; j++) { ++ if (j == 0) { ++ dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); ++ if (dsaddr->ds_list[i] == NULL) ++ goto out_err_free; ++ } else { ++ u32 len; ++ /* skip extra multipath */ ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ len = be32_to_cpup(p++); ++ p += XDR_QUADLEN(len); ++ continue; ++ } ++ } ++ } ++ nfs4_init_deviceid_node(&dsaddr->deviceid); ++ ++ return dsaddr; ++ ++out_err_free: ++ nfs4_fl_free_deviceid(dsaddr); ++out_err: ++ dprintk("%s ERROR: returning NULL\n", __func__); ++ return NULL; ++} ++ ++/* ++ * Decode the opaque device specified in 'dev' ++ * and add it to the list of available devices. ++ * If the deviceid is already cached, nfs4_add_deviceid will return ++ * a pointer to the cached struct and throw away the new. ++ */ ++static struct nfs4_file_layout_dsaddr* ++decode_and_add_device(struct inode *inode, struct pnfs_device *dev) ++{ ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ struct nfs4_deviceid *d; ++ ++ dsaddr = decode_device(inode, dev); ++ if (!dsaddr) { ++ printk(KERN_WARNING "%s: Could not decode or add device\n", ++ __func__); ++ return NULL; ++ } ++ ++ d = nfs4_add_get_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, ++ &dsaddr->deviceid); ++ ++ return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Retrieve the information for dev_id, add it to the list ++ * of available devices, and return it. ++ */ ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id) ++{ ++ struct pnfs_device *pdev = NULL; ++ u32 max_resp_sz; ++ int max_pages; ++ struct page **pages = NULL; ++ struct nfs4_file_layout_dsaddr *dsaddr = NULL; ++ int rc, i; ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ /* ++ * Use the session max response size as the basis for setting ++ * GETDEVICEINFO's maxcount ++ */ ++ max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; ++ max_pages = max_resp_sz >> PAGE_SHIFT; ++ dprintk("%s inode %p max_resp_sz %u max_pages %d\n", ++ __func__, inode, max_resp_sz, max_pages); ++ ++ pdev = kzalloc(sizeof(struct pnfs_device), GFP_KERNEL); ++ if (pdev == NULL) ++ return NULL; ++ ++ pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); ++ if (pages == NULL) { ++ kfree(pdev); ++ return NULL; ++ } ++ for (i = 0; i < max_pages; i++) { ++ pages[i] = alloc_page(GFP_KERNEL); ++ if (!pages[i]) ++ goto out_free; ++ } ++ ++ /* set pdev->area */ ++ pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL); ++ if (!pdev->area) ++ goto out_free; ++ ++ memcpy(&pdev->dev_id, dev_id, NFS4_PNFS_DEVICEID4_SIZE); ++ pdev->layout_type = LAYOUT_NFSV4_1_FILES; ++ pdev->pages = pages; ++ pdev->pgbase = 0; ++ pdev->pglen = PAGE_SIZE * max_pages; ++ pdev->mincount = 0; ++ /* TODO: Update types when CB_NOTIFY_DEVICEID is available */ ++ pdev->dev_notify_types = 0; ++ ++ rc = pnfs_callback_ops->nfs_getdeviceinfo(server, pdev); ++ dprintk("%s getdevice info returns %d\n", __func__, rc); ++ if (rc) ++ goto out_free; ++ ++ /* ++ * Found new device, need to decode it and then add it to the ++ * list of known devices for this mountpoint. ++ */ ++ dsaddr = decode_and_add_device(inode, pdev); ++out_free: ++ if (pdev->area != NULL) ++ vunmap(pdev->area); ++ for (i = 0; i < max_pages; i++) ++ __free_page(pages[i]); ++ kfree(pages); ++ kfree(pdev); ++ dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); ++ return dsaddr; ++} ++ ++struct nfs4_file_layout_dsaddr * ++nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ ++ d = nfs4_find_get_deviceid(clp->cl_devid_cache, id); ++ dprintk("%s device id (%s) nfs4_deviceid %p\n", __func__, ++ deviceid_fmt(id), d); ++ return (d == NULL) ? NULL : ++ container_of(d, struct nfs4_file_layout_dsaddr, deviceid); ++} ++ ++/* ++ * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit ++ * Then: ((res + fsi) % dsaddr->stripe_count) ++ */ ++static inline u32 ++_nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u64 tmp; ++ ++ tmp = offset - flseg->pattern_offset; ++ do_div(tmp, flseg->stripe_unit); ++ tmp += flseg->first_stripe_index; ++ return do_div(tmp, FILE_DSADDR(lseg)->stripe_count); ++} ++ ++u32 ++nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ u32 j; ++ ++ j = _nfs4_fl_calc_j_index(lseg, offset); ++ return FILE_DSADDR(lseg)->stripe_indices[j]; ++} ++ ++struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ u32 i; ++ ++ if (flseg->stripe_type == STRIPE_SPARSE) { ++ if (flseg->num_fh == 1) ++ i = 0; ++ else if (flseg->num_fh == 0) ++ return NULL; ++ else ++ i = nfs4_fl_calc_ds_index(lseg, offset); ++ } else ++ i = _nfs4_fl_calc_j_index(lseg, offset); ++ return &flseg->fh_array[i]; ++} ++ ++struct nfs4_pnfs_ds * ++nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) ++{ ++ struct nfs4_filelayout_segment *flseg = LSEG_LD_DATA(lseg); ++ struct nfs4_file_layout_dsaddr *dsaddr; ++ ++ dsaddr = FILE_DSADDR(lseg); ++ if (dsaddr->ds_list[ds_idx] == NULL) { ++ printk(KERN_ERR "%s: No data server for device id (%s)!!\n", ++ __func__, deviceid_fmt(&flseg->dev_id)); ++ return NULL; ++ } ++ ++ if (!dsaddr->ds_list[ds_idx]->ds_clp) { ++ int err; ++ ++ err = nfs4_pnfs_ds_create(PNFS_NFS_SERVER(lseg->layout), ++ dsaddr->ds_list[ds_idx]); ++ if (err) { ++ printk(KERN_ERR "%s nfs4_pnfs_ds_create error %d\n", ++ __func__, err); ++ return NULL; ++ } ++ } ++ dprintk("%s: dev_id=%s, ds_idx=%u\n", ++ __func__, deviceid_fmt(&flseg->dev_id), ds_idx); ++ ++ return dsaddr->ds_list[ds_idx]; ++} ++ +diff -up linux-2.6.35.noarch/fs/nfs/nfs4filelayout.h.orig linux-2.6.35.noarch/fs/nfs/nfs4filelayout.h +--- linux-2.6.35.noarch/fs/nfs/nfs4filelayout.h.orig 2010-08-31 21:11:40.914160450 -0400 ++++ linux-2.6.35.noarch/fs/nfs/nfs4filelayout.h 2010-08-31 21:11:40.914160450 -0400 +@@ -0,0 +1,96 @@ ++/* ++ * pnfs_nfs4filelayout.h ++ * ++ * NFSv4 file layout driver data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_NFS4FILELAYOUT_H ++#define FS_NFS_NFS4FILELAYOUT_H ++ ++#include ++#include ++ ++#define NFS4_PNFS_DEV_HASH_BITS 5 ++#define NFS4_PNFS_DEV_HASH_SIZE (1 << NFS4_PNFS_DEV_HASH_BITS) ++#define NFS4_PNFS_DEV_HASH_MASK (NFS4_PNFS_DEV_HASH_SIZE - 1) ++ ++#define NFS4_PNFS_MAX_STRIPE_CNT 4096 ++#define NFS4_PNFS_MAX_MULTI_CNT 64 /* 256 fit into a u8 stripe_index */ ++#define NFS4_PNFS_MAX_MULTI_DS 2 ++ ++#define FILE_DSADDR(lseg) (container_of(lseg->deviceid, \ ++ struct nfs4_file_layout_dsaddr, \ ++ deviceid)) ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++/* Individual ip address */ ++struct nfs4_pnfs_ds { ++ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ ++ u32 ds_ip_addr; ++ u32 ds_port; ++ struct nfs_client *ds_clp; ++ atomic_t ds_count; ++ char r_addr[29]; ++}; ++ ++struct nfs4_file_layout_dsaddr { ++ struct nfs4_deviceid deviceid; ++ u32 stripe_count; ++ u8 *stripe_indices; ++ u32 ds_num; ++ struct nfs4_pnfs_ds *ds_list[1]; ++}; ++ ++struct nfs4_pnfs_dev_hlist { ++ rwlock_t dev_lock; ++ struct hlist_head dev_list[NFS4_PNFS_DEV_HASH_SIZE]; ++}; ++ ++struct nfs4_filelayout_segment { ++ u32 stripe_type; ++ u32 commit_through_mds; ++ u32 stripe_unit; ++ u32 first_stripe_index; ++ u64 pattern_offset; ++ struct pnfs_deviceid dev_id; ++ unsigned int num_fh; ++ struct nfs_fh *fh_array; ++}; ++ ++struct nfs4_filelayout { ++ struct pnfs_layout_hdr fl_layout; ++ u32 stripe_unit; ++}; ++ ++extern struct nfs_fh * ++nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, loff_t offset); ++ ++static inline struct nfs4_filelayout * ++FILE_LO(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct nfs4_filelayout, fl_layout); ++} ++ ++extern struct pnfs_client_operations *pnfs_callback_ops; ++ ++extern void nfs4_fl_free_deviceid_callback(struct kref *); ++extern void print_ds(struct nfs4_pnfs_ds *ds); ++char *deviceid_fmt(const struct pnfs_deviceid *dev_id); ++u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, loff_t offset); ++struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, ++ u32 ds_idx); ++extern struct nfs4_file_layout_dsaddr * ++nfs4_fl_find_get_deviceid(struct nfs_client *, struct pnfs_deviceid *dev_id); ++struct nfs4_file_layout_dsaddr * ++get_device_info(struct inode *inode, struct pnfs_deviceid *dev_id); ++ ++#endif /* FS_NFS_NFS4FILELAYOUT_H */ +diff -up linux-2.6.35.noarch/fs/nfs/nfs4_fs.h.orig linux-2.6.35.noarch/fs/nfs/nfs4_fs.h +--- linux-2.6.35.noarch/fs/nfs/nfs4_fs.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/nfs4_fs.h 2010-08-31 21:11:40.912160566 -0400 +@@ -45,8 +45,28 @@ enum nfs4_client_state { + NFS4CLNT_RECLAIM_NOGRACE, + NFS4CLNT_DELEGRETURN, + NFS4CLNT_SESSION_RESET, +- NFS4CLNT_SESSION_DRAINING, + NFS4CLNT_RECALL_SLOT, ++ NFS4CLNT_LAYOUT_RECALL, ++}; ++ ++enum nfs4_session_state { ++ NFS4_SESSION_INITING, ++ NFS4_SESSION_DRAINING, ++}; ++ ++struct nfs4_minor_version_ops { ++ u32 minor_version; ++ ++ int (*call_sync)(struct nfs_server *server, ++ struct rpc_message *msg, ++ struct nfs4_sequence_args *args, ++ struct nfs4_sequence_res *res, ++ int cache_reply); ++ int (*validate_stateid)(struct nfs_delegation *, ++ const nfs4_stateid *); ++ const struct nfs4_state_recovery_ops *reboot_recovery_ops; ++ const struct nfs4_state_recovery_ops *nograce_recovery_ops; ++ const struct nfs4_state_maintenance_ops *state_renewal_ops; + }; + + /* +@@ -89,7 +109,6 @@ struct nfs_unique_id { + */ + struct nfs4_state_owner { + struct nfs_unique_id so_owner_id; +- struct nfs_client *so_client; + struct nfs_server *so_server; + struct rb_node so_client_node; + +@@ -99,7 +118,6 @@ struct nfs4_state_owner { + atomic_t so_count; + unsigned long so_flags; + struct list_head so_states; +- struct list_head so_delegations; + struct nfs_seqid_counter so_seqid; + struct rpc_sequence so_sequence; + }; +@@ -125,10 +143,20 @@ enum { + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + ++struct nfs4_lock_owner { ++ unsigned int lo_type; ++#define NFS4_ANY_LOCK_TYPE (0U) ++#define NFS4_FLOCK_LOCK_TYPE (1U << 0) ++#define NFS4_POSIX_LOCK_TYPE (1U << 1) ++ union { ++ fl_owner_t posix_owner; ++ pid_t flock_owner; ++ } lo_u; ++}; ++ + struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ +- fl_owner_t ls_owner; /* POSIX lock owner */ + #define NFS_LOCK_INITIALIZED 1 + int ls_flags; + struct nfs_seqid_counter ls_seqid; +@@ -136,6 +164,7 @@ struct nfs4_lock_state { + struct nfs_unique_id ls_id; + nfs4_stateid ls_stateid; + atomic_t ls_count; ++ struct nfs4_lock_owner ls_owner; + }; + + /* bits for nfs4_state->flags */ +@@ -219,22 +248,34 @@ extern int nfs4_open_revalidate(struct i + extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); + extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, + struct nfs4_fs_locations *fs_locations, struct page *page); ++extern void nfs4_release_lockowner(const struct nfs4_lock_state *); + +-extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; +-extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; + #if defined(CONFIG_NFS_V4_1) +-extern int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return server->nfs_client->cl_session; ++} ++ ++extern int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); + extern void nfs4_destroy_session(struct nfs4_session *session); + extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); ++extern int nfs4_proc_exchange_id(struct nfs_client *, struct rpc_cred *); + extern int nfs4_proc_create_session(struct nfs_client *); + extern int nfs4_proc_destroy_session(struct nfs4_session *); + extern int nfs4_init_session(struct nfs_server *server); + extern int nfs4_proc_get_lease_time(struct nfs_client *clp, + struct nfs_fsinfo *fsinfo); + #else /* CONFIG_NFS_v4_1 */ +-static inline int nfs4_setup_sequence(struct nfs_client *clp, ++static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) ++{ ++ return NULL; ++} ++ ++static inline int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task) + { +@@ -247,12 +288,12 @@ static inline int nfs4_init_session(stru + } + #endif /* CONFIG_NFS_V4_1 */ + +-extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; ++extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; + + extern const u32 nfs4_fattr_bitmap[2]; + extern const u32 nfs4_statfs_bitmap[2]; + extern const u32 nfs4_pathconf_bitmap[2]; +-extern const u32 nfs4_fsinfo_bitmap[2]; ++extern const u32 nfs4_fsinfo_bitmap[3]; + extern const u32 nfs4_fs_locations_bitmap[2]; + + /* nfs4renewd.c */ +@@ -284,7 +325,7 @@ extern void nfs41_handle_sequence_flag_e + extern void nfs41_handle_recall_slot(struct nfs_client *clp); + extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); + extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); ++extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); + + extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); + extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); +@@ -293,6 +334,7 @@ extern void nfs_increment_lock_seqid(int + extern void nfs_release_seqid(struct nfs_seqid *seqid); + extern void nfs_free_seqid(struct nfs_seqid *seqid); + ++/* write.c */ + extern const nfs4_stateid zero_stateid; + + /* nfs4xdr.c */ +diff -up linux-2.6.35.noarch/fs/nfs/nfs4proc.c.orig linux-2.6.35.noarch/fs/nfs/nfs4proc.c +--- linux-2.6.35.noarch/fs/nfs/nfs4proc.c.orig 2010-08-31 19:12:23.916150362 -0400 ++++ linux-2.6.35.noarch/fs/nfs/nfs4proc.c 2010-08-31 21:11:40.919035214 -0400 +@@ -49,12 +49,14 @@ + #include + #include + #include ++#include + + #include "nfs4_fs.h" + #include "delegation.h" + #include "internal.h" + #include "iostat.h" + #include "callback.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PROC + +@@ -67,7 +69,7 @@ struct nfs4_opendata; + static int _nfs4_proc_open(struct nfs4_opendata *data); + static int _nfs4_recover_proc_open(struct nfs4_opendata *data); + static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); +-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); ++static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, struct nfs_client *); + static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, +@@ -125,11 +127,16 @@ const u32 nfs4_pathconf_bitmap[2] = { + 0 + }; + +-const u32 nfs4_fsinfo_bitmap[2] = { FATTR4_WORD0_MAXFILESIZE ++const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE + | FATTR4_WORD0_MAXREAD + | FATTR4_WORD0_MAXWRITE + | FATTR4_WORD0_LEASE_TIME, ++#ifdef CONFIG_NFS_V4_1 ++ FATTR4_WORD1_FS_LAYOUT_TYPES, ++ FATTR4_WORD2_LAYOUT_BLKSIZE ++#else /* CONFIG_NFS_V4_1 */ + 0 ++#endif /* CONFIG_NFS_V4_1 */ + }; + + const u32 nfs4_fs_locations_bitmap[2] = { +@@ -356,7 +363,7 @@ static void nfs41_check_drain_session_co + { + struct rpc_task *task; + +- if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { ++ if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); + if (task) + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +@@ -370,12 +377,11 @@ static void nfs41_check_drain_session_co + complete(&ses->complete); + } + +-static void nfs41_sequence_free_slot(const struct nfs_client *clp, +- struct nfs4_sequence_res *res) ++static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) + { + struct nfs4_slot_table *tbl; + +- tbl = &clp->cl_session->fc_slot_table; ++ tbl = &res->sr_session->fc_slot_table; + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { + /* just wake up the next guy waiting since + * we may have not consumed a slot after all */ +@@ -385,18 +391,17 @@ static void nfs41_sequence_free_slot(con + + spin_lock(&tbl->slot_tbl_lock); + nfs4_free_slot(tbl, res->sr_slotid); +- nfs41_check_drain_session_complete(clp->cl_session); ++ nfs41_check_drain_session_complete(res->sr_session); + spin_unlock(&tbl->slot_tbl_lock); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + } + +-static void nfs41_sequence_done(struct nfs_client *clp, +- struct nfs4_sequence_res *res, +- int rpc_status) ++static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) + { + unsigned long timestamp; + struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; ++ struct nfs_client *clp; + + /* + * sr_status remains 1 if an RPC level error occurred. The server +@@ -411,13 +416,16 @@ static void nfs41_sequence_done(struct n + if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) + goto out; + ++ tbl = &res->sr_session->fc_slot_table; ++ slot = tbl->slots + res->sr_slotid; ++ + /* Check the SEQUENCE operation status */ +- if (res->sr_status == 0) { +- tbl = &clp->cl_session->fc_slot_table; +- slot = tbl->slots + res->sr_slotid; ++ switch (res->sr_status) { ++ case 0: + /* Update the slot's sequence and clientid lease timer */ + ++slot->seq_nr; + timestamp = res->sr_renewal_time; ++ clp = res->sr_session->clp; + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal, timestamp)) + clp->cl_last_renewal = timestamp; +@@ -425,11 +433,39 @@ static void nfs41_sequence_done(struct n + /* Check sequence flags */ + if (atomic_read(&clp->cl_count) > 1) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ++ break; ++ case -NFS4ERR_DELAY: ++ /* The server detected a resend of the RPC call and ++ * returned NFS4ERR_DELAY as per Section 2.10.6.2 ++ * of RFC5661. ++ */ ++ dprintk("%s: slot=%d seq=%d: Operation in progress\n", ++ __func__, res->sr_slotid, slot->seq_nr); ++ goto out_retry; ++ default: ++ /* Just update the slot sequence no. */ ++ ++slot->seq_nr; + } + out: + /* The session may be reset by one of the error handlers. */ + dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); +- nfs41_sequence_free_slot(clp, res); ++ nfs41_sequence_free_slot(res); ++ return 1; ++out_retry: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ rpc_restart_call(task); ++ /* FIXME: rpc_restart_call() should be made to return success/fail */ ++ if (RPC_ASSASSINATED(task)) ++ goto out; ++ return 0; ++} ++ ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ if (res->sr_session == NULL) ++ return 1; ++ return nfs41_sequence_done(task, res); + } + + /* +@@ -480,12 +516,11 @@ static int nfs41_setup_sequence(struct n + if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) + return 0; + +- memset(res, 0, sizeof(*res)); + res->sr_slotid = NFS4_MAX_SLOT_TABLE; + tbl = &session->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && ++ if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { + /* + * The state manager will wait until the slot table is empty. +@@ -525,6 +560,7 @@ static int nfs41_setup_sequence(struct n + res->sr_session = session; + res->sr_slotid = slotid; + res->sr_renewal_time = jiffies; ++ res->sr_status_flags = 0; + /* + * sr_status is only set in decode_sequence, and so will remain + * set to 1 if an rpc level failure occurs. +@@ -533,33 +569,36 @@ static int nfs41_setup_sequence(struct n + return 0; + } + +-int nfs4_setup_sequence(struct nfs_client *clp, ++int nfs4_setup_sequence(const struct nfs_server *server, ++ struct nfs4_session *ds_session, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply, + struct rpc_task *task) + { ++ struct nfs4_session *session = nfs4_get_session(server); + int ret = 0; + ++ if (ds_session) ++ session = ds_session; ++ if (session == NULL) { ++ args->sa_session = NULL; ++ res->sr_session = NULL; ++ goto out; ++ } ++ + dprintk("--> %s clp %p session %p sr_slotid %d\n", +- __func__, clp, clp->cl_session, res->sr_slotid); ++ __func__, session->clp, session, res->sr_slotid); + +- if (!nfs4_has_session(clp)) +- goto out; +- ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, ++ ret = nfs41_setup_sequence(session, args, res, cache_reply, + task); +- if (ret && ret != -EAGAIN) { +- /* terminate rpc task */ +- task->tk_status = ret; +- task->tk_action = NULL; +- } + out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; + } + + struct nfs41_call_sync_data { +- struct nfs_client *clp; ++ const struct nfs_server *seq_server; + struct nfs4_sequence_args *seq_args; + struct nfs4_sequence_res *seq_res; + int cache_reply; +@@ -569,9 +608,9 @@ static void nfs41_call_sync_prepare(stru + { + struct nfs41_call_sync_data *data = calldata; + +- dprintk("--> %s data->clp->cl_session %p\n", __func__, +- data->clp->cl_session); +- if (nfs4_setup_sequence(data->clp, data->seq_args, ++ dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); ++ ++ if (nfs4_setup_sequence(data->seq_server, NULL, data->seq_args, + data->seq_res, data->cache_reply, task)) + return; + rpc_call_start(task); +@@ -587,7 +626,7 @@ static void nfs41_call_sync_done(struct + { + struct nfs41_call_sync_data *data = calldata; + +- nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); ++ nfs41_sequence_done(task, data->seq_res); + } + + struct rpc_call_ops nfs41_call_sync_ops = { +@@ -600,8 +639,7 @@ struct rpc_call_ops nfs41_call_priv_sync + .rpc_call_done = nfs41_call_sync_done, + }; + +-static int nfs4_call_sync_sequence(struct nfs_client *clp, +- struct rpc_clnt *clnt, ++static int nfs4_call_sync_sequence(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, +@@ -611,13 +649,13 @@ static int nfs4_call_sync_sequence(struc + int ret; + struct rpc_task *task; + struct nfs41_call_sync_data data = { +- .clp = clp, ++ .seq_server = server, + .seq_args = args, + .seq_res = res, + .cache_reply = cache_reply, + }; + struct rpc_task_setup task_setup = { +- .rpc_client = clnt, ++ .rpc_client = server->client, + .rpc_message = msg, + .callback_ops = &nfs41_call_sync_ops, + .callback_data = &data +@@ -642,10 +680,15 @@ int _nfs4_call_sync_session(struct nfs_s + struct nfs4_sequence_res *res, + int cache_reply) + { +- return nfs4_call_sync_sequence(server->nfs_client, server->client, +- msg, args, res, cache_reply, 0); ++ return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); + } + ++#else ++static int nfs4_sequence_done(struct rpc_task *task, ++ struct nfs4_sequence_res *res) ++{ ++ return 1; ++} + #endif /* CONFIG_NFS_V4_1 */ + + int _nfs4_call_sync(struct nfs_server *server, +@@ -659,18 +702,9 @@ int _nfs4_call_sync(struct nfs_server *s + } + + #define nfs4_call_sync(server, msg, args, res, cache_reply) \ +- (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ ++ (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ + &(res)->seq_res, (cache_reply)) + +-static void nfs4_sequence_done(const struct nfs_server *server, +- struct nfs4_sequence_res *res, int rpc_status) +-{ +-#ifdef CONFIG_NFS_V4_1 +- if (nfs4_has_session(server->nfs_client)) +- nfs41_sequence_done(server->nfs_client, res, rpc_status); +-#endif /* CONFIG_NFS_V4_1 */ +-} +- + static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) + { + struct nfs_inode *nfsi = NFS_I(dir); +@@ -745,19 +779,14 @@ static struct nfs4_opendata *nfs4_openda + p->o_arg.server = server; + p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; +- if (flags & O_EXCL) { +- if (nfs4_has_persistent_session(server->nfs_client)) { +- /* GUARDED */ +- p->o_arg.u.attrs = &p->attrs; +- memcpy(&p->attrs, attrs, sizeof(p->attrs)); +- } else { /* EXCLUSIVE4_1 */ +- u32 *s = (u32 *) p->o_arg.u.verifier.data; +- s[0] = jiffies; +- s[1] = current->pid; +- } +- } else if (flags & O_CREAT) { ++ if (flags & O_CREAT) { ++ u32 *s; ++ + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); ++ s = (u32 *) p->o_arg.u.verifier.data; ++ s[0] = jiffies; ++ s[1] = current->pid; + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; +@@ -851,8 +880,10 @@ static void update_open_stateflags(struc + static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) + { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data)); +- memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data)); ++ memcpy(state->stateid.u.data, stateid->u.data, ++ sizeof(state->stateid.u.data)); ++ memcpy(state->open_stateid.u.data, stateid->u.data, ++ sizeof(state->open_stateid.u.data)); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); +@@ -880,7 +911,8 @@ static void __update_open_stateid(struct + */ + write_seqlock(&state->seqlock); + if (deleg_stateid != NULL) { +- memcpy(state->stateid.data, deleg_stateid->data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, deleg_stateid->u.data, ++ sizeof(state->stateid.u.data)); + set_bit(NFS_DELEGATED_STATE, &state->flags); + } + if (open_stateid != NULL) +@@ -911,7 +943,8 @@ static int update_open_stateid(struct nf + + if (delegation == NULL) + delegation = &deleg_cur->stateid; +- else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0) ++ else if (memcmp(deleg_cur->stateid.u.data, delegation->u.data, ++ NFS4_STATEID_SIZE) != 0) + goto no_delegation_unlock; + + nfs_mark_delegation_referenced(deleg_cur); +@@ -973,7 +1006,8 @@ static struct nfs4_state *nfs4_try_open_ + break; + } + /* Save the delegation */ +- memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); ++ memcpy(stateid.u.data, delegation->stateid.u.data, ++ sizeof(stateid.u.data)); + rcu_read_unlock(); + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); + if (ret != 0) +@@ -1127,10 +1161,13 @@ static int nfs4_open_recover(struct nfs4 + * Check if we need to update the current stateid. + */ + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0 && +- memcmp(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)) != 0) { ++ memcmp(state->stateid.u.data, state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)) != 0) { + write_seqlock(&state->seqlock); + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) +- memcpy(state->stateid.data, state->open_stateid.data, sizeof(state->stateid.data)); ++ memcpy(state->stateid.u.data, ++ state->open_stateid.u.data, ++ sizeof(state->stateid.u.data)); + write_sequnlock(&state->seqlock); + } + return 0; +@@ -1199,8 +1236,8 @@ static int _nfs4_open_delegation_recall( + if (IS_ERR(opendata)) + return PTR_ERR(opendata); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; +- memcpy(opendata->o_arg.u.delegation.data, stateid->data, +- sizeof(opendata->o_arg.u.delegation.data)); ++ memcpy(opendata->o_arg.u.delegation.u.data, stateid->u.data, ++ sizeof(opendata->o_arg.u.delegation.u.data)); + ret = nfs4_open_recover(opendata, state); + nfs4_opendata_put(opendata); + return ret; +@@ -1258,8 +1295,8 @@ static void nfs4_open_confirm_done(struc + if (RPC_ASSASSINATED(task)) + return; + if (data->rpc_status == 0) { +- memcpy(data->o_res.stateid.data, data->c_res.stateid.data, +- sizeof(data->o_res.stateid.data)); ++ memcpy(data->o_res.stateid.u.data, data->c_res.stateid.u.data, ++ sizeof(data->o_res.stateid.u.data)); + nfs_confirm_seqid(&data->owner->so_seqid, 0); + renew_lease(data->o_res.server, data->timestamp); + data->rpc_done = 1; +@@ -1356,13 +1393,13 @@ static void nfs4_open_prepare(struct rpc + } + /* Update sequence id. */ + data->o_arg.id = sp->so_owner_id.id; +- data->o_arg.clientid = sp->so_client->cl_clientid; ++ data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); + } + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->o_arg.server->nfs_client, ++ if (nfs4_setup_sequence(data->o_arg.server, NULL, + &data->o_arg.seq_args, + &data->o_res.seq_res, 1, task)) + return; +@@ -1385,8 +1422,8 @@ static void nfs4_open_done(struct rpc_ta + + data->rpc_status = task->tk_status; + +- nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->o_res.seq_res)) ++ return; + + if (RPC_ASSASSINATED(task)) + return; +@@ -1539,9 +1576,8 @@ static int _nfs4_proc_open(struct nfs4_o + return 0; + } + +-static int nfs4_recover_expired_lease(struct nfs_server *server) ++int nfs4_recover_expired_lease(struct nfs_client *clp) + { +- struct nfs_client *clp = server->nfs_client; + unsigned int loop; + int ret; + +@@ -1557,6 +1593,7 @@ static int nfs4_recover_expired_lease(st + } + return ret; + } ++EXPORT_SYMBOL(nfs4_recover_expired_lease); + + /* + * OPEN_EXPIRED: +@@ -1646,7 +1683,7 @@ static int _nfs4_do_open(struct inode *d + dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); + goto out_err; + } +- status = nfs4_recover_expired_lease(server); ++ status = nfs4_recover_expired_lease(server->nfs_client); + if (status != 0) + goto err_put_state_owner; + if (path->dentry->d_inode != NULL) +@@ -1773,7 +1810,7 @@ static int _nfs4_do_setattr(struct inode + if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { + /* Use that stateid */ + } else if (state != NULL) { +- nfs4_copy_stateid(&arg.stateid, state, current->files); ++ nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); + } else + memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); + +@@ -1838,7 +1875,8 @@ static void nfs4_close_done(struct rpc_t + struct nfs4_state *state = calldata->state; + struct nfs_server *server = NFS_SERVER(calldata->inode); + +- nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + /* hmm. we are done with the inode, and in the process of freeing +@@ -1858,7 +1896,7 @@ static void nfs4_close_done(struct rpc_t + if (calldata->arg.fmode == 0) + break; + default: +- if (nfs4_async_handle_error(task, server, state) == -EAGAIN) ++ if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + rpc_restart_call_prepare(task); + } + nfs_release_seqid(calldata->arg.seqid); +@@ -1903,7 +1941,7 @@ static void nfs4_close_prepare(struct rp + + nfs_fattr_init(calldata->res.fattr); + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, ++ if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), NULL, + &calldata->arg.seq_args, &calldata->res.seq_res, + 1, task)) + return; +@@ -2325,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, + struct nfs4_state *state = NULL; + int status; + ++ if (pnfs_ld_layoutret_on_setattr(inode)) ++ pnfs_return_layout(inode, NULL, NULL, RETURN_FILE, true); ++ + nfs_fattr_init(fattr); + + /* Search for an existing open(O_WRITE) file */ +@@ -2650,8 +2691,9 @@ static int nfs4_proc_unlink_done(struct + { + struct nfs_removeres *res = task->tk_msg.rpc_resp; + +- nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); +- if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) ++ if (!nfs4_sequence_done(task, &res->seq_res)) ++ return 0; ++ if (nfs4_async_handle_error(task, res->server, NULL, NULL) == -EAGAIN) + return 0; + update_changeattr(dir, &res->cinfo); + nfs_post_op_update_inode(dir, res->dir_attr); +@@ -3092,18 +3134,31 @@ static int nfs4_proc_pathconf(struct nfs + static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) + { + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; + + dprintk("--> %s\n", __func__); + +- nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; + +- if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, server->nfs_client); ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } + + nfs_invalidate_atime(data->inode); +- if (task->tk_status > 0) ++ if (task->tk_status > 0 && client == server->nfs_client) + renew_lease(server, data->timestamp); + return 0; + } +@@ -3114,20 +3169,56 @@ static void nfs4_proc_read_setup(struct + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + } + ++static void pnfs4_update_write_done(struct nfs_inode *nfsi, struct nfs_write_data *data) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++#endif /* CONFIG_NFS_V4_1 */ ++} ++ + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); ++ struct nfs_server *server = NFS_SERVER(inode); ++ struct nfs_client *client = server->nfs_client; + +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { +- nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++#ifdef CONFIG_NFS_V4_1 ++ /* restore original count after retry? */ ++ if (data->pdata.orig_count) { ++ dprintk("%s: restoring original count %u\n", __func__, ++ data->pdata.orig_count); ++ data->args.count = data->pdata.orig_count; ++ } ++ ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (nfs4_async_handle_error(task, server, data->args.context->state, client) == -EAGAIN) { ++ nfs_restart_rpc(task, client); + return -EAGAIN; + } ++ ++ /* ++ * MDS write: renew lease ++ * DS write: update lastbyte written, mark for layout commit ++ */ + if (task->tk_status >= 0) { +- renew_lease(NFS_SERVER(inode), data->timestamp); +- nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ if (client == server->nfs_client) { ++ renew_lease(server, data->timestamp); ++ nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); ++ } else ++ pnfs4_update_write_done(NFS_I(inode), data); + } + return 0; + } +@@ -3140,20 +3231,42 @@ static void nfs4_proc_write_setup(struct + data->res.server = server; + data->timestamp = jiffies; + ++#ifdef CONFIG_NFS_V4_1 ++ /* writes to DS use pnfs vector */ ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_WRITE]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; + } + + static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) + { + struct inode *inode = data->inode; +- +- nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, +- task->tk_status); +- if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { ++ struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *client = server->nfs_client; ++ ++#ifdef CONFIG_NFS_V4_1 ++ if (data->pdata.pnfsflags & PNFS_NO_RPC) ++ return 0; ++ ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS commit\n", __func__); ++ client = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return -EAGAIN; ++ ++ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL, NULL) == -EAGAIN) { + nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); + return -EAGAIN; + } +- nfs_refresh_inode(inode, data->res.fattr); ++ if (client == server->nfs_client) ++ nfs_refresh_inode(inode, data->res.fattr); + return 0; + } + +@@ -3163,6 +3276,12 @@ static void nfs4_proc_commit_setup(struc + + data->args.bitmask = server->cache_consistency_bitmask; + data->res.server = server; ++#if defined(CONFIG_NFS_V4_1) ++ if (data->fldata.ds_nfs_client) { ++ msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_PNFS_COMMIT]; ++ return; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; + } + +@@ -3466,9 +3585,12 @@ static int nfs4_proc_set_acl(struct inod + } + + static int +-_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) ++nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state, struct nfs_client *clp) + { +- if (!clp || task->tk_status >= 0) ++ if (!clp) ++ clp = server->nfs_client; ++ ++ if (task->tk_status >= 0) + return 0; + switch(task->tk_status) { + case -NFS4ERR_ADMIN_REVOKED: +@@ -3493,8 +3615,9 @@ _nfs4_async_handle_error(struct rpc_task + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: +- dprintk("%s ERROR %d, Reset session\n", __func__, +- task->tk_status); ++ dprintk("%s ERROR %d, Reset session. Exchangeid " ++ "flags 0x%x\n", __func__, task->tk_status, ++ clp->cl_exchange_flags); + nfs4_schedule_state_recovery(clp); + task->tk_status = 0; + return -EAGAIN; +@@ -3514,6 +3637,8 @@ _nfs4_async_handle_error(struct rpc_task + task->tk_status = nfs4_map_errors(task->tk_status); + return 0; + do_state_recovery: ++ if (is_ds_only_client(clp)) ++ return 0; + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) +@@ -3522,12 +3647,6 @@ do_state_recovery: + return -EAGAIN; + } + +-static int +-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +-{ +- return _nfs4_async_handle_error(task, server, server->nfs_client, state); +-} +- + int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, + unsigned short port, struct rpc_cred *cred, + struct nfs4_setclientid_res *res) +@@ -3643,8 +3762,8 @@ static void nfs4_delegreturn_done(struct + { + struct nfs4_delegreturndata *data = calldata; + +- nfs4_sequence_done(data->res.server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + switch (task->tk_status) { + case -NFS4ERR_STALE_STATEID: +@@ -3653,8 +3772,8 @@ static void nfs4_delegreturn_done(struct + renew_lease(data->res.server, data->timestamp); + break; + default: +- if (nfs4_async_handle_error(task, data->res.server, NULL) == +- -EAGAIN) { ++ if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) ++ == -EAGAIN) { + nfs_restart_rpc(task, data->res.server->nfs_client); + return; + } +@@ -3674,7 +3793,7 @@ static void nfs4_delegreturn_prepare(str + + d_data = (struct nfs4_delegreturndata *)data; + +- if (nfs4_setup_sequence(d_data->res.server->nfs_client, ++ if (nfs4_setup_sequence(d_data->res.server, NULL, + &d_data->args.seq_args, + &d_data->res.seq_res, 1, task)) + return; +@@ -3894,15 +4013,16 @@ static void nfs4_locku_done(struct rpc_t + { + struct nfs4_unlockdata *calldata = data; + +- nfs4_sequence_done(calldata->server, &calldata->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &calldata->res.seq_res)) ++ return; + if (RPC_ASSASSINATED(task)) + return; + switch (task->tk_status) { + case 0: +- memcpy(calldata->lsp->ls_stateid.data, +- calldata->res.stateid.data, +- sizeof(calldata->lsp->ls_stateid.data)); ++ memcpy(calldata->lsp->ls_stateid.u.data, ++ calldata->res.stateid.u.data, ++ sizeof(calldata->lsp->ls_stateid.u. ++ data)); + renew_lease(calldata->server, calldata->timestamp); + break; + case -NFS4ERR_BAD_STATEID: +@@ -3911,7 +4031,7 @@ static void nfs4_locku_done(struct rpc_t + case -NFS4ERR_EXPIRED: + break; + default: +- if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) ++ if (nfs4_async_handle_error(task, calldata->server, NULL, NULL) == -EAGAIN) + nfs_restart_rpc(task, + calldata->server->nfs_client); + } +@@ -3929,7 +4049,7 @@ static void nfs4_locku_prepare(struct rp + return; + } + calldata->timestamp = jiffies; +- if (nfs4_setup_sequence(calldata->server->nfs_client, ++ if (nfs4_setup_sequence(calldata->server, NULL, + &calldata->arg.seq_args, + &calldata->res.seq_res, 1, task)) + return; +@@ -4084,7 +4204,8 @@ static void nfs4_lock_prepare(struct rpc + } else + data->arg.new_lock_owner = 0; + data->timestamp = jiffies; +- if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, ++ if (nfs4_setup_sequence(data->server, NULL, ++ &data->arg.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +@@ -4103,8 +4224,8 @@ static void nfs4_lock_done(struct rpc_ta + + dprintk("%s: begin!\n", __func__); + +- nfs4_sequence_done(data->server, &data->res.seq_res, +- task->tk_status); ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; + + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) +@@ -4116,8 +4237,8 @@ static void nfs4_lock_done(struct rpc_ta + goto out; + } + if (data->rpc_status == 0) { +- memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, +- sizeof(data->lsp->ls_stateid.data)); ++ memcpy(data->lsp->ls_stateid.u.data, data->res.stateid.u.data, ++ sizeof(data->lsp->ls_stateid.u.data)); + data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); + } +@@ -4426,6 +4547,34 @@ out: + return err; + } + ++static void nfs4_release_lockowner_release(void *calldata) ++{ ++ kfree(calldata); ++} ++ ++const struct rpc_call_ops nfs4_release_lockowner_ops = { ++ .rpc_release = nfs4_release_lockowner_release, ++}; ++ ++void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) ++{ ++ struct nfs_server *server = lsp->ls_state->owner->so_server; ++ struct nfs_release_lockowner_args *args; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], ++ }; ++ ++ if (server->nfs_client->cl_mvops->minor_version != 0) ++ return; ++ args = kmalloc(sizeof(*args), GFP_NOFS); ++ if (!args) ++ return; ++ args->lock_owner.clientid = server->nfs_client->cl_clientid; ++ args->lock_owner.id = lsp->ls_id.id; ++ msg.rpc_argp = args; ++ rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); ++} ++ + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + + int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, +@@ -4528,7 +4677,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + nfs4_verifier verifier; + struct nfs41_exchange_id_args args = { + .client = clp, +- .flags = clp->cl_exchange_flags, ++ .flags = clp->cl_exchange_flags & ~EXCHGID4_FLAG_CONFIRMED_R, + }; + struct nfs41_exchange_id_res res = { + .client = clp, +@@ -4576,6 +4725,7 @@ int nfs4_proc_exchange_id(struct nfs_cli + dprintk("<-- %s status= %d\n", __func__, status); + return status; + } ++EXPORT_SYMBOL(nfs4_proc_exchange_id); + + struct nfs4_get_lease_time_data { + struct nfs4_get_lease_time_args *args; +@@ -4613,7 +4763,8 @@ static void nfs4_get_lease_time_done(str + (struct nfs4_get_lease_time_data *)calldata; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); ++ if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) ++ return; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: +@@ -4807,13 +4958,6 @@ struct nfs4_session *nfs4_alloc_session( + if (!session) + return NULL; + +- /* +- * The create session reply races with the server back +- * channel probe. Mark the client NFS_CS_SESSION_INITING +- * so that the client back channel can find the +- * nfs_client struct +- */ +- clp->cl_cons_state = NFS_CS_SESSION_INITING; + init_completion(&session->complete); + + tbl = &session->fc_slot_table; +@@ -4826,6 +4970,8 @@ struct nfs4_session *nfs4_alloc_session( + spin_lock_init(&tbl->slot_tbl_lock); + rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + ++ session->session_state = 1<clp = clp; + return session; + } +@@ -5042,6 +5188,10 @@ int nfs4_init_session(struct nfs_server + if (!nfs4_has_session(clp)) + return 0; + ++ session = clp->cl_session; ++ if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) ++ return 0; ++ + rsize = server->rsize; + if (rsize == 0) + rsize = NFS_MAX_FILE_IO_SIZE; +@@ -5049,11 +5199,10 @@ int nfs4_init_session(struct nfs_server + if (wsize == 0) + wsize = NFS_MAX_FILE_IO_SIZE; + +- session = clp->cl_session; + session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; + session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; + +- ret = nfs4_recover_expired_lease(server); ++ ret = nfs4_recover_expired_lease(server->nfs_client); + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; +@@ -5062,69 +5211,70 @@ int nfs4_init_session(struct nfs_server + /* + * Renew the cl_session lease. + */ +-static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +-{ ++struct nfs4_sequence_data { ++ struct nfs_client *clp; + struct nfs4_sequence_args args; + struct nfs4_sequence_res res; +- +- struct rpc_message msg = { +- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], +- .rpc_argp = &args, +- .rpc_resp = &res, +- .rpc_cred = cred, +- }; +- +- args.sa_cache_this = 0; +- +- return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, +- &res, args.sa_cache_this, 1); +-} ++}; + + static void nfs41_sequence_release(void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + + if (atomic_read(&clp->cl_count) > 1) + nfs4_schedule_state_renewal(clp); + nfs_put_client(clp); ++ kfree(calldata); ++} ++ ++static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; + } + + static void nfs41_sequence_call_done(struct rpc_task *task, void *data) + { +- struct nfs_client *clp = (struct nfs_client *)data; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + +- nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); ++ if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) ++ return; + + if (task->tk_status < 0) { + dprintk("%s ERROR %d\n", __func__, task->tk_status); + if (atomic_read(&clp->cl_count) == 1) + goto out; + +- if (_nfs4_async_handle_error(task, NULL, clp, NULL) +- == -EAGAIN) { +- nfs_restart_rpc(task, clp); ++ if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); + return; + } + } + dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); + out: +- kfree(task->tk_msg.rpc_argp); +- kfree(task->tk_msg.rpc_resp); +- + dprintk("<-- %s\n", __func__); + } + + static void nfs41_sequence_prepare(struct rpc_task *task, void *data) + { +- struct nfs_client *clp; ++ struct nfs4_sequence_data *calldata = data; ++ struct nfs_client *clp = calldata->clp; + struct nfs4_sequence_args *args; + struct nfs4_sequence_res *res; + +- clp = (struct nfs_client *)data; + args = task->tk_msg.rpc_argp; + res = task->tk_msg.rpc_resp; + +- if (nfs4_setup_sequence(clp, args, res, 0, task)) ++ if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) + return; + rpc_call_start(task); + } +@@ -5135,32 +5285,67 @@ static const struct rpc_call_ops nfs41_s + .rpc_release = nfs41_sequence_release, + }; + +-static int nfs41_proc_async_sequence(struct nfs_client *clp, +- struct rpc_cred *cred) ++static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) + { +- struct nfs4_sequence_args *args; +- struct nfs4_sequence_res *res; ++ struct nfs4_sequence_data *calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], + .rpc_cred = cred, + }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = clp->cl_rpcclient, ++ .rpc_message = &msg, ++ .callback_ops = &nfs41_sequence_ops, ++ .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, ++ }; + + if (!atomic_inc_not_zero(&clp->cl_count)) +- return -EIO; +- args = kzalloc(sizeof(*args), GFP_NOFS); +- res = kzalloc(sizeof(*res), GFP_NOFS); +- if (!args || !res) { +- kfree(args); +- kfree(res); ++ return ERR_PTR(-EIO); ++ calldata = kmalloc(sizeof(*calldata), GFP_NOFS); ++ if (calldata == NULL) { + nfs_put_client(clp); +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + } +- res->sr_slotid = NFS4_MAX_SLOT_TABLE; +- msg.rpc_argp = args; +- msg.rpc_resp = res; ++ calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ msg.rpc_argp = &calldata->args; ++ msg.rpc_resp = &calldata->res; ++ calldata->clp = clp; ++ task_setup_data.callback_data = calldata; + +- return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, +- &nfs41_sequence_ops, (void *)clp); ++ return rpc_run_task(&task_setup_data); ++} ++ ++static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret = 0; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) ++ ret = PTR_ERR(task); ++ else ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; ++} ++ ++static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) ++{ ++ struct rpc_task *task; ++ int ret; ++ ++ task = _nfs41_proc_sequence(clp, cred); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ret = rpc_wait_for_completion_task(task); ++ if (!ret) ++ ret = task->tk_status; ++ rpc_put_task(task); ++out: ++ dprintk("<-- %s status=%d\n", __func__, ret); ++ return ret; + } + + struct nfs4_reclaim_complete_data { +@@ -5174,13 +5359,31 @@ static void nfs4_reclaim_complete_prepar + struct nfs4_reclaim_complete_data *calldata = data; + + rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); +- if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, ++ if (nfs41_setup_sequence(calldata->clp->cl_session, ++ &calldata->arg.seq_args, + &calldata->res.seq_res, 0, task)) + return; + + rpc_call_start(task); + } + ++static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) ++{ ++ switch(task->tk_status) { ++ case 0: ++ case -NFS4ERR_COMPLETE_ALREADY: ++ case -NFS4ERR_WRONG_CRED: /* What to do here? */ ++ break; ++ case -NFS4ERR_DELAY: ++ case -EKEYEXPIRED: ++ rpc_delay(task, NFS4_POLL_RETRY_MAX); ++ return -EAGAIN; ++ default: ++ nfs4_schedule_state_recovery(clp); ++ } ++ return 0; ++} ++ + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) + { + struct nfs4_reclaim_complete_data *calldata = data; +@@ -5188,32 +5391,13 @@ static void nfs4_reclaim_complete_done(s + struct nfs4_sequence_res *res = &calldata->res.seq_res; + + dprintk("--> %s\n", __func__); +- nfs41_sequence_done(clp, res, task->tk_status); +- switch (task->tk_status) { +- case 0: +- case -NFS4ERR_COMPLETE_ALREADY: +- break; +- case -NFS4ERR_BADSESSION: +- case -NFS4ERR_DEADSESSION: +- /* +- * Handle the session error, but do not retry the operation, as +- * we have no way of telling whether the clientid had to be +- * reset before we got our reply. If reset, a new wave of +- * reclaim operations will follow, containing their own reclaim +- * complete. We don't want our retry to get on the way of +- * recovery by incorrectly indicating to the server that we're +- * done reclaiming state since the process had to be restarted. +- */ +- _nfs4_async_handle_error(task, NULL, clp, NULL); +- break; +- default: +- if (_nfs4_async_handle_error( +- task, NULL, clp, NULL) == -EAGAIN) { +- rpc_restart_call_prepare(task); +- return; +- } +- } ++ if (!nfs41_sequence_done(task, res)) ++ return; + ++ if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { ++ rpc_restart_call_prepare(task); ++ return; ++ } + dprintk("<-- %s\n", __func__); + } + +@@ -5270,6 +5454,404 @@ out: + dprintk("<-- %s status=%d\n", __func__, status); + return status; + } ++ ++static void ++nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lgp->args.seq_args, ++ &lgp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ struct inode *ino = lgp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lgp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ pnfs_get_layout_done(lgp, task->tk_status); ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ lgp->status = task->tk_status; ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutget_release(void *calldata) ++{ ++ struct nfs4_layoutget *lgp = calldata; ++ ++ dprintk("--> %s\n", __func__); ++ pnfs_layout_release(NFS_I(lgp->args.inode)->layout, NULL); ++ if (lgp->res.layout.buf != NULL) ++ free_page((unsigned long) lgp->res.layout.buf); ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutget_call_ops = { ++ .rpc_call_prepare = nfs4_layoutget_prepare, ++ .rpc_call_done = nfs4_layoutget_done, ++ .rpc_release = nfs4_layoutget_release, ++}; ++ ++/* FIXME: We need to call nfs4_handle_exception ++ * and deal with retries. ++ * Currently we can't since we release lgp and its contents. ++ */ ++static int _nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTGET], ++ .rpc_argp = &lgp->args, ++ .rpc_resp = &lgp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutget_call_ops, ++ .callback_data = lgp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); ++ if (lgp->res.layout.buf == NULL) { ++ nfs4_layoutget_release(lgp); ++ return -ENOMEM; ++ } ++ ++ lgp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = lgp->status; ++ if (status != 0) ++ goto out; ++ status = pnfs_layout_process(lgp); ++out: ++ rpc_put_task(task); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_proc_layoutget(struct nfs4_layoutget *lgp) ++{ ++ struct nfs_server *server = NFS_SERVER(lgp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, _nfs4_proc_layoutget(lgp), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *data) ++{ ++ struct nfs4_layoutcommit_data *ldata = ++ (struct nfs4_layoutcommit_data *)data; ++ struct nfs_server *server = NFS_SERVER(ldata->args.inode); ++ ++ if (nfs4_setup_sequence(server, NULL, &ldata->args.seq_args, ++ &ldata->res.seq_res, 1, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void ++nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)calldata; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ ++ if (!nfs4_sequence_done(task, &data->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ data->status = task->tk_status; ++} ++ ++static void nfs4_layoutcommit_release(void *lcdata) ++{ ++ struct nfs4_layoutcommit_data *data = ++ (struct nfs4_layoutcommit_data *)lcdata; ++ ++ put_rpccred(data->cred); ++ pnfs_cleanup_layoutcommit(lcdata); ++ pnfs_layoutcommit_free(lcdata); ++ /* Matched by get_layout in pnfs_layoutcommit_inode */ ++ put_layout(data->args.inode); ++} ++ ++static const struct rpc_call_ops nfs4_layoutcommit_ops = { ++ .rpc_call_prepare = nfs4_layoutcommit_prepare, ++ .rpc_call_done = nfs4_layoutcommit_done, ++ .rpc_release = nfs4_layoutcommit_release, ++}; ++ ++/* Execute a layoutcommit to the server */ ++static int ++_nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT], ++ .rpc_argp = &data->args, ++ .rpc_resp = &data->res, ++ .rpc_cred = data->cred, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .task = &data->task, ++ .rpc_client = NFS_CLIENT(data->args.inode), ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutcommit_ops, ++ .callback_data = data, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ struct rpc_task *task; ++ int status = 0; ++ ++ dprintk("NFS: %4d initiating layoutcommit call. %llu@%llu lbw: %llu " ++ "type: %d issync %d\n", ++ data->task.tk_pid, ++ data->args.range.length, ++ data->args.range.offset, ++ data->args.lastbytewritten, ++ data->args.layout_type, issync); ++ ++ data->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = data->status; ++out: ++ dprintk("%s: status %d\n", __func__, status); ++ rpc_put_task(task); ++ return 0; ++} ++ ++int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, int issync) ++{ ++ struct nfs4_exception exception = { }; ++ struct nfs_server *server = NFS_SERVER(data->args.inode); ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_layoutcommit(data, issync), ++ &exception); ++ } while (exception.retry); ++ return err; ++} ++ ++static void ++nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ if (nfs4_setup_sequence(server, NULL, &lrp->args.seq_args, ++ &lrp->res.seq_res, 0, task)) ++ return; ++ rpc_call_start(task); ++} ++ ++static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ ++ dprintk("--> %s\n", __func__); ++ ++ if (!nfs4_sequence_done(task, &lrp->res.seq_res)) ++ return; ++ ++ if (RPC_ASSASSINATED(task)) ++ return; ++ ++ if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) ++ nfs_restart_rpc(task, server->nfs_client); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void nfs4_layoutreturn_release(void *calldata) ++{ ++ struct nfs4_layoutreturn *lrp = calldata; ++ struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; ++ ++ dprintk("--> %s return_type %d lo %p\n", __func__, ++ lrp->args.return_type, lo); ++ ++ if (lrp->args.return_type == RETURN_FILE) { ++ if (!lrp->res.lrs_present) ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ pnfs_layout_release(lo, &lrp->args.range); ++ } ++ kfree(calldata); ++ dprintk("<-- %s\n", __func__); ++} ++ ++static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { ++ .rpc_call_prepare = nfs4_layoutreturn_prepare, ++ .rpc_call_done = nfs4_layoutreturn_done, ++ .rpc_release = nfs4_layoutreturn_release, ++}; ++ ++int _nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct inode *ino = lrp->args.inode; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct rpc_task *task; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN], ++ .rpc_argp = &lrp->args, ++ .rpc_resp = &lrp->res, ++ }; ++ struct rpc_task_setup task_setup_data = { ++ .rpc_client = server->client, ++ .rpc_message = &msg, ++ .callback_ops = &nfs4_layoutreturn_call_ops, ++ .callback_data = lrp, ++ .flags = RPC_TASK_ASYNC, ++ }; ++ int status = 0; ++ ++ dprintk("--> %s\n", __func__); ++ lrp->res.seq_res.sr_slotid = NFS4_MAX_SLOT_TABLE; ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ if (!issync) ++ goto out; ++ status = nfs4_wait_for_completion_rpc_task(task); ++ if (status != 0) ++ goto out; ++ status = task->tk_status; ++out: ++ dprintk("<-- %s\n", __func__); ++ rpc_put_task(task); ++ return status; ++} ++ ++int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool issync) ++{ ++ struct nfs_server *server = NFS_SERVER(lrp->args.inode); ++ struct nfs4_exception exception = { }; ++ int err; ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_proc_layoutreturn(lrp, issync), ++ &exception); ++ } while (exception.retry); ++ ++ return err; ++} ++ ++/* ++ * Retrieve the list of Data Server devices from the MDS. ++ */ ++static int _nfs4_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_getdevicelist_args args = { ++ .fh = fh, ++ .layoutclass = server->pnfs_curr_ld->id, ++ }; ++ struct nfs4_getdevicelist_res res = { ++ .devlist = devlist, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ return status; ++} ++ ++int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist) ++{ ++ struct nfs4_exception exception = { }; ++ int err; ++ ++ do { ++ err = nfs4_handle_exception(server, ++ _nfs4_getdevicelist(server, fh, devlist), ++ &exception); ++ } while (exception.retry); ++ ++ dprintk("nfs4_pnfs_getdevlist: err=%d, num_devs=%u\n", ++ err, devlist->num_devs); ++ ++ return err; ++} ++ ++int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) ++{ ++ struct nfs4_getdeviceinfo_args args = { ++ .pdev = pdev, ++ }; ++ struct nfs4_getdeviceinfo_res res = { ++ .pdev = pdev, ++ }; ++ struct rpc_message msg = { ++ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICEINFO], ++ .rpc_argp = &args, ++ .rpc_resp = &res, ++ }; ++ int status; ++ ++ dprintk("--> %s\n", __func__); ++ status = nfs4_call_sync(server, &msg, &args, &res, 0); ++ dprintk("<-- %s status=%d\n", __func__, status); ++ ++ return status; ++} ++ + #endif /* CONFIG_NFS_V4_1 */ + + struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { +@@ -5327,28 +5909,30 @@ struct nfs4_state_maintenance_ops nfs41_ + }; + #endif + +-/* +- * Per minor version reboot and network partition recovery ops +- */ +- +-struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { +- &nfs40_reboot_recovery_ops, +-#if defined(CONFIG_NFS_V4_1) +- &nfs41_reboot_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { ++ .minor_version = 0, ++ .call_sync = _nfs4_call_sync, ++ .validate_stateid = nfs4_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs40_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs40_nograce_recovery_ops, ++ .state_renewal_ops = &nfs40_state_renewal_ops, + }; + +-struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { +- &nfs40_nograce_recovery_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_nograce_recovery_ops, +-#endif ++static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { ++ .minor_version = 1, ++ .call_sync = _nfs4_call_sync_session, ++ .validate_stateid = nfs41_validate_delegation_stateid, ++ .reboot_recovery_ops = &nfs41_reboot_recovery_ops, ++ .nograce_recovery_ops = &nfs41_nograce_recovery_ops, ++ .state_renewal_ops = &nfs41_state_renewal_ops, + }; ++#endif + +-struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { +- &nfs40_state_renewal_ops, ++const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { ++ [0] = &nfs_v4_0_minor_ops, + #if defined(CONFIG_NFS_V4_1) +- &nfs41_state_renewal_ops, ++ [1] = &nfs_v4_1_minor_ops, + #endif + }; + +@@ -5366,6 +5950,7 @@ const struct nfs_rpc_ops nfs_v4_clientop + .dentry_ops = &nfs4_dentry_operations, + .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs4_proc_get_root, + .getattr = nfs4_proc_getattr, + .setattr = nfs4_proc_setattr, +diff -up linux-2.6.35.noarch/fs/nfs/nfs4renewd.c.orig linux-2.6.35.noarch/fs/nfs/nfs4renewd.c +--- linux-2.6.35.noarch/fs/nfs/nfs4renewd.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/nfs4renewd.c 2010-08-31 21:11:40.921069706 -0400 +@@ -54,17 +54,17 @@ + void + nfs4_renew_state(struct work_struct *work) + { +- struct nfs4_state_maintenance_ops *ops; ++ const struct nfs4_state_maintenance_ops *ops; + struct nfs_client *clp = + container_of(work, struct nfs_client, cl_renewd.work); + struct rpc_cred *cred; + long lease; + unsigned long last, now; + +- ops = nfs4_state_renewal_ops[clp->cl_minorversion]; ++ ops = clp->cl_mvops->state_renewal_ops; + dprintk("%s: start\n", __func__); + /* Are there any active superblocks? */ +- if (list_empty(&clp->cl_superblocks)) ++ if (list_empty(&clp->cl_superblocks) && !is_ds_only_client(clp)) + goto out; + spin_lock(&clp->cl_lock); + lease = clp->cl_lease_time; +diff -up linux-2.6.35.noarch/fs/nfs/nfs4state.c.orig linux-2.6.35.noarch/fs/nfs/nfs4state.c +--- linux-2.6.35.noarch/fs/nfs/nfs4state.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/nfs4state.c 2010-08-31 21:11:40.922160833 -0400 +@@ -48,11 +48,13 @@ + #include + #include + #include ++#include + + #include "nfs4_fs.h" + #include "callback.h" + #include "delegation.h" + #include "internal.h" ++#include "pnfs.h" + + #define OPENOWNER_POOL_SIZE 8 + +@@ -126,6 +128,11 @@ static int nfs41_setup_state_renewal(str + int status; + struct nfs_fsinfo fsinfo; + ++ if (is_ds_only_client(clp)) { ++ nfs4_schedule_state_renewal(clp); ++ return 0; ++ } ++ + status = nfs4_proc_get_lease_time(clp, &fsinfo); + if (status == 0) { + /* Update lease time and schedule renewal */ +@@ -145,7 +152,9 @@ static void nfs4_end_drain_session(struc + struct nfs4_session *ses = clp->cl_session; + int max_slots; + +- if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { ++ if (ses == NULL) ++ return; ++ if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + spin_lock(&ses->fc_slot_table.slot_tbl_lock); + max_slots = ses->fc_slot_table.max_slots; + while (max_slots--) { +@@ -167,7 +176,7 @@ static int nfs4_begin_drain_session(stru + struct nfs4_slot_table *tbl = &ses->fc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); +- set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); ++ set_bit(NFS4_SESSION_DRAINING, &ses->session_state); + if (tbl->highest_used_slotid != -1) { + INIT_COMPLETION(ses->complete); + spin_unlock(&tbl->slot_tbl_lock); +@@ -371,7 +380,6 @@ nfs4_alloc_state_owner(void) + return NULL; + spin_lock_init(&sp->so_lock); + INIT_LIST_HEAD(&sp->so_states); +- INIT_LIST_HEAD(&sp->so_delegations); + rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); + sp->so_seqid.sequence = &sp->so_sequence; + spin_lock_init(&sp->so_sequence.lock); +@@ -384,7 +392,7 @@ static void + nfs4_drop_state_owner(struct nfs4_state_owner *sp) + { + if (!RB_EMPTY_NODE(&sp->so_client_node)) { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + rb_erase(&sp->so_client_node, &clp->cl_state_owners); +@@ -406,7 +414,6 @@ struct nfs4_state_owner *nfs4_get_state_ + new = nfs4_alloc_state_owner(); + if (new == NULL) + return NULL; +- new->so_client = clp; + new->so_server = server; + new->so_cred = cred; + spin_lock(&clp->cl_lock); +@@ -423,7 +430,7 @@ struct nfs4_state_owner *nfs4_get_state_ + + void nfs4_put_state_owner(struct nfs4_state_owner *sp) + { +- struct nfs_client *clp = sp->so_client; ++ struct nfs_client *clp = sp->so_server->nfs_client; + struct rpc_cred *cred = sp->so_cred; + + if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) +@@ -583,8 +590,24 @@ static void __nfs4_close(struct path *pa + if (!call_close) { + nfs4_put_open_state(state); + nfs4_put_state_owner(owner); +- } else ++ } else { ++ u32 roc_iomode; ++ struct nfs_inode *nfsi = NFS_I(state->inode); ++ ++ if (has_layout(nfsi) && ++ (roc_iomode = pnfs_layout_roc_iomode(nfsi)) != 0) { ++ struct pnfs_layout_range range = { ++ .iomode = roc_iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ pnfs_return_layout(state->inode, &range, NULL, ++ RETURN_FILE, wait); ++ } ++ + nfs4_do_close(path, state, gfp_mask, wait); ++ } + } + + void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode) +@@ -602,12 +625,21 @@ void nfs4_close_sync(struct path *path, + * that is compatible with current->files + */ + static struct nfs4_lock_state * +-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *pos; + list_for_each_entry(pos, &state->lock_states, ls_locks) { +- if (pos->ls_owner != fl_owner) ++ if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) + continue; ++ switch (pos->ls_owner.lo_type) { ++ case NFS4_POSIX_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.posix_owner != fl_owner) ++ continue; ++ break; ++ case NFS4_FLOCK_LOCK_TYPE: ++ if (pos->ls_owner.lo_u.flock_owner != fl_pid) ++ continue; ++ } + atomic_inc(&pos->ls_count); + return pos; + } +@@ -619,10 +651,10 @@ __nfs4_find_lock_state(struct nfs4_state + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) ++static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) + { + struct nfs4_lock_state *lsp; +- struct nfs_client *clp = state->owner->so_client; ++ struct nfs_client *clp = state->owner->so_server->nfs_client; + + lsp = kzalloc(sizeof(*lsp), GFP_NOFS); + if (lsp == NULL) +@@ -633,7 +665,18 @@ static struct nfs4_lock_state *nfs4_allo + lsp->ls_seqid.sequence = &lsp->ls_sequence; + atomic_set(&lsp->ls_count, 1); + lsp->ls_state = state; +- lsp->ls_owner = fl_owner; ++ lsp->ls_owner.lo_type = type; ++ switch (lsp->ls_owner.lo_type) { ++ case NFS4_FLOCK_LOCK_TYPE: ++ lsp->ls_owner.lo_u.flock_owner = fl_pid; ++ break; ++ case NFS4_POSIX_LOCK_TYPE: ++ lsp->ls_owner.lo_u.posix_owner = fl_owner; ++ break; ++ default: ++ kfree(lsp); ++ return NULL; ++ } + spin_lock(&clp->cl_lock); + nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); + spin_unlock(&clp->cl_lock); +@@ -643,7 +686,7 @@ static struct nfs4_lock_state *nfs4_allo + + static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) + { +- struct nfs_client *clp = lsp->ls_state->owner->so_client; ++ struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; + + spin_lock(&clp->cl_lock); + nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); +@@ -657,13 +700,13 @@ static void nfs4_free_lock_state(struct + * exists, return an uninitialized one. + * + */ +-static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) ++static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) + { + struct nfs4_lock_state *lsp, *new = NULL; + + for(;;) { + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, owner); ++ lsp = __nfs4_find_lock_state(state, owner, pid, type); + if (lsp != NULL) + break; + if (new != NULL) { +@@ -674,7 +717,7 @@ static struct nfs4_lock_state *nfs4_get_ + break; + } + spin_unlock(&state->state_lock); +- new = nfs4_alloc_lock_state(state, owner); ++ new = nfs4_alloc_lock_state(state, owner, pid, type); + if (new == NULL) + return NULL; + } +@@ -701,6 +744,8 @@ void nfs4_put_lock_state(struct nfs4_loc + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); ++ if (lsp->ls_flags & NFS_LOCK_INITIALIZED) ++ nfs4_release_lockowner(lsp); + nfs4_free_lock_state(lsp); + } + +@@ -728,7 +773,12 @@ int nfs4_set_lock_state(struct nfs4_stat + + if (fl->fl_ops != NULL) + return 0; +- lsp = nfs4_get_lock_state(state, fl->fl_owner); ++ if (fl->fl_flags & FL_POSIX) ++ lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); ++ else if (fl->fl_flags & FL_FLOCK) ++ lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); ++ else ++ return -EINVAL; + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; +@@ -740,7 +790,7 @@ int nfs4_set_lock_state(struct nfs4_stat + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. + */ +-void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) ++void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) + { + struct nfs4_lock_state *lsp; + int seq; +@@ -753,7 +803,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst + return; + + spin_lock(&state->state_lock); +- lsp = __nfs4_find_lock_state(state, fl_owner); ++ lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + spin_unlock(&state->state_lock); +@@ -1031,8 +1081,8 @@ restart: + * Open state on this file cannot be recovered + * All we can do is revert to using the zero stateid. + */ +- memset(state->stateid.data, 0, +- sizeof(state->stateid.data)); ++ memset(state->stateid.u.data, 0, ++ sizeof(state->stateid.u.data)); + /* Mark the file as being 'closed' */ + state->state = 0; + break; +@@ -1041,11 +1091,11 @@ restart: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: +- nfs4_state_mark_reclaim_nograce(sp->so_client, state); ++ nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: +@@ -1120,8 +1170,7 @@ static void nfs4_state_end_reclaim_reboo + if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + return; + +- nfs4_reclaim_complete(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); + + for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { + sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); +@@ -1211,8 +1260,8 @@ restart: + static int nfs4_check_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_maintenance_ops *ops = +- nfs4_state_renewal_ops[clp->cl_minorversion]; ++ const struct nfs4_state_maintenance_ops *ops = ++ clp->cl_mvops->state_renewal_ops; + int status = -NFS4ERR_EXPIRED; + + /* Is the client already known to have an expired lease? */ +@@ -1235,8 +1284,8 @@ out: + static int nfs4_reclaim_lease(struct nfs_client *clp) + { + struct rpc_cred *cred; +- struct nfs4_state_recovery_ops *ops = +- nfs4_reboot_recovery_ops[clp->cl_minorversion]; ++ const struct nfs4_state_recovery_ops *ops = ++ clp->cl_mvops->reboot_recovery_ops; + int status = -ENOENT; + + cred = ops->get_clid_cred(clp); +@@ -1421,6 +1470,7 @@ static void nfs4_state_manager(struct nf + } + clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state); + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); ++ pnfs_destroy_all_layouts(clp); + } + + if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { +@@ -1444,7 +1494,7 @@ static void nfs4_state_manager(struct nf + /* First recover reboot state... */ + if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_reboot_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->reboot_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + continue; +@@ -1458,7 +1508,7 @@ static void nfs4_state_manager(struct nf + /* Now recover expired state... */ + if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { + status = nfs4_do_reclaim(clp, +- nfs4_nograce_recovery_ops[clp->cl_minorversion]); ++ clp->cl_mvops->nograce_recovery_ops); + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || + test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || + test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) +diff -up linux-2.6.35.noarch/fs/nfs/nfs4xdr.c.orig linux-2.6.35.noarch/fs/nfs/nfs4xdr.c +--- linux-2.6.35.noarch/fs/nfs/nfs4xdr.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/nfs4xdr.c 2010-08-31 21:11:40.925140011 -0400 +@@ -50,8 +50,10 @@ + #include + #include + #include ++#include + #include "nfs4_fs.h" + #include "internal.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_XDR + +@@ -89,7 +91,7 @@ static int nfs4_stat_to_errno(int); + #define encode_getfh_maxsz (op_encode_hdr_maxsz) + #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ + ((3+NFS4_FHSIZE) >> 2)) +-#define nfs4_fattr_bitmap_maxsz 3 ++#define nfs4_fattr_bitmap_maxsz 4 + #define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) + #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) + #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) +@@ -111,7 +113,11 @@ static int nfs4_stat_to_errno(int); + #define encode_restorefh_maxsz (op_encode_hdr_maxsz) + #define decode_restorefh_maxsz (op_decode_hdr_maxsz) + #define encode_fsinfo_maxsz (encode_getattr_maxsz) +-#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) ++/* The 5 accounts for the PNFS attributes, and assumes that at most three ++ * layout types will be returned. ++ */ ++#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ ++ nfs4_fattr_bitmap_maxsz + 8 + 5) + #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) + #define decode_renew_maxsz (op_decode_hdr_maxsz) + #define encode_setclientid_maxsz \ +@@ -202,14 +208,17 @@ static int nfs4_stat_to_errno(int); + #define encode_link_maxsz (op_encode_hdr_maxsz + \ + nfs4_name_maxsz) + #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) ++#define encode_lockowner_maxsz (7) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ + 7 + \ +- 1 + encode_stateid_maxsz + 8) ++ 1 + encode_stateid_maxsz + 1 + \ ++ encode_lockowner_maxsz) + #define decode_lock_denied_maxsz \ + (8 + decode_lockowner_maxsz) + #define decode_lock_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) +-#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) ++#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ ++ encode_lockowner_maxsz) + #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ + decode_lock_denied_maxsz) + #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ +@@ -217,6 +226,11 @@ static int nfs4_stat_to_errno(int); + 4) + #define decode_locku_maxsz (op_decode_hdr_maxsz + \ + decode_stateid_maxsz) ++#define encode_release_lockowner_maxsz \ ++ (op_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define decode_release_lockowner_maxsz \ ++ (op_decode_hdr_maxsz) + #define encode_access_maxsz (op_encode_hdr_maxsz + 1) + #define decode_access_maxsz (op_decode_hdr_maxsz + 2) + #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ +@@ -302,6 +316,35 @@ static int nfs4_stat_to_errno(int); + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) + #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) + #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) ++#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ ++ encode_verifier_maxsz) ++#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + 2 + 1 + 1 + \ ++ decode_verifier_maxsz + \ ++ XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ ++ NFS4_PNFS_DEVICEID4_SIZE)) ++#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ ++ XDR_QUADLEN(NFS4_PNFS_DEVICEID4_SIZE)) ++#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ ++ 4 /*layout type */ + \ ++ 4 /* opaque devaddr4 length */ +\ ++ 4 /* notification bitmap length */ + \ ++ 4 /* notification bitmap */) ++#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ ++ encode_stateid_maxsz) ++#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ ++ decode_stateid_maxsz + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) ++#define encode_layoutcommit_maxsz (18 + \ ++ XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + \ ++ op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz) ++#define decode_layoutcommit_maxsz (3 + op_decode_hdr_maxsz) ++#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ ++ encode_stateid_maxsz + \ ++ 1 /* FIXME: opaque lrf_body always empty at ++ *the moment */) ++#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \ ++ 1 + decode_stateid_maxsz) + #else /* CONFIG_NFS_V4_1 */ + #define encode_sequence_maxsz 0 + #define decode_sequence_maxsz 0 +@@ -471,6 +514,12 @@ static int nfs4_stat_to_errno(int); + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_locku_maxsz) ++#define NFS4_enc_release_lockowner_sz \ ++ (compound_encode_hdr_maxsz + \ ++ encode_lockowner_maxsz) ++#define NFS4_dec_release_lockowner_sz \ ++ (compound_decode_hdr_maxsz + \ ++ decode_lockowner_maxsz) + #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ +@@ -685,6 +734,60 @@ static int nfs4_stat_to_errno(int); + #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_reclaim_complete_maxsz) ++#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_getdevicelist_maxsz) ++#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_getdevicelist_maxsz) ++#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_getdeviceinfo_maxsz) ++#define NFS4_dec_getdeviceinfo_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_getdeviceinfo_maxsz) ++#define NFS4_enc_layoutget_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutget_maxsz) ++#define NFS4_dec_layoutget_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutget_maxsz) ++#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_layoutcommit_maxsz + \ ++ encode_getattr_maxsz) ++#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutcommit_maxsz + \ ++ decode_getattr_maxsz) ++#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_layoutreturn_maxsz) ++#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_layoutreturn_maxsz) ++#define NFS4_enc_dswrite_sz (compound_encode_hdr_maxsz + \ ++ encode_sequence_maxsz +\ ++ encode_putfh_maxsz + \ ++ encode_write_maxsz) ++#define NFS4_dec_dswrite_sz (compound_decode_hdr_maxsz + \ ++ decode_sequence_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_write_maxsz) ++#define NFS4_enc_dscommit_sz (compound_encode_hdr_maxsz + \ ++ encode_putfh_maxsz + \ ++ encode_commit_maxsz) ++#define NFS4_dec_dscommit_sz (compound_decode_hdr_maxsz + \ ++ decode_putfh_maxsz + \ ++ decode_commit_maxsz) + + const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + + compound_encode_hdr_maxsz + +@@ -915,7 +1018,7 @@ static void encode_close(struct xdr_stre + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); +- xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_close_maxsz; + } +@@ -989,6 +1092,35 @@ static void encode_getattr_two(struct xd + hdr->replen += decode_getattr_maxsz; + } + ++static void ++encode_getattr_three(struct xdr_stream *xdr, ++ uint32_t bm0, uint32_t bm1, uint32_t bm2, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_GETATTR); ++ if (bm2) { ++ p = reserve_space(xdr, 16); ++ *p++ = cpu_to_be32(3); ++ *p++ = cpu_to_be32(bm0); ++ *p++ = cpu_to_be32(bm1); ++ *p = cpu_to_be32(bm2); ++ } else if (bm1) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(2); ++ *p++ = cpu_to_be32(bm0); ++ *p = cpu_to_be32(bm1); ++ } else { ++ p = reserve_space(xdr, 8); ++ *p++ = cpu_to_be32(1); ++ *p = cpu_to_be32(bm0); ++ } ++ hdr->nops++; ++ hdr->replen += decode_getattr_maxsz; ++} ++ + static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { + encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], +@@ -997,8 +1129,11 @@ static void encode_getfattr(struct xdr_s + + static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) + { +- encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], +- bitmask[1] & nfs4_fsinfo_bitmap[1], hdr); ++ encode_getattr_three(xdr, ++ bitmask[0] & nfs4_fsinfo_bitmap[0], ++ bitmask[1] & nfs4_fsinfo_bitmap[1], ++ bitmask[2] & nfs4_fsinfo_bitmap[2], ++ hdr); + } + + static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr) +@@ -1042,6 +1177,17 @@ static inline uint64_t nfs4_lock_length( + return fl->fl_end - fl->fl_start + 1; + } + ++static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 28); ++ p = xdr_encode_hyper(p, lowner->clientid); ++ *p++ = cpu_to_be32(16); ++ p = xdr_encode_opaque_fixed(p, "lock id:", 8); ++ xdr_encode_hyper(p, lowner->id); ++} ++ + /* + * opcode,type,reclaim,offset,length,new_lock_owner = 32 + * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 +@@ -1058,18 +1204,16 @@ static void encode_lock(struct xdr_strea + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); + if (args->new_lock_owner){ +- p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); ++ p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->open_stateid->u.data, ++ NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + } + else { + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); +- p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->lock_stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); + } + hdr->nops++; +@@ -1080,15 +1224,12 @@ static void encode_lockt(struct xdr_stre + { + __be32 *p; + +- p = reserve_space(xdr, 52); ++ p = reserve_space(xdr, 24); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); +- p = xdr_encode_hyper(p, args->lock_owner.clientid); +- *p++ = cpu_to_be32(16); +- p = xdr_encode_opaque_fixed(p, "lock id:", 8); +- xdr_encode_hyper(p, args->lock_owner.id); ++ encode_lockowner(xdr, &args->lock_owner); + hdr->nops++; + hdr->replen += decode_lockt_maxsz; + } +@@ -1101,13 +1242,25 @@ static void encode_locku(struct xdr_stre + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); +- p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, args->stateid->u.data, ++ NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + hdr->nops++; + hdr->replen += decode_locku_maxsz; + } + ++static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); ++ encode_lockowner(xdr, lowner); ++ hdr->nops++; ++ hdr->replen += decode_release_lockowner_maxsz; ++} ++ + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) + { + int len = name->len; +@@ -1172,7 +1325,7 @@ static inline void encode_createmode(str + break; + default: + clp = arg->server->nfs_client; +- if (clp->cl_minorversion > 0) { ++ if (clp->cl_mvops->minor_version > 0) { + if (nfs4_has_persistent_session(clp)) { + *p = cpu_to_be32(NFS4_CREATE_GUARDED); + encode_attrs(xdr, arg->u.attrs, arg->server); +@@ -1251,7 +1404,7 @@ static inline void encode_claim_delegate + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + encode_string(xdr, name->len, name->name); + } + +@@ -1282,7 +1435,7 @@ static void encode_open_confirm(struct x + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + hdr->nops++; + hdr->replen += decode_open_confirm_maxsz; +@@ -1294,7 +1447,7 @@ static void encode_open_downgrade(struct + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); +- p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); ++ p = xdr_encode_opaque_fixed(p, arg->stateid->u.data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); + encode_share_access(xdr, arg->fmode); + hdr->nops++; +@@ -1324,17 +1477,17 @@ static void encode_putrootfh(struct xdr_ + hdr->replen += decode_putrootfh_maxsz; + } + +-static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) ++static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) + { + nfs4_stateid stateid; + __be32 *p; + + p = reserve_space(xdr, NFS4_STATEID_SIZE); + if (ctx->state != NULL) { +- nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); +- xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); ++ nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); ++ xdr_encode_opaque_fixed(p, stateid.u.data, NFS4_STATEID_SIZE); + } else +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + } + + static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) +@@ -1344,7 +1497,7 @@ static void encode_read(struct xdr_strea + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); +@@ -1448,7 +1601,7 @@ encode_setacl(struct xdr_stream *xdr, st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, zero_stateid.u.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); +@@ -1479,7 +1632,7 @@ static void encode_setattr(struct xdr_st + + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); +- xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, arg->stateid.u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_setattr_maxsz; + encode_attrs(xdr, arg->iap, server); +@@ -1523,7 +1676,7 @@ static void encode_write(struct xdr_stre + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); + +- encode_stateid(xdr, args->context); ++ encode_stateid(xdr, args->context, args->lock_context); + + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); +@@ -1542,7 +1695,7 @@ static void encode_delegreturn(struct xd + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + + *p++ = cpu_to_be32(OP_DELEGRETURN); +- xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); ++ xdr_encode_opaque_fixed(p, stateid->u.data, NFS4_STATEID_SIZE); + hdr->nops++; + hdr->replen += decode_delegreturn_maxsz; + } +@@ -1696,6 +1849,162 @@ static void encode_sequence(struct xdr_s + #endif /* CONFIG_NFS_V4_1 */ + } + ++#ifdef CONFIG_NFS_V4_1 ++static void ++encode_getdevicelist(struct xdr_stream *xdr, ++ const struct nfs4_getdevicelist_args *args, ++ struct compound_hdr *hdr) ++{ ++ __be32 *p; ++ nfs4_verifier dummy = { ++ .data = "dummmmmy", ++ }; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_GETDEVICELIST); ++ *p++ = cpu_to_be32(args->layoutclass); ++ *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); ++ xdr_encode_hyper(p, 0ULL); /* cookie */ ++ encode_nfs4_verifier(xdr, &dummy); ++ hdr->nops++; ++} ++ ++static void ++encode_getdeviceinfo(struct xdr_stream *xdr, ++ const struct nfs4_getdeviceinfo_args *args, ++ struct compound_hdr *hdr) ++{ ++ int has_bitmap = (args->pdev->dev_notify_types != 0); ++ int len = 16 + NFS4_PNFS_DEVICEID4_SIZE + (has_bitmap * 4); ++ __be32 *p; ++ ++ p = reserve_space(xdr, len); ++ *p++ = cpu_to_be32(OP_GETDEVICEINFO); ++ p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ *p++ = cpu_to_be32(args->pdev->layout_type); ++ *p++ = cpu_to_be32(args->pdev->pglen + len); /* gdia_maxcount */ ++ *p++ = cpu_to_be32(has_bitmap); /* bitmap length [01] */ ++ if (has_bitmap) ++ *p = cpu_to_be32(args->pdev->dev_notify_types); ++ hdr->nops++; ++} ++ ++static void ++encode_layoutget(struct xdr_stream *xdr, ++ const struct nfs4_layoutget_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTGET); ++ *p++ = cpu_to_be32(0); /* Signal layout available */ ++ *p++ = cpu_to_be32(args->type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ p = xdr_encode_hyper(p, args->minlength); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, NFS4_STATEID_SIZE); ++ *p = cpu_to_be32(args->maxcount); ++ ++ dprintk("%s: 1st type:0x%x iomode:%d off:%lu len:%lu mc:%d\n", ++ __func__, ++ args->type, ++ args->range.iomode, ++ (unsigned long)args->range.offset, ++ (unsigned long)args->range.length, ++ args->maxcount); ++ hdr->nops++; ++ hdr->replen += decode_layoutget_maxsz; ++} ++ ++static int ++encode_layoutcommit(struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args, ++ struct compound_hdr *hdr) ++{ ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ __be32 *p; ++ ++ dprintk("%s: %llu@%llu lbw: %llu type: %d\n", __func__, ++ args->range.length, args->range.offset, args->lastbytewritten, ++ args->layout_type); ++ ++ p = reserve_space(xdr, 40 + NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ *p++ = cpu_to_be32(0); /* reclaim */ ++ p = xdr_encode_opaque_fixed(p, args->stateid.u.data, NFS4_STATEID_SIZE); ++ *p++ = cpu_to_be32(1); /* newoffset = TRUE */ ++ p = xdr_encode_hyper(p, args->lastbytewritten); ++ *p = cpu_to_be32(args->time_modify_changed != 0); ++ if (args->time_modify_changed) { ++ p = reserve_space(xdr, 12); ++ *p++ = cpu_to_be32(0); ++ *p++ = cpu_to_be32(args->time_modify.tv_sec); ++ *p = cpu_to_be32(args->time_modify.tv_nsec); ++ } ++ ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(args->layout_type); ++ ++ if (ld_io_ops->encode_layoutcommit) { ++ ld_io_ops->encode_layoutcommit(NFS_I(args->inode)->layout, ++ xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ xdr_encode_opaque(p, NULL, 0); ++ } ++ ++ hdr->nops++; ++ hdr->replen += decode_layoutcommit_maxsz; ++ return 0; ++} ++ ++static void ++encode_layoutreturn(struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args, ++ struct compound_hdr *hdr) ++{ ++ nfs4_stateid stateid; ++ __be32 *p; ++ ++ p = reserve_space(xdr, 20); ++ *p++ = cpu_to_be32(OP_LAYOUTRETURN); ++ *p++ = cpu_to_be32(args->reclaim); ++ *p++ = cpu_to_be32(args->layout_type); ++ *p++ = cpu_to_be32(args->range.iomode); ++ *p = cpu_to_be32(args->return_type); ++ if (args->return_type == RETURN_FILE) { ++ struct layoutdriver_io_operations *ld_io_ops = ++ NFS_SERVER(args->inode)->pnfs_curr_ld->ld_io_ops; ++ ++ p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE); ++ p = xdr_encode_hyper(p, args->range.offset); ++ p = xdr_encode_hyper(p, args->range.length); ++ pnfs_get_layout_stateid(&stateid, NFS_I(args->inode)->layout); ++ p = xdr_encode_opaque_fixed(p, &stateid.u.data, ++ NFS4_STATEID_SIZE); ++ dprintk("%s: call %pF\n", __func__, ++ ld_io_ops->encode_layoutreturn); ++ if (ld_io_ops->encode_layoutreturn) { ++ ld_io_ops->encode_layoutreturn( ++ NFS_I(args->inode)->layout, xdr, args); ++ } else { ++ p = reserve_space(xdr, 4); ++ *p = cpu_to_be32(0); ++ } ++ } ++ hdr->nops++; ++ hdr->replen += decode_layoutreturn_maxsz; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" ENCODE ROUTINES. + */ +@@ -1704,7 +2013,7 @@ static u32 nfs4_xdr_minorversion(const s + { + #if defined(CONFIG_NFS_V4_1) + if (args->sa_session) +- return args->sa_session->clp->cl_minorversion; ++ return args->sa_session->clp->cl_mvops->minor_version; + #endif /* CONFIG_NFS_V4_1 */ + return 0; + } +@@ -2048,6 +2357,20 @@ static int nfs4_xdr_enc_locku(struct rpc + return 0; + } + ++static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = 0, ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_release_lockowner(&xdr, &args->lock_owner, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ + /* + * Encode a READLINK request + */ +@@ -2330,7 +2653,7 @@ static int nfs4_xdr_enc_setclientid_conf + struct compound_hdr hdr = { + .nops = 0, + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2395,7 +2718,7 @@ static int nfs4_xdr_enc_exchange_id(stru + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2413,7 +2736,7 @@ static int nfs4_xdr_enc_create_session(s + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = args->client->cl_minorversion, ++ .minorversion = args->client->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2431,7 +2754,7 @@ static int nfs4_xdr_enc_destroy_session( + { + struct xdr_stream xdr; + struct compound_hdr hdr = { +- .minorversion = session->clp->cl_minorversion, ++ .minorversion = session->clp->cl_mvops->minor_version, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); +@@ -2469,7 +2792,7 @@ static int nfs4_xdr_enc_get_lease_time(s + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->la_seq_args), + }; +- const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 }; ++ const u32 lease_bitmap[3] = { FATTR4_WORD0_LEASE_TIME, 0, 0 }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); +@@ -2499,6 +2822,159 @@ static int nfs4_xdr_enc_reclaim_complete + return 0; + } + ++/* ++ * Encode GETDEVICELIST request ++ */ ++static int ++nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_getdevicelist_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_getdevicelist(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode GETDEVICEINFO request ++ */ ++static int nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_getdeviceinfo_args *args) ++{ ++ struct xdr_stream xdr; ++ struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ int replen; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_getdeviceinfo(&xdr, args, &hdr); ++ ++ /* set up reply kvec. Subtract notification bitmap max size (8) ++ * so that notification bitmap is put in xdr_buf tail */ ++ replen = (RPC_REPHDRSIZE + auth->au_rslack + ++ NFS4_dec_getdeviceinfo_sz - 8) << 2; ++ xdr_inline_pages(&req->rq_rcv_buf, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", ++ __func__, replen, args->pdev->pages, ++ args->pdev->pgbase, args->pdev->pglen); ++ ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTGET request ++ */ ++static int nfs4_xdr_enc_layoutget(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutget_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutget(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTCOMMIT request ++ */ ++static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutcommit_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_layoutcommit(&xdr, args, &hdr); ++ encode_getfattr(&xdr, args->bitmask, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode LAYOUTRETURN request ++ */ ++static int nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req, uint32_t *p, ++ struct nfs4_layoutreturn_args *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, NFS_FH(args->inode), &hdr); ++ encode_layoutreturn(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server WRITE request ++ */ ++static int nfs4_xdr_enc_dswrite(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_write(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} ++ ++/* ++ * Encode a pNFS File Layout Data Server COMMIT request ++ */ ++static int nfs4_xdr_enc_dscommit(struct rpc_rqst *req, uint32_t *p, ++ struct nfs_writeargs *args) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr = { ++ .minorversion = nfs4_xdr_minorversion(&args->seq_args), ++ }; ++ ++ xdr_init_encode(&xdr, &req->rq_snd_buf, p); ++ encode_compound_hdr(&xdr, req, &hdr); ++ encode_sequence(&xdr, &args->seq_args, &hdr); ++ encode_putfh(&xdr, args->fh, &hdr); ++ encode_commit(&xdr, args, &hdr); ++ encode_nops(&hdr); ++ return 0; ++} + #endif /* CONFIG_NFS_V4_1 */ + + static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +@@ -2599,14 +3075,17 @@ static int decode_attr_bitmap(struct xdr + goto out_overflow; + bmlen = be32_to_cpup(p); + +- bitmap[0] = bitmap[1] = 0; ++ bitmap[0] = bitmap[1] = bitmap[2] = 0; + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; + if (bmlen > 0) { + bitmap[0] = be32_to_cpup(p++); +- if (bmlen > 1) +- bitmap[1] = be32_to_cpup(p); ++ if (bmlen > 1) { ++ bitmap[1] = be32_to_cpup(p++); ++ if (bmlen > 2) ++ bitmap[2] = be32_to_cpup(p); ++ } + } + return 0; + out_overflow: +@@ -2635,8 +3114,9 @@ static int decode_attr_supported(struct + decode_attr_bitmap(xdr, bitmask); + bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; + } else +- bitmask[0] = bitmask[1] = 0; +- dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); ++ bitmask[0] = bitmask[1] = bitmask[2] = 0; ++ dprintk("%s: bitmask=%08x:%08x:%08x\n", __func__, ++ bitmask[0], bitmask[1], bitmask[2]); + return 0; + } + +@@ -3565,7 +4045,7 @@ static int decode_opaque_fixed(struct xd + + static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) + { +- return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); ++ return decode_opaque_fixed(xdr, stateid->u.data, NFS4_STATEID_SIZE); + } + + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +@@ -3621,7 +4101,7 @@ out_overflow: + static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3647,7 +4127,7 @@ xdr_error: + static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3679,7 +4159,7 @@ xdr_error: + static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf) + { + __be32 *savep; +- uint32_t attrlen, bitmap[2] = {0}; ++ uint32_t attrlen, bitmap[3] = {0}; + int status; + + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +@@ -3705,7 +4185,7 @@ static int decode_getfattr(struct xdr_st + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}, ++ bitmap[3] = {0}, + type; + int status; + umode_t fmode = 0; +@@ -3824,24 +4304,101 @@ xdr_error: + return status; + } + +- +-static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * Decode potentially multiple layout types. Currently we only support ++ * one layout driver per file system. ++ */ ++static int decode_pnfs_list(struct xdr_stream *xdr, uint32_t *layoutclass) + { +- __be32 *savep; +- uint32_t attrlen, bitmap[2]; +- int status; ++ uint32_t *p; ++ int num; + +- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) +- goto xdr_error; +- if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) +- goto xdr_error; +- if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) +- goto xdr_error; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ num = be32_to_cpup(p); + +- fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ /* pNFS is not supported by the underlying file system */ ++ if (num == 0) { ++ *layoutclass = 0; ++ return 0; ++ } + +- if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) +- goto xdr_error; ++ /* TODO: We will eventually support multiple layout drivers ? */ ++ if (num > 1) ++ printk(KERN_INFO "%s: Warning: Multiple pNFS layout drivers " ++ "per filesystem not supported\n", __func__); ++ ++ /* Decode and set first layout type */ ++ p = xdr_inline_decode(xdr, num * 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ *layoutclass = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++/* ++ * The type of file system exported ++ */ ++static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *layoutclass) ++{ ++ int status = 0; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[1]); ++ if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) ++ return -EIO; ++ if (likely(bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES)) { ++ status = decode_pnfs_list(xdr, layoutclass); ++ bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; ++ } ++ return status; ++} ++ ++/* ++ * The prefered block size for layout directed io ++ */ ++static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, ++ uint32_t *res) ++{ ++ __be32 *p; ++ ++ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); ++ *res = 0; ++ if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) { ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++ } ++ *res = be32_to_cpup(p); ++ bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; ++ } ++ return 0; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ ++static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) ++{ ++ __be32 *savep; ++ uint32_t attrlen, bitmap[3]; ++ int status; ++ ++ if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) ++ goto xdr_error; ++ if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) ++ goto xdr_error; ++ ++ fsinfo->rtmult = fsinfo->wtmult = 512; /* ??? */ ++ ++ if ((status = decode_attr_lease_time(xdr, bitmap, &fsinfo->lease_time)) != 0) ++ goto xdr_error; + if ((status = decode_attr_maxfilesize(xdr, bitmap, &fsinfo->maxfilesize)) != 0) + goto xdr_error; + if ((status = decode_attr_maxread(xdr, bitmap, &fsinfo->rtmax)) != 0) +@@ -3850,6 +4407,14 @@ static int decode_fsinfo(struct xdr_stre + if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0) + goto xdr_error; + fsinfo->wtpref = fsinfo->wtmax; ++#if defined(CONFIG_NFS_V4_1) ++ status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); ++ if (status) ++ goto xdr_error; ++ status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize); ++ if (status) ++ goto xdr_error; ++#endif /* CONFIG_NFS_V4_1 */ + + status = verify_attr_len(xdr, savep, attrlen); + xdr_error: +@@ -3973,6 +4538,11 @@ static int decode_locku(struct xdr_strea + return status; + } + ++static int decode_release_lockowner(struct xdr_stream *xdr) ++{ ++ return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); ++} ++ + static int decode_lookup(struct xdr_stream *xdr) + { + return decode_op_hdr(xdr, OP_LOOKUP); +@@ -4333,7 +4903,7 @@ static int decode_getacl(struct xdr_stre + { + __be32 *savep; + uint32_t attrlen, +- bitmap[2] = {0}; ++ bitmap[3] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + +@@ -4682,6 +5252,226 @@ out_overflow: + #endif /* CONFIG_NFS_V4_1 */ + } + ++#if defined(CONFIG_NFS_V4_1) ++/* ++ * TODO: Need to handle case when EOF != true; ++ */ ++static int decode_getdevicelist(struct xdr_stream *xdr, ++ struct pnfs_devicelist *res) ++{ ++ __be32 *p; ++ int status, i; ++ struct nfs_writeverf verftemp; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICELIST); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 8 + 8 + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ ++ /* TODO: Skip cookie for now */ ++ p += 2; ++ ++ /* Read verifier */ ++ p = xdr_decode_opaque_fixed(p, verftemp.verifier, 8); ++ ++ res->num_devs = be32_to_cpup(p); ++ ++ dprintk("%s: num_dev %d\n", __func__, res->num_devs); ++ ++ if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) ++ return -NFS4ERR_REP_TOO_BIG; ++ ++ p = xdr_inline_decode(xdr, ++ res->num_devs * NFS4_PNFS_DEVICEID4_SIZE + 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ for (i = 0; i < res->num_devs; i++) ++ p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, ++ NFS4_PNFS_DEVICEID4_SIZE); ++ res->eof = be32_to_cpup(p); ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_getdeviceinfo(struct xdr_stream *xdr, ++ struct pnfs_device *pdev) ++{ ++ __be32 *p; ++ uint32_t len, type; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_GETDEVICEINFO); ++ if (status) { ++ if (status == -ETOOSMALL) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->mincount = be32_to_cpup(p); ++ dprintk("%s: Min count too small. mincnt = %u\n", ++ __func__, pdev->mincount); ++ } ++ return status; ++ } ++ ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ type = be32_to_cpup(p++); ++ if (type != pdev->layout_type) { ++ dprintk("%s: layout mismatch req: %u pdev: %u\n", ++ __func__, pdev->layout_type, type); ++ return -EINVAL; ++ } ++ /* ++ * Get the length of the opaque device_addr4. xdr_read_pages places ++ * the opaque device_addr4 in the xdr_buf->pages (pnfs_device->pages) ++ * and places the remaining xdr data in xdr_buf->tail ++ */ ++ pdev->mincount = be32_to_cpup(p); ++ xdr_read_pages(xdr, pdev->mincount); /* include space for the length */ ++ ++ /* At most one bitmap word */ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ len = be32_to_cpup(p); ++ if (len) { ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ pdev->dev_notify_types = be32_to_cpup(p); ++ } else ++ pdev->dev_notify_types = 0; ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, ++ struct nfs4_layoutget_res *res) ++{ ++ __be32 *p; ++ int status; ++ u32 layout_count, dummy; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTGET); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 8 + NFS4_STATEID_SIZE); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->return_on_close = be32_to_cpup(p++); ++ p = xdr_decode_opaque_fixed(p, res->stateid.u.data, NFS4_STATEID_SIZE); ++ layout_count = be32_to_cpup(p); ++ if (!layout_count) { ++ dprintk("%s: server responded with empty layout array\n", ++ __func__); ++ return -EINVAL; ++ } ++ ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ p = xdr_decode_hyper(p, &res->range.offset); ++ p = xdr_decode_hyper(p, &res->range.length); ++ res->range.iomode = be32_to_cpup(p++); ++ res->type = be32_to_cpup(p++); ++ ++ status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ ++ dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", ++ __func__, ++ (unsigned long)res->range.offset, ++ (unsigned long)res->range.length, ++ res->range.iomode, ++ res->type, ++ res->layout.len); ++ ++ /* presuambly, nfs4_proc_layoutget allocated a single page */ ++ if (res->layout.len > PAGE_SIZE) ++ return -ENOMEM; ++ memcpy(res->layout.buf, p, res->layout.len); ++ ++ /* FIXME: the whole layout array should be passed up to the pnfs ++ * client */ ++ if (layout_count > 1) { ++ dprintk("%s: server responded with %d layouts, dropping tail\n", ++ __func__, layout_count); ++ ++ while (--layout_count) { ++ p = xdr_inline_decode(xdr, 24); ++ if (unlikely(!p)) ++ goto out_overflow; ++ status = decode_opaque_inline(xdr, &dummy, (char **)&p); ++ if (unlikely(status)) ++ return status; ++ } ++ } ++ ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutreturn(struct xdr_stream *xdr, ++ struct nfs4_layoutreturn_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTRETURN); ++ if (status) ++ return status; ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->lrs_present = be32_to_cpup(p); ++ if (res->lrs_present) ++ status = decode_stateid(xdr, &res->stateid); ++ return status; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++ ++static int decode_layoutcommit(struct xdr_stream *xdr, ++ struct rpc_rqst *req, ++ struct nfs4_layoutcommit_res *res) ++{ ++ __be32 *p; ++ int status; ++ ++ status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT); ++ if (status) ++ return status; ++ ++ p = xdr_inline_decode(xdr, 4); ++ if (unlikely(!p)) ++ goto out_overflow; ++ res->sizechanged = be32_to_cpup(p); ++ ++ if (res->sizechanged) { ++ p = xdr_inline_decode(xdr, 8); ++ if (unlikely(!p)) ++ goto out_overflow; ++ xdr_decode_hyper(p, &res->newsize); ++ } ++ return 0; ++out_overflow: ++ print_overflow_msg(__func__, xdr); ++ return -EIO; ++} ++#endif /* CONFIG_NFS_V4_1 */ ++ + /* + * END OF "GENERIC" DECODE ROUTINES. + */ +@@ -5259,6 +6049,19 @@ out: + return status; + } + ++static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (!status) ++ status = decode_release_lockowner(&xdr); ++ return status; ++} ++ + /* + * Decode READLINK response + */ +@@ -5696,6 +6499,186 @@ static int nfs4_xdr_dec_reclaim_complete + status = decode_reclaim_complete(&xdr, (void *)NULL); + return status; + } ++ ++/* ++ * Decode GETDEVICELIST response ++ */ ++static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_getdevicelist_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ dprintk("encoding getdevicelist!\n"); ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status != 0) ++ goto out; ++ status = decode_getdevicelist(&xdr, res->devlist); ++out: ++ return status; ++} ++ ++/* ++ * Decode GETDEVINFO response ++ */ ++static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_getdeviceinfo_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status != 0) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status != 0) ++ goto out; ++ status = decode_getdeviceinfo(&xdr, res->pdev); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTGET response ++ */ ++static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutget_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutget(&xdr, rqstp, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTRETURN response ++ */ ++static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutreturn_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutreturn(&xdr, res); ++out: ++ return status; ++} ++ ++/* ++ * Decode LAYOUTCOMMIT response ++ */ ++static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs4_layoutcommit_res *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_layoutcommit(&xdr, rqstp, res); ++ if (status) ++ goto out; ++ decode_getfattr(&xdr, res->fattr, res->server, ++ !RPC_IS_ASYNC(rqstp->rq_task)); ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server WRITE response ++ */ ++static int nfs4_xdr_dec_dswrite(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_write(&xdr, res); ++ if (!status) ++ return res->count; ++out: ++ return status; ++} ++ ++/* ++ * Decode pNFS File Layout Data Server COMMIT response ++ */ ++static int nfs4_xdr_dec_dscommit(struct rpc_rqst *rqstp, uint32_t *p, ++ struct nfs_writeres *res) ++{ ++ struct xdr_stream xdr; ++ struct compound_hdr hdr; ++ int status; ++ ++ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); ++ status = decode_compound_hdr(&xdr, &hdr); ++ if (status) ++ goto out; ++ status = decode_sequence(&xdr, &res->seq_res, rqstp); ++ if (status) ++ goto out; ++ status = decode_putfh(&xdr); ++ if (status) ++ goto out; ++ status = decode_commit(&xdr, res); ++out: ++ return status; ++} + #endif /* CONFIG_NFS_V4_1 */ + + __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus) +@@ -5866,6 +6849,7 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), + PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), ++ PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), + #if defined(CONFIG_NFS_V4_1) + PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), + PROC(CREATE_SESSION, enc_create_session, dec_create_session), +@@ -5873,6 +6857,13 @@ struct rpc_procinfo nfs4_procedures[] = + PROC(SEQUENCE, enc_sequence, dec_sequence), + PROC(GET_LEASE_TIME, enc_get_lease_time, dec_get_lease_time), + PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), ++ PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), ++ PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), ++ PROC(LAYOUTGET, enc_layoutget, dec_layoutget), ++ PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), ++ PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn), ++ PROC(PNFS_WRITE, enc_dswrite, dec_dswrite), ++ PROC(PNFS_COMMIT, enc_dscommit, dec_dscommit), + #endif /* CONFIG_NFS_V4_1 */ + }; + +diff -up linux-2.6.35.noarch/fs/nfs/objlayout/Kbuild.orig linux-2.6.35.noarch/fs/nfs/objlayout/Kbuild +--- linux-2.6.35.noarch/fs/nfs/objlayout/Kbuild.orig 2010-08-31 21:11:40.927105113 -0400 ++++ linux-2.6.35.noarch/fs/nfs/objlayout/Kbuild 2010-08-31 21:11:40.927105113 -0400 +@@ -0,0 +1,11 @@ ++# ++# Makefile for the pNFS Objects Layout Driver kernel module ++# ++objlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o objio_osd.o ++obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o ++ ++# ++# Panasas pNFS Layout Driver kernel module ++# ++panlayoutdriver-y := pnfs_osd_xdr_cli.o objlayout.o panfs_shim.o ++obj-$(CONFIG_PNFS_PANLAYOUT) += panlayoutdriver.o +diff -up linux-2.6.35.noarch/fs/nfs/objlayout/objio_osd.c.orig linux-2.6.35.noarch/fs/nfs/objlayout/objio_osd.c +--- linux-2.6.35.noarch/fs/nfs/objlayout/objio_osd.c.orig 2010-08-31 21:11:40.928160660 -0400 ++++ linux-2.6.35.noarch/fs/nfs/objlayout/objio_osd.c 2010-08-31 21:11:40.928160660 -0400 +@@ -0,0 +1,1087 @@ ++/* ++ * objio_osd.c ++ * ++ * pNFS Objects layout implementation over open-osd initiator library ++ * ++ * Copyright (C) 2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++#define _LLU(x) ((unsigned long long)x) ++ ++enum { BIO_MAX_PAGES_KMALLOC = ++ (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), ++}; ++ ++/* A per mountpoint struct currently for device cache */ ++struct objio_mount_type { ++ struct list_head dev_list; ++ spinlock_t dev_list_lock; ++}; ++ ++struct _dev_ent { ++ struct list_head list; ++ struct pnfs_deviceid d_id; ++ struct osd_dev *od; ++}; ++ ++static void _dev_list_remove_all(struct objio_mount_type *omt) ++{ ++ spin_lock(&omt->dev_list_lock); ++ ++ while (!list_empty(&omt->dev_list)) { ++ struct _dev_ent *de = list_entry(omt->dev_list.next, ++ struct _dev_ent, list); ++ ++ list_del_init(&de->list); ++ osduld_put_device(de->od); ++ kfree(de); ++ } ++ ++ spin_unlock(&omt->dev_list_lock); ++} ++ ++static struct osd_dev *___dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct list_head *le; ++ ++ list_for_each(le, &omt->dev_list) { ++ struct _dev_ent *de = list_entry(le, struct _dev_ent, list); ++ ++ if (0 == memcmp(&de->d_id, d_id, sizeof(*d_id))) ++ return de->od; ++ } ++ ++ return NULL; ++} ++ ++static struct osd_dev *_dev_list_find(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id) ++{ ++ struct osd_dev *od; ++ ++ spin_lock(&omt->dev_list_lock); ++ od = ___dev_list_find(omt, d_id); ++ spin_unlock(&omt->dev_list_lock); ++ return od; ++} ++ ++static int _dev_list_add(struct objio_mount_type *omt, ++ struct pnfs_deviceid *d_id, struct osd_dev *od) ++{ ++ struct _dev_ent *de = kzalloc(sizeof(*de), GFP_KERNEL); ++ ++ if (!de) ++ return -ENOMEM; ++ ++ spin_lock(&omt->dev_list_lock); ++ ++ if (___dev_list_find(omt, d_id)) { ++ kfree(de); ++ goto out; ++ } ++ ++ de->d_id = *d_id; ++ de->od = od; ++ list_add(&de->list, &omt->dev_list); ++ ++out: ++ spin_unlock(&omt->dev_list_lock); ++ return 0; ++} ++ ++struct objio_segment { ++ struct pnfs_osd_layout *layout; ++ ++ unsigned mirrors_p1; ++ unsigned stripe_unit; ++ unsigned group_width; /* Data stripe_units without integrity comps */ ++ u64 group_depth; ++ unsigned group_count; ++ ++ unsigned num_comps; ++ /* variable length */ ++ struct osd_dev *ods[1]; ++}; ++ ++struct objio_state; ++typedef ssize_t (*objio_done_fn)(struct objio_state *ios); ++ ++struct objio_state { ++ /* Generic layer */ ++ struct objlayout_io_state ol_state; ++ ++ struct objio_segment *objio_seg; ++ ++ struct kref kref; ++ objio_done_fn done; ++ void *private; ++ ++ unsigned long length; ++ unsigned numdevs; /* Actually used devs in this IO */ ++ /* A per-device variable array of size numdevs */ ++ struct _objio_per_comp { ++ struct bio *bio; ++ struct osd_request *or; ++ unsigned long length; ++ u64 offset; ++ unsigned dev; ++ } per_dev[]; ++}; ++ ++/* Send and wait for a get_device_info of devices in the layout, ++ then look them up with the osd_initiator library */ ++static struct osd_dev *_device_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg, unsigned comp) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ struct pnfs_osd_deviceaddr *deviceaddr; ++ struct pnfs_deviceid *d_id; ++ struct osd_dev *od; ++ struct osd_dev_info odi; ++ struct objio_mount_type *omt = PNFS_NFS_SERVER(pnfslay)->pnfs_ld_data; ++ int err; ++ ++ d_id = &layout->olo_comps[comp].oc_object_id.oid_device_id; ++ ++ od = _dev_list_find(omt, d_id); ++ if (od) ++ return od; ++ ++ err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr); ++ if (unlikely(err)) { ++ dprintk("%s: objlayout_get_deviceinfo=>%d\n", __func__, err); ++ return ERR_PTR(err); ++ } ++ ++ odi.systemid_len = deviceaddr->oda_systemid.len; ++ if (odi.systemid_len > sizeof(odi.systemid)) { ++ err = -EINVAL; ++ goto out; ++ } else if (odi.systemid_len) ++ memcpy(odi.systemid, deviceaddr->oda_systemid.data, ++ odi.systemid_len); ++ odi.osdname_len = deviceaddr->oda_osdname.len; ++ odi.osdname = (u8 *)deviceaddr->oda_osdname.data; ++ ++ if (!odi.osdname_len && !odi.systemid_len) { ++ dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", ++ __func__); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ od = osduld_info_lookup(&odi); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ dprintk("%s: osduld_info_lookup => %d\n", __func__, err); ++ goto out; ++ } ++ ++ _dev_list_add(omt, d_id, od); ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ objlayout_put_deviceinfo(deviceaddr); ++ return err ? ERR_PTR(err) : od; ++} ++ ++static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, ++ struct objio_segment *objio_seg) ++{ ++ struct pnfs_osd_layout *layout = objio_seg->layout; ++ unsigned i, num_comps = layout->olo_num_comps; ++ int err; ++ ++ /* lookup all devices */ ++ for (i = 0; i < num_comps; i++) { ++ struct osd_dev *od; ++ ++ od = _device_lookup(pnfslay, objio_seg, i); ++ if (unlikely(IS_ERR(od))) { ++ err = PTR_ERR(od); ++ goto out; ++ } ++ objio_seg->ods[i] = od; ++ } ++ objio_seg->num_comps = num_comps; ++ err = 0; ++ ++out: ++ dprintk("%s: return=%d\n", __func__, err); ++ return err; ++} ++ ++static int _verify_data_map(struct pnfs_osd_layout *layout) ++{ ++ struct pnfs_osd_data_map *data_map = &layout->olo_map; ++ u64 stripe_length; ++ u32 group_width; ++ ++/* FIXME: Only raid0 for now. if not go through MDS */ ++ if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { ++ printk(KERN_ERR "Only RAID_0 for now\n"); ++ return -ENOTSUPP; ++ } ++ if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { ++ printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", ++ data_map->odm_num_comps, data_map->odm_mirror_cnt); ++ return -EINVAL; ++ } ++ ++ if (data_map->odm_group_width) ++ group_width = data_map->odm_group_width; ++ else ++ group_width = data_map->odm_num_comps / ++ (data_map->odm_mirror_cnt + 1); ++ ++ stripe_length = (u64)data_map->odm_stripe_unit * group_width; ++ if (stripe_length >= (1ULL << 32)) { ++ printk(KERN_ERR "Total Stripe length(0x%llx)" ++ " >= 32bit is not supported\n", _LLU(stripe_length)); ++ return -ENOTSUPP; ++ } ++ ++ if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { ++ printk(KERN_ERR "Stripe Unit(0x%llx)" ++ " must be Multples of PAGE_SIZE(0x%lx)\n", ++ _LLU(data_map->odm_stripe_unit), PAGE_SIZE); ++ return -ENOTSUPP; ++ } ++ ++ return 0; ++} ++ ++int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ struct objio_segment *objio_seg; ++ int err; ++ ++ err = _verify_data_map(layout); ++ if (unlikely(err)) ++ return err; ++ ++ objio_seg = kzalloc(sizeof(*objio_seg) + ++ (layout->olo_num_comps - 1) * sizeof(objio_seg->ods[0]), ++ GFP_KERNEL); ++ if (!objio_seg) ++ return -ENOMEM; ++ ++ objio_seg->layout = layout; ++ err = objio_devices_lookup(pnfslay, objio_seg); ++ if (err) ++ goto free_seg; ++ ++ objio_seg->mirrors_p1 = layout->olo_map.odm_mirror_cnt + 1; ++ objio_seg->stripe_unit = layout->olo_map.odm_stripe_unit; ++ if (layout->olo_map.odm_group_width) { ++ objio_seg->group_width = layout->olo_map.odm_group_width; ++ objio_seg->group_depth = layout->olo_map.odm_group_depth; ++ objio_seg->group_count = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1 / ++ objio_seg->group_width; ++ } else { ++ objio_seg->group_width = layout->olo_map.odm_num_comps / ++ objio_seg->mirrors_p1; ++ objio_seg->group_depth = -1; ++ objio_seg->group_count = 1; ++ } ++ ++ *outp = objio_seg; ++ return 0; ++ ++free_seg: ++ dprintk("%s: Error: return %d\n", __func__, err); ++ kfree(objio_seg); ++ *outp = NULL; ++ return err; ++} ++ ++void objio_free_lseg(void *p) ++{ ++ struct objio_segment *objio_seg = p; ++ ++ kfree(objio_seg); ++} ++ ++int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct objio_segment *objio_seg = seg; ++ struct objio_state *ios; ++ const unsigned first_size = sizeof(*ios) + ++ objio_seg->num_comps * sizeof(ios->per_dev[0]); ++ const unsigned sec_size = objio_seg->num_comps * ++ sizeof(ios->ol_state.ioerrs[0]); ++ ++ dprintk("%s: num_comps=%d\n", __func__, objio_seg->num_comps); ++ ios = kzalloc(first_size + sec_size, GFP_KERNEL); ++ if (unlikely(!ios)) ++ return -ENOMEM; ++ ++ ios->objio_seg = objio_seg; ++ ios->ol_state.ioerrs = ((void *)ios) + first_size; ++ ios->ol_state.num_comps = objio_seg->num_comps; ++ ++ *outp = &ios->ol_state; ++ return 0; ++} ++ ++void objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ ++ kfree(ios); ++} ++ ++enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) ++{ ++ switch (oep) { ++ case OSD_ERR_PRI_NO_ERROR: ++ return (enum pnfs_osd_errno)0; ++ ++ case OSD_ERR_PRI_CLEAR_PAGES: ++ BUG_ON(1); ++ return 0; ++ ++ case OSD_ERR_PRI_RESOURCE: ++ return PNFS_OSD_ERR_RESOURCE; ++ case OSD_ERR_PRI_BAD_CRED: ++ return PNFS_OSD_ERR_BAD_CRED; ++ case OSD_ERR_PRI_NO_ACCESS: ++ return PNFS_OSD_ERR_NO_ACCESS; ++ case OSD_ERR_PRI_UNREACHABLE: ++ return PNFS_OSD_ERR_UNREACHABLE; ++ case OSD_ERR_PRI_NOT_FOUND: ++ return PNFS_OSD_ERR_NOT_FOUND; ++ case OSD_ERR_PRI_NO_SPACE: ++ return PNFS_OSD_ERR_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case OSD_ERR_PRI_EIO: ++ return PNFS_OSD_ERR_EIO; ++ } ++} ++ ++static void _clear_bio(struct bio *bio) ++{ ++ struct bio_vec *bv; ++ unsigned i; ++ ++ __bio_for_each_segment(bv, bio, i, 0) { ++ unsigned this_count = bv->bv_len; ++ ++ if (likely(PAGE_SIZE == this_count)) ++ clear_highpage(bv->bv_page); ++ else ++ zero_user(bv->bv_page, bv->bv_offset, this_count); ++ } ++} ++ ++static int _io_check(struct objio_state *ios, bool is_write) ++{ ++ enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; ++ int lin_ret = 0; ++ int i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_sense_info osi; ++ struct osd_request *or = ios->per_dev[i].or; ++ int ret; ++ ++ if (!or) ++ continue; ++ ++ ret = osd_req_decode_sense(or, &osi); ++ if (likely(!ret)) ++ continue; ++ ++ if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { ++ /* start read offset passed endof file */ ++ BUG_ON(is_write); ++ _clear_bio(ios->per_dev[i].bio); ++ dprintk("%s: start read offset passed end of file " ++ "offset=0x%llx, length=0x%lx\n", __func__, ++ _LLU(ios->per_dev[i].offset), ++ ios->per_dev[i].length); ++ ++ continue; /* we recovered */ ++ } ++ objlayout_io_set_result(&ios->ol_state, ios->per_dev[i].dev, ++ osd_pri_2_pnfs_err(osi.osd_err_pri), ++ ios->per_dev[i].offset, ++ ios->per_dev[i].length, ++ is_write); ++ ++ if (osi.osd_err_pri >= oep) { ++ oep = osi.osd_err_pri; ++ lin_ret = ret; ++ } ++ } ++ ++ return lin_ret; ++} ++ ++/* ++ * Common IO state helpers. ++ */ ++static void _io_free(struct objio_state *ios) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[i]; ++ ++ if (per_dev->or) { ++ osd_end_request(per_dev->or); ++ per_dev->or = NULL; ++ } ++ ++ if (per_dev->bio) { ++ bio_put(per_dev->bio); ++ per_dev->bio = NULL; ++ } ++ } ++} ++ ++struct osd_dev * _io_od(struct objio_state *ios, unsigned dev) ++{ ++ unsigned min_dev = ios->objio_seg->layout->olo_comps_index; ++ unsigned max_dev = min_dev + ios->ol_state.num_comps; ++ ++ BUG_ON(dev < min_dev || max_dev <= dev); ++ return ios->objio_seg->ods[dev - min_dev]; ++} ++ ++struct _striping_info { ++ u64 obj_offset; ++ u64 group_length; ++ u64 total_group_length; ++ u64 Major; ++ unsigned dev; ++ unsigned unit_off; ++}; ++ ++static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, ++ struct _striping_info *si) ++{ ++ u32 stripe_unit = ios->objio_seg->stripe_unit; ++ u32 group_width = ios->objio_seg->group_width; ++ u64 group_depth = ios->objio_seg->group_depth; ++ u32 U = stripe_unit * group_width; ++ ++ u64 T = U * group_depth; ++ u64 S = T * ios->objio_seg->group_count; ++ u64 M = div64_u64(file_offset, S); ++ ++ /* ++ G = (L - (M * S)) / T ++ H = (L - (M * S)) % T ++ */ ++ u64 LmodU = file_offset - M * S; ++ u32 G = div64_u64(LmodU, T); ++ u64 H = LmodU - G * T; ++ ++ u32 N = div_u64(H, U); ++ ++ div_u64_rem(file_offset, stripe_unit, &si->unit_off); ++ si->obj_offset = si->unit_off + (N * stripe_unit) + ++ (M * group_depth * stripe_unit); ++ ++ /* "H - (N * U)" is just "H % U" so it's bound to u32 */ ++ si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; ++ si->dev *= ios->objio_seg->mirrors_p1; ++ ++ si->group_length = T - H; ++ si->total_group_length = T; ++ si->Major = M; ++} ++ ++static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, ++ unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len) ++{ ++ unsigned pg = *cur_pg; ++ struct request_queue *q = ++ osd_request_queue(_io_od(ios, per_dev->dev)); ++ ++ per_dev->length += cur_len; ++ ++ if (per_dev->bio == NULL) { ++ unsigned stripes = ios->ol_state.num_comps / ++ ios->objio_seg->mirrors_p1; ++ unsigned pages_in_stripe = stripes * ++ (ios->objio_seg->stripe_unit / PAGE_SIZE); ++ unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / ++ stripes; ++ ++ per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); ++ if (unlikely(!per_dev->bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", bio_size); ++ return -ENOMEM; ++ } ++ } ++ ++ while (cur_len > 0) { ++ unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); ++ unsigned added_len; ++ ++ BUG_ON(ios->ol_state.nr_pages <= pg); ++ cur_len -= pglen; ++ ++ added_len = bio_add_pc_page(q, per_dev->bio, ++ ios->ol_state.pages[pg], pglen, pgbase); ++ if (unlikely(pglen != added_len)) ++ return -ENOMEM; ++ pgbase = 0; ++ ++pg; ++ } ++ BUG_ON(cur_len); ++ ++ *cur_pg = pg; ++ return 0; ++} ++ ++static int _prepare_one_group(struct objio_state *ios, u64 length, ++ struct _striping_info *si, unsigned first_comp, ++ unsigned *last_pg) ++{ ++ unsigned stripe_unit = ios->objio_seg->stripe_unit; ++ unsigned mirrors_p1 = ios->objio_seg->mirrors_p1; ++ unsigned devs_in_group = ios->objio_seg->group_width * mirrors_p1; ++ unsigned dev = si->dev; ++ unsigned first_dev = dev - (dev % devs_in_group); ++ unsigned comp = first_comp + (dev - first_dev); ++ unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; ++ unsigned cur_pg = *last_pg; ++ int ret = 0; ++ ++ while (length) { ++ struct _objio_per_comp *per_dev = &ios->per_dev[comp]; ++ unsigned cur_len, page_off = 0; ++ ++ if (!per_dev->length) { ++ per_dev->dev = dev; ++ if (dev < si->dev) { ++ per_dev->offset = si->obj_offset + stripe_unit - ++ si->unit_off; ++ cur_len = stripe_unit; ++ } else if (dev == si->dev) { ++ per_dev->offset = si->obj_offset; ++ cur_len = stripe_unit - si->unit_off; ++ page_off = si->unit_off & ~PAGE_MASK; ++ BUG_ON(page_off && ++ (page_off != ios->ol_state.pgbase)); ++ } else { /* dev > si->dev */ ++ per_dev->offset = si->obj_offset - si->unit_off; ++ cur_len = stripe_unit; ++ } ++ ++ if (max_comp < comp) ++ max_comp = comp; ++ ++ dev += mirrors_p1; ++ dev = (dev % devs_in_group) + first_dev; ++ } else { ++ cur_len = stripe_unit; ++ } ++ if (cur_len >= length) ++ cur_len = length; ++ ++ ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, ++ cur_len); ++ if (unlikely(ret)) ++ goto out; ++ ++ comp += mirrors_p1; ++ comp = (comp % devs_in_group) + first_comp; ++ ++ length -= cur_len; ++ ios->length += cur_len; ++ } ++out: ++ ios->numdevs = max_comp + mirrors_p1; ++ *last_pg = cur_pg; ++ return ret; ++} ++ ++static int _io_rw_pagelist(struct objio_state *ios) ++{ ++ u64 length = ios->ol_state.count; ++ struct _striping_info si; ++ unsigned devs_in_group = ios->objio_seg->group_width * ++ ios->objio_seg->mirrors_p1; ++ unsigned first_comp = 0; ++ unsigned num_comps = ios->objio_seg->layout->olo_map.odm_num_comps; ++ unsigned last_pg = 0; ++ int ret = 0; ++ ++ _calc_stripe_info(ios, ios->ol_state.offset, &si); ++ while (length) { ++ if (length < si.group_length) ++ si.group_length = length; ++ ++ ret = _prepare_one_group(ios, si.group_length, &si, first_comp, ++ &last_pg); ++ if (unlikely(ret)) ++ goto out; ++ ++ length -= si.group_length; ++ ++ si.group_length = si.total_group_length; ++ si.unit_off = 0; ++ ++si.Major; ++ si.obj_offset = si.Major * ios->objio_seg->stripe_unit * ++ ios->objio_seg->group_depth; ++ ++ si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group; ++ si.dev %= num_comps; ++ ++ first_comp += devs_in_group; ++ first_comp %= num_comps; ++ } ++ ++out: ++ if (!ios->length) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t _sync_done(struct objio_state *ios) ++{ ++ struct completion *waiting = ios->private; ++ ++ complete(waiting); ++ return 0; ++} ++ ++static void _last_io(struct kref *kref) ++{ ++ struct objio_state *ios = container_of(kref, struct objio_state, kref); ++ ++ ios->done(ios); ++} ++ ++static void _done_io(struct osd_request *or, void *p) ++{ ++ struct objio_state *ios = p; ++ ++ kref_put(&ios->kref, _last_io); ++} ++ ++static ssize_t _io_exec(struct objio_state *ios) ++{ ++ DECLARE_COMPLETION_ONSTACK(wait); ++ ssize_t status = 0; /* sync status */ ++ unsigned i; ++ objio_done_fn saved_done_fn = ios->done; ++ bool sync = ios->ol_state.sync; ++ ++ if (sync) { ++ ios->done = _sync_done; ++ ios->private = &wait; ++ } ++ ++ kref_init(&ios->kref); ++ ++ for (i = 0; i < ios->numdevs; i++) { ++ struct osd_request *or = ios->per_dev[i].or; ++ ++ if (!or) ++ continue; ++ ++ kref_get(&ios->kref); ++ osd_execute_request_async(or, _done_io, ios); ++ } ++ ++ kref_put(&ios->kref, _last_io); ++ ++ if (sync) { ++ wait_for_completion(&wait); ++ status = saved_done_fn(ios); ++ } ++ ++ return status; ++} ++ ++/* ++ * read ++ */ ++static ssize_t _read_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, false); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) ++ status = ios->length; ++ else ++ status = ret; ++ ++ objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct osd_request *or = NULL; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = per_dev->dev; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ int ret; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ ++err: ++ return ret; ++} ++ ++static ssize_t _read_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _read_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _read_done; ++ return _io_exec(ios); /* In sync mode exec returns the io status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _read_exec(ios); ++} ++ ++/* ++ * write ++ */ ++static ssize_t _write_done(struct objio_state *ios) ++{ ++ ssize_t status; ++ int ret = _io_check(ios, true); ++ ++ _io_free(ios); ++ ++ if (likely(!ret)) { ++ /* FIXME: should be based on the OSD's persistence model ++ * See OSD2r05 Section 4.13 Data persistence model */ ++ ios->ol_state.committed = NFS_UNSTABLE; //NFS_FILE_SYNC; ++ status = ios->length; ++ } else { ++ status = ret; ++ } ++ ++ objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); ++ return status; ++} ++ ++static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) ++{ ++ struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; ++ unsigned dev = ios->per_dev[cur_comp].dev; ++ unsigned last_comp = cur_comp + ios->objio_seg->mirrors_p1; ++ int ret; ++ ++ for (; cur_comp < last_comp; ++cur_comp, ++dev) { ++ struct osd_request *or = NULL; ++ struct pnfs_osd_object_cred *cred = ++ &ios->objio_seg->layout->olo_comps[dev]; ++ struct osd_obj_id obj = { ++ .partition = cred->oc_object_id.oid_partition_id, ++ .id = cred->oc_object_id.oid_object_id, ++ }; ++ struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; ++ struct bio *bio; ++ ++ or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); ++ if (unlikely(!or)) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ per_dev->or = or; ++ ++ if (per_dev != master_dev) { ++ bio = bio_kmalloc(GFP_KERNEL, ++ master_dev->bio->bi_max_vecs); ++ if (unlikely(!bio)) { ++ dprintk("Faild to allocate BIO size=%u\n", ++ master_dev->bio->bi_max_vecs); ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ __bio_clone(bio, master_dev->bio); ++ bio->bi_bdev = NULL; ++ bio->bi_next = NULL; ++ per_dev->bio = bio; ++ per_dev->dev = dev; ++ per_dev->length = master_dev->length; ++ per_dev->offset = master_dev->offset; ++ } else { ++ bio = master_dev->bio; ++ /* FIXME: bio_set_dir() */ ++ bio->bi_rw |= (1 << BIO_RW); ++ } ++ ++ osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); ++ ++ ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); ++ if (ret) { ++ dprintk("%s: Faild to osd_finalize_request() => %d\n", ++ __func__, ret); ++ goto err; ++ } ++ ++ dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", ++ __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), ++ per_dev->length); ++ } ++ ++err: ++ return ret; ++} ++ ++static ssize_t _write_exec(struct objio_state *ios) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < ios->numdevs; i += ios->objio_seg->mirrors_p1) { ++ if (!ios->per_dev[i].length) ++ continue; ++ ret = _write_mirrors(ios, i); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ ios->done = _write_done; ++ return _io_exec(ios); /* In sync mode exec returns the io->status */ ++ ++err: ++ _io_free(ios); ++ return ret; ++} ++ ++ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) ++{ ++ struct objio_state *ios = container_of(ol_state, struct objio_state, ++ ol_state); ++ int ret; ++ ++ /* TODO: ios->stable = stable; */ ++ ret = _io_rw_pagelist(ios); ++ if (unlikely(ret)) ++ return ret; ++ ++ return _write_exec(ios); ++} ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++objlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zx\n", __func__, maxsz); ++ return maxsz; ++} ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++objlayout_get_blocksize(void) ++{ ++ ssize_t sz = BIO_MAX_PAGES_KMALLOC * PAGE_SIZE; ++ ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations objlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = objlayout_get_stripesize, ++ .get_blocksize = objlayout_get_blocksize, ++}; ++ ++static struct pnfs_layoutdriver_type objlayout_type = { ++ .id = LAYOUT_OSD2_OBJECTS, ++ .name = "LAYOUT_OSD2_OBJECTS", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &objlayout_policy_operations, ++}; ++ ++void *objio_init_mt(void) ++{ ++ struct objio_mount_type *omt = kzalloc(sizeof(*omt), GFP_KERNEL); ++ ++ if (!omt) ++ return ERR_PTR(-ENOMEM); ++ ++ INIT_LIST_HEAD(&omt->dev_list); ++ spin_lock_init(&omt->dev_list_lock); ++ return omt; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++ _dev_list_remove_all(mountid); ++ kfree(mountid); ++} ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++objlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++objlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&objlayout_type); ++ printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(objlayout_init); ++module_exit(objlayout_exit); +diff -up linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.c.orig linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.c +--- linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.c.orig 2010-08-31 21:11:40.929160846 -0400 ++++ linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.c 2010-08-31 21:11:40.929160846 -0400 +@@ -0,0 +1,790 @@ ++/* ++ * objlayout.c ++ * ++ * pNFS layout driver for Panasas OSDs ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include "objlayout.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct pnfs_client_operations *pnfs_client_ops; ++ ++/* ++ * Create a objlayout layout structure for the given inode and return it. ++ */ ++static struct pnfs_layout_hdr * ++objlayout_alloc_layout(struct inode *inode) ++{ ++ struct objlayout *objlay; ++ ++ objlay = kzalloc(sizeof(struct objlayout), GFP_KERNEL); ++ if (objlay) { ++ spin_lock_init(&objlay->lock); ++ INIT_LIST_HEAD(&objlay->err_list); ++ } ++ dprintk("%s: Return %p\n", __func__, objlay); ++ return &objlay->pnfs_layout; ++} ++ ++/* ++ * Free an objlayout layout structure ++ */ ++static void ++objlayout_free_layout(struct pnfs_layout_hdr *lo) ++{ ++ struct objlayout *objlay = OBJLAYOUT(lo); ++ ++ dprintk("%s: objlay %p\n", __func__, objlay); ++ ++ WARN_ON(!list_empty(&objlay->err_list)); ++ kfree(objlay); ++} ++ ++/* ++ * Unmarshall layout and store it in pnfslay. ++ */ ++static struct pnfs_layout_segment * ++objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, ++ struct nfs4_layoutget_res *lgr) ++{ ++ int status; ++ void *layout = lgr->layout.buf; ++ struct pnfs_layout_segment *lseg; ++ struct objlayout_segment *objlseg; ++ struct pnfs_osd_layout *pnfs_osd_layout; ++ ++ dprintk("%s: Begin pnfslay %p layout %p\n", __func__, pnfslay, layout); ++ ++ BUG_ON(!layout); ++ ++ status = -ENOMEM; ++ lseg = kzalloc(sizeof(*lseg) + sizeof(*objlseg) + ++ pnfs_osd_layout_incore_sz(layout), GFP_KERNEL); ++ if (!lseg) ++ goto err; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ pnfs_osd_layout = (struct pnfs_osd_layout *)objlseg->pnfs_osd_layout; ++ pnfs_osd_xdr_decode_layout(pnfs_osd_layout, layout); ++ ++ status = objio_alloc_lseg(&objlseg->internal, pnfslay, lseg, ++ pnfs_osd_layout); ++ if (status) ++ goto err; ++ ++ dprintk("%s: Return %p\n", __func__, lseg); ++ return lseg; ++ ++ err: ++ kfree(lseg); ++ return ERR_PTR(status); ++} ++ ++/* ++ * Free a layout segement ++ */ ++static void ++objlayout_free_lseg(struct pnfs_layout_segment *lseg) ++{ ++ struct objlayout_segment *objlseg; ++ ++ dprintk("%s: freeing layout segment %p\n", __func__, lseg); ++ ++ if (unlikely(!lseg)) ++ return; ++ ++ objlseg = LSEG_LD_DATA(lseg); ++ objio_free_lseg(objlseg->internal); ++ kfree(lseg); ++} ++ ++/* ++ * I/O Operations ++ */ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end : NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1 : NFS4_MAX_UINT64; ++} ++ ++static struct objlayout_io_state * ++objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ++ struct page **pages, ++ unsigned pgbase, ++ unsigned nr_pages, ++ loff_t offset, ++ size_t count, ++ struct pnfs_layout_segment *lseg, ++ void *rpcdata) ++{ ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(lseg); ++ struct objlayout_io_state *state; ++ u64 lseg_end_offset; ++ size_t size_nr_pages; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ if (objio_alloc_io_state(objlseg->internal, &state)) ++ return NULL; ++ ++ BUG_ON(offset < lseg->range.offset); ++ lseg_end_offset = end_offset(lseg->range.offset, lseg->range.length); ++ BUG_ON(offset >= lseg_end_offset); ++ if (offset + count > lseg_end_offset) { ++ count = lseg->range.length - (offset - lseg->range.offset); ++ dprintk("%s: truncated count %Zd\n", __func__, count); ++ } ++ ++ if (pgbase > PAGE_SIZE) { ++ unsigned n = pgbase >> PAGE_SHIFT; ++ ++ pgbase &= ~PAGE_MASK; ++ pages += n; ++ nr_pages -= n; ++ } ++ ++ size_nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ BUG_ON(nr_pages < size_nr_pages); ++ if (nr_pages > size_nr_pages) ++ nr_pages = size_nr_pages; ++ ++ INIT_LIST_HEAD(&state->err_list); ++ state->lseg = lseg; ++ state->rpcdata = rpcdata; ++ state->pages = pages; ++ state->pgbase = pgbase; ++ state->nr_pages = nr_pages; ++ state->offset = offset; ++ state->count = count; ++ state->sync = 0; ++ ++ return state; ++} ++ ++static void ++objlayout_free_io_state(struct objlayout_io_state *state) ++{ ++ dprintk("%s: freeing io_state\n", __func__); ++ if (unlikely(!state)) ++ return; ++ ++ objio_free_io_state(state); ++} ++ ++/* ++ * I/O done common code ++ */ ++static void ++objlayout_iodone(struct objlayout_io_state *state) ++{ ++ dprintk("%s: state %p status\n", __func__, state); ++ ++ if (likely(state->status >= 0)) { ++ objlayout_free_io_state(state); ++ } else { ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ spin_lock(&objlay->lock); ++ objlay->delta_space_valid = OBJ_DSU_INVALID; ++ list_add(&objlay->err_list, &state->err_list); ++ spin_unlock(&objlay->lock); ++ } ++} ++ ++/* ++ * objlayout_io_set_result - Set an osd_error code on a specific osd comp. ++ * ++ * The @index component IO failed (error returned from target). Register ++ * the error for later reporting at layout-return. ++ */ ++void ++objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, ++ int osd_error, u64 offset, u64 length, bool is_write) ++{ ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; ++ ++ BUG_ON(index >= state->num_comps); ++ if (osd_error) { ++ struct objlayout_segment *objlseg = LSEG_LD_DATA(state->lseg); ++ struct pnfs_osd_layout *layout = ++ (typeof(layout))objlseg->pnfs_osd_layout; ++ ++ ioerr->oer_component = layout->olo_comps[index].oc_object_id; ++ ioerr->oer_comp_offset = offset; ++ ioerr->oer_comp_length = length; ++ ioerr->oer_iswrite = is_write; ++ ioerr->oer_errno = osd_error; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " ++ "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", ++ __func__, index, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ } else { ++ /* User need not call if no error is reported */ ++ ioerr->oer_errno = 0; ++ } ++} ++ ++static void _rpc_commit_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_commit_complete(wdata); ++} ++ ++/* ++ * Commit data remotely on OSDs ++ */ ++enum pnfs_try_status ++objlayout_commit(struct nfs_write_data *wdata, int how) ++{ ++ int status = PNFS_ATTEMPTED; ++ ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_commit_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ dprintk("%s: Return %d\n", __func__, status); ++ return status; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_read_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++} ++ ++void ++objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) ++{ ++ int eof = state->eof; ++ struct nfs_read_data *rdata; ++ ++ state->status = status; ++ dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); ++ rdata = state->rpcdata; ++ rdata->task.tk_status = status; ++ if (status >= 0) { ++ rdata->res.count = status; ++ rdata->res.eof = eof; ++ } ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_readlist_complete(rdata); ++ else { ++ INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); ++ schedule_work(&rdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async reads. ++ */ ++enum pnfs_try_status ++objlayout_read_pagelist(struct nfs_read_data *rdata, unsigned nr_pages) ++{ ++ loff_t offset = rdata->args.offset; ++ size_t count = rdata->args.count; ++ struct objlayout_io_state *state; ++ ssize_t status = 0; ++ loff_t eof; ++ ++ dprintk("%s: Begin inode %p offset %llu count %d\n", ++ __func__, rdata->inode, offset, (int)count); ++ ++ eof = i_size_read(rdata->inode); ++ if (unlikely(offset + count > eof)) { ++ if (offset >= eof) { ++ status = 0; ++ rdata->res.count = 0; ++ rdata->res.eof = 1; ++ goto out; ++ } ++ count = eof - offset; ++ } ++ ++ state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, ++ rdata->args.pages, rdata->args.pgbase, ++ nr_pages, offset, count, ++ rdata->pdata.lseg, rdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->eof = state->offset + state->count >= eof; ++ ++ status = objio_read_pagelist(state); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ rdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). ++ * This is because the osd completion is called with ints-off from ++ * the block layer ++ */ ++static void _rpc_write_complete(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++} ++ ++void ++objlayout_write_done(struct objlayout_io_state *state, ssize_t status, ++ bool sync) ++{ ++ struct nfs_write_data *wdata; ++ ++ dprintk("%s: Begin\n", __func__); ++ wdata = state->rpcdata; ++ state->status = status; ++ wdata->task.tk_status = status; ++ if (status >= 0) { ++ wdata->res.count = status; ++ wdata->verf.committed = state->committed; ++ dprintk("%s: Return status %d committed %d\n", ++ __func__, wdata->task.tk_status, ++ wdata->verf.committed); ++ } else ++ dprintk("%s: Return status %d\n", ++ __func__, wdata->task.tk_status); ++ objlayout_iodone(state); ++ /* must not use state after this point */ ++ ++ if (sync) ++ pnfs_client_ops->nfs_writelist_complete(wdata); ++ else { ++ INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); ++ schedule_work(&wdata->task.u.tk_work); ++ } ++} ++ ++/* ++ * Perform sync or async writes. ++ */ ++enum pnfs_try_status ++objlayout_write_pagelist(struct nfs_write_data *wdata, ++ unsigned nr_pages, ++ int how) ++{ ++ struct objlayout_io_state *state; ++ ssize_t status; ++ ++ dprintk("%s: Begin inode %p offset %llu count %u\n", ++ __func__, wdata->inode, wdata->args.offset, wdata->args.count); ++ ++ state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, ++ wdata->args.pages, ++ wdata->args.pgbase, ++ nr_pages, ++ wdata->args.offset, ++ wdata->args.count, ++ wdata->pdata.lseg, wdata); ++ if (unlikely(!state)) { ++ status = -ENOMEM; ++ goto out; ++ } ++ ++ state->sync = how & FLUSH_SYNC; ++ ++ status = objio_write_pagelist(state, how & FLUSH_STABLE); ++ out: ++ dprintk("%s: Return status %Zd\n", __func__, status); ++ wdata->pdata.pnfs_error = status; ++ return PNFS_ATTEMPTED; ++} ++ ++void ++objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct pnfs_osd_layoutupdate lou; ++ __be32 *start; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ spin_lock(&objlay->lock); ++ lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); ++ lou.dsu_delta = objlay->delta_space_used; ++ objlay->delta_space_used = 0; ++ objlay->delta_space_valid = OBJ_DSU_INIT; ++ lou.olu_ioerr_flag = !list_empty(&objlay->err_list); ++ spin_unlock(&objlay->lock); ++ ++ start = xdr_reserve_space(xdr, 4); ++ ++ BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ ++ dprintk("%s: Return delta_space_used %lld err %d\n", __func__, ++ lou.dsu_delta, lou.olu_ioerr_flag); ++} ++ ++static int ++err_prio(u32 oer_errno) ++{ ++ switch (oer_errno) { ++ case 0: ++ return 0; ++ ++ case PNFS_OSD_ERR_RESOURCE: ++ return OSD_ERR_PRI_RESOURCE; ++ case PNFS_OSD_ERR_BAD_CRED: ++ return OSD_ERR_PRI_BAD_CRED; ++ case PNFS_OSD_ERR_NO_ACCESS: ++ return OSD_ERR_PRI_NO_ACCESS; ++ case PNFS_OSD_ERR_UNREACHABLE: ++ return OSD_ERR_PRI_UNREACHABLE; ++ case PNFS_OSD_ERR_NOT_FOUND: ++ return OSD_ERR_PRI_NOT_FOUND; ++ case PNFS_OSD_ERR_NO_SPACE: ++ return OSD_ERR_PRI_NO_SPACE; ++ default: ++ WARN_ON(1); ++ /* fallthrough */ ++ case PNFS_OSD_ERR_EIO: ++ return OSD_ERR_PRI_EIO; ++ } ++} ++ ++static void ++merge_ioerr(struct pnfs_osd_ioerr *dest_err, ++ const struct pnfs_osd_ioerr *src_err) ++{ ++ u64 dest_end, src_end; ++ ++ if (!dest_err->oer_errno) { ++ *dest_err = *src_err; ++ /* accumulated device must be blank */ ++ memset(&dest_err->oer_component.oid_device_id, 0, ++ sizeof(dest_err->oer_component.oid_device_id)); ++ ++ return; ++ } ++ ++ if (dest_err->oer_component.oid_partition_id != ++ src_err->oer_component.oid_partition_id) ++ dest_err->oer_component.oid_partition_id = 0; ++ ++ if (dest_err->oer_component.oid_object_id != ++ src_err->oer_component.oid_object_id) ++ dest_err->oer_component.oid_object_id = 0; ++ ++ if (dest_err->oer_comp_offset > src_err->oer_comp_offset) ++ dest_err->oer_comp_offset = src_err->oer_comp_offset; ++ ++ dest_end = end_offset(dest_err->oer_comp_offset, ++ dest_err->oer_comp_length); ++ src_end = end_offset(src_err->oer_comp_offset, ++ src_err->oer_comp_length); ++ if (dest_end < src_end) ++ dest_end = src_end; ++ ++ dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; ++ ++ if ((src_err->oer_iswrite == dest_err->oer_iswrite) && ++ (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { ++ dest_err->oer_errno = src_err->oer_errno; ++ } else if (src_err->oer_iswrite) { ++ dest_err->oer_iswrite = true; ++ dest_err->oer_errno = src_err->oer_errno; ++ } ++} ++ ++static void ++encode_accumulated_error(struct objlayout *objlay, struct xdr_stream *xdr) ++{ ++ struct objlayout_io_state *state, *tmp; ++ struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ ++ for (i = 0; i < state->num_comps; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ merge_ioerr(&accumulated_err, ioerr); ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++ ++ BUG_ON(pnfs_osd_xdr_encode_ioerr(xdr, &accumulated_err)); ++} ++ ++void ++objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args) ++{ ++ struct objlayout *objlay = OBJLAYOUT(pnfslay); ++ struct objlayout_io_state *state, *tmp; ++ __be32 *start, *uninitialized_var(last_xdr); ++ ++ dprintk("%s: Begin\n", __func__); ++ start = xdr_reserve_space(xdr, 4); ++ BUG_ON(!start); ++ ++ spin_lock(&objlay->lock); ++ ++ list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { ++ unsigned i; ++ int res = 0; ++ ++ for (i = 0; i < state->num_comps && !res; i++) { ++ struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; ++ ++ if (!ioerr->oer_errno) ++ continue; ++ ++ dprintk("%s: err[%d]: errno=%d is_write=%d " ++ "dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "offset=0x%llx length=0x%llx\n", ++ __func__, i, ioerr->oer_errno, ++ ioerr->oer_iswrite, ++ _DEVID_LO(&ioerr->oer_component.oid_device_id), ++ _DEVID_HI(&ioerr->oer_component.oid_device_id), ++ ioerr->oer_component.oid_partition_id, ++ ioerr->oer_component.oid_object_id, ++ ioerr->oer_comp_offset, ++ ioerr->oer_comp_length); ++ ++ last_xdr = xdr->p; ++ res = pnfs_osd_xdr_encode_ioerr(xdr, &state->ioerrs[i]); ++ } ++ if (unlikely(res)) { ++ /* no space for even one error descriptor */ ++ BUG_ON(last_xdr == start + 1); ++ ++ /* we've encountered a situation with lots and lots of ++ * errors and no space to encode them all. Use the last ++ * available slot to report the union of all the ++ * remaining errors. ++ */ ++ xdr_rewind_stream(xdr, last_xdr - ++ pnfs_osd_ioerr_xdr_sz() / 4); ++ encode_accumulated_error(objlay, xdr); ++ goto loop_done; ++ } ++ list_del(&state->err_list); ++ objlayout_free_io_state(state); ++ } ++loop_done: ++ spin_unlock(&objlay->lock); ++ ++ *start = cpu_to_be32((xdr->p - start - 1) * 4); ++ dprintk("%s: Return\n", __func__); ++} ++ ++struct objlayout_deviceinfo { ++ struct page *page; ++ struct pnfs_osd_deviceaddr da; /* This must be last */ ++}; ++ ++/* Initialize and call nfs_getdeviceinfo, then decode and return a ++ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() ++ * should be called. ++ */ ++int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi; ++ struct pnfs_device pd; ++ struct super_block *sb; ++ struct page *page; ++ size_t sz; ++ u32 *p; ++ int err; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ pd.area = page_address(page); ++ ++ memcpy(&pd.dev_id, d_id, sizeof(*d_id)); ++ pd.layout_type = LAYOUT_OSD2_OBJECTS; ++ pd.dev_notify_types = 0; ++ pd.pages = &page; ++ pd.pgbase = 0; ++ pd.pglen = PAGE_SIZE; ++ pd.mincount = 0; ++ ++ sb = PNFS_INODE(pnfslay)->i_sb; ++ err = pnfs_client_ops->nfs_getdeviceinfo(PNFS_NFS_SERVER(pnfslay), &pd); ++ dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); ++ if (err) ++ goto err_out; ++ ++ p = pd.area; ++ sz = pnfs_osd_xdr_deviceaddr_incore_sz(p); ++ odi = kzalloc(sz + (sizeof(*odi) - sizeof(odi->da)), GFP_KERNEL); ++ if (!odi) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); ++ odi->page = page; ++ *deviceaddr = &odi->da; ++ return 0; ++ ++err_out: ++ __free_page(page); ++ return err; ++} ++ ++void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) ++{ ++ struct objlayout_deviceinfo *odi = container_of(deviceaddr, ++ struct objlayout_deviceinfo, ++ da); ++ ++ __free_page(odi->page); ++ kfree(odi); ++} ++ ++/* ++ * Initialize a mountpoint by retrieving the list of ++ * available devices for it. ++ * Return the pnfs_mount_type structure so the ++ * pNFS_client can refer to the mount point later on. ++ */ ++static int ++objlayout_initialize_mountpoint(struct nfs_server *server, ++ const struct nfs_fh *mntfh) ++{ ++ void *data; ++ ++ data = objio_init_mt(); ++ if (IS_ERR(data)) { ++ printk(KERN_INFO "%s: objlayout lib not ready err=%ld\n", ++ __func__, PTR_ERR(data)); ++ return PTR_ERR(data); ++ } ++ server->pnfs_ld_data = data; ++ ++ dprintk("%s: Return data=%p\n", __func__, data); ++ return 0; ++} ++ ++/* ++ * Uninitialize a mountpoint ++ */ ++static int ++objlayout_uninitialize_mountpoint(struct nfs_server *server) ++{ ++ dprintk("%s: Begin %p\n", __func__, server->pnfs_ld_data); ++ objio_fini_mt(server->pnfs_ld_data); ++ return 0; ++} ++ ++struct layoutdriver_io_operations objlayout_io_operations = { ++ .commit = objlayout_commit, ++ .read_pagelist = objlayout_read_pagelist, ++ .write_pagelist = objlayout_write_pagelist, ++ .alloc_layout = objlayout_alloc_layout, ++ .free_layout = objlayout_free_layout, ++ .alloc_lseg = objlayout_alloc_lseg, ++ .free_lseg = objlayout_free_lseg, ++ .encode_layoutcommit = objlayout_encode_layoutcommit, ++ .encode_layoutreturn = objlayout_encode_layoutreturn, ++ .initialize_mountpoint = objlayout_initialize_mountpoint, ++ .uninitialize_mountpoint = objlayout_uninitialize_mountpoint, ++}; +diff -up linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.h.orig linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.h +--- linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.h.orig 2010-08-31 21:11:40.930182141 -0400 ++++ linux-2.6.35.noarch/fs/nfs/objlayout/objlayout.h 2010-08-31 21:11:40.930182141 -0400 +@@ -0,0 +1,171 @@ ++/* ++ * objlayout.h ++ * ++ * Data types and function declerations for interfacing with the ++ * pNFS standard object layout driver. ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * Boaz Harrosh ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#ifndef _OBJLAYOUT_H ++#define _OBJLAYOUT_H ++ ++#include ++#include ++#include ++ ++/* ++ * in-core layout segment ++ */ ++struct objlayout_segment { ++ void *internal; /* for provider internal use */ ++ u8 pnfs_osd_layout[]; ++}; ++ ++/* ++ * per-inode layout ++ */ ++struct objlayout { ++ struct pnfs_layout_hdr pnfs_layout; ++ ++ /* for layout_commit */ ++ enum osd_delta_space_valid_enum { ++ OBJ_DSU_INIT = 0, ++ OBJ_DSU_VALID, ++ OBJ_DSU_INVALID, ++ } delta_space_valid; ++ s64 delta_space_used; /* consumed by write ops */ ++ ++ /* for layout_return */ ++ spinlock_t lock; ++ struct list_head err_list; ++}; ++ ++static inline struct objlayout * ++OBJLAYOUT(struct pnfs_layout_hdr *lo) ++{ ++ return container_of(lo, struct objlayout, pnfs_layout); ++} ++ ++/* ++ * per-I/O operation state ++ * embedded in objects provider io_state data structure ++ */ ++struct objlayout_io_state { ++ struct pnfs_layout_segment *lseg; ++ ++ struct page **pages; ++ unsigned pgbase; ++ unsigned nr_pages; ++ unsigned long count; ++ loff_t offset; ++ bool sync; ++ ++ void *rpcdata; ++ int status; /* res */ ++ int eof; /* res */ ++ int committed; /* res */ ++ ++ /* Error reporting (layout_return) */ ++ struct list_head err_list; ++ unsigned num_comps; ++ /* Pointer to array of error descriptors of size num_comps. ++ * It should contain as many entries as devices in the osd_layout ++ * that participate in the I/O. It is up to the io_engine to allocate ++ * needed space and set num_comps. ++ */ ++ struct pnfs_osd_ioerr *ioerrs; ++}; ++ ++/* ++ * Raid engine I/O API ++ */ ++extern void *objio_init_mt(void); ++extern void objio_fini_mt(void *mt); ++ ++extern int objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout); ++extern void objio_free_lseg(void *p); ++ ++extern int objio_alloc_io_state(void *seg, struct objlayout_io_state **outp); ++extern void objio_free_io_state(struct objlayout_io_state *state); ++ ++extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); ++extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable); ++ ++/* ++ * callback API ++ */ ++extern void objlayout_io_set_result(struct objlayout_io_state *state, ++ unsigned index, int osd_error, ++ u64 offset, u64 length, bool is_write); ++ ++static inline void ++objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) ++{ ++ struct objlayout *objlay = OBJLAYOUT(state->lseg->layout); ++ ++ /* If one of the I/Os errored out and the delta_space_used was ++ * invalid we render the complete report as invalid. Protocol mandate ++ * the DSU be accurate or not reported. ++ */ ++ spin_lock(&objlay->lock); ++ if (objlay->delta_space_valid != OBJ_DSU_INVALID) { ++ objlay->delta_space_valid = OBJ_DSU_VALID; ++ objlay->delta_space_used += space_used; ++ } ++ spin_unlock(&objlay->lock); ++} ++ ++extern void objlayout_read_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++extern void objlayout_write_done(struct objlayout_io_state *state, ++ ssize_t status, bool sync); ++ ++extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr); ++extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); ++ ++/* ++ * exported generic objects function vectors ++ */ ++extern struct layoutdriver_io_operations objlayout_io_operations; ++extern struct pnfs_client_operations *pnfs_client_ops; ++ ++#endif /* _OBJLAYOUT_H */ +diff -up linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.c.orig linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.c +--- linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.c.orig 2010-08-31 21:11:40.931077838 -0400 ++++ linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.c 2010-08-31 21:11:40.931077838 -0400 +@@ -0,0 +1,734 @@ ++/* ++ * panfs_shim.c ++ * ++ * Shim layer for interfacing with the Panasas DirectFlow module I/O stack ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include "objlayout.h" ++#include "panfs_shim.h" ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++struct panfs_export_operations *panfs_export_ops; ++ ++void * ++objio_init_mt(void) ++{ ++ return panfs_export_ops == NULL ? ERR_PTR(-EAGAIN) : NULL; ++} ++ ++void objio_fini_mt(void *mountid) ++{ ++} ++ ++static int ++panfs_shim_conv_raid01(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) { ++ hdr->type = PAN_AGG_RAID1; ++ hdr->hdr.raid1.num_comps = lo_map->odm_mirror_cnt + 1; ++ } else if (layout->olo_num_comps > 1) { ++ hdr->type = PAN_AGG_RAID0; ++ hdr->hdr.raid0.num_comps = layout->olo_num_comps; ++ hdr->hdr.raid0.stripe_unit = lo_map->odm_stripe_unit; ++ } else ++ hdr->type = PAN_AGG_SIMPLE; ++ return 0; ++} ++ ++static int ++panfs_shim_conv_raid5(struct pnfs_osd_layout *layout, ++ struct pnfs_osd_data_map *lo_map, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ if (lo_map->odm_mirror_cnt) ++ goto err; ++ ++ if (lo_map->odm_group_width || lo_map->odm_group_depth) { ++ if (!lo_map->odm_group_width || !lo_map->odm_group_depth) ++ goto err; ++ ++ hdr->type = PAN_AGG_GRP_RAID5_LEFT; ++ hdr->hdr.grp_raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.grp_raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.grp_raid5_left.stripe_unit = lo_map->odm_stripe_unit; ++ hdr->hdr.grp_raid5_left.rg_width = lo_map->odm_group_width; ++ hdr->hdr.grp_raid5_left.rg_depth = lo_map->odm_group_depth; ++ /* this is a guess, panasas server is not supposed to ++ hand out layotu otherwise */ ++ hdr->hdr.grp_raid5_left.group_layout_policy = ++ PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN; ++ } else { ++ hdr->type = PAN_AGG_RAID5_LEFT; ++ hdr->hdr.raid5_left.num_comps = lo_map->odm_num_comps; ++ if (hdr->hdr.raid5_left.num_comps != lo_map->odm_num_comps) ++ goto err; ++ hdr->hdr.raid5_left.stripe_unit2 = ++ hdr->hdr.raid5_left.stripe_unit1 = ++ hdr->hdr.raid5_left.stripe_unit0 = lo_map->odm_stripe_unit; ++ } ++ ++ return 0; ++err: ++ return -EINVAL; ++} ++ ++/* ++ * Convert a pnfs_osd data map into Panasas aggregation layout header ++ */ ++static int ++panfs_shim_conv_pnfs_osd_data_map( ++ struct pnfs_osd_layout *layout, ++ pan_agg_layout_hdr_t *hdr) ++{ ++ int status = -EINVAL; ++ struct pnfs_osd_data_map *lo_map = &layout->olo_map; ++ ++ if (!layout->olo_num_comps) { ++ dprintk("%s: !!layout.n_comps(%u)\n", __func__, ++ layout->olo_num_comps); ++ goto err; ++ } ++ ++ switch (lo_map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_0 " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid01(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_5: ++ if (!lo_map->odm_group_width) { ++ if (layout->olo_num_comps != lo_map->odm_num_comps || ++ layout->olo_comps_index) { ++ dprintk("%s: !!PNFS_OSD_RAID_5 !group_width " ++ "layout.n_comps(%u)!=map.n_comps(%u) " ++ "|| comps_index(%u)\n", __func__, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ } else if ((layout->olo_num_comps != lo_map->odm_num_comps && ++ layout->olo_num_comps > lo_map->odm_group_width) || ++ (layout->olo_comps_index % lo_map->odm_group_width)){ ++ dprintk("%s: !!PNFS_OSD_RAID_5 group_width(%u) " ++ "layout.n_comps(%u) map.n_comps(%u) " ++ "comps_index(%u)\n", __func__, ++ lo_map->odm_group_width, ++ layout->olo_num_comps, ++ lo_map->odm_num_comps, ++ layout->olo_comps_index); ++ goto err; ++ } ++ status = panfs_shim_conv_raid5(layout, lo_map, hdr); ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_PQ: ++ default: ++ dprintk("%s: !!PNFS_OSD_RAID_(%d)\n", __func__, ++ lo_map->odm_raid_algorithm); ++ goto err; ++ } ++ ++ return 0; ++ ++err: ++ return status; ++} ++ ++/* ++ * Convert pnfs_osd layout into Panasas map and caps type ++ */ ++int ++objio_alloc_lseg(void **outp, ++ struct pnfs_layout_hdr *pnfslay, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_osd_layout *layout) ++{ ++ int i, total_comps; ++ int status; ++ struct pnfs_osd_object_cred *lo_comp; ++ pan_size_t alloc_sz, local_sz; ++ pan_sm_map_cap_t *mcs = NULL; ++ u8 *buf; ++ pan_agg_comp_obj_t *pan_comp; ++ pan_sm_sec_t *pan_sec; ++ ++ status = -EINVAL; ++ if (layout->olo_num_comps < layout->olo_map.odm_group_width) { ++ total_comps = layout->olo_comps_index + layout->olo_num_comps; ++ } else { ++ /* allocate full map, otherwise SAM gets confused */ ++ total_comps = layout->olo_map.odm_num_comps; ++ } ++ alloc_sz = total_comps * ++ (sizeof(pan_agg_comp_obj_t) + sizeof(pan_sm_sec_t)); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p = layout->olo_comps[i].oc_cap.cred; ++ if (panfs_export_ops->sm_sec_t_get_size_otw( ++ (pan_sm_sec_otw_t *)&p, &local_sz, NULL, NULL)) ++ goto err; ++ alloc_sz += local_sz; ++ } ++ ++ status = -ENOMEM; ++ mcs = kzalloc(sizeof(*mcs) + alloc_sz, GFP_KERNEL); ++ if (!mcs) ++ goto err; ++ buf = (u8 *)&mcs[1]; ++ ++ mcs->offset = lseg->range.offset; ++ mcs->length = lseg->range.length; ++#if 0 ++ /* FIXME: for now */ ++ mcs->expiration_time.ts_sec = 0; ++ mcs->expiration_time.ts_nsec = 0; ++#endif ++ mcs->full_map.map_hdr.avail_state = PAN_AGG_OBJ_STATE_NORMAL; ++ status = panfs_shim_conv_pnfs_osd_data_map(layout, ++ &mcs->full_map.layout_hdr); ++ if (status) ++ goto err; ++ ++ mcs->full_map.components.size = total_comps; ++ mcs->full_map.components.data = (pan_agg_comp_obj_t *)buf; ++ buf += total_comps * sizeof(pan_agg_comp_obj_t); ++ ++ mcs->secs.size = total_comps; ++ mcs->secs.data = (pan_sm_sec_t *)buf; ++ buf += total_comps * sizeof(pan_sm_sec_t); ++ ++ lo_comp = layout->olo_comps; ++ pan_comp = mcs->full_map.components.data + layout->olo_comps_index; ++ pan_sec = mcs->secs.data + layout->olo_comps_index; ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ void *p; ++ pan_stor_obj_id_t *obj_id = &mcs->full_map.map_hdr.obj_id; ++ struct pnfs_osd_objid *oc_obj_id = &lo_comp->oc_object_id; ++ u64 dev_id = __be64_to_cpup( ++ (__be64 *)oc_obj_id->oid_device_id.data + 1); ++ ++ dprintk("%s: i=%d deviceid=%Lx:%Lx partition=%Lx object=%Lx\n", ++ __func__, i, ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data), ++ __be64_to_cpup((__be64 *)oc_obj_id->oid_device_id.data + 1), ++ oc_obj_id->oid_partition_id, oc_obj_id->oid_object_id); ++ ++ if (i == 0) { ++ /* make up mgr_id to calm sam down */ ++ pan_mgr_id_construct_artificial(PAN_MGR_SM, 0, ++ &obj_id->dev_id); ++ obj_id->grp_id = oc_obj_id->oid_partition_id; ++ obj_id->obj_id = oc_obj_id->oid_object_id; ++ } ++ ++ if (obj_id->grp_id != lo_comp->oc_object_id.oid_partition_id) { ++ dprintk("%s: i=%d grp_id=0x%Lx oid_partition_id=0x%Lx\n", ++ __func__, i, (u64)obj_id->grp_id, ++ lo_comp->oc_object_id.oid_partition_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ if (obj_id->obj_id != lo_comp->oc_object_id.oid_object_id) { ++ dprintk("%s: i=%d obj_id=0x%Lx oid_object_id=0x%Lx\n", ++ __func__, i, obj_id->obj_id, ++ lo_comp->oc_object_id.oid_object_id); ++ status = -EINVAL; ++ goto err; ++ } ++ ++ pan_comp->dev_id = dev_id; ++ if (!pan_stor_is_device_id_an_obsd_id(pan_comp->dev_id)) { ++ dprintk("%s: i=%d dev_id=0x%Lx not an obsd_id\n", ++ __func__, i, obj_id->dev_id); ++ status = -EINVAL; ++ goto err; ++ } ++ if (lo_comp->oc_osd_version == PNFS_OSD_MISSING) { ++ dprintk("%s: degraded maps not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ pan_comp->avail_state = PAN_AGG_COMP_STATE_NORMAL; ++ if (lo_comp->oc_cap_key_sec != PNFS_OSD_CAP_KEY_SEC_NONE) { ++ dprintk("%s: cap key security not supported yet\n", ++ __func__); ++ status = -ENOTSUPP; ++ goto err; ++ } ++ ++ p = lo_comp->oc_cap.cred; ++ panfs_export_ops->sm_sec_t_unmarshall( ++ (pan_sm_sec_otw_t *)&p, ++ pan_sec, ++ buf, ++ alloc_sz, ++ NULL, ++ &local_sz); ++ buf += local_sz; ++ alloc_sz -= local_sz; ++ ++ lo_comp++; ++ pan_comp++; ++ pan_sec++; ++ } ++ ++ *outp = mcs; ++ dprintk("%s:Return mcs=%p\n", __func__, mcs); ++ return 0; ++ ++err: ++ objio_free_lseg(mcs); ++ dprintk("%s:Error %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Free a Panasas map and caps type ++ */ ++void ++objio_free_lseg(void *p) ++{ ++ kfree(p); ++} ++ ++/* ++ * I/O routines ++ */ ++int ++objio_alloc_io_state(void *seg, struct objlayout_io_state **outp) ++{ ++ struct panfs_shim_io_state *p; ++ ++ dprintk("%s: allocating io_state\n", __func__); ++ p = kzalloc(sizeof(*p), GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ *outp = &p->ol_state; ++ return 0; ++} ++ ++/* ++ * Free an I/O state ++ */ ++void ++objio_free_io_state(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ int i; ++ ++ dprintk("%s: freeing io_state\n", __func__); ++ for (i = 0; i < state->ol_state.nr_pages; i++) ++ kunmap(state->ol_state.pages[i]); ++ ++ if (state->ucreds) ++ panfs_export_ops->ucreds_put(state->ucreds); ++ kfree(state->sg_list); ++ kfree(state); ++} ++ ++static int ++panfs_shim_pages_to_sg( ++ struct panfs_shim_io_state *state, ++ struct page **pages, ++ unsigned int pgbase, ++ unsigned nr_pages, ++ size_t count) ++{ ++ unsigned i, n; ++ pan_sg_entry_t *sg; ++ ++ dprintk("%s pgbase %u nr_pages %u count %d " ++ "pg0 %p flags 0x%x index %llu\n", ++ __func__, pgbase, nr_pages, (int)count, pages[0], ++ (unsigned)pages[0]->flags, (unsigned long long)pages[0]->index); ++ ++ sg = kmalloc(nr_pages * sizeof(*sg), GFP_KERNEL); ++ if (sg == NULL) ++ return -ENOMEM; ++ ++ dprintk("%s sg_list %p pages %p pgbase %u nr_pages %u\n", ++ __func__, sg, pages, pgbase, nr_pages); ++ ++ for (i = 0; i < nr_pages; i++) { ++ sg[i].buffer = (char *)kmap(pages[i]) + pgbase; ++ n = PAGE_SIZE - pgbase; ++ pgbase = 0; ++ if (n > count) ++ n = count; ++ sg[i].chunk_size = n; ++ count -= n; ++ if (likely(count)) { ++ sg[i].next = &sg[i+1]; ++ } else { ++ /* we're done */ ++ sg[i].next = NULL; ++ break; ++ } ++ } ++ BUG_ON(count); ++ ++ state->sg_list = sg; ++ return 0; ++} ++ ++/* ++ * Callback function for async reads ++ */ ++static void ++panfs_shim_read_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.read.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++ status = res_p->length; ++ WARN_ON(status < 0); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_read rc %d: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_read_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_read_pagelist(struct objlayout_io_state *ol_state) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.read.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.read.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_read(PAN_SAM_ACCESS_BYPASS_TIMESTAMP, ++ &state->u.read.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_read_done, ++ state, NULL, ++ &state->u.read.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_read_done(state, NULL, &state->u.read.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++/* ++ * Callback function for async writes ++ */ ++static void ++panfs_shim_write_done( ++ void *arg1, ++ void *arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t rc) ++{ ++ struct panfs_shim_io_state *state = arg1; ++ ssize_t status; ++ ++ dprintk("%s: Begin\n", __func__); ++ if (!res_p) ++ res_p = &state->u.write.res; ++ if (rc == PAN_SUCCESS) ++ rc = res_p->result; ++ if (rc == PAN_SUCCESS) { ++/* state->ol_state.committed = NFS_FILE_SYNC;*/ ++ state->ol_state.committed = NFS_UNSTABLE; ++ status = res_p->length; ++ WARN_ON(status < 0); ++ ++ objlayout_add_delta_space_used(&state->ol_state, ++ res_p->delta_capacity_used); ++ } else { ++ status = -panfs_export_ops->convert_rc(rc); ++ dprintk("%s: pan_sam_write rc %u: status %Zd\n", ++ __func__, rc, status); ++ } ++ dprintk("%s: Return status %Zd rc %d\n", __func__, status, rc); ++ objlayout_write_done(&state->ol_state, status, true); ++} ++ ++ssize_t ++objio_write_pagelist(struct objlayout_io_state *ol_state, ++ bool stable /* unused, PanOSD writes are stable */) ++{ ++ struct panfs_shim_io_state *state = container_of(ol_state, ++ struct panfs_shim_io_state, ol_state); ++ struct objlayout_segment *lseg = LSEG_LD_DATA(ol_state->lseg); ++ pan_sm_map_cap_t *mcs = (pan_sm_map_cap_t *)lseg->internal; ++ ssize_t status = 0; ++ pan_status_t rc = PAN_SUCCESS; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ status = panfs_shim_pages_to_sg(state, ol_state->pages, ++ ol_state->pgbase, ol_state->nr_pages, ++ ol_state->count); ++ if (unlikely(status)) ++ goto err; ++ ++ state->obj_sec.min_security = 0; ++ state->obj_sec.map_ccaps = mcs; ++ ++ rc = panfs_export_ops->ucreds_get(&state->ucreds); ++ if (unlikely(rc)) { ++ status = -EACCES; ++ goto err; ++ } ++ ++ state->u.write.args.obj_id = mcs->full_map.map_hdr.obj_id; ++ state->u.write.args.offset = ol_state->offset; ++ rc = panfs_export_ops->sam_write(PAN_SAM_ACCESS_NONE, ++ &state->u.write.args, ++ &state->obj_sec, ++ state->sg_list, ++ state->ucreds, ++ ol_state->sync ? ++ NULL : panfs_shim_write_done, ++ state, ++ NULL, ++ &state->u.write.res); ++ if (rc != PAN_ERR_IN_PROGRESS) ++ panfs_shim_write_done(state, NULL, &state->u.write.res, rc); ++ err: ++ dprintk("%s: Return %Zd\n", __func__, status); ++ return status; ++} ++ ++int ++panfs_shim_register(struct panfs_export_operations *ops) ++{ ++ if (panfs_export_ops) { ++ printk(KERN_INFO ++ "%s: panfs already registered (panfs ops %p)\n", ++ __func__, panfs_export_ops); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: registering panfs ops %p\n", ++ __func__, ops); ++ ++ panfs_export_ops = ops; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_register); ++ ++int ++panfs_shim_unregister(void) ++{ ++ if (!panfs_export_ops) { ++ printk(KERN_INFO "%s: panfs is not registered\n", __func__); ++ return -EINVAL; ++ } ++ ++ printk(KERN_INFO "%s: unregistering panfs ops %p\n", ++ __func__, panfs_export_ops); ++ ++ panfs_export_ops = NULL; ++ return 0; ++} ++EXPORT_SYMBOL(panfs_shim_unregister); ++ ++/* ++ * Policy Operations ++ */ ++ ++/* ++ * Return the stripe size for the specified file ++ */ ++ssize_t ++panlayout_get_stripesize(struct pnfs_layout_hdr *pnfslay) ++{ ++ ssize_t sz, maxsz = -1; ++ struct pnfs_layout_segment *lseg; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ list_for_each_entry(lseg, &pnfslay->segs, fi_list) { ++ int n; ++ struct objlayout_segment *panlseg = LSEG_LD_DATA(lseg); ++ struct pnfs_osd_layout *lo = ++ (struct pnfs_osd_layout *)panlseg->pnfs_osd_layout; ++ struct pnfs_osd_data_map *map = &lo->olo_map; ++ ++ n = map->odm_group_width; ++ if (n == 0) ++ n = map->odm_num_comps / (map->odm_mirror_cnt + 1); ++ ++ switch (map->odm_raid_algorithm) { ++ case PNFS_OSD_RAID_0: ++ break; ++ ++ case PNFS_OSD_RAID_4: ++ case PNFS_OSD_RAID_5: ++ n -= 1; ++ n *= 8; /* FIXME: until we have 2-D coalescing */ ++ break; ++ ++ case PNFS_OSD_RAID_PQ: ++ n -= 2; ++ break; ++ ++ default: ++ BUG_ON(1); ++ } ++ sz = map->odm_stripe_unit * n; ++ if (sz > maxsz) ++ maxsz = sz; ++ } ++ dprintk("%s: Return %Zd\n", __func__, maxsz); ++ return maxsz; ++} ++ ++#define PANLAYOUT_DEF_STRIPE_UNIT (64*1024) ++#define PANLAYOUT_DEF_STRIPE_WIDTH 9 ++#define PANLAYOUT_MAX_STRIPE_WIDTH 11 ++#define PANLAYOUT_MAX_GATHER_STRIPES 8 ++ ++/* ++ * Get the max [rw]size ++ */ ++static ssize_t ++panlayout_get_blocksize(void) ++{ ++ ssize_t sz = (PANLAYOUT_MAX_STRIPE_WIDTH-1) * ++ PANLAYOUT_DEF_STRIPE_UNIT * ++ PANLAYOUT_MAX_GATHER_STRIPES; ++ dprintk("%s: Return %Zd\n", __func__, sz); ++ return sz; ++} ++ ++static struct layoutdriver_policy_operations panlayout_policy_operations = { ++/* ++ * Don't gather across stripes, but rather gather (coalesce) up to ++ * the stripe size. ++ * ++ * FIXME: change interface to use merge_align, merge_count ++ */ ++ .flags = PNFS_LAYOUTRET_ON_SETATTR, ++ .get_stripesize = panlayout_get_stripesize, ++ .get_blocksize = panlayout_get_blocksize, ++}; ++ ++#define PNFS_LAYOUT_PANOSD (NFS4_PNFS_PRIVATE_LAYOUT | LAYOUT_OSD2_OBJECTS) ++ ++static struct pnfs_layoutdriver_type panlayout_type = { ++ .id = PNFS_LAYOUT_PANOSD, ++ .name = "PNFS_LAYOUT_PANOSD", ++ .ld_io_ops = &objlayout_io_operations, ++ .ld_policy_ops = &panlayout_policy_operations, ++}; ++ ++MODULE_DESCRIPTION("pNFS Layout Driver for Panasas OSDs"); ++MODULE_AUTHOR("Benny Halevy "); ++MODULE_LICENSE("GPL"); ++ ++static int __init ++panlayout_init(void) ++{ ++ pnfs_client_ops = pnfs_register_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Registered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++ return 0; ++} ++ ++static void __exit ++panlayout_exit(void) ++{ ++ pnfs_unregister_layoutdriver(&panlayout_type); ++ printk(KERN_INFO "%s: Unregistered Panasas OSD pNFS Layout Driver\n", ++ __func__); ++} ++ ++module_init(panlayout_init); ++module_exit(panlayout_exit); +diff -up linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.h.orig linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.h +--- linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.h.orig 2010-08-31 21:11:40.932137666 -0400 ++++ linux-2.6.35.noarch/fs/nfs/objlayout/panfs_shim.h 2010-08-31 21:11:40.932137666 -0400 +@@ -0,0 +1,482 @@ ++/* ++ * panfs_shim.h ++ * ++ * Data types and external function declerations for interfacing with ++ * panfs (Panasas DirectFlow) I/O stack ++ * ++ * Copyright (C) 2007 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * See the file COPYING included with this distribution for more details. ++ * ++ */ ++ ++#ifndef _PANLAYOUT_PANFS_SHIM_H ++#define _PANLAYOUT_PANFS_SHIM_H ++ ++typedef s8 pan_int8_t; ++typedef u8 pan_uint8_t; ++typedef s16 pan_int16_t; ++typedef u16 pan_uint16_t; ++typedef s32 pan_int32_t; ++typedef u32 pan_uint32_t; ++typedef s64 pan_int64_t; ++typedef u64 pan_uint64_t; ++ ++/* ++ * from pan_base_types.h ++ */ ++typedef pan_uint64_t pan_rpc_none_t; ++typedef pan_uint32_t pan_rpc_arrdim_t; ++typedef pan_uint32_t pan_status_t; ++typedef pan_uint8_t pan_otw_t; ++typedef pan_uint8_t pan_pad_t; ++ ++typedef pan_uint32_t pan_timespec_sec_t; ++typedef pan_uint32_t pan_timespec_nsec_t; ++ ++typedef struct pan_timespec_s pan_timespec_t; ++struct pan_timespec_s { ++ pan_timespec_sec_t ts_sec; ++ pan_timespec_nsec_t ts_nsec; ++}; ++ ++/* ++ * from pan_std_types.h ++ */ ++typedef pan_uint32_t pan_size_t; ++typedef int pan_bool_t; ++ ++/* ++ * from pan_common_error.h ++ */ ++#define PAN_SUCCESS ((pan_status_t)0) ++#define PAN_ERR_IN_PROGRESS ((pan_status_t)55) ++ ++/* ++ * from pan_sg.h ++ */ ++typedef struct pan_sg_entry_s pan_sg_entry_t; ++struct pan_sg_entry_s { ++ void *buffer; /* pointer to memory */ ++ pan_uint32_t chunk_size; /* size of each chunk (bytes) */ ++ pan_sg_entry_t *next; ++}; ++ ++/* ++ * from pan_storage.h ++ */ ++typedef pan_uint64_t pan_stor_dev_id_t; ++typedef pan_uint32_t pan_stor_obj_grp_id_t; ++typedef pan_uint64_t pan_stor_obj_uniq_t; ++typedef pan_uint32_t pan_stor_action_t; ++typedef pan_uint8_t pan_stor_cap_key_t[20]; ++ ++typedef pan_uint8_t pan_stor_key_type_t; ++typedef pan_uint64_t pan_stor_len_t; ++typedef pan_int64_t pan_stor_delta_len_t; ++typedef pan_uint64_t pan_stor_offset_t; ++typedef pan_uint16_t pan_stor_op_t; ++ ++typedef pan_uint16_t pan_stor_sec_level_t; ++ ++struct pan_stor_obj_id_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_obj_uniq_t obj_id; ++ pan_stor_obj_grp_id_t grp_id; ++}; ++ ++typedef struct pan_stor_obj_id_s pan_stor_obj_id_t; ++ ++#define PAN_STOR_OP_NONE ((pan_stor_op_t) 0U) ++#define PAN_STOR_OP_READ ((pan_stor_op_t) 8U) ++#define PAN_STOR_OP_WRITE ((pan_stor_op_t) 9U) ++#define PAN_STOR_OP_APPEND ((pan_stor_op_t) 10U) ++#define PAN_STOR_OP_GETATTR ((pan_stor_op_t) 11U) ++#define PAN_STOR_OP_SETATTR ((pan_stor_op_t) 12U) ++#define PAN_STOR_OP_FLUSH ((pan_stor_op_t) 13U) ++#define PAN_STOR_OP_CLEAR ((pan_stor_op_t) 14U) ++ ++/* ++ * from pan_aggregation_map.h ++ */ ++typedef pan_uint8_t pan_agg_type_t; ++typedef pan_uint64_t pan_agg_map_version_t; ++typedef pan_uint8_t pan_agg_obj_state_t; ++typedef pan_uint8_t pan_agg_comp_state_t; ++typedef pan_uint8_t pan_agg_comp_flag_t; ++ ++#define PAN_AGG_OBJ_STATE_INVALID ((pan_agg_obj_state_t) 0x00) ++#define PAN_AGG_OBJ_STATE_NORMAL ((pan_agg_obj_state_t) 0x01) ++#define PAN_AGG_OBJ_STATE_DEGRADED ((pan_agg_obj_state_t) 0x02) ++#define PAN_AGG_OBJ_STATE_RECONSTRUCT ((pan_agg_obj_state_t) 0x03) ++#define PAN_AGG_OBJ_STATE_COPYBACK ((pan_agg_obj_state_t) 0x04) ++#define PAN_AGG_OBJ_STATE_UNAVAILABLE ((pan_agg_obj_state_t) 0x05) ++#define PAN_AGG_OBJ_STATE_CREATING ((pan_agg_obj_state_t) 0x06) ++#define PAN_AGG_OBJ_STATE_DELETED ((pan_agg_obj_state_t) 0x07) ++#define PAN_AGG_COMP_STATE_INVALID ((pan_agg_comp_state_t) 0x00) ++#define PAN_AGG_COMP_STATE_NORMAL ((pan_agg_comp_state_t) 0x01) ++#define PAN_AGG_COMP_STATE_UNAVAILABLE ((pan_agg_comp_state_t) 0x02) ++#define PAN_AGG_COMP_STATE_COPYBACK ((pan_agg_comp_state_t) 0x03) ++#define PAN_AGG_COMP_F_NONE ((pan_agg_comp_flag_t) 0x00) ++#define PAN_AGG_COMP_F_ATTR_STORING ((pan_agg_comp_flag_t) 0x01) ++#define PAN_AGG_COMP_F_OBJ_CORRUPT_OBS ((pan_agg_comp_flag_t) 0x02) ++#define PAN_AGG_COMP_F_TEMP ((pan_agg_comp_flag_t) 0x04) ++ ++struct pan_aggregation_map_s { ++ pan_agg_map_version_t version; ++ pan_agg_obj_state_t avail_state; ++ pan_stor_obj_id_t obj_id; ++}; ++ ++typedef struct pan_aggregation_map_s pan_aggregation_map_t; ++ ++struct pan_agg_comp_obj_s { ++ pan_stor_dev_id_t dev_id; ++ pan_agg_comp_state_t avail_state; ++ pan_agg_comp_flag_t comp_flags; ++}; ++ ++typedef struct pan_agg_comp_obj_s pan_agg_comp_obj_t; ++ ++struct pan_agg_simple_header_s { ++ pan_uint8_t unused; ++}; ++ ++typedef struct pan_agg_simple_header_s pan_agg_simple_header_t; ++ ++struct pan_agg_raid1_header_s { ++ pan_uint16_t num_comps; ++}; ++ ++typedef struct pan_agg_raid1_header_s pan_agg_raid1_header_t; ++ ++struct pan_agg_raid0_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++}; ++ ++typedef struct pan_agg_raid0_header_s pan_agg_raid0_header_t; ++ ++struct pan_agg_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit0; ++ pan_uint32_t stripe_unit1; ++ pan_uint32_t stripe_unit2; ++}; ++ ++typedef struct pan_agg_raid5_left_header_s pan_agg_raid5_left_header_t; ++ ++typedef struct pan_agg_grp_raid5_left_header_s pan_agg_grp_raid5_left_header_t; ++ ++struct pan_agg_grp_raid5_left_header_s { ++ pan_uint16_t num_comps; ++ pan_uint32_t stripe_unit; ++ pan_uint16_t rg_width; ++ pan_uint16_t rg_depth; ++ pan_uint8_t group_layout_policy; ++}; ++ ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_INVALID ((pan_uint8_t) 0x00) ++#define PAN_AGG_GRP_RAID5_LEFT_POLICY_ROUND_ROBIN ((pan_uint8_t) 0x01) ++ ++#define PAN_AGG_NULL_MAP ((pan_agg_type_t) 0x00) ++#define PAN_AGG_SIMPLE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_RAID1 ((pan_agg_type_t) 0x02) ++#define PAN_AGG_RAID0 ((pan_agg_type_t) 0x03) ++#define PAN_AGG_RAID5_LEFT ((pan_agg_type_t) 0x04) ++#define PAN_AGG_GRP_RAID5_LEFT ((pan_agg_type_t) 0x06) ++#define PAN_AGG_MINTYPE ((pan_agg_type_t) 0x01) ++#define PAN_AGG_MAXTYPE ((pan_agg_type_t) 0x06) ++ ++struct pan_agg_layout_hdr_s { ++ pan_agg_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_uint64_t null; ++ pan_agg_simple_header_t simple; ++ pan_agg_raid1_header_t raid1; ++ pan_agg_raid0_header_t raid0; ++ pan_agg_raid5_left_header_t raid5_left; ++ pan_agg_grp_raid5_left_header_t grp_raid5_left; ++ } hdr; ++}; ++ ++typedef struct pan_agg_layout_hdr_s pan_agg_layout_hdr_t; ++ ++struct pan_agg_comp_obj_a_s { ++ pan_rpc_arrdim_t size; ++ pan_agg_comp_obj_t *data; ++}; ++typedef struct pan_agg_comp_obj_a_s pan_agg_comp_obj_a; ++ ++struct pan_agg_full_map_s { ++ pan_aggregation_map_t map_hdr; ++ pan_agg_layout_hdr_t layout_hdr; ++ pan_agg_comp_obj_a components; ++}; ++ ++typedef struct pan_agg_full_map_s pan_agg_full_map_t; ++ ++/* ++ * from pan_obsd_rpc_types.h ++ */ ++typedef pan_uint8_t pan_obsd_security_key_a[16]; ++ ++typedef pan_uint8_t pan_obsd_capability_key_a[20]; ++ ++typedef pan_uint8_t pan_obsd_key_holder_id_t; ++ ++#define PAN_OBSD_KEY_HOLDER_BASIS_KEY ((pan_obsd_key_holder_id_t) 0x01) ++#define PAN_OBSD_KEY_HOLDER_CAP_KEY ((pan_obsd_key_holder_id_t) 0x02) ++ ++struct pan_obsd_key_holder_s { ++ pan_obsd_key_holder_id_t select; ++ pan_pad_t pad[3]; ++ union { ++ pan_obsd_security_key_a basis_key; ++ pan_obsd_capability_key_a cap_key; ++ } key; ++}; ++ ++typedef struct pan_obsd_key_holder_s pan_obsd_key_holder_t; ++ ++/* ++ * from pan_sm_sec.h ++ */ ++typedef pan_uint8_t pan_sm_sec_type_t; ++typedef pan_uint8_t pan_sm_sec_otw_allo_mode_t; ++ ++struct pan_obsd_capability_generic_otw_t_s { ++ pan_rpc_arrdim_t size; ++ pan_uint8_t *data; ++}; ++typedef struct pan_obsd_capability_generic_otw_t_s ++ pan_obsd_capability_generic_otw_t; ++ ++struct pan_sm_sec_obsd_s { ++ pan_obsd_key_holder_t key; ++ pan_obsd_capability_generic_otw_t cap_otw; ++ pan_sm_sec_otw_allo_mode_t allo_mode; ++}; ++ ++typedef struct pan_sm_sec_obsd_s pan_sm_sec_obsd_t; ++ ++struct pan_sm_sec_s { ++ pan_sm_sec_type_t type; ++ pan_pad_t pad[3]; ++ union { ++ pan_rpc_none_t none; ++ pan_sm_sec_obsd_t obsd; ++ } variant; ++}; ++ ++typedef struct pan_sm_sec_s pan_sm_sec_t; ++ ++struct pan_sm_sec_a_s { ++ pan_rpc_arrdim_t size; ++ pan_sm_sec_t *data; ++}; ++typedef struct pan_sm_sec_a_s pan_sm_sec_a; ++typedef pan_otw_t *pan_sm_sec_otw_t; ++ ++/* ++ * from pan_sm_types.h ++ */ ++typedef pan_uint64_t pan_sm_cap_handle_t; ++ ++struct pan_sm_map_cap_s { ++ pan_agg_full_map_t full_map; ++ pan_stor_offset_t offset; ++ pan_stor_len_t length; ++ pan_sm_sec_a secs; ++ pan_sm_cap_handle_t handle; ++ pan_timespec_t expiration_time; ++ pan_stor_action_t action_mask; ++ pan_uint32_t flags; ++}; ++ ++typedef struct pan_sm_map_cap_s pan_sm_map_cap_t; ++ ++/* ++ * from pan_sm_ops.h ++ */ ++typedef pan_rpc_none_t pan_sm_cache_ptr_t; ++ ++/* ++ * from pan_sam_api.h ++ */ ++typedef pan_uint32_t pan_sam_access_flags_t; ++ ++typedef struct pan_sam_dev_error_s pan_sam_dev_error_t; ++struct pan_sam_dev_error_s { ++ pan_stor_dev_id_t dev_id; ++ pan_stor_op_t stor_op; ++ pan_status_t error; ++}; ++ ++typedef struct pan_sam_ext_status_s pan_sam_ext_status_t; ++struct pan_sam_ext_status_s { ++ pan_uint32_t available; ++ pan_uint32_t size; ++ pan_sam_dev_error_t *errors; ++}; ++ ++enum pan_sam_rpc_sec_sel_e { ++ PAN_SAM_RPC_SEC_DEFAULT, ++ PAN_SAM_RPC_SEC_ATLEAST, ++ PAN_SAM_RPC_SEC_EXACTLY ++}; ++typedef enum pan_sam_rpc_sec_sel_e pan_sam_rpc_sec_sel_t; ++ ++typedef struct pan_sam_obj_sec_s pan_sam_obj_sec_t; ++struct pan_sam_obj_sec_s { ++ pan_stor_sec_level_t min_security; ++ pan_sm_map_cap_t *map_ccaps; ++}; ++ ++typedef struct pan_sam_rpc_sec_s pan_sam_rpc_sec_t; ++struct pan_sam_rpc_sec_s { ++ pan_sam_rpc_sec_sel_t selector; ++}; ++ ++typedef struct pan_sam_read_args_s pan_sam_read_args_t; ++struct pan_sam_read_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ void *return_attr; ++ void *checksum; ++ pan_stor_offset_t offset; ++ pan_uint16_t sm_options; ++ void *callout; ++ void *callout_arg; ++}; ++ ++typedef struct pan_sam_read_res_s pan_sam_read_res_t; ++struct pan_sam_read_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ void *attr; ++ void *checksum; ++}; ++ ++typedef void (*pan_sam_read_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p, ++ pan_status_t status); ++ ++#define PAN_SAM_ACCESS_NONE 0x0000 ++#define PAN_SAM_ACCESS_BYPASS_TIMESTAMP 0x0020 ++ ++typedef struct pan_sam_write_args_s pan_sam_write_args_t; ++struct pan_sam_write_args_s { ++ pan_stor_obj_id_t obj_id; ++ pan_sm_cache_ptr_t obj_ent; ++ pan_stor_offset_t offset; ++ void *attr; ++ void *return_attr; ++}; ++ ++typedef struct pan_sam_write_res_s pan_sam_write_res_t; ++struct pan_sam_write_res_s { ++ pan_status_t result; ++ pan_sam_ext_status_t ext_status; ++ pan_stor_len_t length; ++ pan_stor_delta_len_t delta_capacity_used; ++ pan_bool_t parity_dirty; ++ void *attr; ++}; ++ ++typedef void (*pan_sam_write_cb_t)( ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p, ++ pan_status_t status); ++ ++/* ++ * from pan_mgr_types.h ++ */ ++#define PAN_MGR_ID_TYPE_SHIFT 56 ++#define PAN_MGR_ID_TYPE_MASK ((pan_mgr_id_t)18374686479671623680ULL) ++#define PAN_MGR_ID_UNIQ_MASK ((pan_mgr_id_t)72057594037927935ULL) ++ ++typedef pan_uint16_t pan_mgr_type_t; ++typedef pan_uint64_t pan_mgr_id_t; ++ ++#define PAN_MGR_SM ((pan_mgr_type_t) 2U) ++#define PAN_MGR_OBSD ((pan_mgr_type_t) 6U) ++ ++/* ++ * from pan_mgr_types_c.h ++ */ ++#define pan_mgr_id_construct_artificial(_mgr_type_, _mgr_uniq_, _mgr_id_p_) { \ ++ pan_mgr_id_t _id1, _id2; \ ++\ ++ _id1 = (_mgr_type_); \ ++ _id1 <<= PAN_MGR_ID_TYPE_SHIFT; \ ++ _id1 &= PAN_MGR_ID_TYPE_MASK; \ ++ _id2 = (_mgr_uniq_); \ ++ _id2 &= PAN_MGR_ID_UNIQ_MASK; \ ++ _id1 |= _id2; \ ++ *(_mgr_id_p_) = _id1; \ ++} ++ ++/* ++ * from pan_storage_c.h ++ */ ++#define pan_stor_is_device_id_an_obsd_id(_device_id_) \ ++ ((((_device_id_) & PAN_MGR_ID_TYPE_MASK) >> PAN_MGR_ID_TYPE_SHIFT) \ ++ == PAN_MGR_OBSD) ++ ++/* ++ * pnfs_shim internal definitions ++ */ ++ ++struct panfs_shim_io_state { ++ struct objlayout_io_state ol_state; ++ ++ pan_sg_entry_t *sg_list; ++ pan_sam_obj_sec_t obj_sec; ++ void *ucreds; ++ union { ++ struct { ++ pan_sam_read_args_t args; ++ pan_sam_read_res_t res; ++ } read; ++ struct { ++ pan_sam_write_args_t args; ++ pan_sam_write_res_t res; ++ } write; ++ } u; ++}; ++ ++#endif /* _PANLAYOUT_PANFS_SHIM_H */ +diff -up linux-2.6.35.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig linux-2.6.35.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +--- linux-2.6.35.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c.orig 2010-08-31 21:11:40.933035641 -0400 ++++ linux-2.6.35.noarch/fs/nfs/objlayout/pnfs_osd_xdr_cli.c 2010-08-31 21:11:40.933035641 -0400 +@@ -0,0 +1,435 @@ ++/* ++ * pnfs_osd_xdr.c ++ * ++ * Object-Based pNFS Layout XDR layer ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS_LD ++ ++/* ++ * The following implementation is based on these Internet Drafts: ++ * ++ * draft-ietf-nfsv4-minorversion-21 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_objid(u32 *p, struct pnfs_osd_objid *objid) ++{ ++ COPYMEM(objid->oid_device_id.data, sizeof(objid->oid_device_id.data)); ++ READ64(objid->oid_partition_id); ++ READ64(objid->oid_object_id); ++ return p; ++} ++ ++static inline u32 * ++pnfs_osd_xdr_decode_opaque_cred(u32 *p, ++ struct pnfs_osd_opaque_cred *opaque_cred) ++{ ++ READ32(opaque_cred->cred_len); ++ COPYMEM(opaque_cred->cred, opaque_cred->cred_len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_object_cred { ++ * struct pnfs_osd_objid oc_object_id; ++ * u32 oc_osd_version; ++ * u32 oc_cap_key_sec; ++ * struct pnfs_osd_opaque_cred oc_cap_key ++ * struct pnfs_osd_opaque_cred oc_cap; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_object_cred(u32 *p, struct pnfs_osd_object_cred *comp, ++ u8 **credp) ++{ ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_objid(p, &comp->oc_object_id); ++ READ32(comp->oc_osd_version); ++ READ32(comp->oc_cap_key_sec); ++ ++ cred = *credp; ++ comp->oc_cap_key.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap_key); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap_key.cred_len)); ++ comp->oc_cap.cred = cred; ++ p = pnfs_osd_xdr_decode_opaque_cred(p, &comp->oc_cap); ++ cred = (u8 *)((u32 *)cred + XDR_QUADLEN(comp->oc_cap.cred_len)); ++ *credp = cred; ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_data_map { ++ * u32 odm_num_comps; ++ * u64 odm_stripe_unit; ++ * u32 odm_group_width; ++ * u32 odm_group_depth; ++ * u32 odm_mirror_cnt; ++ * u32 odm_raid_algorithm; ++ * }; ++ */ ++static inline u32 * ++pnfs_osd_xdr_decode_data_map(u32 *p, struct pnfs_osd_data_map *data_map) ++{ ++ READ32(data_map->odm_num_comps); ++ READ64(data_map->odm_stripe_unit); ++ READ32(data_map->odm_group_width); ++ READ32(data_map->odm_group_depth); ++ READ32(data_map->odm_mirror_cnt); ++ READ32(data_map->odm_raid_algorithm); ++ dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " ++ "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", ++ __func__, ++ data_map->odm_num_comps, ++ (unsigned long long)data_map->odm_stripe_unit, ++ data_map->odm_group_width, ++ data_map->odm_group_depth, ++ data_map->odm_mirror_cnt, ++ data_map->odm_raid_algorithm); ++ return p; ++} ++ ++struct pnfs_osd_layout * ++pnfs_osd_xdr_decode_layout(struct pnfs_osd_layout *layout, u32 *p) ++{ ++ int i; ++ u32 *start = p; ++ struct pnfs_osd_object_cred *comp; ++ u8 *cred; ++ ++ p = pnfs_osd_xdr_decode_data_map(p, &layout->olo_map); ++ READ32(layout->olo_comps_index); ++ READ32(layout->olo_num_comps); ++ layout->olo_comps = (struct pnfs_osd_object_cred *)(layout + 1); ++ comp = layout->olo_comps; ++ cred = (u8 *)(comp + layout->olo_num_comps); ++ dprintk("%s: comps_index=%u num_comps=%u\n", ++ __func__, layout->olo_comps_index, layout->olo_num_comps); ++ for (i = 0; i < layout->olo_num_comps; i++) { ++ p = pnfs_osd_xdr_decode_object_cred(p, comp, &cred); ++ dprintk("%s: comp[%d]=dev(%llx:%llx) par=0x%llx obj=0x%llx " ++ "key_len=%u cap_len=%u\n", ++ __func__, i, ++ _DEVID_LO(&comp->oc_object_id.oid_device_id), ++ _DEVID_HI(&comp->oc_object_id.oid_device_id), ++ comp->oc_object_id.oid_partition_id, ++ comp->oc_object_id.oid_object_id, ++ comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); ++ comp++; ++ } ++ dprintk("%s: xdr_size=%Zd end=%p in_core_size=%Zd\n", __func__, ++ (char *)p - (char *)start, cred, (char *)cred - (char *)layout); ++ return layout; ++} ++ ++/* ++ * Get Device Information Decoding ++ * ++ * Note: since Device Information is currently done synchronously, most ++ * of the actual fields are left inside the rpc buffer and are only ++ * pointed to by the pnfs_osd_deviceaddr members. So the read buffer ++ * should not be freed while the returned information is in use. ++ */ ++ ++u32 *__xdr_read_calc_nfs4_string( ++ u32 *p, struct nfs4_string *str, u8 **freespace) ++{ ++ u32 len; ++ char *data; ++ bool need_copy; ++ ++ READ32(len); ++ data = (char *)p; ++ ++ if (data[len]) { /* Not null terminated we'll need extra space */ ++ data = *freespace; ++ *freespace += len + 1; ++ need_copy = true; ++ } else { ++ need_copy = false; ++ } ++ ++ if (str) { ++ str->len = len; ++ str->data = data; ++ if (need_copy) { ++ memcpy(data, p, len); ++ data[len] = 0; ++ } ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++u32 *__xdr_read_calc_u8_opaque( ++ u32 *p, struct nfs4_string *str) ++{ ++ u32 len; ++ ++ READ32(len); ++ ++ if (str) { ++ str->len = len; ++ str->data = (char *)p; ++ } ++ ++ p += XDR_QUADLEN(len); ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetid { ++ * u32 oti_type; ++ * struct nfs4_string oti_scsi_device_id; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetid( ++ u32 *p, struct pnfs_osd_targetid* targetid, u8 **freespace) ++{ ++ u32 oti_type; ++ ++ READ32(oti_type); ++ if (targetid) ++ targetid->oti_type = oti_type; ++ ++ switch (oti_type) { ++ case OBJ_TARGET_SCSI_NAME: ++ case OBJ_TARGET_SCSI_DEVICE_ID: ++ p = __xdr_read_calc_u8_opaque(p, ++ targetid ? &targetid->oti_scsi_device_id : NULL); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_net_addr { ++ * struct nfs4_string r_netid; ++ * struct nfs4_string r_addr; ++ * }; ++ */ ++u32 *__xdr_read_calc_net_addr( ++ u32 *p, struct pnfs_osd_net_addr* netaddr, u8 **freespace) ++{ ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_netid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_nfs4_string(p, ++ netaddr ? &netaddr->r_addr : NULL, ++ freespace); ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_targetaddr { ++ * u32 ota_available; ++ * struct pnfs_osd_net_addr ota_netaddr; ++ * }; ++ */ ++u32 *__xdr_read_calc_targetaddr( ++ u32 *p, struct pnfs_osd_targetaddr *targetaddr, u8 **freespace) ++{ ++ u32 ota_available; ++ ++ READ32(ota_available); ++ if (targetaddr) ++ targetaddr->ota_available = ota_available; ++ ++ if (ota_available) { ++ p = __xdr_read_calc_net_addr(p, ++ targetaddr ? &targetaddr->ota_netaddr : NULL, ++ freespace); ++ } ++ ++ return p; ++} ++ ++/* ++ * struct pnfs_osd_deviceaddr { ++ * struct pnfs_osd_targetid oda_targetid; ++ * struct pnfs_osd_targetaddr oda_targetaddr; ++ * u8 oda_lun[8]; ++ * struct nfs4_string oda_systemid; ++ * struct pnfs_osd_object_cred oda_root_obj_cred; ++ * struct nfs4_string oda_osdname; ++ * }; ++ */ ++u32 *__xdr_read_calc_deviceaddr( ++ u32 *p, struct pnfs_osd_deviceaddr *deviceaddr, u8 **freespace) ++{ ++ p = __xdr_read_calc_targetid(p, ++ deviceaddr ? &deviceaddr->oda_targetid : NULL, ++ freespace); ++ ++ p = __xdr_read_calc_targetaddr(p, ++ deviceaddr ? &deviceaddr->oda_targetaddr : NULL, ++ freespace); ++ ++ if (deviceaddr) ++ COPYMEM(deviceaddr->oda_lun, sizeof(deviceaddr->oda_lun)); ++ else ++ p += XDR_QUADLEN(sizeof(deviceaddr->oda_lun)); ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_systemid : NULL); ++ ++ if (deviceaddr) { ++ p = pnfs_osd_xdr_decode_object_cred(p, ++ &deviceaddr->oda_root_obj_cred, freespace); ++ } else { ++ *freespace += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ ++ p = __xdr_read_calc_u8_opaque(p, ++ deviceaddr ? &deviceaddr->oda_osdname : NULL); ++ ++ return p; ++} ++ ++size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p) ++{ ++ u8 *null_freespace = NULL; ++ size_t sz; ++ ++ __xdr_read_calc_deviceaddr(p, NULL, &null_freespace); ++ sz = sizeof(struct pnfs_osd_deviceaddr) + (size_t)null_freespace; ++ ++ return sz; ++} ++ ++void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p) ++{ ++ u8 *freespace = (u8 *)(deviceaddr + 1); ++ ++ __xdr_read_calc_deviceaddr(p, deviceaddr, &freespace); ++} ++ ++/* ++ * struct pnfs_osd_layoutupdate { ++ * u32 dsu_valid; ++ * s64 dsu_delta; ++ * u32 olu_ioerr_flag; ++ * }; ++ */ ++int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou) ++{ ++ __be32 *p = xdr_reserve_space(xdr, 16); ++ ++ if (!p) ++ return -E2BIG; ++ ++ *p++ = cpu_to_be32(lou->dsu_valid); ++ if (lou->dsu_valid) ++ p = xdr_encode_hyper(p, lou->dsu_delta); ++ *p++ = cpu_to_be32(lou->olu_ioerr_flag); ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_objid { ++ * struct pnfs_deviceid oid_device_id; ++ * u64 oid_partition_id; ++ * u64 oid_object_id; ++ */ ++static inline int pnfs_osd_xdr_encode_objid(struct xdr_stream *xdr, ++ struct pnfs_osd_objid *object_id) ++{ ++ __be32 *p; ++ ++ p = xdr_reserve_space(xdr, 32); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, ++ sizeof(object_id->oid_device_id.data)); ++ p = xdr_encode_hyper(p, object_id->oid_partition_id); ++ p = xdr_encode_hyper(p, object_id->oid_object_id); ++ ++ return 0; ++} ++ ++/* ++ * struct pnfs_osd_ioerr { ++ * struct pnfs_osd_objid oer_component; ++ * u64 oer_comp_offset; ++ * u64 oer_comp_length; ++ * u32 oer_iswrite; ++ * u32 oer_errno; ++ * }; ++ */ ++int pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, ++ struct pnfs_osd_ioerr *ioerr) ++{ ++ __be32 *p; ++ int ret; ++ ++ ret = pnfs_osd_xdr_encode_objid(xdr, &ioerr->oer_component); ++ if (ret) ++ return ret; ++ ++ p = xdr_reserve_space(xdr, 24); ++ if (!p) ++ return -E2BIG; ++ ++ p = xdr_encode_hyper(p, ioerr->oer_comp_offset); ++ p = xdr_encode_hyper(p, ioerr->oer_comp_length); ++ *p++ = cpu_to_be32(ioerr->oer_iswrite); ++ *p = cpu_to_be32(ioerr->oer_errno); ++ ++ return 0; ++} +diff -up linux-2.6.35.noarch/fs/nfs/pagelist.c.orig linux-2.6.35.noarch/fs/nfs/pagelist.c +--- linux-2.6.35.noarch/fs/nfs/pagelist.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/pagelist.c 2010-08-31 21:11:40.934160524 -0400 +@@ -20,6 +20,7 @@ + #include + + #include "internal.h" ++#include "pnfs.h" + + static struct kmem_cache *nfs_page_cachep; + +@@ -56,7 +57,8 @@ nfs_page_free(struct nfs_page *p) + struct nfs_page * + nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, + struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + +@@ -79,7 +81,11 @@ nfs_create_request(struct nfs_open_conte + req->wb_pgbase = offset; + req->wb_bytes = count; + req->wb_context = get_nfs_open_context(ctx); ++ req->wb_lock_context = nfs_get_lock_context(ctx); + kref_init(&req->wb_kref); ++ req->wb_lseg = lseg; ++ if (lseg) ++ get_lseg(lseg); + return req; + } + +@@ -141,18 +147,26 @@ void nfs_clear_request(struct nfs_page * + { + struct page *page = req->wb_page; + struct nfs_open_context *ctx = req->wb_context; ++ struct nfs_lock_context *l_ctx = req->wb_lock_context; + + if (page != NULL) { + page_cache_release(page); + req->wb_page = NULL; + } ++ if (l_ctx != NULL) { ++ nfs_put_lock_context(l_ctx); ++ req->wb_lock_context = NULL; ++ } + if (ctx != NULL) { + put_nfs_open_context(ctx); + req->wb_context = NULL; + } ++ if (req->wb_lseg != NULL) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } + } + +- + /** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release +@@ -231,11 +245,12 @@ void nfs_pageio_init(struct nfs_pageio_d + * Return 'true' if this is the case, else return 'false'. + */ + static int nfs_can_coalesce_requests(struct nfs_page *prev, +- struct nfs_page *req) ++ struct nfs_page *req, ++ struct nfs_pageio_descriptor *pgio) + { + if (req->wb_context->cred != prev->wb_context->cred) + return 0; +- if (req->wb_context->lockowner != prev->wb_context->lockowner) ++ if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) + return 0; + if (req->wb_context->state != prev->wb_context->state) + return 0; +@@ -245,6 +260,12 @@ static int nfs_can_coalesce_requests(str + return 0; + if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return 0; ++ if (req->wb_lseg != prev->wb_lseg) ++ return 0; ++#ifdef CONFIG_NFS_V4_1 ++ if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) ++ return 0; ++#endif /* CONFIG_NFS_V4_1 */ + return 1; + } + +@@ -277,7 +298,7 @@ static int nfs_pageio_do_add_request(str + if (newlen > desc->pg_bsize) + return 0; + prev = nfs_list_entry(desc->pg_list.prev); +- if (!nfs_can_coalesce_requests(prev, req)) ++ if (!nfs_can_coalesce_requests(prev, req, desc)) + return 0; + } else + desc->pg_base = req->wb_pgbase; +@@ -366,6 +387,7 @@ void nfs_pageio_cond_complete(struct nfs + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * @tag: tag to scan for ++ * @use_pnfs: will be set TRUE if commit needs to be handled by layout driver + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space +@@ -375,7 +397,7 @@ void nfs_pageio_cond_complete(struct nfs + */ + int nfs_scan_list(struct nfs_inode *nfsi, + struct list_head *dst, pgoff_t idx_start, +- unsigned int npages, int tag) ++ unsigned int npages, int tag, int *use_pnfs) + { + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; +@@ -406,6 +428,8 @@ int nfs_scan_list(struct nfs_inode *nfsi + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, tag); + nfs_list_add_request(req, dst); ++ if (req->wb_lseg) ++ *use_pnfs = 1; + res++; + if (res == INT_MAX) + goto out; +diff -up linux-2.6.35.noarch/fs/nfs/pnfs.c.orig linux-2.6.35.noarch/fs/nfs/pnfs.c +--- linux-2.6.35.noarch/fs/nfs/pnfs.c.orig 2010-08-31 21:11:40.936140109 -0400 ++++ linux-2.6.35.noarch/fs/nfs/pnfs.c 2010-08-31 21:11:40.936140109 -0400 +@@ -0,0 +1,2037 @@ ++/* ++ * linux/fs/nfs/pnfs.c ++ * ++ * pNFS functions to call and manage layout drivers. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "internal.h" ++#include "nfs4_fs.h" ++#include "pnfs.h" ++ ++#define NFSDBG_FACILITY NFSDBG_PNFS ++ ++#define MIN_POOL_LC (4) ++ ++static int pnfs_initialized; ++ ++static void pnfs_free_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range); ++static inline void get_layout(struct pnfs_layout_hdr *lo); ++ ++/* Locking: ++ * ++ * pnfs_spinlock: ++ * protects pnfs_modules_tbl. ++ */ ++static spinlock_t pnfs_spinlock = __SPIN_LOCK_UNLOCKED(pnfs_spinlock); ++ ++/* ++ * pnfs_modules_tbl holds all pnfs modules ++ */ ++static struct list_head pnfs_modules_tbl; ++static struct kmem_cache *pnfs_cachep; ++static mempool_t *pnfs_layoutcommit_mempool; ++ ++static inline struct nfs4_layoutcommit_data *pnfs_layoutcommit_alloc(void) ++{ ++ struct nfs4_layoutcommit_data *p = ++ mempool_alloc(pnfs_layoutcommit_mempool, GFP_NOFS); ++ if (p) ++ memset(p, 0, sizeof(*p)); ++ ++ return p; ++} ++ ++void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *p) ++{ ++ mempool_free(p, pnfs_layoutcommit_mempool); ++} ++ ++/* ++ * struct pnfs_module - One per pNFS device module. ++ */ ++struct pnfs_module { ++ struct pnfs_layoutdriver_type *pnfs_ld_type; ++ struct list_head pnfs_tblid; ++}; ++ ++int ++pnfs_initialize(void) ++{ ++ INIT_LIST_HEAD(&pnfs_modules_tbl); ++ ++ pnfs_cachep = kmem_cache_create("nfs4_layoutcommit_data", ++ sizeof(struct nfs4_layoutcommit_data), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (pnfs_cachep == NULL) ++ return -ENOMEM; ++ ++ pnfs_layoutcommit_mempool = mempool_create(MIN_POOL_LC, ++ mempool_alloc_slab, ++ mempool_free_slab, ++ pnfs_cachep); ++ if (pnfs_layoutcommit_mempool == NULL) { ++ kmem_cache_destroy(pnfs_cachep); ++ return -ENOMEM; ++ } ++ ++ pnfs_initialized = 1; ++ return 0; ++} ++ ++void pnfs_uninitialize(void) ++{ ++ mempool_destroy(pnfs_layoutcommit_mempool); ++ kmem_cache_destroy(pnfs_cachep); ++} ++ ++/* search pnfs_modules_tbl for right pnfs module */ ++static int ++find_pnfs(u32 id, struct pnfs_module **module) { ++ struct pnfs_module *local = NULL; ++ ++ dprintk("PNFS: %s: Searching for %u\n", __func__, id); ++ list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) { ++ if (local->pnfs_ld_type->id == id) { ++ *module = local; ++ return(1); ++ } ++ } ++ return 0; ++} ++ ++/* Set cred to indicate we require a layoutcommit ++ * If we don't even have a layout, we don't need to commit it. ++ */ ++void ++pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx) ++{ ++ dprintk("%s: has_layout=%d ctx=%p\n", __func__, has_layout(nfsi), ctx); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (has_layout(nfsi) && ++ !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state)) { ++ nfsi->layout->cred = get_rpccred(ctx->state->owner->so_cred); ++ __set_bit(NFS_INO_LAYOUTCOMMIT, ++ &nfsi->layout->state); ++ nfsi->change_attr++; ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s: Set layoutcommit\n", __func__); ++ return; ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Update last_write_offset for layoutcommit. ++ * TODO: We should only use commited extents, but the current nfs ++ * implementation does not calculate the written range in nfs_commit_done. ++ * We therefore update this field in writeback_done. ++ */ ++void ++pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent) ++{ ++ loff_t end_pos; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (offset < nfsi->layout->write_begin_pos) ++ nfsi->layout->write_begin_pos = offset; ++ end_pos = offset + extent - 1; /* I'm being inclusive */ ++ if (end_pos > nfsi->layout->write_end_pos) ++ nfsi->layout->write_end_pos = end_pos; ++ dprintk("%s: Wrote %lu@%lu bpos %lu, epos: %lu\n", ++ __func__, ++ (unsigned long) extent, ++ (unsigned long) offset , ++ (unsigned long) nfsi->layout->write_begin_pos, ++ (unsigned long) nfsi->layout->write_end_pos); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* Unitialize a mountpoint in a layout driver */ ++void ++unmount_pnfs_layoutdriver(struct nfs_server *nfss) ++{ ++ if (PNFS_EXISTS_LDIO_OP(nfss, uninitialize_mountpoint)) ++ nfss->pnfs_curr_ld->ld_io_ops->uninitialize_mountpoint(nfss); ++} ++ ++/* ++ * Set the server pnfs module to the first registered pnfs_type. ++ * Only one pNFS layout driver is supported. ++ */ ++void ++set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, ++ u32 id) ++{ ++ struct pnfs_module *mod = NULL; ++ ++ if (server->pnfs_curr_ld) ++ return; ++ ++ if (!find_pnfs(id, &mod)) { ++ request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); ++ find_pnfs(id, &mod); ++ } ++ ++ if (!mod) { ++ dprintk("%s: No pNFS module found for %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ server->pnfs_curr_ld = mod->pnfs_ld_type; ++ if (mod->pnfs_ld_type->ld_io_ops->initialize_mountpoint( ++ server, mntfh)) { ++ printk(KERN_ERR "%s: Error initializing mount point " ++ "for layout driver %u. ", __func__, id); ++ goto out_err; ++ } ++ ++ dprintk("%s: pNFS module for %u set\n", __func__, id); ++ return; ++ ++out_err: ++ dprintk("Using NFSv4 I/O\n"); ++ server->pnfs_curr_ld = NULL; ++} ++ ++/* Allow I/O module to set its functions structure */ ++struct pnfs_client_operations* ++pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ struct layoutdriver_io_operations *io_ops = ld_type->ld_io_ops; ++ ++ if (!pnfs_initialized) { ++ printk(KERN_ERR "%s Registration failure. " ++ "pNFS not initialized.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops || !io_ops->alloc_layout || !io_ops->free_layout) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_layout and free_layout.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->alloc_lseg || !io_ops->free_lseg) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "alloc_lseg and free_lseg.\n", __func__); ++ return NULL; ++ } ++ ++ if (!io_ops->read_pagelist || !io_ops->write_pagelist || ++ !io_ops->commit) { ++ printk(KERN_ERR "%s Layout driver must provide " ++ "read_pagelist, write_pagelist, and commit.\n", ++ __func__); ++ return NULL; ++ } ++ ++ pnfs_mod = kmalloc(sizeof(struct pnfs_module), GFP_KERNEL); ++ if (pnfs_mod != NULL) { ++ dprintk("%s Registering id:%u name:%s\n", ++ __func__, ++ ld_type->id, ++ ld_type->name); ++ pnfs_mod->pnfs_ld_type = ld_type; ++ INIT_LIST_HEAD(&pnfs_mod->pnfs_tblid); ++ ++ spin_lock(&pnfs_spinlock); ++ list_add(&pnfs_mod->pnfs_tblid, &pnfs_modules_tbl); ++ spin_unlock(&pnfs_spinlock); ++ } ++ ++ return &pnfs_ops; ++} ++ ++/* Allow I/O module to set its functions structure */ ++void ++pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) ++{ ++ struct pnfs_module *pnfs_mod; ++ ++ if (find_pnfs(ld_type->id, &pnfs_mod)) { ++ dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); ++ spin_lock(&pnfs_spinlock); ++ list_del(&pnfs_mod->pnfs_tblid); ++ spin_unlock(&pnfs_spinlock); ++ kfree(pnfs_mod); ++ } ++} ++ ++/* ++ * pNFS client layout cache ++ */ ++#if defined(CONFIG_SMP) ++#define BUG_ON_UNLOCKED_INO(ino) \ ++ BUG_ON(!spin_is_locked(&ino->i_lock)) ++#define BUG_ON_UNLOCKED_LO(lo) \ ++ BUG_ON_UNLOCKED_INO(PNFS_INODE(lo)) ++#else /* CONFIG_SMP */ ++#define BUG_ON_UNLOCKED_INO(lo) do {} while (0) ++#define BUG_ON_UNLOCKED_LO(lo) do {} while (0) ++#endif /* CONFIG_SMP */ ++ ++static inline void ++get_layout(struct pnfs_layout_hdr *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ lo->refcount++; ++} ++ ++static inline void ++put_layout_locked(struct pnfs_layout_hdr *lo) ++{ ++ BUG_ON_UNLOCKED_LO(lo); ++ BUG_ON(lo->refcount <= 0); ++ ++ lo->refcount--; ++ if (!lo->refcount) { ++ struct layoutdriver_io_operations *io_ops = PNFS_LD_IO_OPS(lo); ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ dprintk("%s: freeing layout cache %p\n", __func__, lo); ++ WARN_ON(!list_empty(&lo->layouts)); ++ io_ops->free_layout(lo); ++ nfsi->layout = NULL; ++ } ++} ++ ++void ++put_layout(struct inode *inode) ++{ ++ spin_lock(&inode->i_lock); ++ put_layout_locked(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++ ++} ++ ++void ++pnfs_layout_release(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct nfs_inode *nfsi = PNFS_NFS_INODE(lo); ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ if (range) ++ pnfs_free_layout(lo, range); ++ /* ++ * Matched in _pnfs_update_layout for layoutget ++ * and by get_layout in _pnfs_return_layout for layoutreturn ++ */ ++ put_layout_locked(lo); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ wake_up_all(&nfsi->lo_waitq); ++} ++ ++void ++pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ lo = nfsi->layout; ++ if (lo) { ++ pnfs_free_layout(lo, &range); ++ WARN_ON(!list_empty(&nfsi->layout->segs)); ++ WARN_ON(!list_empty(&nfsi->layout->layouts)); ++ ++ if (nfsi->layout->refcount != 1) ++ printk(KERN_WARNING "%s: layout refcount not=1 %d\n", ++ __func__, nfsi->layout->refcount); ++ WARN_ON(nfsi->layout->refcount != 1); ++ ++ /* Matched by refcount set to 1 in alloc_init_layout */ ++ put_layout_locked(lo); ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++} ++ ++/* ++ * Called by the state manger to remove all layouts established under an ++ * expired lease. ++ */ ++void ++pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++ struct pnfs_layout_hdr *lo; ++ ++ while (!list_empty(&clp->cl_layouts)) { ++ lo = list_entry(clp->cl_layouts.next, struct pnfs_layout_hdr, ++ layouts); ++ dprintk("%s freeing layout for inode %lu\n", __func__, ++ lo->inode->i_ino); ++ pnfs_destroy_layout(NFS_I(lo->inode)); ++ } ++} ++ ++static inline void ++init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) ++{ ++ INIT_LIST_HEAD(&lseg->fi_list); ++ kref_init(&lseg->kref); ++ lseg->valid = true; ++ lseg->layout = lo; ++} ++ ++static void ++destroy_lseg(struct kref *kref) ++{ ++ struct pnfs_layout_segment *lseg = ++ container_of(kref, struct pnfs_layout_segment, kref); ++ ++ dprintk("--> %s\n", __func__); ++ /* Matched by get_layout in pnfs_insert_layout */ ++ put_layout_locked(lseg->layout); ++ PNFS_LD_IO_OPS(lseg->layout)->free_lseg(lseg); ++} ++ ++static void ++put_lseg_locked(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ kref_put(&lseg->kref, destroy_lseg); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++ ++void ++put_lseg(struct pnfs_layout_segment *lseg) ++{ ++ bool do_wake_up; ++ struct nfs_inode *nfsi; ++ ++ if (!lseg) ++ return; ++ ++ dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, ++ atomic_read(&lseg->kref.refcount), lseg->valid); ++ do_wake_up = !lseg->valid; ++ nfsi = PNFS_NFS_INODE(lseg->layout); ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ kref_put(&lseg->kref, destroy_lseg); ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ if (do_wake_up) ++ wake_up(&nfsi->lo_waitq); ++} ++EXPORT_SYMBOL(put_lseg); ++ ++void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++ kref_get(&lseg->kref); ++} ++EXPORT_SYMBOL(get_lseg); ++ ++static inline u64 ++end_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ end = start + len; ++ return end >= start ? end: NFS4_MAX_UINT64; ++} ++ ++/* last octet in a range */ ++static inline u64 ++last_byte_offset(u64 start, u64 len) ++{ ++ u64 end; ++ ++ BUG_ON(!len); ++ end = start + len; ++ return end > start ? end - 1: NFS4_MAX_UINT64; ++} ++ ++/* ++ * is l2 fully contained in l1? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_contained(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (start1 <= start2) && (end1 >= end2); ++} ++ ++/* ++ * is l1 and l2 intersecting? ++ * start1 end1 ++ * [----------------------------------) ++ * start2 end2 ++ * [----------------) ++ */ ++static inline int ++lo_seg_intersecting(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ u64 start1 = l1->offset; ++ u64 end1 = end_offset(start1, l1->length); ++ u64 start2 = l2->offset; ++ u64 end2 = end_offset(start2, l2->length); ++ ++ return (end1 == NFS4_MAX_UINT64 || end1 > start2) && ++ (end2 == NFS4_MAX_UINT64 || end2 > start1); ++} ++ ++void ++pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, ++ const nfs4_stateid *stateid) ++{ ++ write_seqlock(&lo->seqlock); ++ memcpy(lo->stateid.u.data, stateid->u.data, sizeof(lo->stateid.u.data)); ++ write_sequnlock(&lo->seqlock); ++} ++ ++void ++pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ do { ++ seq = read_seqbegin(&lo->seqlock); ++ memcpy(dst->u.data, lo->stateid.u.data, ++ sizeof(lo->stateid.u.data)); ++ } while (read_seqretry(&lo->seqlock, seq)); ++ ++ dprintk("<-- %s\n", __func__); ++} ++ ++static void ++pnfs_layout_from_open_stateid(struct pnfs_layout_hdr *lo, ++ struct nfs4_state *state) ++{ ++ int seq; ++ ++ dprintk("--> %s\n", __func__); ++ ++ write_seqlock(&lo->seqlock); ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) ++ do { ++ seq = read_seqbegin(&state->seqlock); ++ memcpy(lo->stateid.u.data, state->stateid.u.data, ++ sizeof(state->stateid.u.data)); ++ } while (read_seqretry(&state->seqlock, seq)); ++ write_sequnlock(&lo->seqlock); ++ dprintk("<-- %s\n", __func__); ++} ++ ++/* ++* Get layout from server. ++* for now, assume that whole file layouts are requested. ++* arg->offset: 0 ++* arg->length: all ones ++*/ ++static int ++send_layoutget(struct inode *ino, ++ struct nfs_open_context *ctx, ++ struct pnfs_layout_range *range, ++ struct pnfs_layout_segment **lsegpp, ++ struct pnfs_layout_hdr *lo) ++{ ++ int status; ++ struct nfs_server *server = NFS_SERVER(ino); ++ struct nfs4_layoutget *lgp; ++ ++ dprintk("--> %s\n", __func__); ++ ++ lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); ++ if (lgp == NULL) { ++ pnfs_layout_release(lo, NULL); ++ return -ENOMEM; ++ } ++ lgp->args.minlength = NFS4_MAX_UINT64; ++ lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; ++ lgp->args.range.iomode = range->iomode; ++ lgp->args.range.offset = 0; ++ lgp->args.range.length = NFS4_MAX_UINT64; ++ lgp->args.type = server->pnfs_curr_ld->id; ++ lgp->args.inode = ino; ++ lgp->lsegpp = lsegpp; ++ ++ if (!memcmp(lo->stateid.u.data, &zero_stateid, NFS4_STATEID_SIZE)) { ++ struct nfs_open_context *oldctx = ctx; ++ ++ if (!oldctx) { ++ ctx = nfs_find_open_context(ino, NULL, ++ (range->iomode == IOMODE_READ) ? ++ FMODE_READ: FMODE_WRITE); ++ BUG_ON(!ctx); ++ } ++ /* Set the layout stateid from the open stateid */ ++ pnfs_layout_from_open_stateid(NFS_I(ino)->layout, ctx->state); ++ if (!oldctx) ++ put_nfs_open_context(ctx); ++ } ++ ++ /* Retrieve layout information from server */ ++ status = nfs4_proc_layoutget(lgp); ++ ++ dprintk("<-- %s status %d\n", __func__, status); ++ return status; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW false ++ */ ++static inline int ++should_free_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) ++{ ++ return (range->iomode == IOMODE_ANY || ++ lseg->range.iomode == range->iomode) && ++ lo_seg_intersecting(&lseg->range, range); ++} ++ ++static struct pnfs_layout_segment * ++has_layout_to_return(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *out = NULL, *lseg; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) ++ if (should_free_lseg(lseg, range)) { ++ out = lseg; ++ break; ++ } ++ ++ dprintk("%s:Return lseg=%p\n", __func__, out); ++ return out; ++} ++ ++static inline bool ++_pnfs_can_return_lseg(struct pnfs_layout_segment *lseg) ++{ ++ return atomic_read(&lseg->kref.refcount) == 1; ++} ++ ++ ++static void ++pnfs_free_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *lseg, *next; ++ dprintk("%s:Begin lo %p offset %llu length %llu iomode %d\n", ++ __func__, lo, range->offset, range->length, range->iomode); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry_safe (lseg, next, &lo->segs, fi_list) { ++ if (!should_free_lseg(lseg, range) || ++ !_pnfs_can_return_lseg(lseg)) ++ continue; ++ dprintk("%s: freeing lseg %p iomode %d " ++ "offset %llu length %llu\n", __func__, ++ lseg, lseg->range.iomode, lseg->range.offset, ++ lseg->range.length); ++ list_del(&lseg->fi_list); ++ put_lseg_locked(lseg); ++ } ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp; ++ ++ clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ spin_lock(&clp->cl_lock); ++ list_del_init(&lo->layouts); ++ spin_unlock(&clp->cl_lock); ++ pnfs_set_layout_stateid(lo, &zero_stateid); ++ } ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++static bool ++pnfs_return_layout_barrier(struct nfs_inode *nfsi, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_segment *lseg; ++ bool ret = false; ++ ++ spin_lock(&nfsi->vfs_inode.i_lock); ++ list_for_each_entry(lseg, &nfsi->layout->segs, fi_list) { ++ if (!should_free_lseg(lseg, range)) ++ continue; ++ lseg->valid = false; ++ if (!_pnfs_can_return_lseg(lseg)) { ++ dprintk("%s: wait on lseg %p refcount %d\n", ++ __func__, lseg, ++ atomic_read(&lseg->kref.refcount)); ++ ret = true; ++ } ++ } ++ spin_unlock(&nfsi->vfs_inode.i_lock); ++ dprintk("%s:Return %d\n", __func__, ret); ++ return ret; ++} ++ ++static int ++return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ enum pnfs_layoutreturn_type type, struct pnfs_layout_hdr *lo, ++ bool wait) ++{ ++ struct nfs4_layoutreturn *lrp; ++ struct nfs_server *server = NFS_SERVER(ino); ++ int status = -ENOMEM; ++ ++ dprintk("--> %s\n", __func__); ++ ++ BUG_ON(type != RETURN_FILE); ++ ++ lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); ++ if (lrp == NULL) { ++ if (lo && (type == RETURN_FILE)) ++ pnfs_layout_release(lo, NULL); ++ goto out; ++ } ++ lrp->args.reclaim = 0; ++ lrp->args.layout_type = server->pnfs_curr_ld->id; ++ lrp->args.return_type = type; ++ lrp->args.range = *range; ++ lrp->args.inode = ino; ++ ++ status = nfs4_proc_layoutreturn(lrp, wait); ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++} ++ ++int ++_pnfs_return_layout(struct inode *ino, struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct pnfs_layout_hdr *lo = NULL; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_range arg; ++ int status = 0; ++ ++ dprintk("--> %s type %d\n", __func__, type); ++ ++ ++ arg.iomode = range ? range->iomode : IOMODE_ANY; ++ arg.offset = 0; ++ arg.length = NFS4_MAX_UINT64; ++ ++ if (type == RETURN_FILE) { ++ spin_lock(&ino->i_lock); ++ lo = nfsi->layout; ++ if (lo && !has_layout_to_return(lo, &arg)) { ++ lo = NULL; ++ } ++ if (!lo) { ++ spin_unlock(&ino->i_lock); ++ dprintk("%s: no layout segments to return\n", __func__); ++ goto out; ++ } ++ ++ /* Reference for layoutreturn matched in pnfs_layout_release */ ++ get_layout(lo); ++ ++ spin_unlock(&ino->i_lock); ++ ++ if (pnfs_return_layout_barrier(nfsi, &arg)) { ++ if (stateid) { /* callback */ ++ status = -EAGAIN; ++ goto out_put; ++ } ++ dprintk("%s: waiting\n", __func__); ++ wait_event(nfsi->lo_waitq, ++ !pnfs_return_layout_barrier(nfsi, &arg)); ++ } ++ ++ if (layoutcommit_needed(nfsi)) { ++ if (stateid && !wait) { /* callback */ ++ dprintk("%s: layoutcommit pending\n", __func__); ++ status = -EAGAIN; ++ goto out_put; ++ } ++ status = pnfs_layoutcommit_inode(ino, wait); ++ if (status) { ++ /* Return layout even if layoutcommit fails */ ++ dprintk("%s: layoutcommit failed, status=%d. " ++ "Returning layout anyway\n", ++ __func__, status); ++ } ++ } ++ ++ if (!stateid) ++ status = return_layout(ino, &arg, type, lo, wait); ++ else ++ pnfs_layout_release(lo, &arg); ++ } ++out: ++ dprintk("<-- %s status: %d\n", __func__, status); ++ return status; ++out_put: ++ put_layout(ino); ++ goto out; ++} ++ ++/* ++ * cmp two layout segments for sorting into layout cache ++ */ ++static inline s64 ++cmp_layout(struct pnfs_layout_range *l1, ++ struct pnfs_layout_range *l2) ++{ ++ s64 d; ++ ++ /* higher offset > lower offset */ ++ d = l1->offset - l2->offset; ++ if (d) ++ return d; ++ ++ /* longer length > shorter length */ ++ d = l1->length - l2->length; ++ if (d) ++ return d; ++ ++ /* read > read/write */ ++ return (int)(l1->iomode == IOMODE_READ) - ++ (int)(l2->iomode == IOMODE_READ); ++} ++ ++static void ++pnfs_insert_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct pnfs_layout_segment *lp; ++ int found = 0; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ if (list_empty(&lo->segs)) { ++ struct nfs_client *clp = PNFS_NFS_SERVER(lo)->nfs_client; ++ ++ spin_lock(&clp->cl_lock); ++ BUG_ON(!list_empty(&lo->layouts)); ++ list_add_tail(&lo->layouts, &clp->cl_layouts); ++ spin_unlock(&clp->cl_lock); ++ } ++ list_for_each_entry (lp, &lo->segs, fi_list) { ++ if (cmp_layout(&lp->range, &lseg->range) > 0) ++ continue; ++ list_add_tail(&lseg->fi_list, &lp->fi_list); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu before " ++ "lp %p iomode %d offset %llu length %llu\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length, ++ lp, lp->range.iomode, lp->range.offset, ++ lp->range.length); ++ found = 1; ++ break; ++ } ++ if (!found) { ++ list_add_tail(&lseg->fi_list, &lo->segs); ++ dprintk("%s: inserted lseg %p " ++ "iomode %d offset %llu length %llu at tail\n", ++ __func__, lseg, lseg->range.iomode, ++ lseg->range.offset, lseg->range.length); ++ } ++ get_layout(lo); ++ ++ dprintk("%s:Return\n", __func__); ++} ++ ++/* ++ * Each layoutdriver embeds pnfs_layout_hdr as the first field in it's ++ * per-layout type layout cache structure and returns it ZEROed ++ * from layoutdriver_io_ops->alloc_layout ++ */ ++static struct pnfs_layout_hdr * ++alloc_init_layout(struct inode *ino) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct layoutdriver_io_operations *io_ops; ++ ++ io_ops = NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops; ++ lo = io_ops->alloc_layout(ino); ++ if (!lo) { ++ printk(KERN_ERR ++ "%s: out of memory: io_ops->alloc_layout failed\n", ++ __func__); ++ return NULL; ++ } ++ lo->refcount = 1; ++ INIT_LIST_HEAD(&lo->layouts); ++ INIT_LIST_HEAD(&lo->segs); ++ seqlock_init(&lo->seqlock); ++ lo->inode = ino; ++ return lo; ++} ++ ++/* ++ * Retrieve and possibly allocate the inode layout ++ * ++ * ino->i_lock must be taken by the caller. ++ */ ++static struct pnfs_layout_hdr * ++pnfs_alloc_layout(struct inode *ino) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_hdr *new = NULL; ++ ++ dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); ++ ++ BUG_ON_UNLOCKED_INO(ino); ++ if (likely(nfsi->layout)) ++ return nfsi->layout; ++ ++ spin_unlock(&ino->i_lock); ++ new = alloc_init_layout(ino); ++ spin_lock(&ino->i_lock); ++ ++ if (likely(nfsi->layout == NULL)) { /* Won the race? */ ++ nfsi->layout = new; ++ } else if (new) { ++ /* Reference the layout accross i_lock release and grab */ ++ get_layout(nfsi->layout); ++ spin_unlock(&ino->i_lock); ++ NFS_SERVER(ino)->pnfs_curr_ld->ld_io_ops->free_layout(new); ++ spin_lock(&ino->i_lock); ++ put_layout_locked(nfsi->layout); ++ } ++ return nfsi->layout; ++} ++ ++/* ++ * iomode matching rules: ++ * range lseg match ++ * ----- ----- ----- ++ * ANY READ true ++ * ANY RW true ++ * RW READ false ++ * RW RW true ++ * READ READ true ++ * READ RW true ++ */ ++static inline int ++has_matching_lseg(struct pnfs_layout_segment *lseg, ++ struct pnfs_layout_range *range) ++{ ++ struct pnfs_layout_range range1; ++ ++ if ((range->iomode == IOMODE_RW && lseg->range.iomode != IOMODE_RW) || ++ !lo_seg_intersecting(&lseg->range, range)) ++ return 0; ++ ++ /* range1 covers only the first byte in the range */ ++ range1 = *range; ++ range1.length = 1; ++ return lo_seg_contained(&lseg->range, &range1); ++} ++ ++/* ++ * lookup range in layout ++ */ ++static struct pnfs_layout_segment * ++pnfs_has_layout(struct pnfs_layout_hdr *lo, ++ struct pnfs_layout_range *range, ++ bool take_ref, ++ bool only_valid) ++{ ++ struct pnfs_layout_segment *lseg, *ret = NULL; ++ ++ dprintk("%s:Begin\n", __func__); ++ ++ BUG_ON_UNLOCKED_LO(lo); ++ list_for_each_entry (lseg, &lo->segs, fi_list) { ++ if (has_matching_lseg(lseg, range) && ++ (lseg->valid || !only_valid)) { ++ ret = lseg; ++ if (take_ref) ++ get_lseg(ret); ++ break; ++ } ++ if (cmp_layout(range, &lseg->range) > 0) ++ break; ++ } ++ ++ dprintk("%s:Return lseg %p take_ref %d ref %d valid %d\n", ++ __func__, ret, take_ref, ++ ret ? atomic_read(&ret->kref.refcount) : 0, ++ ret ? ret->valid : 0); ++ return ret; ++} ++ ++/* Update the file's layout for the given range and iomode. ++ * Layout is retreived from the server if needed. ++ * If lsegpp is given, the appropriate layout segment is referenced and ++ * returned to the caller. ++ */ ++void ++_pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, ++ u64 count, ++ enum pnfs_iomode iomode, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct pnfs_layout_range arg = { ++ .iomode = iomode, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layout_segment *lseg = NULL; ++ bool take_ref = (lsegpp != NULL); ++ ++ if (take_ref) ++ *lsegpp = NULL; ++ spin_lock(&ino->i_lock); ++ lo = pnfs_alloc_layout(ino); ++ if (lo == NULL) { ++ dprintk("%s ERROR: can't get pnfs_layout_hdr\n", __func__); ++ goto out_unlock; ++ } ++ ++ /* Check to see if the layout for the given range already exists */ ++ lseg = pnfs_has_layout(lo, &arg, take_ref, !take_ref); ++ if (lseg && !lseg->valid) { ++ if (take_ref) ++ put_lseg_locked(lseg); ++ /* someone is cleaning the layout */ ++ lseg = NULL; ++ goto out_unlock; ++ } ++ ++ if (lseg) { ++ dprintk("%s: Using cached lseg %p for %llu@%llu iomode %d)\n", ++ __func__, ++ lseg, ++ arg.length, ++ arg.offset, ++ arg.iomode); ++ ++ goto out_unlock; ++ } ++ ++ /* if get layout already failed once goto out */ ++ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->state)) { ++ if (unlikely(nfsi->pnfs_layout_suspend && ++ get_seconds() >= nfsi->pnfs_layout_suspend)) { ++ dprintk("%s: layout_get resumed\n", __func__); ++ clear_bit(lo_fail_bit(iomode), ++ &nfsi->layout->state); ++ nfsi->pnfs_layout_suspend = 0; ++ } else ++ goto out_unlock; ++ } ++ ++ /* Reference the layout for layoutget matched in pnfs_layout_release */ ++ get_layout(lo); ++ spin_unlock(&ino->i_lock); ++ ++ send_layoutget(ino, ctx, &arg, lsegpp, lo); ++out: ++ dprintk("%s end, state 0x%lx lseg %p\n", __func__, ++ nfsi->layout->state, lseg); ++ return; ++out_unlock: ++ if (lsegpp) ++ *lsegpp = lseg; ++ spin_unlock(&ino->i_lock); ++ goto out; ++} ++ ++void ++pnfs_get_layout_done(struct nfs4_layoutget *lgp, int rpc_status) ++{ ++ struct pnfs_layout_segment *lseg = NULL; ++ struct nfs_inode *nfsi = NFS_I(lgp->args.inode); ++ time_t suspend = 0; ++ ++ dprintk("-->%s\n", __func__); ++ ++ lgp->status = rpc_status; ++ if (likely(!rpc_status)) { ++ if (unlikely(lgp->res.layout.len < 0)) { ++ printk(KERN_ERR ++ "%s: ERROR Returned layout size is ZERO\n", __func__); ++ lgp->status = -EIO; ++ } ++ goto out; ++ } ++ ++ dprintk("%s: ERROR retrieving layout %d\n", __func__, rpc_status); ++ switch (rpc_status) { ++ case -NFS4ERR_BADLAYOUT: ++ lgp->status = -ENOENT; ++ /* FALLTHROUGH */ ++ case -EACCES: /* NFS4ERR_ACCESS */ ++ /* transient error, don't mark with NFS_INO_LAYOUT_FAILED */ ++ goto out; ++ ++ case -NFS4ERR_LAYOUTTRYLATER: ++ case -NFS4ERR_RECALLCONFLICT: ++ case -NFS4ERR_OLD_STATEID: ++ case -EAGAIN: /* NFS4ERR_LOCKED */ ++ lgp->status = -NFS4ERR_DELAY; /* for nfs4_handle_exception */ ++ /* FALLTHROUGH */ ++ case -NFS4ERR_GRACE: ++ case -NFS4ERR_DELAY: ++ goto out; ++ ++ case -NFS4ERR_ADMIN_REVOKED: ++ case -NFS4ERR_DELEG_REVOKED: ++ /* The layout is expected to be returned at this point. ++ * This should clear the layout stateid as well */ ++ suspend = get_seconds() + 1; ++ break; ++ ++ case -NFS4ERR_LAYOUTUNAVAILABLE: ++ lgp->status = -ENOTSUPP; ++ break; ++ ++ case -NFS4ERR_REP_TOO_BIG: ++ case -NFS4ERR_REP_TOO_BIG_TO_CACHE: ++ lgp->status = -E2BIG; ++ break; ++ ++ /* Leave the following errors untranslated */ ++ case -NFS4ERR_DEADSESSION: ++ case -NFS4ERR_DQUOT: ++ case -EINVAL: /* NFS4ERR_INVAL */ ++ case -EIO: /* NFS4ERR_IO */ ++ case -NFS4ERR_FHEXPIRED: ++ case -NFS4ERR_MOVED: ++ case -NFS4ERR_NOSPC: ++ case -ESERVERFAULT: /* NFS4ERR_SERVERFAULT */ ++ case -ESTALE: /* NFS4ERR_STALE */ ++ case -ETOOSMALL: /* NFS4ERR_TOOSMALL */ ++ break; ++ ++ /* The following errors are our fault and should never happen */ ++ case -NFS4ERR_BADIOMODE: ++ case -NFS4ERR_BADXDR: ++ case -NFS4ERR_REQ_TOO_BIG: ++ case -NFS4ERR_UNKNOWN_LAYOUTTYPE: ++ case -NFS4ERR_WRONG_TYPE: ++ lgp->status = -EINVAL; ++ /* FALLTHROUGH */ ++ case -NFS4ERR_BAD_STATEID: ++ case -NFS4ERR_NOFILEHANDLE: ++ case -ENOTSUPP: /* NFS4ERR_NOTSUPP */ ++ case -NFS4ERR_OPENMODE: ++ case -NFS4ERR_OP_NOT_IN_SESSION: ++ case -NFS4ERR_TOO_MANY_OPS: ++ dprintk("%s: error %d: should never happen\n", __func__, ++ rpc_status); ++ break; ++ ++ /* The following errors are the server's fault */ ++ default: ++ dprintk("%s: illegal error %d\n", __func__, rpc_status); ++ lgp->status = -EIO; ++ break; ++ } ++ ++ /* remember that get layout failed and suspend trying */ ++ nfsi->pnfs_layout_suspend = suspend; ++ set_bit(lo_fail_bit(lgp->args.range.iomode), ++ &nfsi->layout->state); ++ dprintk("%s: layout_get suspended until %ld\n", ++ __func__, suspend); ++out: ++ dprintk("%s end (err:%d) state 0x%lx lseg %p\n", ++ __func__, lgp->status, nfsi->layout->state, lseg); ++ return; ++} ++ ++int ++pnfs_layout_process(struct nfs4_layoutget *lgp) ++{ ++ struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; ++ struct nfs4_layoutget_res *res = &lgp->res; ++ struct pnfs_layout_segment *lseg; ++ struct inode *ino = PNFS_INODE(lo); ++ int status = 0; ++ ++ /* Inject layout blob into I/O device driver */ ++ lseg = PNFS_LD_IO_OPS(lo)->alloc_lseg(lo, res); ++ if (!lseg || IS_ERR(lseg)) { ++ if (!lseg) ++ status = -ENOMEM; ++ else ++ status = PTR_ERR(lseg); ++ dprintk("%s: Could not allocate layout: error %d\n", ++ __func__, status); ++ goto out; ++ } ++ ++ spin_lock(&ino->i_lock); ++ init_lseg(lo, lseg); ++ lseg->range = res->range; ++ if (lgp->lsegpp) { ++ get_lseg(lseg); ++ *lgp->lsegpp = lseg; ++ } ++ pnfs_insert_layout(lo, lseg); ++ ++ if (res->return_on_close) { ++ lo->roc_iomode |= res->range.iomode; ++ if (!lo->roc_iomode) ++ lo->roc_iomode = IOMODE_ANY; ++ } ++ ++ /* Done processing layoutget. Set the layout stateid */ ++ pnfs_set_layout_stateid(lo, &res->stateid); ++ spin_unlock(&ino->i_lock); ++out: ++ return status; ++} ++ ++void ++readahead_range(struct inode *inode, struct list_head *pages, loff_t *offset, ++ size_t *count) ++{ ++ struct page *first, *last; ++ loff_t foff, i_size = i_size_read(inode); ++ pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; ++ size_t range; ++ ++ ++ first = list_entry((pages)->prev, struct page, lru); ++ last = list_entry((pages)->next, struct page, lru); ++ ++ foff = (loff_t)first->index << PAGE_CACHE_SHIFT; ++ ++ range = (last->index - first->index) * PAGE_CACHE_SIZE; ++ if (last->index == end_index) ++ range += ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; ++ else ++ range += PAGE_CACHE_SIZE; ++ dprintk("%s foff %lu, range %Zu\n", __func__, (unsigned long)foff, ++ range); ++ *offset = foff; ++ *count = range; ++} ++ ++void ++pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct pnfs_layoutdriver_type *ld; ++ ++ pgio->pg_test = NULL; ++ ++ lo = NFS_I(inode)->layout; ++ ld = NFS_SERVER(inode)->pnfs_curr_ld; ++ if (!pnfs_enabled_sb(NFS_SERVER(inode)) || !lo) ++ return; ++ ++ if (ld->ld_policy_ops) ++ pgio->pg_test = ld->ld_policy_ops->pg_test; ++} ++ ++static u32 ++pnfs_getboundary(struct inode *inode) ++{ ++ u32 stripe_size = 0; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct layoutdriver_policy_operations *policy_ops; ++ ++ if (!nfss->pnfs_curr_ld) ++ goto out; ++ ++ policy_ops = nfss->pnfs_curr_ld->ld_policy_ops; ++ if (!policy_ops || !policy_ops->get_stripesize) ++ goto out; ++ ++ /* The default is to not gather across stripes */ ++ if (pnfs_ld_gather_across_stripes(nfss->pnfs_curr_ld)) ++ goto out; ++ ++ spin_lock(&inode->i_lock); ++ if (NFS_I(inode)->layout) ++ stripe_size = policy_ops->get_stripesize(NFS_I(inode)->layout); ++ spin_unlock(&inode->i_lock); ++out: ++ return stripe_size; ++} ++ ++/* ++ * rsize is already set by caller to MDS rsize. ++ */ ++void ++pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, ++ struct inode *inode, ++ struct nfs_open_context *ctx, ++ struct list_head *pages, ++ size_t *rsize) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ size_t count = 0; ++ loff_t loff; ++ ++ pgio->pg_iswrite = 0; ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ pgio->pg_lseg = NULL; ++ ++ if (!pnfs_enabled_sb(nfss)) ++ return; ++ ++ /* Calculate the total read-ahead count */ ++ readahead_range(inode, pages, &loff, &count); ++ ++ if (count > 0) { ++ _pnfs_update_layout(inode, ctx, loff, count, IOMODE_READ, ++ &pgio->pg_lseg); ++ if (!pgio->pg_lseg) ++ return; ++ ++ *rsize = NFS_SERVER(inode)->ds_rsize; ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ if (pgio->pg_boundary) ++ pnfs_set_pg_test(inode, pgio); ++ } ++} ++ ++void ++pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, ++ size_t *wsize) ++{ ++ struct nfs_server *server = NFS_SERVER(inode); ++ ++ pgio->pg_iswrite = 1; ++ if (!pnfs_enabled_sb(server)) { ++ pgio->pg_boundary = 0; ++ pgio->pg_test = NULL; ++ return; ++ } ++ pgio->pg_boundary = pnfs_getboundary(inode); ++ pnfs_set_pg_test(inode, pgio); ++ *wsize = server->ds_wsize; ++} ++ ++/* Return I/O buffer size for a layout driver ++ * This value will determine what size reads and writes ++ * will be gathered into and sent to the data servers. ++ * blocksize must be a multiple of the page cache size. ++ */ ++unsigned int ++pnfs_getiosize(struct nfs_server *server) ++{ ++ if (!PNFS_EXISTS_LDPOLICY_OP(server, get_blocksize)) ++ return 0; ++ return server->pnfs_curr_ld->ld_policy_ops->get_blocksize(); ++} ++ ++void ++pnfs_set_ds_iosize(struct nfs_server *server) ++{ ++ unsigned dssize = pnfs_getiosize(server); ++ ++ /* Set buffer size for data servers */ ++ if (dssize > 0) { ++ server->ds_rsize = server->ds_wsize = ++ nfs_block_size(dssize, NULL); ++ } else { ++ server->ds_wsize = server->wsize; ++ server->ds_rsize = server->rsize; ++ } ++} ++ ++static int ++pnfs_call_done(struct pnfs_call_data *pdata, struct rpc_task *task, void *data) ++{ ++ put_lseg(pdata->lseg); ++ pdata->lseg = NULL; ++ pdata->call_ops->rpc_call_done(task, data); ++ if (pdata->pnfs_error == -EAGAIN || task->tk_status == -EAGAIN) ++ return -EAGAIN; ++ if (pdata->pnfsflags & PNFS_NO_RPC) { ++ pdata->call_ops->rpc_release(data); ++ } else { ++ /* ++ * just restore original rpc call ops ++ * rpc_release will be called later by the rpc scheduling layer. ++ */ ++ task->tk_ops = pdata->call_ops; ++ } ++ return 0; ++} ++ ++/* Post-write completion function ++ * Invoked by all layout drivers when write_pagelist is done. ++ * ++ * NOTE: callers set data->pnfsflags PNFS_NO_RPC ++ * so that the NFS cleanup routines perform only the page cache ++ * cleanup. ++ */ ++static void ++pnfs_write_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_write_data *wdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ wdata = container_of(task, struct nfs_write_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = wdata->args.offset; ++ range.length = wdata->args.count; ++ _pnfs_return_layout(wdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_write(wdata, NFS_CLIENT(wdata->inode), ++ wdata->pdata.call_ops, wdata->pdata.how); ++} ++ ++static void ++pnfs_writeback_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ /* update last write offset and need layout commit ++ * for non-files layout types (files layout calls ++ * pnfs4_write_done for this) ++ */ ++ if ((pdata->pnfsflags & PNFS_NO_RPC) && ++ data->task.tk_status >= 0 && data->res.count > 0) { ++ struct nfs_inode *nfsi = NFS_I(data->inode); ++ ++ pnfs_update_last_write(nfsi, data->args.offset, data->res.count); ++ pnfs_need_layoutcommit(nfsi, data->args.context); ++ } ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_write_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++static void _pnfs_clear_lseg_from_pages(struct list_head *head) ++{ ++ struct nfs_page *req; ++ ++ list_for_each_entry(req, head, wb_list) { ++ put_lseg(req->wb_lseg); ++ req->wb_lseg = NULL; ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem write function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ * ++ * TODO: Is wdata->how and wdata->args.stable always the same value? ++ * TODO: It seems in NFS, the server may not do a stable write even ++ * though it was requested (and vice-versa?). To check, it looks ++ * in data->res.verf->committed. Do we need this ability ++ * for non-file layout drivers? ++ */ ++enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *wdata, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ struct inode *inode = wdata->inode; ++ enum pnfs_try_status trypnfs; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = wdata->req->wb_lseg; ++ ++ wdata->pdata.call_ops = call_ops; ++ wdata->pdata.pnfs_error = 0; ++ wdata->pdata.how = how; ++ ++ dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, ++ inode->i_ino, wdata->args.count, wdata->args.offset, how); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ wdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ wdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->write_pagelist(wdata, ++ nfs_page_array_len(wdata->args.pgbase, wdata->args.count), ++ how); ++ ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ wdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ wdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&wdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* Post-read completion function. Invoked by all layout drivers when ++ * read_pagelist is done ++ */ ++static void ++pnfs_read_retry(struct work_struct *work) ++{ ++ struct rpc_task *task; ++ struct nfs_read_data *rdata; ++ struct pnfs_layout_range range; ++ ++ dprintk("%s enter\n", __func__); ++ task = container_of(work, struct rpc_task, u.tk_work); ++ rdata = container_of(task, struct nfs_read_data, task); ++ range.iomode = IOMODE_RW; ++ range.offset = rdata->args.offset; ++ range.length = rdata->args.count; ++ _pnfs_return_layout(rdata->inode, &range, NULL, RETURN_FILE, true); ++ pnfs_initiate_read(rdata, NFS_CLIENT(rdata->inode), ++ rdata->pdata.call_ops); ++} ++ ++static void ++pnfs_read_done(struct nfs_read_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ INIT_WORK(&data->task.u.tk_work, pnfs_read_retry); ++ queue_work(nfsiod_workqueue, &data->task.u.tk_work); ++ } ++} ++ ++/* ++ * Call the appropriate parallel I/O subsystem read function. ++ * If no I/O device driver exists, or one does match the returned ++ * fstype, then return a positive status for regular NFS processing. ++ */ ++enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *rdata, ++ const struct rpc_call_ops *call_ops) ++{ ++ struct inode *inode = rdata->inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ struct pnfs_layout_segment *lseg = rdata->req->wb_lseg; ++ enum pnfs_try_status trypnfs; ++ ++ rdata->pdata.call_ops = call_ops; ++ rdata->pdata.pnfs_error = 0; ++ ++ dprintk("%s: Reading ino:%lu %u@%llu\n", ++ __func__, inode->i_ino, rdata->args.count, rdata->args.offset); ++ ++ get_lseg(lseg); ++ ++ if (!pnfs_use_rpc(nfss)) ++ rdata->pdata.pnfsflags |= PNFS_NO_RPC; ++ rdata->pdata.lseg = lseg; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->read_pagelist(rdata, ++ nfs_page_array_len(rdata->args.pgbase, rdata->args.count)); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ rdata->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ rdata->pdata.lseg = NULL; ++ put_lseg(lseg); ++ _pnfs_clear_lseg_from_pages(&rdata->pages); ++ } else { ++ nfs_inc_stats(inode, NFSIOS_PNFS_READ); ++ } ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++/* ++ * This gives the layout driver an opportunity to read in page "around" ++ * the data to be written. It returns 0 on success, otherwise an error code ++ * which will either be passed up to user, or ignored if ++ * some previous part of write succeeded. ++ * Note the range [pos, pos+len-1] is entirely within the page. ++ */ ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata) ++{ ++ struct pnfs_fsdata *data; ++ int status = 0; ++ ++ dprintk("--> %s: pos=%llu len=%u\n", ++ __func__, (unsigned long long)pos, len); ++ data = kzalloc(sizeof(struct pnfs_fsdata), GFP_KERNEL); ++ if (!data) { ++ status = -ENOMEM; ++ goto out; ++ } ++ data->lseg = lseg; /* refcount passed into data to be managed there */ ++ status = NFS_SERVER(inode)->pnfs_curr_ld->ld_io_ops->write_begin( ++ lseg, page, pos, len, data); ++ if (status) { ++ kfree(data); ++ data = NULL; ++ } ++out: ++ *fsdata = data; ++ dprintk("<-- %s: status=%d\n", __func__, status); ++ return status; ++} ++ ++/* Return 0 on succes, negative on failure */ ++/* CAREFUL - what happens if copied < len??? */ ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status; ++ ++ status = nfss->pnfs_curr_ld->ld_io_ops->write_end(inode, page, ++ pos, len, copied, lseg); ++ return status; ++} ++ ++/* pNFS Commit callback function for all layout drivers */ ++static void ++pnfs_commit_done(struct nfs_write_data *data) ++{ ++ struct pnfs_call_data *pdata = &data->pdata; ++ ++ dprintk("%s: Begin (status %d)\n", __func__, data->task.tk_status); ++ ++ if (pnfs_call_done(pdata, &data->task, data) == -EAGAIN) { ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_RW, ++ .offset = data->args.offset, ++ .length = data->args.count, ++ }; ++ dprintk("%s: retrying\n", __func__); ++ _pnfs_return_layout(data->inode, &range, NULL, RETURN_FILE, ++ true); ++ pnfs_initiate_commit(data, NFS_CLIENT(data->inode), ++ pdata->call_ops, pdata->how, 1); ++ } ++} ++ ++enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int sync) ++{ ++ struct inode *inode = data->inode; ++ struct nfs_server *nfss = NFS_SERVER(data->inode); ++ enum pnfs_try_status trypnfs; ++ ++ dprintk("%s: Begin\n", __func__); ++ ++ /* We need to account for possibility that ++ * each nfs_page can point to a different lseg (or be NULL). ++ * For the immediate case of whole-file-only layouts, we at ++ * least know there can be only a single lseg. ++ * We still have to account for the possibility of some being NULL. ++ * This will be done by passing the buck to the layout driver. ++ */ ++ data->pdata.call_ops = call_ops; ++ data->pdata.pnfs_error = 0; ++ data->pdata.how = sync; ++ data->pdata.lseg = NULL; ++ trypnfs = nfss->pnfs_curr_ld->ld_io_ops->commit(data, sync); ++ if (trypnfs == PNFS_NOT_ATTEMPTED) { ++ data->pdata.pnfsflags &= ~PNFS_NO_RPC; ++ _pnfs_clear_lseg_from_pages(&data->pages); ++ } else ++ nfs_inc_stats(inode, NFSIOS_PNFS_COMMIT); ++ dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); ++ return trypnfs; ++} ++ ++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) ++{ ++ struct nfs_server *nfss = NFS_SERVER(data->args.inode); ++ ++ /* TODO: Maybe we should avoid this by allowing the layout driver ++ * to directly xdr its layout on the wire. ++ */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit) ++ nfss->pnfs_curr_ld->ld_io_ops->cleanup_layoutcommit( ++ NFS_I(data->args.inode)->layout, ++ &data->args, data->status); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int ++pnfs_layoutcommit_setup(struct inode *inode, ++ struct nfs4_layoutcommit_data *data, ++ loff_t write_begin_pos, loff_t write_end_pos) ++{ ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int result = 0; ++ ++ dprintk("--> %s\n", __func__); ++ ++ data->args.inode = inode; ++ data->args.fh = NFS_FH(inode); ++ data->args.layout_type = nfss->pnfs_curr_ld->id; ++ data->res.fattr = &data->fattr; ++ nfs_fattr_init(&data->fattr); ++ ++ /* TODO: Need to determine the correct values */ ++ data->args.time_modify_changed = 0; ++ ++ /* Set values from inode so it can be reset ++ */ ++ data->args.range.iomode = IOMODE_RW; ++ data->args.range.offset = write_begin_pos; ++ data->args.range.length = write_end_pos - write_begin_pos + 1; ++ data->args.lastbytewritten = min(write_end_pos, ++ i_size_read(inode) - 1); ++ data->args.bitmask = nfss->attr_bitmask; ++ data->res.server = nfss; ++ ++ /* Call layout driver to set the arguments */ ++ if (nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit) ++ result = nfss->pnfs_curr_ld->ld_io_ops->setup_layoutcommit( ++ NFS_I(inode)->layout, &data->args); ++ ++ dprintk("<-- %s Status %d\n", __func__, result); ++ return result; ++} ++ ++/* Issue a async layoutcommit for an inode. ++ */ ++int ++pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ struct nfs4_layoutcommit_data *data; ++ struct nfs_inode *nfsi = NFS_I(inode); ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ ++ int status = 0; ++ ++ dprintk("%s Begin (sync:%d)\n", __func__, sync); ++ ++ BUG_ON(!has_layout(nfsi)); ++ ++ data = pnfs_layoutcommit_alloc(); ++ if (!data) ++ return -ENOMEM; ++ ++ spin_lock(&inode->i_lock); ++ if (!layoutcommit_needed(nfsi)) { ++ spin_unlock(&inode->i_lock); ++ goto out_free; ++ } ++ ++ /* Clear layoutcommit properties in the inode so ++ * new lc info can be generated ++ */ ++ write_begin_pos = nfsi->layout->write_begin_pos; ++ write_end_pos = nfsi->layout->write_end_pos; ++ data->cred = nfsi->layout->cred; ++ nfsi->layout->write_begin_pos = 0; ++ nfsi->layout->write_end_pos = 0; ++ nfsi->layout->cred = NULL; ++ __clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); ++ pnfs_get_layout_stateid(&data->args.stateid, nfsi->layout); ++ ++ /* Reference for layoutcommit matched in pnfs_layoutcommit_release */ ++ get_layout(NFS_I(inode)->layout); ++ ++ spin_unlock(&inode->i_lock); ++ ++ /* Set up layout commit args */ ++ status = pnfs_layoutcommit_setup(inode, data, write_begin_pos, ++ write_end_pos); ++ if (status) { ++ /* The layout driver failed to setup the layoutcommit */ ++ put_rpccred(data->cred); ++ put_layout(inode); ++ goto out_free; ++ } ++ status = nfs4_proc_layoutcommit(data, sync); ++out: ++ dprintk("%s end (err:%d)\n", __func__, status); ++ return status; ++out_free: ++ pnfs_layoutcommit_free(data); ++ goto out; ++} ++ ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata) ++{ ++ if (fsdata) { ++ /* lseg refcounting handled directly in nfs_Write_end */ ++ kfree(fsdata); ++ } ++} ++ ++/* Callback operations for layout drivers. ++ */ ++struct pnfs_client_operations pnfs_ops = { ++ .nfs_getdevicelist = nfs4_proc_getdevicelist, ++ .nfs_getdeviceinfo = nfs4_proc_getdeviceinfo, ++ .nfs_readlist_complete = pnfs_read_done, ++ .nfs_writelist_complete = pnfs_writeback_done, ++ .nfs_commit_complete = pnfs_commit_done, ++}; ++ ++EXPORT_SYMBOL(pnfs_unregister_layoutdriver); ++EXPORT_SYMBOL(pnfs_register_layoutdriver); ++ ++ ++/* Device ID cache. Supports one layout type per struct nfs_client */ ++int ++nfs4_alloc_init_deviceid_cache(struct nfs_client *clp, ++ void (*free_callback)(struct kref *)) ++{ ++ struct nfs4_deviceid_cache *c; ++ ++ c = kzalloc(sizeof(struct nfs4_deviceid_cache), GFP_KERNEL); ++ if (!c) ++ return -ENOMEM; ++ spin_lock(&clp->cl_lock); ++ if (clp->cl_devid_cache != NULL) { ++ kref_get(&clp->cl_devid_cache->dc_kref); ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [kref [%d]]\n", __func__, ++ atomic_read(&clp->cl_devid_cache->dc_kref.refcount)); ++ kfree(c); ++ } else { ++ int i; ++ ++ spin_lock_init(&c->dc_lock); ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE ; i++) ++ INIT_HLIST_HEAD(&c->dc_deviceids[i]); ++ kref_init(&c->dc_kref); ++ c->dc_free_callback = free_callback; ++ clp->cl_devid_cache = c; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [new]\n", __func__); ++ } ++ return 0; ++} ++EXPORT_SYMBOL(nfs4_alloc_init_deviceid_cache); ++ ++void ++nfs4_init_deviceid_node(struct nfs4_deviceid *d) ++{ ++ INIT_HLIST_NODE(&d->de_node); ++ kref_init(&d->de_kref); ++} ++EXPORT_SYMBOL(nfs4_init_deviceid_node); ++ ++/* Called from layoutdriver_io_operations->alloc_lseg */ ++void ++nfs4_set_layout_deviceid(struct pnfs_layout_segment *l, struct nfs4_deviceid *d) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = d; ++} ++EXPORT_SYMBOL(nfs4_set_layout_deviceid); ++ ++/* Called from layoutdriver_io_operations->free_lseg */ ++void ++nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *l, ++ struct nfs4_deviceid *d, ++ void (*free_callback)(struct kref *)) ++{ ++ dprintk("%s [%d]\n", __func__, atomic_read(&d->de_kref.refcount)); ++ l->deviceid = NULL; ++ kref_put(&d->de_kref, free_callback); ++} ++EXPORT_SYMBOL(nfs4_put_unset_layout_deviceid); ++ ++/* Find and reference a deviceid */ ++struct nfs4_deviceid * ++nfs4_find_get_deviceid(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ rcu_read_lock(); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ if (!atomic_inc_not_zero(&d->de_kref.refcount)) { ++ goto fail; ++ } else { ++ rcu_read_unlock(); ++ return d; ++ } ++ } ++ } ++fail: ++ rcu_read_unlock(); ++ return NULL; ++} ++EXPORT_SYMBOL(nfs4_find_get_deviceid); ++ ++/* ++ * Add and kref_get a deviceid. ++ * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new ++ */ ++struct nfs4_deviceid * ++nfs4_add_get_deviceid(struct nfs4_deviceid_cache *c, struct nfs4_deviceid *new) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ long hash = nfs4_deviceid_hash(&new->de_id); ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (!memcmp(&d->de_id, &new->de_id, NFS4_PNFS_DEVICEID4_SIZE)) { ++ kref_get(&d->de_kref); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [discard]\n", __func__); ++ c->dc_free_callback(&new->de_kref); ++ return d; ++ } ++ } ++ hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); ++ kref_get(&new->de_kref); ++ spin_unlock(&c->dc_lock); ++ dprintk("%s [new]\n", __func__); ++ return new; ++} ++EXPORT_SYMBOL(nfs4_add_get_deviceid); ++ ++/* ++ * Remove the first deviceid from a hash bucket, or return 0 if bucket list ++ * is empty. ++ */ ++static int ++nfs4_remove_deviceid(struct nfs4_deviceid_cache *c, long hash, ++ struct pnfs_deviceid *id) ++{ ++ struct nfs4_deviceid *d; ++ struct hlist_node *n; ++ ++ dprintk("--> %s hash %ld\n", __func__, hash); ++ spin_lock(&c->dc_lock); ++ hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { ++ if (id && memcmp(id, &d->de_id, NFS4_PNFS_DEVICEID4_SIZE)) ++ continue; ++ hlist_del_rcu(&d->de_node); ++ spin_unlock(&c->dc_lock); ++ synchronize_rcu(); ++ dprintk("%s [%d]\n", __func__, ++ atomic_read(&d->de_kref.refcount)); ++ kref_put(&d->de_kref, c->dc_free_callback); ++ return 1; ++ } ++ spin_unlock(&c->dc_lock); ++ return 0; ++} ++ ++void ++nfs4_delete_device(struct nfs4_deviceid_cache *c, struct pnfs_deviceid *id) ++{ ++ long hash = nfs4_deviceid_hash(id); ++ ++ nfs4_remove_deviceid(c, hash, id); ++} ++EXPORT_SYMBOL(nfs4_delete_device); ++ ++static void ++nfs4_free_deviceid_cache(struct kref *kref) ++{ ++ struct nfs4_deviceid_cache *cache = ++ container_of(kref, struct nfs4_deviceid_cache, dc_kref); ++ long i; ++ ++ for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) ++ while (nfs4_remove_deviceid(cache, i, NULL)) ++ ; ++ kfree(cache); ++} ++ ++void ++nfs4_put_deviceid_cache(struct nfs_client *clp) ++{ ++ struct nfs4_deviceid_cache *tmp = clp->cl_devid_cache; ++ int refcount; ++ ++ dprintk("--> %s cl_devid_cache %p\n", __func__, clp->cl_devid_cache); ++ spin_lock(&clp->cl_lock); ++ refcount = atomic_read(&clp->cl_devid_cache->dc_kref.refcount); ++ if (refcount == 1) ++ clp->cl_devid_cache = NULL; ++ spin_unlock(&clp->cl_lock); ++ dprintk("%s [%d]\n", __func__, refcount); ++ kref_put(&tmp->dc_kref, nfs4_free_deviceid_cache); ++} ++EXPORT_SYMBOL(nfs4_put_deviceid_cache); +diff -up linux-2.6.35.noarch/fs/nfs/pnfs.h.orig linux-2.6.35.noarch/fs/nfs/pnfs.h +--- linux-2.6.35.noarch/fs/nfs/pnfs.h.orig 2010-08-31 21:11:40.937150401 -0400 ++++ linux-2.6.35.noarch/fs/nfs/pnfs.h 2010-08-31 21:11:40.937150401 -0400 +@@ -0,0 +1,354 @@ ++/* ++ * fs/nfs/pnfs.h ++ * ++ * pNFS client data structures. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef FS_NFS_PNFS_H ++#define FS_NFS_PNFS_H ++ ++#include ++ ++#ifdef CONFIG_NFS_V4_1 ++ ++#include ++#include ++#include "iostat.h" ++ ++/* nfs4proc.c */ ++extern int nfs4_proc_getdevicelist(struct nfs_server *server, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, ++ struct pnfs_device *dev); ++extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); ++extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, ++ int issync); ++extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool wait); ++ ++/* pnfs.c */ ++extern const nfs4_stateid zero_stateid; ++ ++void _pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp); ++ ++int _pnfs_return_layout(struct inode *, struct pnfs_layout_range *, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type, bool wait); ++void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *mntfh, u32 id); ++void unmount_pnfs_layoutdriver(struct nfs_server *); ++enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, ++ const struct rpc_call_ops *); ++int pnfs_initialize(void); ++void pnfs_uninitialize(void); ++void pnfs_layoutcommit_free(struct nfs4_layoutcommit_data *data); ++void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); ++int pnfs_layoutcommit_inode(struct inode *inode, int sync); ++void pnfs_update_last_write(struct nfs_inode *nfsi, loff_t offset, size_t extent); ++void pnfs_need_layoutcommit(struct nfs_inode *nfsi, struct nfs_open_context *ctx); ++unsigned int pnfs_getiosize(struct nfs_server *server); ++void pnfs_set_ds_iosize(struct nfs_server *server); ++enum pnfs_try_status pnfs_try_to_commit(struct nfs_write_data *, ++ const struct rpc_call_ops *, int); ++void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *, ++ struct nfs_open_context *, struct list_head *, ++ size_t *); ++void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *, ++ size_t *); ++void pnfs_free_fsdata(struct pnfs_fsdata *fsdata); ++void pnfs_get_layout_done(struct nfs4_layoutget *, int rpc_status); ++int pnfs_layout_process(struct nfs4_layoutget *lgp); ++void pnfs_layout_release(struct pnfs_layout_hdr *, struct pnfs_layout_range *range); ++void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, ++ const nfs4_stateid *stateid); ++void pnfs_destroy_layout(struct nfs_inode *); ++void pnfs_destroy_all_layouts(struct nfs_client *); ++void put_layout(struct inode *inode); ++void pnfs_get_layout_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo); ++int _pnfs_write_begin(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata **fsdata); ++int _pnfs_write_end(struct inode *inode, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ ++#define PNFS_EXISTS_LDIO_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_io_ops && \ ++ (srv)->pnfs_curr_ld->ld_io_ops->opname) ++#define PNFS_EXISTS_LDPOLICY_OP(srv, opname) ((srv)->pnfs_curr_ld && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops && \ ++ (srv)->pnfs_curr_ld->ld_policy_ops->opname) ++ ++#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" ++ ++static inline int lo_fail_bit(u32 iomode) ++{ ++ return iomode == IOMODE_RW ? ++ NFS_INO_RW_LAYOUT_FAILED : NFS_INO_RO_LAYOUT_FAILED; ++} ++ ++/* Return true if a layout driver is being used for this mountpoint */ ++static inline int pnfs_enabled_sb(struct nfs_server *nfss) ++{ ++ return nfss->pnfs_curr_ld != NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return !fsdata || ((struct pnfs_layout_segment *)fsdata == lseg) || ++ !fsdata->bypass_eof; ++} ++ ++/* Should the pNFS client commit and return the layout upon a setattr */ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ if (!pnfs_enabled_sb(NFS_SERVER(inode))) ++ return false; ++ return NFS_SERVER(inode)->pnfs_curr_ld->ld_policy_ops->flags & ++ PNFS_LAYOUTRET_ON_SETATTR; ++} ++ ++/* Should the pNFS client commit and return the layout on close ++ */ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout->roc_iomode; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ int status = 0; ++ ++ *fsdata = lseg; ++ if (lseg && PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ status = _pnfs_write_begin(inode, page, pos, len, lseg, ++ (struct pnfs_fsdata **) fsdata); ++ return status; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct nfs_server *nfss = NFS_SERVER(inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end)) ++ return _pnfs_write_end(inode, page, pos, len, copied, lseg); ++ else ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_end_cleanup)) ++ nfss->pnfs_curr_ld->ld_io_ops->write_end_cleanup(filp, fsdata); ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ pnfs_free_fsdata(fsdata); ++ } ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ struct nfs_inode *nfsi = NFS_I(ino); ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss) && ++ (type != RETURN_FILE || has_layout(nfsi))) ++ return _pnfs_return_layout(ino, range, stateid, type, wait); ++ ++ return 0; ++} ++ ++static inline void pnfs_update_layout(struct inode *ino, ++ struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ struct nfs_server *nfss = NFS_SERVER(ino); ++ ++ if (pnfs_enabled_sb(nfss)) ++ _pnfs_update_layout(ino, ctx, pos, count, access_type, lsegpp); ++ else { ++ if (lsegpp) ++ *lsegpp = NULL; ++ } ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return data->pdata.pnfs_error; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ if (pnfs_enabled_sb(nfss)) ++ return pnfs_ld_use_rpc_code(nfss->pnfs_curr_ld); ++ ++ return 1; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ if (fsdata) { ++ struct nfs_server *nfss = NFS_SERVER(filp->f_dentry->d_inode); ++ ++ if (PNFS_EXISTS_LDIO_OP(nfss, write_begin)) ++ return ((struct pnfs_fsdata *) fsdata)->lseg; ++ } ++ return fsdata; ++} ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++} ++ ++static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) ++{ ++} ++ ++static inline void get_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void put_lseg(struct pnfs_layout_segment *lseg) ++{ ++} ++ ++static inline void ++pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, ++ loff_t pos, u64 count, enum pnfs_iomode access_type, ++ struct pnfs_layout_segment **lsegpp) ++{ ++ if (lsegpp) ++ *lsegpp = NULL; ++} ++ ++static inline int pnfs_grow_ok(struct pnfs_layout_segment *lseg, ++ struct pnfs_fsdata *fsdata) ++{ ++ return 1; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_read_data(struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_write_data(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline enum pnfs_try_status ++pnfs_try_to_commit(struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, int how) ++{ ++ return PNFS_NOT_ATTEMPTED; ++} ++ ++static inline int pnfs_write_begin(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, ++ struct pnfs_layout_segment *lseg, ++ void **fsdata) ++{ ++ *fsdata = NULL; ++ return 0; ++} ++ ++static inline int pnfs_write_end(struct file *filp, struct page *page, ++ loff_t pos, unsigned len, unsigned copied, ++ struct pnfs_layout_segment *lseg) ++{ ++ return 0; ++} ++ ++static inline void pnfs_write_end_cleanup(struct file *filp, void *fsdata) ++{ ++} ++ ++static inline int pnfs_get_write_status(struct nfs_write_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_get_read_status(struct nfs_read_data *data) ++{ ++ return 0; ++} ++ ++static inline int pnfs_use_rpc(struct nfs_server *nfss) ++{ ++ return 1; ++} ++ ++static inline int pnfs_layoutcommit_inode(struct inode *inode, int sync) ++{ ++ return 0; ++} ++ ++static inline bool ++pnfs_ld_layoutret_on_setattr(struct inode *inode) ++{ ++ return false; ++} ++ ++static inline int ++pnfs_layout_roc_iomode(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++static inline int pnfs_return_layout(struct inode *ino, ++ struct pnfs_layout_range *range, ++ const nfs4_stateid *stateid, /* optional */ ++ enum pnfs_layoutreturn_type type, ++ bool wait) ++{ ++ return 0; ++} ++ ++static inline struct pnfs_layout_segment * ++nfs4_pull_lseg_from_fsdata(struct file *filp, void *fsdata) ++{ ++ return NULL; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++#endif /* FS_NFS_PNFS_H */ +diff -up linux-2.6.35.noarch/fs/nfs/proc.c.orig linux-2.6.35.noarch/fs/nfs/proc.c +--- linux-2.6.35.noarch/fs/nfs/proc.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/proc.c 2010-08-31 21:11:40.938160559 -0400 +@@ -443,7 +443,7 @@ nfs_proc_symlink(struct inode *dir, stru + fattr = nfs_alloc_fattr(); + status = -ENOMEM; + if (fh == NULL || fattr == NULL) +- goto out; ++ goto out_free; + + status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); + nfs_mark_for_revalidate(dir); +@@ -455,7 +455,7 @@ nfs_proc_symlink(struct inode *dir, stru + */ + if (status == 0) + status = nfs_instantiate(dentry, fh, fattr); +- ++out_free: + nfs_free_fattr(fattr); + nfs_free_fhandle(fh); + out: +@@ -694,6 +694,7 @@ const struct nfs_rpc_ops nfs_v2_clientop + .dentry_ops = &nfs_dentry_operations, + .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, ++ .file_ops = &nfs_file_operations, + .getroot = nfs_proc_get_root, + .getattr = nfs_proc_getattr, + .setattr = nfs_proc_setattr, +diff -up linux-2.6.35.noarch/fs/nfs/read.c.orig linux-2.6.35.noarch/fs/nfs/read.c +--- linux-2.6.35.noarch/fs/nfs/read.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/read.c 2010-08-31 21:11:40.939170402 -0400 +@@ -18,8 +18,12 @@ + #include + #include + #include ++#include ++#include + + #include ++#include ++#include "pnfs.h" + + #include "nfs4_fs.h" + #include "internal.h" +@@ -117,11 +121,14 @@ int nfs_readpage_async(struct nfs_open_c + LIST_HEAD(one_request); + struct nfs_page *new; + unsigned int len; ++ struct pnfs_layout_segment *lseg; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); +- new = nfs_create_request(ctx, inode, page, 0, len); ++ pnfs_update_layout(inode, ctx, 0, NFS4_MAX_UINT64, IOMODE_READ, &lseg); ++ new = nfs_create_request(ctx, inode, page, 0, len, lseg); ++ put_lseg(lseg); + if (IS_ERR(new)) { + unlock_page(page); + return PTR_ERR(new); +@@ -155,24 +162,20 @@ static void nfs_readpage_release(struct + nfs_release_request(req); + } + +-/* +- * Set up the NFS read request struct +- */ +-static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset) ++int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, +@@ -180,9 +183,46 @@ static int nfs_read_rpcsetup(struct nfs_ + .flags = RPC_TASK_ASYNC | swap_flags, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->read_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_read); ++ ++int pnfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) ++ return pnfs_get_read_status(data); ++ ++ return nfs_initiate_read(data, clnt, call_ops); ++} ++ ++/* ++ * Set up the NFS read request struct ++ */ ++static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + data->req = req; + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -190,27 +230,14 @@ static int nfs_read_rpcsetup(struct nfs_ + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + + data->res.fattr = &data->fattr; + data->res.count = count; + data->res.eof = 0; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->read_setup(data, &msg); +- +- dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ return pnfs_initiate_read(data, NFS_CLIENT(inode), call_ops); + } + + static void +@@ -354,7 +381,14 @@ static void nfs_readpage_retry(struct rp + { + struct nfs_readargs *argp = &data->args; + struct nfs_readres *resp = &data->res; ++ struct nfs_client *clp = NFS_SERVER(data->inode)->nfs_client; + ++#ifdef CONFIG_NFS_V4_1 ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + if (resp->eof || resp->count == argp->count) + return; + +@@ -368,7 +402,10 @@ static void nfs_readpage_retry(struct rp + argp->offset += resp->count; + argp->pgbase += resp->count; + argp->count -= resp->count; +- nfs_restart_rpc(task, NFS_SERVER(data->inode)->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + } + + /* +@@ -409,13 +446,19 @@ static void nfs_readpage_release_partial + void nfs_read_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_read_data *data = calldata; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, + &data->args.seq_args, &data->res.seq_res, + 0, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_read_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_read_partial_ops = { +@@ -568,7 +611,8 @@ readpage_async_filler(void *data, struct + if (len == 0) + return nfs_return_empty_page(page); + +- new = nfs_create_request(desc->ctx, inode, page, 0, len); ++ new = nfs_create_request(desc->ctx, inode, page, 0, len, ++ desc->pgio->pg_lseg); + if (IS_ERR(new)) + goto out_error; + +@@ -624,6 +668,9 @@ int nfs_readpages(struct file *filp, str + if (ret == 0) + goto read_complete; /* all pages were read */ + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_read(&pgio, inode, desc.ctx, pages, &rsize); ++#endif /* CONFIG_NFS_V4_1 */ + if (rsize < PAGE_CACHE_SIZE) + nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); + else +@@ -632,6 +679,7 @@ int nfs_readpages(struct file *filp, str + ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); + + nfs_pageio_complete(&pgio); ++ put_lseg(pgio.pg_lseg); + npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); + read_complete: +diff -up linux-2.6.35.noarch/fs/nfs/super.c.orig linux-2.6.35.noarch/fs/nfs/super.c +--- linux-2.6.35.noarch/fs/nfs/super.c.orig 2010-08-31 19:12:23.918150053 -0400 ++++ linux-2.6.35.noarch/fs/nfs/super.c 2010-08-31 21:11:40.940160289 -0400 +@@ -64,6 +64,7 @@ + #include "iostat.h" + #include "internal.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_VFS + +@@ -676,6 +677,28 @@ static int nfs_show_options(struct seq_f + + return 0; + } ++#ifdef CONFIG_NFS_V4_1 ++void show_sessions(struct seq_file *m, struct nfs_server *server) ++{ ++ if (nfs4_has_session(server->nfs_client)) ++ seq_printf(m, ",sessions"); ++} ++#else ++void show_sessions(struct seq_file *m, struct nfs_server *server) {} ++#endif ++ ++#ifdef CONFIG_NFS_V4_1 ++void show_pnfs(struct seq_file *m, struct nfs_server *server) ++{ ++ seq_printf(m, ",pnfs="); ++ if (server->pnfs_curr_ld) ++ seq_printf(m, "%s", server->pnfs_curr_ld->name); ++ else ++ seq_printf(m, "not configured"); ++} ++#else /* CONFIG_NFS_V4_1 */ ++void show_pnfs(struct seq_file *m, struct nfs_server *server) {} ++#endif /* CONFIG_NFS_V4_1 */ + + /* + * Present statistical information for this VFS mountpoint +@@ -714,6 +737,8 @@ static int nfs_show_stats(struct seq_fil + seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); + seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); + seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); ++ show_sessions(m, nfss); ++ show_pnfs(m, nfss); + } + #endif + +diff -up linux-2.6.35.noarch/fs/nfs/unlink.c.orig linux-2.6.35.noarch/fs/nfs/unlink.c +--- linux-2.6.35.noarch/fs/nfs/unlink.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/unlink.c 2010-08-31 21:11:40.941150860 -0400 +@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task + struct nfs_unlinkdata *data = calldata; + struct nfs_server *server = NFS_SERVER(data->dir); + +- if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, ++ if (nfs4_setup_sequence(server, NULL, &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); +diff -up linux-2.6.35.noarch/fs/nfs/write.c.orig linux-2.6.35.noarch/fs/nfs/write.c +--- linux-2.6.35.noarch/fs/nfs/write.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/fs/nfs/write.c 2010-08-31 21:11:40.943150294 -0400 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + +@@ -28,6 +29,7 @@ + #include "iostat.h" + #include "nfs4_fs.h" + #include "fscache.h" ++#include "pnfs.h" + + #define NFSDBG_FACILITY NFSDBG_PAGECACHE + +@@ -59,6 +61,7 @@ struct nfs_write_data *nfs_commitdata_al + } + return p; + } ++EXPORT_SYMBOL(nfs_commitdata_alloc); + + void nfs_commit_free(struct nfs_write_data *p) + { +@@ -66,6 +69,7 @@ void nfs_commit_free(struct nfs_write_da + kfree(p->pagevec); + mempool_free(p, nfs_commit_mempool); + } ++EXPORT_SYMBOL(nfs_commit_free); + + struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) + { +@@ -429,6 +433,17 @@ static void nfs_inode_remove_request(str + nfs_clear_request(req); + nfs_release_request(req); + } ++static void ++nfs_mark_request_nopnfs(struct nfs_page *req) ++{ ++ struct pnfs_layout_segment *lseg = req->wb_lseg; ++ ++ if (req->wb_lseg == NULL) ++ return; ++ req->wb_lseg = NULL; ++ put_lseg(lseg); ++ dprintk(" retry through MDS\n"); ++} + + static void + nfs_mark_request_dirty(struct nfs_page *req) +@@ -534,7 +549,7 @@ nfs_need_commit(struct nfs_inode *nfsi) + * The requests are *not* checked to ensure that they form a contiguous set. + */ + static int +-nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages) ++nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int *use_pnfs) + { + struct nfs_inode *nfsi = NFS_I(inode); + int ret; +@@ -542,7 +557,8 @@ nfs_scan_commit(struct inode *inode, str + if (!nfs_need_commit(nfsi)) + return 0; + +- ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); ++ ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT, ++ use_pnfs); + if (ret > 0) + nfsi->ncommit -= ret; + if (nfs_need_commit(NFS_I(inode))) +@@ -571,7 +587,8 @@ static inline int nfs_scan_commit(struct + static struct nfs_page *nfs_try_to_update_request(struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int bytes) ++ unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_page *req; + unsigned int rqend; +@@ -596,8 +613,8 @@ static struct nfs_page *nfs_try_to_updat + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ +- if (offset > rqend +- || end < req->wb_offset) ++ if (offset > rqend || end < req->wb_offset || ++ req->wb_lseg != lseg) + goto out_flushme; + + if (nfs_set_page_tag_locked(req)) +@@ -645,16 +662,17 @@ out_err: + * already called nfs_flush_incompatible() if necessary. + */ + static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, +- struct page *page, unsigned int offset, unsigned int bytes) ++ struct page *page, unsigned int offset, unsigned int bytes, ++ struct pnfs_layout_segment *lseg) + { + struct inode *inode = page->mapping->host; + struct nfs_page *req; + int error; + +- req = nfs_try_to_update_request(inode, page, offset, bytes); ++ req = nfs_try_to_update_request(inode, page, offset, bytes, lseg); + if (req != NULL) + goto out; +- req = nfs_create_request(ctx, inode, page, offset, bytes); ++ req = nfs_create_request(ctx, inode, page, offset, bytes, lseg); + if (IS_ERR(req)) + goto out; + error = nfs_inode_add_request(inode, req); +@@ -667,23 +685,27 @@ out: + } + + static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, ++ void *fsdata) + { + struct nfs_page *req; + +- req = nfs_setup_write_request(ctx, page, offset, count); ++ req = nfs_setup_write_request(ctx, page, offset, count, lseg); + if (IS_ERR(req)) + return PTR_ERR(req); + nfs_mark_request_dirty(req); + /* Update file length */ +- nfs_grow_file(page, offset, count); ++ if (pnfs_grow_ok(lseg, fsdata)) ++ nfs_grow_file(page, offset, count); + nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + return 0; + } + +-int nfs_flush_incompatible(struct file *file, struct page *page) ++int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct nfs_page *req; +@@ -700,7 +722,10 @@ int nfs_flush_incompatible(struct file * + req = nfs_page_find_request(page); + if (req == NULL) + return 0; +- do_flush = req->wb_page != page || req->wb_context != ctx; ++ do_flush = req->wb_page != page || req->wb_context != ctx || ++ req->wb_lock_context->lockowner != current->files || ++ req->wb_lock_context->pid != current->tgid || ++ req->wb_lseg != lseg; + nfs_release_request(req); + if (!do_flush) + return 0; +@@ -727,7 +752,8 @@ static int nfs_write_pageuptodate(struct + * things with a page scheduled for an RPC call (e.g. invalidate it). + */ + int nfs_updatepage(struct file *file, struct page *page, +- unsigned int offset, unsigned int count) ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata) + { + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct inode *inode = page->mapping->host; +@@ -752,7 +778,7 @@ int nfs_updatepage(struct file *file, st + offset = 0; + } + +- status = nfs_writepage_setup(ctx, page, offset, count); ++ status = nfs_writepage_setup(ctx, page, offset, count, lseg, fsdata); + if (status < 0) + nfs_set_pageerror(page); + +@@ -782,25 +808,21 @@ static int flush_task_priority(int how) + return RPC_PRIORITY_NORMAL; + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_write_rpcsetup(struct nfs_page *req, +- struct nfs_write_data *data, +- const struct rpc_call_ops *call_ops, +- unsigned int count, unsigned int offset, +- int how) ++int nfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct inode *inode = req->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = req->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .task = &data->task, + .rpc_message = &msg, + .callback_ops = call_ops, +@@ -811,12 +833,62 @@ static int nfs_write_rpcsetup(struct nfs + }; + int ret = 0; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->write_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated write call " ++ "(req %s/%lld, %u bytes @ offset %llu)\n", ++ data->task.tk_pid, ++ inode->i_sb->s_id, ++ (long long)NFS_FILEID(inode), ++ data->args.count, ++ (unsigned long long)data->args.offset); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ if (how & FLUSH_SYNC) { ++ ret = rpc_wait_for_completion_task(task); ++ if (ret == 0) ++ ret = task->tk_status; ++ } ++ rpc_put_task(task); ++out: ++ return ret; ++} ++EXPORT_SYMBOL(nfs_initiate_write); ++ ++int pnfs_initiate_write(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) ++{ ++ if (data->req->wb_lseg && ++ (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_write(data, clnt, call_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_write_rpcsetup(struct nfs_page *req, ++ struct nfs_write_data *data, ++ const struct rpc_call_ops *call_ops, ++ unsigned int count, unsigned int offset, ++ int how) ++{ ++ struct inode *inode = req->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + data->req = req; + data->inode = inode = req->wb_context->path.dentry->d_inode; +- data->cred = msg.rpc_cred; ++ data->cred = req->wb_context->cred; + + data->args.fh = NFS_FH(inode); + data->args.offset = req_offset(req) + offset; +@@ -824,6 +896,7 @@ static int nfs_write_rpcsetup(struct nfs + data->args.pages = data->pagevec; + data->args.count = count; + data->args.context = get_nfs_open_context(req->wb_context); ++ data->args.lock_context = req->wb_lock_context; + data->args.stable = NFS_UNSTABLE; + if (how & FLUSH_STABLE) { + data->args.stable = NFS_DATA_SYNC; +@@ -836,30 +909,7 @@ static int nfs_write_rpcsetup(struct nfs + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->write_setup(data, &msg); +- +- dprintk("NFS: %5u initiated write call " +- "(req %s/%lld, %u bytes @ offset %llu)\n", +- data->task.tk_pid, +- inode->i_sb->s_id, +- (long long)NFS_FILEID(inode), +- count, +- (unsigned long long)data->args.offset); +- +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) { +- ret = PTR_ERR(task); +- goto out; +- } +- if (how & FLUSH_SYNC) { +- ret = rpc_wait_for_completion_task(task); +- if (ret == 0) +- ret = task->tk_status; +- } +- rpc_put_task(task); +-out: +- return ret; ++ return pnfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); + } + + /* If a nfs_flush_* function fails, it should remove reqs from @head and +@@ -870,6 +920,7 @@ static void nfs_redirty_request(struct n + { + struct page *page = req->wb_page; + ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); +@@ -982,6 +1033,10 @@ static void nfs_pageio_init_write(struct + { + size_t wsize = NFS_SERVER(inode)->wsize; + ++#ifdef CONFIG_NFS_V4_1 ++ pnfs_pageio_init_write(pgio, inode, &wsize); ++#endif /* CONFIG_NFS_V4_1 */ ++ + if (wsize < PAGE_CACHE_SIZE) + nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); + else +@@ -1047,13 +1102,27 @@ out: + void nfs_write_prepare(struct rpc_task *task, void *calldata) + { + struct nfs_write_data *data = calldata; +- struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; ++ struct nfs4_session *ds_session = NULL; + +- if (nfs4_setup_sequence(clp, &data->args.seq_args, ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS read\n", __func__); ++ ds_session = data->fldata.ds_nfs_client->cl_session; ++ } else if (data->args.count > NFS_SERVER(data->inode)->wsize) { ++ /* retrying via MDS? */ ++ data->pdata.orig_count = data->args.count; ++ data->args.count = NFS_SERVER(data->inode)->wsize; ++ dprintk("%s: trimmed count %u to wsize %u\n", __func__, ++ data->pdata.orig_count, data->args.count); ++ } else ++ data->pdata.orig_count = 0; ++ ++ if (nfs4_setup_sequence(NFS_SERVER(data->inode), ds_session, ++ &data->args.seq_args, + &data->res.seq_res, 1, task)) + return; + rpc_call_start(task); + } ++EXPORT_SYMBOL(nfs_write_prepare); + #endif /* CONFIG_NFS_V4_1 */ + + static const struct rpc_call_ops nfs_write_partial_ops = { +@@ -1137,10 +1206,11 @@ int nfs_writeback_done(struct rpc_task * + struct nfs_writeargs *argp = &data->args; + struct nfs_writeres *resp = &data->res; + struct nfs_server *server = NFS_SERVER(data->inode); ++ struct nfs_client *clp = server->nfs_client; + int status; + +- dprintk("NFS: %5u nfs_writeback_done (status %d)\n", +- task->tk_pid, task->tk_status); ++ dprintk("NFS: %5u nfs_writeback_done (status %d count %u)\n", ++ task->tk_pid, task->tk_status, resp->count); + + /* + * ->write_done will attempt to use post-op attributes to detect +@@ -1153,6 +1223,13 @@ int nfs_writeback_done(struct rpc_task * + if (status != 0) + return status; + nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); ++#ifdef CONFIG_NFS_V4_1 ++ /* Is this a DS session */ ++ if (data->fldata.ds_nfs_client) { ++ dprintk("%s DS write\n", __func__); ++ clp = data->fldata.ds_nfs_client; ++ } ++#endif /* CONFIG_NFS_V4_1 */ + + #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) + if (resp->verf->committed < argp->stable && task->tk_status >= 0) { +@@ -1169,7 +1246,7 @@ int nfs_writeback_done(struct rpc_task * + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", +- server->nfs_client->cl_hostname, ++ clp->cl_hostname, + resp->verf->committed, argp->stable); + complain = jiffies + 300 * HZ; + } +@@ -1179,6 +1256,9 @@ int nfs_writeback_done(struct rpc_task * + if (task->tk_status >= 0 && resp->count < argp->count) { + static unsigned long complain; + ++ dprintk("NFS: short write:" ++ " (resp->count %u) < (argp->count = %u)\n", ++ resp->count, argp->count); + nfs_inc_stats(data->inode, NFSIOS_SHORTWRITE); + + /* Has the server at least made some progress? */ +@@ -1195,7 +1275,10 @@ int nfs_writeback_done(struct rpc_task * + */ + argp->stable = NFS_FILE_SYNC; + } +- nfs_restart_rpc(task, server->nfs_client); ++#ifdef CONFIG_NFS_V4_1 ++ data->pdata.pnfs_error = -EAGAIN; ++#endif /* CONFIG_NFS_V4_1 */ ++ nfs_restart_rpc(task, clp); + return -EAGAIN; + } + if (time_before(complain, jiffies)) { +@@ -1239,40 +1322,73 @@ static void nfs_commitdata_release(void + nfs_commit_free(wdata); + } + +-/* +- * Set up the argument/result storage required for the RPC call. +- */ +-static int nfs_commit_rpcsetup(struct list_head *head, +- struct nfs_write_data *data, +- int how) ++int nfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how) + { +- struct nfs_page *first = nfs_list_entry(head->next); +- struct inode *inode = first->wb_context->path.dentry->d_inode; ++ struct inode *inode = data->inode; + int priority = flush_task_priority(how); + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, +- .rpc_cred = first->wb_context->cred, ++ .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, +- .rpc_client = NFS_CLIENT(inode), ++ .rpc_client = clnt, + .rpc_message = &msg, +- .callback_ops = &nfs_commit_ops, ++ .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC, + .priority = priority, + }; + ++ /* Set up the initial task struct. */ ++ NFS_PROTO(inode)->commit_setup(data, &msg); ++ ++ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++ ++ task = rpc_run_task(&task_setup_data); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ rpc_put_task(task); ++ return 0; ++} ++EXPORT_SYMBOL(nfs_initiate_commit); ++ ++ ++int pnfs_initiate_commit(struct nfs_write_data *data, ++ struct rpc_clnt *clnt, ++ const struct rpc_call_ops *call_ops, ++ int how, int pnfs) ++{ ++ if (pnfs && ++ (pnfs_try_to_commit(data, &nfs_commit_ops, how) == PNFS_ATTEMPTED)) ++ return pnfs_get_write_status(data); ++ ++ return nfs_initiate_commit(data, clnt, &nfs_commit_ops, how); ++} ++ ++/* ++ * Set up the argument/result storage required for the RPC call. ++ */ ++static int nfs_commit_rpcsetup(struct list_head *head, ++ struct nfs_write_data *data, ++ int how, int pnfs) ++{ ++ struct nfs_page *first = nfs_list_entry(head->next); ++ struct inode *inode = first->wb_context->path.dentry->d_inode; ++ + /* Set up the RPC argument and reply structs + * NB: take care not to mess about with data->commit et al. */ + + list_splice_init(head, &data->pages); + + data->inode = inode; +- data->cred = msg.rpc_cred; ++ data->cred = first->wb_context->cred; + + data->args.fh = NFS_FH(data->inode); + /* Note: we always request a commit of the entire inode */ +@@ -1283,45 +1399,47 @@ static int nfs_commit_rpcsetup(struct li + data->res.fattr = &data->fattr; + data->res.verf = &data->verf; + nfs_fattr_init(&data->fattr); ++ kref_init(&data->refcount); ++ data->parent = NULL; ++ data->args.context = first->wb_context; /* used by commit done */ + +- /* Set up the initial task struct. */ +- NFS_PROTO(inode)->commit_setup(data, &msg); ++ return pnfs_initiate_commit(data, NFS_CLIENT(inode), &nfs_commit_ops, ++ how, pnfs); ++} + +- dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); ++/* Handle memory error during commit */ ++void nfs_mark_list_commit(struct list_head *head) ++{ ++ struct nfs_page *req; + +- task = rpc_run_task(&task_setup_data); +- if (IS_ERR(task)) +- return PTR_ERR(task); +- rpc_put_task(task); +- return 0; ++ while (!list_empty(head)) { ++ req = nfs_list_entry(head->next); ++ nfs_list_remove_request(req); ++ nfs_mark_request_commit(req); ++ dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); ++ dec_bdi_stat(req->wb_page->mapping->backing_dev_info, ++ BDI_RECLAIMABLE); ++ nfs_clear_page_tag_locked(req); ++ } + } ++EXPORT_SYMBOL(nfs_mark_list_commit); + + /* + * Commit dirty pages + */ + static int +-nfs_commit_list(struct inode *inode, struct list_head *head, int how) ++nfs_commit_list(struct inode *inode, struct list_head *head, int how, int pnfs) + { + struct nfs_write_data *data; +- struct nfs_page *req; + + data = nfs_commitdata_alloc(); +- + if (!data) + goto out_bad; + + /* Set up the argument struct */ +- return nfs_commit_rpcsetup(head, data, how); ++ return nfs_commit_rpcsetup(head, data, how, pnfs); + out_bad: +- while (!list_empty(head)) { +- req = nfs_list_entry(head->next); +- nfs_list_remove_request(req); +- nfs_mark_request_commit(req); +- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); +- dec_bdi_stat(req->wb_page->mapping->backing_dev_info, +- BDI_RECLAIMABLE); +- nfs_clear_page_tag_locked(req); +- } ++ nfs_mark_list_commit(head); + nfs_commit_clear_lock(NFS_I(inode)); + return -ENOMEM; + } +@@ -1341,6 +1459,19 @@ static void nfs_commit_done(struct rpc_t + return; + } + ++static inline void nfs_commit_cleanup(struct kref *kref) ++{ ++ struct nfs_write_data *data; ++ ++ data = container_of(kref, struct nfs_write_data, refcount); ++ /* Clear lock only when all cloned commits are finished */ ++ if (data->parent) ++ kref_put(&data->parent->refcount, nfs_commit_cleanup); ++ else ++ nfs_commit_clear_lock(NFS_I(data->inode)); ++ nfs_commitdata_release(data); ++} ++ + static void nfs_commit_release(void *calldata) + { + struct nfs_write_data *data = calldata; +@@ -1358,6 +1489,11 @@ static void nfs_commit_release(void *cal + req->wb_bytes, + (long long)req_offset(req)); + if (status < 0) { ++ if (req->wb_lseg) { ++ nfs_mark_request_nopnfs(req); ++ nfs_mark_request_dirty(req); ++ goto next; ++ } + nfs_context_set_write_error(req->wb_context, status); + nfs_inode_remove_request(req); + dprintk(", error = %d\n", status); +@@ -1374,12 +1510,12 @@ static void nfs_commit_release(void *cal + } + /* We have a mismatch. Write the page again */ + dprintk(" mismatch\n"); ++ nfs_mark_request_nopnfs(req); + nfs_mark_request_dirty(req); + next: + nfs_clear_page_tag_locked(req); + } +- nfs_commit_clear_lock(NFS_I(data->inode)); +- nfs_commitdata_release(calldata); ++ kref_put(&data->refcount, nfs_commit_cleanup); + } + + static const struct rpc_call_ops nfs_commit_ops = { +@@ -1395,21 +1531,22 @@ int nfs_commit_inode(struct inode *inode + LIST_HEAD(head); + int may_wait = how & FLUSH_SYNC; + int res = 0; ++ int use_pnfs = 0; + + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out_mark_dirty; + spin_lock(&inode->i_lock); +- res = nfs_scan_commit(inode, &head, 0, 0); ++ res = nfs_scan_commit(inode, &head, 0, 0, &use_pnfs); + spin_unlock(&inode->i_lock); + if (res) { +- int error = nfs_commit_list(inode, &head, how); ++ int error = nfs_commit_list(inode, &head, how, use_pnfs); + if (error < 0) + return error; +- if (may_wait) ++ if (may_wait) { + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); +- else ++ } else + goto out_mark_dirty; + } else + nfs_commit_clear_lock(NFS_I(inode)); +@@ -1462,7 +1599,18 @@ static int nfs_commit_unstable_pages(str + + int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) + { +- return nfs_commit_unstable_pages(inode, wbc); ++ int ret; ++ ret = nfs_commit_unstable_pages(inode, wbc); ++ if (ret >= 0 && layoutcommit_needed(NFS_I(inode))) { ++ int err, sync = wbc->sync_mode; ++ ++ if (wbc->nonblocking || wbc->for_background) ++ sync = 0; ++ err = pnfs_layoutcommit_inode(inode, sync); ++ if (err < 0) ++ ret = err; ++ } ++ return ret; + } + + /* +@@ -1470,6 +1618,7 @@ int nfs_write_inode(struct inode *inode, + */ + int nfs_wb_all(struct inode *inode) + { ++ int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, +@@ -1477,7 +1626,8 @@ int nfs_wb_all(struct inode *inode) + .range_end = LLONG_MAX, + }; + +- return sync_inode(inode, &wbc); ++ ret = sync_inode(inode, &wbc); ++ return ret; + } + + int nfs_wb_page_cancel(struct inode *inode, struct page *page) +diff -up linux-2.6.35.noarch/include/linux/exportfs.h.orig linux-2.6.35.noarch/include/linux/exportfs.h +--- linux-2.6.35.noarch/include/linux/exportfs.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/exportfs.h 2010-08-31 21:11:40.973160288 -0400 +@@ -2,6 +2,7 @@ + #define LINUX_EXPORTFS_H 1 + + #include ++#include + + struct dentry; + struct inode; +@@ -175,4 +176,62 @@ extern struct dentry *generic_fh_to_pare + struct fid *fid, int fh_len, int fh_type, + struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)); + ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct pnfs_filelayout_device; ++struct pnfs_filelayout_layout; ++ ++extern int filelayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_device *fdev); ++extern enum nfsstat4 filelayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct pnfs_filelayout_layout *flp); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_EXPORTFS_FILE_LAYOUT) ++struct list_head; ++ ++extern int blocklayout_encode_devinfo(struct exp_xdr_stream *xdr, ++ const struct list_head *volumes); ++ ++extern enum nfsstat4 blocklayout_encode_layout(struct exp_xdr_stream *xdr, ++ const struct list_head *layouts); ++#endif /* defined(CONFIG_EXPORTFS_FILE_LAYOUT) */ ++ ++#if defined(CONFIG_PNFSD) ++#include ++ ++struct pnfsd_cb_operations; ++ ++struct pnfsd_cb_ctl { ++ spinlock_t lock; ++ struct module *module; ++ const struct pnfsd_cb_operations *cb_op; ++}; ++ ++/* in expfs.c so that file systems can depend on it */ ++extern struct pnfsd_cb_ctl pnfsd_cb_ctl; ++ ++static inline int ++pnfsd_get_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ int ret = -ENOENT; ++ ++ spin_lock(&pnfsd_cb_ctl.lock); ++ if (!pnfsd_cb_ctl.cb_op) ++ goto out; ++ if (!try_module_get(pnfsd_cb_ctl.module)) ++ goto out; ++ ctl->cb_op = pnfsd_cb_ctl.cb_op; ++ ctl->module = pnfsd_cb_ctl.module; ++ ret = 0; ++out: ++ spin_unlock(&pnfsd_cb_ctl.lock); ++ return ret; ++} ++ ++static inline void ++pnfsd_put_cb_op(struct pnfsd_cb_ctl *ctl) ++{ ++ module_put(ctl->module); ++} ++#endif /* CONFIG_PNFSD */ + #endif /* LINUX_EXPORTFS_H */ +diff -up linux-2.6.35.noarch/include/linux/exp_xdr.h.orig linux-2.6.35.noarch/include/linux/exp_xdr.h +--- linux-2.6.35.noarch/include/linux/exp_xdr.h.orig 2010-08-31 21:11:40.971022995 -0400 ++++ linux-2.6.35.noarch/include/linux/exp_xdr.h 2010-08-31 21:11:40.972150320 -0400 +@@ -0,0 +1,141 @@ ++#ifndef _LINUX_EXP_XDR_H ++#define _LINUX_EXP_XDR_H ++ ++#include ++#include ++#include ++ ++struct exp_xdr_stream { ++ __be32 *p; ++ __be32 *end; ++}; ++ ++/** ++ * exp_xdr_qwords - Calculate the number of quad-words holding nbytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline size_t ++exp_xdr_qwords(__u32 nbytes) ++{ ++ return DIV_ROUND_UP(nbytes, 4); ++} ++ ++/** ++ * exp_xdr_qbytes - Calculate the number of bytes holding qwords ++ * @qwords: number of quad-words to encode ++ */ ++static inline size_t ++exp_xdr_qbytes(size_t qwords) ++{ ++ return qwords << 2; ++} ++ ++/** ++ * exp_xdr_reserve_space - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nbytes: number of bytes to reserve ++ * ++ * Checks that we have enough buffer space to encode 'nbytes' more ++ * bytes of data. If so, update the xdr stream. ++ */ ++static inline __be32 * ++exp_xdr_reserve_space(struct exp_xdr_stream *xdr, size_t nbytes) ++{ ++ __be32 *p = xdr->p; ++ __be32 *q; ++ ++ /* align nbytes on the next 32-bit boundary */ ++ q = p + exp_xdr_qwords(nbytes); ++ if (unlikely(q > xdr->end || q < p)) ++ return NULL; ++ xdr->p = q; ++ return p; ++} ++ ++/** ++ * exp_xdr_reserve_qwords - Reserve buffer space for sending ++ * @xdr: pointer to exp_xdr_stream ++ * @nwords: number of quad words (u32's) to reserve ++ */ ++static inline __be32 * ++exp_xdr_reserve_qwords(struct exp_xdr_stream *xdr, size_t qwords) ++{ ++ return exp_xdr_reserve_space(xdr, exp_xdr_qbytes(qwords)); ++} ++ ++/** ++ * exp_xdr_encode_u32 - Encode an unsigned 32-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u32(__be32 *p, __u32 val) ++{ ++ *p = cpu_to_be32(val); ++ return p + 1; ++} ++ ++/** ++ * exp_xdr_encode_u64 - Encode an unsigned 64-bit value onto a xdr stream ++ * @p: pointer to encoding destination ++ * @val: value to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_u64(__be32 *p, __u64 val) ++{ ++ put_unaligned_be64(val, p); ++ return p + 2; ++} ++ ++/** ++ * exp_xdr_encode_bytes - Encode an array of bytes onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the array of bytes ++ * @nbytes: number of bytes to encode ++ */ ++static inline __be32 * ++exp_xdr_encode_bytes(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ if (likely(nbytes != 0)) { ++ unsigned int qwords = exp_xdr_qwords(nbytes); ++ unsigned int padding = exp_xdr_qbytes(qwords) - nbytes; ++ ++ memcpy(p, ptr, nbytes); ++ if (padding != 0) ++ memset((char *)p + nbytes, 0, padding); ++ p += qwords; ++ } ++ return p; ++} ++ ++/** ++ * exp_xdr_encode_opaque - Encode an opaque type onto a xdr stream ++ * @p: pointer to encoding destination ++ * @ptr: pointer to the opaque array ++ * @nbytes: number of bytes to encode ++ * ++ * Encodes the 32-bit opaque size in bytes followed by the opaque value. ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque(__be32 *p, const void *ptr, __u32 nbytes) ++{ ++ p = exp_xdr_encode_u32(p, nbytes); ++ return exp_xdr_encode_bytes(p, ptr, nbytes); ++} ++ ++/** ++ * exp_xdr_encode_opaque_qlen - Encode the opaque length onto a xdr stream ++ * @lenp: pointer to the opaque length destination ++ * @endp: pointer to the end of the opaque array ++ * ++ * Encodes the 32-bit opaque size in bytes given the start and end pointers ++ */ ++static inline __be32 * ++exp_xdr_encode_opaque_len(__be32 *lenp, const void *endp) ++{ ++ size_t nbytes = (char *)endp - (char *)(lenp + 1); ++ ++ exp_xdr_encode_u32(lenp, nbytes); ++ return lenp + 1 + exp_xdr_qwords(nbytes); ++} ++#endif /* _LINUX_EXP_XDR_H */ +diff -up linux-2.6.35.noarch/include/linux/fs.h.orig linux-2.6.35.noarch/include/linux/fs.h +--- linux-2.6.35.noarch/include/linux/fs.h.orig 2010-08-31 19:12:24.838202665 -0400 ++++ linux-2.6.35.noarch/include/linux/fs.h 2010-08-31 21:11:40.975150080 -0400 +@@ -388,6 +388,7 @@ struct inodes_stat_t { + #include + + struct export_operations; ++struct pnfs_export_operations; + struct hd_geometry; + struct iovec; + struct nameidata; +@@ -1330,6 +1331,7 @@ struct super_block { + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; ++ const struct pnfs_export_operations *s_pnfs_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; +diff -up linux-2.6.35.noarch/include/linux/nfs4.h.orig linux-2.6.35.noarch/include/linux/nfs4.h +--- linux-2.6.35.noarch/include/linux/nfs4.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfs4.h 2010-08-31 21:11:40.977150194 -0400 +@@ -17,7 +17,10 @@ + + #define NFS4_BITMAP_SIZE 2 + #define NFS4_VERIFIER_SIZE 8 +-#define NFS4_STATEID_SIZE 16 ++#define NFS4_CLIENTID_SIZE 8 ++#define NFS4_STATEID_SEQID_SIZE 4 ++#define NFS4_STATEID_OTHER_SIZE 12 ++#define NFS4_STATEID_SIZE (NFS4_STATEID_SEQID_SIZE + NFS4_STATEID_OTHER_SIZE) + #define NFS4_FHSIZE 128 + #define NFS4_MAXPATHLEN PATH_MAX + #define NFS4_MAXNAMLEN NAME_MAX +@@ -119,6 +122,13 @@ + #define EXCHGID4_FLAG_MASK_A 0x40070003 + #define EXCHGID4_FLAG_MASK_R 0x80070003 + ++static inline bool ++is_ds_only_session(u32 exchange_flags) ++{ ++ u32 mask = EXCHGID4_FLAG_USE_PNFS_DS | EXCHGID4_FLAG_USE_PNFS_MDS; ++ return (exchange_flags & mask) == EXCHGID4_FLAG_USE_PNFS_DS; ++} ++ + #define SEQ4_STATUS_CB_PATH_DOWN 0x00000001 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING 0x00000002 + #define SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED 0x00000004 +@@ -166,8 +176,25 @@ struct nfs4_acl { + struct nfs4_ace aces[0]; + }; + ++struct nfs4_fsid { ++ u64 major; ++ u64 minor; ++}; ++ + typedef struct { char data[NFS4_VERIFIER_SIZE]; } nfs4_verifier; +-typedef struct { char data[NFS4_STATEID_SIZE]; } nfs4_stateid; ++typedef struct { char data[NFS4_CLIENTID_SIZE]; } nfs4_clientid; ++ ++struct nfs41_stateid { ++ __be32 seqid; ++ char other[NFS4_STATEID_OTHER_SIZE]; ++} __attribute__ ((packed)); ++ ++typedef struct { ++ union { ++ char data[NFS4_STATEID_SIZE]; ++ struct nfs41_stateid stateid; ++ } u; ++} nfs4_stateid; + + enum nfs_opnum4 { + OP_ACCESS = 3, +@@ -471,6 +498,8 @@ enum lock_type4 { + #define FATTR4_WORD1_TIME_MODIFY (1UL << 21) + #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) + #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) ++#define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) ++#define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) + + #define NFSPROC4_NULL 0 + #define NFSPROC4_COMPOUND 1 +@@ -523,6 +552,7 @@ enum { + NFSPROC4_CLNT_GETACL, + NFSPROC4_CLNT_SETACL, + NFSPROC4_CLNT_FS_LOCATIONS, ++ NFSPROC4_CLNT_RELEASE_LOCKOWNER, + + /* nfs41 */ + NFSPROC4_CLNT_EXCHANGE_ID, +@@ -531,6 +561,13 @@ enum { + NFSPROC4_CLNT_SEQUENCE, + NFSPROC4_CLNT_GET_LEASE_TIME, + NFSPROC4_CLNT_RECLAIM_COMPLETE, ++ NFSPROC4_CLNT_LAYOUTGET, ++ NFSPROC4_CLNT_LAYOUTCOMMIT, ++ NFSPROC4_CLNT_LAYOUTRETURN, ++ NFSPROC4_CLNT_GETDEVICELIST, ++ NFSPROC4_CLNT_GETDEVICEINFO, ++ NFSPROC4_CLNT_PNFS_WRITE, ++ NFSPROC4_CLNT_PNFS_COMMIT, + }; + + /* nfs41 types */ +@@ -549,6 +586,43 @@ enum state_protect_how4 { + SP4_SSV = 2 + }; + ++enum pnfs_layouttype { ++ LAYOUT_NFSV4_1_FILES = 1, ++ LAYOUT_OSD2_OBJECTS = 2, ++ LAYOUT_BLOCK_VOLUME = 3, ++}; ++ ++/* used for both layout return and recall */ ++enum pnfs_layoutreturn_type { ++ RETURN_FILE = 1, ++ RETURN_FSID = 2, ++ RETURN_ALL = 3 ++}; ++ ++enum pnfs_iomode { ++ IOMODE_READ = 1, ++ IOMODE_RW = 2, ++ IOMODE_ANY = 3, ++}; ++ ++enum pnfs_notify_deviceid_type4 { ++ NOTIFY_DEVICEID4_CHANGE = 1 << 1, ++ NOTIFY_DEVICEID4_DELETE = 1 << 2, ++}; ++ ++#define NFL4_UFLG_MASK 0x0000003F ++#define NFL4_UFLG_DENSE 0x00000001 ++#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002 ++#define NFL4_UFLG_STRIPE_UNIT_SIZE_MASK 0xFFFFFFC0 ++ ++/* Encoded in the loh_body field of type layouthint4 */ ++enum filelayout_hint_care4 { ++ NFLH4_CARE_DENSE = NFL4_UFLG_DENSE, ++ NFLH4_CARE_COMMIT_THRU_MDS = NFL4_UFLG_COMMIT_THRU_MDS, ++ NFLH4_CARE_STRIPE_UNIT_SIZE = 0x00000040, ++ NFLH4_CARE_STRIPE_COUNT = 0x00000080 ++}; ++ + #endif + #endif + +diff -up linux-2.6.35.noarch/include/linux/nfs4_pnfs.h.orig linux-2.6.35.noarch/include/linux/nfs4_pnfs.h +--- linux-2.6.35.noarch/include/linux/nfs4_pnfs.h.orig 2010-08-31 21:11:40.978150492 -0400 ++++ linux-2.6.35.noarch/include/linux/nfs4_pnfs.h 2010-08-31 21:11:40.979150216 -0400 +@@ -0,0 +1,329 @@ ++/* ++ * include/linux/nfs4_pnfs.h ++ * ++ * Common data structures needed by the pnfs client and pnfs layout driver. ++ * ++ * Copyright (c) 2002 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Dean Hildebrand ++ */ ++ ++#ifndef LINUX_NFS4_PNFS_H ++#define LINUX_NFS4_PNFS_H ++ ++#include ++ ++enum pnfs_try_status { ++ PNFS_ATTEMPTED = 0, ++ PNFS_NOT_ATTEMPTED = 1, ++}; ++ ++#define NFS4_PNFS_GETDEVLIST_MAXNUM 16 ++ ++/* Per-layout driver specific registration structure */ ++struct pnfs_layoutdriver_type { ++ const u32 id; ++ const char *name; ++ struct layoutdriver_io_operations *ld_io_ops; ++ struct layoutdriver_policy_operations *ld_policy_ops; ++}; ++ ++struct pnfs_fsdata { ++ int bypass_eof; ++ struct pnfs_layout_segment *lseg; ++ void *private; ++}; ++ ++#if defined(CONFIG_NFS_V4_1) ++ ++static inline struct nfs_inode * ++PNFS_NFS_INODE(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_I(lo->inode); ++} ++ ++static inline struct inode * ++PNFS_INODE(struct pnfs_layout_hdr *lo) ++{ ++ return lo->inode; ++} ++ ++static inline struct nfs_server * ++PNFS_NFS_SERVER(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo)); ++} ++ ++static inline struct pnfs_layoutdriver_type * ++PNFS_LD(struct pnfs_layout_hdr *lo) ++{ ++ return NFS_SERVER(PNFS_INODE(lo))->pnfs_curr_ld; ++} ++ ++static inline struct layoutdriver_io_operations * ++PNFS_LD_IO_OPS(struct pnfs_layout_hdr *lo) ++{ ++ return PNFS_LD(lo)->ld_io_ops; ++} ++ ++static inline struct layoutdriver_policy_operations * ++PNFS_LD_POLICY_OPS(struct pnfs_layout_hdr *lo) ++{ ++ return PNFS_LD(lo)->ld_policy_ops; ++} ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return nfsi->layout != NULL; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return has_layout(nfsi) && ++ test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->layout->state); ++} ++ ++extern void put_lseg(struct pnfs_layout_segment *lseg); ++extern void get_lseg(struct pnfs_layout_segment *lseg); ++ ++#else /* CONFIG_NFS_V4_1 */ ++ ++static inline bool ++has_layout(struct nfs_inode *nfsi) ++{ ++ return false; ++} ++ ++static inline bool ++layoutcommit_needed(struct nfs_inode *nfsi) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_NFS_V4_1 */ ++ ++struct pnfs_layout_segment { ++ struct list_head fi_list; ++ struct pnfs_layout_range range; ++ struct kref kref; ++ bool valid; ++ struct pnfs_layout_hdr *layout; ++ struct nfs4_deviceid *deviceid; ++ u8 ld_data[]; /* layout driver private data */ ++}; ++ ++static inline void * ++LSEG_LD_DATA(struct pnfs_layout_segment *lseg) ++{ ++ return lseg->ld_data; ++} ++ ++/* Layout driver I/O operations. ++ * Either the pagecache or non-pagecache read/write operations must be implemented ++ */ ++struct layoutdriver_io_operations { ++ /* Functions that use the pagecache. ++ * If use_pagecache == 1, then these functions must be implemented. ++ */ ++ /* read and write pagelist should return just 0 (to indicate that ++ * the layout code has taken control) or 1 (to indicate that the ++ * layout code wishes to fall back to normal nfs.) If 0 is returned, ++ * information can be passed back through nfs_data->res and ++ * nfs_data->task.tk_status, and the appropriate pnfs done function ++ * MUST be called. ++ */ ++ enum pnfs_try_status ++ (*read_pagelist) (struct nfs_read_data *nfs_data, unsigned nr_pages); ++ enum pnfs_try_status ++ (*write_pagelist) (struct nfs_write_data *nfs_data, unsigned nr_pages, int how); ++ int (*write_begin) (struct pnfs_layout_segment *lseg, struct page *page, ++ loff_t pos, unsigned count, ++ struct pnfs_fsdata *fsdata); ++ int (*write_end)(struct inode *inode, struct page *page, loff_t pos, ++ unsigned count, unsigned copied, ++ struct pnfs_layout_segment *lseg); ++ void (*write_end_cleanup)(struct file *filp, ++ struct pnfs_fsdata *fsdata); ++ ++ /* Consistency ops */ ++ /* 2 problems: ++ * 1) the page list contains nfs_pages, NOT pages ++ * 2) currently the NFS code doesn't create a page array (as it does with read/write) ++ */ ++ enum pnfs_try_status ++ (*commit) (struct nfs_write_data *nfs_data, int how); ++ ++ /* Layout information. For each inode, alloc_layout is executed once to retrieve an ++ * inode specific layout structure. Each subsequent layoutget operation results in ++ * a set_layout call to set the opaque layout in the layout driver.*/ ++ struct pnfs_layout_hdr * (*alloc_layout) (struct inode *inode); ++ void (*free_layout) (struct pnfs_layout_hdr *); ++ struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); ++ void (*free_lseg) (struct pnfs_layout_segment *lseg); ++ ++ int (*setup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args); ++ void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutcommit_args *args); ++ void (*cleanup_layoutcommit) (struct pnfs_layout_hdr *layoutid, ++ struct nfs4_layoutcommit_args *args, ++ int status); ++ void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, ++ struct xdr_stream *xdr, ++ const struct nfs4_layoutreturn_args *args); ++ ++ /* Registration information for a new mounted file system ++ */ ++ int (*initialize_mountpoint) (struct nfs_server *, ++ const struct nfs_fh * mntfh); ++ int (*uninitialize_mountpoint) (struct nfs_server *server); ++}; ++ ++enum layoutdriver_policy_flags { ++ /* Should the full nfs rpc cleanup code be used after io */ ++ PNFS_USE_RPC_CODE = 1 << 0, ++ ++ /* Should the NFS req. gather algorithm cross stripe boundaries? */ ++ PNFS_GATHER_ACROSS_STRIPES = 1 << 1, ++ ++ /* Should the pNFS client commit and return the layout upon a setattr */ ++ PNFS_LAYOUTRET_ON_SETATTR = 1 << 3, ++}; ++ ++struct layoutdriver_policy_operations { ++ unsigned flags; ++ ++ /* The stripe size of the file system */ ++ ssize_t (*get_stripesize) (struct pnfs_layout_hdr *layoutid); ++ ++ /* test for nfs page cache coalescing */ ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++ ++ /* Retreive the block size of the file system. ++ * If gather_across_stripes == 1, then the file system will gather ++ * requests into the block size. ++ * TODO: Where will the layout driver get this info? It is hard ++ * coded in PVFS2. ++ */ ++ ssize_t (*get_blocksize) (void); ++}; ++ ++/* Should the full nfs rpc cleanup code be used after io */ ++static inline int ++pnfs_ld_use_rpc_code(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_USE_RPC_CODE; ++} ++ ++/* Should the NFS req. gather algorithm cross stripe boundaries? */ ++static inline int ++pnfs_ld_gather_across_stripes(struct pnfs_layoutdriver_type *ld) ++{ ++ return ld->ld_policy_ops->flags & PNFS_GATHER_ACROSS_STRIPES; ++} ++ ++struct pnfs_device { ++ struct pnfs_deviceid dev_id; ++ unsigned int layout_type; ++ unsigned int mincount; ++ struct page **pages; ++ void *area; ++ unsigned int pgbase; ++ unsigned int pglen; ++ unsigned int dev_notify_types; ++}; ++ ++struct pnfs_devicelist { ++ unsigned int eof; ++ unsigned int num_devs; ++ struct pnfs_deviceid dev_id[NFS4_PNFS_GETDEVLIST_MAXNUM]; ++}; ++ ++/* ++ * Device ID RCU cache. A device ID is unique per client ID and layout type. ++ */ ++#define NFS4_DEVICE_ID_HASH_BITS 5 ++#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) ++#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) ++ ++static inline u32 ++nfs4_deviceid_hash(struct pnfs_deviceid *id) ++{ ++ unsigned char *cptr = (unsigned char *)id->data; ++ unsigned int nbytes = NFS4_PNFS_DEVICEID4_SIZE; ++ u32 x = 0; ++ ++ while (nbytes--) { ++ x *= 37; ++ x += *cptr++; ++ } ++ return x & NFS4_DEVICE_ID_HASH_MASK; ++} ++ ++struct nfs4_deviceid_cache { ++ spinlock_t dc_lock; ++ struct kref dc_kref; ++ void (*dc_free_callback)(struct kref *); ++ struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; ++}; ++ ++/* Device ID cache node */ ++struct nfs4_deviceid { ++ struct hlist_node de_node; ++ struct pnfs_deviceid de_id; ++ struct kref de_kref; ++}; ++ ++extern int nfs4_alloc_init_deviceid_cache(struct nfs_client *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_put_deviceid_cache(struct nfs_client *); ++extern void nfs4_init_deviceid_node(struct nfs4_deviceid *); ++extern struct nfs4_deviceid *nfs4_find_get_deviceid( ++ struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++extern struct nfs4_deviceid *nfs4_add_get_deviceid(struct nfs4_deviceid_cache *, ++ struct nfs4_deviceid *); ++extern void nfs4_set_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *); ++extern void nfs4_put_unset_layout_deviceid(struct pnfs_layout_segment *, ++ struct nfs4_deviceid *, ++ void (*free_callback)(struct kref *)); ++extern void nfs4_delete_device(struct nfs4_deviceid_cache *, ++ struct pnfs_deviceid *); ++ ++/* pNFS client callback functions. ++ * These operations allow the layout driver to access pNFS client ++ * specific information or call pNFS client->server operations. ++ * E.g., getdeviceinfo, I/O callbacks, etc ++ */ ++struct pnfs_client_operations { ++ int (*nfs_getdevicelist) (struct nfs_server *, ++ const struct nfs_fh *fh, ++ struct pnfs_devicelist *devlist); ++ int (*nfs_getdeviceinfo) (struct nfs_server *, ++ struct pnfs_device *dev); ++ ++ /* Post read callback. */ ++ void (*nfs_readlist_complete) (struct nfs_read_data *nfs_data); ++ ++ /* Post write callback. */ ++ void (*nfs_writelist_complete) (struct nfs_write_data *nfs_data); ++ ++ /* Post commit callback. */ ++ void (*nfs_commit_complete) (struct nfs_write_data *nfs_data); ++ void (*nfs_return_layout) (struct inode *); ++}; ++ ++extern struct pnfs_client_operations pnfs_ops; ++ ++extern struct pnfs_client_operations *pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); ++extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); ++ ++#define NFS4_PNFS_MAX_LAYOUTS 4 ++#define NFS4_PNFS_PRIVATE_LAYOUT 0x80000000 ++ ++#endif /* LINUX_NFS4_PNFS_H */ +diff -up linux-2.6.35.noarch/include/linux/nfsd4_block.h.orig linux-2.6.35.noarch/include/linux/nfsd4_block.h +--- linux-2.6.35.noarch/include/linux/nfsd4_block.h.orig 2010-08-31 21:11:40.992170438 -0400 ++++ linux-2.6.35.noarch/include/linux/nfsd4_block.h 2010-08-31 21:11:40.992170438 -0400 +@@ -0,0 +1,101 @@ ++#ifndef NFSD4_BLOCK ++#define NFSD4_BLOCK ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_BLOCK_SUCCESS 1 ++#define PNFS_BLOCK_FAILURE 0 ++ ++#define PNFS_BLOCK_CTL_START 1 ++#define PNFS_BLOCK_CTL_STOP 2 ++#define PNFS_BLOCK_CTL_VERS 3 /* Allows daemon to request current ++ * version from kernel via an upcall. ++ */ ++ ++#define PNFS_UPCALL_MSG_STOP 0 ++#define PNFS_UPCALL_MSG_GETSIG 1 ++#define PNFS_UPCALL_MSG_GETSLICE 2 ++#define PNFS_UPCALL_MSG_DMCHK 3 // See if dev_t is a DM volume ++#define PNFS_UPCALL_MSG_DMGET 4 ++#define PNFS_UPCALL_MSG_VERS 5 ++ ++#define PNFS_UPCALL_VERS 8 ++ ++typedef struct stripe_dev { ++ int major, ++ minor, ++ offset; ++} stripe_dev_t; ++ ++typedef struct bl_comm_res { ++ int res_status; ++ union { ++ struct { ++ long long start, ++ length; ++ } slice; ++ struct { ++ int num_stripes, ++ stripe_size; ++ stripe_dev_t devs[]; ++ } stripe; ++ struct { ++ long long sector; ++ int offset, ++ len; ++ char sig[]; ++ } sig; ++ int vers, ++ dm_vol; ++ } u; ++} bl_comm_res_t; ++ ++typedef struct bl_comm_msg { ++ int msg_type, ++ msg_status; ++ union { ++ dev_t msg_dev; ++ int msg_vers; ++ } u; ++ bl_comm_res_t *msg_res; ++} bl_comm_msg_t; ++ ++#ifdef __KERNEL__ ++ ++typedef struct bl_comm { ++ /* ---- protects access to this structure ---- */ ++ struct mutex lock; ++ /* ---- protects access to rpc pipe ---- */ ++ struct mutex pipe_lock; ++ struct dentry *pipe_dentry; ++ wait_queue_head_t pipe_wq; ++ bl_comm_msg_t msg; ++} bl_comm_t; ++ ++int pnfs_block_enabled(struct inode *, int); ++int bl_layout_type(struct super_block *sb); ++int bl_getdeviceiter(struct super_block *, u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int bl_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++enum nfsstat4 bl_layoutget(struct inode *, struct exp_xdr_stream *, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int bl_layoutcommit(struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++int bl_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int bl_layoutrecall(struct inode *inode, int type, u64 offset, u64 len); ++int bl_init_proc(void); ++int bl_upcall(bl_comm_t *, bl_comm_msg_t *, bl_comm_res_t **); ++ ++extern bl_comm_t *bl_comm_global; // Ugly... ++#endif /* __KERNEL__ */ ++ ++#endif /* NFSD4_BLOCK */ ++ +diff -up linux-2.6.35.noarch/include/linux/nfsd4_spnfs.h.orig linux-2.6.35.noarch/include/linux/nfsd4_spnfs.h +--- linux-2.6.35.noarch/include/linux/nfsd4_spnfs.h.orig 2010-08-31 21:11:40.993150404 -0400 ++++ linux-2.6.35.noarch/include/linux/nfsd4_spnfs.h 2010-08-31 21:11:40.993150404 -0400 +@@ -0,0 +1,345 @@ ++/* ++ * include/linux/nfsd4_spnfs.h ++ * ++ * spNFS - simple pNFS implementation with userspace daemon ++ * ++ */ ++ ++/****************************************************************************** ++ ++(c) 2007 Network Appliance, Inc. All Rights Reserved. ++ ++Network Appliance provides this source code under the GPL v2 License. ++The GPL v2 license is available at ++http://opensource.org/licenses/gpl-license.php. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++******************************************************************************/ ++ ++#ifndef NFS_SPNFS_H ++#define NFS_SPNFS_H ++ ++ ++#ifdef __KERNEL__ ++#include "exportfs.h" ++#include "sunrpc/svc.h" ++#include "nfsd/nfsfh.h" ++#else ++#include ++#endif /* __KERNEL__ */ ++ ++#define SPNFS_STATUS_INVALIDMSG 0x01 ++#define SPNFS_STATUS_AGAIN 0x02 ++#define SPNFS_STATUS_FAIL 0x04 ++#define SPNFS_STATUS_SUCCESS 0x08 ++ ++#define SPNFS_TYPE_LAYOUTGET 0x01 ++#define SPNFS_TYPE_LAYOUTCOMMIT 0x02 ++#define SPNFS_TYPE_LAYOUTRETURN 0x03 ++#define SPNFS_TYPE_GETDEVICEITER 0x04 ++#define SPNFS_TYPE_GETDEVICEINFO 0x05 ++#define SPNFS_TYPE_SETATTR 0x06 ++#define SPNFS_TYPE_OPEN 0x07 ++#define SPNFS_TYPE_CLOSE 0x08 ++#define SPNFS_TYPE_CREATE 0x09 ++#define SPNFS_TYPE_REMOVE 0x0a ++#define SPNFS_TYPE_COMMIT 0x0b ++#define SPNFS_TYPE_READ 0x0c ++#define SPNFS_TYPE_WRITE 0x0d ++ ++#define SPNFS_MAX_DEVICES 1 ++#define SPNFS_MAX_DATA_SERVERS 16 ++#define SPNFS_MAX_IO 512 ++ ++/* layout */ ++struct spnfs_msg_layoutget_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_filelayout_list { ++ u_int32_t fh_len; ++ unsigned char fh_val[128]; /* DMXXX fix this const */ ++}; ++ ++struct spnfs_msg_layoutget_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t stripe_size; ++ u_int32_t stripe_type; ++ u_int32_t stripe_count; ++ struct spnfs_filelayout_list flist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++/* layoutcommit */ ++struct spnfs_msg_layoutcommit_args { ++ unsigned long inode; ++ unsigned long generation; ++ u_int64_t file_size; ++}; ++ ++struct spnfs_msg_layoutcommit_res { ++ int status; ++}; ++ ++/* layoutreturn */ ++/* No op for the daemon */ ++/* ++struct spnfs_msg_layoutreturn_args { ++}; ++ ++struct spnfs_msg_layoutreturn_res { ++}; ++*/ ++ ++/* getdeviceiter */ ++struct spnfs_msg_getdeviceiter_args { ++ unsigned long inode; ++ u_int64_t cookie; ++ u_int64_t verf; ++}; ++ ++struct spnfs_msg_getdeviceiter_res { ++ int status; ++ u_int64_t devid; ++ u_int64_t cookie; ++ u_int64_t verf; ++ u_int32_t eof; ++}; ++ ++/* getdeviceinfo */ ++struct spnfs_data_server { ++ u_int32_t dsid; ++ char netid[5]; ++ char addr[29]; ++}; ++ ++struct spnfs_device { ++ u_int64_t devid; ++ int dscount; ++ struct spnfs_data_server dslist[SPNFS_MAX_DATA_SERVERS]; ++}; ++ ++struct spnfs_msg_getdeviceinfo_args { ++ u_int64_t devid; ++}; ++ ++struct spnfs_msg_getdeviceinfo_res { ++ int status; ++ struct spnfs_device devinfo; ++}; ++ ++/* setattr */ ++struct spnfs_msg_setattr_args { ++ unsigned long inode; ++ unsigned long generation; ++ int file_size; ++}; ++ ++struct spnfs_msg_setattr_res { ++ int status; ++}; ++ ++/* open */ ++struct spnfs_msg_open_args { ++ unsigned long inode; ++ unsigned long generation; ++ int create; ++ int createmode; ++ int truncate; ++}; ++ ++struct spnfs_msg_open_res { ++ int status; ++}; ++ ++/* close */ ++/* No op for daemon */ ++struct spnfs_msg_close_args { ++ int x; ++}; ++ ++struct spnfs_msg_close_res { ++ int y; ++}; ++ ++/* create */ ++/* ++struct spnfs_msg_create_args { ++ int x; ++}; ++ ++struct spnfs_msg_create_res { ++ int y; ++}; ++*/ ++ ++/* remove */ ++struct spnfs_msg_remove_args { ++ unsigned long inode; ++ unsigned long generation; ++}; ++ ++struct spnfs_msg_remove_res { ++ int status; ++}; ++ ++/* commit */ ++/* ++struct spnfs_msg_commit_args { ++ int x; ++}; ++ ++struct spnfs_msg_commit_res { ++ int y; ++}; ++*/ ++ ++/* read */ ++struct spnfs_msg_read_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++}; ++ ++struct spnfs_msg_read_res { ++ int status; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++/* write */ ++struct spnfs_msg_write_args { ++ unsigned long inode; ++ unsigned long generation; ++ loff_t offset; ++ unsigned long len; ++ char data[SPNFS_MAX_IO]; ++}; ++ ++struct spnfs_msg_write_res { ++ int status; ++}; ++ ++/* bundle args and responses */ ++union spnfs_msg_args { ++ struct spnfs_msg_layoutget_args layoutget_args; ++ struct spnfs_msg_layoutcommit_args layoutcommit_args; ++/* ++ struct spnfs_msg_layoutreturn_args layoutreturn_args; ++*/ ++ struct spnfs_msg_getdeviceiter_args getdeviceiter_args; ++ struct spnfs_msg_getdeviceinfo_args getdeviceinfo_args; ++ struct spnfs_msg_setattr_args setattr_args; ++ struct spnfs_msg_open_args open_args; ++ struct spnfs_msg_close_args close_args; ++/* ++ struct spnfs_msg_create_args create_args; ++*/ ++ struct spnfs_msg_remove_args remove_args; ++/* ++ struct spnfs_msg_commit_args commit_args; ++*/ ++ struct spnfs_msg_read_args read_args; ++ struct spnfs_msg_write_args write_args; ++}; ++ ++union spnfs_msg_res { ++ struct spnfs_msg_layoutget_res layoutget_res; ++ struct spnfs_msg_layoutcommit_res layoutcommit_res; ++/* ++ struct spnfs_msg_layoutreturn_res layoutreturn_res; ++*/ ++ struct spnfs_msg_getdeviceiter_res getdeviceiter_res; ++ struct spnfs_msg_getdeviceinfo_res getdeviceinfo_res; ++ struct spnfs_msg_setattr_res setattr_res; ++ struct spnfs_msg_open_res open_res; ++ struct spnfs_msg_close_res close_res; ++/* ++ struct spnfs_msg_create_res create_res; ++*/ ++ struct spnfs_msg_remove_res remove_res; ++/* ++ struct spnfs_msg_commit_res commit_res; ++*/ ++ struct spnfs_msg_read_res read_res; ++ struct spnfs_msg_write_res write_res; ++}; ++ ++/* a spnfs message, args and response */ ++struct spnfs_msg { ++ unsigned char im_type; ++ unsigned char im_status; ++ union spnfs_msg_args im_args; ++ union spnfs_msg_res im_res; ++}; ++ ++/* spnfs configuration info */ ++struct spnfs_config { ++ unsigned char dense_striping; ++ int stripe_size; ++ int num_ds; ++ char ds_dir[SPNFS_MAX_DATA_SERVERS][80]; /* XXX */ ++}; ++ ++#if defined(__KERNEL__) && defined(CONFIG_SPNFS) ++ ++#include ++ ++/* pipe mgmt structure. messages flow through here */ ++struct spnfs { ++ struct dentry *spnfs_dentry; /* dentry for pipe */ ++ wait_queue_head_t spnfs_wq; ++ struct spnfs_msg spnfs_im; /* spnfs message */ ++ struct mutex spnfs_lock; /* Serializes upcalls */ ++ struct mutex spnfs_plock; ++}; ++ ++struct nfsd4_open; ++ ++int spnfs_layout_type(struct super_block *); ++enum nfsstat4 spnfs_layoutget(struct inode *, struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++int spnfs_layoutcommit(void); ++int spnfs_layoutreturn(struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++int spnfs_getdeviceiter(struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++int spnfs_getdeviceinfo(struct super_block *, struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++int spnfs_setattr(void); ++int spnfs_open(struct inode *, struct nfsd4_open *); ++int spnfs_get_state(struct inode *, struct knfsd_fh *, struct pnfs_get_state *); ++int spnfs_remove(unsigned long, unsigned long); ++__be32 spnfs_read(struct inode *, loff_t, unsigned long *, ++ int, struct svc_rqst *); ++__be32 spnfs_write(struct inode *, loff_t, size_t, int, struct svc_rqst *); ++int spnfs_getfh(int, struct nfs_fh *); ++int spnfs_test_layoutrecall(char *, u64, u64); ++int spnfs_layoutrecall(struct inode *, int, u64, u64); ++ ++int nfsd_spnfs_new(void); ++void nfsd_spnfs_delete(void); ++int spnfs_upcall(struct spnfs *, struct spnfs_msg *, union spnfs_msg_res *); ++int spnfs_enabled(void); ++int spnfs_init_proc(void); ++ ++extern struct spnfs_config *spnfs_config; ++ ++#endif /* __KERNEL__ && CONFIG_SPNFS */ ++ ++#endif /* NFS_SPNFS_H */ +diff -up linux-2.6.35.noarch/include/linux/nfsd/const.h.orig linux-2.6.35.noarch/include/linux/nfsd/const.h +--- linux-2.6.35.noarch/include/linux/nfsd/const.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfsd/const.h 2010-08-31 21:11:40.987096445 -0400 +@@ -29,6 +29,7 @@ + #ifdef __KERNEL__ + + #include ++#include + + /* + * Largest number of bytes we need to allocate for an NFS +diff -up linux-2.6.35.noarch/include/linux/nfsd/debug.h.orig linux-2.6.35.noarch/include/linux/nfsd/debug.h +--- linux-2.6.35.noarch/include/linux/nfsd/debug.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfsd/debug.h 2010-08-31 21:11:40.987096445 -0400 +@@ -32,6 +32,8 @@ + #define NFSDDBG_REPCACHE 0x0080 + #define NFSDDBG_XDR 0x0100 + #define NFSDDBG_LOCKD 0x0200 ++#define NFSDDBG_PNFS 0x0400 ++#define NFSDDBG_FILELAYOUT 0x0800 + #define NFSDDBG_ALL 0x7FFF + #define NFSDDBG_NOCHANGE 0xFFFF + +diff -up linux-2.6.35.noarch/include/linux/nfsd/export.h.orig linux-2.6.35.noarch/include/linux/nfsd/export.h +--- linux-2.6.35.noarch/include/linux/nfsd/export.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfsd/export.h 2010-08-31 21:11:40.988138882 -0400 +@@ -100,6 +100,7 @@ struct svc_export { + uid_t ex_anon_uid; + gid_t ex_anon_gid; + int ex_fsid; ++ int ex_pnfs; + unsigned char * ex_uuid; /* 16 byte fsid */ + struct nfsd4_fs_locations ex_fslocs; + int ex_nflavors; +diff -up linux-2.6.35.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig linux-2.6.35.noarch/include/linux/nfsd/nfs4layoutxdr.h +--- linux-2.6.35.noarch/include/linux/nfsd/nfs4layoutxdr.h.orig 2010-08-31 21:11:40.988138882 -0400 ++++ linux-2.6.35.noarch/include/linux/nfsd/nfs4layoutxdr.h 2010-08-31 21:11:40.988138882 -0400 +@@ -0,0 +1,132 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef NFSD_NFS4LAYOUTXDR_H ++#define NFSD_NFS4LAYOUTXDR_H ++ ++#include ++#include ++ ++/* the nfsd4_pnfs_devlist dev_addr for the file layout type */ ++struct pnfs_filelayout_devaddr { ++ struct xdr_netobj r_netid; ++ struct xdr_netobj r_addr; ++}; ++ ++/* list of multipath servers */ ++struct pnfs_filelayout_multipath { ++ u32 fl_multipath_length; ++ struct pnfs_filelayout_devaddr *fl_multipath_list; ++}; ++ ++struct pnfs_filelayout_device { ++ u32 fl_stripeindices_length; ++ u32 *fl_stripeindices_list; ++ u32 fl_device_length; ++ struct pnfs_filelayout_multipath *fl_device_list; ++}; ++ ++struct pnfs_filelayout_layout { ++ u32 lg_layout_type; /* response */ ++ u32 lg_stripe_type; /* response */ ++ u32 lg_commit_through_mds; /* response */ ++ u64 lg_stripe_unit; /* response */ ++ u64 lg_pattern_offset; /* response */ ++ u32 lg_first_stripe_index; /* response */ ++ struct nfsd4_pnfs_deviceid device_id; /* response */ ++ u32 lg_fh_length; /* response */ ++ struct knfsd_fh *lg_fh_list; /* response */ ++}; ++ ++enum stripetype4 { ++ STRIPE_SPARSE = 1, ++ STRIPE_DENSE = 2 ++}; ++ ++enum pnfs_block_extent_state4 { ++ PNFS_BLOCK_READWRITE_DATA = 0, ++ PNFS_BLOCK_READ_DATA = 1, ++ PNFS_BLOCK_INVALID_DATA = 2, ++ PNFS_BLOCK_NONE_DATA = 3 ++}; ++ ++enum pnfs_block_volume_type4 { ++ PNFS_BLOCK_VOLUME_SIMPLE = 0, ++ PNFS_BLOCK_VOLUME_SLICE = 1, ++ PNFS_BLOCK_VOLUME_CONCAT = 2, ++ PNFS_BLOCK_VOLUME_STRIPE = 3, ++}; ++typedef enum pnfs_block_volume_type4 pnfs_block_volume_type4; ++ ++enum bl_cache_state { ++ BLOCK_LAYOUT_NEW = 0, ++ BLOCK_LAYOUT_CACHE = 1, ++ BLOCK_LAYOUT_UPDATE = 2, ++}; ++ ++typedef struct pnfs_blocklayout_layout { ++ struct list_head bll_list; ++ struct nfsd4_pnfs_deviceid bll_vol_id; ++ u64 bll_foff; // file offset ++ u64 bll_len; ++ u64 bll_soff; // storage offset ++ int bll_recalled; ++ enum pnfs_block_extent_state4 bll_es; ++ enum bl_cache_state bll_cache_state; ++} pnfs_blocklayout_layout_t; ++ ++typedef struct pnfs_blocklayout_devinfo { ++ struct list_head bld_list; ++ pnfs_block_volume_type4 bld_type; ++ struct nfsd4_pnfs_deviceid bld_devid; ++ int bld_index_loc; ++ union { ++ struct { ++ u64 bld_offset; ++ u32 bld_sig_len, ++ *bld_sig; ++ } simple; ++ struct { ++ u64 bld_start, ++ bld_len; ++ u32 bld_index; /* Index of Simple Volume */ ++ } slice; ++ struct { ++ u32 bld_stripes; ++ u64 bld_chunk_size; ++ u32 *bld_stripe_indexs; ++ } stripe; ++ } u; ++} pnfs_blocklayout_devinfo_t; ++ ++#endif /* NFSD_NFS4LAYOUTXDR_H */ +diff -up linux-2.6.35.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig linux-2.6.35.noarch/include/linux/nfsd/nfs4pnfsdlm.h +--- linux-2.6.35.noarch/include/linux/nfsd/nfs4pnfsdlm.h.orig 2010-08-31 21:11:40.989160454 -0400 ++++ linux-2.6.35.noarch/include/linux/nfsd/nfs4pnfsdlm.h 2010-08-31 21:11:40.989160454 -0400 +@@ -0,0 +1,54 @@ ++/****************************************************************************** ++ * ++ * (c) 2007 Network Appliance, Inc. All Rights Reserved. ++ * (c) 2009 NetApp. All Rights Reserved. ++ * ++ * NetApp provides this source code under the GPL v2 License. ++ * The GPL v2 license is available at ++ * http://opensource.org/licenses/gpl-license.php. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR ++ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ++ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ++ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ++ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ ******************************************************************************/ ++#include ++ ++/* ++ * Length of comma separated pnfs data server IPv4 addresses. Enough room for ++ * 32 addresses. ++ */ ++#define NFSD_DLM_DS_LIST_MAX 512 ++/* ++ * Length of colon separated pnfs dlm device of the form ++ * disk_name:comma separated data server IPv4 address ++ */ ++#define NFSD_PNFS_DLM_DEVICE_MAX (NFSD_DLM_DS_LIST_MAX + DISK_NAME_LEN + 1) ++ ++#ifdef CONFIG_PNFSD ++ ++/* For use by DLM cluster file systems exported by pNFSD */ ++extern const struct pnfs_export_operations pnfs_dlm_export_ops; ++ ++int nfsd4_set_pnfs_dlm_device(char *pnfs_dlm_device, int len); ++ ++void nfsd4_pnfs_dlm_shutdown(void); ++ ++ssize_t nfsd4_get_pnfs_dlm_device_list(char *buf, ssize_t buflen); ++ ++#else /* CONFIG_PNFSD */ ++ ++static inline void nfsd4_pnfs_dlm_shutdown(void) ++{ ++ return; ++} ++ ++#endif /* CONFIG_PNFSD */ +diff -up linux-2.6.35.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig linux-2.6.35.noarch/include/linux/nfsd/nfsd4_pnfs.h +--- linux-2.6.35.noarch/include/linux/nfsd/nfsd4_pnfs.h.orig 2010-08-31 21:11:40.990180678 -0400 ++++ linux-2.6.35.noarch/include/linux/nfsd/nfsd4_pnfs.h 2010-08-31 21:11:40.990180678 -0400 +@@ -0,0 +1,271 @@ ++/* ++ * Copyright (c) 2006 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * Andy Adamson ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ */ ++ ++#ifndef _LINUX_NFSD_NFSD4_PNFS_H ++#define _LINUX_NFSD_NFSD4_PNFS_H ++ ++#include ++#include ++#include ++ ++struct nfsd4_pnfs_deviceid { ++ u64 sbid; /* per-superblock unique ID */ ++ u64 devid; /* filesystem-wide unique device ID */ ++}; ++ ++struct nfsd4_pnfs_dev_iter_res { ++ u64 gd_cookie; /* request/repsonse */ ++ u64 gd_verf; /* request/repsonse */ ++ u64 gd_devid; /* response */ ++ u32 gd_eof; /* response */ ++}; ++ ++/* Arguments for set_device_notify */ ++struct pnfs_devnotify_arg { ++ struct nfsd4_pnfs_deviceid dn_devid; /* request */ ++ u32 dn_layout_type; /* request */ ++ u32 dn_notify_types; /* request/response */ ++}; ++ ++struct nfsd4_layout_seg { ++ u64 clientid; ++ u32 layout_type; ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++/* Used by layout_get to encode layout (loc_body var in spec) ++ * Args: ++ * minlength - min number of accessible bytes given by layout ++ * fsid - Major part of struct pnfs_deviceid. File system uses this ++ * to build the deviceid returned in the layout. ++ * fh - fs can modify the file handle for use on data servers ++ * seg - layout info requested and layout info returned ++ * xdr - xdr info ++ * return_on_close - true if layout to be returned on file close ++ */ ++ ++struct nfsd4_pnfs_layoutget_arg { ++ u64 lg_minlength; ++ u64 lg_sbid; ++ const struct knfsd_fh *lg_fh; ++}; ++ ++struct nfsd4_pnfs_layoutget_res { ++ struct nfsd4_layout_seg lg_seg; /* request/resopnse */ ++ u32 lg_return_on_close; ++}; ++ ++struct nfsd4_pnfs_layoutcommit_arg { ++ struct nfsd4_layout_seg lc_seg; /* request */ ++ u32 lc_reclaim; /* request */ ++ u32 lc_newoffset; /* request */ ++ u64 lc_last_wr; /* request */ ++ struct nfstime4 lc_mtime; /* request */ ++ u32 lc_up_len; /* layout length */ ++ void *lc_up_layout; /* decoded by callback */ ++}; ++ ++struct nfsd4_pnfs_layoutcommit_res { ++ u32 lc_size_chg; /* boolean for response */ ++ u64 lc_newsize; /* response */ ++}; ++ ++#define PNFS_LAST_LAYOUT_NO_RECALLS ((void *)-1) /* used with lr_cookie below */ ++ ++struct nfsd4_pnfs_layoutreturn_arg { ++ u32 lr_return_type; /* request */ ++ struct nfsd4_layout_seg lr_seg; /* request */ ++ u32 lr_reclaim; /* request */ ++ u32 lrf_body_len; /* request */ ++ void *lrf_body; /* request */ ++ void *lr_cookie; /* fs private */ ++}; ++ ++/* pNFS Metadata to Data server state communication */ ++struct pnfs_get_state { ++ u32 dsid; /* request */ ++ u64 ino; /* request */ ++ nfs4_stateid stid; /* request;response */ ++ nfs4_clientid clid; /* response */ ++ u32 access; /* response */ ++ u32 stid_gen; /* response */ ++ u32 verifier[2]; /* response */ ++}; ++ ++/* ++ * pNFS export operations vector. ++ * ++ * The filesystem must implement the following methods: ++ * layout_type ++ * get_device_info ++ * layout_get ++ * ++ * All other methods are optional and can be set to NULL if not implemented. ++ */ ++struct pnfs_export_operations { ++ /* Returns the supported pnfs_layouttype4. */ ++ int (*layout_type) (struct super_block *); ++ ++ /* Encode device info onto the xdr stream. */ ++ int (*get_device_info) (struct super_block *, ++ struct exp_xdr_stream *, ++ u32 layout_type, ++ const struct nfsd4_pnfs_deviceid *); ++ ++ /* Retrieve all available devices via an iterator. ++ * arg->cookie == 0 indicates the beginning of the list, ++ * otherwise arg->verf is used to verify that the list hasn't changed ++ * while retrieved. ++ * ++ * On output, the filesystem sets the devid based on the current cookie ++ * and sets res->cookie and res->verf corresponding to the next entry. ++ * When the last entry in the list is retrieved, res->eof is set to 1. ++ */ ++ int (*get_device_iter) (struct super_block *, ++ u32 layout_type, ++ struct nfsd4_pnfs_dev_iter_res *); ++ ++ int (*set_device_notify) (struct super_block *, ++ struct pnfs_devnotify_arg *); ++ ++ /* Retrieve and encode a layout for inode onto the xdr stream. ++ * arg->minlength is the minimum number of accessible bytes required ++ * by the client. ++ * The maximum number of bytes to encode the layout is given by ++ * the xdr stream end pointer. ++ * arg->fsid contains the major part of struct pnfs_deviceid. ++ * The file system uses this to build the deviceid returned ++ * in the layout. ++ * res->seg - layout segment requested and layout info returned. ++ * res->fh can be modified the file handle for use on data servers ++ * res->return_on_close - true if layout to be returned on file close ++ * ++ * return one of the following nfs errors: ++ * NFS_OK Success ++ * NFS4ERR_ACCESS Permission error ++ * NFS4ERR_BADIOMODE Server does not support requested iomode ++ * NFS4ERR_BADLAYOUT No layout matching loga_minlength rules ++ * NFS4ERR_INVAL Parameter other than layout is invalid ++ * NFS4ERR_IO I/O error ++ * NFS4ERR_LAYOUTTRYLATER Layout may be retrieved later ++ * NFS4ERR_LAYOUTUNAVAILABLE Layout unavailable for this file ++ * NFS4ERR_LOCKED Lock conflict ++ * NFS4ERR_NOSPC Out-of-space error occured ++ * NFS4ERR_RECALLCONFLICT Layout currently unavialable due to ++ * a conflicting CB_LAYOUTRECALL ++ * NFS4ERR_SERVERFAULT Server went bezerk ++ * NFS4ERR_TOOSMALL loga_maxcount too small to fit layout ++ * NFS4ERR_WRONG_TYPE Wrong file type (not a regular file) ++ */ ++ enum nfsstat4 (*layout_get) (struct inode *, ++ struct exp_xdr_stream *xdr, ++ const struct nfsd4_pnfs_layoutget_arg *, ++ struct nfsd4_pnfs_layoutget_res *); ++ ++ /* Commit changes to layout */ ++ int (*layout_commit) (struct inode *, ++ const struct nfsd4_pnfs_layoutcommit_arg *, ++ struct nfsd4_pnfs_layoutcommit_res *); ++ ++ /* Returns the layout */ ++ int (*layout_return) (struct inode *, ++ const struct nfsd4_pnfs_layoutreturn_arg *); ++ ++ /* Can layout segments be merged for this layout type? */ ++ int (*can_merge_layouts) (u32 layout_type); ++ ++ /* pNFS Files layout specific operations */ ++ ++ /* Get the write verifier for DS (called on MDS only) */ ++ void (*get_verifier) (struct super_block *, u32 *p); ++ /* Call fs on DS only */ ++ int (*get_state) (struct inode *, struct knfsd_fh *, ++ struct pnfs_get_state *); ++}; ++ ++struct nfsd4_pnfs_cb_layout { ++ u32 cbl_recall_type; /* request */ ++ struct nfsd4_layout_seg cbl_seg; /* request */ ++ u32 cbl_layoutchanged; /* request */ ++ nfs4_stateid cbl_sid; /* request */ ++ struct nfs4_fsid cbl_fsid; ++ void *cbl_cookie; /* fs private */ ++}; ++ ++/* layoutrecall request (from exported filesystem) */ ++struct nfs4_layoutrecall { ++ struct kref clr_ref; ++ struct nfsd4_pnfs_cb_layout cb; /* request */ ++ struct list_head clr_perclnt; /* on cl_layoutrecalls */ ++ struct nfs4_client *clr_client; ++ struct nfs4_file *clr_file; ++ struct timespec clr_time; /* last activity */ ++ struct super_block *clr_sb; /* We might not have a file */ ++ struct nfs4_layoutrecall *parent; /* The initiating recall */ ++ ++ void *clr_args; /* nfsd internal */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_item { ++ u32 cbd_notify_type; /* request */ ++ u32 cbd_layout_type; /* request */ ++ struct nfsd4_pnfs_deviceid cbd_devid; /* request */ ++ u32 cbd_immediate; /* request */ ++}; ++ ++struct nfsd4_pnfs_cb_dev_list { ++ u32 cbd_len; /* request */ ++ struct nfsd4_pnfs_cb_dev_item *cbd_list; /* request */ ++}; ++ ++/* ++ * callbacks provided by the nfsd ++ */ ++struct pnfsd_cb_operations { ++ /* Generic callbacks */ ++ int (*cb_layout_recall) (struct super_block *, struct inode *, ++ struct nfsd4_pnfs_cb_layout *); ++ int (*cb_device_notify) (struct super_block *, ++ struct nfsd4_pnfs_cb_dev_list *); ++ ++ /* pNFS Files layout specific callbacks */ ++ ++ /* Callback from fs on MDS only */ ++ int (*cb_get_state) (struct super_block *, struct pnfs_get_state *); ++ /* Callback from fs on DS only */ ++ int (*cb_change_state) (struct pnfs_get_state *); ++}; ++ ++#endif /* _LINUX_NFSD_NFSD4_PNFS_H */ +diff -up linux-2.6.35.noarch/include/linux/nfsd/syscall.h.orig linux-2.6.35.noarch/include/linux/nfsd/syscall.h +--- linux-2.6.35.noarch/include/linux/nfsd/syscall.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfsd/syscall.h 2010-08-31 21:11:40.991160519 -0400 +@@ -29,6 +29,7 @@ + /*#define NFSCTL_GETFH 6 / * get an fh by ino DISCARDED */ + #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ + #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ ++#define NFSCTL_FD2FH 9 /* get a fh from a fd */ + + /* SVC */ + struct nfsctl_svc { +@@ -71,6 +72,11 @@ struct nfsctl_fsparm { + int gd_maxlen; + }; + ++/* FD2FH */ ++struct nfsctl_fd2fh { ++ int fd; ++}; ++ + /* + * This is the argument union. + */ +@@ -82,6 +88,7 @@ struct nfsctl_arg { + struct nfsctl_export u_export; + struct nfsctl_fdparm u_getfd; + struct nfsctl_fsparm u_getfs; ++ struct nfsctl_fd2fh u_fd2fh; + /* + * The following dummy member is needed to preserve binary compatibility + * on platforms where alignof(void*)>alignof(int). It's needed because +@@ -95,6 +102,7 @@ struct nfsctl_arg { + #define ca_export u.u_export + #define ca_getfd u.u_getfd + #define ca_getfs u.u_getfs ++#define ca_fd2fh u.u_fd2fh + }; + + union nfsctl_res { +diff -up linux-2.6.35.noarch/include/linux/nfs_fs.h.orig linux-2.6.35.noarch/include/linux/nfs_fs.h +--- linux-2.6.35.noarch/include/linux/nfs_fs.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfs_fs.h 2010-08-31 21:11:40.980160309 -0400 +@@ -72,13 +72,20 @@ struct nfs_access_entry { + int mask; + }; + ++struct nfs_lock_context { ++ atomic_t count; ++ struct list_head list; ++ struct nfs_open_context *open_context; ++ fl_owner_t lockowner; ++ pid_t pid; ++}; ++ + struct nfs4_state; + struct nfs_open_context { +- atomic_t count; ++ struct nfs_lock_context lock_context; + struct path path; + struct rpc_cred *cred; + struct nfs4_state *state; +- fl_owner_t lockowner; + fmode_t mode; + + unsigned long flags; +@@ -97,6 +104,27 @@ struct nfs_delegation; + + struct posix_acl; + ++struct pnfs_layout_hdr { ++ int refcount; ++ struct list_head layouts; /* other client layouts */ ++ struct list_head segs; /* layout segments list */ ++ int roc_iomode;/* return on close iomode, 0=none */ ++ seqlock_t seqlock; /* Protects the stateid */ ++ nfs4_stateid stateid; ++ unsigned long state; ++#define NFS_INO_RO_LAYOUT_FAILED 0 /* ro layoutget failed stop trying */ ++#define NFS_INO_RW_LAYOUT_FAILED 1 /* rw layoutget failed stop trying */ ++#define NFS_INO_LAYOUTCOMMIT 2 /* LAYOUTCOMMIT needed */ ++ ++ struct rpc_cred *cred; /* layoutcommit credential */ ++ /* DH: These vars keep track of the maximum write range ++ * so the values can be used for layoutcommit. ++ */ ++ loff_t write_begin_pos; ++ loff_t write_end_pos; ++ struct inode *inode; ++}; ++ + /* + * nfs fs inode data in memory + */ +@@ -181,6 +209,13 @@ struct nfs_inode { + struct nfs_delegation *delegation; + fmode_t delegation_state; + struct rw_semaphore rwsem; ++ ++ /* pNFS layout information */ ++#if defined(CONFIG_NFS_V4_1) ++ wait_queue_head_t lo_waitq; ++ struct pnfs_layout_hdr *layout; ++ time_t pnfs_layout_suspend; ++#endif /* CONFIG_NFS_V4_1 */ + #endif /* CONFIG_NFS_V4*/ + #ifdef CONFIG_NFS_FSCACHE + struct fscache_cookie *fscache; +@@ -353,6 +388,8 @@ extern void nfs_setattr_update_inode(str + extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); + extern void put_nfs_open_context(struct nfs_open_context *ctx); + extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); ++extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); ++extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); + extern u64 nfs_compat_user_ino64(u64 fileid); + extern void nfs_fattr_init(struct nfs_fattr *fattr); + +@@ -481,8 +518,12 @@ extern void nfs_unblock_sillyrename(stru + extern int nfs_congestion_kb; + extern int nfs_writepage(struct page *page, struct writeback_control *wbc); + extern int nfs_writepages(struct address_space *, struct writeback_control *); +-extern int nfs_flush_incompatible(struct file *file, struct page *page); +-extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); ++struct pnfs_layout_segment; ++extern int nfs_flush_incompatible(struct file *file, struct page *page, ++ struct pnfs_layout_segment *lseg); ++extern int nfs_updatepage(struct file *, struct page *, ++ unsigned int offset, unsigned int count, ++ struct pnfs_layout_segment *lseg, void *fsdata); + extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); + + /* +@@ -604,6 +645,8 @@ extern void * nfs_root_data(void); + #define NFSDBG_CLIENT 0x0200 + #define NFSDBG_MOUNT 0x0400 + #define NFSDBG_FSCACHE 0x0800 ++#define NFSDBG_PNFS 0x1000 ++#define NFSDBG_PNFS_LD 0x2000 + #define NFSDBG_ALL 0xFFFF + + #ifdef __KERNEL__ +diff -up linux-2.6.35.noarch/include/linux/nfs_fs_sb.h.orig linux-2.6.35.noarch/include/linux/nfs_fs_sb.h +--- linux-2.6.35.noarch/include/linux/nfs_fs_sb.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfs_fs_sb.h 2010-08-31 21:11:40.982150104 -0400 +@@ -15,6 +15,7 @@ struct nlm_host; + struct nfs4_sequence_args; + struct nfs4_sequence_res; + struct nfs_server; ++struct nfs4_minor_version_ops; + + /* + * The nfs_client identifies our client state to the server. +@@ -70,11 +71,7 @@ struct nfs_client { + */ + char cl_ipaddr[48]; + unsigned char cl_id_uniquifier; +- int (* cl_call_sync)(struct nfs_server *server, +- struct rpc_message *msg, +- struct nfs4_sequence_args *args, +- struct nfs4_sequence_res *res, +- int cache_reply); ++ const struct nfs4_minor_version_ops *cl_mvops; + #endif /* CONFIG_NFS_V4 */ + + #ifdef CONFIG_NFS_V4_1 +@@ -85,6 +82,8 @@ struct nfs_client { + /* The flags used for obtaining the clientid during EXCHANGE_ID */ + u32 cl_exchange_flags; + struct nfs4_session *cl_session; /* sharred session */ ++ struct list_head cl_layouts; ++ struct nfs4_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ + #endif /* CONFIG_NFS_V4_1 */ + + #ifdef CONFIG_NFS_FSCACHE +@@ -92,6 +91,16 @@ struct nfs_client { + #endif + }; + ++static inline bool ++is_ds_only_client(struct nfs_client *clp) ++{ ++#ifdef CONFIG_NFS_V4_1 ++ return is_ds_only_session(clp->cl_exchange_flags); ++#else ++ return false; ++#endif ++} ++ + /* + * NFS client parameters stored in the superblock. + */ +@@ -136,7 +145,7 @@ struct nfs_server { + #endif + + #ifdef CONFIG_NFS_V4 +- u32 attr_bitmask[2];/* V4 bitmask representing the set ++ u32 attr_bitmask[3];/* V4 bitmask representing the set + of attributes supported on this + filesystem */ + u32 cache_consistency_bitmask[2]; +@@ -148,6 +157,15 @@ struct nfs_server { + that are supported on this + filesystem */ + #endif ++ ++#ifdef CONFIG_NFS_V4_1 ++ u32 pnfs_blksize; /* layout_blksize attr */ ++ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ ++ void *pnfs_ld_data; /* Per-mount data */ ++ unsigned int ds_rsize; /* Data server read size */ ++ unsigned int ds_wsize; /* Data server write size */ ++#endif /* CONFIG_NFS_V4_1 */ ++ + void (*destroy)(struct nfs_server *); + + atomic_t active; /* Keep trace of any activity to this server */ +diff -up linux-2.6.35.noarch/include/linux/nfs_iostat.h.orig linux-2.6.35.noarch/include/linux/nfs_iostat.h +--- linux-2.6.35.noarch/include/linux/nfs_iostat.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfs_iostat.h 2010-08-31 21:11:40.983160345 -0400 +@@ -113,6 +113,9 @@ enum nfs_stat_eventcounters { + NFSIOS_SHORTREAD, + NFSIOS_SHORTWRITE, + NFSIOS_DELAY, ++ NFSIOS_PNFS_READ, ++ NFSIOS_PNFS_WRITE, ++ NFSIOS_PNFS_COMMIT, + __NFSIOS_COUNTSMAX, + }; + +diff -up linux-2.6.35.noarch/include/linux/nfs_page.h.orig linux-2.6.35.noarch/include/linux/nfs_page.h +--- linux-2.6.35.noarch/include/linux/nfs_page.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfs_page.h 2010-08-31 21:11:40.984150297 -0400 +@@ -39,6 +39,7 @@ struct nfs_page { + struct list_head wb_list; /* Defines state of page: */ + struct page *wb_page; /* page to read in/write out */ + struct nfs_open_context *wb_context; /* File state context info */ ++ struct nfs_lock_context *wb_lock_context; /* lock context info */ + atomic_t wb_complete; /* i/os we're waiting for */ + pgoff_t wb_index; /* Offset >> PAGE_CACHE_SHIFT */ + unsigned int wb_offset, /* Offset & ~PAGE_CACHE_MASK */ +@@ -47,6 +48,7 @@ struct nfs_page { + struct kref wb_kref; /* reference count */ + unsigned long wb_flags; + struct nfs_writeverf wb_verf; /* Commit cookie */ ++ struct pnfs_layout_segment *wb_lseg; /* Pnfs layout info */ + }; + + struct nfs_pageio_descriptor { +@@ -60,6 +62,12 @@ struct nfs_pageio_descriptor { + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int pg_ioflags; + int pg_error; ++ struct pnfs_layout_segment *pg_lseg; ++#ifdef CONFIG_NFS_V4_1 ++ int pg_iswrite; ++ int pg_boundary; ++ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); ++#endif /* CONFIG_NFS_V4_1 */ + }; + + #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) +@@ -68,13 +76,15 @@ extern struct nfs_page *nfs_create_reque + struct inode *inode, + struct page *page, + unsigned int offset, +- unsigned int count); ++ unsigned int count, ++ struct pnfs_layout_segment *lseg); + extern void nfs_clear_request(struct nfs_page *req); + extern void nfs_release_request(struct nfs_page *req); + + + extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, +- pgoff_t idx_start, unsigned int npages, int tag); ++ pgoff_t idx_start, unsigned int npages, int tag, ++ int *use_pnfs); + extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, + struct inode *inode, + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), +diff -up linux-2.6.35.noarch/include/linux/nfs_xdr.h.orig linux-2.6.35.noarch/include/linux/nfs_xdr.h +--- linux-2.6.35.noarch/include/linux/nfs_xdr.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/nfs_xdr.h 2010-08-31 21:11:40.986150203 -0400 +@@ -3,6 +3,8 @@ + + #include + #include ++#include ++#include + + /* + * To change the maximum rsize and wsize supported by the NFS client, adjust +@@ -10,7 +12,7 @@ + * support a megabyte or more. The default is left at 4096 bytes, which is + * reasonable for NFS over UDP. + */ +-#define NFS_MAX_FILE_IO_SIZE (1048576U) ++#define NFS_MAX_FILE_IO_SIZE (4U * 1048576U) + #define NFS_DEF_FILE_IO_SIZE (4096U) + #define NFS_MIN_FILE_IO_SIZE (1024U) + +@@ -113,6 +115,10 @@ struct nfs_fsinfo { + __u32 dtpref; /* pref. readdir transfer size */ + __u64 maxfilesize; + __u32 lease_time; /* in seconds */ ++#if defined(CONFIG_NFS_V4_1) ++ __u32 layouttype; /* supported pnfs layout driver */ ++ __u32 blksize; /* preferred pnfs io block size */ ++#endif + }; + + struct nfs_fsstat { +@@ -185,6 +191,125 @@ struct nfs4_get_lease_time_res { + struct nfs4_sequence_res lr_seq_res; + }; + ++#define PNFS_LAYOUT_MAXSIZE 4096 ++#define NFS4_PNFS_DEVICEID4_SIZE 16 ++ ++struct pnfs_deviceid { ++ char data[NFS4_PNFS_DEVICEID4_SIZE]; ++}; ++ ++struct nfs4_layoutdriver_data { ++ __u32 len; ++ void *buf; ++}; ++ ++struct pnfs_layout_range { ++ u32 iomode; ++ u64 offset; ++ u64 length; ++}; ++ ++struct nfs4_layoutget_args { ++ __u32 type; ++ struct pnfs_layout_range range; ++ __u64 minlength; ++ __u32 maxcount; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutget_res { ++ __u32 return_on_close; ++ struct pnfs_layout_range range; ++ __u32 type; ++ nfs4_stateid stateid; ++ struct nfs4_layoutdriver_data layout; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_layoutget { ++ struct nfs4_layoutget_args args; ++ struct nfs4_layoutget_res res; ++ struct pnfs_layout_segment **lsegpp; ++ int status; ++}; ++ ++struct nfs4_layoutcommit_args { ++ nfs4_stateid stateid; ++ __u64 lastbytewritten; ++ __u32 time_modify_changed; ++ struct timespec time_modify; ++ const u32 *bitmask; ++ struct nfs_fh *fh; ++ struct inode *inode; ++ ++ /* Values set by layout driver */ ++ struct pnfs_layout_range range; ++ __u32 layout_type; ++ void *layoutdriver_data; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutcommit_res { ++ __u32 sizechanged; ++ __u64 newsize; ++ struct nfs_fattr *fattr; ++ const struct nfs_server *server; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_layoutcommit_data { ++ struct rpc_task task; ++ struct rpc_cred *cred; ++ struct nfs_fattr fattr; ++ struct nfs4_layoutcommit_args args; ++ struct nfs4_layoutcommit_res res; ++ int status; ++}; ++ ++struct nfs4_layoutreturn_args { ++ __u32 reclaim; ++ __u32 layout_type; ++ __u32 return_type; ++ struct pnfs_layout_range range; ++ struct inode *inode; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_layoutreturn_res { ++ struct nfs4_sequence_res seq_res; ++ u32 lrs_present; ++ nfs4_stateid stateid; ++}; ++ ++struct nfs4_layoutreturn { ++ struct nfs4_layoutreturn_args args; ++ struct nfs4_layoutreturn_res res; ++ struct rpc_cred *cred; ++ int rpc_status; ++}; ++ ++struct nfs4_getdevicelist_args { ++ const struct nfs_fh *fh; ++ u32 layoutclass; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdevicelist_res { ++ struct pnfs_devicelist *devlist; ++ struct nfs4_sequence_res seq_res; ++}; ++ ++struct nfs4_getdeviceinfo_args { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_args seq_args; ++}; ++ ++struct nfs4_getdeviceinfo_res { ++ struct pnfs_device *pdev; ++ struct nfs4_sequence_res seq_res; ++}; ++ + /* + * Arguments to the open call. + */ +@@ -196,8 +321,10 @@ struct nfs_openargs { + __u64 clientid; + __u64 id; + union { +- struct iattr * attrs; /* UNCHECKED, GUARDED */ +- nfs4_verifier verifier; /* EXCLUSIVE */ ++ struct { ++ struct iattr * attrs; /* UNCHECKED, GUARDED */ ++ nfs4_verifier verifier; /* EXCLUSIVE */ ++ }; + nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ + fmode_t delegation_type; /* CLAIM_PREVIOUS */ + } u; +@@ -313,6 +440,10 @@ struct nfs_lockt_res { + struct nfs4_sequence_res seq_res; + }; + ++struct nfs_release_lockowner_args { ++ struct nfs_lowner lock_owner; ++}; ++ + struct nfs4_delegreturnargs { + const struct nfs_fh *fhandle; + const nfs4_stateid *stateid; +@@ -332,6 +463,7 @@ struct nfs4_delegreturnres { + struct nfs_readargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + unsigned int pgbase; +@@ -352,6 +484,7 @@ struct nfs_readres { + struct nfs_writeargs { + struct nfs_fh * fh; + struct nfs_open_context *context; ++ struct nfs_lock_context *lock_context; + __u64 offset; + __u32 count; + enum nfs3_stable_how stable; +@@ -846,7 +979,7 @@ struct nfs4_server_caps_arg { + }; + + struct nfs4_server_caps_res { +- u32 attr_bitmask[2]; ++ u32 attr_bitmask[3]; + u32 acl_bitmask; + u32 has_links; + u32 has_symlinks; +@@ -961,6 +1094,27 @@ struct nfs_page; + + #define NFS_PAGEVEC_SIZE (8U) + ++#if defined(CONFIG_NFS_V4_1) ++/* pnfsflag values */ ++#define PNFS_NO_RPC 0x0001 /* non rpc result callback switch */ ++ ++/* pnfs-specific data needed for read, write, and commit calls */ ++struct pnfs_call_data { ++ struct pnfs_layout_segment *lseg; ++ const struct rpc_call_ops *call_ops; ++ u32 orig_count; /* for retry via MDS */ ++ int pnfs_error; ++ u8 pnfsflags; ++ u8 how; /* for FLUSH_STABLE */ ++}; ++ ++/* files layout-type specific data for read, write, and commit */ ++struct pnfs_fl_call_data { ++ struct nfs_client *ds_nfs_client; ++ __u64 orig_offset; ++}; ++#endif /* CONFIG_NFS_V4_1 */ ++ + struct nfs_read_data { + int flags; + struct rpc_task task; +@@ -976,10 +1130,16 @@ struct nfs_read_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + + struct nfs_write_data { ++ struct kref refcount; /* For pnfs commit splitting */ ++ struct nfs_write_data *parent; /* For pnfs commit splitting */ + int flags; + struct rpc_task task; + struct inode *inode; +@@ -995,6 +1155,10 @@ struct nfs_write_data { + #ifdef CONFIG_NFS_V4 + unsigned long timestamp; /* For lease renewal */ + #endif ++#if defined(CONFIG_NFS_V4_1) ++ struct pnfs_call_data pdata; ++ struct pnfs_fl_call_data fldata; ++#endif /* CONFIG_NFS_V4_1 */ + struct page *page_array[NFS_PAGEVEC_SIZE]; + }; + +@@ -1008,6 +1172,7 @@ struct nfs_rpc_ops { + const struct dentry_operations *dentry_ops; + const struct inode_operations *dir_inode_ops; + const struct inode_operations *file_inode_ops; ++ const struct file_operations *file_ops; + + int (*getroot) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsinfo *); +@@ -1072,6 +1237,7 @@ struct nfs_rpc_ops { + extern const struct nfs_rpc_ops nfs_v2_clientops; + extern const struct nfs_rpc_ops nfs_v3_clientops; + extern const struct nfs_rpc_ops nfs_v4_clientops; ++extern const struct nfs_rpc_ops pnfs_v4_clientops; + extern struct rpc_version nfs_version2; + extern struct rpc_version nfs_version3; + extern struct rpc_version nfs_version4; +diff -up linux-2.6.35.noarch/include/linux/panfs_shim_api.h.orig linux-2.6.35.noarch/include/linux/panfs_shim_api.h +--- linux-2.6.35.noarch/include/linux/panfs_shim_api.h.orig 2010-08-31 21:11:40.995160385 -0400 ++++ linux-2.6.35.noarch/include/linux/panfs_shim_api.h 2010-08-31 21:11:40.995160385 -0400 +@@ -0,0 +1,57 @@ ++#ifndef _PANFS_SHIM_API_H ++#define _PANFS_SHIM_API_H ++ ++/* ++ * imported panfs functions ++ */ ++struct panfs_export_operations { ++ int (*convert_rc)(pan_status_t rc); ++ ++ int (*sm_sec_t_get_size_otw)( ++ pan_sm_sec_otw_t *var, ++ pan_size_t *core_sizep, ++ pan_size_t *wire_size, ++ void *buf_end); ++ ++ int (*sm_sec_t_unmarshall)( ++ pan_sm_sec_otw_t *in, ++ pan_sm_sec_t *out, ++ void *buf, ++ pan_size_t size, ++ pan_size_t *otw_consumed, ++ pan_size_t *in_core_consumed); ++ ++ int (*ucreds_get)(void **ucreds_pp); ++ ++ void (*ucreds_put)(void *ucreds); ++ ++ int (*sam_read)( ++ pan_sam_access_flags_t flags, ++ pan_sam_read_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_read_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_read_res_t *res_p); ++ ++ int (*sam_write)( ++ pan_sam_access_flags_t flags, ++ pan_sam_write_args_t *args_p, ++ pan_sam_obj_sec_t *obj_sec_p, ++ pan_sg_entry_t *data_p, ++ void *ucreds, ++ pan_sam_write_cb_t closure, ++ void *user_arg1, ++ void *user_arg2, ++ pan_sam_write_res_t *res_p); ++}; ++ ++extern int ++panfs_shim_register(struct panfs_export_operations *ops); ++ ++extern int ++panfs_shim_unregister(void); ++ ++#endif /* _PANFS_SHIM_API_H */ +diff -up linux-2.6.35.noarch/include/linux/pnfs_osd_xdr.h.orig linux-2.6.35.noarch/include/linux/pnfs_osd_xdr.h +--- linux-2.6.35.noarch/include/linux/pnfs_osd_xdr.h.orig 2010-08-31 21:11:40.996160287 -0400 ++++ linux-2.6.35.noarch/include/linux/pnfs_osd_xdr.h 2010-08-31 21:11:40.996160287 -0400 +@@ -0,0 +1,439 @@ ++/* ++ * pnfs_osd_xdr.h ++ * ++ * pNFS-osd on-the-wire data structures ++ * ++ * Copyright (C) 2007-2009 Panasas Inc. ++ * All rights reserved. ++ * ++ * Benny Halevy ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 ++ * See the file COPYING included with this distribution for more details. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the Panasas company nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ */ ++#ifndef __PNFS_OSD_XDR_H__ ++#define __PNFS_OSD_XDR_H__ ++ ++#include ++#include ++#include ++#include ++ ++#define PNFS_OSD_OSDNAME_MAXSIZE 256 ++ ++/* ++ * START OF "GENERIC" DECODE ROUTINES. ++ * These may look a little ugly since they are imported from a "generic" ++ * set of XDR encode/decode routines which are intended to be shared by ++ * all of our NFSv4 implementations (OpenBSD, MacOS X...). ++ * ++ * If the pain of reading these is too great, it should be a straightforward ++ * task to translate them into Linux-specific versions which are more ++ * consistent with the style used in NFSv2/v3... ++ */ ++#define READ32(x) (x) = ntohl(*p++) ++#define READ64(x) do { \ ++ (x) = (u64)ntohl(*p++) << 32; \ ++ (x) |= ntohl(*p++); \ ++} while (0) ++#define COPYMEM(x, nbytes) do { \ ++ memcpy((x), p, nbytes); \ ++ p += XDR_QUADLEN(nbytes); \ ++} while (0) ++ ++/* ++ * draft-ietf-nfsv4-minorversion-22 ++ * draft-ietf-nfsv4-pnfs-obj-12 ++ */ ++ ++/* Layout Structure */ ++ ++enum pnfs_osd_raid_algorithm4 { ++ PNFS_OSD_RAID_0 = 1, ++ PNFS_OSD_RAID_4 = 2, ++ PNFS_OSD_RAID_5 = 3, ++ PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ ++}; ++ ++/* struct pnfs_osd_data_map4 { ++ * uint32_t odm_num_comps; ++ * length4 odm_stripe_unit; ++ * uint32_t odm_group_width; ++ * uint32_t odm_group_depth; ++ * uint32_t odm_mirror_cnt; ++ * pnfs_osd_raid_algorithm4 odm_raid_algorithm; ++ * }; ++ */ ++struct pnfs_osd_data_map { ++ u32 odm_num_comps; ++ u64 odm_stripe_unit; ++ u32 odm_group_width; ++ u32 odm_group_depth; ++ u32 odm_mirror_cnt; ++ u32 odm_raid_algorithm; ++}; ++ ++static inline int ++pnfs_osd_data_map_xdr_sz(void) ++{ ++ return 1 + 2 + 1 + 1 + 1 + 1; ++} ++ ++static inline size_t ++pnfs_osd_data_map_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_data_map); ++} ++ ++/* struct pnfs_osd_objid4 { ++ * deviceid4 oid_device_id; ++ * uint64_t oid_partition_id; ++ * uint64_t oid_object_id; ++ * }; ++ */ ++struct pnfs_osd_objid { ++ struct pnfs_deviceid oid_device_id; ++ u64 oid_partition_id; ++ u64 oid_object_id; ++}; ++ ++/* For printout. I use "dev(%llx:%llx)", _DEVID_LO(), _DEVID_HI BE style */ ++#define _DEVID_LO(oid_device_id) \ ++ (unsigned long long)be64_to_cpup((__be64 *)oid_device_id.data) ++ ++#define _DEVID_HI(oid_device_id) \ ++ (unsigned long long)be64_to_cpup(((__be64 *)oid_device_id.data) + 1) ++ ++static inline int ++pnfs_osd_objid_xdr_sz(void) ++{ ++ return (NFS4_PNFS_DEVICEID4_SIZE / 4) + 2 + 2; ++} ++ ++static inline size_t ++pnfs_osd_objid_incore_sz(void) ++{ ++ return sizeof(struct pnfs_osd_objid); ++} ++ ++enum pnfs_osd_version { ++ PNFS_OSD_MISSING = 0, ++ PNFS_OSD_VERSION_1 = 1, ++ PNFS_OSD_VERSION_2 = 2 ++}; ++ ++struct pnfs_osd_opaque_cred { ++ u32 cred_len; ++ u8 *cred; ++}; ++ ++static inline int ++pnfs_osd_opaque_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ READ32(n); ++ p += XDR_QUADLEN(n); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_opaque_cred_incore_sz(u32 *p) ++{ ++ u32 n; ++ ++ READ32(n); ++ return XDR_QUADLEN(n) * 4; ++} ++ ++enum pnfs_osd_cap_key_sec { ++ PNFS_OSD_CAP_KEY_SEC_NONE = 0, ++ PNFS_OSD_CAP_KEY_SEC_SSV = 1, ++}; ++ ++/* struct pnfs_osd_object_cred4 { ++ * pnfs_osd_objid4 oc_object_id; ++ * pnfs_osd_version4 oc_osd_version; ++ * pnfs_osd_cap_key_sec4 oc_cap_key_sec; ++ * opaque oc_capability_key<>; ++ * opaque oc_capability<>; ++ * }; ++ */ ++struct pnfs_osd_object_cred { ++ struct pnfs_osd_objid oc_object_id; ++ u32 oc_osd_version; ++ u32 oc_cap_key_sec; ++ struct pnfs_osd_opaque_cred oc_cap_key; ++ struct pnfs_osd_opaque_cred oc_cap; ++}; ++ ++static inline int ++pnfs_osd_object_cred_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_object_cred_incore_sz(u32 *p) ++{ ++ size_t sz = sizeof(struct pnfs_osd_object_cred); ++ ++ p += pnfs_osd_objid_xdr_sz() + 2; ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ p += pnfs_osd_opaque_cred_xdr_sz(p); ++ sz += pnfs_osd_opaque_cred_incore_sz(p); ++ return sz; ++} ++ ++/* struct pnfs_osd_layout4 { ++ * pnfs_osd_data_map4 olo_map; ++ * uint32_t olo_comps_index; ++ * pnfs_osd_object_cred4 olo_components<>; ++ * }; ++ */ ++struct pnfs_osd_layout { ++ struct pnfs_osd_data_map olo_map; ++ u32 olo_comps_index; ++ u32 olo_num_comps; ++ struct pnfs_osd_object_cred *olo_comps; ++}; ++ ++static inline int ++pnfs_osd_layout_xdr_sz(u32 *p) ++{ ++ u32 *start = p; ++ u32 n; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ while ((int)(n--) > 0) ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ return p - start; ++} ++ ++static inline size_t ++pnfs_osd_layout_incore_sz(u32 *p) ++{ ++ u32 n; ++ size_t sz; ++ ++ p += pnfs_osd_data_map_xdr_sz() + 1; ++ READ32(n); ++ sz = sizeof(struct pnfs_osd_layout); ++ while ((int)(n--) > 0) { ++ sz += pnfs_osd_object_cred_incore_sz(p); ++ p += pnfs_osd_object_cred_xdr_sz(p); ++ } ++ return sz; ++} ++ ++/* Device Address */ ++ ++enum pnfs_osd_targetid_type { ++ OBJ_TARGET_ANON = 1, ++ OBJ_TARGET_SCSI_NAME = 2, ++ OBJ_TARGET_SCSI_DEVICE_ID = 3, ++}; ++ ++/* union pnfs_osd_targetid4 switch (pnfs_osd_targetid_type4 oti_type) { ++ * case OBJ_TARGET_SCSI_NAME: ++ * string oti_scsi_name<>; ++ * ++ * case OBJ_TARGET_SCSI_DEVICE_ID: ++ * opaque oti_scsi_device_id<>; ++ * ++ * default: ++ * void; ++ * }; ++ * ++ * union pnfs_osd_targetaddr4 switch (bool ota_available) { ++ * case TRUE: ++ * netaddr4 ota_netaddr; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_deviceaddr4 { ++ * pnfs_osd_targetid4 oda_targetid; ++ * pnfs_osd_targetaddr4 oda_targetaddr; ++ * uint64_t oda_lun; ++ * opaque oda_systemid<>; ++ * pnfs_osd_object_cred4 oda_root_obj_cred; ++ * opaque oda_osdname<>; ++ * }; ++ */ ++struct pnfs_osd_targetid { ++ u32 oti_type; ++ struct nfs4_string oti_scsi_device_id; ++}; ++ ++enum { PNFS_OSD_TARGETID_MAX = 1 + PNFS_OSD_OSDNAME_MAXSIZE / 4 }; ++ ++/* struct netaddr4 { ++ * // see struct rpcb in RFC1833 ++ * string r_netid<>; // network id ++ * string r_addr<>; // universal address ++ * }; ++ */ ++struct pnfs_osd_net_addr { ++ struct nfs4_string r_netid; ++ struct nfs4_string r_addr; ++}; ++ ++struct pnfs_osd_targetaddr { ++ u32 ota_available; ++ struct pnfs_osd_net_addr ota_netaddr; ++}; ++ ++enum { ++ NETWORK_ID_MAX = 16 / 4, ++ UNIVERSAL_ADDRESS_MAX = 64 / 4, ++ PNFS_OSD_TARGETADDR_MAX = 3 + NETWORK_ID_MAX + UNIVERSAL_ADDRESS_MAX, ++}; ++ ++struct pnfs_osd_deviceaddr { ++ struct pnfs_osd_targetid oda_targetid; ++ struct pnfs_osd_targetaddr oda_targetaddr; ++ u8 oda_lun[8]; ++ struct nfs4_string oda_systemid; ++ struct pnfs_osd_object_cred oda_root_obj_cred; ++ struct nfs4_string oda_osdname; ++}; ++ ++enum { ++ ODA_OSDNAME_MAX = PNFS_OSD_OSDNAME_MAXSIZE / 4, ++ PNFS_OSD_DEVICEADDR_MAX = ++ PNFS_OSD_TARGETID_MAX + PNFS_OSD_TARGETADDR_MAX + ++ 2 /*oda_lun*/ + ++ 1 + OSD_SYSTEMID_LEN + ++ 1 + ODA_OSDNAME_MAX, ++}; ++ ++/* LAYOUTCOMMIT: layoutupdate */ ++ ++/* union pnfs_osd_deltaspaceused4 switch (bool dsu_valid) { ++ * case TRUE: ++ * int64_t dsu_delta; ++ * case FALSE: ++ * void; ++ * }; ++ * ++ * struct pnfs_osd_layoutupdate4 { ++ * pnfs_osd_deltaspaceused4 olu_delta_space_used; ++ * bool olu_ioerr_flag; ++ * }; ++ */ ++struct pnfs_osd_layoutupdate { ++ u32 dsu_valid; ++ s64 dsu_delta; ++ u32 olu_ioerr_flag; ++}; ++ ++/* LAYOUTRETURN: I/O Rrror Report */ ++ ++enum pnfs_osd_errno { ++ PNFS_OSD_ERR_EIO = 1, ++ PNFS_OSD_ERR_NOT_FOUND = 2, ++ PNFS_OSD_ERR_NO_SPACE = 3, ++ PNFS_OSD_ERR_BAD_CRED = 4, ++ PNFS_OSD_ERR_NO_ACCESS = 5, ++ PNFS_OSD_ERR_UNREACHABLE = 6, ++ PNFS_OSD_ERR_RESOURCE = 7 ++}; ++ ++/* struct pnfs_osd_ioerr4 { ++ * pnfs_osd_objid4 oer_component; ++ * length4 oer_comp_offset; ++ * length4 oer_comp_length; ++ * bool oer_iswrite; ++ * pnfs_osd_errno4 oer_errno; ++ * }; ++ */ ++struct pnfs_osd_ioerr { ++ struct pnfs_osd_objid oer_component; ++ u64 oer_comp_offset; ++ u64 oer_comp_length; ++ u32 oer_iswrite; ++ u32 oer_errno; ++}; ++ ++static inline unsigned ++pnfs_osd_ioerr_xdr_sz(void) ++{ ++ return pnfs_osd_objid_xdr_sz() + 2 + 2 + 1 + 1; ++} ++ ++/* OSD XDR API */ ++ ++/* Layout helpers */ ++extern struct pnfs_osd_layout *pnfs_osd_xdr_decode_layout( ++ struct pnfs_osd_layout *layout, u32 *p); ++ ++extern int pnfs_osd_xdr_encode_layout( ++ struct exp_xdr_stream *xdr, ++ struct pnfs_osd_layout *layout); ++ ++/* Device Info helpers */ ++ ++/* First pass calculate total size for space needed */ ++extern size_t pnfs_osd_xdr_deviceaddr_incore_sz(u32 *p); ++ ++/* Note: some strings pointed to inside @deviceaddr might point ++ * to space inside @p. @p should stay valid while @deviceaddr ++ * is in use. ++ * It is assumed that @deviceaddr points to bigger memory of size ++ * calculated in first pass by pnfs_osd_xdr_deviceaddr_incore_sz() ++ */ ++extern void pnfs_osd_xdr_decode_deviceaddr( ++ struct pnfs_osd_deviceaddr *deviceaddr, u32 *p); ++ ++/* For Servers */ ++extern int pnfs_osd_xdr_encode_deviceaddr( ++ struct exp_xdr_stream *xdr, struct pnfs_osd_deviceaddr *devaddr); ++ ++/* layoutupdate (layout_commit) xdr helpers */ ++extern int ++pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, ++ struct pnfs_osd_layoutupdate *lou); ++extern __be32 * ++pnfs_osd_xdr_decode_layoutupdate(struct pnfs_osd_layoutupdate *lou, __be32 *p); ++ ++/* osd_ioerror encoding/decoding (layout_return) */ ++extern int ++pnfs_osd_xdr_encode_ioerr(struct xdr_stream *xdr, struct pnfs_osd_ioerr *ioerr); ++extern __be32 * ++pnfs_osd_xdr_decode_ioerr(struct pnfs_osd_ioerr *ioerr, __be32 *p); ++ ++#endif /* __PNFS_OSD_XDR_H__ */ +diff -up linux-2.6.35.noarch/include/linux/posix_acl.h.orig linux-2.6.35.noarch/include/linux/posix_acl.h +--- linux-2.6.35.noarch/include/linux/posix_acl.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/posix_acl.h 2010-08-31 21:11:40.998160292 -0400 +@@ -8,6 +8,7 @@ + #ifndef __LINUX_POSIX_ACL_H + #define __LINUX_POSIX_ACL_H + ++#include + #include + + #define ACL_UNDEFINED_ID (-1) +diff -up linux-2.6.35.noarch/include/linux/sunrpc/msg_prot.h.orig linux-2.6.35.noarch/include/linux/sunrpc/msg_prot.h +--- linux-2.6.35.noarch/include/linux/sunrpc/msg_prot.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/sunrpc/msg_prot.h 2010-08-31 21:11:40.998160292 -0400 +@@ -14,6 +14,8 @@ + /* size of an XDR encoding unit in bytes, i.e. 32bit */ + #define XDR_UNIT (4) + ++#include ++ + /* spec defines authentication flavor as an unsigned 32 bit integer */ + typedef u32 rpc_authflavor_t; + +diff -up linux-2.6.35.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig linux-2.6.35.noarch/include/linux/sunrpc/rpc_pipe_fs.h +--- linux-2.6.35.noarch/include/linux/sunrpc/rpc_pipe_fs.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/sunrpc/rpc_pipe_fs.h 2010-08-31 21:11:40.999138583 -0400 +@@ -3,6 +3,7 @@ + + #ifdef __KERNEL__ + ++#include + #include + + struct rpc_pipe_msg { +@@ -11,6 +12,10 @@ struct rpc_pipe_msg { + size_t len; + size_t copied; + int errno; ++#define PIPEFS_AUTOFREE_RPCMSG 0x01 /* frees rpc_pipe_msg */ ++#define PIPEFS_AUTOFREE_RPCMSG_DATA 0x02 /* frees rpc_pipe_msg->data */ ++#define PIPEFS_AUTOFREE_UPCALL_MSG PIPEFS_AUTOFREE_RPCMSG_DATA ++ u8 flags; + }; + + struct rpc_pipe_ops { +diff -up linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h +--- linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h.orig 2010-08-31 21:11:40.999138583 -0400 ++++ linux-2.6.35.noarch/include/linux/sunrpc/simple_rpc_pipefs.h 2010-08-31 21:11:41.000170411 -0400 +@@ -0,0 +1,111 @@ ++/* ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#ifndef _SIMPLE_RPC_PIPEFS_H_ ++#define _SIMPLE_RPC_PIPEFS_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#define payload_of(headerp) ((void *)(headerp + 1)) ++ ++/* ++ * struct pipefs_hdr -- the generic message format for simple_rpc_pipefs. ++ * Messages may simply be the header itself, although having an optional ++ * data payload follow the header allows much more flexibility. ++ * ++ * Messages are created using pipefs_alloc_init_msg() and ++ * pipefs_alloc_init_msg_padded(), both of which accept a pointer to an ++ * (optional) data payload. ++ * ++ * Given a struct pipefs_hdr *msg that has a struct foo payload, the data ++ * can be accessed using: struct foo *foop = payload_of(msg) ++ */ ++struct pipefs_hdr { ++ u32 msgid; ++ u8 type; ++ u8 flags; ++ u16 totallen; /* length of entire message, including hdr itself */ ++ u32 status; ++}; ++ ++/* ++ * struct pipefs_list -- a type of list used for tracking callers who've made an ++ * upcall and are blocked waiting for a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_assign_upcall_reply(). ++ */ ++struct pipefs_list { ++ struct list_head list; ++ spinlock_t list_lock; ++}; ++ ++ ++/* See net/sunrpc/simple_rpc_pipefs.c for more info on using these functions. */ ++extern struct dentry *pipefs_mkpipe(const char *name, ++ const struct rpc_pipe_ops *ops, ++ int wait_for_open); ++extern void pipefs_closepipe(struct dentry *pipe); ++extern void pipefs_init_list(struct pipefs_list *list); ++extern struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen); ++extern struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, ++ u8 flags, void *data, ++ u16 datalen, u16 padlen); ++extern struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list ++ *uplist, u8 upflags, ++ u32 timeout); ++extern int pipefs_queue_upcall_noreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, u8 upflags); ++extern int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist); ++extern struct pipefs_hdr *pipefs_readmsg(struct file *filp, ++ const char __user *src, size_t len); ++extern ssize_t pipefs_generic_upcall(struct file *filp, ++ struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen); ++extern void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg); ++ ++#endif /* _SIMPLE_RPC_PIPEFS_H_ */ +diff -up linux-2.6.35.noarch/include/linux/sunrpc/svc_xprt.h.orig linux-2.6.35.noarch/include/linux/sunrpc/svc_xprt.h +--- linux-2.6.35.noarch/include/linux/sunrpc/svc_xprt.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/sunrpc/svc_xprt.h 2010-08-31 21:11:41.000170411 -0400 +@@ -166,4 +166,41 @@ static inline char *__svc_print_addr(con + + return buf; + } ++ ++/* ++ * Print a network address in a universal format (see rfc1833 and nfsv4.1) ++ */ ++static inline int __svc_print_netaddr(struct sockaddr *addr, ++ struct xdr_netobj *na) ++{ ++ u16 port; ++ ssize_t len; ++ ++ switch (addr->sa_family) { ++ case AF_INET: { ++ struct sockaddr_in *sin = (struct sockaddr_in *)addr; ++ port = ntohs(sin->sin_port); ++ ++ len = snprintf(na->data, na->len, "%pI4.%u.%u", ++ &sin->sin_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ case AF_INET6: { ++ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; ++ port = ntohs(sin6->sin6_port); ++ ++ len = snprintf(na->data, na->len, "%pI6.%u.%u", ++ &sin6->sin6_addr, ++ port >> 8, port & 0xff); ++ break; ++ } ++ default: ++ snprintf(na->data, na->len, "unknown address type: %d", ++ addr->sa_family); ++ len = -EINVAL; ++ break; ++ } ++ return len; ++} + #endif /* SUNRPC_SVC_XPRT_H */ +diff -up linux-2.6.35.noarch/include/linux/sunrpc/xdr.h.orig linux-2.6.35.noarch/include/linux/sunrpc/xdr.h +--- linux-2.6.35.noarch/include/linux/sunrpc/xdr.h.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/include/linux/sunrpc/xdr.h 2010-08-31 21:11:41.001061972 -0400 +@@ -131,6 +131,13 @@ xdr_decode_hyper(__be32 *p, __u64 *valp) + return p + 2; + } + ++static inline __be32 * ++xdr_decode_opaque_fixed(__be32 *p, void *ptr, unsigned int len) ++{ ++ memcpy(ptr, p, len); ++ return p + XDR_QUADLEN(len); ++} ++ + /* + * Adjust kvec to reflect end of xdr'ed data (RPC client XDR) + */ +@@ -197,6 +204,7 @@ struct xdr_stream { + + extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); + extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes); ++extern __be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q); + extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, + unsigned int base, unsigned int len); + extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p); +diff -up linux-2.6.35.noarch/localversion-pnfs.orig linux-2.6.35.noarch/localversion-pnfs +--- linux-2.6.35.noarch/localversion-pnfs.orig 2010-08-31 21:11:41.001061972 -0400 ++++ linux-2.6.35.noarch/localversion-pnfs 2010-08-31 21:11:41.001061972 -0400 +@@ -0,0 +1 @@ ++-pnfs +diff -up linux-2.6.35.noarch/net/sunrpc/Makefile.orig linux-2.6.35.noarch/net/sunrpc/Makefile +--- linux-2.6.35.noarch/net/sunrpc/Makefile.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/net/sunrpc/Makefile 2010-08-31 21:11:41.002196454 -0400 +@@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprt + svc.o svcsock.o svcauth.o svcauth_unix.o \ + addr.o rpcb_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o \ +- svc_xprt.o ++ svc_xprt.o simple_rpc_pipefs.o + sunrpc-$(CONFIG_NFS_V4_1) += backchannel_rqst.o bc_svc.o + sunrpc-$(CONFIG_PROC_FS) += stats.o + sunrpc-$(CONFIG_SYSCTL) += sysctl.o +diff -up linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c.orig linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c +--- linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c.orig 2010-08-31 21:11:41.003170510 -0400 ++++ linux-2.6.35.noarch/net/sunrpc/simple_rpc_pipefs.c 2010-08-31 21:11:41.003170510 -0400 +@@ -0,0 +1,424 @@ ++/* ++ * net/sunrpc/simple_rpc_pipefs.c ++ * ++ * Copyright (c) 2008 The Regents of the University of Michigan. ++ * All rights reserved. ++ * ++ * David M. Richter ++ * ++ * Drawing on work done by Andy Adamson and ++ * Marius Eriksen . Thanks for the help over the ++ * years, guys. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. Neither the name of the University nor the names of its ++ * contributors may be used to endorse or promote products derived ++ * from this software without specific prior written permission. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++ * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR ++ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ++ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ++ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * With thanks to CITI's project sponsor and partner, IBM. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++ ++/* ++ * Make an rpc_pipefs pipe named @name at the root of the mounted rpc_pipefs ++ * filesystem. ++ * ++ * If @wait_for_open is non-zero and an upcall is later queued but the userland ++ * end of the pipe has not yet been opened, the upcall will remain queued until ++ * the pipe is opened; otherwise, the upcall queueing will return with -EPIPE. ++ */ ++struct dentry *pipefs_mkpipe(const char *name, const struct rpc_pipe_ops *ops, ++ int wait_for_open) ++{ ++ struct dentry *dir, *pipe; ++ struct vfsmount *mnt; ++ ++ mnt = rpc_get_mount(); ++ if (IS_ERR(mnt)) { ++ pipe = ERR_CAST(mnt); ++ goto out; ++ } ++ dir = mnt->mnt_root; ++ if (!dir) { ++ pipe = ERR_PTR(-ENOENT); ++ goto out; ++ } ++ pipe = rpc_mkpipe(dir, name, NULL, ops, ++ wait_for_open ? RPC_PIPE_WAIT_FOR_OPEN : 0); ++out: ++ return pipe; ++} ++EXPORT_SYMBOL(pipefs_mkpipe); ++ ++/* ++ * Shutdown a pipe made by pipefs_mkpipe(). ++ * XXX: do we need to retain an extra reference on the mount? ++ */ ++void pipefs_closepipe(struct dentry *pipe) ++{ ++ rpc_unlink(pipe); ++ rpc_put_mount(); ++} ++EXPORT_SYMBOL(pipefs_closepipe); ++ ++/* ++ * Initialize a struct pipefs_list -- which are a way to keep track of callers ++ * who're blocked having made an upcall and are awaiting a reply. ++ * ++ * See pipefs_queue_upcall_waitreply() and pipefs_find_upcall_msgid() for how ++ * to use them. ++ */ ++inline void pipefs_init_list(struct pipefs_list *list) ++{ ++ INIT_LIST_HEAD(&list->list); ++ spin_lock_init(&list->list_lock); ++} ++EXPORT_SYMBOL(pipefs_init_list); ++ ++/* ++ * Alloc/init a generic pipefs message header and copy into its message body ++ * an arbitrary data payload. ++ * ++ * struct pipefs_hdr's are meant to serve as generic, general-purpose message ++ * headers for easy rpc_pipefs I/O. When an upcall is made, the ++ * struct pipefs_hdr is assigned to a struct rpc_pipe_msg and delivered ++ * therein. --And yes, the naming can seem a little confusing at first: ++ * ++ * When one thinks of an upcall "message", in simple_rpc_pipefs that's a ++ * struct pipefs_hdr (possibly with an attached message body). A ++ * struct rpc_pipe_msg is actually only the -vehicle- by which the "real" ++ * message is delivered and processed. ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg_padded(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen, u16 padlen) ++{ ++ u16 totallen; ++ struct pipefs_hdr *msg = NULL; ++ ++ totallen = sizeof(*msg) + datalen + padlen; ++ if (totallen > PAGE_SIZE) { ++ msg = ERR_PTR(-E2BIG); ++ goto out; ++ } ++ ++ msg = kzalloc(totallen, GFP_KERNEL); ++ if (!msg) { ++ msg = ERR_PTR(-ENOMEM); ++ goto out; ++ } ++ ++ msg->msgid = msgid; ++ msg->type = type; ++ msg->flags = flags; ++ msg->totallen = totallen; ++ memcpy(payload_of(msg), data, datalen); ++out: ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg_padded); ++ ++/* ++ * See the description of pipefs_alloc_init_msg_padded(). ++ */ ++struct pipefs_hdr *pipefs_alloc_init_msg(u32 msgid, u8 type, u8 flags, ++ void *data, u16 datalen) ++{ ++ return pipefs_alloc_init_msg_padded(msgid, type, flags, data, ++ datalen, 0); ++} ++EXPORT_SYMBOL(pipefs_alloc_init_msg); ++ ++ ++static void pipefs_init_rpcmsg(struct rpc_pipe_msg *rpcmsg, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ memset(rpcmsg, 0, sizeof(*rpcmsg)); ++ rpcmsg->data = msg; ++ rpcmsg->len = msg->totallen; ++ rpcmsg->flags = upflags; ++} ++ ++static struct rpc_pipe_msg *pipefs_alloc_init_rpcmsg(struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ struct rpc_pipe_msg *rpcmsg; ++ ++ rpcmsg = kmalloc(sizeof(*rpcmsg), GFP_KERNEL); ++ if (!rpcmsg) ++ return ERR_PTR(-ENOMEM); ++ ++ pipefs_init_rpcmsg(rpcmsg, msg, upflags); ++ return rpcmsg; ++} ++ ++ ++/* represents an upcall that'll block and wait for a reply */ ++struct pipefs_upcall { ++ u32 msgid; ++ struct rpc_pipe_msg rpcmsg; ++ struct list_head list; ++ wait_queue_head_t waitq; ++ struct pipefs_hdr *reply; ++}; ++ ++ ++static void pipefs_init_upcall_waitreply(struct pipefs_upcall *upcall, ++ struct pipefs_hdr *msg, u8 upflags) ++{ ++ upcall->reply = NULL; ++ upcall->msgid = msg->msgid; ++ INIT_LIST_HEAD(&upcall->list); ++ init_waitqueue_head(&upcall->waitq); ++ pipefs_init_rpcmsg(&upcall->rpcmsg, msg, upflags); ++} ++ ++static int __pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_upcall *upcall, ++ struct pipefs_list *uplist, ++ u32 timeout) ++{ ++ int err = 0; ++ DECLARE_WAITQUEUE(wq, current); ++ ++ add_wait_queue(&upcall->waitq, &wq); ++ spin_lock(&uplist->list_lock); ++ list_add(&upcall->list, &uplist->list); ++ spin_unlock(&uplist->list_lock); ++ ++ err = rpc_queue_upcall(pipe->d_inode, &upcall->rpcmsg); ++ if (err < 0) ++ goto out; ++ ++ if (timeout) { ++ /* retval of 0 means timer expired */ ++ err = schedule_timeout_uninterruptible(timeout); ++ if (err == 0 && upcall->reply == NULL) ++ err = -ETIMEDOUT; ++ } else { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule(); ++ __set_current_state(TASK_RUNNING); ++ } ++ ++out: ++ spin_lock(&uplist->list_lock); ++ list_del_init(&upcall->list); ++ spin_unlock(&uplist->list_lock); ++ remove_wait_queue(&upcall->waitq, &wq); ++ return err; ++} ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace, place the calling thread ++ * on @uplist, and block the thread to wait for a reply. If @timeout is ++ * nonzero, the thread will be blocked for at most @timeout jiffies. ++ * ++ * (To convert time units into jiffies, consider the functions ++ * msecs_to_jiffies(), usecs_to_jiffies(), timeval_to_jiffies(), and ++ * timespec_to_jiffies().) ++ * ++ * Once a reply is received by your downcall handler, call ++ * pipefs_assign_upcall_reply() with @uplist to find the corresponding upcall, ++ * assign the reply, and wake the waiting thread. ++ * ++ * This function's return value pointer may be an error and should be checked ++ * with IS_ERR() before attempting to access the reply message. ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++struct pipefs_hdr *pipefs_queue_upcall_waitreply(struct dentry *pipe, ++ struct pipefs_hdr *msg, ++ struct pipefs_list *uplist, ++ u8 upflags, u32 timeout) ++{ ++ int err = 0; ++ struct pipefs_upcall upcall; ++ ++ pipefs_init_upcall_waitreply(&upcall, msg, upflags); ++ err = __pipefs_queue_upcall_waitreply(pipe, &upcall, uplist, timeout); ++ if (err < 0) { ++ kfree(upcall.reply); ++ upcall.reply = ERR_PTR(err); ++ } ++ ++ return upcall.reply; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_waitreply); ++ ++/* ++ * Queue a pipefs msg for an upcall to userspace and immediately return (i.e., ++ * no reply is expected). ++ * ++ * Callers are responsible for freeing @msg, unless pipefs_generic_destroy_msg() ++ * is used as the ->destroy_msg() callback and the PIPEFS_AUTOFREE_UPCALL_MSG ++ * flag is set in @upflags. See also rpc_pipe_fs.h. ++ */ ++int pipefs_queue_upcall_noreply(struct dentry *pipe, struct pipefs_hdr *msg, ++ u8 upflags) ++{ ++ int err = 0; ++ struct rpc_pipe_msg *rpcmsg; ++ ++ upflags |= PIPEFS_AUTOFREE_RPCMSG; ++ rpcmsg = pipefs_alloc_init_rpcmsg(msg, upflags); ++ if (IS_ERR(rpcmsg)) { ++ err = PTR_ERR(rpcmsg); ++ goto out; ++ } ++ err = rpc_queue_upcall(pipe->d_inode, rpcmsg); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_queue_upcall_noreply); ++ ++ ++static struct pipefs_upcall *pipefs_find_upcall_msgid(u32 msgid, ++ struct pipefs_list *uplist) ++{ ++ struct pipefs_upcall *upcall; ++ ++ spin_lock(&uplist->list_lock); ++ list_for_each_entry(upcall, &uplist->list, list) ++ if (upcall->msgid == msgid) ++ goto out; ++ upcall = NULL; ++out: ++ spin_unlock(&uplist->list_lock); ++ return upcall; ++} ++ ++/* ++ * In your rpc_pipe_ops->downcall() handler, once you've read in a downcall ++ * message and have determined that it is a reply to a waiting upcall, ++ * you can use this function to find the appropriate upcall, assign the result, ++ * and wake the upcall thread. ++ * ++ * The reply message must have the same msgid as the original upcall message's. ++ * ++ * See also pipefs_queue_upcall_waitreply() and pipefs_readmsg(). ++ */ ++int pipefs_assign_upcall_reply(struct pipefs_hdr *reply, ++ struct pipefs_list *uplist) ++{ ++ int err = 0; ++ struct pipefs_upcall *upcall; ++ ++ upcall = pipefs_find_upcall_msgid(reply->msgid, uplist); ++ if (!upcall) { ++ printk(KERN_ERR "%s: ERROR: have reply but no matching upcall " ++ "for msgid %d\n", __func__, reply->msgid); ++ err = -ENOENT; ++ goto out; ++ } ++ upcall->reply = reply; ++ wake_up(&upcall->waitq); ++out: ++ return err; ++} ++EXPORT_SYMBOL(pipefs_assign_upcall_reply); ++ ++/* ++ * Generic method to read-in and return a newly-allocated message which begins ++ * with a struct pipefs_hdr. ++ */ ++struct pipefs_hdr *pipefs_readmsg(struct file *filp, const char __user *src, ++ size_t len) ++{ ++ int err = 0, hdrsize; ++ struct pipefs_hdr *msg = NULL; ++ ++ hdrsize = sizeof(*msg); ++ if (len < hdrsize) { ++ printk(KERN_ERR "%s: ERROR: header is too short (%d vs %d)\n", ++ __func__, (int) len, hdrsize); ++ err = -EINVAL; ++ goto out; ++ } ++ ++ msg = kzalloc(len, GFP_KERNEL); ++ if (!msg) { ++ err = -ENOMEM; ++ goto out; ++ } ++ if (copy_from_user(msg, src, len)) ++ err = -EFAULT; ++out: ++ if (err) { ++ kfree(msg); ++ msg = ERR_PTR(err); ++ } ++ return msg; ++} ++EXPORT_SYMBOL(pipefs_readmsg); ++ ++/* ++ * Generic rpc_pipe_ops->upcall() handler implementation. ++ * ++ * Don't call this directly: to make an upcall, use ++ * pipefs_queue_upcall_waitreply() or pipefs_queue_upcall_noreply(). ++ */ ++ssize_t pipefs_generic_upcall(struct file *filp, struct rpc_pipe_msg *rpcmsg, ++ char __user *dst, size_t buflen) ++{ ++ char *data; ++ ssize_t len, left; ++ ++ data = (char *)rpcmsg->data + rpcmsg->copied; ++ len = rpcmsg->len - rpcmsg->copied; ++ if (len > buflen) ++ len = buflen; ++ ++ left = copy_to_user(dst, data, len); ++ if (left < 0) { ++ rpcmsg->errno = left; ++ return left; ++ } ++ ++ len -= left; ++ rpcmsg->copied += len; ++ rpcmsg->errno = 0; ++ return len; ++} ++EXPORT_SYMBOL(pipefs_generic_upcall); ++ ++/* ++ * Generic rpc_pipe_ops->destroy_msg() handler implementation. ++ * ++ * Items are only freed if @rpcmsg->flags has been set appropriately. ++ * See pipefs_queue_upcall_noreply() and rpc_pipe_fs.h. ++ */ ++void pipefs_generic_destroy_msg(struct rpc_pipe_msg *rpcmsg) ++{ ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_UPCALL_MSG) ++ kfree(rpcmsg->data); ++ if (rpcmsg->flags & PIPEFS_AUTOFREE_RPCMSG) ++ kfree(rpcmsg); ++} ++EXPORT_SYMBOL(pipefs_generic_destroy_msg); +diff -up linux-2.6.35.noarch/net/sunrpc/xdr.c.orig linux-2.6.35.noarch/net/sunrpc/xdr.c +--- linux-2.6.35.noarch/net/sunrpc/xdr.c.orig 2010-08-01 18:11:14.000000000 -0400 ++++ linux-2.6.35.noarch/net/sunrpc/xdr.c 2010-08-31 21:11:41.004160487 -0400 +@@ -395,24 +395,29 @@ xdr_shrink_pagelen(struct xdr_buf *buf, + { + struct kvec *tail; + size_t copy; +- char *p; + unsigned int pglen = buf->page_len; ++ unsigned int tailbuf_len; + + tail = buf->tail; + BUG_ON (len > pglen); + ++ tailbuf_len = buf->buflen - buf->head->iov_len - buf->page_len; ++ + /* Shift the tail first */ +- if (tail->iov_len != 0) { +- p = (char *)tail->iov_base + len; ++ if (tailbuf_len != 0) { ++ unsigned int free_space = tailbuf_len - tail->iov_len; ++ ++ if (len < free_space) ++ free_space = len; ++ tail->iov_len += free_space; ++ ++ copy = len; + if (tail->iov_len > len) { +- copy = tail->iov_len - len; +- memmove(p, tail->iov_base, copy); ++ char *p = (char *)tail->iov_base + len; ++ memmove(p, tail->iov_base, tail->iov_len - len); + } else +- buf->buflen -= len; +- /* Copy from the inlined pages into the tail */ +- copy = len; +- if (copy > tail->iov_len) + copy = tail->iov_len; ++ /* Copy from the inlined pages into the tail */ + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); +@@ -496,6 +501,27 @@ __be32 * xdr_reserve_space(struct xdr_st + EXPORT_SYMBOL_GPL(xdr_reserve_space); + + /** ++ * xdr_rewind_stream - rewind a stream back to some checkpoint ++ * @xdr: pointer to xdr_stream ++ * @q: some checkpoint at historical place of @xdr ++ * ++ * Restors an xdr stream to some historical point. @q must be ++ * a logical xdr point in the past that was sampled by @q = @xdr->p. ++ */ ++__be32 *xdr_rewind_stream(struct xdr_stream *xdr, __be32 *q) ++{ ++ size_t nbytes = (xdr->p - q) << 2; ++ ++ BUG_ON(xdr->p < q); ++ BUG_ON(nbytes > xdr->iov->iov_len || nbytes > xdr->buf->len); ++ xdr->p = q; ++ xdr->iov->iov_len -= nbytes; ++ xdr->buf->len -= nbytes; ++ return q; ++} ++EXPORT_SYMBOL_GPL(xdr_rewind_stream); ++ ++/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages