Merge branch 'cifs-3.2' of git://git.samba.org/jlayton/linux into temp-3.2-jeff
authorSteve French <smfrench@gmail.com>
Thu, 20 Oct 2011 02:22:41 +0000 (21:22 -0500)
committerSteve French <smfrench@gmail.com>
Thu, 20 Oct 2011 02:22:41 +0000 (21:22 -0500)
fs/cifs/cifsfs.c
fs/cifs/cifsglob.h
fs/cifs/cifspdu.h
fs/cifs/cifsproto.h
fs/cifs/cifssmb.c
fs/cifs/connect.c
fs/cifs/file.c
fs/cifs/transport.c
include/linux/freezer.h

index 96a48ba..f219dcc 100644 (file)
@@ -74,7 +74,7 @@ module_param(cifs_min_small, int, 0);
 MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
                                 "Range: 2 to 256");
 unsigned int cifs_max_pending = CIFS_MAX_REQ;
-module_param(cifs_max_pending, int, 0);
+module_param(cifs_max_pending, int, 0444);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
                                   "Default: 50 Range: 2 to 256");
 unsigned short echo_retries = 5;
index 55ebf39..d153d0b 100644 (file)
@@ -291,7 +291,13 @@ struct TCP_Server_Info {
        bool    sec_kerberosu2u;        /* supports U2U Kerberos */
        bool    sec_kerberos;           /* supports plain Kerberos */
        bool    sec_mskerberos;         /* supports legacy MS Kerberos */
+       bool    large_buf;              /* is current buffer large? */
        struct delayed_work     echo; /* echo ping workqueue job */
+       struct kvec *iov;       /* reusable kvec array for receives */
+       unsigned int nr_iov;    /* number of kvecs in array */
+       char    *smallbuf;      /* pointer to current "small" buffer */
+       char    *bigbuf;        /* pointer to current "big" buffer */
+       unsigned int total_read; /* total amount of data read in this pass */
 #ifdef CONFIG_CIFS_FSCACHE
        struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
@@ -650,8 +656,24 @@ static inline void cifs_stats_bytes_read(struct cifs_tcon *tcon,
 struct mid_q_entry;
 
 /*
- * This is the prototype for the mid callback function. When creating one,
- * take special care to avoid deadlocks. Things to bear in mind:
+ * This is the prototype for the mid receive function. This function is for
+ * receiving the rest of the SMB frame, starting with the WordCount (which is
+ * just after the MID in struct smb_hdr). Note:
+ *
+ * - This will be called by cifsd, with no locks held.
+ * - The mid will still be on the pending_mid_q.
+ * - mid->resp_buf will point to the current buffer.
+ *
+ * Returns zero on a successful receive, or an error. The receive state in
+ * the TCP_Server_Info will also be updated.
+ */
+typedef int (mid_receive_t)(struct TCP_Server_Info *server,
+                           struct mid_q_entry *mid);
+
+/*
+ * This is the prototype for the mid callback function. This is called once the
+ * mid has been received off of the socket. When creating one, take special
+ * care to avoid deadlocks. Things to bear in mind:
  *
  * - it will be called by cifsd, with no locks held
  * - the mid will be removed from any lists
@@ -669,9 +691,10 @@ struct mid_q_entry {
        unsigned long when_sent; /* time when smb send finished */
        unsigned long when_received; /* when demux complete (taken off wire) */
 #endif
+       mid_receive_t *receive; /* call receive callback */
        mid_callback_t *callback; /* call completion callback */
        void *callback_data;      /* general purpose pointer for callback */
-       struct smb_hdr *resp_buf;       /* response buffer */
+       struct smb_hdr *resp_buf;       /* pointer to received SMB header */
        int midState;   /* wish this were enum but can not pass to wait_event */
        __u8 command;   /* smb command code */
        bool largeBuf:1;        /* if valid response, is pointer to large buf */
index de3aa28..3c6ef34 100644 (file)
@@ -1089,9 +1089,7 @@ typedef struct smb_com_read_rsp {
        __le16 DataLengthHigh;
        __u64 Reserved2;
        __u16 ByteCount;
-       __u8 Pad;               /* BB check for whether padded to DWORD
-                                  boundary and optimum performance here */
-       char Data[1];
+       /* read response data immediately follows */
 } __attribute__((packed)) READ_RSP;
 
 typedef struct locking_andx_range {
index a1fa9ce..c25d063 100644 (file)
@@ -69,8 +69,9 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
                                        struct TCP_Server_Info *server);
 extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
 extern int cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-                          unsigned int nvec, mid_callback_t *callback,
-                          void *cbdata, bool ignore_pend);
+                          unsigned int nvec, mid_receive_t *receive,
+                          mid_callback_t *callback, void *cbdata,
+                          bool ignore_pend);
 extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
                        struct smb_hdr * /* input */ ,
                        struct smb_hdr * /* out */ ,
@@ -153,6 +154,12 @@ extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
 extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
                                const char *, int);
 
+extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
+extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+                    unsigned int to_read);
+extern int cifs_readv_from_socket(struct TCP_Server_Info *server,
+               struct kvec *iov_orig, unsigned int nr_segs,
+               unsigned int to_read);
 extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
                               struct cifs_sb_info *cifs_sb);
 extern int cifs_match_super(struct super_block *, void *);
@@ -442,6 +449,24 @@ extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
 extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
                        unsigned char *p24);
 
+/* asynchronous read support */
+struct cifs_readdata {
+       struct cifsFileInfo             *cfile;
+       struct address_space            *mapping;
+       __u64                           offset;
+       unsigned int                    bytes;
+       pid_t                           pid;
+       int                             result;
+       struct list_head                pages;
+       struct work_struct              work;
+       unsigned int                    nr_iov;
+       struct kvec                     iov[1];
+};
+
+struct cifs_readdata *cifs_readdata_alloc(unsigned int nr_pages);
+void cifs_readdata_free(struct cifs_readdata *rdata);
+int cifs_async_readv(struct cifs_readdata *rdata);
+
 /* asynchronous write support */
 struct cifs_writedata {
        struct kref                     refcount;
index c824c10..aaad4ce 100644 (file)
@@ -33,6 +33,8 @@
 #include <linux/slab.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/task_io_accounting_ops.h>
 #include <asm/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -40,6 +42,7 @@
 #include "cifsproto.h"
 #include "cifs_unicode.h"
 #include "cifs_debug.h"
+#include "fscache.h"
 
 #ifdef CONFIG_CIFS_POSIX
 static struct {
@@ -83,6 +86,9 @@ static struct {
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
 #endif /* CIFS_POSIX */
 
+/* Forward declarations */
+static void cifs_readv_complete(struct work_struct *work);
+
 /* Mark as invalid, all open files on tree connections since they
    were closed when session to server was lost */
 static void mark_open_files_invalid(struct cifs_tcon *pTcon)
@@ -737,7 +743,8 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
        iov.iov_base = smb;
        iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
 
-       rc = cifs_call_async(server, &iov, 1, cifs_echo_callback, server, true);
+       rc = cifs_call_async(server, &iov, 1, NULL, cifs_echo_callback,
+                            server, true);
        if (rc)
                cFYI(1, "Echo request failed: %d", rc);
 
@@ -1374,6 +1381,359 @@ openRetry:
        return rc;
 }
 
+struct cifs_readdata *
+cifs_readdata_alloc(unsigned int nr_pages)
+{
+       struct cifs_readdata *rdata;
+
+       /* readdata + 1 kvec for each page */
+       rdata = kzalloc(sizeof(*rdata) +
+                       sizeof(struct kvec) * nr_pages, GFP_KERNEL);
+       if (rdata != NULL) {
+               INIT_WORK(&rdata->work, cifs_readv_complete);
+               INIT_LIST_HEAD(&rdata->pages);
+       }
+       return rdata;
+}
+
+void
+cifs_readdata_free(struct cifs_readdata *rdata)
+{
+       cifsFileInfo_put(rdata->cfile);
+       kfree(rdata);
+}
+
+/*
+ * Discard any remaining data in the current SMB. To do this, we borrow the
+ * current bigbuf.
+ */
+static int
+cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+       READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+       unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length);
+       int remaining = rfclen + 4 - server->total_read;
+       struct cifs_readdata *rdata = mid->callback_data;
+
+       while (remaining > 0) {
+               int length;
+
+               length = cifs_read_from_socket(server, server->bigbuf,
+                               min_t(unsigned int, remaining,
+                                       CIFSMaxBufSize + MAX_CIFS_HDR_SIZE));
+               if (length < 0)
+                       return length;
+               server->total_read += length;
+               remaining -= length;
+       }
+
+       dequeue_mid(mid, rdata->result);
+       return 0;
+}
+
+static int
+cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+       int length, len;
+       unsigned int data_offset, remaining, data_len;
+       struct cifs_readdata *rdata = mid->callback_data;
+       READ_RSP *rsp = (READ_RSP *)server->smallbuf;
+       unsigned int rfclen = be32_to_cpu(rsp->hdr.smb_buf_length) + 4;
+       u64 eof;
+       pgoff_t eof_index;
+       struct page *page, *tpage;
+
+       cFYI(1, "%s: mid=%u offset=%llu bytes=%u", __func__,
+               mid->mid, rdata->offset, rdata->bytes);
+
+       /*
+        * read the rest of READ_RSP header (sans Data array), or whatever we
+        * can if there's not enough data. At this point, we've read down to
+        * the Mid.
+        */
+       len = min_t(unsigned int, rfclen, sizeof(*rsp)) -
+                       sizeof(struct smb_hdr) + 1;
+
+       rdata->iov[0].iov_base = server->smallbuf + sizeof(struct smb_hdr) - 1;
+       rdata->iov[0].iov_len = len;
+
+       length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+       if (length < 0)
+               return length;
+       server->total_read += length;
+
+       /* Was the SMB read successful? */
+       rdata->result = map_smb_to_linux_error(&rsp->hdr, false);
+       if (rdata->result != 0) {
+               cFYI(1, "%s: server returned error %d", __func__,
+                       rdata->result);
+               return cifs_readv_discard(server, mid);
+       }
+
+       /* Is there enough to get to the rest of the READ_RSP header? */
+       if (server->total_read < sizeof(READ_RSP)) {
+               cFYI(1, "%s: server returned short header. got=%u expected=%zu",
+                       __func__, server->total_read, sizeof(READ_RSP));
+               rdata->result = -EIO;
+               return cifs_readv_discard(server, mid);
+       }
+
+       data_offset = le16_to_cpu(rsp->DataOffset) + 4;
+       if (data_offset < server->total_read) {
+               /*
+                * win2k8 sometimes sends an offset of 0 when the read
+                * is beyond the EOF. Treat it as if the data starts just after
+                * the header.
+                */
+               cFYI(1, "%s: data offset (%u) inside read response header",
+                       __func__, data_offset);
+               data_offset = server->total_read;
+       } else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+               /* data_offset is beyond the end of smallbuf */
+               cFYI(1, "%s: data offset (%u) beyond end of smallbuf",
+                       __func__, data_offset);
+               rdata->result = -EIO;
+               return cifs_readv_discard(server, mid);
+       }
+
+       cFYI(1, "%s: total_read=%u data_offset=%u", __func__,
+               server->total_read, data_offset);
+
+       len = data_offset - server->total_read;
+       if (len > 0) {
+               /* read any junk before data into the rest of smallbuf */
+               rdata->iov[0].iov_base = server->smallbuf + server->total_read;
+               rdata->iov[0].iov_len = len;
+               length = cifs_readv_from_socket(server, rdata->iov, 1, len);
+               if (length < 0)
+                       return length;
+               server->total_read += length;
+       }
+
+       /* set up first iov for signature check */
+       rdata->iov[0].iov_base = server->smallbuf;
+       rdata->iov[0].iov_len = server->total_read;
+       cFYI(1, "0: iov_base=%p iov_len=%zu",
+               rdata->iov[0].iov_base, rdata->iov[0].iov_len);
+
+       /* how much data is in the response? */
+       data_len = le16_to_cpu(rsp->DataLengthHigh) << 16;
+       data_len += le16_to_cpu(rsp->DataLength);
+       if (data_offset + data_len > rfclen) {
+               /* data_len is corrupt -- discard frame */
+               rdata->result = -EIO;
+               return cifs_readv_discard(server, mid);
+       }
+
+       /* marshal up the page array */
+       len = 0;
+       remaining = data_len;
+       rdata->nr_iov = 1;
+
+       /* determine the eof that the server (probably) has */
+       eof = CIFS_I(rdata->mapping->host)->server_eof;
+       eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+       cFYI(1, "eof=%llu eof_index=%lu", eof, eof_index);
+
+       list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+               if (remaining >= PAGE_CACHE_SIZE) {
+                       /* enough data to fill the page */
+                       rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+                       rdata->iov[rdata->nr_iov].iov_len = PAGE_CACHE_SIZE;
+                       cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+                               rdata->nr_iov, page->index,
+                               rdata->iov[rdata->nr_iov].iov_base,
+                               rdata->iov[rdata->nr_iov].iov_len);
+                       ++rdata->nr_iov;
+                       len += PAGE_CACHE_SIZE;
+                       remaining -= PAGE_CACHE_SIZE;
+               } else if (remaining > 0) {
+                       /* enough for partial page, fill and zero the rest */
+                       rdata->iov[rdata->nr_iov].iov_base = kmap(page);
+                       rdata->iov[rdata->nr_iov].iov_len = remaining;
+                       cFYI(1, "%u: idx=%lu iov_base=%p iov_len=%zu",
+                               rdata->nr_iov, page->index,
+                               rdata->iov[rdata->nr_iov].iov_base,
+                               rdata->iov[rdata->nr_iov].iov_len);
+                       memset(rdata->iov[rdata->nr_iov].iov_base + remaining,
+                               '\0', PAGE_CACHE_SIZE - remaining);
+                       ++rdata->nr_iov;
+                       len += remaining;
+                       remaining = 0;
+               } else if (page->index > eof_index) {
+                       /*
+                        * The VFS will not try to do readahead past the
+                        * i_size, but it's possible that we have outstanding
+                        * writes with gaps in the middle and the i_size hasn't
+                        * caught up yet. Populate those with zeroed out pages
+                        * to prevent the VFS from repeatedly attempting to
+                        * fill them until the writes are flushed.
+                        */
+                       zero_user(page, 0, PAGE_CACHE_SIZE);
+                       list_del(&page->lru);
+                       lru_cache_add_file(page);
+                       flush_dcache_page(page);
+                       SetPageUptodate(page);
+                       unlock_page(page);
+                       page_cache_release(page);
+               } else {
+                       /* no need to hold page hostage */
+                       list_del(&page->lru);
+                       lru_cache_add_file(page);
+                       unlock_page(page);
+                       page_cache_release(page);
+               }
+       }
+
+       /* issue the read if we have any iovecs left to fill */
+       if (rdata->nr_iov > 1) {
+               length = cifs_readv_from_socket(server, &rdata->iov[1],
+                                               rdata->nr_iov - 1, len);
+               if (length < 0)
+                       return length;
+               server->total_read += length;
+       } else {
+               length = 0;
+       }
+
+       rdata->bytes = length;
+
+       cFYI(1, "total_read=%u rfclen=%u remaining=%u", server->total_read,
+               rfclen, remaining);
+
+       /* discard anything left over */
+       if (server->total_read < rfclen)
+               return cifs_readv_discard(server, mid);
+
+       dequeue_mid(mid, false);
+       return length;
+}
+
+static void
+cifs_readv_complete(struct work_struct *work)
+{
+       struct cifs_readdata *rdata = container_of(work,
+                                               struct cifs_readdata, work);
+       struct page *page, *tpage;
+
+       list_for_each_entry_safe(page, tpage, &rdata->pages, lru) {
+               list_del(&page->lru);
+               lru_cache_add_file(page);
+               kunmap(page);
+
+               if (rdata->result == 0) {
+                       flush_dcache_page(page);
+                       SetPageUptodate(page);
+               }
+
+               unlock_page(page);
+
+               if (rdata->result == 0)
+                       cifs_readpage_to_fscache(rdata->mapping->host, page);
+
+               page_cache_release(page);
+       }
+       cifs_readdata_free(rdata);
+}
+
+static void
+cifs_readv_callback(struct mid_q_entry *mid)
+{
+       struct cifs_readdata *rdata = mid->callback_data;
+       struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+       struct TCP_Server_Info *server = tcon->ses->server;
+
+       cFYI(1, "%s: mid=%u state=%d result=%d bytes=%u", __func__,
+               mid->mid, mid->midState, rdata->result, rdata->bytes);
+
+       switch (mid->midState) {
+       case MID_RESPONSE_RECEIVED:
+               /* result already set, check signature */
+               if (server->sec_mode &
+                   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+                       if (cifs_verify_signature(rdata->iov, rdata->nr_iov,
+                                         server, mid->sequence_number + 1))
+                               cERROR(1, "Unexpected SMB signature");
+               }
+               /* FIXME: should this be counted toward the initiating task? */
+               task_io_account_read(rdata->bytes);
+               cifs_stats_bytes_read(tcon, rdata->bytes);
+               break;
+       case MID_REQUEST_SUBMITTED:
+       case MID_RETRY_NEEDED:
+               rdata->result = -EAGAIN;
+               break;
+       default:
+               rdata->result = -EIO;
+       }
+
+       queue_work(system_nrt_wq, &rdata->work);
+       DeleteMidQEntry(mid);
+       atomic_dec(&server->inFlight);
+       wake_up(&server->request_q);
+}
+
+/* cifs_async_readv - send an async write, and set up mid to handle result */
+int
+cifs_async_readv(struct cifs_readdata *rdata)
+{
+       int rc;
+       READ_REQ *smb = NULL;
+       int wct;
+       struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
+
+       cFYI(1, "%s: offset=%llu bytes=%u", __func__,
+               rdata->offset, rdata->bytes);
+
+       if (tcon->ses->capabilities & CAP_LARGE_FILES)
+               wct = 12;
+       else {
+               wct = 10; /* old style read */
+               if ((rdata->offset >> 32) > 0)  {
+                       /* can not handle this big offset for old */
+                       return -EIO;
+               }
+       }
+
+       rc = small_smb_init(SMB_COM_READ_ANDX, wct, tcon, (void **)&smb);
+       if (rc)
+               return rc;
+
+       smb->hdr.Pid = cpu_to_le16((__u16)rdata->pid);
+       smb->hdr.PidHigh = cpu_to_le16((__u16)(rdata->pid >> 16));
+
+       smb->AndXCommand = 0xFF;        /* none */
+       smb->Fid = rdata->cfile->netfid;
+       smb->OffsetLow = cpu_to_le32(rdata->offset & 0xFFFFFFFF);
+       if (wct == 12)
+               smb->OffsetHigh = cpu_to_le32(rdata->offset >> 32);
+       smb->Remaining = 0;
+       smb->MaxCount = cpu_to_le16(rdata->bytes & 0xFFFF);
+       smb->MaxCountHigh = cpu_to_le32(rdata->bytes >> 16);
+       if (wct == 12)
+               smb->ByteCount = 0;
+       else {
+               /* old style read */
+               struct smb_com_readx_req *smbr =
+                       (struct smb_com_readx_req *)smb;
+               smbr->ByteCount = 0;
+       }
+
+       /* 4 for RFC1001 length + 1 for BCC */
+       rdata->iov[0].iov_base = smb;
+       rdata->iov[0].iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+
+       rc = cifs_call_async(tcon->ses->server, rdata->iov, 1,
+                            cifs_readv_receive, cifs_readv_callback,
+                            rdata, false);
+
+       if (rc == 0)
+               cifs_stats_inc(&tcon->num_reads);
+
+       cifs_small_buf_release(smb);
+       return rc;
+}
+
 int
 CIFSSMBRead(const int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes,
            char **buf, int *pbuf_type)
@@ -1834,7 +2194,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
 
        kref_get(&wdata->refcount);
        rc = cifs_call_async(tcon->ses->server, iov, wdata->nr_pages + 1,
-                            cifs_writev_callback, wdata, false);
+                            NULL, cifs_writev_callback, wdata, false);
 
        if (rc == 0)
                cifs_stats_inc(&tcon->num_writes);
index 97a65af..f70d87d 100644 (file)
@@ -320,27 +320,24 @@ requeue_echo:
 }
 
 static bool
-allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
-                bool is_large_buf)
+allocate_buffers(struct TCP_Server_Info *server)
 {
-       char *bbuf = *bigbuf, *sbuf = *smallbuf;
-
-       if (bbuf == NULL) {
-               bbuf = (char *)cifs_buf_get();
-               if (!bbuf) {
+       if (!server->bigbuf) {
+               server->bigbuf = (char *)cifs_buf_get();
+               if (!server->bigbuf) {
                        cERROR(1, "No memory for large SMB response");
                        msleep(3000);
                        /* retry will check if exiting */
                        return false;
                }
-       } else if (is_large_buf) {
+       } else if (server->large_buf) {
                /* we are reusing a dirty large buf, clear its start */
-               memset(bbuf, 0, size);
+               memset(server->bigbuf, 0, sizeof(struct smb_hdr));
        }
 
-       if (sbuf == NULL) {
-               sbuf = (char *)cifs_small_buf_get();
-               if (!sbuf) {
+       if (!server->smallbuf) {
+               server->smallbuf = (char *)cifs_small_buf_get();
+               if (!server->smallbuf) {
                        cERROR(1, "No memory for SMB response");
                        msleep(1000);
                        /* retry will check if exiting */
@@ -349,12 +346,9 @@ allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
                /* beginning of smb buffer is cleared in our buf_get */
        } else {
                /* if existing small buf clear beginning */
-               memset(sbuf, 0, size);
+               memset(server->smallbuf, 0, sizeof(struct smb_hdr));
        }
 
-       *bigbuf = bbuf;
-       *smallbuf = sbuf;
-
        return true;
 }
 
@@ -375,14 +369,72 @@ server_unresponsive(struct TCP_Server_Info *server)
        return false;
 }
 
-static int
-read_from_socket(struct TCP_Server_Info *server, char *buf,
-                unsigned int to_read)
+/*
+ * kvec_array_init - clone a kvec array, and advance into it
+ * @new:       pointer to memory for cloned array
+ * @iov:       pointer to original array
+ * @nr_segs:   number of members in original array
+ * @bytes:     number of bytes to advance into the cloned array
+ *
+ * This function will copy the array provided in iov to a section of memory
+ * and advance the specified number of bytes into the new array. It returns
+ * the number of segments in the new array. "new" must be at least as big as
+ * the original iov array.
+ */
+static unsigned int
+kvec_array_init(struct kvec *new, struct kvec *iov, unsigned int nr_segs,
+               size_t bytes)
+{
+       size_t base = 0;
+
+       while (bytes || !iov->iov_len) {
+               int copy = min(bytes, iov->iov_len);
+
+               bytes -= copy;
+               base += copy;
+               if (iov->iov_len == base) {
+                       iov++;
+                       nr_segs--;
+                       base = 0;
+               }
+       }
+       memcpy(new, iov, sizeof(*iov) * nr_segs);
+       new->iov_base += base;
+       new->iov_len -= base;
+       return nr_segs;
+}
+
+static struct kvec *
+get_server_iovec(struct TCP_Server_Info *server, unsigned int nr_segs)
+{
+       struct kvec *new_iov;
+
+       if (server->iov && nr_segs <= server->nr_iov)
+               return server->iov;
+
+       /* not big enough -- allocate a new one and release the old */
+       new_iov = kmalloc(sizeof(*new_iov) * nr_segs, GFP_NOFS);
+       if (new_iov) {
+               kfree(server->iov);
+               server->iov = new_iov;
+               server->nr_iov = nr_segs;
+       }
+       return new_iov;
+}
+
+int
+cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
+                      unsigned int nr_segs, unsigned int to_read)
 {
        int length = 0;
        int total_read;
+       unsigned int segs;
        struct msghdr smb_msg;
-       struct kvec iov;
+       struct kvec *iov;
+
+       iov = get_server_iovec(server, nr_segs);
+       if (!iov)
+               return -ENOMEM;
 
        smb_msg.msg_control = NULL;
        smb_msg.msg_controllen = 0;
@@ -393,10 +445,11 @@ read_from_socket(struct TCP_Server_Info *server, char *buf,
                        break;
                }
 
-               iov.iov_base = buf + total_read;
-               iov.iov_len = to_read;
-               length = kernel_recvmsg(server->ssocket, &smb_msg, &iov, 1,
-                                       to_read, 0);
+               segs = kvec_array_init(iov, iov_orig, nr_segs, total_read);
+
+               length = kernel_recvmsg(server->ssocket, &smb_msg,
+                                       iov, segs, to_read, 0);
+
                if (server->tcpStatus == CifsExiting) {
                        total_read = -ESHUTDOWN;
                        break;
@@ -426,6 +479,18 @@ read_from_socket(struct TCP_Server_Info *server, char *buf,
        return total_read;
 }
 
+int
+cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
+                     unsigned int to_read)
+{
+       struct kvec iov;
+
+       iov.iov_base = buf;
+       iov.iov_len = to_read;
+
+       return cifs_readv_from_socket(server, &iov, 1, to_read);
+}
+
 static bool
 is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 {
@@ -471,61 +536,76 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
 }
 
 static struct mid_q_entry *
-find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
-             int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
+find_mid(struct TCP_Server_Info *server, struct smb_hdr *buf)
 {
-       struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL;
+       struct mid_q_entry *mid;
 
        spin_lock(&GlobalMid_Lock);
-       list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) {
-               if (mid->mid != buf->Mid ||
-                   mid->midState != MID_REQUEST_SUBMITTED ||
-                   mid->command != buf->Command)
-                       continue;
-
-               if (*length == 0 && check2ndT2(buf) > 0) {
-                       /* We have a multipart transact2 resp */
-                       *is_multi_rsp = true;
-                       if (mid->resp_buf) {
-                               /* merge response - fix up 1st*/
-                               *length = coalesce_t2(buf, mid->resp_buf);
-                               if (*length > 0) {
-                                       *length = 0;
-                                       mid->multiRsp = true;
-                                       break;
-                               }
-                               /* All parts received or packet is malformed. */
-                               mid->multiEnd = true;
-                               goto multi_t2_fnd;
-                       }
-                       if (!is_large_buf) {
-                               /*FIXME: switch to already allocated largebuf?*/
-                               cERROR(1, "1st trans2 resp needs bigbuf");
-                       } else {
-                               /* Have first buffer */
-                               mid->resp_buf = buf;
-                               mid->largeBuf = true;
-                               *bigbuf = NULL;
-                       }
-                       break;
+       list_for_each_entry(mid, &server->pending_mid_q, qhead) {
+               if (mid->mid == buf->Mid &&
+                   mid->midState == MID_REQUEST_SUBMITTED &&
+                   mid->command == buf->Command) {
+                       spin_unlock(&GlobalMid_Lock);
+                       return mid;
                }
-               mid->resp_buf = buf;
-               mid->largeBuf = is_large_buf;
-multi_t2_fnd:
-               if (*length == 0)
-                       mid->midState = MID_RESPONSE_RECEIVED;
-               else
-                       mid->midState = MID_RESPONSE_MALFORMED;
+       }
+       spin_unlock(&GlobalMid_Lock);
+       return NULL;
+}
+
+void
+dequeue_mid(struct mid_q_entry *mid, bool malformed)
+{
 #ifdef CONFIG_CIFS_STATS2
-               mid->when_received = jiffies;
+       mid->when_received = jiffies;
 #endif
-               list_del_init(&mid->qhead);
-               ret = mid;
-               break;
-       }
+       spin_lock(&GlobalMid_Lock);
+       if (!malformed)
+               mid->midState = MID_RESPONSE_RECEIVED;
+       else
+               mid->midState = MID_RESPONSE_MALFORMED;
+       list_del_init(&mid->qhead);
        spin_unlock(&GlobalMid_Lock);
+}
 
-       return ret;
+static void
+handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+          struct smb_hdr *buf, int malformed)
+{
+       if (malformed == 0 && check2ndT2(buf) > 0) {
+               mid->multiRsp = true;
+               if (mid->resp_buf) {
+                       /* merge response - fix up 1st*/
+                       malformed = coalesce_t2(buf, mid->resp_buf);
+                       if (malformed > 0)
+                               return;
+
+                       /* All parts received or packet is malformed. */
+                       mid->multiEnd = true;
+                       return dequeue_mid(mid, malformed);
+               }
+               if (!server->large_buf) {
+                       /*FIXME: switch to already allocated largebuf?*/
+                       cERROR(1, "1st trans2 resp needs bigbuf");
+               } else {
+                       /* Have first buffer */
+                       mid->resp_buf = buf;
+                       mid->largeBuf = true;
+                       server->bigbuf = NULL;
+               }
+               return;
+       }
+       mid->resp_buf = buf;
+       mid->largeBuf = server->large_buf;
+       /* Was previous buf put in mpx struct for multi-rsp? */
+       if (!mid->multiRsp) {
+               /* smb buffer will be freed by user thread */
+               if (server->large_buf)
+                       server->bigbuf = NULL;
+               else
+                       server->smallbuf = NULL;
+       }
+       dequeue_mid(mid, malformed);
 }
 
 static void clean_demultiplex_info(struct TCP_Server_Info *server)
@@ -615,6 +695,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
        }
 
        kfree(server->hostname);
+       kfree(server->iov);
        kfree(server);
 
        length = atomic_dec_return(&tcpSesAllocCount);
@@ -624,17 +705,70 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
 }
 
 static int
+standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+       int length;
+       char *buf = server->smallbuf;
+       struct smb_hdr *smb_buffer = (struct smb_hdr *)buf;
+       unsigned int pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
+
+       /* make sure this will fit in a large buffer */
+       if (pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+               cERROR(1, "SMB response too long (%u bytes)",
+                       pdu_length);
+               cifs_reconnect(server);
+               wake_up(&server->response_q);
+               return -EAGAIN;
+       }
+
+       /* switch to large buffer if too big for a small one */
+       if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
+               server->large_buf = true;
+               memcpy(server->bigbuf, server->smallbuf, server->total_read);
+               buf = server->bigbuf;
+               smb_buffer = (struct smb_hdr *)buf;
+       }
+
+       /* now read the rest */
+       length = cifs_read_from_socket(server,
+                         buf + sizeof(struct smb_hdr) - 1,
+                         pdu_length - sizeof(struct smb_hdr) + 1 + 4);
+       if (length < 0)
+               return length;
+       server->total_read += length;
+
+       dump_smb(smb_buffer, server->total_read);
+
+       /*
+        * We know that we received enough to get to the MID as we
+        * checked the pdu_length earlier. Now check to see
+        * if the rest of the header is OK. We borrow the length
+        * var for the rest of the loop to avoid a new stack var.
+        *
+        * 48 bytes is enough to display the header and a little bit
+        * into the payload for debugging purposes.
+        */
+       length = checkSMB(smb_buffer, smb_buffer->Mid, server->total_read);
+       if (length != 0)
+               cifs_dump_mem("Bad SMB: ", buf,
+                       min_t(unsigned int, server->total_read, 48));
+
+       if (mid)
+               handle_mid(mid, server, smb_buffer, length);
+
+       return length;
+}
+
+static int
 cifs_demultiplex_thread(void *p)
 {
        int length;
        struct TCP_Server_Info *server = p;
-       unsigned int pdu_length, total_read;
-       char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
+       unsigned int pdu_length;
+       char *buf = NULL;
        struct smb_hdr *smb_buffer = NULL;
        struct task_struct *task_to_wake = NULL;
        struct mid_q_entry *mid_entry;
-       bool isLargeBuf = false;
-       bool isMultiRsp = false;
 
        current->flags |= PF_MEMALLOC;
        cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -649,20 +783,18 @@ cifs_demultiplex_thread(void *p)
                if (try_to_freeze())
                        continue;
 
-               if (!allocate_buffers(&bigbuf, &smallbuf,
-                                     sizeof(struct smb_hdr), isLargeBuf))
+               if (!allocate_buffers(server))
                        continue;
 
-               isLargeBuf = false;
-               isMultiRsp = false;
-               smb_buffer = (struct smb_hdr *)smallbuf;
-               buf = smallbuf;
+               server->large_buf = false;
+               smb_buffer = (struct smb_hdr *)server->smallbuf;
+               buf = server->smallbuf;
                pdu_length = 4; /* enough to get RFC1001 header */
 
-               length = read_from_socket(server, buf, pdu_length);
+               length = cifs_read_from_socket(server, buf, pdu_length);
                if (length < 0)
                        continue;
-               total_read = length;
+               server->total_read = length;
 
                /*
                 * The right amount was read from socket - 4 bytes,
@@ -674,64 +806,42 @@ cifs_demultiplex_thread(void *p)
                if (!is_smb_response(server, buf[0]))
                        continue;
 
-               /* check the length */
-               if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
-                   (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
-                       cERROR(1, "Invalid size SMB length %d pdu_length %d",
-                              4, pdu_length + 4);
+               /* make sure we have enough to get to the MID */
+               if (pdu_length < sizeof(struct smb_hdr) - 1 - 4) {
+                       cERROR(1, "SMB response too short (%u bytes)",
+                               pdu_length);
                        cifs_reconnect(server);
                        wake_up(&server->response_q);
                        continue;
                }
 
-               /* else length ok */
-               if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
-                       isLargeBuf = true;
-                       memcpy(bigbuf, smallbuf, 4);
-                       smb_buffer = (struct smb_hdr *)bigbuf;
-                       buf = bigbuf;
-               }
-
-               length = read_from_socket(server, buf + 4, pdu_length);
+               /* read down to the MID */
+               length = cifs_read_from_socket(server, buf + 4,
+                                       sizeof(struct smb_hdr) - 1 - 4);
                if (length < 0)
                        continue;
-               total_read += length;
+               server->total_read += length;
 
-               dump_smb(smb_buffer, total_read);
+               mid_entry = find_mid(server, smb_buffer);
 
-               /*
-                * We know that we received enough to get to the MID as we
-                * checked the pdu_length earlier. Now check to see
-                * if the rest of the header is OK. We borrow the length
-                * var for the rest of the loop to avoid a new stack var.
-                *
-                * 48 bytes is enough to display the header and a little bit
-                * into the payload for debugging purposes.
-                */
-               length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
-               if (length != 0)
-                       cifs_dump_mem("Bad SMB: ", buf,
-                                     min_t(unsigned int, total_read, 48));
+               if (!mid_entry || !mid_entry->receive)
+                       length = standard_receive3(server, mid_entry);
+               else
+                       length = mid_entry->receive(server, mid_entry);
 
-               server->lstrp = jiffies;
+               if (length < 0)
+                       continue;
 
-               mid_entry = find_cifs_mid(server, smb_buffer, &length,
-                                         isLargeBuf, &isMultiRsp, &bigbuf);
+               if (server->large_buf) {
+                       buf = server->bigbuf;
+                       smb_buffer = (struct smb_hdr *)buf;
+               }
+
+               server->lstrp = jiffies;
                if (mid_entry != NULL) {
-                       mid_entry->callback(mid_entry);
-                       /* Was previous buf put in mpx struct for multi-rsp? */
-                       if (!isMultiRsp) {
-                               /* smb buffer will be freed by user thread */
-                               if (isLargeBuf)
-                                       bigbuf = NULL;
-                               else
-                                       smallbuf = NULL;
-                       }
-               } else if (length != 0) {
-                       /* response sanity checks failed */
-                       continue;
-               } else if (!is_valid_oplock_break(smb_buffer, server) &&
-                          !isMultiRsp) {
+                       if (!mid_entry->multiRsp || mid_entry->multiEnd)
+                               mid_entry->callback(mid_entry);
+               } else if (!is_valid_oplock_break(smb_buffer, server)) {
                        cERROR(1, "No task to wake, unknown frame received! "
                                   "NumMids %d", atomic_read(&midCount));
                        cifs_dump_mem("Received Data is: ", buf,
@@ -745,9 +855,9 @@ cifs_demultiplex_thread(void *p)
        } /* end while !EXITING */
 
        /* buffer usually freed in free_mid - need to free it here on exit */
-       cifs_buf_release(bigbuf);
-       if (smallbuf) /* no sense logging a debug message if NULL */
-               cifs_small_buf_release(smallbuf);
+       cifs_buf_release(server->bigbuf);
+       if (server->smallbuf) /* no sense logging a debug message if NULL */
+               cifs_small_buf_release(server->smallbuf);
 
        task_to_wake = xchg(&server->tsk, NULL);
        clean_demultiplex_info(server);
@@ -2200,16 +2310,16 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
            (new->mnt_cifs_flags & CIFS_MOUNT_MASK))
                return 0;
 
-       if (old->rsize != new->rsize)
-               return 0;
-
        /*
-        * We want to share sb only if we don't specify wsize or specified wsize
-        * is greater or equal than existing one.
+        * We want to share sb only if we don't specify an r/wsize or
+        * specified r/wsize is greater than or equal to existing one.
         */
        if (new->wsize && new->wsize < old->wsize)
                return 0;
 
+       if (new->rsize && new->rsize < old->rsize)
+               return 0;
+
        if (old->mnt_uid != new->mnt_uid || old->mnt_gid != new->mnt_gid)
                return 0;
 
@@ -2647,14 +2757,6 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon,
                                        CIFS_MOUNT_POSIX_PATHS;
                }
 
-               if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) {
-                       if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) {
-                               cifs_sb->rsize = 127 * 1024;
-                               cFYI(DBG2, "larger reads not supported by srv");
-                       }
-               }
-
-
                cFYI(1, "Negotiate caps 0x%x", (int)cap);
 #ifdef CONFIG_CIFS_DEBUG2
                if (cap & CIFS_UNIX_FCNTL_CAP)
@@ -2699,27 +2801,11 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
        spin_lock_init(&cifs_sb->tlink_tree_lock);
        cifs_sb->tlink_tree = RB_ROOT;
 
-       if (pvolume_info->rsize > CIFSMaxBufSize) {
-               cERROR(1, "rsize %d too large, using MaxBufSize",
-                       pvolume_info->rsize);
-               cifs_sb->rsize = CIFSMaxBufSize;
-       } else if ((pvolume_info->rsize) &&
-                       (pvolume_info->rsize <= CIFSMaxBufSize))
-               cifs_sb->rsize = pvolume_info->rsize;
-       else /* default */
-               cifs_sb->rsize = CIFSMaxBufSize;
-
-       if (cifs_sb->rsize < 2048) {
-               cifs_sb->rsize = 2048;
-               /* Windows ME may prefer this */
-               cFYI(1, "readsize set to minimum: 2048");
-       }
-
        /*
-        * Temporarily set wsize for matching superblock. If we end up using
-        * new sb then cifs_negotiate_wsize will later negotiate it downward
-        * if needed.
+        * Temporarily set r/wsize for matching superblock. If we end up using
+        * new sb then client will later negotiate it downward if needed.
         */
+       cifs_sb->rsize = pvolume_info->rsize;
        cifs_sb->wsize = pvolume_info->wsize;
 
        cifs_sb->mnt_uid = pvolume_info->linux_uid;
@@ -2794,29 +2880,41 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 }
 
 /*
- * When the server supports very large writes via POSIX extensions, we can
- * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including
- * the RFC1001 length.
+ * When the server supports very large reads and writes via POSIX extensions,
+ * we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
+ * including the RFC1001 length.
  *
  * Note that this might make for "interesting" allocation problems during
  * writeback however as we have to allocate an array of pointers for the
  * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096.
+ *
+ * For reads, there is a similar problem as we need to allocate an array
+ * of kvecs to handle the receive, though that should only need to be done
+ * once.
  */
 #define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RSIZE ((1<<24) - sizeof(READ_RSP) + 4)
 
 /*
- * When the server doesn't allow large posix writes, only allow a wsize of
- * 2^17-1 minus the size of the WRITE_AND_X header. That allows for a write up
- * to the maximum size described by RFC1002.
+ * When the server doesn't allow large posix writes, only allow a rsize/wsize
+ * of 2^17-1 minus the size of the call header. That allows for a read or
+ * write up to the maximum size described by RFC1002.
  */
 #define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4)
+#define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4)
 
 /*
  * The default wsize is 1M. find_get_pages seems to return a maximum of 256
  * pages in a single call. With PAGE_CACHE_SIZE == 4k, this means we can fill
  * a single wsize request with a single call.
  */
-#define CIFS_DEFAULT_WSIZE (1024 * 1024)
+#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
+
+/*
+ * Windows only supports a max of 60k reads. Default to that when posix
+ * extensions aren't in force.
+ */
+#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
 
 static unsigned int
 cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
@@ -2824,7 +2922,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
        __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
        struct TCP_Server_Info *server = tcon->ses->server;
        unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize :
-                               CIFS_DEFAULT_WSIZE;
+                               CIFS_DEFAULT_IOSIZE;
 
        /* can server support 24-bit write sizes? (via UNIX extensions) */
        if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -2847,6 +2945,50 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
        return wsize;
 }
 
+static unsigned int
+cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
+{
+       __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+       struct TCP_Server_Info *server = tcon->ses->server;
+       unsigned int rsize, defsize;
+
+       /*
+        * Set default value...
+        *
+        * HACK alert! Ancient servers have very small buffers. Even though
+        * MS-CIFS indicates that servers are only limited by the client's
+        * bufsize for reads, testing against win98se shows that it throws
+        * INVALID_PARAMETER errors if you try to request too large a read.
+        *
+        * If the server advertises a MaxBufferSize of less than one page,
+        * assume that it also can't satisfy reads larger than that either.
+        *
+        * FIXME: Is there a better heuristic for this?
+        */
+       if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_READ_CAP))
+               defsize = CIFS_DEFAULT_IOSIZE;
+       else if (server->capabilities & CAP_LARGE_READ_X)
+               defsize = CIFS_DEFAULT_NON_POSIX_RSIZE;
+       else if (server->maxBuf >= PAGE_CACHE_SIZE)
+               defsize = CIFSMaxBufSize;
+       else
+               defsize = server->maxBuf - sizeof(READ_RSP);
+
+       rsize = pvolume_info->rsize ? pvolume_info->rsize : defsize;
+
+       /*
+        * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
+        * the client's MaxBufferSize.
+        */
+       if (!(server->capabilities & CAP_LARGE_READ_X))
+               rsize = min_t(unsigned int, CIFSMaxBufSize, rsize);
+
+       /* hard limit of CIFS_MAX_RSIZE */
+       rsize = min_t(unsigned int, rsize, CIFS_MAX_RSIZE);
+
+       return rsize;
+}
+
 static int
 is_path_accessible(int xid, struct cifs_tcon *tcon,
                   struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -3040,6 +3182,22 @@ cifs_get_volume_info(char *mount_data, const char *devname)
        return volume_info;
 }
 
+/* make sure ra_pages is a multiple of rsize */
+static inline unsigned int
+cifs_ra_pages(struct cifs_sb_info *cifs_sb)
+{
+       unsigned int reads;
+       unsigned int rsize_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
+
+       if (rsize_pages >= default_backing_dev_info.ra_pages)
+               return default_backing_dev_info.ra_pages;
+       else if (rsize_pages == 0)
+               return rsize_pages;
+
+       reads = default_backing_dev_info.ra_pages / rsize_pages;
+       return reads * rsize_pages;
+}
+
 int
 cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
 {
@@ -3058,8 +3216,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
        if (rc)
                return rc;
 
-       cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-
 #ifdef CONFIG_CIFS_DFS_UPCALL
 try_mount_again:
        /* cleanup activities if we're chasing a referral */
@@ -3124,14 +3280,11 @@ try_mount_again:
                CIFSSMBQFSAttributeInfo(xid, tcon);
        }
 
-       if ((tcon->unix_ext == 0) && (cifs_sb->rsize > (1024 * 127))) {
-               cifs_sb->rsize = 1024 * 127;
-               cFYI(DBG2, "no very large read support, rsize now 127K");
-       }
-       if (!(tcon->ses->capabilities & CAP_LARGE_READ_X))
-               cifs_sb->rsize = min(cifs_sb->rsize, CIFSMaxBufSize);
-
        cifs_sb->wsize = cifs_negotiate_wsize(tcon, volume_info);
+       cifs_sb->rsize = cifs_negotiate_rsize(tcon, volume_info);
+
+       /* tune readahead according to rsize */
+       cifs_sb->bdi.ra_pages = cifs_ra_pages(cifs_sb);
 
 remote_path_check:
 #ifdef CONFIG_CIFS_DFS_UPCALL
index 852d1f3..a3b545f 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/delay.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
+#include <linux/swap.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -1757,6 +1758,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        struct smb_com_read_rsp *pSMBr;
        struct cifs_io_parms io_parms;
        char *read_data;
+       unsigned int rsize;
        __u32 pid;
 
        if (!nr_segs)
@@ -1769,6 +1771,9 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
+       /* FIXME: set up handlers for larger reads and/or convert to async */
+       rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
        open_file = file->private_data;
        pTcon = tlink_tcon(open_file->tlink);
 
@@ -1781,7 +1786,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
                cFYI(1, "attempting read on write only file instance");
 
        for (total_read = 0; total_read < len; total_read += bytes_read) {
-               cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize);
+               cur_len = min_t(const size_t, len - total_read, rsize);
                rc = -EAGAIN;
                read_data = NULL;
 
@@ -1873,6 +1878,7 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        unsigned int bytes_read = 0;
        unsigned int total_read;
        unsigned int current_read_size;
+       unsigned int rsize;
        struct cifs_sb_info *cifs_sb;
        struct cifs_tcon *pTcon;
        int xid;
@@ -1885,6 +1891,9 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        xid = GetXid();
        cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 
+       /* FIXME: set up handlers for larger reads and/or convert to async */
+       rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize);
+
        if (file->private_data == NULL) {
                rc = -EBADF;
                FreeXid(xid);
@@ -1904,8 +1913,8 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
        for (total_read = 0, current_offset = read_data;
             read_size > total_read;
             total_read += bytes_read, current_offset += bytes_read) {
-               current_read_size = min_t(uint, read_size - total_read,
-                                         cifs_sb->rsize);
+               current_read_size = min_t(uint, read_size - total_read, rsize);
+
                /* For windows me and 9x we do not want to request more
                than it negotiated since it will refuse the read then */
                if ((pTcon->ses) &&
@@ -2000,82 +2009,24 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
        return rc;
 }
 
-
-static void cifs_copy_cache_pages(struct address_space *mapping,
-       struct list_head *pages, int bytes_read, char *data)
-{
-       struct page *page;
-       char *target;
-
-       while (bytes_read > 0) {
-               if (list_empty(pages))
-                       break;
-
-               page = list_entry(pages->prev, struct page, lru);
-               list_del(&page->lru);
-
-               if (add_to_page_cache_lru(page, mapping, page->index,
-                                     GFP_KERNEL)) {
-                       page_cache_release(page);
-                       cFYI(1, "Add page cache failed");
-                       data += PAGE_CACHE_SIZE;
-                       bytes_read -= PAGE_CACHE_SIZE;
-                       continue;
-               }
-               page_cache_release(page);
-
-               target = kmap_atomic(page, KM_USER0);
-
-               if (PAGE_CACHE_SIZE > bytes_read) {
-                       memcpy(target, data, bytes_read);
-                       /* zero the tail end of this partial page */
-                       memset(target + bytes_read, 0,
-                              PAGE_CACHE_SIZE - bytes_read);
-                       bytes_read = 0;
-               } else {
-                       memcpy(target, data, PAGE_CACHE_SIZE);
-                       bytes_read -= PAGE_CACHE_SIZE;
-               }
-               kunmap_atomic(target, KM_USER0);
-
-               flush_dcache_page(page);
-               SetPageUptodate(page);
-               unlock_page(page);
-               data += PAGE_CACHE_SIZE;
-
-               /* add page to FS-Cache */
-               cifs_readpage_to_fscache(mapping->host, page);
-       }
-       return;
-}
-
 static int cifs_readpages(struct file *file, struct address_space *mapping,
        struct list_head *page_list, unsigned num_pages)
 {
-       int rc = -EACCES;
-       int xid;
-       loff_t offset;
-       struct page *page;
-       struct cifs_sb_info *cifs_sb;
-       struct cifs_tcon *pTcon;
-       unsigned int bytes_read = 0;
-       unsigned int read_size, i;
-       char *smb_read_data = NULL;
-       struct smb_com_read_rsp *pSMBr;
-       struct cifsFileInfo *open_file;
-       struct cifs_io_parms io_parms;
-       int buf_type = CIFS_NO_BUFFER;
-       __u32 pid;
+       int rc;
+       struct list_head tmplist;
+       struct cifsFileInfo *open_file = file->private_data;
+       struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
+       unsigned int rsize = cifs_sb->rsize;
+       pid_t pid;
 
-       xid = GetXid();
-       if (file->private_data == NULL) {
-               rc = -EBADF;
-               FreeXid(xid);
-               return rc;
-       }
-       open_file = file->private_data;
-       cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-       pTcon = tlink_tcon(open_file->tlink);
+       /*
+        * Give up immediately if rsize is too small to read an entire page.
+        * The VFS will fall back to readpage. We should never reach this
+        * point however since we set ra_pages to 0 when the rsize is smaller
+        * than a cache page.
+        */
+       if (unlikely(rsize < PAGE_CACHE_SIZE))
+               return 0;
 
        /*
         * Reads as many pages as possible from fscache. Returns -ENOBUFS
@@ -2084,125 +2035,127 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
        rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
                                         &num_pages);
        if (rc == 0)
-               goto read_complete;
+               return rc;
 
-       cFYI(DBG2, "rpages: num pages %d", num_pages);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
                pid = open_file->pid;
        else
                pid = current->tgid;
 
-       for (i = 0; i < num_pages; ) {
-               unsigned contig_pages;
-               struct page *tmp_page;
-               unsigned long expected_index;
+       rc = 0;
+       INIT_LIST_HEAD(&tmplist);
+
+       cFYI(1, "%s: file=%p mapping=%p num_pages=%u", __func__, file,
+               mapping, num_pages);
 
-               if (list_empty(page_list))
-                       break;
+       /*
+        * Start with the page at end of list and move it to private
+        * list. Do the same with any following pages until we hit
+        * the rsize limit, hit an index discontinuity, or run out of
+        * pages. Issue the async read and then start the loop again
+        * until the list is empty.
+        *
+        * Note that list order is important. The page_list is in
+        * the order of declining indexes. When we put the pages in
+        * the rdata->pages, then we want them in increasing order.
+        */
+       while (!list_empty(page_list)) {
+               unsigned int bytes = PAGE_CACHE_SIZE;
+               unsigned int expected_index;
+               unsigned int nr_pages = 1;
+               loff_t offset;
+               struct page *page, *tpage;
+               struct cifs_readdata *rdata;
 
                page = list_entry(page_list->prev, struct page, lru);
+
+               /*
+                * Lock the page and put it in the cache. Since no one else
+                * should have access to this page, we're safe to simply set
+                * PG_locked without checking it first.
+                */
+               __set_page_locked(page);
+               rc = add_to_page_cache_locked(page, mapping,
+                                             page->index, GFP_KERNEL);
+
+               /* give up if we can't stick it in the cache */
+               if (rc) {
+                       __clear_page_locked(page);
+                       break;
+               }
+
+               /* move first page to the tmplist */
                offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+               list_move_tail(&page->lru, &tmplist);
 
-               /* count adjacent pages that we will read into */
-               contig_pages = 0;
-               expected_index =
-                       list_entry(page_list->prev, struct page, lru)->index;
-               list_for_each_entry_reverse(tmp_page, page_list, lru) {
-                       if (tmp_page->index == expected_index) {
-                               contig_pages++;
-                               expected_index++;
-                       } else
+               /* now try and add more pages onto the request */
+               expected_index = page->index + 1;
+               list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
+                       /* discontinuity ? */
+                       if (page->index != expected_index)
                                break;
+
+                       /* would this page push the read over the rsize? */
+                       if (bytes + PAGE_CACHE_SIZE > rsize)
+                               break;
+
+                       __set_page_locked(page);
+                       if (add_to_page_cache_locked(page, mapping,
+                                               page->index, GFP_KERNEL)) {
+                               __clear_page_locked(page);
+                               break;
+                       }
+                       list_move_tail(&page->lru, &tmplist);
+                       bytes += PAGE_CACHE_SIZE;
+                       expected_index++;
+                       nr_pages++;
                }
-               if (contig_pages + i >  num_pages)
-                       contig_pages = num_pages - i;
-
-               /* for reads over a certain size could initiate async
-                  read ahead */
-
-               read_size = contig_pages * PAGE_CACHE_SIZE;
-               /* Read size needs to be in multiples of one page */
-               read_size = min_t(const unsigned int, read_size,
-                                 cifs_sb->rsize & PAGE_CACHE_MASK);
-               cFYI(DBG2, "rpages: read size 0x%x  contiguous pages %d",
-                               read_size, contig_pages);
-               rc = -EAGAIN;
-               while (rc == -EAGAIN) {
+
+               rdata = cifs_readdata_alloc(nr_pages);
+               if (!rdata) {
+                       /* best to give up if we're out of mem */
+                       list_for_each_entry_safe(page, tpage, &tmplist, lru) {
+                               list_del(&page->lru);
+                               lru_cache_add_file(page);
+                               unlock_page(page);
+                               page_cache_release(page);
+                       }
+                       rc = -ENOMEM;
+                       break;
+               }
+
+               spin_lock(&cifs_file_list_lock);
+               cifsFileInfo_get(open_file);
+               spin_unlock(&cifs_file_list_lock);
+               rdata->cfile = open_file;
+               rdata->mapping = mapping;
+               rdata->offset = offset;
+               rdata->bytes = bytes;
+               rdata->pid = pid;
+               list_splice_init(&tmplist, &rdata->pages);
+
+               do {
                        if (open_file->invalidHandle) {
                                rc = cifs_reopen_file(open_file, true);
                                if (rc != 0)
-                                       break;
+                                       continue;
                        }
-                       io_parms.netfid = open_file->netfid;
-                       io_parms.pid = pid;
-                       io_parms.tcon = pTcon;
-                       io_parms.offset = offset;
-                       io_parms.length = read_size;
-                       rc = CIFSSMBRead(xid, &io_parms, &bytes_read,
-                                        &smb_read_data, &buf_type);
-                       /* BB more RC checks ? */
-                       if (rc == -EAGAIN) {
-                               if (smb_read_data) {
-                                       if (buf_type == CIFS_SMALL_BUFFER)
-                                               cifs_small_buf_release(smb_read_data);
-                                       else if (buf_type == CIFS_LARGE_BUFFER)
-                                               cifs_buf_release(smb_read_data);
-                                       smb_read_data = NULL;
-                               }
-                       }
-               }
-               if ((rc < 0) || (smb_read_data == NULL)) {
-                       cFYI(1, "Read error in readpages: %d", rc);
-                       break;
-               } else if (bytes_read > 0) {
-                       task_io_account_read(bytes_read);
-                       pSMBr = (struct smb_com_read_rsp *)smb_read_data;
-                       cifs_copy_cache_pages(mapping, page_list, bytes_read,
-                               smb_read_data + 4 /* RFC1001 hdr */ +
-                               le16_to_cpu(pSMBr->DataOffset));
-
-                       i +=  bytes_read >> PAGE_CACHE_SHIFT;
-                       cifs_stats_bytes_read(pTcon, bytes_read);
-                       if ((bytes_read & PAGE_CACHE_MASK) != bytes_read) {
-                               i++; /* account for partial page */
-
-                               /* server copy of file can have smaller size
-                                  than client */
-                               /* BB do we need to verify this common case ?
-                                  this case is ok - if we are at server EOF
-                                  we will hit it on next read */
+                       rc = cifs_async_readv(rdata);
+               } while (rc == -EAGAIN);
 
-                               /* break; */
+               if (rc != 0) {
+                       list_for_each_entry_safe(page, tpage, &rdata->pages,
+                                                lru) {
+                               list_del(&page->lru);
+                               lru_cache_add_file(page);
+                               unlock_page(page);
+                               page_cache_release(page);
                        }
-               } else {
-                       cFYI(1, "No bytes read (%d) at offset %lld . "
-                               "Cleaning remaining pages from readahead list",
-                               bytes_read, offset);
-                       /* BB turn off caching and do new lookup on
-                          file size at server? */
+                       cifs_readdata_free(rdata);
                        break;
                }
-               if (smb_read_data) {
-                       if (buf_type == CIFS_SMALL_BUFFER)
-                               cifs_small_buf_release(smb_read_data);
-                       else if (buf_type == CIFS_LARGE_BUFFER)
-                               cifs_buf_release(smb_read_data);
-                       smb_read_data = NULL;
-               }
-               bytes_read = 0;
-       }
-
-/* need to free smb_read_data buf before exit */
-       if (smb_read_data) {
-               if (buf_type == CIFS_SMALL_BUFFER)
-                       cifs_small_buf_release(smb_read_data);
-               else if (buf_type == CIFS_LARGE_BUFFER)
-                       cifs_buf_release(smb_read_data);
-               smb_read_data = NULL;
        }
 
-read_complete:
-       FreeXid(xid);
        return rc;
 }
 
index 33a3fbf..0cc9584 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/wait.h>
 #include <linux/net.h>
 #include <linux/delay.h>
+#include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <linux/mempool.h>
@@ -324,7 +325,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 {
        int error;
 
-       error = wait_event_killable(server->response_q,
+       error = wait_event_freezekillable(server->response_q,
                                    midQ->midState != MID_REQUEST_SUBMITTED);
        if (error < 0)
                return -ERESTARTSYS;
@@ -339,8 +340,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
  */
 int
 cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
-               unsigned int nvec, mid_callback_t *callback, void *cbdata,
-               bool ignore_pend)
+               unsigned int nvec, mid_receive_t *receive,
+               mid_callback_t *callback, void *cbdata, bool ignore_pend)
 {
        int rc;
        struct mid_q_entry *mid;
@@ -374,6 +375,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
                goto out_err;
        }
 
+       mid->receive = receive;
        mid->callback = callback;
        mid->callback_data = cbdata;
        mid->midState = MID_REQUEST_SUBMITTED;
index 1effc8b..3672f73 100644 (file)
@@ -134,10 +134,25 @@ static inline void set_freezable_with_signal(void)
 }
 
 /*
- * Freezer-friendly wrappers around wait_event_interruptible() and
- * wait_event_interruptible_timeout(), originally defined in <linux/wait.h>
+ * Freezer-friendly wrappers around wait_event_interruptible(),
+ * wait_event_killable() and wait_event_interruptible_timeout(), originally
+ * defined in <linux/wait.h>
  */
 
+#define wait_event_freezekillable(wq, condition)                       \
+({                                                                     \
+       int __retval;                                                   \
+       do {                                                            \
+               __retval = wait_event_killable(wq,                      \
+                               (condition) || freezing(current));      \
+               if (__retval && !freezing(current))                     \
+                       break;                                          \
+               else if (!(condition))                                  \
+                       __retval = -ERESTARTSYS;                        \
+       } while (try_to_freeze());                                      \
+       __retval;                                                       \
+})
+
 #define wait_event_freezable(wq, condition)                            \
 ({                                                                     \
        int __retval;                                                   \