Merge tag 'arm-soc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 18 Jan 2012 02:55:56 +0000 (18:55 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 18 Jan 2012 02:55:56 +0000 (18:55 -0800)
ARM: fixes for ARM platforms

Some fallout from the 3.3. merge window as well as a couple bug fixes
for older preexisting bugs that seem valid to include at this time:

* sched_clock changes broke picoxcell, fix included
* BSYM bugs causing issues with thumb2-built kernels on SMP
* Missing module.h include on msm.
* A collection of bugfixes for samsung platforms that didn't make it into
  the first pull requests.

* tag 'arm-soc-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc:
  ARM: make BSYM macro assembly only
  ARM: highbank: remove incorrect BSYM usage
  ARM: imx: remove incorrect BSYM usage
  ARM: exynos: remove incorrect BSYM usage
  ARM: ux500: add missing ENDPROC to headsmp.S
  ARM: msm: Add missing ENDPROC to headsmp.S
  ARM: versatile: Add missing ENDPROC to headsmp.S
  ARM: EXYNOS: Invert VCLK polarity for framebuffer on ORIGEN
  ARM: S3C64XX: Fix interrupt configuration for PCA935x on Cragganmore
  ARM: S3C64XX: Fix the memory mapped GPIOs on Cragganmore
  ARM: S3C64XX: Remove hsmmc1 from Cragganmore
  ARM: S3C64XX: Remove unconditional power domain disables
  ARM: SAMSUNG: Declare struct platform_device in plat/s3c64xx-spi.h
  ARM: SAMSUNG: dma-ops.h needs mach/dma.h
  ARM: SAMSUNG: Guard against multiple inclusion of plat/dma.h
  ARM: picoxcell: fix sched_clock() cleanup fallout
  ARM: msm: vreg is a module and so needs module.h

202 files changed:
Documentation/devicetree/bindings/dma/atmel-dma.txt [new file with mode: 0644]
Documentation/dmaengine.txt
MAINTAINERS
arch/arm/include/asm/kprobes.h
arch/arm/include/asm/ptrace.h
arch/arm/include/asm/thread_info.h
arch/arm/kernel/entry-common.S
arch/arm/kernel/ptrace.c
arch/arm/mach-ep93xx/include/mach/dma.h
arch/arm/mach-shmobile/setup-sh7372.c
arch/arm/plat-mxc/include/mach/mx3fb.h
arch/arm/plat-nomadik/include/plat/ste_dma40.h
arch/arm/plat-samsung/dma-ops.c
arch/arm/plat-samsung/include/plat/dma-ops.h
arch/ia64/include/asm/ptrace.h
arch/ia64/kernel/ptrace.c
arch/microblaze/include/asm/ptrace.h
arch/microblaze/kernel/ptrace.c
arch/microblaze/kernel/setup.c
arch/mips/include/asm/ptrace.h
arch/mips/kernel/ptrace.c
arch/powerpc/include/asm/ptrace.h
arch/powerpc/kernel/ptrace.c
arch/s390/include/asm/ptrace.h
arch/s390/kernel/ptrace.c
arch/sh/include/asm/ptrace_32.h
arch/sh/include/asm/ptrace_64.h
arch/sh/kernel/ptrace_32.c
arch/sh/kernel/ptrace_64.c
arch/sparc/include/asm/ptrace.h
arch/sparc/kernel/ptrace_64.c
arch/um/kernel/ptrace.c
arch/x86/ia32/ia32entry.S
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/ptrace.c
arch/x86/kernel/vm86_32.c
arch/x86/um/shared/sysdep/ptrace.h
arch/xtensa/kernel/ptrace.c
block/cfq-iosched.c
drivers/ata/ata_piix.c
drivers/ata/libata-core.c
drivers/ata/libata-transport.c
drivers/ata/pata_bf54x.c
drivers/ata/sata_fsl.c
drivers/dma/Kconfig
drivers/dma/Makefile
drivers/dma/amba-pl08x.c
drivers/dma/at_hdmac.c
drivers/dma/at_hdmac_regs.h
drivers/dma/coh901318.c
drivers/dma/coh901318_lli.c
drivers/dma/coh901318_lli.h
drivers/dma/dmaengine.c
drivers/dma/dw_dmac.c
drivers/dma/dw_dmac_regs.h
drivers/dma/ep93xx_dma.c
drivers/dma/fsldma.c
drivers/dma/imx-dma.c
drivers/dma/imx-sdma.c
drivers/dma/intel_mid_dma.c
drivers/dma/intel_mid_dma_regs.h
drivers/dma/iop-adma.c
drivers/dma/ipu/ipu_idmac.c
drivers/dma/mpc512x_dma.c
drivers/dma/mxs-dma.c
drivers/dma/pch_dma.c
drivers/dma/pl330.c
drivers/dma/shdma.c
drivers/dma/sirf-dma.c [new file with mode: 0644]
drivers/dma/ste_dma40.c
drivers/dma/ste_dma40_ll.h
drivers/dma/timb_dma.c
drivers/dma/txx9dmac.c
drivers/media/video/mx3_camera.c
drivers/media/video/timblogiw.c
drivers/misc/carma/carma-fpga-program.c
drivers/mmc/host/atmel-mci.c
drivers/mmc/host/mmci.c
drivers/mmc/host/mxcmmc.c
drivers/mmc/host/mxs-mmc.c
drivers/mmc/host/sh_mmcif.c
drivers/mmc/host/tmio_mmc_dma.c
drivers/mtd/nand/gpmi-nand/gpmi-lib.c
drivers/net/ethernet/micrel/ks8842.c
drivers/spi/spi-dw-mid.c
drivers/spi/spi-ep93xx.c
drivers/spi/spi-pl022.c
drivers/spi/spi-topcliff-pch.c
drivers/tty/serial/amba-pl011.c
drivers/tty/serial/pch_uart.c
drivers/tty/serial/sh-sci.c
drivers/usb/host/ehci-xilinx-of.c
drivers/usb/musb/ux500_dma.c
drivers/usb/renesas_usbhs/fifo.c
drivers/video/mx3fb.c
drivers/xen/xen-balloon.c
fs/btrfs/Kconfig
fs/btrfs/Makefile
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c [new file with mode: 0644]
fs/btrfs/check-integrity.h [new file with mode: 0644]
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/export.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode-map.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/locking.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/ulist.c [new file with mode: 0644]
fs/btrfs/ulist.h [new file with mode: 0644]
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/xattr.c
fs/namei.c
fs/proc/base.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_attr.c
fs/xfs/xfs_attr_leaf.c
fs/xfs/xfs_bmap.c
fs/xfs/xfs_dfrag.c
fs/xfs/xfs_file.c
fs/xfs/xfs_fs_subr.c
fs/xfs/xfs_iget.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_qm_syscalls.c
fs/xfs/xfs_super.c
fs/xfs/xfs_sync.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_vnodeops.c
include/linux/amba/pl08x.h
include/linux/audit.h
include/linux/digsig.h
include/linux/dmaengine.h
include/linux/dw_dmac.h
include/linux/key.h
include/linux/kref.h
include/linux/mtd/gpmi-nand.h [new file with mode: 0644]
include/linux/ptrace.h
include/linux/sh_dma.h
include/linux/sirfsoc_dma.h [new file with mode: 0644]
include/linux/tty_driver.h
include/trace/events/btrfs.h
init/Kconfig
kernel/audit.c
kernel/audit.h
kernel/auditfilter.c
kernel/auditsc.c
kernel/capability.c
kernel/exit.c
kernel/fork.c
kernel/seccomp.c
lib/Kconfig
lib/Makefile
security/integrity/Kconfig
security/integrity/Makefile
security/integrity/ima/ima_audit.c
security/integrity/integrity.h
security/keys/encrypted-keys/encrypted.c
security/keys/encrypted-keys/masterkey_trusted.c
security/keys/gc.c
security/keys/keyring.c
security/keys/trusted.c
security/lsm_audit.c
security/tomoyo/util.c
sound/atmel/abdac.c
sound/atmel/ac97c.c
sound/core/Kconfig
sound/pci/au88x0/au88x0.c
sound/pci/au88x0/au88x0.h
sound/pci/au88x0/au88x0_pcm.c
sound/pci/hda/hda_intel.c
sound/pci/hda/patch_sigmatel.c
sound/pci/oxygen/xonar_wm87x6.c
sound/soc/ep93xx/ep93xx-pcm.c
sound/soc/imx/imx-pcm-dma-mx2.c
sound/soc/mxs/mxs-pcm.c
sound/soc/samsung/dma.c
sound/soc/sh/siu_pcm.c
sound/soc/txx9/txx9aclc.c

diff --git a/Documentation/devicetree/bindings/dma/atmel-dma.txt b/Documentation/devicetree/bindings/dma/atmel-dma.txt
new file mode 100644 (file)
index 0000000..3c046ee
--- /dev/null
@@ -0,0 +1,14 @@
+* Atmel Direct Memory Access Controller (DMA)
+
+Required properties:
+- compatible: Should be "atmel,<chip>-dma"
+- reg: Should contain DMA registers location and length
+- interrupts: Should contain DMA interrupt
+
+Examples:
+
+dma@ffffec00 {
+       compatible = "atmel,at91sam9g45-dma";
+       reg = <0xffffec00 0x200>;
+       interrupts = <21>;
+};
index 94b7e0f..bbe6cb3 100644 (file)
@@ -75,6 +75,10 @@ The slave DMA usage consists of following steps:
    slave_sg    - DMA a list of scatter gather buffers from/to a peripheral
    dma_cyclic  - Perform a cyclic DMA operation from/to a peripheral till the
                  operation is explicitly stopped.
+   interleaved_dma - This is common to Slave as well as M2M clients. For slave
+                address of devices' fifo could be already known to the driver.
+                Various types of operations could be expressed by setting
+                appropriate values to the 'dma_interleaved_template' members.
 
    A non-NULL return of this transfer API represents a "descriptor" for
    the given transaction.
@@ -89,6 +93,10 @@ The slave DMA usage consists of following steps:
                struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
                size_t period_len, enum dma_data_direction direction);
 
+       struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
+               struct dma_chan *chan, struct dma_interleaved_template *xt,
+               unsigned long flags);
+
    The peripheral driver is expected to have mapped the scatterlist for
    the DMA operation prior to calling device_prep_slave_sg, and must
    keep the scatterlist mapped until the DMA operation has completed.
index 2a90101..341dee3 100644 (file)
@@ -745,6 +745,7 @@ M:  Barry Song <baohua.song@csr.com>
 L:     linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:     Maintained
 F:     arch/arm/mach-prima2/
+F:     drivers/dma/sirf-dma*
 
 ARM/EBSA110 MACHINE SUPPORT
 M:     Russell King <linux@arm.linux.org.uk>
@@ -5846,7 +5847,7 @@ F:        drivers/mmc/host/sdhci-spear.c
 SECURITY SUBSYSTEM
 M:     James Morris <jmorris@namei.org>
 L:     linux-security-module@vger.kernel.org (suggested Cc:)
-T:     git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/security-testing-2.6.git
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git
 W:     http://security.wiki.kernel.org/
 S:     Supported
 F:     security/
index feec867..f82ec22 100644 (file)
@@ -24,7 +24,6 @@
 #define MAX_INSN_SIZE                  2
 #define MAX_STACK_SIZE                 64      /* 32 would probably be OK */
 
-#define regs_return_value(regs)                ((regs)->ARM_r0)
 #define flush_insn_slot(p)             do { } while (0)
 #define kretprobe_blacklist_size       0
 
index 96187ff..451808b 100644 (file)
@@ -189,6 +189,11 @@ static inline int valid_user_regs(struct pt_regs *regs)
        return 0;
 }
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->ARM_r0;
+}
+
 #define instruction_pointer(regs)      (regs)->ARM_pc
 
 #ifdef CONFIG_SMP
index 0f30c3a..d4c24d4 100644 (file)
@@ -129,6 +129,7 @@ extern void vfp_flush_hwstate(struct thread_info *);
 /*
  * thread information flags:
  *  TIF_SYSCALL_TRACE  - syscall trace active
+ *  TIF_SYSCAL_AUDIT   - syscall auditing active
  *  TIF_SIGPENDING     - signal pending
  *  TIF_NEED_RESCHED   - rescheduling necessary
  *  TIF_NOTIFY_RESUME  - callback before returning to user
@@ -139,6 +140,7 @@ extern void vfp_flush_hwstate(struct thread_info *);
 #define TIF_NEED_RESCHED       1
 #define TIF_NOTIFY_RESUME      2       /* callback before returning to user */
 #define TIF_SYSCALL_TRACE      8
+#define TIF_SYSCALL_AUDIT      9
 #define TIF_POLLING_NRFLAG     16
 #define TIF_USING_IWMMXT       17
 #define TIF_MEMDIE             18      /* is terminating due to OOM killer */
@@ -149,11 +151,15 @@ extern void vfp_flush_hwstate(struct thread_info *);
 #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
+#define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG    (1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT      (1 << TIF_USING_IWMMXT)
 #define _TIF_RESTORE_SIGMASK   (1 << TIF_RESTORE_SIGMASK)
 #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
 
+/* Checks for any syscall work in entry-common.S */
+#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT)
+
 /*
  * Change these and you break ASM code in entry-common.S
  */
index b2a27b6..520889c 100644 (file)
@@ -87,7 +87,7 @@ ENTRY(ret_from_fork)
        get_thread_info tsk
        ldr     r1, [tsk, #TI_FLAGS]            @ check for syscall tracing
        mov     why, #1
-       tst     r1, #_TIF_SYSCALL_TRACE         @ are we tracing syscalls?
+       tst     r1, #_TIF_SYSCALL_WORK          @ are we tracing syscalls?
        beq     ret_slow_syscall
        mov     r1, sp
        mov     r0, #1                          @ trace exit [IP = 1]
@@ -443,7 +443,7 @@ ENTRY(vector_swi)
 1:
 #endif
 
-       tst     r10, #_TIF_SYSCALL_TRACE                @ are we tracing syscalls?
+       tst     r10, #_TIF_SYSCALL_WORK         @ are we tracing syscalls?
        bne     __sys_trace
 
        cmp     scno, #NR_syscalls              @ check upper syscall limit
index 483727a..e1d5e19 100644 (file)
@@ -906,11 +906,6 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno)
 {
        unsigned long ip;
 
-       if (!test_thread_flag(TIF_SYSCALL_TRACE))
-               return scno;
-       if (!(current->ptrace & PT_PTRACED))
-               return scno;
-
        /*
         * Save IP.  IP is used to denote syscall entry/exit:
         *  IP = 0 -> entry, = 1 -> exit
@@ -918,6 +913,17 @@ asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno)
        ip = regs->ARM_ip;
        regs->ARM_ip = why;
 
+       if (!ip)
+               audit_syscall_exit(regs);
+       else
+               audit_syscall_entry(AUDIT_ARCH_ARMEB, scno, regs->ARM_r0,
+                                   regs->ARM_r1, regs->ARM_r2, regs->ARM_r3);
+
+       if (!test_thread_flag(TIF_SYSCALL_TRACE))
+               return scno;
+       if (!(current->ptrace & PT_PTRACED))
+               return scno;
+
        current_thread_info()->syscall = scno;
 
        /* the 0x80 provides a way for the tracing parent to distinguish
index 46d4d87..e82c642 100644 (file)
@@ -37,7 +37,7 @@
  */
 struct ep93xx_dma_data {
        int                             port;
-       enum dma_data_direction         direction;
+       enum dma_transfer_direction     direction;
        const char                      *name;
 };
 
@@ -80,14 +80,14 @@ static inline bool ep93xx_dma_chan_is_m2p(struct dma_chan *chan)
  * channel supports given DMA direction. Only M2P channels have such
  * limitation, for M2M channels the direction is configurable.
  */
-static inline enum dma_data_direction
+static inline enum dma_transfer_direction
 ep93xx_dma_chan_direction(struct dma_chan *chan)
 {
        if (!ep93xx_dma_chan_is_m2p(chan))
                return DMA_NONE;
 
        /* even channels are for TX, odd for RX */
-       return (chan->chan_id % 2 == 0) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+       return (chan->chan_id % 2 == 0) ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM;
 }
 
 #endif /* __ASM_ARCH_DMA_H */
index 1ea89be..6fcf304 100644 (file)
@@ -445,31 +445,39 @@ static const struct sh_dmae_slave_config sh7372_dmae_slaves[] = {
        },
 };
 
+#define SH7372_CHCLR 0x220
+
 static const struct sh_dmae_channel sh7372_dmae_channels[] = {
        {
                .offset = 0,
                .dmars = 0,
                .dmars_bit = 0,
+               .chclr_offset = SH7372_CHCLR + 0,
        }, {
                .offset = 0x10,
                .dmars = 0,
                .dmars_bit = 8,
+               .chclr_offset = SH7372_CHCLR + 0x10,
        }, {
                .offset = 0x20,
                .dmars = 4,
                .dmars_bit = 0,
+               .chclr_offset = SH7372_CHCLR + 0x20,
        }, {
                .offset = 0x30,
                .dmars = 4,
                .dmars_bit = 8,
+               .chclr_offset = SH7372_CHCLR + 0x30,
        }, {
                .offset = 0x50,
                .dmars = 8,
                .dmars_bit = 0,
+               .chclr_offset = SH7372_CHCLR + 0x50,
        }, {
                .offset = 0x60,
                .dmars = 8,
                .dmars_bit = 8,
+               .chclr_offset = SH7372_CHCLR + 0x60,
        }
 };
 
@@ -487,6 +495,7 @@ static struct sh_dmae_pdata dma_platform_data = {
        .ts_shift       = ts_shift,
        .ts_shift_num   = ARRAY_SIZE(ts_shift),
        .dmaor_init     = DMAOR_DME,
+       .chclr_present  = 1,
 };
 
 /* Resource order important! */
@@ -494,7 +503,7 @@ static struct resource sh7372_dmae0_resources[] = {
        {
                /* Channel registers and DMAOR */
                .start  = 0xfe008020,
-               .end    = 0xfe00808f,
+               .end    = 0xfe00828f,
                .flags  = IORESOURCE_MEM,
        },
        {
@@ -522,7 +531,7 @@ static struct resource sh7372_dmae1_resources[] = {
        {
                /* Channel registers and DMAOR */
                .start  = 0xfe018020,
-               .end    = 0xfe01808f,
+               .end    = 0xfe01828f,
                .flags  = IORESOURCE_MEM,
        },
        {
@@ -550,7 +559,7 @@ static struct resource sh7372_dmae2_resources[] = {
        {
                /* Channel registers and DMAOR */
                .start  = 0xfe028020,
-               .end    = 0xfe02808f,
+               .end    = 0xfe02828f,
                .flags  = IORESOURCE_MEM,
        },
        {
index ac24c5c..fdbe600 100644 (file)
 #define FB_SYNC_SWAP_RGB       0x04000000
 #define FB_SYNC_CLK_SEL_EN     0x02000000
 
+/*
+ * Specify the way your display is connected. The IPU can arbitrarily
+ * map the internal colors to the external data lines. We only support
+ * the following mappings at the moment.
+ */
+enum disp_data_mapping {
+       /* blue -> d[0..5], green -> d[6..11], red -> d[12..17] */
+       IPU_DISP_DATA_MAPPING_RGB666,
+       /* blue -> d[0..4], green -> d[5..10], red -> d[11..15] */
+       IPU_DISP_DATA_MAPPING_RGB565,
+       /* blue -> d[0..7], green -> d[8..15], red -> d[16..23] */
+       IPU_DISP_DATA_MAPPING_RGB888,
+};
+
 /**
  * struct mx3fb_platform_data - mx3fb platform data
  *
@@ -33,6 +47,7 @@ struct mx3fb_platform_data {
        const char                      *name;
        const struct fb_videomode       *mode;
        int                             num_modes;
+       enum disp_data_mapping          disp_data_fmt;
 };
 
 #endif
index 685c787..fd0ee84 100644 (file)
@@ -113,7 +113,8 @@ struct stedma40_half_channel_info {
  * @dst_dev_type: Dst device type
  * @src_info: Parameters for dst half channel
  * @dst_info: Parameters for dst half channel
- *
+ * @use_fixed_channel: if true, use physical channel specified by phy_channel
+ * @phy_channel: physical channel to use, only if use_fixed_channel is true
  *
  * This structure has to be filled by the client drivers.
  * It is recommended to do all dma configurations for clients in the machine.
@@ -129,6 +130,9 @@ struct stedma40_chan_cfg {
        int                                      dst_dev_type;
        struct stedma40_half_channel_info        src_info;
        struct stedma40_half_channel_info        dst_info;
+
+       bool                                     use_fixed_channel;
+       int                                      phy_channel;
 };
 
 /**
@@ -153,6 +157,7 @@ struct stedma40_platform_data {
        struct stedma40_chan_cfg        *memcpy_conf_phy;
        struct stedma40_chan_cfg        *memcpy_conf_log;
        int                              disabled_channels[STEDMA40_MAX_PHYS];
+       bool                             use_esram_lcla;
 };
 
 #ifdef CONFIG_STE_DMA40
@@ -187,7 +192,7 @@ static inline struct
 dma_async_tx_descriptor *stedma40_slave_mem(struct dma_chan *chan,
                                            dma_addr_t addr,
                                            unsigned int size,
-                                           enum dma_data_direction direction,
+                                           enum dma_transfer_direction direction,
                                            unsigned long flags)
 {
        struct scatterlist sg;
@@ -209,7 +214,7 @@ static inline struct
 dma_async_tx_descriptor *stedma40_slave_mem(struct dma_chan *chan,
                                            dma_addr_t addr,
                                            unsigned int size,
-                                           enum dma_data_direction direction,
+                                           enum dma_transfer_direction direction,
                                            unsigned long flags)
 {
        return NULL;
index 2cded87..0747c77 100644 (file)
@@ -37,14 +37,14 @@ static unsigned samsung_dmadev_request(enum dma_ch dma_ch,
                                (void *)dma_ch;
        chan = dma_request_channel(mask, pl330_filter, filter_param);
 
-       if (info->direction == DMA_FROM_DEVICE) {
+       if (info->direction == DMA_DEV_TO_MEM) {
                memset(&slave_config, 0, sizeof(struct dma_slave_config));
                slave_config.direction = info->direction;
                slave_config.src_addr = info->fifo;
                slave_config.src_addr_width = info->width;
                slave_config.src_maxburst = 1;
                dmaengine_slave_config(chan, &slave_config);
-       } else if (info->direction == DMA_TO_DEVICE) {
+       } else if (info->direction == DMA_MEM_TO_DEV) {
                memset(&slave_config, 0, sizeof(struct dma_slave_config));
                slave_config.direction = info->direction;
                slave_config.dst_addr = info->fifo;
index 7c6c2a8..71a6827 100644 (file)
@@ -18,7 +18,7 @@
 
 struct samsung_dma_prep_info {
        enum dma_transaction_type cap;
-       enum dma_data_direction direction;
+       enum dma_transfer_direction direction;
        dma_addr_t buf;
        unsigned long period;
        unsigned long len;
@@ -28,7 +28,7 @@ struct samsung_dma_prep_info {
 
 struct samsung_dma_info {
        enum dma_transaction_type cap;
-       enum dma_data_direction direction;
+       enum dma_transfer_direction direction;
        enum dma_slave_buswidth width;
        dma_addr_t fifo;
        struct s3c2410_dma_client *client;
index f5cb276..68c98f5 100644 (file)
@@ -246,7 +246,18 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
        return regs->ar_bspstore;
 }
 
-#define regs_return_value(regs) ((regs)->r8)
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+       return regs->r10 != -1;
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       if (is_syscall_success(regs))
+               return regs->r8;
+       else
+               return -regs->r8;
+}
 
 /* Conserve space in histogram by encoding slot bits in address
  * bits 2 and 3 rather than bits 0 and 1.
index 8848f43..dad9166 100644 (file)
@@ -1246,15 +1246,8 @@ syscall_trace_enter (long arg0, long arg1, long arg2, long arg3,
        if (test_thread_flag(TIF_RESTORE_RSE))
                ia64_sync_krbs();
 
-       if (unlikely(current->audit_context)) {
-               long syscall;
-               int arch;
 
-               syscall = regs.r15;
-               arch = AUDIT_ARCH_IA64;
-
-               audit_syscall_entry(arch, syscall, arg0, arg1, arg2, arg3);
-       }
+       audit_syscall_entry(AUDIT_ARCH_IA64, regs.r15, arg0, arg1, arg2, arg3);
 
        return 0;
 }
@@ -1268,14 +1261,7 @@ syscall_trace_leave (long arg0, long arg1, long arg2, long arg3,
 {
        int step;
 
-       if (unlikely(current->audit_context)) {
-               int success = AUDITSC_RESULT(regs.r10);
-               long result = regs.r8;
-
-               if (success != AUDITSC_SUCCESS)
-                       result = -result;
-               audit_syscall_exit(success, result);
-       }
+       audit_syscall_exit(&regs);
 
        step = test_thread_flag(TIF_SINGLESTEP);
        if (step || test_thread_flag(TIF_SYSCALL_TRACE))
index 816bee6..94e92c8 100644 (file)
@@ -61,6 +61,11 @@ struct pt_regs {
 #define instruction_pointer(regs)      ((regs)->pc)
 #define profile_pc(regs)               instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->r3;
+}
+
 #else /* __KERNEL__ */
 
 /* pt_regs offsets used by gdbserver etc in ptrace syscalls */
index 043cb58..6eb2aa9 100644 (file)
@@ -147,10 +147,8 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
                 */
                ret = -1L;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(EM_MICROBLAZE, regs->r12,
-                                   regs->r5, regs->r6,
-                                   regs->r7, regs->r8);
+       audit_syscall_entry(EM_MICROBLAZE, regs->r12, regs->r5, regs->r6,
+                           regs->r7, regs->r8);
 
        return ret ?: regs->r12;
 }
@@ -159,8 +157,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
        int step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->r3), regs->r3);
+       audit_syscall_exit(regs);
 
        step = test_thread_flag(TIF_SINGLESTEP);
        if (step || test_thread_flag(TIF_SYSCALL_TRACE))
index 604cd9d..d4fc1a9 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/cache.h>
 #include <linux/of_platform.h>
 #include <linux/dma-mapping.h>
+#include <linux/cpu.h>
 #include <asm/cacheflush.h>
 #include <asm/entry.h>
 #include <asm/cpuinfo.h>
@@ -226,5 +227,23 @@ static int __init setup_bus_notifier(void)
 
        return 0;
 }
-
 arch_initcall(setup_bus_notifier);
+
+static DEFINE_PER_CPU(struct cpu, cpu_devices);
+
+static int __init topology_init(void)
+{
+       int i, ret;
+
+       for_each_present_cpu(i) {
+               struct cpu *c = &per_cpu(cpu_devices, i);
+
+               ret = register_cpu(c, i);
+               if (ret)
+                       printk(KERN_WARNING "topology_init: register_cpu %d "
+                                               "failed (%d)\n", i, ret);
+       }
+
+       return 0;
+}
+subsys_initcall(topology_init);
index 7b99c67..4b7f525 100644 (file)
@@ -137,7 +137,19 @@ extern int ptrace_set_watch_regs(struct task_struct *child,
  */
 #define user_mode(regs) (((regs)->cp0_status & KU_MASK) == KU_USER)
 
-#define regs_return_value(_regs) ((_regs)->regs[2])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+       return !regs->regs[7];
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       if (is_syscall_success(regs))
+               return regs->regs[2];
+       else
+               return -regs->regs[2];
+}
+
 #define instruction_pointer(regs) ((regs)->cp0_epc)
 #define profile_pc(regs) instruction_pointer(regs)
 
index 4e6ea1f..7786b60 100644 (file)
@@ -560,10 +560,9 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
        }
 
 out:
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(audit_arch(), regs->regs[2],
-                                   regs->regs[4], regs->regs[5],
-                                   regs->regs[6], regs->regs[7]);
+       audit_syscall_entry(audit_arch(), regs->regs[2],
+                           regs->regs[4], regs->regs[5],
+                           regs->regs[6], regs->regs[7]);
 }
 
 /*
@@ -572,9 +571,7 @@ out:
  */
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->regs[7]),
-                                  -regs->regs[2]);
+       audit_syscall_exit(regs);
 
        if (!(current->ptrace & PT_PTRACED))
                return;
index 48223f9..78a2051 100644 (file)
@@ -86,7 +86,18 @@ struct pt_regs {
 #define instruction_pointer(regs) ((regs)->nip)
 #define user_stack_pointer(regs) ((regs)->gpr[1])
 #define kernel_stack_pointer(regs) ((regs)->gpr[1])
-#define regs_return_value(regs) ((regs)->gpr[3])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+       return !(regs->ccr & 0x10000000);
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       if (is_syscall_success(regs))
+               return regs->gpr[3];
+       else
+               return -regs->gpr[3];
+}
 
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *regs);
index 5de73db..5b43325 100644 (file)
@@ -1724,22 +1724,20 @@ long do_syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->gpr[0]);
 
-       if (unlikely(current->audit_context)) {
 #ifdef CONFIG_PPC64
-               if (!is_32bit_task())
-                       audit_syscall_entry(AUDIT_ARCH_PPC64,
-                                           regs->gpr[0],
-                                           regs->gpr[3], regs->gpr[4],
-                                           regs->gpr[5], regs->gpr[6]);
-               else
+       if (!is_32bit_task())
+               audit_syscall_entry(AUDIT_ARCH_PPC64,
+                                   regs->gpr[0],
+                                   regs->gpr[3], regs->gpr[4],
+                                   regs->gpr[5], regs->gpr[6]);
+       else
 #endif
-                       audit_syscall_entry(AUDIT_ARCH_PPC,
-                                           regs->gpr[0],
-                                           regs->gpr[3] & 0xffffffff,
-                                           regs->gpr[4] & 0xffffffff,
-                                           regs->gpr[5] & 0xffffffff,
-                                           regs->gpr[6] & 0xffffffff);
-       }
+               audit_syscall_entry(AUDIT_ARCH_PPC,
+                                   regs->gpr[0],
+                                   regs->gpr[3] & 0xffffffff,
+                                   regs->gpr[4] & 0xffffffff,
+                                   regs->gpr[5] & 0xffffffff,
+                                   regs->gpr[6] & 0xffffffff);
 
        return ret ?: regs->gpr[0];
 }
@@ -1748,9 +1746,7 @@ void do_syscall_trace_leave(struct pt_regs *regs)
 {
        int step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS,
-                                  regs->result);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->result);
index 56da355..aeb77f0 100644 (file)
@@ -541,9 +541,13 @@ struct user_regs_struct
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define user_stack_pointer(regs)((regs)->gprs[15])
-#define regs_return_value(regs)((regs)->gprs[2])
 #define profile_pc(regs) instruction_pointer(regs)
 
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->gprs[2];
+}
+
 int regs_query_register_offset(const char *name);
 const char *regs_query_register_name(unsigned int offset);
 unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset);
index 573bc29..9d82ed4 100644 (file)
@@ -740,20 +740,17 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->gprs[2]);
 
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(is_compat_task() ?
-                                       AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
-                                   regs->gprs[2], regs->orig_gpr2,
-                                   regs->gprs[3], regs->gprs[4],
-                                   regs->gprs[5]);
+       audit_syscall_entry(is_compat_task() ?
+                               AUDIT_ARCH_S390 : AUDIT_ARCH_S390X,
+                           regs->gprs[2], regs->orig_gpr2,
+                           regs->gprs[3], regs->gprs[4],
+                           regs->gprs[5]);
        return ret ?: regs->gprs[2];
 }
 
 asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
 {
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]),
-                                  regs->gprs[2]);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->gprs[2]);
index 6c2239c..2d3e906 100644 (file)
@@ -76,7 +76,10 @@ struct pt_dspregs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET         offsetof(struct pt_regs, tra)
-#define regs_return_value(_regs)       ((_regs)->regs[0])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->regs[0];
+}
 
 #endif /* __KERNEL__ */
 
index bf9be77..eb3fcce 100644 (file)
@@ -13,7 +13,10 @@ struct pt_regs {
 #ifdef __KERNEL__
 
 #define MAX_REG_OFFSET         offsetof(struct pt_regs, tregs[7])
-#define regs_return_value(_regs)       ((_regs)->regs[3])
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->regs[3];
+}
 
 #endif /* __KERNEL__ */
 
index 92b3c27..a3e6515 100644 (file)
@@ -518,10 +518,9 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->regs[0]);
 
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(audit_arch(), regs->regs[3],
-                                   regs->regs[4], regs->regs[5],
-                                   regs->regs[6], regs->regs[7]);
+       audit_syscall_entry(audit_arch(), regs->regs[3],
+                           regs->regs[4], regs->regs[5],
+                           regs->regs[6], regs->regs[7]);
 
        return ret ?: regs->regs[0];
 }
@@ -530,9 +529,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
        int step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->regs[0]),
-                                  regs->regs[0]);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->regs[0]);
index c8f9764..3d0080b 100644 (file)
@@ -536,10 +536,9 @@ asmlinkage long long do_syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->regs[9]);
 
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(audit_arch(), regs->regs[1],
-                                   regs->regs[2], regs->regs[3],
-                                   regs->regs[4], regs->regs[5]);
+       audit_syscall_entry(audit_arch(), regs->regs[1],
+                           regs->regs[2], regs->regs[3],
+                           regs->regs[4], regs->regs[5]);
 
        return ret ?: regs->regs[9];
 }
@@ -548,9 +547,7 @@ asmlinkage void do_syscall_trace_leave(struct pt_regs *regs)
 {
        int step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->regs[9]),
-                                  regs->regs[9]);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->regs[9]);
index a0e1bcf..c00c3b5 100644 (file)
@@ -207,7 +207,15 @@ do {       current_thread_info()->syscall_noerror = 1; \
 #define instruction_pointer(regs) ((regs)->tpc)
 #define instruction_pointer_set(regs, val) ((regs)->tpc = (val))
 #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP])
-#define regs_return_value(regs) ((regs)->u_regs[UREG_I0])
+static inline int is_syscall_success(struct pt_regs *regs)
+{
+       return !(regs->tstate & (TSTATE_XCARRY | TSTATE_ICARRY));
+}
+
+static inline long regs_return_value(struct pt_regs *regs)
+{
+       return regs->u_regs[UREG_I0];
+}
 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *);
 #else
index 96ee50a..9388844 100644 (file)
@@ -1071,32 +1071,22 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->u_regs[UREG_G1]);
 
-       if (unlikely(current->audit_context) && !ret)
-               audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
-                                    AUDIT_ARCH_SPARC :
-                                    AUDIT_ARCH_SPARC64),
-                                   regs->u_regs[UREG_G1],
-                                   regs->u_regs[UREG_I0],
-                                   regs->u_regs[UREG_I1],
-                                   regs->u_regs[UREG_I2],
-                                   regs->u_regs[UREG_I3]);
+       audit_syscall_entry((test_thread_flag(TIF_32BIT) ?
+                            AUDIT_ARCH_SPARC :
+                            AUDIT_ARCH_SPARC64),
+                           regs->u_regs[UREG_G1],
+                           regs->u_regs[UREG_I0],
+                           regs->u_regs[UREG_I1],
+                           regs->u_regs[UREG_I2],
+                           regs->u_regs[UREG_I3]);
 
        return ret;
 }
 
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-#ifdef CONFIG_AUDITSYSCALL
-       if (unlikely(current->audit_context)) {
-               unsigned long tstate = regs->tstate;
-               int result = AUDITSC_SUCCESS;
+       audit_syscall_exit(regs);
 
-               if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
-                       result = AUDITSC_FAILURE;
-
-               audit_syscall_exit(result, regs->u_regs[UREG_I0]);
-       }
-#endif
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->u_regs[UREG_G1]);
 
index c9da32b..06b1903 100644 (file)
@@ -167,17 +167,15 @@ void syscall_trace(struct uml_pt_regs *regs, int entryexit)
        int is_singlestep = (current->ptrace & PT_DTRACE) && entryexit;
        int tracesysgood;
 
-       if (unlikely(current->audit_context)) {
-               if (!entryexit)
-                       audit_syscall_entry(HOST_AUDIT_ARCH,
-                                           UPT_SYSCALL_NR(regs),
-                                           UPT_SYSCALL_ARG1(regs),
-                                           UPT_SYSCALL_ARG2(regs),
-                                           UPT_SYSCALL_ARG3(regs),
-                                           UPT_SYSCALL_ARG4(regs));
-               else audit_syscall_exit(AUDITSC_RESULT(UPT_SYSCALL_RET(regs)),
-                                       UPT_SYSCALL_RET(regs));
-       }
+       if (!entryexit)
+               audit_syscall_entry(HOST_AUDIT_ARCH,
+                                   UPT_SYSCALL_NR(regs),
+                                   UPT_SYSCALL_ARG1(regs),
+                                   UPT_SYSCALL_ARG2(regs),
+                                   UPT_SYSCALL_ARG3(regs),
+                                   UPT_SYSCALL_ARG4(regs));
+       else
+               audit_syscall_exit(regs);
 
        /* Fake a debug trap */
        if (is_singlestep)
index 1106261..e3e7340 100644 (file)
@@ -14,6 +14,7 @@
 #include <asm/segment.h>
 #include <asm/irqflags.h>
 #include <linux/linkage.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -189,7 +190,7 @@ sysexit_from_sys_call:
        movl %ebx,%edx                  /* 3rd arg: 1st syscall arg */
        movl %eax,%esi                  /* 2nd arg: syscall number */
        movl $AUDIT_ARCH_I386,%edi      /* 1st arg: audit arch */
-       call audit_syscall_entry
+       call __audit_syscall_entry
        movl RAX-ARGOFFSET(%rsp),%eax   /* reload syscall number */
        cmpq $(IA32_NR_syscalls-1),%rax
        ja ia32_badsys
@@ -206,12 +207,13 @@ sysexit_from_sys_call:
        TRACE_IRQS_ON
        sti
        movl %eax,%esi          /* second arg, syscall return value */
-       cmpl $0,%eax            /* is it < 0? */
-       setl %al                /* 1 if so, 0 if not */
+       cmpl $-MAX_ERRNO,%eax   /* is it an error ? */
+       jbe 1f
+       movslq %eax, %rsi       /* if error sign extend to 64 bits */
+1:     setbe %al               /* 1 if error, 0 if not */
        movzbl %al,%edi         /* zero-extend that into %edi */
-       inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-       call audit_syscall_exit
-       movl RAX-ARGOFFSET(%rsp),%eax   /* reload syscall return value */
+       call __audit_syscall_exit
+       movq RAX-ARGOFFSET(%rsp),%rax   /* reload syscall return value */
        movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
        cli
        TRACE_IRQS_OFF
index 4af9fd2..79d97e6 100644 (file)
@@ -42,6 +42,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/err.h>
 #include <asm/thread_info.h>
 #include <asm/irqflags.h>
 #include <asm/errno.h>
@@ -453,7 +454,7 @@ sysenter_audit:
        movl %ebx,%ecx                  /* 3rd arg: 1st syscall arg */
        movl %eax,%edx                  /* 2nd arg: syscall number */
        movl $AUDIT_ARCH_I386,%eax      /* 1st arg: audit arch */
-       call audit_syscall_entry
+       call __audit_syscall_entry
        pushl_cfi %ebx
        movl PT_EAX(%esp),%eax          /* reload syscall number */
        jmp sysenter_do_call
@@ -464,11 +465,10 @@ sysexit_audit:
        TRACE_IRQS_ON
        ENABLE_INTERRUPTS(CLBR_ANY)
        movl %eax,%edx          /* second arg, syscall return value */
-       cmpl $0,%eax            /* is it < 0? */
-       setl %al                /* 1 if so, 0 if not */
+       cmpl $-MAX_ERRNO,%eax   /* is it an error ? */
+       setbe %al               /* 1 if so, 0 if not */
        movzbl %al,%eax         /* zero-extend that */
-       inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-       call audit_syscall_exit
+       call __audit_syscall_exit
        DISABLE_INTERRUPTS(CLBR_ANY)
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
index 940ba71..3fe8239 100644 (file)
@@ -55,6 +55,7 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 #include <asm/percpu.h>
+#include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
@@ -548,7 +549,7 @@ badsys:
 #ifdef CONFIG_AUDITSYSCALL
        /*
         * Fast path for syscall audit without full syscall trace.
-        * We just call audit_syscall_entry() directly, and then
+        * We just call __audit_syscall_entry() directly, and then
         * jump back to the normal fast path.
         */
 auditsys:
@@ -558,22 +559,21 @@ auditsys:
        movq %rdi,%rdx                  /* 3rd arg: 1st syscall arg */
        movq %rax,%rsi                  /* 2nd arg: syscall number */
        movl $AUDIT_ARCH_X86_64,%edi    /* 1st arg: audit arch */
-       call audit_syscall_entry
+       call __audit_syscall_entry
        LOAD_ARGS 0             /* reload call-clobbered registers */
        jmp system_call_fastpath
 
        /*
-        * Return fast path for syscall audit.  Call audit_syscall_exit()
+        * Return fast path for syscall audit.  Call __audit_syscall_exit()
         * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
         * masked off.
         */
 sysret_audit:
        movq RAX-ARGOFFSET(%rsp),%rsi   /* second arg, syscall return value */
-       cmpq $0,%rsi            /* is it < 0? */
-       setl %al                /* 1 if so, 0 if not */
+       cmpq $-MAX_ERRNO,%rsi   /* is it < -MAX_ERRNO? */
+       setbe %al               /* 1 if so, 0 if not */
        movzbl %al,%edi         /* zero-extend that into %edi */
-       inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
-       call audit_syscall_exit
+       call __audit_syscall_exit
        movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
        jmp sysret_check
 #endif /* CONFIG_AUDITSYSCALL */
index 89a04c7..5026738 100644 (file)
@@ -1392,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_enter(regs, regs->orig_ax);
 
-       if (unlikely(current->audit_context)) {
-               if (IS_IA32)
-                       audit_syscall_entry(AUDIT_ARCH_I386,
-                                           regs->orig_ax,
-                                           regs->bx, regs->cx,
-                                           regs->dx, regs->si);
+       if (IS_IA32)
+               audit_syscall_entry(AUDIT_ARCH_I386,
+                                   regs->orig_ax,
+                                   regs->bx, regs->cx,
+                                   regs->dx, regs->si);
 #ifdef CONFIG_X86_64
-               else
-                       audit_syscall_entry(AUDIT_ARCH_X86_64,
-                                           regs->orig_ax,
-                                           regs->di, regs->si,
-                                           regs->dx, regs->r10);
+       else
+               audit_syscall_entry(AUDIT_ARCH_X86_64,
+                                   regs->orig_ax,
+                                   regs->di, regs->si,
+                                   regs->dx, regs->r10);
 #endif
-       }
 
        return ret ?: regs->orig_ax;
 }
@@ -1414,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs)
 {
        bool step;
 
-       if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
+       audit_syscall_exit(regs);
 
        if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
                trace_sys_exit(regs, regs->ax);
index 863f875..b466cab 100644 (file)
@@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
        if (info->flags & VM86_SCREEN_BITMAP)
                mark_screen_rdonly(tsk->mm);
 
-       /*call audit_syscall_exit since we do not exit via the normal paths */
+       /*call __audit_syscall_exit since we do not exit via the normal paths */
+#ifdef CONFIG_AUDITSYSCALL
        if (unlikely(current->audit_context))
-               audit_syscall_exit(AUDITSC_RESULT(0), 0);
+               __audit_syscall_exit(1, 0);
+#endif
 
        __asm__ __volatile__(
                "movl %0,%%esp\n\t"
index 711b162..5ef9344 100644 (file)
@@ -3,3 +3,8 @@
 #else
 #include "ptrace_64.h"
 #endif
+
+static inline long regs_return_value(struct uml_pt_regs *regs)
+{
+       return UPT_SYSCALL_RET(regs);
+}
index a0d042a..2dff698 100644 (file)
@@ -334,8 +334,7 @@ void do_syscall_trace_enter(struct pt_regs *regs)
                do_syscall_trace();
 
 #if 0
-       if (unlikely(current->audit_context))
-               audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
+       audit_syscall_entry(current, AUDIT_ARCH_XTENSA..);
 #endif
 }
 
index 163263d..ee55019 100644 (file)
@@ -3117,18 +3117,17 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-       struct cfq_queue *old_cfqq = cfqd->active_queue;
-
        cfq_log_cfqq(cfqd, cfqq, "preempt");
-       cfq_slice_expired(cfqd, 1);
 
        /*
         * workload type is changed, don't save slice, otherwise preempt
         * doesn't happen
         */
-       if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+       if (cfqq_type(cfqd->active_queue) != cfqq_type(cfqq))
                cfqq->cfqg->saved_workload_slice = 0;
 
+       cfq_slice_expired(cfqd, 1);
+
        /*
         * Put the new queue at the front of the of the current list,
         * so we know that it will be selected next.
index 69ac373..fdf27b9 100644 (file)
@@ -1117,6 +1117,13 @@ static int piix_broken_suspend(void)
                        },
                },
                {
+                       .ident = "Satellite Pro A120",
+                       .matches = {
+                               DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+                               DMI_MATCH(DMI_PRODUCT_NAME, "Satellite Pro A120"),
+                       },
+               },
+               {
                        .ident = "Portege M500",
                        .matches = {
                                DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
index 11c9aea..c06e0ec 100644 (file)
@@ -4125,6 +4125,8 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
         * device and controller are SATA.
         */
        { "PIONEER DVD-RW  DVRTD08",    NULL,   ATA_HORKAGE_NOSETXFER },
+       { "PIONEER DVD-RW  DVRTD08A",   NULL,   ATA_HORKAGE_NOSETXFER },
+       { "PIONEER DVD-RW  DVR-215",    NULL,   ATA_HORKAGE_NOSETXFER },
        { "PIONEER DVD-RW  DVR-212D",   NULL,   ATA_HORKAGE_NOSETXFER },
        { "PIONEER DVD-RW  DVR-216D",   NULL,   ATA_HORKAGE_NOSETXFER },
 
index 9a7f0ea..74aaee3 100644 (file)
@@ -291,6 +291,7 @@ int ata_tport_add(struct device *parent,
                goto tport_err;
        }
 
+       device_enable_async_suspend(dev);
        pm_runtime_set_active(dev);
        pm_runtime_enable(dev);
 
index d6a4677..1e65842 100644 (file)
@@ -251,6 +251,8 @@ static const u32 udma_tenvmin = 20;
 static const u32 udma_tackmin = 20;
 static const u32 udma_tssmin = 50;
 
+#define BFIN_MAX_SG_SEGMENTS 4
+
 /**
  *
  *     Function:       num_clocks_min
@@ -829,79 +831,61 @@ static void bfin_set_devctl(struct ata_port *ap, u8 ctl)
 
 static void bfin_bmdma_setup(struct ata_queued_cmd *qc)
 {
-       unsigned short config = WDSIZE_16;
+       struct ata_port *ap = qc->ap;
+       struct dma_desc_array *dma_desc_cpu = (struct dma_desc_array *)ap->bmdma_prd;
+       void __iomem *base = (void __iomem *)ap->ioaddr.ctl_addr;
+       unsigned short config = DMAFLOW_ARRAY | NDSIZE_5 | RESTART | WDSIZE_16 | DMAEN;
        struct scatterlist *sg;
        unsigned int si;
+       unsigned int channel;
+       unsigned int dir;
+       unsigned int size = 0;
 
        dev_dbg(qc->ap->dev, "in atapi dma setup\n");
        /* Program the ATA_CTRL register with dir */
        if (qc->tf.flags & ATA_TFLAG_WRITE) {
-               /* fill the ATAPI DMA controller */
-               set_dma_config(CH_ATAPI_TX, config);
-               set_dma_x_modify(CH_ATAPI_TX, 2);
-               for_each_sg(qc->sg, sg, qc->n_elem, si) {
-                       set_dma_start_addr(CH_ATAPI_TX, sg_dma_address(sg));
-                       set_dma_x_count(CH_ATAPI_TX, sg_dma_len(sg) >> 1);
-               }
+               channel = CH_ATAPI_TX;
+               dir = DMA_TO_DEVICE;
        } else {
+               channel = CH_ATAPI_RX;
+               dir = DMA_FROM_DEVICE;
                config |= WNR;
-               /* fill the ATAPI DMA controller */
-               set_dma_config(CH_ATAPI_RX, config);
-               set_dma_x_modify(CH_ATAPI_RX, 2);
-               for_each_sg(qc->sg, sg, qc->n_elem, si) {
-                       set_dma_start_addr(CH_ATAPI_RX, sg_dma_address(sg));
-                       set_dma_x_count(CH_ATAPI_RX, sg_dma_len(sg) >> 1);
-               }
        }
-}
 
-/**
- *     bfin_bmdma_start - Start an IDE DMA transaction
- *     @qc: Info associated with this ATA transaction.
- *
- *     Note: Original code is ata_bmdma_start().
- */
+       dma_map_sg(ap->dev, qc->sg, qc->n_elem, dir);
 
-static void bfin_bmdma_start(struct ata_queued_cmd *qc)
-{
-       struct ata_port *ap = qc->ap;
-       void __iomem *base = (void __iomem *)ap->ioaddr.ctl_addr;
-       struct scatterlist *sg;
-       unsigned int si;
+       /* fill the ATAPI DMA controller */
+       for_each_sg(qc->sg, sg, qc->n_elem, si) {
+               dma_desc_cpu[si].start_addr = sg_dma_address(sg);
+               dma_desc_cpu[si].cfg = config;
+               dma_desc_cpu[si].x_count = sg_dma_len(sg) >> 1;
+               dma_desc_cpu[si].x_modify = 2;
+               size += sg_dma_len(sg);
+       }
 
-       dev_dbg(qc->ap->dev, "in atapi dma start\n");
-       if (!(ap->udma_mask || ap->mwdma_mask))
-               return;
+       /* Set the last descriptor to stop mode */
+       dma_desc_cpu[qc->n_elem - 1].cfg &= ~(DMAFLOW | NDSIZE);
 
-       /* start ATAPI DMA controller*/
-       if (qc->tf.flags & ATA_TFLAG_WRITE) {
-               /*
-                * On blackfin arch, uncacheable memory is not
-                * allocated with flag GFP_DMA. DMA buffer from
-                * common kenel code should be flushed if WB
-                * data cache is enabled. Otherwise, this loop
-                * is an empty loop and optimized out.
-                */
-               for_each_sg(qc->sg, sg, qc->n_elem, si) {
-                       flush_dcache_range(sg_dma_address(sg),
-                               sg_dma_address(sg) + sg_dma_len(sg));
-               }
-               enable_dma(CH_ATAPI_TX);
-               dev_dbg(qc->ap->dev, "enable udma write\n");
+       flush_dcache_range((unsigned int)dma_desc_cpu,
+               (unsigned int)dma_desc_cpu +
+                       qc->n_elem * sizeof(struct dma_desc_array));
 
-               /* Send ATA DMA write command */
-               bfin_exec_command(ap, &qc->tf);
+       /* Enable ATA DMA operation*/
+       set_dma_curr_desc_addr(channel, (unsigned long *)ap->bmdma_prd_dma);
+       set_dma_x_count(channel, 0);
+       set_dma_x_modify(channel, 0);
+       set_dma_config(channel, config);
+
+       SSYNC();
+
+       /* Send ATA DMA command */
+       bfin_exec_command(ap, &qc->tf);
 
+       if (qc->tf.flags & ATA_TFLAG_WRITE) {
                /* set ATA DMA write direction */
                ATAPI_SET_CONTROL(base, (ATAPI_GET_CONTROL(base)
                        | XFER_DIR));
        } else {
-               enable_dma(CH_ATAPI_RX);
-               dev_dbg(qc->ap->dev, "enable udma read\n");
-
-               /* Send ATA DMA read command */
-               bfin_exec_command(ap, &qc->tf);
-
                /* set ATA DMA read direction */
                ATAPI_SET_CONTROL(base, (ATAPI_GET_CONTROL(base)
                        & ~XFER_DIR));
@@ -913,12 +897,28 @@ static void bfin_bmdma_start(struct ata_queued_cmd *qc)
        /* Set ATAPI state machine contorl in terminate sequence */
        ATAPI_SET_CONTROL(base, ATAPI_GET_CONTROL(base) | END_ON_TERM);
 
-       /* Set transfer length to buffer len */
-       for_each_sg(qc->sg, sg, qc->n_elem, si) {
-               ATAPI_SET_XFER_LEN(base, (sg_dma_len(sg) >> 1));
-       }
+       /* Set transfer length to the total size of sg buffers */
+       ATAPI_SET_XFER_LEN(base, size >> 1);
+}
 
-       /* Enable ATA DMA operation*/
+/**
+ *     bfin_bmdma_start - Start an IDE DMA transaction
+ *     @qc: Info associated with this ATA transaction.
+ *
+ *     Note: Original code is ata_bmdma_start().
+ */
+
+static void bfin_bmdma_start(struct ata_queued_cmd *qc)
+{
+       struct ata_port *ap = qc->ap;
+       void __iomem *base = (void __iomem *)ap->ioaddr.ctl_addr;
+
+       dev_dbg(qc->ap->dev, "in atapi dma start\n");
+
+       if (!(ap->udma_mask || ap->mwdma_mask))
+               return;
+
+       /* start ATAPI transfer*/
        if (ap->udma_mask)
                ATAPI_SET_CONTROL(base, ATAPI_GET_CONTROL(base)
                        | ULTRA_START);
@@ -935,34 +935,23 @@ static void bfin_bmdma_start(struct ata_queued_cmd *qc)
 static void bfin_bmdma_stop(struct ata_queued_cmd *qc)
 {
        struct ata_port *ap = qc->ap;
-       struct scatterlist *sg;
-       unsigned int si;
+       unsigned int dir;
 
        dev_dbg(qc->ap->dev, "in atapi dma stop\n");
+
        if (!(ap->udma_mask || ap->mwdma_mask))
                return;
 
        /* stop ATAPI DMA controller*/
-       if (qc->tf.flags & ATA_TFLAG_WRITE)
+       if (qc->tf.flags & ATA_TFLAG_WRITE) {
+               dir = DMA_TO_DEVICE;
                disable_dma(CH_ATAPI_TX);
-       else {
+       } else {
+               dir = DMA_FROM_DEVICE;
                disable_dma(CH_ATAPI_RX);
-               if (ap->hsm_task_state & HSM_ST_LAST) {
-                       /*
-                        * On blackfin arch, uncacheable memory is not
-                        * allocated with flag GFP_DMA. DMA buffer from
-                        * common kenel code should be invalidated if
-                        * data cache is enabled. Otherwise, this loop
-                        * is an empty loop and optimized out.
-                        */
-                       for_each_sg(qc->sg, sg, qc->n_elem, si) {
-                               invalidate_dcache_range(
-                                       sg_dma_address(sg),
-                                       sg_dma_address(sg)
-                                       + sg_dma_len(sg));
-                       }
-               }
        }
+
+       dma_unmap_sg(ap->dev, qc->sg, qc->n_elem, dir);
 }
 
 /**
@@ -1260,6 +1249,11 @@ static void bfin_port_stop(struct ata_port *ap)
 {
        dev_dbg(ap->dev, "in atapi port stop\n");
        if (ap->udma_mask != 0 || ap->mwdma_mask != 0) {
+               dma_free_coherent(ap->dev,
+                       BFIN_MAX_SG_SEGMENTS * sizeof(struct dma_desc_array),
+                       ap->bmdma_prd,
+                       ap->bmdma_prd_dma);
+
                free_dma(CH_ATAPI_RX);
                free_dma(CH_ATAPI_TX);
        }
@@ -1271,14 +1265,29 @@ static int bfin_port_start(struct ata_port *ap)
        if (!(ap->udma_mask || ap->mwdma_mask))
                return 0;
 
+       ap->bmdma_prd = dma_alloc_coherent(ap->dev,
+                               BFIN_MAX_SG_SEGMENTS * sizeof(struct dma_desc_array),
+                               &ap->bmdma_prd_dma,
+                               GFP_KERNEL);
+
+       if (ap->bmdma_prd == NULL) {
+               dev_info(ap->dev, "Unable to allocate DMA descriptor array.\n");
+               goto out;
+       }
+
        if (request_dma(CH_ATAPI_RX, "BFIN ATAPI RX DMA") >= 0) {
                if (request_dma(CH_ATAPI_TX,
                        "BFIN ATAPI TX DMA") >= 0)
                        return 0;
 
                free_dma(CH_ATAPI_RX);
+               dma_free_coherent(ap->dev,
+                       BFIN_MAX_SG_SEGMENTS * sizeof(struct dma_desc_array),
+                       ap->bmdma_prd,
+                       ap->bmdma_prd_dma);
        }
 
+out:
        ap->udma_mask = 0;
        ap->mwdma_mask = 0;
        dev_err(ap->dev, "Unable to request ATAPI DMA!"
@@ -1400,7 +1409,7 @@ static irqreturn_t bfin_ata_interrupt(int irq, void *dev_instance)
 
 static struct scsi_host_template bfin_sht = {
        ATA_BASE_SHT(DRV_NAME),
-       .sg_tablesize           = SG_NONE,
+       .sg_tablesize           = BFIN_MAX_SG_SEGMENTS,
        .dma_boundary           = ATA_DMA_BOUNDARY,
 };
 
index 5a2c95b..0120b0d 100644 (file)
@@ -140,6 +140,7 @@ enum {
         */
        HCONTROL_ONLINE_PHY_RST = (1 << 31),
        HCONTROL_FORCE_OFFLINE = (1 << 30),
+       HCONTROL_LEGACY = (1 << 28),
        HCONTROL_PARITY_PROT_MOD = (1 << 14),
        HCONTROL_DPATH_PARITY = (1 << 12),
        HCONTROL_SNOOP_ENABLE = (1 << 10),
@@ -1223,6 +1224,10 @@ static int sata_fsl_init_controller(struct ata_host *host)
         * part of the port_start() callback
         */
 
+       /* sata controller to operate in enterprise mode */
+       temp = ioread32(hcr_base + HCONTROL);
+       iowrite32(temp & ~HCONTROL_LEGACY, hcr_base + HCONTROL);
+
        /* ack. any pending IRQs for this controller/port */
        temp = ioread32(hcr_base + HSTATUS);
        if (temp & 0x3F)
@@ -1421,6 +1426,12 @@ static int sata_fsl_resume(struct platform_device *op)
        /* Recovery the CHBA register in host controller cmd register set */
        iowrite32(pp->cmdslot_paddr & 0xffffffff, hcr_base + CHBA);
 
+       iowrite32((ioread32(hcr_base + HCONTROL)
+                               | HCONTROL_ONLINE_PHY_RST
+                               | HCONTROL_SNOOP_ENABLE
+                               | HCONTROL_PMP_ATTACHED),
+                       hcr_base + HCONTROL);
+
        ata_host_resume(host);
        return 0;
 }
index 5a99bb3..f1a2749 100644 (file)
@@ -124,7 +124,7 @@ config MV_XOR
 
 config MX3_IPU
        bool "MX3x Image Processing Unit support"
-       depends on SOC_IMX31 || SOC_IMX35
+       depends on ARCH_MXC
        select DMA_ENGINE
        default y
        help
@@ -187,6 +187,13 @@ config TIMB_DMA
        help
          Enable support for the Timberdale FPGA DMA engine.
 
+config SIRF_DMA
+       tristate "CSR SiRFprimaII DMA support"
+       depends on ARCH_PRIMA2
+       select DMA_ENGINE
+       help
+         Enable support for the CSR SiRFprimaII DMA engine.
+
 config ARCH_HAS_ASYNC_TX_FIND_CHANNEL
        bool
 
@@ -201,26 +208,26 @@ config PL330_DMA
          platform_data for a dma-pl330 device.
 
 config PCH_DMA
-       tristate "Intel EG20T PCH / OKI Semi IOH(ML7213/ML7223) DMA support"
+       tristate "Intel EG20T PCH / LAPIS Semicon IOH(ML7213/ML7223/ML7831) DMA"
        depends on PCI && X86
        select DMA_ENGINE
        help
          Enable support for Intel EG20T PCH DMA engine.
 
-         This driver also can be used for OKI SEMICONDUCTOR IOH(Input/
-         Output Hub), ML7213 and ML7223.
-         ML7213 IOH is for IVI(In-Vehicle Infotainment) use and ML7223 IOH is
-         for MP(Media Phone) use.
-         ML7213/ML7223 is companion chip for Intel Atom E6xx series.
-         ML7213/ML7223 is completely compatible for Intel EG20T PCH.
+         This driver also can be used for LAPIS Semiconductor IOH(Input/
+         Output Hub), ML7213, ML7223 and ML7831.
+         ML7213 IOH is for IVI(In-Vehicle Infotainment) use, ML7223 IOH is
+         for MP(Media Phone) use and ML7831 IOH is for general purpose use.
+         ML7213/ML7223/ML7831 is companion chip for Intel Atom E6xx series.
+         ML7213/ML7223/ML7831 is completely compatible for Intel EG20T PCH.
 
 config IMX_SDMA
        tristate "i.MX SDMA support"
-       depends on ARCH_MX25 || SOC_IMX31 || SOC_IMX35 || ARCH_MX5
+       depends on ARCH_MXC
        select DMA_ENGINE
        help
          Support the i.MX SDMA engine. This engine is integrated into
-         Freescale i.MX25/31/35/51 chips.
+         Freescale i.MX25/31/35/51/53 chips.
 
 config IMX_DMA
        tristate "i.MX DMA support"
index 30cf3b1..009a222 100644 (file)
@@ -21,6 +21,7 @@ obj-$(CONFIG_IMX_SDMA) += imx-sdma.o
 obj-$(CONFIG_IMX_DMA) += imx-dma.o
 obj-$(CONFIG_MXS_DMA) += mxs-dma.o
 obj-$(CONFIG_TIMB_DMA) += timb_dma.o
+obj-$(CONFIG_SIRF_DMA) += sirf-dma.o
 obj-$(CONFIG_STE_DMA40) += ste_dma40.o ste_dma40_ll.o
 obj-$(CONFIG_PL330_DMA) += pl330.o
 obj-$(CONFIG_PCH_DMA) += pch_dma.o
index 0698695..8a28158 100644 (file)
@@ -854,8 +854,10 @@ static int prep_phy_channel(struct pl08x_dma_chan *plchan,
        int ret;
 
        /* Check if we already have a channel */
-       if (plchan->phychan)
-               return 0;
+       if (plchan->phychan) {
+               ch = plchan->phychan;
+               goto got_channel;
+       }
 
        ch = pl08x_get_phy_channel(pl08x, plchan);
        if (!ch) {
@@ -880,21 +882,22 @@ static int prep_phy_channel(struct pl08x_dma_chan *plchan,
                        return -EBUSY;
                }
                ch->signal = ret;
-
-               /* Assign the flow control signal to this channel */
-               if (txd->direction == DMA_TO_DEVICE)
-                       txd->ccfg |= ch->signal << PL080_CONFIG_DST_SEL_SHIFT;
-               else if (txd->direction == DMA_FROM_DEVICE)
-                       txd->ccfg |= ch->signal << PL080_CONFIG_SRC_SEL_SHIFT;
        }
 
+       plchan->phychan = ch;
        dev_dbg(&pl08x->adev->dev, "allocated physical channel %d and signal %d for xfer on %s\n",
                 ch->id,
                 ch->signal,
                 plchan->name);
 
+got_channel:
+       /* Assign the flow control signal to this channel */
+       if (txd->direction == DMA_MEM_TO_DEV)
+               txd->ccfg |= ch->signal << PL080_CONFIG_DST_SEL_SHIFT;
+       else if (txd->direction == DMA_DEV_TO_MEM)
+               txd->ccfg |= ch->signal << PL080_CONFIG_SRC_SEL_SHIFT;
+
        plchan->phychan_hold++;
-       plchan->phychan = ch;
 
        return 0;
 }
@@ -1102,10 +1105,10 @@ static int dma_set_runtime_config(struct dma_chan *chan,
 
        /* Transfer direction */
        plchan->runtime_direction = config->direction;
-       if (config->direction == DMA_TO_DEVICE) {
+       if (config->direction == DMA_MEM_TO_DEV) {
                addr_width = config->dst_addr_width;
                maxburst = config->dst_maxburst;
-       } else if (config->direction == DMA_FROM_DEVICE) {
+       } else if (config->direction == DMA_DEV_TO_MEM) {
                addr_width = config->src_addr_width;
                maxburst = config->src_maxburst;
        } else {
@@ -1136,7 +1139,7 @@ static int dma_set_runtime_config(struct dma_chan *chan,
        cctl |= burst << PL080_CONTROL_SB_SIZE_SHIFT;
        cctl |= burst << PL080_CONTROL_DB_SIZE_SHIFT;
 
-       if (plchan->runtime_direction == DMA_FROM_DEVICE) {
+       if (plchan->runtime_direction == DMA_DEV_TO_MEM) {
                plchan->src_addr = config->src_addr;
                plchan->src_cctl = pl08x_cctl(cctl) | PL080_CONTROL_DST_INCR |
                        pl08x_select_bus(plchan->cd->periph_buses,
@@ -1152,7 +1155,7 @@ static int dma_set_runtime_config(struct dma_chan *chan,
                "configured channel %s (%s) for %s, data width %d, "
                "maxburst %d words, LE, CCTL=0x%08x\n",
                dma_chan_name(chan), plchan->name,
-               (config->direction == DMA_FROM_DEVICE) ? "RX" : "TX",
+               (config->direction == DMA_DEV_TO_MEM) ? "RX" : "TX",
                addr_width,
                maxburst,
                cctl);
@@ -1322,7 +1325,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_dma_memcpy(
 
 static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
                struct dma_chan *chan, struct scatterlist *sgl,
-               unsigned int sg_len, enum dma_data_direction direction,
+               unsigned int sg_len, enum dma_transfer_direction direction,
                unsigned long flags)
 {
        struct pl08x_dma_chan *plchan = to_pl08x_chan(chan);
@@ -1354,10 +1357,10 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
         */
        txd->direction = direction;
 
-       if (direction == DMA_TO_DEVICE) {
+       if (direction == DMA_MEM_TO_DEV) {
                txd->cctl = plchan->dst_cctl;
                slave_addr = plchan->dst_addr;
-       } else if (direction == DMA_FROM_DEVICE) {
+       } else if (direction == DMA_DEV_TO_MEM) {
                txd->cctl = plchan->src_cctl;
                slave_addr = plchan->src_addr;
        } else {
@@ -1368,10 +1371,10 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
        }
 
        if (plchan->cd->device_fc)
-               tmp = (direction == DMA_TO_DEVICE) ? PL080_FLOW_MEM2PER_PER :
+               tmp = (direction == DMA_MEM_TO_DEV) ? PL080_FLOW_MEM2PER_PER :
                        PL080_FLOW_PER2MEM_PER;
        else
-               tmp = (direction == DMA_TO_DEVICE) ? PL080_FLOW_MEM2PER :
+               tmp = (direction == DMA_MEM_TO_DEV) ? PL080_FLOW_MEM2PER :
                        PL080_FLOW_PER2MEM;
 
        txd->ccfg |= tmp << PL080_CONFIG_FLOW_CONTROL_SHIFT;
@@ -1387,7 +1390,7 @@ static struct dma_async_tx_descriptor *pl08x_prep_slave_sg(
                list_add_tail(&dsg->node, &txd->dsg_list);
 
                dsg->len = sg_dma_len(sg);
-               if (direction == DMA_TO_DEVICE) {
+               if (direction == DMA_MEM_TO_DEV) {
                        dsg->src_addr = sg_phys(sg);
                        dsg->dst_addr = slave_addr;
                } else {
index fcfa0a8..97f87b2 100644 (file)
@@ -23,6 +23,8 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
 
 #include "at_hdmac_regs.h"
 
@@ -660,7 +662,7 @@ err_desc_get:
  */
 static struct dma_async_tx_descriptor *
 atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
-               unsigned int sg_len, enum dma_data_direction direction,
+               unsigned int sg_len, enum dma_transfer_direction direction,
                unsigned long flags)
 {
        struct at_dma_chan      *atchan = to_at_dma_chan(chan);
@@ -678,7 +680,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
        dev_vdbg(chan2dev(chan), "prep_slave_sg (%d): %s f0x%lx\n",
                        sg_len,
-                       direction == DMA_TO_DEVICE ? "TO DEVICE" : "FROM DEVICE",
+                       direction == DMA_MEM_TO_DEV ? "TO DEVICE" : "FROM DEVICE",
                        flags);
 
        if (unlikely(!atslave || !sg_len)) {
@@ -692,7 +694,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
        ctrlb = ATC_IEN;
 
        switch (direction) {
-       case DMA_TO_DEVICE:
+       case DMA_MEM_TO_DEV:
                ctrla |=  ATC_DST_WIDTH(reg_width);
                ctrlb |=  ATC_DST_ADDR_MODE_FIXED
                        | ATC_SRC_ADDR_MODE_INCR
@@ -725,7 +727,7 @@ atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                        total_len += len;
                }
                break;
-       case DMA_FROM_DEVICE:
+       case DMA_DEV_TO_MEM:
                ctrla |=  ATC_SRC_WIDTH(reg_width);
                ctrlb |=  ATC_DST_ADDR_MODE_INCR
                        | ATC_SRC_ADDR_MODE_FIXED
@@ -787,7 +789,7 @@ err_desc_get:
  */
 static int
 atc_dma_cyclic_check_values(unsigned int reg_width, dma_addr_t buf_addr,
-               size_t period_len, enum dma_data_direction direction)
+               size_t period_len, enum dma_transfer_direction direction)
 {
        if (period_len > (ATC_BTSIZE_MAX << reg_width))
                goto err_out;
@@ -795,7 +797,7 @@ atc_dma_cyclic_check_values(unsigned int reg_width, dma_addr_t buf_addr,
                goto err_out;
        if (unlikely(buf_addr & ((1 << reg_width) - 1)))
                goto err_out;
-       if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE))))
+       if (unlikely(!(direction & (DMA_DEV_TO_MEM | DMA_MEM_TO_DEV))))
                goto err_out;
 
        return 0;
@@ -810,7 +812,7 @@ err_out:
 static int
 atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc,
                unsigned int period_index, dma_addr_t buf_addr,
-               size_t period_len, enum dma_data_direction direction)
+               size_t period_len, enum dma_transfer_direction direction)
 {
        u32             ctrla;
        unsigned int    reg_width = atslave->reg_width;
@@ -822,7 +824,7 @@ atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc,
                | period_len >> reg_width;
 
        switch (direction) {
-       case DMA_TO_DEVICE:
+       case DMA_MEM_TO_DEV:
                desc->lli.saddr = buf_addr + (period_len * period_index);
                desc->lli.daddr = atslave->tx_reg;
                desc->lli.ctrla = ctrla;
@@ -833,7 +835,7 @@ atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc,
                                | ATC_DIF(AT_DMA_PER_IF);
                break;
 
-       case DMA_FROM_DEVICE:
+       case DMA_DEV_TO_MEM:
                desc->lli.saddr = atslave->rx_reg;
                desc->lli.daddr = buf_addr + (period_len * period_index);
                desc->lli.ctrla = ctrla;
@@ -861,7 +863,7 @@ atc_dma_cyclic_fill_desc(struct at_dma_slave *atslave, struct at_desc *desc,
  */
 static struct dma_async_tx_descriptor *
 atc_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
-               size_t period_len, enum dma_data_direction direction)
+               size_t period_len, enum dma_transfer_direction direction)
 {
        struct at_dma_chan      *atchan = to_at_dma_chan(chan);
        struct at_dma_slave     *atslave = chan->private;
@@ -872,7 +874,7 @@ atc_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
        unsigned int            i;
 
        dev_vdbg(chan2dev(chan), "prep_dma_cyclic: %s buf@0x%08x - %d (%d/%d)\n",
-                       direction == DMA_TO_DEVICE ? "TO DEVICE" : "FROM DEVICE",
+                       direction == DMA_MEM_TO_DEV ? "TO DEVICE" : "FROM DEVICE",
                        buf_addr,
                        periods, buf_len, period_len);
 
@@ -1175,6 +1177,56 @@ static void atc_free_chan_resources(struct dma_chan *chan)
 
 /*--  Module Management  -----------------------------------------------*/
 
+/* cap_mask is a multi-u32 bitfield, fill it with proper C code. */
+static struct at_dma_platform_data at91sam9rl_config = {
+       .nr_channels = 2,
+};
+static struct at_dma_platform_data at91sam9g45_config = {
+       .nr_channels = 8,
+};
+
+#if defined(CONFIG_OF)
+static const struct of_device_id atmel_dma_dt_ids[] = {
+       {
+               .compatible = "atmel,at91sam9rl-dma",
+               .data = &at91sam9rl_config,
+       }, {
+               .compatible = "atmel,at91sam9g45-dma",
+               .data = &at91sam9g45_config,
+       }, {
+               /* sentinel */
+       }
+};
+
+MODULE_DEVICE_TABLE(of, atmel_dma_dt_ids);
+#endif
+
+static const struct platform_device_id atdma_devtypes[] = {
+       {
+               .name = "at91sam9rl_dma",
+               .driver_data = (unsigned long) &at91sam9rl_config,
+       }, {
+               .name = "at91sam9g45_dma",
+               .driver_data = (unsigned long) &at91sam9g45_config,
+       }, {
+               /* sentinel */
+       }
+};
+
+static inline struct at_dma_platform_data * __init at_dma_get_driver_data(
+                                               struct platform_device *pdev)
+{
+       if (pdev->dev.of_node) {
+               const struct of_device_id *match;
+               match = of_match_node(atmel_dma_dt_ids, pdev->dev.of_node);
+               if (match == NULL)
+                       return NULL;
+               return match->data;
+       }
+       return (struct at_dma_platform_data *)
+                       platform_get_device_id(pdev)->driver_data;
+}
+
 /**
  * at_dma_off - disable DMA controller
  * @atdma: the Atmel HDAMC device
@@ -1193,18 +1245,23 @@ static void at_dma_off(struct at_dma *atdma)
 
 static int __init at_dma_probe(struct platform_device *pdev)
 {
-       struct at_dma_platform_data *pdata;
        struct resource         *io;
        struct at_dma           *atdma;
        size_t                  size;
        int                     irq;
        int                     err;
        int                     i;
+       struct at_dma_platform_data *plat_dat;
 
-       /* get DMA Controller parameters from platform */
-       pdata = pdev->dev.platform_data;
-       if (!pdata || pdata->nr_channels > AT_DMA_MAX_NR_CHANNELS)
-               return -EINVAL;
+       /* setup platform data for each SoC */
+       dma_cap_set(DMA_MEMCPY, at91sam9rl_config.cap_mask);
+       dma_cap_set(DMA_MEMCPY, at91sam9g45_config.cap_mask);
+       dma_cap_set(DMA_SLAVE, at91sam9g45_config.cap_mask);
+
+       /* get DMA parameters from controller type */
+       plat_dat = at_dma_get_driver_data(pdev);
+       if (!plat_dat)
+               return -ENODEV;
 
        io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
        if (!io)
@@ -1215,14 +1272,14 @@ static int __init at_dma_probe(struct platform_device *pdev)
                return irq;
 
        size = sizeof(struct at_dma);
-       size += pdata->nr_channels * sizeof(struct at_dma_chan);
+       size += plat_dat->nr_channels * sizeof(struct at_dma_chan);
        atdma = kzalloc(size, GFP_KERNEL);
        if (!atdma)
                return -ENOMEM;
 
-       /* discover transaction capabilites from the platform data */
-       atdma->dma_common.cap_mask = pdata->cap_mask;
-       atdma->all_chan_mask = (1 << pdata->nr_channels) - 1;
+       /* discover transaction capabilities */
+       atdma->dma_common.cap_mask = plat_dat->cap_mask;
+       atdma->all_chan_mask = (1 << plat_dat->nr_channels) - 1;
 
        size = resource_size(io);
        if (!request_mem_region(io->start, size, pdev->dev.driver->name)) {
@@ -1268,7 +1325,7 @@ static int __init at_dma_probe(struct platform_device *pdev)
 
        /* initialize channels related values */
        INIT_LIST_HEAD(&atdma->dma_common.channels);
-       for (i = 0; i < pdata->nr_channels; i++) {
+       for (i = 0; i < plat_dat->nr_channels; i++) {
                struct at_dma_chan      *atchan = &atdma->chan[i];
 
                atchan->chan_common.device = &atdma->dma_common;
@@ -1313,7 +1370,7 @@ static int __init at_dma_probe(struct platform_device *pdev)
        dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s), %d channels\n",
          dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask) ? "cpy " : "",
          dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask)  ? "slave " : "",
-         pdata->nr_channels);
+         plat_dat->nr_channels);
 
        dma_async_device_register(&atdma->dma_common);
 
@@ -1495,9 +1552,11 @@ static const struct dev_pm_ops at_dma_dev_pm_ops = {
 static struct platform_driver at_dma_driver = {
        .remove         = __exit_p(at_dma_remove),
        .shutdown       = at_dma_shutdown,
+       .id_table       = atdma_devtypes,
        .driver = {
                .name   = "at_hdmac",
                .pm     = &at_dma_dev_pm_ops,
+               .of_match_table = of_match_ptr(atmel_dma_dt_ids),
        },
 };
 
index aa4c9ae..dcaedfc 100644 (file)
@@ -251,6 +251,7 @@ static inline struct at_dma_chan *to_at_dma_chan(struct dma_chan *dchan)
 /**
  * struct at_dma - internal representation of an Atmel HDMA Controller
  * @chan_common: common dmaengine dma_device object members
+ * @atdma_devtype: identifier of DMA controller compatibility
  * @ch_regs: memory mapped register base
  * @clk: dma controller clock
  * @save_imr: interrupt mask register that is saved on suspend/resume cycle
index 4234f41..d65a718 100644 (file)
@@ -39,7 +39,7 @@ struct coh901318_desc {
        struct scatterlist *sg;
        unsigned int sg_len;
        struct coh901318_lli *lli;
-       enum dma_data_direction dir;
+       enum dma_transfer_direction dir;
        unsigned long flags;
        u32 head_config;
        u32 head_ctrl;
@@ -1034,7 +1034,7 @@ coh901318_prep_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 
 static struct dma_async_tx_descriptor *
 coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
-                       unsigned int sg_len, enum dma_data_direction direction,
+                       unsigned int sg_len, enum dma_transfer_direction direction,
                        unsigned long flags)
 {
        struct coh901318_chan *cohc = to_coh901318_chan(chan);
@@ -1077,7 +1077,7 @@ coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
        ctrl_last |= cohc->runtime_ctrl;
        ctrl |= cohc->runtime_ctrl;
 
-       if (direction == DMA_TO_DEVICE) {
+       if (direction == DMA_MEM_TO_DEV) {
                u32 tx_flags = COH901318_CX_CTRL_PRDD_SOURCE |
                        COH901318_CX_CTRL_SRC_ADDR_INC_ENABLE;
 
@@ -1085,7 +1085,7 @@ coh901318_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                ctrl_chained |= tx_flags;
                ctrl_last |= tx_flags;
                ctrl |= tx_flags;
-       } else if (direction == DMA_FROM_DEVICE) {
+       } else if (direction == DMA_DEV_TO_MEM) {
                u32 rx_flags = COH901318_CX_CTRL_PRDD_DEST |
                        COH901318_CX_CTRL_DST_ADDR_INC_ENABLE;
 
@@ -1274,11 +1274,11 @@ static void coh901318_dma_set_runtimeconfig(struct dma_chan *chan,
        int i = 0;
 
        /* We only support mem to per or per to mem transfers */
-       if (config->direction == DMA_FROM_DEVICE) {
+       if (config->direction == DMA_DEV_TO_MEM) {
                addr = config->src_addr;
                addr_width = config->src_addr_width;
                maxburst = config->src_maxburst;
-       } else if (config->direction == DMA_TO_DEVICE) {
+       } else if (config->direction == DMA_MEM_TO_DEV) {
                addr = config->dst_addr;
                addr_width = config->dst_addr_width;
                maxburst = config->dst_maxburst;
index 9f7e0e6..6c0e2d4 100644 (file)
@@ -7,11 +7,10 @@
  * Author: Per Friden <per.friden@stericsson.com>
  */
 
-#include <linux/dma-mapping.h>
 #include <linux/spinlock.h>
-#include <linux/dmapool.h>
 #include <linux/memory.h>
 #include <linux/gfp.h>
+#include <linux/dmapool.h>
 #include <mach/coh901318.h>
 
 #include "coh901318_lli.h"
@@ -177,18 +176,18 @@ coh901318_lli_fill_single(struct coh901318_pool *pool,
                          struct coh901318_lli *lli,
                          dma_addr_t buf, unsigned int size,
                          dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl_eom,
-                         enum dma_data_direction dir)
+                         enum dma_transfer_direction dir)
 {
        int s = size;
        dma_addr_t src;
        dma_addr_t dst;
 
 
-       if (dir == DMA_TO_DEVICE) {
+       if (dir == DMA_MEM_TO_DEV) {
                src = buf;
                dst = dev_addr;
 
-       } else if (dir == DMA_FROM_DEVICE) {
+       } else if (dir == DMA_DEV_TO_MEM) {
 
                src = dev_addr;
                dst = buf;
@@ -215,9 +214,9 @@ coh901318_lli_fill_single(struct coh901318_pool *pool,
 
                lli = coh901318_lli_next(lli);
 
-               if (dir == DMA_TO_DEVICE)
+               if (dir == DMA_MEM_TO_DEV)
                        src += block_size;
-               else if (dir == DMA_FROM_DEVICE)
+               else if (dir == DMA_DEV_TO_MEM)
                        dst += block_size;
        }
 
@@ -234,7 +233,7 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool,
                      struct scatterlist *sgl, unsigned int nents,
                      dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl,
                      u32 ctrl_last,
-                     enum dma_data_direction dir, u32 ctrl_irq_mask)
+                     enum dma_transfer_direction dir, u32 ctrl_irq_mask)
 {
        int i;
        struct scatterlist *sg;
@@ -249,9 +248,9 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool,
 
        spin_lock(&pool->lock);
 
-       if (dir == DMA_TO_DEVICE)
+       if (dir == DMA_MEM_TO_DEV)
                dst = dev_addr;
-       else if (dir == DMA_FROM_DEVICE)
+       else if (dir == DMA_DEV_TO_MEM)
                src = dev_addr;
        else
                goto err;
@@ -269,7 +268,7 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool,
                        ctrl_sg = ctrl ? ctrl : ctrl_last;
 
 
-               if (dir == DMA_TO_DEVICE)
+               if (dir == DMA_MEM_TO_DEV)
                        /* increment source address */
                        src = sg_phys(sg);
                else
@@ -293,7 +292,7 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool,
                        lli->src_addr = src;
                        lli->dst_addr = dst;
 
-                       if (dir == DMA_FROM_DEVICE)
+                       if (dir == DMA_DEV_TO_MEM)
                                dst += elem_size;
                        else
                                src += elem_size;
index 7a5c809..abff371 100644 (file)
@@ -97,7 +97,7 @@ coh901318_lli_fill_single(struct coh901318_pool *pool,
                          struct coh901318_lli *lli,
                          dma_addr_t buf, unsigned int size,
                          dma_addr_t dev_addr, u32 ctrl_chained, u32 ctrl_last,
-                         enum dma_data_direction dir);
+                         enum dma_transfer_direction dir);
 
 /**
  * coh901318_lli_fill_single() - Prepares the lli:s for dma scatter list transfer
@@ -119,6 +119,6 @@ coh901318_lli_fill_sg(struct coh901318_pool *pool,
                      struct scatterlist *sg, unsigned int nents,
                      dma_addr_t dev_addr, u32 ctrl_chained,
                      u32 ctrl, u32 ctrl_last,
-                     enum dma_data_direction dir, u32 ctrl_irq_mask);
+                     enum dma_transfer_direction dir, u32 ctrl_irq_mask);
 
 #endif /* COH901318_LLI_H */
index b48967b..a6c6051 100644 (file)
@@ -693,12 +693,12 @@ int dma_async_device_register(struct dma_device *device)
                !device->device_prep_dma_interrupt);
        BUG_ON(dma_has_cap(DMA_SG, device->cap_mask) &&
                !device->device_prep_dma_sg);
-       BUG_ON(dma_has_cap(DMA_SLAVE, device->cap_mask) &&
-               !device->device_prep_slave_sg);
        BUG_ON(dma_has_cap(DMA_CYCLIC, device->cap_mask) &&
                !device->device_prep_dma_cyclic);
        BUG_ON(dma_has_cap(DMA_SLAVE, device->cap_mask) &&
                !device->device_control);
+       BUG_ON(dma_has_cap(DMA_INTERLEAVE, device->cap_mask) &&
+               !device->device_prep_interleaved_dma);
 
        BUG_ON(!device->device_alloc_chan_resources);
        BUG_ON(!device->device_free_chan_resources);
index 9bfd6d3..9b592b0 100644 (file)
@@ -166,6 +166,38 @@ dwc_assign_cookie(struct dw_dma_chan *dwc, struct dw_desc *desc)
        return cookie;
 }
 
+static void dwc_initialize(struct dw_dma_chan *dwc)
+{
+       struct dw_dma *dw = to_dw_dma(dwc->chan.device);
+       struct dw_dma_slave *dws = dwc->chan.private;
+       u32 cfghi = DWC_CFGH_FIFO_MODE;
+       u32 cfglo = DWC_CFGL_CH_PRIOR(dwc->priority);
+
+       if (dwc->initialized == true)
+               return;
+
+       if (dws) {
+               /*
+                * We need controller-specific data to set up slave
+                * transfers.
+                */
+               BUG_ON(!dws->dma_dev || dws->dma_dev != dw->dma.dev);
+
+               cfghi = dws->cfg_hi;
+               cfglo |= dws->cfg_lo & ~DWC_CFGL_CH_PRIOR_MASK;
+       }
+
+       channel_writel(dwc, CFG_LO, cfglo);
+       channel_writel(dwc, CFG_HI, cfghi);
+
+       /* Enable interrupts */
+       channel_set_bit(dw, MASK.XFER, dwc->mask);
+       channel_set_bit(dw, MASK.BLOCK, dwc->mask);
+       channel_set_bit(dw, MASK.ERROR, dwc->mask);
+
+       dwc->initialized = true;
+}
+
 /*----------------------------------------------------------------------*/
 
 /* Called with dwc->lock held and bh disabled */
@@ -189,6 +221,8 @@ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first)
                return;
        }
 
+       dwc_initialize(dwc);
+
        channel_writel(dwc, LLP, first->txd.phys);
        channel_writel(dwc, CTL_LO,
                        DWC_CTLL_LLP_D_EN | DWC_CTLL_LLP_S_EN);
@@ -696,7 +730,7 @@ err_desc_get:
 
 static struct dma_async_tx_descriptor *
 dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
-               unsigned int sg_len, enum dma_data_direction direction,
+               unsigned int sg_len, enum dma_transfer_direction direction,
                unsigned long flags)
 {
        struct dw_dma_chan      *dwc = to_dw_dma_chan(chan);
@@ -720,7 +754,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
        prev = first = NULL;
 
        switch (direction) {
-       case DMA_TO_DEVICE:
+       case DMA_MEM_TO_DEV:
                ctllo = (DWC_DEFAULT_CTLLO(chan->private)
                                | DWC_CTLL_DST_WIDTH(reg_width)
                                | DWC_CTLL_DST_FIX
@@ -777,7 +811,7 @@ slave_sg_todev_fill_desc:
                                goto slave_sg_todev_fill_desc;
                }
                break;
-       case DMA_FROM_DEVICE:
+       case DMA_DEV_TO_MEM:
                ctllo = (DWC_DEFAULT_CTLLO(chan->private)
                                | DWC_CTLL_SRC_WIDTH(reg_width)
                                | DWC_CTLL_DST_INC
@@ -959,10 +993,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
        struct dw_dma_chan      *dwc = to_dw_dma_chan(chan);
        struct dw_dma           *dw = to_dw_dma(chan->device);
        struct dw_desc          *desc;
-       struct dw_dma_slave     *dws;
        int                     i;
-       u32                     cfghi;
-       u32                     cfglo;
        unsigned long           flags;
 
        dev_vdbg(chan2dev(chan), "alloc_chan_resources\n");
@@ -975,26 +1006,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
 
        dwc->completed = chan->cookie = 1;
 
-       cfghi = DWC_CFGH_FIFO_MODE;
-       cfglo = 0;
-
-       dws = chan->private;
-       if (dws) {
-               /*
-                * We need controller-specific data to set up slave
-                * transfers.
-                */
-               BUG_ON(!dws->dma_dev || dws->dma_dev != dw->dma.dev);
-
-               cfghi = dws->cfg_hi;
-               cfglo = dws->cfg_lo & ~DWC_CFGL_CH_PRIOR_MASK;
-       }
-
-       cfglo |= DWC_CFGL_CH_PRIOR(dwc->priority);
-
-       channel_writel(dwc, CFG_LO, cfglo);
-       channel_writel(dwc, CFG_HI, cfghi);
-
        /*
         * NOTE: some controllers may have additional features that we
         * need to initialize here, like "scatter-gather" (which
@@ -1026,11 +1037,6 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan)
                i = ++dwc->descs_allocated;
        }
 
-       /* Enable interrupts */
-       channel_set_bit(dw, MASK.XFER, dwc->mask);
-       channel_set_bit(dw, MASK.BLOCK, dwc->mask);
-       channel_set_bit(dw, MASK.ERROR, dwc->mask);
-
        spin_unlock_irqrestore(&dwc->lock, flags);
 
        dev_dbg(chan2dev(chan),
@@ -1058,6 +1064,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan)
        spin_lock_irqsave(&dwc->lock, flags);
        list_splice_init(&dwc->free_list, &list);
        dwc->descs_allocated = 0;
+       dwc->initialized = false;
 
        /* Disable interrupts */
        channel_clear_bit(dw, MASK.XFER, dwc->mask);
@@ -1165,7 +1172,7 @@ EXPORT_SYMBOL(dw_dma_cyclic_stop);
  */
 struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
                dma_addr_t buf_addr, size_t buf_len, size_t period_len,
-               enum dma_data_direction direction)
+               enum dma_transfer_direction direction)
 {
        struct dw_dma_chan              *dwc = to_dw_dma_chan(chan);
        struct dw_cyclic_desc           *cdesc;
@@ -1206,7 +1213,7 @@ struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
                goto out_err;
        if (unlikely(buf_addr & ((1 << reg_width) - 1)))
                goto out_err;
-       if (unlikely(!(direction & (DMA_TO_DEVICE | DMA_FROM_DEVICE))))
+       if (unlikely(!(direction & (DMA_MEM_TO_DEV | DMA_DEV_TO_MEM))))
                goto out_err;
 
        retval = ERR_PTR(-ENOMEM);
@@ -1228,7 +1235,7 @@ struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
                        goto out_err_desc_get;
 
                switch (direction) {
-               case DMA_TO_DEVICE:
+               case DMA_MEM_TO_DEV:
                        desc->lli.dar = dws->tx_reg;
                        desc->lli.sar = buf_addr + (period_len * i);
                        desc->lli.ctllo = (DWC_DEFAULT_CTLLO(chan->private)
@@ -1239,7 +1246,7 @@ struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
                                        | DWC_CTLL_FC(dws->fc)
                                        | DWC_CTLL_INT_EN);
                        break;
-               case DMA_FROM_DEVICE:
+               case DMA_DEV_TO_MEM:
                        desc->lli.dar = buf_addr + (period_len * i);
                        desc->lli.sar = dws->rx_reg;
                        desc->lli.ctllo = (DWC_DEFAULT_CTLLO(chan->private)
@@ -1335,6 +1342,8 @@ EXPORT_SYMBOL(dw_dma_cyclic_free);
 
 static void dw_dma_off(struct dw_dma *dw)
 {
+       int i;
+
        dma_writel(dw, CFG, 0);
 
        channel_clear_bit(dw, MASK.XFER, dw->all_chan_mask);
@@ -1345,6 +1354,9 @@ static void dw_dma_off(struct dw_dma *dw)
 
        while (dma_readl(dw, CFG) & DW_CFG_DMA_EN)
                cpu_relax();
+
+       for (i = 0; i < dw->dma.chancnt; i++)
+               dw->chan[i].initialized = false;
 }
 
 static int __init dw_probe(struct platform_device *pdev)
@@ -1533,6 +1545,7 @@ static int dw_suspend_noirq(struct device *dev)
 
        dw_dma_off(platform_get_drvdata(pdev));
        clk_disable(dw->clk);
+
        return 0;
 }
 
index c341951..5eef694 100644 (file)
@@ -140,6 +140,7 @@ struct dw_dma_chan {
        u8                      mask;
        u8                      priority;
        bool                    paused;
+       bool                    initialized;
 
        spinlock_t              lock;
 
index b47e2b8..59e7a96 100644 (file)
@@ -246,6 +246,9 @@ static void ep93xx_dma_set_active(struct ep93xx_dma_chan *edmac,
 static struct ep93xx_dma_desc *
 ep93xx_dma_get_active(struct ep93xx_dma_chan *edmac)
 {
+       if (list_empty(&edmac->active))
+               return NULL;
+
        return list_first_entry(&edmac->active, struct ep93xx_dma_desc, node);
 }
 
@@ -263,16 +266,22 @@ ep93xx_dma_get_active(struct ep93xx_dma_chan *edmac)
  */
 static bool ep93xx_dma_advance_active(struct ep93xx_dma_chan *edmac)
 {
+       struct ep93xx_dma_desc *desc;
+
        list_rotate_left(&edmac->active);
 
        if (test_bit(EP93XX_DMA_IS_CYCLIC, &edmac->flags))
                return true;
 
+       desc = ep93xx_dma_get_active(edmac);
+       if (!desc)
+               return false;
+
        /*
         * If txd.cookie is set it means that we are back in the first
         * descriptor in the chain and hence done with it.
         */
-       return !ep93xx_dma_get_active(edmac)->txd.cookie;
+       return !desc->txd.cookie;
 }
 
 /*
@@ -327,10 +336,16 @@ static void m2p_hw_shutdown(struct ep93xx_dma_chan *edmac)
 
 static void m2p_fill_desc(struct ep93xx_dma_chan *edmac)
 {
-       struct ep93xx_dma_desc *desc = ep93xx_dma_get_active(edmac);
+       struct ep93xx_dma_desc *desc;
        u32 bus_addr;
 
-       if (ep93xx_dma_chan_direction(&edmac->chan) == DMA_TO_DEVICE)
+       desc = ep93xx_dma_get_active(edmac);
+       if (!desc) {
+               dev_warn(chan2dev(edmac), "M2P: empty descriptor list\n");
+               return;
+       }
+
+       if (ep93xx_dma_chan_direction(&edmac->chan) == DMA_MEM_TO_DEV)
                bus_addr = desc->src_addr;
        else
                bus_addr = desc->dst_addr;
@@ -443,7 +458,7 @@ static int m2m_hw_setup(struct ep93xx_dma_chan *edmac)
                control = (5 << M2M_CONTROL_PWSC_SHIFT);
                control |= M2M_CONTROL_NO_HDSK;
 
-               if (data->direction == DMA_TO_DEVICE) {
+               if (data->direction == DMA_MEM_TO_DEV) {
                        control |= M2M_CONTROL_DAH;
                        control |= M2M_CONTROL_TM_TX;
                        control |= M2M_CONTROL_RSS_SSPTX;
@@ -459,11 +474,7 @@ static int m2m_hw_setup(struct ep93xx_dma_chan *edmac)
                 * This IDE part is totally untested. Values below are taken
                 * from the EP93xx Users's Guide and might not be correct.
                 */
-               control |= M2M_CONTROL_NO_HDSK;
-               control |= M2M_CONTROL_RSS_IDE;
-               control |= M2M_CONTROL_PW_16;
-
-               if (data->direction == DMA_TO_DEVICE) {
+               if (data->direction == DMA_MEM_TO_DEV) {
                        /* Worst case from the UG */
                        control = (3 << M2M_CONTROL_PWSC_SHIFT);
                        control |= M2M_CONTROL_DAH;
@@ -473,6 +484,10 @@ static int m2m_hw_setup(struct ep93xx_dma_chan *edmac)
                        control |= M2M_CONTROL_SAH;
                        control |= M2M_CONTROL_TM_RX;
                }
+
+               control |= M2M_CONTROL_NO_HDSK;
+               control |= M2M_CONTROL_RSS_IDE;
+               control |= M2M_CONTROL_PW_16;
                break;
 
        default:
@@ -491,7 +506,13 @@ static void m2m_hw_shutdown(struct ep93xx_dma_chan *edmac)
 
 static void m2m_fill_desc(struct ep93xx_dma_chan *edmac)
 {
-       struct ep93xx_dma_desc *desc = ep93xx_dma_get_active(edmac);
+       struct ep93xx_dma_desc *desc;
+
+       desc = ep93xx_dma_get_active(edmac);
+       if (!desc) {
+               dev_warn(chan2dev(edmac), "M2M: empty descriptor list\n");
+               return;
+       }
 
        if (edmac->buffer == 0) {
                writel(desc->src_addr, edmac->regs + M2M_SAR_BASE0);
@@ -669,24 +690,30 @@ static void ep93xx_dma_tasklet(unsigned long data)
 {
        struct ep93xx_dma_chan *edmac = (struct ep93xx_dma_chan *)data;
        struct ep93xx_dma_desc *desc, *d;
-       dma_async_tx_callback callback;
-       void *callback_param;
+       dma_async_tx_callback callback = NULL;
+       void *callback_param = NULL;
        LIST_HEAD(list);
 
        spin_lock_irq(&edmac->lock);
+       /*
+        * If dma_terminate_all() was called before we get to run, the active
+        * list has become empty. If that happens we aren't supposed to do
+        * anything more than call ep93xx_dma_advance_work().
+        */
        desc = ep93xx_dma_get_active(edmac);
-       if (desc->complete) {
-               edmac->last_completed = desc->txd.cookie;
-               list_splice_init(&edmac->active, &list);
+       if (desc) {
+               if (desc->complete) {
+                       edmac->last_completed = desc->txd.cookie;
+                       list_splice_init(&edmac->active, &list);
+               }
+               callback = desc->txd.callback;
+               callback_param = desc->txd.callback_param;
        }
        spin_unlock_irq(&edmac->lock);
 
        /* Pick up the next descriptor from the queue */
        ep93xx_dma_advance_work(edmac);
 
-       callback = desc->txd.callback;
-       callback_param = desc->txd.callback_param;
-
        /* Now we can release all the chained descriptors */
        list_for_each_entry_safe(desc, d, &list, node) {
                /*
@@ -706,13 +733,22 @@ static void ep93xx_dma_tasklet(unsigned long data)
 static irqreturn_t ep93xx_dma_interrupt(int irq, void *dev_id)
 {
        struct ep93xx_dma_chan *edmac = dev_id;
+       struct ep93xx_dma_desc *desc;
        irqreturn_t ret = IRQ_HANDLED;
 
        spin_lock(&edmac->lock);
 
+       desc = ep93xx_dma_get_active(edmac);
+       if (!desc) {
+               dev_warn(chan2dev(edmac),
+                        "got interrupt while active list is empty\n");
+               spin_unlock(&edmac->lock);
+               return IRQ_NONE;
+       }
+
        switch (edmac->edma->hw_interrupt(edmac)) {
        case INTERRUPT_DONE:
-               ep93xx_dma_get_active(edmac)->complete = true;
+               desc->complete = true;
                tasklet_schedule(&edmac->tasklet);
                break;
 
@@ -803,8 +839,8 @@ static int ep93xx_dma_alloc_chan_resources(struct dma_chan *chan)
                        switch (data->port) {
                        case EP93XX_DMA_SSP:
                        case EP93XX_DMA_IDE:
-                               if (data->direction != DMA_TO_DEVICE &&
-                                   data->direction != DMA_FROM_DEVICE)
+                               if (data->direction != DMA_MEM_TO_DEV &&
+                                   data->direction != DMA_DEV_TO_MEM)
                                        return -EINVAL;
                                break;
                        default:
@@ -952,7 +988,7 @@ fail:
  */
 static struct dma_async_tx_descriptor *
 ep93xx_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
-                        unsigned int sg_len, enum dma_data_direction dir,
+                        unsigned int sg_len, enum dma_transfer_direction dir,
                         unsigned long flags)
 {
        struct ep93xx_dma_chan *edmac = to_ep93xx_dma_chan(chan);
@@ -988,7 +1024,7 @@ ep93xx_dma_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                        goto fail;
                }
 
-               if (dir == DMA_TO_DEVICE) {
+               if (dir == DMA_MEM_TO_DEV) {
                        desc->src_addr = sg_dma_address(sg);
                        desc->dst_addr = edmac->runtime_addr;
                } else {
@@ -1032,7 +1068,7 @@ fail:
 static struct dma_async_tx_descriptor *
 ep93xx_dma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr,
                           size_t buf_len, size_t period_len,
-                          enum dma_data_direction dir)
+                          enum dma_transfer_direction dir)
 {
        struct ep93xx_dma_chan *edmac = to_ep93xx_dma_chan(chan);
        struct ep93xx_dma_desc *desc, *first;
@@ -1065,7 +1101,7 @@ ep93xx_dma_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr,
                        goto fail;
                }
 
-               if (dir == DMA_TO_DEVICE) {
+               if (dir == DMA_MEM_TO_DEV) {
                        desc->src_addr = dma_addr + offset;
                        desc->dst_addr = edmac->runtime_addr;
                } else {
@@ -1133,12 +1169,12 @@ static int ep93xx_dma_slave_config(struct ep93xx_dma_chan *edmac,
                return -EINVAL;
 
        switch (config->direction) {
-       case DMA_FROM_DEVICE:
+       case DMA_DEV_TO_MEM:
                width = config->src_addr_width;
                addr = config->src_addr;
                break;
 
-       case DMA_TO_DEVICE:
+       case DMA_MEM_TO_DEV:
                width = config->dst_addr_width;
                addr = config->dst_addr;
                break;
index 8a78154..b98070c 100644 (file)
@@ -772,7 +772,7 @@ fail:
  */
 static struct dma_async_tx_descriptor *fsl_dma_prep_slave_sg(
        struct dma_chan *dchan, struct scatterlist *sgl, unsigned int sg_len,
-       enum dma_data_direction direction, unsigned long flags)
+       enum dma_transfer_direction direction, unsigned long flags)
 {
        /*
         * This operation is not supported on the Freescale DMA controller
@@ -819,7 +819,7 @@ static int fsl_dma_device_control(struct dma_chan *dchan,
                        return -ENXIO;
 
                /* we set the controller burst size depending on direction */
-               if (config->direction == DMA_TO_DEVICE)
+               if (config->direction == DMA_MEM_TO_DEV)
                        size = config->dst_addr_width * config->dst_maxburst;
                else
                        size = config->src_addr_width * config->src_maxburst;
index 4be55f9..e4383ee 100644 (file)
@@ -107,7 +107,7 @@ static int imxdma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
                imx_dma_disable(imxdmac->imxdma_channel);
                return 0;
        case DMA_SLAVE_CONFIG:
-               if (dmaengine_cfg->direction == DMA_FROM_DEVICE) {
+               if (dmaengine_cfg->direction == DMA_DEV_TO_MEM) {
                        imxdmac->per_address = dmaengine_cfg->src_addr;
                        imxdmac->watermark_level = dmaengine_cfg->src_maxburst;
                        imxdmac->word_size = dmaengine_cfg->src_addr_width;
@@ -224,7 +224,7 @@ static void imxdma_free_chan_resources(struct dma_chan *chan)
 
 static struct dma_async_tx_descriptor *imxdma_prep_slave_sg(
                struct dma_chan *chan, struct scatterlist *sgl,
-               unsigned int sg_len, enum dma_data_direction direction,
+               unsigned int sg_len, enum dma_transfer_direction direction,
                unsigned long flags)
 {
        struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
@@ -241,7 +241,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_slave_sg(
                dma_length += sg->length;
        }
 
-       if (direction == DMA_FROM_DEVICE)
+       if (direction == DMA_DEV_TO_MEM)
                dmamode = DMA_MODE_READ;
        else
                dmamode = DMA_MODE_WRITE;
@@ -271,7 +271,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_slave_sg(
 
 static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
-               size_t period_len, enum dma_data_direction direction)
+               size_t period_len, enum dma_transfer_direction direction)
 {
        struct imxdma_channel *imxdmac = to_imxdma_chan(chan);
        struct imxdma_engine *imxdma = imxdmac->imxdma;
@@ -317,7 +317,7 @@ static struct dma_async_tx_descriptor *imxdma_prep_dma_cyclic(
        imxdmac->sg_list[periods].page_link =
                ((unsigned long)imxdmac->sg_list | 0x01) & ~0x02;
 
-       if (direction == DMA_FROM_DEVICE)
+       if (direction == DMA_DEV_TO_MEM)
                dmamode = DMA_MODE_READ;
        else
                dmamode = DMA_MODE_WRITE;
index f993955..a8af379 100644 (file)
@@ -247,7 +247,7 @@ struct sdma_engine;
 struct sdma_channel {
        struct sdma_engine              *sdma;
        unsigned int                    channel;
-       enum dma_data_direction         direction;
+       enum dma_transfer_direction             direction;
        enum sdma_peripheral_type       peripheral_type;
        unsigned int                    event_id0;
        unsigned int                    event_id1;
@@ -268,6 +268,8 @@ struct sdma_channel {
        struct dma_async_tx_descriptor  desc;
        dma_cookie_t                    last_completed;
        enum dma_status                 status;
+       unsigned int                    chn_count;
+       unsigned int                    chn_real_count;
 };
 
 #define IMX_DMA_SG_LOOP                (1 << 0)
@@ -503,6 +505,7 @@ static void mxc_sdma_handle_channel_normal(struct sdma_channel *sdmac)
        struct sdma_buffer_descriptor *bd;
        int i, error = 0;
 
+       sdmac->chn_real_count = 0;
        /*
         * non loop mode. Iterate over all descriptors, collect
         * errors and call callback function
@@ -512,6 +515,7 @@ static void mxc_sdma_handle_channel_normal(struct sdma_channel *sdmac)
 
                 if (bd->mode.status & (BD_DONE | BD_RROR))
                        error = -EIO;
+                sdmac->chn_real_count += bd->mode.count;
        }
 
        if (error)
@@ -519,9 +523,9 @@ static void mxc_sdma_handle_channel_normal(struct sdma_channel *sdmac)
        else
                sdmac->status = DMA_SUCCESS;
 
+       sdmac->last_completed = sdmac->desc.cookie;
        if (sdmac->desc.callback)
                sdmac->desc.callback(sdmac->desc.callback_param);
-       sdmac->last_completed = sdmac->desc.cookie;
 }
 
 static void mxc_sdma_handle_channel(struct sdma_channel *sdmac)
@@ -650,7 +654,7 @@ static int sdma_load_context(struct sdma_channel *sdmac)
        struct sdma_buffer_descriptor *bd0 = sdma->channel[0].bd;
        int ret;
 
-       if (sdmac->direction == DMA_FROM_DEVICE) {
+       if (sdmac->direction == DMA_DEV_TO_MEM) {
                load_address = sdmac->pc_from_device;
        } else {
                load_address = sdmac->pc_to_device;
@@ -832,17 +836,18 @@ static struct sdma_channel *to_sdma_chan(struct dma_chan *chan)
 
 static dma_cookie_t sdma_tx_submit(struct dma_async_tx_descriptor *tx)
 {
+       unsigned long flags;
        struct sdma_channel *sdmac = to_sdma_chan(tx->chan);
        struct sdma_engine *sdma = sdmac->sdma;
        dma_cookie_t cookie;
 
-       spin_lock_irq(&sdmac->lock);
+       spin_lock_irqsave(&sdmac->lock, flags);
 
        cookie = sdma_assign_cookie(sdmac);
 
        sdma_enable_channel(sdma, sdmac->channel);
 
-       spin_unlock_irq(&sdmac->lock);
+       spin_unlock_irqrestore(&sdmac->lock, flags);
 
        return cookie;
 }
@@ -911,7 +916,7 @@ static void sdma_free_chan_resources(struct dma_chan *chan)
 
 static struct dma_async_tx_descriptor *sdma_prep_slave_sg(
                struct dma_chan *chan, struct scatterlist *sgl,
-               unsigned int sg_len, enum dma_data_direction direction,
+               unsigned int sg_len, enum dma_transfer_direction direction,
                unsigned long flags)
 {
        struct sdma_channel *sdmac = to_sdma_chan(chan);
@@ -941,6 +946,7 @@ static struct dma_async_tx_descriptor *sdma_prep_slave_sg(
                goto err_out;
        }
 
+       sdmac->chn_count = 0;
        for_each_sg(sgl, sg, sg_len, i) {
                struct sdma_buffer_descriptor *bd = &sdmac->bd[i];
                int param;
@@ -957,6 +963,7 @@ static struct dma_async_tx_descriptor *sdma_prep_slave_sg(
                }
 
                bd->mode.count = count;
+               sdmac->chn_count += count;
 
                if (sdmac->word_size > DMA_SLAVE_BUSWIDTH_4_BYTES) {
                        ret =  -EINVAL;
@@ -1008,7 +1015,7 @@ err_out:
 
 static struct dma_async_tx_descriptor *sdma_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
-               size_t period_len, enum dma_data_direction direction)
+               size_t period_len, enum dma_transfer_direction direction)
 {
        struct sdma_channel *sdmac = to_sdma_chan(chan);
        struct sdma_engine *sdma = sdmac->sdma;
@@ -1093,7 +1100,7 @@ static int sdma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
                sdma_disable_channel(sdmac);
                return 0;
        case DMA_SLAVE_CONFIG:
-               if (dmaengine_cfg->direction == DMA_FROM_DEVICE) {
+               if (dmaengine_cfg->direction == DMA_DEV_TO_MEM) {
                        sdmac->per_address = dmaengine_cfg->src_addr;
                        sdmac->watermark_level = dmaengine_cfg->src_maxburst;
                        sdmac->word_size = dmaengine_cfg->src_addr_width;
@@ -1102,6 +1109,7 @@ static int sdma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
                        sdmac->watermark_level = dmaengine_cfg->dst_maxburst;
                        sdmac->word_size = dmaengine_cfg->dst_addr_width;
                }
+               sdmac->direction = dmaengine_cfg->direction;
                return sdma_config_channel(sdmac);
        default:
                return -ENOSYS;
@@ -1119,7 +1127,8 @@ static enum dma_status sdma_tx_status(struct dma_chan *chan,
 
        last_used = chan->cookie;
 
-       dma_set_tx_state(txstate, sdmac->last_completed, last_used, 0);
+       dma_set_tx_state(txstate, sdmac->last_completed, last_used,
+                       sdmac->chn_count - sdmac->chn_real_count);
 
        return sdmac->status;
 }
index 19a0c64..74f70aa 100644 (file)
@@ -280,7 +280,8 @@ static void midc_dostart(struct intel_mid_dma_chan *midc,
  * callbacks but must be called with the lock held.
  */
 static void midc_descriptor_complete(struct intel_mid_dma_chan *midc,
-              struct intel_mid_dma_desc *desc)
+               struct intel_mid_dma_desc *desc)
+               __releases(&midc->lock) __acquires(&midc->lock)
 {
        struct dma_async_tx_descriptor  *txd = &desc->txd;
        dma_async_tx_callback callback_txd = NULL;
@@ -311,6 +312,7 @@ static void midc_descriptor_complete(struct intel_mid_dma_chan *midc,
                        pci_pool_free(desc->lli_pool, desc->lli,
                                                desc->lli_phys);
                        pci_pool_destroy(desc->lli_pool);
+                       desc->lli = NULL;
                }
                list_move(&desc->desc_node, &midc->free_list);
                midc->busy = false;
@@ -395,10 +397,10 @@ static int midc_lli_fill_sg(struct intel_mid_dma_chan *midc,
                                                        midc->dma->block_size);
                /*Populate SAR and DAR values*/
                sg_phy_addr = sg_phys(sg);
-               if (desc->dirn ==  DMA_TO_DEVICE) {
+               if (desc->dirn ==  DMA_MEM_TO_DEV) {
                        lli_bloc_desc->sar  = sg_phy_addr;
                        lli_bloc_desc->dar  = mids->dma_slave.dst_addr;
-               } else if (desc->dirn ==  DMA_FROM_DEVICE) {
+               } else if (desc->dirn ==  DMA_DEV_TO_MEM) {
                        lli_bloc_desc->sar  = mids->dma_slave.src_addr;
                        lli_bloc_desc->dar  = sg_phy_addr;
                }
@@ -490,7 +492,9 @@ static enum dma_status intel_mid_dma_tx_status(struct dma_chan *chan,
 
        ret = dma_async_is_complete(cookie, last_complete, last_used);
        if (ret != DMA_SUCCESS) {
+               spin_lock_bh(&midc->lock);
                midc_scan_descriptors(to_middma_device(chan->device), midc);
+               spin_unlock_bh(&midc->lock);
 
                last_complete = midc->completed;
                last_used = chan->cookie;
@@ -566,6 +570,7 @@ static int intel_mid_dma_device_control(struct dma_chan *chan,
                        pci_pool_free(desc->lli_pool, desc->lli,
                                                desc->lli_phys);
                        pci_pool_destroy(desc->lli_pool);
+                       desc->lli = NULL;
                }
                list_move(&desc->desc_node, &midc->free_list);
        }
@@ -632,13 +637,13 @@ static struct dma_async_tx_descriptor *intel_mid_dma_prep_memcpy(
                if (midc->dma->pimr_mask) {
                        cfg_hi.cfgx.protctl = 0x0; /*default value*/
                        cfg_hi.cfgx.fifo_mode = 1;
-                       if (mids->dma_slave.direction == DMA_TO_DEVICE) {
+                       if (mids->dma_slave.direction == DMA_MEM_TO_DEV) {
                                cfg_hi.cfgx.src_per = 0;
                                if (mids->device_instance == 0)
                                        cfg_hi.cfgx.dst_per = 3;
                                if (mids->device_instance == 1)
                                        cfg_hi.cfgx.dst_per = 1;
-                       } else if (mids->dma_slave.direction == DMA_FROM_DEVICE) {
+                       } else if (mids->dma_slave.direction == DMA_DEV_TO_MEM) {
                                if (mids->device_instance == 0)
                                        cfg_hi.cfgx.src_per = 2;
                                if (mids->device_instance == 1)
@@ -682,11 +687,11 @@ static struct dma_async_tx_descriptor *intel_mid_dma_prep_memcpy(
                ctl_lo.ctlx.sinc = 0;
                ctl_lo.ctlx.dinc = 0;
        } else {
-               if (mids->dma_slave.direction == DMA_TO_DEVICE) {
+               if (mids->dma_slave.direction == DMA_MEM_TO_DEV) {
                        ctl_lo.ctlx.sinc = 0;
                        ctl_lo.ctlx.dinc = 2;
                        ctl_lo.ctlx.tt_fc = 1;
-               } else if (mids->dma_slave.direction == DMA_FROM_DEVICE) {
+               } else if (mids->dma_slave.direction == DMA_DEV_TO_MEM) {
                        ctl_lo.ctlx.sinc = 2;
                        ctl_lo.ctlx.dinc = 0;
                        ctl_lo.ctlx.tt_fc = 2;
@@ -732,7 +737,7 @@ err_desc_get:
  */
 static struct dma_async_tx_descriptor *intel_mid_dma_prep_slave_sg(
                        struct dma_chan *chan, struct scatterlist *sgl,
-                       unsigned int sg_len, enum dma_data_direction direction,
+                       unsigned int sg_len, enum dma_transfer_direction direction,
                        unsigned long flags)
 {
        struct intel_mid_dma_chan *midc = NULL;
@@ -868,7 +873,7 @@ static int intel_mid_dma_alloc_chan_resources(struct dma_chan *chan)
        pm_runtime_get_sync(&mid->pdev->dev);
 
        if (mid->state == SUSPENDED) {
-               if (dma_resume(mid->pdev)) {
+               if (dma_resume(&mid->pdev->dev)) {
                        pr_err("ERR_MDMA: resume failed");
                        return -EFAULT;
                }
@@ -1099,7 +1104,8 @@ static int mid_setup_dma(struct pci_dev *pdev)
                                        LNW_PERIPHRAL_MASK_SIZE);
                if (dma->mask_reg == NULL) {
                        pr_err("ERR_MDMA:Can't map periphral intr space !!\n");
-                       return -ENOMEM;
+                       err = -ENOMEM;
+                       goto err_ioremap;
                }
        } else
                dma->mask_reg = NULL;
@@ -1196,6 +1202,9 @@ static int mid_setup_dma(struct pci_dev *pdev)
 err_engine:
        free_irq(pdev->irq, dma);
 err_irq:
+       if (dma->mask_reg)
+               iounmap(dma->mask_reg);
+err_ioremap:
        pci_pool_destroy(dma->dma_pool);
 err_dma_pool:
        pr_err("ERR_MDMA:setup_dma failed: %d\n", err);
@@ -1337,8 +1346,9 @@ static void __devexit intel_mid_dma_remove(struct pci_dev *pdev)
 *
 * This function is called by OS when a power event occurs
 */
-int dma_suspend(struct pci_dev *pci, pm_message_t state)
+static int dma_suspend(struct device *dev)
 {
+       struct pci_dev *pci = to_pci_dev(dev);
        int i;
        struct middma_device *device = pci_get_drvdata(pci);
        pr_debug("MDMA: dma_suspend called\n");
@@ -1362,8 +1372,9 @@ int dma_suspend(struct pci_dev *pci, pm_message_t state)
 *
 * This function is called by OS when a power event occurs
 */
-int dma_resume(struct pci_dev *pci)
+int dma_resume(struct device *dev)
 {
+       struct pci_dev *pci = to_pci_dev(dev);
        int ret;
        struct middma_device *device = pci_get_drvdata(pci);
 
@@ -1429,6 +1440,8 @@ static const struct dev_pm_ops intel_mid_dma_pm = {
        .runtime_suspend = dma_runtime_suspend,
        .runtime_resume = dma_runtime_resume,
        .runtime_idle = dma_runtime_idle,
+       .suspend = dma_suspend,
+       .resume = dma_resume,
 };
 
 static struct pci_driver intel_mid_dma_pci_driver = {
@@ -1437,8 +1450,6 @@ static struct pci_driver intel_mid_dma_pci_driver = {
        .probe          =       intel_mid_dma_probe,
        .remove         =       __devexit_p(intel_mid_dma_remove),
 #ifdef CONFIG_PM
-       .suspend = dma_suspend,
-       .resume = dma_resume,
        .driver = {
                .pm = &intel_mid_dma_pm,
        },
index aea5ee8..c83d35b 100644 (file)
@@ -262,7 +262,7 @@ struct intel_mid_dma_desc {
        unsigned int                    lli_length;
        unsigned int                    current_lli;
        dma_addr_t                      next;
-       enum dma_data_direction         dirn;
+       enum dma_transfer_direction             dirn;
        enum dma_status                 status;
        enum dma_slave_buswidth         width; /*width of DMA txn*/
        enum intel_mid_dma_mode         cfg_mode; /*mode configuration*/
@@ -296,6 +296,6 @@ static inline struct intel_mid_dma_slave *to_intel_mid_dma_slave
 }
 
 
-int dma_resume(struct pci_dev *pci);
+int dma_resume(struct device *dev);
 
 #endif /*__INTEL_MID_DMAC_REGS_H__*/
index e03f811..04be90b 100644 (file)
@@ -1735,8 +1735,6 @@ static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan)
        spin_unlock_bh(&iop_chan->lock);
 }
 
-MODULE_ALIAS("platform:iop-adma");
-
 static struct platform_driver iop_adma_driver = {
        .probe          = iop_adma_probe,
        .remove         = __devexit_p(iop_adma_remove),
@@ -1746,19 +1744,9 @@ static struct platform_driver iop_adma_driver = {
        },
 };
 
-static int __init iop_adma_init (void)
-{
-       return platform_driver_register(&iop_adma_driver);
-}
-
-static void __exit iop_adma_exit (void)
-{
-       platform_driver_unregister(&iop_adma_driver);
-       return;
-}
-module_exit(iop_adma_exit);
-module_init(iop_adma_init);
+module_platform_driver(iop_adma_driver);
 
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("IOP ADMA Engine Driver");
 MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:iop-adma");
index 0e5ef33..6212b16 100644 (file)
@@ -312,7 +312,7 @@ static void ipu_ch_param_set_size(union chan_param_mem *params,
        case IPU_PIX_FMT_RGB565:
                params->ip.bpp  = 2;
                params->ip.pfs  = 4;
-               params->ip.npb  = 7;
+               params->ip.npb  = 15;
                params->ip.sat  = 2;            /* SAT = 32-bit access */
                params->ip.ofs0 = 0;            /* Red bit offset */
                params->ip.ofs1 = 5;            /* Green bit offset */
@@ -422,12 +422,6 @@ static void ipu_ch_param_set_size(union chan_param_mem *params,
        params->pp.nsb = 1;
 }
 
-static void ipu_ch_param_set_burst_size(union chan_param_mem *params,
-                                       uint16_t burst_pixels)
-{
-       params->pp.npb = burst_pixels - 1;
-}
-
 static void ipu_ch_param_set_buffer(union chan_param_mem *params,
                                    dma_addr_t buf0, dma_addr_t buf1)
 {
@@ -690,23 +684,6 @@ static int ipu_init_channel_buffer(struct idmac_channel *ichan,
        ipu_ch_param_set_size(&params, pixel_fmt, width, height, stride_bytes);
        ipu_ch_param_set_buffer(&params, phyaddr_0, phyaddr_1);
        ipu_ch_param_set_rotation(&params, rot_mode);
-       /* Some channels (rotation) have restriction on burst length */
-       switch (channel) {
-       case IDMAC_IC_7:        /* Hangs with burst 8, 16, other values
-                                  invalid - Table 44-30 */
-/*
-               ipu_ch_param_set_burst_size(&params, 8);
- */
-               break;
-       case IDMAC_SDC_0:
-       case IDMAC_SDC_1:
-               /* In original code only IPU_PIX_FMT_RGB565 was setting burst */
-               ipu_ch_param_set_burst_size(&params, 16);
-               break;
-       case IDMAC_IC_0:
-       default:
-               break;
-       }
 
        spin_lock_irqsave(&ipu->lock, flags);
 
@@ -1364,7 +1341,7 @@ static void ipu_gc_tasklet(unsigned long arg)
 /* Allocate and initialise a transfer descriptor. */
 static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan,
                struct scatterlist *sgl, unsigned int sg_len,
-               enum dma_data_direction direction, unsigned long tx_flags)
+               enum dma_transfer_direction direction, unsigned long tx_flags)
 {
        struct idmac_channel *ichan = to_idmac_chan(chan);
        struct idmac_tx_desc *desc = NULL;
@@ -1376,7 +1353,7 @@ static struct dma_async_tx_descriptor *idmac_prep_slave_sg(struct dma_chan *chan
            chan->chan_id != IDMAC_IC_7)
                return NULL;
 
-       if (direction != DMA_FROM_DEVICE && direction != DMA_TO_DEVICE) {
+       if (direction != DMA_DEV_TO_MEM && direction != DMA_MEM_TO_DEV) {
                dev_err(chan->device->dev, "Invalid DMA direction %d!\n", direction);
                return NULL;
        }
index 8ba4edc..4d6d4cf 100644 (file)
@@ -835,17 +835,7 @@ static struct platform_driver mpc_dma_driver = {
        },
 };
 
-static int __init mpc_dma_init(void)
-{
-       return platform_driver_register(&mpc_dma_driver);
-}
-module_init(mpc_dma_init);
-
-static void __exit mpc_dma_exit(void)
-{
-       platform_driver_unregister(&mpc_dma_driver);
-}
-module_exit(mpc_dma_exit);
+module_platform_driver(mpc_dma_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Piotr Ziecik <kosmo@semihalf.com>");
index fc903c0..b06cd4c 100644 (file)
@@ -44,7 +44,6 @@
 #define HW_APBHX_CTRL0                         0x000
 #define BM_APBH_CTRL0_APB_BURST8_EN            (1 << 29)
 #define BM_APBH_CTRL0_APB_BURST_EN             (1 << 28)
-#define BP_APBH_CTRL0_CLKGATE_CHANNEL          8
 #define BP_APBH_CTRL0_RESET_CHANNEL            16
 #define HW_APBHX_CTRL1                         0x010
 #define HW_APBHX_CTRL2                         0x020
@@ -111,6 +110,7 @@ struct mxs_dma_chan {
        int                             chan_irq;
        struct mxs_dma_ccw              *ccw;
        dma_addr_t                      ccw_phys;
+       int                             desc_count;
        dma_cookie_t                    last_completed;
        enum dma_status                 status;
        unsigned int                    flags;
@@ -130,23 +130,6 @@ struct mxs_dma_engine {
        struct mxs_dma_chan             mxs_chans[MXS_DMA_CHANNELS];
 };
 
-static inline void mxs_dma_clkgate(struct mxs_dma_chan *mxs_chan, int enable)
-{
-       struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
-       int chan_id = mxs_chan->chan.chan_id;
-       int set_clr = enable ? MXS_CLR_ADDR : MXS_SET_ADDR;
-
-       /* enable apbh channel clock */
-       if (dma_is_apbh()) {
-               if (apbh_is_old())
-                       writel(1 << (chan_id + BP_APBH_CTRL0_CLKGATE_CHANNEL),
-                               mxs_dma->base + HW_APBHX_CTRL0 + set_clr);
-               else
-                       writel(1 << chan_id,
-                               mxs_dma->base + HW_APBHX_CTRL0 + set_clr);
-       }
-}
-
 static void mxs_dma_reset_chan(struct mxs_dma_chan *mxs_chan)
 {
        struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
@@ -165,9 +148,6 @@ static void mxs_dma_enable_chan(struct mxs_dma_chan *mxs_chan)
        struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
        int chan_id = mxs_chan->chan.chan_id;
 
-       /* clkgate needs to be enabled before writing other registers */
-       mxs_dma_clkgate(mxs_chan, 1);
-
        /* set cmd_addr up */
        writel(mxs_chan->ccw_phys,
                mxs_dma->base + HW_APBHX_CHn_NXTCMDAR(chan_id));
@@ -178,9 +158,6 @@ static void mxs_dma_enable_chan(struct mxs_dma_chan *mxs_chan)
 
 static void mxs_dma_disable_chan(struct mxs_dma_chan *mxs_chan)
 {
-       /* disable apbh channel clock */
-       mxs_dma_clkgate(mxs_chan, 0);
-
        mxs_chan->status = DMA_SUCCESS;
 }
 
@@ -268,7 +245,7 @@ static irqreturn_t mxs_dma_int_handler(int irq, void *dev_id)
        /*
         * When both completion and error of termination bits set at the
         * same time, we do not take it as an error.  IOW, it only becomes
-        * an error we need to handler here in case of ether it's (1) an bus
+        * an error we need to handle here in case of either it's (1) a bus
         * error or (2) a termination error with no completion.
         */
        stat2 = ((stat2 >> MXS_DMA_CHANNELS) & stat2) | /* (1) */
@@ -338,10 +315,7 @@ static int mxs_dma_alloc_chan_resources(struct dma_chan *chan)
        if (ret)
                goto err_clk;
 
-       /* clkgate needs to be enabled for reset to finish */
-       mxs_dma_clkgate(mxs_chan, 1);
        mxs_dma_reset_chan(mxs_chan);
-       mxs_dma_clkgate(mxs_chan, 0);
 
        dma_async_tx_descriptor_init(&mxs_chan->desc, chan);
        mxs_chan->desc.tx_submit = mxs_dma_tx_submit;
@@ -377,7 +351,7 @@ static void mxs_dma_free_chan_resources(struct dma_chan *chan)
 
 static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
                struct dma_chan *chan, struct scatterlist *sgl,
-               unsigned int sg_len, enum dma_data_direction direction,
+               unsigned int sg_len, enum dma_transfer_direction direction,
                unsigned long append)
 {
        struct mxs_dma_chan *mxs_chan = to_mxs_dma_chan(chan);
@@ -386,7 +360,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
        struct scatterlist *sg;
        int i, j;
        u32 *pio;
-       static int idx;
+       int idx = append ? mxs_chan->desc_count : 0;
 
        if (mxs_chan->status == DMA_IN_PROGRESS && !append)
                return NULL;
@@ -417,7 +391,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
                idx = 0;
        }
 
-       if (direction == DMA_NONE) {
+       if (direction == DMA_TRANS_NONE) {
                ccw = &mxs_chan->ccw[idx++];
                pio = (u32 *) sgl;
 
@@ -450,7 +424,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
                        ccw->bits |= CCW_CHAIN;
                        ccw->bits |= CCW_HALT_ON_TERM;
                        ccw->bits |= CCW_TERM_FLUSH;
-                       ccw->bits |= BF_CCW(direction == DMA_FROM_DEVICE ?
+                       ccw->bits |= BF_CCW(direction == DMA_DEV_TO_MEM ?
                                        MXS_DMA_CMD_WRITE : MXS_DMA_CMD_READ,
                                        COMMAND);
 
@@ -462,6 +436,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
                        }
                }
        }
+       mxs_chan->desc_count = idx;
 
        return &mxs_chan->desc;
 
@@ -472,7 +447,7 @@ err_out:
 
 static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t buf_len,
-               size_t period_len, enum dma_data_direction direction)
+               size_t period_len, enum dma_transfer_direction direction)
 {
        struct mxs_dma_chan *mxs_chan = to_mxs_dma_chan(chan);
        struct mxs_dma_engine *mxs_dma = mxs_chan->mxs_dma;
@@ -515,7 +490,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic(
                ccw->bits |= CCW_IRQ;
                ccw->bits |= CCW_HALT_ON_TERM;
                ccw->bits |= CCW_TERM_FLUSH;
-               ccw->bits |= BF_CCW(direction == DMA_FROM_DEVICE ?
+               ccw->bits |= BF_CCW(direction == DMA_DEV_TO_MEM ?
                                MXS_DMA_CMD_WRITE : MXS_DMA_CMD_READ, COMMAND);
 
                dma_addr += period_len;
@@ -523,6 +498,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_dma_cyclic(
 
                i++;
        }
+       mxs_chan->desc_count = i;
 
        return &mxs_chan->desc;
 
@@ -539,8 +515,8 @@ static int mxs_dma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
 
        switch (cmd) {
        case DMA_TERMINATE_ALL:
-               mxs_dma_disable_chan(mxs_chan);
                mxs_dma_reset_chan(mxs_chan);
+               mxs_dma_disable_chan(mxs_chan);
                break;
        case DMA_PAUSE:
                mxs_dma_pause_chan(mxs_chan);
@@ -580,7 +556,7 @@ static int __init mxs_dma_init(struct mxs_dma_engine *mxs_dma)
 
        ret = clk_prepare_enable(mxs_dma->clk);
        if (ret)
-               goto err_out;
+               return ret;
 
        ret = mxs_reset_block(mxs_dma->base);
        if (ret)
@@ -604,11 +580,8 @@ static int __init mxs_dma_init(struct mxs_dma_engine *mxs_dma)
        writel(MXS_DMA_CHANNELS_MASK << MXS_DMA_CHANNELS,
                mxs_dma->base + HW_APBHX_CTRL1 + MXS_SET_ADDR);
 
-       clk_disable_unprepare(mxs_dma->clk);
-
-       return 0;
-
 err_out:
+       clk_disable_unprepare(mxs_dma->clk);
        return ret;
 }
 
index a6d0e3d..823f581 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Topcliff PCH DMA controller driver
  * Copyright (c) 2010 Intel Corporation
- * Copyright (C) 2011 OKI SEMICONDUCTOR CO., LTD.
+ * Copyright (C) 2011 LAPIS Semiconductor Co., Ltd.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -99,7 +99,7 @@ struct pch_dma_desc {
 struct pch_dma_chan {
        struct dma_chan         chan;
        void __iomem *membase;
-       enum dma_data_direction dir;
+       enum dma_transfer_direction dir;
        struct tasklet_struct   tasklet;
        unsigned long           err_status;
 
@@ -224,7 +224,7 @@ static void pdc_set_dir(struct dma_chan *chan)
                mask_ctl = DMA_MASK_CTL0_MODE & ~(DMA_CTL0_MODE_MASK_BITS <<
                                       (DMA_CTL0_BITS_PER_CH * chan->chan_id));
                val &= mask_mode;
-               if (pd_chan->dir == DMA_TO_DEVICE)
+               if (pd_chan->dir == DMA_MEM_TO_DEV)
                        val |= 0x1 << (DMA_CTL0_BITS_PER_CH * chan->chan_id +
                                       DMA_CTL0_DIR_SHIFT_BITS);
                else
@@ -242,7 +242,7 @@ static void pdc_set_dir(struct dma_chan *chan)
                mask_ctl = DMA_MASK_CTL2_MODE & ~(DMA_CTL0_MODE_MASK_BITS <<
                                                 (DMA_CTL0_BITS_PER_CH * ch));
                val &= mask_mode;
-               if (pd_chan->dir == DMA_TO_DEVICE)
+               if (pd_chan->dir == DMA_MEM_TO_DEV)
                        val |= 0x1 << (DMA_CTL0_BITS_PER_CH * ch +
                                       DMA_CTL0_DIR_SHIFT_BITS);
                else
@@ -607,7 +607,7 @@ static void pd_issue_pending(struct dma_chan *chan)
 
 static struct dma_async_tx_descriptor *pd_prep_slave_sg(struct dma_chan *chan,
                        struct scatterlist *sgl, unsigned int sg_len,
-                       enum dma_data_direction direction, unsigned long flags)
+                       enum dma_transfer_direction direction, unsigned long flags)
 {
        struct pch_dma_chan *pd_chan = to_pd_chan(chan);
        struct pch_dma_slave *pd_slave = chan->private;
@@ -623,9 +623,9 @@ static struct dma_async_tx_descriptor *pd_prep_slave_sg(struct dma_chan *chan,
                return NULL;
        }
 
-       if (direction == DMA_FROM_DEVICE)
+       if (direction == DMA_DEV_TO_MEM)
                reg = pd_slave->rx_reg;
-       else if (direction == DMA_TO_DEVICE)
+       else if (direction == DMA_MEM_TO_DEV)
                reg = pd_slave->tx_reg;
        else
                return NULL;
@@ -1018,6 +1018,8 @@ static void __devexit pch_dma_remove(struct pci_dev *pdev)
 #define PCI_DEVICE_ID_ML7223_DMA2_4CH  0x800E
 #define PCI_DEVICE_ID_ML7223_DMA3_4CH  0x8017
 #define PCI_DEVICE_ID_ML7223_DMA4_4CH  0x803B
+#define PCI_DEVICE_ID_ML7831_DMA1_8CH  0x8810
+#define PCI_DEVICE_ID_ML7831_DMA2_4CH  0x8815
 
 DEFINE_PCI_DEVICE_TABLE(pch_dma_id_table) = {
        { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_EG20T_PCH_DMA_8CH), 8 },
@@ -1030,6 +1032,8 @@ DEFINE_PCI_DEVICE_TABLE(pch_dma_id_table) = {
        { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA2_4CH), 4}, /* Video SPI */
        { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA3_4CH), 4}, /* Security */
        { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7223_DMA4_4CH), 4}, /* FPGA */
+       { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7831_DMA1_8CH), 8}, /* UART */
+       { PCI_VDEVICE(ROHM, PCI_DEVICE_ID_ML7831_DMA2_4CH), 4}, /* SPI */
        { 0, },
 };
 
@@ -1057,7 +1061,7 @@ static void __exit pch_dma_exit(void)
 module_init(pch_dma_init);
 module_exit(pch_dma_exit);
 
-MODULE_DESCRIPTION("Intel EG20T PCH / OKI SEMICONDUCTOR ML7213 IOH "
+MODULE_DESCRIPTION("Intel EG20T PCH / LAPIS Semicon ML7213/ML7223/ML7831 IOH "
                   "DMA controller driver");
 MODULE_AUTHOR("Yong Wang <yong.y.wang@intel.com>");
 MODULE_LICENSE("GPL v2");
index 09adcfc..b8ec03e 100644 (file)
@@ -350,14 +350,14 @@ static int pl330_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd, unsigned
        case DMA_SLAVE_CONFIG:
                slave_config = (struct dma_slave_config *)arg;
 
-               if (slave_config->direction == DMA_TO_DEVICE) {
+               if (slave_config->direction == DMA_MEM_TO_DEV) {
                        if (slave_config->dst_addr)
                                pch->fifo_addr = slave_config->dst_addr;
                        if (slave_config->dst_addr_width)
                                pch->burst_sz = __ffs(slave_config->dst_addr_width);
                        if (slave_config->dst_maxburst)
                                pch->burst_len = slave_config->dst_maxburst;
-               } else if (slave_config->direction == DMA_FROM_DEVICE) {
+               } else if (slave_config->direction == DMA_DEV_TO_MEM) {
                        if (slave_config->src_addr)
                                pch->fifo_addr = slave_config->src_addr;
                        if (slave_config->src_addr_width)
@@ -621,7 +621,7 @@ static inline int get_burst_len(struct dma_pl330_desc *desc, size_t len)
 
 static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
                struct dma_chan *chan, dma_addr_t dma_addr, size_t len,
-               size_t period_len, enum dma_data_direction direction)
+               size_t period_len, enum dma_transfer_direction direction)
 {
        struct dma_pl330_desc *desc;
        struct dma_pl330_chan *pch = to_pchan(chan);
@@ -636,14 +636,14 @@ static struct dma_async_tx_descriptor *pl330_prep_dma_cyclic(
        }
 
        switch (direction) {
-       case DMA_TO_DEVICE:
+       case DMA_MEM_TO_DEV:
                desc->rqcfg.src_inc = 1;
                desc->rqcfg.dst_inc = 0;
                desc->req.rqtype = MEMTODEV;
                src = dma_addr;
                dst = pch->fifo_addr;
                break;
-       case DMA_FROM_DEVICE:
+       case DMA_DEV_TO_MEM:
                desc->rqcfg.src_inc = 0;
                desc->rqcfg.dst_inc = 1;
                desc->req.rqtype = DEVTOMEM;
@@ -710,7 +710,7 @@ pl330_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dst,
 
 static struct dma_async_tx_descriptor *
 pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
-               unsigned int sg_len, enum dma_data_direction direction,
+               unsigned int sg_len, enum dma_transfer_direction direction,
                unsigned long flg)
 {
        struct dma_pl330_desc *first, *desc = NULL;
@@ -759,7 +759,7 @@ pl330_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                else
                        list_add_tail(&desc->node, &first->node);
 
-               if (direction == DMA_TO_DEVICE) {
+               if (direction == DMA_MEM_TO_DEV) {
                        desc->rqcfg.src_inc = 1;
                        desc->rqcfg.dst_inc = 0;
                        desc->req.rqtype = MEMTODEV;
@@ -834,17 +834,7 @@ pl330_probe(struct amba_device *adev, const struct amba_id *id)
 
        amba_set_drvdata(adev, pdmac);
 
-#ifdef CONFIG_PM_RUNTIME
-       /* to use the runtime PM helper functions */
-       pm_runtime_enable(&adev->dev);
-
-       /* enable the power domain */
-       if (pm_runtime_get_sync(&adev->dev)) {
-               dev_err(&adev->dev, "failed to get runtime pm\n");
-               ret = -ENODEV;
-               goto probe_err1;
-       }
-#else
+#ifndef CONFIG_PM_RUNTIME
        /* enable dma clk */
        clk_enable(pdmac->clk);
 #endif
@@ -977,10 +967,7 @@ static int __devexit pl330_remove(struct amba_device *adev)
        res = &adev->res;
        release_mem_region(res->start, resource_size(res));
 
-#ifdef CONFIG_PM_RUNTIME
-       pm_runtime_put(&adev->dev);
-       pm_runtime_disable(&adev->dev);
-#else
+#ifndef CONFIG_PM_RUNTIME
        clk_disable(pdmac->clk);
 #endif
 
index 81809c2..54043cd 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/interrupt.h>
 #include <linux/dmaengine.h>
 #include <linux/delay.h>
-#include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/sh_dma.h>
@@ -57,6 +56,15 @@ static LIST_HEAD(sh_dmae_devices);
 static unsigned long sh_dmae_slave_used[BITS_TO_LONGS(SH_DMA_SLAVE_NUMBER)];
 
 static void sh_dmae_chan_ld_cleanup(struct sh_dmae_chan *sh_chan, bool all);
+static void sh_chan_xfer_ld_queue(struct sh_dmae_chan *sh_chan);
+
+static void chclr_write(struct sh_dmae_chan *sh_dc, u32 data)
+{
+       struct sh_dmae_device *shdev = to_sh_dev(sh_dc);
+
+       __raw_writel(data, shdev->chan_reg +
+                    shdev->pdata->channel[sh_dc->id].chclr_offset);
+}
 
 static void sh_dmae_writel(struct sh_dmae_chan *sh_dc, u32 data, u32 reg)
 {
@@ -129,6 +137,15 @@ static int sh_dmae_rst(struct sh_dmae_device *shdev)
 
        dmaor = dmaor_read(shdev) & ~(DMAOR_NMIF | DMAOR_AE | DMAOR_DME);
 
+       if (shdev->pdata->chclr_present) {
+               int i;
+               for (i = 0; i < shdev->pdata->channel_num; i++) {
+                       struct sh_dmae_chan *sh_chan = shdev->chan[i];
+                       if (sh_chan)
+                               chclr_write(sh_chan, 0);
+               }
+       }
+
        dmaor_write(shdev, dmaor | shdev->pdata->dmaor_init);
 
        dmaor = dmaor_read(shdev);
@@ -139,6 +156,10 @@ static int sh_dmae_rst(struct sh_dmae_device *shdev)
                dev_warn(shdev->common.dev, "Can't initialize DMAOR.\n");
                return -EIO;
        }
+       if (shdev->pdata->dmaor_init & ~dmaor)
+               dev_warn(shdev->common.dev,
+                        "DMAOR=0x%x hasn't latched the initial value 0x%x.\n",
+                        dmaor, shdev->pdata->dmaor_init);
        return 0;
 }
 
@@ -259,8 +280,6 @@ static int dmae_set_dmars(struct sh_dmae_chan *sh_chan, u16 val)
        return 0;
 }
 
-static void sh_chan_xfer_ld_queue(struct sh_dmae_chan *sh_chan);
-
 static dma_cookie_t sh_dmae_tx_submit(struct dma_async_tx_descriptor *tx)
 {
        struct sh_desc *desc = tx_to_sh_desc(tx), *chunk, *last = desc, *c;
@@ -340,6 +359,8 @@ static dma_cookie_t sh_dmae_tx_submit(struct dma_async_tx_descriptor *tx)
                                sh_chan_xfer_ld_queue(sh_chan);
                        sh_chan->pm_state = DMAE_PM_ESTABLISHED;
                }
+       } else {
+               sh_chan->pm_state = DMAE_PM_PENDING;
        }
 
        spin_unlock_irq(&sh_chan->desc_lock);
@@ -479,19 +500,19 @@ static void sh_dmae_free_chan_resources(struct dma_chan *chan)
  * @sh_chan:   DMA channel
  * @flags:     DMA transfer flags
  * @dest:      destination DMA address, incremented when direction equals
- *             DMA_FROM_DEVICE or DMA_BIDIRECTIONAL
+ *             DMA_DEV_TO_MEM
  * @src:       source DMA address, incremented when direction equals
- *             DMA_TO_DEVICE or DMA_BIDIRECTIONAL
+ *             DMA_MEM_TO_DEV
  * @len:       DMA transfer length
  * @first:     if NULL, set to the current descriptor and cookie set to -EBUSY
  * @direction: needed for slave DMA to decide which address to keep constant,
- *             equals DMA_BIDIRECTIONAL for MEMCPY
+ *             equals DMA_MEM_TO_MEM for MEMCPY
  * Returns 0 or an error
  * Locks: called with desc_lock held
  */
 static struct sh_desc *sh_dmae_add_desc(struct sh_dmae_chan *sh_chan,
        unsigned long flags, dma_addr_t *dest, dma_addr_t *src, size_t *len,
-       struct sh_desc **first, enum dma_data_direction direction)
+       struct sh_desc **first, enum dma_transfer_direction direction)
 {
        struct sh_desc *new;
        size_t copy_size;
@@ -531,9 +552,9 @@ static struct sh_desc *sh_dmae_add_desc(struct sh_dmae_chan *sh_chan,
        new->direction = direction;
 
        *len -= copy_size;
-       if (direction == DMA_BIDIRECTIONAL || direction == DMA_TO_DEVICE)
+       if (direction == DMA_MEM_TO_MEM || direction == DMA_MEM_TO_DEV)
                *src += copy_size;
-       if (direction == DMA_BIDIRECTIONAL || direction == DMA_FROM_DEVICE)
+       if (direction == DMA_MEM_TO_MEM || direction == DMA_DEV_TO_MEM)
                *dest += copy_size;
 
        return new;
@@ -546,12 +567,12 @@ static struct sh_desc *sh_dmae_add_desc(struct sh_dmae_chan *sh_chan,
  * converted to scatter-gather to guarantee consistent locking and a correct
  * list manipulation. For slave DMA direction carries the usual meaning, and,
  * logically, the SG list is RAM and the addr variable contains slave address,
- * e.g., the FIFO I/O register. For MEMCPY direction equals DMA_BIDIRECTIONAL
+ * e.g., the FIFO I/O register. For MEMCPY direction equals DMA_MEM_TO_MEM
  * and the SG list contains only one element and points at the source buffer.
  */
 static struct dma_async_tx_descriptor *sh_dmae_prep_sg(struct sh_dmae_chan *sh_chan,
        struct scatterlist *sgl, unsigned int sg_len, dma_addr_t *addr,
-       enum dma_data_direction direction, unsigned long flags)
+       enum dma_transfer_direction direction, unsigned long flags)
 {
        struct scatterlist *sg;
        struct sh_desc *first = NULL, *new = NULL /* compiler... */;
@@ -592,7 +613,7 @@ static struct dma_async_tx_descriptor *sh_dmae_prep_sg(struct sh_dmae_chan *sh_c
                        dev_dbg(sh_chan->dev, "Add SG #%d@%p[%d], dma %llx\n",
                                i, sg, len, (unsigned long long)sg_addr);
 
-                       if (direction == DMA_FROM_DEVICE)
+                       if (direction == DMA_DEV_TO_MEM)
                                new = sh_dmae_add_desc(sh_chan, flags,
                                                &sg_addr, addr, &len, &first,
                                                direction);
@@ -646,13 +667,13 @@ static struct dma_async_tx_descriptor *sh_dmae_prep_memcpy(
        sg_dma_address(&sg) = dma_src;
        sg_dma_len(&sg) = len;
 
-       return sh_dmae_prep_sg(sh_chan, &sg, 1, &dma_dest, DMA_BIDIRECTIONAL,
+       return sh_dmae_prep_sg(sh_chan, &sg, 1, &dma_dest, DMA_MEM_TO_MEM,
                               flags);
 }
 
 static struct dma_async_tx_descriptor *sh_dmae_prep_slave_sg(
        struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len,
-       enum dma_data_direction direction, unsigned long flags)
+       enum dma_transfer_direction direction, unsigned long flags)
 {
        struct sh_dmae_slave *param;
        struct sh_dmae_chan *sh_chan;
@@ -996,7 +1017,7 @@ static void dmae_do_tasklet(unsigned long data)
        spin_lock_irq(&sh_chan->desc_lock);
        list_for_each_entry(desc, &sh_chan->ld_queue, node) {
                if (desc->mark == DESC_SUBMITTED &&
-                   ((desc->direction == DMA_FROM_DEVICE &&
+                   ((desc->direction == DMA_DEV_TO_MEM &&
                      (desc->hw.dar + desc->hw.tcr) == dar_buf) ||
                     (desc->hw.sar + desc->hw.tcr) == sar_buf)) {
                        dev_dbg(sh_chan->dev, "done #%d@%p dst %u\n",
@@ -1225,6 +1246,8 @@ static int __init sh_dmae_probe(struct platform_device *pdev)
 
        platform_set_drvdata(pdev, shdev);
 
+       shdev->common.dev = &pdev->dev;
+
        pm_runtime_enable(&pdev->dev);
        pm_runtime_get_sync(&pdev->dev);
 
@@ -1254,7 +1277,6 @@ static int __init sh_dmae_probe(struct platform_device *pdev)
        shdev->common.device_prep_slave_sg = sh_dmae_prep_slave_sg;
        shdev->common.device_control = sh_dmae_control;
 
-       shdev->common.dev = &pdev->dev;
        /* Default transfer size of 32 bytes requires 32-byte alignment */
        shdev->common.copy_align = LOG2_DEFAULT_XFER_SIZE;
 
@@ -1435,22 +1457,17 @@ static int sh_dmae_runtime_resume(struct device *dev)
 #ifdef CONFIG_PM
 static int sh_dmae_suspend(struct device *dev)
 {
-       struct sh_dmae_device *shdev = dev_get_drvdata(dev);
-       int i;
-
-       for (i = 0; i < shdev->pdata->channel_num; i++) {
-               struct sh_dmae_chan *sh_chan = shdev->chan[i];
-               if (sh_chan->descs_allocated)
-                       sh_chan->pm_error = pm_runtime_put_sync(dev);
-       }
-
        return 0;
 }
 
 static int sh_dmae_resume(struct device *dev)
 {
        struct sh_dmae_device *shdev = dev_get_drvdata(dev);
-       int i;
+       int i, ret;
+
+       ret = sh_dmae_rst(shdev);
+       if (ret < 0)
+               dev_err(dev, "Failed to reset!\n");
 
        for (i = 0; i < shdev->pdata->channel_num; i++) {
                struct sh_dmae_chan *sh_chan = shdev->chan[i];
@@ -1459,9 +1476,6 @@ static int sh_dmae_resume(struct device *dev)
                if (!sh_chan->descs_allocated)
                        continue;
 
-               if (!sh_chan->pm_error)
-                       pm_runtime_get_sync(dev);
-
                if (param) {
                        const struct sh_dmae_slave_config *cfg = param->config;
                        dmae_set_dmars(sh_chan, cfg->mid_rid);
diff --git a/drivers/dma/sirf-dma.c b/drivers/dma/sirf-dma.c
new file mode 100644 (file)
index 0000000..2333810
--- /dev/null
@@ -0,0 +1,707 @@
+/*
+ * DMA controller driver for CSR SiRFprimaII
+ *
+ * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
+ *
+ * Licensed under GPLv2 or later.
+ */
+
+#include <linux/module.h>
+#include <linux/dmaengine.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/of_irq.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#include <linux/sirfsoc_dma.h>
+
+#define SIRFSOC_DMA_DESCRIPTORS                 16
+#define SIRFSOC_DMA_CHANNELS                    16
+
+#define SIRFSOC_DMA_CH_ADDR                     0x00
+#define SIRFSOC_DMA_CH_XLEN                     0x04
+#define SIRFSOC_DMA_CH_YLEN                     0x08
+#define SIRFSOC_DMA_CH_CTRL                     0x0C
+
+#define SIRFSOC_DMA_WIDTH_0                     0x100
+#define SIRFSOC_DMA_CH_VALID                    0x140
+#define SIRFSOC_DMA_CH_INT                      0x144
+#define SIRFSOC_DMA_INT_EN                      0x148
+#define SIRFSOC_DMA_CH_LOOP_CTRL                0x150
+
+#define SIRFSOC_DMA_MODE_CTRL_BIT               4
+#define SIRFSOC_DMA_DIR_CTRL_BIT                5
+
+/* xlen and dma_width register is in 4 bytes boundary */
+#define SIRFSOC_DMA_WORD_LEN                   4
+
+struct sirfsoc_dma_desc {
+       struct dma_async_tx_descriptor  desc;
+       struct list_head                node;
+
+       /* SiRFprimaII 2D-DMA parameters */
+
+       int             xlen;           /* DMA xlen */
+       int             ylen;           /* DMA ylen */
+       int             width;          /* DMA width */
+       int             dir;
+       bool            cyclic;         /* is loop DMA? */
+       u32             addr;           /* DMA buffer address */
+};
+
+struct sirfsoc_dma_chan {
+       struct dma_chan                 chan;
+       struct list_head                free;
+       struct list_head                prepared;
+       struct list_head                queued;
+       struct list_head                active;
+       struct list_head                completed;
+       dma_cookie_t                    completed_cookie;
+       unsigned long                   happened_cyclic;
+       unsigned long                   completed_cyclic;
+
+       /* Lock for this structure */
+       spinlock_t                      lock;
+
+       int                             mode;
+};
+
+struct sirfsoc_dma {
+       struct dma_device               dma;
+       struct tasklet_struct           tasklet;
+       struct sirfsoc_dma_chan         channels[SIRFSOC_DMA_CHANNELS];
+       void __iomem                    *base;
+       int                             irq;
+};
+
+#define DRV_NAME       "sirfsoc_dma"
+
+/* Convert struct dma_chan to struct sirfsoc_dma_chan */
+static inline
+struct sirfsoc_dma_chan *dma_chan_to_sirfsoc_dma_chan(struct dma_chan *c)
+{
+       return container_of(c, struct sirfsoc_dma_chan, chan);
+}
+
+/* Convert struct dma_chan to struct sirfsoc_dma */
+static inline struct sirfsoc_dma *dma_chan_to_sirfsoc_dma(struct dma_chan *c)
+{
+       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(c);
+       return container_of(schan, struct sirfsoc_dma, channels[c->chan_id]);
+}
+
+/* Execute all queued DMA descriptors */
+static void sirfsoc_dma_execute(struct sirfsoc_dma_chan *schan)
+{
+       struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
+       int cid = schan->chan.chan_id;
+       struct sirfsoc_dma_desc *sdesc = NULL;
+
+       /*
+        * lock has been held by functions calling this, so we don't hold
+        * lock again
+        */
+
+       sdesc = list_first_entry(&schan->queued, struct sirfsoc_dma_desc,
+               node);
+       /* Move the first queued descriptor to active list */
+       list_move_tail(&schan->queued, &schan->active);
+
+       /* Start the DMA transfer */
+       writel_relaxed(sdesc->width, sdma->base + SIRFSOC_DMA_WIDTH_0 +
+               cid * 4);
+       writel_relaxed(cid | (schan->mode << SIRFSOC_DMA_MODE_CTRL_BIT) |
+               (sdesc->dir << SIRFSOC_DMA_DIR_CTRL_BIT),
+               sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_CTRL);
+       writel_relaxed(sdesc->xlen, sdma->base + cid * 0x10 +
+               SIRFSOC_DMA_CH_XLEN);
+       writel_relaxed(sdesc->ylen, sdma->base + cid * 0x10 +
+               SIRFSOC_DMA_CH_YLEN);
+       writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) |
+               (1 << cid), sdma->base + SIRFSOC_DMA_INT_EN);
+
+       /*
+        * writel has an implict memory write barrier to make sure data is
+        * flushed into memory before starting DMA
+        */
+       writel(sdesc->addr >> 2, sdma->base + cid * 0x10 + SIRFSOC_DMA_CH_ADDR);
+
+       if (sdesc->cyclic) {
+               writel((1 << cid) | 1 << (cid + 16) |
+                       readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL),
+                       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
+               schan->happened_cyclic = schan->completed_cyclic = 0;
+       }
+}
+
+/* Interrupt handler */
+static irqreturn_t sirfsoc_dma_irq(int irq, void *data)
+{
+       struct sirfsoc_dma *sdma = data;
+       struct sirfsoc_dma_chan *schan;
+       struct sirfsoc_dma_desc *sdesc = NULL;
+       u32 is;
+       int ch;
+
+       is = readl(sdma->base + SIRFSOC_DMA_CH_INT);
+       while ((ch = fls(is) - 1) >= 0) {
+               is &= ~(1 << ch);
+               writel_relaxed(1 << ch, sdma->base + SIRFSOC_DMA_CH_INT);
+               schan = &sdma->channels[ch];
+
+               spin_lock(&schan->lock);
+
+               sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc,
+                       node);
+               if (!sdesc->cyclic) {
+                       /* Execute queued descriptors */
+                       list_splice_tail_init(&schan->active, &schan->completed);
+                       if (!list_empty(&schan->queued))
+                               sirfsoc_dma_execute(schan);
+               } else
+                       schan->happened_cyclic++;
+
+               spin_unlock(&schan->lock);
+       }
+
+       /* Schedule tasklet */
+       tasklet_schedule(&sdma->tasklet);
+
+       return IRQ_HANDLED;
+}
+
+/* process completed descriptors */
+static void sirfsoc_dma_process_completed(struct sirfsoc_dma *sdma)
+{
+       dma_cookie_t last_cookie = 0;
+       struct sirfsoc_dma_chan *schan;
+       struct sirfsoc_dma_desc *sdesc;
+       struct dma_async_tx_descriptor *desc;
+       unsigned long flags;
+       unsigned long happened_cyclic;
+       LIST_HEAD(list);
+       int i;
+
+       for (i = 0; i < sdma->dma.chancnt; i++) {
+               schan = &sdma->channels[i];
+
+               /* Get all completed descriptors */
+               spin_lock_irqsave(&schan->lock, flags);
+               if (!list_empty(&schan->completed)) {
+                       list_splice_tail_init(&schan->completed, &list);
+                       spin_unlock_irqrestore(&schan->lock, flags);
+
+                       /* Execute callbacks and run dependencies */
+                       list_for_each_entry(sdesc, &list, node) {
+                               desc = &sdesc->desc;
+
+                               if (desc->callback)
+                                       desc->callback(desc->callback_param);
+
+                               last_cookie = desc->cookie;
+                               dma_run_dependencies(desc);
+                       }
+
+                       /* Free descriptors */
+                       spin_lock_irqsave(&schan->lock, flags);
+                       list_splice_tail_init(&list, &schan->free);
+                       schan->completed_cookie = last_cookie;
+                       spin_unlock_irqrestore(&schan->lock, flags);
+               } else {
+                       /* for cyclic channel, desc is always in active list */
+                       sdesc = list_first_entry(&schan->active, struct sirfsoc_dma_desc,
+                               node);
+
+                       if (!sdesc || (sdesc && !sdesc->cyclic)) {
+                               /* without active cyclic DMA */
+                               spin_unlock_irqrestore(&schan->lock, flags);
+                               continue;
+                       }
+
+                       /* cyclic DMA */
+                       happened_cyclic = schan->happened_cyclic;
+                       spin_unlock_irqrestore(&schan->lock, flags);
+
+                       desc = &sdesc->desc;
+                       while (happened_cyclic != schan->completed_cyclic) {
+                               if (desc->callback)
+                                       desc->callback(desc->callback_param);
+                               schan->completed_cyclic++;
+                       }
+               }
+       }
+}
+
+/* DMA Tasklet */
+static void sirfsoc_dma_tasklet(unsigned long data)
+{
+       struct sirfsoc_dma *sdma = (void *)data;
+
+       sirfsoc_dma_process_completed(sdma);
+}
+
+/* Submit descriptor to hardware */
+static dma_cookie_t sirfsoc_dma_tx_submit(struct dma_async_tx_descriptor *txd)
+{
+       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(txd->chan);
+       struct sirfsoc_dma_desc *sdesc;
+       unsigned long flags;
+       dma_cookie_t cookie;
+
+       sdesc = container_of(txd, struct sirfsoc_dma_desc, desc);
+
+       spin_lock_irqsave(&schan->lock, flags);
+
+       /* Move descriptor to queue */
+       list_move_tail(&sdesc->node, &schan->queued);
+
+       /* Update cookie */
+       cookie = schan->chan.cookie + 1;
+       if (cookie <= 0)
+               cookie = 1;
+
+       schan->chan.cookie = cookie;
+       sdesc->desc.cookie = cookie;
+
+       spin_unlock_irqrestore(&schan->lock, flags);
+
+       return cookie;
+}
+
+static int sirfsoc_dma_slave_config(struct sirfsoc_dma_chan *schan,
+       struct dma_slave_config *config)
+{
+       unsigned long flags;
+
+       if ((config->src_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES) ||
+               (config->dst_addr_width != DMA_SLAVE_BUSWIDTH_4_BYTES))
+               return -EINVAL;
+
+       spin_lock_irqsave(&schan->lock, flags);
+       schan->mode = (config->src_maxburst == 4 ? 1 : 0);
+       spin_unlock_irqrestore(&schan->lock, flags);
+
+       return 0;
+}
+
+static int sirfsoc_dma_terminate_all(struct sirfsoc_dma_chan *schan)
+{
+       struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(&schan->chan);
+       int cid = schan->chan.chan_id;
+       unsigned long flags;
+
+       writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_INT_EN) &
+               ~(1 << cid), sdma->base + SIRFSOC_DMA_INT_EN);
+       writel_relaxed(1 << cid, sdma->base + SIRFSOC_DMA_CH_VALID);
+
+       writel_relaxed(readl_relaxed(sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL)
+               & ~((1 << cid) | 1 << (cid + 16)),
+                       sdma->base + SIRFSOC_DMA_CH_LOOP_CTRL);
+
+       spin_lock_irqsave(&schan->lock, flags);
+       list_splice_tail_init(&schan->active, &schan->free);
+       list_splice_tail_init(&schan->queued, &schan->free);
+       spin_unlock_irqrestore(&schan->lock, flags);
+
+       return 0;
+}
+
+static int sirfsoc_dma_control(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
+       unsigned long arg)
+{
+       struct dma_slave_config *config;
+       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
+
+       switch (cmd) {
+       case DMA_TERMINATE_ALL:
+               return sirfsoc_dma_terminate_all(schan);
+       case DMA_SLAVE_CONFIG:
+               config = (struct dma_slave_config *)arg;
+               return sirfsoc_dma_slave_config(schan, config);
+
+       default:
+               break;
+       }
+
+       return -ENOSYS;
+}
+
+/* Alloc channel resources */
+static int sirfsoc_dma_alloc_chan_resources(struct dma_chan *chan)
+{
+       struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
+       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
+       struct sirfsoc_dma_desc *sdesc;
+       unsigned long flags;
+       LIST_HEAD(descs);
+       int i;
+
+       /* Alloc descriptors for this channel */
+       for (i = 0; i < SIRFSOC_DMA_DESCRIPTORS; i++) {
+               sdesc = kzalloc(sizeof(*sdesc), GFP_KERNEL);
+               if (!sdesc) {
+                       dev_notice(sdma->dma.dev, "Memory allocation error. "
+                               "Allocated only %u descriptors\n", i);
+                       break;
+               }
+
+               dma_async_tx_descriptor_init(&sdesc->desc, chan);
+               sdesc->desc.flags = DMA_CTRL_ACK;
+               sdesc->desc.tx_submit = sirfsoc_dma_tx_submit;
+
+               list_add_tail(&sdesc->node, &descs);
+       }
+
+       /* Return error only if no descriptors were allocated */
+       if (i == 0)
+               return -ENOMEM;
+
+       spin_lock_irqsave(&schan->lock, flags);
+
+       list_splice_tail_init(&descs, &schan->free);
+       spin_unlock_irqrestore(&schan->lock, flags);
+
+       return i;
+}
+
+/* Free channel resources */
+static void sirfsoc_dma_free_chan_resources(struct dma_chan *chan)
+{
+       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
+       struct sirfsoc_dma_desc *sdesc, *tmp;
+       unsigned long flags;
+       LIST_HEAD(descs);
+
+       spin_lock_irqsave(&schan->lock, flags);
+
+       /* Channel must be idle */
+       BUG_ON(!list_empty(&schan->prepared));
+       BUG_ON(!list_empty(&schan->queued));
+       BUG_ON(!list_empty(&schan->active));
+       BUG_ON(!list_empty(&schan->completed));
+
+       /* Move data */
+       list_splice_tail_init(&schan->free, &descs);
+
+       spin_unlock_irqrestore(&schan->lock, flags);
+
+       /* Free descriptors */
+       list_for_each_entry_safe(sdesc, tmp, &descs, node)
+               kfree(sdesc);
+}
+
+/* Send pending descriptor to hardware */
+static void sirfsoc_dma_issue_pending(struct dma_chan *chan)
+{
+       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
+       unsigned long flags;
+
+       spin_lock_irqsave(&schan->lock, flags);
+
+       if (list_empty(&schan->active) && !list_empty(&schan->queued))
+               sirfsoc_dma_execute(schan);
+
+       spin_unlock_irqrestore(&schan->lock, flags);
+}
+
+/* Check request completion status */
+static enum dma_status
+sirfsoc_dma_tx_status(struct dma_chan *chan, dma_cookie_t cookie,
+       struct dma_tx_state *txstate)
+{
+       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
+       unsigned long flags;
+       dma_cookie_t last_used;
+       dma_cookie_t last_complete;
+
+       spin_lock_irqsave(&schan->lock, flags);
+       last_used = schan->chan.cookie;
+       last_complete = schan->completed_cookie;
+       spin_unlock_irqrestore(&schan->lock, flags);
+
+       dma_set_tx_state(txstate, last_complete, last_used, 0);
+       return dma_async_is_complete(cookie, last_complete, last_used);
+}
+
+static struct dma_async_tx_descriptor *sirfsoc_dma_prep_interleaved(
+       struct dma_chan *chan, struct dma_interleaved_template *xt,
+       unsigned long flags)
+{
+       struct sirfsoc_dma *sdma = dma_chan_to_sirfsoc_dma(chan);
+       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
+       struct sirfsoc_dma_desc *sdesc = NULL;
+       unsigned long iflags;
+       int ret;
+
+       if ((xt->dir != DMA_MEM_TO_DEV) || (xt->dir != DMA_DEV_TO_MEM)) {
+               ret = -EINVAL;
+               goto err_dir;
+       }
+
+       /* Get free descriptor */
+       spin_lock_irqsave(&schan->lock, iflags);
+       if (!list_empty(&schan->free)) {
+               sdesc = list_first_entry(&schan->free, struct sirfsoc_dma_desc,
+                       node);
+               list_del(&sdesc->node);
+       }
+       spin_unlock_irqrestore(&schan->lock, iflags);
+
+       if (!sdesc) {
+               /* try to free completed descriptors */
+               sirfsoc_dma_process_completed(sdma);
+               ret = 0;
+               goto no_desc;
+       }
+
+       /* Place descriptor in prepared list */
+       spin_lock_irqsave(&schan->lock, iflags);
+
+       /*
+        * Number of chunks in a frame can only be 1 for prima2
+        * and ylen (number of frame - 1) must be at least 0
+        */
+       if ((xt->frame_size == 1) && (xt->numf > 0)) {
+               sdesc->cyclic = 0;
+               sdesc->xlen = xt->sgl[0].size / SIRFSOC_DMA_WORD_LEN;
+               sdesc->width = (xt->sgl[0].size + xt->sgl[0].icg) /
+                               SIRFSOC_DMA_WORD_LEN;
+               sdesc->ylen = xt->numf - 1;
+               if (xt->dir == DMA_MEM_TO_DEV) {
+                       sdesc->addr = xt->src_start;
+                       sdesc->dir = 1;
+               } else {
+                       sdesc->addr = xt->dst_start;
+                       sdesc->dir = 0;
+               }
+
+               list_add_tail(&sdesc->node, &schan->prepared);
+       } else {
+               pr_err("sirfsoc DMA Invalid xfer\n");
+               ret = -EINVAL;
+               goto err_xfer;
+       }
+       spin_unlock_irqrestore(&schan->lock, iflags);
+
+       return &sdesc->desc;
+err_xfer:
+       spin_unlock_irqrestore(&schan->lock, iflags);
+no_desc:
+err_dir:
+       return ERR_PTR(ret);
+}
+
+static struct dma_async_tx_descriptor *
+sirfsoc_dma_prep_cyclic(struct dma_chan *chan, dma_addr_t addr,
+       size_t buf_len, size_t period_len,
+       enum dma_transfer_direction direction)
+{
+       struct sirfsoc_dma_chan *schan = dma_chan_to_sirfsoc_dma_chan(chan);
+       struct sirfsoc_dma_desc *sdesc = NULL;
+       unsigned long iflags;
+
+       /*
+        * we only support cycle transfer with 2 period
+        * If the X-length is set to 0, it would be the loop mode.
+        * The DMA address keeps increasing until reaching the end of a loop
+        * area whose size is defined by (DMA_WIDTH x (Y_LENGTH + 1)). Then
+        * the DMA address goes back to the beginning of this area.
+        * In loop mode, the DMA data region is divided into two parts, BUFA
+        * and BUFB. DMA controller generates interrupts twice in each loop:
+        * when the DMA address reaches the end of BUFA or the end of the
+        * BUFB
+        */
+       if (buf_len !=  2 * period_len)
+               return ERR_PTR(-EINVAL);
+
+       /* Get free descriptor */
+       spin_lock_irqsave(&schan->lock, iflags);
+       if (!list_empty(&schan->free)) {
+               sdesc = list_first_entry(&schan->free, struct sirfsoc_dma_desc,
+                       node);
+               list_del(&sdesc->node);
+       }
+       spin_unlock_irqrestore(&schan->lock, iflags);
+
+       if (!sdesc)
+               return 0;
+
+       /* Place descriptor in prepared list */
+       spin_lock_irqsave(&schan->lock, iflags);
+       sdesc->addr = addr;
+       sdesc->cyclic = 1;
+       sdesc->xlen = 0;
+       sdesc->ylen = buf_len / SIRFSOC_DMA_WORD_LEN - 1;
+       sdesc->width = 1;
+       list_add_tail(&sdesc->node, &schan->prepared);
+       spin_unlock_irqrestore(&schan->lock, iflags);
+
+       return &sdesc->desc;
+}
+
+/*
+ * The DMA controller consists of 16 independent DMA channels.
+ * Each channel is allocated to a different function
+ */
+bool sirfsoc_dma_filter_id(struct dma_chan *chan, void *chan_id)
+{
+       unsigned int ch_nr = (unsigned int) chan_id;
+
+       if (ch_nr == chan->chan_id +
+               chan->device->dev_id * SIRFSOC_DMA_CHANNELS)
+               return true;
+
+       return false;
+}
+EXPORT_SYMBOL(sirfsoc_dma_filter_id);
+
+static int __devinit sirfsoc_dma_probe(struct platform_device *op)
+{
+       struct device_node *dn = op->dev.of_node;
+       struct device *dev = &op->dev;
+       struct dma_device *dma;
+       struct sirfsoc_dma *sdma;
+       struct sirfsoc_dma_chan *schan;
+       struct resource res;
+       ulong regs_start, regs_size;
+       u32 id;
+       int ret, i;
+
+       sdma = devm_kzalloc(dev, sizeof(*sdma), GFP_KERNEL);
+       if (!sdma) {
+               dev_err(dev, "Memory exhausted!\n");
+               return -ENOMEM;
+       }
+
+       if (of_property_read_u32(dn, "cell-index", &id)) {
+               dev_err(dev, "Fail to get DMAC index\n");
+               ret = -ENODEV;
+               goto free_mem;
+       }
+
+       sdma->irq = irq_of_parse_and_map(dn, 0);
+       if (sdma->irq == NO_IRQ) {
+               dev_err(dev, "Error mapping IRQ!\n");
+               ret = -EINVAL;
+               goto free_mem;
+       }
+
+       ret = of_address_to_resource(dn, 0, &res);
+       if (ret) {
+               dev_err(dev, "Error parsing memory region!\n");
+               goto free_mem;
+       }
+
+       regs_start = res.start;
+       regs_size = resource_size(&res);
+
+       sdma->base = devm_ioremap(dev, regs_start, regs_size);
+       if (!sdma->base) {
+               dev_err(dev, "Error mapping memory region!\n");
+               ret = -ENOMEM;
+               goto irq_dispose;
+       }
+
+       ret = devm_request_irq(dev, sdma->irq, &sirfsoc_dma_irq, 0, DRV_NAME,
+               sdma);
+       if (ret) {
+               dev_err(dev, "Error requesting IRQ!\n");
+               ret = -EINVAL;
+               goto unmap_mem;
+       }
+
+       dma = &sdma->dma;
+       dma->dev = dev;
+       dma->chancnt = SIRFSOC_DMA_CHANNELS;
+
+       dma->device_alloc_chan_resources = sirfsoc_dma_alloc_chan_resources;
+       dma->device_free_chan_resources = sirfsoc_dma_free_chan_resources;
+       dma->device_issue_pending = sirfsoc_dma_issue_pending;
+       dma->device_control = sirfsoc_dma_control;
+       dma->device_tx_status = sirfsoc_dma_tx_status;
+       dma->device_prep_interleaved_dma = sirfsoc_dma_prep_interleaved;
+       dma->device_prep_dma_cyclic = sirfsoc_dma_prep_cyclic;
+
+       INIT_LIST_HEAD(&dma->channels);
+       dma_cap_set(DMA_SLAVE, dma->cap_mask);
+       dma_cap_set(DMA_CYCLIC, dma->cap_mask);
+       dma_cap_set(DMA_INTERLEAVE, dma->cap_mask);
+       dma_cap_set(DMA_PRIVATE, dma->cap_mask);
+
+       for (i = 0; i < dma->chancnt; i++) {
+               schan = &sdma->channels[i];
+
+               schan->chan.device = dma;
+               schan->chan.cookie = 1;
+               schan->completed_cookie = schan->chan.cookie;
+
+               INIT_LIST_HEAD(&schan->free);
+               INIT_LIST_HEAD(&schan->prepared);
+               INIT_LIST_HEAD(&schan->queued);
+               INIT_LIST_HEAD(&schan->active);
+               INIT_LIST_HEAD(&schan->completed);
+
+               spin_lock_init(&schan->lock);
+               list_add_tail(&schan->chan.device_node, &dma->channels);
+       }
+
+       tasklet_init(&sdma->tasklet, sirfsoc_dma_tasklet, (unsigned long)sdma);
+
+       /* Register DMA engine */
+       dev_set_drvdata(dev, sdma);
+       ret = dma_async_device_register(dma);
+       if (ret)
+               goto free_irq;
+
+       dev_info(dev, "initialized SIRFSOC DMAC driver\n");
+
+       return 0;
+
+free_irq:
+       devm_free_irq(dev, sdma->irq, sdma);
+irq_dispose:
+       irq_dispose_mapping(sdma->irq);
+unmap_mem:
+       iounmap(sdma->base);
+free_mem:
+       devm_kfree(dev, sdma);
+       return ret;
+}
+
+static int __devexit sirfsoc_dma_remove(struct platform_device *op)
+{
+       struct device *dev = &op->dev;
+       struct sirfsoc_dma *sdma = dev_get_drvdata(dev);
+
+       dma_async_device_unregister(&sdma->dma);
+       devm_free_irq(dev, sdma->irq, sdma);
+       irq_dispose_mapping(sdma->irq);
+       iounmap(sdma->base);
+       devm_kfree(dev, sdma);
+       return 0;
+}
+
+static struct of_device_id sirfsoc_dma_match[] = {
+       { .compatible = "sirf,prima2-dmac", },
+       {},
+};
+
+static struct platform_driver sirfsoc_dma_driver = {
+       .probe          = sirfsoc_dma_probe,
+       .remove         = __devexit_p(sirfsoc_dma_remove),
+       .driver = {
+               .name = DRV_NAME,
+               .owner = THIS_MODULE,
+               .of_match_table = sirfsoc_dma_match,
+       },
+};
+
+module_platform_driver(sirfsoc_dma_driver);
+
+MODULE_AUTHOR("Rongjun Ying <rongjun.ying@csr.com>, "
+       "Barry Song <baohua.song@csr.com>");
+MODULE_DESCRIPTION("SIRFSOC DMA control driver");
+MODULE_LICENSE("GPL v2");
index 13259ca..cc5ecbc 100644 (file)
@@ -14,6 +14,8 @@
 #include <linux/platform_device.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
 #include <linux/err.h>
 #include <linux/amba/bus.h>
 
@@ -32,6 +34,9 @@
 /* Maximum iterations taken before giving up suspending a channel */
 #define D40_SUSPEND_MAX_IT 500
 
+/* Milliseconds */
+#define DMA40_AUTOSUSPEND_DELAY        100
+
 /* Hardware requirement on LCLA alignment */
 #define LCLA_ALIGNMENT 0x40000
 
@@ -62,6 +67,55 @@ enum d40_command {
        D40_DMA_SUSPENDED       = 3
 };
 
+/*
+ * These are the registers that has to be saved and later restored
+ * when the DMA hw is powered off.
+ * TODO: Add save/restore of D40_DREG_GCC on dma40 v3 or later, if that works.
+ */
+static u32 d40_backup_regs[] = {
+       D40_DREG_LCPA,
+       D40_DREG_LCLA,
+       D40_DREG_PRMSE,
+       D40_DREG_PRMSO,
+       D40_DREG_PRMOE,
+       D40_DREG_PRMOO,
+};
+
+#define BACKUP_REGS_SZ ARRAY_SIZE(d40_backup_regs)
+
+/* TODO: Check if all these registers have to be saved/restored on dma40 v3 */
+static u32 d40_backup_regs_v3[] = {
+       D40_DREG_PSEG1,
+       D40_DREG_PSEG2,
+       D40_DREG_PSEG3,
+       D40_DREG_PSEG4,
+       D40_DREG_PCEG1,
+       D40_DREG_PCEG2,
+       D40_DREG_PCEG3,
+       D40_DREG_PCEG4,
+       D40_DREG_RSEG1,
+       D40_DREG_RSEG2,
+       D40_DREG_RSEG3,
+       D40_DREG_RSEG4,
+       D40_DREG_RCEG1,
+       D40_DREG_RCEG2,
+       D40_DREG_RCEG3,
+       D40_DREG_RCEG4,
+};
+
+#define BACKUP_REGS_SZ_V3 ARRAY_SIZE(d40_backup_regs_v3)
+
+static u32 d40_backup_regs_chan[] = {
+       D40_CHAN_REG_SSCFG,
+       D40_CHAN_REG_SSELT,
+       D40_CHAN_REG_SSPTR,
+       D40_CHAN_REG_SSLNK,
+       D40_CHAN_REG_SDCFG,
+       D40_CHAN_REG_SDELT,
+       D40_CHAN_REG_SDPTR,
+       D40_CHAN_REG_SDLNK,
+};
+
 /**
  * struct d40_lli_pool - Structure for keeping LLIs in memory
  *
@@ -96,7 +150,7 @@ struct d40_lli_pool {
  * during a transfer.
  * @node: List entry.
  * @is_in_client_list: true if the client owns this descriptor.
- * the previous one.
+ * @cyclic: true if this is a cyclic job
  *
  * This descriptor is used for both logical and physical transfers.
  */
@@ -143,6 +197,7 @@ struct d40_lcla_pool {
  * channels.
  *
  * @lock: A lock protection this entity.
+ * @reserved: True if used by secure world or otherwise.
  * @num: The physical channel number of this entity.
  * @allocated_src: Bit mapped to show which src event line's are mapped to
  * this physical channel. Can also be free or physically allocated.
@@ -152,6 +207,7 @@ struct d40_lcla_pool {
  */
 struct d40_phy_res {
        spinlock_t lock;
+       bool       reserved;
        int        num;
        u32        allocated_src;
        u32        allocated_dst;
@@ -185,7 +241,6 @@ struct d40_base;
  * @src_def_cfg: Default cfg register setting for src.
  * @dst_def_cfg: Default cfg register setting for dst.
  * @log_def: Default logical channel settings.
- * @lcla: Space for one dst src pair for logical channel transfers.
  * @lcpa: Pointer to dst and src lcpa settings.
  * @runtime_addr: runtime configured address.
  * @runtime_direction: runtime configured direction.
@@ -217,7 +272,7 @@ struct d40_chan {
        struct d40_log_lli_full         *lcpa;
        /* Runtime reconfiguration */
        dma_addr_t                      runtime_addr;
-       enum dma_data_direction         runtime_direction;
+       enum dma_transfer_direction     runtime_direction;
 };
 
 /**
@@ -241,6 +296,7 @@ struct d40_chan {
  * @dma_both: dma_device channels that can do both memcpy and slave transfers.
  * @dma_slave: dma_device channels that can do only do slave transfers.
  * @dma_memcpy: dma_device channels that can do only do memcpy transfers.
+ * @phy_chans: Room for all possible physical channels in system.
  * @log_chans: Room for all possible logical channels in system.
  * @lookup_log_chans: Used to map interrupt number to logical channel. Points
  * to log_chans entries.
@@ -248,12 +304,20 @@ struct d40_chan {
  * to phy_chans entries.
  * @plat_data: Pointer to provided platform_data which is the driver
  * configuration.
+ * @lcpa_regulator: Pointer to hold the regulator for the esram bank for lcla.
  * @phy_res: Vector containing all physical channels.
  * @lcla_pool: lcla pool settings and data.
  * @lcpa_base: The virtual mapped address of LCPA.
  * @phy_lcpa: The physical address of the LCPA.
  * @lcpa_size: The size of the LCPA area.
  * @desc_slab: cache for descriptors.
+ * @reg_val_backup: Here the values of some hardware registers are stored
+ * before the DMA is powered off. They are restored when the power is back on.
+ * @reg_val_backup_v3: Backup of registers that only exits on dma40 v3 and
+ * later.
+ * @reg_val_backup_chan: Backup data for standard channel parameter registers.
+ * @gcc_pwr_off_mask: Mask to maintain the channels that can be turned off.
+ * @initialized: true if the dma has been initialized
  */
 struct d40_base {
        spinlock_t                       interrupt_lock;
@@ -275,6 +339,7 @@ struct d40_base {
        struct d40_chan                 **lookup_log_chans;
        struct d40_chan                 **lookup_phy_chans;
        struct stedma40_platform_data    *plat_data;
+       struct regulator                 *lcpa_regulator;
        /* Physical half channels */
        struct d40_phy_res               *phy_res;
        struct d40_lcla_pool              lcla_pool;
@@ -282,6 +347,11 @@ struct d40_base {
        dma_addr_t                        phy_lcpa;
        resource_size_t                   lcpa_size;
        struct kmem_cache                *desc_slab;
+       u32                               reg_val_backup[BACKUP_REGS_SZ];
+       u32                               reg_val_backup_v3[BACKUP_REGS_SZ_V3];
+       u32                              *reg_val_backup_chan;
+       u16                               gcc_pwr_off_mask;
+       bool                              initialized;
 };
 
 /**
@@ -479,13 +549,14 @@ static struct d40_desc *d40_desc_get(struct d40_chan *d40c)
                struct d40_desc *d;
                struct d40_desc *_d;
 
-               list_for_each_entry_safe(d, _d, &d40c->client, node)
+               list_for_each_entry_safe(d, _d, &d40c->client, node) {
                        if (async_tx_test_ack(&d->txd)) {
                                d40_desc_remove(d);
                                desc = d;
                                memset(desc, 0, sizeof(*desc));
                                break;
                        }
+               }
        }
 
        if (!desc)
@@ -536,6 +607,7 @@ static void d40_log_lli_to_lcxa(struct d40_chan *chan, struct d40_desc *desc)
        bool cyclic = desc->cyclic;
        int curr_lcla = -EINVAL;
        int first_lcla = 0;
+       bool use_esram_lcla = chan->base->plat_data->use_esram_lcla;
        bool linkback;
 
        /*
@@ -608,11 +680,16 @@ static void d40_log_lli_to_lcxa(struct d40_chan *chan, struct d40_desc *desc)
                                       &lli->src[lli_current],
                                       next_lcla, flags);
 
-               dma_sync_single_range_for_device(chan->base->dev,
-                                       pool->dma_addr, lcla_offset,
-                                       2 * sizeof(struct d40_log_lli),
-                                       DMA_TO_DEVICE);
-
+               /*
+                * Cache maintenance is not needed if lcla is
+                * mapped in esram
+                */
+               if (!use_esram_lcla) {
+                       dma_sync_single_range_for_device(chan->base->dev,
+                                               pool->dma_addr, lcla_offset,
+                                               2 * sizeof(struct d40_log_lli),
+                                               DMA_TO_DEVICE);
+               }
                curr_lcla = next_lcla;
 
                if (curr_lcla == -EINVAL || curr_lcla == first_lcla) {
@@ -740,7 +817,61 @@ static int d40_sg_2_dmalen(struct scatterlist *sgl, int sg_len,
        return len;
 }
 
-/* Support functions for logical channels */
+
+#ifdef CONFIG_PM
+static void dma40_backup(void __iomem *baseaddr, u32 *backup,
+                        u32 *regaddr, int num, bool save)
+{
+       int i;
+
+       for (i = 0; i < num; i++) {
+               void __iomem *addr = baseaddr + regaddr[i];
+
+               if (save)
+                       backup[i] = readl_relaxed(addr);
+               else
+                       writel_relaxed(backup[i], addr);
+       }
+}
+
+static void d40_save_restore_registers(struct d40_base *base, bool save)
+{
+       int i;
+
+       /* Save/Restore channel specific registers */
+       for (i = 0; i < base->num_phy_chans; i++) {
+               void __iomem *addr;
+               int idx;
+
+               if (base->phy_res[i].reserved)
+                       continue;
+
+               addr = base->virtbase + D40_DREG_PCBASE + i * D40_DREG_PCDELTA;
+               idx = i * ARRAY_SIZE(d40_backup_regs_chan);
+
+               dma40_backup(addr, &base->reg_val_backup_chan[idx],
+                            d40_backup_regs_chan,
+                            ARRAY_SIZE(d40_backup_regs_chan),
+                            save);
+       }
+
+       /* Save/Restore global registers */
+       dma40_backup(base->virtbase, base->reg_val_backup,
+                    d40_backup_regs, ARRAY_SIZE(d40_backup_regs),
+                    save);
+
+       /* Save/Restore registers only existing on dma40 v3 and later */
+       if (base->rev >= 3)
+               dma40_backup(base->virtbase, base->reg_val_backup_v3,
+                            d40_backup_regs_v3,
+                            ARRAY_SIZE(d40_backup_regs_v3),
+                            save);
+}
+#else
+static void d40_save_restore_registers(struct d40_base *base, bool save)
+{
+}
+#endif
 
 static int d40_channel_execute_command(struct d40_chan *d40c,
                                       enum d40_command command)
@@ -973,6 +1104,10 @@ static void d40_config_write(struct d40_chan *d40c)
                /* Set LIDX for lcla */
                writel(lidx, chanbase + D40_CHAN_REG_SSELT);
                writel(lidx, chanbase + D40_CHAN_REG_SDELT);
+
+               /* Clear LNK which will be used by d40_chan_has_events() */
+               writel(0, chanbase + D40_CHAN_REG_SSLNK);
+               writel(0, chanbase + D40_CHAN_REG_SDLNK);
        }
 }
 
@@ -1013,6 +1148,7 @@ static int d40_pause(struct d40_chan *d40c)
        if (!d40c->busy)
                return 0;
 
+       pm_runtime_get_sync(d40c->base->dev);
        spin_lock_irqsave(&d40c->lock, flags);
 
        res = d40_channel_execute_command(d40c, D40_DMA_SUSPEND_REQ);
@@ -1025,7 +1161,8 @@ static int d40_pause(struct d40_chan *d40c)
                                                                  D40_DMA_RUN);
                }
        }
-
+       pm_runtime_mark_last_busy(d40c->base->dev);
+       pm_runtime_put_autosuspend(d40c->base->dev);
        spin_unlock_irqrestore(&d40c->lock, flags);
        return res;
 }
@@ -1039,7 +1176,7 @@ static int d40_resume(struct d40_chan *d40c)
                return 0;
 
        spin_lock_irqsave(&d40c->lock, flags);
-
+       pm_runtime_get_sync(d40c->base->dev);
        if (d40c->base->rev == 0)
                if (chan_is_logical(d40c)) {
                        res = d40_channel_execute_command(d40c,
@@ -1057,6 +1194,8 @@ static int d40_resume(struct d40_chan *d40c)
        }
 
 no_suspend:
+       pm_runtime_mark_last_busy(d40c->base->dev);
+       pm_runtime_put_autosuspend(d40c->base->dev);
        spin_unlock_irqrestore(&d40c->lock, flags);
        return res;
 }
@@ -1129,7 +1268,10 @@ static struct d40_desc *d40_queue_start(struct d40_chan *d40c)
        d40d = d40_first_queued(d40c);
 
        if (d40d != NULL) {
-               d40c->busy = true;
+               if (!d40c->busy)
+                       d40c->busy = true;
+
+               pm_runtime_get_sync(d40c->base->dev);
 
                /* Remove from queue */
                d40_desc_remove(d40d);
@@ -1190,6 +1332,8 @@ static void dma_tc_handle(struct d40_chan *d40c)
 
                if (d40_queue_start(d40c) == NULL)
                        d40c->busy = false;
+               pm_runtime_mark_last_busy(d40c->base->dev);
+               pm_runtime_put_autosuspend(d40c->base->dev);
        }
 
        d40c->pending_tx++;
@@ -1405,11 +1549,16 @@ static int d40_validate_conf(struct d40_chan *d40c,
        return res;
 }
 
-static bool d40_alloc_mask_set(struct d40_phy_res *phy, bool is_src,
-                              int log_event_line, bool is_log)
+static bool d40_alloc_mask_set(struct d40_phy_res *phy,
+                              bool is_src, int log_event_line, bool is_log,
+                              bool *first_user)
 {
        unsigned long flags;
        spin_lock_irqsave(&phy->lock, flags);
+
+       *first_user = ((phy->allocated_src | phy->allocated_dst)
+                       == D40_ALLOC_FREE);
+
        if (!is_log) {
                /* Physical interrupts are masked per physical full channel */
                if (phy->allocated_src == D40_ALLOC_FREE &&
@@ -1490,7 +1639,7 @@ out:
        return is_free;
 }
 
-static int d40_allocate_channel(struct d40_chan *d40c)
+static int d40_allocate_channel(struct d40_chan *d40c, bool *first_phy_user)
 {
        int dev_type;
        int event_group;
@@ -1526,7 +1675,8 @@ static int d40_allocate_channel(struct d40_chan *d40c)
                        for (i = 0; i < d40c->base->num_phy_chans; i++) {
 
                                if (d40_alloc_mask_set(&phys[i], is_src,
-                                                      0, is_log))
+                                                      0, is_log,
+                                                      first_phy_user))
                                        goto found_phy;
                        }
                } else
@@ -1536,7 +1686,8 @@ static int d40_allocate_channel(struct d40_chan *d40c)
                                        if (d40_alloc_mask_set(&phys[i],
                                                               is_src,
                                                               0,
-                                                              is_log))
+                                                              is_log,
+                                                              first_phy_user))
                                                goto found_phy;
                                }
                        }
@@ -1552,6 +1703,25 @@ found_phy:
        /* Find logical channel */
        for (j = 0; j < d40c->base->num_phy_chans; j += 8) {
                int phy_num = j + event_group * 2;
+
+               if (d40c->dma_cfg.use_fixed_channel) {
+                       i = d40c->dma_cfg.phy_channel;
+
+                       if ((i != phy_num) && (i != phy_num + 1)) {
+                               dev_err(chan2dev(d40c),
+                                       "invalid fixed phy channel %d\n", i);
+                               return -EINVAL;
+                       }
+
+                       if (d40_alloc_mask_set(&phys[i], is_src, event_line,
+                                              is_log, first_phy_user))
+                               goto found_log;
+
+                       dev_err(chan2dev(d40c),
+                               "could not allocate fixed phy channel %d\n", i);
+                       return -EINVAL;
+               }
+
                /*
                 * Spread logical channels across all available physical rather
                 * than pack every logical channel at the first available phy
@@ -1560,13 +1730,15 @@ found_phy:
                if (is_src) {
                        for (i = phy_num; i < phy_num + 2; i++) {
                                if (d40_alloc_mask_set(&phys[i], is_src,
-                                                      event_line, is_log))
+                                                      event_line, is_log,
+                                                      first_phy_user))
                                        goto found_log;
                        }
                } else {
                        for (i = phy_num + 1; i >= phy_num; i--) {
                                if (d40_alloc_mask_set(&phys[i], is_src,
-                                                      event_line, is_log))
+                                                      event_line, is_log,
+                                                      first_phy_user))
                                        goto found_log;
                        }
                }
@@ -1643,10 +1815,11 @@ static int d40_free_dma(struct d40_chan *d40c)
                return -EINVAL;
        }
 
+       pm_runtime_get_sync(d40c->base->dev);
        res = d40_channel_execute_command(d40c, D40_DMA_SUSPEND_REQ);
        if (res) {
                chan_err(d40c, "suspend failed\n");
-               return res;
+               goto out;
        }
 
        if (chan_is_logical(d40c)) {
@@ -1664,13 +1837,11 @@ static int d40_free_dma(struct d40_chan *d40c)
                        if (d40_chan_has_events(d40c)) {
                                res = d40_channel_execute_command(d40c,
                                                                  D40_DMA_RUN);
-                               if (res) {
+                               if (res)
                                        chan_err(d40c,
                                                "Executing RUN command\n");
-                                       return res;
-                               }
                        }
-                       return 0;
+                       goto out;
                }
        } else {
                (void) d40_alloc_mask_free(phy, is_src, 0);
@@ -1680,13 +1851,23 @@ static int d40_free_dma(struct d40_chan *d40c)
        res = d40_channel_execute_command(d40c, D40_DMA_STOP);
        if (res) {
                chan_err(d40c, "Failed to stop channel\n");
-               return res;
+               goto out;
        }
+
+       if (d40c->busy) {
+               pm_runtime_mark_last_busy(d40c->base->dev);
+               pm_runtime_put_autosuspend(d40c->base->dev);
+       }
+
+       d40c->busy = false;
        d40c->phy_chan = NULL;
        d40c->configured = false;
        d40c->base->lookup_phy_chans[phy->num] = NULL;
+out:
 
-       return 0;
+       pm_runtime_mark_last_busy(d40c->base->dev);
+       pm_runtime_put_autosuspend(d40c->base->dev);
+       return res;
 }
 
 static bool d40_is_paused(struct d40_chan *d40c)
@@ -1855,7 +2036,7 @@ err:
 }
 
 static dma_addr_t
-d40_get_dev_addr(struct d40_chan *chan, enum dma_data_direction direction)
+d40_get_dev_addr(struct d40_chan *chan, enum dma_transfer_direction direction)
 {
        struct stedma40_platform_data *plat = chan->base->plat_data;
        struct stedma40_chan_cfg *cfg = &chan->dma_cfg;
@@ -1864,9 +2045,9 @@ d40_get_dev_addr(struct d40_chan *chan, enum dma_data_direction direction)
        if (chan->runtime_addr)
                return chan->runtime_addr;
 
-       if (direction == DMA_FROM_DEVICE)
+       if (direction == DMA_DEV_TO_MEM)
                addr = plat->dev_rx[cfg->src_dev_type];
-       else if (direction == DMA_TO_DEVICE)
+       else if (direction == DMA_MEM_TO_DEV)
                addr = plat->dev_tx[cfg->dst_dev_type];
 
        return addr;
@@ -1875,7 +2056,7 @@ d40_get_dev_addr(struct d40_chan *chan, enum dma_data_direction direction)
 static struct dma_async_tx_descriptor *
 d40_prep_sg(struct dma_chan *dchan, struct scatterlist *sg_src,
            struct scatterlist *sg_dst, unsigned int sg_len,
-           enum dma_data_direction direction, unsigned long dma_flags)
+           enum dma_transfer_direction direction, unsigned long dma_flags)
 {
        struct d40_chan *chan = container_of(dchan, struct d40_chan, chan);
        dma_addr_t src_dev_addr = 0;
@@ -1902,9 +2083,9 @@ d40_prep_sg(struct dma_chan *dchan, struct scatterlist *sg_src,
        if (direction != DMA_NONE) {
                dma_addr_t dev_addr = d40_get_dev_addr(chan, direction);
 
-               if (direction == DMA_FROM_DEVICE)
+               if (direction == DMA_DEV_TO_MEM)
                        src_dev_addr = dev_addr;
-               else if (direction == DMA_TO_DEVICE)
+               else if (direction == DMA_MEM_TO_DEV)
                        dst_dev_addr = dev_addr;
        }
 
@@ -2011,14 +2192,15 @@ static int d40_alloc_chan_resources(struct dma_chan *chan)
                        goto fail;
                }
        }
-       is_free_phy = (d40c->phy_chan == NULL);
 
-       err = d40_allocate_channel(d40c);
+       err = d40_allocate_channel(d40c, &is_free_phy);
        if (err) {
                chan_err(d40c, "Failed to allocate channel\n");
+               d40c->configured = false;
                goto fail;
        }
 
+       pm_runtime_get_sync(d40c->base->dev);
        /* Fill in basic CFG register values */
        d40_phy_cfg(&d40c->dma_cfg, &d40c->src_def_cfg,
                    &d40c->dst_def_cfg, chan_is_logical(d40c));
@@ -2038,6 +2220,12 @@ static int d40_alloc_chan_resources(struct dma_chan *chan)
                          D40_LCPA_CHAN_SIZE + D40_LCPA_CHAN_DST_DELTA;
        }
 
+       dev_dbg(chan2dev(d40c), "allocated %s channel (phy %d%s)\n",
+                chan_is_logical(d40c) ? "logical" : "physical",
+                d40c->phy_chan->num,
+                d40c->dma_cfg.use_fixed_channel ? ", fixed" : "");
+
+
        /*
         * Only write channel configuration to the DMA if the physical
         * resource is free. In case of multiple logical channels
@@ -2046,6 +2234,8 @@ static int d40_alloc_chan_resources(struct dma_chan *chan)
        if (is_free_phy)
                d40_config_write(d40c);
 fail:
+       pm_runtime_mark_last_busy(d40c->base->dev);
+       pm_runtime_put_autosuspend(d40c->base->dev);
        spin_unlock_irqrestore(&d40c->lock, flags);
        return err;
 }
@@ -2108,10 +2298,10 @@ d40_prep_memcpy_sg(struct dma_chan *chan,
 static struct dma_async_tx_descriptor *d40_prep_slave_sg(struct dma_chan *chan,
                                                         struct scatterlist *sgl,
                                                         unsigned int sg_len,
-                                                        enum dma_data_direction direction,
+                                                        enum dma_transfer_direction direction,
                                                         unsigned long dma_flags)
 {
-       if (direction != DMA_FROM_DEVICE && direction != DMA_TO_DEVICE)
+       if (direction != DMA_DEV_TO_MEM && direction != DMA_MEM_TO_DEV)
                return NULL;
 
        return d40_prep_sg(chan, sgl, sgl, sg_len, direction, dma_flags);
@@ -2120,7 +2310,7 @@ static struct dma_async_tx_descriptor *d40_prep_slave_sg(struct dma_chan *chan,
 static struct dma_async_tx_descriptor *
 dma40_prep_dma_cyclic(struct dma_chan *chan, dma_addr_t dma_addr,
                     size_t buf_len, size_t period_len,
-                    enum dma_data_direction direction)
+                    enum dma_transfer_direction direction)
 {
        unsigned int periods = buf_len / period_len;
        struct dma_async_tx_descriptor *txd;
@@ -2269,7 +2459,7 @@ static int d40_set_runtime_config(struct dma_chan *chan,
        dst_addr_width = config->dst_addr_width;
        dst_maxburst = config->dst_maxburst;
 
-       if (config->direction == DMA_FROM_DEVICE) {
+       if (config->direction == DMA_DEV_TO_MEM) {
                dma_addr_t dev_addr_rx =
                        d40c->base->plat_data->dev_rx[cfg->src_dev_type];
 
@@ -2292,7 +2482,7 @@ static int d40_set_runtime_config(struct dma_chan *chan,
                if (dst_maxburst == 0)
                        dst_maxburst = src_maxburst;
 
-       } else if (config->direction == DMA_TO_DEVICE) {
+       } else if (config->direction == DMA_MEM_TO_DEV) {
                dma_addr_t dev_addr_tx =
                        d40c->base->plat_data->dev_tx[cfg->dst_dev_type];
 
@@ -2357,7 +2547,7 @@ static int d40_set_runtime_config(struct dma_chan *chan,
                "configured channel %s for %s, data width %d/%d, "
                "maxburst %d/%d elements, LE, no flow control\n",
                dma_chan_name(chan),
-               (config->direction == DMA_FROM_DEVICE) ? "RX" : "TX",
+               (config->direction == DMA_DEV_TO_MEM) ? "RX" : "TX",
                src_addr_width, dst_addr_width,
                src_maxburst, dst_maxburst);
 
@@ -2519,6 +2709,72 @@ failure1:
        return err;
 }
 
+/* Suspend resume functionality */
+#ifdef CONFIG_PM
+static int dma40_pm_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct d40_base *base = platform_get_drvdata(pdev);
+       int ret = 0;
+       if (!pm_runtime_suspended(dev))
+               return -EBUSY;
+
+       if (base->lcpa_regulator)
+               ret = regulator_disable(base->lcpa_regulator);
+       return ret;
+}
+
+static int dma40_runtime_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct d40_base *base = platform_get_drvdata(pdev);
+
+       d40_save_restore_registers(base, true);
+
+       /* Don't disable/enable clocks for v1 due to HW bugs */
+       if (base->rev != 1)
+               writel_relaxed(base->gcc_pwr_off_mask,
+                              base->virtbase + D40_DREG_GCC);
+
+       return 0;
+}
+
+static int dma40_runtime_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct d40_base *base = platform_get_drvdata(pdev);
+
+       if (base->initialized)
+               d40_save_restore_registers(base, false);
+
+       writel_relaxed(D40_DREG_GCC_ENABLE_ALL,
+                      base->virtbase + D40_DREG_GCC);
+       return 0;
+}
+
+static int dma40_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct d40_base *base = platform_get_drvdata(pdev);
+       int ret = 0;
+
+       if (base->lcpa_regulator)
+               ret = regulator_enable(base->lcpa_regulator);
+
+       return ret;
+}
+
+static const struct dev_pm_ops dma40_pm_ops = {
+       .suspend                = dma40_pm_suspend,
+       .runtime_suspend        = dma40_runtime_suspend,
+       .runtime_resume         = dma40_runtime_resume,
+       .resume                 = dma40_resume,
+};
+#define DMA40_PM_OPS   (&dma40_pm_ops)
+#else
+#define DMA40_PM_OPS   NULL
+#endif
+
 /* Initialization functions. */
 
 static int __init d40_phy_res_init(struct d40_base *base)
@@ -2527,6 +2783,7 @@ static int __init d40_phy_res_init(struct d40_base *base)
        int num_phy_chans_avail = 0;
        u32 val[2];
        int odd_even_bit = -2;
+       int gcc = D40_DREG_GCC_ENA;
 
        val[0] = readl(base->virtbase + D40_DREG_PRSME);
        val[1] = readl(base->virtbase + D40_DREG_PRSMO);
@@ -2538,9 +2795,17 @@ static int __init d40_phy_res_init(struct d40_base *base)
                        /* Mark security only channels as occupied */
                        base->phy_res[i].allocated_src = D40_ALLOC_PHY;
                        base->phy_res[i].allocated_dst = D40_ALLOC_PHY;
+                       base->phy_res[i].reserved = true;
+                       gcc |= D40_DREG_GCC_EVTGRP_ENA(D40_PHYS_TO_GROUP(i),
+                                                      D40_DREG_GCC_SRC);
+                       gcc |= D40_DREG_GCC_EVTGRP_ENA(D40_PHYS_TO_GROUP(i),
+                                                      D40_DREG_GCC_DST);
+
+
                } else {
                        base->phy_res[i].allocated_src = D40_ALLOC_FREE;
                        base->phy_res[i].allocated_dst = D40_ALLOC_FREE;
+                       base->phy_res[i].reserved = false;
                        num_phy_chans_avail++;
                }
                spin_lock_init(&base->phy_res[i].lock);
@@ -2552,6 +2817,11 @@ static int __init d40_phy_res_init(struct d40_base *base)
 
                base->phy_res[chan].allocated_src = D40_ALLOC_PHY;
                base->phy_res[chan].allocated_dst = D40_ALLOC_PHY;
+               base->phy_res[chan].reserved = true;
+               gcc |= D40_DREG_GCC_EVTGRP_ENA(D40_PHYS_TO_GROUP(chan),
+                                              D40_DREG_GCC_SRC);
+               gcc |= D40_DREG_GCC_EVTGRP_ENA(D40_PHYS_TO_GROUP(chan),
+                                              D40_DREG_GCC_DST);
                num_phy_chans_avail--;
        }
 
@@ -2572,6 +2842,15 @@ static int __init d40_phy_res_init(struct d40_base *base)
                val[0] = val[0] >> 2;
        }
 
+       /*
+        * To keep things simple, Enable all clocks initially.
+        * The clocks will get managed later post channel allocation.
+        * The clocks for the event lines on which reserved channels exists
+        * are not managed here.
+        */
+       writel(D40_DREG_GCC_ENABLE_ALL, base->virtbase + D40_DREG_GCC);
+       base->gcc_pwr_off_mask = gcc;
+
        return num_phy_chans_avail;
 }
 
@@ -2699,10 +2978,15 @@ static struct d40_base * __init d40_hw_detect_init(struct platform_device *pdev)
                        goto failure;
        }
 
-       base->lcla_pool.alloc_map = kzalloc(num_phy_chans *
-                                           sizeof(struct d40_desc *) *
-                                           D40_LCLA_LINK_PER_EVENT_GRP,
+       base->reg_val_backup_chan = kmalloc(base->num_phy_chans *
+                                           sizeof(d40_backup_regs_chan),
                                            GFP_KERNEL);
+       if (!base->reg_val_backup_chan)
+               goto failure;
+
+       base->lcla_pool.alloc_map =
+               kzalloc(num_phy_chans * sizeof(struct d40_desc *)
+                       * D40_LCLA_LINK_PER_EVENT_GRP, GFP_KERNEL);
        if (!base->lcla_pool.alloc_map)
                goto failure;
 
@@ -2741,9 +3025,9 @@ failure:
 static void __init d40_hw_init(struct d40_base *base)
 {
 
-       static const struct d40_reg_val dma_init_reg[] = {
+       static struct d40_reg_val dma_init_reg[] = {
                /* Clock every part of the DMA block from start */
-               { .reg = D40_DREG_GCC,    .val = 0x0000ff01},
+               { .reg = D40_DREG_GCC,    .val = D40_DREG_GCC_ENABLE_ALL},
 
                /* Interrupts on all logical channels */
                { .reg = D40_DREG_LCMIS0, .val = 0xFFFFFFFF},
@@ -2943,11 +3227,31 @@ static int __init d40_probe(struct platform_device *pdev)
                d40_err(&pdev->dev, "Failed to ioremap LCPA region\n");
                goto failure;
        }
+       /* If lcla has to be located in ESRAM we don't need to allocate */
+       if (base->plat_data->use_esram_lcla) {
+               res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+                                                       "lcla_esram");
+               if (!res) {
+                       ret = -ENOENT;
+                       d40_err(&pdev->dev,
+                               "No \"lcla_esram\" memory resource\n");
+                       goto failure;
+               }
+               base->lcla_pool.base = ioremap(res->start,
+                                               resource_size(res));
+               if (!base->lcla_pool.base) {
+                       ret = -ENOMEM;
+                       d40_err(&pdev->dev, "Failed to ioremap LCLA region\n");
+                       goto failure;
+               }
+               writel(res->start, base->virtbase + D40_DREG_LCLA);
 
-       ret = d40_lcla_allocate(base);
-       if (ret) {
-               d40_err(&pdev->dev, "Failed to allocate LCLA area\n");
-               goto failure;
+       } else {
+               ret = d40_lcla_allocate(base);
+               if (ret) {
+                       d40_err(&pdev->dev, "Failed to allocate LCLA area\n");
+                       goto failure;
+               }
        }
 
        spin_lock_init(&base->lcla_pool.lock);
@@ -2960,6 +3264,32 @@ static int __init d40_probe(struct platform_device *pdev)
                goto failure;
        }
 
+       pm_runtime_irq_safe(base->dev);
+       pm_runtime_set_autosuspend_delay(base->dev, DMA40_AUTOSUSPEND_DELAY);
+       pm_runtime_use_autosuspend(base->dev);
+       pm_runtime_enable(base->dev);
+       pm_runtime_resume(base->dev);
+
+       if (base->plat_data->use_esram_lcla) {
+
+               base->lcpa_regulator = regulator_get(base->dev, "lcla_esram");
+               if (IS_ERR(base->lcpa_regulator)) {
+                       d40_err(&pdev->dev, "Failed to get lcpa_regulator\n");
+                       base->lcpa_regulator = NULL;
+                       goto failure;
+               }
+
+               ret = regulator_enable(base->lcpa_regulator);
+               if (ret) {
+                       d40_err(&pdev->dev,
+                               "Failed to enable lcpa_regulator\n");
+                       regulator_put(base->lcpa_regulator);
+                       base->lcpa_regulator = NULL;
+                       goto failure;
+               }
+       }
+
+       base->initialized = true;
        err = d40_dmaengine_init(base, num_reserved_chans);
        if (err)
                goto failure;
@@ -2976,6 +3306,11 @@ failure:
                if (base->virtbase)
                        iounmap(base->virtbase);
 
+               if (base->lcla_pool.base && base->plat_data->use_esram_lcla) {
+                       iounmap(base->lcla_pool.base);
+                       base->lcla_pool.base = NULL;
+               }
+
                if (base->lcla_pool.dma_addr)
                        dma_unmap_single(base->dev, base->lcla_pool.dma_addr,
                                         SZ_1K * base->num_phy_chans,
@@ -2998,6 +3333,11 @@ failure:
                        clk_put(base->clk);
                }
 
+               if (base->lcpa_regulator) {
+                       regulator_disable(base->lcpa_regulator);
+                       regulator_put(base->lcpa_regulator);
+               }
+
                kfree(base->lcla_pool.alloc_map);
                kfree(base->lookup_log_chans);
                kfree(base->lookup_phy_chans);
@@ -3013,6 +3353,7 @@ static struct platform_driver d40_driver = {
        .driver = {
                .owner = THIS_MODULE,
                .name  = D40_NAME,
+               .pm = DMA40_PM_OPS,
        },
 };
 
index b44c455..8d3d490 100644 (file)
@@ -16,6 +16,8 @@
 
 #define D40_TYPE_TO_GROUP(type) (type / 16)
 #define D40_TYPE_TO_EVENT(type) (type % 16)
+#define D40_GROUP_SIZE 8
+#define D40_PHYS_TO_GROUP(phys) ((phys & (D40_GROUP_SIZE - 1)) / 2)
 
 /* Most bits of the CFG register are the same in log as in phy mode */
 #define D40_SREG_CFG_MST_POS           15
 
 /* DMA Register Offsets */
 #define D40_DREG_GCC           0x000
+#define D40_DREG_GCC_ENA       0x1
+/* This assumes that there are only 4 event groups */
+#define D40_DREG_GCC_ENABLE_ALL        0xff01
+#define D40_DREG_GCC_EVTGRP_POS 8
+#define D40_DREG_GCC_SRC 0
+#define D40_DREG_GCC_DST 1
+#define D40_DREG_GCC_EVTGRP_ENA(x, y) \
+       (1 << (D40_DREG_GCC_EVTGRP_POS + 2 * x + y))
+
 #define D40_DREG_PRTYP         0x004
 #define D40_DREG_PRSME         0x008
 #define D40_DREG_PRSMO         0x00C
index a4a398f..a6f9c16 100644 (file)
@@ -90,7 +90,7 @@ struct timb_dma_chan {
        struct list_head        queue;
        struct list_head        free_list;
        unsigned int            bytes_per_line;
-       enum dma_data_direction direction;
+       enum dma_transfer_direction     direction;
        unsigned int            descs; /* Descriptors to allocate */
        unsigned int            desc_elems; /* number of elems per descriptor */
 };
@@ -166,10 +166,10 @@ static void __td_unmap_desc(struct timb_dma_chan *td_chan, const u8 *dma_desc,
 
        if (single)
                dma_unmap_single(chan2dev(&td_chan->chan), addr, len,
-                       td_chan->direction);
+                       DMA_TO_DEVICE);
        else
                dma_unmap_page(chan2dev(&td_chan->chan), addr, len,
-                       td_chan->direction);
+                       DMA_TO_DEVICE);
 }
 
 static void __td_unmap_descs(struct timb_dma_desc *td_desc, bool single)
@@ -235,7 +235,7 @@ static void __td_start_dma(struct timb_dma_chan *td_chan)
                "td_chan: %p, chan: %d, membase: %p\n",
                td_chan, td_chan->chan.chan_id, td_chan->membase);
 
-       if (td_chan->direction == DMA_FROM_DEVICE) {
+       if (td_chan->direction == DMA_DEV_TO_MEM) {
 
                /* descriptor address */
                iowrite32(0, td_chan->membase + TIMBDMA_OFFS_RX_DHAR);
@@ -278,7 +278,7 @@ static void __td_finish(struct timb_dma_chan *td_chan)
                txd->cookie);
 
        /* make sure to stop the transfer */
-       if (td_chan->direction == DMA_FROM_DEVICE)
+       if (td_chan->direction == DMA_DEV_TO_MEM)
                iowrite32(0, td_chan->membase + TIMBDMA_OFFS_RX_ER);
 /* Currently no support for stopping DMA transfers
        else
@@ -558,7 +558,7 @@ static void td_issue_pending(struct dma_chan *chan)
 
 static struct dma_async_tx_descriptor *td_prep_slave_sg(struct dma_chan *chan,
        struct scatterlist *sgl, unsigned int sg_len,
-       enum dma_data_direction direction, unsigned long flags)
+       enum dma_transfer_direction direction, unsigned long flags)
 {
        struct timb_dma_chan *td_chan =
                container_of(chan, struct timb_dma_chan, chan);
@@ -606,7 +606,7 @@ static struct dma_async_tx_descriptor *td_prep_slave_sg(struct dma_chan *chan,
        }
 
        dma_sync_single_for_device(chan2dmadev(chan), td_desc->txd.phys,
-               td_desc->desc_list_len, DMA_TO_DEVICE);
+               td_desc->desc_list_len, DMA_MEM_TO_DEV);
 
        return &td_desc->txd;
 }
@@ -775,8 +775,8 @@ static int __devinit td_probe(struct platform_device *pdev)
                td_chan->descs = pchan->descriptors;
                td_chan->desc_elems = pchan->descriptor_elements;
                td_chan->bytes_per_line = pchan->bytes_per_line;
-               td_chan->direction = pchan->rx ? DMA_FROM_DEVICE :
-                       DMA_TO_DEVICE;
+               td_chan->direction = pchan->rx ? DMA_DEV_TO_MEM :
+                       DMA_MEM_TO_DEV;
 
                td_chan->membase = td->membase +
                        (i / 2) * TIMBDMA_INSTANCE_OFFSET +
@@ -841,17 +841,7 @@ static struct platform_driver td_driver = {
        .remove = __exit_p(td_remove),
 };
 
-static int __init td_init(void)
-{
-       return platform_driver_register(&td_driver);
-}
-module_init(td_init);
-
-static void __exit td_exit(void)
-{
-       platform_driver_unregister(&td_driver);
-}
-module_exit(td_exit);
+module_platform_driver(td_driver);
 
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("Timberdale DMA controller driver");
index cbd83e3..6122c36 100644 (file)
@@ -845,7 +845,7 @@ txx9dmac_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 
 static struct dma_async_tx_descriptor *
 txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
-               unsigned int sg_len, enum dma_data_direction direction,
+               unsigned int sg_len, enum dma_transfer_direction direction,
                unsigned long flags)
 {
        struct txx9dmac_chan *dc = to_txx9dmac_chan(chan);
@@ -860,9 +860,9 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
 
        BUG_ON(!ds || !ds->reg_width);
        if (ds->tx_reg)
-               BUG_ON(direction != DMA_TO_DEVICE);
+               BUG_ON(direction != DMA_MEM_TO_DEV);
        else
-               BUG_ON(direction != DMA_FROM_DEVICE);
+               BUG_ON(direction != DMA_DEV_TO_MEM);
        if (unlikely(!sg_len))
                return NULL;
 
@@ -882,7 +882,7 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                mem = sg_dma_address(sg);
 
                if (__is_dmac64(ddev)) {
-                       if (direction == DMA_TO_DEVICE) {
+                       if (direction == DMA_MEM_TO_DEV) {
                                desc->hwdesc.SAR = mem;
                                desc->hwdesc.DAR = ds->tx_reg;
                        } else {
@@ -891,7 +891,7 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                        }
                        desc->hwdesc.CNTR = sg_dma_len(sg);
                } else {
-                       if (direction == DMA_TO_DEVICE) {
+                       if (direction == DMA_MEM_TO_DEV) {
                                desc->hwdesc32.SAR = mem;
                                desc->hwdesc32.DAR = ds->tx_reg;
                        } else {
@@ -900,7 +900,7 @@ txx9dmac_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
                        }
                        desc->hwdesc32.CNTR = sg_dma_len(sg);
                }
-               if (direction == DMA_TO_DEVICE) {
+               if (direction == DMA_MEM_TO_DEV) {
                        sai = ds->reg_width;
                        dai = 0;
                } else {
index 0cb461d..7452277 100644 (file)
@@ -287,7 +287,7 @@ static void mx3_videobuf_queue(struct vb2_buffer *vb)
                sg_dma_len(sg)          = new_size;
 
                txd = ichan->dma_chan.device->device_prep_slave_sg(
-                       &ichan->dma_chan, sg, 1, DMA_FROM_DEVICE,
+                       &ichan->dma_chan, sg, 1, DMA_DEV_TO_MEM,
                        DMA_PREP_INTERRUPT);
                if (!txd)
                        goto error;
index 0a2d75f..4ed1c7c 100644 (file)
@@ -565,7 +565,7 @@ static void buffer_queue(struct videobuf_queue *vq, struct videobuf_buffer *vb)
        spin_unlock_irq(&fh->queue_lock);
 
        desc = fh->chan->device->device_prep_slave_sg(fh->chan,
-               buf->sg, sg_elems, DMA_FROM_DEVICE,
+               buf->sg, sg_elems, DMA_DEV_TO_MEM,
                DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP);
        if (!desc) {
                spin_lock_irq(&fh->queue_lock);
index eb5cd28..a2d25e4 100644 (file)
@@ -513,7 +513,7 @@ static noinline int fpga_program_dma(struct fpga_dev *priv)
         * transaction, and then put it under external control
         */
        memset(&config, 0, sizeof(config));
-       config.direction = DMA_TO_DEVICE;
+       config.direction = DMA_MEM_TO_DEV;
        config.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
        config.dst_maxburst = fpga_fifo_size(priv->regs) / 2 / 4;
        ret = chan->device->device_control(chan, DMA_SLAVE_CONFIG,
index a7ee502..fcfe1eb 100644 (file)
@@ -823,6 +823,7 @@ atmci_prepare_data_dma(struct atmel_mci *host, struct mmc_data *data)
        struct scatterlist              *sg;
        unsigned int                    i;
        enum dma_data_direction         direction;
+       enum dma_transfer_direction     slave_dirn;
        unsigned int                    sglen;
        u32 iflags;
 
@@ -860,16 +861,19 @@ atmci_prepare_data_dma(struct atmel_mci *host, struct mmc_data *data)
        if (host->caps.has_dma)
                atmci_writel(host, ATMCI_DMA, ATMCI_DMA_CHKSIZE(3) | ATMCI_DMAEN);
 
-       if (data->flags & MMC_DATA_READ)
+       if (data->flags & MMC_DATA_READ) {
                direction = DMA_FROM_DEVICE;
-       else
+               slave_dirn = DMA_DEV_TO_MEM;
+       } else {
                direction = DMA_TO_DEVICE;
+               slave_dirn = DMA_MEM_TO_DEV;
+       }
 
        sglen = dma_map_sg(chan->device->dev, data->sg,
                        data->sg_len, direction);
 
        desc = chan->device->device_prep_slave_sg(chan,
-                       data->sg, sglen, direction,
+                       data->sg, sglen, slave_dirn,
                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc)
                goto unmap_exit;
index ece03b4..0d955ff 100644 (file)
@@ -374,6 +374,7 @@ static int mmci_dma_prep_data(struct mmci_host *host, struct mmc_data *data,
        struct dma_chan *chan;
        struct dma_device *device;
        struct dma_async_tx_descriptor *desc;
+       enum dma_data_direction buffer_dirn;
        int nr_sg;
 
        /* Check if next job is already prepared */
@@ -387,10 +388,12 @@ static int mmci_dma_prep_data(struct mmci_host *host, struct mmc_data *data,
        }
 
        if (data->flags & MMC_DATA_READ) {
-               conf.direction = DMA_FROM_DEVICE;
+               conf.direction = DMA_DEV_TO_MEM;
+               buffer_dirn = DMA_FROM_DEVICE;
                chan = host->dma_rx_channel;
        } else {
-               conf.direction = DMA_TO_DEVICE;
+               conf.direction = DMA_MEM_TO_DEV;
+               buffer_dirn = DMA_TO_DEVICE;
                chan = host->dma_tx_channel;
        }
 
@@ -403,7 +406,7 @@ static int mmci_dma_prep_data(struct mmci_host *host, struct mmc_data *data,
                return -EINVAL;
 
        device = chan->device;
-       nr_sg = dma_map_sg(device->dev, data->sg, data->sg_len, conf.direction);
+       nr_sg = dma_map_sg(device->dev, data->sg, data->sg_len, buffer_dirn);
        if (nr_sg == 0)
                return -EINVAL;
 
@@ -426,7 +429,7 @@ static int mmci_dma_prep_data(struct mmci_host *host, struct mmc_data *data,
  unmap_exit:
        if (!next)
                dmaengine_terminate_all(chan);
-       dma_unmap_sg(device->dev, data->sg, data->sg_len, conf.direction);
+       dma_unmap_sg(device->dev, data->sg, data->sg_len, buffer_dirn);
        return -ENOMEM;
 }
 
index 7088b40..4184b79 100644 (file)
@@ -218,6 +218,7 @@ static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
        unsigned int blksz = data->blksz;
        unsigned int datasize = nob * blksz;
        struct scatterlist *sg;
+       enum dma_transfer_direction slave_dirn;
        int i, nents;
 
        if (data->flags & MMC_DATA_STREAM)
@@ -240,10 +241,13 @@ static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
                }
        }
 
-       if (data->flags & MMC_DATA_READ)
+       if (data->flags & MMC_DATA_READ) {
                host->dma_dir = DMA_FROM_DEVICE;
-       else
+               slave_dirn = DMA_DEV_TO_MEM;
+       } else {
                host->dma_dir = DMA_TO_DEVICE;
+               slave_dirn = DMA_MEM_TO_DEV;
+       }
 
        nents = dma_map_sg(host->dma->device->dev, data->sg,
                                     data->sg_len,  host->dma_dir);
@@ -251,7 +255,7 @@ static int mxcmci_setup_data(struct mxcmci_host *host, struct mmc_data *data)
                return -EINVAL;
 
        host->desc = host->dma->device->device_prep_slave_sg(host->dma,
-               data->sg, data->sg_len, host->dma_dir,
+               data->sg, data->sg_len, slave_dirn,
                DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 
        if (!host->desc) {
index 4e2e019..382c835 100644 (file)
@@ -154,6 +154,7 @@ struct mxs_mmc_host {
        struct dma_chan                 *dmach;
        struct mxs_dma_data             dma_data;
        unsigned int                    dma_dir;
+       enum dma_transfer_direction     slave_dirn;
        u32                             ssp_pio_words[SSP_PIO_NUM];
 
        unsigned int                    version;
@@ -324,7 +325,7 @@ static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
        }
 
        desc = host->dmach->device->device_prep_slave_sg(host->dmach,
-                               sgl, sg_len, host->dma_dir, append);
+                               sgl, sg_len, host->slave_dirn, append);
        if (desc) {
                desc->callback = mxs_mmc_dma_irq_callback;
                desc->callback_param = host;
@@ -356,6 +357,7 @@ static void mxs_mmc_bc(struct mxs_mmc_host *host)
        host->ssp_pio_words[1] = cmd0;
        host->ssp_pio_words[2] = cmd1;
        host->dma_dir = DMA_NONE;
+       host->slave_dirn = DMA_TRANS_NONE;
        desc = mxs_mmc_prep_dma(host, 0);
        if (!desc)
                goto out;
@@ -395,6 +397,7 @@ static void mxs_mmc_ac(struct mxs_mmc_host *host)
        host->ssp_pio_words[1] = cmd0;
        host->ssp_pio_words[2] = cmd1;
        host->dma_dir = DMA_NONE;
+       host->slave_dirn = DMA_TRANS_NONE;
        desc = mxs_mmc_prep_dma(host, 0);
        if (!desc)
                goto out;
@@ -433,6 +436,7 @@ static void mxs_mmc_adtc(struct mxs_mmc_host *host)
        int i;
 
        unsigned short dma_data_dir, timeout;
+       enum dma_transfer_direction slave_dirn;
        unsigned int data_size = 0, log2_blksz;
        unsigned int blocks = data->blocks;
 
@@ -448,9 +452,11 @@ static void mxs_mmc_adtc(struct mxs_mmc_host *host)
 
        if (data->flags & MMC_DATA_WRITE) {
                dma_data_dir = DMA_TO_DEVICE;
+               slave_dirn = DMA_MEM_TO_DEV;
                read = 0;
        } else {
                dma_data_dir = DMA_FROM_DEVICE;
+               slave_dirn = DMA_DEV_TO_MEM;
                read = BM_SSP_CTRL0_READ;
        }
 
@@ -510,6 +516,7 @@ static void mxs_mmc_adtc(struct mxs_mmc_host *host)
        host->ssp_pio_words[1] = cmd0;
        host->ssp_pio_words[2] = cmd1;
        host->dma_dir = DMA_NONE;
+       host->slave_dirn = DMA_TRANS_NONE;
        desc = mxs_mmc_prep_dma(host, 0);
        if (!desc)
                goto out;
@@ -518,6 +525,7 @@ static void mxs_mmc_adtc(struct mxs_mmc_host *host)
        WARN_ON(host->data != NULL);
        host->data = data;
        host->dma_dir = dma_data_dir;
+       host->slave_dirn = slave_dirn;
        desc = mxs_mmc_prep_dma(host, 1);
        if (!desc)
                goto out;
index 4a2c5b2..f5d8b53 100644 (file)
@@ -286,7 +286,7 @@ static void sh_mmcif_start_dma_rx(struct sh_mmcif_host *host)
        if (ret > 0) {
                host->dma_active = true;
                desc = chan->device->device_prep_slave_sg(chan, sg, ret,
-                       DMA_FROM_DEVICE, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+                       DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        }
 
        if (desc) {
@@ -335,7 +335,7 @@ static void sh_mmcif_start_dma_tx(struct sh_mmcif_host *host)
        if (ret > 0) {
                host->dma_active = true;
                desc = chan->device->device_prep_slave_sg(chan, sg, ret,
-                       DMA_TO_DEVICE, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+                       DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        }
 
        if (desc) {
index 86f259c..7a6e6cc 100644 (file)
@@ -77,7 +77,7 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
        ret = dma_map_sg(chan->device->dev, sg, host->sg_len, DMA_FROM_DEVICE);
        if (ret > 0)
                desc = chan->device->device_prep_slave_sg(chan, sg, ret,
-                       DMA_FROM_DEVICE, DMA_CTRL_ACK);
+                       DMA_DEV_TO_MEM, DMA_CTRL_ACK);
 
        if (desc) {
                cookie = dmaengine_submit(desc);
@@ -158,7 +158,7 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
        ret = dma_map_sg(chan->device->dev, sg, host->sg_len, DMA_TO_DEVICE);
        if (ret > 0)
                desc = chan->device->device_prep_slave_sg(chan, sg, ret,
-                       DMA_TO_DEVICE, DMA_CTRL_ACK);
+                       DMA_MEM_TO_DEV, DMA_CTRL_ACK);
 
        if (desc) {
                cookie = dmaengine_submit(desc);
index 2a56fc6..7f68042 100644 (file)
@@ -827,7 +827,7 @@ int gpmi_send_command(struct gpmi_nand_data *this)
        pio[1] = pio[2] = 0;
        desc = channel->device->device_prep_slave_sg(channel,
                                        (struct scatterlist *)pio,
-                                       ARRAY_SIZE(pio), DMA_NONE, 0);
+                                       ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
        if (!desc) {
                pr_err("step 1 error\n");
                return -1;
@@ -839,7 +839,7 @@ int gpmi_send_command(struct gpmi_nand_data *this)
        sg_init_one(sgl, this->cmd_buffer, this->command_length);
        dma_map_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
        desc = channel->device->device_prep_slave_sg(channel,
-                                       sgl, 1, DMA_TO_DEVICE, 1);
+                                       sgl, 1, DMA_MEM_TO_DEV, 1);
        if (!desc) {
                pr_err("step 2 error\n");
                return -1;
@@ -872,7 +872,7 @@ int gpmi_send_data(struct gpmi_nand_data *this)
        pio[1] = 0;
        desc = channel->device->device_prep_slave_sg(channel,
                                        (struct scatterlist *)pio,
-                                       ARRAY_SIZE(pio), DMA_NONE, 0);
+                                       ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
        if (!desc) {
                pr_err("step 1 error\n");
                return -1;
@@ -881,7 +881,7 @@ int gpmi_send_data(struct gpmi_nand_data *this)
        /* [2] send DMA request */
        prepare_data_dma(this, DMA_TO_DEVICE);
        desc = channel->device->device_prep_slave_sg(channel, &this->data_sgl,
-                                               1, DMA_TO_DEVICE, 1);
+                                               1, DMA_MEM_TO_DEV, 1);
        if (!desc) {
                pr_err("step 2 error\n");
                return -1;
@@ -908,7 +908,7 @@ int gpmi_read_data(struct gpmi_nand_data *this)
        pio[1] = 0;
        desc = channel->device->device_prep_slave_sg(channel,
                                        (struct scatterlist *)pio,
-                                       ARRAY_SIZE(pio), DMA_NONE, 0);
+                                       ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
        if (!desc) {
                pr_err("step 1 error\n");
                return -1;
@@ -917,7 +917,7 @@ int gpmi_read_data(struct gpmi_nand_data *this)
        /* [2] : send DMA request */
        prepare_data_dma(this, DMA_FROM_DEVICE);
        desc = channel->device->device_prep_slave_sg(channel, &this->data_sgl,
-                                               1, DMA_FROM_DEVICE, 1);
+                                               1, DMA_DEV_TO_MEM, 1);
        if (!desc) {
                pr_err("step 2 error\n");
                return -1;
@@ -964,7 +964,7 @@ int gpmi_send_page(struct gpmi_nand_data *this,
 
        desc = channel->device->device_prep_slave_sg(channel,
                                        (struct scatterlist *)pio,
-                                       ARRAY_SIZE(pio), DMA_NONE, 0);
+                                       ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
        if (!desc) {
                pr_err("step 2 error\n");
                return -1;
@@ -998,7 +998,8 @@ int gpmi_read_page(struct gpmi_nand_data *this,
                | BF_GPMI_CTRL0_XFER_COUNT(0);
        pio[1] = 0;
        desc = channel->device->device_prep_slave_sg(channel,
-                               (struct scatterlist *)pio, 2, DMA_NONE, 0);
+                               (struct scatterlist *)pio, 2,
+                               DMA_TRANS_NONE, 0);
        if (!desc) {
                pr_err("step 1 error\n");
                return -1;
@@ -1027,7 +1028,7 @@ int gpmi_read_page(struct gpmi_nand_data *this,
        pio[5] = auxiliary;
        desc = channel->device->device_prep_slave_sg(channel,
                                        (struct scatterlist *)pio,
-                                       ARRAY_SIZE(pio), DMA_NONE, 1);
+                                       ARRAY_SIZE(pio), DMA_TRANS_NONE, 1);
        if (!desc) {
                pr_err("step 2 error\n");
                return -1;
@@ -1045,7 +1046,8 @@ int gpmi_read_page(struct gpmi_nand_data *this,
                | BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
        pio[1] = 0;
        desc = channel->device->device_prep_slave_sg(channel,
-                               (struct scatterlist *)pio, 2, DMA_NONE, 1);
+                               (struct scatterlist *)pio, 2,
+                               DMA_TRANS_NONE, 1);
        if (!desc) {
                pr_err("step 3 error\n");
                return -1;
index 75ec87a..0a85690 100644 (file)
@@ -459,7 +459,7 @@ static int ks8842_tx_frame_dma(struct sk_buff *skb, struct net_device *netdev)
                sg_dma_len(&ctl->sg) += 4 - sg_dma_len(&ctl->sg) % 4;
 
        ctl->adesc = ctl->chan->device->device_prep_slave_sg(ctl->chan,
-               &ctl->sg, 1, DMA_TO_DEVICE,
+               &ctl->sg, 1, DMA_MEM_TO_DEV,
                DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP);
        if (!ctl->adesc)
                return NETDEV_TX_BUSY;
@@ -571,7 +571,7 @@ static int __ks8842_start_new_rx_dma(struct net_device *netdev)
                sg_dma_len(sg) = DMA_BUFFER_SIZE;
 
                ctl->adesc = ctl->chan->device->device_prep_slave_sg(ctl->chan,
-                       sg, 1, DMA_FROM_DEVICE,
+                       sg, 1, DMA_DEV_TO_MEM,
                        DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_SRC_UNMAP);
 
                if (!ctl->adesc)
index e743a45..8418eb0 100644 (file)
@@ -131,7 +131,7 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change)
        rxchan = dws->rxchan;
 
        /* 2. Prepare the TX dma transfer */
-       txconf.direction = DMA_TO_DEVICE;
+       txconf.direction = DMA_MEM_TO_DEV;
        txconf.dst_addr = dws->dma_addr;
        txconf.dst_maxburst = LNW_DMA_MSIZE_16;
        txconf.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
@@ -147,13 +147,13 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change)
        txdesc = txchan->device->device_prep_slave_sg(txchan,
                                &dws->tx_sgl,
                                1,
-                               DMA_TO_DEVICE,
+                               DMA_MEM_TO_DEV,
                                DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_DEST_UNMAP);
        txdesc->callback = dw_spi_dma_done;
        txdesc->callback_param = dws;
 
        /* 3. Prepare the RX dma transfer */
-       rxconf.direction = DMA_FROM_DEVICE;
+       rxconf.direction = DMA_DEV_TO_MEM;
        rxconf.src_addr = dws->dma_addr;
        rxconf.src_maxburst = LNW_DMA_MSIZE_16;
        rxconf.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
@@ -169,7 +169,7 @@ static int mid_spi_dma_transfer(struct dw_spi *dws, int cs_change)
        rxdesc = rxchan->device->device_prep_slave_sg(rxchan,
                                &dws->rx_sgl,
                                1,
-                               DMA_FROM_DEVICE,
+                               DMA_DEV_TO_MEM,
                                DMA_PREP_INTERRUPT | DMA_COMPL_SKIP_DEST_UNMAP);
        rxdesc->callback = dw_spi_dma_done;
        rxdesc->callback_param = dws;
index 0a282e5..d46e55c 100644 (file)
@@ -551,6 +551,7 @@ ep93xx_spi_dma_prepare(struct ep93xx_spi *espi, enum dma_data_direction dir)
        struct dma_async_tx_descriptor *txd;
        enum dma_slave_buswidth buswidth;
        struct dma_slave_config conf;
+       enum dma_transfer_direction slave_dirn;
        struct scatterlist *sg;
        struct sg_table *sgt;
        struct dma_chan *chan;
@@ -573,6 +574,7 @@ ep93xx_spi_dma_prepare(struct ep93xx_spi *espi, enum dma_data_direction dir)
 
                conf.src_addr = espi->sspdr_phys;
                conf.src_addr_width = buswidth;
+               slave_dirn = DMA_DEV_TO_MEM;
        } else {
                chan = espi->dma_tx;
                buf = t->tx_buf;
@@ -580,6 +582,7 @@ ep93xx_spi_dma_prepare(struct ep93xx_spi *espi, enum dma_data_direction dir)
 
                conf.dst_addr = espi->sspdr_phys;
                conf.dst_addr_width = buswidth;
+               slave_dirn = DMA_MEM_TO_DEV;
        }
 
        ret = dmaengine_slave_config(chan, &conf);
@@ -631,7 +634,7 @@ ep93xx_spi_dma_prepare(struct ep93xx_spi *espi, enum dma_data_direction dir)
                return ERR_PTR(-ENOMEM);
 
        txd = chan->device->device_prep_slave_sg(chan, sgt->sgl, nents,
-                                                dir, DMA_CTRL_ACK);
+                                                slave_dirn, DMA_CTRL_ACK);
        if (!txd) {
                dma_unmap_sg(chan->device->dev, sgt->sgl, sgt->nents, dir);
                return ERR_PTR(-ENOMEM);
@@ -979,7 +982,7 @@ static int ep93xx_spi_setup_dma(struct ep93xx_spi *espi)
        dma_cap_set(DMA_SLAVE, mask);
 
        espi->dma_rx_data.port = EP93XX_DMA_SSP;
-       espi->dma_rx_data.direction = DMA_FROM_DEVICE;
+       espi->dma_rx_data.direction = DMA_DEV_TO_MEM;
        espi->dma_rx_data.name = "ep93xx-spi-rx";
 
        espi->dma_rx = dma_request_channel(mask, ep93xx_spi_dma_filter,
@@ -990,7 +993,7 @@ static int ep93xx_spi_setup_dma(struct ep93xx_spi *espi)
        }
 
        espi->dma_tx_data.port = EP93XX_DMA_SSP;
-       espi->dma_tx_data.direction = DMA_TO_DEVICE;
+       espi->dma_tx_data.direction = DMA_MEM_TO_DEV;
        espi->dma_tx_data.name = "ep93xx-spi-tx";
 
        espi->dma_tx = dma_request_channel(mask, ep93xx_spi_dma_filter,
index f1f5efb..2f9cb43 100644 (file)
@@ -900,11 +900,11 @@ static int configure_dma(struct pl022 *pl022)
 {
        struct dma_slave_config rx_conf = {
                .src_addr = SSP_DR(pl022->phybase),
-               .direction = DMA_FROM_DEVICE,
+               .direction = DMA_DEV_TO_MEM,
        };
        struct dma_slave_config tx_conf = {
                .dst_addr = SSP_DR(pl022->phybase),
-               .direction = DMA_TO_DEVICE,
+               .direction = DMA_MEM_TO_DEV,
        };
        unsigned int pages;
        int ret;
@@ -1041,7 +1041,7 @@ static int configure_dma(struct pl022 *pl022)
        rxdesc = rxchan->device->device_prep_slave_sg(rxchan,
                                      pl022->sgt_rx.sgl,
                                      rx_sglen,
-                                     DMA_FROM_DEVICE,
+                                     DMA_DEV_TO_MEM,
                                      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!rxdesc)
                goto err_rxdesc;
@@ -1049,7 +1049,7 @@ static int configure_dma(struct pl022 *pl022)
        txdesc = txchan->device->device_prep_slave_sg(txchan,
                                      pl022->sgt_tx.sgl,
                                      tx_sglen,
-                                     DMA_TO_DEVICE,
+                                     DMA_MEM_TO_DEV,
                                      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!txdesc)
                goto err_txdesc;
index 7086583..2a6429d 100644 (file)
@@ -1079,7 +1079,7 @@ static void pch_spi_handle_dma(struct pch_spi_data *data, int *bpw)
        }
        sg = dma->sg_rx_p;
        desc_rx = dma->chan_rx->device->device_prep_slave_sg(dma->chan_rx, sg,
-                                       num, DMA_FROM_DEVICE,
+                                       num, DMA_DEV_TO_MEM,
                                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc_rx) {
                dev_err(&data->master->dev, "%s:device_prep_slave_sg Failed\n",
@@ -1124,7 +1124,7 @@ static void pch_spi_handle_dma(struct pch_spi_data *data, int *bpw)
        }
        sg = dma->sg_tx_p;
        desc_tx = dma->chan_tx->device->device_prep_slave_sg(dma->chan_tx,
-                                       sg, num, DMA_TO_DEVICE,
+                                       sg, num, DMA_MEM_TO_DEV,
                                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc_tx) {
                dev_err(&data->master->dev, "%s:device_prep_slave_sg Failed\n",
index 6958594..9ae0240 100644 (file)
@@ -268,7 +268,7 @@ static void pl011_dma_probe_initcall(struct uart_amba_port *uap)
        struct dma_slave_config tx_conf = {
                .dst_addr = uap->port.mapbase + UART01x_DR,
                .dst_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE,
-               .direction = DMA_TO_DEVICE,
+               .direction = DMA_MEM_TO_DEV,
                .dst_maxburst = uap->fifosize >> 1,
        };
        struct dma_chan *chan;
@@ -301,7 +301,7 @@ static void pl011_dma_probe_initcall(struct uart_amba_port *uap)
                struct dma_slave_config rx_conf = {
                        .src_addr = uap->port.mapbase + UART01x_DR,
                        .src_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE,
-                       .direction = DMA_FROM_DEVICE,
+                       .direction = DMA_DEV_TO_MEM,
                        .src_maxburst = uap->fifosize >> 1,
                };
 
@@ -480,7 +480,7 @@ static int pl011_dma_tx_refill(struct uart_amba_port *uap)
                return -EBUSY;
        }
 
-       desc = dma_dev->device_prep_slave_sg(chan, &dmatx->sg, 1, DMA_TO_DEVICE,
+       desc = dma_dev->device_prep_slave_sg(chan, &dmatx->sg, 1, DMA_MEM_TO_DEV,
                                             DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc) {
                dma_unmap_sg(dma_dev->dev, &dmatx->sg, 1, DMA_TO_DEVICE);
@@ -676,7 +676,7 @@ static int pl011_dma_rx_trigger_dma(struct uart_amba_port *uap)
                &uap->dmarx.sgbuf_b : &uap->dmarx.sgbuf_a;
        dma_dev = rxchan->device;
        desc = rxchan->device->device_prep_slave_sg(rxchan, &sgbuf->sg, 1,
-                                       DMA_FROM_DEVICE,
+                                       DMA_DEV_TO_MEM,
                                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        /*
         * If the DMA engine is busy and cannot prepare a
index de0f613..17ae657 100644 (file)
@@ -764,7 +764,7 @@ static int dma_handle_rx(struct eg20t_port *priv)
        sg_dma_address(sg) = priv->rx_buf_dma;
 
        desc = priv->chan_rx->device->device_prep_slave_sg(priv->chan_rx,
-                       sg, 1, DMA_FROM_DEVICE,
+                       sg, 1, DMA_DEV_TO_MEM,
                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 
        if (!desc)
@@ -923,7 +923,7 @@ static unsigned int dma_handle_tx(struct eg20t_port *priv)
        }
 
        desc = priv->chan_tx->device->device_prep_slave_sg(priv->chan_tx,
-                                       priv->sg_tx_p, nent, DMA_TO_DEVICE,
+                                       priv->sg_tx_p, nent, DMA_MEM_TO_DEV,
                                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc) {
                dev_err(priv->port.dev, "%s:device_prep_slave_sg Failed\n",
index 9e62349..7508579 100644 (file)
@@ -1339,7 +1339,7 @@ static void sci_submit_rx(struct sci_port *s)
                struct dma_async_tx_descriptor *desc;
 
                desc = chan->device->device_prep_slave_sg(chan,
-                       sg, 1, DMA_FROM_DEVICE, DMA_PREP_INTERRUPT);
+                       sg, 1, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT);
 
                if (desc) {
                        s->desc_rx[i] = desc;
@@ -1454,7 +1454,7 @@ static void work_fn_tx(struct work_struct *work)
        BUG_ON(!sg_dma_len(sg));
 
        desc = chan->device->device_prep_slave_sg(chan,
-                       sg, s->sg_len_tx, DMA_TO_DEVICE,
+                       sg, s->sg_len_tx, DMA_MEM_TO_DEV,
                        DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc) {
                /* switch to PIO */
index 32793ce..9c2cc46 100644 (file)
@@ -183,7 +183,7 @@ static int __devinit ehci_hcd_xilinx_of_probe(struct platform_device *op)
        }
 
        irq = irq_of_parse_and_map(dn, 0);
-       if (irq == NO_IRQ) {
+       if (!irq) {
                printk(KERN_ERR "%s: irq_of_parse_and_map failed\n", __FILE__);
                rv = -EBUSY;
                goto err_irq;
index a163632..97cb459 100644 (file)
@@ -84,7 +84,7 @@ static bool ux500_configure_channel(struct dma_channel *channel,
        struct musb_hw_ep *hw_ep = ux500_channel->hw_ep;
        struct dma_chan *dma_chan = ux500_channel->dma_chan;
        struct dma_async_tx_descriptor *dma_desc;
-       enum dma_data_direction direction;
+       enum dma_transfer_direction direction;
        struct scatterlist sg;
        struct dma_slave_config slave_conf;
        enum dma_slave_buswidth addr_width;
@@ -104,7 +104,7 @@ static bool ux500_configure_channel(struct dma_channel *channel,
        sg_dma_address(&sg) = dma_addr;
        sg_dma_len(&sg) = len;
 
-       direction = ux500_channel->is_tx ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+       direction = ux500_channel->is_tx ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM;
        addr_width = (len & 0x3) ? DMA_SLAVE_BUSWIDTH_1_BYTE :
                                        DMA_SLAVE_BUSWIDTH_4_BYTES;
 
index b51fcd8..72339bd 100644 (file)
@@ -772,10 +772,10 @@ static void usbhsf_dma_prepare_tasklet(unsigned long data)
        struct dma_async_tx_descriptor *desc;
        struct dma_chan *chan = usbhsf_dma_chan_get(fifo, pkt);
        struct device *dev = usbhs_priv_to_dev(priv);
-       enum dma_data_direction dir;
+       enum dma_transfer_direction dir;
        dma_cookie_t cookie;
 
-       dir = usbhs_pipe_is_dir_in(pipe) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+       dir = usbhs_pipe_is_dir_in(pipe) ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV;
 
        sg_init_table(&sg, 1);
        sg_set_page(&sg, virt_to_page(pkt->dma),
index e3406ab..727a514 100644 (file)
@@ -245,6 +245,7 @@ struct mx3fb_data {
 
        uint32_t                h_start_width;
        uint32_t                v_start_width;
+       enum disp_data_mapping  disp_data_fmt;
 };
 
 struct dma_chan_request {
@@ -287,11 +288,14 @@ static void mx3fb_write_reg(struct mx3fb_data *mx3fb, u32 value, unsigned long r
        __raw_writel(value, mx3fb->reg_base + reg);
 }
 
-static const uint32_t di_mappings[] = {
-       0x1600AAAA, 0x00E05555, 0x00070000, 3,  /* RGB888 */
-       0x0005000F, 0x000B000F, 0x0011000F, 1,  /* RGB666 */
-       0x0011000F, 0x000B000F, 0x0005000F, 1,  /* BGR666 */
-       0x0004003F, 0x000A000F, 0x000F003F, 1   /* RGB565 */
+struct di_mapping {
+       uint32_t b0, b1, b2;
+};
+
+static const struct di_mapping di_mappings[] = {
+       [IPU_DISP_DATA_MAPPING_RGB666] = { 0x0005000f, 0x000b000f, 0x0011000f },
+       [IPU_DISP_DATA_MAPPING_RGB565] = { 0x0004003f, 0x000a000f, 0x000f003f },
+       [IPU_DISP_DATA_MAPPING_RGB888] = { 0x00070000, 0x000f0000, 0x00170000 },
 };
 
 static void sdc_fb_init(struct mx3fb_info *fbi)
@@ -334,7 +338,7 @@ static void sdc_enable_channel(struct mx3fb_info *mx3_fbi)
        /* This enables the channel */
        if (mx3_fbi->cookie < 0) {
                mx3_fbi->txd = dma_chan->device->device_prep_slave_sg(dma_chan,
-                     &mx3_fbi->sg[0], 1, DMA_TO_DEVICE, DMA_PREP_INTERRUPT);
+                     &mx3_fbi->sg[0], 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT);
                if (!mx3_fbi->txd) {
                        dev_err(mx3fb->dev, "Cannot allocate descriptor on %d\n",
                                dma_chan->chan_id);
@@ -425,7 +429,6 @@ static int sdc_set_window_pos(struct mx3fb_data *mx3fb, enum ipu_channel channel
  * @pixel_clk:         desired pixel clock frequency in Hz.
  * @width:             width of panel in pixels.
  * @height:            height of panel in pixels.
- * @pixel_fmt:         pixel format of buffer as FOURCC ASCII code.
  * @h_start_width:     number of pixel clocks between the HSYNC signal pulse
  *                     and the start of valid data.
  * @h_sync_width:      width of the HSYNC signal in units of pixel clocks.
@@ -442,7 +445,6 @@ static int sdc_set_window_pos(struct mx3fb_data *mx3fb, enum ipu_channel channel
 static int sdc_init_panel(struct mx3fb_data *mx3fb, enum ipu_panel panel,
                          uint32_t pixel_clk,
                          uint16_t width, uint16_t height,
-                         enum pixel_fmt pixel_fmt,
                          uint16_t h_start_width, uint16_t h_sync_width,
                          uint16_t h_end_width, uint16_t v_start_width,
                          uint16_t v_sync_width, uint16_t v_end_width,
@@ -453,6 +455,7 @@ static int sdc_init_panel(struct mx3fb_data *mx3fb, enum ipu_panel panel,
        uint32_t old_conf;
        uint32_t div;
        struct clk *ipu_clk;
+       const struct di_mapping *map;
 
        dev_dbg(mx3fb->dev, "panel size = %d x %d", width, height);
 
@@ -540,36 +543,10 @@ static int sdc_init_panel(struct mx3fb_data *mx3fb, enum ipu_panel panel,
                sig.Vsync_pol << DI_D3_VSYNC_POL_SHIFT;
        mx3fb_write_reg(mx3fb, old_conf, DI_DISP_SIG_POL);
 
-       switch (pixel_fmt) {
-       case IPU_PIX_FMT_RGB24:
-               mx3fb_write_reg(mx3fb, di_mappings[0], DI_DISP3_B0_MAP);
-               mx3fb_write_reg(mx3fb, di_mappings[1], DI_DISP3_B1_MAP);
-               mx3fb_write_reg(mx3fb, di_mappings[2], DI_DISP3_B2_MAP);
-               mx3fb_write_reg(mx3fb, mx3fb_read_reg(mx3fb, DI_DISP_ACC_CC) |
-                            ((di_mappings[3] - 1) << 12), DI_DISP_ACC_CC);
-               break;
-       case IPU_PIX_FMT_RGB666:
-               mx3fb_write_reg(mx3fb, di_mappings[4], DI_DISP3_B0_MAP);
-               mx3fb_write_reg(mx3fb, di_mappings[5], DI_DISP3_B1_MAP);
-               mx3fb_write_reg(mx3fb, di_mappings[6], DI_DISP3_B2_MAP);
-               mx3fb_write_reg(mx3fb, mx3fb_read_reg(mx3fb, DI_DISP_ACC_CC) |
-                            ((di_mappings[7] - 1) << 12), DI_DISP_ACC_CC);
-               break;
-       case IPU_PIX_FMT_BGR666:
-               mx3fb_write_reg(mx3fb, di_mappings[8], DI_DISP3_B0_MAP);
-               mx3fb_write_reg(mx3fb, di_mappings[9], DI_DISP3_B1_MAP);
-               mx3fb_write_reg(mx3fb, di_mappings[10], DI_DISP3_B2_MAP);
-               mx3fb_write_reg(mx3fb, mx3fb_read_reg(mx3fb, DI_DISP_ACC_CC) |
-                            ((di_mappings[11] - 1) << 12), DI_DISP_ACC_CC);
-               break;
-       default:
-               mx3fb_write_reg(mx3fb, di_mappings[12], DI_DISP3_B0_MAP);
-               mx3fb_write_reg(mx3fb, di_mappings[13], DI_DISP3_B1_MAP);
-               mx3fb_write_reg(mx3fb, di_mappings[14], DI_DISP3_B2_MAP);
-               mx3fb_write_reg(mx3fb, mx3fb_read_reg(mx3fb, DI_DISP_ACC_CC) |
-                            ((di_mappings[15] - 1) << 12), DI_DISP_ACC_CC);
-               break;
-       }
+       map = &di_mappings[mx3fb->disp_data_fmt];
+       mx3fb_write_reg(mx3fb, map->b0, DI_DISP3_B0_MAP);
+       mx3fb_write_reg(mx3fb, map->b1, DI_DISP3_B1_MAP);
+       mx3fb_write_reg(mx3fb, map->b2, DI_DISP3_B2_MAP);
 
        spin_unlock_irqrestore(&mx3fb->lock, lock_flags);
 
@@ -780,8 +757,6 @@ static int __set_par(struct fb_info *fbi, bool lock)
                if (sdc_init_panel(mx3fb, mode,
                                   (PICOS2KHZ(fbi->var.pixclock)) * 1000UL,
                                   fbi->var.xres, fbi->var.yres,
-                                  (fbi->var.sync & FB_SYNC_SWAP_RGB) ?
-                                  IPU_PIX_FMT_BGR666 : IPU_PIX_FMT_RGB666,
                                   fbi->var.left_margin,
                                   fbi->var.hsync_len,
                                   fbi->var.right_margin +
@@ -1117,7 +1092,7 @@ static int mx3fb_pan_display(struct fb_var_screeninfo *var,
                async_tx_ack(mx3_fbi->txd);
 
        txd = dma_chan->device->device_prep_slave_sg(dma_chan, sg +
-               mx3_fbi->cur_ipu_buf, 1, DMA_TO_DEVICE, DMA_PREP_INTERRUPT);
+               mx3_fbi->cur_ipu_buf, 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT);
        if (!txd) {
                dev_err(fbi->device,
                        "Error preparing a DMA transaction descriptor.\n");
@@ -1349,6 +1324,12 @@ static int init_fb_chan(struct mx3fb_data *mx3fb, struct idmac_channel *ichan)
        const struct fb_videomode *mode;
        int ret, num_modes;
 
+       if (mx3fb_pdata->disp_data_fmt >= ARRAY_SIZE(di_mappings)) {
+               dev_err(dev, "Illegal display data format %d\n",
+                               mx3fb_pdata->disp_data_fmt);
+               return -EINVAL;
+       }
+
        ichan->client = mx3fb;
        irq = ichan->eof_irq;
 
@@ -1402,6 +1383,8 @@ static int init_fb_chan(struct mx3fb_data *mx3fb, struct idmac_channel *ichan)
        mx3fbi->mx3fb           = mx3fb;
        mx3fbi->blank           = FB_BLANK_NORMAL;
 
+       mx3fb->disp_data_fmt    = mx3fb_pdata->disp_data_fmt;
+
        init_completion(&mx3fbi->flip_cmpl);
        disable_irq(ichan->eof_irq);
        dev_dbg(mx3fb->dev, "disabling irq %d\n", ichan->eof_irq);
index 3832e30..596e6a7 100644 (file)
@@ -221,7 +221,7 @@ static int register_balloon(struct device *dev)
 {
        int i, error;
 
-       error = bus_register(&balloon_subsys);
+       error = subsys_system_register(&balloon_subsys, NULL);
        if (error)
                return error;
 
index ecb9fd3..d33f01c 100644 (file)
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
 
          If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+       bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+       depends on BTRFS_FS
+       help
+         Adds code that examines all block write requests (including
+         writes of the super block). The goal is to verify that the
+         state of the filesystem on disk is always consistent, i.e.,
+         after a power-loss or kernel panic event the filesystem is
+         in a consistent state.
+
+         If the integrity check tool is included and activated in
+         the mount options, plenty of kernel memory is used, and
+         plenty of additional CPU cycles are spent. Enabling this
+         functionality is not intended for normal use.
+
+         In most cases, unless you are a btrfs developer who needs
+         to verify the integrity of (super)-block write requests
+         during the run of a regression test, say N
index c0ddfd2..0c4fa2b 100644 (file)
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-          reada.o backref.o
+          reada.o backref.o ulist.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
index 22c64ff..b9a8432 100644 (file)
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
 
-struct __data_ref {
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
        struct list_head list;
-       u64 inum;
-       u64 root;
-       u64 extent_data_item_offset;
+       u64 root_id;
+       struct btrfs_key key;
+       int level;
+       int count;
+       u64 parent;
+       u64 wanted_disk_byte;
 };
 
-struct __shared_ref {
-       struct list_head list;
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+                           struct btrfs_key *key, int level, u64 parent,
+                           u64 wanted_disk_byte, int count)
+{
+       struct __prelim_ref *ref;
+
+       /* in case we're adding delayed refs, we're holding the refs spinlock */
+       ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+       if (!ref)
+               return -ENOMEM;
+
+       ref->root_id = root_id;
+       if (key)
+               ref->key = *key;
+       else
+               memset(&ref->key, 0, sizeof(ref->key));
+
+       ref->level = level;
+       ref->count = count;
+       ref->parent = parent;
+       ref->wanted_disk_byte = wanted_disk_byte;
+       list_add_tail(&ref->list, head);
+
+       return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+                               struct ulist *parents,
+                               struct extent_buffer *eb, int level,
+                               u64 wanted_objectid, u64 wanted_disk_byte)
+{
+       int ret;
+       int slot;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
        u64 disk_byte;
-};
+
+add_parent:
+       ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+       if (ret < 0)
+               return ret;
+
+       if (level != 0)
+               return 0;
+
+       /*
+        * if the current leaf is full with EXTENT_DATA items, we must
+        * check the next one if that holds a reference as well.
+        * ref->count cannot be used to skip this check.
+        * repeat this until we don't find any additional EXTENT_DATA items.
+        */
+       while (1) {
+               ret = btrfs_next_leaf(root, path);
+               if (ret < 0)
+                       return ret;
+               if (ret)
+                       return 0;
+
+               eb = path->nodes[0];
+               for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
+                       btrfs_item_key_to_cpu(eb, &key, slot);
+                       if (key.objectid != wanted_objectid ||
+                           key.type != BTRFS_EXTENT_DATA_KEY)
+                               return 0;
+                       fi = btrfs_item_ptr(eb, slot,
+                                               struct btrfs_file_extent_item);
+                       disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+                       if (disk_byte == wanted_disk_byte)
+                               goto add_parent;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+                                       struct __prelim_ref *ref,
+                                       struct ulist *parents)
+{
+       struct btrfs_path *path;
+       struct btrfs_root *root;
+       struct btrfs_key root_key;
+       struct btrfs_key key = {0};
+       struct extent_buffer *eb;
+       int ret = 0;
+       int root_level;
+       int level = ref->level;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       root_key.objectid = ref->root_id;
+       root_key.type = BTRFS_ROOT_ITEM_KEY;
+       root_key.offset = (u64)-1;
+       root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
+
+       rcu_read_lock();
+       root_level = btrfs_header_level(root->node);
+       rcu_read_unlock();
+
+       if (root_level + 1 == level)
+               goto out;
+
+       path->lowest_level = level;
+       ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+       pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+                "%d for key (%llu %u %llu)\n",
+                (unsigned long long)ref->root_id, level, ref->count, ret,
+                (unsigned long long)ref->key.objectid, ref->key.type,
+                (unsigned long long)ref->key.offset);
+       if (ret < 0)
+               goto out;
+
+       eb = path->nodes[level];
+       if (!eb) {
+               WARN_ON(1);
+               ret = 1;
+               goto out;
+       }
+
+       if (level == 0) {
+               if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret)
+                               goto out;
+                       eb = path->nodes[0];
+               }
+
+               btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+       }
+
+       /* the last two parameters will only be used for level == 0 */
+       ret = add_all_parents(root, path, parents, eb, level, key.objectid,
+                               ref->wanted_disk_byte);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+                                  struct list_head *head)
+{
+       int err;
+       int ret = 0;
+       struct __prelim_ref *ref;
+       struct __prelim_ref *ref_safe;
+       struct __prelim_ref *new_ref;
+       struct ulist *parents;
+       struct ulist_node *node;
+
+       parents = ulist_alloc(GFP_NOFS);
+       if (!parents)
+               return -ENOMEM;
+
+       /*
+        * _safe allows us to insert directly after the current item without
+        * iterating over the newly inserted items.
+        * we're also allowed to re-assign ref during iteration.
+        */
+       list_for_each_entry_safe(ref, ref_safe, head, list) {
+               if (ref->parent)        /* already direct */
+                       continue;
+               if (ref->count == 0)
+                       continue;
+               err = __resolve_indirect_ref(fs_info, ref, parents);
+               if (err) {
+                       if (ret == 0)
+                               ret = err;
+                       continue;
+               }
+
+               /* we put the first parent into the ref at hand */
+               node = ulist_next(parents, NULL);
+               ref->parent = node ? node->val : 0;
+
+               /* additional parents require new refs being added here */
+               while ((node = ulist_next(parents, node))) {
+                       new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+                       if (!new_ref) {
+                               ret = -ENOMEM;
+                               break;
+                       }
+                       memcpy(new_ref, ref, sizeof(*ref));
+                       new_ref->parent = node->val;
+                       list_add(&new_ref->list, &ref->list);
+               }
+               ulist_reinit(parents);
+       }
+
+       ulist_free(parents);
+       return ret;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * mode = 2: merge identical parents
+ */
+static int __merge_refs(struct list_head *head, int mode)
+{
+       struct list_head *pos1;
+
+       list_for_each(pos1, head) {
+               struct list_head *n2;
+               struct list_head *pos2;
+               struct __prelim_ref *ref1;
+
+               ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+               if (mode == 1 && ref1->key.type == 0)
+                       continue;
+               for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+                    pos2 = n2, n2 = pos2->next) {
+                       struct __prelim_ref *ref2;
+
+                       ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+                       if (mode == 1) {
+                               if (memcmp(&ref1->key, &ref2->key,
+                                          sizeof(ref1->key)) ||
+                                   ref1->level != ref2->level ||
+                                   ref1->root_id != ref2->root_id)
+                                       continue;
+                               ref1->count += ref2->count;
+                       } else {
+                               if (ref1->parent != ref2->parent)
+                                       continue;
+                               ref1->count += ref2->count;
+                       }
+                       list_del(&ref2->list);
+                       kfree(ref2);
+               }
+
+       }
+       return 0;
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+                             struct btrfs_key *info_key,
+                             struct list_head *prefs)
+{
+       struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+       struct rb_node *n = &head->node.rb_node;
+       int sgn;
+       int ret;
+
+       if (extent_op && extent_op->update_key)
+               btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+
+       while ((n = rb_prev(n))) {
+               struct btrfs_delayed_ref_node *node;
+               node = rb_entry(n, struct btrfs_delayed_ref_node,
+                               rb_node);
+               if (node->bytenr != head->node.bytenr)
+                       break;
+               WARN_ON(node->is_head);
+
+               if (node->seq > seq)
+                       continue;
+
+               switch (node->action) {
+               case BTRFS_ADD_DELAYED_EXTENT:
+               case BTRFS_UPDATE_DELAYED_HEAD:
+                       WARN_ON(1);
+                       continue;
+               case BTRFS_ADD_DELAYED_REF:
+                       sgn = 1;
+                       break;
+               case BTRFS_DROP_DELAYED_REF:
+                       sgn = -1;
+                       break;
+               default:
+                       BUG_ON(1);
+               }
+               switch (node->type) {
+               case BTRFS_TREE_BLOCK_REF_KEY: {
+                       struct btrfs_delayed_tree_ref *ref;
+
+                       ref = btrfs_delayed_node_to_tree_ref(node);
+                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                                              ref->level + 1, 0, node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               case BTRFS_SHARED_BLOCK_REF_KEY: {
+                       struct btrfs_delayed_tree_ref *ref;
+
+                       ref = btrfs_delayed_node_to_tree_ref(node);
+                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                                              ref->level + 1, ref->parent,
+                                              node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               case BTRFS_EXTENT_DATA_REF_KEY: {
+                       struct btrfs_delayed_data_ref *ref;
+                       struct btrfs_key key;
+
+                       ref = btrfs_delayed_node_to_data_ref(node);
+
+                       key.objectid = ref->objectid;
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = ref->offset;
+                       ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+                                              node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               case BTRFS_SHARED_DATA_REF_KEY: {
+                       struct btrfs_delayed_data_ref *ref;
+                       struct btrfs_key key;
+
+                       ref = btrfs_delayed_node_to_data_ref(node);
+
+                       key.objectid = ref->objectid;
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = ref->offset;
+                       ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+                                              ref->parent, node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               default:
+                       WARN_ON(1);
+               }
+               BUG_ON(ret);
+       }
+
+       return 0;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+                            struct btrfs_path *path, u64 bytenr,
+                            struct btrfs_key *info_key, int *info_level,
+                            struct list_head *prefs)
+{
+       int ret;
+       int slot;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       unsigned long ptr;
+       unsigned long end;
+       struct btrfs_extent_item *ei;
+       u64 flags;
+       u64 item_size;
+
+       /*
+        * enumerate all inline refs
+        */
+       leaf = path->nodes[0];
+       slot = path->slots[0] - 1;
+
+       item_size = btrfs_item_size_nr(leaf, slot);
+       BUG_ON(item_size < sizeof(*ei));
+
+       ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+       flags = btrfs_extent_flags(leaf, ei);
+
+       ptr = (unsigned long)(ei + 1);
+       end = (unsigned long)ei + item_size;
+
+       if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+               struct btrfs_tree_block_info *info;
+               struct btrfs_disk_key disk_key;
+
+               info = (struct btrfs_tree_block_info *)ptr;
+               *info_level = btrfs_tree_block_level(leaf, info);
+               btrfs_tree_block_key(leaf, info, &disk_key);
+               btrfs_disk_key_to_cpu(info_key, &disk_key);
+               ptr += sizeof(struct btrfs_tree_block_info);
+               BUG_ON(ptr > end);
+       } else {
+               BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+       }
+
+       while (ptr < end) {
+               struct btrfs_extent_inline_ref *iref;
+               u64 offset;
+               int type;
+
+               iref = (struct btrfs_extent_inline_ref *)ptr;
+               type = btrfs_extent_inline_ref_type(leaf, iref);
+               offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+               switch (type) {
+               case BTRFS_SHARED_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, 0, info_key,
+                                               *info_level + 1, offset,
+                                               bytenr, 1);
+                       break;
+               case BTRFS_SHARED_DATA_REF_KEY: {
+                       struct btrfs_shared_data_ref *sdref;
+                       int count;
+
+                       sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+                       count = btrfs_shared_data_ref_count(leaf, sdref);
+                       ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+                                              bytenr, count);
+                       break;
+               }
+               case BTRFS_TREE_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, offset, info_key,
+                                              *info_level + 1, 0, bytenr, 1);
+                       break;
+               case BTRFS_EXTENT_DATA_REF_KEY: {
+                       struct btrfs_extent_data_ref *dref;
+                       int count;
+                       u64 root;
+
+                       dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                       count = btrfs_extent_data_ref_count(leaf, dref);
+                       key.objectid = btrfs_extent_data_ref_objectid(leaf,
+                                                                     dref);
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                       root = btrfs_extent_data_ref_root(leaf, dref);
+                       ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
+                                               count);
+                       break;
+               }
+               default:
+                       WARN_ON(1);
+               }
+               BUG_ON(ret);
+               ptr += btrfs_extent_inline_ref_size(type);
+       }
+
+       return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+                           struct btrfs_path *path, u64 bytenr,
+                           struct btrfs_key *info_key, int info_level,
+                           struct list_head *prefs)
+{
+       struct btrfs_root *extent_root = fs_info->extent_root;
+       int ret;
+       int slot;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+
+       while (1) {
+               ret = btrfs_next_item(extent_root, path);
+               if (ret < 0)
+                       break;
+               if (ret) {
+                       ret = 0;
+                       break;
+               }
+
+               slot = path->slots[0];
+               leaf = path->nodes[0];
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+
+               if (key.objectid != bytenr)
+                       break;
+               if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+                       continue;
+               if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+                       break;
+
+               switch (key.type) {
+               case BTRFS_SHARED_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, 0, info_key,
+                                               info_level + 1, key.offset,
+                                               bytenr, 1);
+                       break;
+               case BTRFS_SHARED_DATA_REF_KEY: {
+                       struct btrfs_shared_data_ref *sdref;
+                       int count;
+
+                       sdref = btrfs_item_ptr(leaf, slot,
+                                             struct btrfs_shared_data_ref);
+                       count = btrfs_shared_data_ref_count(leaf, sdref);
+                       ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+                                               bytenr, count);
+                       break;
+               }
+               case BTRFS_TREE_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, key.offset, info_key,
+                                               info_level + 1, 0, bytenr, 1);
+                       break;
+               case BTRFS_EXTENT_DATA_REF_KEY: {
+                       struct btrfs_extent_data_ref *dref;
+                       int count;
+                       u64 root;
+
+                       dref = btrfs_item_ptr(leaf, slot,
+                                             struct btrfs_extent_data_ref);
+                       count = btrfs_extent_data_ref_count(leaf, dref);
+                       key.objectid = btrfs_extent_data_ref_objectid(leaf,
+                                                                     dref);
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                       root = btrfs_extent_data_ref_root(leaf, dref);
+                       ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+                                               bytenr, count);
+                       break;
+               }
+               default:
+                       WARN_ON(1);
+               }
+               BUG_ON(ret);
+       }
+
+       return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+                            struct btrfs_fs_info *fs_info, u64 bytenr,
+                            u64 seq, struct ulist *refs, struct ulist *roots)
+{
+       struct btrfs_key key;
+       struct btrfs_path *path;
+       struct btrfs_key info_key = { 0 };
+       struct btrfs_delayed_ref_root *delayed_refs = NULL;
+       struct btrfs_delayed_ref_head *head = NULL;
+       int info_level = 0;
+       int ret;
+       struct list_head prefs_delayed;
+       struct list_head prefs;
+       struct __prelim_ref *ref;
+
+       INIT_LIST_HEAD(&prefs);
+       INIT_LIST_HEAD(&prefs_delayed);
+
+       key.objectid = bytenr;
+       key.type = BTRFS_EXTENT_ITEM_KEY;
+       key.offset = (u64)-1;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       /*
+        * grab both a lock on the path and a lock on the delayed ref head.
+        * We need both to get a consistent picture of how the refs look
+        * at a specified point in time
+        */
+again:
+       ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       BUG_ON(ret == 0);
+
+       /*
+        * look if there are updates for this ref queued and lock the head
+        */
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       head = btrfs_find_delayed_ref_head(trans, bytenr);
+       if (head) {
+               if (!mutex_trylock(&head->mutex)) {
+                       atomic_inc(&head->node.refs);
+                       spin_unlock(&delayed_refs->lock);
+
+                       btrfs_release_path(path);
+
+                       /*
+                        * Mutex was contended, block until it's
+                        * released and try again
+                        */
+                       mutex_lock(&head->mutex);
+                       mutex_unlock(&head->mutex);
+                       btrfs_put_delayed_ref(&head->node);
+                       goto again;
+               }
+               ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
+               if (ret)
+                       goto out;
+       }
+       spin_unlock(&delayed_refs->lock);
+
+       if (path->slots[0]) {
+               struct extent_buffer *leaf;
+               int slot;
+
+               leaf = path->nodes[0];
+               slot = path->slots[0] - 1;
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (key.objectid == bytenr &&
+                   key.type == BTRFS_EXTENT_ITEM_KEY) {
+                       ret = __add_inline_refs(fs_info, path, bytenr,
+                                               &info_key, &info_level, &prefs);
+                       if (ret)
+                               goto out;
+                       ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+                                              info_level, &prefs);
+                       if (ret)
+                               goto out;
+               }
+       }
+       btrfs_release_path(path);
+
+       /*
+        * when adding the delayed refs above, the info_key might not have
+        * been known yet. Go over the list and replace the missing keys
+        */
+       list_for_each_entry(ref, &prefs_delayed, list) {
+               if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
+                       memcpy(&ref->key, &info_key, sizeof(ref->key));
+       }
+       list_splice_init(&prefs_delayed, &prefs);
+
+       ret = __merge_refs(&prefs, 1);
+       if (ret)
+               goto out;
+
+       ret = __resolve_indirect_refs(fs_info, &prefs);
+       if (ret)
+               goto out;
+
+       ret = __merge_refs(&prefs, 2);
+       if (ret)
+               goto out;
+
+       while (!list_empty(&prefs)) {
+               ref = list_first_entry(&prefs, struct __prelim_ref, list);
+               list_del(&ref->list);
+               if (ref->count < 0)
+                       WARN_ON(1);
+               if (ref->count && ref->root_id && ref->parent == 0) {
+                       /* no parent == root of tree */
+                       ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+                       BUG_ON(ret < 0);
+               }
+               if (ref->count && ref->parent) {
+                       ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+                       BUG_ON(ret < 0);
+               }
+               kfree(ref);
+       }
+
+out:
+       if (head)
+               mutex_unlock(&head->mutex);
+       btrfs_free_path(path);
+       while (!list_empty(&prefs)) {
+               ref = list_first_entry(&prefs, struct __prelim_ref, list);
+               list_del(&ref->list);
+               kfree(ref);
+       }
+       while (!list_empty(&prefs_delayed)) {
+               ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+                                      list);
+               list_del(&ref->list);
+               kfree(ref);
+       }
+
+       return ret;
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info, u64 bytenr,
+                               u64 num_bytes, u64 seq, struct ulist **leafs)
+{
+       struct ulist *tmp;
+       int ret;
+
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
+       *leafs = ulist_alloc(GFP_NOFS);
+       if (!*leafs) {
+               ulist_free(tmp);
+               return -ENOMEM;
+       }
+
+       ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+       ulist_free(tmp);
+
+       if (ret < 0 && ret != -ENOENT) {
+               ulist_free(*leafs);
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info, u64 bytenr,
+                               u64 num_bytes, u64 seq, struct ulist **roots)
+{
+       struct ulist *tmp;
+       struct ulist_node *node = NULL;
+       int ret;
+
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
+       *roots = ulist_alloc(GFP_NOFS);
+       if (!*roots) {
+               ulist_free(tmp);
+               return -ENOMEM;
+       }
+
+       while (1) {
+               ret = find_parent_nodes(trans, fs_info, bytenr, seq,
+                                       tmp, *roots);
+               if (ret < 0 && ret != -ENOENT) {
+                       ulist_free(tmp);
+                       ulist_free(*roots);
+                       return ret;
+               }
+               node = ulist_next(tmp, node);
+               if (!node)
+                       break;
+               bytenr = node->val;
+       }
+
+       ulist_free(tmp);
+       return 0;
+}
+
 
 static int __inode_info(u64 inum, u64 ioff, u8 key_type,
                        struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
        if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
            found_key->objectid > logical ||
-           found_key->objectid + found_key->offset <= logical)
+           found_key->objectid + found_key->offset <= logical) {
+               pr_debug("logical %llu is not within any extent\n",
+                        (unsigned long long)logical);
                return -ENOENT;
+       }
 
        eb = path->nodes[0];
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
        flags = btrfs_extent_flags(eb, ei);
 
+       pr_debug("logical %llu is at position %llu within the extent (%llu "
+                "EXTENT_ITEM %llu) flags %#llx size %u\n",
+                (unsigned long long)logical,
+                (unsigned long long)(logical - found_key->objectid),
+                (unsigned long long)found_key->objectid,
+                (unsigned long long)found_key->offset,
+                (unsigned long long)flags, item_size);
        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                return BTRFS_EXTENT_FLAG_TREE_BLOCK;
        if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
        return 0;
 }
 
-static int __data_list_add(struct list_head *head, u64 inum,
-                               u64 extent_data_item_offset, u64 root)
-{
-       struct __data_ref *ref;
-
-       ref = kmalloc(sizeof(*ref), GFP_NOFS);
-       if (!ref)
-               return -ENOMEM;
-
-       ref->inum = inum;
-       ref->extent_data_item_offset = extent_data_item_offset;
-       ref->root = root;
-       list_add_tail(&ref->list, head);
-
-       return 0;
-}
-
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
-                               struct btrfs_extent_data_ref *dref)
-{
-       return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
-                               btrfs_extent_data_ref_offset(eb, dref),
-                               btrfs_extent_data_ref_root(eb, dref));
-}
-
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
-       struct __shared_ref *ref;
-
-       ref = kmalloc(sizeof(*ref), GFP_NOFS);
-       if (!ref)
-               return -ENOMEM;
-
-       ref->disk_byte = disk_byte;
-       list_add_tail(&ref->list, head);
-
-       return 0;
-}
-
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
-                                          u64 logical, u64 inum,
-                                          u64 extent_data_item_offset,
-                                          u64 extent_offset,
-                                          struct btrfs_path *path,
-                                          struct list_head *data_refs,
-                                          iterate_extent_inodes_t *iterate,
-                                          void *ctx)
-{
-       u64 ref_root;
-       u32 item_size;
-       struct btrfs_key key;
-       struct extent_buffer *eb;
-       struct btrfs_extent_item *ei;
-       struct btrfs_extent_inline_ref *eiref;
-       struct __data_ref *ref;
-       int ret;
-       int type;
-       int last;
-       unsigned long ptr = 0;
-
-       WARN_ON(!list_empty(data_refs));
-       ret = extent_from_logical(fs_info, logical, path, &key);
-       if (ret & BTRFS_EXTENT_FLAG_DATA)
-               ret = -EIO;
-       if (ret < 0)
-               goto out;
-
-       eb = path->nodes[0];
-       ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-       item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-       ret = 0;
-       ref_root = 0;
-       /*
-        * as done in iterate_extent_inodes, we first build a list of refs to
-        * iterate, then free the path and then iterate them to avoid deadlocks.
-        */
-       do {
-               last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-                                               &eiref, &type);
-               if (last < 0) {
-                       ret = last;
-                       goto out;
-               }
-               if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-                   type == BTRFS_SHARED_BLOCK_REF_KEY) {
-                       ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
-                       ret = __data_list_add(data_refs, inum,
-                                               extent_data_item_offset,
-                                               ref_root);
-               }
-       } while (!ret && !last);
-
-       btrfs_release_path(path);
-
-       if (ref_root == 0) {
-               printk(KERN_ERR "btrfs: failed to find tree block ref "
-                       "for shared data backref %llu\n", logical);
-               WARN_ON(1);
-               ret = -EIO;
-       }
-
-out:
-       while (!list_empty(data_refs)) {
-               ref = list_first_entry(data_refs, struct __data_ref, list);
-               list_del(&ref->list);
-               if (!ret)
-                       ret = iterate(ref->inum, extent_offset +
-                                       ref->extent_data_item_offset,
-                                       ref->root, ctx);
-               kfree(ref);
-       }
-
-       return ret;
-}
-
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
-                                   u64 logical, u64 orig_extent_item_objectid,
-                                   u64 extent_offset, struct btrfs_path *path,
-                                   struct list_head *data_refs,
-                                   iterate_extent_inodes_t *iterate,
-                                   void *ctx)
+static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
+                               struct btrfs_path *path, u64 logical,
+                               u64 orig_extent_item_objectid,
+                               u64 extent_item_pos, u64 root,
+                               iterate_extent_inodes_t *iterate, void *ctx)
 {
        u64 disk_byte;
        struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
        struct extent_buffer *eb;
        int slot;
        int nritems;
-       int ret;
-       int found = 0;
+       int ret = 0;
+       int extent_type;
+       u64 data_offset;
+       u64 data_len;
 
        eb = read_tree_block(fs_info->tree_root, logical,
                                fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
                if (key.type != BTRFS_EXTENT_DATA_KEY)
                        continue;
                fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-               if (!fi) {
-                       free_extent_buffer(eb);
-                       return -EIO;
-               }
+               extent_type = btrfs_file_extent_type(eb, fi);
+               if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                       continue;
+               /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
                disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-               if (disk_byte != orig_extent_item_objectid) {
-                       if (found)
-                               break;
-                       else
-                               continue;
-               }
-               ++found;
-               ret = __iter_shared_inline_ref_inodes(fs_info, logical,
-                                                       key.objectid,
-                                                       key.offset,
-                                                       extent_offset, path,
-                                                       data_refs,
-                                                       iterate, ctx);
-               if (ret)
-                       break;
-       }
+               if (disk_byte != orig_extent_item_objectid)
+                       continue;
 
-       if (!found) {
-               printk(KERN_ERR "btrfs: failed to follow shared data backref "
-                       "to parent %llu\n", logical);
-               WARN_ON(1);
-               ret = -EIO;
+               data_offset = btrfs_file_extent_offset(eb, fi);
+               data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+               if (extent_item_pos < data_offset ||
+                   extent_item_pos >= data_offset + data_len)
+                       continue;
+
+               pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+                               "root %llu\n", orig_extent_item_objectid,
+                               key.objectid, key.offset, root);
+               ret = iterate(key.objectid,
+                               key.offset + (extent_item_pos - data_offset),
+                               root, ctx);
+               if (ret) {
+                       pr_debug("stopping iteration because ret=%d\n", ret);
+                       break;
+               }
        }
 
        free_extent_buffer(eb);
+
        return ret;
 }
 
 /*
  * calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
- * released.
+ * the given parameters.
  * when the iterator function returns a non-zero value, iteration stops.
+ * path is guaranteed to be in released state when iterate() is called.
  */
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                                struct btrfs_path *path,
-                               u64 extent_item_objectid,
-                               u64 extent_offset,
+                               u64 extent_item_objectid, u64 extent_item_pos,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
-       unsigned long ptr = 0;
-       int last;
        int ret;
-       int type;
-       u64 logical;
-       u32 item_size;
-       struct btrfs_extent_inline_ref *eiref;
-       struct btrfs_extent_data_ref *dref;
-       struct extent_buffer *eb;
-       struct btrfs_extent_item *ei;
-       struct btrfs_key key;
        struct list_head data_refs = LIST_HEAD_INIT(data_refs);
        struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
-       struct __data_ref *ref_d;
-       struct __shared_ref *ref_s;
-
-       eb = path->nodes[0];
-       ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-       item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-       /* first we iterate the inline refs, ... */
-       do {
-               last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-                                               &eiref, &type);
-               if (last == -ENOENT) {
-                       ret = 0;
-                       break;
-               }
-               if (last < 0) {
-                       ret = last;
-                       break;
-               }
+       struct btrfs_trans_handle *trans;
+       struct ulist *refs;
+       struct ulist *roots;
+       struct ulist_node *ref_node = NULL;
+       struct ulist_node *root_node = NULL;
+       struct seq_list seq_elem;
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       trans = btrfs_join_transaction(fs_info->extent_root);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       pr_debug("resolving all inodes for extent %llu\n",
+                       extent_item_objectid);
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+       spin_unlock(&delayed_refs->lock);
+
+       ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+                                  extent_item_pos, seq_elem.seq,
+                                  &refs);
 
-               if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-                       dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
-                       ret = __data_list_add_eb(&data_refs, eb, dref);
-               } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
-                       logical = btrfs_extent_inline_ref_offset(eb, eiref);
-                       ret = __shared_list_add(&shared_refs, logical);
-               }
-       } while (!ret && !last);
+       if (ret)
+               goto out;
 
-       /* ... then we proceed to in-tree references and ... */
-       while (!ret) {
-               ++path->slots[0];
-               if (path->slots[0] > btrfs_header_nritems(eb)) {
-                       ret = btrfs_next_leaf(fs_info->extent_root, path);
-                       if (ret) {
-                               if (ret == 1)
-                                       ret = 0; /* we're done */
-                               break;
-                       }
-                       eb = path->nodes[0];
-               }
-               btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
-               if (key.objectid != extent_item_objectid)
+       while (!ret && (ref_node = ulist_next(refs, ref_node))) {
+               ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
+                                               seq_elem.seq, &roots);
+               if (ret)
                        break;
-               if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-                       dref = btrfs_item_ptr(eb, path->slots[0],
-                                               struct btrfs_extent_data_ref);
-                       ret = __data_list_add_eb(&data_refs, eb, dref);
-               } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-                       ret = __shared_list_add(&shared_refs, key.offset);
+               while (!ret && (root_node = ulist_next(roots, root_node))) {
+                       pr_debug("root %llu references leaf %llu\n",
+                                       root_node->val, ref_node->val);
+                       ret = iterate_leaf_refs(fs_info, path, ref_node->val,
+                                               extent_item_objectid,
+                                               extent_item_pos, root_node->val,
+                                               iterate, ctx);
                }
        }
 
-       btrfs_release_path(path);
-
-       /*
-        * ... only at the very end we can process the refs we found. this is
-        * because the iterator function we call is allowed to make tree lookups
-        * and we have to avoid deadlocks. additionally, we need more tree
-        * lookups ourselves for shared data refs.
-        */
-       while (!list_empty(&data_refs)) {
-               ref_d = list_first_entry(&data_refs, struct __data_ref, list);
-               list_del(&ref_d->list);
-               if (!ret)
-                       ret = iterate(ref_d->inum, extent_offset +
-                                       ref_d->extent_data_item_offset,
-                                       ref_d->root, ctx);
-               kfree(ref_d);
-       }
-
-       while (!list_empty(&shared_refs)) {
-               ref_s = list_first_entry(&shared_refs, struct __shared_ref,
-                                       list);
-               list_del(&ref_s->list);
-               if (!ret)
-                       ret = __iter_shared_inline_ref(fs_info,
-                                                       ref_s->disk_byte,
-                                                       extent_item_objectid,
-                                                       extent_offset, path,
-                                                       &data_refs,
-                                                       iterate, ctx);
-               kfree(ref_s);
-       }
-
+       ulist_free(refs);
+       ulist_free(roots);
+out:
+       btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+       btrfs_end_transaction(trans, fs_info->extent_root);
        return ret;
 }
 
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
        int ret;
-       u64 offset;
+       u64 extent_item_pos;
        struct btrfs_key found_key;
 
        ret = extent_from_logical(fs_info, logical, path,
                                        &found_key);
+       btrfs_release_path(path);
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                ret = -EINVAL;
        if (ret < 0)
                return ret;
 
-       offset = logical - found_key.objectid;
+       extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       offset, iterate, ctx);
+                                       extent_item_pos, iterate, ctx);
 
        return ret;
 }
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
                for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
                        name_len = btrfs_inode_ref_name_len(eb, iref);
                        /* path must be released before calling iterate()! */
+                       pr_debug("following ref at offset %u for inode %llu in "
+                                "tree %llu\n", cur,
+                                (unsigned long long)found_key.objectid,
+                                (unsigned long long)fs_root->objectid);
                        ret = iterate(parent, iref, eb, ctx);
                        if (ret) {
                                free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
                return PTR_ERR(fspath);
 
        if (fspath > fspath_min) {
+               pr_debug("path resolved: %s\n", fspath);
                ipath->fspath->val[i] = (u64)(unsigned long)fspath;
                ++ipath->fspath->elem_cnt;
                ipath->fspath->bytes_left = fspath - fspath_min;
        } else {
+               pr_debug("missed path, not enough space. missing bytes: %lu, "
+                        "constructed so far: %s\n",
+                        (unsigned long)(fspath_min - fspath), fspath_min);
                ++ipath->fspath->elem_missed;
                ipath->fspath->bytes_missing += fspath_min - fspath;
                ipath->fspath->bytes_left = 0;
index 9261883..d00dfa9 100644 (file)
@@ -20,6 +20,7 @@
 #define __BTRFS_BACKREF__
 
 #include "ioctl.h"
+#include "ulist.h"
 
 struct inode_fs_paths {
        struct btrfs_path               *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info, u64 bytenr,
+                               u64 num_bytes, u64 seq, struct ulist **roots);
+
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
                                        struct btrfs_path *path);
index 634608d..9b9b15f 100644 (file)
@@ -51,6 +51,9 @@ struct btrfs_inode {
        /* held while logging the inode in tree-log.c */
        struct mutex log_mutex;
 
+       /* held while doing delalloc reservations */
+       struct mutex delalloc_mutex;
+
        /* used to order data wrt metadata */
        struct btrfs_ordered_inode_tree ordered_tree;
 
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644 (file)
index 0000000..ad0b3ba
--- /dev/null
@@ -0,0 +1,3068 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ *    currently referenced by the super block (either directly
+ *    or indirectly).
+ * 2. When a super block is written, it is verified that all
+ *    referenced (directly or indirectly) blocks fulfill the
+ *    following requirements:
+ *    2a. All referenced blocks have either been present when
+ *        the file system was mounted, (i.e., they have been
+ *        referenced by the super block) or they have been
+ *        written since then and the write completion callback
+ *        was called and a FLUSH request to the device where
+ *        these blocks are located was received and completed.
+ *    2b. All referenced blocks need to have a generation
+ *        number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/crc32c.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)   /* in characters,
+                                                        * excluding " [...]" */
+#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
+
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE                    0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION                0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE                 0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE                        0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH                       0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH                       0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE                             0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE                                0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE                                0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES                   0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE                    0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES                          0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS               0x00001000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+       u32 magic_num;          /* only used for debug purposes */
+       unsigned int is_metadata:1;     /* if it is meta-data, not data-data */
+       unsigned int is_superblock:1;   /* if it is one of the superblocks */
+       unsigned int is_iodone:1;       /* if is done by lower subsystem */
+       unsigned int iodone_w_error:1;  /* error was indicated to endio */
+       unsigned int never_written:1;   /* block was added because it was
+                                        * referenced, not because it was
+                                        * written */
+       unsigned int mirror_num:2;      /* large enough to hold
+                                        * BTRFS_SUPER_MIRROR_MAX */
+       struct btrfsic_dev_state *dev_state;
+       u64 dev_bytenr;         /* key, physical byte num on disk */
+       u64 logical_bytenr;     /* logical byte num on disk */
+       u64 generation;
+       struct btrfs_disk_key disk_key; /* extra info to print in case of
+                                        * issues, will not always be correct */
+       struct list_head collision_resolving_node;      /* list node */
+       struct list_head all_blocks_node;       /* list node */
+
+       /* the following two lists contain block_link items */
+       struct list_head ref_to_list;   /* list */
+       struct list_head ref_from_list; /* list */
+       struct btrfsic_block *next_in_same_bio;
+       void *orig_bio_bh_private;
+       union {
+               bio_end_io_t *bio;
+               bh_end_io_t *bh;
+       } orig_bio_bh_end_io;
+       int submit_bio_bh_rw;
+       u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+       u32 magic_num;          /* only used for debug purposes */
+       u32 ref_cnt;
+       struct list_head node_ref_to;   /* list node */
+       struct list_head node_ref_from; /* list node */
+       struct list_head collision_resolving_node;      /* list node */
+       struct btrfsic_block *block_ref_to;
+       struct btrfsic_block *block_ref_from;
+       u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+       u32 magic_num;          /* only used for debug purposes */
+       struct block_device *bdev;
+       struct btrfsic_state *state;
+       struct list_head collision_resolving_node;      /* list node */
+       struct btrfsic_block dummy_block_for_bio_bh_flush;
+       u64 last_flush_gen;
+       char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+       struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+       struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+       struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+       u64 start;              /* virtual bytenr */
+       u64 dev_bytenr;         /* physical bytenr on device */
+       u32 len;
+       struct btrfsic_dev_state *dev;
+       char *data;
+       struct buffer_head *bh; /* do not use if set to NULL */
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+       u32 magic;
+       u32 nr;
+       int error;
+       int i;
+       int limit_nesting;
+       int num_copies;
+       int mirror_num;
+       struct btrfsic_block *block;
+       struct btrfsic_block_data_ctx *block_ctx;
+       struct btrfsic_block *next_block;
+       struct btrfsic_block_data_ctx next_block_ctx;
+       struct btrfs_header *hdr;
+       struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+       u32 print_mask;
+       int include_extent_data;
+       int csum_size;
+       struct list_head all_blocks_list;
+       struct btrfsic_block_hashtable block_hashtable;
+       struct btrfsic_block_link_hashtable block_link_hashtable;
+       struct btrfs_root *root;
+       u64 max_superblock_generation;
+       struct btrfsic_block *latest_superblock;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+                                       struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+               struct block_device *bdev,
+               u64 dev_bytenr,
+               struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+               struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+               struct btrfsic_block_link *l,
+               struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+               struct block_device *bdev_ref_to,
+               u64 dev_bytenr_ref_to,
+               struct block_device *bdev_ref_from,
+               u64 dev_bytenr_ref_from,
+               struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+               struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+               struct btrfsic_dev_state *ds,
+               struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+               struct block_device *bdev,
+               struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+                                     struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+                                    struct btrfsic_block *block,
+                                    struct btrfsic_block_data_ctx *block_ctx,
+                                    struct btrfs_header *hdr,
+                                    int limit_nesting, int force_iodone_flag);
+static int btrfsic_create_link_to_next_block(
+               struct btrfsic_state *state,
+               struct btrfsic_block *block,
+               struct btrfsic_block_data_ctx
+               *block_ctx, u64 next_bytenr,
+               int limit_nesting,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block **next_blockp,
+               int force_iodone_flag,
+               int *num_copiesp, int *mirror_nump,
+               struct btrfs_disk_key *disk_key,
+               u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+                                     struct btrfsic_block *block,
+                                     struct btrfsic_block_data_ctx *block_ctx,
+                                     u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+                            struct btrfsic_block_data_ctx *block_ctx_out,
+                            int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+                                 u32 len, struct block_device *bdev,
+                                 struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+                             struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+                                    const u8 *data, unsigned int size);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+                                         u64 dev_bytenr, u8 *mapped_data,
+                                         unsigned int len, struct bio *bio,
+                                         int *bio_is_patched,
+                                         struct buffer_head *bh,
+                                         int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+               struct btrfsic_state *state,
+               struct btrfsic_block *const block,
+               struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+                                             const struct btrfsic_block *block,
+                                             int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+                                       struct btrfsic_block *const block,
+                                       int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+                                  const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+                                 const struct btrfsic_block *block,
+                                 int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block *next_block,
+               struct btrfsic_block *from_block,
+               u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *block_ctx,
+               const char *additional_string,
+               int is_metadata,
+               int is_iodone,
+               int never_written,
+               int mirror_num,
+               int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+               struct btrfsic_state *state,
+               struct btrfsic_dev_state *dev_state,
+               struct btrfs_device *device,
+               int superblock_mirror_num,
+               struct btrfsic_dev_state **selected_dev_state,
+               struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+               struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+                                          u64 bytenr,
+                                          struct btrfsic_dev_state *dev_state,
+                                          u64 dev_bytenr, char *data);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+       b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+       b->dev_state = NULL;
+       b->dev_bytenr = 0;
+       b->logical_bytenr = 0;
+       b->generation = BTRFSIC_GENERATION_UNKNOWN;
+       b->disk_key.objectid = 0;
+       b->disk_key.type = 0;
+       b->disk_key.offset = 0;
+       b->is_metadata = 0;
+       b->is_superblock = 0;
+       b->is_iodone = 0;
+       b->iodone_w_error = 0;
+       b->never_written = 0;
+       b->mirror_num = 0;
+       b->next_in_same_bio = NULL;
+       b->orig_bio_bh_private = NULL;
+       b->orig_bio_bh_end_io.bio = NULL;
+       INIT_LIST_HEAD(&b->collision_resolving_node);
+       INIT_LIST_HEAD(&b->all_blocks_node);
+       INIT_LIST_HEAD(&b->ref_to_list);
+       INIT_LIST_HEAD(&b->ref_from_list);
+       b->submit_bio_bh_rw = 0;
+       b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+       struct btrfsic_block *b;
+
+       b = kzalloc(sizeof(*b), GFP_NOFS);
+       if (NULL != b)
+               btrfsic_block_init(b);
+
+       return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+       BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+       kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+       l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+       l->ref_cnt = 1;
+       INIT_LIST_HEAD(&l->node_ref_to);
+       INIT_LIST_HEAD(&l->node_ref_from);
+       INIT_LIST_HEAD(&l->collision_resolving_node);
+       l->block_ref_to = NULL;
+       l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+       struct btrfsic_block_link *l;
+
+       l = kzalloc(sizeof(*l), GFP_NOFS);
+       if (NULL != l)
+               btrfsic_block_link_init(l);
+
+       return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+       BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+       kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+       ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+       ds->bdev = NULL;
+       ds->state = NULL;
+       ds->name[0] = '\0';
+       INIT_LIST_HEAD(&ds->collision_resolving_node);
+       ds->last_flush_gen = 0;
+       btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+       ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+       ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+       struct btrfsic_dev_state *ds;
+
+       ds = kzalloc(sizeof(*ds), GFP_NOFS);
+       if (NULL != ds)
+               btrfsic_dev_state_init(ds);
+
+       return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+       BUG_ON(!(NULL == ds ||
+                BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+       kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+       int i;
+
+       for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+               INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+                                       struct btrfsic_block_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(b->dev_bytenr >> 16)) ^
+            ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+            (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+       list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+       list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+               struct block_device *bdev,
+               u64 dev_bytenr,
+               struct btrfsic_block_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(dev_bytenr >> 16)) ^
+            ((unsigned int)((uintptr_t)bdev))) &
+            (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+       struct list_head *elem;
+
+       list_for_each(elem, h->table + hashval) {
+               struct btrfsic_block *const b =
+                   list_entry(elem, struct btrfsic_block,
+                              collision_resolving_node);
+
+               if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+                       return b;
+       }
+
+       return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+               struct btrfsic_block_link_hashtable *h)
+{
+       int i;
+
+       for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+               INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+               struct btrfsic_block_link *l,
+               struct btrfsic_block_link_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+            ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+            ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+            ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+            & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+       BUG_ON(NULL == l->block_ref_to);
+       BUG_ON(NULL == l->block_ref_from);
+       list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+       list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+               struct block_device *bdev_ref_to,
+               u64 dev_bytenr_ref_to,
+               struct block_device *bdev_ref_from,
+               u64 dev_bytenr_ref_from,
+               struct btrfsic_block_link_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+            ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+            ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+            ((unsigned int)((uintptr_t)bdev_ref_from))) &
+            (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+       struct list_head *elem;
+
+       list_for_each(elem, h->table + hashval) {
+               struct btrfsic_block_link *const l =
+                   list_entry(elem, struct btrfsic_block_link,
+                              collision_resolving_node);
+
+               BUG_ON(NULL == l->block_ref_to);
+               BUG_ON(NULL == l->block_ref_from);
+               if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+                   l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+                   l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+                   l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+                       return l;
+       }
+
+       return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+               struct btrfsic_dev_state_hashtable *h)
+{
+       int i;
+
+       for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+               INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+               struct btrfsic_dev_state *ds,
+               struct btrfsic_dev_state_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)((uintptr_t)ds->bdev)) &
+            (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+       list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+       list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+               struct block_device *bdev,
+               struct btrfsic_dev_state_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)((uintptr_t)bdev)) &
+            (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+       struct list_head *elem;
+
+       list_for_each(elem, h->table + hashval) {
+               struct btrfsic_dev_state *const ds =
+                   list_entry(elem, struct btrfsic_dev_state,
+                              collision_resolving_node);
+
+               if (ds->bdev == bdev)
+                       return ds;
+       }
+
+       return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+                                     struct btrfs_fs_devices *fs_devices)
+{
+       int ret;
+       struct btrfs_super_block *selected_super;
+       struct list_head *dev_head = &fs_devices->devices;
+       struct btrfs_device *device;
+       struct btrfsic_dev_state *selected_dev_state = NULL;
+       int pass;
+
+       BUG_ON(NULL == state);
+       selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+       if (NULL == selected_super) {
+               printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+               return -1;
+       }
+
+       list_for_each_entry(device, dev_head, dev_list) {
+               int i;
+               struct btrfsic_dev_state *dev_state;
+
+               if (!device->bdev || !device->name)
+                       continue;
+
+               dev_state = btrfsic_dev_state_lookup(device->bdev);
+               BUG_ON(NULL == dev_state);
+               for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                       ret = btrfsic_process_superblock_dev_mirror(
+                                       state, dev_state, device, i,
+                                       &selected_dev_state, selected_super);
+                       if (0 != ret && 0 == i) {
+                               kfree(selected_super);
+                               return ret;
+                       }
+               }
+       }
+
+       if (NULL == state->latest_superblock) {
+               printk(KERN_INFO "btrfsic: no superblock found!\n");
+               kfree(selected_super);
+               return -1;
+       }
+
+       state->csum_size = btrfs_super_csum_size(selected_super);
+
+       for (pass = 0; pass < 3; pass++) {
+               int num_copies;
+               int mirror_num;
+               u64 next_bytenr;
+
+               switch (pass) {
+               case 0:
+                       next_bytenr = btrfs_super_root(selected_super);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "root@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 1:
+                       next_bytenr = btrfs_super_chunk_root(selected_super);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "chunk@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 2:
+                       next_bytenr = btrfs_super_log_root(selected_super);
+                       if (0 == next_bytenr)
+                               continue;
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "log@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               }
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       struct btrfsic_block *next_block;
+                       struct btrfsic_block_data_ctx tmp_next_block_ctx;
+                       struct btrfsic_block_link *l;
+                       struct btrfs_header *hdr;
+
+                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                               &tmp_next_block_ctx,
+                                               mirror_num);
+                       if (ret) {
+                               printk(KERN_INFO "btrfsic:"
+                                      " btrfsic_map_block(root @%llu,"
+                                      " mirror %d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               kfree(selected_super);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_hashtable_lookup(
+                                       tmp_next_block_ctx.dev->bdev,
+                                       tmp_next_block_ctx.dev_bytenr,
+                                       &state->block_hashtable);
+                       BUG_ON(NULL == next_block);
+
+                       l = btrfsic_block_link_hashtable_lookup(
+                                       tmp_next_block_ctx.dev->bdev,
+                                       tmp_next_block_ctx.dev_bytenr,
+                                       state->latest_superblock->dev_state->
+                                       bdev,
+                                       state->latest_superblock->dev_bytenr,
+                                       &state->block_link_hashtable);
+                       BUG_ON(NULL == l);
+
+                       ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+                       if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                               printk(KERN_INFO
+                                      "btrfsic: read @logical %llu failed!\n",
+                                      (unsigned long long)
+                                      tmp_next_block_ctx.start);
+                               btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                               kfree(selected_super);
+                               return -1;
+                       }
+
+                       hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
+                       ret = btrfsic_process_metablock(state,
+                                                       next_block,
+                                                       &tmp_next_block_ctx,
+                                                       hdr,
+                                                       BTRFS_MAX_LEVEL + 3, 1);
+                       btrfsic_release_block_ctx(&tmp_next_block_ctx);
+               }
+       }
+
+       kfree(selected_super);
+       return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+               struct btrfsic_state *state,
+               struct btrfsic_dev_state *dev_state,
+               struct btrfs_device *device,
+               int superblock_mirror_num,
+               struct btrfsic_dev_state **selected_dev_state,
+               struct btrfs_super_block *selected_super)
+{
+       struct btrfs_super_block *super_tmp;
+       u64 dev_bytenr;
+       struct buffer_head *bh;
+       struct btrfsic_block *superblock_tmp;
+       int pass;
+       struct block_device *const superblock_bdev = device->bdev;
+
+       /* super block bytenr is always the unmapped device bytenr */
+       dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+       bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+       if (NULL == bh)
+               return -1;
+       super_tmp = (struct btrfs_super_block *)
+           (bh->b_data + (dev_bytenr & 4095));
+
+       if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+           strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+                   sizeof(super_tmp->magic)) ||
+           memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+               brelse(bh);
+               return 0;
+       }
+
+       superblock_tmp =
+           btrfsic_block_hashtable_lookup(superblock_bdev,
+                                          dev_bytenr,
+                                          &state->block_hashtable);
+       if (NULL == superblock_tmp) {
+               superblock_tmp = btrfsic_block_alloc();
+               if (NULL == superblock_tmp) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       brelse(bh);
+                       return -1;
+               }
+               /* for superblock, only the dev_bytenr makes sense */
+               superblock_tmp->dev_bytenr = dev_bytenr;
+               superblock_tmp->dev_state = dev_state;
+               superblock_tmp->logical_bytenr = dev_bytenr;
+               superblock_tmp->generation = btrfs_super_generation(super_tmp);
+               superblock_tmp->is_metadata = 1;
+               superblock_tmp->is_superblock = 1;
+               superblock_tmp->is_iodone = 1;
+               superblock_tmp->never_written = 0;
+               superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+               if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                       printk(KERN_INFO "New initial S-block (bdev %p, %s)"
+                              " @%llu (%s/%llu/%d)\n",
+                              superblock_bdev, device->name,
+                              (unsigned long long)dev_bytenr,
+                              dev_state->name,
+                              (unsigned long long)dev_bytenr,
+                              superblock_mirror_num);
+               list_add(&superblock_tmp->all_blocks_node,
+                        &state->all_blocks_list);
+               btrfsic_block_hashtable_add(superblock_tmp,
+                                           &state->block_hashtable);
+       }
+
+       /* select the one with the highest generation field */
+       if (btrfs_super_generation(super_tmp) >
+           state->max_superblock_generation ||
+           0 == state->max_superblock_generation) {
+               memcpy(selected_super, super_tmp, sizeof(*selected_super));
+               *selected_dev_state = dev_state;
+               state->max_superblock_generation =
+                   btrfs_super_generation(super_tmp);
+               state->latest_superblock = superblock_tmp;
+       }
+
+       for (pass = 0; pass < 3; pass++) {
+               u64 next_bytenr;
+               int num_copies;
+               int mirror_num;
+               const char *additional_string = NULL;
+               struct btrfs_disk_key tmp_disk_key;
+
+               tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+               tmp_disk_key.offset = 0;
+               switch (pass) {
+               case 0:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+                       additional_string = "initial root ";
+                       next_bytenr = btrfs_super_root(super_tmp);
+                       break;
+               case 1:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+                       additional_string = "initial chunk ";
+                       next_bytenr = btrfs_super_chunk_root(super_tmp);
+                       break;
+               case 2:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+                       additional_string = "initial log ";
+                       next_bytenr = btrfs_super_log_root(super_tmp);
+                       if (0 == next_bytenr)
+                               continue;
+                       break;
+               }
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       struct btrfsic_block *next_block;
+                       struct btrfsic_block_data_ctx tmp_next_block_ctx;
+                       struct btrfsic_block_link *l;
+
+                       if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                             &tmp_next_block_ctx,
+                                             mirror_num)) {
+                               printk(KERN_INFO "btrfsic: btrfsic_map_block("
+                                      "bytenr @%llu, mirror %d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               brelse(bh);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_lookup_or_add(
+                                       state, &tmp_next_block_ctx,
+                                       additional_string, 1, 1, 0,
+                                       mirror_num, NULL);
+                       if (NULL == next_block) {
+                               btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                               brelse(bh);
+                               return -1;
+                       }
+
+                       next_block->disk_key = tmp_disk_key;
+                       next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+                       l = btrfsic_block_link_lookup_or_add(
+                                       state, &tmp_next_block_ctx,
+                                       next_block, superblock_tmp,
+                                       BTRFSIC_GENERATION_UNKNOWN);
+                       btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                       if (NULL == l) {
+                               brelse(bh);
+                               return -1;
+                       }
+               }
+       }
+       if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+               btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+       brelse(bh);
+       return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+       struct btrfsic_stack_frame *sf;
+
+       sf = kzalloc(sizeof(*sf), GFP_NOFS);
+       if (NULL == sf)
+               printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+       else
+               sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+       return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+       BUG_ON(!(NULL == sf ||
+                BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+       kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+               struct btrfsic_state *state,
+               struct btrfsic_block *const first_block,
+               struct btrfsic_block_data_ctx *const first_block_ctx,
+               struct btrfs_header *const first_hdr,
+               int first_limit_nesting, int force_iodone_flag)
+{
+       struct btrfsic_stack_frame initial_stack_frame = { 0 };
+       struct btrfsic_stack_frame *sf;
+       struct btrfsic_stack_frame *next_stack;
+
+       sf = &initial_stack_frame;
+       sf->error = 0;
+       sf->i = -1;
+       sf->limit_nesting = first_limit_nesting;
+       sf->block = first_block;
+       sf->block_ctx = first_block_ctx;
+       sf->next_block = NULL;
+       sf->hdr = first_hdr;
+       sf->prev = NULL;
+
+continue_with_new_stack_frame:
+       sf->block->generation = le64_to_cpu(sf->hdr->generation);
+       if (0 == sf->hdr->level) {
+               struct btrfs_leaf *const leafhdr =
+                   (struct btrfs_leaf *)sf->hdr;
+
+               if (-1 == sf->i) {
+                       sf->nr = le32_to_cpu(leafhdr->header.nritems);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "leaf %llu items %d generation %llu"
+                                      " owner %llu\n",
+                                      (unsigned long long)
+                                      sf->block_ctx->start,
+                                      sf->nr,
+                                      (unsigned long long)
+                                      le64_to_cpu(leafhdr->header.generation),
+                                      (unsigned long long)
+                                      le64_to_cpu(leafhdr->header.owner));
+               }
+
+continue_with_current_leaf_stack_frame:
+               if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+                       sf->i++;
+                       sf->num_copies = 0;
+               }
+
+               if (sf->i < sf->nr) {
+                       struct btrfs_item *disk_item = leafhdr->items + sf->i;
+                       struct btrfs_disk_key *disk_key = &disk_item->key;
+                       u8 type;
+                       const u32 item_offset = le32_to_cpu(disk_item->offset);
+
+                       type = disk_key->type;
+
+                       if (BTRFS_ROOT_ITEM_KEY == type) {
+                               const struct btrfs_root_item *const root_item =
+                                   (struct btrfs_root_item *)
+                                   (sf->block_ctx->data +
+                                    offsetof(struct btrfs_leaf, items) +
+                                    item_offset);
+                               const u64 next_bytenr =
+                                   le64_to_cpu(root_item->bytenr);
+
+                               sf->error =
+                                   btrfsic_create_link_to_next_block(
+                                               state,
+                                               sf->block,
+                                               sf->block_ctx,
+                                               next_bytenr,
+                                               sf->limit_nesting,
+                                               &sf->next_block_ctx,
+                                               &sf->next_block,
+                                               force_iodone_flag,
+                                               &sf->num_copies,
+                                               &sf->mirror_num,
+                                               disk_key,
+                                               le64_to_cpu(root_item->
+                                               generation));
+                               if (sf->error)
+                                       goto one_stack_frame_backwards;
+
+                               if (NULL != sf->next_block) {
+                                       struct btrfs_header *const next_hdr =
+                                           (struct btrfs_header *)
+                                           sf->next_block_ctx.data;
+
+                                       next_stack =
+                                           btrfsic_stack_frame_alloc();
+                                       if (NULL == next_stack) {
+                                               btrfsic_release_block_ctx(
+                                                               &sf->
+                                                               next_block_ctx);
+                                               goto one_stack_frame_backwards;
+                                       }
+
+                                       next_stack->i = -1;
+                                       next_stack->block = sf->next_block;
+                                       next_stack->block_ctx =
+                                           &sf->next_block_ctx;
+                                       next_stack->next_block = NULL;
+                                       next_stack->hdr = next_hdr;
+                                       next_stack->limit_nesting =
+                                           sf->limit_nesting - 1;
+                                       next_stack->prev = sf;
+                                       sf = next_stack;
+                                       goto continue_with_new_stack_frame;
+                               }
+                       } else if (BTRFS_EXTENT_DATA_KEY == type &&
+                                  state->include_extent_data) {
+                               sf->error = btrfsic_handle_extent_data(
+                                               state,
+                                               sf->block,
+                                               sf->block_ctx,
+                                               item_offset,
+                                               force_iodone_flag);
+                               if (sf->error)
+                                       goto one_stack_frame_backwards;
+                       }
+
+                       goto continue_with_current_leaf_stack_frame;
+               }
+       } else {
+               struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+               if (-1 == sf->i) {
+                       sf->nr = le32_to_cpu(nodehdr->header.nritems);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO "node %llu level %d items %d"
+                                      " generation %llu owner %llu\n",
+                                      (unsigned long long)
+                                      sf->block_ctx->start,
+                                      nodehdr->header.level, sf->nr,
+                                      (unsigned long long)
+                                      le64_to_cpu(nodehdr->header.generation),
+                                      (unsigned long long)
+                                      le64_to_cpu(nodehdr->header.owner));
+               }
+
+continue_with_current_node_stack_frame:
+               if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+                       sf->i++;
+                       sf->num_copies = 0;
+               }
+
+               if (sf->i < sf->nr) {
+                       struct btrfs_key_ptr *disk_key_ptr =
+                           nodehdr->ptrs + sf->i;
+                       const u64 next_bytenr =
+                           le64_to_cpu(disk_key_ptr->blockptr);
+
+                       sf->error = btrfsic_create_link_to_next_block(
+                                       state,
+                                       sf->block,
+                                       sf->block_ctx,
+                                       next_bytenr,
+                                       sf->limit_nesting,
+                                       &sf->next_block_ctx,
+                                       &sf->next_block,
+                                       force_iodone_flag,
+                                       &sf->num_copies,
+                                       &sf->mirror_num,
+                                       &disk_key_ptr->key,
+                                       le64_to_cpu(disk_key_ptr->generation));
+                       if (sf->error)
+                               goto one_stack_frame_backwards;
+
+                       if (NULL != sf->next_block) {
+                               struct btrfs_header *const next_hdr =
+                                   (struct btrfs_header *)
+                                   sf->next_block_ctx.data;
+
+                               next_stack = btrfsic_stack_frame_alloc();
+                               if (NULL == next_stack)
+                                       goto one_stack_frame_backwards;
+
+                               next_stack->i = -1;
+                               next_stack->block = sf->next_block;
+                               next_stack->block_ctx = &sf->next_block_ctx;
+                               next_stack->next_block = NULL;
+                               next_stack->hdr = next_hdr;
+                               next_stack->limit_nesting =
+                                   sf->limit_nesting - 1;
+                               next_stack->prev = sf;
+                               sf = next_stack;
+                               goto continue_with_new_stack_frame;
+                       }
+
+                       goto continue_with_current_node_stack_frame;
+               }
+       }
+
+one_stack_frame_backwards:
+       if (NULL != sf->prev) {
+               struct btrfsic_stack_frame *const prev = sf->prev;
+
+               /* the one for the initial block is freed in the caller */
+               btrfsic_release_block_ctx(sf->block_ctx);
+
+               if (sf->error) {
+                       prev->error = sf->error;
+                       btrfsic_stack_frame_free(sf);
+                       sf = prev;
+                       goto one_stack_frame_backwards;
+               }
+
+               btrfsic_stack_frame_free(sf);
+               sf = prev;
+               goto continue_with_new_stack_frame;
+       } else {
+               BUG_ON(&initial_stack_frame != sf);
+       }
+
+       return sf->error;
+}
+
+static int btrfsic_create_link_to_next_block(
+               struct btrfsic_state *state,
+               struct btrfsic_block *block,
+               struct btrfsic_block_data_ctx *block_ctx,
+               u64 next_bytenr,
+               int limit_nesting,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block **next_blockp,
+               int force_iodone_flag,
+               int *num_copiesp, int *mirror_nump,
+               struct btrfs_disk_key *disk_key,
+               u64 parent_generation)
+{
+       struct btrfsic_block *next_block = NULL;
+       int ret;
+       struct btrfsic_block_link *l;
+       int did_alloc_block_link;
+       int block_was_created;
+
+       *next_blockp = NULL;
+       if (0 == *num_copiesp) {
+               *num_copiesp =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, *num_copiesp);
+               *mirror_nump = 1;
+       }
+
+       if (*mirror_nump > *num_copiesp)
+               return 0;
+
+       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+               printk(KERN_INFO
+                      "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+                      *mirror_nump);
+       ret = btrfsic_map_block(state, next_bytenr,
+                               BTRFSIC_BLOCK_SIZE,
+                               next_block_ctx, *mirror_nump);
+       if (ret) {
+               printk(KERN_INFO
+                      "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+                      (unsigned long long)next_bytenr, *mirror_nump);
+               btrfsic_release_block_ctx(next_block_ctx);
+               *next_blockp = NULL;
+               return -1;
+       }
+
+       next_block = btrfsic_block_lookup_or_add(state,
+                                                next_block_ctx, "referenced ",
+                                                1, force_iodone_flag,
+                                                !force_iodone_flag,
+                                                *mirror_nump,
+                                                &block_was_created);
+       if (NULL == next_block) {
+               btrfsic_release_block_ctx(next_block_ctx);
+               *next_blockp = NULL;
+               return -1;
+       }
+       if (block_was_created) {
+               l = NULL;
+               next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+       } else {
+               if (next_block->logical_bytenr != next_bytenr &&
+                   !(!next_block->is_metadata &&
+                     0 == next_block->logical_bytenr)) {
+                       printk(KERN_INFO
+                              "Referenced block @%llu (%s/%llu/%d)"
+                              " found in hash table, %c,"
+                              " bytenr mismatch (!= stored %llu).\n",
+                              (unsigned long long)next_bytenr,
+                              next_block_ctx->dev->name,
+                              (unsigned long long)next_block_ctx->dev_bytenr,
+                              *mirror_nump,
+                              btrfsic_get_block_type(state, next_block),
+                              (unsigned long long)next_block->logical_bytenr);
+               } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "Referenced block @%llu (%s/%llu/%d)"
+                              " found in hash table, %c.\n",
+                              (unsigned long long)next_bytenr,
+                              next_block_ctx->dev->name,
+                              (unsigned long long)next_block_ctx->dev_bytenr,
+                              *mirror_nump,
+                              btrfsic_get_block_type(state, next_block));
+               next_block->logical_bytenr = next_bytenr;
+
+               next_block->mirror_num = *mirror_nump;
+               l = btrfsic_block_link_hashtable_lookup(
+                               next_block_ctx->dev->bdev,
+                               next_block_ctx->dev_bytenr,
+                               block_ctx->dev->bdev,
+                               block_ctx->dev_bytenr,
+                               &state->block_link_hashtable);
+       }
+
+       next_block->disk_key = *disk_key;
+       if (NULL == l) {
+               l = btrfsic_block_link_alloc();
+               if (NULL == l) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       btrfsic_release_block_ctx(next_block_ctx);
+                       *next_blockp = NULL;
+                       return -1;
+               }
+
+               did_alloc_block_link = 1;
+               l->block_ref_to = next_block;
+               l->block_ref_from = block;
+               l->ref_cnt = 1;
+               l->parent_generation = parent_generation;
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       btrfsic_print_add_link(state, l);
+
+               list_add(&l->node_ref_to, &block->ref_to_list);
+               list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+               btrfsic_block_link_hashtable_add(l,
+                                                &state->block_link_hashtable);
+       } else {
+               did_alloc_block_link = 0;
+               if (0 == limit_nesting) {
+                       l->ref_cnt++;
+                       l->parent_generation = parent_generation;
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               btrfsic_print_add_link(state, l);
+               }
+       }
+
+       if (limit_nesting > 0 && did_alloc_block_link) {
+               ret = btrfsic_read_block(state, next_block_ctx);
+               if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                       printk(KERN_INFO
+                              "btrfsic: read block @logical %llu failed!\n",
+                              (unsigned long long)next_bytenr);
+                       btrfsic_release_block_ctx(next_block_ctx);
+                       *next_blockp = NULL;
+                       return -1;
+               }
+
+               *next_blockp = next_block;
+       } else {
+               *next_blockp = NULL;
+       }
+       (*mirror_nump)++;
+
+       return 0;
+}
+
+static int btrfsic_handle_extent_data(
+               struct btrfsic_state *state,
+               struct btrfsic_block *block,
+               struct btrfsic_block_data_ctx *block_ctx,
+               u32 item_offset, int force_iodone_flag)
+{
+       int ret;
+       struct btrfs_file_extent_item *file_extent_item =
+           (struct btrfs_file_extent_item *)(block_ctx->data +
+                                             offsetof(struct btrfs_leaf,
+                                                      items) + item_offset);
+       u64 next_bytenr =
+           le64_to_cpu(file_extent_item->disk_bytenr) +
+           le64_to_cpu(file_extent_item->offset);
+       u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
+       u64 generation = le64_to_cpu(file_extent_item->generation);
+       struct btrfsic_block_link *l;
+
+       if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+               printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+                      " offset = %llu, num_bytes = %llu\n",
+                      file_extent_item->type,
+                      (unsigned long long)
+                      le64_to_cpu(file_extent_item->disk_bytenr),
+                      (unsigned long long)
+                      le64_to_cpu(file_extent_item->offset),
+                      (unsigned long long)
+                      le64_to_cpu(file_extent_item->num_bytes));
+       if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
+           ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
+               return 0;
+       while (num_bytes > 0) {
+               u32 chunk_len;
+               int num_copies;
+               int mirror_num;
+
+               if (num_bytes > BTRFSIC_BLOCK_SIZE)
+                       chunk_len = BTRFSIC_BLOCK_SIZE;
+               else
+                       chunk_len = num_bytes;
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       struct btrfsic_block_data_ctx next_block_ctx;
+                       struct btrfsic_block *next_block;
+                       int block_was_created;
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO "btrfsic_handle_extent_data("
+                                      "mirror_num=%d)\n", mirror_num);
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+                               printk(KERN_INFO
+                                      "\tdisk_bytenr = %llu, num_bytes %u\n",
+                                      (unsigned long long)next_bytenr,
+                                      chunk_len);
+                       ret = btrfsic_map_block(state, next_bytenr,
+                                               chunk_len, &next_block_ctx,
+                                               mirror_num);
+                       if (ret) {
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_map_block(@%llu,"
+                                      " mirror=%d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_lookup_or_add(
+                                       state,
+                                       &next_block_ctx,
+                                       "referenced ",
+                                       0,
+                                       force_iodone_flag,
+                                       !force_iodone_flag,
+                                       mirror_num,
+                                       &block_was_created);
+                       if (NULL == next_block) {
+                               printk(KERN_INFO
+                                      "btrfsic: error, kmalloc failed!\n");
+                               btrfsic_release_block_ctx(&next_block_ctx);
+                               return -1;
+                       }
+                       if (!block_was_created) {
+                               if (next_block->logical_bytenr != next_bytenr &&
+                                   !(!next_block->is_metadata &&
+                                     0 == next_block->logical_bytenr)) {
+                                       printk(KERN_INFO
+                                              "Referenced block"
+                                              " @%llu (%s/%llu/%d)"
+                                              " found in hash table, D,"
+                                              " bytenr mismatch"
+                                              " (!= stored %llu).\n",
+                                              (unsigned long long)next_bytenr,
+                                              next_block_ctx.dev->name,
+                                              (unsigned long long)
+                                              next_block_ctx.dev_bytenr,
+                                              mirror_num,
+                                              (unsigned long long)
+                                              next_block->logical_bytenr);
+                               }
+                               next_block->logical_bytenr = next_bytenr;
+                               next_block->mirror_num = mirror_num;
+                       }
+
+                       l = btrfsic_block_link_lookup_or_add(state,
+                                                            &next_block_ctx,
+                                                            next_block, block,
+                                                            generation);
+                       btrfsic_release_block_ctx(&next_block_ctx);
+                       if (NULL == l)
+                               return -1;
+               }
+
+               next_bytenr += chunk_len;
+               num_bytes -= chunk_len;
+       }
+
+       return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+                            struct btrfsic_block_data_ctx *block_ctx_out,
+                            int mirror_num)
+{
+       int ret;
+       u64 length;
+       struct btrfs_bio *multi = NULL;
+       struct btrfs_device *device;
+
+       length = len;
+       ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+                             bytenr, &length, &multi, mirror_num);
+
+       device = multi->stripes[0].dev;
+       block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+       block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+       block_ctx_out->start = bytenr;
+       block_ctx_out->len = len;
+       block_ctx_out->data = NULL;
+       block_ctx_out->bh = NULL;
+
+       if (0 == ret)
+               kfree(multi);
+       if (NULL == block_ctx_out->dev) {
+               ret = -ENXIO;
+               printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+       }
+
+       return ret;
+}
+
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+                                 u32 len, struct block_device *bdev,
+                                 struct btrfsic_block_data_ctx *block_ctx_out)
+{
+       block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+       block_ctx_out->dev_bytenr = bytenr;
+       block_ctx_out->start = bytenr;
+       block_ctx_out->len = len;
+       block_ctx_out->data = NULL;
+       block_ctx_out->bh = NULL;
+       if (NULL != block_ctx_out->dev) {
+               return 0;
+       } else {
+               printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+               return -ENXIO;
+       }
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+       if (NULL != block_ctx->bh) {
+               brelse(block_ctx->bh);
+               block_ctx->bh = NULL;
+       }
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+                             struct btrfsic_block_data_ctx *block_ctx)
+{
+       block_ctx->bh = NULL;
+       if (block_ctx->dev_bytenr & 4095) {
+               printk(KERN_INFO
+                      "btrfsic: read_block() with unaligned bytenr %llu\n",
+                      (unsigned long long)block_ctx->dev_bytenr);
+               return -1;
+       }
+       if (block_ctx->len > 4096) {
+               printk(KERN_INFO
+                      "btrfsic: read_block() with too huge size %d\n",
+                      block_ctx->len);
+               return -1;
+       }
+
+       block_ctx->bh = __bread(block_ctx->dev->bdev,
+                               block_ctx->dev_bytenr >> 12, 4096);
+       if (NULL == block_ctx->bh)
+               return -1;
+       block_ctx->data = block_ctx->bh->b_data;
+
+       return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+       struct list_head *elem_all;
+
+       BUG_ON(NULL == state);
+
+       printk(KERN_INFO "all_blocks_list:\n");
+       list_for_each(elem_all, &state->all_blocks_list) {
+               const struct btrfsic_block *const b_all =
+                   list_entry(elem_all, struct btrfsic_block,
+                              all_blocks_node);
+               struct list_head *elem_ref_to;
+               struct list_head *elem_ref_from;
+
+               printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+                      btrfsic_get_block_type(state, b_all),
+                      (unsigned long long)b_all->logical_bytenr,
+                      b_all->dev_state->name,
+                      (unsigned long long)b_all->dev_bytenr,
+                      b_all->mirror_num);
+
+               list_for_each(elem_ref_to, &b_all->ref_to_list) {
+                       const struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_to,
+                                      struct btrfsic_block_link,
+                                      node_ref_to);
+
+                       printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+                              " refers %u* to"
+                              " %c @%llu (%s/%llu/%d)\n",
+                              btrfsic_get_block_type(state, b_all),
+                              (unsigned long long)b_all->logical_bytenr,
+                              b_all->dev_state->name,
+                              (unsigned long long)b_all->dev_bytenr,
+                              b_all->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+               }
+
+               list_for_each(elem_ref_from, &b_all->ref_from_list) {
+                       const struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_from,
+                                      struct btrfsic_block_link,
+                                      node_ref_from);
+
+                       printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+                              " is ref %u* from"
+                              " %c @%llu (%s/%llu/%d)\n",
+                              btrfsic_get_block_type(state, b_all),
+                              (unsigned long long)b_all->logical_bytenr,
+                              b_all->dev_state->name,
+                              (unsigned long long)b_all->dev_bytenr,
+                              b_all->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_from),
+                              (unsigned long long)
+                              l->block_ref_from->logical_bytenr,
+                              l->block_ref_from->dev_state->name,
+                              (unsigned long long)
+                              l->block_ref_from->dev_bytenr,
+                              l->block_ref_from->mirror_num);
+               }
+
+               printk(KERN_INFO "\n");
+       }
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+                                    const u8 *data, unsigned int size)
+{
+       struct btrfs_header *h;
+       u8 csum[BTRFS_CSUM_SIZE];
+       u32 crc = ~(u32)0;
+       int fail = 0;
+       int crc_fail = 0;
+
+       h = (struct btrfs_header *)data;
+
+       if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+               fail++;
+
+       crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+       btrfs_csum_final(crc, csum);
+       if (memcmp(csum, h->csum, state->csum_size))
+               crc_fail++;
+
+       return fail || crc_fail;
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+                                         u64 dev_bytenr,
+                                         u8 *mapped_data, unsigned int len,
+                                         struct bio *bio,
+                                         int *bio_is_patched,
+                                         struct buffer_head *bh,
+                                         int submit_bio_bh_rw)
+{
+       int is_metadata;
+       struct btrfsic_block *block;
+       struct btrfsic_block_data_ctx block_ctx;
+       int ret;
+       struct btrfsic_state *state = dev_state->state;
+       struct block_device *bdev = dev_state->bdev;
+
+       WARN_ON(len > PAGE_SIZE);
+       is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
+       if (NULL != bio_is_patched)
+               *bio_is_patched = 0;
+
+       block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+                                              &state->block_hashtable);
+       if (NULL != block) {
+               u64 bytenr;
+               struct list_head *elem_ref_to;
+               struct list_head *tmp_ref_to;
+
+               if (block->is_superblock) {
+                       bytenr = le64_to_cpu(((struct btrfs_super_block *)
+                                             mapped_data)->bytenr);
+                       is_metadata = 1;
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+                               printk(KERN_INFO
+                                      "[before new superblock is written]:\n");
+                               btrfsic_dump_tree_sub(state, block, 0);
+                       }
+               }
+               if (is_metadata) {
+                       if (!block->is_superblock) {
+                               bytenr = le64_to_cpu(((struct btrfs_header *)
+                                                     mapped_data)->bytenr);
+                               btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+                                                              dev_state,
+                                                              dev_bytenr,
+                                                              mapped_data);
+                       }
+                       if (block->logical_bytenr != bytenr) {
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/%d)"
+                                      " found in hash table, %c,"
+                                      " bytenr mismatch"
+                                      " (!= stored %llu).\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr,
+                                      block->mirror_num,
+                                      btrfsic_get_block_type(state, block),
+                                      (unsigned long long)
+                                      block->logical_bytenr);
+                               block->logical_bytenr = bytenr;
+                       } else if (state->print_mask &
+                                  BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/%d)"
+                                      " found in hash table, %c.\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr,
+                                      block->mirror_num,
+                                      btrfsic_get_block_type(state, block));
+               } else {
+                       bytenr = block->logical_bytenr;
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/%d)"
+                                      " found in hash table, %c.\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr,
+                                      block->mirror_num,
+                                      btrfsic_get_block_type(state, block));
+               }
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "ref_to_list: %cE, ref_from_list: %cE\n",
+                              list_empty(&block->ref_to_list) ? ' ' : '!',
+                              list_empty(&block->ref_from_list) ? ' ' : '!');
+               if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+                       printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+                              " @%llu (%s/%llu/%d), old(gen=%llu,"
+                              " objectid=%llu, type=%d, offset=%llu),"
+                              " new(gen=%llu),"
+                              " which is referenced by most recent superblock"
+                              " (superblockgen=%llu)!\n",
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)bytenr,
+                              dev_state->name,
+                              (unsigned long long)dev_bytenr,
+                              block->mirror_num,
+                              (unsigned long long)block->generation,
+                              (unsigned long long)
+                              le64_to_cpu(block->disk_key.objectid),
+                              block->disk_key.type,
+                              (unsigned long long)
+                              le64_to_cpu(block->disk_key.offset),
+                              (unsigned long long)
+                              le64_to_cpu(((struct btrfs_header *)
+                                           mapped_data)->generation),
+                              (unsigned long long)
+                              state->max_superblock_generation);
+                       btrfsic_dump_tree(state);
+               }
+
+               if (!block->is_iodone && !block->never_written) {
+                       printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+                              " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+                              " which is not yet iodone!\n",
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)bytenr,
+                              dev_state->name,
+                              (unsigned long long)dev_bytenr,
+                              block->mirror_num,
+                              (unsigned long long)block->generation,
+                              (unsigned long long)
+                              le64_to_cpu(((struct btrfs_header *)
+                                           mapped_data)->generation));
+                       /* it would not be safe to go on */
+                       btrfsic_dump_tree(state);
+                       return;
+               }
+
+               /*
+                * Clear all references of this block. Do not free
+                * the block itself even if is not referenced anymore
+                * because it still carries valueable information
+                * like whether it was ever written and IO completed.
+                */
+               list_for_each_safe(elem_ref_to, tmp_ref_to,
+                                  &block->ref_to_list) {
+                       struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_to,
+                                      struct btrfsic_block_link,
+                                      node_ref_to);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               btrfsic_print_rem_link(state, l);
+                       l->ref_cnt--;
+                       if (0 == l->ref_cnt) {
+                               list_del(&l->node_ref_to);
+                               list_del(&l->node_ref_from);
+                               btrfsic_block_link_hashtable_remove(l);
+                               btrfsic_block_link_free(l);
+                       }
+               }
+
+               if (block->is_superblock)
+                       ret = btrfsic_map_superblock(state, bytenr, len,
+                                                    bdev, &block_ctx);
+               else
+                       ret = btrfsic_map_block(state, bytenr, len,
+                                               &block_ctx, 0);
+               if (ret) {
+                       printk(KERN_INFO
+                              "btrfsic: btrfsic_map_block(root @%llu)"
+                              " failed!\n", (unsigned long long)bytenr);
+                       return;
+               }
+               block_ctx.data = mapped_data;
+               /* the following is required in case of writes to mirrors,
+                * use the same that was used for the lookup */
+               block_ctx.dev = dev_state;
+               block_ctx.dev_bytenr = dev_bytenr;
+
+               if (is_metadata || state->include_extent_data) {
+                       block->never_written = 0;
+                       block->iodone_w_error = 0;
+                       if (NULL != bio) {
+                               block->is_iodone = 0;
+                               BUG_ON(NULL == bio_is_patched);
+                               if (!*bio_is_patched) {
+                                       block->orig_bio_bh_private =
+                                           bio->bi_private;
+                                       block->orig_bio_bh_end_io.bio =
+                                           bio->bi_end_io;
+                                       block->next_in_same_bio = NULL;
+                                       bio->bi_private = block;
+                                       bio->bi_end_io = btrfsic_bio_end_io;
+                                       *bio_is_patched = 1;
+                               } else {
+                                       struct btrfsic_block *chained_block =
+                                           (struct btrfsic_block *)
+                                           bio->bi_private;
+
+                                       BUG_ON(NULL == chained_block);
+                                       block->orig_bio_bh_private =
+                                           chained_block->orig_bio_bh_private;
+                                       block->orig_bio_bh_end_io.bio =
+                                           chained_block->orig_bio_bh_end_io.
+                                           bio;
+                                       block->next_in_same_bio = chained_block;
+                                       bio->bi_private = block;
+                               }
+                       } else if (NULL != bh) {
+                               block->is_iodone = 0;
+                               block->orig_bio_bh_private = bh->b_private;
+                               block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                               block->next_in_same_bio = NULL;
+                               bh->b_private = block;
+                               bh->b_end_io = btrfsic_bh_end_io;
+                       } else {
+                               block->is_iodone = 1;
+                               block->orig_bio_bh_private = NULL;
+                               block->orig_bio_bh_end_io.bio = NULL;
+                               block->next_in_same_bio = NULL;
+                       }
+               }
+
+               block->flush_gen = dev_state->last_flush_gen + 1;
+               block->submit_bio_bh_rw = submit_bio_bh_rw;
+               if (is_metadata) {
+                       block->logical_bytenr = bytenr;
+                       block->is_metadata = 1;
+                       if (block->is_superblock) {
+                               ret = btrfsic_process_written_superblock(
+                                               state,
+                                               block,
+                                               (struct btrfs_super_block *)
+                                               mapped_data);
+                               if (state->print_mask &
+                                   BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+                                       printk(KERN_INFO
+                                       "[after new superblock is written]:\n");
+                                       btrfsic_dump_tree_sub(state, block, 0);
+                               }
+                       } else {
+                               block->mirror_num = 0;  /* unknown */
+                               ret = btrfsic_process_metablock(
+                                               state,
+                                               block,
+                                               &block_ctx,
+                                               (struct btrfs_header *)
+                                               block_ctx.data,
+                                               0, 0);
+                       }
+                       if (ret)
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_process_metablock"
+                                      "(root @%llu) failed!\n",
+                                      (unsigned long long)dev_bytenr);
+               } else {
+                       block->is_metadata = 0;
+                       block->mirror_num = 0;  /* unknown */
+                       block->generation = BTRFSIC_GENERATION_UNKNOWN;
+                       if (!state->include_extent_data
+                           && list_empty(&block->ref_from_list)) {
+                               /*
+                                * disk block is overwritten with extent
+                                * data (not meta data) and we are configured
+                                * to not include extent data: take the
+                                * chance and free the block's memory
+                                */
+                               btrfsic_block_hashtable_remove(block);
+                               list_del(&block->all_blocks_node);
+                               btrfsic_block_free(block);
+                       }
+               }
+               btrfsic_release_block_ctx(&block_ctx);
+       } else {
+               /* block has not been found in hash table */
+               u64 bytenr;
+
+               if (!is_metadata) {
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO "Written block (%s/%llu/?)"
+                                      " !found in hash table, D.\n",
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr);
+                       if (!state->include_extent_data)
+                               return; /* ignore that written D block */
+
+                       /* this is getting ugly for the
+                        * include_extent_data case... */
+                       bytenr = 0;     /* unknown */
+                       block_ctx.start = bytenr;
+                       block_ctx.len = len;
+                       block_ctx.bh = NULL;
+               } else {
+                       bytenr = le64_to_cpu(((struct btrfs_header *)
+                                             mapped_data)->bytenr);
+                       btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+                                                      dev_bytenr,
+                                                      mapped_data);
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/?)"
+                                      " !found in hash table, M.\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr);
+
+                       ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
+                                               0);
+                       if (ret) {
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_map_block(root @%llu)"
+                                      " failed!\n",
+                                      (unsigned long long)dev_bytenr);
+                               return;
+                       }
+               }
+               block_ctx.data = mapped_data;
+               /* the following is required in case of writes to mirrors,
+                * use the same that was used for the lookup */
+               block_ctx.dev = dev_state;
+               block_ctx.dev_bytenr = dev_bytenr;
+
+               block = btrfsic_block_alloc();
+               if (NULL == block) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       btrfsic_release_block_ctx(&block_ctx);
+                       return;
+               }
+               block->dev_state = dev_state;
+               block->dev_bytenr = dev_bytenr;
+               block->logical_bytenr = bytenr;
+               block->is_metadata = is_metadata;
+               block->never_written = 0;
+               block->iodone_w_error = 0;
+               block->mirror_num = 0;  /* unknown */
+               block->flush_gen = dev_state->last_flush_gen + 1;
+               block->submit_bio_bh_rw = submit_bio_bh_rw;
+               if (NULL != bio) {
+                       block->is_iodone = 0;
+                       BUG_ON(NULL == bio_is_patched);
+                       if (!*bio_is_patched) {
+                               block->orig_bio_bh_private = bio->bi_private;
+                               block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+                               block->next_in_same_bio = NULL;
+                               bio->bi_private = block;
+                               bio->bi_end_io = btrfsic_bio_end_io;
+                               *bio_is_patched = 1;
+                       } else {
+                               struct btrfsic_block *chained_block =
+                                   (struct btrfsic_block *)
+                                   bio->bi_private;
+
+                               BUG_ON(NULL == chained_block);
+                               block->orig_bio_bh_private =
+                                   chained_block->orig_bio_bh_private;
+                               block->orig_bio_bh_end_io.bio =
+                                   chained_block->orig_bio_bh_end_io.bio;
+                               block->next_in_same_bio = chained_block;
+                               bio->bi_private = block;
+                       }
+               } else if (NULL != bh) {
+                       block->is_iodone = 0;
+                       block->orig_bio_bh_private = bh->b_private;
+                       block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                       block->next_in_same_bio = NULL;
+                       bh->b_private = block;
+                       bh->b_end_io = btrfsic_bh_end_io;
+               } else {
+                       block->is_iodone = 1;
+                       block->orig_bio_bh_private = NULL;
+                       block->orig_bio_bh_end_io.bio = NULL;
+                       block->next_in_same_bio = NULL;
+               }
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "New written %c-block @%llu (%s/%llu/%d)\n",
+                              is_metadata ? 'M' : 'D',
+                              (unsigned long long)block->logical_bytenr,
+                              block->dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num);
+               list_add(&block->all_blocks_node, &state->all_blocks_list);
+               btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+               if (is_metadata) {
+                       ret = btrfsic_process_metablock(state, block,
+                                                       &block_ctx,
+                                                       (struct btrfs_header *)
+                                                       block_ctx.data, 0, 0);
+                       if (ret)
+                               printk(KERN_INFO
+                                      "btrfsic: process_metablock(root @%llu)"
+                                      " failed!\n",
+                                      (unsigned long long)dev_bytenr);
+               }
+               btrfsic_release_block_ctx(&block_ctx);
+       }
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+       struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+       int iodone_w_error;
+
+       /* mutex is not held! This is not save if IO is not yet completed
+        * on umount */
+       iodone_w_error = 0;
+       if (bio_error_status)
+               iodone_w_error = 1;
+
+       BUG_ON(NULL == block);
+       bp->bi_private = block->orig_bio_bh_private;
+       bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+       do {
+               struct btrfsic_block *next_block;
+               struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+               if ((dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                       printk(KERN_INFO
+                              "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+                              bio_error_status,
+                              btrfsic_get_block_type(dev_state->state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num);
+               next_block = block->next_in_same_bio;
+               block->iodone_w_error = iodone_w_error;
+               if (block->submit_bio_bh_rw & REQ_FLUSH) {
+                       dev_state->last_flush_gen++;
+                       if ((dev_state->state->print_mask &
+                            BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                               printk(KERN_INFO
+                                      "bio_end_io() new %s flush_gen=%llu\n",
+                                      dev_state->name,
+                                      (unsigned long long)
+                                      dev_state->last_flush_gen);
+               }
+               if (block->submit_bio_bh_rw & REQ_FUA)
+                       block->flush_gen = 0; /* FUA completed means block is
+                                              * on disk */
+               block->is_iodone = 1; /* for FLUSH, this releases the block */
+               block = next_block;
+       } while (NULL != block);
+
+       bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+       struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+       int iodone_w_error = !uptodate;
+       struct btrfsic_dev_state *dev_state;
+
+       BUG_ON(NULL == block);
+       dev_state = block->dev_state;
+       if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+               printk(KERN_INFO
+                      "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+                      iodone_w_error,
+                      btrfsic_get_block_type(dev_state->state, block),
+                      (unsigned long long)block->logical_bytenr,
+                      block->dev_state->name,
+                      (unsigned long long)block->dev_bytenr,
+                      block->mirror_num);
+
+       block->iodone_w_error = iodone_w_error;
+       if (block->submit_bio_bh_rw & REQ_FLUSH) {
+               dev_state->last_flush_gen++;
+               if ((dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                       printk(KERN_INFO
+                              "bh_end_io() new %s flush_gen=%llu\n",
+                              dev_state->name,
+                              (unsigned long long)dev_state->last_flush_gen);
+       }
+       if (block->submit_bio_bh_rw & REQ_FUA)
+               block->flush_gen = 0; /* FUA completed means block is on disk */
+
+       bh->b_private = block->orig_bio_bh_private;
+       bh->b_end_io = block->orig_bio_bh_end_io.bh;
+       block->is_iodone = 1; /* for FLUSH, this releases the block */
+       bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+               struct btrfsic_state *state,
+               struct btrfsic_block *const superblock,
+               struct btrfs_super_block *const super_hdr)
+{
+       int pass;
+
+       superblock->generation = btrfs_super_generation(super_hdr);
+       if (!(superblock->generation > state->max_superblock_generation ||
+             0 == state->max_superblock_generation)) {
+               if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                       printk(KERN_INFO
+                              "btrfsic: superblock @%llu (%s/%llu/%d)"
+                              " with old gen %llu <= %llu\n",
+                              (unsigned long long)superblock->logical_bytenr,
+                              superblock->dev_state->name,
+                              (unsigned long long)superblock->dev_bytenr,
+                              superblock->mirror_num,
+                              (unsigned long long)
+                              btrfs_super_generation(super_hdr),
+                              (unsigned long long)
+                              state->max_superblock_generation);
+       } else {
+               if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                       printk(KERN_INFO
+                              "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+                              " with new gen %llu > %llu\n",
+                              (unsigned long long)superblock->logical_bytenr,
+                              superblock->dev_state->name,
+                              (unsigned long long)superblock->dev_bytenr,
+                              superblock->mirror_num,
+                              (unsigned long long)
+                              btrfs_super_generation(super_hdr),
+                              (unsigned long long)
+                              state->max_superblock_generation);
+
+               state->max_superblock_generation =
+                   btrfs_super_generation(super_hdr);
+               state->latest_superblock = superblock;
+       }
+
+       for (pass = 0; pass < 3; pass++) {
+               int ret;
+               u64 next_bytenr;
+               struct btrfsic_block *next_block;
+               struct btrfsic_block_data_ctx tmp_next_block_ctx;
+               struct btrfsic_block_link *l;
+               int num_copies;
+               int mirror_num;
+               const char *additional_string = NULL;
+               struct btrfs_disk_key tmp_disk_key;
+
+               tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+               tmp_disk_key.offset = 0;
+
+               switch (pass) {
+               case 0:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+                       additional_string = "root ";
+                       next_bytenr = btrfs_super_root(super_hdr);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "root@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 1:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+                       additional_string = "chunk ";
+                       next_bytenr = btrfs_super_chunk_root(super_hdr);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "chunk@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 2:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+                       additional_string = "log ";
+                       next_bytenr = btrfs_super_log_root(super_hdr);
+                       if (0 == next_bytenr)
+                               continue;
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "log@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               }
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       int was_created;
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "btrfsic_process_written_superblock("
+                                      "mirror_num=%d)\n", mirror_num);
+                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                               &tmp_next_block_ctx,
+                                               mirror_num);
+                       if (ret) {
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_map_block(@%llu,"
+                                      " mirror=%d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_lookup_or_add(
+                                       state,
+                                       &tmp_next_block_ctx,
+                                       additional_string,
+                                       1, 0, 1,
+                                       mirror_num,
+                                       &was_created);
+                       if (NULL == next_block) {
+                               printk(KERN_INFO
+                                      "btrfsic: error, kmalloc failed!\n");
+                               btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                               return -1;
+                       }
+
+                       next_block->disk_key = tmp_disk_key;
+                       if (was_created)
+                               next_block->generation =
+                                   BTRFSIC_GENERATION_UNKNOWN;
+                       l = btrfsic_block_link_lookup_or_add(
+                                       state,
+                                       &tmp_next_block_ctx,
+                                       next_block,
+                                       superblock,
+                                       BTRFSIC_GENERATION_UNKNOWN);
+                       btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                       if (NULL == l)
+                               return -1;
+               }
+       }
+
+       if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
+               WARN_ON(1);
+               btrfsic_dump_tree(state);
+       }
+
+       return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+                                       struct btrfsic_block *const block,
+                                       int recursion_level)
+{
+       struct list_head *elem_ref_to;
+       int ret = 0;
+
+       if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+               /*
+                * Note that this situation can happen and does not
+                * indicate an error in regular cases. It happens
+                * when disk blocks are freed and later reused.
+                * The check-integrity module is not aware of any
+                * block free operations, it just recognizes block
+                * write operations. Therefore it keeps the linkage
+                * information for a block until a block is
+                * rewritten. This can temporarily cause incorrect
+                * and even circular linkage informations. This
+                * causes no harm unless such blocks are referenced
+                * by the most recent super block.
+                */
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "btrfsic: abort cyclic linkage (case 1).\n");
+
+               return ret;
+       }
+
+       /*
+        * This algorithm is recursive because the amount of used stack
+        * space is very small and the max recursion depth is limited.
+        */
+       list_for_each(elem_ref_to, &block->ref_to_list) {
+               const struct btrfsic_block_link *const l =
+                   list_entry(elem_ref_to, struct btrfsic_block_link,
+                              node_ref_to);
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "rl=%d, %c @%llu (%s/%llu/%d)"
+                              " %u* refers to %c @%llu (%s/%llu/%d)\n",
+                              recursion_level,
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              block->dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+               if (l->block_ref_to->never_written) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which is never written!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+                       ret = -1;
+               } else if (!l->block_ref_to->is_iodone) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which is not yet iodone!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+                       ret = -1;
+               } else if (l->parent_generation !=
+                          l->block_ref_to->generation &&
+                          BTRFSIC_GENERATION_UNKNOWN !=
+                          l->parent_generation &&
+                          BTRFSIC_GENERATION_UNKNOWN !=
+                          l->block_ref_to->generation) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " with generation %llu !="
+                              " parent generation %llu!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num,
+                              (unsigned long long)l->block_ref_to->generation,
+                              (unsigned long long)l->parent_generation);
+                       ret = -1;
+               } else if (l->block_ref_to->flush_gen >
+                          l->block_ref_to->dev_state->last_flush_gen) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which is not flushed out of disk's write cache"
+                              " (block flush_gen=%llu,"
+                              " dev->flush_gen=%llu)!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num,
+                              (unsigned long long)block->flush_gen,
+                              (unsigned long long)
+                              l->block_ref_to->dev_state->last_flush_gen);
+                       ret = -1;
+               } else if (-1 == btrfsic_check_all_ref_blocks(state,
+                                                             l->block_ref_to,
+                                                             recursion_level +
+                                                             1)) {
+                       ret = -1;
+               }
+       }
+
+       return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+               const struct btrfsic_state *state,
+               const struct btrfsic_block *block,
+               int recursion_level)
+{
+       struct list_head *elem_ref_from;
+
+       if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+               /* refer to comment at "abort cyclic linkage (case 1)" */
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "btrfsic: abort cyclic linkage (case 2).\n");
+
+               return 0;
+       }
+
+       /*
+        * This algorithm is recursive because the amount of used stack space
+        * is very small and the max recursion depth is limited.
+        */
+       list_for_each(elem_ref_from, &block->ref_from_list) {
+               const struct btrfsic_block_link *const l =
+                   list_entry(elem_ref_from, struct btrfsic_block_link,
+                              node_ref_from);
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "rl=%d, %c @%llu (%s/%llu/%d)"
+                              " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+                              recursion_level,
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              block->dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_from),
+                              (unsigned long long)
+                              l->block_ref_from->logical_bytenr,
+                              l->block_ref_from->dev_state->name,
+                              (unsigned long long)
+                              l->block_ref_from->dev_bytenr,
+                              l->block_ref_from->mirror_num);
+               if (l->block_ref_from->is_superblock &&
+                   state->latest_superblock->dev_bytenr ==
+                   l->block_ref_from->dev_bytenr &&
+                   state->latest_superblock->dev_state->bdev ==
+                   l->block_ref_from->dev_state->bdev)
+                       return 1;
+               else if (btrfsic_is_block_ref_by_superblock(state,
+                                                           l->block_ref_from,
+                                                           recursion_level +
+                                                           1))
+                       return 1;
+       }
+
+       return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l)
+{
+       printk(KERN_INFO
+              "Add %u* link from %c @%llu (%s/%llu/%d)"
+              " to %c @%llu (%s/%llu/%d).\n",
+              l->ref_cnt,
+              btrfsic_get_block_type(state, l->block_ref_from),
+              (unsigned long long)l->block_ref_from->logical_bytenr,
+              l->block_ref_from->dev_state->name,
+              (unsigned long long)l->block_ref_from->dev_bytenr,
+              l->block_ref_from->mirror_num,
+              btrfsic_get_block_type(state, l->block_ref_to),
+              (unsigned long long)l->block_ref_to->logical_bytenr,
+              l->block_ref_to->dev_state->name,
+              (unsigned long long)l->block_ref_to->dev_bytenr,
+              l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l)
+{
+       printk(KERN_INFO
+              "Rem %u* link from %c @%llu (%s/%llu/%d)"
+              " to %c @%llu (%s/%llu/%d).\n",
+              l->ref_cnt,
+              btrfsic_get_block_type(state, l->block_ref_from),
+              (unsigned long long)l->block_ref_from->logical_bytenr,
+              l->block_ref_from->dev_state->name,
+              (unsigned long long)l->block_ref_from->dev_bytenr,
+              l->block_ref_from->mirror_num,
+              btrfsic_get_block_type(state, l->block_ref_to),
+              (unsigned long long)l->block_ref_to->logical_bytenr,
+              l->block_ref_to->dev_state->name,
+              (unsigned long long)l->block_ref_to->dev_bytenr,
+              l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+                                  const struct btrfsic_block *block)
+{
+       if (block->is_superblock &&
+           state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+           state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+               return 'S';
+       else if (block->is_superblock)
+               return 's';
+       else if (block->is_metadata)
+               return 'M';
+       else
+               return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+       btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+                                 const struct btrfsic_block *block,
+                                 int indent_level)
+{
+       struct list_head *elem_ref_to;
+       int indent_add;
+       static char buf[80];
+       int cursor_position;
+
+       /*
+        * Should better fill an on-stack buffer with a complete line and
+        * dump it at once when it is time to print a newline character.
+        */
+
+       /*
+        * This algorithm is recursive because the amount of used stack space
+        * is very small and the max recursion depth is limited.
+        */
+       indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+                            btrfsic_get_block_type(state, block),
+                            (unsigned long long)block->logical_bytenr,
+                            block->dev_state->name,
+                            (unsigned long long)block->dev_bytenr,
+                            block->mirror_num);
+       if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+               printk("[...]\n");
+               return;
+       }
+       printk(buf);
+       indent_level += indent_add;
+       if (list_empty(&block->ref_to_list)) {
+               printk("\n");
+               return;
+       }
+       if (block->mirror_num > 1 &&
+           !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+               printk(" [...]\n");
+               return;
+       }
+
+       cursor_position = indent_level;
+       list_for_each(elem_ref_to, &block->ref_to_list) {
+               const struct btrfsic_block_link *const l =
+                   list_entry(elem_ref_to, struct btrfsic_block_link,
+                              node_ref_to);
+
+               while (cursor_position < indent_level) {
+                       printk(" ");
+                       cursor_position++;
+               }
+               if (l->ref_cnt > 1)
+                       indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+               else
+                       indent_add = sprintf(buf, " --> ");
+               if (indent_level + indent_add >
+                   BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+                       printk("[...]\n");
+                       cursor_position = 0;
+                       continue;
+               }
+
+               printk(buf);
+
+               btrfsic_dump_tree_sub(state, l->block_ref_to,
+                                     indent_level + indent_add);
+               cursor_position = 0;
+       }
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block *next_block,
+               struct btrfsic_block *from_block,
+               u64 parent_generation)
+{
+       struct btrfsic_block_link *l;
+
+       l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+                                               next_block_ctx->dev_bytenr,
+                                               from_block->dev_state->bdev,
+                                               from_block->dev_bytenr,
+                                               &state->block_link_hashtable);
+       if (NULL == l) {
+               l = btrfsic_block_link_alloc();
+               if (NULL == l) {
+                       printk(KERN_INFO
+                              "btrfsic: error, kmalloc" " failed!\n");
+                       return NULL;
+               }
+
+               l->block_ref_to = next_block;
+               l->block_ref_from = from_block;
+               l->ref_cnt = 1;
+               l->parent_generation = parent_generation;
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       btrfsic_print_add_link(state, l);
+
+               list_add(&l->node_ref_to, &from_block->ref_to_list);
+               list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+               btrfsic_block_link_hashtable_add(l,
+                                                &state->block_link_hashtable);
+       } else {
+               l->ref_cnt++;
+               l->parent_generation = parent_generation;
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       btrfsic_print_add_link(state, l);
+       }
+
+       return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *block_ctx,
+               const char *additional_string,
+               int is_metadata,
+               int is_iodone,
+               int never_written,
+               int mirror_num,
+               int *was_created)
+{
+       struct btrfsic_block *block;
+
+       block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+                                              block_ctx->dev_bytenr,
+                                              &state->block_hashtable);
+       if (NULL == block) {
+               struct btrfsic_dev_state *dev_state;
+
+               block = btrfsic_block_alloc();
+               if (NULL == block) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       return NULL;
+               }
+               dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+               if (NULL == dev_state) {
+                       printk(KERN_INFO
+                              "btrfsic: error, lookup dev_state failed!\n");
+                       btrfsic_block_free(block);
+                       return NULL;
+               }
+               block->dev_state = dev_state;
+               block->dev_bytenr = block_ctx->dev_bytenr;
+               block->logical_bytenr = block_ctx->start;
+               block->is_metadata = is_metadata;
+               block->is_iodone = is_iodone;
+               block->never_written = never_written;
+               block->mirror_num = mirror_num;
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "New %s%c-block @%llu (%s/%llu/%d)\n",
+                              additional_string,
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              mirror_num);
+               list_add(&block->all_blocks_node, &state->all_blocks_list);
+               btrfsic_block_hashtable_add(block, &state->block_hashtable);
+               if (NULL != was_created)
+                       *was_created = 1;
+       } else {
+               if (NULL != was_created)
+                       *was_created = 0;
+       }
+
+       return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+                                          u64 bytenr,
+                                          struct btrfsic_dev_state *dev_state,
+                                          u64 dev_bytenr, char *data)
+{
+       int num_copies;
+       int mirror_num;
+       int ret;
+       struct btrfsic_block_data_ctx block_ctx;
+       int match = 0;
+
+       num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                     bytenr, PAGE_SIZE);
+
+       for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+               ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                                       &block_ctx, mirror_num);
+               if (ret) {
+                       printk(KERN_INFO "btrfsic:"
+                              " btrfsic_map_block(logical @%llu,"
+                              " mirror %d) failed!\n",
+                              (unsigned long long)bytenr, mirror_num);
+                       continue;
+               }
+
+               if (dev_state->bdev == block_ctx.dev->bdev &&
+                   dev_bytenr == block_ctx.dev_bytenr) {
+                       match++;
+                       btrfsic_release_block_ctx(&block_ctx);
+                       break;
+               }
+               btrfsic_release_block_ctx(&block_ctx);
+       }
+
+       if (!match) {
+               printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+                      " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+                      " phys_bytenr=%llu)!\n",
+                      (unsigned long long)bytenr, dev_state->name,
+                      (unsigned long long)dev_bytenr);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                                               &block_ctx, mirror_num);
+                       if (ret)
+                               continue;
+
+                       printk(KERN_INFO "Read logical bytenr @%llu maps to"
+                              " (%s/%llu/%d)\n",
+                              (unsigned long long)bytenr,
+                              block_ctx.dev->name,
+                              (unsigned long long)block_ctx.dev_bytenr,
+                              mirror_num);
+               }
+               WARN_ON(1);
+       }
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+               struct block_device *bdev)
+{
+       struct btrfsic_dev_state *ds;
+
+       ds = btrfsic_dev_state_hashtable_lookup(bdev,
+                                               &btrfsic_dev_state_hashtable);
+       return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+       struct btrfsic_dev_state *dev_state;
+
+       if (!btrfsic_is_initialized)
+               return submit_bh(rw, bh);
+
+       mutex_lock(&btrfsic_mutex);
+       /* since btrfsic_submit_bh() might also be called before
+        * btrfsic_mount(), this might return NULL */
+       dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+       /* Only called to write the superblock (incl. FLUSH/FUA) */
+       if (NULL != dev_state &&
+           (rw & WRITE) && bh->b_size > 0) {
+               u64 dev_bytenr;
+
+               dev_bytenr = 4096 * bh->b_blocknr;
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
+                              " size=%lu, data=%p, bdev=%p)\n",
+                              rw, bh->b_blocknr,
+                              (unsigned long long)dev_bytenr, bh->b_size,
+                              bh->b_data, bh->b_bdev);
+               btrfsic_process_written_block(dev_state, dev_bytenr,
+                                             bh->b_data, bh->b_size, NULL,
+                                             NULL, bh, rw);
+       } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+                              rw, bh->b_bdev);
+               if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+                       if ((dev_state->state->print_mask &
+                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                               printk(KERN_INFO
+                                      "btrfsic_submit_bh(%s) with FLUSH"
+                                      " but dummy block already in use"
+                                      " (ignored)!\n",
+                                      dev_state->name);
+               } else {
+                       struct btrfsic_block *const block =
+                               &dev_state->dummy_block_for_bio_bh_flush;
+
+                       block->is_iodone = 0;
+                       block->never_written = 0;
+                       block->iodone_w_error = 0;
+                       block->flush_gen = dev_state->last_flush_gen + 1;
+                       block->submit_bio_bh_rw = rw;
+                       block->orig_bio_bh_private = bh->b_private;
+                       block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                       block->next_in_same_bio = NULL;
+                       bh->b_private = block;
+                       bh->b_end_io = btrfsic_bh_end_io;
+               }
+       }
+       mutex_unlock(&btrfsic_mutex);
+       return submit_bh(rw, bh);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+       struct btrfsic_dev_state *dev_state;
+
+       if (!btrfsic_is_initialized) {
+               submit_bio(rw, bio);
+               return;
+       }
+
+       mutex_lock(&btrfsic_mutex);
+       /* since btrfsic_submit_bio() is also called before
+        * btrfsic_mount(), this might return NULL */
+       dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+       if (NULL != dev_state &&
+           (rw & WRITE) && NULL != bio->bi_io_vec) {
+               unsigned int i;
+               u64 dev_bytenr;
+               int bio_is_patched;
+
+               dev_bytenr = 512 * bio->bi_sector;
+               bio_is_patched = 0;
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bio(rw=0x%x, bi_vcnt=%u,"
+                              " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
+                              rw, bio->bi_vcnt, bio->bi_sector,
+                              (unsigned long long)dev_bytenr,
+                              bio->bi_bdev);
+
+               for (i = 0; i < bio->bi_vcnt; i++) {
+                       u8 *mapped_data;
+
+                       mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+                       if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                            BTRFSIC_PRINT_MASK_VERBOSE) ==
+                           (dev_state->state->print_mask &
+                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                               printk(KERN_INFO
+                                      "#%u: page=%p, mapped=%p, len=%u,"
+                                      " offset=%u\n",
+                                      i, bio->bi_io_vec[i].bv_page,
+                                      mapped_data,
+                                      bio->bi_io_vec[i].bv_len,
+                                      bio->bi_io_vec[i].bv_offset);
+                       btrfsic_process_written_block(dev_state, dev_bytenr,
+                                                     mapped_data,
+                                                     bio->bi_io_vec[i].bv_len,
+                                                     bio, &bio_is_patched,
+                                                     NULL, rw);
+                       kunmap(bio->bi_io_vec[i].bv_page);
+                       dev_bytenr += bio->bi_io_vec[i].bv_len;
+               }
+       } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+                              rw, bio->bi_bdev);
+               if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+                       if ((dev_state->state->print_mask &
+                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                               printk(KERN_INFO
+                                      "btrfsic_submit_bio(%s) with FLUSH"
+                                      " but dummy block already in use"
+                                      " (ignored)!\n",
+                                      dev_state->name);
+               } else {
+                       struct btrfsic_block *const block =
+                               &dev_state->dummy_block_for_bio_bh_flush;
+
+                       block->is_iodone = 0;
+                       block->never_written = 0;
+                       block->iodone_w_error = 0;
+                       block->flush_gen = dev_state->last_flush_gen + 1;
+                       block->submit_bio_bh_rw = rw;
+                       block->orig_bio_bh_private = bio->bi_private;
+                       block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+                       block->next_in_same_bio = NULL;
+                       bio->bi_private = block;
+                       bio->bi_end_io = btrfsic_bio_end_io;
+               }
+       }
+       mutex_unlock(&btrfsic_mutex);
+
+       submit_bio(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+                 struct btrfs_fs_devices *fs_devices,
+                 int including_extent_data, u32 print_mask)
+{
+       int ret;
+       struct btrfsic_state *state;
+       struct list_head *dev_head = &fs_devices->devices;
+       struct btrfs_device *device;
+
+       state = kzalloc(sizeof(*state), GFP_NOFS);
+       if (NULL == state) {
+               printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+               return -1;
+       }
+
+       if (!btrfsic_is_initialized) {
+               mutex_init(&btrfsic_mutex);
+               btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+               btrfsic_is_initialized = 1;
+       }
+       mutex_lock(&btrfsic_mutex);
+       state->root = root;
+       state->print_mask = print_mask;
+       state->include_extent_data = including_extent_data;
+       state->csum_size = 0;
+       INIT_LIST_HEAD(&state->all_blocks_list);
+       btrfsic_block_hashtable_init(&state->block_hashtable);
+       btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+       state->max_superblock_generation = 0;
+       state->latest_superblock = NULL;
+
+       list_for_each_entry(device, dev_head, dev_list) {
+               struct btrfsic_dev_state *ds;
+               char *p;
+
+               if (!device->bdev || !device->name)
+                       continue;
+
+               ds = btrfsic_dev_state_alloc();
+               if (NULL == ds) {
+                       printk(KERN_INFO
+                              "btrfs check-integrity: kmalloc() failed!\n");
+                       mutex_unlock(&btrfsic_mutex);
+                       return -1;
+               }
+               ds->bdev = device->bdev;
+               ds->state = state;
+               bdevname(ds->bdev, ds->name);
+               ds->name[BDEVNAME_SIZE - 1] = '\0';
+               for (p = ds->name; *p != '\0'; p++);
+               while (p > ds->name && *p != '/')
+                       p--;
+               if (*p == '/')
+                       p++;
+               strlcpy(ds->name, p, sizeof(ds->name));
+               btrfsic_dev_state_hashtable_add(ds,
+                                               &btrfsic_dev_state_hashtable);
+       }
+
+       ret = btrfsic_process_superblock(state, fs_devices);
+       if (0 != ret) {
+               mutex_unlock(&btrfsic_mutex);
+               btrfsic_unmount(root, fs_devices);
+               return ret;
+       }
+
+       if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+               btrfsic_dump_database(state);
+       if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+               btrfsic_dump_tree(state);
+
+       mutex_unlock(&btrfsic_mutex);
+       return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+                    struct btrfs_fs_devices *fs_devices)
+{
+       struct list_head *elem_all;
+       struct list_head *tmp_all;
+       struct btrfsic_state *state;
+       struct list_head *dev_head = &fs_devices->devices;
+       struct btrfs_device *device;
+
+       if (!btrfsic_is_initialized)
+               return;
+
+       mutex_lock(&btrfsic_mutex);
+
+       state = NULL;
+       list_for_each_entry(device, dev_head, dev_list) {
+               struct btrfsic_dev_state *ds;
+
+               if (!device->bdev || !device->name)
+                       continue;
+
+               ds = btrfsic_dev_state_hashtable_lookup(
+                               device->bdev,
+                               &btrfsic_dev_state_hashtable);
+               if (NULL != ds) {
+                       state = ds->state;
+                       btrfsic_dev_state_hashtable_remove(ds);
+                       btrfsic_dev_state_free(ds);
+               }
+       }
+
+       if (NULL == state) {
+               printk(KERN_INFO
+                      "btrfsic: error, cannot find state information"
+                      " on umount!\n");
+               mutex_unlock(&btrfsic_mutex);
+               return;
+       }
+
+       /*
+        * Don't care about keeping the lists' state up to date,
+        * just free all memory that was allocated dynamically.
+        * Free the blocks and the block_links.
+        */
+       list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+               struct btrfsic_block *const b_all =
+                   list_entry(elem_all, struct btrfsic_block,
+                              all_blocks_node);
+               struct list_head *elem_ref_to;
+               struct list_head *tmp_ref_to;
+
+               list_for_each_safe(elem_ref_to, tmp_ref_to,
+                                  &b_all->ref_to_list) {
+                       struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_to,
+                                      struct btrfsic_block_link,
+                                      node_ref_to);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               btrfsic_print_rem_link(state, l);
+
+                       l->ref_cnt--;
+                       if (0 == l->ref_cnt)
+                               btrfsic_block_link_free(l);
+               }
+
+               if (b_all->is_iodone)
+                       btrfsic_block_free(b_all);
+               else
+                       printk(KERN_INFO "btrfs: attempt to free %c-block"
+                              " @%llu (%s/%llu/%d) on umount which is"
+                              " not yet iodone!\n",
+                              btrfsic_get_block_type(state, b_all),
+                              (unsigned long long)b_all->logical_bytenr,
+                              b_all->dev_state->name,
+                              (unsigned long long)b_all->dev_bytenr,
+                              b_all->mirror_num);
+       }
+
+       mutex_unlock(&btrfsic_mutex);
+
+       kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644 (file)
index 0000000..8b59175
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+                 struct btrfs_fs_devices *fs_devices,
+                 int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+                    struct btrfs_fs_devices *fs_devices);
+
+#endif
index dede441..0639a55 100644 (file)
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
                                     new_root_objectid, &disk_key, level,
-                                    buf->start, 0);
+                                    buf->start, 0, 1);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-               ret = btrfs_inc_ref(trans, root, cow, 1);
+               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
        else
-               ret = btrfs_inc_ref(trans, root, cow, 0);
+               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 
        if (ret)
                return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if ((owner == root->root_key.objectid ||
                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-                       ret = btrfs_inc_ref(trans, root, buf, 1);
+                       ret = btrfs_inc_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret);
 
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID) {
-                               ret = btrfs_dec_ref(trans, root, buf, 0);
+                               ret = btrfs_dec_ref(trans, root, buf, 0, 1);
                                BUG_ON(ret);
-                               ret = btrfs_inc_ref(trans, root, cow, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                                BUG_ON(ret);
                        }
                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                               ret = btrfs_inc_ref(trans, root, cow, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                        else
-                               ret = btrfs_inc_ref(trans, root, cow, 0);
+                               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
                        BUG_ON(ret);
                }
                if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                               ret = btrfs_inc_ref(trans, root, cow, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                        else
-                               ret = btrfs_inc_ref(trans, root, cow, 0);
+                               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
                        BUG_ON(ret);
-                       ret = btrfs_dec_ref(trans, root, buf, 1);
+                       ret = btrfs_dec_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret);
                }
                clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
                                     root->root_key.objectid, &disk_key,
-                                    level, search_start, empty_size);
+                                    level, search_start, empty_size, 1);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                rcu_assign_pointer(root->node, cow);
 
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref);
+                                     last_ref, 1);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref);
+                                     last_ref, 1);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                free_extent_buffer(mid);
 
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1);
+               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                /* once for the root ptr */
                free_extent_buffer(mid);
                return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        if (wret)
                                ret = wret;
                        root_sub_used(root, right->len);
-                       btrfs_free_tree_block(trans, root, right, 0, 1);
+                       btrfs_free_tree_block(trans, root, right, 0, 1, 0);
                        free_extent_buffer(right);
                        right = NULL;
                } else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret)
                        ret = wret;
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1);
+               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                free_extent_buffer(mid);
                mid = NULL;
        } else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                   root->root_key.objectid, &lower_key,
-                                  level, root->node->start, 0);
+                                  level, root->node->start, 0, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
 
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, level, c->start, 0);
+                                       &disk_key, level, c->start, 0, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
 
@@ -2970,7 +2970,7 @@ again:
 
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, 0, l->start, 0);
+                                       &disk_key, 0, l->start, 0, 0);
        if (IS_ERR(right))
                return PTR_ERR(right);
 
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
        root_sub_used(root, leaf->len);
 
-       btrfs_free_tree_block(trans, root, leaf, 0, 1);
+       btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
        return 0;
 }
 /*
index 6738503..27ebe61 100644 (file)
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
        __le16 name_len;
 } __attribute__ ((__packed__));
 
+struct btrfs_disk_balance_args {
+       /*
+        * profiles to operate on, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 profiles;
+
+       /* usage filter */
+       __le64 usage;
+
+       /* devid filter */
+       __le64 devid;
+
+       /* devid subset filter [pstart..pend) */
+       __le64 pstart;
+       __le64 pend;
+
+       /* btrfs virtual address space subset filter [vstart..vend) */
+       __le64 vstart;
+       __le64 vend;
+
+       /*
+        * profile to convert to, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 target;
+
+       /* BTRFS_BALANCE_ARGS_* */
+       __le64 flags;
+
+       __le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+       /* BTRFS_BALANCE_* */
+       __le64 flags;
+
+       struct btrfs_disk_balance_args data;
+       struct btrfs_disk_balance_args meta;
+       struct btrfs_disk_balance_args sys;
+
+       __le64 unused[4];
+} __attribute__ ((__packed__));
+
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP     (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
-#define BTRFS_NR_RAID_TYPES       5
+#define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA     (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0                (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1                (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP          (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10       (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED     BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES            5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK    (BTRFS_BLOCK_GROUP_DATA |    \
+                                        BTRFS_BLOCK_GROUP_SYSTEM |  \
+                                        BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 |   \
+                                        BTRFS_BLOCK_GROUP_RAID1 |   \
+                                        BTRFS_BLOCK_GROUP_DUP |     \
+                                        BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE   (1ULL << 48)
 
 struct btrfs_block_group_item {
        __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
         * is required instead of the faster short fsync log commits
         */
        u64 last_trans_log_full_commit;
-       unsigned long mount_opt:20;
+       unsigned long mount_opt:21;
        unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
 
+       /*
+        * these three are in extended format (availability of single
+        * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+        * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+        */
        u64 avail_data_alloc_bits;
        u64 avail_metadata_alloc_bits;
        u64 avail_system_alloc_bits;
-       u64 data_alloc_profile;
-       u64 metadata_alloc_profile;
-       u64 system_alloc_profile;
+
+       /* restriper state */
+       spinlock_t balance_lock;
+       struct mutex balance_mutex;
+       atomic_t balance_running;
+       atomic_t balance_pause_req;
+       atomic_t balance_cancel_req;
+       struct btrfs_balance_control *balance_ctl;
+       wait_queue_head_t balance_wait_q;
 
        unsigned data_chunk_allocations;
        unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
        int scrub_workers_refcnt;
        struct btrfs_workers scrub_workers;
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       u32 check_integrity_print_mask;
+#endif
+
        /* filesystem state */
        u64 fs_state;
 
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY     216
 #define BTRFS_CHUNK_ITEM_KEY   228
 
+#define BTRFS_BALANCE_ITEM_KEY 248
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG                (1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE    (1 << 17)
 #define BTRFS_MOUNT_RECOVERY           (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE       (1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY    (1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 
 #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
                   num_devices, 64);
 
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
 
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+                                     struct btrfs_balance_item *bi,
+                                     struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+                                         struct btrfs_balance_item *bi,
+                                         struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+                                     struct btrfs_balance_item *bi,
+                                     struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+                                         struct btrfs_balance_item *bi,
+                                         struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+                                    struct btrfs_balance_item *bi,
+                                    struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+                                        struct btrfs_balance_item *bi,
+                                        struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+                              struct btrfs_disk_balance_args *disk)
+{
+       memset(cpu, 0, sizeof(*cpu));
+
+       cpu->profiles = le64_to_cpu(disk->profiles);
+       cpu->usage = le64_to_cpu(disk->usage);
+       cpu->devid = le64_to_cpu(disk->devid);
+       cpu->pstart = le64_to_cpu(disk->pstart);
+       cpu->pend = le64_to_cpu(disk->pend);
+       cpu->vstart = le64_to_cpu(disk->vstart);
+       cpu->vend = le64_to_cpu(disk->vend);
+       cpu->target = le64_to_cpu(disk->target);
+       cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+                              struct btrfs_balance_args *cpu)
+{
+       memset(disk, 0, sizeof(*disk));
+
+       disk->profiles = cpu_to_le64(cpu->profiles);
+       disk->usage = cpu_to_le64(cpu->usage);
+       disk->devid = cpu_to_le64(cpu->devid);
+       disk->pstart = cpu_to_le64(cpu->pstart);
+       disk->pend = cpu_to_le64(cpu->pend);
+       disk->vstart = cpu_to_le64(cpu->vstart);
+       disk->vend = cpu_to_le64(cpu->vend);
+       disk->target = cpu_to_le64(cpu->target);
+       disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
        return btrfs_item_size(eb, e) - offset;
 }
 
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
        return sb->s_fs_info;
 }
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size);
+                                       u64 hint, u64 empty_size, int for_cow);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          u64 parent, int last_ref);
+                          u64 parent, int last_ref, int for_cow);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  u64 search_end, struct btrfs_key *ins,
                                  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref);
+                 struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref);
+                 struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 flags,
                                int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, u64 parent,
-                     u64 root_objectid, u64 owner, u64 offset);
+                     u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+                     u64 owner, u64 offset, int for_cow);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset);
+                        u64 root_objectid, u64 owner, u64 offset, int for_cow);
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+       ++p->slots[0];
+       if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+               return btrfs_next_leaf(root, p);
+       return 0;
+}
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref);
+                        struct btrfs_block_rsv *block_rsv, int update_ref,
+                        int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 }
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+       kfree(fs_info->balance_ctl);
        kfree(fs_info->delayed_root);
        kfree(fs_info->extent_root);
        kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
        kfree(fs_info->super_for_commit);
        kfree(fs_info);
 }
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+       u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+       if (extended)
+               mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & mask)
+               return 0;
+       /* true if zero or exactly one bit set */
+       return (flags & (~flags + 1)) == flags;
+}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
index 9c1eccc..fe4cd0f 100644 (file)
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-       if (!ret)
+       if (!ret) {
+               trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+                                             item->key.objectid,
+                                             num_bytes, 1);
                item->bytes_reserved = num_bytes;
+       }
 
        return ret;
 }
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
                return;
 
        rsv = &root->fs_info->delayed_block_rsv;
+       trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+                                     item->key.objectid, item->bytes_reserved,
+                                     0);
        btrfs_block_rsv_release(root, rsv,
                                item->bytes_reserved);
 }
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
        struct btrfs_block_rsv *dst_rsv;
        u64 num_bytes;
        int ret;
-       int release = false;
+       bool release = false;
 
        src_rsv = trans->block_rsv;
        dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
                 */
                if (ret == -EAGAIN)
                        ret = -ENOSPC;
-               if (!ret)
+               if (!ret) {
                        node->bytes_reserved = num_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "delayed_inode",
+                                                     btrfs_ino(inode),
+                                                     num_bytes, 1);
+               }
                return ret;
        } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
                spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
         * reservation here.  I think it may be time for a documentation page on
         * how block rsvs. work.
         */
-       if (!ret)
+       if (!ret) {
+               trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+                                             btrfs_ino(inode), num_bytes, 1);
                node->bytes_reserved = num_bytes;
+       }
 
-       if (release)
+       if (release) {
+               trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                             btrfs_ino(inode), num_bytes, 0);
                btrfs_block_rsv_release(root, src_rsv, num_bytes);
+       }
 
        return ret;
 }
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
                return;
 
        rsv = &root->fs_info->delayed_block_rsv;
+       trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+                                     node->inode_id, node->bytes_reserved, 0);
        btrfs_block_rsv_release(root, rsv,
                                node->bytes_reserved);
        node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
                goto release_node;
        }
 
-       ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
-       /*
-        * we have reserved enough space when we start a new transaction,
-        * so reserving metadata failure is impossible
-        */
-       BUG_ON(ret);
-
        delayed_item->key.objectid = btrfs_ino(dir);
        btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
        delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
        dir_item->type = type;
        memcpy((char *)(dir_item + 1), name, name_len);
 
+       ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+       /*
+        * we have reserved enough space when we start a new transaction,
+        * so reserving metadata failure is impossible
+        */
+       BUG_ON(ret);
+
+
        mutex_lock(&delayed_node->mutex);
        ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
        if (unlikely(ret)) {
index 125cf76..66e4f29 100644 (file)
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
                return -1;
        if (ref1->type > ref2->type)
                return 1;
+       /* merging of sequenced refs is not allowed */
+       if (ref1->seq < ref2->seq)
+               return -1;
+       if (ref1->seq > ref2->seq)
+               return 1;
        if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
            ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
                return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 
 /*
  * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
  */
 static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
                                  u64 bytenr,
-                                 struct btrfs_delayed_ref_node **last)
+                                 struct btrfs_delayed_ref_node **last,
+                                 int return_bigger)
 {
-       struct rb_node *n = root->rb_node;
+       struct rb_node *n;
        struct btrfs_delayed_ref_node *entry;
-       int cmp;
+       int cmp = 0;
 
+again:
+       n = root->rb_node;
+       entry = NULL;
        while (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
                WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
                else
                        return entry;
        }
+       if (entry && return_bigger) {
+               if (cmp > 0) {
+                       n = rb_next(&entry->rb_node);
+                       if (!n)
+                               n = rb_first(root);
+                       entry = rb_entry(n, struct btrfs_delayed_ref_node,
+                                        rb_node);
+                       bytenr = entry->bytenr;
+                       return_bigger = 0;
+                       goto again;
+               }
+               return entry;
+       }
        return NULL;
 }
 
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                           u64 seq)
+{
+       struct seq_list *elem;
+
+       assert_spin_locked(&delayed_refs->lock);
+       if (list_empty(&delayed_refs->seq_head))
+               return 0;
+
+       elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
+       if (seq >= elem->seq) {
+               pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
+                        seq, elem->seq, delayed_refs);
+               return 1;
+       }
+       return 0;
+}
+
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 start)
 {
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                node = rb_first(&delayed_refs->root);
        } else {
                ref = NULL;
-               find_ref_head(&delayed_refs->root, start, &ref);
+               find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
                if (ref) {
-                       struct btrfs_delayed_ref_node *tmp;
-
-                       node = rb_prev(&ref->rb_node);
-                       while (node) {
-                               tmp = rb_entry(node,
-                                              struct btrfs_delayed_ref_node,
-                                              rb_node);
-                               if (tmp->bytenr < start)
-                                       break;
-                               ref = tmp;
-                               node = rb_prev(&ref->rb_node);
-                       }
                        node = &ref->rb_node;
                } else
                        node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
  * this does all the dirty work in terms of maintaining the correct
  * overall modification count.
  */
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+                                       struct btrfs_trans_handle *trans,
                                        struct btrfs_delayed_ref_node *ref,
                                        u64 bytenr, u64 num_bytes,
                                        int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
        ref->action  = 0;
        ref->is_head = 1;
        ref->in_tree = 1;
+       ref->seq = 0;
 
        head_ref = btrfs_delayed_node_to_head(ref);
        head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed tree ref into the rbtree.
  */
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_trans_handle *trans,
                                         struct btrfs_delayed_ref_node *ref,
                                         u64 bytenr, u64 num_bytes, u64 parent,
-                                        u64 ref_root, int level, int action)
+                                        u64 ref_root, int level, int action,
+                                        int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_tree_ref *full_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
+       u64 seq = 0;
 
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
        ref->is_head = 0;
        ref->in_tree = 1;
 
+       if (need_ref_seq(for_cow, ref_root))
+               seq = inc_delayed_seq(delayed_refs);
+       ref->seq = seq;
+
        full_ref = btrfs_delayed_node_to_tree_ref(ref);
-       if (parent) {
-               full_ref->parent = parent;
+       full_ref->parent = parent;
+       full_ref->root = ref_root;
+       if (parent)
                ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-       } else {
-               full_ref->root = ref_root;
+       else
                ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-       }
        full_ref->level = level;
 
        trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed data ref into the rbtree.
  */
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_trans_handle *trans,
                                         struct btrfs_delayed_ref_node *ref,
                                         u64 bytenr, u64 num_bytes, u64 parent,
                                         u64 ref_root, u64 owner, u64 offset,
-                                        int action)
+                                        int action, int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_data_ref *full_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
+       u64 seq = 0;
 
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
        ref->is_head = 0;
        ref->in_tree = 1;
 
+       if (need_ref_seq(for_cow, ref_root))
+               seq = inc_delayed_seq(delayed_refs);
+       ref->seq = seq;
+
        full_ref = btrfs_delayed_node_to_data_ref(ref);
-       if (parent) {
-               full_ref->parent = parent;
+       full_ref->parent = parent;
+       full_ref->root = ref_root;
+       if (parent)
                ref->type = BTRFS_SHARED_DATA_REF_KEY;
-       } else {
-               full_ref->root = ref_root;
+       else
                ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-       }
+
        full_ref->objectid = owner;
        full_ref->offset = offset;
 
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root,  int level, int action,
-                              struct btrfs_delayed_extent_op *extent_op)
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow)
 {
        struct btrfs_delayed_tree_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-       ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                                  action, 0);
+       ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+                                  num_bytes, action, 0);
        BUG_ON(ret);
 
-       ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
-                                  parent, ref_root, level, action);
+       ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+                                  num_bytes, parent, ref_root, level, action,
+                                  for_cow);
        BUG_ON(ret);
+       if (!need_ref_seq(for_cow, ref_root) &&
+           waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
-                              struct btrfs_delayed_extent_op *extent_op)
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow)
 {
        struct btrfs_delayed_data_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-       ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                                  action, 1);
+       ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+                                  num_bytes, action, 1);
        BUG_ON(ret);
 
-       ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-                                  parent, ref_root, owner, offset, action);
+       ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+                                  num_bytes, parent, ref_root, owner, offset,
+                                  action, for_cow);
        BUG_ON(ret);
+       if (!need_ref_seq(for_cow, ref_root) &&
+           waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
 
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op)
 {
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
 
-       ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+       ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
                                   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
                                   extent_op->is_data);
        BUG_ON(ret);
 
+       if (waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
        struct btrfs_delayed_ref_root *delayed_refs;
 
        delayed_refs = &trans->transaction->delayed_refs;
-       ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+       ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
        if (ref)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
index e287e3b..d8f244d 100644 (file)
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
        /* the size of the extent */
        u64 num_bytes;
 
+       /* seq number to keep track of insertion order */
+       u64 seq;
+
        /* ref count on this data structure */
        atomic_t refs;
 
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
 
 struct btrfs_delayed_tree_ref {
        struct btrfs_delayed_ref_node node;
-       union {
-               u64 root;
-               u64 parent;
-       };
+       u64 root;
+       u64 parent;
        int level;
 };
 
 struct btrfs_delayed_data_ref {
        struct btrfs_delayed_ref_node node;
-       union {
-               u64 root;
-               u64 parent;
-       };
+       u64 root;
+       u64 parent;
        u64 objectid;
        u64 offset;
 };
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
        int flushing;
 
        u64 run_delayed_start;
+
+       /*
+        * seq number of delayed refs. We need to know if a backref was being
+        * added before the currently processed ref or afterwards.
+        */
+       u64 seq;
+
+       /*
+        * seq_list holds a list of all seq numbers that are currently being
+        * added to the list. While walking backrefs (btrfs_find_all_roots,
+        * qgroups), which might take some time, no newer ref must be processed,
+        * as it might influence the outcome of the walk.
+        */
+       struct list_head seq_head;
+
+       /*
+        * when the only refs we have in the list must not be processed, we want
+        * to wait for more refs to show up or for the end of backref walking.
+        */
+       wait_queue_head_t seq_wait;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
        }
 }
 
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root, int level, int action,
-                              struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
-                              struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op);
 
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
                           struct btrfs_delayed_ref_head *head);
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 search_start);
+
+struct seq_list {
+       struct list_head list;
+       u64 seq;
+};
+
+static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
+{
+       assert_spin_locked(&delayed_refs->lock);
+       ++delayed_refs->seq;
+       return delayed_refs->seq;
+}
+
+static inline void
+btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                     struct seq_list *elem)
+{
+       assert_spin_locked(&delayed_refs->lock);
+       elem->seq = delayed_refs->seq;
+       list_add_tail(&elem->list, &delayed_refs->seq_head);
+}
+
+static inline void
+btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                     struct seq_list *elem)
+{
+       spin_lock(&delayed_refs->lock);
+       list_del(&elem->list);
+       wake_up(&delayed_refs->seq_wait);
+       spin_unlock(&delayed_refs->lock);
+}
+
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                           u64 seq);
+
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+       if (for_cow)
+               return 0;
+
+       if (rootid == BTRFS_FS_TREE_OBJECTID)
+               return 1;
+
+       if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+               return 1;
+
+       return 0;
+}
+
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
index d852566..7aa9cd3 100644 (file)
@@ -43,6 +43,7 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "check-integrity.h"
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -1143,7 +1144,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->orphan_item_inserted = 0;
        root->orphan_cleanup_state = 0;
 
-       root->fs_info = fs_info;
        root->objectid = objectid;
        root->last_trans = 0;
        root->highest_objectid = 0;
@@ -1217,6 +1217,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        return 0;
 }
 
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+       if (root)
+               root->fs_info = fs_info;
+       return root;
+}
+
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
 {
@@ -1224,7 +1232,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        struct btrfs_root *tree_root = fs_info->tree_root;
        struct extent_buffer *leaf;
 
-       root = kzalloc(sizeof(*root), GFP_NOFS);
+       root = btrfs_alloc_root(fs_info);
        if (!root)
                return ERR_PTR(-ENOMEM);
 
@@ -1244,7 +1252,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        root->ref_cows = 0;
 
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-                                     BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+                                     BTRFS_TREE_LOG_OBJECTID, NULL,
+                                     0, 0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
@@ -1318,7 +1327,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        u32 blocksize;
        int ret = 0;
 
-       root = kzalloc(sizeof(*root), GFP_NOFS);
+       root = btrfs_alloc_root(fs_info);
        if (!root)
                return ERR_PTR(-ENOMEM);
        if (location->offset == (u64)-1) {
@@ -1874,9 +1883,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 }
 
 
-struct btrfs_root *open_ctree(struct super_block *sb,
-                             struct btrfs_fs_devices *fs_devices,
-                             char *options)
+int open_ctree(struct super_block *sb,
+              struct btrfs_fs_devices *fs_devices,
+              char *options)
 {
        u32 sectorsize;
        u32 nodesize;
@@ -1888,8 +1897,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        struct btrfs_key location;
        struct buffer_head *bh;
        struct btrfs_super_block *disk_super;
-       struct btrfs_root *tree_root = btrfs_sb(sb);
-       struct btrfs_fs_info *fs_info = tree_root->fs_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_root *tree_root;
        struct btrfs_root *extent_root;
        struct btrfs_root *csum_root;
        struct btrfs_root *chunk_root;
@@ -1900,16 +1909,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        int num_backups_tried = 0;
        int backup_index = 0;
 
-       extent_root = fs_info->extent_root =
-               kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-       csum_root = fs_info->csum_root =
-               kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-       chunk_root = fs_info->chunk_root =
-               kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-       dev_root = fs_info->dev_root =
-               kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+       tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+       extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
+       csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
+       chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+       dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
 
-       if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+       if (!tree_root || !extent_root || !csum_root ||
+           !chunk_root || !dev_root) {
                err = -ENOMEM;
                goto fail;
        }
@@ -1998,6 +2005,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        init_rwsem(&fs_info->scrub_super_lock);
        fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       fs_info->check_integrity_print_mask = 0;
+#endif
+
+       spin_lock_init(&fs_info->balance_lock);
+       mutex_init(&fs_info->balance_mutex);
+       atomic_set(&fs_info->balance_running, 0);
+       atomic_set(&fs_info->balance_pause_req, 0);
+       atomic_set(&fs_info->balance_cancel_req, 0);
+       fs_info->balance_ctl = NULL;
+       init_waitqueue_head(&fs_info->balance_wait_q);
 
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
@@ -2267,9 +2285,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
           BTRFS_UUID_SIZE);
 
-       mutex_lock(&fs_info->chunk_mutex);
        ret = btrfs_read_chunk_tree(chunk_root);
-       mutex_unlock(&fs_info->chunk_mutex);
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                       sb->s_id);
@@ -2318,9 +2334,6 @@ retry_root_backup:
 
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
-       fs_info->data_alloc_profile = (u64)-1;
-       fs_info->metadata_alloc_profile = (u64)-1;
-       fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
@@ -2353,6 +2366,19 @@ retry_root_backup:
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+               ret = btrfsic_mount(tree_root, fs_devices,
+                                   btrfs_test_opt(tree_root,
+                                       CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+                                   1 : 0,
+                                   fs_info->check_integrity_print_mask);
+               if (ret)
+                       printk(KERN_WARNING "btrfs: failed to initialize"
+                              " integrity check module %s\n", sb->s_id);
+       }
+#endif
+
        /* do not make disk changes in broken FS */
        if (btrfs_super_log_root(disk_super) != 0 &&
            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2368,7 +2394,7 @@ retry_root_backup:
                     btrfs_level_size(tree_root,
                                      btrfs_super_log_root_level(disk_super));
 
-               log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+               log_tree_root = btrfs_alloc_root(fs_info);
                if (!log_tree_root) {
                        err = -ENOMEM;
                        goto fail_trans_kthread;
@@ -2423,13 +2449,17 @@ retry_root_backup:
                if (!err)
                        err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
+
+               if (!err)
+                       err = btrfs_recover_balance(fs_info->tree_root);
+
                if (err) {
                        close_ctree(tree_root);
-                       return ERR_PTR(err);
+                       return err;
                }
        }
 
-       return tree_root;
+       return 0;
 
 fail_trans_kthread:
        kthread_stop(fs_info->transaction_kthread);
@@ -2475,8 +2505,7 @@ fail_srcu:
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
        btrfs_close_devices(fs_info->fs_devices);
-       free_fs_info(fs_info);
-       return ERR_PTR(err);
+       return err;
 
 recovery_tree_root:
        if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2631,7 +2660,7 @@ static int write_dev_supers(struct btrfs_device *device,
                 * we fua the first super.  The others we allow
                 * to go down lazy.
                 */
-               ret = submit_bh(WRITE_FUA, bh);
+               ret = btrfsic_submit_bh(WRITE_FUA, bh);
                if (ret)
                        errors++;
        }
@@ -2708,7 +2737,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
        device->flush_bio = bio;
 
        bio_get(bio);
-       submit_bio(WRITE_FLUSH, bio);
+       btrfsic_submit_bio(WRITE_FLUSH, bio);
 
        return 0;
 }
@@ -2972,6 +3001,9 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
 
+       /* pause restriper - we want to resume on mount */
+       btrfs_pause_balance(root->fs_info);
+
        btrfs_scrub_cancel(root);
 
        /* wait for any defraggers to finish */
@@ -2979,7 +3011,7 @@ int close_ctree(struct btrfs_root *root)
                   (atomic_read(&fs_info->defrag_running) == 0));
 
        /* clear out the rbtree of defraggable inodes */
-       btrfs_run_defrag_inodes(root->fs_info);
+       btrfs_run_defrag_inodes(fs_info);
 
        /*
         * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3008,8 +3040,8 @@ int close_ctree(struct btrfs_root *root)
 
        btrfs_put_block_group_cache(fs_info);
 
-       kthread_stop(root->fs_info->transaction_kthread);
-       kthread_stop(root->fs_info->cleaner_kthread);
+       kthread_stop(fs_info->transaction_kthread);
+       kthread_stop(fs_info->cleaner_kthread);
 
        fs_info->closing = 2;
        smp_mb();
@@ -3027,14 +3059,14 @@ int close_ctree(struct btrfs_root *root)
        free_extent_buffer(fs_info->extent_root->commit_root);
        free_extent_buffer(fs_info->tree_root->node);
        free_extent_buffer(fs_info->tree_root->commit_root);
-       free_extent_buffer(root->fs_info->chunk_root->node);
-       free_extent_buffer(root->fs_info->chunk_root->commit_root);
-       free_extent_buffer(root->fs_info->dev_root->node);
-       free_extent_buffer(root->fs_info->dev_root->commit_root);
-       free_extent_buffer(root->fs_info->csum_root->node);
-       free_extent_buffer(root->fs_info->csum_root->commit_root);
+       free_extent_buffer(fs_info->chunk_root->node);
+       free_extent_buffer(fs_info->chunk_root->commit_root);
+       free_extent_buffer(fs_info->dev_root->node);
+       free_extent_buffer(fs_info->dev_root->commit_root);
+       free_extent_buffer(fs_info->csum_root->node);
+       free_extent_buffer(fs_info->csum_root->commit_root);
 
-       btrfs_free_block_groups(root->fs_info);
+       btrfs_free_block_groups(fs_info);
 
        del_fs_roots(fs_info);
 
@@ -3054,14 +3086,17 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->caching_workers);
        btrfs_stop_workers(&fs_info->readahead_workers);
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(root, CHECK_INTEGRITY))
+               btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 
-       free_fs_info(fs_info);
-
        return 0;
 }
 
index c99d0a8..e4bc474 100644 (file)
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
                     struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
-                             struct btrfs_fs_devices *fs_devices,
-                             char *options);
+int open_ctree(struct super_block *sb,
+              struct btrfs_fs_devices *fs_devices,
+              char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
index 1b8dc33..5f77166 100644 (file)
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                                       u64 root_objectid, u32 generation,
                                       int check_generation)
 {
-       struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_root *root;
        struct inode *inode;
        struct btrfs_key key;
index f5fbe57..700879e 100644 (file)
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
 
-       flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
-                BTRFS_BLOCK_GROUP_METADATA;
+       flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1871,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset)
+                        u64 root_objectid, u64 owner, u64 offset, int for_cow)
 {
        int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
               root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, owner, offset,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        }
        return ret;
 }
@@ -2233,6 +2236,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
 
                /*
+                * locked_ref is the head node, so we have to go one
+                * node back for any delayed ref updates
+                */
+               ref = select_delayed_ref(locked_ref);
+
+               if (ref && ref->seq &&
+                   btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+                       /*
+                        * there are still refs with lower seq numbers in the
+                        * process of being added. Don't run this ref yet.
+                        */
+                       list_del_init(&locked_ref->cluster);
+                       mutex_unlock(&locked_ref->mutex);
+                       locked_ref = NULL;
+                       delayed_refs->num_heads_ready++;
+                       spin_unlock(&delayed_refs->lock);
+                       cond_resched();
+                       spin_lock(&delayed_refs->lock);
+                       continue;
+               }
+
+               /*
                 * record the must insert reserved flag before we
                 * drop the spin lock.
                 */
@@ -2242,11 +2267,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                extent_op = locked_ref->extent_op;
                locked_ref->extent_op = NULL;
 
-               /*
-                * locked_ref is the head node, so we have to go one
-                * node back for any delayed ref updates
-                */
-               ref = select_delayed_ref(locked_ref);
                if (!ref) {
                        /* All delayed refs have been processed, Go ahead
                         * and send the head node to run_one_delayed_ref,
@@ -2267,9 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                BUG_ON(ret);
                                kfree(extent_op);
 
-                               cond_resched();
-                               spin_lock(&delayed_refs->lock);
-                               continue;
+                               goto next;
                        }
 
                        list_del_init(&locked_ref->cluster);
@@ -2279,7 +2297,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-
+               /*
+                * we modified num_entries, but as we're currently running
+                * delayed refs, skip
+                *     wake_up(&delayed_refs->seq_wait);
+                * here.
+                */
                spin_unlock(&delayed_refs->lock);
 
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2312,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                btrfs_put_delayed_ref(ref);
                kfree(extent_op);
                count++;
-
+next:
+               do_chunk_alloc(trans, root->fs_info->extent_root,
+                              2 * 1024 * 1024,
+                              btrfs_get_alloc_profile(root, 0),
+                              CHUNK_ALLOC_NO_FORCE);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
        return count;
 }
 
+
+static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+                       unsigned long num_refs)
+{
+       struct list_head *first_seq = delayed_refs->seq_head.next;
+
+       spin_unlock(&delayed_refs->lock);
+       pr_debug("waiting for more refs (num %ld, first %p)\n",
+                num_refs, first_seq);
+       wait_event(delayed_refs->seq_wait,
+                  num_refs != delayed_refs->num_entries ||
+                  delayed_refs->seq_head.next != first_seq);
+       pr_debug("done waiting for more refs (num %ld, first %p)\n",
+                delayed_refs->num_entries, delayed_refs->seq_head.next);
+       spin_lock(&delayed_refs->lock);
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2311,15 +2355,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct list_head cluster;
        int ret;
+       u64 delayed_start;
        int run_all = count == (unsigned long)-1;
        int run_most = 0;
+       unsigned long num_refs = 0;
+       int consider_waiting;
 
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
 
+       do_chunk_alloc(trans, root->fs_info->extent_root,
+                      2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+                      CHUNK_ALLOC_NO_FORCE);
+
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
 again:
+       consider_waiting = 0;
        spin_lock(&delayed_refs->lock);
        if (count == 0) {
                count = delayed_refs->num_entries * 2;
@@ -2336,11 +2388,35 @@ again:
                 * of refs to process starting at the first one we are able to
                 * lock
                 */
+               delayed_start = delayed_refs->run_delayed_start;
                ret = btrfs_find_ref_cluster(trans, &cluster,
                                             delayed_refs->run_delayed_start);
                if (ret)
                        break;
 
+               if (delayed_start >= delayed_refs->run_delayed_start) {
+                       if (consider_waiting == 0) {
+                               /*
+                                * btrfs_find_ref_cluster looped. let's do one
+                                * more cycle. if we don't run any delayed ref
+                                * during that cycle (because we can't because
+                                * all of them are blocked) and if the number of
+                                * refs doesn't change, we avoid busy waiting.
+                                */
+                               consider_waiting = 1;
+                               num_refs = delayed_refs->num_entries;
+                       } else {
+                               wait_for_more_refs(delayed_refs, num_refs);
+                               /*
+                                * after waiting, things have changed. we
+                                * dropped the lock and someone else might have
+                                * run some refs, built new clusters and so on.
+                                * therefore, we restart staleness detection.
+                                */
+                               consider_waiting = 0;
+                       }
+               }
+
                ret = run_clustered_refs(trans, root, &cluster);
                BUG_ON(ret < 0);
 
@@ -2348,6 +2424,11 @@ again:
 
                if (count == 0)
                        break;
+
+               if (ret || delayed_refs->run_delayed_start == 0) {
+                       /* refs were run, let's reset staleness detection */
+                       consider_waiting = 0;
+               }
        }
 
        if (run_all) {
@@ -2405,7 +2486,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        extent_op->update_key = 0;
        extent_op->is_data = is_data ? 1 : 0;
 
-       ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+       ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+                                         num_bytes, extent_op);
        if (ret)
                kfree(extent_op);
        return ret;
@@ -2590,7 +2672,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          int full_backref, int inc)
+                          int full_backref, int inc, int for_cow)
 {
        u64 bytenr;
        u64 num_bytes;
@@ -2603,7 +2685,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64);
+                           u64, u64, u64, u64, u64, u64, int);
 
        ref_root = btrfs_header_owner(buf);
        nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2722,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
-                                          key.offset);
+                                          key.offset, for_cow);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = btrfs_level_size(root, level - 1);
                        ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, level - 1, 0);
+                                          parent, ref_root, level - 1, 0,
+                                          for_cow);
                        if (ret)
                                goto fail;
                }
@@ -2659,15 +2742,15 @@ fail:
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3076,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
-       found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
-                               BTRFS_BLOCK_GROUP_SYSTEM |
-                               BTRFS_BLOCK_GROUP_METADATA);
+       found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
        found->total_bytes = total_bytes;
        found->disk_total = total_bytes * factor;
        found->bytes_used = bytes_used;
@@ -3016,20 +3097,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-       u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-                                  BTRFS_BLOCK_GROUP_RAID1 |
-                                  BTRFS_BLOCK_GROUP_RAID10 |
-                                  BTRFS_BLOCK_GROUP_DUP);
-       if (extra_flags) {
-               if (flags & BTRFS_BLOCK_GROUP_DATA)
-                       fs_info->avail_data_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_METADATA)
-                       fs_info->avail_metadata_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                       fs_info->avail_system_alloc_bits |= extra_flags;
-       }
+       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       /* chunk -> extended profile */
+       if (extra_flags == 0)
+               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits |= extra_flags;
 }
 
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        /*
@@ -3040,6 +3128,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
 
+       /* pick restriper's target profile if it's available */
+       spin_lock(&root->fs_info->balance_lock);
+       if (root->fs_info->balance_ctl) {
+               struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+               u64 tgt = 0;
+
+               if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+                   (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                   (flags & bctl->data.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+                          (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->sys.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+                          (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->meta.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+               }
+
+               if (tgt) {
+                       spin_unlock(&root->fs_info->balance_lock);
+                       flags = tgt;
+                       goto out;
+               }
+       }
+       spin_unlock(&root->fs_info->balance_lock);
+
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
        if (num_devices < 4)
@@ -3059,22 +3175,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
             (flags & BTRFS_BLOCK_GROUP_RAID10) |
-            (flags & BTRFS_BLOCK_GROUP_DUP)))
+            (flags & BTRFS_BLOCK_GROUP_DUP))) {
                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+       }
+
+out:
+       /* extended -> chunk profile */
+       flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
        return flags;
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        if (flags & BTRFS_BLOCK_GROUP_DATA)
-               flags |= root->fs_info->avail_data_alloc_bits &
-                        root->fs_info->data_alloc_profile;
+               flags |= root->fs_info->avail_data_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               flags |= root->fs_info->avail_system_alloc_bits &
-                        root->fs_info->system_alloc_profile;
+               flags |= root->fs_info->avail_system_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-               flags |= root->fs_info->avail_metadata_alloc_bits &
-                        root->fs_info->metadata_alloc_profile;
+               flags |= root->fs_info->avail_metadata_alloc_bits;
+
        return btrfs_reduce_alloc_profile(root, flags);
 }
 
@@ -3191,6 +3310,8 @@ commit_trans:
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
+       trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                     (u64)data_sinfo, bytes, 1);
        spin_unlock(&data_sinfo->lock);
 
        return 0;
@@ -3210,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        data_sinfo = BTRFS_I(inode)->space_info;
        spin_lock(&data_sinfo->lock);
        data_sinfo->bytes_may_use -= bytes;
+       trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                     (u64)data_sinfo, bytes, 0);
        spin_unlock(&data_sinfo->lock);
 }
 
@@ -3257,27 +3380,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
                if (num_bytes - num_allocated < thresh)
                        return 1;
        }
-
-       /*
-        * we have two similar checks here, one based on percentage
-        * and once based on a hard number of 256MB.  The idea
-        * is that if we have a good amount of free
-        * room, don't allocate a chunk.  A good mount is
-        * less than 80% utilized of the chunks we have allocated,
-        * or more than 256MB free
-        */
-       if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
-               return 0;
-
-       if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-               return 0;
-
        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-       /* 256MB or 5% of the FS */
-       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+       /* 256MB or 2% of the FS */
+       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
+       /* system chunks need a much small threshold */
+       if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               thresh = 32 * 1024 * 1024;
 
-       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
                return 0;
        return 1;
 }
@@ -3291,7 +3402,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        int wait_for_alloc = 0;
        int ret = 0;
 
-       flags = btrfs_reduce_alloc_profile(extent_root, flags);
+       BUG_ON(!profile_is_valid(flags, 0));
 
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
@@ -3582,6 +3693,10 @@ again:
        if (used <= space_info->total_bytes) {
                if (used + orig_bytes <= space_info->total_bytes) {
                        space_info->bytes_may_use += orig_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "space_info",
+                                                     (u64)space_info,
+                                                     orig_bytes, 1);
                        ret = 0;
                } else {
                        /*
@@ -3649,6 +3764,10 @@ again:
 
                if (used + num_bytes < space_info->total_bytes + avail) {
                        space_info->bytes_may_use += orig_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "space_info",
+                                                     (u64)space_info,
+                                                     orig_bytes, 1);
                        ret = 0;
                } else {
                        wait_ordered = true;
@@ -3755,7 +3874,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
        spin_unlock(&block_rsv->lock);
 }
 
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_block_rsv *block_rsv,
                                    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +3911,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_may_use -= num_bytes;
+                       trace_btrfs_space_reservation(fs_info, "space_info",
+                                                     (u64)space_info,
+                                                     num_bytes, 0);
                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
@@ -3947,7 +4070,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
        if (global_rsv->full || global_rsv == block_rsv ||
            block_rsv->space_info != global_rsv->space_info)
                global_rsv = NULL;
-       block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+       block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+                               num_bytes);
 }
 
 /*
@@ -4006,11 +4130,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
                num_bytes = sinfo->total_bytes - num_bytes;
                block_rsv->reserved += num_bytes;
                sinfo->bytes_may_use += num_bytes;
+               trace_btrfs_space_reservation(fs_info, "space_info",
+                                             (u64)sinfo, num_bytes, 1);
        }
 
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_may_use -= num_bytes;
+               trace_btrfs_space_reservation(fs_info, "space_info",
+                                             (u64)sinfo, num_bytes, 0);
                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
@@ -4045,7 +4173,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
-       block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+       block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+                               (u64)-1);
        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
        WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,6 +4191,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
        if (!trans->bytes_reserved)
                return;
 
+       trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
+                                     trans->bytes_reserved, 0);
        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
        trans->bytes_reserved = 0;
 }
@@ -4079,6 +4210,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
         * when we are truly done with the orphan item.
         */
        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+       trace_btrfs_space_reservation(root->fs_info, "orphan",
+                                     btrfs_ino(inode), num_bytes, 1);
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
@@ -4086,6 +4219,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+       trace_btrfs_space_reservation(root->fs_info, "orphan",
+                                     btrfs_ino(inode), num_bytes, 0);
        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
@@ -4213,12 +4348,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        /* Need to be holding the i_mutex here if we aren't free space cache */
        if (btrfs_is_free_space_inode(root, inode))
                flush = 0;
-       else
-               WARN_ON(!mutex_is_locked(&inode->i_mutex));
 
        if (flush && btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
 
+       mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
 
        spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4400,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                if (dropped)
                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
-               if (to_free)
+               if (to_free) {
                        btrfs_block_rsv_release(root, block_rsv, to_free);
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "delalloc",
+                                                     btrfs_ino(inode),
+                                                     to_free, 0);
+               }
+               mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                return ret;
        }
 
@@ -4278,7 +4418,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        }
        BTRFS_I(inode)->reserved_extents += nr_extents;
        spin_unlock(&BTRFS_I(inode)->lock);
+       mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
 
+       if (to_reserve)
+               trace_btrfs_space_reservation(root->fs_info,"delalloc",
+                                             btrfs_ino(inode), to_reserve, 1);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
        return 0;
@@ -4308,6 +4452,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
+       trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                     btrfs_ino(inode), to_free, 0);
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
 }
@@ -4562,7 +4708,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
                        cache->reserved += num_bytes;
                        space_info->bytes_reserved += num_bytes;
                        if (reserve == RESERVE_ALLOC) {
-                               BUG_ON(space_info->bytes_may_use < num_bytes);
+                               trace_btrfs_space_reservation(cache->fs_info,
+                                                             "space_info",
+                                                             (u64)space_info,
+                                                             num_bytes, 0);
                                space_info->bytes_may_use -= num_bytes;
                        }
                }
@@ -4928,6 +5077,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        rb_erase(&head->node.rb_node, &delayed_refs->root);
 
        delayed_refs->num_entries--;
+       if (waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
 
        /*
         * we don't take a ref on the node because we're removing it from the
@@ -4955,16 +5106,17 @@ out:
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          u64 parent, int last_ref)
+                          u64 parent, int last_ref, int for_cow)
 {
        struct btrfs_block_group_cache *cache = NULL;
        int ret;
 
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
-                                               parent, root->root_key.objectid,
-                                               btrfs_header_level(buf),
-                                               BTRFS_DROP_DELAYED_REF, NULL);
+               ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+                                       buf->start, buf->len,
+                                       parent, root->root_key.objectid,
+                                       btrfs_header_level(buf),
+                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
                BUG_ON(ret);
        }
 
@@ -4999,12 +5151,12 @@ out:
        btrfs_put_block_group(cache);
 }
 
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, u64 parent,
-                     u64 root_objectid, u64 owner, u64 offset)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                     u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+                     u64 owner, u64 offset, int for_cow)
 {
        int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
 
        /*
         * tree log blocks never actually go into the extent allocation
@@ -5016,14 +5168,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                btrfs_pin_extent(root, bytenr, num_bytes, 1);
                ret = 0;
        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_DROP_DELAYED_REF, NULL);
+                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
                BUG_ON(ret);
        } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-                                       parent, root_objectid, owner,
-                                       offset, BTRFS_DROP_DELAYED_REF, NULL);
+               ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                               num_bytes,
+                                               parent, root_objectid, owner,
+                                               offset, BTRFS_DROP_DELAYED_REF,
+                                               NULL, for_cow);
                BUG_ON(ret);
        }
        return ret;
@@ -5146,6 +5301,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        ins->objectid = 0;
        ins->offset = 0;
 
+       trace_find_free_extent(orig_root, num_bytes, empty_size, data);
+
        space_info = __find_space_info(root->fs_info, data);
        if (!space_info) {
                printk(KERN_ERR "No space info for %llu\n", data);
@@ -5295,15 +5452,6 @@ alloc:
                if (unlikely(block_group->ro))
                        goto loop;
 
-               spin_lock(&block_group->free_space_ctl->tree_lock);
-               if (cached &&
-                   block_group->free_space_ctl->free_space <
-                   num_bytes + empty_cluster + empty_size) {
-                       spin_unlock(&block_group->free_space_ctl->tree_lock);
-                       goto loop;
-               }
-               spin_unlock(&block_group->free_space_ctl->tree_lock);
-
                /*
                 * Ok we want to try and use the cluster allocator, so
                 * lets look there
@@ -5331,6 +5479,8 @@ alloc:
                        if (offset) {
                                /* we have a block, we're done */
                                spin_unlock(&last_ptr->refill_lock);
+                               trace_btrfs_reserve_extent_cluster(root,
+                                       block_group, search_start, num_bytes);
                                goto checks;
                        }
 
@@ -5349,8 +5499,15 @@ refill_cluster:
                         * plenty of times and not have found
                         * anything, so we are likely way too
                         * fragmented for the clustering stuff to find
-                        * anything.  */
-                       if (loop >= LOOP_NO_EMPTY_SIZE) {
+                        * anything.
+                        *
+                        * However, if the cluster is taken from the
+                        * current block group, release the cluster
+                        * first, so that we stand a better chance of
+                        * succeeding in the unclustered
+                        * allocation.  */
+                       if (loop >= LOOP_NO_EMPTY_SIZE &&
+                           last_ptr->block_group != block_group) {
                                spin_unlock(&last_ptr->refill_lock);
                                goto unclustered_alloc;
                        }
@@ -5361,6 +5518,11 @@ refill_cluster:
                         */
                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
+                       if (loop >= LOOP_NO_EMPTY_SIZE) {
+                               spin_unlock(&last_ptr->refill_lock);
+                               goto unclustered_alloc;
+                       }
+
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
@@ -5377,6 +5539,9 @@ refill_cluster:
                                if (offset) {
                                        /* we found one, proceed */
                                        spin_unlock(&last_ptr->refill_lock);
+                                       trace_btrfs_reserve_extent_cluster(root,
+                                               block_group, search_start,
+                                               num_bytes);
                                        goto checks;
                                }
                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5566,15 @@ refill_cluster:
                }
 
 unclustered_alloc:
+               spin_lock(&block_group->free_space_ctl->tree_lock);
+               if (cached &&
+                   block_group->free_space_ctl->free_space <
+                   num_bytes + empty_cluster + empty_size) {
+                       spin_unlock(&block_group->free_space_ctl->tree_lock);
+                       goto loop;
+               }
+               spin_unlock(&block_group->free_space_ctl->tree_lock);
+
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
                /*
@@ -5438,9 +5612,6 @@ checks:
                        goto loop;
                }
 
-               ins->objectid = search_start;
-               ins->offset = num_bytes;
-
                if (offset < search_start)
                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
@@ -5457,6 +5628,8 @@ checks:
                ins->objectid = search_start;
                ins->offset = num_bytes;
 
+               trace_btrfs_reserve_extent(orig_root, block_group,
+                                          search_start, num_bytes);
                if (offset < search_start)
                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
@@ -5842,9 +6015,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 
        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
-       ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
-                                        0, root_objectid, owner, offset,
-                                        BTRFS_ADD_DELAYED_EXTENT, NULL);
+       ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+                                        ins->offset, 0,
+                                        root_objectid, owner, offset,
+                                        BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
        return ret;
 }
 
@@ -5997,10 +6171,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        return ERR_PTR(-ENOSPC);
 }
 
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+                           struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
        block_rsv_add_bytes(block_rsv, blocksize, 0);
-       block_rsv_release_bytes(block_rsv, NULL, 0);
+       block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
 }
 
 /*
@@ -6014,7 +6189,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size)
+                                       u64 hint, u64 empty_size, int for_cow)
 {
        struct btrfs_key ins;
        struct btrfs_block_rsv *block_rsv;
@@ -6030,7 +6205,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
                                   empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
-               unuse_block_rsv(block_rsv, blocksize);
+               unuse_block_rsv(root->fs_info, block_rsv, blocksize);
                return ERR_PTR(ret);
        }
 
@@ -6058,10 +6233,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                extent_op->update_flags = 1;
                extent_op->is_data = 0;
 
-               ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+               ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+                                       ins.objectid,
                                        ins.offset, parent, root_objectid,
                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                       extent_op);
+                                       extent_op, for_cow);
                BUG_ON(ret);
        }
        return buf;
@@ -6078,6 +6254,7 @@ struct walk_control {
        int keep_locks;
        int reada_slot;
        int reada_count;
+       int for_reloc;
 };
 
 #define DROP_REFERENCE 1
@@ -6216,9 +6393,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
        /* wc->stage == UPDATE_BACKREF */
        if (!(wc->flags[level] & flag)) {
                BUG_ON(!path->locks[level]);
-               ret = btrfs_inc_ref(trans, root, eb, 1);
+               ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
                BUG_ON(ret);
-               ret = btrfs_dec_ref(trans, root, eb, 0);
+               ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
                BUG_ON(ret);
                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
                                                  eb->len, flag, 0);
@@ -6362,7 +6539,7 @@ skip:
                }
 
                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                                       root->root_key.objectid, level - 1, 0);
+                               root->root_key.objectid, level - 1, 0, 0);
                BUG_ON(ret);
        }
        btrfs_tree_unlock(next);
@@ -6436,9 +6613,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
        if (wc->refs[level] == 1) {
                if (level == 0) {
                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-                               ret = btrfs_dec_ref(trans, root, eb, 1);
+                               ret = btrfs_dec_ref(trans, root, eb, 1,
+                                                   wc->for_reloc);
                        else
-                               ret = btrfs_dec_ref(trans, root, eb, 0);
+                               ret = btrfs_dec_ref(trans, root, eb, 0,
+                                                   wc->for_reloc);
                        BUG_ON(ret);
                }
                /* make block locked assertion in clean_tree_block happy */
@@ -6465,7 +6644,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                               btrfs_header_owner(path->nodes[level + 1]));
        }
 
-       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
+       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
 out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
@@ -6549,7 +6728,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * blocks are properly updated.
  */
 void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref)
+                        struct btrfs_block_rsv *block_rsv, int update_ref,
+                        int for_reloc)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -6637,6 +6817,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
        wc->stage = DROP_REFERENCE;
        wc->update_ref = update_ref;
        wc->keep_locks = 0;
+       wc->for_reloc = for_reloc;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
@@ -6721,6 +6902,7 @@ out:
  * drop subtree rooted at tree block 'node'.
  *
  * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
  */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
@@ -6765,6 +6947,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        wc->stage = DROP_REFERENCE;
        wc->update_ref = 0;
        wc->keep_locks = 1;
+       wc->for_reloc = 1;
        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
        while (1) {
@@ -6792,6 +6975,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
+       if (root->fs_info->balance_ctl) {
+               struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+               u64 tgt = 0;
+
+               /* pick restriper's target profile and return */
+               if (flags & BTRFS_BLOCK_GROUP_DATA &&
+                   bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+               } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+                          bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+               } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+                          bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+               }
+
+               if (tgt) {
+                       /* extended -> chunk profile */
+                       tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+                       return tgt;
+               }
+       }
+
        /*
         * we add in the count of missing devices because we want
         * to make sure that any RAID levels on a degraded FS
@@ -7085,7 +7291,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 * space to fit our block group in.
                 */
                if (device->total_bytes > device->bytes_used + min_free) {
-                       ret = find_free_dev_extent(NULL, device, min_free,
+                       ret = find_free_dev_extent(device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
                                dev_nr++;
@@ -7447,6 +7653,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
+       update_global_block_rsv(root->fs_info);
 
        spin_lock(&cache->space_info->lock);
        cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7466,6 +7673,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       /* chunk -> extended profile */
+       if (extra_flags == 0)
+               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits &= ~extra_flags;
+}
+
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start)
 {
@@ -7476,6 +7699,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        struct inode *inode;
        int ret;
+       int index;
        int factor;
 
        root = root->fs_info->extent_root;
@@ -7491,6 +7715,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        free_excluded_extents(root, block_group);
 
        memcpy(&key, &block_group->key, sizeof(key));
+       index = get_block_group_index(block_group);
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7790,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
+       if (list_empty(&block_group->space_info->block_groups[index]))
+               clear_avail_alloc_bits(root->fs_info, block_group->flags);
        up_write(&block_group->space_info->groups_sem);
 
        if (block_group->cached == BTRFS_CACHE_STARTED)
index 49f3c9d..9d09a4f 100644 (file)
@@ -18,6 +18,7 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
+#include "check-integrity.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        }
        bio->bi_bdev = dev->bdev;
        bio_add_page(bio, page, length, start-page_offset(page));
-       submit_bio(WRITE_SYNC, bio);
+       btrfsic_submit_bio(WRITE_SYNC, bio);
        wait_for_completion(&compl);
 
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
                                           mirror_num, bio_flags, start);
        else
-               submit_bio(rw, bio);
+               btrfsic_submit_bio(rw, bio);
 
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                ret = -EOPNOTSUPP;
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        atomic_set(&eb->blocking_writers, 0);
        atomic_set(&eb->spinning_readers, 0);
        atomic_set(&eb->spinning_writers, 0);
+       eb->lock_nested = 0;
        init_waitqueue_head(&eb->write_lock_wq);
        init_waitqueue_head(&eb->read_lock_wq);
 
index 7604c30..bc6a042 100644 (file)
@@ -129,6 +129,7 @@ struct extent_buffer {
        struct list_head leak_list;
        struct rcu_head rcu_head;
        atomic_t refs;
+       pid_t lock_owner;
 
        /* count of read lock holders on the extent buffer */
        atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
        atomic_t blocking_readers;
        atomic_t spinning_readers;
        atomic_t spinning_writers;
+       int lock_nested;
 
        /* protects write locks */
        rwlock_t lock;
index 034d985..859ba2d 100644 (file)
@@ -678,7 +678,7 @@ next_slot:
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                new_key.objectid,
-                                               start - extent_offset);
+                                               start - extent_offset, 0);
                                BUG_ON(ret);
                                *hint_byte = disk_bytenr;
                        }
@@ -753,7 +753,7 @@ next_slot:
                                                disk_bytenr, num_bytes, 0,
                                                root->root_key.objectid,
                                                key.objectid, key.offset -
-                                               extent_offset);
+                                               extent_offset, 0);
                                BUG_ON(ret);
                                inode_sub_bytes(inode,
                                                extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
 
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
-                                          ino, orig_offset);
+                                          ino, orig_offset, 0);
                BUG_ON(ret);
 
                if (split == start) {
@@ -989,7 +989,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                       ino, orig_offset);
+                                       ino, orig_offset, 0);
                BUG_ON(ret);
        }
        other_start = 0;
@@ -1006,7 +1006,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                       ino, orig_offset);
+                                       ino, orig_offset, 0);
                BUG_ON(ret);
        }
        if (del_nr == 0) {
@@ -1274,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                                   dirty_pages);
                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
                        btrfs_btree_balance_dirty(root, 1);
-               btrfs_throttle(root);
 
                pos += copied;
                num_written += copied;
index 9a897bf..d20ff87 100644 (file)
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
        io_ctl_unmap_page(io_ctl);
 
        for (i = 0; i < io_ctl->num_pages; i++) {
-               ClearPageChecked(io_ctl->pages[i]);
-               unlock_page(io_ctl->pages[i]);
-               page_cache_release(io_ctl->pages[i]);
+               if (io_ctl->pages[i]) {
+                       ClearPageChecked(io_ctl->pages[i]);
+                       unlock_page(io_ctl->pages[i]);
+                       page_cache_release(io_ctl->pages[i]);
+               }
        }
 }
 
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        if (!num_entries)
                return 0;
 
-       io_ctl_init(&io_ctl, inode, root);
+       ret = io_ctl_init(&io_ctl, inode, root);
+       if (ret)
+               return ret;
+
        ret = readahead_cache(inode);
        if (ret)
                goto out;
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        struct io_ctl io_ctl;
        struct list_head bitmap_list;
        struct btrfs_key key;
-       u64 start, end, len;
+       u64 start, extent_start, extent_end, len;
        int entries = 0;
        int bitmaps = 0;
        int ret;
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        if (!i_size_read(inode))
                return -1;
 
-       io_ctl_init(&io_ctl, inode, root);
+       ret = io_ctl_init(&io_ctl, inode, root);
+       if (ret)
+               return -1;
 
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                                     struct btrfs_free_cluster,
                                     block_group_list);
 
-       /*
-        * We shouldn't have switched the pinned extents yet so this is the
-        * right one
-        */
-       unpin = root->fs_info->pinned_extents;
-
        /* Lock all pages first so we can lock the extent safely. */
        io_ctl_prepare_pages(&io_ctl, inode, 0);
 
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                         0, &cached_state, GFP_NOFS);
 
-       /*
-        * When searching for pinned extents, we need to start at our start
-        * offset.
-        */
-       if (block_group)
-               start = block_group->key.objectid;
-
        node = rb_first(&ctl->free_space_offset);
        if (!node && cluster) {
                node = rb_first(&cluster->root);
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         * We want to add any pinned extents to our free space cache
         * so we don't leak the space
         */
+
+       /*
+        * We shouldn't have switched the pinned extents yet so this is the
+        * right one
+        */
+       unpin = root->fs_info->pinned_extents;
+
+       if (block_group)
+               start = block_group->key.objectid;
+
        while (block_group && (start < block_group->key.objectid +
                               block_group->key.offset)) {
-               ret = find_first_extent_bit(unpin, start, &start, &end,
+               ret = find_first_extent_bit(unpin, start,
+                                           &extent_start, &extent_end,
                                            EXTENT_DIRTY);
                if (ret) {
                        ret = 0;
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                }
 
                /* This pinned extent is out of our range */
-               if (start >= block_group->key.objectid +
+               if (extent_start >= block_group->key.objectid +
                    block_group->key.offset)
                        break;
 
-               len = block_group->key.objectid +
-                       block_group->key.offset - start;
-               len = min(len, end + 1 - start);
+               extent_start = max(extent_start, start);
+               extent_end = min(block_group->key.objectid +
+                                block_group->key.offset, extent_end + 1);
+               len = extent_end - extent_start;
 
                entries++;
-               ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+               ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
                if (ret)
                        goto out_nospc;
 
-               start = end + 1;
+               start = extent_end;
        }
 
        /* Write out the bitmaps */
@@ -2283,23 +2289,23 @@ out:
 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
                                struct btrfs_free_space *entry,
                                struct btrfs_free_cluster *cluster,
-                               u64 offset, u64 bytes, u64 min_bytes)
+                               u64 offset, u64 bytes,
+                               u64 cont1_bytes, u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        unsigned long next_zero;
        unsigned long i;
-       unsigned long search_bits;
-       unsigned long total_bits;
+       unsigned long want_bits;
+       unsigned long min_bits;
        unsigned long found_bits;
        unsigned long start = 0;
        unsigned long total_found = 0;
        int ret;
-       bool found = false;
 
        i = offset_to_bit(entry->offset, block_group->sectorsize,
                          max_t(u64, offset, entry->offset));
-       search_bits = bytes_to_bits(bytes, block_group->sectorsize);
-       total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+       want_bits = bytes_to_bits(bytes, block_group->sectorsize);
+       min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
 
 again:
        found_bits = 0;
@@ -2308,7 +2314,7 @@ again:
             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
                next_zero = find_next_zero_bit(entry->bitmap,
                                               BITS_PER_BITMAP, i);
-               if (next_zero - i >= search_bits) {
+               if (next_zero - i >= min_bits) {
                        found_bits = next_zero - i;
                        break;
                }
@@ -2318,10 +2324,9 @@ again:
        if (!found_bits)
                return -ENOSPC;
 
-       if (!found) {
+       if (!total_found) {
                start = i;
                cluster->max_size = 0;
-               found = true;
        }
 
        total_found += found_bits;
@@ -2329,13 +2334,8 @@ again:
        if (cluster->max_size < found_bits * block_group->sectorsize)
                cluster->max_size = found_bits * block_group->sectorsize;
 
-       if (total_found < total_bits) {
-               i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
-               if (i - start > total_bits * 2) {
-                       total_found = 0;
-                       cluster->max_size = 0;
-                       found = false;
-               }
+       if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+               i = next_zero + 1;
                goto again;
        }
 
@@ -2346,28 +2346,31 @@ again:
                                 &entry->offset_index, 1);
        BUG_ON(ret);
 
+       trace_btrfs_setup_cluster(block_group, cluster,
+                                 total_found * block_group->sectorsize, 1);
        return 0;
 }
 
 /*
  * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
  */
 static noinline int
 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                        struct btrfs_free_cluster *cluster,
                        struct list_head *bitmaps, u64 offset, u64 bytes,
-                       u64 min_bytes)
+                       u64 cont1_bytes, u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *first = NULL;
        struct btrfs_free_space *entry = NULL;
-       struct btrfs_free_space *prev = NULL;
        struct btrfs_free_space *last;
        struct rb_node *node;
        u64 window_start;
        u64 window_free;
        u64 max_extent;
-       u64 max_gap = 128 * 1024;
+       u64 total_size = 0;
 
        entry = tree_search_offset(ctl, offset, 0, 1);
        if (!entry)
@@ -2377,8 +2380,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
         * We don't want bitmaps, so just move along until we find a normal
         * extent entry.
         */
-       while (entry->bitmap) {
-               if (list_empty(&entry->list))
+       while (entry->bitmap || entry->bytes < min_bytes) {
+               if (entry->bitmap && list_empty(&entry->list))
                        list_add_tail(&entry->list, bitmaps);
                node = rb_next(&entry->offset_index);
                if (!node)
@@ -2391,12 +2394,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
        max_extent = entry->bytes;
        first = entry;
        last = entry;
-       prev = entry;
 
-       while (window_free <= min_bytes) {
-               node = rb_next(&entry->offset_index);
-               if (!node)
-                       return -ENOSPC;
+       for (node = rb_next(&entry->offset_index); node;
+            node = rb_next(&entry->offset_index)) {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
 
                if (entry->bitmap) {
@@ -2405,26 +2405,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
                        continue;
                }
 
-               /*
-                * we haven't filled the empty size and the window is
-                * very large.  reset and try again
-                */
-               if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
-                   entry->offset - window_start > (min_bytes * 2)) {
-                       first = entry;
-                       window_start = entry->offset;
-                       window_free = entry->bytes;
-                       last = entry;
+               if (entry->bytes < min_bytes)
+                       continue;
+
+               last = entry;
+               window_free += entry->bytes;
+               if (entry->bytes > max_extent)
                        max_extent = entry->bytes;
-               } else {
-                       last = entry;
-                       window_free += entry->bytes;
-                       if (entry->bytes > max_extent)
-                               max_extent = entry->bytes;
-               }
-               prev = entry;
        }
 
+       if (window_free < bytes || max_extent < cont1_bytes)
+               return -ENOSPC;
+
        cluster->window_start = first->offset;
 
        node = &first->offset_index;
@@ -2438,17 +2430,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
-               if (entry->bitmap)
+               if (entry->bitmap || entry->bytes < min_bytes)
                        continue;
 
                rb_erase(&entry->offset_index, &ctl->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
                                         &entry->offset_index, 0);
+               total_size += entry->bytes;
                BUG_ON(ret);
        } while (node && entry != last);
 
        cluster->max_size = max_extent;
-
+       trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
        return 0;
 }
 
@@ -2460,7 +2453,7 @@ static noinline int
 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
                     struct btrfs_free_cluster *cluster,
                     struct list_head *bitmaps, u64 offset, u64 bytes,
-                    u64 min_bytes)
+                    u64 cont1_bytes, u64 min_bytes)
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
@@ -2485,7 +2478,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
                if (entry->bytes < min_bytes)
                        continue;
                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-                                          bytes, min_bytes);
+                                          bytes, cont1_bytes, min_bytes);
                if (!ret)
                        return 0;
        }
@@ -2499,7 +2492,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 
 /*
  * here we try to find a cluster of blocks in a block group.  The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
  * We might not find them all in one contiguous area.
  *
  * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2508,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        struct btrfs_free_space *entry, *tmp;
        LIST_HEAD(bitmaps);
        u64 min_bytes;
+       u64 cont1_bytes;
        int ret;
 
-       /* for metadata, allow allocates with more holes */
+       /*
+        * Choose the minimum extent size we'll require for this
+        * cluster.  For SSD_SPREAD, don't allow any fragmentation.
+        * For metadata, allow allocates with smaller extents.  For
+        * data, keep it dense.
+        */
        if (btrfs_test_opt(root, SSD_SPREAD)) {
-               min_bytes = bytes + empty_size;
+               cont1_bytes = min_bytes = bytes + empty_size;
        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-               /*
-                * we want to do larger allocations when we are
-                * flushing out the delayed refs, it helps prevent
-                * making more work as we go along.
-                */
-               if (trans->transaction->delayed_refs.flushing)
-                       min_bytes = max(bytes, (bytes + empty_size) >> 1);
-               else
-                       min_bytes = max(bytes, (bytes + empty_size) >> 4);
-       } else
-               min_bytes = max(bytes, (bytes + empty_size) >> 2);
+               cont1_bytes = bytes;
+               min_bytes = block_group->sectorsize;
+       } else {
+               cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+               min_bytes = block_group->sectorsize;
+       }
 
        spin_lock(&ctl->tree_lock);
 
@@ -2539,7 +2533,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
         * If we know we don't have enough space to make a cluster don't even
         * bother doing all the work to try and find one.
         */
-       if (ctl->free_space < min_bytes) {
+       if (ctl->free_space < bytes) {
                spin_unlock(&ctl->tree_lock);
                return -ENOSPC;
        }
@@ -2552,11 +2546,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
 
+       trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+                                min_bytes);
+
+       INIT_LIST_HEAD(&bitmaps);
        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
-                                     bytes, min_bytes);
+                                     bytes + empty_size,
+                                     cont1_bytes, min_bytes);
        if (ret)
                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
-                                          offset, bytes, min_bytes);
+                                          offset, bytes + empty_size,
+                                          cont1_bytes, min_bytes);
 
        /* Clear our temporary list */
        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2567,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                list_add_tail(&cluster->block_group_list,
                              &block_group->cluster_list);
                cluster->block_group = block_group;
+       } else {
+               trace_btrfs_failed_cluster_setup(block_group);
        }
 out:
        spin_unlock(&cluster->lock);
@@ -2588,17 +2590,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        cluster->block_group = NULL;
 }
 
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
-                          u64 *trimmed, u64 start, u64 end, u64 minlen)
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+                      u64 *total_trimmed, u64 start, u64 bytes,
+                      u64 reserved_start, u64 reserved_bytes)
 {
-       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-       struct btrfs_free_space *entry = NULL;
+       struct btrfs_space_info *space_info = block_group->space_info;
        struct btrfs_fs_info *fs_info = block_group->fs_info;
-       u64 bytes = 0;
-       u64 actually_trimmed;
-       int ret = 0;
+       int ret;
+       int update = 0;
+       u64 trimmed = 0;
 
-       *trimmed = 0;
+       spin_lock(&space_info->lock);
+       spin_lock(&block_group->lock);
+       if (!block_group->ro) {
+               block_group->reserved += reserved_bytes;
+               space_info->bytes_reserved += reserved_bytes;
+               update = 1;
+       }
+       spin_unlock(&block_group->lock);
+       spin_unlock(&space_info->lock);
+
+       ret = btrfs_error_discard_extent(fs_info->extent_root,
+                                        start, bytes, &trimmed);
+       if (!ret)
+               *total_trimmed += trimmed;
+
+       btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+
+       if (update) {
+               spin_lock(&space_info->lock);
+               spin_lock(&block_group->lock);
+               if (block_group->ro)
+                       space_info->bytes_readonly += reserved_bytes;
+               block_group->reserved -= reserved_bytes;
+               space_info->bytes_reserved -= reserved_bytes;
+               spin_unlock(&space_info->lock);
+               spin_unlock(&block_group->lock);
+       }
+
+       return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+                         u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+       struct btrfs_free_space *entry;
+       struct rb_node *node;
+       int ret = 0;
+       u64 extent_start;
+       u64 extent_bytes;
+       u64 bytes;
 
        while (start < end) {
                spin_lock(&ctl->tree_lock);
@@ -2609,81 +2651,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                }
 
                entry = tree_search_offset(ctl, start, 0, 1);
-               if (!entry)
-                       entry = tree_search_offset(ctl,
-                                                  offset_to_bitmap(ctl, start),
-                                                  1, 1);
-
-               if (!entry || entry->offset >= end) {
+               if (!entry) {
                        spin_unlock(&ctl->tree_lock);
                        break;
                }
 
-               if (entry->bitmap) {
-                       ret = search_bitmap(ctl, entry, &start, &bytes);
-                       if (!ret) {
-                               if (start >= end) {
-                                       spin_unlock(&ctl->tree_lock);
-                                       break;
-                               }
-                               bytes = min(bytes, end - start);
-                               bitmap_clear_bits(ctl, entry, start, bytes);
-                               if (entry->bytes == 0)
-                                       free_bitmap(ctl, entry);
-                       } else {
-                               start = entry->offset + BITS_PER_BITMAP *
-                                       block_group->sectorsize;
+               /* skip bitmaps */
+               while (entry->bitmap) {
+                       node = rb_next(&entry->offset_index);
+                       if (!node) {
                                spin_unlock(&ctl->tree_lock);
-                               ret = 0;
-                               continue;
+                               goto out;
                        }
-               } else {
-                       start = entry->offset;
-                       bytes = min(entry->bytes, end - start);
-                       unlink_free_space(ctl, entry);
-                       kmem_cache_free(btrfs_free_space_cachep, entry);
+                       entry = rb_entry(node, struct btrfs_free_space,
+                                        offset_index);
                }
 
+               if (entry->offset >= end) {
+                       spin_unlock(&ctl->tree_lock);
+                       break;
+               }
+
+               extent_start = entry->offset;
+               extent_bytes = entry->bytes;
+               start = max(start, extent_start);
+               bytes = min(extent_start + extent_bytes, end) - start;
+               if (bytes < minlen) {
+                       spin_unlock(&ctl->tree_lock);
+                       goto next;
+               }
+
+               unlink_free_space(ctl, entry);
+               kmem_cache_free(btrfs_free_space_cachep, entry);
+
                spin_unlock(&ctl->tree_lock);
 
-               if (bytes >= minlen) {
-                       struct btrfs_space_info *space_info;
-                       int update = 0;
-
-                       space_info = block_group->space_info;
-                       spin_lock(&space_info->lock);
-                       spin_lock(&block_group->lock);
-                       if (!block_group->ro) {
-                               block_group->reserved += bytes;
-                               space_info->bytes_reserved += bytes;
-                               update = 1;
-                       }
-                       spin_unlock(&block_group->lock);
-                       spin_unlock(&space_info->lock);
-
-                       ret = btrfs_error_discard_extent(fs_info->extent_root,
-                                                        start,
-                                                        bytes,
-                                                        &actually_trimmed);
-
-                       btrfs_add_free_space(block_group, start, bytes);
-                       if (update) {
-                               spin_lock(&space_info->lock);
-                               spin_lock(&block_group->lock);
-                               if (block_group->ro)
-                                       space_info->bytes_readonly += bytes;
-                               block_group->reserved -= bytes;
-                               space_info->bytes_reserved -= bytes;
-                               spin_unlock(&space_info->lock);
-                               spin_unlock(&block_group->lock);
-                       }
+               ret = do_trimming(block_group, total_trimmed, start, bytes,
+                                 extent_start, extent_bytes);
+               if (ret)
+                       break;
+next:
+               start += bytes;
 
-                       if (ret)
-                               break;
-                       *trimmed += actually_trimmed;
+               if (fatal_signal_pending(current)) {
+                       ret = -ERESTARTSYS;
+                       break;
+               }
+
+               cond_resched();
+       }
+out:
+       return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+                       u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+       struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+       struct btrfs_free_space *entry;
+       int ret = 0;
+       int ret2;
+       u64 bytes;
+       u64 offset = offset_to_bitmap(ctl, start);
+
+       while (offset < end) {
+               bool next_bitmap = false;
+
+               spin_lock(&ctl->tree_lock);
+
+               if (ctl->free_space < minlen) {
+                       spin_unlock(&ctl->tree_lock);
+                       break;
+               }
+
+               entry = tree_search_offset(ctl, offset, 1, 0);
+               if (!entry) {
+                       spin_unlock(&ctl->tree_lock);
+                       next_bitmap = true;
+                       goto next;
+               }
+
+               bytes = minlen;
+               ret2 = search_bitmap(ctl, entry, &start, &bytes);
+               if (ret2 || start >= end) {
+                       spin_unlock(&ctl->tree_lock);
+                       next_bitmap = true;
+                       goto next;
+               }
+
+               bytes = min(bytes, end - start);
+               if (bytes < minlen) {
+                       spin_unlock(&ctl->tree_lock);
+                       goto next;
+               }
+
+               bitmap_clear_bits(ctl, entry, start, bytes);
+               if (entry->bytes == 0)
+                       free_bitmap(ctl, entry);
+
+               spin_unlock(&ctl->tree_lock);
+
+               ret = do_trimming(block_group, total_trimmed, start, bytes,
+                                 start, bytes);
+               if (ret)
+                       break;
+next:
+               if (next_bitmap) {
+                       offset += BITS_PER_BITMAP * ctl->unit;
+               } else {
+                       start += bytes;
+                       if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+                               offset += BITS_PER_BITMAP * ctl->unit;
                }
-               start += bytes;
-               bytes = 0;
 
                if (fatal_signal_pending(current)) {
                        ret = -ERESTARTSYS;
@@ -2696,6 +2775,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
        return ret;
 }
 
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+                          u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+       int ret;
+
+       *trimmed = 0;
+
+       ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+       if (ret)
+               return ret;
+
+       ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+
+       return ret;
+}
+
 /*
  * Find the left-most item in the cache tree, and then return the
  * smallest inode number in the item.
index f8962a9..213ffa8 100644 (file)
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
                                          trans->bytes_reserved);
        if (ret)
                goto out;
+       trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+                                     trans->bytes_reserved, 1);
 again:
        inode = lookup_free_ino_inode(root, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
 out_put:
        iput(inode);
 out_release:
+       trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
+                                     trans->bytes_reserved, 0);
        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
        trans->block_rsv = rsv;
index 81b235a..0da19a0 100644 (file)
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root)
 {
+       struct btrfs_block_rsv *block_rsv;
        int ret;
 
        if (!list_empty(&root->orphan_list) ||
            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
                return;
 
+       spin_lock(&root->orphan_lock);
+       if (!list_empty(&root->orphan_list)) {
+               spin_unlock(&root->orphan_lock);
+               return;
+       }
+
+       if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+               spin_unlock(&root->orphan_lock);
+               return;
+       }
+
+       block_rsv = root->orphan_block_rsv;
+       root->orphan_block_rsv = NULL;
+       spin_unlock(&root->orphan_lock);
+
        if (root->orphan_item_inserted &&
            btrfs_root_refs(&root->root_item) > 0) {
                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                root->orphan_item_inserted = 0;
        }
 
-       if (root->orphan_block_rsv) {
-               WARN_ON(root->orphan_block_rsv->size > 0);
-               btrfs_free_block_rsv(root, root->orphan_block_rsv);
-               root->orphan_block_rsv = NULL;
+       if (block_rsv) {
+               WARN_ON(block_rsv->size > 0);
+               btrfs_free_block_rsv(root, block_rsv);
        }
 }
 
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                continue;
                        }
                        nr_truncate++;
-                       /*
-                        * Need to hold the imutex for reservation purposes, not
-                        * a huge deal here but I have a WARN_ON in
-                        * btrfs_delalloc_reserve_space to catch offenders.
-                        */
-                       mutex_lock(&inode->i_mutex);
                        ret = btrfs_truncate(inode);
-                       mutex_unlock(&inode->i_mutex);
                } else {
                        nr_unlink++;
                }
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
                BUG_ON(!root->fs_info->enospc_unlink);
                root->fs_info->enospc_unlink = 0;
        }
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        int pending_del_nr = 0;
        int pending_del_slot = 0;
        int extent_type = -1;
-       int encoding;
        int ret;
        int err = 0;
        u64 ino = btrfs_ino(inode);
@@ -3059,7 +3066,6 @@ search_again:
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                found_type = btrfs_key_type(&found_key);
-               encoding = 0;
 
                if (found_key.objectid != ino)
                        break;
@@ -3072,10 +3078,6 @@ search_again:
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
                        extent_type = btrfs_file_extent_type(leaf, fi);
-                       encoding = btrfs_file_extent_compression(leaf, fi);
-                       encoding |= btrfs_file_extent_encryption(leaf, fi);
-                       encoding |= btrfs_file_extent_other_encoding(leaf, fi);
-
                        if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
                                item_end +=
                                    btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3105,7 @@ search_again:
                if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
                        u64 num_dec;
                        extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-                       if (!del_item && !encoding) {
+                       if (!del_item) {
                                u64 orig_num_bytes =
                                        btrfs_file_extent_num_bytes(leaf, fi);
                                extent_num_bytes = new_size -
@@ -3179,7 +3181,7 @@ delete:
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
-                                               ino, extent_offset);
+                                               ino, extent_offset, 0);
                        BUG_ON(ret);
                }
 
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                i_size_write(inode, newsize);
                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                ret = btrfs_update_inode(trans, root, inode);
-               btrfs_end_transaction_throttle(trans, root);
+               btrfs_end_transaction(trans, root);
        } else {
 
                /*
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        }
 out_unlock:
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        }
 out_unlock:
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        }
 
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
 fail:
        if (drop_inode) {
                inode_dec_link_count(inode);
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 out_fail:
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@ -5121,7 +5123,7 @@ again:
                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
-                       WARN_ON(1);
+                       BUG();
                        if (!trans) {
                                kunmap(page);
                                free_extent_map(em);
@@ -6402,10 +6404,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
 
-       /* Need this to keep space reservations serialized */
-       mutex_lock(&inode->i_mutex);
        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-       mutex_unlock(&inode->i_mutex);
        if (!ret)
                ret = btrfs_update_time(vma->vm_file);
        if (ret) {
@@ -6494,8 +6493,8 @@ out_unlock:
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
-       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
+       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
        return ret;
 }
 
@@ -6668,7 +6667,7 @@ end_trans:
                        err = ret;
 
                nr = trans->blocks_used;
-               ret = btrfs_end_transaction_throttle(trans, root);
+               ret = btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(root, nr);
        }
 
@@ -6749,6 +6748,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        extent_io_tree_init(&ei->io_tree, &inode->i_data);
        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
        mutex_init(&ei->log_mutex);
+       mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7074,7 +7074,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                btrfs_end_log_trans(root);
        }
 out_fail:
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
 out_notrans:
        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7246,7 @@ out_unlock:
        if (!err)
                d_instantiate(dentry, inode);
        nr = trans->blocks_used;
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
index 5441ff1..ab62001 100644 (file)
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        struct btrfs_trans_handle *trans;
        unsigned int flags, oldflags;
        int ret;
+       u64 ip_oldflags;
+       unsigned int i_oldflags;
 
        if (btrfs_root_readonly(root))
                return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 
        mutex_lock(&inode->i_mutex);
 
+       ip_oldflags = ip->flags;
+       i_oldflags = inode->i_flags;
+
        flags = btrfs_mask_flags(inode->i_mode, flags);
        oldflags = btrfs_flags_to_ioctl(ip->flags);
        if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
        }
 
-       trans = btrfs_join_transaction(root);
-       BUG_ON(IS_ERR(trans));
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+               goto out_drop;
+       }
 
        btrfs_update_iflags(inode);
        inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, inode);
-       BUG_ON(ret);
 
        btrfs_end_transaction(trans, root);
+ out_drop:
+       if (ret) {
+               ip->flags = ip_oldflags;
+               inode->i_flags = i_oldflags;
+       }
 
        mnt_drop_write_file(file);
-
-       ret = 0;
  out_unlock:
        mutex_unlock(&inode->i_mutex);
        return ret;
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
 
 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 {
-       struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
-       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
        struct btrfs_device *device;
        struct request_queue *q;
        struct fstrim_range range;
        u64 minlen = ULLONG_MAX;
        u64 num_devices = 0;
-       u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+       u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
        int ret;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 
        range.len = min(range.len, total_bytes - range.start);
        range.minlen = max(range.minlen, minlen);
-       ret = btrfs_trim_fs(root, &range);
+       ret = btrfs_trim_fs(fs_info->tree_root, &range);
        if (ret < 0)
                return ret;
 
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
                return PTR_ERR(trans);
 
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-                                     0, objectid, NULL, 0, 0, 0);
+                                     0, objectid, NULL, 0, 0, 0, 0);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                goto fail;
@@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
                return 0;
        file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
 
-       mutex_lock(&inode->i_mutex);
        ret = btrfs_delalloc_reserve_space(inode,
                                           num_pages << PAGE_CACHE_SHIFT);
-       mutex_unlock(&inode->i_mutex);
        if (ret)
                return ret;
 again:
@@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       mutex_lock(&root->fs_info->volume_mutex);
+       if (root->fs_info->balance_ctl) {
+               printk(KERN_INFO "btrfs: balance in progress\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args))
-               return PTR_ERR(vol_args);
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-       mutex_lock(&root->fs_info->volume_mutex);
        sizestr = vol_args->name;
        devstr = strchr(sizestr, ':');
        if (devstr) {
@@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
-               goto out_unlock;
+               goto out_free;
        }
        if (!strcmp(sizestr, "max"))
                new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                new_size = memparse(sizestr, NULL);
                if (new_size == 0) {
                        ret = -EINVAL;
-                       goto out_unlock;
+                       goto out_free;
                }
        }
 
@@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (mod < 0) {
                if (new_size > old_size) {
                        ret = -EINVAL;
-                       goto out_unlock;
+                       goto out_free;
                }
                new_size = old_size - new_size;
        } else if (mod > 0) {
@@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
 
        if (new_size < 256 * 1024 * 1024) {
                ret = -EINVAL;
-               goto out_unlock;
+               goto out_free;
        }
        if (new_size > device->bdev->bd_inode->i_size) {
                ret = -EFBIG;
-               goto out_unlock;
+               goto out_free;
        }
 
        do_div(new_size, root->sectorsize);
@@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                trans = btrfs_start_transaction(root, 0);
                if (IS_ERR(trans)) {
                        ret = PTR_ERR(trans);
-                       goto out_unlock;
+                       goto out_free;
                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
@@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                ret = btrfs_shrink_device(device, new_size);
        }
 
-out_unlock:
-       mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
        kfree(vol_args);
+out:
+       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
 
@@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       mutex_lock(&root->fs_info->volume_mutex);
+       if (root->fs_info->balance_ctl) {
+               printk(KERN_INFO "btrfs: balance in progress\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args))
-               return PTR_ERR(vol_args);
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_init_new_device(root, vol_args->name);
 
        kfree(vol_args);
+out:
+       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
 
@@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
 
+       mutex_lock(&root->fs_info->volume_mutex);
+       if (root->fs_info->balance_ctl) {
+               printk(KERN_INFO "btrfs: balance in progress\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
        vol_args = memdup_user(arg, sizeof(*vol_args));
-       if (IS_ERR(vol_args))
-               return PTR_ERR(vol_args);
+       if (IS_ERR(vol_args)) {
+               ret = PTR_ERR(vol_args);
+               goto out;
+       }
 
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
        ret = btrfs_rm_device(root, vol_args->name);
 
        kfree(vol_args);
+out:
+       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 }
 
@@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                                        disko, diskl, 0,
                                                        root->root_key.objectid,
                                                        btrfs_ino(inode),
-                                                       new_key.offset - datao);
+                                                       new_key.offset - datao,
+                                                       0);
                                        BUG_ON(ret);
                                }
                        } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 {
        int ret = 0;
        int size;
-       u64 extent_offset;
+       u64 extent_item_pos;
        struct btrfs_ioctl_logical_ino_args *loi;
        struct btrfs_data_container *inodes = NULL;
        struct btrfs_path *path = NULL;
@@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
        }
 
        ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+       btrfs_release_path(path);
 
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                ret = -ENOENT;
        if (ret < 0)
                goto out;
 
-       extent_offset = loi->logical - key.objectid;
+       extent_item_pos = loi->logical - key.objectid;
        ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
-                                       extent_offset, build_ino_list, inodes);
+                                       extent_item_pos, build_ino_list,
+                                       inodes);
 
        if (ret < 0)
                goto out;
@@ -3034,6 +3075,163 @@ out:
        return ret;
 }
 
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+                              struct btrfs_ioctl_balance_args *bargs)
+{
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+       bargs->flags = bctl->flags;
+
+       if (atomic_read(&fs_info->balance_running))
+               bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+       if (atomic_read(&fs_info->balance_pause_req))
+               bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+       if (atomic_read(&fs_info->balance_cancel_req))
+               bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+       memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+       memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+       memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+       if (lock) {
+               spin_lock(&fs_info->balance_lock);
+               memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+               spin_unlock(&fs_info->balance_lock);
+       } else {
+               memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+       }
+}
+
+static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_ioctl_balance_args *bargs;
+       struct btrfs_balance_control *bctl;
+       int ret;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (fs_info->sb->s_flags & MS_RDONLY)
+               return -EROFS;
+
+       mutex_lock(&fs_info->volume_mutex);
+       mutex_lock(&fs_info->balance_mutex);
+
+       if (arg) {
+               bargs = memdup_user(arg, sizeof(*bargs));
+               if (IS_ERR(bargs)) {
+                       ret = PTR_ERR(bargs);
+                       goto out;
+               }
+
+               if (bargs->flags & BTRFS_BALANCE_RESUME) {
+                       if (!fs_info->balance_ctl) {
+                               ret = -ENOTCONN;
+                               goto out_bargs;
+                       }
+
+                       bctl = fs_info->balance_ctl;
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->flags |= BTRFS_BALANCE_RESUME;
+                       spin_unlock(&fs_info->balance_lock);
+
+                       goto do_balance;
+               }
+       } else {
+               bargs = NULL;
+       }
+
+       if (fs_info->balance_ctl) {
+               ret = -EINPROGRESS;
+               goto out_bargs;
+       }
+
+       bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+       if (!bctl) {
+               ret = -ENOMEM;
+               goto out_bargs;
+       }
+
+       bctl->fs_info = fs_info;
+       if (arg) {
+               memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+               memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+               memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+               bctl->flags = bargs->flags;
+       } else {
+               /* balance everything - no filters */
+               bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+       }
+
+do_balance:
+       ret = btrfs_balance(bctl, bargs);
+       /*
+        * bctl is freed in __cancel_balance or in free_fs_info if
+        * restriper was paused all the way until unmount
+        */
+       if (arg) {
+               if (copy_to_user(arg, bargs, sizeof(*bargs)))
+                       ret = -EFAULT;
+       }
+
+out_bargs:
+       kfree(bargs);
+out:
+       mutex_unlock(&fs_info->balance_mutex);
+       mutex_unlock(&fs_info->volume_mutex);
+       return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       switch (cmd) {
+       case BTRFS_BALANCE_CTL_PAUSE:
+               return btrfs_pause_balance(root->fs_info);
+       case BTRFS_BALANCE_CTL_CANCEL:
+               return btrfs_cancel_balance(root->fs_info);
+       }
+
+       return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+                                        void __user *arg)
+{
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_ioctl_balance_args *bargs;
+       int ret = 0;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               ret = -ENOTCONN;
+               goto out;
+       }
+
+       bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+       if (!bargs) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       update_ioctl_balance_args(fs_info, 1, bargs);
+
+       if (copy_to_user(arg, bargs, sizeof(*bargs)))
+               ret = -EFAULT;
+
+       kfree(bargs);
+out:
+       mutex_unlock(&fs_info->balance_mutex);
+       return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_DEV_INFO:
                return btrfs_ioctl_dev_info(root, argp);
        case BTRFS_IOC_BALANCE:
-               return btrfs_balance(root->fs_info->dev_root);
+               return btrfs_ioctl_balance(root, NULL);
        case BTRFS_IOC_CLONE:
                return btrfs_ioctl_clone(file, arg, 0, 0, 0);
        case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_scrub_cancel(root, argp);
        case BTRFS_IOC_SCRUB_PROGRESS:
                return btrfs_ioctl_scrub_progress(root, argp);
+       case BTRFS_IOC_BALANCE_V2:
+               return btrfs_ioctl_balance(root, argp);
+       case BTRFS_IOC_BALANCE_CTL:
+               return btrfs_ioctl_balance_ctl(root, arg);
+       case BTRFS_IOC_BALANCE_PROGRESS:
+               return btrfs_ioctl_balance_progress(root, argp);
        }
 
        return -ENOTTY;
index 252ae99..4f69028 100644 (file)
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
        __u64 reserved[124];                    /* pad to 1k */
 };
 
+/* balance control ioctl modes */
+#define BTRFS_BALANCE_CTL_PAUSE                1
+#define BTRFS_BALANCE_CTL_CANCEL       2
+
+/*
+ * this is packed, because it should be exactly the same as its disk
+ * byte order counterpart (struct btrfs_disk_balance_args)
+ */
+struct btrfs_balance_args {
+       __u64 profiles;
+       __u64 usage;
+       __u64 devid;
+       __u64 pstart;
+       __u64 pend;
+       __u64 vstart;
+       __u64 vend;
+
+       __u64 target;
+
+       __u64 flags;
+
+       __u64 unused[8];
+} __attribute__ ((__packed__));
+
+/* report balance progress to userspace */
+struct btrfs_balance_progress {
+       __u64 expected;         /* estimated # of chunks that will be
+                                * relocated to fulfill the request */
+       __u64 considered;       /* # of chunks we have considered so far */
+       __u64 completed;        /* # of chunks relocated so far */
+};
+
+#define BTRFS_BALANCE_STATE_RUNNING    (1ULL << 0)
+#define BTRFS_BALANCE_STATE_PAUSE_REQ  (1ULL << 1)
+#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
+
+struct btrfs_ioctl_balance_args {
+       __u64 flags;                            /* in/out */
+       __u64 state;                            /* out */
+
+       struct btrfs_balance_args data;         /* in/out */
+       struct btrfs_balance_args meta;         /* in/out */
+       struct btrfs_balance_args sys;          /* in/out */
+
+       struct btrfs_balance_progress stat;     /* out */
+
+       __u64 unused[72];                       /* pad to 1k */
+};
+
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
        __u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
                                 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
                               struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
+                                  struct btrfs_ioctl_balance_args)
+#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
+#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
+                                       struct btrfs_ioctl_balance_args)
 #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
                                        struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
index d77b67c..5e178d8 100644 (file)
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
  */
 void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+       if (eb->lock_nested) {
+               read_lock(&eb->lock);
+               if (eb->lock_nested && current->pid == eb->lock_owner) {
+                       read_unlock(&eb->lock);
+                       return;
+               }
+               read_unlock(&eb->lock);
+       }
        if (rw == BTRFS_WRITE_LOCK) {
                if (atomic_read(&eb->blocking_writers) == 0) {
                        WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
  */
 void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+       if (eb->lock_nested) {
+               read_lock(&eb->lock);
+               if (&eb->lock_nested && current->pid == eb->lock_owner) {
+                       read_unlock(&eb->lock);
+                       return;
+               }
+               read_unlock(&eb->lock);
+       }
        if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
                BUG_ON(atomic_read(&eb->blocking_writers) != 1);
                write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
 again:
+       read_lock(&eb->lock);
+       if (atomic_read(&eb->blocking_writers) &&
+           current->pid == eb->lock_owner) {
+               /*
+                * This extent is already write-locked by our thread. We allow
+                * an additional read lock to be added because it's for the same
+                * thread. btrfs_find_all_roots() depends on this as it may be
+                * called on a partly (write-)locked tree.
+                */
+               BUG_ON(eb->lock_nested);
+               eb->lock_nested = 1;
+               read_unlock(&eb->lock);
+               return;
+       }
+       read_unlock(&eb->lock);
        wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
        read_lock(&eb->lock);
        if (atomic_read(&eb->blocking_writers)) {
                read_unlock(&eb->lock);
-               wait_event(eb->write_lock_wq,
-                          atomic_read(&eb->blocking_writers) == 0);
                goto again;
        }
        atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
        }
        atomic_inc(&eb->write_locks);
        atomic_inc(&eb->spinning_writers);
+       eb->lock_owner = current->pid;
        return 1;
 }
 
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
+       if (eb->lock_nested) {
+               read_lock(&eb->lock);
+               if (eb->lock_nested && current->pid == eb->lock_owner) {
+                       eb->lock_nested = 0;
+                       read_unlock(&eb->lock);
+                       return;
+               }
+               read_unlock(&eb->lock);
+       }
        btrfs_assert_tree_read_locked(eb);
        WARN_ON(atomic_read(&eb->spinning_readers) == 0);
        atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
  */
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 {
+       if (eb->lock_nested) {
+               read_lock(&eb->lock);
+               if (eb->lock_nested && current->pid == eb->lock_owner) {
+                       eb->lock_nested = 0;
+                       read_unlock(&eb->lock);
+                       return;
+               }
+               read_unlock(&eb->lock);
+       }
        btrfs_assert_tree_read_locked(eb);
        WARN_ON(atomic_read(&eb->blocking_readers) == 0);
        if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
        WARN_ON(atomic_read(&eb->spinning_writers));
        atomic_inc(&eb->spinning_writers);
        atomic_inc(&eb->write_locks);
+       eb->lock_owner = current->pid;
        return 0;
 }
 
index cfb5543..8c1aae2 100644 (file)
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
                                           num_bytes, parent,
                                           btrfs_header_owner(leaf),
-                                          key.objectid, key.offset);
+                                          key.objectid, key.offset, 1);
                BUG_ON(ret);
 
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        parent, btrfs_header_owner(leaf),
-                                       key.objectid, key.offset);
+                                       key.objectid, key.offset, 1);
                BUG_ON(ret);
        }
        if (dirty)
@@ -1778,21 +1778,23 @@ again:
 
                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
                                        path->nodes[level]->start,
-                                       src->root_key.objectid, level - 1, 0);
+                                       src->root_key.objectid, level - 1, 0,
+                                       1);
                BUG_ON(ret);
                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
                                        0, dest->root_key.objectid, level - 1,
-                                       0);
+                                       0, 1);
                BUG_ON(ret);
 
                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
                                        path->nodes[level]->start,
-                                       src->root_key.objectid, level - 1, 0);
+                                       src->root_key.objectid, level - 1, 0,
+                                       1);
                BUG_ON(ret);
 
                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
                                        0, dest->root_key.objectid, level - 1,
-                                       0);
+                                       0, 1);
                BUG_ON(ret);
 
                btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
                } else {
                        list_del_init(&reloc_root->root_list);
                }
-               btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+               btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
        }
 
        if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                                                node->eb->start, blocksize,
                                                upper->eb->start,
                                                btrfs_header_owner(upper->eb),
-                                               node->level, 0);
+                                               node->level, 0, 1);
                        BUG_ON(ret);
 
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
        while (index <= last_index) {
-               mutex_lock(&inode->i_mutex);
                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
-               mutex_unlock(&inode->i_mutex);
                if (ret)
                        goto out;
 
index ddf2c90..9770cc5 100644 (file)
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "check-integrity.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
        u8 ref_level;
        unsigned long ptr = 0;
        const int bufsize = 4096;
-       u64 extent_offset;
+       u64 extent_item_pos;
 
        path = btrfs_alloc_path();
 
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
        if (ret < 0)
                goto out;
 
-       extent_offset = swarn.logical - found_key.objectid;
+       extent_item_pos = swarn.logical - found_key.objectid;
        swarn.extent_item_size = found_key.offset;
 
        eb = path->nodes[0];
        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
+       btrfs_release_path(path);
 
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                do {
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
        } else {
                swarn.path = path;
                iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       extent_offset,
+                                       extent_item_pos,
                                        scrub_print_warning_inode, &swarn);
        }
 
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
        bio_add_page(bio, page, PAGE_SIZE, 0);
        bio->bi_end_io = scrub_fixup_end_io;
        bio->bi_private = &complete;
-       submit_bio(rw, bio);
+       btrfsic_submit_bio(rw, bio);
 
        /* this will also unplug the queue */
        wait_for_completion(&complete);
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev)
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
 
-       submit_bio(READ, sbio->bio);
+       btrfsic_submit_bio(READ, sbio->bio);
 
        return 0;
 }
index ae488aa..3ce97b2 100644 (file)
@@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 
 static void btrfs_put_super(struct super_block *sb)
 {
-       struct btrfs_root *root = btrfs_sb(sb);
-       int ret;
-
-       ret = close_ctree(root);
-       sb->s_fs_info = NULL;
-
-       (void)ret; /* FIXME: need to fix VFS to return error? */
+       (void)close_ctree(btrfs_sb(sb)->tree_root);
+       /* FIXME: need to fix VFS to return error? */
+       /* AV: return it _where_?  ->put_super() can be triggered by any number
+        * of async events, up to and including delivery of SIGKILL to the
+        * last process that kept it busy.  Or segfault in the aforementioned
+        * process...  Whom would you report that to?
+        */
 }
 
 enum {
@@ -163,8 +163,11 @@ enum {
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-       Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+       Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+       Opt_check_integrity, Opt_check_integrity_including_extent_data,
+       Opt_check_integrity_print_mask,
+       Opt_err,
 };
 
 static match_table_t tokens = {
@@ -199,6 +202,10 @@ static match_table_t tokens = {
        {Opt_inode_cache, "inode_cache"},
        {Opt_no_space_cache, "nospace_cache"},
        {Opt_recovery, "recovery"},
+       {Opt_skip_balance, "skip_balance"},
+       {Opt_check_integrity, "check_int"},
+       {Opt_check_integrity_including_extent_data, "check_int_data"},
+       {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
        {Opt_err, NULL},
 };
 
@@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: enabling auto recovery");
                        btrfs_set_opt(info->mount_opt, RECOVERY);
                        break;
+               case Opt_skip_balance:
+                       btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+                       break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+               case Opt_check_integrity_including_extent_data:
+                       printk(KERN_INFO "btrfs: enabling check integrity"
+                              " including extent data\n");
+                       btrfs_set_opt(info->mount_opt,
+                                     CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+                       btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                       break;
+               case Opt_check_integrity:
+                       printk(KERN_INFO "btrfs: enabling check integrity\n");
+                       btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+                       break;
+               case Opt_check_integrity_print_mask:
+                       intarg = 0;
+                       match_int(&args[0], &intarg);
+                       if (intarg) {
+                               info->check_integrity_print_mask = intarg;
+                               printk(KERN_INFO "btrfs:"
+                                      " check_integrity_print_mask 0x%x\n",
+                                      info->check_integrity_print_mask);
+                       }
+                       break;
+#else
+               case Opt_check_integrity_including_extent_data:
+               case Opt_check_integrity:
+               case Opt_check_integrity_print_mask:
+                       printk(KERN_ERR "btrfs: support for check_integrity*"
+                              " not compiled in!\n");
+                       ret = -EINVAL;
+                       goto out;
+#endif
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -500,7 +541,8 @@ out:
 static struct dentry *get_default_root(struct super_block *sb,
                                       u64 subvol_objectid)
 {
-       struct btrfs_root *root = sb->s_fs_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_root *root = fs_info->tree_root;
        struct btrfs_root *new_root;
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
@@ -530,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb,
         * will mount by default if we haven't been given a specific subvolume
         * to mount.
         */
-       dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+       dir_id = btrfs_super_root_dir(fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
        if (IS_ERR(di)) {
                btrfs_free_path(path);
@@ -544,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb,
                 */
                btrfs_free_path(path);
                dir_id = BTRFS_FIRST_FREE_OBJECTID;
-               new_root = root->fs_info->fs_root;
+               new_root = fs_info->fs_root;
                goto setup_root;
        }
 
@@ -552,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb,
        btrfs_free_path(path);
 
 find_root:
-       new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+       new_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (IS_ERR(new_root))
                return ERR_CAST(new_root);
 
@@ -588,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
 {
        struct inode *inode;
        struct dentry *root_dentry;
-       struct btrfs_root *tree_root;
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_key key;
        int err;
 
@@ -603,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_flags |= MS_POSIXACL;
 #endif
 
-       tree_root = open_ctree(sb, fs_devices, (char *)data);
-
-       if (IS_ERR(tree_root)) {
+       err = open_ctree(sb, fs_devices, (char *)data);
+       if (err) {
                printk("btrfs: open_ctree failed\n");
-               return PTR_ERR(tree_root);
+               return err;
        }
-       sb->s_fs_info = tree_root;
 
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-       inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+       inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto fail_close;
@@ -631,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb,
 
        save_mount_options(sb, data);
        cleancache_init_fs(sb);
+       sb->s_flags |= MS_ACTIVE;
        return 0;
 
 fail_close:
-       close_ctree(tree_root);
+       close_ctree(fs_info->tree_root);
        return err;
 }
 
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
        struct btrfs_trans_handle *trans;
-       struct btrfs_root *root = btrfs_sb(sb);
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_root *root = fs_info->tree_root;
        int ret;
 
        trace_btrfs_sync_fs(wait);
 
        if (!wait) {
-               filemap_flush(root->fs_info->btree_inode->i_mapping);
+               filemap_flush(fs_info->btree_inode->i_mapping);
                return 0;
        }
 
@@ -663,8 +705,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
-       struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
+       struct btrfs_root *root = info->tree_root;
        char *compress_type;
 
        if (btrfs_test_opt(root, DEGRADED))
@@ -722,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
                seq_puts(seq, ",autodefrag");
        if (btrfs_test_opt(root, INODE_MAP_CACHE))
                seq_puts(seq, ",inode_cache");
+       if (btrfs_test_opt(root, SKIP_BALANCE))
+               seq_puts(seq, ",skip_balance");
        return 0;
 }
 
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-       struct btrfs_root *test_root = data;
-       struct btrfs_root *root = btrfs_sb(s);
+       struct btrfs_fs_info *p = data;
+       struct btrfs_fs_info *fs_info = btrfs_sb(s);
 
-       /*
-        * If this super block is going away, return false as it
-        * can't match as an existing super block.
-        */
-       if (!atomic_read(&s->s_active))
-               return 0;
-       return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+       return fs_info->fs_devices == p->fs_devices;
 }
 
 static int btrfs_set_super(struct super_block *s, void *data)
 {
-       s->s_fs_info = data;
-
-       return set_anon_super(s, data);
+       int err = set_anon_super(s, data);
+       if (!err)
+               s->s_fs_info = data;
+       return err;
 }
 
 /*
@@ -903,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        if (!fs_info)
                return ERR_PTR(-ENOMEM);
 
-       fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-       if (!fs_info->tree_root) {
-               error = -ENOMEM;
-               goto error_fs_info;
-       }
-       fs_info->tree_root->fs_info = fs_info;
        fs_info->fs_devices = fs_devices;
 
        fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -928,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        }
 
        bdev = fs_devices->latest_bdev;
-       s = sget(fs_type, btrfs_test_super, btrfs_set_super,
-                fs_info->tree_root);
+       s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
        if (IS_ERR(s)) {
                error = PTR_ERR(s);
                goto error_close_devices;
        }
 
        if (s->s_root) {
-               if ((flags ^ s->s_flags) & MS_RDONLY) {
-                       deactivate_locked_super(s);
-                       error = -EBUSY;
-                       goto error_close_devices;
-               }
-
                btrfs_close_devices(fs_devices);
                free_fs_info(fs_info);
+               if ((flags ^ s->s_flags) & MS_RDONLY)
+                       error = -EBUSY;
        } else {
                char b[BDEVNAME_SIZE];
 
                s->s_flags = flags | MS_NOSEC;
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-               btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+               btrfs_sb(s)->bdev_holder = fs_type;
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
-               if (error) {
-                       deactivate_locked_super(s);
-                       return ERR_PTR(error);
-               }
-
-               s->s_flags |= MS_ACTIVE;
        }
 
-       root = get_default_root(s, subvol_objectid);
-       if (IS_ERR(root)) {
+       root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+       if (IS_ERR(root))
                deactivate_locked_super(s);
-               return root;
-       }
 
        return root;
 
@@ -977,7 +997,8 @@ error_fs_info:
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
-       struct btrfs_root *root = btrfs_sb(sb);
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       struct btrfs_root *root = fs_info->tree_root;
        int ret;
 
        ret = btrfs_parse_options(root, data);
@@ -993,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                ret =  btrfs_commit_super(root);
                WARN_ON(ret);
        } else {
-               if (root->fs_info->fs_devices->rw_devices == 0)
+               if (fs_info->fs_devices->rw_devices == 0)
                        return -EACCES;
 
-               if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
+               if (btrfs_super_log_root(fs_info->super_copy) != 0)
                        return -EINVAL;
 
-               ret = btrfs_cleanup_fs_roots(root->fs_info);
+               ret = btrfs_cleanup_fs_roots(fs_info);
                WARN_ON(ret);
 
                /* recover relocation */
@@ -1168,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-       struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-       struct btrfs_super_block *disk_super = root->fs_info->super_copy;
-       struct list_head *head = &root->fs_info->space_info;
+       struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+       struct btrfs_super_block *disk_super = fs_info->super_copy;
+       struct list_head *head = &fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
        u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
-       __be32 *fsid = (__be32 *)root->fs_info->fsid;
+       __be32 *fsid = (__be32 *)fs_info->fsid;
        int ret;
 
        /* holding chunk_muext to avoid allocating new chunks */
-       mutex_lock(&root->fs_info->chunk_mutex);
+       mutex_lock(&fs_info->chunk_mutex);
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1198,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
        buf->f_bavail = total_free_data;
-       ret = btrfs_calc_avail_data_space(root, &total_free_data);
+       ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
        if (ret) {
-               mutex_unlock(&root->fs_info->chunk_mutex);
+               mutex_unlock(&fs_info->chunk_mutex);
                return ret;
        }
        buf->f_bavail += total_free_data;
        buf->f_bavail = buf->f_bavail >> bits;
-       mutex_unlock(&root->fs_info->chunk_mutex);
+       mutex_unlock(&fs_info->chunk_mutex);
 
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
@@ -1219,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
 
+static void btrfs_kill_super(struct super_block *sb)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       kill_anon_super(sb);
+       free_fs_info(fs_info);
+}
+
 static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
        .mount          = btrfs_mount,
-       .kill_sb        = kill_anon_super,
+       .kill_sb        = btrfs_kill_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
 
@@ -1257,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 
 static int btrfs_freeze(struct super_block *sb)
 {
-       struct btrfs_root *root = btrfs_sb(sb);
-       mutex_lock(&root->fs_info->transaction_kthread_mutex);
-       mutex_lock(&root->fs_info->cleaner_mutex);
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       mutex_lock(&fs_info->transaction_kthread_mutex);
+       mutex_lock(&fs_info->cleaner_mutex);
        return 0;
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-       struct btrfs_root *root = btrfs_sb(sb);
-       mutex_unlock(&root->fs_info->cleaner_mutex);
-       mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+       mutex_unlock(&fs_info->cleaner_mutex);
+       mutex_unlock(&fs_info->transaction_kthread_mutex);
        return 0;
 }
 
index 81376d9..287a672 100644 (file)
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
+               WARN_ON(transaction->delayed_refs.root.rb_node);
+               WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -108,8 +110,11 @@ loop:
        cur_trans->delayed_refs.num_heads = 0;
        cur_trans->delayed_refs.flushing = 0;
        cur_trans->delayed_refs.run_delayed_start = 0;
+       cur_trans->delayed_refs.seq = 1;
+       init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
        spin_lock_init(&cur_trans->commit_lock);
        spin_lock_init(&cur_trans->delayed_refs.lock);
+       INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
 
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -321,6 +326,8 @@ again:
        }
 
        if (num_bytes) {
+               trace_btrfs_space_reservation(root->fs_info, "transaction",
+                                             (u64)h, num_bytes, 1);
                h->block_rsv = &root->fs_info->trans_block_rsv;
                h->bytes_reserved = num_bytes;
        }
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
-       while (count < 4) {
+       while (count < 2) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
                if (cur &&
                    trans->transaction->delayed_refs.num_heads_ready > 64) {
                        trans->delayed_ref_updates = 0;
-
-                       /*
-                        * do a full flush if the transaction is trying
-                        * to close
-                        */
-                       if (trans->transaction->delayed_refs.flushing)
-                               cur = 0;
                        btrfs_run_delayed_refs(trans, root, cur);
                } else {
                        break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
-                       btrfs_drop_snapshot(root, NULL, 0);
+                       btrfs_drop_snapshot(root, NULL, 0, 0);
                else
-                       btrfs_drop_snapshot(root, NULL, 1);
+                       btrfs_drop_snapshot(root, NULL, 1, 0);
        }
        return 0;
 }
index 3568374..cb877e0 100644 (file)
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                ret = btrfs_inc_extent_ref(trans, root,
                                                ins.objectid, ins.offset,
                                                0, root->root_key.objectid,
-                                               key->objectid, offset);
+                                               key->objectid, offset, 0);
                                BUG_ON(ret);
                        } else {
                                /*
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644 (file)
index 0000000..12f5147
--- /dev/null
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "ulist.h"
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ *
+ * A sample usage for ulists is the enumeration of directed graphs without
+ * visiting a node twice. The pseudo-code could look like this:
+ *
+ * ulist = ulist_alloc();
+ * ulist_add(ulist, root);
+ * elem = NULL;
+ *
+ * while ((elem = ulist_next(ulist, elem)) {
+ *     for (all child nodes n in elem)
+ *             ulist_add(ulist, n);
+ *     do something useful with the node;
+ * }
+ * ulist_free(ulist);
+ *
+ * This assumes the graph nodes are adressable by u64. This stems from the
+ * usage for tree enumeration in btrfs, where the logical addresses are
+ * 64 bit.
+ *
+ * It is also useful for tree enumeration which could be done elegantly
+ * recursively, but is not possible due to kernel stack limitations. The
+ * loop would be similar to the above.
+ */
+
+/**
+ * ulist_init - freshly initialize a ulist
+ * @ulist:     the ulist to initialize
+ *
+ * Note: don't use this function to init an already used ulist, use
+ * ulist_reinit instead.
+ */
+void ulist_init(struct ulist *ulist)
+{
+       ulist->nnodes = 0;
+       ulist->nodes = ulist->int_nodes;
+       ulist->nodes_alloced = ULIST_SIZE;
+}
+EXPORT_SYMBOL(ulist_init);
+
+/**
+ * ulist_fini - free up additionally allocated memory for the ulist
+ * @ulist:     the ulist from which to free the additional memory
+ *
+ * This is useful in cases where the base 'struct ulist' has been statically
+ * allocated.
+ */
+void ulist_fini(struct ulist *ulist)
+{
+       /*
+        * The first ULIST_SIZE elements are stored inline in struct ulist.
+        * Only if more elements are alocated they need to be freed.
+        */
+       if (ulist->nodes_alloced > ULIST_SIZE)
+               kfree(ulist->nodes);
+       ulist->nodes_alloced = 0;       /* in case ulist_fini is called twice */
+}
+EXPORT_SYMBOL(ulist_fini);
+
+/**
+ * ulist_reinit - prepare a ulist for reuse
+ * @ulist:     ulist to be reused
+ *
+ * Free up all additional memory allocated for the list elements and reinit
+ * the ulist.
+ */
+void ulist_reinit(struct ulist *ulist)
+{
+       ulist_fini(ulist);
+       ulist_init(ulist);
+}
+EXPORT_SYMBOL(ulist_reinit);
+
+/**
+ * ulist_alloc - dynamically allocate a ulist
+ * @gfp_mask:  allocation flags to for base allocation
+ *
+ * The allocated ulist will be returned in an initialized state.
+ */
+struct ulist *ulist_alloc(unsigned long gfp_mask)
+{
+       struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
+
+       if (!ulist)
+               return NULL;
+
+       ulist_init(ulist);
+
+       return ulist;
+}
+EXPORT_SYMBOL(ulist_alloc);
+
+/**
+ * ulist_free - free dynamically allocated ulist
+ * @ulist:     ulist to free
+ *
+ * It is not necessary to call ulist_fini before.
+ */
+void ulist_free(struct ulist *ulist)
+{
+       if (!ulist)
+               return;
+       ulist_fini(ulist);
+       kfree(ulist);
+}
+EXPORT_SYMBOL(ulist_free);
+
+/**
+ * ulist_add - add an element to the ulist
+ * @ulist:     ulist to add the element to
+ * @val:       value to add to ulist
+ * @aux:       auxiliary value to store along with val
+ * @gfp_mask:  flags to use for allocation
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks write
+ *       locking is needed
+ *
+ * Add an element to a ulist. The @val will only be added if it doesn't
+ * already exist. If it is added, the auxiliary value @aux is stored along with
+ * it. In case @val already exists in the ulist, @aux is ignored, even if
+ * it differs from the already stored value.
+ *
+ * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
+ * inserted.
+ * In case of allocation failure -ENOMEM is returned and the ulist stays
+ * unaltered.
+ */
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+             unsigned long gfp_mask)
+{
+       int i;
+
+       for (i = 0; i < ulist->nnodes; ++i) {
+               if (ulist->nodes[i].val == val)
+                       return 0;
+       }
+
+       if (ulist->nnodes >= ulist->nodes_alloced) {
+               u64 new_alloced = ulist->nodes_alloced + 128;
+               struct ulist_node *new_nodes;
+               void *old = NULL;
+
+               /*
+                * if nodes_alloced == ULIST_SIZE no memory has been allocated
+                * yet, so pass NULL to krealloc
+                */
+               if (ulist->nodes_alloced > ULIST_SIZE)
+                       old = ulist->nodes;
+
+               new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
+                                    gfp_mask);
+               if (!new_nodes)
+                       return -ENOMEM;
+
+               if (!old)
+                       memcpy(new_nodes, ulist->int_nodes,
+                              sizeof(ulist->int_nodes));
+
+               ulist->nodes = new_nodes;
+               ulist->nodes_alloced = new_alloced;
+       }
+       ulist->nodes[ulist->nnodes].val = val;
+       ulist->nodes[ulist->nnodes].aux = aux;
+       ++ulist->nnodes;
+
+       return 1;
+}
+EXPORT_SYMBOL(ulist_add);
+
+/**
+ * ulist_next - iterate ulist
+ * @ulist:     ulist to iterate
+ * @prev:      previously returned element or %NULL to start iteration
+ *
+ * Note: locking must be provided by the caller. In case of rwlocks only read
+ *       locking is needed
+ *
+ * This function is used to iterate an ulist. The iteration is started with
+ * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * end is reached. No guarantee is made with respect to the order in which
+ * the elements are returned. They might neither be returned in order of
+ * addition nor in ascending order.
+ * It is allowed to call ulist_add during an enumeration. Newly added items
+ * are guaranteed to show up in the running enumeration.
+ */
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+{
+       int next;
+
+       if (ulist->nnodes == 0)
+               return NULL;
+
+       if (!prev)
+               return &ulist->nodes[0];
+
+       next = (prev - ulist->nodes) + 1;
+       if (next < 0 || next >= ulist->nnodes)
+               return NULL;
+
+       return &ulist->nodes[next];
+}
+EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644 (file)
index 0000000..2e25dec
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2011 STRATO AG
+ * written by Arne Jansen <sensille@gmx.net>
+ * Distributed under the GNU GPL license version 2.
+ *
+ */
+
+#ifndef __ULIST__
+#define __ULIST__
+
+/*
+ * ulist is a generic data structure to hold a collection of unique u64
+ * values. The only operations it supports is adding to the list and
+ * enumerating it.
+ * It is possible to store an auxiliary value along with the key.
+ *
+ * The implementation is preliminary and can probably be sped up
+ * significantly. A first step would be to store the values in an rbtree
+ * as soon as ULIST_SIZE is exceeded.
+ */
+
+/*
+ * number of elements statically allocated inside struct ulist
+ */
+#define ULIST_SIZE 16
+
+/*
+ * element of the list
+ */
+struct ulist_node {
+       u64 val;                /* value to store */
+       unsigned long aux;      /* auxiliary value saved along with the val */
+};
+
+struct ulist {
+       /*
+        * number of elements stored in list
+        */
+       unsigned long nnodes;
+
+       /*
+        * number of nodes we already have room for
+        */
+       unsigned long nodes_alloced;
+
+       /*
+        * pointer to the array storing the elements. The first ULIST_SIZE
+        * elements are stored inline. In this case the it points to int_nodes.
+        * After exceeding ULIST_SIZE, dynamic memory is allocated.
+        */
+       struct ulist_node *nodes;
+
+       /*
+        * inline storage space for the first ULIST_SIZE entries
+        */
+       struct ulist_node int_nodes[ULIST_SIZE];
+};
+
+void ulist_init(struct ulist *ulist);
+void ulist_fini(struct ulist *ulist);
+void ulist_reinit(struct ulist *ulist);
+struct ulist *ulist_alloc(unsigned long gfp_mask);
+void ulist_free(struct ulist *ulist);
+int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
+             unsigned long gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+
+#endif
index f4b839f..0b4e2af 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -32,6 +33,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "async-thread.h"
+#include "check-integrity.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@ -246,7 +248,7 @@ loop_lock:
                        sync_pending = 0;
                }
 
-               submit_bio(cur->bi_rw, cur);
+               btrfsic_submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
                if (need_resched())
@@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        u64 devid;
        u64 transid;
 
-       mutex_lock(&uuid_mutex);
-
        flags |= FMODE_EXCL;
        bdev = blkdev_get_by_path(path, flags, holder);
 
@@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                goto error;
        }
 
+       mutex_lock(&uuid_mutex);
        ret = set_blocksize(bdev, 4096);
        if (ret)
                goto error_close;
@@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
        brelse(bh);
 error_close:
+       mutex_unlock(&uuid_mutex);
        blkdev_put(bdev, flags);
 error:
-       mutex_unlock(&uuid_mutex);
        return ret;
 }
 
@@ -829,7 +830,6 @@ out:
 
 /*
  * find_free_dev_extent - find free space in the specified device
- * @trans:     transaction handler
  * @device:    the device which we search the free space in
  * @num_bytes: the size of the free space that we need
  * @start:     store the start of the free space.
@@ -848,8 +848,7 @@ out:
  * But if we don't find suitable free space, it is used to store the size of
  * the max free space.
  */
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-                        struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *len)
 {
        struct btrfs_key key;
@@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;
 
-       ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+       ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto out;
        if (ret > 0) {
@@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        bool clear_super = false;
 
        mutex_lock(&uuid_mutex);
-       mutex_lock(&root->fs_info->volume_mutex);
 
        all_avail = root->fs_info->avail_data_alloc_bits |
                root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1450,6 @@ error_close:
        if (bdev)
                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
-       mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
 error_undo:
@@ -1469,8 +1466,7 @@ error_undo:
 /*
  * does all the dirty work required for changing file system's UUID.
  */
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_root *root)
 {
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_fs_devices *old_devices;
@@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        }
 
        filemap_write_and_wait(bdev->bd_inode->i_mapping);
-       mutex_lock(&root->fs_info->volume_mutex);
 
        devices = &root->fs_info->fs_devices->devices;
        /*
@@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
        if (seeding_dev) {
                sb->s_flags &= ~MS_RDONLY;
-               ret = btrfs_prepare_sprout(trans, root);
+               ret = btrfs_prepare_sprout(root);
                BUG_ON(ret);
        }
 
@@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                ret = btrfs_relocate_sys_chunks(root);
                BUG_ON(ret);
        }
-out:
-       mutex_unlock(&root->fs_info->volume_mutex);
+
        return ret;
 error:
        blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1760,7 @@ error:
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
        }
-       goto out;
+       return ret;
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2071,362 @@ error:
        return ret;
 }
 
+static int insert_balance_item(struct btrfs_root *root,
+                              struct btrfs_balance_control *bctl)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_balance_item *item;
+       struct btrfs_disk_balance_args disk_bargs;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret, err;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                     sizeof(*item));
+       if (ret)
+               goto out;
+
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+       memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+       btrfs_set_balance_data(leaf, item, &disk_bargs);
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+       btrfs_set_balance_meta(leaf, item, &disk_bargs);
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+       btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+       btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+       btrfs_mark_buffer_dirty(leaf);
+out:
+       btrfs_free_path(path);
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+       return ret;
+}
+
+static int del_balance_item(struct btrfs_root *root)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret, err;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       ret = btrfs_del_item(trans, root, path);
+out:
+       btrfs_free_path(path);
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+       return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+       /*
+        * Turn on soft mode for chunk types that were being converted.
+        */
+       if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+       if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+       if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+       /*
+        * Turn on usage filter if is not already used.  The idea is
+        * that chunks that we have already balanced should be
+        * reasonably full.  Don't do it for chunks that are being
+        * converted - that will keep us from relocating unconverted
+        * (albeit full) chunks.
+        */
+       if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->data.usage = 90;
+       }
+       if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->sys.usage = 90;
+       }
+       if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->meta.usage = 90;
+       }
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper.  Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+       BUG_ON(fs_info->balance_ctl);
+
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = bctl;
+       spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+       BUG_ON(!fs_info->balance_ctl);
+
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = NULL;
+       spin_unlock(&fs_info->balance_lock);
+
+       kfree(bctl);
+}
+
+/*
+ * Balance filters.  Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_profile,
+                                struct btrfs_balance_args *bargs)
+{
+       chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       if (chunk_profile == 0)
+               chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (bargs->profiles & chunk_profile)
+               return 0;
+
+       return 1;
+}
+
+static u64 div_factor_fine(u64 num, int factor)
+{
+       if (factor <= 0)
+               return 0;
+       if (factor >= 100)
+               return num;
+
+       num *= factor;
+       do_div(num, 100);
+       return num;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+                             struct btrfs_balance_args *bargs)
+{
+       struct btrfs_block_group_cache *cache;
+       u64 chunk_used, user_thresh;
+       int ret = 1;
+
+       cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+       chunk_used = btrfs_block_group_used(&cache->item);
+
+       user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+       if (chunk_used < user_thresh)
+               ret = 0;
+
+       btrfs_put_block_group(cache);
+       return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+                             struct btrfs_chunk *chunk,
+                             struct btrfs_balance_args *bargs)
+{
+       struct btrfs_stripe *stripe;
+       int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       int i;
+
+       for (i = 0; i < num_stripes; i++) {
+               stripe = btrfs_stripe_nr(chunk, i);
+               if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+                       return 0;
+       }
+
+       return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              u64 chunk_offset,
+                              struct btrfs_balance_args *bargs)
+{
+       struct btrfs_stripe *stripe;
+       int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       u64 stripe_offset;
+       u64 stripe_length;
+       int factor;
+       int i;
+
+       if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+               return 0;
+
+       if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+            BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+               factor = 2;
+       else
+               factor = 1;
+       factor = num_stripes / factor;
+
+       for (i = 0; i < num_stripes; i++) {
+               stripe = btrfs_stripe_nr(chunk, i);
+               if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+                       continue;
+
+               stripe_offset = btrfs_stripe_offset(leaf, stripe);
+               stripe_length = btrfs_chunk_length(leaf, chunk);
+               do_div(stripe_length, factor);
+
+               if (stripe_offset < bargs->pend &&
+                   stripe_offset + stripe_length > bargs->pstart)
+                       return 0;
+       }
+
+       return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              u64 chunk_offset,
+                              struct btrfs_balance_args *bargs)
+{
+       if (chunk_offset < bargs->vend &&
+           chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+               /* at least part of the chunk is inside this vrange */
+               return 0;
+
+       return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_profile,
+                                    struct btrfs_balance_args *bargs)
+{
+       if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+               return 0;
+
+       chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       if (chunk_profile == 0)
+               chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (bargs->target & chunk_profile)
+               return 1;
+
+       return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+                               struct extent_buffer *leaf,
+                               struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+       struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+       struct btrfs_balance_args *bargs = NULL;
+       u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+       /* type filter */
+       if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+             (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+               return 0;
+       }
+
+       if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+               bargs = &bctl->data;
+       else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+               bargs = &bctl->sys;
+       else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+               bargs = &bctl->meta;
+
+       /* profiles filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+           chunk_profiles_filter(chunk_type, bargs)) {
+               return 0;
+       }
+
+       /* usage filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+               return 0;
+       }
+
+       /* devid filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+           chunk_devid_filter(leaf, chunk, bargs)) {
+               return 0;
+       }
+
+       /* drange filter, makes sense only with devid filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+           chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+               return 0;
+       }
+
+       /* vrange filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+           chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+               return 0;
+       }
+
+       /* soft profile changing mode */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+           chunk_soft_convert_filter(chunk_type, bargs)) {
+               return 0;
+       }
+
+       return 1;
+}
+
 static u64 div_factor(u64 num, int factor)
 {
        if (factor == 10)
@@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor)
        return num;
 }
 
-int btrfs_balance(struct btrfs_root *dev_root)
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 {
-       int ret;
-       struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+       struct btrfs_root *chunk_root = fs_info->chunk_root;
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct list_head *devices;
        struct btrfs_device *device;
        u64 old_size;
        u64 size_to_free;
+       struct btrfs_chunk *chunk;
        struct btrfs_path *path;
        struct btrfs_key key;
-       struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
-       struct btrfs_trans_handle *trans;
        struct btrfs_key found_key;
-
-       if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       mutex_lock(&dev_root->fs_info->volume_mutex);
-       dev_root = dev_root->fs_info->dev_root;
+       struct btrfs_trans_handle *trans;
+       struct extent_buffer *leaf;
+       int slot;
+       int ret;
+       int enospc_errors = 0;
+       bool counting = true;
 
        /* step one make some room on all the devices */
+       devices = &fs_info->fs_devices->devices;
        list_for_each_entry(device, devices, dev_list) {
                old_size = device->total_bytes;
                size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
                ret = -ENOMEM;
                goto error;
        }
+
+       /* zero out stat counters */
+       spin_lock(&fs_info->balance_lock);
+       memset(&bctl->stat, 0, sizeof(bctl->stat));
+       spin_unlock(&fs_info->balance_lock);
+again:
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;
 
        while (1) {
+               if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+                   atomic_read(&fs_info->balance_cancel_req)) {
+                       ret = -ECANCELED;
+                       goto error;
+               }
+
                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                if (ret < 0)
                        goto error;
@@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
                 * failed
                 */
                if (ret == 0)
-                       break;
+                       BUG(); /* FIXME break ? */
 
                ret = btrfs_previous_item(chunk_root, path, 0,
                                          BTRFS_CHUNK_ITEM_KEY);
-               if (ret)
+               if (ret) {
+                       ret = 0;
                        break;
+               }
+
+               leaf = path->nodes[0];
+               slot = path->slots[0];
+               btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
-               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                     path->slots[0]);
                if (found_key.objectid != key.objectid)
                        break;
 
@@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
                if (found_key.offset == 0)
                        break;
 
+               chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+               if (!counting) {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.considered++;
+                       spin_unlock(&fs_info->balance_lock);
+               }
+
+               ret = should_balance_chunk(chunk_root, leaf, chunk,
+                                          found_key.offset);
                btrfs_release_path(path);
+               if (!ret)
+                       goto loop;
+
+               if (counting) {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.expected++;
+                       spin_unlock(&fs_info->balance_lock);
+                       goto loop;
+               }
+
                ret = btrfs_relocate_chunk(chunk_root,
                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
                                           found_key.offset);
                if (ret && ret != -ENOSPC)
                        goto error;
+               if (ret == -ENOSPC) {
+                       enospc_errors++;
+               } else {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.completed++;
+                       spin_unlock(&fs_info->balance_lock);
+               }
+loop:
                key.offset = found_key.offset - 1;
        }
-       ret = 0;
+
+       if (counting) {
+               btrfs_release_path(path);
+               counting = false;
+               goto again;
+       }
 error:
        btrfs_free_path(path);
-       mutex_unlock(&dev_root->fs_info->volume_mutex);
+       if (enospc_errors) {
+               printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+                      enospc_errors);
+               if (!ret)
+                       ret = -ENOSPC;
+       }
+
        return ret;
 }
 
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+       /* cancel requested || normal exit path */
+       return atomic_read(&fs_info->balance_cancel_req) ||
+               (atomic_read(&fs_info->balance_pause_req) == 0 &&
+                atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+       int ret;
+
+       unset_balance_control(fs_info);
+       ret = del_balance_item(fs_info->tree_root);
+       BUG_ON(ret);
+}
+
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+                              struct btrfs_ioctl_balance_args *bargs);
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+                 struct btrfs_ioctl_balance_args *bargs)
+{
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+       u64 allowed;
+       int ret;
+
+       if (btrfs_fs_closing(fs_info) ||
+           atomic_read(&fs_info->balance_pause_req) ||
+           atomic_read(&fs_info->balance_cancel_req)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * In case of mixed groups both data and meta should be picked,
+        * and identical options should be given for both of them.
+        */
+       allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+       if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+           (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+               if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+                   !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+                   memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+                       printk(KERN_ERR "btrfs: with mixed groups data and "
+                              "metadata balance options must be the same\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /*
+        * Profile changing sanity checks.  Skip them if a simple
+        * balance is requested.
+        */
+       if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+             BTRFS_BALANCE_ARGS_CONVERT))
+               goto do_balance;
+
+       allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+       if (fs_info->fs_devices->num_devices == 1)
+               allowed |= BTRFS_BLOCK_GROUP_DUP;
+       else if (fs_info->fs_devices->num_devices < 4)
+               allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+       else
+               allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID10);
+
+       if (!profile_is_valid(bctl->data.target, 1) ||
+           bctl->data.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "data profile %llu\n",
+                      (unsigned long long)bctl->data.target);
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!profile_is_valid(bctl->meta.target, 1) ||
+           bctl->meta.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "metadata profile %llu\n",
+                      (unsigned long long)bctl->meta.target);
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!profile_is_valid(bctl->sys.target, 1) ||
+           bctl->sys.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "system profile %llu\n",
+                      (unsigned long long)bctl->sys.target);
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+               printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* allow to reduce meta or sys integrity only if force set */
+       allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_RAID10;
+       if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+            (fs_info->avail_system_alloc_bits & allowed) &&
+            !(bctl->sys.target & allowed)) ||
+           ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+            (fs_info->avail_metadata_alloc_bits & allowed) &&
+            !(bctl->meta.target & allowed))) {
+               if (bctl->flags & BTRFS_BALANCE_FORCE) {
+                       printk(KERN_INFO "btrfs: force reducing metadata "
+                              "integrity\n");
+               } else {
+                       printk(KERN_ERR "btrfs: balance will reduce metadata "
+                              "integrity, use force if you want this\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+do_balance:
+       ret = insert_balance_item(fs_info->tree_root, bctl);
+       if (ret && ret != -EEXIST)
+               goto out;
+
+       if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+               BUG_ON(ret == -EEXIST);
+               set_balance_control(bctl);
+       } else {
+               BUG_ON(ret != -EEXIST);
+               spin_lock(&fs_info->balance_lock);
+               update_balance_args(bctl);
+               spin_unlock(&fs_info->balance_lock);
+       }
+
+       atomic_inc(&fs_info->balance_running);
+       mutex_unlock(&fs_info->balance_mutex);
+
+       ret = __btrfs_balance(fs_info);
+
+       mutex_lock(&fs_info->balance_mutex);
+       atomic_dec(&fs_info->balance_running);
+
+       if (bargs) {
+               memset(bargs, 0, sizeof(*bargs));
+               update_ioctl_balance_args(fs_info, 0, bargs);
+       }
+
+       if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+           balance_need_close(fs_info)) {
+               __cancel_balance(fs_info);
+       }
+
+       wake_up(&fs_info->balance_wait_q);
+
+       return ret;
+out:
+       if (bctl->flags & BTRFS_BALANCE_RESUME)
+               __cancel_balance(fs_info);
+       else
+               kfree(bctl);
+       return ret;
+}
+
+static int balance_kthread(void *data)
+{
+       struct btrfs_balance_control *bctl =
+                       (struct btrfs_balance_control *)data;
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+       int ret = 0;
+
+       mutex_lock(&fs_info->volume_mutex);
+       mutex_lock(&fs_info->balance_mutex);
+
+       set_balance_control(bctl);
+
+       if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+               printk(KERN_INFO "btrfs: force skipping balance\n");
+       } else {
+               printk(KERN_INFO "btrfs: continuing balance\n");
+               ret = btrfs_balance(bctl, NULL);
+       }
+
+       mutex_unlock(&fs_info->balance_mutex);
+       mutex_unlock(&fs_info->volume_mutex);
+       return ret;
+}
+
+int btrfs_recover_balance(struct btrfs_root *tree_root)
+{
+       struct task_struct *tsk;
+       struct btrfs_balance_control *bctl;
+       struct btrfs_balance_item *item;
+       struct btrfs_disk_balance_args disk_bargs;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+       if (!bctl) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+
+       ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out_bctl;
+       if (ret > 0) { /* ret = -ENOENT; */
+               ret = 0;
+               goto out_bctl;
+       }
+
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+       bctl->fs_info = tree_root->fs_info;
+       bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+
+       btrfs_balance_data(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+       btrfs_balance_meta(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+       btrfs_balance_sys(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+       tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+       if (IS_ERR(tsk))
+               ret = PTR_ERR(tsk);
+       else
+               goto out;
+
+out_bctl:
+       kfree(bctl);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+       int ret = 0;
+
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -ENOTCONN;
+       }
+
+       if (atomic_read(&fs_info->balance_running)) {
+               atomic_inc(&fs_info->balance_pause_req);
+               mutex_unlock(&fs_info->balance_mutex);
+
+               wait_event(fs_info->balance_wait_q,
+                          atomic_read(&fs_info->balance_running) == 0);
+
+               mutex_lock(&fs_info->balance_mutex);
+               /* we are good with balance_ctl ripped off from under us */
+               BUG_ON(atomic_read(&fs_info->balance_running));
+               atomic_dec(&fs_info->balance_pause_req);
+       } else {
+               ret = -ENOTCONN;
+       }
+
+       mutex_unlock(&fs_info->balance_mutex);
+       return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -ENOTCONN;
+       }
+
+       atomic_inc(&fs_info->balance_cancel_req);
+       /*
+        * if we are running just wait and return, balance item is
+        * deleted in btrfs_balance in this case
+        */
+       if (atomic_read(&fs_info->balance_running)) {
+               mutex_unlock(&fs_info->balance_mutex);
+               wait_event(fs_info->balance_wait_q,
+                          atomic_read(&fs_info->balance_running) == 0);
+               mutex_lock(&fs_info->balance_mutex);
+       } else {
+               /* __cancel_balance needs volume_mutex */
+               mutex_unlock(&fs_info->balance_mutex);
+               mutex_lock(&fs_info->volume_mutex);
+               mutex_lock(&fs_info->balance_mutex);
+
+               if (fs_info->balance_ctl)
+                       __cancel_balance(fs_info);
+
+               mutex_unlock(&fs_info->volume_mutex);
+       }
+
+       BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+       atomic_dec(&fs_info->balance_cancel_req);
+       mutex_unlock(&fs_info->balance_mutex);
+       return 0;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -2323,8 +3041,7 @@ done:
        return ret;
 }
 
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
                           struct btrfs_key *key,
                           struct btrfs_chunk *chunk, int item_size)
 {
@@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                max_stripe_size = 1024 * 1024 * 1024;
                max_chunk_size = 10 * max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-               max_stripe_size = 256 * 1024 * 1024;
+               /* for larger filesystems, use larger metadata chunks */
+               if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+                       max_stripe_size = 1024 * 1024 * 1024;
+               else
+                       max_stripe_size = 256 * 1024 * 1024;
                max_chunk_size = max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-               max_stripe_size = 8 * 1024 * 1024;
+               max_stripe_size = 32 * 1024 * 1024;
                max_chunk_size = 2 * max_stripe_size;
        } else {
                printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                if (total_avail == 0)
                        continue;
 
-               ret = find_free_dev_extent(trans, device,
+               ret = find_free_dev_extent(device,
                                           max_stripe_size * dev_stripes,
                                           &dev_offset, &max_avail);
                if (ret && ret != -ENOSPC)
@@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
 
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-               ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+               ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
                                             item_size);
                BUG_ON(ret);
        }
@@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
                return ret;
 
        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-                       (fs_info->metadata_alloc_profile &
-                        fs_info->avail_metadata_alloc_bits);
+                               fs_info->avail_metadata_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        sys_chunk_offset = chunk_offset + chunk_size;
 
        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-                       (fs_info->system_alloc_profile &
-                        fs_info->avail_system_alloc_bits);
+                               fs_info->avail_system_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
 
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        u64 stripe_nr;
        u64 stripe_nr_orig;
        u64 stripe_nr_end;
-       int stripes_allocated = 8;
-       int stripes_required = 1;
        int stripe_index;
        int i;
+       int ret = 0;
        int num_stripes;
        int max_errors = 0;
        struct btrfs_bio *bbio = NULL;
 
-       if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
-               stripes_allocated = 1;
-again:
-       if (bbio_ret) {
-               bbio = kzalloc(btrfs_bio_size(stripes_allocated),
-                               GFP_NOFS);
-               if (!bbio)
-                       return -ENOMEM;
-
-               atomic_set(&bbio->error, 0);
-       }
-
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, *length);
        read_unlock(&em_tree->lock);
@@ -2939,32 +3645,6 @@ again:
        if (mirror_num > map->num_stripes)
                mirror_num = 0;
 
-       /* if our btrfs_bio struct is too small, back off and try again */
-       if (rw & REQ_WRITE) {
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-                                BTRFS_BLOCK_GROUP_DUP)) {
-                       stripes_required = map->num_stripes;
-                       max_errors = 1;
-               } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-                       stripes_required = map->sub_stripes;
-                       max_errors = 1;
-               }
-       }
-       if (rw & REQ_DISCARD) {
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                                BTRFS_BLOCK_GROUP_RAID1 |
-                                BTRFS_BLOCK_GROUP_DUP |
-                                BTRFS_BLOCK_GROUP_RAID10)) {
-                       stripes_required = map->num_stripes;
-               }
-       }
-       if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
-           stripes_allocated < stripes_required) {
-               stripes_allocated = map->num_stripes;
-               free_extent_map(em);
-               kfree(bbio);
-               goto again;
-       }
        stripe_nr = offset;
        /*
         * stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3660,7 @@ again:
 
        if (rw & REQ_DISCARD)
                *length = min_t(u64, em->len - offset, *length);
-       else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                             BTRFS_BLOCK_GROUP_RAID1 |
-                             BTRFS_BLOCK_GROUP_RAID10 |
-                             BTRFS_BLOCK_GROUP_DUP)) {
+       else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
                /* we limit the length of each bio to what fits in a stripe */
                *length = min_t(u64, em->len - offset,
                                map->stripe_len - stripe_offset);
@@ -3059,81 +3736,55 @@ again:
        }
        BUG_ON(stripe_index >= map->num_stripes);
 
+       bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
+       if (!bbio) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       atomic_set(&bbio->error, 0);
+
        if (rw & REQ_DISCARD) {
+               int factor = 0;
+               int sub_stripes = 0;
+               u64 stripes_per_dev = 0;
+               u32 remaining_stripes = 0;
+
+               if (map->type &
+                   (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+                       if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+                               sub_stripes = 1;
+                       else
+                               sub_stripes = map->sub_stripes;
+
+                       factor = map->num_stripes / sub_stripes;
+                       stripes_per_dev = div_u64_rem(stripe_nr_end -
+                                                     stripe_nr_orig,
+                                                     factor,
+                                                     &remaining_stripes);
+               }
+
                for (i = 0; i < num_stripes; i++) {
                        bbio->stripes[i].physical =
                                map->stripes[stripe_index].physical +
                                stripe_offset + stripe_nr * map->stripe_len;
                        bbio->stripes[i].dev = map->stripes[stripe_index].dev;
 
-                       if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
-                               u64 stripes;
-                               u32 last_stripe = 0;
-                               int j;
-
-                               div_u64_rem(stripe_nr_end - 1,
-                                           map->num_stripes,
-                                           &last_stripe);
-
-                               for (j = 0; j < map->num_stripes; j++) {
-                                       u32 test;
-
-                                       div_u64_rem(stripe_nr_end - 1 - j,
-                                                   map->num_stripes, &test);
-                                       if (test == stripe_index)
-                                               break;
-                               }
-                               stripes = stripe_nr_end - 1 - j;
-                               do_div(stripes, map->num_stripes);
-                               bbio->stripes[i].length = map->stripe_len *
-                                       (stripes - stripe_nr + 1);
-
-                               if (i == 0) {
-                                       bbio->stripes[i].length -=
-                                               stripe_offset;
-                                       stripe_offset = 0;
-                               }
-                               if (stripe_index == last_stripe)
-                                       bbio->stripes[i].length -=
-                                               stripe_end_offset;
-                       } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-                               u64 stripes;
-                               int j;
-                               int factor = map->num_stripes /
-                                            map->sub_stripes;
-                               u32 last_stripe = 0;
-
-                               div_u64_rem(stripe_nr_end - 1,
-                                           factor, &last_stripe);
-                               last_stripe *= map->sub_stripes;
-
-                               for (j = 0; j < factor; j++) {
-                                       u32 test;
-
-                                       div_u64_rem(stripe_nr_end - 1 - j,
-                                                   factor, &test);
-
-                                       if (test ==
-                                           stripe_index / map->sub_stripes)
-                                               break;
-                               }
-                               stripes = stripe_nr_end - 1 - j;
-                               do_div(stripes, factor);
-                               bbio->stripes[i].length = map->stripe_len *
-                                       (stripes - stripe_nr + 1);
-
-                               if (i < map->sub_stripes) {
+                       if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                                        BTRFS_BLOCK_GROUP_RAID10)) {
+                               bbio->stripes[i].length = stripes_per_dev *
+                                                         map->stripe_len;
+                               if (i / sub_stripes < remaining_stripes)
+                                       bbio->stripes[i].length +=
+                                               map->stripe_len;
+                               if (i < sub_stripes)
                                        bbio->stripes[i].length -=
                                                stripe_offset;
-                                       if (i == map->sub_stripes - 1)
-                                               stripe_offset = 0;
-                               }
-                               if (stripe_index >= last_stripe &&
-                                   stripe_index <= (last_stripe +
-                                                    map->sub_stripes - 1)) {
+                               if ((i / sub_stripes + 1) %
+                                   sub_stripes == remaining_stripes)
                                        bbio->stripes[i].length -=
                                                stripe_end_offset;
-                               }
+                               if (i == sub_stripes - 1)
+                                       stripe_offset = 0;
                        } else
                                bbio->stripes[i].length = *length;
 
@@ -3155,15 +3806,22 @@ again:
                        stripe_index++;
                }
        }
-       if (bbio_ret) {
-               *bbio_ret = bbio;
-               bbio->num_stripes = num_stripes;
-               bbio->max_errors = max_errors;
-               bbio->mirror_num = mirror_num;
+
+       if (rw & REQ_WRITE) {
+               if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+                                BTRFS_BLOCK_GROUP_RAID10 |
+                                BTRFS_BLOCK_GROUP_DUP)) {
+                       max_errors = 1;
+               }
        }
+
+       *bbio_ret = bbio;
+       bbio->num_stripes = num_stripes;
+       bbio->max_errors = max_errors;
+       bbio->mirror_num = mirror_num;
 out:
        free_extent_map(em);
-       return 0;
+       return ret;
 }
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
        /* don't bother with additional async steps for reads, right now */
        if (!(rw & REQ_WRITE)) {
                bio_get(bio);
-               submit_bio(rw, bio);
+               btrfsic_submit_bio(rw, bio);
                bio_put(bio);
                return 0;
        }
@@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                        if (async_submit)
                                schedule_bio(root, dev, rw, bio);
                        else
-                               submit_bio(rw, bio);
+                               btrfsic_submit_bio(rw, bio);
                } else {
                        bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
                        bio->bi_sector = logical >> 9;
@@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
        struct btrfs_fs_devices *fs_devices;
        int ret;
 
-       mutex_lock(&uuid_mutex);
+       BUG_ON(!mutex_is_locked(&uuid_mutex));
 
        fs_devices = root->fs_info->fs_devices->seed;
        while (fs_devices) {
@@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
        fs_devices->seed = root->fs_info->fs_devices->seed;
        root->fs_info->fs_devices->seed = fs_devices;
 out:
-       mutex_unlock(&uuid_mutex);
        return ret;
 }
 
@@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
        if (!path)
                return -ENOMEM;
 
+       mutex_lock(&uuid_mutex);
+       lock_chunks(root);
+
        /* first we search for all of the device items, and then we
         * read in all of the chunk items.  This way we can create chunk
         * mappings that reference all of the devices that are afound
@@ -3799,6 +4459,9 @@ again:
        }
        ret = 0;
 error:
+       unlock_chunks(root);
+       mutex_unlock(&uuid_mutex);
+
        btrfs_free_path(path);
        return ret;
 }
index 78f2d4d..19ac950 100644 (file)
@@ -186,6 +186,51 @@ struct map_lookup {
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))
 
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA             (1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM           (1ULL << 1)
+#define BTRFS_BALANCE_METADATA         (1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK                (BTRFS_BALANCE_DATA |       \
+                                        BTRFS_BALANCE_SYSTEM |     \
+                                        BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE            (1ULL << 3)
+#define BTRFS_BALANCE_RESUME           (1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES    (1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE       (1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID       (1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE      (1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE      (1ULL << 4)
+
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT     (1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT                (1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+       struct btrfs_fs_info *fs_info;
+
+       struct btrfs_balance_args data;
+       struct btrfs_balance_args meta;
+       struct btrfs_balance_args sys;
+
+       u64 flags;
+
+       struct btrfs_balance_progress stat;
+};
+
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                   u64 end, u64 *length);
 
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
                                       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+                 struct btrfs_ioctl_balance_args *bargs);
+int btrfs_recover_balance(struct btrfs_root *tree_root);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
-int find_free_dev_extent(struct btrfs_trans_handle *trans,
-                        struct btrfs_device *device, u64 num_bytes,
+int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *max_avail);
 #endif
index 3848b04..e7a5659 100644 (file)
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
 out:
-       btrfs_end_transaction_throttle(trans, root);
+       btrfs_end_transaction(trans, root);
        return ret;
 }
 
index c283a1e..208c6aa 100644 (file)
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
 
 static char *getname_flags(const char __user *filename, int flags, int *empty)
 {
-       char *tmp, *result;
-
-       result = ERR_PTR(-ENOMEM);
-       tmp = __getname();
-       if (tmp)  {
-               int retval = do_getname(filename, tmp);
-
-               result = tmp;
-               if (retval < 0) {
-                       if (retval == -ENOENT && empty)
-                               *empty = 1;
-                       if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-                               __putname(tmp);
-                               result = ERR_PTR(retval);
-                       }
+       char *result = __getname();
+       int retval;
+
+       if (!result)
+               return ERR_PTR(-ENOMEM);
+
+       retval = do_getname(filename, result);
+       if (retval < 0) {
+               if (retval == -ENOENT && empty)
+                       *empty = 1;
+               if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+                       __putname(result);
+                       return ERR_PTR(retval);
                }
        }
        audit_getname(result);
index 5485a53..9cde9ed 100644 (file)
@@ -198,65 +198,7 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
        return result;
 }
 
-static struct mm_struct *__check_mem_permission(struct task_struct *task)
-{
-       struct mm_struct *mm;
-
-       mm = get_task_mm(task);
-       if (!mm)
-               return ERR_PTR(-EINVAL);
-
-       /*
-        * A task can always look at itself, in case it chooses
-        * to use system calls instead of load instructions.
-        */
-       if (task == current)
-               return mm;
-
-       /*
-        * If current is actively ptrace'ing, and would also be
-        * permitted to freshly attach with ptrace now, permit it.
-        */
-       if (task_is_stopped_or_traced(task)) {
-               int match;
-               rcu_read_lock();
-               match = (ptrace_parent(task) == current);
-               rcu_read_unlock();
-               if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
-                       return mm;
-       }
-
-       /*
-        * No one else is allowed.
-        */
-       mmput(mm);
-       return ERR_PTR(-EPERM);
-}
-
-/*
- * If current may access user memory in @task return a reference to the
- * corresponding mm, otherwise ERR_PTR.
- */
-static struct mm_struct *check_mem_permission(struct task_struct *task)
-{
-       struct mm_struct *mm;
-       int err;
-
-       /*
-        * Avoid racing if task exec's as we might get a new mm but validate
-        * against old credentials.
-        */
-       err = mutex_lock_killable(&task->signal->cred_guard_mutex);
-       if (err)
-               return ERR_PTR(err);
-
-       mm = __check_mem_permission(task);
-       mutex_unlock(&task->signal->cred_guard_mutex);
-
-       return mm;
-}
-
-struct mm_struct *mm_for_maps(struct task_struct *task)
+static struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
 {
        struct mm_struct *mm;
        int err;
@@ -267,7 +209,7 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 
        mm = get_task_mm(task);
        if (mm && mm != current->mm &&
-                       !ptrace_may_access(task, PTRACE_MODE_READ)) {
+                       !ptrace_may_access(task, mode)) {
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
@@ -276,6 +218,11 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
        return mm;
 }
 
+struct mm_struct *mm_for_maps(struct task_struct *task)
+{
+       return mm_access(task, PTRACE_MODE_READ);
+}
+
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 {
        int res = 0;
@@ -752,38 +699,39 @@ static const struct file_operations proc_single_file_operations = {
 
 static int mem_open(struct inode* inode, struct file* file)
 {
-       file->private_data = (void*)((long)current->self_exec_id);
+       struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+       struct mm_struct *mm;
+
+       if (!task)
+               return -ESRCH;
+
+       mm = mm_access(task, PTRACE_MODE_ATTACH);
+       put_task_struct(task);
+
+       if (IS_ERR(mm))
+               return PTR_ERR(mm);
+
        /* OK to pass negative loff_t, we can catch out-of-range */
        file->f_mode |= FMODE_UNSIGNED_OFFSET;
+       file->private_data = mm;
+
        return 0;
 }
 
 static ssize_t mem_read(struct file * file, char __user * buf,
                        size_t count, loff_t *ppos)
 {
-       struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+       int ret;
        char *page;
        unsigned long src = *ppos;
-       int ret = -ESRCH;
-       struct mm_struct *mm;
+       struct mm_struct *mm = file->private_data;
 
-       if (!task)
-               goto out_no_task;
+       if (!mm)
+               return 0;
 
-       ret = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
-               goto out;
-
-       mm = check_mem_permission(task);
-       ret = PTR_ERR(mm);
-       if (IS_ERR(mm))
-               goto out_free;
-
-       ret = -EIO;
-       if (file->private_data != (void*)((long)current->self_exec_id))
-               goto out_put;
+               return -ENOMEM;
 
        ret = 0;
  
@@ -810,13 +758,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
        }
        *ppos = src;
 
-out_put:
-       mmput(mm);
-out_free:
        free_page((unsigned long) page);
-out:
-       put_task_struct(task);
-out_no_task:
        return ret;
 }
 
@@ -825,27 +767,15 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
 {
        int copied;
        char *page;
-       struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
        unsigned long dst = *ppos;
-       struct mm_struct *mm;
+       struct mm_struct *mm = file->private_data;
 
-       copied = -ESRCH;
-       if (!task)
-               goto out_no_task;
+       if (!mm)
+               return 0;
 
-       copied = -ENOMEM;
        page = (char *)__get_free_page(GFP_TEMPORARY);
        if (!page)
-               goto out_task;
-
-       mm = check_mem_permission(task);
-       copied = PTR_ERR(mm);
-       if (IS_ERR(mm))
-               goto out_free;
-
-       copied = -EIO;
-       if (file->private_data != (void *)((long)current->self_exec_id))
-               goto out_mm;
+               return -ENOMEM;
 
        copied = 0;
        while (count > 0) {
@@ -869,13 +799,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
        }
        *ppos = dst;
 
-out_mm:
-       mmput(mm);
-out_free:
        free_page((unsigned long) page);
-out_task:
-       put_task_struct(task);
-out_no_task:
        return copied;
 }
 
@@ -895,11 +819,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
        return file->f_pos;
 }
 
+static int mem_release(struct inode *inode, struct file *file)
+{
+       struct mm_struct *mm = file->private_data;
+
+       mmput(mm);
+       return 0;
+}
+
 static const struct file_operations proc_mem_operations = {
        .llseek         = mem_lseek,
        .read           = mem_read,
        .write          = mem_write,
        .open           = mem_open,
+       .release        = mem_release,
 };
 
 static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1199,9 +1132,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
        ssize_t length;
        uid_t loginuid;
 
-       if (!capable(CAP_AUDIT_CONTROL))
-               return -EPERM;
-
        rcu_read_lock();
        if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
                rcu_read_unlock();
@@ -1230,7 +1160,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
                goto out_free_page;
 
        }
-       length = audit_set_loginuid(current, loginuid);
+       length = audit_set_loginuid(loginuid);
        if (likely(length == 0))
                length = count;
 
index 574d4ee..74b9baf 100644 (file)
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
        xfs_fsize_t             bsize;
 
        bsize = ioend->io_offset + ioend->io_size;
-       isize = MAX(ip->i_size, ip->i_new_size);
-       isize = MIN(isize, bsize);
+       isize = MIN(i_size_read(VFS_I(ip)), bsize);
        return isize > ip->i_d.di_size ? isize : 0;
 }
 
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 }
 
 /*
- * Update on-disk file size now that data has been written to disk.  The
- * current in-memory file size is i_size.  If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated.  If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
+ * Update on-disk file size now that data has been written to disk.
  *
  * This function does not block as blocking on the inode lock in IO completion
  * can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1279,6 +1274,15 @@ xfs_end_io_direct_write(
        struct xfs_ioend        *ioend = iocb->private;
 
        /*
+        * While the generic direct I/O code updates the inode size, it does
+        * so only after the end_io handler is called, which means our
+        * end_io handler thinks the on-disk size is outside the in-core
+        * size.  To prevent this just update it a little bit earlier here.
+        */
+       if (offset + size > i_size_read(ioend->io_inode))
+               i_size_write(ioend->io_inode, offset + size);
+
+       /*
         * blockdev_direct_IO can return an error even after the I/O
         * completion handler was called.  Thus we need to protect
         * against double-freeing.
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
 
        if (to > inode->i_size) {
                /*
-                * punch out the delalloc blocks we have already allocated. We
-                * don't call xfs_setattr() to do this as we may be in the
-                * middle of a multi-iovec write and so the vfs inode->i_size
-                * will not match the xfs ip->i_size and so it will zero too
-                * much. Hence we jus truncate the page cache to zero what is
-                * necessary and punch the delalloc blocks directly.
+                * Punch out the delalloc blocks we have already allocated.
+                *
+                * Don't bother with xfs_setattr given that nothing can have
+                * made it to disk yet as the page is still locked at this
+                * point.
                 */
                struct xfs_inode        *ip = XFS_I(inode);
                xfs_fileoff_t           start_fsb;
index 1e5d97f..08b9ac6 100644 (file)
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
        if (error)
                goto out;
 
-       /*
-        * Commit the last in the sequence of transactions.
-        */
-       xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
        error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
 
index c1b55e5..d25eafd 100644 (file)
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
        dp = args->dp;
        mp = dp->i_mount;
        dp->i_d.di_forkoff = forkoff;
-       dp->i_df.if_ext_max =
-               XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-       dp->i_afp->if_ext_max =
-               XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
 
        ifp = dp->i_afp;
        ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
        ASSERT(ip->i_d.di_anextents == 0);
        ASSERT(ip->i_afp == NULL);
 
-       ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
                                (args->op_flags & XFS_DA_OP_ADDNAME) ||
                                !(mp->m_flags & XFS_MOUNT_ATTR2) ||
                                dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
-               dp->i_afp->if_ext_max =
-                       XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
-               dp->i_df.if_ext_max =
-                       XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
                xfs_trans_log_inode(args->trans, dp,
                                        XFS_ILOG_CORE | XFS_ILOG_ADATA);
        }
index d0ab788..188ef2f 100644 (file)
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
 }
 
 /*
-* Update the record referred to by cur to the value given
+ * Check if the inode needs to be converted to btree format.
+ */
+static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
+{
+       return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+               XFS_IFORK_NEXTENTS(ip, whichfork) >
+                       XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Check if the inode should be converted to extent format.
+ */
+static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
+{
+       return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+               XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                       XFS_IFORK_MAXEXT(ip, whichfork);
+}
+
+/*
+ * Update the record referred to by cur to the value given
  * by [off, bno, len, state].
  * This either works (return 0) or gets an EFSCORRUPTED error.
  */
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-               if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                   bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
                                        bma->firstblock, bma->flist,
                                        &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-               if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                   bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
                                bma->firstblock, bma->flist, &bma->cur, 1,
                                &tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
-               if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-                   bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) {
+
+               if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                        error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
                                        bma->firstblock, bma->flist, &bma->cur,
                                        1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
        }
 
        /* convert to a btree if necessary */
-       if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+       if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
                int     tmp_logflags;   /* partial log flag return val */
 
                ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
        }
 
        /* convert to a btree if necessary */
-       if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
+       if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
                int     tmp_logflags;   /* partial log flag return val */
 
                ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
        }
 
        /* convert to a btree if necessary */
-       if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
+       if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
                int     tmp_logflags;   /* partial log flag return val */
 
                ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
 
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
        /*
         * Make space in the inode incore.
         */
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
            ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
                uint    dfl_forkoff = xfs_default_attroffset(ip) >> 3;
 
-               if (dfl_forkoff > ip->i_d.di_forkoff) {
+               if (dfl_forkoff > ip->i_d.di_forkoff)
                        ip->i_d.di_forkoff = dfl_forkoff;
-                       ip->i_df.if_ext_max =
-                               XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
-                       ip->i_afp->if_ext_max =
-                               XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
-               }
        }
 }
 
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
        int                     error;          /* error return value */
 
        ASSERT(XFS_IFORK_Q(ip) == 0);
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
 
        mp = ip->i_mount;
        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
                error = XFS_ERROR(EINVAL);
                goto error1;
        }
-       ip->i_df.if_ext_max =
-               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
        ASSERT(ip->i_afp == NULL);
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
-       ip->i_afp->if_ext_max =
-               XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        ip->i_afp->if_flags = XFS_IFEXTENTS;
        logflags = 0;
        xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
                } else
                        spin_unlock(&mp->m_sb_lock);
        }
-       if ((error = xfs_bmap_finish(&tp, &flist, &committed)))
+
+       error = xfs_bmap_finish(&tp, &flist, &committed);
+       if (error)
                goto error2;
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
-       return error;
+       return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 error2:
        xfs_bmap_cancel(&flist);
 error1:
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
 error0:
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
        return error;
 }
 
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block(
        xfs_bmbt_irec_t s;              /* internal version of extent */
 
 #ifndef DEBUG
-       if (whichfork == XFS_DATA_FORK) {
-               return S_ISREG(ip->i_d.di_mode) ?
-                       (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
-                       (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
-       }
+       if (whichfork == XFS_DATA_FORK)
+               return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
 #endif /* !DEBUG */
        if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
                return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
        xfs_bmbt_get_all(ep, &s);
        rval = s.br_startoff == 0 && s.br_blockcount == 1;
        if (rval && whichfork == XFS_DATA_FORK)
-               ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize);
+               ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
        return rval;
 }
 
@@ -4379,8 +4379,6 @@ xfs_bmapi_read(
        XFS_STATS_INC(xs_blk_mapr);
 
        ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
                error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4869,6 @@ xfs_bmapi_write(
                return XFS_ERROR(EIO);
 
        ifp = XFS_IFORK_PTR(ip, whichfork);
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
 
        XFS_STATS_INC(xs_blk_mapw);
 
@@ -4981,8 +4977,7 @@ xfs_bmapi_write(
        /*
         * Transform from btree to extents, give it cur.
         */
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-           XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+       if (xfs_bmap_wants_extents(ip, whichfork)) {
                int             tmp_logflags = 0;
 
                ASSERT(bma.cur);
@@ -4992,10 +4987,10 @@ xfs_bmapi_write(
                if (error)
                        goto error0;
        }
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
-              XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max);
+              XFS_IFORK_NEXTENTS(ip, whichfork) >
+               XFS_IFORK_MAXEXT(ip, whichfork));
        error = 0;
 error0:
        /*
@@ -5095,8 +5090,7 @@ xfs_bunmapi(
 
        ASSERT(len > 0);
        ASSERT(nexts >= 0);
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
        if (!(ifp->if_flags & XFS_IFEXTENTS) &&
            (error = xfs_iread_extents(tp, ip, whichfork)))
                return error;
@@ -5322,7 +5316,8 @@ xfs_bunmapi(
                 */
                if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
                    XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-                   XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max &&
+                   XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
+                       XFS_IFORK_MAXEXT(ip, whichfork) &&
                    del.br_startoff > got.br_startoff &&
                    del.br_startoff + del.br_blockcount <
                    got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5348,11 @@ nodelete:
                }
        }
        *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
+
        /*
         * Convert to a btree if necessary.
         */
-       if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
+       if (xfs_bmap_needs_btree(ip, whichfork)) {
                ASSERT(cur == NULL);
                error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
                        &cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5363,7 @@ nodelete:
        /*
         * transform from btree to extents, give it cur
         */
-       else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
-                XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
+       else if (xfs_bmap_wants_extents(ip, whichfork)) {
                ASSERT(cur != NULL);
                error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
                        whichfork);
@@ -5382,8 +5374,6 @@ nodelete:
        /*
         * transform from extents to local?
         */
-       ASSERT(ifp->if_ext_max ==
-              XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
        error = 0;
 error0:
        /*
@@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
        if (startblock == HOLESTARTBLOCK) {
                mp = ip->i_mount;
                out->bmv_block = -1;
-               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size));
+               fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
                fixlen -= out->bmv_offset;
                if (prealloced && out->bmv_offset + out->bmv_length == end) {
                        /* Came to hole at EOF. Trim it. */
@@ -5522,7 +5512,7 @@ xfs_getbmap(
                        fixlen = XFS_MAXIOFFSET(mp);
                } else {
                        prealloced = 0;
-                       fixlen = ip->i_size;
+                       fixlen = XFS_ISIZE(ip);
                }
        }
 
@@ -5551,7 +5541,7 @@ xfs_getbmap(
 
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
-               if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) {
+               if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
                        error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
                        if (error)
                                goto out_unlock_iolock;
index 654dc6f..dd974a5 100644 (file)
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
 
        /* Check temp in extent form to max in target */
        if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max)
+           XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
+                       XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
                return EINVAL;
 
        /* Check target in extent form to max in temp */
        if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
-           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
+           XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
+                       XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
                return EINVAL;
 
        /*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
         * (a common defrag case) which will occur when the temp inode is in
         * extent format...
         */
-       if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-           ((XFS_IFORK_BOFF(ip) &&
-             tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
-            XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
-               return EINVAL;
+       if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+               if (XFS_IFORK_BOFF(ip) &&
+                   tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+                       return EINVAL;
+               if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
+                   XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
+                       return EINVAL;
+       }
 
        /* Reciprocal target->temp btree format checks */
-       if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-           ((XFS_IFORK_BOFF(tip) &&
-             ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
-            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
-               return EINVAL;
+       if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+               if (XFS_IFORK_BOFF(tip) &&
+                   ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+                       return EINVAL;
+
+               if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
+                   XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
+                       return EINVAL;
+       }
 
        return 0;
 }
@@ -349,16 +358,6 @@ xfs_swap_extents(
        *tifp = *tempifp;       /* struct copy */
 
        /*
-        * Fix the in-memory data fork values that are dependent on the fork
-        * offset in the inode. We can't assume they remain the same as attr2
-        * has dynamic fork offsets.
-        */
-       ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
-                                       (uint)sizeof(xfs_bmbt_rec_t);
-       tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
-                                       (uint)sizeof(xfs_bmbt_rec_t);
-
-       /*
         * Fix the on-disk inode values
         */
        tmp = (__uint64_t)ip->i_d.di_nblocks;
index f675f3d..7e5bc87 100644 (file)
@@ -327,7 +327,7 @@ xfs_file_aio_read(
                                mp->m_rtdev_targp : mp->m_ddev_targp;
                if ((iocb->ki_pos & target->bt_smask) ||
                    (size & target->bt_smask)) {
-                       if (iocb->ki_pos == ip->i_size)
+                       if (iocb->ki_pos == i_size_read(inode))
                                return 0;
                        return -XFS_ERROR(EINVAL);
                }
@@ -412,51 +412,6 @@ xfs_file_splice_read(
        return ret;
 }
 
-STATIC void
-xfs_aio_write_isize_update(
-       struct inode    *inode,
-       loff_t          *ppos,
-       ssize_t         bytes_written)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             isize = i_size_read(inode);
-
-       if (bytes_written > 0)
-               XFS_STATS_ADD(xs_write_bytes, bytes_written);
-
-       if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
-                                       *ppos > isize))
-               *ppos = isize;
-
-       if (*ppos > ip->i_size) {
-               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-               if (*ppos > ip->i_size)
-                       ip->i_size = *ppos;
-               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-}
-
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred.  In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
-       struct xfs_inode        *ip,
-       xfs_fsize_t             new_size)
-{
-       if (new_size == ip->i_new_size) {
-               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-               if (new_size == ip->i_new_size)
-                       ip->i_new_size = 0;
-               if (ip->i_d.di_size > ip->i_size)
-                       ip->i_d.di_size = ip->i_size;
-               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-}
-
 /*
  * xfs_file_splice_write() does not use xfs_rw_ilock() because
  * generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +430,6 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
 
@@ -489,19 +443,12 @@ xfs_file_splice_write(
 
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
 
-       new_size = *ppos + count;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       if (new_size > ip->i_size)
-               ip->i_new_size = new_size;
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
 
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+       if (ret > 0)
+               XFS_STATS_ADD(xs_write_bytes, ret);
 
-       xfs_aio_write_isize_update(inode, ppos, ret);
-       xfs_aio_write_newsize_update(ip, new_size);
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -689,28 +636,26 @@ out_lock:
 /*
  * Common pre-write limit and setup checks.
  *
- * Returns with iolock held according to @iolock.
+ * Called with the iolocked held either shared and exclusive according to
+ * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
+ * if called for a direct write beyond i_size.
  */
 STATIC ssize_t
 xfs_file_aio_write_checks(
        struct file             *file,
        loff_t                  *pos,
        size_t                  *count,
-       xfs_fsize_t             *new_sizep,
        int                     *iolock)
 {
        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             new_size;
        int                     error = 0;
 
        xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-       *new_sizep = 0;
 restart:
        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
        if (error) {
-               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
-               *iolock = 0;
+               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
                return error;
        }
 
@@ -720,36 +665,21 @@ restart:
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
-        * write. There is no need to issue zeroing if another in-flght IO ends
-        * at or before this one If zeronig is needed and we are currently
-        * holding the iolock shared, we need to update it to exclusive which
-        * involves dropping all locks and relocking to maintain correct locking
-        * order. If we do this, restart the function to ensure all checks and
-        * values are still valid.
+        * write.  If zeroing is needed and we are currently holding the
+        * iolock shared, we need to update it to exclusive which involves
+        * dropping all locks and relocking to maintain correct locking order.
+        * If we do this, restart the function to ensure all checks and values
+        * are still valid.
         */
-       if ((ip->i_new_size && *pos > ip->i_new_size) ||
-           (!ip->i_new_size && *pos > ip->i_size)) {
+       if (*pos > i_size_read(inode)) {
                if (*iolock == XFS_IOLOCK_SHARED) {
                        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
                        *iolock = XFS_IOLOCK_EXCL;
                        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
                        goto restart;
                }
-               error = -xfs_zero_eof(ip, *pos, ip->i_size);
+               error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
        }
-
-       /*
-        * If this IO extends beyond EOF, we may need to update ip->i_new_size.
-        * We have already zeroed space beyond EOF (if necessary).  Only update
-        * ip->i_new_size if this IO ends beyond any other in-flight writes.
-        */
-       new_size = *pos + *count;
-       if (new_size > ip->i_size) {
-               if (new_size > ip->i_new_size)
-                       ip->i_new_size = new_size;
-               *new_sizep = new_size;
-       }
-
        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
@@ -794,9 +724,7 @@ xfs_file_dio_aio_write(
        const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos,
-       size_t                  ocount,
-       xfs_fsize_t             *new_size,
-       int                     *iolock)
+       size_t                  ocount)
 {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
@@ -806,10 +734,10 @@ xfs_file_dio_aio_write(
        ssize_t                 ret = 0;
        size_t                  count = ocount;
        int                     unaligned_io = 0;
+       int                     iolock;
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
 
-       *iolock = 0;
        if ((pos & target->bt_smask) || (count & target->bt_smask))
                return -XFS_ERROR(EINVAL);
 
@@ -824,31 +752,31 @@ xfs_file_dio_aio_write(
         * EOF zeroing cases and fill out the new inode size as appropriate.
         */
        if (unaligned_io || mapping->nrpages)
-               *iolock = XFS_IOLOCK_EXCL;
+               iolock = XFS_IOLOCK_EXCL;
        else
-               *iolock = XFS_IOLOCK_SHARED;
-       xfs_rw_ilock(ip, *iolock);
+               iolock = XFS_IOLOCK_SHARED;
+       xfs_rw_ilock(ip, iolock);
 
        /*
         * Recheck if there are cached pages that need invalidate after we got
         * the iolock to protect against other threads adding new pages while
         * we were waiting for the iolock.
         */
-       if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) {
-               xfs_rw_iunlock(ip, *iolock);
-               *iolock = XFS_IOLOCK_EXCL;
-               xfs_rw_ilock(ip, *iolock);
+       if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
+               xfs_rw_iunlock(ip, iolock);
+               iolock = XFS_IOLOCK_EXCL;
+               xfs_rw_ilock(ip, iolock);
        }
 
-       ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+       ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
        if (ret)
-               return ret;
+               goto out;
 
        if (mapping->nrpages) {
                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
                                                        FI_REMAPF_LOCKED);
                if (ret)
-                       return ret;
+                       goto out;
        }
 
        /*
@@ -857,15 +785,18 @@ xfs_file_dio_aio_write(
         */
        if (unaligned_io)
                inode_dio_wait(inode);
-       else if (*iolock == XFS_IOLOCK_EXCL) {
+       else if (iolock == XFS_IOLOCK_EXCL) {
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-               *iolock = XFS_IOLOCK_SHARED;
+               iolock = XFS_IOLOCK_SHARED;
        }
 
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
        ret = generic_file_direct_write(iocb, iovp,
                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
 
+out:
+       xfs_rw_iunlock(ip, iolock);
+
        /* No fallback to buffered IO on errors for XFS. */
        ASSERT(ret < 0 || ret == count);
        return ret;
@@ -877,9 +808,7 @@ xfs_file_buffered_aio_write(
        const struct iovec      *iovp,
        unsigned long           nr_segs,
        loff_t                  pos,
-       size_t                  ocount,
-       xfs_fsize_t             *new_size,
-       int                     *iolock)
+       size_t                  ocount)
 {
        struct file             *file = iocb->ki_filp;
        struct address_space    *mapping = file->f_mapping;
@@ -887,14 +816,14 @@ xfs_file_buffered_aio_write(
        struct xfs_inode        *ip = XFS_I(inode);
        ssize_t                 ret;
        int                     enospc = 0;
+       int                     iolock = XFS_IOLOCK_EXCL;
        size_t                  count = ocount;
 
-       *iolock = XFS_IOLOCK_EXCL;
-       xfs_rw_ilock(ip, *iolock);
+       xfs_rw_ilock(ip, iolock);
 
-       ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
+       ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
        if (ret)
-               return ret;
+               goto out;
 
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +837,15 @@ write_retry:
         * page locks and retry *once*
         */
        if (ret == -ENOSPC && !enospc) {
-               ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-               if (ret)
-                       return ret;
                enospc = 1;
-               goto write_retry;
+               ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+               if (!ret)
+                       goto write_retry;
        }
+
        current->backing_dev_info = NULL;
+out:
+       xfs_rw_iunlock(ip, iolock);
        return ret;
 }
 
@@ -930,9 +861,7 @@ xfs_file_aio_write(
        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
        ssize_t                 ret;
-       int                     iolock;
        size_t                  ocount = 0;
-       xfs_fsize_t             new_size = 0;
 
        XFS_STATS_INC(xs_write_calls);
 
@@ -951,33 +880,22 @@ xfs_file_aio_write(
                return -EIO;
 
        if (unlikely(file->f_flags & O_DIRECT))
-               ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-                                               ocount, &new_size, &iolock);
+               ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
        else
                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-                                               ocount, &new_size, &iolock);
-
-       xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
+                                                 ocount);
 
-       if (ret <= 0)
-               goto out_unlock;
+       if (ret > 0) {
+               ssize_t err;
 
-       /* Handle various SYNC-type writes */
-       if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-               loff_t end = pos + ret - 1;
-               int error;
+               XFS_STATS_ADD(xs_write_bytes, ret);
 
-               xfs_rw_iunlock(ip, iolock);
-               error = xfs_file_fsync(file, pos, end,
-                                     (file->f_flags & __O_SYNC) ? 0 : 1);
-               xfs_rw_ilock(ip, iolock);
-               if (error)
-                       ret = error;
+               /* Handle various SYNC-type writes */
+               err = generic_write_sync(file, pos, ret);
+               if (err < 0)
+                       ret = err;
        }
 
-out_unlock:
-       xfs_aio_write_newsize_update(ip, new_size);
-       xfs_rw_iunlock(ip, iolock);
        return ret;
 }
 
index ed88ed1..652b875 100644 (file)
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
 
        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
                return -filemap_fdatawait_range(mapping, first,
-                                       last == -1 ? ip->i_size - 1 : last);
+                                       last == -1 ? XFS_ISIZE(ip) - 1 : last);
        }
        return 0;
 }
index 3960a06..8c3e463 100644 (file)
@@ -77,7 +77,7 @@ xfs_inode_alloc(
 
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-       ASSERT(completion_done(&ip->i_flush));
+       ASSERT(!xfs_isiflocked(ip));
        ASSERT(ip->i_ino == 0);
 
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -94,8 +94,6 @@ xfs_inode_alloc(
        ip->i_update_core = 0;
        ip->i_delayed_blks = 0;
        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-       ip->i_size = 0;
-       ip->i_new_size = 0;
 
        return ip;
 }
@@ -150,7 +148,7 @@ xfs_inode_free(
        /* asserts to verify all state is correct here */
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-       ASSERT(completion_done(&ip->i_flush));
+       ASSERT(!xfs_isiflocked(ip));
 
        /*
         * Because we use RCU freeing we need to ensure the inode always
@@ -450,8 +448,6 @@ again:
 
        *ipp = ip;
 
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
        /*
         * If we have a real type for an on-disk inode, we can set ops(&unlock)
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
@@ -715,3 +711,19 @@ xfs_isilocked(
        return 0;
 }
 #endif
+
+void
+__xfs_iflock(
+       struct xfs_inode        *ip)
+{
+       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+       do {
+               prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_isiflocked(ip))
+                       io_schedule();
+       } while (!xfs_iflock_nowait(ip));
+
+       finish_wait(wq, &wait.wait);
+}
index 9dda7cc..b210224 100644 (file)
@@ -299,11 +299,8 @@ xfs_iformat(
 {
        xfs_attr_shortform_t    *atp;
        int                     size;
-       int                     error;
+       int                     error = 0;
        xfs_fsize_t             di_size;
-       ip->i_df.if_ext_max =
-               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
-       error = 0;
 
        if (unlikely(be32_to_cpu(dip->di_nextents) +
                     be16_to_cpu(dip->di_anextents) >
@@ -350,7 +347,6 @@ xfs_iformat(
                        return XFS_ERROR(EFSCORRUPTED);
                }
                ip->i_d.di_size = 0;
-               ip->i_size = 0;
                ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
                break;
 
@@ -409,10 +405,10 @@ xfs_iformat(
        }
        if (!XFS_DFORK_Q(dip))
                return 0;
+
        ASSERT(ip->i_afp == NULL);
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
-       ip->i_afp->if_ext_max =
-               XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
+
        switch (dip->di_aformat) {
        case XFS_DINODE_FMT_LOCAL:
                atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +600,11 @@ xfs_iformat_btree(
         * or the number of extents is greater than the number of
         * blocks.
         */
-       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
-           || XFS_BMDR_SPACE_CALC(nrecs) >
-                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
-           || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+       if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
+                       XFS_IFORK_MAXEXT(ip, whichfork) ||
+                    XFS_BMDR_SPACE_CALC(nrecs) >
+                       XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
+                    XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
                xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
                        (unsigned long long) ip->i_ino);
                XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +832,6 @@ xfs_iread(
                 * with the uninitialized part of it.
                 */
                ip->i_d.di_mode = 0;
-               /*
-                * Initialize the per-fork minima and maxima for a new
-                * inode here.  xfs_iformat will do it for old inodes.
-                */
-               ip->i_df.if_ext_max =
-                       XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        }
 
        /*
@@ -861,7 +852,6 @@ xfs_iread(
        }
 
        ip->i_delayed_blks = 0;
-       ip->i_size = ip->i_d.di_size;
 
        /*
         * Mark the buffer containing the inode as something to keep
@@ -1051,7 +1041,6 @@ xfs_ialloc(
        }
 
        ip->i_d.di_size = 0;
-       ip->i_size = 0;
        ip->i_d.di_nextents = 0;
        ASSERT(ip->i_d.di_nblocks == 0);
 
@@ -1166,52 +1155,6 @@ xfs_ialloc(
 }
 
 /*
- * Check to make sure that there are no blocks allocated to the
- * file beyond the size of the file.  We don't check this for
- * files with fixed size extents or real time extents, but we
- * at least do it for regular files.
- */
-#ifdef DEBUG
-STATIC void
-xfs_isize_check(
-       struct xfs_inode        *ip,
-       xfs_fsize_t             isize)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           map_first;
-       int                     nimaps;
-       xfs_bmbt_irec_t         imaps[2];
-       int                     error;
-
-       if (!S_ISREG(ip->i_d.di_mode))
-               return;
-
-       if (XFS_IS_REALTIME_INODE(ip))
-               return;
-
-       if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
-               return;
-
-       nimaps = 2;
-       map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-       /*
-        * The filesystem could be shutting down, so bmapi may return
-        * an error.
-        */
-       error = xfs_bmapi_read(ip, map_first,
-                        (XFS_B_TO_FSB(mp,
-                              (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
-                        imaps, &nimaps, XFS_BMAPI_ENTIRE);
-       if (error)
-               return;
-       ASSERT(nimaps == 1);
-       ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
-}
-#else  /* DEBUG */
-#define xfs_isize_check(ip, isize)
-#endif /* DEBUG */
-
-/*
  * Free up the underlying blocks past new_size.  The new size must be smaller
  * than the current size.  This routine can be used both for the attribute and
  * data fork, and does not modify the inode size, which is left to the caller.
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents(
        int                     done = 0;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-       ASSERT(new_size <= ip->i_size);
+       ASSERT(new_size <= XFS_ISIZE(ip));
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(ip->i_itemp != NULL);
        ASSERT(ip->i_itemp->ili_lock_flags == 0);
        ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
 
+       trace_xfs_itruncate_extents_start(ip, new_size);
+
        /*
         * Since it is possible for space to become allocated beyond
         * the end of the file (in a crash where the space is allocated
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents(
                        goto out;
        }
 
+       /*
+        * Always re-log the inode so that our permanent transaction can keep
+        * on rolling it forward in the log.
+        */
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       trace_xfs_itruncate_extents_end(ip, new_size);
+
 out:
        *tpp = tp;
        return error;
@@ -1338,74 +1291,6 @@ out_bmap_cancel:
        goto out;
 }
 
-int
-xfs_itruncate_data(
-       struct xfs_trans        **tpp,
-       struct xfs_inode        *ip,
-       xfs_fsize_t             new_size)
-{
-       int                     error;
-
-       trace_xfs_itruncate_data_start(ip, new_size);
-
-       /*
-        * The first thing we do is set the size to new_size permanently on
-        * disk.  This way we don't have to worry about anyone ever being able
-        * to look at the data being freed even in the face of a crash.
-        * What we're getting around here is the case where we free a block, it
-        * is allocated to another file, it is written to, and then we crash.
-        * If the new data gets written to the file but the log buffers
-        * containing the free and reallocation don't, then we'd end up with
-        * garbage in the blocks being freed.  As long as we make the new_size
-        * permanent before actually freeing any blocks it doesn't matter if
-        * they get written to.
-        */
-       if (ip->i_d.di_nextents > 0) {
-               /*
-                * If we are not changing the file size then do not update
-                * the on-disk file size - we may be called from
-                * xfs_inactive_free_eofblocks().  If we update the on-disk
-                * file size and then the system crashes before the contents
-                * of the file are flushed to disk then the files may be
-                * full of holes (ie NULL files bug).
-                */
-               if (ip->i_size != new_size) {
-                       ip->i_d.di_size = new_size;
-                       ip->i_size = new_size;
-                       xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-               }
-       }
-
-       error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
-       if (error)
-               return error;
-
-       /*
-        * If we are not changing the file size then do not update the on-disk
-        * file size - we may be called from xfs_inactive_free_eofblocks().
-        * If we update the on-disk file size and then the system crashes
-        * before the contents of the file are flushed to disk then the files
-        * may be full of holes (ie NULL files bug).
-        */
-       xfs_isize_check(ip, new_size);
-       if (ip->i_size != new_size) {
-               ip->i_d.di_size = new_size;
-               ip->i_size = new_size;
-       }
-
-       ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
-       ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
-
-       /*
-        * Always re-log the inode so that our permanent transaction can keep
-        * on rolling it forward in the log.
-        */
-       xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
-
-       trace_xfs_itruncate_data_end(ip, new_size);
-       return 0;
-}
-
 /*
  * This is called when the inode's link count goes to 0.
  * We place the on-disk inode on a list in the AGI.  It
@@ -1824,8 +1709,7 @@ xfs_ifree(
        ASSERT(ip->i_d.di_nlink == 0);
        ASSERT(ip->i_d.di_nextents == 0);
        ASSERT(ip->i_d.di_anextents == 0);
-       ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) ||
-              (!S_ISREG(ip->i_d.di_mode)));
+       ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
        ASSERT(ip->i_d.di_nblocks == 0);
 
        /*
@@ -1844,8 +1728,6 @@ xfs_ifree(
        ip->i_d.di_flags = 0;
        ip->i_d.di_dmevmask = 0;
        ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
-       ip->i_df.if_ext_max =
-               XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
        ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
        /*
@@ -2151,7 +2033,7 @@ xfs_idestroy_fork(
  * once someone is waiting for it to be unpinned.
  */
 static void
-xfs_iunpin_nowait(
+xfs_iunpin(
        struct xfs_inode        *ip)
 {
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2045,29 @@ xfs_iunpin_nowait(
 
 }
 
+static void
+__xfs_iunpin_wait(
+       struct xfs_inode        *ip)
+{
+       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
+       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
+
+       xfs_iunpin(ip);
+
+       do {
+               prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_ipincount(ip))
+                       io_schedule();
+       } while (xfs_ipincount(ip));
+       finish_wait(wq, &wait.wait);
+}
+
 void
 xfs_iunpin_wait(
        struct xfs_inode        *ip)
 {
-       if (xfs_ipincount(ip)) {
-               xfs_iunpin_nowait(ip);
-               wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
-       }
+       if (xfs_ipincount(ip))
+               __xfs_iunpin_wait(ip);
 }
 
 /*
@@ -2510,9 +2407,9 @@ xfs_iflush(
        XFS_STATS_INC(xs_iflush_count);
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-       ASSERT(!completion_done(&ip->i_flush));
+       ASSERT(xfs_isiflocked(ip));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-              ip->i_d.di_nextents > ip->i_df.if_ext_max);
+              ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
        iip = ip->i_itemp;
        mp = ip->i_mount;
@@ -2529,7 +2426,7 @@ xfs_iflush(
         * out for us if they occur after the log force completes.
         */
        if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
-               xfs_iunpin_nowait(ip);
+               xfs_iunpin(ip);
                xfs_ifunlock(ip);
                return EAGAIN;
        }
@@ -2626,9 +2523,9 @@ xfs_iflush_int(
 #endif
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-       ASSERT(!completion_done(&ip->i_flush));
+       ASSERT(xfs_isiflocked(ip));
        ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-              ip->i_d.di_nextents > ip->i_df.if_ext_max);
+              ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
 
        iip = ip->i_itemp;
        mp = ip->i_mount;
index f0e6b15..2f27b74 100644 (file)
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
        struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
-       unsigned char           if_ext_max;     /* max # of extent records */
        union {
                xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
                xfs_ext_irec_t  *if_ext_irec;   /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
        ((w) == XFS_DATA_FORK ? \
                ((ip)->i_d.di_nextents = (n)) : \
                ((ip)->i_d.di_anextents = (n)))
-
+#define XFS_IFORK_MAXEXT(ip, w) \
+       (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
 
 
 #ifdef __KERNEL__
 
-struct bhv_desc;
 struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot;
 
-typedef struct dm_attrs_s {
-       __uint32_t      da_dmevmask;    /* DMIG event mask */
-       __uint16_t      da_dmstate;     /* DMIG state info */
-       __uint16_t      da_pad;         /* DMIG extra padding */
-} dm_attrs_t;
-
 typedef struct xfs_inode {
        /* Inode linking and identification information. */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
@@ -244,27 +237,19 @@ typedef struct xfs_inode {
        struct xfs_inode_log_item *i_itemp;     /* logging information */
        mrlock_t                i_lock;         /* inode lock */
        mrlock_t                i_iolock;       /* inode IO lock */
-       struct completion       i_flush;        /* inode flush completion q */
        atomic_t                i_pincount;     /* inode pin count */
-       wait_queue_head_t       i_ipin_wait;    /* inode pinning wait queue */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
        /* Miscellaneous state. */
-       unsigned short          i_flags;        /* see defined flags below */
+       unsigned long           i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
 
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
 
-       xfs_fsize_t             i_size;         /* in-memory size */
-       xfs_fsize_t             i_new_size;     /* size when write completes */
-
        /* VFS inode */
        struct inode            i_vnode;        /* embedded VFS inode */
 } xfs_inode_t;
 
-#define XFS_ISIZE(ip)  S_ISREG((ip)->i_d.di_mode) ? \
-                               (ip)->i_size : (ip)->i_d.di_size;
-
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
@@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 }
 
 /*
+ * For regular files we only update the on-disk filesize when actually
+ * writing data back to disk.  Until then only the copy in the VFS inode
+ * is uptodate.
+ */
+static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
+{
+       if (S_ISREG(ip->i_d.di_mode))
+               return i_size_read(VFS_I(ip));
+       return ip->i_d.di_size;
+}
+
+/*
  * i_flags helper functions
  */
 static inline void
@@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
        return ret;
 }
 
+static inline int
+xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
+{
+       int ret;
+
+       spin_lock(&ip->i_flags_lock);
+       ret = ip->i_flags & flags;
+       if (!ret)
+               ip->i_flags |= flags;
+       spin_unlock(&ip->i_flags_lock);
+       return ret;
+}
+
 /*
  * Project quota id helpers (previously projid was 16bit only
  * and using two 16bit values to hold new 32bit projid was chosen
@@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip,
 }
 
 /*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-       wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-       return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-       complete(&ip->i_flush);
-}
-
-/*
  * In-core inode flags.
  */
-#define XFS_IRECLAIM           0x0001  /* started reclaiming this inode */
-#define XFS_ISTALE             0x0002  /* inode has been staled */
-#define XFS_IRECLAIMABLE       0x0004  /* inode can be reclaimed */
-#define XFS_INEW               0x0008  /* inode has just been allocated */
-#define XFS_IFILESTREAM                0x0010  /* inode is in a filestream directory */
-#define XFS_ITRUNCATED         0x0020  /* truncated down so flush-on-close */
-#define XFS_IDIRTY_RELEASE     0x0040  /* dirty release already seen */
+#define XFS_IRECLAIM           (1 << 0) /* started reclaiming this inode */
+#define XFS_ISTALE             (1 << 1) /* inode has been staled */
+#define XFS_IRECLAIMABLE       (1 << 2) /* inode can be reclaimed */
+#define XFS_INEW               (1 << 3) /* inode has just been allocated */
+#define XFS_IFILESTREAM                (1 << 4) /* inode is in a filestream dir. */
+#define XFS_ITRUNCATED         (1 << 5) /* truncated down so flush-on-close */
+#define XFS_IDIRTY_RELEASE     (1 << 6) /* dirty release already seen */
+#define __XFS_IFLOCK_BIT       7        /* inode is being flushed right now */
+#define XFS_IFLOCK             (1 << __XFS_IFLOCK_BIT)
+#define __XFS_IPINNED_BIT      8        /* wakeup key for zero pin count */
+#define XFS_IPINNED            (1 << __XFS_IPINNED_BIT)
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
         XFS_IFILESTREAM);
 
 /*
+ * Synchronize processes attempting to flush the in-core inode back to disk.
+ */
+
+extern void __xfs_iflock(struct xfs_inode *ip);
+
+static inline int xfs_iflock_nowait(struct xfs_inode *ip)
+{
+       return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
+}
+
+static inline void xfs_iflock(struct xfs_inode *ip)
+{
+       if (!xfs_iflock_nowait(ip))
+               __xfs_iflock(ip);
+}
+
+static inline void xfs_ifunlock(struct xfs_inode *ip)
+{
+       xfs_iflags_clear(ip, XFS_IFLOCK);
+       wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
+}
+
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+       return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
+/*
  * Flags for inode locking.
  * Bit ranges: 1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
  *             1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -491,8 +513,6 @@ int         xfs_ifree(struct xfs_trans *, xfs_inode_t *,
                           struct xfs_bmap_free *);
 int            xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
                                      int, xfs_fsize_t);
-int            xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
-                                  xfs_fsize_t);
 int            xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 
 void           xfs_iext_realloc(xfs_inode_t *, int, int);
index cfd6c7f..91d71dc 100644 (file)
@@ -79,8 +79,6 @@ xfs_inode_item_size(
                break;
 
        case XFS_DINODE_FMT_BTREE:
-               ASSERT(ip->i_df.if_ext_max ==
-                      XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
                iip->ili_format.ilf_fields &=
                        ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
                          XFS_ILOG_DEV | XFS_ILOG_UUID);
@@ -557,7 +555,7 @@ xfs_inode_item_unpin(
        trace_xfs_inode_unpin(ip, _RET_IP_);
        ASSERT(atomic_read(&ip->i_pincount) > 0);
        if (atomic_dec_and_test(&ip->i_pincount))
-               wake_up(&ip->i_ipin_wait);
+               wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
 }
 
 /*
@@ -719,7 +717,7 @@ xfs_inode_item_pushbuf(
         * If a flush is not in progress anymore, chances are that the
         * inode was taken off the AIL. So, just get out.
         */
-       if (completion_done(&ip->i_flush) ||
+       if (!xfs_isiflocked(ip) ||
            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return true;
@@ -752,7 +750,7 @@ xfs_inode_item_push(
        struct xfs_inode        *ip = iip->ili_inode;
 
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
-       ASSERT(!completion_done(&ip->i_flush));
+       ASSERT(xfs_isiflocked(ip));
 
        /*
         * Since we were able to lock the inode's flush lock and
index 9afa282..246c7d5 100644 (file)
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
        xfs_fileoff_t   *last_fsb)
 {
        xfs_fileoff_t   new_last_fsb = 0;
-       xfs_extlen_t    align;
+       xfs_extlen_t    align = 0;
        int             eof, error;
 
-       if (XFS_IS_REALTIME_INODE(ip))
-               ;
-       /*
-        * If mounted with the "-o swalloc" option, roundup the allocation
-        * request to a stripe width boundary if the file size is >=
-        * stripe width and we are allocating past the allocation eof.
-        */
-       else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
-               (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
-               new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
-       /*
-        * Roundup the allocation request to a stripe unit (m_dalign) boundary
-        * if the file size is >= stripe unit size, and we are allocating past
-        * the allocation eof.
-        */
-       else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
-               new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
+       if (!XFS_IS_REALTIME_INODE(ip)) {
+               /*
+                * Round up the allocation request to a stripe unit
+                * (m_dalign) boundary if the file size is >= stripe unit
+                * size, and we are allocating past the allocation eof.
+                *
+                * If mounted with the "-o swalloc" option the alignment is
+                * increased from the strip unit size to the stripe width.
+                */
+               if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+                       align = mp->m_swidth;
+               else if (mp->m_dalign)
+                       align = mp->m_dalign;
+
+               if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
+                       new_last_fsb = roundup_64(*last_fsb, align);
+       }
 
        /*
         * Always round up the allocation request to an extent boundary
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
 
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-       if ((offset + count) > ip->i_size) {
+       if ((offset + count) > XFS_ISIZE(ip)) {
                error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
                if (error)
                        goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
        xfs_trans_ijoin(tp, ip, 0);
 
        bmapi_flag = 0;
-       if (offset < ip->i_size || extsz)
+       if (offset < XFS_ISIZE(ip) || extsz)
                bmapi_flag |= XFS_BMAPI_PREALLOC;
 
        /*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
        int             found_delalloc = 0;
 
        *prealloc = 0;
-       if ((offset + count) <= ip->i_size)
+       if (offset + count <= XFS_ISIZE(ip))
                return 0;
 
        /*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
                 * ensure we always pass in a non-zero value.
                 */
-               alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1;
+               alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
                                        rounddown_pow_of_two(alloc_blocks));
 
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
                         * back....
                         */
                        nimaps = 1;
-                       end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
+                       end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
                        error = xfs_bmap_last_offset(NULL, ip, &last_block,
                                                        XFS_DATA_FORK);
                        if (error)
index f9babd1..ab30253 100644 (file)
@@ -750,6 +750,7 @@ xfs_setattr_size(
        struct xfs_mount        *mp = ip->i_mount;
        struct inode            *inode = VFS_I(ip);
        int                     mask = iattr->ia_valid;
+       xfs_off_t               oldsize, newsize;
        struct xfs_trans        *tp;
        int                     error;
        uint                    lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
                lock_flags |= XFS_IOLOCK_EXCL;
        xfs_ilock(ip, lock_flags);
 
+       oldsize = inode->i_size;
+       newsize = iattr->ia_size;
+
        /*
         * Short circuit the truncate case for zero length files.
         */
-       if (iattr->ia_size == 0 &&
-           ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+       if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
                if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
                        goto out_unlock;
 
@@ -807,14 +810,14 @@ xfs_setattr_size(
         * the inode to the transaction, because the inode cannot be unlocked
         * once it is a part of the transaction.
         */
-       if (iattr->ia_size > ip->i_size) {
+       if (newsize > oldsize) {
                /*
                 * Do the first part of growing a file: zero any data in the
                 * last block that is beyond the old EOF.  We need to do this
                 * before the inode is joined to the transaction to modify
                 * i_size.
                 */
-               error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+               error = xfs_zero_eof(ip, newsize, oldsize);
                if (error)
                        goto out_unlock;
        }
@@ -833,8 +836,8 @@ xfs_setattr_size(
         * here and prevents waiting for other data not within the range we
         * care about here.
         */
-       if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
-               error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0,
+       if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+               error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
                                        FI_NONE);
                if (error)
                        goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
         */
        inode_dio_wait(inode);
 
-       error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
-                                    xfs_get_blocks);
+       error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
        if (error)
                goto out_unlock;
 
@@ -857,7 +859,7 @@ xfs_setattr_size(
        if (error)
                goto out_trans_cancel;
 
-       truncate_setsize(inode, iattr->ia_size);
+       truncate_setsize(inode, newsize);
 
        commit_flags = XFS_TRANS_RELEASE_LOG_RES;
        lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,29 @@ xfs_setattr_size(
         * these flags set.  For all other operations the VFS set these flags
         * explicitly if it wants a timestamp update.
         */
-       if (iattr->ia_size != ip->i_size &&
-           (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+       if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
                iattr->ia_ctime = iattr->ia_mtime =
                        current_fs_time(inode->i_sb);
                mask |= ATTR_CTIME | ATTR_MTIME;
        }
 
-       if (iattr->ia_size > ip->i_size) {
-               ip->i_d.di_size = iattr->ia_size;
-               ip->i_size = iattr->ia_size;
-       } else if (iattr->ia_size <= ip->i_size ||
-                  (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
-               error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+       /*
+        * The first thing we do is set the size to new_size permanently on
+        * disk.  This way we don't have to worry about anyone ever being able
+        * to look at the data being freed even in the face of a crash.
+        * What we're getting around here is the case where we free a block, it
+        * is allocated to another file, it is written to, and then we crash.
+        * If the new data gets written to the file but the log buffers
+        * containing the free and reallocation don't, then we'd end up with
+        * garbage in the blocks being freed.  As long as we make the new size
+        * permanent before actually freeing any blocks it doesn't matter if
+        * they get written to.
+        */
+       ip->i_d.di_size = newsize;
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       if (newsize <= oldsize) {
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
                if (error)
                        goto out_trans_abort;
 
index 5cc3dde..eafbcff 100644 (file)
@@ -31,6 +31,7 @@
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile(
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_trans_ijoin(tp, ip, 0);
 
-       error = xfs_itruncate_data(&tp, ip, 0);
+       ip->i_d.di_size = 0;
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
        if (error) {
                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
                                     XFS_TRANS_ABORT);
                goto out_unlock;
        }
 
+       ASSERT(ip->i_d.di_nextents == 0);
+
        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
 
index 281961c..ee5b695 100644 (file)
@@ -828,14 +828,6 @@ xfs_fs_inode_init_once(
        /* xfs inode */
        atomic_set(&ip->i_pincount, 0);
        spin_lock_init(&ip->i_flags_lock);
-       init_waitqueue_head(&ip->i_ipin_wait);
-       /*
-        * Because we want to use a counting completion, complete
-        * the flush completion once to allow a single access to
-        * the flush completion without blocking.
-        */
-       init_completion(&ip->i_flush);
-       complete(&ip->i_flush);
 
        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
                     "xfsino", ip->i_ino);
index 72c01a1..40b75ee 100644 (file)
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab(
                return 1;
 
        /*
-        * do some unlocked checks first to avoid unnecessary lock traffic.
-        * The first is a flush lock check, the second is a already in reclaim
-        * check. Only do these checks if we are not going to block on locks.
+        * If we are asked for non-blocking operation, do unlocked checks to
+        * see if the inode already is being flushed or in reclaim to avoid
+        * lock traffic.
         */
        if ((flags & SYNC_TRYLOCK) &&
-           (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+           __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
                return 1;
-       }
 
        /*
         * The radix tree lock here protects a thread in xfs_iget from racing
index a9d5b1e..6b6df58 100644 (file)
@@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(xfs_fsize_t, size)
-               __field(xfs_fsize_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
                __field(int, flags)
@@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->size = ip->i_d.di_size;
-               __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
                __entry->flags = flags;
        ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
                  "offset 0x%llx count 0x%zx ioflags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
-                 __entry->new_size,
                  __entry->offset,
                  __entry->count,
                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __field(dev_t, dev)
                __field(xfs_ino_t, ino)
                __field(loff_t, size)
-               __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
                __field(int, type)
@@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
                __entry->size = ip->i_d.di_size;
-               __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
                __entry->type = type;
@@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
                __entry->startblock = irec ? irec->br_startblock : 0;
                __entry->blockcount = irec ? irec->br_blockcount : 0;
        ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                 "offset 0x%llx count %zd type %s "
-                 "startoff 0x%llx startblock %lld blockcount 0x%llx",
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
+                 "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
-                 __entry->new_size,
                  __entry->offset,
                  __entry->count,
                  __print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,26 +1024,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
                __field(xfs_ino_t, ino)
                __field(loff_t, isize)
                __field(loff_t, disize)
-               __field(loff_t, new_size)
                __field(loff_t, offset)
                __field(size_t, count)
        ),
        TP_fast_assign(
                __entry->dev = VFS_I(ip)->i_sb->s_dev;
                __entry->ino = ip->i_ino;
-               __entry->isize = ip->i_size;
+               __entry->isize = VFS_I(ip)->i_size;
                __entry->disize = ip->i_d.di_size;
-               __entry->new_size = ip->i_new_size;
                __entry->offset = offset;
                __entry->count = count;
        ),
-       TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+       TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
                  "offset 0x%llx count %zd",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->isize,
                  __entry->disize,
-                 __entry->new_size,
                  __entry->offset,
                  __entry->count)
 );
@@ -1090,8 +1080,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
 DEFINE_EVENT(xfs_itrunc_class, name, \
        TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
        TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
 
 TRACE_EVENT(xfs_pagecache_inval,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1568,7 +1558,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
                __field(xfs_ino_t, ino)
                __field(int, format)
                __field(int, nex)
-               __field(int, max_nex)
                __field(int, broot_size)
                __field(int, fork_off)
        ),
@@ -1578,18 +1567,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
                __entry->ino = ip->i_ino;
                __entry->format = ip->i_d.di_format;
                __entry->nex = ip->i_d.di_nextents;
-               __entry->max_nex = ip->i_df.if_ext_max;
                __entry->broot_size = ip->i_df.if_broot_bytes;
                __entry->fork_off = XFS_IFORK_BOFF(ip);
        ),
        TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
-                 "Max in-fork extents %d, broot size %d, fork offset %d",
+                 "broot size %d, fork offset %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
                  __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
                  __entry->nex,
-                 __entry->max_nex,
                  __entry->broot_size,
                  __entry->fork_off)
 )
index f2fea86..0cf52da 100644 (file)
@@ -175,7 +175,7 @@ xfs_free_eofblocks(
         * Figure out if there are any blocks beyond the end
         * of the file.  If not, then there is nothing to do.
         */
-       end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
+       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
        last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
        if (last_fsb <= end_fsb)
                return 0;
@@ -226,7 +226,14 @@ xfs_free_eofblocks(
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, 0);
 
-               error = xfs_itruncate_data(&tp, ip, ip->i_size);
+               /*
+                * Do not update the on-disk file size.  If we update the
+                * on-disk file size and then the system crashes before the
+                * contents of the file are flushed to disk then the files
+                * may be full of holes (ie NULL files bug).
+                */
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
+                                             XFS_ISIZE(ip));
                if (error) {
                        /*
                         * If we get an error at this point we simply don't
@@ -540,8 +547,8 @@ xfs_release(
                return 0;
 
        if ((S_ISREG(ip->i_d.di_mode) &&
-            ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-              ip->i_delayed_blks > 0)) &&
+            (VFS_I(ip)->i_size > 0 ||
+             (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
 
@@ -618,7 +625,7 @@ xfs_inactive(
         * only one with a reference to the inode.
         */
        truncate = ((ip->i_d.di_nlink == 0) &&
-           ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
+           ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
             (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
            S_ISREG(ip->i_d.di_mode));
 
@@ -632,12 +639,12 @@ xfs_inactive(
 
        if (ip->i_d.di_nlink != 0) {
                if ((S_ISREG(ip->i_d.di_mode) &&
-                     ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
-                       ip->i_delayed_blks > 0)) &&
-                     (ip->i_df.if_flags & XFS_IFEXTENTS) &&
-                    (!(ip->i_d.di_flags &
+                   (VFS_I(ip)->i_size > 0 ||
+                    (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+                   (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+                   (!(ip->i_d.di_flags &
                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
-                     (ip->i_delayed_blks != 0)))) {
+                    ip->i_delayed_blks != 0))) {
                        error = xfs_free_eofblocks(mp, ip, 0);
                        if (error)
                                return VN_INACTIVE_CACHE;
@@ -670,13 +677,18 @@ xfs_inactive(
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_trans_ijoin(tp, ip, 0);
 
-               error = xfs_itruncate_data(&tp, ip, 0);
+               ip->i_d.di_size = 0;
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+               error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
                if (error) {
                        xfs_trans_cancel(tp,
                                XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
                        xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
                        return VN_INACTIVE_CACHE;
                }
+
+               ASSERT(ip->i_d.di_nextents == 0);
        } else if (S_ISLNK(ip->i_d.di_mode)) {
 
                /*
@@ -1961,11 +1973,11 @@ xfs_zero_remaining_bytes(
         * since nothing can read beyond eof.  The space will
         * be zeroed when the file is extended anyway.
         */
-       if (startoff >= ip->i_size)
+       if (startoff >= XFS_ISIZE(ip))
                return 0;
 
-       if (endoff > ip->i_size)
-               endoff = ip->i_size;
+       if (endoff > XFS_ISIZE(ip))
+               endoff = XFS_ISIZE(ip);
 
        bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2260,7 +2272,7 @@ xfs_change_file_space(
                bf->l_start += offset;
                break;
        case 2: /*SEEK_END*/
-               bf->l_start += ip->i_size;
+               bf->l_start += XFS_ISIZE(ip);
                break;
        default:
                return XFS_ERROR(EINVAL);
@@ -2277,7 +2289,7 @@ xfs_change_file_space(
        bf->l_whence = 0;
 
        startoffset = bf->l_start;
-       fsize = ip->i_size;
+       fsize = XFS_ISIZE(ip);
 
        /*
         * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
index 9eabffb..033f6aa 100644 (file)
@@ -134,7 +134,7 @@ struct pl08x_txd {
        struct dma_async_tx_descriptor tx;
        struct list_head node;
        struct list_head dsg_list;
-       enum dma_data_direction direction;
+       enum dma_transfer_direction direction;
        dma_addr_t llis_bus;
        struct pl08x_lli *llis_va;
        /* Default cctl value for LLIs */
@@ -197,7 +197,7 @@ struct pl08x_dma_chan {
        dma_addr_t dst_addr;
        u32 src_cctl;
        u32 dst_cctl;
-       enum dma_data_direction runtime_direction;
+       enum dma_transfer_direction runtime_direction;
        dma_cookie_t lc;
        struct list_head pend_list;
        struct pl08x_txd *at;
index 426ab9f..9ff7a2c 100644 (file)
@@ -26,6 +26,7 @@
 
 #include <linux/types.h>
 #include <linux/elf-em.h>
+#include <linux/ptrace.h>
 
 /* The netlink messages for the audit system is divided into blocks:
  * 1000 - 1099 are for commanding the audit system
  * AUDIT_UNUSED_BITS is updated if need be. */
 #define AUDIT_UNUSED_BITS      0x07FFFC00
 
+/* AUDIT_FIELD_COMPARE rule list */
+#define AUDIT_COMPARE_UID_TO_OBJ_UID   1
+#define AUDIT_COMPARE_GID_TO_OBJ_GID   2
+#define AUDIT_COMPARE_EUID_TO_OBJ_UID  3
+#define AUDIT_COMPARE_EGID_TO_OBJ_GID  4
+#define AUDIT_COMPARE_AUID_TO_OBJ_UID  5
+#define AUDIT_COMPARE_SUID_TO_OBJ_UID  6
+#define AUDIT_COMPARE_SGID_TO_OBJ_GID  7
+#define AUDIT_COMPARE_FSUID_TO_OBJ_UID 8
+#define AUDIT_COMPARE_FSGID_TO_OBJ_GID 9
+
+#define AUDIT_COMPARE_UID_TO_AUID      10
+#define AUDIT_COMPARE_UID_TO_EUID      11
+#define AUDIT_COMPARE_UID_TO_FSUID     12
+#define AUDIT_COMPARE_UID_TO_SUID      13
+
+#define AUDIT_COMPARE_AUID_TO_FSUID    14
+#define AUDIT_COMPARE_AUID_TO_SUID     15
+#define AUDIT_COMPARE_AUID_TO_EUID     16
+
+#define AUDIT_COMPARE_EUID_TO_SUID     17
+#define AUDIT_COMPARE_EUID_TO_FSUID    18
+
+#define AUDIT_COMPARE_SUID_TO_FSUID    19
+
+#define AUDIT_COMPARE_GID_TO_EGID      20
+#define AUDIT_COMPARE_GID_TO_FSGID     21
+#define AUDIT_COMPARE_GID_TO_SGID      22
+
+#define AUDIT_COMPARE_EGID_TO_FSGID    23
+#define AUDIT_COMPARE_EGID_TO_SGID     24
+#define AUDIT_COMPARE_SGID_TO_FSGID    25
+
+#define AUDIT_MAX_FIELD_COMPARE                AUDIT_COMPARE_SGID_TO_FSGID
 
 /* Rule fields */
                                /* These are useful when checking the
 #define AUDIT_PERM     106
 #define AUDIT_DIR      107
 #define AUDIT_FILETYPE 108
+#define AUDIT_OBJ_UID  109
+#define AUDIT_OBJ_GID  110
+#define AUDIT_FIELD_COMPARE    111
 
 #define AUDIT_ARG0      200
 #define AUDIT_ARG1      (AUDIT_ARG0+1)
@@ -408,28 +446,24 @@ struct audit_field {
        void                            *lsm_rule;
 };
 
-#define AUDITSC_INVALID 0
-#define AUDITSC_SUCCESS 1
-#define AUDITSC_FAILURE 2
-#define AUDITSC_RESULT(x) ( ((long)(x))<0?AUDITSC_FAILURE:AUDITSC_SUCCESS )
 extern int __init audit_register_class(int class, unsigned *list);
 extern int audit_classify_syscall(int abi, unsigned syscall);
 extern int audit_classify_arch(int arch);
 #ifdef CONFIG_AUDITSYSCALL
 /* These are defined in auditsc.c */
                                /* Public API */
-extern void audit_finish_fork(struct task_struct *child);
 extern int  audit_alloc(struct task_struct *task);
-extern void audit_free(struct task_struct *task);
-extern void audit_syscall_entry(int arch,
-                               int major, unsigned long a0, unsigned long a1,
-                               unsigned long a2, unsigned long a3);
-extern void audit_syscall_exit(int failed, long return_code);
+extern void __audit_free(struct task_struct *task);
+extern void __audit_syscall_entry(int arch,
+                                 int major, unsigned long a0, unsigned long a1,
+                                 unsigned long a2, unsigned long a3);
+extern void __audit_syscall_exit(int ret_success, long ret_value);
 extern void __audit_getname(const char *name);
 extern void audit_putname(const char *name);
 extern void __audit_inode(const char *name, const struct dentry *dentry);
 extern void __audit_inode_child(const struct dentry *dentry,
                                const struct inode *parent);
+extern void __audit_seccomp(unsigned long syscall);
 extern void __audit_ptrace(struct task_struct *t);
 
 static inline int audit_dummy_context(void)
@@ -437,6 +471,27 @@ static inline int audit_dummy_context(void)
        void *p = current->audit_context;
        return !p || *(int *)p;
 }
+static inline void audit_free(struct task_struct *task)
+{
+       if (unlikely(task->audit_context))
+               __audit_free(task);
+}
+static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
+                                      unsigned long a1, unsigned long a2,
+                                      unsigned long a3)
+{
+       if (unlikely(!audit_dummy_context()))
+               __audit_syscall_entry(arch, major, a0, a1, a2, a3);
+}
+static inline void audit_syscall_exit(void *pt_regs)
+{
+       if (unlikely(current->audit_context)) {
+               int success = is_syscall_success(pt_regs);
+               int return_code = regs_return_value(pt_regs);
+
+               __audit_syscall_exit(success, return_code);
+       }
+}
 static inline void audit_getname(const char *name)
 {
        if (unlikely(!audit_dummy_context()))
@@ -453,6 +508,12 @@ static inline void audit_inode_child(const struct dentry *dentry,
 }
 void audit_core_dumps(long signr);
 
+static inline void audit_seccomp(unsigned long syscall)
+{
+       if (unlikely(!audit_dummy_context()))
+               __audit_seccomp(syscall);
+}
+
 static inline void audit_ptrace(struct task_struct *t)
 {
        if (unlikely(!audit_dummy_context()))
@@ -463,17 +524,16 @@ static inline void audit_ptrace(struct task_struct *t)
 extern unsigned int audit_serial(void);
 extern int auditsc_get_stamp(struct audit_context *ctx,
                              struct timespec *t, unsigned int *serial);
-extern int  audit_set_loginuid(struct task_struct *task, uid_t loginuid);
+extern int  audit_set_loginuid(uid_t loginuid);
 #define audit_get_loginuid(t) ((t)->loginuid)
 #define audit_get_sessionid(t) ((t)->sessionid)
 extern void audit_log_task_context(struct audit_buffer *ab);
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
-extern int audit_bprm(struct linux_binprm *bprm);
-extern void audit_socketcall(int nargs, unsigned long *args);
-extern int audit_sockaddr(int len, void *addr);
+extern int __audit_bprm(struct linux_binprm *bprm);
+extern void __audit_socketcall(int nargs, unsigned long *args);
+extern int __audit_sockaddr(int len, void *addr);
 extern void __audit_fd_pair(int fd1, int fd2);
-extern int audit_set_macxattr(const char *name);
 extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
 extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec *abs_timeout);
 extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
@@ -499,6 +559,23 @@ static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
 }
+static inline int audit_bprm(struct linux_binprm *bprm)
+{
+       if (unlikely(!audit_dummy_context()))
+               return __audit_bprm(bprm);
+       return 0;
+}
+static inline void audit_socketcall(int nargs, unsigned long *args)
+{
+       if (unlikely(!audit_dummy_context()))
+               __audit_socketcall(nargs, args);
+}
+static inline int audit_sockaddr(int len, void *addr)
+{
+       if (unlikely(!audit_dummy_context()))
+               return __audit_sockaddr(len, addr);
+       return 0;
+}
 static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
 {
        if (unlikely(!audit_dummy_context()))
@@ -544,12 +621,11 @@ static inline void audit_mmap_fd(int fd, int flags)
 
 extern int audit_n_rules;
 extern int audit_signals;
-#else
-#define audit_finish_fork(t)
+#else /* CONFIG_AUDITSYSCALL */
 #define audit_alloc(t) ({ 0; })
 #define audit_free(t) do { ; } while (0)
 #define audit_syscall_entry(ta,a,b,c,d,e) do { ; } while (0)
-#define audit_syscall_exit(f,r) do { ; } while (0)
+#define audit_syscall_exit(r) do { ; } while (0)
 #define audit_dummy_context() 1
 #define audit_getname(n) do { ; } while (0)
 #define audit_putname(n) do { ; } while (0)
@@ -558,6 +634,7 @@ extern int audit_signals;
 #define audit_inode(n,d) do { (void)(d); } while (0)
 #define audit_inode_child(i,p) do { ; } while (0)
 #define audit_core_dumps(i) do { ; } while (0)
+#define audit_seccomp(i) do { ; } while (0)
 #define auditsc_get_stamp(c,t,s) (0)
 #define audit_get_loginuid(t) (-1)
 #define audit_get_sessionid(t) (-1)
@@ -568,7 +645,6 @@ extern int audit_signals;
 #define audit_socketcall(n,a) ((void)0)
 #define audit_fd_pair(n,a) ((void)0)
 #define audit_sockaddr(len, addr) ({ 0; })
-#define audit_set_macxattr(n) do { ; } while (0)
 #define audit_mq_open(o,m,a) ((void)0)
 #define audit_mq_sendrecv(d,l,p,t) ((void)0)
 #define audit_mq_notify(d,n) ((void)0)
@@ -579,7 +655,7 @@ extern int audit_signals;
 #define audit_ptrace(t) ((void)0)
 #define audit_n_rules 0
 #define audit_signals 0
-#endif
+#endif /* CONFIG_AUDITSYSCALL */
 
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
index efae755..b01558b 100644 (file)
@@ -46,7 +46,7 @@ struct signature_hdr {
        char            mpi[0];
 } __packed;
 
-#if defined(CONFIG_DIGSIG) || defined(CONFIG_DIGSIG_MODULE)
+#if defined(CONFIG_SIGNATURE) || defined(CONFIG_SIGNATURE_MODULE)
 
 int digsig_verify(struct key *keyring, const char *sig, int siglen,
                                        const char *digest, int digestlen);
@@ -59,6 +59,6 @@ static inline int digsig_verify(struct key *keyring, const char *sig,
        return -EOPNOTSUPP;
 }
 
-#endif /* CONFIG_DIGSIG */
+#endif /* CONFIG_SIGNATURE */
 
 #endif /* _DIGSIG_H */
index 75f53f8..679b349 100644 (file)
@@ -23,7 +23,6 @@
 
 #include <linux/device.h>
 #include <linux/uio.h>
-#include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
 #include <linux/bitmap.h>
 #include <asm/page.h>
@@ -72,11 +71,93 @@ enum dma_transaction_type {
        DMA_ASYNC_TX,
        DMA_SLAVE,
        DMA_CYCLIC,
+       DMA_INTERLEAVE,
+/* last transaction type for creation of the capabilities mask */
+       DMA_TX_TYPE_END,
 };
 
-/* last transaction type for creation of the capabilities mask */
-#define DMA_TX_TYPE_END (DMA_CYCLIC + 1)
+/**
+ * enum dma_transfer_direction - dma transfer mode and direction indicator
+ * @DMA_MEM_TO_MEM: Async/Memcpy mode
+ * @DMA_MEM_TO_DEV: Slave mode & From Memory to Device
+ * @DMA_DEV_TO_MEM: Slave mode & From Device to Memory
+ * @DMA_DEV_TO_DEV: Slave mode & From Device to Device
+ */
+enum dma_transfer_direction {
+       DMA_MEM_TO_MEM,
+       DMA_MEM_TO_DEV,
+       DMA_DEV_TO_MEM,
+       DMA_DEV_TO_DEV,
+       DMA_TRANS_NONE,
+};
+
+/**
+ * Interleaved Transfer Request
+ * ----------------------------
+ * A chunk is collection of contiguous bytes to be transfered.
+ * The gap(in bytes) between two chunks is called inter-chunk-gap(ICG).
+ * ICGs may or maynot change between chunks.
+ * A FRAME is the smallest series of contiguous {chunk,icg} pairs,
+ *  that when repeated an integral number of times, specifies the transfer.
+ * A transfer template is specification of a Frame, the number of times
+ *  it is to be repeated and other per-transfer attributes.
+ *
+ * Practically, a client driver would have ready a template for each
+ *  type of transfer it is going to need during its lifetime and
+ *  set only 'src_start' and 'dst_start' before submitting the requests.
+ *
+ *
+ *  |      Frame-1        |       Frame-2       | ~ |       Frame-'numf'  |
+ *  |====....==.===...=...|====....==.===...=...| ~ |====....==.===...=...|
+ *
+ *    ==  Chunk size
+ *    ... ICG
+ */
+
+/**
+ * struct data_chunk - Element of scatter-gather list that makes a frame.
+ * @size: Number of bytes to read from source.
+ *       size_dst := fn(op, size_src), so doesn't mean much for destination.
+ * @icg: Number of bytes to jump after last src/dst address of this
+ *      chunk and before first src/dst address for next chunk.
+ *      Ignored for dst(assumed 0), if dst_inc is true and dst_sgl is false.
+ *      Ignored for src(assumed 0), if src_inc is true and src_sgl is false.
+ */
+struct data_chunk {
+       size_t size;
+       size_t icg;
+};
 
+/**
+ * struct dma_interleaved_template - Template to convey DMAC the transfer pattern
+ *      and attributes.
+ * @src_start: Bus address of source for the first chunk.
+ * @dst_start: Bus address of destination for the first chunk.
+ * @dir: Specifies the type of Source and Destination.
+ * @src_inc: If the source address increments after reading from it.
+ * @dst_inc: If the destination address increments after writing to it.
+ * @src_sgl: If the 'icg' of sgl[] applies to Source (scattered read).
+ *             Otherwise, source is read contiguously (icg ignored).
+ *             Ignored if src_inc is false.
+ * @dst_sgl: If the 'icg' of sgl[] applies to Destination (scattered write).
+ *             Otherwise, destination is filled contiguously (icg ignored).
+ *             Ignored if dst_inc is false.
+ * @numf: Number of frames in this template.
+ * @frame_size: Number of chunks in a frame i.e, size of sgl[].
+ * @sgl: Array of {chunk,icg} pairs that make up a frame.
+ */
+struct dma_interleaved_template {
+       dma_addr_t src_start;
+       dma_addr_t dst_start;
+       enum dma_transfer_direction dir;
+       bool src_inc;
+       bool dst_inc;
+       bool src_sgl;
+       bool dst_sgl;
+       size_t numf;
+       size_t frame_size;
+       struct data_chunk sgl[0];
+};
 
 /**
  * enum dma_ctrl_flags - DMA flags to augment operation preparation,
@@ -269,7 +350,7 @@ enum dma_slave_buswidth {
  * struct, if applicable.
  */
 struct dma_slave_config {
-       enum dma_data_direction direction;
+       enum dma_transfer_direction direction;
        dma_addr_t src_addr;
        dma_addr_t dst_addr;
        enum dma_slave_buswidth src_addr_width;
@@ -433,6 +514,7 @@ struct dma_tx_state {
  * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio.
  *     The function takes a buffer of size buf_len. The callback function will
  *     be called after period_len bytes have been transferred.
+ * @device_prep_interleaved_dma: Transfer expression in a generic way.
  * @device_control: manipulate all pending operations on a channel, returns
  *     zero or error code
  * @device_tx_status: poll for transaction completion, the optional
@@ -492,11 +574,14 @@ struct dma_device {
 
        struct dma_async_tx_descriptor *(*device_prep_slave_sg)(
                struct dma_chan *chan, struct scatterlist *sgl,
-               unsigned int sg_len, enum dma_data_direction direction,
+               unsigned int sg_len, enum dma_transfer_direction direction,
                unsigned long flags);
        struct dma_async_tx_descriptor *(*device_prep_dma_cyclic)(
                struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len,
-               size_t period_len, enum dma_data_direction direction);
+               size_t period_len, enum dma_transfer_direction direction);
+       struct dma_async_tx_descriptor *(*device_prep_interleaved_dma)(
+               struct dma_chan *chan, struct dma_interleaved_template *xt,
+               unsigned long flags);
        int (*device_control)(struct dma_chan *chan, enum dma_ctrl_cmd cmd,
                unsigned long arg);
 
@@ -522,7 +607,7 @@ static inline int dmaengine_slave_config(struct dma_chan *chan,
 
 static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single(
        struct dma_chan *chan, void *buf, size_t len,
-       enum dma_data_direction dir, unsigned long flags)
+       enum dma_transfer_direction dir, unsigned long flags)
 {
        struct scatterlist sg;
        sg_init_one(&sg, buf, len);
index 4bfe0a2..f2c64f9 100644 (file)
@@ -127,7 +127,7 @@ struct dw_cyclic_desc {
 
 struct dw_cyclic_desc *dw_dma_cyclic_prep(struct dma_chan *chan,
                dma_addr_t buf_addr, size_t buf_len, size_t period_len,
-               enum dma_data_direction direction);
+               enum dma_transfer_direction direction);
 void dw_dma_cyclic_free(struct dma_chan *chan);
 int dw_dma_cyclic_start(struct dma_chan *chan);
 void dw_dma_cyclic_stop(struct dma_chan *chan);
index 183a6af..bfc014c 100644 (file)
@@ -293,6 +293,9 @@ static inline bool key_is_instantiated(const struct key *key)
        (rcu_dereference_protected((KEY)->payload.rcudata,              \
                                   rwsem_is_locked(&((struct key *)(KEY))->sem)))
 
+#define rcu_assign_keypointer(KEY, PAYLOAD)                            \
+       (rcu_assign_pointer((KEY)->payload.rcudata, PAYLOAD))
+
 #ifdef CONFIG_SYSCTL
 extern ctl_table key_sysctls[];
 #endif
index abc0120..9c07dce 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <linux/bug.h>
 #include <linux/atomic.h>
+#include <linux/kernel.h>
 
 struct kref {
        atomic_t refcount;
diff --git a/include/linux/mtd/gpmi-nand.h b/include/linux/mtd/gpmi-nand.h
new file mode 100644 (file)
index 0000000..69b6dbf
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2011 Freescale Semiconductor, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef __MACH_MXS_GPMI_NAND_H__
+#define __MACH_MXS_GPMI_NAND_H__
+
+/* The size of the resources is fixed. */
+#define GPMI_NAND_RES_SIZE     6
+
+/* Resource names for the GPMI NAND driver. */
+#define GPMI_NAND_GPMI_REGS_ADDR_RES_NAME  "GPMI NAND GPMI Registers"
+#define GPMI_NAND_GPMI_INTERRUPT_RES_NAME  "GPMI NAND GPMI Interrupt"
+#define GPMI_NAND_BCH_REGS_ADDR_RES_NAME   "GPMI NAND BCH Registers"
+#define GPMI_NAND_BCH_INTERRUPT_RES_NAME   "GPMI NAND BCH Interrupt"
+#define GPMI_NAND_DMA_CHANNELS_RES_NAME    "GPMI NAND DMA Channels"
+#define GPMI_NAND_DMA_INTERRUPT_RES_NAME   "GPMI NAND DMA Interrupt"
+
+/**
+ * struct gpmi_nand_platform_data - GPMI NAND driver platform data.
+ *
+ * This structure communicates platform-specific information to the GPMI NAND
+ * driver that can't be expressed as resources.
+ *
+ * @platform_init:           A pointer to a function the driver will call to
+ *                           initialize the platform (e.g., set up the pin mux).
+ * @min_prop_delay_in_ns:    Minimum propagation delay of GPMI signals to and
+ *                           from the NAND Flash device, in nanoseconds.
+ * @max_prop_delay_in_ns:    Maximum propagation delay of GPMI signals to and
+ *                           from the NAND Flash device, in nanoseconds.
+ * @max_chip_count:          The maximum number of chips for which the driver
+ *                           should configure the hardware. This value most
+ *                           likely reflects the number of pins that are
+ *                           connected to a NAND Flash device. If this is
+ *                           greater than the SoC hardware can support, the
+ *                           driver will print a message and fail to initialize.
+ * @partitions:              An optional pointer to an array of partition
+ *                           descriptions.
+ * @partition_count:         The number of elements in the partitions array.
+ */
+struct gpmi_nand_platform_data {
+       /* SoC hardware information. */
+       int             (*platform_init)(void);
+
+       /* NAND Flash information. */
+       unsigned int    min_prop_delay_in_ns;
+       unsigned int    max_prop_delay_in_ns;
+       unsigned int    max_chip_count;
+
+       /* Medium information. */
+       struct          mtd_partition *partitions;
+       unsigned        partition_count;
+};
+#endif
index a27e56c..c2f1f6a 100644 (file)
 
 #include <linux/compiler.h>            /* For unlikely.  */
 #include <linux/sched.h>               /* For struct task_struct.  */
+#include <linux/err.h>                 /* for IS_ERR_VALUE */
 
 
 extern long arch_ptrace(struct task_struct *child, long request,
@@ -266,6 +267,15 @@ static inline void ptrace_release_task(struct task_struct *task)
 #define force_successful_syscall_return() do { } while (0)
 #endif
 
+#ifndef is_syscall_success
+/*
+ * On most systems we can tell if a syscall is a success based on if the retval
+ * is an error value.  On some systems like ia64 and powerpc they have different
+ * indicators of success/failure and must define their own.
+ */
+#define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs))))
+#endif
+
 /*
  * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__.
  *
index cb2dd11..8cd7fe5 100644 (file)
@@ -30,7 +30,7 @@ struct sh_desc {
        struct sh_dmae_regs hw;
        struct list_head node;
        struct dma_async_tx_descriptor async_tx;
-       enum dma_data_direction direction;
+       enum dma_transfer_direction direction;
        dma_cookie_t cookie;
        size_t partial;
        int chunks;
@@ -48,6 +48,7 @@ struct sh_dmae_channel {
        unsigned int    offset;
        unsigned int    dmars;
        unsigned int    dmars_bit;
+       unsigned int    chclr_offset;
 };
 
 struct sh_dmae_pdata {
@@ -68,6 +69,7 @@ struct sh_dmae_pdata {
        unsigned int dmaor_is_32bit:1;
        unsigned int needs_tend_set:1;
        unsigned int no_dmars:1;
+       unsigned int chclr_present:1;
 };
 
 /* DMA register */
diff --git a/include/linux/sirfsoc_dma.h b/include/linux/sirfsoc_dma.h
new file mode 100644 (file)
index 0000000..29d9593
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef _SIRFSOC_DMA_H_
+#define _SIRFSOC_DMA_H_
+
+bool sirfsoc_dma_filter_id(struct dma_chan *chan, void *chan_id);
+
+#endif
index ecdaeb9..5cf6850 100644 (file)
@@ -312,7 +312,6 @@ struct tty_driver {
         */
        struct tty_struct **ttys;
        struct ktermios **termios;
-       struct ktermios **termios_locked;
        void *driver_state;
 
        /*
index b31702a..84f3001 100644 (file)
@@ -16,6 +16,8 @@ struct btrfs_delayed_ref_node;
 struct btrfs_delayed_tree_ref;
 struct btrfs_delayed_data_ref;
 struct btrfs_delayed_ref_head;
+struct btrfs_block_group_cache;
+struct btrfs_free_cluster;
 struct map_lookup;
 struct extent_buffer;
 
@@ -44,6 +46,17 @@ struct extent_buffer;
        obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||                \
              (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
 
+#define BTRFS_GROUP_FLAGS      \
+       { BTRFS_BLOCK_GROUP_DATA,       "DATA"}, \
+       { BTRFS_BLOCK_GROUP_SYSTEM,     "SYSTEM"}, \
+       { BTRFS_BLOCK_GROUP_METADATA,   "METADATA"}, \
+       { BTRFS_BLOCK_GROUP_RAID0,      "RAID0"}, \
+       { BTRFS_BLOCK_GROUP_RAID1,      "RAID1"}, \
+       { BTRFS_BLOCK_GROUP_DUP,        "DUP"}, \
+       { BTRFS_BLOCK_GROUP_RAID10,     "RAID10"}
+
+#define BTRFS_UUID_SIZE 16
+
 TRACE_EVENT(btrfs_transaction_commit,
 
        TP_PROTO(struct btrfs_root *root),
@@ -621,6 +634,34 @@ TRACE_EVENT(btrfs_cow_block,
                  __entry->cow_level)
 );
 
+TRACE_EVENT(btrfs_space_reservation,
+
+       TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val,
+                u64 bytes, int reserve),
+
+       TP_ARGS(fs_info, type, val, bytes, reserve),
+
+       TP_STRUCT__entry(
+               __array(        u8,     fsid,   BTRFS_UUID_SIZE )
+               __string(       type,   type                    )
+               __field(        u64,    val                     )
+               __field(        u64,    bytes                   )
+               __field(        int,    reserve                 )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+               __assign_str(type, type);
+               __entry->val            = val;
+               __entry->bytes          = bytes;
+               __entry->reserve        = reserve;
+       ),
+
+       TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type),
+                 __entry->val, __entry->reserve ? "reserve" : "release",
+                 __entry->bytes)
+);
+
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
        TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
@@ -659,6 +700,168 @@ DEFINE_EVENT(btrfs__reserved_extent,  btrfs_reserved_extent_free,
        TP_ARGS(root, start, len)
 );
 
+TRACE_EVENT(find_free_extent,
+
+       TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size,
+                u64 data),
+
+       TP_ARGS(root, num_bytes, empty_size, data),
+
+       TP_STRUCT__entry(
+               __field(        u64,    root_objectid           )
+               __field(        u64,    num_bytes               )
+               __field(        u64,    empty_size              )
+               __field(        u64,    data                    )
+       ),
+
+       TP_fast_assign(
+               __entry->root_objectid  = root->root_key.objectid;
+               __entry->num_bytes      = num_bytes;
+               __entry->empty_size     = empty_size;
+               __entry->data           = data;
+       ),
+
+       TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, "
+                 "flags = %Lu(%s)", show_root_type(__entry->root_objectid),
+                 __entry->num_bytes, __entry->empty_size, __entry->data,
+                 __print_flags((unsigned long)__entry->data, "|",
+                                BTRFS_GROUP_FLAGS))
+);
+
+DECLARE_EVENT_CLASS(btrfs__reserve_extent,
+
+       TP_PROTO(struct btrfs_root *root,
+                struct btrfs_block_group_cache *block_group, u64 start,
+                u64 len),
+
+       TP_ARGS(root, block_group, start, len),
+
+       TP_STRUCT__entry(
+               __field(        u64,    root_objectid           )
+               __field(        u64,    bg_objectid             )
+               __field(        u64,    flags                   )
+               __field(        u64,    start                   )
+               __field(        u64,    len                     )
+       ),
+
+       TP_fast_assign(
+               __entry->root_objectid  = root->root_key.objectid;
+               __entry->bg_objectid    = block_group->key.objectid;
+               __entry->flags          = block_group->flags;
+               __entry->start          = start;
+               __entry->len            = len;
+       ),
+
+       TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), "
+                 "start = %Lu, len = %Lu",
+                 show_root_type(__entry->root_objectid), __entry->bg_objectid,
+                 __entry->flags, __print_flags((unsigned long)__entry->flags,
+                                               "|", BTRFS_GROUP_FLAGS),
+                 __entry->start, __entry->len)
+);
+
+DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
+
+       TP_PROTO(struct btrfs_root *root,
+                struct btrfs_block_group_cache *block_group, u64 start,
+                u64 len),
+
+       TP_ARGS(root, block_group, start, len)
+);
+
+DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
+
+       TP_PROTO(struct btrfs_root *root,
+                struct btrfs_block_group_cache *block_group, u64 start,
+                u64 len),
+
+       TP_ARGS(root, block_group, start, len)
+);
+
+TRACE_EVENT(btrfs_find_cluster,
+
+       TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start,
+                u64 bytes, u64 empty_size, u64 min_bytes),
+
+       TP_ARGS(block_group, start, bytes, empty_size, min_bytes),
+
+       TP_STRUCT__entry(
+               __field(        u64,    bg_objectid             )
+               __field(        u64,    flags                   )
+               __field(        u64,    start                   )
+               __field(        u64,    bytes                   )
+               __field(        u64,    empty_size              )
+               __field(        u64,    min_bytes               )
+       ),
+
+       TP_fast_assign(
+               __entry->bg_objectid    = block_group->key.objectid;
+               __entry->flags          = block_group->flags;
+               __entry->start          = start;
+               __entry->bytes          = bytes;
+               __entry->empty_size     = empty_size;
+               __entry->min_bytes      = min_bytes;
+       ),
+
+       TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu,"
+                 " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid,
+                 __entry->flags,
+                 __print_flags((unsigned long)__entry->flags, "|",
+                               BTRFS_GROUP_FLAGS), __entry->start,
+                 __entry->bytes, __entry->empty_size,  __entry->min_bytes)
+);
+
+TRACE_EVENT(btrfs_failed_cluster_setup,
+
+       TP_PROTO(struct btrfs_block_group_cache *block_group),
+
+       TP_ARGS(block_group),
+
+       TP_STRUCT__entry(
+               __field(        u64,    bg_objectid             )
+       ),
+
+       TP_fast_assign(
+               __entry->bg_objectid    = block_group->key.objectid;
+       ),
+
+       TP_printk("block_group = %Lu", __entry->bg_objectid)
+);
+
+TRACE_EVENT(btrfs_setup_cluster,
+
+       TP_PROTO(struct btrfs_block_group_cache *block_group,
+                struct btrfs_free_cluster *cluster, u64 size, int bitmap),
+
+       TP_ARGS(block_group, cluster, size, bitmap),
+
+       TP_STRUCT__entry(
+               __field(        u64,    bg_objectid             )
+               __field(        u64,    flags                   )
+               __field(        u64,    start                   )
+               __field(        u64,    max_size                )
+               __field(        u64,    size                    )
+               __field(        int,    bitmap                  )
+       ),
+
+       TP_fast_assign(
+               __entry->bg_objectid    = block_group->key.objectid;
+               __entry->flags          = block_group->flags;
+               __entry->start          = cluster->window_start;
+               __entry->max_size       = cluster->max_size;
+               __entry->size           = size;
+               __entry->bitmap         = bitmap;
+       ),
+
+       TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, "
+                 "size = %Lu, max_size = %Lu, bitmap = %d",
+                 __entry->bg_objectid,
+                 __entry->flags,
+                 __print_flags((unsigned long)__entry->flags, "|",
+                               BTRFS_GROUP_FLAGS), __entry->start,
+                 __entry->size, __entry->max_size, __entry->bitmap)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
index 6ac2236..3f42cd6 100644 (file)
@@ -355,7 +355,7 @@ config AUDIT
 
 config AUDITSYSCALL
        bool "Enable system-call auditing support"
-       depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH)
+       depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || ARM)
        default y if SECURITY_SELINUX
        help
          Enable low-overhead system-call auditing infrastructure that
@@ -372,6 +372,20 @@ config AUDIT_TREE
        depends on AUDITSYSCALL
        select FSNOTIFY
 
+config AUDIT_LOGINUID_IMMUTABLE
+       bool "Make audit loginuid immutable"
+       depends on AUDIT
+       help
+         The config option toggles if a task setting its loginuid requires
+         CAP_SYS_AUDITCONTROL or if that task should require no special permissions
+         but should instead only allow setting its loginuid if it was never
+         previously set.  On systems which use systemd or a similar central
+         process to restart login services this should be set to true.  On older
+         systems in which an admin would typically have to directly stop and
+         start processes this should be set to false.  Setting this to true allows
+         one to drop potentially dangerous capabilites from the login tasks,
+         but may not be backwards compatible with older init systems.
+
 source "kernel/irq/Kconfig"
 
 menu "RCU Subsystem"
index 57e3f51..bb0eb5b 100644 (file)
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
        }
 
        *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
-       audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
+       audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
                         pid, uid, auid, ses);
        if (sid) {
                rc = security_secid_to_secctx(sid, &ctx, &len);
@@ -1423,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
        char *p, *pathname;
 
        if (prefix)
-               audit_log_format(ab, " %s", prefix);
+               audit_log_format(ab, "%s", prefix);
 
        /* We will allow 11 spaces for ' (deleted)' to be appended */
        pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
index 91e7071..8167668 100644 (file)
@@ -36,12 +36,8 @@ enum audit_state {
        AUDIT_DISABLED,         /* Do not create per-task audit_context.
                                 * No syscall-specific audit records can
                                 * be generated. */
-       AUDIT_SETUP_CONTEXT,    /* Create the per-task audit_context,
-                                * but don't necessarily fill it in at
-                                * syscall entry time (i.e., filter
-                                * instead). */
        AUDIT_BUILD_CONTEXT,    /* Create the per-task audit_context,
-                                * and always fill it in at syscall
+                                * and fill it in at syscall
                                 * entry time.  This makes a full
                                 * syscall record available if some
                                 * other part of the kernel decides it
index f8277c8..a6c3f1a 100644 (file)
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
        switch(listnr) {
        default:
                goto exit_err;
-       case AUDIT_FILTER_USER:
-       case AUDIT_FILTER_TYPE:
 #ifdef CONFIG_AUDITSYSCALL
        case AUDIT_FILTER_ENTRY:
+               if (rule->action == AUDIT_ALWAYS)
+                       goto exit_err;
        case AUDIT_FILTER_EXIT:
        case AUDIT_FILTER_TASK:
 #endif
+       case AUDIT_FILTER_USER:
+       case AUDIT_FILTER_TYPE:
                ;
        }
        if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
                                goto exit_free;
                        break;
                case AUDIT_FILETYPE:
-                       if ((f->val & ~S_IFMT) > S_IFMT)
+                       if (f->val & ~S_IFMT)
                                goto exit_free;
                        break;
                case AUDIT_INODE:
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                case AUDIT_ARG1:
                case AUDIT_ARG2:
                case AUDIT_ARG3:
+               case AUDIT_OBJ_UID:
+               case AUDIT_OBJ_GID:
                        break;
                case AUDIT_ARCH:
                        entry->rule.arch_f = f;
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                goto exit_free;
                        break;
                case AUDIT_FILTERKEY:
-                       err = -EINVAL;
                        if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
                                goto exit_free;
                        str = audit_unpack_string(&bufp, &remain, f->val);
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                goto exit_free;
                        break;
                case AUDIT_FILETYPE:
-                       if ((f->val & ~S_IFMT) > S_IFMT)
+                       if (f->val & ~S_IFMT)
+                               goto exit_free;
+                       break;
+               case AUDIT_FIELD_COMPARE:
+                       if (f->val > AUDIT_MAX_FIELD_COMPARE)
                                goto exit_free;
                        break;
                default:
index e7fe2b0..caaea6e 100644 (file)
 
 #include "audit.h"
 
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). */
-#define AUDIT_NAMES    20
+ * for saving names from getname().  If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES    5
 
 /* Indicates that audit should log the full pathname. */
 #define AUDIT_NAME_FULL -1
@@ -101,9 +107,8 @@ struct audit_cap_data {
  *
  * Further, in fs/namei.c:path_lookup() we store the inode and device. */
 struct audit_names {
+       struct list_head list;          /* audit_context->names_list */
        const char      *name;
-       int             name_len;       /* number of name's characters to log */
-       unsigned        name_put;       /* call __putname() for this name */
        unsigned long   ino;
        dev_t           dev;
        umode_t         mode;
@@ -113,6 +118,14 @@ struct audit_names {
        u32             osid;
        struct audit_cap_data fcap;
        unsigned int    fcap_ver;
+       int             name_len;       /* number of name's characters to log */
+       bool            name_put;       /* call __putname() for this name */
+       /*
+        * This was an allocated audit_names and not from the array of
+        * names allocated in the task audit context.  Thus this name
+        * should be freed on syscall exit
+        */
+       bool            should_free;
 };
 
 struct audit_aux_data {
@@ -174,8 +187,17 @@ struct audit_context {
        long                return_code;/* syscall return code */
        u64                 prio;
        int                 return_valid; /* return code is valid */
-       int                 name_count;
-       struct audit_names  names[AUDIT_NAMES];
+       /*
+        * The names_list is the list of all audit_names collected during this
+        * syscall.  The first AUDIT_NAMES entries in the names_list will
+        * actually be from the preallocated_names array for performance
+        * reasons.  Except during allocation they should never be referenced
+        * through the preallocated_names array and should only be found/used
+        * by running the names_list.
+        */
+       struct audit_names  preallocated_names[AUDIT_NAMES];
+       int                 name_count; /* total records in names_list */
+       struct list_head    names_list; /* anchor for struct audit_names->list */
        char *              filterkey;  /* key for rule that triggered record */
        struct path         pwd;
        struct audit_context *previous; /* For nested syscalls */
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
        }
 }
 
-static int audit_match_filetype(struct audit_context *ctx, int which)
+static int audit_match_filetype(struct audit_context *ctx, int val)
 {
-       unsigned index = which & ~S_IFMT;
-       umode_t mode = which & S_IFMT;
+       struct audit_names *n;
+       umode_t mode = (umode_t)val;
 
        if (unlikely(!ctx))
                return 0;
 
-       if (index >= ctx->name_count)
-               return 0;
-       if (ctx->names[index].ino == -1)
-               return 0;
-       if ((ctx->names[index].mode ^ mode) & S_IFMT)
-               return 0;
-       return 1;
+       list_for_each_entry(n, &ctx->names_list, list) {
+               if ((n->ino != -1) &&
+                   ((n->mode & S_IFMT) == mode))
+                       return 1;
+       }
+
+       return 0;
 }
 
 /*
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
        return 0;
 }
 
+static int audit_compare_id(uid_t uid1,
+                           struct audit_names *name,
+                           unsigned long name_offset,
+                           struct audit_field *f,
+                           struct audit_context *ctx)
+{
+       struct audit_names *n;
+       unsigned long addr;
+       uid_t uid2;
+       int rc;
+
+       BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
+
+       if (name) {
+               addr = (unsigned long)name;
+               addr += name_offset;
+
+               uid2 = *(uid_t *)addr;
+               rc = audit_comparator(uid1, f->op, uid2);
+               if (rc)
+                       return rc;
+       }
+
+       if (ctx) {
+               list_for_each_entry(n, &ctx->names_list, list) {
+                       addr = (unsigned long)n;
+                       addr += name_offset;
+
+                       uid2 = *(uid_t *)addr;
+
+                       rc = audit_comparator(uid1, f->op, uid2);
+                       if (rc)
+                               return rc;
+               }
+       }
+       return 0;
+}
+
+static int audit_field_compare(struct task_struct *tsk,
+                              const struct cred *cred,
+                              struct audit_field *f,
+                              struct audit_context *ctx,
+                              struct audit_names *name)
+{
+       switch (f->val) {
+       /* process to file object comparisons */
+       case AUDIT_COMPARE_UID_TO_OBJ_UID:
+               return audit_compare_id(cred->uid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_GID_TO_OBJ_GID:
+               return audit_compare_id(cred->gid,
+                                       name, offsetof(struct audit_names, gid),
+                                       f, ctx);
+       case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+               return audit_compare_id(cred->euid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+               return audit_compare_id(cred->egid,
+                                       name, offsetof(struct audit_names, gid),
+                                       f, ctx);
+       case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+               return audit_compare_id(tsk->loginuid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+               return audit_compare_id(cred->suid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+               return audit_compare_id(cred->sgid,
+                                       name, offsetof(struct audit_names, gid),
+                                       f, ctx);
+       case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+               return audit_compare_id(cred->fsuid,
+                                       name, offsetof(struct audit_names, uid),
+                                       f, ctx);
+       case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+               return audit_compare_id(cred->fsgid,
+                                       name, offsetof(struct audit_names, gid),
+                                       f, ctx);
+       /* uid comparisons */
+       case AUDIT_COMPARE_UID_TO_AUID:
+               return audit_comparator(cred->uid, f->op, tsk->loginuid);
+       case AUDIT_COMPARE_UID_TO_EUID:
+               return audit_comparator(cred->uid, f->op, cred->euid);
+       case AUDIT_COMPARE_UID_TO_SUID:
+               return audit_comparator(cred->uid, f->op, cred->suid);
+       case AUDIT_COMPARE_UID_TO_FSUID:
+               return audit_comparator(cred->uid, f->op, cred->fsuid);
+       /* auid comparisons */
+       case AUDIT_COMPARE_AUID_TO_EUID:
+               return audit_comparator(tsk->loginuid, f->op, cred->euid);
+       case AUDIT_COMPARE_AUID_TO_SUID:
+               return audit_comparator(tsk->loginuid, f->op, cred->suid);
+       case AUDIT_COMPARE_AUID_TO_FSUID:
+               return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+       /* euid comparisons */
+       case AUDIT_COMPARE_EUID_TO_SUID:
+               return audit_comparator(cred->euid, f->op, cred->suid);
+       case AUDIT_COMPARE_EUID_TO_FSUID:
+               return audit_comparator(cred->euid, f->op, cred->fsuid);
+       /* suid comparisons */
+       case AUDIT_COMPARE_SUID_TO_FSUID:
+               return audit_comparator(cred->suid, f->op, cred->fsuid);
+       /* gid comparisons */
+       case AUDIT_COMPARE_GID_TO_EGID:
+               return audit_comparator(cred->gid, f->op, cred->egid);
+       case AUDIT_COMPARE_GID_TO_SGID:
+               return audit_comparator(cred->gid, f->op, cred->sgid);
+       case AUDIT_COMPARE_GID_TO_FSGID:
+               return audit_comparator(cred->gid, f->op, cred->fsgid);
+       /* egid comparisons */
+       case AUDIT_COMPARE_EGID_TO_SGID:
+               return audit_comparator(cred->egid, f->op, cred->sgid);
+       case AUDIT_COMPARE_EGID_TO_FSGID:
+               return audit_comparator(cred->egid, f->op, cred->fsgid);
+       /* sgid comparison */
+       case AUDIT_COMPARE_SGID_TO_FSGID:
+               return audit_comparator(cred->sgid, f->op, cred->fsgid);
+       default:
+               WARN(1, "Missing AUDIT_COMPARE define.  Report as a bug\n");
+               return 0;
+       }
+       return 0;
+}
+
 /* Determine if any context name data matches a rule's watch data */
 /* Compare a task_struct with an audit_rule.  Return 1 on match, 0
  * otherwise.
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk,
                              bool task_creation)
 {
        const struct cred *cred;
-       int i, j, need_sid = 1;
+       int i, need_sid = 1;
        u32 sid;
 
        cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
 
        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
+               struct audit_names *n;
                int result = 0;
 
                switch (f->type) {
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMAJOR:
-                       if (name)
-                               result = audit_comparator(MAJOR(name->dev),
-                                                         f->op, f->val);
-                       else if (ctx) {
-                               for (j = 0; j < ctx->name_count; j++) {
-                                       if (audit_comparator(MAJOR(ctx->names[j].dev),  f->op, f->val)) {
+                       if (name) {
+                               if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+                                   audit_comparator(MAJOR(name->rdev), f->op, f->val))
+                                       ++result;
+                       } else if (ctx) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+                                           audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk,
                        }
                        break;
                case AUDIT_DEVMINOR:
-                       if (name)
-                               result = audit_comparator(MINOR(name->dev),
-                                                         f->op, f->val);
-                       else if (ctx) {
-                               for (j = 0; j < ctx->name_count; j++) {
-                                       if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+                       if (name) {
+                               if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+                                   audit_comparator(MINOR(name->rdev), f->op, f->val))
+                                       ++result;
+                       } else if (ctx) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+                                           audit_comparator(MINOR(n->rdev), f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk,
                        if (name)
                                result = (name->ino == f->val);
                        else if (ctx) {
-                               for (j = 0; j < ctx->name_count; j++) {
-                                       if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(n->ino, f->op, f->val)) {
+                                               ++result;
+                                               break;
+                                       }
+                               }
+                       }
+                       break;
+               case AUDIT_OBJ_UID:
+                       if (name) {
+                               result = audit_comparator(name->uid, f->op, f->val);
+                       } else if (ctx) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(n->uid, f->op, f->val)) {
+                                               ++result;
+                                               break;
+                                       }
+                               }
+                       }
+                       break;
+               case AUDIT_OBJ_GID:
+                       if (name) {
+                               result = audit_comparator(name->gid, f->op, f->val);
+                       } else if (ctx) {
+                               list_for_each_entry(n, &ctx->names_list, list) {
+                                       if (audit_comparator(n->gid, f->op, f->val)) {
                                                ++result;
                                                break;
                                        }
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                                                   name->osid, f->type, f->op,
                                                   f->lsm_rule, ctx);
                                } else if (ctx) {
-                                       for (j = 0; j < ctx->name_count; j++) {
-                                               if (security_audit_rule_match(
-                                                     ctx->names[j].osid,
-                                                     f->type, f->op,
-                                                     f->lsm_rule, ctx)) {
+                                       list_for_each_entry(n, &ctx->names_list, list) {
+                                               if (security_audit_rule_match(n->osid, f->type,
+                                                                             f->op, f->lsm_rule,
+                                                                             ctx)) {
                                                        ++result;
                                                        break;
                                                }
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk,
                case AUDIT_FILETYPE:
                        result = audit_match_filetype(ctx, f->val);
                        break;
+               case AUDIT_FIELD_COMPARE:
+                       result = audit_field_compare(tsk, cred, f, ctx, name);
+                       break;
                }
-
                if (!result)
                        return 0;
        }
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
        return AUDIT_BUILD_CONTEXT;
 }
 
-/* At syscall exit time, this filter is called if any audit_names[] have been
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+                                  struct audit_names *n,
+                                  struct audit_context *ctx) {
+       int word, bit;
+       int h = audit_hash_ino((u32)n->ino);
+       struct list_head *list = &audit_inode_hash[h];
+       struct audit_entry *e;
+       enum audit_state state;
+
+       word = AUDIT_WORD(ctx->major);
+       bit  = AUDIT_BIT(ctx->major);
+
+       if (list_empty(list))
+               return 0;
+
+       list_for_each_entry_rcu(e, list, list) {
+               if ((e->rule.mask[word] & bit) == bit &&
+                   audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+                       ctx->current_state = state;
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+/* At syscall exit time, this filter is called if any audit_names have been
  * collected during syscall processing.  We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names[].
+ * buckets applicable to the inode numbers in audit_names.
  * Regarding audit_state, same rules apply as for audit_filter_syscall().
  */
 void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 {
-       int i;
-       struct audit_entry *e;
-       enum audit_state state;
+       struct audit_names *n;
 
        if (audit_pid && tsk->tgid == audit_pid)
                return;
 
        rcu_read_lock();
-       for (i = 0; i < ctx->name_count; i++) {
-               int word = AUDIT_WORD(ctx->major);
-               int bit  = AUDIT_BIT(ctx->major);
-               struct audit_names *n = &ctx->names[i];
-               int h = audit_hash_ino((u32)n->ino);
-               struct list_head *list = &audit_inode_hash[h];
-
-               if (list_empty(list))
-                       continue;
 
-               list_for_each_entry_rcu(e, list, list) {
-                       if ((e->rule.mask[word] & bit) == bit &&
-                           audit_filter_rules(tsk, &e->rule, ctx, n,
-                                              &state, false)) {
-                               rcu_read_unlock();
-                               ctx->current_state = state;
-                               return;
-                       }
-               }
+       list_for_each_entry(n, &ctx->names_list, list) {
+               if (audit_filter_inode_name(tsk, n, ctx))
+                       break;
        }
        rcu_read_unlock();
 }
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 {
        struct audit_context *context = tsk->audit_context;
 
-       if (likely(!context))
+       if (!context)
                return NULL;
        context->return_valid = return_valid;
 
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 
 static inline void audit_free_names(struct audit_context *context)
 {
-       int i;
+       struct audit_names *n, *next;
 
 #if AUDIT_DEBUG == 2
        if (context->put_count + context->ino_count != context->name_count) {
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context)
                       context->serial, context->major, context->in_syscall,
                       context->name_count, context->put_count,
                       context->ino_count);
-               for (i = 0; i < context->name_count; i++) {
+               list_for_each_entry(n, &context->names_list, list) {
                        printk(KERN_ERR "names[%d] = %p = %s\n", i,
-                              context->names[i].name,
-                              context->names[i].name ?: "(null)");
+                              n->name, n->name ?: "(null)");
                }
                dump_stack();
                return;
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context)
        context->ino_count  = 0;
 #endif
 
-       for (i = 0; i < context->name_count; i++) {
-               if (context->names[i].name && context->names[i].name_put)
-                       __putname(context->names[i].name);
+       list_for_each_entry_safe(n, next, &context->names_list, list) {
+               list_del(&n->list);
+               if (n->name && n->name_put)
+                       __putname(n->name);
+               if (n->should_free)
+                       kfree(n);
        }
        context->name_count = 0;
        path_put(&context->pwd);
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
                return NULL;
        audit_zero_context(context, state);
        INIT_LIST_HEAD(&context->killed_trees);
+       INIT_LIST_HEAD(&context->names_list);
        return context;
 }
 
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk)
                return 0; /* Return if not auditing. */
 
        state = audit_filter_task(tsk, &key);
-       if (likely(state == AUDIT_DISABLED))
+       if (state == AUDIT_DISABLED)
                return 0;
 
        if (!(context = audit_alloc_context(state))) {
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
                while (vma) {
                        if ((vma->vm_flags & VM_EXECUTABLE) &&
                            vma->vm_file) {
-                               audit_log_d_path(ab, "exe=",
+                               audit_log_d_path(ab, " exe=",
                                                 &vma->vm_file->f_path);
                                break;
                        }
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
                                  struct audit_buffer **ab,
                                  struct audit_aux_data_execve *axi)
 {
-       int i;
-       size_t len, len_sent = 0;
+       int i, len;
+       size_t len_sent = 0;
        const char __user *p;
        char *buf;
 
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic)
        audit_log_end(ab);
 }
 
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+                          int record_num, int *call_panic)
+{
+       struct audit_buffer *ab;
+       ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+       if (!ab)
+               return; /* audit_panic has been called */
+
+       audit_log_format(ab, "item=%d", record_num);
+
+       if (n->name) {
+               switch (n->name_len) {
+               case AUDIT_NAME_FULL:
+                       /* log the full path */
+                       audit_log_format(ab, " name=");
+                       audit_log_untrustedstring(ab, n->name);
+                       break;
+               case 0:
+                       /* name was specified as a relative path and the
+                        * directory component is the cwd */
+                       audit_log_d_path(ab, " name=", &context->pwd);
+                       break;
+               default:
+                       /* log the name's directory component */
+                       audit_log_format(ab, " name=");
+                       audit_log_n_untrustedstring(ab, n->name,
+                                                   n->name_len);
+               }
+       } else
+               audit_log_format(ab, " name=(null)");
+
+       if (n->ino != (unsigned long)-1) {
+               audit_log_format(ab, " inode=%lu"
+                                " dev=%02x:%02x mode=%#ho"
+                                " ouid=%u ogid=%u rdev=%02x:%02x",
+                                n->ino,
+                                MAJOR(n->dev),
+                                MINOR(n->dev),
+                                n->mode,
+                                n->uid,
+                                n->gid,
+                                MAJOR(n->rdev),
+                                MINOR(n->rdev));
+       }
+       if (n->osid != 0) {
+               char *ctx = NULL;
+               u32 len;
+               if (security_secid_to_secctx(
+                       n->osid, &ctx, &len)) {
+                       audit_log_format(ab, " osid=%u", n->osid);
+                       *call_panic = 2;
+               } else {
+                       audit_log_format(ab, " obj=%s", ctx);
+                       security_release_secctx(ctx, len);
+               }
+       }
+
+       audit_log_fcaps(ab, n);
+
+       audit_log_end(ab);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
        const struct cred *cred;
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        struct audit_buffer *ab;
        struct audit_aux_data *aux;
        const char *tty;
+       struct audit_names *n;
 
        /* tsk == current */
        context->pid = tsk->pid;
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
        if (context->pwd.dentry && context->pwd.mnt) {
                ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
                if (ab) {
-                       audit_log_d_path(ab, "cwd=", &context->pwd);
+                       audit_log_d_path(ab, " cwd=", &context->pwd);
                        audit_log_end(ab);
                }
        }
-       for (i = 0; i < context->name_count; i++) {
-               struct audit_names *n = &context->names[i];
 
-               ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
-               if (!ab)
-                       continue; /* audit_panic has been called */
-
-               audit_log_format(ab, "item=%d", i);
-
-               if (n->name) {
-                       switch(n->name_len) {
-                       case AUDIT_NAME_FULL:
-                               /* log the full path */
-                               audit_log_format(ab, " name=");
-                               audit_log_untrustedstring(ab, n->name);
-                               break;
-                       case 0:
-                               /* name was specified as a relative path and the
-                                * directory component is the cwd */
-                               audit_log_d_path(ab, "name=", &context->pwd);
-                               break;
-                       default:
-                               /* log the name's directory component */
-                               audit_log_format(ab, " name=");
-                               audit_log_n_untrustedstring(ab, n->name,
-                                                           n->name_len);
-                       }
-               } else
-                       audit_log_format(ab, " name=(null)");
-
-               if (n->ino != (unsigned long)-1) {
-                       audit_log_format(ab, " inode=%lu"
-                                        " dev=%02x:%02x mode=%#ho"
-                                        " ouid=%u ogid=%u rdev=%02x:%02x",
-                                        n->ino,
-                                        MAJOR(n->dev),
-                                        MINOR(n->dev),
-                                        n->mode,
-                                        n->uid,
-                                        n->gid,
-                                        MAJOR(n->rdev),
-                                        MINOR(n->rdev));
-               }
-               if (n->osid != 0) {
-                       char *ctx = NULL;
-                       u32 len;
-                       if (security_secid_to_secctx(
-                               n->osid, &ctx, &len)) {
-                               audit_log_format(ab, " osid=%u", n->osid);
-                               call_panic = 2;
-                       } else {
-                               audit_log_format(ab, " obj=%s", ctx);
-                               security_release_secctx(ctx, len);
-                       }
-               }
-
-               audit_log_fcaps(ab, n);
-
-               audit_log_end(ab);
-       }
+       i = 0;
+       list_for_each_entry(n, &context->names_list, list)
+               audit_log_name(context, n, i++, &call_panic);
 
        /* Send end of event record to help user space know we are finished */
        ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
  *
  * Called from copy_process and do_exit
  */
-void audit_free(struct task_struct *tsk)
+void __audit_free(struct task_struct *tsk)
 {
        struct audit_context *context;
 
        context = audit_get_context(tsk, 0, 0);
-       if (likely(!context))
+       if (!context)
                return;
 
        /* Check for system calls that do not go through the exit
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk)
  * will only be written if another part of the kernel requests that it
  * be written).
  */
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
                         unsigned long a1, unsigned long a2,
                         unsigned long a3, unsigned long a4)
 {
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major,
        struct audit_context *context = tsk->audit_context;
        enum audit_state     state;
 
-       if (unlikely(!context))
+       if (!context)
                return;
 
        /*
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major,
                context->prio = 0;
                state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
        }
-       if (likely(state == AUDIT_DISABLED))
+       if (state == AUDIT_DISABLED)
                return;
 
        context->serial     = 0;
@@ -1658,30 +1861,9 @@ void audit_syscall_entry(int arch, int major,
        context->ppid       = 0;
 }
 
-void audit_finish_fork(struct task_struct *child)
-{
-       struct audit_context *ctx = current->audit_context;
-       struct audit_context *p = child->audit_context;
-       if (!p || !ctx)
-               return;
-       if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
-               return;
-       p->arch = ctx->arch;
-       p->major = ctx->major;
-       memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
-       p->ctime = ctx->ctime;
-       p->dummy = ctx->dummy;
-       p->in_syscall = ctx->in_syscall;
-       p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
-       p->ppid = current->pid;
-       p->prio = ctx->prio;
-       p->current_state = ctx->current_state;
-}
-
 /**
  * audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @pt_regs: syscall registers
  *
  * Tear down after system call.  If the audit context has been marked as
  * auditable (either because of the AUDIT_RECORD_CONTEXT state from
@@ -1689,14 +1871,18 @@ void audit_finish_fork(struct task_struct *child)
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
 {
        struct task_struct *tsk = current;
        struct audit_context *context;
 
-       context = audit_get_context(tsk, valid, return_code);
+       if (success)
+               success = AUDITSC_SUCCESS;
+       else
+               success = AUDITSC_FAILURE;
 
-       if (likely(!context))
+       context = audit_get_context(tsk, success, return_code);
+       if (!context)
                return;
 
        if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1821,6 +2007,30 @@ retry:
 #endif
 }
 
+static struct audit_names *audit_alloc_name(struct audit_context *context)
+{
+       struct audit_names *aname;
+
+       if (context->name_count < AUDIT_NAMES) {
+               aname = &context->preallocated_names[context->name_count];
+               memset(aname, 0, sizeof(*aname));
+       } else {
+               aname = kzalloc(sizeof(*aname), GFP_NOFS);
+               if (!aname)
+                       return NULL;
+               aname->should_free = true;
+       }
+
+       aname->ino = (unsigned long)-1;
+       list_add_tail(&aname->list, &context->names_list);
+
+       context->name_count++;
+#if AUDIT_DEBUG
+       context->ino_count++;
+#endif
+       return aname;
+}
+
 /**
  * audit_getname - add a name to the list
  * @name: name to add
@@ -1831,9 +2041,7 @@ retry:
 void __audit_getname(const char *name)
 {
        struct audit_context *context = current->audit_context;
-
-       if (IS_ERR(name) || !name)
-               return;
+       struct audit_names *n;
 
        if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
@@ -1843,13 +2051,15 @@ void __audit_getname(const char *name)
 #endif
                return;
        }
-       BUG_ON(context->name_count >= AUDIT_NAMES);
-       context->names[context->name_count].name = name;
-       context->names[context->name_count].name_len = AUDIT_NAME_FULL;
-       context->names[context->name_count].name_put = 1;
-       context->names[context->name_count].ino  = (unsigned long)-1;
-       context->names[context->name_count].osid = 0;
-       ++context->name_count;
+
+       n = audit_alloc_name(context);
+       if (!n)
+               return;
+
+       n->name = name;
+       n->name_len = AUDIT_NAME_FULL;
+       n->name_put = true;
+
        if (!context->pwd.dentry)
                get_fs_pwd(current->fs, &context->pwd);
 }
@@ -1871,12 +2081,13 @@ void audit_putname(const char *name)
                printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
                       __FILE__, __LINE__, context->serial, name);
                if (context->name_count) {
+                       struct audit_names *n;
                        int i;
-                       for (i = 0; i < context->name_count; i++)
+
+                       list_for_each_entry(n, &context->names_list, list)
                                printk(KERN_ERR "name[%d] = %p = %s\n", i,
-                                      context->names[i].name,
-                                      context->names[i].name ?: "(null)");
-               }
+                                      n->name, n->name ?: "(null)");
+                       }
 #endif
                __putname(name);
        }
@@ -1897,39 +2108,11 @@ void audit_putname(const char *name)
 #endif
 }
 
-static int audit_inc_name_count(struct audit_context *context,
-                               const struct inode *inode)
-{
-       if (context->name_count >= AUDIT_NAMES) {
-               if (inode)
-                       printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
-                              "dev=%02x:%02x, inode=%lu\n",
-                              MAJOR(inode->i_sb->s_dev),
-                              MINOR(inode->i_sb->s_dev),
-                              inode->i_ino);
-
-               else
-                       printk(KERN_DEBUG "name_count maxed, losing inode data\n");
-               return 1;
-       }
-       context->name_count++;
-#if AUDIT_DEBUG
-       context->ino_count++;
-#endif
-       return 0;
-}
-
-
 static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
 {
        struct cpu_vfs_cap_data caps;
        int rc;
 
-       memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
-       memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
-       name->fcap.fE = 0;
-       name->fcap_ver = 0;
-
        if (!dentry)
                return 0;
 
@@ -1969,30 +2152,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
  */
 void __audit_inode(const char *name, const struct dentry *dentry)
 {
-       int idx;
        struct audit_context *context = current->audit_context;
        const struct inode *inode = dentry->d_inode;
+       struct audit_names *n;
 
        if (!context->in_syscall)
                return;
-       if (context->name_count
-           && context->names[context->name_count-1].name
-           && context->names[context->name_count-1].name == name)
-               idx = context->name_count - 1;
-       else if (context->name_count > 1
-                && context->names[context->name_count-2].name
-                && context->names[context->name_count-2].name == name)
-               idx = context->name_count - 2;
-       else {
-               /* FIXME: how much do we care about inodes that have no
-                * associated name? */
-               if (audit_inc_name_count(context, inode))
-                       return;
-               idx = context->name_count - 1;
-               context->names[idx].name = NULL;
+
+       list_for_each_entry_reverse(n, &context->names_list, list) {
+               if (n->name && (n->name == name))
+                       goto out;
        }
+
+       /* unable to find the name from a previous getname() */
+       n = audit_alloc_name(context);
+       if (!n)
+               return;
+out:
        handle_path(dentry);
-       audit_copy_inode(&context->names[idx], dentry, inode);
+       audit_copy_inode(n, dentry, inode);
 }
 
 /**
@@ -2011,11 +2189,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
 void __audit_inode_child(const struct dentry *dentry,
                         const struct inode *parent)
 {
-       int idx;
        struct audit_context *context = current->audit_context;
        const char *found_parent = NULL, *found_child = NULL;
        const struct inode *inode = dentry->d_inode;
        const char *dname = dentry->d_name.name;
+       struct audit_names *n;
        int dirlen = 0;
 
        if (!context->in_syscall)
@@ -2025,9 +2203,7 @@ void __audit_inode_child(const struct dentry *dentry,
                handle_one(inode);
 
        /* parent is more likely, look for it first */
-       for (idx = 0; idx < context->name_count; idx++) {
-               struct audit_names *n = &context->names[idx];
-
+       list_for_each_entry(n, &context->names_list, list) {
                if (!n->name)
                        continue;
 
@@ -2040,9 +2216,7 @@ void __audit_inode_child(const struct dentry *dentry,
        }
 
        /* no matching parent, look for matching child */
-       for (idx = 0; idx < context->name_count; idx++) {
-               struct audit_names *n = &context->names[idx];
-
+       list_for_each_entry(n, &context->names_list, list) {
                if (!n->name)
                        continue;
 
@@ -2060,34 +2234,29 @@ void __audit_inode_child(const struct dentry *dentry,
 
 add_names:
        if (!found_parent) {
-               if (audit_inc_name_count(context, parent))
+               n = audit_alloc_name(context);
+               if (!n)
                        return;
-               idx = context->name_count - 1;
-               context->names[idx].name = NULL;
-               audit_copy_inode(&context->names[idx], NULL, parent);
+               audit_copy_inode(n, NULL, parent);
        }
 
        if (!found_child) {
-               if (audit_inc_name_count(context, inode))
+               n = audit_alloc_name(context);
+               if (!n)
                        return;
-               idx = context->name_count - 1;
 
                /* Re-use the name belonging to the slot for a matching parent
                 * directory. All names for this context are relinquished in
                 * audit_free_names() */
                if (found_parent) {
-                       context->names[idx].name = found_parent;
-                       context->names[idx].name_len = AUDIT_NAME_FULL;
+                       n->name = found_parent;
+                       n->name_len = AUDIT_NAME_FULL;
                        /* don't call __putname() */
-                       context->names[idx].name_put = 0;
-               } else {
-                       context->names[idx].name = NULL;
+                       n->name_put = false;
                }
 
                if (inode)
-                       audit_copy_inode(&context->names[idx], NULL, inode);
-               else
-                       context->names[idx].ino = (unsigned long)-1;
+                       audit_copy_inode(n, NULL, inode);
        }
 }
 EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2121,19 +2290,28 @@ int auditsc_get_stamp(struct audit_context *ctx,
 static atomic_t session_id = ATOMIC_INIT(0);
 
 /**
- * audit_set_loginuid - set a task's audit_context loginuid
- * @task: task whose audit context is being modified
+ * audit_set_loginuid - set current task's audit_context loginuid
  * @loginuid: loginuid value
  *
  * Returns 0.
  *
  * Called (set) from fs/proc/base.c::proc_loginuid_write().
  */
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(uid_t loginuid)
 {
-       unsigned int sessionid = atomic_inc_return(&session_id);
+       struct task_struct *task = current;
        struct audit_context *context = task->audit_context;
+       unsigned int sessionid;
+
+#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
+       if (task->loginuid != -1)
+               return -EPERM;
+#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+       if (!capable(CAP_AUDIT_CONTROL))
+               return -EPERM;
+#endif  /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
 
+       sessionid = atomic_inc_return(&session_id);
        if (context && context->in_syscall) {
                struct audit_buffer *ab;
 
@@ -2271,14 +2449,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo
        context->ipc.has_perm = 1;
 }
 
-int audit_bprm(struct linux_binprm *bprm)
+int __audit_bprm(struct linux_binprm *bprm)
 {
        struct audit_aux_data_execve *ax;
        struct audit_context *context = current->audit_context;
 
-       if (likely(!audit_enabled || !context || context->dummy))
-               return 0;
-
        ax = kmalloc(sizeof(*ax), GFP_KERNEL);
        if (!ax)
                return -ENOMEM;
@@ -2299,13 +2474,10 @@ int audit_bprm(struct linux_binprm *bprm)
  * @args: args array
  *
  */
-void audit_socketcall(int nargs, unsigned long *args)
+void __audit_socketcall(int nargs, unsigned long *args)
 {
        struct audit_context *context = current->audit_context;
 
-       if (likely(!context || context->dummy))
-               return;
-
        context->type = AUDIT_SOCKETCALL;
        context->socketcall.nargs = nargs;
        memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2331,13 +2503,10 @@ void __audit_fd_pair(int fd1, int fd2)
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_sockaddr(int len, void *a)
+int __audit_sockaddr(int len, void *a)
 {
        struct audit_context *context = current->audit_context;
 
-       if (likely(!context || context->dummy))
-               return 0;
-
        if (!context->sockaddr) {
                void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
                if (!p)
@@ -2499,6 +2668,25 @@ void __audit_mmap_fd(int fd, int flags)
        context->type = AUDIT_MMAP;
 }
 
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+       uid_t auid, uid;
+       gid_t gid;
+       unsigned int sessionid;
+
+       auid = audit_get_loginuid(current);
+       sessionid = audit_get_sessionid(current);
+       current_uid_gid(&uid, &gid);
+
+       audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+                        auid, uid, gid, sessionid);
+       audit_log_task_context(ab);
+       audit_log_format(ab, " pid=%d comm=", current->pid);
+       audit_log_untrustedstring(ab, current->comm);
+       audit_log_format(ab, " reason=");
+       audit_log_string(ab, reason);
+       audit_log_format(ab, " sig=%ld", signr);
+}
 /**
  * audit_core_dumps - record information about processes that end abnormally
  * @signr: signal value
@@ -2509,10 +2697,6 @@ void __audit_mmap_fd(int fd, int flags)
 void audit_core_dumps(long signr)
 {
        struct audit_buffer *ab;
-       u32 sid;
-       uid_t auid = audit_get_loginuid(current), uid;
-       gid_t gid;
-       unsigned int sessionid = audit_get_sessionid(current);
 
        if (!audit_enabled)
                return;
@@ -2521,24 +2705,17 @@ void audit_core_dumps(long signr)
                return;
 
        ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
-       current_uid_gid(&uid, &gid);
-       audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
-                        auid, uid, gid, sessionid);
-       security_task_getsecid(current, &sid);
-       if (sid) {
-               char *ctx = NULL;
-               u32 len;
+       audit_log_abend(ab, "memory violation", signr);
+       audit_log_end(ab);
+}
 
-               if (security_secid_to_secctx(sid, &ctx, &len))
-                       audit_log_format(ab, " ssid=%u", sid);
-               else {
-                       audit_log_format(ab, " subj=%s", ctx);
-                       security_release_secctx(ctx, len);
-               }
-       }
-       audit_log_format(ab, " pid=%d comm=", current->pid);
-       audit_log_untrustedstring(ab, current->comm);
-       audit_log_format(ab, " sig=%ld", signr);
+void __audit_seccomp(unsigned long syscall)
+{
+       struct audit_buffer *ab;
+
+       ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+       audit_log_abend(ab, "seccomp", SIGKILL);
+       audit_log_format(ab, " syscall=%ld", syscall);
        audit_log_end(ab);
 }
 
index 0fcf1c1..3f1adb6 100644 (file)
@@ -384,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
                BUG();
        }
 
-       if (has_ns_capability(current, ns, cap)) {
+       if (security_capable(current_cred(), ns, cap) == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
index c447382..294b170 100644 (file)
@@ -964,8 +964,7 @@ void do_exit(long code)
        acct_collect(code, group_dead);
        if (group_dead)
                tty_audit_exit();
-       if (unlikely(tsk->audit_context))
-               audit_free(tsk);
+       audit_free(tsk);
 
        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);
index f3fa188..051f090 100644 (file)
@@ -1527,8 +1527,6 @@ long do_fork(unsigned long clone_flags,
                        init_completion(&vfork);
                }
 
-               audit_finish_fork(p);
-
                /*
                 * We set PF_STARTING at creation in case tracing wants to
                 * use this to distinguish a fully live task from one that
index 57d4b13..e8d76c5 100644 (file)
@@ -6,6 +6,7 @@
  * This defines a simple but solid secure-computing mode.
  */
 
+#include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/sched.h>
 #include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
 #ifdef SECCOMP_DEBUG
        dump_stack();
 #endif
+       audit_seccomp(this_syscall);
        do_exit(SIGKILL);
 }
 
index 201e1b3..169eb7c 100644 (file)
@@ -286,25 +286,24 @@ config CORDIC
          calculations are in fixed point. Module will be called cordic.
 
 config MPILIB
-       tristate "Multiprecision maths library"
+       tristate
        help
          Multiprecision maths library from GnuPG.
          It is used to implement RSA digital signature verification,
          which is used by IMA/EVM digital signature extension.
 
 config MPILIB_EXTRA
-       bool "Multiprecision maths library - additional sources"
+       bool
        depends on MPILIB
        help
-         Multiprecision maths library from GnuPG.
-         It is used to implement RSA digital signature verification,
-         which is used by IMA/EVM digital signature extension.
-         This code in unnecessary for RSA digital signature verification,
-         and can be compiled if needed.
+         Additional sources of multiprecision maths library from GnuPG.
+         This code is unnecessary for RSA digital signature verification,
+         but can be compiled if needed.
 
-config DIGSIG
-       tristate "In-kernel signature checker"
-       depends on KEYS
+config SIGNATURE
+       tristate
+       depends on KEYS && CRYPTO
+       select CRYPTO_SHA1
        select MPILIB
        help
          Digital signature verification. Currently only RSA is supported.
index dace162..d71aae1 100644 (file)
@@ -119,7 +119,7 @@ obj-$(CONFIG_CORDIC) += cordic.o
 obj-$(CONFIG_DQL) += dynamic_queue_limits.o
 
 obj-$(CONFIG_MPILIB) += mpi/
-obj-$(CONFIG_DIGSIG) += digsig.o
+obj-$(CONFIG_SIGNATURE) += digsig.o
 
 hostprogs-y    := gen_crc32table
 clean-files    := crc32table.h
index d384ea9..5bd1cc1 100644 (file)
@@ -3,11 +3,11 @@ config INTEGRITY
        def_bool y
        depends on IMA || EVM
 
-config INTEGRITY_DIGSIG
+config INTEGRITY_SIGNATURE
        boolean "Digital signature verification using multiple keyrings"
        depends on INTEGRITY && KEYS
        default n
-       select DIGSIG
+       select SIGNATURE
        help
          This option enables digital signature verification support
          using multiple keyrings. It defines separate keyrings for each
index bece056..d43799c 100644 (file)
@@ -3,7 +3,7 @@
 #
 
 obj-$(CONFIG_INTEGRITY) += integrity.o
-obj-$(CONFIG_INTEGRITY_DIGSIG) += digsig.o
+obj-$(CONFIG_INTEGRITY_SIGNATURE) += digsig.o
 
 integrity-y := iint.o
 
index c5c5a72..2ad942f 100644 (file)
@@ -56,9 +56,11 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
                audit_log_format(ab, " name=");
                audit_log_untrustedstring(ab, fname);
        }
-       if (inode)
-               audit_log_format(ab, " dev=%s ino=%lu",
-                                inode->i_sb->s_id, inode->i_ino);
+       if (inode) {
+               audit_log_format(ab, " dev=");
+               audit_log_untrustedstring(ab, inode->i_sb->s_id);
+               audit_log_format(ab, " ino=%lu", inode->i_ino);
+       }
        audit_log_format(ab, " res=%d", !result ? 0 : 1);
        audit_log_end(ab);
 }
index 4da6ba8..7a25ece 100644 (file)
@@ -51,7 +51,7 @@ struct integrity_iint_cache *integrity_iint_find(struct inode *inode);
 #define INTEGRITY_KEYRING_IMA          2
 #define INTEGRITY_KEYRING_MAX          3
 
-#ifdef CONFIG_INTEGRITY_DIGSIG
+#ifdef CONFIG_INTEGRITY_SIGNATURE
 
 int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
                                        const char *digest, int digestlen);
@@ -65,7 +65,7 @@ static inline int integrity_digsig_verify(const unsigned int id,
        return -EOPNOTSUPP;
 }
 
-#endif /* CONFIG_INTEGRITY_DIGSIG */
+#endif /* CONFIG_INTEGRITY_SIGNATURE */
 
 /* set during initialization */
 extern int iint_initialized;
index 41144f7..2d1bb8a 100644 (file)
@@ -314,7 +314,7 @@ static struct key *request_user_key(const char *master_desc, u8 **master_key,
                goto error;
 
        down_read(&ukey->sem);
-       upayload = rcu_dereference(ukey->payload.data);
+       upayload = ukey->payload.data;
        *master_key = upayload->data;
        *master_keylen = upayload->datalen;
 error:
@@ -810,7 +810,7 @@ static int encrypted_instantiate(struct key *key, const void *data,
                goto out;
        }
 
-       rcu_assign_pointer(key->payload.data, epayload);
+       rcu_assign_keypointer(key, epayload);
 out:
        kfree(datablob);
        return ret;
@@ -874,7 +874,7 @@ static int encrypted_update(struct key *key, const void *data, size_t datalen)
        memcpy(new_epayload->payload_data, epayload->payload_data,
               epayload->payload_datalen);
 
-       rcu_assign_pointer(key->payload.data, new_epayload);
+       rcu_assign_keypointer(key, new_epayload);
        call_rcu(&epayload->rcu, encrypted_rcu_free);
 out:
        kfree(buf);
index df87272..013f7e5 100644 (file)
@@ -18,6 +18,8 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <keys/trusted-type.h>
+#include <keys/encrypted-type.h>
+#include "encrypted.h"
 
 /*
  * request_trusted_key - request the trusted key
@@ -37,7 +39,7 @@ struct key *request_trusted_key(const char *trusted_desc,
                goto error;
 
        down_read(&tkey->sem);
-       tpayload = rcu_dereference(tkey->payload.data);
+       tpayload = tkey->payload.data;
        *master_key = tpayload->key;
        *master_keylen = tpayload->key_len;
 error:
index bf4d8da..a42b455 100644 (file)
@@ -145,7 +145,9 @@ static void key_gc_keyring(struct key *keyring, time_t limit)
        if (!klist)
                goto unlock_dont_gc;
 
-       for (loop = klist->nkeys - 1; loop >= 0; loop--) {
+       loop = klist->nkeys;
+       smp_rmb();
+       for (loop--; loop >= 0; loop--) {
                key = klist->keys[loop];
                if (test_bit(KEY_FLAG_DEAD, &key->flags) ||
                    (key->expiry > 0 && key->expiry <= limit))
index 37a7f3b..d605f75 100644 (file)
@@ -319,7 +319,7 @@ key_ref_t keyring_search_aux(key_ref_t keyring_ref,
        struct key *keyring, *key;
        key_ref_t key_ref;
        long err;
-       int sp, kix;
+       int sp, nkeys, kix;
 
        keyring = key_ref_to_ptr(keyring_ref);
        possessed = is_key_possessed(keyring_ref);
@@ -380,7 +380,9 @@ descend:
                goto not_this_keyring;
 
        /* iterate through the keys in this keyring first */
-       for (kix = 0; kix < keylist->nkeys; kix++) {
+       nkeys = keylist->nkeys;
+       smp_rmb();
+       for (kix = 0; kix < nkeys; kix++) {
                key = keylist->keys[kix];
                kflags = key->flags;
 
@@ -421,7 +423,9 @@ descend:
        /* search through the keyrings nested in this one */
        kix = 0;
 ascend:
-       for (; kix < keylist->nkeys; kix++) {
+       nkeys = keylist->nkeys;
+       smp_rmb();
+       for (; kix < nkeys; kix++) {
                key = keylist->keys[kix];
                if (key->type != &key_type_keyring)
                        continue;
@@ -515,7 +519,7 @@ key_ref_t __keyring_search_one(key_ref_t keyring_ref,
        struct keyring_list *klist;
        unsigned long possessed;
        struct key *keyring, *key;
-       int loop;
+       int nkeys, loop;
 
        keyring = key_ref_to_ptr(keyring_ref);
        possessed = is_key_possessed(keyring_ref);
@@ -524,7 +528,9 @@ key_ref_t __keyring_search_one(key_ref_t keyring_ref,
 
        klist = rcu_dereference(keyring->payload.subscriptions);
        if (klist) {
-               for (loop = 0; loop < klist->nkeys; loop++) {
+               nkeys = klist->nkeys;
+               smp_rmb();
+               for (loop = 0; loop < nkeys ; loop++) {
                        key = klist->keys[loop];
 
                        if (key->type == ktype &&
@@ -622,7 +628,7 @@ static int keyring_detect_cycle(struct key *A, struct key *B)
 
        struct keyring_list *keylist;
        struct key *subtree, *key;
-       int sp, kix, ret;
+       int sp, nkeys, kix, ret;
 
        rcu_read_lock();
 
@@ -645,7 +651,9 @@ descend:
 
 ascend:
        /* iterate through the remaining keys in this keyring */
-       for (; kix < keylist->nkeys; kix++) {
+       nkeys = keylist->nkeys;
+       smp_rmb();
+       for (; kix < nkeys; kix++) {
                key = keylist->keys[kix];
 
                if (key == A)
index 0ed5fdf..2d5d041 100644 (file)
@@ -993,7 +993,7 @@ out:
        kfree(datablob);
        kfree(options);
        if (!ret)
-               rcu_assign_pointer(key->payload.data, payload);
+               rcu_assign_keypointer(key, payload);
        else
                kfree(payload);
        return ret;
@@ -1067,7 +1067,7 @@ static int trusted_update(struct key *key, const void *data, size_t datalen)
                        goto out;
                }
        }
-       rcu_assign_pointer(key->payload.data, new_p);
+       rcu_assign_keypointer(key, new_p);
        call_rcu(&p->rcu, trusted_rcu_free);
 out:
        kfree(datablob);
index 7bd6f13..293b8c4 100644 (file)
@@ -232,13 +232,14 @@ static void dump_common_audit_data(struct audit_buffer *ab,
        case LSM_AUDIT_DATA_PATH: {
                struct inode *inode;
 
-               audit_log_d_path(ab, "path=", &a->u.path);
+               audit_log_d_path(ab, " path=", &a->u.path);
 
                inode = a->u.path.dentry->d_inode;
-               if (inode)
-                       audit_log_format(ab, " dev=%s ino=%lu",
-                                       inode->i_sb->s_id,
-                                       inode->i_ino);
+               if (inode) {
+                       audit_log_format(ab, " dev=");
+                       audit_log_untrustedstring(ab, inode->i_sb->s_id);
+                       audit_log_format(ab, " ino=%lu", inode->i_ino);
+               }
                break;
        }
        case LSM_AUDIT_DATA_DENTRY: {
@@ -248,10 +249,11 @@ static void dump_common_audit_data(struct audit_buffer *ab,
                audit_log_untrustedstring(ab, a->u.dentry->d_name.name);
 
                inode = a->u.dentry->d_inode;
-               if (inode)
-                       audit_log_format(ab, " dev=%s ino=%lu",
-                                       inode->i_sb->s_id,
-                                       inode->i_ino);
+               if (inode) {
+                       audit_log_format(ab, " dev=");
+                       audit_log_untrustedstring(ab, inode->i_sb->s_id);
+                       audit_log_format(ab, " ino=%lu", inode->i_ino);
+               }
                break;
        }
        case LSM_AUDIT_DATA_INODE: {
@@ -266,8 +268,9 @@ static void dump_common_audit_data(struct audit_buffer *ab,
                                         dentry->d_name.name);
                        dput(dentry);
                }
-               audit_log_format(ab, " dev=%s ino=%lu", inode->i_sb->s_id,
-                                inode->i_ino);
+               audit_log_format(ab, " dev=");
+               audit_log_untrustedstring(ab, inode->i_sb->s_id);
+               audit_log_format(ab, " ino=%lu", inode->i_ino);
                break;
        }
        case LSM_AUDIT_DATA_TASK:
@@ -315,7 +318,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
                                                .dentry = u->dentry,
                                                .mnt = u->mnt
                                        };
-                                       audit_log_d_path(ab, "path=", &path);
+                                       audit_log_d_path(ab, " path=", &path);
                                        break;
                                }
                                if (!u->addr)
index 4a9b4b2..867558c 100644 (file)
@@ -492,13 +492,13 @@ static bool tomoyo_correct_word2(const char *string, size_t len)
                                if (d < '0' || d > '7' || e < '0' || e > '7')
                                        break;
                                c = tomoyo_make_byte(c, d, e);
-                               if (tomoyo_invalid(c))
-                                       continue; /* pattern is not \000 */
+                               if (c <= ' ' || c >= 127)
+                                       continue;
                        }
                        goto out;
                } else if (in_repetition && c == '/') {
                        goto out;
-               } else if (tomoyo_invalid(c)) {
+               } else if (c <= ' ' || c >= 127) {
                        goto out;
                }
        }
index 6fd9391..4fa1dbd 100644 (file)
@@ -133,7 +133,7 @@ static int atmel_abdac_prepare_dma(struct atmel_abdac *dac,
        period_len = frames_to_bytes(runtime, runtime->period_size);
 
        cdesc = dw_dma_cyclic_prep(chan, runtime->dma_addr, buffer_len,
-                       period_len, DMA_TO_DEVICE);
+                       period_len, DMA_MEM_TO_DEV);
        if (IS_ERR(cdesc)) {
                dev_dbg(&dac->pdev->dev, "could not prepare cyclic DMA\n");
                return PTR_ERR(cdesc);
index 73516f6..61dade6 100644 (file)
@@ -102,7 +102,7 @@ static void atmel_ac97c_dma_capture_period_done(void *arg)
 
 static int atmel_ac97c_prepare_dma(struct atmel_ac97c *chip,
                struct snd_pcm_substream *substream,
-               enum dma_data_direction direction)
+               enum dma_transfer_direction direction)
 {
        struct dma_chan                 *chan;
        struct dw_cyclic_desc           *cdesc;
@@ -118,7 +118,7 @@ static int atmel_ac97c_prepare_dma(struct atmel_ac97c *chip,
                return -EINVAL;
        }
 
-       if (direction == DMA_TO_DEVICE)
+       if (direction == DMA_MEM_TO_DEV)
                chan = chip->dma.tx_chan;
        else
                chan = chip->dma.rx_chan;
@@ -133,7 +133,7 @@ static int atmel_ac97c_prepare_dma(struct atmel_ac97c *chip,
                return PTR_ERR(cdesc);
        }
 
-       if (direction == DMA_TO_DEVICE) {
+       if (direction == DMA_MEM_TO_DEV) {
                cdesc->period_callback = atmel_ac97c_dma_playback_period_done;
                set_bit(DMA_TX_READY, &chip->flags);
        } else {
@@ -393,7 +393,7 @@ static int atmel_ac97c_playback_prepare(struct snd_pcm_substream *substream)
        if (cpu_is_at32ap7000()) {
                if (!test_bit(DMA_TX_READY, &chip->flags))
                        retval = atmel_ac97c_prepare_dma(chip, substream,
-                                       DMA_TO_DEVICE);
+                                       DMA_MEM_TO_DEV);
        } else {
                /* Initialize and start the PDC */
                writel(runtime->dma_addr, chip->regs + ATMEL_PDC_TPR);
@@ -484,7 +484,7 @@ static int atmel_ac97c_capture_prepare(struct snd_pcm_substream *substream)
        if (cpu_is_at32ap7000()) {
                if (!test_bit(DMA_RX_READY, &chip->flags))
                        retval = atmel_ac97c_prepare_dma(chip, substream,
-                                       DMA_FROM_DEVICE);
+                                       DMA_DEV_TO_MEM);
        } else {
                /* Initialize and start the PDC */
                writel(runtime->dma_addr, chip->regs + ATMEL_PDC_RPR);
index ad40938..b413ed0 100644 (file)
@@ -12,6 +12,9 @@ config SND_HWDEP
 config SND_RAWMIDI
        tristate
 
+config SND_COMPRESS_OFFLOAD
+       tristate
+
 # To be effective this also requires INPUT - users should say:
 #    select SND_JACK if INPUT=y || INPUT=SND
 # to avoid having to force INPUT on.
@@ -154,16 +157,6 @@ config SND_DYNAMIC_MINORS
 
          If you are unsure about this, say N here.
 
-config SND_COMPRESS_OFFLOAD
-       tristate "ALSA Compressed audio offload support"
-       default n
-       help
-         If you want support for offloading compressed audio and have such
-         a hardware, then you should say Y here and also to the DSP driver
-         of your platform.
-
-         If you are unsure about this, say N here.
-
 config SND_SUPPORT_OLD_API
        bool "Support old ALSA API"
        default y
index 762bb10..f13ad53 100644 (file)
@@ -268,8 +268,14 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
                card->shortname, chip->io, chip->irq);
 
        // (4) Alloc components.
+       err = snd_vortex_mixer(chip);
+       if (err < 0) {
+               snd_card_free(card);
+               return err;
+       }
        // ADB pcm.
-       if ((err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_ADB)) < 0) {
+       err = snd_vortex_new_pcm(chip, VORTEX_PCM_ADB, NR_PCM);
+       if (err < 0) {
                snd_card_free(card);
                return err;
        }
@@ -299,11 +305,6 @@ snd_vortex_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
                return err;
        }
 #endif
-       // snd_ac97_mixer and Vortex mixer.
-       if ((err = snd_vortex_mixer(chip)) < 0) {
-               snd_card_free(card);
-               return err;
-       }
        if ((err = snd_vortex_midi(chip)) < 0) {
                snd_card_free(card);
                return err;
index 02f6e08..bb93815 100644 (file)
 #define MIX_SPDIF(x) (vortex->mixspdif[x])
 
 #define NR_WTPB 0x20           /* WT channels per each bank. */
+#define NR_PCM 0x10
 
 /* Structs */
 typedef struct {
index 0488633..0ef2f97 100644 (file)
@@ -168,6 +168,7 @@ static int snd_vortex_pcm_open(struct snd_pcm_substream *substream)
                        runtime->hw = snd_vortex_playback_hw_adb;
 #ifdef CHIP_AU8830
                if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK &&
+                       VORTEX_IS_QUAD(vortex) &&
                        VORTEX_PCM_TYPE(substream->pcm) == VORTEX_PCM_ADB) {
                        runtime->hw.channels_max = 4;
                        snd_pcm_hw_constraint_list(runtime, 0,
index 0852e20..fb35474 100644 (file)
@@ -2498,6 +2498,7 @@ static struct snd_pci_quirk position_fix_list[] __devinitdata = {
        SND_PCI_QUIRK(0x1043, 0x81b3, "ASUS", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x1043, 0x81e7, "ASUS M2V", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x104d, 0x9069, "Sony VPCS11V9E", POS_FIX_LPIB),
+       SND_PCI_QUIRK(0x10de, 0xcb89, "Macbook Pro 7,1", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x1297, 0x3166, "Shuttle", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x1458, 0xa022, "ga-ma770-ud3", POS_FIX_LPIB),
        SND_PCI_QUIRK(0x1462, 0x1002, "MSI Wind U115", POS_FIX_LPIB),
index 87e684f..3556408 100644 (file)
@@ -1596,7 +1596,7 @@ static const struct snd_pci_quirk stac92hd73xx_cfg_tbl[] = {
        SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02bd,
                                "Dell Studio 1557", STAC_DELL_M6_DMIC),
        SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x02fe,
-                               "Dell Studio XPS 1645", STAC_DELL_M6_BOTH),
+                               "Dell Studio XPS 1645", STAC_DELL_M6_DMIC),
        SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0413,
                                "Dell Studio 1558", STAC_DELL_M6_DMIC),
        {} /* terminator */
index 478303e..63cff90 100644 (file)
@@ -177,6 +177,7 @@ static void wm8776_registers_init(struct oxygen *chip)
        struct xonar_wm87x6 *data = chip->model_data;
 
        wm8776_write(chip, WM8776_RESET, 0);
+       wm8776_write(chip, WM8776_PHASESWAP, WM8776_PH_MASK);
        wm8776_write(chip, WM8776_DACCTRL1, WM8776_DZCEN |
                     WM8776_PL_LEFT_LEFT | WM8776_PL_RIGHT_RIGHT);
        wm8776_write(chip, WM8776_DACMUTE, chip->dac_mute ? WM8776_DMUTE : 0);
index 3fc9613..de83904 100644 (file)
@@ -113,9 +113,9 @@ static int ep93xx_pcm_open(struct snd_pcm_substream *substream)
        rtd->dma_data.name = dma_params->name;
 
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
-               rtd->dma_data.direction = DMA_TO_DEVICE;
+               rtd->dma_data.direction = DMA_MEM_TO_DEV;
        else
-               rtd->dma_data.direction = DMA_FROM_DEVICE;
+               rtd->dma_data.direction = DMA_DEV_TO_MEM;
 
        rtd->dma_chan = dma_request_channel(mask, ep93xx_pcm_dma_filter,
                                            &rtd->dma_data);
index 1cf2fe8..aecdba9 100644 (file)
@@ -107,12 +107,12 @@ static int imx_ssi_dma_alloc(struct snd_pcm_substream *substream,
        }
 
        if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) {
-               slave_config.direction = DMA_TO_DEVICE;
+               slave_config.direction = DMA_MEM_TO_DEV;
                slave_config.dst_addr = dma_params->dma_addr;
                slave_config.dst_addr_width = buswidth;
                slave_config.dst_maxburst = dma_params->burstsize;
        } else {
-               slave_config.direction = DMA_FROM_DEVICE;
+               slave_config.direction = DMA_DEV_TO_MEM;
                slave_config.src_addr = dma_params->dma_addr;
                slave_config.src_addr_width = buswidth;
                slave_config.src_maxburst = dma_params->burstsize;
@@ -159,7 +159,7 @@ static int snd_imx_pcm_hw_params(struct snd_pcm_substream *substream,
                        iprtd->period_bytes * iprtd->periods,
                        iprtd->period_bytes,
                        substream->stream == SNDRV_PCM_STREAM_PLAYBACK ?
-                       DMA_TO_DEVICE : DMA_FROM_DEVICE);
+                       DMA_MEM_TO_DEV : DMA_DEV_TO_MEM);
        if (!iprtd->desc) {
                dev_err(&chan->dev->device, "cannot prepare slave dma\n");
                return -EINVAL;
index 0e12f4e..105f42a 100644 (file)
@@ -136,7 +136,7 @@ static int snd_mxs_pcm_hw_params(struct snd_pcm_substream *substream,
                        iprtd->period_bytes * iprtd->periods,
                        iprtd->period_bytes,
                        substream->stream == SNDRV_PCM_STREAM_PLAYBACK ?
-                       DMA_TO_DEVICE : DMA_FROM_DEVICE);
+                       DMA_MEM_TO_DEV : DMA_DEV_TO_MEM);
        if (!iprtd->desc) {
                dev_err(&chan->dev->device, "cannot prepare slave dma\n");
                return -EINVAL;
index 427ae0d..e4ba17c 100644 (file)
@@ -86,7 +86,7 @@ static void dma_enqueue(struct snd_pcm_substream *substream)
        dma_info.cap = (samsung_dma_has_circular() ? DMA_CYCLIC : DMA_SLAVE);
        dma_info.direction =
                (substream->stream == SNDRV_PCM_STREAM_PLAYBACK
-               ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM);
        dma_info.fp = audio_buffdone;
        dma_info.fp_param = substream;
        dma_info.period = prtd->dma_period;
@@ -171,7 +171,7 @@ static int dma_hw_params(struct snd_pcm_substream *substream,
                dma_info.client = prtd->params->client;
                dma_info.direction =
                        (substream->stream == SNDRV_PCM_STREAM_PLAYBACK
-                       ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+                       ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM);
                dma_info.width = prtd->params->dma_size;
                dma_info.fifo = prtd->params->dma_addr;
                prtd->params->ch = prtd->params->ops->request(
index f8f6816..0193e59 100644 (file)
@@ -131,7 +131,7 @@ static int siu_pcm_wr_set(struct siu_port *port_info,
        sg_dma_address(&sg) = buff;
 
        desc = siu_stream->chan->device->device_prep_slave_sg(siu_stream->chan,
-               &sg, 1, DMA_TO_DEVICE, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+               &sg, 1, DMA_MEM_TO_DEV, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc) {
                dev_err(dev, "Failed to allocate a dma descriptor\n");
                return -ENOMEM;
@@ -181,7 +181,7 @@ static int siu_pcm_rd_set(struct siu_port *port_info,
        sg_dma_address(&sg) = buff;
 
        desc = siu_stream->chan->device->device_prep_slave_sg(siu_stream->chan,
-               &sg, 1, DMA_FROM_DEVICE, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
+               &sg, 1, DMA_DEV_TO_MEM, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc) {
                dev_err(dev, "Failed to allocate dma descriptor\n");
                return -ENOMEM;
index 93931de..2155461 100644 (file)
@@ -134,7 +134,7 @@ txx9aclc_dma_submit(struct txx9aclc_dmadata *dmadata, dma_addr_t buf_dma_addr)
        sg_dma_address(&sg) = buf_dma_addr;
        desc = chan->device->device_prep_slave_sg(chan, &sg, 1,
                dmadata->substream->stream == SNDRV_PCM_STREAM_PLAYBACK ?
-               DMA_TO_DEVICE : DMA_FROM_DEVICE,
+               DMA_MEM_TO_DEV : DMA_DEV_TO_MEM,
                DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
        if (!desc) {
                dev_err(&chan->dev->device, "cannot prepare slave dma\n");